convert-hf-to-gguf.py 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029
  1. #!/usr/bin/env python3
  2. from __future__ import annotations
  3. import argparse
  4. import contextlib
  5. import json
  6. import os
  7. import re
  8. import sys
  9. from enum import IntEnum
  10. from pathlib import Path
  11. from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional
  12. import numpy as np
  13. import torch
  14. if TYPE_CHECKING:
  15. from torch import Tensor
  16. if 'NO_LOCAL_GGUF' not in os.environ:
  17. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
  18. import gguf
  19. ###### MODEL DEFINITIONS ######
  20. class SentencePieceTokenTypes(IntEnum):
  21. NORMAL = 1
  22. UNKNOWN = 2
  23. CONTROL = 3
  24. USER_DEFINED = 4
  25. UNUSED = 5
  26. BYTE = 6
  27. class Model:
  28. def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
  29. self.dir_model = dir_model
  30. self.ftype = ftype
  31. self.fname_out = fname_out
  32. self.is_big_endian = is_big_endian
  33. self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
  34. self.is_safetensors = self._is_model_safetensors()
  35. self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
  36. self.part_names = self._get_part_names()
  37. self.hparams = Model.load_hparams(self.dir_model)
  38. self.model_arch = self._get_model_architecture()
  39. self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess)
  40. def set_vocab(self):
  41. self._set_vocab_gpt2()
  42. def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
  43. for part_name in self.part_names:
  44. print(f"gguf: loading model part '{part_name}'")
  45. ctx: ContextManager[Any]
  46. if self.is_safetensors:
  47. from safetensors import safe_open
  48. ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
  49. else:
  50. ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
  51. with ctx as model_part:
  52. for name in model_part.keys():
  53. data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
  54. yield name, data
  55. def set_gguf_parameters(self):
  56. self.gguf_writer.add_name(self.dir_model.name)
  57. self.gguf_writer.add_block_count(self.hparams.get(
  58. "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")),
  59. ))
  60. if (n_ctx := self.hparams.get("max_position_embeddings")) is not None:
  61. self.gguf_writer.add_context_length(n_ctx)
  62. if (n_embd := self.hparams.get("hidden_size")) is not None:
  63. self.gguf_writer.add_embedding_length(n_embd)
  64. if (n_ff := self.hparams.get("intermediate_size")) is not None:
  65. self.gguf_writer.add_feed_forward_length(n_ff)
  66. if (n_head := self.hparams.get("num_attention_head")) is not None:
  67. self.gguf_writer.add_head_count(n_head)
  68. self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
  69. def write_tensors(self):
  70. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  71. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  72. for name, data_torch in self.get_tensors():
  73. # we don't need these
  74. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
  75. continue
  76. old_dtype = data_torch.dtype
  77. # convert any unsupported data types to float32
  78. if data_torch.dtype not in (torch.float16, torch.float32):
  79. data_torch = data_torch.to(torch.float32)
  80. data = data_torch.squeeze().numpy()
  81. # map tensor names
  82. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  83. if new_name is None:
  84. print(f"Can not map tensor {name!r}")
  85. sys.exit()
  86. n_dims = len(data.shape)
  87. data_dtype = data.dtype
  88. # if f32 desired, convert any float16 to float32
  89. if self.ftype == 0 and data_dtype == np.float16:
  90. data = data.astype(np.float32)
  91. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  92. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  93. data = data.astype(np.float32)
  94. # if f16 desired, convert any float32 2-dim weight tensors to float16
  95. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  96. data = data.astype(np.float16)
  97. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  98. self.gguf_writer.add_tensor(new_name, data)
  99. def write(self):
  100. self.write_tensors()
  101. self.gguf_writer.write_header_to_file()
  102. self.gguf_writer.write_kv_data_to_file()
  103. self.gguf_writer.write_tensors_to_file()
  104. self.gguf_writer.close()
  105. def write_vocab(self):
  106. self.gguf_writer.write_header_to_file()
  107. self.gguf_writer.write_kv_data_to_file()
  108. self.gguf_writer.close()
  109. @staticmethod
  110. def count_model_parts(dir_model: Path, prefix: str) -> int:
  111. num_parts = 0
  112. for filename in os.listdir(dir_model):
  113. if filename.endswith(prefix):
  114. num_parts += 1
  115. return num_parts
  116. @staticmethod
  117. def load_hparams(dir_model):
  118. with open(dir_model / "config.json", "r", encoding="utf-8") as f:
  119. return json.load(f)
  120. @staticmethod
  121. def from_model_architecture(model_architecture):
  122. if model_architecture == "GPTNeoXForCausalLM":
  123. return GPTNeoXModel
  124. if model_architecture == "BloomForCausalLM":
  125. return BloomModel
  126. if model_architecture == "MPTForCausalLM":
  127. return MPTModel
  128. if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
  129. return BaichuanModel
  130. if model_architecture in ("FalconForCausalLM", "RWForCausalLM"):
  131. return FalconModel
  132. if model_architecture == "GPTBigCodeForCausalLM":
  133. return StarCoderModel
  134. if model_architecture == "GPTRefactForCausalLM":
  135. return RefactModel
  136. if model_architecture == "PersimmonForCausalLM":
  137. return PersimmonModel
  138. if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
  139. return StableLMModel
  140. if model_architecture == "QWenLMHeadModel":
  141. return QwenModel
  142. return Model
  143. def _is_model_safetensors(self) -> bool:
  144. return Model.count_model_parts(self.dir_model, ".safetensors") > 0
  145. def _get_part_names(self):
  146. if self.is_safetensors:
  147. if self.num_parts == 1: # there's only one .safetensors file
  148. return ("model.safetensors",)
  149. return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
  150. if self.num_parts == 1: # there's only one .bin file
  151. return ("pytorch_model.bin",)
  152. return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
  153. def _get_model_architecture(self) -> gguf.MODEL_ARCH:
  154. arch = self.hparams["architectures"][0]
  155. if arch == "GPTNeoXForCausalLM":
  156. return gguf.MODEL_ARCH.GPTNEOX
  157. if arch == "BloomForCausalLM":
  158. return gguf.MODEL_ARCH.BLOOM
  159. if arch == "MPTForCausalLM":
  160. return gguf.MODEL_ARCH.MPT
  161. if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
  162. return gguf.MODEL_ARCH.BAICHUAN
  163. if arch in ("FalconForCausalLM", "RWForCausalLM"):
  164. return gguf.MODEL_ARCH.FALCON
  165. if arch == "GPTBigCodeForCausalLM":
  166. return gguf.MODEL_ARCH.STARCODER
  167. if arch == "GPTRefactForCausalLM":
  168. return gguf.MODEL_ARCH.REFACT
  169. if arch == "PersimmonForCausalLM":
  170. return gguf.MODEL_ARCH.PERSIMMON
  171. if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
  172. return gguf.MODEL_ARCH.STABLELM
  173. if arch == "QWenLMHeadModel":
  174. return gguf.MODEL_ARCH.QWEN
  175. raise NotImplementedError(f'Architecture "{arch}" not supported!')
  176. def _set_vocab_gpt2(self):
  177. dir_model = self.dir_model
  178. hparams = self.hparams
  179. tokens: list[bytearray] = []
  180. toktypes: list[int] = []
  181. from transformers import AutoTokenizer # type: ignore[attr-defined]
  182. tokenizer = AutoTokenizer.from_pretrained(dir_model)
  183. vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
  184. assert max(tokenizer.vocab.values()) < vocab_size
  185. reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
  186. added_vocab = tokenizer.get_added_vocab()
  187. for i in range(vocab_size):
  188. if i not in reverse_vocab:
  189. pad_token = f"[PAD{i}]".encode('utf-8')
  190. tokens.append(bytearray(pad_token))
  191. toktypes.append(gguf.TokenType.USER_DEFINED)
  192. elif reverse_vocab[i] in added_vocab:
  193. tokens.append(reverse_vocab[i])
  194. if tokenizer.added_tokens_decoder[i].special:
  195. toktypes.append(gguf.TokenType.CONTROL)
  196. else:
  197. toktypes.append(gguf.TokenType.USER_DEFINED)
  198. else:
  199. tokens.append(reverse_vocab[i])
  200. toktypes.append(gguf.TokenType.NORMAL)
  201. self.gguf_writer.add_tokenizer_model("gpt2")
  202. self.gguf_writer.add_token_list(tokens)
  203. self.gguf_writer.add_token_types(toktypes)
  204. special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
  205. special_vocab.add_to_gguf(self.gguf_writer)
  206. def _set_vocab_sentencepiece(self):
  207. from sentencepiece import SentencePieceProcessor
  208. tokenizer_path = self.dir_model / 'tokenizer.model'
  209. tokens: list[bytes] = []
  210. scores: list[float] = []
  211. toktypes: list[int] = []
  212. if not tokenizer_path.is_file():
  213. print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
  214. sys.exit(1)
  215. tokenizer = SentencePieceProcessor(str(tokenizer_path))
  216. vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
  217. for token_id in range(vocab_size):
  218. piece = tokenizer.id_to_piece(token_id)
  219. text = piece.encode("utf-8")
  220. score = tokenizer.get_score(token_id)
  221. toktype = SentencePieceTokenTypes.NORMAL
  222. if tokenizer.is_unknown(token_id):
  223. toktype = SentencePieceTokenTypes.UNKNOWN
  224. elif tokenizer.is_control(token_id):
  225. toktype = SentencePieceTokenTypes.CONTROL
  226. elif tokenizer.is_unused(token_id):
  227. toktype = SentencePieceTokenTypes.UNUSED
  228. elif tokenizer.is_byte(token_id):
  229. toktype = SentencePieceTokenTypes.BYTE
  230. tokens.append(text)
  231. scores.append(score)
  232. toktypes.append(toktype)
  233. added_tokens_file = self.dir_model / 'added_tokens.json'
  234. if added_tokens_file.is_file():
  235. with open(added_tokens_file, "r", encoding="utf-8") as f:
  236. added_tokens_json = json.load(f)
  237. for key in added_tokens_json:
  238. tokens.append(key.encode("utf-8"))
  239. scores.append(-1000.0)
  240. toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
  241. self.gguf_writer.add_tokenizer_model("llama")
  242. self.gguf_writer.add_token_list(tokens)
  243. self.gguf_writer.add_token_scores(scores)
  244. self.gguf_writer.add_token_types(toktypes)
  245. special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
  246. special_vocab.add_to_gguf(self.gguf_writer)
  247. class GPTNeoXModel(Model):
  248. def set_gguf_parameters(self):
  249. block_count = self.hparams["num_hidden_layers"]
  250. self.gguf_writer.add_name(self.dir_model.name)
  251. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  252. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  253. self.gguf_writer.add_block_count(block_count)
  254. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  255. self.gguf_writer.add_rope_dimension_count(
  256. int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
  257. )
  258. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  259. self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
  260. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
  261. class BloomModel(Model):
  262. def set_gguf_parameters(self):
  263. self.gguf_writer.add_name("Bloom")
  264. n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
  265. n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
  266. self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
  267. self.gguf_writer.add_embedding_length(n_embed)
  268. self.gguf_writer.add_feed_forward_length(4 * n_embed)
  269. self.gguf_writer.add_block_count(self.hparams["n_layer"])
  270. self.gguf_writer.add_head_count(n_head)
  271. self.gguf_writer.add_head_count_kv(n_head)
  272. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  273. self.gguf_writer.add_file_type(self.ftype)
  274. def write_tensors(self):
  275. block_count = self.hparams["n_layer"]
  276. tensors = dict(self.get_tensors())
  277. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  278. has_lm_head = True
  279. n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
  280. n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
  281. for name, data_torch in tensors.items():
  282. if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys():
  283. has_lm_head = False
  284. name = re.sub(r'transformer\.', '', name)
  285. old_dtype = data_torch.dtype
  286. # convert any unsupported data types to float32
  287. if data_torch.dtype not in (torch.float16, torch.float32):
  288. data_torch = data_torch.to(torch.float32)
  289. data = data_torch.squeeze().numpy()
  290. if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
  291. # Map bloom-style qkv_linear to gpt-style qkv_linear
  292. # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
  293. # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
  294. qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
  295. data = np.concatenate(
  296. (
  297. qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
  298. qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
  299. qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
  300. ),
  301. axis=0,
  302. )
  303. print("re-format attention.linear_qkv.weight")
  304. elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
  305. qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
  306. data = np.concatenate(
  307. (
  308. qkv_bias[:, 0, :].reshape((n_embed,)),
  309. qkv_bias[:, 1, :].reshape((n_embed,)),
  310. qkv_bias[:, 2, :].reshape((n_embed,)),
  311. ),
  312. axis=0,
  313. )
  314. print("re-format attention.linear_qkv.bias")
  315. # map tensor names
  316. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  317. if new_name is None:
  318. print(f"Can not map tensor {name!r}")
  319. sys.exit()
  320. n_dims = len(data.shape)
  321. data_dtype = data.dtype
  322. # if f32 desired, convert any float16 to float32
  323. if self.ftype == 0 and data_dtype == np.float16:
  324. data = data.astype(np.float32)
  325. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  326. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  327. data = data.astype(np.float32)
  328. # if f16 desired, convert any float32 2-dim weight tensors to float16
  329. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  330. data = data.astype(np.float16)
  331. print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  332. self.gguf_writer.add_tensor(new_name, data)
  333. if not has_lm_head and name == "word_embeddings.weight":
  334. self.gguf_writer.add_tensor("output.weight", data)
  335. print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  336. class MPTModel(Model):
  337. def set_gguf_parameters(self):
  338. block_count = self.hparams["n_layers"]
  339. self.gguf_writer.add_name(self.dir_model.name)
  340. self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
  341. self.gguf_writer.add_embedding_length(self.hparams["d_model"])
  342. self.gguf_writer.add_block_count(block_count)
  343. self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
  344. self.gguf_writer.add_head_count(self.hparams["n_heads"])
  345. if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
  346. self.gguf_writer.add_head_count_kv(kv_n_heads)
  347. self.gguf_writer.add_layer_norm_eps(1e-5)
  348. if self.hparams["attn_config"]["clip_qkv"] is not None:
  349. self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
  350. self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
  351. def write_tensors(self):
  352. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
  353. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  354. for name, data_torch in self.get_tensors():
  355. # we don't need these
  356. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
  357. continue
  358. old_dtype = data_torch.dtype
  359. # convert any unsupported data types to float32
  360. if data_torch.dtype not in (torch.float16, torch.float32):
  361. data_torch = data_torch.to(torch.float32)
  362. data = data_torch.squeeze().numpy()
  363. # map tensor names
  364. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  365. if new_name is None:
  366. print(f"Can not map tensor {name!r}")
  367. sys.exit()
  368. n_dims = len(data.shape)
  369. data_dtype = data.dtype
  370. # if f32 desired, convert any float16 to float32
  371. if self.ftype == 0 and data_dtype == np.float16:
  372. data = data.astype(np.float32)
  373. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  374. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  375. data = data.astype(np.float32)
  376. # if f16 desired, convert any float32 2-dim weight tensors to float16
  377. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  378. data = data.astype(np.float16)
  379. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  380. self.gguf_writer.add_tensor(new_name, data)
  381. # note: MPT output is tied to (same as) wte in original model;
  382. # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
  383. if new_name == "token_embd.weight":
  384. self.gguf_writer.add_tensor("output.weight", data)
  385. class BaichuanModel(Model):
  386. def set_vocab(self):
  387. self._set_vocab_sentencepiece()
  388. def set_gguf_parameters(self):
  389. block_count = self.hparams["num_hidden_layers"]
  390. head_count = self.hparams["num_attention_heads"]
  391. head_count_kv = self.hparams.get("num_key_value_heads", head_count)
  392. hf_repo = self.hparams.get("_name_or_path", "")
  393. ctx_length = 0
  394. if "max_sequence_length" in self.hparams:
  395. ctx_length = self.hparams["max_sequence_length"]
  396. elif "max_position_embeddings" in self.hparams:
  397. ctx_length = self.hparams["max_position_embeddings"]
  398. elif "model_max_length" in self.hparams:
  399. ctx_length = self.hparams["model_max_length"]
  400. else:
  401. print("gguf: can not find ctx length parameter.")
  402. sys.exit()
  403. self.gguf_writer.add_name(self.dir_model.name)
  404. self.gguf_writer.add_source_hf_repo(hf_repo)
  405. self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
  406. self.gguf_writer.add_context_length(ctx_length)
  407. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  408. self.gguf_writer.add_block_count(block_count)
  409. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  410. self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
  411. self.gguf_writer.add_head_count(head_count)
  412. self.gguf_writer.add_head_count_kv(head_count_kv)
  413. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  414. if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
  415. if self.hparams["rope_scaling"].get("type") == "linear":
  416. self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
  417. self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
  418. def write_tensors(self):
  419. # Collect tensors from generator object
  420. model_kv = dict(self.get_tensors())
  421. block_count = self.hparams["num_hidden_layers"]
  422. head_count = self.hparams["num_attention_heads"]
  423. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  424. head_count_kv = self.hparams.get("num_key_value_heads", head_count)
  425. for i in range(block_count):
  426. if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
  427. print(f"Unpacking and permuting layer {i}")
  428. model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
  429. self._reverse_hf_permute_part(w, 0, head_count, head_count)
  430. model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
  431. self._reverse_hf_permute_part(w, 1, head_count, head_count_kv)
  432. model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \
  433. self._reverse_hf_part(w, 2)
  434. del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"]
  435. for name, data_torch in model_kv.items():
  436. # we don't need these
  437. if name.endswith(".rotary_emb.inv_freq"):
  438. continue
  439. old_dtype = data_torch.dtype
  440. # convert any unsupported data types to float32
  441. if data_torch.dtype not in (torch.float16, torch.float32):
  442. data_torch = data_torch.to(torch.float32)
  443. data = data_torch.squeeze().numpy()
  444. # map tensor names
  445. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  446. if new_name is None:
  447. print(f"Can not map tensor {name!r}")
  448. sys.exit()
  449. n_dims = len(data.shape)
  450. data_dtype = data.dtype
  451. # if f32 desired, convert any float16 to float32
  452. if self.ftype == 0 and data_dtype == np.float16:
  453. data = data.astype(np.float32)
  454. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  455. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  456. data = data.astype(np.float32)
  457. # if f16 desired, convert any float32 2-dim weight tensors to float16
  458. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  459. data = data.astype(np.float16)
  460. print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  461. self.gguf_writer.add_tensor(new_name, data)
  462. def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
  463. if n_kv_head is not None and n_head != n_kv_head:
  464. n_head //= n_kv_head
  465. return (
  466. weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
  467. .swapaxes(1, 2)
  468. .reshape(weights.shape)
  469. )
  470. def _reverse_hf_permute_part(
  471. self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
  472. ) -> Tensor:
  473. r = weights.shape[0] // 3
  474. return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
  475. def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
  476. r = weights.shape[0] // 3
  477. return weights[r * n_part:r * n_part + r, ...]
  478. class FalconModel(Model):
  479. def set_gguf_parameters(self):
  480. block_count = self.hparams.get("num_hidden_layers")
  481. if block_count is None:
  482. block_count = self.hparams["n_layer"] # old name
  483. n_head = self.hparams.get("num_attention_heads")
  484. if n_head is None:
  485. n_head = self.hparams["n_head"] # old name
  486. n_head_kv = self.hparams.get("num_kv_heads")
  487. if n_head_kv is None:
  488. n_head_kv = self.hparams.get("n_head_kv", 1) # old name
  489. self.gguf_writer.add_name("Falcon")
  490. self.gguf_writer.add_context_length(2048) # not in config.json
  491. self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
  492. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  493. self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
  494. self.gguf_writer.add_block_count(block_count)
  495. self.gguf_writer.add_head_count(n_head)
  496. self.gguf_writer.add_head_count_kv(n_head_kv)
  497. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  498. self.gguf_writer.add_file_type(self.ftype)
  499. def write_tensors(self):
  500. block_count = self.hparams.get("num_hidden_layers")
  501. if block_count is None:
  502. block_count = self.hparams["n_layer"] # old name
  503. n_head = self.hparams.get("num_attention_heads")
  504. if n_head is None:
  505. n_head = self.hparams["n_head"] # old name
  506. n_head_kv = self.hparams.get("num_kv_heads")
  507. if n_head_kv is None:
  508. n_head_kv = self.hparams.get("n_head_kv", 1) # old name
  509. head_dim = self.hparams["hidden_size"] // n_head
  510. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  511. for name, data_torch in self.get_tensors():
  512. old_dtype = data_torch.dtype
  513. # convert any unsupported data types to float32
  514. if data_torch.dtype not in (torch.float16, torch.float32):
  515. data_torch = data_torch.to(torch.float32)
  516. # QKV tensor transform
  517. # The original query_key_value tensor contains n_head_kv "kv groups",
  518. # each consisting of n_head/n_head_kv query weights followed by one key
  519. # and one value weight (shared by all query heads in the kv group).
  520. # This layout makes it a big pain to work with in GGML.
  521. # So we rearrange them here,, so that we have n_head query weights
  522. # followed by n_head_kv key weights followed by n_head_kv value weights,
  523. # in contiguous fashion.
  524. # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
  525. if "query_key_value" in name:
  526. qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
  527. q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
  528. k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
  529. v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
  530. data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
  531. data = data_torch.squeeze().numpy()
  532. # map tensor names
  533. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  534. if new_name is None:
  535. print(f"Can not map tensor {name!r}")
  536. sys.exit()
  537. n_dims = len(data.shape)
  538. data_dtype = data.dtype
  539. # if f32 desired, convert any float16 to float32
  540. if self.ftype == 0 and data_dtype == np.float16:
  541. data = data.astype(np.float32)
  542. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  543. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  544. data = data.astype(np.float32)
  545. # if f16 desired, convert any float32 2-dim weight tensors to float16
  546. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  547. data = data.astype(np.float16)
  548. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  549. self.gguf_writer.add_tensor(new_name, data)
  550. class StarCoderModel(Model):
  551. def set_gguf_parameters(self):
  552. block_count = self.hparams["n_layer"]
  553. self.gguf_writer.add_name("StarCoder")
  554. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  555. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  556. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  557. self.gguf_writer.add_block_count(block_count)
  558. self.gguf_writer.add_head_count(self.hparams["n_head"])
  559. self.gguf_writer.add_head_count_kv(1)
  560. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  561. self.gguf_writer.add_file_type(self.ftype)
  562. class RefactModel(Model):
  563. def set_gguf_parameters(self):
  564. hidden_dim = self.hparams["n_embd"]
  565. inner_dim = 4 * hidden_dim
  566. hidden_dim = int(2 * inner_dim / 3)
  567. multiple_of = 256
  568. ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
  569. block_count = self.hparams["n_layer"]
  570. self.gguf_writer.add_name("Refact")
  571. # refact uses Alibi. So this is from config.json which might be used by training.
  572. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  573. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  574. self.gguf_writer.add_feed_forward_length(ff_dim)
  575. self.gguf_writer.add_block_count(block_count)
  576. self.gguf_writer.add_head_count(self.hparams["n_head"])
  577. self.gguf_writer.add_head_count_kv(1)
  578. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
  579. self.gguf_writer.add_file_type(self.ftype)
  580. def write_tensors(self):
  581. hidden_dim = self.hparams["n_embd"]
  582. inner_dim = 4 * hidden_dim
  583. hidden_dim = int(2 * inner_dim / 3)
  584. multiple_of = 256
  585. ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
  586. n_head = self.hparams["n_head"]
  587. n_head_kv = 1
  588. head_dim = self.hparams["n_embd"] // n_head
  589. block_count = self.hparams["n_layer"]
  590. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  591. tensors = dict(self.get_tensors())
  592. for i in range(block_count):
  593. if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None:
  594. tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim]
  595. tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:]
  596. del tensors[f"transformer.h.{i}.attn.kv.weight"]
  597. if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None:
  598. tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w
  599. del tensors[f"transformer.h.{i}.attn.q.weight"]
  600. if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None:
  601. tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim]
  602. tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:]
  603. del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
  604. for name, data_torch in tensors.items():
  605. old_dtype = data_torch.dtype
  606. # convert any unsupported data types to float32
  607. if data_torch.dtype not in (torch.float16, torch.float32):
  608. data_torch = data_torch.to(torch.float32)
  609. data = data_torch.squeeze().numpy()
  610. # map tensor names
  611. new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
  612. if new_name is None:
  613. print(f"Can not map tensor {name!r}")
  614. sys.exit()
  615. n_dims = len(data.shape)
  616. data_dtype = data.dtype
  617. # if f32 desired, convert any float16 to float32
  618. if self.ftype == 0 and data_dtype == np.float16:
  619. data = data.astype(np.float32)
  620. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  621. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  622. data = data.astype(np.float32)
  623. # if f16 desired, convert any float32 2-dim weight tensors to float16
  624. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  625. data = data.astype(np.float16)
  626. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  627. self.gguf_writer.add_tensor(new_name, data)
  628. class PersimmonModel(Model):
  629. def set_gguf_parameters(self):
  630. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  631. head_count = self.hparams["num_attention_heads"]
  632. head_count_kv = head_count
  633. hidden_size = self.hparams["hidden_size"]
  634. self.gguf_writer.add_name('persimmon-8b-chat')
  635. self.gguf_writer.add_embedding_length(hidden_size)
  636. self.gguf_writer.add_block_count(block_count)
  637. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  638. self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
  639. self.gguf_writer.add_head_count(head_count)
  640. self.gguf_writer.add_head_count_kv(head_count_kv)
  641. self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
  642. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
  643. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  644. def set_vocab(self):
  645. self._set_vocab_sentencepiece()
  646. # self.gguf_writer.add_bos_token_id(71013)
  647. # self.gguf_writer.add_eos_token_id(71013)
  648. def write_tensors(self):
  649. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  650. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  651. for name, data_torch in self.get_tensors():
  652. if name.endswith(".self_attention.rotary_emb.inv_freq"):
  653. continue
  654. old_dtype = data_torch.dtype
  655. # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
  656. data = data_torch.to(torch.float32).squeeze().numpy()
  657. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  658. if new_name is None:
  659. print(f"Can not map tensor {name!r}")
  660. sys.exit()
  661. n_dims = len(data.shape)
  662. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  663. self.gguf_writer.add_tensor(new_name, data)
  664. class StableLMModel(Model):
  665. def set_gguf_parameters(self):
  666. hparams = self.hparams
  667. block_count = hparams["num_hidden_layers"]
  668. self.gguf_writer.add_name(dir_model.name)
  669. self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
  670. self.gguf_writer.add_embedding_length(hparams["hidden_size"])
  671. self.gguf_writer.add_block_count(block_count)
  672. self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
  673. self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
  674. self.gguf_writer.add_head_count(hparams["num_attention_heads"])
  675. self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
  676. self.gguf_writer.add_layer_norm_eps(1e-5)
  677. class QwenModel(Model):
  678. @staticmethod
  679. def token_bytes_to_string(b):
  680. from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
  681. byte_encoder = bytes_to_unicode()
  682. return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
  683. @staticmethod
  684. def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
  685. parts = [bytes([b]) for b in token]
  686. while True:
  687. min_idx = None
  688. min_rank = None
  689. for i, pair in enumerate(zip(parts[:-1], parts[1:])):
  690. rank = mergeable_ranks.get(pair[0] + pair[1])
  691. if rank is not None and (min_rank is None or rank < min_rank):
  692. min_idx = i
  693. min_rank = rank
  694. if min_rank is None or (max_rank is not None and min_rank >= max_rank):
  695. break
  696. assert min_idx is not None
  697. parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
  698. return parts
  699. def set_vocab(self):
  700. dir_model = self.dir_model
  701. hparams = self.hparams
  702. tokens: list[bytearray] = []
  703. toktypes: list[int] = []
  704. from transformers import AutoTokenizer # type: ignore[attr-defined]
  705. tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
  706. vocab_size = hparams["vocab_size"]
  707. assert max(tokenizer.get_vocab().values()) < vocab_size
  708. merges = []
  709. vocab = {}
  710. mergeable_ranks = tokenizer.mergeable_ranks
  711. for token, rank in mergeable_ranks.items():
  712. vocab[self.token_bytes_to_string(token)] = rank
  713. if len(token) == 1:
  714. continue
  715. merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
  716. assert len(merged) == 2
  717. merges.append(' '.join(map(self.token_bytes_to_string, merged)))
  718. reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
  719. added_vocab = tokenizer.special_tokens
  720. for i in range(vocab_size):
  721. if i not in reverse_vocab:
  722. pad_token = f"[PAD{i}]".encode("utf-8")
  723. tokens.append(bytearray(pad_token))
  724. toktypes.append(gguf.TokenType.USER_DEFINED)
  725. elif reverse_vocab[i] in added_vocab:
  726. tokens.append(reverse_vocab[i])
  727. toktypes.append(gguf.TokenType.CONTROL)
  728. else:
  729. tokens.append(reverse_vocab[i])
  730. toktypes.append(gguf.TokenType.NORMAL)
  731. self.gguf_writer.add_tokenizer_model("gpt2")
  732. self.gguf_writer.add_token_list(tokens)
  733. self.gguf_writer.add_token_types(toktypes)
  734. special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
  735. special_vocab.merges = merges
  736. special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
  737. special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
  738. special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
  739. special_vocab.add_to_gguf(self.gguf_writer)
  740. def set_gguf_parameters(self):
  741. self.gguf_writer.add_name("Qwen")
  742. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  743. self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
  744. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  745. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  746. self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
  747. self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
  748. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  749. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
  750. def write_tensors(self):
  751. block_count = self.hparams["num_hidden_layers"]
  752. model_kv = dict(self.get_tensors())
  753. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  754. for name, data_torch in model_kv.items():
  755. # we don't need these
  756. if name.endswith(".rotary_emb.inv_freq"):
  757. continue
  758. old_dtype = data_torch.dtype
  759. # convert any unsupported data types to float32
  760. if data_torch.dtype not in (torch.float16, torch.float32):
  761. data_torch = data_torch.to(torch.float32)
  762. data = data_torch.squeeze().numpy()
  763. # map tensor names
  764. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  765. if new_name is None:
  766. print(f"Can not map tensor {name!r}")
  767. sys.exit()
  768. n_dims = len(data.shape)
  769. data_dtype = data.dtype
  770. # if f32 desired, convert any float16 to float32
  771. if self.ftype == 0 and data_dtype == np.float16:
  772. data = data.astype(np.float32)
  773. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  774. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  775. data = data.astype(np.float32)
  776. # if f16 desired, convert any float32 2-dim weight tensors to float16
  777. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  778. data = data.astype(np.float16)
  779. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  780. self.gguf_writer.add_tensor(new_name, data)
  781. ###### CONVERSION LOGIC ######
  782. def parse_args() -> argparse.Namespace:
  783. parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
  784. parser.add_argument(
  785. "--vocab-only", action="store_true",
  786. help="extract only the vocab",
  787. )
  788. parser.add_argument(
  789. "--outfile", type=Path,
  790. help="path to write to; default: based on input",
  791. )
  792. parser.add_argument(
  793. "--outtype", type=str, choices=["f32", "f16"], default="f16",
  794. help="output format - use f32 for float32, f16 for float16",
  795. )
  796. parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
  797. parser.add_argument(
  798. "model", type=Path,
  799. help="directory containing model file",
  800. )
  801. return parser.parse_args()
  802. args = parse_args()
  803. dir_model = args.model
  804. if not dir_model.is_dir():
  805. print(f'Error: {args.model} is not a directory', file=sys.stderr)
  806. sys.exit(1)
  807. ftype_map = {
  808. "f32": gguf.GGMLQuantizationType.F32,
  809. "f16": gguf.GGMLQuantizationType.F16,
  810. }
  811. if args.outfile is not None:
  812. fname_out = args.outfile
  813. else:
  814. # output in the same directory as the model by default
  815. fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
  816. print(f"Loading model: {dir_model.name}")
  817. hparams = Model.load_hparams(dir_model)
  818. with torch.inference_mode():
  819. model_class = Model.from_model_architecture(hparams["architectures"][0])
  820. model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
  821. print("Set model parameters")
  822. model_instance.set_gguf_parameters()
  823. print("Set model tokenizer")
  824. model_instance.set_vocab()
  825. if args.vocab_only:
  826. print(f"Exporting model vocab to '{fname_out}'")
  827. model_instance.write_vocab()
  828. else:
  829. print(f"Exporting model to '{fname_out}'")
  830. model_instance.write()
  831. print(f"Model successfully exported to '{fname_out}'")