gguf.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860
  1. #!/usr/bin/env python3
  2. from __future__ import annotations
  3. import json
  4. import os
  5. import shutil
  6. import struct
  7. import sys
  8. import tempfile
  9. from enum import IntEnum, auto
  10. from io import BufferedWriter
  11. from pathlib import Path
  12. from typing import IO, Any, BinaryIO, Callable, Sequence
  13. import numpy as np
  14. #
  15. # constants
  16. #
  17. GGUF_MAGIC = 0x46554747
  18. GGUF_VERSION = 2
  19. GGUF_DEFAULT_ALIGNMENT = 32
  20. # general
  21. KEY_GENERAL_ARCHITECTURE = "general.architecture"
  22. KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
  23. KEY_GENERAL_ALIGNMENT = "general.alignment"
  24. KEY_GENERAL_NAME = "general.name"
  25. KEY_GENERAL_AUTHOR = "general.author"
  26. KEY_GENERAL_URL = "general.url"
  27. KEY_GENERAL_DESCRIPTION = "general.description"
  28. KEY_GENERAL_LICENSE = "general.license"
  29. KEY_GENERAL_SOURCE_URL = "general.source.url"
  30. KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
  31. KEY_GENERAL_FILE_TYPE = "general.file_type"
  32. # LLM
  33. KEY_CONTEXT_LENGTH = "{arch}.context_length"
  34. KEY_EMBEDDING_LENGTH = "{arch}.embedding_length"
  35. KEY_BLOCK_COUNT = "{arch}.block_count"
  36. KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
  37. KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
  38. KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
  39. # attention
  40. KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
  41. KEY_ATTENTION_HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
  42. KEY_ATTENTION_MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
  43. KEY_ATTENTION_CLAMP_KQV = "{arch}.attention.clamp_kqv"
  44. KEY_ATTENTION_LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
  45. KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
  46. # RoPE
  47. KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
  48. KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base"
  49. KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear"
  50. # tokenization
  51. KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
  52. KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens"
  53. KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
  54. KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores"
  55. KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges"
  56. KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"
  57. KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"
  58. KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"
  59. KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"
  60. KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
  61. KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
  62. KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
  63. #
  64. # recommended mapping of model tensor names for storage in gguf
  65. #
  66. class MODEL_ARCH(IntEnum):
  67. LLAMA : int = auto()
  68. FALCON : int = auto()
  69. GPT2 : int = auto()
  70. GPTJ : int = auto()
  71. GPTNEOX: int = auto()
  72. MPT : int = auto()
  73. class MODEL_TENSOR(IntEnum):
  74. TOKEN_EMBD : int = auto()
  75. POS_EMBD : int = auto()
  76. OUTPUT : int = auto()
  77. OUTPUT_NORM : int = auto()
  78. ROPE_FREQS : int = auto()
  79. ATTN_Q : int = auto()
  80. ATTN_K : int = auto()
  81. ATTN_V : int = auto()
  82. ATTN_QKV : int = auto()
  83. ATTN_OUT : int = auto()
  84. ATTN_NORM : int = auto()
  85. ATTN_NORM_2 : int = auto()
  86. ATTN_ROT_EMBD: int = auto()
  87. FFN_GATE : int = auto()
  88. FFN_DOWN : int = auto()
  89. FFN_UP : int = auto()
  90. FFN_NORM : int = auto()
  91. MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
  92. MODEL_ARCH.LLAMA: "llama",
  93. MODEL_ARCH.FALCON: "falcon",
  94. MODEL_ARCH.GPT2: "gpt2",
  95. MODEL_ARCH.GPTJ: "gptj",
  96. MODEL_ARCH.GPTNEOX: "gptneox",
  97. MODEL_ARCH.MPT: "mpt",
  98. }
  99. MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
  100. MODEL_ARCH.LLAMA: {
  101. MODEL_TENSOR.TOKEN_EMBD: "token_embd",
  102. MODEL_TENSOR.OUTPUT_NORM: "output_norm",
  103. MODEL_TENSOR.OUTPUT: "output",
  104. MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
  105. MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
  106. MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
  107. MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
  108. MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
  109. MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
  110. MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
  111. MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
  112. MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
  113. MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
  114. MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
  115. },
  116. MODEL_ARCH.GPTNEOX: {
  117. MODEL_TENSOR.TOKEN_EMBD: "token_embd",
  118. MODEL_TENSOR.OUTPUT_NORM: "output_norm",
  119. MODEL_TENSOR.OUTPUT: "output",
  120. MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
  121. MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
  122. MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
  123. MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
  124. MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
  125. MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
  126. },
  127. MODEL_ARCH.FALCON: {
  128. MODEL_TENSOR.TOKEN_EMBD: "token_embd",
  129. MODEL_TENSOR.OUTPUT_NORM: "output_norm",
  130. MODEL_TENSOR.OUTPUT: "output",
  131. MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
  132. MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
  133. MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
  134. MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
  135. MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
  136. MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
  137. },
  138. MODEL_ARCH.GPT2: {
  139. # TODO
  140. },
  141. # TODO
  142. }
  143. # tensors that will not be serialized
  144. MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
  145. MODEL_ARCH.LLAMA: [
  146. MODEL_TENSOR.ROPE_FREQS,
  147. MODEL_TENSOR.ATTN_ROT_EMBD,
  148. ],
  149. }
  150. class TensorNameMap:
  151. mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
  152. # Token embeddings
  153. MODEL_TENSOR.TOKEN_EMBD: (
  154. "gpt_neox.embed_in", # gptneox
  155. "transformer.wte", # gpt2 mpt
  156. "transformer.word_embeddings", # falcon
  157. "model.embed_tokens", # llama-hf
  158. "tok_embeddings", # llama-pth
  159. ),
  160. # Position embeddings
  161. MODEL_TENSOR.POS_EMBD: (
  162. "transformer.wpe", # gpt2
  163. ),
  164. # Output
  165. MODEL_TENSOR.OUTPUT: (
  166. "embed_out", # gptneox
  167. "lm_head", # gpt2 mpt falcon llama-hf
  168. "output", # llama-pth
  169. ),
  170. # Output norm
  171. MODEL_TENSOR.OUTPUT_NORM: (
  172. "gpt_neox.final_layer_norm", # gptneox
  173. "transformer.ln_f", # gpt2 falcon
  174. "model.norm", # llama-hf
  175. "norm", # llama-pth
  176. ),
  177. # Rope frequencies
  178. MODEL_TENSOR.ROPE_FREQS: (
  179. "rope.freqs", # llama-pth
  180. ),
  181. }
  182. block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
  183. # Attention norm
  184. MODEL_TENSOR.ATTN_NORM: (
  185. "gpt_neox.layers.{bid}.input_layernorm", # gptneox
  186. "transformer.h.{bid}.ln_1", # gpt2
  187. "transformer.blocks.{bid}.norm_1", # mpt
  188. "transformer.h.{bid}.input_layernorm", # falcon7b
  189. "transformer.h.{bid}.ln_mlp", # falcon40b
  190. "model.layers.{bid}.input_layernorm", # llama-hf
  191. "layers.{bid}.attention_norm", # llama-pth
  192. ),
  193. # Attention norm 2
  194. MODEL_TENSOR.ATTN_NORM_2: (
  195. "transformer.h.{bid}.ln_attn", # falcon40b
  196. ),
  197. # Attention query-key-value
  198. MODEL_TENSOR.ATTN_QKV: (
  199. "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
  200. "transformer.h.{bid}.attn.c_attn", # gpt2
  201. "transformer.blocks.{bid}.attn.Wqkv", # mpt
  202. "transformer.h.{bid}.self_attention.query_key_value", # falcon
  203. ),
  204. # Attention query
  205. MODEL_TENSOR.ATTN_Q: (
  206. "model.layers.{bid}.self_attn.q_proj", # llama-hf
  207. "layers.{bid}.attention.wq", # llama-pth
  208. ),
  209. # Attention key
  210. MODEL_TENSOR.ATTN_K: (
  211. "model.layers.{bid}.self_attn.k_proj", # llama-hf
  212. "layers.{bid}.attention.wk", # llama-pth
  213. ),
  214. # Attention value
  215. MODEL_TENSOR.ATTN_V: (
  216. "model.layers.{bid}.self_attn.v_proj", # llama-hf
  217. "layers.{bid}.attention.wv", # llama-pth
  218. ),
  219. # Attention output
  220. MODEL_TENSOR.ATTN_OUT: (
  221. "gpt_neox.layers.{bid}.attention.dense", # gptneox
  222. "transformer.h.{bid}.attn.c_proj", # gpt2
  223. "transformer.blocks.{bid}.attn.out_proj", # mpt
  224. "transformer.h.{bid}.self_attention.dense", # falcon
  225. "model.layers.{bid}.self_attn.o_proj", # llama-hf
  226. "layers.{bid}.attention.wo", # llama-pth
  227. ),
  228. # Rotary embeddings
  229. MODEL_TENSOR.ATTN_ROT_EMBD: (
  230. "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
  231. "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
  232. ),
  233. # Feed-forward norm
  234. MODEL_TENSOR.FFN_NORM: (
  235. "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
  236. "transformer.h.{bid}.ln_2", # gpt2
  237. "transformer.blocks.{bid}.norm_2", # mpt
  238. "model.layers.{bid}.post_attention_layernorm", # llama-hf
  239. "layers.{bid}.ffn_norm", # llama-pth
  240. ),
  241. # Feed-forward up
  242. MODEL_TENSOR.FFN_UP: (
  243. "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
  244. "transformer.h.{bid}.mlp.c_fc", # gpt2
  245. "transformer.blocks.{bid}.ffn.up_proj", # mpt
  246. "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
  247. "model.layers.{bid}.mlp.up_proj", # llama-hf
  248. "layers.{bid}.feed_forward.w3", # llama-pth
  249. ),
  250. # Feed-forward gate
  251. MODEL_TENSOR.FFN_GATE: (
  252. "model.layers.{bid}.mlp.gate_proj", # llama-hf
  253. "layers.{bid}.feed_forward.w1", # llama-pth
  254. ),
  255. # Feed-forward down
  256. MODEL_TENSOR.FFN_DOWN: (
  257. "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
  258. "transformer.h.{bid}.mlp.c_proj", # gpt2
  259. "transformer.blocks.{bid}.ffn.down_proj", # mpt
  260. "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
  261. "model.layers.{bid}.mlp.down_proj", # llama-hf
  262. "layers.{bid}.feed_forward.w2", # llama-pth
  263. ),
  264. }
  265. mapping: dict[str, tuple[MODEL_TENSOR, str]]
  266. tensor_names: dict[MODEL_TENSOR, str]
  267. def __init__(self, arch: MODEL_ARCH, n_blocks: int):
  268. mapping = self.mapping = {}
  269. tensor_names = self.tensor_names = MODEL_TENSOR_NAMES[arch]
  270. for tensor, keys in self.mappings_cfg.items():
  271. tensor_name = tensor_names.get(tensor)
  272. if tensor_name is None:
  273. continue
  274. for key in keys:
  275. mapping[key] = (tensor, tensor_name)
  276. for bid in range(n_blocks):
  277. for tensor, keys in self.block_mappings_cfg.items():
  278. tensor_name = tensor_names.get(tensor)
  279. if tensor_name is None:
  280. continue
  281. tensor_name = tensor_name.format(bid = bid)
  282. for key in keys:
  283. key = key.format(bid = bid)
  284. mapping[key] = (tensor, tensor_name)
  285. def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> tuple[MODEL_TENSOR, str] | None:
  286. result = self.mapping.get(key)
  287. if result is not None:
  288. return result
  289. for suffix in try_suffixes:
  290. if key.endswith(suffix):
  291. result = self.mapping.get(key[:-len(suffix)])
  292. if result is not None:
  293. return (result[0], result[1] + suffix)
  294. return None
  295. def get_name(self, key: str, try_suffixes: Sequence[str]) -> str | None:
  296. result = self.get_type_and_name(key, try_suffixes = try_suffixes)
  297. if result is None:
  298. return None
  299. return result[1]
  300. def get_type(self, key: str, try_suffixes: Sequence[str]) -> MODEL_TENSOR | None:
  301. result = self.get_type_and_name(key, try_suffixes = try_suffixes)
  302. if result is None:
  303. return None
  304. return result[0]
  305. def __getitem__(self, key: str) -> str:
  306. try:
  307. return self.mapping[key][1]
  308. except KeyError:
  309. raise KeyError(key)
  310. def __contains__(self, key: str) -> bool:
  311. return key in self.mapping
  312. def __repr__(self) -> str:
  313. return repr(self.mapping)
  314. def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap:
  315. return TensorNameMap(arch, n_blocks)
  316. class TokenType(IntEnum):
  317. NORMAL = 1
  318. UNKNOWN = 2
  319. CONTROL = 3
  320. USER_DEFINED = 4
  321. UNUSED = 5
  322. BYTE = 6
  323. #
  324. # implementation
  325. #
  326. class GGMLQuantizationType(IntEnum):
  327. F32 = 0
  328. F16 = 1
  329. Q4_0 = 2
  330. Q4_1 = 3
  331. Q5_0 = 6
  332. Q5_1 = 7
  333. Q8_0 = 8
  334. Q8_1 = 9
  335. Q2_K = 10
  336. Q3_K = 11
  337. Q4_K = 12
  338. Q5_K = 13
  339. Q6_K = 14
  340. Q8_K = 15
  341. class GGUFValueType(IntEnum):
  342. UINT8 = 0
  343. INT8 = 1
  344. UINT16 = 2
  345. INT16 = 3
  346. UINT32 = 4
  347. INT32 = 5
  348. FLOAT32 = 6
  349. BOOL = 7
  350. STRING = 8
  351. ARRAY = 9
  352. UINT64 = 10
  353. INT64 = 11
  354. FLOAT64 = 12
  355. @staticmethod
  356. def get_type(val):
  357. if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray):
  358. return GGUFValueType.STRING
  359. elif isinstance(val, list):
  360. return GGUFValueType.ARRAY
  361. elif isinstance(val, float):
  362. return GGUFValueType.FLOAT32
  363. elif isinstance(val, bool):
  364. return GGUFValueType.BOOL
  365. elif isinstance(val, int):
  366. return GGUFValueType.INT32
  367. # TODO: need help with 64-bit types in Python
  368. else:
  369. print("Unknown type: "+str(type(val)))
  370. sys.exit()
  371. class GGUFWriter:
  372. fout: BufferedWriter
  373. arch: str
  374. offset_tensor = 0
  375. data_alignment = GGUF_DEFAULT_ALIGNMENT
  376. kv_data = b""
  377. kv_data_count = 0
  378. ti_data = b""
  379. ti_data_count = 0
  380. use_temp_file: bool
  381. temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
  382. tensors: list[tuple[np.ndarray[Any, Any], int]]
  383. def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
  384. self.fout = open(path, "wb")
  385. self.arch = arch
  386. self.add_architecture()
  387. self.use_temp_file = use_temp_file
  388. self.tensors = []
  389. def write_header_to_file(self):
  390. self.fout.write(struct.pack("<I", GGUF_MAGIC))
  391. self.fout.write(struct.pack("<I", GGUF_VERSION))
  392. self.fout.write(struct.pack("<Q", self.ti_data_count))
  393. self.fout.write(struct.pack("<Q", self.kv_data_count))
  394. self.flush()
  395. # print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
  396. def write_kv_data_to_file(self):
  397. self.fout.write(self.kv_data)
  398. self.flush()
  399. def write_ti_data_to_file(self):
  400. self.fout.write(self.ti_data)
  401. self.flush()
  402. def add_key(self, key: str):
  403. self.add_val(key, GGUFValueType.STRING, add_vtype=False)
  404. def add_uint8(self, key: str, val: int):
  405. self.add_key(key)
  406. self.add_val(val, GGUFValueType.UINT8)
  407. def add_int8(self, key: str, val: int):
  408. self.add_key(key)
  409. self.add_val(val, GGUFValueType.INT8)
  410. def add_uint16(self, key: str, val: int):
  411. self.add_key(key)
  412. self.add_val(val, GGUFValueType.UINT16)
  413. def add_int16(self, key: str, val: int):
  414. self.add_key(key)
  415. self.add_val(val, GGUFValueType.INT16)
  416. def add_uint32(self, key: str, val: int):
  417. self.add_key(key)
  418. self.add_val(val, GGUFValueType.UINT32)
  419. def add_int32(self, key: str, val: int):
  420. self.add_key(key)
  421. self.add_val(val, GGUFValueType.INT32)
  422. def add_float32(self, key: str, val: float):
  423. self.add_key(key)
  424. self.add_val(val, GGUFValueType.FLOAT32)
  425. def add_uint64(self, key: str, val: int):
  426. self.add_key(key)
  427. self.add_val(val, GGUFValueType.UINT64)
  428. def add_int64(self, key: str, val: int):
  429. self.add_key(key)
  430. self.add_val(val, GGUFValueType.INT64)
  431. def add_float64(self, key: str, val: float):
  432. self.add_key(key)
  433. self.add_val(val, GGUFValueType.FLOAT64)
  434. def add_bool(self, key: str, val: bool):
  435. self.add_key(key)
  436. self.add_val(val, GGUFValueType.BOOL)
  437. def add_string(self, key: str, val: str):
  438. if len(val) == 0:
  439. return
  440. self.add_key(key)
  441. self.add_val(val, GGUFValueType.STRING)
  442. def add_array(self, key: str, val: Sequence[Any]):
  443. if not isinstance(val, Sequence):
  444. raise ValueError("Value must be a sequence for array type")
  445. self.add_key(key)
  446. self.add_val(val, GGUFValueType.ARRAY)
  447. _simple_value_packing = {
  448. GGUFValueType.UINT8: "<B",
  449. GGUFValueType.INT8: "<b",
  450. GGUFValueType.UINT16: "<H",
  451. GGUFValueType.INT16: "<h",
  452. GGUFValueType.UINT32: "<I",
  453. GGUFValueType.INT32: "<i",
  454. GGUFValueType.FLOAT32: "<f",
  455. GGUFValueType.UINT64: "<Q",
  456. GGUFValueType.INT64: "<q",
  457. GGUFValueType.FLOAT64: "<d",
  458. GGUFValueType.BOOL: "?" ,
  459. }
  460. def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
  461. if vtype is None:
  462. vtype = GGUFValueType.get_type(val)
  463. if add_vtype:
  464. self.kv_data += struct.pack("<I", vtype)
  465. self.kv_data_count += 1
  466. pack_fmt = self._simple_value_packing.get(vtype)
  467. if pack_fmt is not None:
  468. self.kv_data += struct.pack(pack_fmt, val)
  469. elif vtype == GGUFValueType.STRING:
  470. encoded_val = val.encode("utf8") if isinstance(val, str) else val
  471. self.kv_data += struct.pack("<Q", len(encoded_val))
  472. self.kv_data += encoded_val
  473. elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
  474. ltype = GGUFValueType.get_type(val[0])
  475. if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
  476. raise ValueError("All items in a GGUF array should be of the same type")
  477. self.kv_data += struct.pack("<I", ltype)
  478. self.kv_data += struct.pack("<Q", len(val))
  479. for item in val:
  480. self.add_val(item, add_vtype=False)
  481. else:
  482. raise ValueError("Invalid GGUF metadata value type or value")
  483. @staticmethod
  484. def ggml_pad(x: int, n: int) -> int:
  485. return ((x + n - 1) // n) * n
  486. def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32], tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None):
  487. assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
  488. encoded_name = name.encode("utf8")
  489. self.ti_data += struct.pack("<Q", len(encoded_name))
  490. self.ti_data += encoded_name
  491. n_dims = len(tensor_shape)
  492. self.ti_data += struct.pack("<I", n_dims)
  493. for i in range(n_dims):
  494. self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
  495. if raw_dtype is None:
  496. dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
  497. else:
  498. dtype = raw_dtype
  499. self.ti_data += struct.pack("<I", dtype)
  500. self.ti_data += struct.pack("<Q", self.offset_tensor)
  501. self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
  502. self.ti_data_count += 1
  503. def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
  504. if self.use_temp_file and self.temp_file is None:
  505. fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
  506. fp.seek(0)
  507. self.temp_file = fp
  508. shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
  509. self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
  510. pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
  511. if self.temp_file is None:
  512. self.tensors.append((tensor, pad))
  513. return
  514. tensor.tofile(self.temp_file)
  515. if pad != 0:
  516. self.temp_file.write(bytes([0] * pad))
  517. def write_padding(self, fp: BinaryIO, n: int, align: int | None = None):
  518. pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n
  519. if pad != 0:
  520. fp.write(bytes([0] * pad))
  521. def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
  522. self.write_padding(self.fout, self.fout.tell())
  523. tensor.tofile(self.fout)
  524. self.write_padding(self.fout, tensor.nbytes)
  525. def write_tensors_to_file(self):
  526. self.write_ti_data_to_file()
  527. self.write_padding(self.fout, self.fout.tell())
  528. if self.temp_file is None:
  529. for (currtensor, currpad) in self.tensors:
  530. currtensor.tofile(self.fout)
  531. if currpad != 0:
  532. self.fout.write(bytes([0] * currpad))
  533. return
  534. self.temp_file.seek(0)
  535. shutil.copyfileobj(self.temp_file, self.fout)
  536. self.flush()
  537. self.temp_file.close()
  538. def flush(self):
  539. self.fout.flush()
  540. def close(self):
  541. self.fout.close()
  542. def add_architecture(self):
  543. self.add_string(KEY_GENERAL_ARCHITECTURE, self.arch)
  544. def add_author(self, author: str):
  545. self.add_string(KEY_GENERAL_AUTHOR, author)
  546. def add_tensor_data_layout(self, layout: str):
  547. self.add_string(KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
  548. def add_url(self, url: str):
  549. self.add_string(KEY_GENERAL_URL, url)
  550. def add_description(self, description: str):
  551. self.add_string(KEY_GENERAL_DESCRIPTION, description)
  552. def add_source_url(self, url: str):
  553. self.add_string(KEY_GENERAL_SOURCE_URL, url)
  554. def add_source_hf_repo(self, repo: str):
  555. self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
  556. def add_file_type(self, ftype: int):
  557. self.add_uint32(KEY_GENERAL_FILE_TYPE, ftype)
  558. def add_name(self, name: str):
  559. self.add_string(KEY_GENERAL_NAME, name)
  560. def add_quantization_version(self, quantization_version: GGMLQuantizationType):
  561. self.add_uint32(
  562. KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
  563. def add_custom_alignment(self, alignment: int):
  564. self.data_alignment = alignment
  565. self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
  566. def add_context_length(self, length: int):
  567. self.add_uint32(
  568. KEY_CONTEXT_LENGTH.format(arch=self.arch), length)
  569. def add_embedding_length(self, length: int):
  570. self.add_uint32(
  571. KEY_EMBEDDING_LENGTH.format(arch=self.arch), length)
  572. def add_block_count(self, length: int):
  573. self.add_uint32(
  574. KEY_BLOCK_COUNT.format(arch=self.arch), length)
  575. def add_feed_forward_length(self, length: int):
  576. self.add_uint32(
  577. KEY_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
  578. def add_parallel_residual(self, use: bool):
  579. self.add_bool(
  580. KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
  581. def add_head_count(self, count: int):
  582. self.add_uint32(
  583. KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
  584. def add_head_count_kv(self, count: int):
  585. self.add_uint32(
  586. KEY_ATTENTION_HEAD_COUNT_KV.format(arch=self.arch), count)
  587. def add_max_alibi_bias(self, bias: float):
  588. self.add_float32(
  589. KEY_ATTENTION_MAX_ALIBI_BIAS.format(arch=self.arch), bias)
  590. def add_clamp_kqv(self, value: float):
  591. self.add_float32(
  592. KEY_ATTENTION_CLAMP_KQV.format(arch=self.arch), value)
  593. def add_layer_norm_eps(self, value: float):
  594. self.add_float32(
  595. KEY_ATTENTION_LAYERNORM_EPS.format(arch=self.arch), value)
  596. def add_layer_norm_rms_eps(self, value: float):
  597. self.add_float32(
  598. KEY_ATTENTION_LAYERNORM_RMS_EPS.format(arch=self.arch), value)
  599. def add_rope_dimension_count(self, count: int):
  600. self.add_uint32(
  601. KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
  602. def add_rope_freq_base(self, value: float):
  603. self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)
  604. def add_rope_scale_linear(self, value: float):
  605. self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
  606. def add_tokenizer_model(self, model: str):
  607. self.add_string(KEY_TOKENIZER_MODEL, model)
  608. def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]):
  609. self.add_array(KEY_TOKENIZER_LIST, tokens)
  610. def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]):
  611. self.add_array(KEY_TOKENIZER_MERGES, merges)
  612. def add_token_types(self, types: Sequence[TokenType] | Sequence[int]):
  613. self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
  614. def add_token_scores(self, scores: Sequence[float]):
  615. self.add_array(KEY_TOKENIZER_SCORES, scores)
  616. def add_bos_token_id(self, id: int):
  617. self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
  618. def add_eos_token_id(self, id: int):
  619. self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
  620. def add_unk_token_id(self, id: int):
  621. self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
  622. def add_sep_token_id(self, id: int):
  623. self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
  624. def add_pad_token_id(self, id: int):
  625. self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
  626. class SpecialVocab:
  627. load_merges: bool = False
  628. merges: list[str] = []
  629. special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
  630. special_token_ids: dict[str, int] = {}
  631. def __init__(self, path: Path, load_merges: bool = False, special_token_types: tuple[str, ...] | None = None):
  632. self.special_token_ids = {}
  633. self.load_merges = load_merges
  634. if special_token_types is not None:
  635. self.special_token_types = special_token_types
  636. self.load(path)
  637. def load(self, path: Path):
  638. if not self.try_load_from_tokenizer_json(path):
  639. self.try_load_from_config_json(path)
  640. def try_load_from_tokenizer_json(self, path: Path) -> bool:
  641. tokenizer_file = path / 'tokenizer.json'
  642. if not tokenizer_file.is_file():
  643. return False
  644. with open(tokenizer_file, 'r', encoding = 'utf-8') as f:
  645. tokenizer = json.load(f)
  646. if self.load_merges:
  647. merges = tokenizer.get('model', {}).get('merges')
  648. if isinstance(merges, list) and len(merges) > 0 and isinstance(merges[0], str):
  649. self.merges = merges
  650. tokenizer_config_file = path / 'tokenizer_config.json'
  651. added_tokens = tokenizer.get('added_tokens')
  652. if added_tokens is None or not tokenizer_config_file.is_file():
  653. return True
  654. with open(tokenizer_config_file, 'r', encoding = 'utf-8') as f:
  655. tokenizer_config = json.load(f)
  656. for typ in self.special_token_types:
  657. entry = tokenizer_config.get(f'{typ}_token')
  658. if isinstance(entry, str):
  659. tc_content = entry
  660. elif isinstance(entry, dict):
  661. entry_content = entry.get('content')
  662. if not isinstance(entry_content, str):
  663. continue
  664. tc_content = entry_content
  665. else:
  666. continue
  667. for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
  668. if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
  669. self.special_token_ids[typ] = maybe_token_id
  670. break
  671. return True
  672. def try_load_from_config_json(self, path: Path) -> bool:
  673. config_file = path / 'config.json'
  674. if not config_file.is_file():
  675. return False
  676. with open(config_file, 'r', encoding = 'utf-8') as f:
  677. config = json.load(f)
  678. for typ in self.special_token_types:
  679. maybe_token_id = config.get(f'{typ}_token_id')
  680. if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
  681. self.special_token_ids[typ] = maybe_token_id
  682. return True
  683. def add_to_gguf(self, gw: GGUFWriter):
  684. if len(self.merges) > 0:
  685. print(f'gguf: Adding {len(self.merges)} merge(s).')
  686. gw.add_token_merges(self.merges)
  687. for typ, tokid in self.special_token_ids.items():
  688. handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
  689. if handler is None:
  690. print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping')
  691. continue
  692. print(f'gguf: Setting special token type {typ} to {tokid}')
  693. handler(tokid)
  694. def __repr__(self):
  695. return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids if self.special_token_ids else "unset"}>'
  696. # Example usage:
  697. if __name__ == "__main__":
  698. # Example usage with a file
  699. gguf_writer = GGUFWriter("example.gguf", "llama")
  700. gguf_writer.add_architecture()
  701. gguf_writer.add_block_count(12)
  702. gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer
  703. gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float
  704. gguf_writer.add_custom_alignment(64)
  705. tensor1 = np.ones((32,), dtype=np.float32) * 100.0
  706. tensor2 = np.ones((64,), dtype=np.float32) * 101.0
  707. tensor3 = np.ones((96,), dtype=np.float32) * 102.0
  708. gguf_writer.add_tensor("tensor1", tensor1)
  709. gguf_writer.add_tensor("tensor2", tensor2)
  710. gguf_writer.add_tensor("tensor3", tensor3)
  711. gguf_writer.write_header_to_file()
  712. gguf_writer.write_kv_data_to_file()
  713. gguf_writer.write_tensors_to_file()
  714. gguf_writer.close()