gguf.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723
  1. #!/usr/bin/env python3
  2. import shutil
  3. import sys
  4. import struct
  5. import tempfile
  6. import numpy as np
  7. from enum import IntEnum, auto
  8. from typing import Any, IO, List, Optional
  9. #
  10. # constants
  11. #
  12. GGUF_MAGIC = 0x46554747
  13. GGUF_VERSION = 1
  14. GGUF_DEFAULT_ALIGNMENT = 32
  15. # general
  16. KEY_GENERAL_ARCHITECTURE = "general.architecture"
  17. KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
  18. KEY_GENERAL_ALIGNMENT = "general.alignment"
  19. KEY_GENERAL_NAME = "general.name"
  20. KEY_GENERAL_AUTHOR = "general.author"
  21. KEY_GENERAL_URL = "general.url"
  22. KEY_GENERAL_DESCRIPTION = "general.description"
  23. KEY_GENERAL_LICENSE = "general.license"
  24. KEY_GENERAL_SOURCE_URL = "general.source.url"
  25. KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
  26. KEY_GENERAL_FILE_TYPE = "general.file_type"
  27. # LLM
  28. KEY_LLM_CONTEXT_LENGTH = "{arch}.context_length"
  29. KEY_LLM_EMBEDDING_LENGTH = "{arch}.embedding_length"
  30. KEY_LLM_BLOCK_COUNT = "{arch}.block_count"
  31. KEY_LLM_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
  32. KEY_LLM_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
  33. KEY_LLM_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
  34. # attention
  35. KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
  36. KEY_ATTENTION_HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
  37. KEY_ATTENTION_MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
  38. KEY_ATTENTION_CLAMP_KQV = "{arch}.attention.clamp_kqv"
  39. KEY_ATTENTION_LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
  40. KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
  41. # RoPE
  42. KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
  43. KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear"
  44. # tokenization
  45. KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
  46. KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens"
  47. KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
  48. KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores"
  49. KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges"
  50. KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"
  51. KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"
  52. KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"
  53. KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"
  54. KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
  55. KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
  56. KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
  57. #
  58. # recommended mapping of model tensor names for storage in gguf
  59. #
  60. class MODEL_ARCH(IntEnum):
  61. LLAMA = auto()
  62. FALCON = auto()
  63. GPT2 = auto()
  64. GPTJ = auto()
  65. GPTNEOX = auto()
  66. MPT = auto()
  67. class MODEL_TENSOR(IntEnum):
  68. TOKEN_EMBD = auto()
  69. POS_EMBD = auto()
  70. OUTPUT = auto()
  71. OUTPUT_NORM = auto()
  72. ROPE_FREQS = auto()
  73. ATTN_Q = auto()
  74. ATTN_K = auto()
  75. ATTN_V = auto()
  76. ATTN_QKV = auto()
  77. ATTN_OUT = auto()
  78. ATTN_NORM = auto()
  79. ATTN_NORM_2 = auto()
  80. ATTN_ROT_EMBD = auto()
  81. FFN_GATE = auto()
  82. FFN_DOWN = auto()
  83. FFN_UP = auto()
  84. FFN_NORM = auto()
  85. MODEL_ARCH_NAMES = {
  86. MODEL_ARCH.LLAMA: "llama",
  87. MODEL_ARCH.FALCON: "falcon",
  88. MODEL_ARCH.GPT2: "gpt2",
  89. MODEL_ARCH.GPTJ: "gptj",
  90. MODEL_ARCH.GPTNEOX: "gptneox",
  91. MODEL_ARCH.MPT: "mpt",
  92. }
  93. MODEL_TENSOR_NAMES = {
  94. MODEL_ARCH.LLAMA: {
  95. MODEL_TENSOR.TOKEN_EMBD: "token_embd",
  96. MODEL_TENSOR.OUTPUT_NORM: "output_norm",
  97. MODEL_TENSOR.OUTPUT: "output",
  98. MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
  99. MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
  100. MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
  101. MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
  102. MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
  103. MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
  104. MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
  105. MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
  106. MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
  107. MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
  108. MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
  109. },
  110. MODEL_ARCH.GPTNEOX: {
  111. MODEL_TENSOR.TOKEN_EMBD: "token_embd",
  112. MODEL_TENSOR.OUTPUT_NORM: "output_norm",
  113. MODEL_TENSOR.OUTPUT: "output",
  114. MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
  115. MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
  116. MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
  117. MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
  118. MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
  119. MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
  120. },
  121. MODEL_ARCH.FALCON: {
  122. MODEL_TENSOR.TOKEN_EMBD: "token_embd",
  123. MODEL_TENSOR.OUTPUT_NORM: "output_norm",
  124. MODEL_TENSOR.OUTPUT: "output",
  125. MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
  126. MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
  127. MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
  128. MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
  129. MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
  130. MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
  131. },
  132. MODEL_ARCH.GPT2: {
  133. # TODO
  134. },
  135. # TODO
  136. }
  137. # tensors that will not be serialized
  138. MODEL_TENSOR_SKIP = {
  139. MODEL_ARCH.LLAMA: [
  140. MODEL_TENSOR.ROPE_FREQS,
  141. MODEL_TENSOR.ATTN_ROT_EMBD,
  142. ],
  143. }
  144. # TODO: the following helper functions should be removed
  145. # instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR)
  146. # however, my Python is very bad, and I couldn't figure out how to do this, hence these functions
  147. # REMOVE
  148. def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool:
  149. for skip in MODEL_TENSOR_SKIP.get(arch, []):
  150. for i in range(n_blocks):
  151. if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i):
  152. return True
  153. return False
  154. def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
  155. tensor_map = {}
  156. # Token embeddings
  157. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None)
  158. tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox
  159. tensor_map["transformer.wte"] = mapped_to # gpt2 mpt
  160. tensor_map["transformer.word_embeddings"] = mapped_to # falcon
  161. tensor_map["model.embed_tokens"] = mapped_to # llama-hf
  162. tensor_map["tok_embeddings"] = mapped_to # llama-pth
  163. # Position embeddings
  164. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None)
  165. tensor_map["transformer.wpe"] = mapped_to # gpt2
  166. # Output
  167. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None)
  168. tensor_map["embed_out"] = mapped_to # gptneox
  169. tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf
  170. tensor_map["output"] = mapped_to # llama-pth
  171. # Output norm
  172. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None)
  173. tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox
  174. tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon
  175. tensor_map["transformer.norm_f"] = mapped_to # mpt
  176. tensor_map["model.norm"] = mapped_to # llama-hf
  177. tensor_map["norm"] = mapped_to # llama-pth
  178. # Rope frequencies
  179. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None)
  180. tensor_map["rope.freqs"] = mapped_to # llama-pth
  181. # Attention and feed-forward blocks
  182. for i in range(0, n_blocks):
  183. # Attention norm
  184. # TODO: is there are simpler way to write these 2 lines in Python?
  185. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None)
  186. mapped_to = mapped_to.format(bid=i) if mapped_to else None
  187. tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox
  188. tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2
  189. tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt
  190. tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b
  191. tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b
  192. tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf
  193. tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth
  194. # Attention norm 2
  195. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None)
  196. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  197. tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b
  198. # Attention query-key-value
  199. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None)
  200. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  201. tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox
  202. tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2
  203. tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt
  204. tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon
  205. # Attention query
  206. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None)
  207. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  208. tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf
  209. tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth
  210. # Attention key
  211. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None)
  212. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  213. tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf
  214. tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth
  215. # Attention value
  216. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None)
  217. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  218. tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf
  219. tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth
  220. # Attention output
  221. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None)
  222. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  223. tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox
  224. tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2
  225. tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt
  226. tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon
  227. tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf
  228. tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth
  229. # Rotary embeddings
  230. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None)
  231. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  232. tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"] = mapped_to # llama-hf
  233. tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to # llama-pth
  234. # Feed-forward norm
  235. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None)
  236. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  237. tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox
  238. tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2
  239. tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt
  240. tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf
  241. tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth
  242. # Feed-forward up
  243. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None)
  244. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  245. tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox
  246. tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2
  247. tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt
  248. tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon
  249. tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf
  250. tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth
  251. # Feed-forward gate
  252. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None)
  253. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  254. tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf
  255. tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth
  256. # Feed-forward down
  257. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None)
  258. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  259. tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox
  260. tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2
  261. tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt
  262. tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon
  263. tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf
  264. tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth
  265. return tensor_map
  266. class TokenType(IntEnum):
  267. NORMAL = 1
  268. UNKNOWN = 2
  269. CONTROL = 3
  270. USER_DEFINED = 4
  271. UNUSED = 5
  272. BYTE = 6
  273. #
  274. # implementation
  275. #
  276. class GGMLQuantizationType(IntEnum):
  277. F32 = 0
  278. F16 = 1
  279. Q4_0 = 2
  280. Q4_1 = 3
  281. Q5_0 = 6
  282. Q5_1 = 7
  283. Q8_0 = 8
  284. Q8_1 = 9
  285. Q2_K = 10
  286. Q3_K = 11
  287. Q4_K = 12
  288. Q5_K = 13
  289. Q6_K = 14
  290. Q8_K = 15
  291. class GGUFValueType(IntEnum):
  292. UINT8 = 0
  293. INT8 = 1
  294. UINT16 = 2
  295. INT16 = 3
  296. UINT32 = 4
  297. INT32 = 5
  298. FLOAT32 = 6
  299. BOOL = 7
  300. STRING = 8
  301. ARRAY = 9
  302. @staticmethod
  303. def get_type(val):
  304. if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray):
  305. return GGUFValueType.STRING
  306. elif isinstance(val, list):
  307. return GGUFValueType.ARRAY
  308. elif isinstance(val, float):
  309. return GGUFValueType.FLOAT32
  310. elif isinstance(val, bool):
  311. return GGUFValueType.BOOL
  312. elif isinstance(val, int):
  313. return GGUFValueType.INT32
  314. else:
  315. print("Unknown type: "+str(type(val)))
  316. sys.exit()
  317. class GGUFWriter:
  318. def __init__(self, path: str, arch: str, use_temp_file = True):
  319. self.fout = open(path, "wb")
  320. self.arch = arch
  321. self.offset_tensor = 0
  322. self.data_alignment = GGUF_DEFAULT_ALIGNMENT
  323. self.kv_data = b""
  324. self.kv_data_count = 0
  325. self.ti_data = b""
  326. self.ti_data_count = 0
  327. self.add_architecture()
  328. self.use_temp_file = use_temp_file
  329. self.tensors = []
  330. def write_header_to_file(self):
  331. self.fout.write(struct.pack("<I", GGUF_MAGIC))
  332. self.fout.write(struct.pack("<I", GGUF_VERSION))
  333. self.fout.write(struct.pack("<I", self.ti_data_count))
  334. self.fout.write(struct.pack("<I", self.kv_data_count))
  335. self.flush()
  336. # print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
  337. def write_kv_data_to_file(self):
  338. self.fout.write(self.kv_data)
  339. self.flush()
  340. def write_ti_data_to_file(self):
  341. self.fout.write(self.ti_data)
  342. self.flush()
  343. def add_key(self, key: str):
  344. self.add_val(key, GGUFValueType.STRING, add_vtype=False)
  345. def add_uint8(self, key: str, val: int):
  346. self.add_key(key)
  347. self.add_val(val, GGUFValueType.UINT8)
  348. def add_int8(self, key: str, val: int):
  349. self.add_key(key)
  350. self.add_val(val, GGUFValueType.INT8)
  351. def add_uint16(self, key: str, val: int):
  352. self.add_key(key)
  353. self.add_val(val, GGUFValueType.UINT16)
  354. def add_int16(self, key: str, val: int):
  355. self.add_key(key)
  356. self.add_val(val, GGUFValueType.INT16)
  357. def add_uint32(self, key: str, val: int):
  358. self.add_key(key)
  359. self.add_val(val, GGUFValueType.UINT32)
  360. def add_int32(self, key: str, val: int):
  361. self.add_key(key)
  362. self.add_val(val, GGUFValueType.INT32)
  363. def add_float32(self, key: str, val: float):
  364. self.add_key(key)
  365. self.add_val(val, GGUFValueType.FLOAT32)
  366. def add_bool(self, key: str, val: bool):
  367. self.add_key(key)
  368. self.add_val(val, GGUFValueType.BOOL)
  369. def add_string(self, key: str, val: str):
  370. if len(val) == 0:
  371. return
  372. self.add_key(key)
  373. self.add_val(val, GGUFValueType.STRING)
  374. def add_array(self, key: str, val: list):
  375. if not isinstance(val, list):
  376. raise ValueError("Value must be a list for array type")
  377. self.add_key(key)
  378. self.add_val(val, GGUFValueType.ARRAY)
  379. def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True):
  380. if vtype is None:
  381. vtype = GGUFValueType.get_type(val)
  382. if add_vtype:
  383. self.kv_data += struct.pack("<I", vtype)
  384. self.kv_data_count += 1
  385. if vtype == GGUFValueType.UINT8:
  386. self.kv_data += struct.pack("<B", val)
  387. elif vtype == GGUFValueType.INT8:
  388. self.kv_data += struct.pack("<b", val)
  389. elif vtype == GGUFValueType.UINT16:
  390. self.kv_data += struct.pack("<H", val)
  391. elif vtype == GGUFValueType.INT16:
  392. self.kv_data += struct.pack("<h", val)
  393. elif vtype == GGUFValueType.UINT32:
  394. self.kv_data += struct.pack("<I", val)
  395. elif vtype == GGUFValueType.INT32:
  396. self.kv_data += struct.pack("<i", val)
  397. elif vtype == GGUFValueType.FLOAT32:
  398. self.kv_data += struct.pack("<f", val)
  399. elif vtype == GGUFValueType.BOOL:
  400. self.kv_data += struct.pack("?", val)
  401. elif vtype == GGUFValueType.STRING:
  402. encoded_val = val.encode("utf8") if isinstance(val, str) else val
  403. self.kv_data += struct.pack("<I", len(encoded_val))
  404. self.kv_data += encoded_val
  405. elif vtype == GGUFValueType.ARRAY:
  406. ltype = set([GGUFValueType.get_type(item) for item in val])
  407. assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
  408. self.kv_data += struct.pack("<I", list(ltype)[0])
  409. self.kv_data += struct.pack("<I", len(val))
  410. for item in val:
  411. self.add_val(item, add_vtype=False)
  412. else:
  413. raise ValueError("Invalid GGUF metadata value type")
  414. @staticmethod
  415. def ggml_pad(x: int, n: int) -> int:
  416. return ((x + n - 1) // n) * n
  417. def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
  418. assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
  419. encoded_name = name.encode("utf8")
  420. self.ti_data += struct.pack("<I", len(encoded_name))
  421. self.ti_data += encoded_name
  422. n_dims = len(tensor_shape)
  423. self.ti_data += struct.pack("<I", n_dims)
  424. for i in range(n_dims):
  425. self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
  426. if raw_dtype is None:
  427. dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
  428. else:
  429. dtype = raw_dtype
  430. self.ti_data += struct.pack("<I", dtype)
  431. self.ti_data += struct.pack("<Q", self.offset_tensor)
  432. self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
  433. self.ti_data_count += 1
  434. def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
  435. if self.use_temp_file and not hasattr(self, "temp_file"):
  436. self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
  437. self.temp_file.seek(0)
  438. self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
  439. pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
  440. if not self.use_temp_file:
  441. self.tensors.append((tensor, pad))
  442. return
  443. tensor.tofile(self.temp_file)
  444. if pad != 0:
  445. self.temp_file.write(bytes([0] * pad))
  446. def write_tensor_data(self, tensor: np.ndarray):
  447. pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
  448. if pad != 0:
  449. self.fout.write(bytes([0] * pad))
  450. tensor.tofile(self.fout)
  451. pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
  452. if pad != 0:
  453. self.fout.write(bytes([0] * pad))
  454. def write_tensors_to_file(self):
  455. self.write_ti_data_to_file()
  456. pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
  457. if pad != 0:
  458. self.fout.write(bytes([0] * pad))
  459. if not self.use_temp_file:
  460. for (currtensor, currpad) in self.tensors:
  461. currtensor.tofile(self.fout)
  462. if currpad != 0:
  463. self.fout.write(bytes([0] * currpad))
  464. return
  465. self.temp_file.seek(0)
  466. shutil.copyfileobj(self.temp_file, self.fout)
  467. self.flush()
  468. self.temp_file.close()
  469. def flush(self):
  470. self.fout.flush()
  471. def close(self):
  472. self.fout.close()
  473. def add_architecture(self):
  474. self.add_string(KEY_GENERAL_ARCHITECTURE, self.arch)
  475. def add_author(self, author: str):
  476. self.add_string(KEY_GENERAL_AUTHOR, author)
  477. def add_tensor_data_layout(self, layout: str):
  478. self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
  479. def add_url(self, url: str):
  480. self.add_string(KEY_GENERAL_URL, url)
  481. def add_description(self, description: str):
  482. self.add_string(KEY_GENERAL_DESCRIPTION, description)
  483. def add_source_url(self, url: str):
  484. self.add_string(KEY_GENERAL_SOURCE_URL, url)
  485. def add_source_hf_repo(self, repo: str):
  486. self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
  487. def add_file_type(self, ftype: int):
  488. self.add_uint32(KEY_GENERAL_FILE_TYPE, ftype)
  489. def add_name(self, name: str):
  490. self.add_string(KEY_GENERAL_NAME, name)
  491. def add_quantization_version(self, quantization_version: GGMLQuantizationType):
  492. self.add_uint32(
  493. KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
  494. def add_custom_alignment(self, alignment: int):
  495. self.data_alignment = alignment
  496. self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
  497. def add_context_length(self, length: int):
  498. self.add_uint32(
  499. KEY_LLM_CONTEXT_LENGTH.format(arch=self.arch), length)
  500. def add_embedding_length(self, length: int):
  501. self.add_uint32(
  502. KEY_LLM_EMBEDDING_LENGTH.format(arch=self.arch), length)
  503. def add_block_count(self, length: int):
  504. self.add_uint32(
  505. KEY_LLM_BLOCK_COUNT.format(arch=self.arch), length)
  506. def add_feed_forward_length(self, length: int):
  507. self.add_uint32(
  508. KEY_LLM_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
  509. def add_parallel_residual(self, use: bool):
  510. self.add_bool(
  511. KEY_LLM_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
  512. def add_tensor_data_layout(self, layout: str):
  513. self.add_string(
  514. KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
  515. def add_head_count(self, count: int):
  516. self.add_uint32(
  517. KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
  518. def add_head_count_kv(self, count: int):
  519. self.add_uint32(
  520. KEY_ATTENTION_HEAD_COUNT_KV.format(arch=self.arch), count)
  521. def add_max_alibi_bias(self, bias: float):
  522. self.add_float32(
  523. KEY_ATTENTION_MAX_ALIBI_BIAS.format(arch=self.arch), bias)
  524. def add_clamp_kqv(self, value: float):
  525. self.add_float32(
  526. KEY_ATTENTION_CLAMP_KQV.format(arch=self.arch), value)
  527. def add_layer_norm_eps(self, value: float):
  528. self.add_float32(
  529. KEY_ATTENTION_LAYERNORM_EPS.format(arch=self.arch), value)
  530. def add_layer_norm_rms_eps(self, value: float):
  531. self.add_float32(
  532. KEY_ATTENTION_LAYERNORM_RMS_EPS.format(arch=self.arch), value)
  533. def add_rope_dimension_count(self, count: int):
  534. self.add_uint32(
  535. KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
  536. def add_rope_scale_linear(self, value: float):
  537. self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
  538. def add_tokenizer_model(self, model: str):
  539. self.add_string(KEY_TOKENIZER_MODEL, model)
  540. def add_token_list(self, tokens: List):
  541. self.add_array(KEY_TOKENIZER_LIST, tokens)
  542. def add_token_merges(self, merges: List):
  543. self.add_array(KEY_TOKENIZER_MERGES, merges)
  544. def add_token_types(self, types: List[int]):
  545. self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
  546. def add_token_scores(self, scores: List[float]):
  547. self.add_array(KEY_TOKENIZER_SCORES, scores)
  548. def add_bos_token_id(self, id: int):
  549. self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
  550. def add_eos_token_id(self, id: int):
  551. self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
  552. def add_unk_token_id(self, id: int):
  553. self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
  554. def add_sep_token_id(self, id: int):
  555. self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
  556. def add_pad_token_id(self, id: int):
  557. self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
  558. # Example usage:
  559. if __name__ == "__main__":
  560. # Example usage with a file
  561. gguf_writer = GGUFWriter("example.gguf", "llama")
  562. gguf_writer.add_architecture()
  563. gguf_writer.add_block_count(12)
  564. gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer
  565. gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float
  566. gguf_writer.add_custom_alignment(64)
  567. tensor1 = np.ones((32,), dtype=np.float32) * 100.0
  568. tensor2 = np.ones((64,), dtype=np.float32) * 101.0
  569. tensor3 = np.ones((96,), dtype=np.float32) * 102.0
  570. gguf_writer.add_tensor("tensor1", tensor1)
  571. gguf_writer.add_tensor("tensor2", tensor2)
  572. gguf_writer.add_tensor("tensor3", tensor3)
  573. gguf_writer.write_header_to_file()
  574. gguf_writer.write_kv_data_to_file()
  575. gguf_writer.write_tensors_to_file()
  576. gguf_writer.close()