gguf.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722
  1. import shutil
  2. import sys
  3. import struct
  4. import tempfile
  5. import numpy as np
  6. from enum import IntEnum, auto
  7. from typing import Any, IO, List, Optional
  8. #
  9. # constants
  10. #
  11. GGUF_MAGIC = 0x46554747
  12. GGUF_VERSION = 1
  13. GGUF_DEFAULT_ALIGNMENT = 32
  14. # general
  15. KEY_GENERAL_ARCHITECTURE = "general.architecture"
  16. KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
  17. KEY_GENERAL_ALIGNMENT = "general.alignment"
  18. KEY_GENERAL_NAME = "general.name"
  19. KEY_GENERAL_AUTHOR = "general.author"
  20. KEY_GENERAL_URL = "general.url"
  21. KEY_GENERAL_DESCRIPTION = "general.description"
  22. KEY_GENERAL_LICENSE = "general.license"
  23. KEY_GENERAL_SOURCE_URL = "general.source.url"
  24. KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
  25. KEY_GENERAL_FILE_TYPE = "general.file_type"
  26. # LLM
  27. KEY_LLM_CONTEXT_LENGTH = "{arch}.context_length"
  28. KEY_LLM_EMBEDDING_LENGTH = "{arch}.embedding_length"
  29. KEY_LLM_BLOCK_COUNT = "{arch}.block_count"
  30. KEY_LLM_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
  31. KEY_LLM_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
  32. KEY_LLM_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
  33. # attention
  34. KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
  35. KEY_ATTENTION_HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
  36. KEY_ATTENTION_MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
  37. KEY_ATTENTION_CLAMP_KQV = "{arch}.attention.clamp_kqv"
  38. KEY_ATTENTION_LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
  39. KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
  40. # RoPE
  41. KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
  42. KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear"
  43. # tokenization
  44. KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
  45. KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens"
  46. KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
  47. KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores"
  48. KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges"
  49. KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"
  50. KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"
  51. KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"
  52. KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"
  53. KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
  54. KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
  55. KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
  56. #
  57. # recommended mapping of model tensor names for storage in gguf
  58. #
  59. class MODEL_ARCH(IntEnum):
  60. LLAMA = auto()
  61. FALCON = auto()
  62. GPT2 = auto()
  63. GPTJ = auto()
  64. GPTNEOX = auto()
  65. MPT = auto()
  66. class MODEL_TENSOR(IntEnum):
  67. TOKEN_EMBD = auto()
  68. POS_EMBD = auto()
  69. OUTPUT = auto()
  70. OUTPUT_NORM = auto()
  71. ROPE_FREQS = auto()
  72. ATTN_Q = auto()
  73. ATTN_K = auto()
  74. ATTN_V = auto()
  75. ATTN_QKV = auto()
  76. ATTN_OUT = auto()
  77. ATTN_NORM = auto()
  78. ATTN_NORM_2 = auto()
  79. ATTN_ROT_EMBD = auto()
  80. FFN_GATE = auto()
  81. FFN_DOWN = auto()
  82. FFN_UP = auto()
  83. FFN_NORM = auto()
  84. MODEL_ARCH_NAMES = {
  85. MODEL_ARCH.LLAMA: "llama",
  86. MODEL_ARCH.FALCON: "falcon",
  87. MODEL_ARCH.GPT2: "gpt2",
  88. MODEL_ARCH.GPTJ: "gptj",
  89. MODEL_ARCH.GPTNEOX: "gptneox",
  90. MODEL_ARCH.MPT: "mpt",
  91. }
  92. MODEL_TENSOR_NAMES = {
  93. MODEL_ARCH.LLAMA: {
  94. MODEL_TENSOR.TOKEN_EMBD: "token_embd",
  95. MODEL_TENSOR.OUTPUT_NORM: "output_norm",
  96. MODEL_TENSOR.OUTPUT: "output",
  97. MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
  98. MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
  99. MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
  100. MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
  101. MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
  102. MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
  103. MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
  104. MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
  105. MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
  106. MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
  107. MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
  108. },
  109. MODEL_ARCH.GPTNEOX: {
  110. MODEL_TENSOR.TOKEN_EMBD: "token_embd",
  111. MODEL_TENSOR.OUTPUT_NORM: "output_norm",
  112. MODEL_TENSOR.OUTPUT: "output",
  113. MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
  114. MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
  115. MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
  116. MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
  117. MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
  118. MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
  119. },
  120. MODEL_ARCH.FALCON: {
  121. MODEL_TENSOR.TOKEN_EMBD: "token_embd",
  122. MODEL_TENSOR.OUTPUT_NORM: "output_norm",
  123. MODEL_TENSOR.OUTPUT: "output",
  124. MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
  125. MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
  126. MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
  127. MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
  128. MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
  129. MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
  130. },
  131. MODEL_ARCH.GPT2: {
  132. # TODO
  133. },
  134. # TODO
  135. }
  136. # tensors that will not be serialized
  137. MODEL_TENSOR_SKIP = {
  138. MODEL_ARCH.LLAMA: [
  139. MODEL_TENSOR.ROPE_FREQS,
  140. MODEL_TENSOR.ATTN_ROT_EMBD,
  141. ],
  142. }
  143. # TODO: the following helper functions should be removed
  144. # instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR)
  145. # however, my Python is very bad, and I couldn't figure out how to do this, hence these functions
  146. # REMOVE
  147. def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool:
  148. for skip in MODEL_TENSOR_SKIP.get(arch, []):
  149. for i in range(n_blocks):
  150. if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i):
  151. return True
  152. return False
  153. def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
  154. tensor_map = {}
  155. # Token embeddings
  156. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None)
  157. tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox
  158. tensor_map["transformer.wte"] = mapped_to # gpt2 mpt
  159. tensor_map["transformer.word_embeddings"] = mapped_to # falcon
  160. tensor_map["model.embed_tokens"] = mapped_to # llama-hf
  161. tensor_map["tok_embeddings"] = mapped_to # llama-pth
  162. # Position embeddings
  163. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None)
  164. tensor_map["transformer.wpe"] = mapped_to # gpt2
  165. # Output
  166. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None)
  167. tensor_map["embed_out"] = mapped_to # gptneox
  168. tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf
  169. tensor_map["output"] = mapped_to # llama-pth
  170. # Output norm
  171. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None)
  172. tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox
  173. tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon
  174. tensor_map["transformer.norm_f"] = mapped_to # mpt
  175. tensor_map["model.norm"] = mapped_to # llama-hf
  176. tensor_map["norm"] = mapped_to # llama-pth
  177. # Rope frequencies
  178. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None)
  179. tensor_map["rope.freqs"] = mapped_to # llama-pth
  180. # Attention and feed-forward blocks
  181. for i in range(0, n_blocks):
  182. # Attention norm
  183. # TODO: is there are simpler way to write these 2 lines in Python?
  184. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None)
  185. mapped_to = mapped_to.format(bid=i) if mapped_to else None
  186. tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox
  187. tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2
  188. tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt
  189. tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b
  190. tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b
  191. tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf
  192. tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth
  193. # Attention norm 2
  194. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None)
  195. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  196. tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b
  197. # Attention query-key-value
  198. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None)
  199. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  200. tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox
  201. tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2
  202. tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt
  203. tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon
  204. # Attention query
  205. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None)
  206. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  207. tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf
  208. tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth
  209. # Attention key
  210. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None)
  211. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  212. tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf
  213. tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth
  214. # Attention value
  215. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None)
  216. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  217. tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf
  218. tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth
  219. # Attention output
  220. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None)
  221. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  222. tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox
  223. tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2
  224. tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt
  225. tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon
  226. tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf
  227. tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth
  228. # Rotary embeddings
  229. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None)
  230. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  231. tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"] = mapped_to # llama-hf
  232. tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to # llama-pth
  233. # Feed-forward norm
  234. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None)
  235. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  236. tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox
  237. tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2
  238. tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt
  239. tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf
  240. tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth
  241. # Feed-forward up
  242. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None)
  243. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  244. tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox
  245. tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2
  246. tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt
  247. tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon
  248. tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf
  249. tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth
  250. # Feed-forward gate
  251. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None)
  252. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  253. tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf
  254. tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth
  255. # Feed-forward down
  256. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None)
  257. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  258. tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox
  259. tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2
  260. tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt
  261. tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon
  262. tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf
  263. tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth
  264. return tensor_map
  265. class TokenType(IntEnum):
  266. NORMAL = 1
  267. UNKNOWN = 2
  268. CONTROL = 3
  269. USER_DEFINED = 4
  270. UNUSED = 5
  271. BYTE = 6
  272. #
  273. # implementation
  274. #
  275. class GGMLQuantizationType(IntEnum):
  276. F32 = 0
  277. F16 = 1
  278. Q4_0 = 2
  279. Q4_1 = 3
  280. Q5_0 = 6
  281. Q5_1 = 7
  282. Q8_0 = 8
  283. Q8_1 = 9
  284. Q2_K = 10
  285. Q3_K = 11
  286. Q4_K = 12
  287. Q5_K = 13
  288. Q6_K = 14
  289. Q8_K = 15
  290. class GGUFValueType(IntEnum):
  291. UINT8 = 0
  292. INT8 = 1
  293. UINT16 = 2
  294. INT16 = 3
  295. UINT32 = 4
  296. INT32 = 5
  297. FLOAT32 = 6
  298. BOOL = 7
  299. STRING = 8
  300. ARRAY = 9
  301. @staticmethod
  302. def get_type(val):
  303. if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray):
  304. return GGUFValueType.STRING
  305. elif isinstance(val, list):
  306. return GGUFValueType.ARRAY
  307. elif isinstance(val, float):
  308. return GGUFValueType.FLOAT32
  309. elif isinstance(val, bool):
  310. return GGUFValueType.BOOL
  311. elif isinstance(val, int):
  312. return GGUFValueType.INT32
  313. else:
  314. print("Unknown type: "+str(type(val)))
  315. sys.exit()
  316. class GGUFWriter:
  317. def __init__(self, path: str, arch: str, use_temp_file = True):
  318. self.fout = open(path, "wb")
  319. self.arch = arch
  320. self.offset_tensor = 0
  321. self.data_alignment = GGUF_DEFAULT_ALIGNMENT
  322. self.kv_data = b""
  323. self.kv_data_count = 0
  324. self.ti_data = b""
  325. self.ti_data_count = 0
  326. self.add_architecture()
  327. self.use_temp_file = use_temp_file
  328. self.tensors = []
  329. def write_header_to_file(self):
  330. self.fout.write(struct.pack("<I", GGUF_MAGIC))
  331. self.fout.write(struct.pack("<I", GGUF_VERSION))
  332. self.fout.write(struct.pack("<I", self.ti_data_count))
  333. self.fout.write(struct.pack("<I", self.kv_data_count))
  334. self.flush()
  335. # print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
  336. def write_kv_data_to_file(self):
  337. self.fout.write(self.kv_data)
  338. self.flush()
  339. def write_ti_data_to_file(self):
  340. self.fout.write(self.ti_data)
  341. self.flush()
  342. def add_key(self, key: str):
  343. self.add_val(key, GGUFValueType.STRING, add_vtype=False)
  344. def add_uint8(self, key: str, val: int):
  345. self.add_key(key)
  346. self.add_val(val, GGUFValueType.UINT8)
  347. def add_int8(self, key: str, val: int):
  348. self.add_key(key)
  349. self.add_val(val, GGUFValueType.INT8)
  350. def add_uint16(self, key: str, val: int):
  351. self.add_key(key)
  352. self.add_val(val, GGUFValueType.UINT16)
  353. def add_int16(self, key: str, val: int):
  354. self.add_key(key)
  355. self.add_val(val, GGUFValueType.INT16)
  356. def add_uint32(self, key: str, val: int):
  357. self.add_key(key)
  358. self.add_val(val, GGUFValueType.UINT32)
  359. def add_int32(self, key: str, val: int):
  360. self.add_key(key)
  361. self.add_val(val, GGUFValueType.INT32)
  362. def add_float32(self, key: str, val: float):
  363. self.add_key(key)
  364. self.add_val(val, GGUFValueType.FLOAT32)
  365. def add_bool(self, key: str, val: bool):
  366. self.add_key(key)
  367. self.add_val(val, GGUFValueType.BOOL)
  368. def add_string(self, key: str, val: str):
  369. if len(val) == 0:
  370. return
  371. self.add_key(key)
  372. self.add_val(val, GGUFValueType.STRING)
  373. def add_array(self, key: str, val: list):
  374. if not isinstance(val, list):
  375. raise ValueError("Value must be a list for array type")
  376. self.add_key(key)
  377. self.add_val(val, GGUFValueType.ARRAY)
  378. def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True):
  379. if vtype is None:
  380. vtype = GGUFValueType.get_type(val)
  381. if add_vtype:
  382. self.kv_data += struct.pack("<I", vtype)
  383. self.kv_data_count += 1
  384. if vtype == GGUFValueType.UINT8:
  385. self.kv_data += struct.pack("<B", val)
  386. elif vtype == GGUFValueType.INT8:
  387. self.kv_data += struct.pack("<b", val)
  388. elif vtype == GGUFValueType.UINT16:
  389. self.kv_data += struct.pack("<H", val)
  390. elif vtype == GGUFValueType.INT16:
  391. self.kv_data += struct.pack("<h", val)
  392. elif vtype == GGUFValueType.UINT32:
  393. self.kv_data += struct.pack("<I", val)
  394. elif vtype == GGUFValueType.INT32:
  395. self.kv_data += struct.pack("<i", val)
  396. elif vtype == GGUFValueType.FLOAT32:
  397. self.kv_data += struct.pack("<f", val)
  398. elif vtype == GGUFValueType.BOOL:
  399. self.kv_data += struct.pack("?", val)
  400. elif vtype == GGUFValueType.STRING:
  401. encoded_val = val.encode("utf8") if isinstance(val, str) else val
  402. self.kv_data += struct.pack("<I", len(encoded_val))
  403. self.kv_data += encoded_val
  404. elif vtype == GGUFValueType.ARRAY:
  405. ltype = set([GGUFValueType.get_type(item) for item in val])
  406. assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
  407. self.kv_data += struct.pack("<I", list(ltype)[0])
  408. self.kv_data += struct.pack("<I", len(val))
  409. for item in val:
  410. self.add_val(item, add_vtype=False)
  411. else:
  412. raise ValueError("Invalid GGUF metadata value type")
  413. @staticmethod
  414. def ggml_pad(x: int, n: int) -> int:
  415. return ((x + n - 1) // n) * n
  416. def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
  417. assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
  418. encoded_name = name.encode("utf8")
  419. self.ti_data += struct.pack("<I", len(encoded_name))
  420. self.ti_data += encoded_name
  421. n_dims = len(tensor_shape)
  422. self.ti_data += struct.pack("<I", n_dims)
  423. for i in range(n_dims):
  424. self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
  425. if raw_dtype is None:
  426. dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
  427. else:
  428. dtype = raw_dtype
  429. self.ti_data += struct.pack("<I", dtype)
  430. self.ti_data += struct.pack("<Q", self.offset_tensor)
  431. self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
  432. self.ti_data_count += 1
  433. def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
  434. if self.use_temp_file and not hasattr(self, "temp_file"):
  435. self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
  436. self.temp_file.seek(0)
  437. self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
  438. pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
  439. if not self.use_temp_file:
  440. self.tensors.append((tensor, pad))
  441. return
  442. tensor.tofile(self.temp_file)
  443. if pad != 0:
  444. self.temp_file.write(bytes([0] * pad))
  445. def write_tensor_data(self, tensor: np.ndarray):
  446. pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
  447. if pad != 0:
  448. self.fout.write(bytes([0] * pad))
  449. tensor.tofile(self.fout)
  450. pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
  451. if pad != 0:
  452. self.fout.write(bytes([0] * pad))
  453. def write_tensors_to_file(self):
  454. self.write_ti_data_to_file()
  455. pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
  456. if pad != 0:
  457. self.fout.write(bytes([0] * pad))
  458. if not self.use_temp_file:
  459. for (currtensor, currpad) in self.tensors:
  460. currtensor.tofile(self.fout)
  461. if currpad != 0:
  462. self.fout.write(bytes([0] * currpad))
  463. return
  464. self.temp_file.seek(0)
  465. shutil.copyfileobj(self.temp_file, self.fout)
  466. self.flush()
  467. self.temp_file.close()
  468. def flush(self):
  469. self.fout.flush()
  470. def close(self):
  471. self.fout.close()
  472. def add_architecture(self):
  473. self.add_string(KEY_GENERAL_ARCHITECTURE, self.arch)
  474. def add_author(self, author: str):
  475. self.add_string(KEY_GENERAL_AUTHOR, author)
  476. def add_tensor_data_layout(self, layout: str):
  477. self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
  478. def add_url(self, url: str):
  479. self.add_string(KEY_GENERAL_URL, url)
  480. def add_description(self, description: str):
  481. self.add_string(KEY_GENERAL_DESCRIPTION, description)
  482. def add_source_url(self, url: str):
  483. self.add_string(KEY_GENERAL_SOURCE_URL, url)
  484. def add_source_hf_repo(self, repo: str):
  485. self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
  486. def add_file_type(self, ftype: int):
  487. self.add_uint32(KEY_GENERAL_FILE_TYPE, ftype)
  488. def add_name(self, name: str):
  489. self.add_string(KEY_GENERAL_NAME, name)
  490. def add_quantization_version(self, quantization_version: GGMLQuantizationType):
  491. self.add_uint32(
  492. KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
  493. def add_custom_alignment(self, alignment: int):
  494. self.data_alignment = alignment
  495. self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
  496. def add_context_length(self, length: int):
  497. self.add_uint32(
  498. KEY_LLM_CONTEXT_LENGTH.format(arch=self.arch), length)
  499. def add_embedding_length(self, length: int):
  500. self.add_uint32(
  501. KEY_LLM_EMBEDDING_LENGTH.format(arch=self.arch), length)
  502. def add_block_count(self, length: int):
  503. self.add_uint32(
  504. KEY_LLM_BLOCK_COUNT.format(arch=self.arch), length)
  505. def add_feed_forward_length(self, length: int):
  506. self.add_uint32(
  507. KEY_LLM_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
  508. def add_parallel_residual(self, use: bool):
  509. self.add_bool(
  510. KEY_LLM_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
  511. def add_tensor_data_layout(self, layout: str):
  512. self.add_string(
  513. KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
  514. def add_head_count(self, count: int):
  515. self.add_uint32(
  516. KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
  517. def add_head_count_kv(self, count: int):
  518. self.add_uint32(
  519. KEY_ATTENTION_HEAD_COUNT_KV.format(arch=self.arch), count)
  520. def add_max_alibi_bias(self, bias: float):
  521. self.add_float32(
  522. KEY_ATTENTION_MAX_ALIBI_BIAS.format(arch=self.arch), bias)
  523. def add_clamp_kqv(self, value: float):
  524. self.add_float32(
  525. KEY_ATTENTION_CLAMP_KQV.format(arch=self.arch), value)
  526. def add_layer_norm_eps(self, value: float):
  527. self.add_float32(
  528. KEY_ATTENTION_LAYERNORM_EPS.format(arch=self.arch), value)
  529. def add_layer_norm_rms_eps(self, value: float):
  530. self.add_float32(
  531. KEY_ATTENTION_LAYERNORM_RMS_EPS.format(arch=self.arch), value)
  532. def add_rope_dimension_count(self, count: int):
  533. self.add_uint32(
  534. KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
  535. def add_rope_scale_linear(self, value: float):
  536. self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
  537. def add_tokenizer_model(self, model: str):
  538. self.add_string(KEY_TOKENIZER_MODEL, model)
  539. def add_token_list(self, tokens: List):
  540. self.add_array(KEY_TOKENIZER_LIST, tokens)
  541. def add_token_merges(self, merges: List):
  542. self.add_array(KEY_TOKENIZER_MERGES, merges)
  543. def add_token_types(self, types: List[int]):
  544. self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
  545. def add_token_scores(self, scores: List[float]):
  546. self.add_array(KEY_TOKENIZER_SCORES, scores)
  547. def add_bos_token_id(self, id: int):
  548. self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
  549. def add_eos_token_id(self, id: int):
  550. self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
  551. def add_unk_token_id(self, id: int):
  552. self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
  553. def add_sep_token_id(self, id: int):
  554. self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
  555. def add_pad_token_id(self, id: int):
  556. self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
  557. # Example usage:
  558. if __name__ == "__main__":
  559. # Example usage with a file
  560. gguf_writer = GGUFWriter("example.gguf", "llama")
  561. gguf_writer.add_architecture()
  562. gguf_writer.add_block_count(12)
  563. gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer
  564. gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float
  565. gguf_writer.add_custom_alignment(64)
  566. tensor1 = np.ones((32,), dtype=np.float32) * 100.0
  567. tensor2 = np.ones((64,), dtype=np.float32) * 101.0
  568. tensor3 = np.ones((96,), dtype=np.float32) * 102.0
  569. gguf_writer.add_tensor("tensor1", tensor1)
  570. gguf_writer.add_tensor("tensor2", tensor2)
  571. gguf_writer.add_tensor("tensor3", tensor3)
  572. gguf_writer.write_header_to_file()
  573. gguf_writer.write_kv_data_to_file()
  574. gguf_writer.write_tensors_to_file()
  575. gguf_writer.close()