gguf.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727
  1. #!/usr/bin/env python3
  2. import shutil
  3. import sys
  4. import struct
  5. import tempfile
  6. import numpy as np
  7. from enum import IntEnum, auto
  8. from typing import Any, IO, List, Optional
  9. #
  10. # constants
  11. #
  12. GGUF_MAGIC = 0x46554747
  13. GGUF_VERSION = 1
  14. GGUF_DEFAULT_ALIGNMENT = 32
  15. # general
  16. KEY_GENERAL_ARCHITECTURE = "general.architecture"
  17. KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
  18. KEY_GENERAL_ALIGNMENT = "general.alignment"
  19. KEY_GENERAL_NAME = "general.name"
  20. KEY_GENERAL_AUTHOR = "general.author"
  21. KEY_GENERAL_URL = "general.url"
  22. KEY_GENERAL_DESCRIPTION = "general.description"
  23. KEY_GENERAL_LICENSE = "general.license"
  24. KEY_GENERAL_SOURCE_URL = "general.source.url"
  25. KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
  26. KEY_GENERAL_FILE_TYPE = "general.file_type"
  27. # LLM
  28. KEY_CONTEXT_LENGTH = "{arch}.context_length"
  29. KEY_EMBEDDING_LENGTH = "{arch}.embedding_length"
  30. KEY_BLOCK_COUNT = "{arch}.block_count"
  31. KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
  32. KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
  33. KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
  34. # attention
  35. KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
  36. KEY_ATTENTION_HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
  37. KEY_ATTENTION_MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
  38. KEY_ATTENTION_CLAMP_KQV = "{arch}.attention.clamp_kqv"
  39. KEY_ATTENTION_LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
  40. KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
  41. # RoPE
  42. KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
  43. KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base"
  44. KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear"
  45. # tokenization
  46. KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
  47. KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens"
  48. KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
  49. KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores"
  50. KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges"
  51. KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"
  52. KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"
  53. KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"
  54. KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"
  55. KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
  56. KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
  57. KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
  58. #
  59. # recommended mapping of model tensor names for storage in gguf
  60. #
  61. class MODEL_ARCH(IntEnum):
  62. LLAMA = auto()
  63. FALCON = auto()
  64. GPT2 = auto()
  65. GPTJ = auto()
  66. GPTNEOX = auto()
  67. MPT = auto()
  68. class MODEL_TENSOR(IntEnum):
  69. TOKEN_EMBD = auto()
  70. POS_EMBD = auto()
  71. OUTPUT = auto()
  72. OUTPUT_NORM = auto()
  73. ROPE_FREQS = auto()
  74. ATTN_Q = auto()
  75. ATTN_K = auto()
  76. ATTN_V = auto()
  77. ATTN_QKV = auto()
  78. ATTN_OUT = auto()
  79. ATTN_NORM = auto()
  80. ATTN_NORM_2 = auto()
  81. ATTN_ROT_EMBD = auto()
  82. FFN_GATE = auto()
  83. FFN_DOWN = auto()
  84. FFN_UP = auto()
  85. FFN_NORM = auto()
  86. MODEL_ARCH_NAMES = {
  87. MODEL_ARCH.LLAMA: "llama",
  88. MODEL_ARCH.FALCON: "falcon",
  89. MODEL_ARCH.GPT2: "gpt2",
  90. MODEL_ARCH.GPTJ: "gptj",
  91. MODEL_ARCH.GPTNEOX: "gptneox",
  92. MODEL_ARCH.MPT: "mpt",
  93. }
  94. MODEL_TENSOR_NAMES = {
  95. MODEL_ARCH.LLAMA: {
  96. MODEL_TENSOR.TOKEN_EMBD: "token_embd",
  97. MODEL_TENSOR.OUTPUT_NORM: "output_norm",
  98. MODEL_TENSOR.OUTPUT: "output",
  99. MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
  100. MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
  101. MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
  102. MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
  103. MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
  104. MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
  105. MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
  106. MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
  107. MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
  108. MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
  109. MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
  110. },
  111. MODEL_ARCH.GPTNEOX: {
  112. MODEL_TENSOR.TOKEN_EMBD: "token_embd",
  113. MODEL_TENSOR.OUTPUT_NORM: "output_norm",
  114. MODEL_TENSOR.OUTPUT: "output",
  115. MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
  116. MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
  117. MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
  118. MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
  119. MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
  120. MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
  121. },
  122. MODEL_ARCH.FALCON: {
  123. MODEL_TENSOR.TOKEN_EMBD: "token_embd",
  124. MODEL_TENSOR.OUTPUT_NORM: "output_norm",
  125. MODEL_TENSOR.OUTPUT: "output",
  126. MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
  127. MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
  128. MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
  129. MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
  130. MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
  131. MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
  132. },
  133. MODEL_ARCH.GPT2: {
  134. # TODO
  135. },
  136. # TODO
  137. }
  138. # tensors that will not be serialized
  139. MODEL_TENSOR_SKIP = {
  140. MODEL_ARCH.LLAMA: [
  141. MODEL_TENSOR.ROPE_FREQS,
  142. MODEL_TENSOR.ATTN_ROT_EMBD,
  143. ],
  144. }
  145. # TODO: the following helper functions should be removed
  146. # instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR)
  147. # however, my Python is very bad, and I couldn't figure out how to do this, hence these functions
  148. # REMOVE
  149. def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool:
  150. for skip in MODEL_TENSOR_SKIP.get(arch, []):
  151. for i in range(n_blocks):
  152. if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i):
  153. return True
  154. return False
  155. def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
  156. tensor_map = {}
  157. # Token embeddings
  158. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None)
  159. tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox
  160. tensor_map["transformer.wte"] = mapped_to # gpt2 mpt
  161. tensor_map["transformer.word_embeddings"] = mapped_to # falcon
  162. tensor_map["model.embed_tokens"] = mapped_to # llama-hf
  163. tensor_map["tok_embeddings"] = mapped_to # llama-pth
  164. # Position embeddings
  165. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None)
  166. tensor_map["transformer.wpe"] = mapped_to # gpt2
  167. # Output
  168. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None)
  169. tensor_map["embed_out"] = mapped_to # gptneox
  170. tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf
  171. tensor_map["output"] = mapped_to # llama-pth
  172. # Output norm
  173. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None)
  174. tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox
  175. tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon
  176. tensor_map["transformer.norm_f"] = mapped_to # mpt
  177. tensor_map["model.norm"] = mapped_to # llama-hf
  178. tensor_map["norm"] = mapped_to # llama-pth
  179. # Rope frequencies
  180. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None)
  181. tensor_map["rope.freqs"] = mapped_to # llama-pth
  182. # Attention and feed-forward blocks
  183. for i in range(0, n_blocks):
  184. # Attention norm
  185. # TODO: is there are simpler way to write these 2 lines in Python?
  186. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None)
  187. mapped_to = mapped_to.format(bid=i) if mapped_to else None
  188. tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox
  189. tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2
  190. tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt
  191. tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b
  192. tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b
  193. tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf
  194. tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth
  195. # Attention norm 2
  196. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None)
  197. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  198. tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b
  199. # Attention query-key-value
  200. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None)
  201. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  202. tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox
  203. tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2
  204. tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt
  205. tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon
  206. # Attention query
  207. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None)
  208. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  209. tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf
  210. tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth
  211. # Attention key
  212. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None)
  213. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  214. tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf
  215. tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth
  216. # Attention value
  217. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None)
  218. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  219. tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf
  220. tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth
  221. # Attention output
  222. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None)
  223. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  224. tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox
  225. tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2
  226. tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt
  227. tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon
  228. tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf
  229. tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth
  230. # Rotary embeddings
  231. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None)
  232. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  233. tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"] = mapped_to # llama-hf
  234. tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to # llama-pth
  235. # Feed-forward norm
  236. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None)
  237. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  238. tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox
  239. tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2
  240. tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt
  241. tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf
  242. tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth
  243. # Feed-forward up
  244. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None)
  245. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  246. tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox
  247. tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2
  248. tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt
  249. tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon
  250. tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf
  251. tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth
  252. # Feed-forward gate
  253. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None)
  254. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  255. tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf
  256. tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth
  257. # Feed-forward down
  258. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None)
  259. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  260. tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox
  261. tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2
  262. tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt
  263. tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon
  264. tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf
  265. tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth
  266. return tensor_map
  267. class TokenType(IntEnum):
  268. NORMAL = 1
  269. UNKNOWN = 2
  270. CONTROL = 3
  271. USER_DEFINED = 4
  272. UNUSED = 5
  273. BYTE = 6
  274. #
  275. # implementation
  276. #
  277. class GGMLQuantizationType(IntEnum):
  278. F32 = 0
  279. F16 = 1
  280. Q4_0 = 2
  281. Q4_1 = 3
  282. Q5_0 = 6
  283. Q5_1 = 7
  284. Q8_0 = 8
  285. Q8_1 = 9
  286. Q2_K = 10
  287. Q3_K = 11
  288. Q4_K = 12
  289. Q5_K = 13
  290. Q6_K = 14
  291. Q8_K = 15
  292. class GGUFValueType(IntEnum):
  293. UINT8 = 0
  294. INT8 = 1
  295. UINT16 = 2
  296. INT16 = 3
  297. UINT32 = 4
  298. INT32 = 5
  299. FLOAT32 = 6
  300. BOOL = 7
  301. STRING = 8
  302. ARRAY = 9
  303. @staticmethod
  304. def get_type(val):
  305. if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray):
  306. return GGUFValueType.STRING
  307. elif isinstance(val, list):
  308. return GGUFValueType.ARRAY
  309. elif isinstance(val, float):
  310. return GGUFValueType.FLOAT32
  311. elif isinstance(val, bool):
  312. return GGUFValueType.BOOL
  313. elif isinstance(val, int):
  314. return GGUFValueType.INT32
  315. else:
  316. print("Unknown type: "+str(type(val)))
  317. sys.exit()
  318. class GGUFWriter:
  319. def __init__(self, path: str, arch: str, use_temp_file = True):
  320. self.fout = open(path, "wb")
  321. self.arch = arch
  322. self.offset_tensor = 0
  323. self.data_alignment = GGUF_DEFAULT_ALIGNMENT
  324. self.kv_data = b""
  325. self.kv_data_count = 0
  326. self.ti_data = b""
  327. self.ti_data_count = 0
  328. self.add_architecture()
  329. self.use_temp_file = use_temp_file
  330. self.tensors = []
  331. def write_header_to_file(self):
  332. self.fout.write(struct.pack("<I", GGUF_MAGIC))
  333. self.fout.write(struct.pack("<I", GGUF_VERSION))
  334. self.fout.write(struct.pack("<I", self.ti_data_count))
  335. self.fout.write(struct.pack("<I", self.kv_data_count))
  336. self.flush()
  337. # print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
  338. def write_kv_data_to_file(self):
  339. self.fout.write(self.kv_data)
  340. self.flush()
  341. def write_ti_data_to_file(self):
  342. self.fout.write(self.ti_data)
  343. self.flush()
  344. def add_key(self, key: str):
  345. self.add_val(key, GGUFValueType.STRING, add_vtype=False)
  346. def add_uint8(self, key: str, val: int):
  347. self.add_key(key)
  348. self.add_val(val, GGUFValueType.UINT8)
  349. def add_int8(self, key: str, val: int):
  350. self.add_key(key)
  351. self.add_val(val, GGUFValueType.INT8)
  352. def add_uint16(self, key: str, val: int):
  353. self.add_key(key)
  354. self.add_val(val, GGUFValueType.UINT16)
  355. def add_int16(self, key: str, val: int):
  356. self.add_key(key)
  357. self.add_val(val, GGUFValueType.INT16)
  358. def add_uint32(self, key: str, val: int):
  359. self.add_key(key)
  360. self.add_val(val, GGUFValueType.UINT32)
  361. def add_int32(self, key: str, val: int):
  362. self.add_key(key)
  363. self.add_val(val, GGUFValueType.INT32)
  364. def add_float32(self, key: str, val: float):
  365. self.add_key(key)
  366. self.add_val(val, GGUFValueType.FLOAT32)
  367. def add_bool(self, key: str, val: bool):
  368. self.add_key(key)
  369. self.add_val(val, GGUFValueType.BOOL)
  370. def add_string(self, key: str, val: str):
  371. if len(val) == 0:
  372. return
  373. self.add_key(key)
  374. self.add_val(val, GGUFValueType.STRING)
  375. def add_array(self, key: str, val: list):
  376. if not isinstance(val, list):
  377. raise ValueError("Value must be a list for array type")
  378. self.add_key(key)
  379. self.add_val(val, GGUFValueType.ARRAY)
  380. def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True):
  381. if vtype is None:
  382. vtype = GGUFValueType.get_type(val)
  383. if add_vtype:
  384. self.kv_data += struct.pack("<I", vtype)
  385. self.kv_data_count += 1
  386. if vtype == GGUFValueType.UINT8:
  387. self.kv_data += struct.pack("<B", val)
  388. elif vtype == GGUFValueType.INT8:
  389. self.kv_data += struct.pack("<b", val)
  390. elif vtype == GGUFValueType.UINT16:
  391. self.kv_data += struct.pack("<H", val)
  392. elif vtype == GGUFValueType.INT16:
  393. self.kv_data += struct.pack("<h", val)
  394. elif vtype == GGUFValueType.UINT32:
  395. self.kv_data += struct.pack("<I", val)
  396. elif vtype == GGUFValueType.INT32:
  397. self.kv_data += struct.pack("<i", val)
  398. elif vtype == GGUFValueType.FLOAT32:
  399. self.kv_data += struct.pack("<f", val)
  400. elif vtype == GGUFValueType.BOOL:
  401. self.kv_data += struct.pack("?", val)
  402. elif vtype == GGUFValueType.STRING:
  403. encoded_val = val.encode("utf8") if isinstance(val, str) else val
  404. self.kv_data += struct.pack("<I", len(encoded_val))
  405. self.kv_data += encoded_val
  406. elif vtype == GGUFValueType.ARRAY:
  407. ltype = set([GGUFValueType.get_type(item) for item in val])
  408. assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
  409. self.kv_data += struct.pack("<I", list(ltype)[0])
  410. self.kv_data += struct.pack("<I", len(val))
  411. for item in val:
  412. self.add_val(item, add_vtype=False)
  413. else:
  414. raise ValueError("Invalid GGUF metadata value type")
  415. @staticmethod
  416. def ggml_pad(x: int, n: int) -> int:
  417. return ((x + n - 1) // n) * n
  418. def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
  419. assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
  420. encoded_name = name.encode("utf8")
  421. self.ti_data += struct.pack("<I", len(encoded_name))
  422. self.ti_data += encoded_name
  423. n_dims = len(tensor_shape)
  424. self.ti_data += struct.pack("<I", n_dims)
  425. for i in range(n_dims):
  426. self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
  427. if raw_dtype is None:
  428. dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
  429. else:
  430. dtype = raw_dtype
  431. self.ti_data += struct.pack("<I", dtype)
  432. self.ti_data += struct.pack("<Q", self.offset_tensor)
  433. self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
  434. self.ti_data_count += 1
  435. def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
  436. if self.use_temp_file and not hasattr(self, "temp_file"):
  437. self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
  438. self.temp_file.seek(0)
  439. self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
  440. pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
  441. if not self.use_temp_file:
  442. self.tensors.append((tensor, pad))
  443. return
  444. tensor.tofile(self.temp_file)
  445. if pad != 0:
  446. self.temp_file.write(bytes([0] * pad))
  447. def write_tensor_data(self, tensor: np.ndarray):
  448. pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
  449. if pad != 0:
  450. self.fout.write(bytes([0] * pad))
  451. tensor.tofile(self.fout)
  452. pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
  453. if pad != 0:
  454. self.fout.write(bytes([0] * pad))
  455. def write_tensors_to_file(self):
  456. self.write_ti_data_to_file()
  457. pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
  458. if pad != 0:
  459. self.fout.write(bytes([0] * pad))
  460. if not self.use_temp_file:
  461. for (currtensor, currpad) in self.tensors:
  462. currtensor.tofile(self.fout)
  463. if currpad != 0:
  464. self.fout.write(bytes([0] * currpad))
  465. return
  466. self.temp_file.seek(0)
  467. shutil.copyfileobj(self.temp_file, self.fout)
  468. self.flush()
  469. self.temp_file.close()
  470. def flush(self):
  471. self.fout.flush()
  472. def close(self):
  473. self.fout.close()
  474. def add_architecture(self):
  475. self.add_string(KEY_GENERAL_ARCHITECTURE, self.arch)
  476. def add_author(self, author: str):
  477. self.add_string(KEY_GENERAL_AUTHOR, author)
  478. def add_tensor_data_layout(self, layout: str):
  479. self.add_string(KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
  480. def add_url(self, url: str):
  481. self.add_string(KEY_GENERAL_URL, url)
  482. def add_description(self, description: str):
  483. self.add_string(KEY_GENERAL_DESCRIPTION, description)
  484. def add_source_url(self, url: str):
  485. self.add_string(KEY_GENERAL_SOURCE_URL, url)
  486. def add_source_hf_repo(self, repo: str):
  487. self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
  488. def add_file_type(self, ftype: int):
  489. self.add_uint32(KEY_GENERAL_FILE_TYPE, ftype)
  490. def add_name(self, name: str):
  491. self.add_string(KEY_GENERAL_NAME, name)
  492. def add_quantization_version(self, quantization_version: GGMLQuantizationType):
  493. self.add_uint32(
  494. KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
  495. def add_custom_alignment(self, alignment: int):
  496. self.data_alignment = alignment
  497. self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
  498. def add_context_length(self, length: int):
  499. self.add_uint32(
  500. KEY_CONTEXT_LENGTH.format(arch=self.arch), length)
  501. def add_embedding_length(self, length: int):
  502. self.add_uint32(
  503. KEY_EMBEDDING_LENGTH.format(arch=self.arch), length)
  504. def add_block_count(self, length: int):
  505. self.add_uint32(
  506. KEY_BLOCK_COUNT.format(arch=self.arch), length)
  507. def add_feed_forward_length(self, length: int):
  508. self.add_uint32(
  509. KEY_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
  510. def add_parallel_residual(self, use: bool):
  511. self.add_bool(
  512. KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
  513. def add_tensor_data_layout(self, layout: str):
  514. self.add_string(
  515. KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
  516. def add_head_count(self, count: int):
  517. self.add_uint32(
  518. KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
  519. def add_head_count_kv(self, count: int):
  520. self.add_uint32(
  521. KEY_ATTENTION_HEAD_COUNT_KV.format(arch=self.arch), count)
  522. def add_max_alibi_bias(self, bias: float):
  523. self.add_float32(
  524. KEY_ATTENTION_MAX_ALIBI_BIAS.format(arch=self.arch), bias)
  525. def add_clamp_kqv(self, value: float):
  526. self.add_float32(
  527. KEY_ATTENTION_CLAMP_KQV.format(arch=self.arch), value)
  528. def add_layer_norm_eps(self, value: float):
  529. self.add_float32(
  530. KEY_ATTENTION_LAYERNORM_EPS.format(arch=self.arch), value)
  531. def add_layer_norm_rms_eps(self, value: float):
  532. self.add_float32(
  533. KEY_ATTENTION_LAYERNORM_RMS_EPS.format(arch=self.arch), value)
  534. def add_rope_dimension_count(self, count: int):
  535. self.add_uint32(
  536. KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
  537. def add_rope_freq_base(self, value: float):
  538. self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)
  539. def add_rope_scale_linear(self, value: float):
  540. self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
  541. def add_tokenizer_model(self, model: str):
  542. self.add_string(KEY_TOKENIZER_MODEL, model)
  543. def add_token_list(self, tokens: List):
  544. self.add_array(KEY_TOKENIZER_LIST, tokens)
  545. def add_token_merges(self, merges: List):
  546. self.add_array(KEY_TOKENIZER_MERGES, merges)
  547. def add_token_types(self, types: List[int]):
  548. self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
  549. def add_token_scores(self, scores: List[float]):
  550. self.add_array(KEY_TOKENIZER_SCORES, scores)
  551. def add_bos_token_id(self, id: int):
  552. self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
  553. def add_eos_token_id(self, id: int):
  554. self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
  555. def add_unk_token_id(self, id: int):
  556. self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
  557. def add_sep_token_id(self, id: int):
  558. self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
  559. def add_pad_token_id(self, id: int):
  560. self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
  561. # Example usage:
  562. if __name__ == "__main__":
  563. # Example usage with a file
  564. gguf_writer = GGUFWriter("example.gguf", "llama")
  565. gguf_writer.add_architecture()
  566. gguf_writer.add_block_count(12)
  567. gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer
  568. gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float
  569. gguf_writer.add_custom_alignment(64)
  570. tensor1 = np.ones((32,), dtype=np.float32) * 100.0
  571. tensor2 = np.ones((64,), dtype=np.float32) * 101.0
  572. tensor3 = np.ones((96,), dtype=np.float32) * 102.0
  573. gguf_writer.add_tensor("tensor1", tensor1)
  574. gguf_writer.add_tensor("tensor2", tensor2)
  575. gguf_writer.add_tensor("tensor3", tensor3)
  576. gguf_writer.write_header_to_file()
  577. gguf_writer.write_kv_data_to_file()
  578. gguf_writer.write_tensors_to_file()
  579. gguf_writer.close()