gguf.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718
  1. import shutil
  2. import sys
  3. import struct
  4. import tempfile
  5. import numpy as np
  6. from enum import IntEnum, auto
  7. from typing import Any, IO, List, Optional
  8. #
  9. # constants
  10. #
  11. GGUF_MAGIC = 0x46554747
  12. GGUF_VERSION = 1
  13. GGUF_DEFAULT_ALIGNMENT = 32
  14. # general
  15. KEY_GENERAL_ARCHITECTURE = "general.architecture"
  16. KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
  17. KEY_GENERAL_ALIGNMENT = "general.alignment"
  18. KEY_GENERAL_NAME = "general.name"
  19. KEY_GENERAL_AUTHOR = "general.author"
  20. KEY_GENERAL_URL = "general.url"
  21. KEY_GENERAL_DESCRIPTION = "general.description"
  22. KEY_GENERAL_LICENSE = "general.license"
  23. KEY_GENERAL_SOURCE_URL = "general.source.url"
  24. KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
  25. # LLM
  26. KEY_LLM_CONTEXT_LENGTH = "{arch}.context_length"
  27. KEY_LLM_EMBEDDING_LENGTH = "{arch}.embedding_length"
  28. KEY_LLM_BLOCK_COUNT = "{arch}.block_count"
  29. KEY_LLM_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
  30. KEY_LLM_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
  31. KEY_LLM_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
  32. # attention
  33. KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
  34. KEY_ATTENTION_HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
  35. KEY_ATTENTION_MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
  36. KEY_ATTENTION_CLAMP_KQV = "{arch}.attention.clamp_kqv"
  37. KEY_ATTENTION_LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
  38. KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
  39. # RoPE
  40. KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
  41. KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear"
  42. # tokenization
  43. KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
  44. KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens"
  45. KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
  46. KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores"
  47. KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges"
  48. KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"
  49. KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"
  50. KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"
  51. KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"
  52. KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
  53. KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
  54. KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
  55. #
  56. # recommended mapping of model tensor names for storage in gguf
  57. #
  58. class MODEL_ARCH(IntEnum):
  59. LLAMA = auto()
  60. FALCON = auto()
  61. GPT2 = auto()
  62. GPTJ = auto()
  63. GPTNEOX = auto()
  64. MPT = auto()
  65. class MODEL_TENSOR(IntEnum):
  66. TOKEN_EMBD = auto()
  67. POS_EMBD = auto()
  68. OUTPUT = auto()
  69. OUTPUT_NORM = auto()
  70. ROPE_FREQS = auto()
  71. ATTN_Q = auto()
  72. ATTN_K = auto()
  73. ATTN_V = auto()
  74. ATTN_QKV = auto()
  75. ATTN_OUT = auto()
  76. ATTN_NORM = auto()
  77. ATTN_NORM_2 = auto()
  78. ATTN_ROT_EMBD = auto()
  79. FFN_GATE = auto()
  80. FFN_DOWN = auto()
  81. FFN_UP = auto()
  82. FFN_NORM = auto()
  83. MODEL_ARCH_NAMES = {
  84. MODEL_ARCH.LLAMA: "llama",
  85. MODEL_ARCH.FALCON: "falcon",
  86. MODEL_ARCH.GPT2: "gpt2",
  87. MODEL_ARCH.GPTJ: "gptj",
  88. MODEL_ARCH.GPTNEOX: "gptneox",
  89. MODEL_ARCH.MPT: "mpt",
  90. }
  91. MODEL_TENSOR_NAMES = {
  92. MODEL_ARCH.LLAMA: {
  93. MODEL_TENSOR.TOKEN_EMBD: "token_embd",
  94. MODEL_TENSOR.OUTPUT_NORM: "output_norm",
  95. MODEL_TENSOR.OUTPUT: "output",
  96. MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
  97. MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
  98. MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
  99. MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
  100. MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
  101. MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
  102. MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
  103. MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
  104. MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
  105. MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
  106. MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
  107. },
  108. MODEL_ARCH.GPTNEOX: {
  109. MODEL_TENSOR.TOKEN_EMBD: "token_embd",
  110. MODEL_TENSOR.OUTPUT_NORM: "output_norm",
  111. MODEL_TENSOR.OUTPUT: "output",
  112. MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
  113. MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
  114. MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
  115. MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
  116. MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
  117. MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
  118. },
  119. MODEL_ARCH.FALCON: {
  120. MODEL_TENSOR.TOKEN_EMBD: "token_embd",
  121. MODEL_TENSOR.OUTPUT_NORM: "output_norm",
  122. MODEL_TENSOR.OUTPUT: "output",
  123. MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
  124. MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
  125. MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
  126. MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
  127. MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
  128. MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
  129. },
  130. MODEL_ARCH.GPT2: {
  131. # TODO
  132. },
  133. # TODO
  134. }
  135. # tensors that will not be serialized
  136. MODEL_TENSOR_SKIP = {
  137. MODEL_ARCH.LLAMA: [
  138. MODEL_TENSOR.ROPE_FREQS,
  139. MODEL_TENSOR.ATTN_ROT_EMBD,
  140. ],
  141. }
  142. # TODO: the following helper functions should be removed
  143. # instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR)
  144. # however, my Python is very bad, and I couldn't figure out how to do this, hence these functions
  145. # REMOVE
  146. def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool:
  147. for skip in MODEL_TENSOR_SKIP.get(arch, []):
  148. for i in range(n_blocks):
  149. if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i):
  150. return True
  151. return False
  152. def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
  153. tensor_map = {}
  154. # Token embeddings
  155. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None)
  156. tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox
  157. tensor_map["transformer.wte"] = mapped_to # gpt2 mpt
  158. tensor_map["transformer.word_embeddings"] = mapped_to # falcon
  159. tensor_map["model.embed_tokens"] = mapped_to # llama-hf
  160. tensor_map["tok_embeddings"] = mapped_to # llama-pth
  161. # Position embeddings
  162. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None)
  163. tensor_map["transformer.wpe"] = mapped_to # gpt2
  164. # Output
  165. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None)
  166. tensor_map["embed_out"] = mapped_to # gptneox
  167. tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf
  168. tensor_map["output"] = mapped_to # llama-pth
  169. # Output norm
  170. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None)
  171. tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox
  172. tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon
  173. tensor_map["transformer.norm_f"] = mapped_to # mpt
  174. tensor_map["model.norm"] = mapped_to # llama-hf
  175. tensor_map["norm"] = mapped_to # llama-pth
  176. # Rope frequencies
  177. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None)
  178. tensor_map["rope.freqs"] = mapped_to # llama-pth
  179. # Attention and feed-forward blocks
  180. for i in range(0, n_blocks):
  181. # Attention norm
  182. # TODO: is there are simpler way to write these 2 lines in Python?
  183. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None)
  184. mapped_to = mapped_to.format(bid=i) if mapped_to else None
  185. tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox
  186. tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2
  187. tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt
  188. tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b
  189. tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b
  190. tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf
  191. tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth
  192. # Attention norm 2
  193. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None)
  194. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  195. tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b
  196. # Attention query-key-value
  197. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None)
  198. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  199. tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox
  200. tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2
  201. tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt
  202. tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon
  203. # Attention query
  204. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None)
  205. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  206. tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf
  207. tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth
  208. # Attention key
  209. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None)
  210. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  211. tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf
  212. tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth
  213. # Attention value
  214. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None)
  215. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  216. tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf
  217. tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth
  218. # Attention output
  219. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None)
  220. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  221. tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox
  222. tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2
  223. tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt
  224. tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon
  225. tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf
  226. tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth
  227. # Rotary embeddings
  228. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None)
  229. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  230. tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"] = mapped_to # llama-hf
  231. tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to # llama-pth
  232. # Feed-forward norm
  233. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None)
  234. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  235. tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox
  236. tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2
  237. tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt
  238. tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf
  239. tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth
  240. # Feed-forward up
  241. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None)
  242. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  243. tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox
  244. tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2
  245. tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt
  246. tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon
  247. tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf
  248. tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth
  249. # Feed-forward gate
  250. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None)
  251. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  252. tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf
  253. tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth
  254. # Feed-forward down
  255. mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None)
  256. mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
  257. tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox
  258. tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2
  259. tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt
  260. tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon
  261. tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf
  262. tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth
  263. return tensor_map
  264. class TokenType(IntEnum):
  265. NORMAL = 1
  266. UNKNOWN = 2
  267. CONTROL = 3
  268. USER_DEFINED = 4
  269. UNUSED = 5
  270. BYTE = 6
  271. #
  272. # implementation
  273. #
  274. class GGMLQuantizationType(IntEnum):
  275. F32 = 0
  276. F16 = 1
  277. Q4_0 = 2
  278. Q4_1 = 3
  279. Q5_0 = 6
  280. Q5_1 = 7
  281. Q8_0 = 8
  282. Q8_1 = 9
  283. Q2_K = 10
  284. Q3_K = 11
  285. Q4_K = 12
  286. Q5_K = 13
  287. Q6_K = 14
  288. Q8_K = 15
  289. class GGUFValueType(IntEnum):
  290. UINT8 = 0
  291. INT8 = 1
  292. UINT16 = 2
  293. INT16 = 3
  294. UINT32 = 4
  295. INT32 = 5
  296. FLOAT32 = 6
  297. BOOL = 7
  298. STRING = 8
  299. ARRAY = 9
  300. @staticmethod
  301. def get_type(val):
  302. if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray):
  303. return GGUFValueType.STRING
  304. elif isinstance(val, list):
  305. return GGUFValueType.ARRAY
  306. elif isinstance(val, float):
  307. return GGUFValueType.FLOAT32
  308. elif isinstance(val, bool):
  309. return GGUFValueType.BOOL
  310. elif isinstance(val, int):
  311. return GGUFValueType.INT32
  312. else:
  313. print("Unknown type: "+str(type(val)))
  314. sys.exit()
  315. class GGUFWriter:
  316. def __init__(self, path: str, arch: str, use_temp_file = True):
  317. self.fout = open(path, "wb")
  318. self.arch = arch
  319. self.offset_tensor = 0
  320. self.data_alignment = GGUF_DEFAULT_ALIGNMENT
  321. self.kv_data = b""
  322. self.kv_data_count = 0
  323. self.ti_data = b""
  324. self.ti_data_count = 0
  325. self.add_architecture()
  326. self.use_temp_file = use_temp_file
  327. self.tensors = []
  328. def write_header_to_file(self):
  329. self.fout.write(struct.pack("<I", GGUF_MAGIC))
  330. self.fout.write(struct.pack("<I", GGUF_VERSION))
  331. self.fout.write(struct.pack("<I", self.ti_data_count))
  332. self.fout.write(struct.pack("<I", self.kv_data_count))
  333. self.flush()
  334. # print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
  335. def write_kv_data_to_file(self):
  336. self.fout.write(self.kv_data)
  337. self.flush()
  338. def write_ti_data_to_file(self):
  339. self.fout.write(self.ti_data)
  340. self.flush()
  341. def add_key(self, key: str):
  342. self.add_val(key, GGUFValueType.STRING, add_vtype=False)
  343. def add_uint8(self, key: str, val: int):
  344. self.add_key(key)
  345. self.add_val(val, GGUFValueType.UINT8)
  346. def add_int8(self, key: str, val: int):
  347. self.add_key(key)
  348. self.add_val(val, GGUFValueType.INT8)
  349. def add_uint16(self, key: str, val: int):
  350. self.add_key(key)
  351. self.add_val(val, GGUFValueType.UINT16)
  352. def add_int16(self, key: str, val: int):
  353. self.add_key(key)
  354. self.add_val(val, GGUFValueType.INT16)
  355. def add_uint32(self, key: str, val: int):
  356. self.add_key(key)
  357. self.add_val(val, GGUFValueType.UINT32)
  358. def add_int32(self, key: str, val: int):
  359. self.add_key(key)
  360. self.add_val(val, GGUFValueType.INT32)
  361. def add_float32(self, key: str, val: float):
  362. self.add_key(key)
  363. self.add_val(val, GGUFValueType.FLOAT32)
  364. def add_bool(self, key: str, val: bool):
  365. self.add_key(key)
  366. self.add_val(val, GGUFValueType.BOOL)
  367. def add_string(self, key: str, val: str):
  368. if len(val) == 0:
  369. return
  370. self.add_key(key)
  371. self.add_val(val, GGUFValueType.STRING)
  372. def add_array(self, key: str, val: list):
  373. if not isinstance(val, list):
  374. raise ValueError("Value must be a list for array type")
  375. self.add_key(key)
  376. self.add_val(val, GGUFValueType.ARRAY)
  377. def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True):
  378. if vtype is None:
  379. vtype = GGUFValueType.get_type(val)
  380. if add_vtype:
  381. self.kv_data += struct.pack("<I", vtype)
  382. self.kv_data_count += 1
  383. if vtype == GGUFValueType.UINT8:
  384. self.kv_data += struct.pack("<B", val)
  385. elif vtype == GGUFValueType.INT8:
  386. self.kv_data += struct.pack("<b", val)
  387. elif vtype == GGUFValueType.UINT16:
  388. self.kv_data += struct.pack("<H", val)
  389. elif vtype == GGUFValueType.INT16:
  390. self.kv_data += struct.pack("<h", val)
  391. elif vtype == GGUFValueType.UINT32:
  392. self.kv_data += struct.pack("<I", val)
  393. elif vtype == GGUFValueType.INT32:
  394. self.kv_data += struct.pack("<i", val)
  395. elif vtype == GGUFValueType.FLOAT32:
  396. self.kv_data += struct.pack("<f", val)
  397. elif vtype == GGUFValueType.BOOL:
  398. self.kv_data += struct.pack("?", val)
  399. elif vtype == GGUFValueType.STRING:
  400. encoded_val = val.encode("utf8") if isinstance(val, str) else val
  401. self.kv_data += struct.pack("<I", len(encoded_val))
  402. self.kv_data += encoded_val
  403. elif vtype == GGUFValueType.ARRAY:
  404. ltype = set([GGUFValueType.get_type(item) for item in val])
  405. assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
  406. self.kv_data += struct.pack("<I", list(ltype)[0])
  407. self.kv_data += struct.pack("<I", len(val))
  408. for item in val:
  409. self.add_val(item, add_vtype=False)
  410. else:
  411. raise ValueError("Invalid GGUF metadata value type")
  412. @staticmethod
  413. def ggml_pad(x: int, n: int) -> int:
  414. return ((x + n - 1) // n) * n
  415. def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
  416. assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
  417. encoded_name = name.encode("utf8")
  418. self.ti_data += struct.pack("<I", len(encoded_name))
  419. self.ti_data += encoded_name
  420. n_dims = len(tensor_shape)
  421. self.ti_data += struct.pack("<I", n_dims)
  422. for i in range(n_dims):
  423. self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
  424. if raw_dtype is None:
  425. dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
  426. else:
  427. dtype = raw_dtype
  428. self.ti_data += struct.pack("<I", dtype)
  429. self.ti_data += struct.pack("<Q", self.offset_tensor)
  430. self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
  431. self.ti_data_count += 1
  432. def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
  433. if self.use_temp_file and not hasattr(self, "temp_file"):
  434. self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
  435. self.temp_file.seek(0)
  436. self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
  437. pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
  438. if not self.use_temp_file:
  439. self.tensors.append((tensor, pad))
  440. return
  441. tensor.tofile(self.temp_file)
  442. if pad != 0:
  443. self.temp_file.write(bytes([0] * pad))
  444. def write_tensor_data(self, tensor: np.ndarray):
  445. pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
  446. if pad != 0:
  447. self.fout.write(bytes([0] * pad))
  448. tensor.tofile(self.fout)
  449. pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
  450. if pad != 0:
  451. self.fout.write(bytes([0] * pad))
  452. def write_tensors_to_file(self):
  453. self.write_ti_data_to_file()
  454. pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
  455. if pad != 0:
  456. self.fout.write(bytes([0] * pad))
  457. if not self.use_temp_file:
  458. for (currtensor, currpad) in self.tensors:
  459. currtensor.tofile(self.fout)
  460. if currpad != 0:
  461. self.fout.write(bytes([0] * currpad))
  462. return
  463. self.temp_file.seek(0)
  464. shutil.copyfileobj(self.temp_file, self.fout)
  465. self.flush()
  466. self.temp_file.close()
  467. def flush(self):
  468. self.fout.flush()
  469. def close(self):
  470. self.fout.close()
  471. def add_architecture(self):
  472. self.add_string(KEY_GENERAL_ARCHITECTURE, self.arch)
  473. def add_author(self, author: str):
  474. self.add_string(KEY_GENERAL_AUTHOR, author)
  475. def add_tensor_data_layout(self, layout: str):
  476. self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
  477. def add_url(self, url: str):
  478. self.add_string(KEY_GENERAL_URL, url)
  479. def add_description(self, description: str):
  480. self.add_string(KEY_GENERAL_DESCRIPTION, description)
  481. def add_source_url(self, url: str):
  482. self.add_string(KEY_GENERAL_SOURCE_URL, url)
  483. def add_source_hf_repo(self, repo: str):
  484. self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
  485. def add_name(self, name: str):
  486. self.add_string(KEY_GENERAL_NAME, name)
  487. def add_quantization_version(self, quantization_version: GGMLQuantizationType):
  488. self.add_uint32(
  489. KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
  490. def add_custom_alignment(self, alignment: int):
  491. self.data_alignment = alignment
  492. self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
  493. def add_context_length(self, length: int):
  494. self.add_uint32(
  495. KEY_LLM_CONTEXT_LENGTH.format(arch=self.arch), length)
  496. def add_embedding_length(self, length: int):
  497. self.add_uint32(
  498. KEY_LLM_EMBEDDING_LENGTH.format(arch=self.arch), length)
  499. def add_block_count(self, length: int):
  500. self.add_uint32(
  501. KEY_LLM_BLOCK_COUNT.format(arch=self.arch), length)
  502. def add_feed_forward_length(self, length: int):
  503. self.add_uint32(
  504. KEY_LLM_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
  505. def add_parallel_residual(self, use: bool):
  506. self.add_bool(
  507. KEY_LLM_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
  508. def add_tensor_data_layout(self, layout: str):
  509. self.add_string(
  510. KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
  511. def add_head_count(self, count: int):
  512. self.add_uint32(
  513. KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
  514. def add_head_count_kv(self, count: int):
  515. self.add_uint32(
  516. KEY_ATTENTION_HEAD_COUNT_KV.format(arch=self.arch), count)
  517. def add_max_alibi_bias(self, bias: float):
  518. self.add_float32(
  519. KEY_ATTENTION_MAX_ALIBI_BIAS.format(arch=self.arch), bias)
  520. def add_clamp_kqv(self, value: float):
  521. self.add_float32(
  522. KEY_ATTENTION_CLAMP_KQV.format(arch=self.arch), value)
  523. def add_layer_norm_eps(self, value: float):
  524. self.add_float32(
  525. KEY_ATTENTION_LAYERNORM_EPS.format(arch=self.arch), value)
  526. def add_layer_norm_rms_eps(self, value: float):
  527. self.add_float32(
  528. KEY_ATTENTION_LAYERNORM_RMS_EPS.format(arch=self.arch), value)
  529. def add_rope_dimension_count(self, count: int):
  530. self.add_uint32(
  531. KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
  532. def add_rope_scale_linear(self, value: float):
  533. self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
  534. def add_tokenizer_model(self, model: str):
  535. self.add_string(KEY_TOKENIZER_MODEL, model)
  536. def add_token_list(self, tokens: List):
  537. self.add_array(KEY_TOKENIZER_LIST, tokens)
  538. def add_token_merges(self, merges: List):
  539. self.add_array(KEY_TOKENIZER_MERGES, merges)
  540. def add_token_types(self, types: List[int]):
  541. self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
  542. def add_token_scores(self, scores: List[float]):
  543. self.add_array(KEY_TOKENIZER_SCORES, scores)
  544. def add_bos_token_id(self, id: int):
  545. self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
  546. def add_eos_token_id(self, id: int):
  547. self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
  548. def add_unk_token_id(self, id: int):
  549. self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
  550. def add_sep_token_id(self, id: int):
  551. self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
  552. def add_pad_token_id(self, id: int):
  553. self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
  554. # Example usage:
  555. if __name__ == "__main__":
  556. # Example usage with a file
  557. gguf_writer = GGUFWriter("example.gguf", "llama")
  558. gguf_writer.add_architecture()
  559. gguf_writer.add_block_count(12)
  560. gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer
  561. gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float
  562. gguf_writer.add_custom_alignment(64)
  563. tensor1 = np.ones((32,), dtype=np.float32) * 100.0
  564. tensor2 = np.ones((64,), dtype=np.float32) * 101.0
  565. tensor3 = np.ones((96,), dtype=np.float32) * 102.0
  566. gguf_writer.add_tensor("tensor1", tensor1)
  567. gguf_writer.add_tensor("tensor2", tensor2)
  568. gguf_writer.add_tensor("tensor3", tensor3)
  569. gguf_writer.write_header_to_file()
  570. gguf_writer.write_kv_data_to_file()
  571. gguf_writer.write_tensors_to_file()
  572. gguf_writer.close()