convert-pth-to-ggml.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. # Convert a LLaMA model checkpoint to a ggml compatible file
  2. #
  3. # Load the model using Torch
  4. # Iterate over all variables and write them to a binary file.
  5. #
  6. # For each variable, write the following:
  7. # - Number of dimensions (int)
  8. # - Name length (int)
  9. # - Dimensions (int[n_dims])
  10. # - Name (char[name_length])
  11. # - Data (float[n_dims])
  12. #
  13. # At the start of the ggml file we write the model parameters
  14. # and vocabulary.
  15. #
  16. import argparse
  17. import os
  18. import sys
  19. import json
  20. import struct
  21. import numpy as np
  22. import torch
  23. from sentencepiece import SentencePieceProcessor
  24. QK = 32
  25. GGML_TYPE_Q4_0 = 0
  26. GGML_TYPE_Q4_1 = 1
  27. GGML_TYPE_I8 = 2
  28. GGML_TYPE_I16 = 3
  29. GGML_TYPE_I32 = 4
  30. GGML_TYPE_F16 = 5
  31. GGML_TYPE_F32 = 6
  32. WTYPES = {
  33. 0: GGML_TYPE_F32,
  34. 1: GGML_TYPE_F16,
  35. 2: GGML_TYPE_Q4_0,
  36. 3: GGML_TYPE_Q4_1,
  37. }
  38. GGML_BLCK_SIZE = {
  39. GGML_TYPE_Q4_0: QK,
  40. GGML_TYPE_Q4_1: QK,
  41. GGML_TYPE_I8: 1,
  42. GGML_TYPE_I16: 1,
  43. GGML_TYPE_I32: 1,
  44. GGML_TYPE_F16: 1,
  45. GGML_TYPE_F32: 1,
  46. }
  47. GGML_TYPE_SIZE = {
  48. GGML_TYPE_Q4_0: 4 + QK/2,
  49. GGML_TYPE_Q4_1: 4*2 + QK/2,
  50. GGML_TYPE_I8: 1,
  51. GGML_TYPE_I16: 2,
  52. GGML_TYPE_I32: 4,
  53. GGML_TYPE_F16: 2,
  54. GGML_TYPE_F32: 4,
  55. }
  56. def ggml_nelements(shape):
  57. r = 1
  58. for i in shape:
  59. r *= i
  60. return r
  61. def ggml_nbytes(shape, ftype):
  62. x = ggml_nelements(shape)
  63. t = WTYPES[ftype]
  64. x *= GGML_TYPE_SIZE[t]
  65. x //= GGML_BLCK_SIZE[t]
  66. return x
  67. def parse_args():
  68. parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
  69. parser.add_argument('dir_model', help='directory containing the model checkpoint')
  70. parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
  71. parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
  72. return parser.parse_args()
  73. def get_n_parts(dim):
  74. mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
  75. n_parts = mappings.get(dim)
  76. if n_parts is None:
  77. print(f"Invalid dim: {dim}")
  78. sys.exit(1)
  79. print(f"n_parts = {n_parts}\n")
  80. return n_parts
  81. def load_hparams_and_tokenizer(dir_model):
  82. # `dir_model` is something like `models/7B` or `models/7B/`.
  83. # "tokenizer.model" is expected under model's parent dir.
  84. # When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
  85. # Let's use the model's parent dir directly.
  86. model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
  87. fname_hparams = f"{dir_model}/params.json"
  88. fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
  89. with open(fname_hparams, "r") as f:
  90. hparams = json.load(f)
  91. print(hparams)
  92. tokenizer = SentencePieceProcessor(fname_tokenizer)
  93. hparams.update({"vocab_size": tokenizer.vocab_size()})
  94. return hparams, tokenizer
  95. def write_header(fout, hparams, ftype):
  96. keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
  97. values = [
  98. 0x67676a74, # magic: ggjt in hex
  99. 1, # file version
  100. *[hparams[key] for key in keys],
  101. hparams["dim"] // hparams["n_heads"], # rot (obsolete)
  102. ftype
  103. ]
  104. fout.write(struct.pack("i" * len(values), *values))
  105. def write_tokens(fout, tokenizer):
  106. for i in range(tokenizer.vocab_size()):
  107. if tokenizer.is_unknown(i):
  108. text = " \u2047 ".encode("utf-8")
  109. elif tokenizer.is_control(i):
  110. text = b""
  111. elif tokenizer.is_byte(i):
  112. piece = tokenizer.id_to_piece(i)
  113. if len(piece) != 6:
  114. print(f"Invalid token: {piece}")
  115. sys.exit(1)
  116. byte_value = int(piece[3:-1], 16)
  117. text = struct.pack("B", byte_value)
  118. else:
  119. text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
  120. fout.write(struct.pack("i", len(text)))
  121. fout.write(text)
  122. fout.write(struct.pack("f", tokenizer.get_score(i)))
  123. def process_and_write_variables(fout, model, ftype, part_id, n_parts):
  124. for name, datao in model.items():
  125. if name.endswith("freqs"):
  126. continue
  127. # remove dimensions with a single element
  128. data = datao.numpy().squeeze()
  129. partshape = data.shape
  130. n_dims = len(data.shape)
  131. assert n_dims in (1, 2)
  132. print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
  133. # coerce single-dimensional tensors from float16 to float32
  134. ftype_cur = 1
  135. if ftype == 0 or n_dims == 1:
  136. print(" Converting to float32")
  137. data = data.astype(np.float32)
  138. ftype_cur = 0
  139. blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
  140. type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
  141. # determine dimension along which multipart tensor is sharded
  142. #
  143. # split_dim 0 regex:
  144. # - output.*
  145. # - layers.*.attention.wq.weight
  146. # - layers.*.attention.wk.weight
  147. # - layers.*.attention.wv.weight
  148. # - layers.*.feed_forward.w1.weight
  149. # - layers.*.feed_forward.w3.weight
  150. #
  151. # split_dim 1 regex:
  152. # - tok_embeddings.*
  153. # - layers.*.attention.wo.weight
  154. # - layers.*.feed_forward.w2.weight
  155. #
  156. if n_dims > 1:
  157. split_dim = 1
  158. if "tok_embeddings" in name:
  159. split_dim = 1
  160. elif "layers" in name:
  161. if "attention.wo.weight" in name:
  162. split_dim = 1
  163. elif "feed_forward.w2.weight" in name:
  164. split_dim = 1
  165. else:
  166. split_dim = 0
  167. elif "output" in name:
  168. split_dim = 0
  169. # output tensor header
  170. fullshape = list(partshape)
  171. if n_dims > 1:
  172. fullshape[split_dim] *= n_parts
  173. sname = name.encode('utf-8')
  174. fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
  175. for dim in reversed(fullshape):
  176. fout.write(struct.pack("i", dim))
  177. fout.write(sname)
  178. # ensure tensor data is aligned
  179. tensor_data_offset = fout.tell()
  180. while tensor_data_offset % QK != 0:
  181. fout.write(struct.pack("B", 0))
  182. tensor_data_offset += 1
  183. # output unified mappable tensor data
  184. if n_dims == 1 or n_parts == 1:
  185. # copy tensor which we thankfully received in one piece
  186. if part_id == 0:
  187. data.tofile(fout)
  188. elif split_dim == 0:
  189. # reassemble multifile tensor containing some of the rows
  190. rows_per_chunk = partshape[0]
  191. current_row = part_id * rows_per_chunk
  192. bytes_per_row = fullshape[1] // blck_size * type_size
  193. offset = current_row * bytes_per_row
  194. fout.seek(tensor_data_offset + offset)
  195. data.tofile(fout)
  196. elif split_dim == 1:
  197. # reassemble multifile tensor containing some of the cols
  198. cols_per_chunk = partshape[1]
  199. current_col = part_id * cols_per_chunk
  200. bytes_per_row = fullshape[1] // blck_size * type_size
  201. offset_current_col = current_col // blck_size * type_size
  202. for row in range(partshape[0]):
  203. offset_row = row * bytes_per_row
  204. offset = offset_row + offset_current_col
  205. fout.seek(tensor_data_offset + offset)
  206. data[row].tofile(fout)
  207. # advance file position to next tensor
  208. fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))
  209. def main():
  210. args = parse_args()
  211. dir_model = args.dir_model
  212. ftype = args.ftype
  213. ftype_str = ["f32", "f16"]
  214. hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
  215. print(args)
  216. # if only writing vocab to file
  217. if args.vocab_only:
  218. fname_model = f"{dir_model}/consolidated.00.pth"
  219. fname_out = f"{dir_model}/ggml-vocab.bin"
  220. print(f"Extracting only the vocab from '{fname_model}'\n")
  221. model = torch.load(fname_model, map_location="cpu")
  222. with open(fname_out, "wb") as fout:
  223. write_header(fout, hparams, ftype)
  224. write_tokens(fout, tokenizer)
  225. del model
  226. print(f"Done. Output file: {fname_out}\n")
  227. return
  228. n_parts = get_n_parts(hparams["dim"])
  229. fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
  230. # we output a single file for ggml
  231. with open(fname_out, "wb") as fout:
  232. write_header(fout, hparams, ftype)
  233. write_tokens(fout, tokenizer)
  234. offset_of_tensors = fout.tell()
  235. # the tensors we load could be split across multiple files
  236. for part_id in range(n_parts):
  237. fout.seek(offset_of_tensors)
  238. print(f"Processing part {part_id+1} of {n_parts}\n")
  239. fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
  240. model = torch.load(fname_model, map_location="cpu")
  241. process_and_write_variables(fout, model, ftype, part_id, n_parts)
  242. del model
  243. print(f"Done. Output file: {fname_out}\n")
  244. if __name__ == "__main__":
  245. main()