migrate-ggml-2023-03-30-pr613.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. # Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
  2. #
  3. # We caused a breaking change to the file format on 2023-03-30 in:
  4. # https://github.com/ggerganov/llama.cpp/pull/613
  5. #
  6. # (1) If you still have the Meta LLaMA .pth files, then close this
  7. # file now; you can just run `convert-pth-to-ggml.py` again to
  8. # migrate to the new format. The tool is easier to use too. It
  9. # isn't necessary anymore to manage split output files because
  10. # the new format always combines things into a single file.
  11. #
  12. # (2) If you deleted the Meta LLaMA .pth files due to save on disk
  13. # space, then this tool is intended to help you. Please check
  14. # out the instructions below.
  15. #
  16. # USAGE
  17. #
  18. # python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
  19. #
  20. # PREREQUISITES
  21. #
  22. # pip install numpy
  23. # cd llama.cpp
  24. # make -j4
  25. #
  26. # EXAMPLE (7B MODEL)
  27. #
  28. # # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
  29. # python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
  30. #
  31. # # check that it works
  32. # ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
  33. #
  34. # # you can delete the old files
  35. # rm -f models/7B/ggml-model-f16.bin
  36. # mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
  37. #
  38. # EXAMPLE (13B MODEL)
  39. #
  40. # # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
  41. # python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
  42. #
  43. # # check that it works
  44. # ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
  45. #
  46. # # you can delete the old files
  47. # rm -f models/13B/ggml-model-f16.bin*
  48. # mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
  49. #
  50. import argparse
  51. import os
  52. import sys
  53. import json
  54. import struct
  55. import numpy as np
  56. QK = 32
  57. GGML_TYPE_Q4_0 = 0
  58. GGML_TYPE_Q4_1 = 1
  59. GGML_TYPE_I8 = 2
  60. GGML_TYPE_I16 = 3
  61. GGML_TYPE_I32 = 4
  62. GGML_TYPE_F16 = 5
  63. GGML_TYPE_F32 = 6
  64. WTYPE_NAMES = {
  65. 0: "F32",
  66. 1: "F16",
  67. 2: "Q4_0",
  68. 3: "Q4_1",
  69. }
  70. WTYPES = {
  71. 0: GGML_TYPE_F32,
  72. 1: GGML_TYPE_F16,
  73. 2: GGML_TYPE_Q4_0,
  74. 3: GGML_TYPE_Q4_1,
  75. }
  76. GGML_BLCK_SIZE = {
  77. GGML_TYPE_Q4_0: QK,
  78. GGML_TYPE_Q4_1: QK,
  79. GGML_TYPE_I8: 1,
  80. GGML_TYPE_I16: 1,
  81. GGML_TYPE_I32: 1,
  82. GGML_TYPE_F16: 1,
  83. GGML_TYPE_F32: 1,
  84. }
  85. GGML_TYPE_SIZE = {
  86. GGML_TYPE_Q4_0: 4 + QK//2,
  87. GGML_TYPE_Q4_1: 4*2 + QK//2,
  88. GGML_TYPE_I8: 1,
  89. GGML_TYPE_I16: 2,
  90. GGML_TYPE_I32: 4,
  91. GGML_TYPE_F16: 2,
  92. GGML_TYPE_F32: 4,
  93. }
  94. HPARAMS = [
  95. 'magic', # int32
  96. 'version', # int32
  97. 'n_vocab', # int32
  98. 'n_embd', # int32
  99. 'n_mult', # int32
  100. 'n_head', # int32
  101. 'n_layer', # int32
  102. 'n_rot', # int32
  103. 'f16', # int32
  104. ]
  105. def read_hparams(fin):
  106. struct_fmt = "i" * len(HPARAMS)
  107. struct_size = struct.calcsize(struct_fmt)
  108. buf = fin.read(struct_size)
  109. ints = struct.unpack(struct_fmt, buf)
  110. hparams = dict(zip(HPARAMS, ints))
  111. return hparams
  112. def write_hparams(fout, hparams):
  113. struct_fmt = "i" * len(HPARAMS)
  114. struct_size = struct.calcsize(struct_fmt)
  115. ints = [hparams[h] for h in HPARAMS]
  116. fout.write(struct.pack(struct_fmt, *ints))
  117. def read_tokens(fin, hparams):
  118. tokens = []
  119. for i in range(hparams['n_vocab']):
  120. len_b = fin.read(4)
  121. (length,) = struct.unpack("i", len_b)
  122. word = fin.read(length)
  123. score_b = fin.read(4)
  124. (score,) = struct.unpack("f", score_b)
  125. tokens.append((word, score))
  126. return tokens
  127. def write_tokens(fout, tokens):
  128. for word, score in tokens:
  129. fout.write(struct.pack("i", len(word)))
  130. fout.write(word)
  131. fout.write(struct.pack("f", score))
  132. def ggml_nelements(shape):
  133. r = 1
  134. for i in shape:
  135. r *= i
  136. return r
  137. def ggml_nbytes(shape, ftype):
  138. x = ggml_nelements(shape)
  139. t = WTYPES[ftype]
  140. x *= GGML_TYPE_SIZE[t]
  141. x //= GGML_BLCK_SIZE[t]
  142. return x
  143. def copy_tensors(fin, fout, part_id, n_parts):
  144. while True:
  145. b = fin.read(4)
  146. if not b: break
  147. (n_dims,) = struct.unpack("i", b)
  148. b = fin.read(4)
  149. (length,) = struct.unpack("i", b)
  150. b = fin.read(4)
  151. (ftype,) = struct.unpack("i", b)
  152. assert n_dims in (1, 2)
  153. partshape = list(range(n_dims))
  154. for i in range(n_dims):
  155. b = fin.read(4)
  156. partshape[i] = struct.unpack("i", b)[0]
  157. partshape = list(reversed(partshape))
  158. name = fin.read(length)
  159. data = fin.read(ggml_nbytes(partshape, ftype))
  160. blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
  161. type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
  162. print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
  163. # determine dimension along which multipart tensor is sharded
  164. #
  165. # split_dim 0 regex:
  166. # - output.*
  167. # - layers.*.attention.wq.weight
  168. # - layers.*.attention.wk.weight
  169. # - layers.*.attention.wv.weight
  170. # - layers.*.feed_forward.w1.weight
  171. # - layers.*.feed_forward.w3.weight
  172. #
  173. # split_dim 1 regex:
  174. # - tok_embeddings.*
  175. # - layers.*.attention.wo.weight
  176. # - layers.*.feed_forward.w2.weight
  177. #
  178. if n_dims > 1:
  179. split_dim = 1
  180. if b"tok_embeddings" in name:
  181. split_dim = 1
  182. elif b"layers" in name:
  183. if b"attention.wo.weight" in name:
  184. split_dim = 1
  185. elif b"feed_forward.w2.weight" in name:
  186. split_dim = 1
  187. else:
  188. split_dim = 0
  189. elif b"output" in name:
  190. split_dim = 0
  191. # output tensor header
  192. fullshape = list(partshape)
  193. if n_dims > 1:
  194. fullshape[split_dim] *= n_parts
  195. fout.write(struct.pack("iii", n_dims, len(name), ftype))
  196. for dim in reversed(fullshape):
  197. fout.write(struct.pack("i", dim))
  198. fout.write(name)
  199. # ensure tensor data is aligned
  200. tensor_data_offset = fout.tell()
  201. while tensor_data_offset % QK != 0:
  202. fout.write(struct.pack("B", 0))
  203. tensor_data_offset += 1
  204. # output unified mappable tensor data
  205. if n_dims == 1 or n_parts == 1:
  206. # copy tensor which we thankfully received in one piece
  207. if part_id == 0:
  208. fout.write(data)
  209. elif split_dim == 0:
  210. # reassemble multifile tensor containing some of the rows
  211. rows_per_chunk = partshape[0]
  212. current_row = part_id * rows_per_chunk
  213. bytes_per_row = fullshape[1] // blck_size * type_size
  214. offset = current_row * bytes_per_row
  215. fout.seek(tensor_data_offset + offset)
  216. fout.write(data)
  217. elif split_dim == 1:
  218. # reassemble multifile tensor containing some of the cols
  219. cols_per_chunk = partshape[1]
  220. current_col = part_id * cols_per_chunk
  221. bpr = partshape[1] // blck_size * type_size
  222. bytes_per_row = fullshape[1] // blck_size * type_size
  223. offset_current_col = current_col // blck_size * type_size
  224. for row in range(partshape[0]):
  225. offset_row = row * bytes_per_row
  226. offset = offset_row + offset_current_col
  227. fout.seek(tensor_data_offset + offset)
  228. fout.write(data[row * bpr:row * bpr + bpr])
  229. # advance file position to next tensor
  230. fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
  231. def parse_args():
  232. parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
  233. parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
  234. parser.add_argument('fout_path', help='your new ggjt file name')
  235. return parser.parse_args()
  236. def main():
  237. args = parse_args()
  238. assert args.fin_path
  239. assert args.fout_path
  240. assert args.fin_path != args.fout_path
  241. with open(args.fin_path, "rb") as fin:
  242. hparams = read_hparams(fin)
  243. tokens = read_tokens(fin, hparams)
  244. if hparams['magic'] == 0x67676a74: # ggjt
  245. print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
  246. sys.exit(1)
  247. if hparams['magic'] != 0x67676d66: # ggmf
  248. print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
  249. sys.exit(1)
  250. hparams['magic'] = 0x67676a74 # ggjt
  251. # count number of multipart files by convention
  252. n_parts = 1
  253. while True:
  254. if os.path.exists(f"{args.fin_path}.{n_parts}"):
  255. n_parts += 1
  256. else:
  257. break
  258. # we output a single file for ggml
  259. with open(args.fout_path, "wb") as fout:
  260. write_hparams(fout, hparams)
  261. write_tokens(fout, tokens)
  262. offset_of_tensors = fout.tell()
  263. # the tensors we load could be split across multiple files
  264. for part_id in range(n_parts):
  265. fout.seek(offset_of_tensors)
  266. print(f"Processing part {part_id+1} of {n_parts}\n")
  267. fin_path = args.fin_path
  268. if part_id > 0:
  269. fin_path += f".{part_id}"
  270. with open(fin_path, "rb") as fin:
  271. read_tokens(fin, read_hparams(fin))
  272. copy_tensors(fin, fout, part_id, n_parts)
  273. print(f"Done. Output file: {args.fout_path}\n")
  274. if __name__ == "__main__":
  275. main()