convert-unversioned-ggml-to-ggml.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. #!/usr/bin/env python3
  2. # Original by https://github.com/eiz
  3. # https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
  4. import argparse
  5. import glob
  6. import os
  7. import struct
  8. import sys
  9. from sentencepiece import SentencePieceProcessor
  10. HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
  11. def parse_args():
  12. parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
  13. parser.add_argument('dir_model', help='directory containing ggml .bin files')
  14. parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
  15. return parser.parse_args()
  16. def read_header(f_in):
  17. struct_fmt = "i" * (3 + len(HPARAMS))
  18. struct_size = struct.calcsize(struct_fmt)
  19. buf = f_in.read(struct_size)
  20. return struct.unpack(struct_fmt, buf)
  21. def write_header(f_out, header):
  22. (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
  23. if magic != 0x67676d6c:
  24. raise Exception('Invalid file magic. Must be an old style ggml file.')
  25. values = [
  26. 0x67676d66, # magic: ggml in hex
  27. 1, # file version
  28. vocab_size,
  29. dim,
  30. multiple_of,
  31. n_heads,
  32. n_layers,
  33. rot,
  34. ftype
  35. ]
  36. f_out.write(struct.pack("i" * len(values), *values))
  37. def write_tokens(fout, tokenizer):
  38. for i in range(tokenizer.vocab_size()):
  39. if tokenizer.is_unknown(i):
  40. text = " \u2047 ".encode()
  41. elif tokenizer.is_control(i):
  42. text = b""
  43. elif tokenizer.is_byte(i):
  44. piece = tokenizer.id_to_piece(i)
  45. if len(piece) != 6:
  46. print(f"Invalid token: {piece}")
  47. sys.exit(1)
  48. byte_value = int(piece[3:-1], 16)
  49. text = struct.pack("B", byte_value)
  50. else:
  51. text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
  52. fout.write(struct.pack("i", len(text)))
  53. fout.write(text)
  54. fout.write(struct.pack("f", tokenizer.get_score(i)))
  55. def read_tokens(f_in, tokenizer):
  56. for i in range(tokenizer.vocab_size()):
  57. len_b = f_in.read(4)
  58. (length,) = struct.unpack("i", len_b)
  59. f_in.read(length)
  60. def copy_all_data(f_out, f_in):
  61. while True:
  62. buf = f_in.read(1024 * 1024)
  63. if not buf:
  64. break
  65. f_out.write(buf)
  66. def convert_one_file(path_in, tokenizer):
  67. path_tmp = f"{path_in}.tmp"
  68. path_orig= f"{path_in}.orig"
  69. print(f"converting {path_in}")
  70. with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
  71. write_header(f_out, read_header(f_in))
  72. read_tokens(f_in, tokenizer)
  73. write_tokens(f_out, tokenizer)
  74. copy_all_data(f_out, f_in)
  75. os.rename(path_in, path_orig)
  76. os.rename(path_tmp, path_in)
  77. def main():
  78. args = parse_args()
  79. files = []
  80. files.extend(glob.glob(f"{args.dir_model}/*.bin"))
  81. files.extend(glob.glob(f"{args.dir_model}/*.bin.*"))
  82. tokenizer = SentencePieceProcessor(args.tokenizer_model)
  83. for file in files:
  84. convert_one_file(file, tokenizer)
  85. if __name__ == "__main__":
  86. main()