convert-gpt4all-to-ggml.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. #!/usr/bin/env python3
  2. #
  3. # TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
  4. #
  5. # Original by https://github.com/eiz
  6. # https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
  7. import argparse
  8. import glob
  9. import os
  10. import struct
  11. import sys
  12. from sentencepiece import SentencePieceProcessor
  13. HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
  14. def parse_args():
  15. parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
  16. parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
  17. parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
  18. return parser.parse_args()
  19. def read_header(f_in):
  20. struct_fmt = "i" * (3 + len(HPARAMS))
  21. struct_size = struct.calcsize(struct_fmt)
  22. buf = f_in.read(struct_size)
  23. return struct.unpack(struct_fmt, buf)
  24. def write_header(f_out, header):
  25. (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
  26. if magic != 0x67676d6c:
  27. raise Exception('Invalid file magic. Must be an old style ggml file.')
  28. values = [
  29. 0x67676d66, # magic: ggml in hex
  30. 1, # file version
  31. vocab_size,
  32. dim,
  33. multiple_of,
  34. n_heads,
  35. n_layers,
  36. rot,
  37. ftype
  38. ]
  39. f_out.write(struct.pack("i" * len(values), *values))
  40. def write_tokens(fout, tokenizer):
  41. for i in range(tokenizer.vocab_size()):
  42. if tokenizer.is_unknown(i):
  43. text = " \u2047 ".encode()
  44. elif tokenizer.is_control(i):
  45. text = b""
  46. elif tokenizer.is_byte(i):
  47. piece = tokenizer.id_to_piece(i)
  48. if len(piece) != 6:
  49. print(f"Invalid token: {piece}")
  50. sys.exit(1)
  51. byte_value = int(piece[3:-1], 16)
  52. text = struct.pack("B", byte_value)
  53. else:
  54. text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
  55. fout.write(struct.pack("i", len(text)))
  56. fout.write(text)
  57. fout.write(struct.pack("f", tokenizer.get_score(i)))
  58. # TODO: GPT4All - add extra <pad> token
  59. text = "<pad>".encode()
  60. fout.write(struct.pack("i", len(text)))
  61. fout.write(text)
  62. fout.write(struct.pack("f", 0.0))
  63. def read_tokens(f_in, tokenizer):
  64. for i in range(tokenizer.vocab_size()):
  65. len_b = f_in.read(4)
  66. (length,) = struct.unpack("i", len_b)
  67. f_in.read(length)
  68. def copy_all_data(f_out, f_in):
  69. while True:
  70. buf = f_in.read(1024 * 1024)
  71. if not buf:
  72. break
  73. f_out.write(buf)
  74. def convert_one_file(path_in, tokenizer):
  75. path_tmp = f"{path_in}.tmp"
  76. path_orig= f"{path_in}.orig"
  77. print(f"converting {path_in}")
  78. with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
  79. write_header(f_out, read_header(f_in))
  80. read_tokens(f_in, tokenizer)
  81. write_tokens(f_out, tokenizer)
  82. copy_all_data(f_out, f_in)
  83. os.rename(path_in, path_orig)
  84. os.rename(path_tmp, path_in)
  85. def main():
  86. args = parse_args()
  87. tokenizer = SentencePieceProcessor(args.tokenizer_model)
  88. convert_one_file(args.gpt4all_model, tokenizer)
  89. if __name__ == "__main__":
  90. main()