convert-pth-to-ggml.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. # Convert a LLaMA model checkpoint to a ggml compatible file
  2. #
  3. # Load the model using Torch
  4. # Iterate over all variables and write them to a binary file.
  5. #
  6. # For each variable, write the following:
  7. # - Number of dimensions (int)
  8. # - Name length (int)
  9. # - Dimensions (int[n_dims])
  10. # - Name (char[name_length])
  11. # - Data (float[n_dims])
  12. #
  13. # By default, the bigger matrices are converted to 16-bit floats.
  14. # This can be disabled by adding the "use-f32" CLI argument.
  15. #
  16. # At the start of the ggml file we write the model parameters
  17. # and vocabulary.
  18. #
  19. import argparse
  20. import os
  21. import sys
  22. import json
  23. import struct
  24. import numpy as np
  25. import torch
  26. from sentencepiece import SentencePieceProcessor
  27. def parse_args():
  28. parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
  29. parser.add_argument('dir_model', help='directory containing the model checkpoint')
  30. parser.add_argument('ftype', type=int, choices=[0, 1], default=1, help='file type (0: float32, 1: float16)')
  31. return parser.parse_args()
  32. def get_n_parts(dim):
  33. mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
  34. n_parts = mappings.get(dim)
  35. if n_parts is None:
  36. print(f"Invalid dim: {dim}")
  37. sys.exit(1)
  38. print(f"n_parts = {n_parts}\n")
  39. return n_parts
  40. def load_hparams_and_tokenizer(dir_model):
  41. # `dir_model` is something like `models/7B` or `models/7B/`.
  42. # "tokenizer.model" is expected under model's parent dir.
  43. # When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
  44. # Let's use the model's parent dir directly.
  45. model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
  46. fname_hparams = f"{dir_model}/params.json"
  47. fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
  48. with open(fname_hparams, "r") as f:
  49. hparams = json.load(f)
  50. print(hparams)
  51. tokenizer = SentencePieceProcessor(fname_tokenizer)
  52. hparams.update({"vocab_size": tokenizer.vocab_size()})
  53. return hparams, tokenizer
  54. def write_header(fout, hparams, ftype):
  55. keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
  56. values = [
  57. 0x67676d66, # magic: ggml in hex
  58. 1, # file version
  59. *[hparams[key] for key in keys],
  60. hparams["dim"] // hparams["n_heads"], # rot (obsolete)
  61. ftype
  62. ]
  63. fout.write(struct.pack("i" * len(values), *values))
  64. def write_tokens(fout, tokenizer):
  65. for i in range(tokenizer.vocab_size()):
  66. if tokenizer.is_unknown(i):
  67. text = " \u2047 ".encode("utf-8")
  68. elif tokenizer.is_control(i):
  69. text = b""
  70. elif tokenizer.is_byte(i):
  71. piece = tokenizer.id_to_piece(i)
  72. if len(piece) != 6:
  73. print(f"Invalid token: {piece}")
  74. sys.exit(1)
  75. byte_value = int(piece[3:-1], 16)
  76. text = struct.pack("B", byte_value)
  77. else:
  78. text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
  79. fout.write(struct.pack("i", len(text)))
  80. fout.write(text)
  81. fout.write(struct.pack("f", tokenizer.get_score(i)))
  82. def process_and_write_variables(fout, model, ftype):
  83. for name, datao in model.items():
  84. if name.endswith("freqs"):
  85. continue
  86. shape = datao.shape
  87. print(f"Processing variable: {name} with shape: {shape} and type: {datao.dtype}")
  88. data = datao.numpy().squeeze()
  89. n_dims = len(shape)
  90. # default type is fp16
  91. ftype_cur = 1
  92. if ftype == 0 or n_dims == 1:
  93. print(" Converting to float32")
  94. data = data.astype(np.float32)
  95. ftype_cur = 0
  96. # header
  97. sname = name.encode('utf-8')
  98. fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur))
  99. for dim in reversed(data.shape):
  100. fout.write(struct.pack("i", dim))
  101. fout.write(sname)
  102. # data output to file
  103. data.tofile(fout)
  104. def main():
  105. args = parse_args()
  106. dir_model = args.dir_model
  107. ftype = args.ftype
  108. ftype_str = ["f32", "f16"]
  109. hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
  110. n_parts = get_n_parts(hparams["dim"])
  111. for p in range(n_parts):
  112. print(f"Processing part {p}\n")
  113. fname_model = f"{dir_model}/consolidated.0{p}.pth"
  114. fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"
  115. model = torch.load(fname_model, map_location="cpu")
  116. with open(fname_out, "wb") as fout:
  117. write_header(fout, hparams, ftype)
  118. write_tokens(fout, tokenizer)
  119. process_and_write_variables(fout, model, ftype)
  120. del model
  121. print(f"Done. Output file: {fname_out}, (part {p})\n")
  122. if __name__ == "__main__":
  123. main()