| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181 |
- # Convert a LLaMA model checkpoint to a ggml compatible file
- #
- # Load the model using Torch
- # Iterate over all variables and write them to a binary file.
- #
- # For each variable, write the following:
- # - Number of dimensions (int)
- # - Name length (int)
- # - Dimensions (int[n_dims])
- # - Name (char[name_length])
- # - Data (float[n_dims])
- #
- # At the start of the ggml file we write the model parameters
- # and vocabulary.
- #
- import argparse
- import os
- import sys
- import json
- import struct
- import numpy as np
- import torch
- from sentencepiece import SentencePieceProcessor
- def parse_args():
- parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
- parser.add_argument('dir_model', help='directory containing the model checkpoint')
- parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
- parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
- return parser.parse_args()
- def get_n_parts(dim):
- mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
- n_parts = mappings.get(dim)
- if n_parts is None:
- print(f"Invalid dim: {dim}")
- sys.exit(1)
- print(f"n_parts = {n_parts}\n")
- return n_parts
- def load_hparams_and_tokenizer(dir_model):
- # `dir_model` is something like `models/7B` or `models/7B/`.
- # "tokenizer.model" is expected under model's parent dir.
- # When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
- # Let's use the model's parent dir directly.
- model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
- fname_hparams = f"{dir_model}/params.json"
- fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
- with open(fname_hparams, "r") as f:
- hparams = json.load(f)
- print(hparams)
- tokenizer = SentencePieceProcessor(fname_tokenizer)
- hparams.update({"vocab_size": tokenizer.vocab_size()})
- return hparams, tokenizer
- def write_header(fout, hparams, ftype):
- keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
- values = [
- 0x67676d66, # magic: ggmf in hex
- 1, # file version
- *[hparams[key] for key in keys],
- hparams["dim"] // hparams["n_heads"], # rot (obsolete)
- ftype
- ]
- fout.write(struct.pack("i" * len(values), *values))
- def write_tokens(fout, tokenizer):
- for i in range(tokenizer.vocab_size()):
- if tokenizer.is_unknown(i):
- text = " \u2047 ".encode("utf-8")
- elif tokenizer.is_control(i):
- text = b""
- elif tokenizer.is_byte(i):
- piece = tokenizer.id_to_piece(i)
- if len(piece) != 6:
- print(f"Invalid token: {piece}")
- sys.exit(1)
- byte_value = int(piece[3:-1], 16)
- text = struct.pack("B", byte_value)
- else:
- text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
- fout.write(struct.pack("i", len(text)))
- fout.write(text)
- fout.write(struct.pack("f", tokenizer.get_score(i)))
- def process_and_write_variables(fout, model, ftype):
- for name, datao in model.items():
- if name.endswith("freqs"):
- continue
- shape = datao.shape
- print(f"Processing variable: {name} with shape: {shape} and type: {datao.dtype}")
- data = datao.numpy().squeeze()
- n_dims = len(shape)
- # default type is fp16
- ftype_cur = 1
- if ftype == 0 or n_dims == 1:
- print(" Converting to float32")
- data = data.astype(np.float32)
- ftype_cur = 0
- # header
- sname = name.encode('utf-8')
- fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur))
- for dim in reversed(data.shape):
- fout.write(struct.pack("i", dim))
- fout.write(sname)
- # data output to file
- data.tofile(fout)
- def main():
- args = parse_args()
- dir_model = args.dir_model
- ftype = args.ftype
- ftype_str = ["f32", "f16"]
- hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
- print(args)
- # if only writing vocab to file
- if args.vocab_only:
- fname_model = f"{dir_model}/consolidated.00.pth"
- fname_out = f"{dir_model}/ggml-vocab.bin"
- print(f"Extracting only the vocab from '{fname_model}'\n")
- model = torch.load(fname_model, map_location="cpu")
- with open(fname_out, "wb") as fout:
- write_header(fout, hparams, ftype)
- write_tokens(fout, tokenizer)
- del model
- print(f"Done. Output file: {fname_out}\n")
- return
- n_parts = get_n_parts(hparams["dim"])
- for p in range(n_parts):
- print(f"Processing part {p}\n")
- fname_model = f"{dir_model}/consolidated.0{p}.pth"
- fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"
- model = torch.load(fname_model, map_location="cpu")
- with open(fname_out, "wb") as fout:
- write_header(fout, hparams, ftype)
- write_tokens(fout, tokenizer)
- process_and_write_variables(fout, model, ftype)
- del model
- print(f"Done. Output file: {fname_out}, (part {p})\n")
- if __name__ == "__main__":
- main()
|