2 лет назад · 53635c081c
--- a/convert-gpt4all-to-ggml.py
+++ b/convert-gpt4all-to-ggml.py
@@ -0,0 +1,107 @@
 
															+#!/usr/bin/env python3
														
 
															+
														
 
															+#
														
 
															+# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
														
 
															+#
														
 
															+
														
 
															+# Original by https://github.com/eiz
														
 
															+# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
														
 
															+import argparse
														
 
															+import glob
														
 
															+import os
														
 
															+import struct
														
 
															+import sys
														
 
															+from sentencepiece import SentencePieceProcessor
														
 
															+
														
 
															+HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
														
 
															+
														
 
															+def parse_args():
														
 
															+    parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
														
 
															+    parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
														
 
															+    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
														
 
															+    return parser.parse_args()
														
 
															+
														
 
															+def read_header(f_in):
														
 
															+    struct_fmt = "i" * (3 + len(HPARAMS))
														
 
															+    struct_size = struct.calcsize(struct_fmt)
														
 
															+    buf = f_in.read(struct_size)
														
 
															+    return struct.unpack(struct_fmt, buf)
														
 
															+
														
 
															+def write_header(f_out, header):
														
 
															+    (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
														
 
															+
														
 
															+    if magic != 0x67676d6c:
														
 
															+        raise Exception('Invalid file magic. Must be an old style ggml file.')
														
 
															+
														
 
															+    values = [
														
 
															+        0x67676d66, # magic: ggml in hex
														
 
															+        1,          # file version
														
 
															+        vocab_size,
														
 
															+        dim,
														
 
															+        multiple_of,
														
 
															+        n_heads,
														
 
															+        n_layers,
														
 
															+        rot,
														
 
															+        ftype
														
 
															+    ]
														
 
															+    f_out.write(struct.pack("i" * len(values), *values))
														
 
															+
														
 
															+def write_tokens(fout, tokenizer):
														
 
															+    for i in range(tokenizer.vocab_size()):
														
 
															+        if tokenizer.is_unknown(i):
														
 
															+            text = " \u2047 ".encode("utf-8")
														
 
															+        elif tokenizer.is_control(i):
														
 
															+            text = b""
														
 
															+        elif tokenizer.is_byte(i):
														
 
															+            piece = tokenizer.id_to_piece(i)
														
 
															+            if len(piece) != 6:
														
 
															+                print(f"Invalid token: {piece}")
														
 
															+                sys.exit(1)
														
 
															+            byte_value = int(piece[3:-1], 16)
														
 
															+            text = struct.pack("B", byte_value)
														
 
															+        else:
														
 
															+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
														
 
															+        fout.write(struct.pack("i", len(text)))
														
 
															+        fout.write(text)
														
 
															+        fout.write(struct.pack("f", tokenizer.get_score(i)))
														
 
															+
														
 
															+    # TODO: GPT4All - add extra <pad> token
														
 
															+    text = "<pad>".encode("utf-8")
														
 
															+    fout.write(struct.pack("i", len(text)))
														
 
															+    fout.write(text)
														
 
															+    fout.write(struct.pack("f", 0.0))
														
 
															+
														
 
															+def read_tokens(f_in, tokenizer):
														
 
															+    for i in range(tokenizer.vocab_size()):
														
 
															+        len_b = f_in.read(4)
														
 
															+        (length,) = struct.unpack("i", len_b)
														
 
															+        f_in.read(length)
														
 
															+
														
 
															+def copy_all_data(f_out, f_in):
														
 
															+    while True:
														
 
															+        buf = f_in.read(1024 * 1024)
														
 
															+        if not buf:
														
 
															+            break
														
 
															+        f_out.write(buf)
														
 
															+
														
 
															+def convert_one_file(path_in, tokenizer):
														
 
															+    path_tmp = f"{path_in}.tmp"
														
 
															+    path_orig= f"{path_in}.orig"
														
 
															+    print(f"converting {path_in}")
														
 
															+    with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
														
 
															+        write_header(f_out, read_header(f_in))
														
 
															+        read_tokens(f_in, tokenizer)
														
 
															+        write_tokens(f_out, tokenizer)
														
 
															+        copy_all_data(f_out, f_in)
														
 
															+    os.rename(path_in, path_orig)
														
 
															+    os.rename(path_tmp, path_in)
														
 
															+
														
 
															+def main():
														
 
															+    args = parse_args()
														
 
															+
														
 
															+    tokenizer = SentencePieceProcessor(args.tokenizer_model)
														
 
															+
														
 
															+    convert_one_file(args.gpt4all_model, tokenizer)
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    main()
														
--- a/convert-unversioned-ggml-to-ggml.py
+++ b/convert-unversioned-ggml-to-ggml.py
@@ -27,7 +27,7 @@ def write_header(f_out, header):
 
															     if magic != 0x67676d6c:
														
 
															         raise Exception('Invalid file magic. Must be an old style ggml file.')
														
 
															-    
														
 
															+
														
 
															     values = [
														
 
															         0x67676d66,  # magic: ggml in hex
														
 
															         1, # file version