2 vuotta sitten · 8cf19d60dc
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@@ -76,6 +76,7 @@ def parse_args() -> argparse.Namespace:
 
				         "ftype", type=int, choices=[0, 1], default=1, nargs='?',
			
 
				         help="output format - use 0 for float32, 1 for float16",
			
 
				     )
			
 
				+    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
			
 
				     return parser.parse_args()
			
 
				 
			
 
				 args = parse_args()
			
@@ -86,6 +87,11 @@ if not dir_model.is_dir():
 
				     print(f'Error: {args.model} is not a directory', file = sys.stderr)
			
 
				     sys.exit(1)
			
 
				 
			
 
				+endianess = gguf.GGUFEndian.LITTLE
			
 
				+if args.bigendian:
			
 
				+    endianess = gguf.GGUFEndian.BIG
			
 
				+endianess_str = "Big Endian" if args.bigendian else "Little Endian"
			
 
				+print(f"gguf: Conversion Endianess {endianess}")
			
 
				 # possible tensor data types
			
 
				 #   ftype == 0 -> float32
			
 
				 #   ftype == 1 -> float16
			
@@ -113,7 +119,7 @@ if hparams["architectures"][0] != "BaichuanForCausalLM":
 
				 num_parts = count_model_parts(dir_model)
			
 
				 print(f"num_parts:{num_parts}\n")
			
 
				 ARCH=gguf.MODEL_ARCH.BAICHUAN
			
 
				-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
			
 
				+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
			
 
				 
			
 
				 print("gguf: get model metadata")
			
 
				 
			
--- a/convert.py
+++ b/convert.py
@@ -803,8 +803,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
 
				 
			
 
				 
			
 
				 class OutputFile:
			
 
				-    def __init__(self, fname_out: Path) -> None:
			
 
				-        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
			
 
				+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
			
 
				+        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
			
 
				 
			
 
				     def add_meta_arch(self, params: Params) -> None:
			
 
				         name = "LLaMA"
			
@@ -875,10 +875,10 @@ class OutputFile:
 
				         self.gguf.close()
			
 
				 
			
 
				     @staticmethod
			
 
				-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
			
 
				+    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
			
 
				         check_vocab_size(params, vocab)
			
 
				 
			
 
				-        of = OutputFile(fname_out)
			
 
				+        of = OutputFile(fname_out, endianess=endianess)
			
 
				 
			
 
				         # meta data
			
 
				         of.add_meta_arch(params)
			
@@ -903,10 +903,10 @@ class OutputFile:
 
				         return dt.quantize(arr)
			
 
				 
			
 
				     @staticmethod
			
 
				-    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
			
 
				+    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
			
 
				         check_vocab_size(params, vocab)
			
 
				 
			
 
				-        of = OutputFile(fname_out)
			
 
				+        of = OutputFile(fname_out, endianess=endianess)
			
 
				 
			
 
				         # meta data
			
 
				         of.add_meta_arch(params)
			
@@ -1123,8 +1123,9 @@ def main(args_in: list[str] | None = None) -> None:
 
				     parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
			
 
				     parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
			
 
				     parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
			
 
				-    args = parser.parse_args(args_in)
			
 
				+    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
			
 
				 
			
 
				+    args = parser.parse_args(args_in)
			
 
				     if args.dump_single:
			
 
				         model_plus = lazy_load_file(args.model)
			
 
				         do_dump_model(model_plus)
			
@@ -1138,6 +1139,9 @@ def main(args_in: list[str] | None = None) -> None:
 
				     if args.dump:
			
 
				         do_dump_model(model_plus)
			
 
				         return
			
 
				+    endianess = gguf.GGUFEndian.LITTLE
			
 
				+    if args.bigendian:
			
 
				+        endianess = gguf.GGUFEndian.BIG
			
 
				 
			
 
				     params = Params.load(model_plus)
			
 
				     if params.n_ctx == -1:
			
@@ -1185,7 +1189,7 @@ def main(args_in: list[str] | None = None) -> None:
 
				     params.ftype = ftype
			
 
				     print(f"Writing {outfile}, format {ftype}")
			
 
				 
			
 
				-    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
			
 
				+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
			
 
				     print(f"Wrote {outfile}")
			
 
				 
			
 
				 
			
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) {
 
				     if (file.size < 4) {
			
 
				         return false;
			
 
				     }
			
 
				-    uint32_t magic = file.read_u32();
			
 
				+    std::string magic = file.read_string(4);
			
 
				     return magic == GGUF_MAGIC;
			
 
				 }
			
 
				 
			
--- a/ggml.c
+++ b/ggml.c
@@ -20845,7 +20845,7 @@ struct gguf_kv {
 
				 };
			
 
				 
			
 
				 struct gguf_header {
			
 
				-    uint32_t magic;
			
 
				+    char magic[4];
			
 
				     uint32_t version;
			
 
				     uint64_t n_tensors; // GGUFv2
			
 
				     uint64_t n_kv;      // GGUFv2
			
@@ -20915,7 +20915,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset)
 
				 struct gguf_context * gguf_init_empty(void) {
			
 
				     struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
			
 
				 
			
 
				-    ctx->header.magic     = GGUF_MAGIC;
			
 
				+    memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
			
 
				     ctx->header.version   = GGUF_VERSION;
			
 
				     ctx->header.n_tensors = 0;
			
 
				     ctx->header.n_kv      = 0;
			
@@ -20941,16 +20941,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
				     // offset from start of file
			
 
				     size_t offset = 0;
			
 
				 
			
 
				-    uint32_t magic = 0;
			
 
				+    char magic[4];
			
 
				 
			
 
				     // check the magic before making allocations
			
 
				     {
			
 
				         gguf_fread_el(file, &magic, sizeof(magic), &offset);
			
 
				 
			
 
				-        if (magic != GGUF_MAGIC) {
			
 
				-            fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
			
 
				-            fclose(file);
			
 
				-            return NULL;
			
 
				+        for (uint32_t i = 0; i < sizeof(magic); i++) {
			
 
				+            if (magic[i] != GGUF_MAGIC[i]) {
			
 
				+                fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
			
 
				+                fclose(file);
			
 
				+                return NULL;
			
 
				+            }
			
 
				         }
			
 
				     }
			
 
				 
			
@@ -20960,7 +20962,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
				 
			
 
				     // read the header
			
 
				     {
			
 
				-        ctx->header.magic = magic;
			
 
				+        strncpy(ctx->header.magic, magic, 4);
			
 
				+
			
 
				 
			
 
				         ctx->kv    = NULL;
			
 
				         ctx->infos = NULL;
			
--- a/ggml.h
+++ b/ggml.h
@@ -231,8 +231,9 @@
 
				 #define GGML_EXIT_SUCCESS 0
			
 
				 #define GGML_EXIT_ABORTED 1
			
 
				 
			
 
				-#define GGUF_MAGIC   0x46554747 // "GGUF"
			
 
				-#define GGUF_VERSION 2
			
 
				+#define GGUF_MAGIC "GGUF"
			
 
				+
			
 
				+#define GGUF_VERSION 3
			
 
				 
			
 
				 #define GGUF_DEFAULT_ALIGNMENT 32
			
 
				 
			
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -19,9 +19,10 @@ import numpy as np
 
				 #
			
 
				 
			
 
				 GGUF_MAGIC             = 0x46554747
			
 
				-GGUF_VERSION           = 2
			
 
				+GGUF_VERSION           = 3
			
 
				 GGUF_DEFAULT_ALIGNMENT = 32
			
 
				 
			
 
				+
			
 
				 # general
			
 
				 KEY_GENERAL_ARCHITECTURE         = "general.architecture"
			
 
				 KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
			
@@ -597,6 +598,10 @@ class GGMLQuantizationType(IntEnum):
 
				     Q6_K = 14
			
 
				     Q8_K = 15
			
 
				 
			
 
				+class GGUFEndian(IntEnum):
			
 
				+    LITTLE = 0
			
 
				+    BIG = 1
			
 
				+
			
 
				 
			
 
				 class GGUFValueType(IntEnum):
			
 
				     UINT8   = 0
			
@@ -644,18 +649,41 @@ class GGUFWriter:
 
				     temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
			
 
				     tensors: list[tuple[np.ndarray[Any, Any], int]]
			
 
				 
			
 
				-    def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
			
 
				+    @property
			
 
				+    def pack_prefix(self):
			
 
				+        if self.endianess==GGUFEndian.LITTLE:
			
 
				+            return "<"
			
 
				+        else:
			
 
				+            return ">"
			
 
				+
			
 
				+    def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True, endianess=GGUFEndian.LITTLE):
			
 
				         self.fout = open(path, "wb")
			
 
				         self.arch = arch
			
 
				+        self.endianess = endianess
			
 
				+        self._simple_value_packing = {
			
 
				+            GGUFValueType.UINT8:   f"{self.pack_prefix}B",
			
 
				+            GGUFValueType.INT8:    f"{self.pack_prefix}b",
			
 
				+            GGUFValueType.UINT16:  f"{self.pack_prefix}H",
			
 
				+            GGUFValueType.INT16:   f"{self.pack_prefix}h",
			
 
				+            GGUFValueType.UINT32:  f"{self.pack_prefix}I",
			
 
				+            GGUFValueType.INT32:   f"{self.pack_prefix}i",
			
 
				+            GGUFValueType.FLOAT32: f"{self.pack_prefix}f",
			
 
				+            GGUFValueType.UINT64:  f"{self.pack_prefix}Q",
			
 
				+            GGUFValueType.INT64:   f"{self.pack_prefix}q",
			
 
				+            GGUFValueType.FLOAT64: f"{self.pack_prefix}d",
			
 
				+            GGUFValueType.BOOL:    "?" ,
			
 
				+        }
			
 
				         self.add_architecture()
			
 
				         self.use_temp_file = use_temp_file
			
 
				         self.tensors = []
			
 
				+        endianess_str = "Big Endian" if self.endianess == GGUFEndian.BIG else "Little Endian"
			
 
				+        print(f"This gguf file is for {endianess_str} only")
			
 
				 
			
 
				     def write_header_to_file(self):
			
 
				         self.fout.write(struct.pack("<I", GGUF_MAGIC))
			
 
				-        self.fout.write(struct.pack("<I", GGUF_VERSION))
			
 
				-        self.fout.write(struct.pack("<Q", self.ti_data_count))
			
 
				-        self.fout.write(struct.pack("<Q", self.kv_data_count))
			
 
				+        self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION))
			
 
				+        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count))
			
 
				+        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count))
			
 
				         self.flush()
			
 
				 #        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
			
 
				 
			
@@ -727,25 +755,12 @@ class GGUFWriter:
 
				         self.add_key(key)
			
 
				         self.add_val(val, GGUFValueType.ARRAY)
			
 
				 
			
 
				-    _simple_value_packing = {
			
 
				-        GGUFValueType.UINT8:   "<B",
			
 
				-        GGUFValueType.INT8:    "<b",
			
 
				-        GGUFValueType.UINT16:  "<H",
			
 
				-        GGUFValueType.INT16:   "<h",
			
 
				-        GGUFValueType.UINT32:  "<I",
			
 
				-        GGUFValueType.INT32:   "<i",
			
 
				-        GGUFValueType.FLOAT32: "<f",
			
 
				-        GGUFValueType.UINT64:  "<Q",
			
 
				-        GGUFValueType.INT64:   "<q",
			
 
				-        GGUFValueType.FLOAT64: "<d",
			
 
				-        GGUFValueType.BOOL:    "?" ,
			
 
				-    }
			
 
				     def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
			
 
				         if vtype is None:
			
 
				             vtype = GGUFValueType.get_type(val)
			
 
				 
			
 
				         if add_vtype:
			
 
				-            self.kv_data += struct.pack("<I", vtype)
			
 
				+            self.kv_data += struct.pack(f"{self.pack_prefix}I", vtype)
			
 
				             self.kv_data_count += 1
			
 
				 
			
 
				         pack_fmt = self._simple_value_packing.get(vtype)
			
@@ -753,14 +768,14 @@ class GGUFWriter:
 
				             self.kv_data += struct.pack(pack_fmt, val)
			
 
				         elif vtype == GGUFValueType.STRING:
			
 
				             encoded_val = val.encode("utf8") if isinstance(val, str) else val
			
 
				-            self.kv_data += struct.pack("<Q", len(encoded_val))
			
 
				+            self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_val))
			
 
				             self.kv_data += encoded_val
			
 
				         elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
			
 
				             ltype = GGUFValueType.get_type(val[0])
			
 
				             if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
			
 
				                 raise ValueError("All items in a GGUF array should be of the same type")
			
 
				-            self.kv_data += struct.pack("<I", ltype)
			
 
				-            self.kv_data += struct.pack("<Q", len(val))
			
 
				+            self.kv_data += struct.pack(f"{self.pack_prefix}I", ltype)
			
 
				+            self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(val))
			
 
				             for item in val:
			
 
				                 self.add_val(item, add_vtype=False)
			
 
				         else:
			
@@ -774,22 +789,24 @@ class GGUFWriter:
 
				         assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
			
 
				 
			
 
				         encoded_name = name.encode("utf8")
			
 
				-        self.ti_data += struct.pack("<Q", len(encoded_name))
			
 
				+        self.ti_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_name))
			
 
				         self.ti_data += encoded_name
			
 
				         n_dims = len(tensor_shape)
			
 
				-        self.ti_data += struct.pack("<I", n_dims)
			
 
				+        self.ti_data += struct.pack(f"{self.pack_prefix}I", n_dims)
			
 
				         for i in range(n_dims):
			
 
				-            self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
			
 
				+            self.ti_data += struct.pack(f"{self.pack_prefix}Q", tensor_shape[n_dims - 1 - i])
			
 
				         if raw_dtype is None:
			
 
				             dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
			
 
				         else:
			
 
				             dtype = raw_dtype
			
 
				-        self.ti_data += struct.pack("<I", dtype)
			
 
				-        self.ti_data += struct.pack("<Q", self.offset_tensor)
			
 
				+        self.ti_data += struct.pack(f"{self.pack_prefix}I", dtype)
			
 
				+        self.ti_data += struct.pack(f"{self.pack_prefix}Q", self.offset_tensor)
			
 
				         self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
			
 
				         self.ti_data_count += 1
			
 
				 
			
 
				     def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
			
 
				+        if self.endianess == GGUFEndian.BIG:
			
 
				+            tensor.byteswap(inplace=True)
			
 
				         if self.use_temp_file and self.temp_file is None:
			
 
				             fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
			
 
				             fp.seek(0)
			
@@ -815,6 +832,8 @@ class GGUFWriter:
 
				             fp.write(bytes([0] * pad))
			
 
				 
			
 
				     def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
			
 
				+        if self.endianess==GGUFEndian.BIG:
			
 
				+            tensor.byteswap(inplace=True)
			
 
				         self.write_padding(self.fout, self.fout.tell())
			
 
				         tensor.tofile(self.fout)
			
 
				         self.write_padding(self.fout, tensor.nbytes)
			
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
 
				 [tool.poetry]
			
 
				 name = "gguf"
			
 
				-version = "0.4.4"
			
 
				+version = "0.4.5"
			
 
				 description = "Write ML models in GGUF for GGML"
			
 
				 authors = ["GGML <ggml@ggml.ai>"]
			
 
				 packages = [
			
--- a/k_quants.c
+++ b/k_quants.c
@@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
 
				 #if defined(_MSC_VER) || defined(__MINGW32__)
			
 
				 #include <intrin.h>
			
 
				 #else
			
 
				-#if !defined(__riscv)
			
 
				+#if !defined(__riscv) && !defined(__s390__)
			
 
				 #include <immintrin.h>
			
 
				 #endif
			
 
				 #endif
			
--- a/tests/test-double-float.cpp
+++ b/tests/test-double-float.cpp
@@ -4,7 +4,9 @@
 
				 
			
 
				 #undef NDEBUG
			
 
				 #include <cassert>
			
 
				+#if !defined(__riscv) && !defined(__s390__)
			
 
				 #include <immintrin.h>
			
 
				+#endif
			
 
				 #include <cmath>
			
 
				 #include <cstdint>
			
 
				 #include <cstring>