1
0

convert.py 60 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662
  1. #!/usr/bin/env python3
  2. from __future__ import annotations
  3. import argparse
  4. import concurrent.futures
  5. import enum
  6. import faulthandler
  7. import functools
  8. import itertools
  9. import json
  10. import math
  11. import mmap
  12. import os
  13. import pickle
  14. import re
  15. import signal
  16. import struct
  17. import sys
  18. import time
  19. import warnings
  20. import zipfile
  21. from abc import ABCMeta, abstractmethod
  22. from argparse import ArgumentParser
  23. from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
  24. from dataclasses import dataclass
  25. from pathlib import Path
  26. from typing import (
  27. IO,
  28. TYPE_CHECKING,
  29. Any,
  30. Callable,
  31. Iterable,
  32. Literal,
  33. Optional,
  34. Tuple,
  35. TypeVar,
  36. )
  37. import numpy as np
  38. from sentencepiece import SentencePieceProcessor
  39. try:
  40. from transformers import AutoTokenizer
  41. except ModuleNotFoundError as e:
  42. warnings.warn(f"Could not import AutoTokenizer from transformers: {e}")
  43. # If NO_LOCAL_GGUF is not set, try to import gguf from the local gguf-py directory
  44. if "NO_LOCAL_GGUF" not in os.environ:
  45. # Use absolute path to the gguf-py directory
  46. gguf_py_dir = str(Path(__file__).resolve().parent / "gguf-py")
  47. print(gguf_py_dir) # NOTE: Remove this once path is verified after changes are completed
  48. if gguf_py_dir not in sys.path:
  49. sys.path.insert(1, gguf_py_dir)
  50. # Import gguf module
  51. try:
  52. import gguf
  53. except ModuleNotFoundError as e:
  54. print(f"Could not import gguf: {e}")
  55. sys.exit(1)
  56. if TYPE_CHECKING: # NOTE: This isn't necessary.
  57. from typing import TypeAlias # This can technically be omitted.
  58. if hasattr(faulthandler, "register") and hasattr(signal, "SIGUSR1"):
  59. faulthandler.register(signal.SIGUSR1)
  60. # NOTE: n-dimensional arrays should be directly referenced
  61. NDArray: TypeAlias = "np.ndarray[Any, Any]"
  62. # Why is this here? LLAMA and GPT are technically the only compatible ARCHs.
  63. ARCH = gguf.MODEL_ARCH.LLAMA
  64. DEFAULT_CONCURRENCY = 8
  65. #
  66. # data types
  67. #
  68. # TODO: Clean up and refactor data types
  69. @dataclass(frozen=True)
  70. class DataType:
  71. name: str
  72. dtype: np.dtype[Any]
  73. valid_conversions: list[str]
  74. def elements_to_bytes(self, n_elements: int) -> int:
  75. return n_elements * self.dtype.itemsize
  76. @dataclass(frozen=True)
  77. class UnquantizedDataType(DataType):
  78. pass
  79. DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
  80. DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
  81. DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
  82. DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
  83. @dataclass(frozen=True)
  84. class QuantizedDataType(DataType):
  85. block_size: int
  86. quantized_dtype: np.dtype[Any]
  87. ggml_type: gguf.GGMLQuantizationType
  88. def quantize(self, arr: NDArray) -> NDArray:
  89. raise NotImplementedError(f'Quantization for {self.name} not implemented')
  90. def elements_to_bytes(self, n_elements: int) -> int:
  91. assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
  92. return self.quantized_dtype.itemsize * (n_elements // self.block_size)
  93. @dataclass(frozen=True)
  94. class Q8_0QuantizedDataType(QuantizedDataType):
  95. # Mini Q8_0 quantization in Python!
  96. def quantize(self, arr: NDArray) -> NDArray:
  97. assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}'
  98. assert arr.dtype == np.float32, f'Bad array type {arr.dtype}'
  99. n_blocks = arr.size // self.block_size
  100. blocks = arr.reshape((n_blocks, self.block_size))
  101. # Much faster implementation of block quantization contributed by @Cebtenzzre
  102. def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
  103. d = abs(blocks).max(axis = 1) / np.float32(127)
  104. with np.errstate(divide = 'ignore'):
  105. qs = (blocks / d[:, None]).round()
  106. qs[d == 0] = 0
  107. yield from zip(d, qs)
  108. return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
  109. DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
  110. dtype = np.dtype(np.float32), valid_conversions = [],
  111. ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
  112. quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
  113. # Quantized types skipped here because they may also map to np.float32
  114. NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
  115. for dt in (DT_BF16, DT_F16, DT_F32, DT_I32):
  116. if dt.dtype in NUMPY_TYPE_TO_DATA_TYPE:
  117. raise ValueError(f'Invalid duplicate data type {dt}')
  118. NUMPY_TYPE_TO_DATA_TYPE[dt.dtype] = dt
  119. SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
  120. 'BF16': DT_BF16,
  121. 'F16': DT_F16,
  122. 'F32': DT_F32,
  123. 'I32': DT_I32,
  124. }
  125. # TODO: match this with `llama_ftype`
  126. # TODO: rename to LLAMAFileType
  127. # TODO: move to `gguf.py`
  128. class GGMLFileType(enum.IntEnum):
  129. AllF32 = 0
  130. MostlyF16 = 1 # except 1d tensors
  131. MostlyQ8_0 = 7 # except 1d tensors
  132. def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType:
  133. dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
  134. if dt is None:
  135. raise ValueError(self)
  136. # 1D tensors are always F32.
  137. return dt if len(tensor.shape) > 1 else DT_F32
  138. GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
  139. GGMLFileType.AllF32 : DT_F32,
  140. GGMLFileType.MostlyF16 : DT_F16,
  141. GGMLFileType.MostlyQ8_0: DT_Q8_0,
  142. }
  143. #
  144. # hparams loading
  145. #
  146. @dataclass
  147. class Params:
  148. n_vocab: int
  149. n_embd: int
  150. n_layer: int
  151. n_ctx: int
  152. n_ff: int
  153. n_head: int
  154. n_head_kv: int
  155. f_norm_eps: Optional[float] = None
  156. n_experts: Optional[int] = None
  157. n_experts_used: Optional[int] = None
  158. rope_scaling_type: Optional[gguf.RopeScalingType] = None
  159. f_rope_freq_base: Optional[float] = None
  160. f_rope_scale: Optional[float] = None
  161. n_orig_ctx: Optional[int] = None
  162. rope_finetuned: Optional[bool] = None
  163. ftype: Optional[GGMLFileType] = None
  164. # path to the directory containing the model files
  165. path_model: Optional[Path] = None
  166. @staticmethod
  167. def guessed(model: LazyModel) -> "Params":
  168. # try transformer naming first
  169. n_vocab, n_embd = (
  170. model["model.embed_tokens.weight"].shape
  171. if "model.embed_tokens.weight" in model
  172. else model["tok_embeddings.weight"].shape
  173. )
  174. # try transformer naming first
  175. if "model.layers.0.self_attn.q_proj.weight" in model:
  176. n_layer = next(
  177. i
  178. for i in itertools.count()
  179. if f"model.layers.{i}.self_attn.q_proj.weight" not in model
  180. )
  181. elif (
  182. "model.layers.0.self_attn.W_pack.weight" in model
  183. ): # next: try baichuan naming
  184. n_layer = next(
  185. i
  186. for i in itertools.count()
  187. if f"model.layers.{i}.self_attn.W_pack.weight" not in model
  188. )
  189. else:
  190. n_layer = next(
  191. i
  192. for i in itertools.count()
  193. if f"layers.{i}.attention.wq.weight" not in model
  194. )
  195. if n_layer < 1:
  196. raise Exception(
  197. "failed to guess 'n_layer'. This model is unknown or unsupported.\n"
  198. "Suggestion: provide 'config.json' of the model in the same directory containing model files."
  199. )
  200. n_head = n_embd // 128 # guessed
  201. n_mult = 256 # guessed
  202. # TODO: verify this
  203. n_ff = int(2 * (4 * n_embd) / 3)
  204. n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
  205. return Params(
  206. n_vocab=n_vocab,
  207. n_embd=n_embd,
  208. n_layer=n_layer,
  209. n_ctx=-1,
  210. n_ff=n_ff,
  211. n_head=n_head,
  212. n_head_kv=n_head,
  213. f_norm_eps=1e-5,
  214. )
  215. @staticmethod
  216. def load_transformers_config(model: LazyModel, config_path: Path) -> "Params":
  217. config = json.load(open(config_path))
  218. rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
  219. rope_scaling = config.get("rope_scaling")
  220. if rope_scaling is not None and (typ := rope_scaling.get("type")):
  221. rope_factor = rope_scaling.get("factor")
  222. f_rope_scale = rope_factor
  223. if typ == "linear":
  224. rope_scaling_type = gguf.RopeScalingType.LINEAR
  225. elif typ == "yarn":
  226. rope_scaling_type = gguf.RopeScalingType.YARN
  227. n_orig_ctx = rope_scaling["original_max_position_embeddings"]
  228. rope_finetuned = rope_scaling["finetuned"]
  229. else:
  230. raise NotImplementedError(f"Unknown rope scaling type: {typ}")
  231. if "max_sequence_length" in config:
  232. n_ctx = config["max_sequence_length"]
  233. elif "max_position_embeddings" in config:
  234. n_ctx = config["max_position_embeddings"]
  235. else:
  236. raise Exception(
  237. "failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
  238. "Suggestion: provide 'config.json' of the model in the same directory containing model files."
  239. )
  240. n_experts = None
  241. n_experts_used = None
  242. if "num_local_experts" in config:
  243. n_experts = config["num_local_experts"]
  244. n_experts_used = config["num_experts_per_tok"]
  245. return Params(
  246. n_vocab=config["vocab_size"],
  247. n_embd=config["hidden_size"],
  248. n_layer=config["num_hidden_layers"],
  249. n_ctx=n_ctx,
  250. n_ff=config["intermediate_size"],
  251. n_head=(n_head := config["num_attention_heads"]),
  252. n_head_kv=config.get("num_key_value_heads", n_head),
  253. n_experts=n_experts,
  254. n_experts_used=n_experts_used,
  255. f_norm_eps=config["rms_norm_eps"],
  256. f_rope_freq_base=config.get("rope_theta"),
  257. rope_scaling_type=rope_scaling_type,
  258. f_rope_scale=f_rope_scale,
  259. n_orig_ctx=n_orig_ctx,
  260. rope_finetuned=rope_finetuned,
  261. )
  262. # LLaMA v2 70B params.json
  263. # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
  264. @staticmethod
  265. def load_torch_params(model: LazyModel, config_path: Path) -> "Params":
  266. config = json.load(open(config_path))
  267. n_experts = None
  268. n_experts_used = None
  269. f_rope_freq_base = None
  270. # hack to determine LLaMA v1 vs v2 vs CodeLlama
  271. if config.get("moe"):
  272. # Mixtral
  273. n_ctx = 32768
  274. elif config.get("rope_theta") == 1000000:
  275. # CodeLlama
  276. n_ctx = 16384
  277. elif config["norm_eps"] == 1e-05:
  278. # LLaMA v2
  279. n_ctx = 4096
  280. else:
  281. # LLaMA v1
  282. n_ctx = 2048
  283. if "layers.0.feed_forward.w1.weight" in model:
  284. n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
  285. if config.get("moe"):
  286. n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
  287. n_experts = config["moe"]["num_experts"]
  288. n_experts_used = config["moe"]["num_experts_per_tok"]
  289. f_rope_freq_base = 1e6
  290. return Params(
  291. n_vocab=model["tok_embeddings.weight"].shape[0],
  292. n_embd=config["dim"],
  293. n_layer=config["n_layers"],
  294. n_ctx=n_ctx,
  295. n_ff=n_ff,
  296. n_head=(n_head := config["n_heads"]),
  297. n_head_kv=config.get("n_kv_heads", n_head),
  298. n_experts=n_experts,
  299. n_experts_used=n_experts_used,
  300. f_norm_eps=config["norm_eps"],
  301. f_rope_freq_base=config.get("rope_theta", f_rope_freq_base),
  302. )
  303. @staticmethod
  304. def load(model_plus: ModelPlus) -> "Params":
  305. hf_config_path = model_plus.paths[0].parent / "config.json"
  306. orig_config_path = model_plus.paths[0].parent / "params.json"
  307. if hf_config_path.exists():
  308. params = Params.load_transformers_config(model_plus.model, hf_config_path)
  309. elif orig_config_path.exists():
  310. params = Params.load_torch_params(model_plus.model, orig_config_path)
  311. elif model_plus.format != "none":
  312. params = Params.guessed(model_plus.model)
  313. else:
  314. raise ValueError("Cannot guess params when model format is none")
  315. params.path_model = model_plus.paths[0].parent
  316. return params
  317. class BpeVocab: # GPT
  318. def __init__(
  319. self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
  320. ) -> None:
  321. self.bpe_tokenizer = json.loads(
  322. open(str(fname_tokenizer), encoding="utf-8").read()
  323. )
  324. self.vocab = self.bpe_tokenizer["model"]["vocab"]
  325. added_tokens: dict[str, int]
  326. if fname_added_tokens is not None:
  327. # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
  328. added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
  329. else:
  330. # Fall back to trying to find the added tokens in tokenizer.json
  331. tokenizer_json_file = fname_tokenizer.parent / "tokenizer.json"
  332. if not tokenizer_json_file.is_file():
  333. added_tokens = {}
  334. else:
  335. tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
  336. added_tokens = dict(
  337. (item["content"], item["id"])
  338. for item in tokenizer_json.get("added_tokens", [])
  339. # Added tokens here can be duplicates of the main vocabulary.
  340. if item["content"] not in self.bpe_tokenizer
  341. )
  342. vocab_size: int = len(self.vocab)
  343. expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
  344. actual_ids = sorted(added_tokens.values())
  345. if expected_ids != actual_ids:
  346. expected_end_id = vocab_size + len(actual_ids) - 1
  347. raise Exception(
  348. f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}"
  349. )
  350. items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
  351. self.added_tokens_dict = added_tokens
  352. self.added_tokens_list = [text for (text, idx) in items]
  353. self.vocab_size_base: int = vocab_size
  354. self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
  355. self.fname_tokenizer = fname_tokenizer
  356. self.fname_added_tokens = fname_added_tokens
  357. def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
  358. reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
  359. for i, _ in enumerate(self.vocab):
  360. yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
  361. def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
  362. for text in self.added_tokens_list:
  363. score = -1000.0
  364. yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
  365. def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
  366. yield from self.bpe_tokens()
  367. yield from self.added_tokens()
  368. def __repr__(self) -> str:
  369. return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
  370. class SentencePieceVocab: # LlaMa
  371. def __init__(
  372. self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
  373. ) -> None:
  374. self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
  375. added_tokens: dict[str, int]
  376. if fname_added_tokens is not None:
  377. added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
  378. else:
  379. added_tokens = {}
  380. vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
  381. new_tokens = {
  382. id: piece for piece, id in added_tokens.items() if id >= vocab_size
  383. }
  384. expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
  385. actual_new_ids = sorted(new_tokens.keys())
  386. if expected_new_ids != actual_new_ids:
  387. raise ValueError(
  388. f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}"
  389. )
  390. # Token pieces that were added to the base vocabulary.
  391. self.added_tokens_dict = added_tokens
  392. self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
  393. self.vocab_size_base = vocab_size
  394. self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
  395. self.fname_tokenizer = fname_tokenizer
  396. self.fname_added_tokens = fname_added_tokens
  397. def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
  398. tokenizer = self.sentencepiece_tokenizer
  399. for i in range(tokenizer.vocab_size()):
  400. piece = tokenizer.id_to_piece(i)
  401. text: bytes = piece.encode("utf-8")
  402. score: float = tokenizer.get_score(i)
  403. toktype = gguf.TokenType.NORMAL
  404. if tokenizer.is_unknown(i):
  405. toktype = gguf.TokenType.UNKNOWN
  406. if tokenizer.is_control(i):
  407. toktype = gguf.TokenType.CONTROL
  408. # NOTE: I think added_tokens are user defined.
  409. # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
  410. # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
  411. if tokenizer.is_unused(i):
  412. toktype = gguf.TokenType.UNUSED
  413. if tokenizer.is_byte(i):
  414. toktype = gguf.TokenType.BYTE
  415. yield text, score, toktype
  416. def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
  417. for text in self.added_tokens_list:
  418. score = -1000.0
  419. yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
  420. def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
  421. yield from self.sentencepiece_tokens()
  422. yield from self.added_tokens()
  423. def __repr__(self) -> str:
  424. return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
  425. class HfVocab:
  426. def __init__(
  427. self,
  428. fname_tokenizer: Path,
  429. fname_added_tokens: Optional[Path] = None,
  430. ) -> None:
  431. print("fname_tokenizer:", fname_tokenizer)
  432. # Allow the tokenizer to default to slow or fast versions.
  433. # Explicitly set tokenizer to use local paths.
  434. self.tokenizer = AutoTokenizer.from_pretrained(
  435. fname_tokenizer,
  436. cache_dir=fname_tokenizer,
  437. local_files_only=True,
  438. )
  439. # Initialize lists and dictionaries for added tokens
  440. self.added_tokens_list = []
  441. self.added_tokens_dict = dict()
  442. self.added_tokens_ids = set()
  443. # Process added tokens
  444. for tok, tokidx in sorted(
  445. self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
  446. ):
  447. # Only consider added tokens that are not in the base vocabulary
  448. if tokidx >= self.tokenizer.vocab_size:
  449. self.added_tokens_list.append(tok)
  450. self.added_tokens_dict[tok] = tokidx
  451. self.added_tokens_ids.add(tokidx)
  452. # Store special tokens and their IDs
  453. self.specials = {
  454. tok: self.tokenizer.get_vocab()[tok]
  455. for tok in self.tokenizer.all_special_tokens
  456. }
  457. self.special_ids = set(self.tokenizer.all_special_ids)
  458. # Set vocabulary sizes
  459. self.vocab_size_base = self.tokenizer.vocab_size
  460. self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
  461. self.fname_tokenizer = fname_tokenizer
  462. self.fname_added_tokens = fname_added_tokens
  463. def hf_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
  464. reverse_vocab = {
  465. id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
  466. }
  467. for token_id in range(self.vocab_size_base):
  468. # Skip processing added tokens here
  469. if token_id in self.added_tokens_ids:
  470. continue
  471. # Convert token text to bytes
  472. token_text = reverse_vocab[token_id].encode("utf-8")
  473. # Yield token text, score, and type
  474. yield token_text, self.get_token_score(token_id), self.get_token_type(
  475. token_id, self.special_ids # Reuse already stored special IDs
  476. )
  477. def get_token_type(self, token_id: int, special_ids: set) -> gguf.TokenType:
  478. # Determine token type based on whether it's a special token
  479. return (
  480. gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
  481. )
  482. def get_token_score(self, token_id: int) -> float:
  483. # Placeholder for actual logic to determine the token's score
  484. # This needs to be implemented based on specific requirements
  485. return -1000.0 # Default score
  486. def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
  487. for text in self.added_tokens_list:
  488. if text in self.specials:
  489. toktype = self.get_token_type(self.specials[text], self.special_ids)
  490. score = self.get_token_score(self.specials[text])
  491. else:
  492. toktype = gguf.TokenType.USER_DEFINED
  493. score = -1000.0
  494. yield text.encode("utf-8"), score, toktype
  495. def has_newline_token(self):
  496. return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
  497. def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
  498. yield from self.hf_tokens()
  499. yield from self.added_tokens()
  500. def __repr__(self) -> str:
  501. return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
  502. Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
  503. #
  504. # data loading
  505. # TODO: reuse (probably move to gguf.py?)
  506. #
  507. def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
  508. # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
  509. if n_head_kv is not None and n_head != n_head_kv:
  510. n_head = n_head_kv
  511. return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
  512. .swapaxes(1, 2)
  513. .reshape(weights.shape))
  514. class Tensor(metaclass=ABCMeta):
  515. data_type: DataType
  516. @abstractmethod
  517. def astype(self, data_type: DataType) -> Tensor: ...
  518. @abstractmethod
  519. def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
  520. @abstractmethod
  521. def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
  522. @abstractmethod
  523. def part(self, n_part: int) -> UnquantizedTensor: ...
  524. @abstractmethod
  525. def to_ggml(self) -> GGMLCompatibleTensor: ...
  526. def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
  527. assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
  528. fp32_arr = bf16_arr.astype(np.uint32) << 16
  529. return fp32_arr.view(np.float32)
  530. class UnquantizedTensor(Tensor):
  531. def __init__(self, ndarray: NDArray) -> None:
  532. assert isinstance(ndarray, np.ndarray)
  533. self.ndarray = ndarray
  534. self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
  535. def astype(self, data_type: DataType) -> Tensor:
  536. dtype = data_type.dtype
  537. if self.data_type == DT_BF16:
  538. self.ndarray = bf16_to_fp32(self.ndarray)
  539. return UnquantizedTensor(self.ndarray.astype(dtype))
  540. def to_ggml(self) -> UnquantizedTensor:
  541. return self
  542. def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
  543. r = self.ndarray.shape[0] // 3
  544. return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
  545. def part(self, n_part: int) -> UnquantizedTensor:
  546. r = self.ndarray.shape[0] // 3
  547. return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
  548. def permute(self, n_head: int, n_head_kv: int) -> UnquantizedTensor:
  549. return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv))
  550. def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray:
  551. tensor = lazy_tensor.load()
  552. assert isinstance(tensor, UnquantizedTensor)
  553. # double-check:
  554. actual_shape = list(tensor.ndarray.shape)
  555. assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape)
  556. if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype:
  557. if convert:
  558. tensor.ndarray = tensor.ndarray.astype(expected_dtype)
  559. else:
  560. raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}')
  561. return tensor.ndarray
  562. GGMLCompatibleTensor = UnquantizedTensor
  563. @dataclass
  564. class LazyTensor:
  565. _load: Callable[[], Tensor]
  566. shape: list[int]
  567. data_type: DataType
  568. description: str
  569. def load(self) -> Tensor:
  570. ret = self._load()
  571. # Should be okay if it maps to the same numpy type?
  572. assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
  573. (self.data_type, ret.data_type, self.description)
  574. return ret
  575. def astype(self, data_type: DataType) -> LazyTensor:
  576. self.validate_conversion_to(data_type)
  577. def load() -> Tensor:
  578. return self.load().astype(data_type)
  579. return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
  580. def validate_conversion_to(self, data_type: DataType) -> None:
  581. if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions:
  582. raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')
  583. LazyModel: TypeAlias = 'dict[str, LazyTensor]'
  584. @dataclass
  585. class ModelPlus:
  586. model: LazyModel
  587. paths: list[Path] # Where this was read from.
  588. format: Literal['ggml', 'torch', 'safetensors', 'none']
  589. vocab: Vocab | None # For GGML models (which have vocab built in), the vocab.
  590. def merge_sharded(models: list[LazyModel]) -> LazyModel:
  591. # Original LLaMA models have each file contain one part of each tensor.
  592. # Use a dict instead of a set to preserve order.
  593. names = {name: None for model in models for name in model}
  594. def convert(name: str) -> LazyTensor:
  595. lazy_tensors: list[LazyTensor] = [model[name] for model in models]
  596. if len(lazy_tensors) == 1:
  597. # only one file; don't go through this procedure since there might
  598. # be quantized tensors
  599. return lazy_tensors[0]
  600. if len(lazy_tensors[0].shape) == 1:
  601. # the tensor is just duplicated in every file
  602. return lazy_tensors[0]
  603. if name.startswith('tok_embeddings.') or \
  604. name.endswith('.attention.wo.weight') or \
  605. name.endswith('.feed_forward.w2.weight'):
  606. # split by columns
  607. axis = 1
  608. else:
  609. # split by rows
  610. axis = 0
  611. concatenated_shape = list(lazy_tensors[0].shape)
  612. concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
  613. def load() -> UnquantizedTensor:
  614. ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
  615. concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
  616. return UnquantizedTensor(concatenated)
  617. description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
  618. return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
  619. return {name: convert(name) for name in names}
  620. def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
  621. formats = set(mp.format for mp in models_plus)
  622. assert len(formats) == 1, "different formats?"
  623. format = formats.pop()
  624. paths = [path for mp in models_plus for path in mp.paths]
  625. # Use the first non-None vocab, if any.
  626. try:
  627. vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None)
  628. except StopIteration:
  629. vocab = None
  630. if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
  631. # Transformers models put different tensors in different files, but
  632. # don't split individual tensors between files.
  633. model: LazyModel = {}
  634. for mp in models_plus:
  635. model.update(mp.model)
  636. else:
  637. model = merge_sharded([mp.model for mp in models_plus])
  638. return ModelPlus(model, paths, format, vocab)
  639. def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
  640. def load() -> Tensor:
  641. return lazy_tensor.load().permute(n_head, n_head_kv)
  642. return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
  643. def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
  644. def load() -> Tensor:
  645. return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
  646. s = lazy_tensor.shape.copy()
  647. s[0] = s[0] // 3
  648. return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
  649. def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
  650. def load() -> Tensor:
  651. return lazy_tensor.load().part(n_part)
  652. s = lazy_tensor.shape.copy()
  653. s[0] = s[0] // 3
  654. return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
  655. # Functionality that simulates `torch.load` but where individual tensors are
  656. # only loaded into memory on demand, not all at once.
  657. # PyTorch can't do this natively as of time of writing:
  658. # - https://github.com/pytorch/pytorch/issues/64327
  659. # This allows us to de-shard without multiplying RAM usage, and also
  660. # conveniently drops the PyTorch dependency (though we still need numpy).
  661. @dataclass
  662. class LazyStorageKind:
  663. data_type: DataType
  664. @dataclass
  665. class LazyStorage:
  666. load: Callable[[int, int], NDArray]
  667. kind: LazyStorageKind
  668. description: str
  669. class LazyUnpickler(pickle.Unpickler):
  670. def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
  671. super().__init__(fp)
  672. self.data_base_path = data_base_path
  673. self.zip_file = zip_file
  674. def persistent_load(self, pid: Any) -> Any:
  675. assert pid[0] == 'storage'
  676. assert isinstance(pid[1], LazyStorageKind)
  677. data_type = pid[1].data_type
  678. filename_stem = pid[2]
  679. filename = f'{self.data_base_path}/{filename_stem}'
  680. info = self.zip_file.getinfo(filename)
  681. def load(offset: int, elm_count: int) -> NDArray:
  682. dtype = data_type.dtype
  683. fp = self.zip_file.open(info)
  684. fp.seek(offset * dtype.itemsize)
  685. size = elm_count * dtype.itemsize
  686. data = fp.read(size)
  687. assert len(data) == size
  688. return np.frombuffer(data, dtype)
  689. description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
  690. return LazyStorage(load=load, kind=pid[1], description=description)
  691. @staticmethod
  692. def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
  693. requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
  694. assert isinstance(storage, LazyStorage)
  695. def load() -> UnquantizedTensor:
  696. elm_count = stride[0] * size[0]
  697. return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
  698. description = f'pickled storage_offset={storage_offset} in {storage.description}'
  699. return LazyTensor(load, list(size), storage.kind.data_type, description)
  700. @staticmethod
  701. def rebuild_from_type_v2(func, new_type, args, state):
  702. return func(*args)
  703. CLASSES: dict[tuple[str, str], Any] = {
  704. # getattr used here as a workaround for mypy not being smart enough to determine
  705. # the staticmethods have a __func__ attribute.
  706. ("torch._tensor", "_rebuild_from_type_v2"): getattr(
  707. rebuild_from_type_v2, "__func__"
  708. ),
  709. ("torch._utils", "_rebuild_tensor_v2"): getattr(
  710. lazy_rebuild_tensor_v2, "__func__"
  711. ),
  712. ("torch", "BFloat16Storage"): LazyStorageKind(DT_BF16),
  713. ("torch", "HalfStorage"): LazyStorageKind(DT_F16),
  714. ("torch", "FloatStorage"): LazyStorageKind(DT_F32),
  715. ("torch", "IntStorage"): LazyStorageKind(DT_I32),
  716. ("torch", "Tensor"): LazyTensor,
  717. }
  718. def find_class(self, module: str, name: str) -> Any:
  719. if not module.startswith('torch'):
  720. return super().find_class(module, name)
  721. return self.CLASSES[(module, name)]
  722. def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
  723. zf = zipfile.ZipFile(outer_fp)
  724. pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
  725. assert len(pickle_paths) == 1, pickle_paths
  726. pickle_fp = zf.open(pickle_paths[0], 'r')
  727. unpickler = LazyUnpickler(pickle_fp,
  728. data_base_path=pickle_paths[0][:-4],
  729. zip_file=zf)
  730. model = unpickler.load()
  731. if 'model' in model: model = model['model']
  732. as_dict = dict(model.items())
  733. return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
  734. def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
  735. header_size, = struct.unpack('<Q', fp.read(8))
  736. header: dict[str, dict[str, Any]] = json.loads(fp.read(header_size))
  737. # Use mmap for the actual data to avoid race conditions with the file offset.
  738. mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
  739. byte_buf = mapped[8 + header_size:]
  740. def convert(info: dict[str, Any]) -> LazyTensor:
  741. data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
  742. numpy_dtype = data_type.dtype
  743. shape: list[int] = info['shape']
  744. begin, end = info['data_offsets']
  745. assert 0 <= begin <= end <= len(byte_buf)
  746. assert end - begin == math.prod(shape) * numpy_dtype.itemsize
  747. buf = byte_buf[begin:end]
  748. def load() -> UnquantizedTensor:
  749. return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
  750. description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
  751. return LazyTensor(load, shape, data_type, description)
  752. model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'}
  753. return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
  754. def must_read(fp: IO[bytes], length: int) -> bytes:
  755. ret = fp.read(length)
  756. if len(ret) < length:
  757. raise Exception("unexpectedly reached end of file")
  758. return ret
  759. @functools.lru_cache(maxsize=None)
  760. def lazy_load_file(path: Path) -> ModelPlus:
  761. fp = open(path, 'rb')
  762. first8 = fp.read(8)
  763. fp.seek(0)
  764. if first8[:2] == b'PK':
  765. # A zip file, i.e. PyTorch format
  766. return lazy_load_torch_file(fp, path)
  767. elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
  768. # Probably safetensors
  769. return lazy_load_safetensors_file(fp, path)
  770. else:
  771. raise ValueError(f"unknown format: {path}")
  772. In = TypeVar('In')
  773. Out = TypeVar('Out')
  774. def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
  775. '''Parallel map, but with backpressure. If the caller doesn't call `next`
  776. fast enough, this will stop calling `func` at some point rather than
  777. letting results pile up in memory. Specifically, there is a max of one
  778. output value buffered per thread.'''
  779. if concurrency < 2:
  780. yield from map(func, iterable)
  781. # Not reached.
  782. iterable = iter(iterable)
  783. executor_class: type[ThreadPoolExecutor] | type[ProcessPoolExecutor]
  784. if use_processpool_executor:
  785. executor_class = ProcessPoolExecutor
  786. else:
  787. executor_class = ThreadPoolExecutor
  788. with executor_class(max_workers = max_workers) as executor:
  789. futures: list[concurrent.futures.Future[Out]] = []
  790. done = False
  791. for _ in range(concurrency):
  792. try:
  793. futures.append(executor.submit(func, next(iterable)))
  794. except StopIteration:
  795. done = True
  796. break
  797. while futures:
  798. result = futures.pop(0).result()
  799. while not done and len(futures) < concurrency:
  800. try:
  801. futures.append(executor.submit(func, next(iterable)))
  802. except StopIteration:
  803. done = True
  804. break
  805. yield result
  806. def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
  807. # Handle special case where the model's vocab size is not set
  808. if params.n_vocab == -1:
  809. raise ValueError(
  810. f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?"
  811. )
  812. # Check for a vocab size mismatch
  813. if params.n_vocab == vocab.vocab_size:
  814. print("Ignoring added_tokens.json since model matches vocab size without it.")
  815. return
  816. if pad_vocab and params.n_vocab > vocab.vocab_size:
  817. pad_count = params.n_vocab - vocab.vocab_size
  818. print(
  819. f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>"
  820. )
  821. for i in range(1, pad_count + 1):
  822. vocab.added_tokens_dict[f"<dummy{i:05}>"] = -1
  823. vocab.added_tokens_list.append(f"<dummy{i:05}>")
  824. vocab.vocab_size = params.n_vocab
  825. return
  826. msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer} has {vocab.vocab_size})."
  827. if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
  828. msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
  829. if vocab.vocab_size < params.n_vocab:
  830. msg += " Add the --pad-vocab option and try again."
  831. raise Exception(msg)
  832. class OutputFile:
  833. def __init__(
  834. self, fname_out: Path, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE
  835. ) -> None:
  836. self.gguf = gguf.GGUFWriter(
  837. fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess
  838. )
  839. def add_meta_arch(self, params: Params) -> None:
  840. name = "LLaMA"
  841. # TODO: better logic to determine model name
  842. if params.n_ctx == 4096:
  843. name = "LLaMA v2"
  844. elif params.path_model is not None:
  845. name = str(params.path_model.parent).split("/")[-1]
  846. self.gguf.add_name(name)
  847. self.gguf.add_context_length(params.n_ctx)
  848. self.gguf.add_embedding_length(params.n_embd)
  849. self.gguf.add_block_count(params.n_layer)
  850. self.gguf.add_feed_forward_length(params.n_ff)
  851. self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
  852. self.gguf.add_head_count(params.n_head)
  853. self.gguf.add_head_count_kv(params.n_head_kv)
  854. if params.f_norm_eps is None:
  855. raise ValueError("f_norm_eps is None")
  856. self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
  857. if params.n_experts:
  858. self.gguf.add_expert_count(params.n_experts)
  859. if params.n_experts_used:
  860. self.gguf.add_expert_used_count(params.n_experts_used)
  861. if params.f_rope_freq_base is not None:
  862. self.gguf.add_rope_freq_base(params.f_rope_freq_base)
  863. if params.rope_scaling_type:
  864. assert params.f_rope_scale is not None
  865. self.gguf.add_rope_scaling_type(params.rope_scaling_type)
  866. self.gguf.add_rope_scaling_factor(params.f_rope_scale)
  867. if params.n_orig_ctx is not None:
  868. self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
  869. if params.rope_finetuned is not None:
  870. self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
  871. if params.ftype is not None:
  872. self.gguf.add_file_type(params.ftype)
  873. def handle_tokenizer_model(self, vocab: Vocab) -> str:
  874. # Map the vocab types to the supported tokenizer models
  875. tokenizer_model = {
  876. SentencePieceVocab: "llama",
  877. HfVocab: "llama",
  878. BpeVocab: "gpt2",
  879. }.get(type(vocab))
  880. # Block if vocab type is not predefined
  881. if tokenizer_model is None:
  882. raise ValueError("Unknown vocab type: Not supported")
  883. return tokenizer_model
  884. def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]:
  885. tokens = []
  886. scores = []
  887. toktypes = []
  888. # NOTE: `all_tokens` returns the base vocabulary and added tokens
  889. for text, score, toktype in vocab.all_tokens():
  890. tokens.append(text)
  891. scores.append(score)
  892. toktypes.append(toktype)
  893. assert len(tokens) == vocab.vocab_size
  894. return tokens, scores, toktypes
  895. def add_meta_vocab(self, vocab: Vocab) -> None:
  896. # Handle the tokenizer model
  897. tokenizer_model = self.handle_tokenizer_model(vocab)
  898. # Ensure that tokenizer_model is added to the GGUF model
  899. self.gguf.add_tokenizer_model(tokenizer_model)
  900. # Extract model vocabulary for model conversion
  901. tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
  902. # Add extracted token information for model conversion
  903. self.gguf.add_token_list(tokens)
  904. self.gguf.add_token_scores(scores)
  905. self.gguf.add_token_types(toktypes)
  906. def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None:
  907. svocab.add_to_gguf(self.gguf)
  908. def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
  909. n_elements = int(np.prod(tensor.shape))
  910. raw_dtype = getattr(tensor.data_type, "ggml_type", None)
  911. data_type = (
  912. getattr(tensor.data_type, "quantized_type", None) or tensor.data_type.dtype
  913. )
  914. data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
  915. self.gguf.add_tensor_info(
  916. name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype
  917. )
  918. def write_meta(self) -> None:
  919. self.gguf.write_header_to_file()
  920. self.gguf.write_kv_data_to_file()
  921. def write_tensor_info(self) -> None:
  922. self.gguf.write_ti_data_to_file()
  923. def close(self) -> None:
  924. self.gguf.close()
  925. @staticmethod
  926. def write_vocab_only(
  927. fname_out: Path,
  928. params: Params,
  929. vocab: Vocab,
  930. svocab: gguf.SpecialVocab,
  931. endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
  932. pad_vocab: bool = False,
  933. ) -> None:
  934. check_vocab_size(params, vocab, pad_vocab=pad_vocab)
  935. of = OutputFile(fname_out, endianess=endianess)
  936. # meta data
  937. of.add_meta_arch(params)
  938. of.add_meta_vocab(vocab)
  939. of.add_meta_special_vocab(svocab)
  940. of.write_meta()
  941. of.close()
  942. @staticmethod
  943. def do_item(item: tuple[str, LazyTensor]) -> tuple[DataType, NDArray]:
  944. name, lazy_tensor = item
  945. tensor = lazy_tensor.load().to_ggml()
  946. return (lazy_tensor.data_type, tensor.ndarray)
  947. @staticmethod
  948. def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
  949. dt, arr = item
  950. if not isinstance(dt, QuantizedDataType):
  951. return arr
  952. return dt.quantize(arr)
  953. @staticmethod
  954. def write_all(
  955. fname_out: Path,
  956. ftype: GGMLFileType,
  957. params: Params,
  958. model: LazyModel,
  959. vocab: Vocab,
  960. svocab: gguf.SpecialVocab,
  961. concurrency: int = DEFAULT_CONCURRENCY,
  962. endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
  963. pad_vocab: bool = False,
  964. ) -> None:
  965. check_vocab_size(params, vocab, pad_vocab=pad_vocab)
  966. of = OutputFile(fname_out, endianess=endianess)
  967. # meta data
  968. of.add_meta_arch(params)
  969. of.add_meta_vocab(vocab)
  970. of.add_meta_special_vocab(svocab)
  971. # tensor info
  972. for name, lazy_tensor in model.items():
  973. of.add_tensor_info(name, lazy_tensor)
  974. of.write_meta()
  975. of.write_tensor_info()
  976. # tensor data
  977. ndarrays_inner = bounded_parallel_map(
  978. OutputFile.do_item, model.items(), concurrency=concurrency
  979. )
  980. if ftype == GGMLFileType.MostlyQ8_0:
  981. ndarrays = bounded_parallel_map(
  982. OutputFile.maybe_do_quantize,
  983. ndarrays_inner,
  984. concurrency=concurrency,
  985. max_workers=concurrency,
  986. use_processpool_executor=True,
  987. )
  988. else:
  989. ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
  990. start = time.time()
  991. for i, ((name, lazy_tensor), ndarray) in enumerate(
  992. zip(model.items(), ndarrays)
  993. ):
  994. elapsed = time.time() - start
  995. size = " x ".join(f"{dim:6d}" for dim in lazy_tensor.shape)
  996. padi = len(str(len(model)))
  997. print(
  998. f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
  999. )
  1000. of.gguf.write_tensor_data(ndarray)
  1001. of.close()
  1002. def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
  1003. wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
  1004. if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
  1005. return GGMLFileType.AllF32
  1006. if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
  1007. return GGMLFileType.MostlyF16
  1008. if output_type_str == "q8_0":
  1009. return GGMLFileType.MostlyQ8_0
  1010. name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
  1011. raise Exception(f"Unexpected combination of types: {name_to_type}")
  1012. def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
  1013. return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
  1014. for (name, tensor) in model.items()}
  1015. def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
  1016. tmap = gguf.TensorNameMap(ARCH, params.n_layer)
  1017. should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
  1018. tmp = model
  1019. # HF models permut or pack some of the tensors, so we need to undo that
  1020. for i in itertools.count():
  1021. if f"model.layers.{i}.self_attn.q_proj.weight" in model:
  1022. print(f"Permuting layer {i}")
  1023. tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
  1024. tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
  1025. # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
  1026. elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
  1027. print(f"Unpacking and permuting layer {i}")
  1028. tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
  1029. tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
  1030. tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
  1031. del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
  1032. else:
  1033. break
  1034. out: LazyModel = {}
  1035. for name, lazy_tensor in model.items():
  1036. tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
  1037. if name_new is None:
  1038. raise Exception(f"Unexpected tensor name: {name}")
  1039. if tensor_type in should_skip:
  1040. print(f"skipping tensor {name_new}")
  1041. continue
  1042. print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
  1043. out[name_new] = lazy_tensor
  1044. return out
  1045. def nth_multifile_path(path: Path, n: int) -> Path | None:
  1046. '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
  1047. the nth path in the model.
  1048. '''
  1049. # Support the following patterns:
  1050. patterns: list[tuple[str, str]] = [
  1051. # - x.00.pth, x.01.pth, etc.
  1052. (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
  1053. # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
  1054. (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'),
  1055. # x.bin, x.bin.1, etc.
  1056. (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}')
  1057. ]
  1058. for regex, replacement in patterns:
  1059. if re.search(regex, path.name):
  1060. new_path = path.with_name(re.sub(regex, replacement, path.name))
  1061. if new_path.exists():
  1062. return new_path
  1063. return None
  1064. def find_multifile_paths(path: Path) -> list[Path]:
  1065. '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
  1066. the whole list of paths in the model.
  1067. '''
  1068. ret: list[Path] = []
  1069. for i in itertools.count():
  1070. nth_path = nth_multifile_path(path, i)
  1071. if nth_path is None:
  1072. break
  1073. ret.append(nth_path)
  1074. if not ret:
  1075. # No matches. This should only happen if the file was named, e.g.,
  1076. # foo.0, and there was no file named foo. Oh well, try to process it
  1077. # as a single file.
  1078. return [path]
  1079. return ret
  1080. def load_some_model(path: Path) -> ModelPlus:
  1081. '''Load a model of any supported format.'''
  1082. # Be extra-friendly and accept either a file or a directory:
  1083. if path.is_dir():
  1084. # Check if it's a set of safetensors files first
  1085. globs = ["model-00001-of-*.safetensors", "model.safetensors"]
  1086. files = [file for glob in globs for file in path.glob(glob)]
  1087. if not files:
  1088. # Try the PyTorch patterns too, with lower priority
  1089. globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
  1090. files = [file for glob in globs for file in path.glob(glob)]
  1091. if not files:
  1092. raise Exception(f"Can't find model in directory {path}")
  1093. if len(files) > 1:
  1094. raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
  1095. path = files[0]
  1096. paths = find_multifile_paths(path)
  1097. models_plus: list[ModelPlus] = []
  1098. for path in paths:
  1099. print(f"Loading model file {path}")
  1100. models_plus.append(lazy_load_file(path))
  1101. model_plus = merge_multifile_models(models_plus)
  1102. return model_plus
  1103. class VocabFactory:
  1104. def __init__(self, path: Path):
  1105. self.path = path
  1106. self.files = {
  1107. "tokenizer.model": None,
  1108. "vocab.json": None,
  1109. "tokenizer.json": None,
  1110. }
  1111. self._detect_files()
  1112. def _detect_files(self):
  1113. for file in self.files.keys():
  1114. file_path = self.path / file
  1115. parent_file_path = self.path.parent / file
  1116. if file_path.exists():
  1117. self.files[file] = file_path
  1118. elif parent_file_path.exists():
  1119. self.files[file] = parent_file_path
  1120. print(f"Found vocab files: {self.files}")
  1121. def _select_file(self, vocabtype: Optional[str]) -> Path:
  1122. if vocabtype in ["spm", "bpe"]:
  1123. for file_key in self.files.keys():
  1124. if self.files[file_key]:
  1125. return self.files[file_key]
  1126. raise FileNotFoundError(f"{vocabtype} vocab not found.")
  1127. elif vocabtype == "hfft":
  1128. # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
  1129. return self.path
  1130. else:
  1131. raise ValueError(f"Unsupported vocabulary type {vocabtype}")
  1132. def _create_special_vocab(
  1133. self,
  1134. vocab: Vocab,
  1135. vocabtype: str,
  1136. model_parent_path: Path,
  1137. ) -> gguf.SpecialVocab:
  1138. load_merges = vocabtype == "bpe"
  1139. n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
  1140. return gguf.SpecialVocab(
  1141. model_parent_path,
  1142. load_merges=load_merges,
  1143. special_token_types=None, # Predetermined or passed as a parameter
  1144. n_vocab=n_vocab,
  1145. )
  1146. def load_vocab(
  1147. self, vocabtype: str, model_parent_path: Path
  1148. ) -> Tuple[Vocab, gguf.SpecialVocab]:
  1149. path = self._select_file(vocabtype)
  1150. print(f"Loading vocab file '{path}', type '{vocabtype}'")
  1151. added_tokens_path = path.parent / "added_tokens.json"
  1152. if vocabtype == "bpe":
  1153. vocab = BpeVocab(
  1154. path, added_tokens_path if added_tokens_path.exists() else None
  1155. )
  1156. elif vocabtype == "spm":
  1157. vocab = SentencePieceVocab(
  1158. path, added_tokens_path if added_tokens_path.exists() else None
  1159. )
  1160. elif vocabtype == "hfft":
  1161. vocab = HfVocab(
  1162. path, added_tokens_path if added_tokens_path.exists() else None
  1163. )
  1164. else:
  1165. raise ValueError(f"Unsupported vocabulary type {vocabtype}")
  1166. special_vocab = self._create_special_vocab(
  1167. vocab,
  1168. vocabtype,
  1169. model_parent_path,
  1170. )
  1171. return vocab, special_vocab
  1172. def default_output_file(model_paths: list[Path], file_type: GGMLFileType) -> Path:
  1173. namestr = {
  1174. GGMLFileType.AllF32: "f32",
  1175. GGMLFileType.MostlyF16: "f16",
  1176. GGMLFileType.MostlyQ8_0: "q8_0",
  1177. }[file_type]
  1178. ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
  1179. if ret in model_paths:
  1180. sys.stderr.write(
  1181. f"Error: Default output path ({ret}) would overwrite the input. "
  1182. "Please explicitly specify a path using --outfile.\n"
  1183. )
  1184. sys.exit(1)
  1185. return ret
  1186. def do_dump_model(model_plus: ModelPlus) -> None:
  1187. print(f"model_plus.paths = {model_plus.paths!r}")
  1188. print(f"model_plus.format = {model_plus.format!r}")
  1189. print(f"model_plus.vocab = {model_plus.vocab!r}")
  1190. for name, lazy_tensor in model_plus.model.items():
  1191. print(
  1192. f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}"
  1193. )
  1194. def get_argument_parser() -> ArgumentParser:
  1195. output_choices = ["f32", "f16"]
  1196. if np.uint32(1) == np.uint32(1).newbyteorder("<"):
  1197. # We currently only support Q8_0 output on little endian systems.
  1198. output_choices.append("q8_0")
  1199. parser = argparse.ArgumentParser(
  1200. description="Convert a LLaMa model to a GGML compatible file"
  1201. )
  1202. parser.add_argument(
  1203. "model",
  1204. type=Path,
  1205. help="Directory containing the model file or the model file itself (*.pth, *.pt, *.bin)",
  1206. )
  1207. parser.add_argument(
  1208. "--awq-path",
  1209. type=Path,
  1210. help="Path to the Activation-aware Weight Quantization cache file",
  1211. default=None,
  1212. )
  1213. parser.add_argument(
  1214. "--dump",
  1215. action="store_true",
  1216. help="Display the model content without converting it",
  1217. )
  1218. parser.add_argument(
  1219. "--dump-single",
  1220. action="store_true",
  1221. help="Display the content of a single model file without conversion",
  1222. )
  1223. parser.add_argument(
  1224. "--vocab-only",
  1225. action="store_true",
  1226. help="Extract and output only the vocabulary",
  1227. )
  1228. parser.add_argument(
  1229. "--outtype",
  1230. choices=output_choices,
  1231. help="Output format - note: q8_0 may be very slow (default: f16 or f32 based on input)",
  1232. )
  1233. parser.add_argument(
  1234. "--vocab-dir",
  1235. type=Path,
  1236. help="Directory containing the tokenizer.model, if separate from the model file",
  1237. )
  1238. parser.add_argument(
  1239. "--vocab-type",
  1240. choices=["spm", "bpe", "hfft"], # hfft: Hugging Face Fast Tokenizer
  1241. default="spm",
  1242. help="The vocabulary format used to define the tokenizer model (default: spm)",
  1243. )
  1244. parser.add_argument(
  1245. "--pad-vocab",
  1246. action="store_true",
  1247. help="Add padding tokens when the model's vocabulary size exceeds the tokenizer metadata",
  1248. )
  1249. parser.add_argument(
  1250. "--outfile",
  1251. type=Path,
  1252. help="Specify the path for the output file (default is based on input)",
  1253. )
  1254. parser.add_argument(
  1255. "--ctx", type=int, help="Model training context (default is based on input)"
  1256. )
  1257. parser.add_argument(
  1258. "--concurrency",
  1259. type=int,
  1260. help=f"Concurrency used for conversion (default: {DEFAULT_CONCURRENCY})",
  1261. default=DEFAULT_CONCURRENCY,
  1262. )
  1263. parser.add_argument(
  1264. "--big-endian",
  1265. action="store_true",
  1266. help="Indicate that the model is executed on a big-endian machine",
  1267. )
  1268. return parser
  1269. def main(argv: Optional[list[str]] = None) -> None:
  1270. parser = get_argument_parser()
  1271. args = parser.parse_args(argv)
  1272. if args.awq_path:
  1273. sys.path.insert(1, str(Path(__file__).resolve().parent / "awq-py"))
  1274. from awq.apply_awq import add_scale_weights
  1275. tmp_model_path = args.model / "weighted_model"
  1276. if tmp_model_path.is_dir():
  1277. print(f"{tmp_model_path} exists as a weighted model.")
  1278. else:
  1279. tmp_model_path.mkdir(parents=True, exist_ok=True)
  1280. print("Saving new weighted model ...")
  1281. add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
  1282. print(f"Saved weighted model at {tmp_model_path}.")
  1283. args.model = tmp_model_path
  1284. if args.dump_single:
  1285. model_plus = lazy_load_file(args.model)
  1286. do_dump_model(model_plus)
  1287. return
  1288. if not args.vocab_only:
  1289. model_plus = load_some_model(args.model)
  1290. else:
  1291. model_plus = ModelPlus(
  1292. model={}, paths=[args.model / "dummy"], format="none", vocab=None
  1293. )
  1294. if args.dump:
  1295. do_dump_model(model_plus)
  1296. return
  1297. endianess = gguf.GGUFEndian.LITTLE
  1298. if args.big_endian:
  1299. endianess = gguf.GGUFEndian.BIG
  1300. params = Params.load(model_plus)
  1301. if params.n_ctx == -1:
  1302. if args.ctx is None:
  1303. raise Exception(
  1304. "The model doesn't have a context size, and you didn't specify one with --ctx\n"
  1305. "Please specify one with --ctx:\n"
  1306. " - LLaMA v1: --ctx 2048\n"
  1307. " - LLaMA v2: --ctx 4096\n"
  1308. )
  1309. params.n_ctx = args.ctx
  1310. if args.outtype:
  1311. params.ftype = {
  1312. "f32": GGMLFileType.AllF32,
  1313. "f16": GGMLFileType.MostlyF16,
  1314. "q8_0": GGMLFileType.MostlyQ8_0,
  1315. }[args.outtype]
  1316. print(f"params = {params}")
  1317. model_parent_path = model_plus.paths[0].parent
  1318. vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
  1319. vocab_factory = VocabFactory(vocab_path)
  1320. vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type, model_parent_path)
  1321. if args.vocab_only:
  1322. if not args.outfile:
  1323. raise ValueError("need --outfile if using --vocab-only")
  1324. outfile = args.outfile
  1325. OutputFile.write_vocab_only(
  1326. outfile,
  1327. params,
  1328. vocab,
  1329. special_vocab,
  1330. endianess=endianess,
  1331. pad_vocab=args.pad_vocab,
  1332. )
  1333. print(f"Wrote {outfile}")
  1334. return
  1335. if model_plus.vocab is not None and args.vocab_dir is None:
  1336. vocab = model_plus.vocab
  1337. model = model_plus.model
  1338. model = convert_model_names(model, params)
  1339. ftype = pick_output_type(model, args.outtype)
  1340. model = convert_to_output_type(model, ftype)
  1341. outfile = args.outfile or default_output_file(model_plus.paths, ftype)
  1342. params.ftype = ftype
  1343. print(f"Writing {outfile}, format {ftype}")
  1344. OutputFile.write_all(
  1345. outfile,
  1346. ftype,
  1347. params,
  1348. model,
  1349. vocab,
  1350. special_vocab,
  1351. concurrency=args.concurrency,
  1352. endianess=endianess,
  1353. pad_vocab=args.pad_vocab,
  1354. )
  1355. print(f"Wrote {outfile}")
  1356. if __name__ == "__main__":
  1357. main(sys.argv[1:]) # Exclude the first element (script name) from sys.argv