convert_ggml_to_pth.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. # Author: github.com/ductai199x
  2. import argparse
  3. import os
  4. import struct
  5. import numpy as np
  6. import torch
  7. from numba import njit
  8. from tqdm.auto import tqdm
  9. def read_header(fin):
  10. values = struct.unpack("i" * 9, fin.read(4 * 9))
  11. _, _, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype = values
  12. return {
  13. "vocab_size": vocab_size,
  14. "dim": dim,
  15. "multiple_of": multiple_of,
  16. "n_heads": n_heads,
  17. "n_layers": n_layers,
  18. }, ftype
  19. def read_tokens(fin, vocab_size):
  20. tokens = []
  21. for _ in range(vocab_size):
  22. text_len = struct.unpack("i", fin.read(4))[0]
  23. text_bytes = fin.read(text_len)
  24. try:
  25. text = text_bytes.decode("utf-8")
  26. except UnicodeDecodeError:
  27. text = text_bytes.decode("utf-8", "replace")
  28. score = struct.unpack("f", fin.read(4))[0]
  29. tokens.append((text, score))
  30. return tokens
  31. @njit
  32. def dequantize_weights_numba(fin_data, n_rows, n_cols):
  33. qk = 32
  34. nb = n_cols // qk
  35. bs = 4 + (qk // 2)
  36. weights = np.zeros((n_rows, n_cols), dtype=np.float32)
  37. data_pos = 0
  38. for row in range(n_rows):
  39. for block in range(nb):
  40. d = np.frombuffer(fin_data[data_pos : data_pos + 4], dtype=np.float32)[0]
  41. data_pos += 4
  42. packed_values = fin_data[data_pos : data_pos + (qk // 2)]
  43. data_pos += qk // 2
  44. for i in range(qk // 2):
  45. packed_value = packed_values[i]
  46. v0 = np.float32((packed_value & 0b00001111) - 8) * d
  47. v1 = np.float32((packed_value >> 4) - 8) * d
  48. weights[row, block * qk + 2 * i] = v0
  49. weights[row, block * qk + 2 * i + 1] = v1
  50. return weights
  51. def dequantize_weights(fin, n_rows, n_cols):
  52. qk = 32
  53. nb = n_cols // qk
  54. data_size = n_rows * n_cols // 2 + n_rows * nb * 4
  55. fin_data = fin.read(data_size)
  56. return dequantize_weights_numba(fin_data, n_rows, n_cols)
  57. def read_variables(fin):
  58. model = {}
  59. pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
  60. while True:
  61. start_pos = fin.tell()
  62. try:
  63. n_dims, name_length, ftype_cur = struct.unpack("iii", fin.read(4 * 3))
  64. except struct.error:
  65. break
  66. shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
  67. shape = shape[::-1]
  68. name = fin.read(name_length).decode("utf-8")
  69. if ftype_cur == 2:
  70. # 4-bit quantized weights
  71. dtype = np.uint8
  72. data = dequantize_weights(fin, shape[0], shape[1])
  73. data = data.reshape(shape)
  74. elif ftype_cur == 0:
  75. dtype = np.float32
  76. data_size = np.prod(shape)
  77. data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
  78. elif ftype_cur == 1:
  79. dtype = np.float16
  80. data_size = np.prod(shape)
  81. data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
  82. model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
  83. pbar.update(fin.tell() - start_pos)
  84. return model
  85. def convert_to_hf_format(model, hparams):
  86. # This works for llama 7B, need to test with other models
  87. n_layers = hparams["n_layers"]
  88. n_heads = hparams["n_heads"]
  89. dim = hparams["dim"]
  90. dims_per_head = dim // n_heads
  91. base = 10000.0
  92. inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
  93. # permute for sliced rotary
  94. def permute(w):
  95. return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
  96. state_dict = {}
  97. for layer_i in range(n_layers):
  98. state_dict.update(
  99. {
  100. f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
  101. model[f"layers.{layer_i}.attention.wq.weight"]
  102. ),
  103. f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
  104. model[f"layers.{layer_i}.attention.wk.weight"]
  105. ),
  106. f"model.layers.{layer_i}.self_attn.v_proj.weight": model[
  107. f"layers.{layer_i}.attention.wv.weight"
  108. ],
  109. f"model.layers.{layer_i}.self_attn.o_proj.weight": model[
  110. f"layers.{layer_i}.attention.wo.weight"
  111. ],
  112. f"model.layers.{layer_i}.mlp.gate_proj.weight": model[
  113. f"layers.{layer_i}.feed_forward.w1.weight"
  114. ],
  115. f"model.layers.{layer_i}.mlp.down_proj.weight": model[
  116. f"layers.{layer_i}.feed_forward.w2.weight"
  117. ],
  118. f"model.layers.{layer_i}.mlp.up_proj.weight": model[
  119. f"layers.{layer_i}.feed_forward.w3.weight"
  120. ],
  121. f"model.layers.{layer_i}.input_layernorm.weight": model[
  122. f"layers.{layer_i}.attention_norm.weight"
  123. ],
  124. f"model.layers.{layer_i}.post_attention_layernorm.weight": model[
  125. f"layers.{layer_i}.ffn_norm.weight"
  126. ],
  127. }
  128. )
  129. state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
  130. state_dict.update(
  131. {
  132. "model.embed_tokens.weight": model["tok_embeddings.weight"],
  133. "model.norm.weight": model["norm.weight"],
  134. "lm_head.weight": model["output.weight"],
  135. }
  136. )
  137. return state_dict
  138. def chat(model, hparams, llama_dir):
  139. from transformers import (GenerationConfig, LlamaForCausalLM,
  140. LlamaTokenizer, StoppingCriteria,
  141. StoppingCriteriaList)
  142. from transformers.models.llama.configuration_llama import LlamaConfig
  143. class StoppingCriteriaSub(StoppingCriteria):
  144. def __init__(self):
  145. super().__init__()
  146. def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
  147. print(tokenizer.decode(input_ids[0]), end="", flush=True)
  148. if input_ids[0][-1] == 13:
  149. return True
  150. return False
  151. config = LlamaConfig(
  152. vocab_size=hparams["vocab_size"],
  153. dim=hparams["dim"],
  154. num_hidden_layers=hparams["n_layers"],
  155. num_attention_heads=hparams["n_heads"],
  156. )
  157. llama = LlamaForCausalLM(config=config)
  158. llama.load_state_dict(state_dict=model, strict=True)
  159. tokenizer = LlamaTokenizer.from_pretrained(llama_dir)
  160. device = torch.device("cpu")
  161. llama = llama.to(device)
  162. ctx = """You are AI.
  163. This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
  164. User: Hello, AI.
  165. AI: Hello! How can I assist you today?
  166. """
  167. print(ctx.rstrip("\n"))
  168. while True:
  169. print("-" * 60)
  170. prompt = input(f"User: ")
  171. if ctx != "":
  172. ctx = ctx + "User: " + prompt + "\n"
  173. else:
  174. ctx = prompt + "\nAI:"
  175. ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
  176. print("-" * 60)
  177. if len(ctx.strip()) > 0:
  178. input_ids = tokenizer(ctx, return_tensors="pt")["input_ids"].to(device)
  179. generation_config = GenerationConfig(
  180. temperature=0.8,
  181. top_p=0.95,
  182. top_k=50,
  183. repetition_penalty=1.1764,
  184. )
  185. with torch.no_grad():
  186. generation_output = llama.generate(
  187. input_ids=input_ids,
  188. generation_config=generation_config,
  189. return_dict_in_generate=True,
  190. output_scores=True,
  191. max_length=2048,
  192. do_sample=True,
  193. stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub()]),
  194. )
  195. s = generation_output.sequences[0]
  196. decoded = tokenizer.decode(s)
  197. ctx = decoded + "\n"
  198. def main():
  199. parser = argparse.ArgumentParser()
  200. parser.add_argument(
  201. "--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
  202. )
  203. parser.add_argument(
  204. "--prefix",
  205. "-p",
  206. type=str,
  207. required=True,
  208. help="The prefix of the ggml files (ggml-model-f16 or ggml-model-q4_0).",
  209. )
  210. parser.add_argument(
  211. "--hf",
  212. action="store_true",
  213. help="Whether to save the model in the huggingface format. (default: False)",
  214. )
  215. parser.add_argument(
  216. "--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
  217. )
  218. args = parser.parse_args()
  219. llama_dir = os.path.abspath(f"{args.input_dir}/../")
  220. ggml_files = sorted(
  221. [f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
  222. )
  223. fin = open(ggml_files[0], "rb")
  224. hparams, ftype = read_header(fin)
  225. tokens = read_tokens(fin, hparams["vocab_size"])
  226. model = read_variables(fin)
  227. for f in tqdm(ggml_files[1:]):
  228. fin = open(f, "rb")
  229. read_header(fin)
  230. read_tokens(fin, hparams["vocab_size"])
  231. model.update(read_variables(fin))
  232. if args.hf:
  233. model = convert_to_hf_format(model, hparams)
  234. pth_ckpt = {
  235. "state_dict": model,
  236. "hparams": hparams,
  237. "tokens": tokens,
  238. }
  239. torch.save(pth_ckpt, f"{args.input_dir}/{args.prefix}-to-torch.pth")
  240. if args.chat:
  241. if not args.hf:
  242. model = convert_to_hf_format(model, hparams)
  243. chat(model, hparams, llama_dir)
  244. if __name__ == "__main__":
  245. main()