convert-refact-hf-to-gguf.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. #!/usr/bin/env python3
  2. # HF refact--> gguf conversion
  3. from __future__ import annotations
  4. import argparse
  5. import json
  6. import os
  7. import sys
  8. from pathlib import Path
  9. import numpy as np
  10. import torch
  11. from transformers import AutoTokenizer # type: ignore[import]
  12. if "NO_LOCAL_GGUF" not in os.environ:
  13. sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
  14. import gguf
  15. def count_model_parts(dir_model: Path) -> int:
  16. num_parts = 0
  17. for filename in os.listdir(dir_model):
  18. if filename.startswith("pytorch_model-"):
  19. num_parts += 1
  20. if num_parts > 0:
  21. print("gguf: found " + str(num_parts) + " model parts")
  22. return num_parts
  23. def parse_args() -> argparse.Namespace:
  24. parser = argparse.ArgumentParser(
  25. description="Convert a Refact model to a GGML compatible file"
  26. )
  27. parser.add_argument(
  28. "--vocab-only",
  29. action="store_true",
  30. help="extract only the vocab",
  31. )
  32. parser.add_argument(
  33. "--outfile",
  34. type=Path,
  35. help="path to write to; default: based on input",
  36. )
  37. parser.add_argument(
  38. "model",
  39. type=Path,
  40. help="directory containing model file, or model file itself (*.bin)",
  41. )
  42. parser.add_argument(
  43. "ftype",
  44. type=int,
  45. choices=[0, 1],
  46. default=1,
  47. nargs="?",
  48. help="output format - use 0 for float32, 1 for float16",
  49. )
  50. return parser.parse_args()
  51. args = parse_args()
  52. dir_model = args.model
  53. ftype = args.ftype
  54. if not dir_model.is_dir():
  55. print(f"Error: {args.model} is not a directory", file=sys.stderr)
  56. sys.exit(1)
  57. # possible tensor data types
  58. # ftype == 0 -> float32
  59. # ftype == 1 -> float16
  60. # map from ftype to string
  61. ftype_str = ["f32", "f16"]
  62. if args.outfile is not None:
  63. fname_out = args.outfile
  64. else:
  65. # output in the same directory as the model by default
  66. fname_out = dir_model / f"ggml-model-{ftype_str[ftype]}.gguf"
  67. print("gguf: loading model " + dir_model.name)
  68. with open(dir_model / "config.json", "r", encoding="utf-8") as f:
  69. hparams = json.load(f)
  70. if hparams["architectures"][0] != "GPTRefactForCausalLM":
  71. print("Model architecture not supported: " + hparams["architectures"][0])
  72. sys.exit(1)
  73. # get number of model parts
  74. num_parts = count_model_parts(dir_model)
  75. ARCH = gguf.MODEL_ARCH.REFACT
  76. gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
  77. print("gguf: get model metadata")
  78. # Get refact feed forward dimension
  79. hidden_dim = hparams["n_embd"]
  80. inner_dim = 4 * hidden_dim
  81. hidden_dim = int(2 * inner_dim / 3)
  82. multiple_of = 256
  83. ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
  84. block_count = hparams["n_layer"]
  85. gguf_writer.add_name("Refact")
  86. # refact uses Alibi. So this is from config.json which might be used by training.
  87. gguf_writer.add_context_length(hparams["n_positions"])
  88. gguf_writer.add_embedding_length(hparams["n_embd"])
  89. gguf_writer.add_feed_forward_length(ff_dim)
  90. gguf_writer.add_block_count(block_count)
  91. gguf_writer.add_head_count(hparams["n_head"])
  92. gguf_writer.add_head_count_kv(1)
  93. gguf_writer.add_layer_norm_rms_eps(hparams["layer_norm_epsilon"])
  94. gguf_writer.add_file_type(ftype)
  95. # TOKENIZATION
  96. print("gguf: get tokenizer metadata")
  97. tokens: list[bytearray] = []
  98. scores: list[float] = []
  99. toktypes: list[int] = []
  100. # gpt2 tokenizer
  101. gguf_writer.add_tokenizer_model("gpt2")
  102. print("gguf: get gpt2 tokenizer vocab")
  103. # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
  104. tokenizer = AutoTokenizer.from_pretrained(dir_model)
  105. # The number of tokens in tokenizer.json can differ from the expected vocab size.
  106. # This causes downstream issues with mismatched tensor sizes when running the inference
  107. vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
  108. assert max(tokenizer.vocab.values()) < vocab_size
  109. added_vocab = tokenizer.get_added_vocab()
  110. reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
  111. for i in range(vocab_size):
  112. if i not in reverse_vocab:
  113. tokens.append(f"[PAD{i}]")
  114. toktypes.append(gguf.TokenType.USER_DEFINED)
  115. elif reverse_vocab[i] in added_vocab:
  116. tokens.append(reverse_vocab[i])
  117. if tokenizer.added_tokens_decoder[i].special:
  118. toktypes.append(gguf.TokenType.CONTROL)
  119. else:
  120. toktypes.append(gguf.TokenType.USER_DEFINED)
  121. else:
  122. tokens.append(reverse_vocab[i])
  123. toktypes.append(gguf.TokenType.NORMAL)
  124. gguf_writer.add_token_list(tokens)
  125. gguf_writer.add_token_types(toktypes)
  126. special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
  127. special_vocab.add_to_gguf(gguf_writer)
  128. # TENSORS
  129. tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
  130. # params for qkv transform
  131. n_head = hparams["n_head"]
  132. n_head_kv = 1
  133. head_dim = hparams["n_embd"] // n_head
  134. # tensor info
  135. print("gguf: get tensor metadata")
  136. if num_parts == 0:
  137. part_names = iter(("pytorch_model.bin",))
  138. else:
  139. part_names = (
  140. f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
  141. )
  142. for part_name in part_names:
  143. if args.vocab_only:
  144. break
  145. print("gguf: loading model part '" + part_name + "'")
  146. model_part = torch.load(dir_model / part_name, map_location="cpu")
  147. for i in range(block_count):
  148. if f"transformer.h.{i}.attn.kv.weight" in model_part:
  149. data = model_part[f"transformer.h.{i}.attn.kv.weight"]
  150. model_part[f"model.layers.{i}.self_attn.k_proj.weight"] = data[
  151. : n_head_kv * head_dim
  152. ]
  153. model_part[f"model.layers.{i}.self_attn.v_proj.weight"] = data[
  154. n_head_kv * head_dim :
  155. ]
  156. del model_part[f"transformer.h.{i}.attn.kv.weight"]
  157. if f"transformer.h.{i}.attn.q.weight" in model_part:
  158. model_part[f"model.layers.{i}.self_attn.q_proj.weight"] = model_part[
  159. f"transformer.h.{i}.attn.q.weight"
  160. ]
  161. del model_part[f"transformer.h.{i}.attn.q.weight"]
  162. if f"transformer.h.{i}.mlp.gate_up_proj.weight" in model_part:
  163. data = model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
  164. model_part[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim]
  165. model_part[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:]
  166. del model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
  167. for name in model_part.keys():
  168. data = model_part[name]
  169. old_dtype = data.dtype
  170. # convert any unsupported data types to float32
  171. if data.dtype != torch.float16 and data.dtype != torch.float32:
  172. data = data.to(torch.float32)
  173. data = data.squeeze().numpy()
  174. # map tensor names
  175. new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
  176. if new_name is None:
  177. print("Can not map tensor '" + name + "'")
  178. sys.exit()
  179. n_dims = len(data.shape)
  180. data_dtype = data.dtype
  181. # if f32 desired, convert any float16 to float32
  182. if ftype == 0 and data_dtype == np.float16:
  183. data = data.astype(np.float32)
  184. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  185. if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  186. data = data.astype(np.float32)
  187. # if f16 desired, convert any float32 2-dim weight tensors to float16
  188. if (
  189. ftype == 1
  190. and data_dtype == np.float32
  191. and name.endswith(".weight")
  192. and n_dims == 2
  193. ):
  194. data = data.astype(np.float16)
  195. print(
  196. new_name
  197. + ", n_dims = "
  198. + str(n_dims)
  199. + ", "
  200. + str(old_dtype)
  201. + " --> "
  202. + str(data.dtype)
  203. )
  204. gguf_writer.add_tensor(new_name, data)
  205. print("gguf: write header")
  206. gguf_writer.write_header_to_file()
  207. print("gguf: write metadata")
  208. gguf_writer.write_kv_data_to_file()
  209. if not args.vocab_only:
  210. print("gguf: write tensors")
  211. gguf_writer.write_tensors_to_file()
  212. gguf_writer.close()
  213. print(f"gguf: model successfully exported to '{fname_out}'")
  214. print("")