convert-falcon-hf-to-gguf.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. #!/usr/bin/env python3
  2. # HF falcon--> gguf conversion
  3. from __future__ import annotations
  4. import argparse
  5. import json
  6. import os
  7. import struct
  8. import sys
  9. from pathlib import Path
  10. from typing import Any
  11. import numpy as np
  12. import torch
  13. from transformers import AutoTokenizer # type: ignore[import]
  14. if 'NO_LOCAL_GGUF' not in os.environ:
  15. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
  16. import gguf
  17. def bytes_to_unicode():
  18. # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
  19. """
  20. Returns list of utf-8 byte and a corresponding list of unicode strings.
  21. The reversible bpe codes work on unicode strings.
  22. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
  23. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
  24. This is a significant percentage of your normal, say, 32K bpe vocab.
  25. To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
  26. And avoids mapping to whitespace/control characters the bpe code barfs on.
  27. """
  28. bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
  29. cs = bs[:]
  30. n = 0
  31. for b in range(2**8):
  32. if b not in bs:
  33. bs.append(b)
  34. cs.append(2**8+n)
  35. n += 1
  36. return dict(zip(bs, (chr(n) for n in cs)))
  37. def count_model_parts(dir_model: Path) -> int:
  38. num_parts = 0
  39. for filename in os.listdir(dir_model):
  40. if filename.startswith("pytorch_model-"):
  41. num_parts += 1
  42. if num_parts > 0:
  43. print("gguf: found " + str(num_parts) + " model parts")
  44. return num_parts
  45. def parse_args() -> argparse.Namespace:
  46. parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
  47. parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
  48. parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
  49. parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
  50. parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
  51. return parser.parse_args()
  52. args = parse_args()
  53. dir_model = args.model
  54. ftype = args.ftype
  55. if not dir_model.is_dir():
  56. print(f'Error: {args.model} is not a directory', file = sys.stderr)
  57. sys.exit(1)
  58. # possible tensor data types
  59. # ftype == 0 -> float32
  60. # ftype == 1 -> float16
  61. # map from ftype to string
  62. ftype_str = ["f32", "f16"]
  63. if args.outfile is not None:
  64. fname_out = args.outfile
  65. else:
  66. # output in the same directory as the model by default
  67. fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
  68. print("gguf: loading model "+dir_model.name)
  69. with open(dir_model / "config.json", "r", encoding="utf-8") as f:
  70. hparams = json.load(f)
  71. if hparams["architectures"][0] != "RWForCausalLM":
  72. print("Model architecture not supported: " + hparams["architectures"][0])
  73. sys.exit(1)
  74. # get number of model parts
  75. num_parts = count_model_parts(dir_model)
  76. ARCH=gguf.MODEL_ARCH.FALCON
  77. gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
  78. print("gguf: get model metadata")
  79. block_count = hparams["n_layer"]
  80. gguf_writer.add_name("Falcon")
  81. gguf_writer.add_context_length(2048) # not in config.json
  82. gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
  83. gguf_writer.add_embedding_length(hparams["hidden_size"])
  84. gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
  85. gguf_writer.add_block_count(block_count)
  86. gguf_writer.add_head_count(hparams["n_head"])
  87. if "n_head_kv" in hparams:
  88. gguf_writer.add_head_count_kv(hparams["n_head_kv"])
  89. else:
  90. gguf_writer.add_head_count_kv(1)
  91. gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
  92. gguf_writer.add_file_type(ftype)
  93. # TOKENIZATION
  94. print("gguf: get tokenizer metadata")
  95. tokens: list[bytearray] = []
  96. scores: list[float] = []
  97. toktypes: list[int] = []
  98. tokenizer_json_file = dir_model / 'tokenizer.json'
  99. if not tokenizer_json_file.is_file():
  100. print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
  101. sys.exit(1)
  102. # gpt2 tokenizer
  103. gguf_writer.add_tokenizer_model("gpt2")
  104. with open(tokenizer_json_file, "r", encoding="utf-8") as f:
  105. tokenizer_json = json.load(f)
  106. print("gguf: get gpt2 tokenizer vocab")
  107. vocab_size = len(tokenizer_json["model"]["vocab"])
  108. # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
  109. tokenizer = AutoTokenizer.from_pretrained(dir_model)
  110. reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
  111. byte_encoder = bytes_to_unicode()
  112. byte_decoder = {v: k for k, v in byte_encoder.items()}
  113. for i in range(vocab_size):
  114. if i in reverse_vocab:
  115. try:
  116. text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
  117. except KeyError:
  118. text = bytearray()
  119. for c in reverse_vocab[i]:
  120. if ord(c) < 256: # single byte character
  121. text.append(byte_decoder[ord(c)])
  122. else: # multibyte special token character
  123. text.extend(c.encode('utf-8'))
  124. else:
  125. print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
  126. pad_token = f"[PAD{i}]".encode("utf8")
  127. text = bytearray(pad_token)
  128. tokens.append(text)
  129. scores.append(0.0) # dymmy
  130. toktypes.append(gguf.TokenType.NORMAL) # dummy
  131. gguf_writer.add_token_list(tokens)
  132. gguf_writer.add_token_scores(scores)
  133. gguf_writer.add_token_types(toktypes)
  134. special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
  135. special_vocab.add_to_gguf(gguf_writer)
  136. # TENSORS
  137. tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
  138. # params for qkv transform
  139. n_head = hparams["n_head"]
  140. n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
  141. head_dim = hparams["hidden_size"] // n_head
  142. # tensor info
  143. print("gguf: get tensor metadata")
  144. if num_parts == 0:
  145. part_names = iter(("pytorch_model.bin",))
  146. else:
  147. part_names = (
  148. f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
  149. )
  150. for part_name in part_names:
  151. if args.vocab_only:
  152. break
  153. print("gguf: loading model part '" + part_name + "'")
  154. model_part = torch.load(dir_model / part_name, map_location="cpu")
  155. for name in model_part.keys():
  156. data = model_part[name]
  157. old_dtype = data.dtype
  158. # convert any unsupported data types to float32
  159. if data.dtype != torch.float16 and data.dtype != torch.float32:
  160. data = data.to(torch.float32)
  161. # QKV tensor transform
  162. # The original query_key_value tensor contains n_head_kv "kv groups",
  163. # each consisting of n_head/n_head_kv query weights followed by one key
  164. # and one value weight (shared by all query heads in the kv group).
  165. # This layout makes it a big pain to work with in GGML.
  166. # So we rearrange them here,, so that we have n_head query weights
  167. # followed by n_head_kv key weights followed by n_head_kv value weights,
  168. # in contiguous fashion.
  169. # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
  170. if "query_key_value" in name:
  171. qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
  172. q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
  173. k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
  174. v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
  175. data = torch.cat((q,k,v)).reshape_as(data)
  176. data = data.squeeze().numpy()
  177. # map tensor names
  178. new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
  179. if new_name is None:
  180. print("Can not map tensor '" + name + "'")
  181. sys.exit()
  182. n_dims = len(data.shape)
  183. data_dtype = data.dtype
  184. # if f32 desired, convert any float16 to float32
  185. if ftype == 0 and data_dtype == np.float16:
  186. data = data.astype(np.float32)
  187. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  188. if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  189. data = data.astype(np.float32)
  190. # if f16 desired, convert any float32 2-dim weight tensors to float16
  191. if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  192. data = data.astype(np.float16)
  193. print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
  194. gguf_writer.add_tensor(new_name, data)
  195. print("gguf: write header")
  196. gguf_writer.write_header_to_file()
  197. print("gguf: write metadata")
  198. gguf_writer.write_kv_data_to_file()
  199. if not args.vocab_only:
  200. print("gguf: write tensors")
  201. gguf_writer.write_tensors_to_file()
  202. gguf_writer.close()
  203. print(f"gguf: model successfully exported to '{fname_out}'")
  204. print("")