convert-bloom-hf-to-gguf.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. #!/usr/bin/env python3
  2. # HF bloom --> gguf conversion
  3. from __future__ import annotations
  4. import argparse
  5. import json
  6. import os
  7. import re
  8. import struct
  9. import sys
  10. from pathlib import Path
  11. from typing import Any
  12. import numpy as np
  13. import torch
  14. from transformers import AutoTokenizer # type: ignore[import]
  15. if 'NO_LOCAL_GGUF' not in os.environ:
  16. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
  17. import gguf
  18. def count_model_parts(dir_model: Path) -> int:
  19. num_parts = 0
  20. for filename in os.listdir(dir_model):
  21. if filename.startswith("pytorch_model-"):
  22. num_parts += 1
  23. if num_parts > 0:
  24. print("gguf: found " + str(num_parts) + " model parts")
  25. return num_parts
  26. # Supported Models:
  27. # https://huggingface.co/bigscience/bloom-1b7
  28. # https://huggingface.co/bigscience/bloom-3b
  29. # https://huggingface.co/bigscience/bloom-7b1
  30. # https://huggingface.co/Langboat/bloom-1b4-zh
  31. def parse_args() -> argparse.Namespace:
  32. parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file")
  33. parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
  34. parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
  35. parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
  36. parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
  37. return parser.parse_args()
  38. args = parse_args()
  39. dir_model = args.model
  40. ftype = args.ftype
  41. if not dir_model.is_dir():
  42. print(f'Error: {args.model} is not a directory', file = sys.stderr)
  43. sys.exit(1)
  44. # possible tensor data types
  45. # ftype == 0 -> float32
  46. # ftype == 1 -> float16
  47. # map from ftype to string
  48. ftype_str = ["f32", "f16"]
  49. if args.outfile is not None:
  50. fname_out = args.outfile
  51. else:
  52. # output in the same directory as the model by default
  53. fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
  54. print("gguf: loading model "+dir_model.name)
  55. with open(dir_model / "config.json", "r", encoding="utf-8") as f:
  56. hparams = json.load(f)
  57. if hparams["architectures"][0] != "BloomForCausalLM":
  58. print("Model architecture not supported: " + hparams["architectures"][0])
  59. sys.exit(1)
  60. # get number of model parts
  61. num_parts = count_model_parts(dir_model)
  62. ARCH=gguf.MODEL_ARCH.BLOOM
  63. gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
  64. print("gguf: get model metadata")
  65. block_count = hparams["n_layer"]
  66. gguf_writer.add_name("Bloom")
  67. n_embed = hparams.get("hidden_size", hparams.get("n_embed"))
  68. n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
  69. gguf_writer.add_context_length(hparams.get("seq_length", n_embed))
  70. gguf_writer.add_embedding_length(n_embed)
  71. gguf_writer.add_feed_forward_length(4 * n_embed)
  72. gguf_writer.add_block_count(block_count)
  73. gguf_writer.add_head_count(n_head)
  74. gguf_writer.add_head_count_kv(n_head)
  75. gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
  76. gguf_writer.add_file_type(ftype)
  77. # TOKENIZATION
  78. print("gguf: get tokenizer metadata")
  79. tokens: list[bytearray] = []
  80. scores: list[float] = []
  81. toktypes: list[int] = []
  82. # gpt2 tokenizer
  83. gguf_writer.add_tokenizer_model("gpt2")
  84. print("gguf: get gpt2 tokenizer vocab")
  85. # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
  86. tokenizer = AutoTokenizer.from_pretrained(dir_model)
  87. # The number of tokens in tokenizer.json can differ from the expected vocab size.
  88. # This causes downstream issues with mismatched tensor sizes when running the inference
  89. vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
  90. assert max(tokenizer.vocab.values()) < vocab_size
  91. added_vocab = tokenizer.get_added_vocab()
  92. reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
  93. for i in range(vocab_size):
  94. if i not in reverse_vocab:
  95. tokens.append(f"[PAD{i}]")
  96. toktypes.append(gguf.TokenType.USER_DEFINED)
  97. elif reverse_vocab[i] in added_vocab:
  98. tokens.append(reverse_vocab[i])
  99. if tokenizer.added_tokens_decoder[i].special:
  100. toktypes.append(gguf.TokenType.CONTROL)
  101. else:
  102. toktypes.append(gguf.TokenType.USER_DEFINED)
  103. else:
  104. tokens.append(reverse_vocab[i])
  105. toktypes.append(gguf.TokenType.NORMAL)
  106. gguf_writer.add_token_list(tokens)
  107. gguf_writer.add_token_types(toktypes)
  108. special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
  109. special_vocab.add_to_gguf(gguf_writer)
  110. # TENSORS
  111. tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
  112. # params for qkv transform
  113. n_head_kv = hparams.get("n_head_kv", n_head)
  114. head_dim = n_embed // n_head
  115. # tensor info
  116. print("gguf: get tensor metadata")
  117. if num_parts == 0:
  118. part_names = iter(("pytorch_model.bin",))
  119. else:
  120. part_names = (
  121. f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
  122. )
  123. for part_name in part_names:
  124. if args.vocab_only:
  125. break
  126. print("gguf: loading model part '" + part_name + "'")
  127. model_part = torch.load(dir_model / part_name, map_location="cpu")
  128. has_lm_head = True
  129. if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys():
  130. has_lm_head = False
  131. for original_name in model_part.keys():
  132. data = model_part[original_name]
  133. name = re.sub(r'transformer\.', '', original_name)
  134. old_dtype = data.dtype
  135. # convert any unsupported data types to float32
  136. if data.dtype != torch.float16 and data.dtype != torch.float32:
  137. data = data.to(torch.float32)
  138. data = data.squeeze().numpy()
  139. if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
  140. # Map bloom-style qkv_linear to gpt-style qkv_linear
  141. # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
  142. # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
  143. qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
  144. data = np.concatenate(
  145. (qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
  146. qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
  147. qkv_weights[:, 2, :, :].reshape((-1, n_embed))),
  148. axis=0
  149. )
  150. print("re-format attention.linear_qkv.weight")
  151. elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
  152. qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
  153. data = np.concatenate(
  154. (qkv_bias[:, 0, :].reshape((n_embed,)),
  155. qkv_bias[:, 1, :].reshape((n_embed,)),
  156. qkv_bias[:, 2, :].reshape((n_embed,))),
  157. axis=0
  158. )
  159. print("re-format attention.linear_qkv.bias")
  160. # map tensor names
  161. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  162. if new_name is None:
  163. print("Can not map tensor '" + name + "'")
  164. sys.exit()
  165. n_dims = len(data.shape)
  166. data_dtype = data.dtype
  167. # if f32 desired, convert any float16 to float32
  168. if ftype == 0 and data_dtype == np.float16:
  169. data = data.astype(np.float32)
  170. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  171. if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  172. data = data.astype(np.float32)
  173. # if f16 desired, convert any float32 2-dim weight tensors to float16
  174. if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  175. data = data.astype(np.float16)
  176. print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
  177. gguf_writer.add_tensor(new_name, data)
  178. if not has_lm_head and name == "word_embeddings.weight":
  179. gguf_writer.add_tensor("output.weight", data)
  180. print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa
  181. print("gguf: write header")
  182. gguf_writer.write_header_to_file()
  183. print("gguf: write metadata")
  184. gguf_writer.write_kv_data_to_file()
  185. if not args.vocab_only:
  186. print("gguf: write tensors")
  187. gguf_writer.write_tensors_to_file()
  188. gguf_writer.close()
  189. print(f"gguf: model successfully exported to '{fname_out}'")
  190. print("")