convert-bloom-hf-to-gguf.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. #!/usr/bin/env python3
  2. # HF bloom --> gguf conversion
  3. from __future__ import annotations
  4. import argparse
  5. import json
  6. import os
  7. import re
  8. import struct
  9. import sys
  10. from pathlib import Path
  11. from typing import Any
  12. import numpy as np
  13. import torch
  14. from transformers import AutoTokenizer # type: ignore[import]
  15. if 'NO_LOCAL_GGUF' not in os.environ:
  16. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
  17. import gguf
  18. def count_model_parts(dir_model: Path) -> int:
  19. num_parts = 0
  20. for filename in os.listdir(dir_model):
  21. if filename.startswith("pytorch_model-"):
  22. num_parts += 1
  23. if num_parts > 0:
  24. print("gguf: found " + str(num_parts) + " model parts")
  25. return num_parts
  26. # Supported Models:
  27. # https://huggingface.co/bigscience/bloom-1b7
  28. # https://huggingface.co/bigscience/bloom-3b
  29. # https://huggingface.co/bigscience/bloom-7b1
  30. # https://huggingface.co/Langboat/bloom-1b4-zh
  31. def parse_args() -> argparse.Namespace:
  32. parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file")
  33. parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
  34. parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
  35. parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
  36. parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
  37. return parser.parse_args()
  38. args = parse_args()
  39. dir_model = args.model
  40. ftype = args.ftype
  41. if not dir_model.is_dir():
  42. print(f'Error: {args.model} is not a directory', file = sys.stderr)
  43. sys.exit(1)
  44. # possible tensor data types
  45. # ftype == 0 -> float32
  46. # ftype == 1 -> float16
  47. # map from ftype to string
  48. ftype_str = ["f32", "f16"]
  49. if args.outfile is not None:
  50. fname_out = args.outfile
  51. else:
  52. # output in the same directory as the model by default
  53. fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
  54. print("gguf: loading model "+dir_model.name)
  55. with open(dir_model / "config.json", "r", encoding="utf-8") as f:
  56. hparams = json.load(f)
  57. if hparams["architectures"][0] != "BloomForCausalLM":
  58. print("Model architecture not supported: " + hparams["architectures"][0])
  59. sys.exit(1)
  60. # get number of model parts
  61. num_parts = count_model_parts(dir_model)
  62. ARCH=gguf.MODEL_ARCH.BLOOM
  63. gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
  64. print("gguf: get model metadata")
  65. block_count = hparams["n_layer"]
  66. gguf_writer.add_name("Bloom")
  67. n_embed = hparams.get("hidden_size", hparams.get("n_embed"))
  68. n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
  69. gguf_writer.add_context_length(hparams.get("seq_length", n_embed))
  70. gguf_writer.add_embedding_length(n_embed)
  71. gguf_writer.add_feed_forward_length(4 * n_embed)
  72. gguf_writer.add_block_count(block_count)
  73. gguf_writer.add_head_count(n_head)
  74. gguf_writer.add_head_count_kv(n_head)
  75. gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
  76. gguf_writer.add_file_type(ftype)
  77. # TOKENIZATION
  78. print("gguf: get tokenizer metadata")
  79. tokens: list[bytearray] = []
  80. scores: list[float] = []
  81. toktypes: list[int] = []
  82. # gpt2 tokenizer
  83. gguf_writer.add_tokenizer_model("gpt2")
  84. print("gguf: get gpt2 tokenizer vocab")
  85. # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
  86. tokenizer = AutoTokenizer.from_pretrained(dir_model)
  87. # The number of tokens in tokenizer.json can differ from the expected vocab size.
  88. # This causes downstream issues with mismatched tensor sizes when running the inference
  89. vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
  90. assert max(tokenizer.vocab.values()) < vocab_size
  91. reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
  92. for i in range(vocab_size):
  93. tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
  94. scores.append(0.0) # dummy
  95. toktypes.append(gguf.TokenType.NORMAL)
  96. gguf_writer.add_token_list(tokens)
  97. gguf_writer.add_token_scores(scores)
  98. gguf_writer.add_token_types(toktypes)
  99. special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
  100. special_vocab.add_to_gguf(gguf_writer)
  101. # TENSORS
  102. tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
  103. # params for qkv transform
  104. n_head_kv = hparams.get("n_head_kv", n_head)
  105. head_dim = n_embed // n_head
  106. # tensor info
  107. print("gguf: get tensor metadata")
  108. if num_parts == 0:
  109. part_names = iter(("pytorch_model.bin",))
  110. else:
  111. part_names = (
  112. f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
  113. )
  114. for part_name in part_names:
  115. if args.vocab_only:
  116. break
  117. print("gguf: loading model part '" + part_name + "'")
  118. model_part = torch.load(dir_model / part_name, map_location="cpu")
  119. has_lm_head = True
  120. if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys():
  121. has_lm_head = False
  122. for original_name in model_part.keys():
  123. data = model_part[original_name]
  124. name = re.sub(r'transformer\.', '', original_name)
  125. old_dtype = data.dtype
  126. # convert any unsupported data types to float32
  127. if data.dtype != torch.float16 and data.dtype != torch.float32:
  128. data = data.to(torch.float32)
  129. data = data.squeeze().numpy()
  130. if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
  131. # Map bloom-style qkv_linear to gpt-style qkv_linear
  132. # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
  133. # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
  134. qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
  135. data = np.concatenate(
  136. (qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
  137. qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
  138. qkv_weights[:, 2, :, :].reshape((-1, n_embed))),
  139. axis=0
  140. )
  141. print("re-format attention.linear_qkv.weight")
  142. elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
  143. qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
  144. data = np.concatenate(
  145. (qkv_bias[:, 0, :].reshape((n_embed,)),
  146. qkv_bias[:, 1, :].reshape((n_embed,)),
  147. qkv_bias[:, 2, :].reshape((n_embed,))),
  148. axis=0
  149. )
  150. print("re-format attention.linear_qkv.bias")
  151. # map tensor names
  152. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  153. if new_name is None:
  154. print("Can not map tensor '" + name + "'")
  155. sys.exit()
  156. n_dims = len(data.shape)
  157. data_dtype = data.dtype
  158. # if f32 desired, convert any float16 to float32
  159. if ftype == 0 and data_dtype == np.float16:
  160. data = data.astype(np.float32)
  161. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  162. if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  163. data = data.astype(np.float32)
  164. # if f16 desired, convert any float32 2-dim weight tensors to float16
  165. if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  166. data = data.astype(np.float16)
  167. print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
  168. gguf_writer.add_tensor(new_name, data)
  169. if not has_lm_head and name == "word_embeddings.weight":
  170. gguf_writer.add_tensor("output.weight", data)
  171. print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa
  172. print("gguf: write header")
  173. gguf_writer.write_header_to_file()
  174. print("gguf: write metadata")
  175. gguf_writer.write_kv_data_to_file()
  176. if not args.vocab_only:
  177. print("gguf: write tensors")
  178. gguf_writer.write_tensors_to_file()
  179. gguf_writer.close()
  180. print(f"gguf: model successfully exported to '{fname_out}'")
  181. print("")