convert-starcoder-hf-to-gguf.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. #!/usr/bin/env python3
  2. # HF starcoder --> gguf conversion
  3. from __future__ import annotations
  4. import argparse
  5. import json
  6. import os
  7. import struct
  8. import sys
  9. from pathlib import Path
  10. from typing import Any
  11. import numpy as np
  12. import torch
  13. from transformers import AutoTokenizer # type: ignore[import]
  14. if 'NO_LOCAL_GGUF' not in os.environ:
  15. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
  16. import gguf
  17. def count_model_parts(dir_model: Path) -> int:
  18. num_parts = 0
  19. for filename in os.listdir(dir_model):
  20. if filename.startswith("pytorch_model-"):
  21. num_parts += 1
  22. if num_parts > 0:
  23. print("gguf: found " + str(num_parts) + " model parts")
  24. return num_parts
  25. def parse_args() -> argparse.Namespace:
  26. parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
  27. parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
  28. parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
  29. parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
  30. parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
  31. return parser.parse_args()
  32. args = parse_args()
  33. dir_model = args.model
  34. ftype = args.ftype
  35. if not dir_model.is_dir():
  36. print(f'Error: {args.model} is not a directory', file = sys.stderr)
  37. sys.exit(1)
  38. # possible tensor data types
  39. # ftype == 0 -> float32
  40. # ftype == 1 -> float16
  41. # map from ftype to string
  42. ftype_str = ["f32", "f16"]
  43. if args.outfile is not None:
  44. fname_out = args.outfile
  45. else:
  46. # output in the same directory as the model by default
  47. fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
  48. print("gguf: loading model "+dir_model.name)
  49. with open(dir_model / "config.json", "r", encoding="utf-8") as f:
  50. hparams = json.load(f)
  51. if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
  52. print("Model architecture not supported: " + hparams["architectures"][0])
  53. sys.exit(1)
  54. # get number of model parts
  55. num_parts = count_model_parts(dir_model)
  56. ARCH=gguf.MODEL_ARCH.STARCODER
  57. gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
  58. print("gguf: get model metadata")
  59. block_count = hparams["n_layer"]
  60. gguf_writer.add_name("StarCoder")
  61. gguf_writer.add_context_length(hparams["n_positions"])
  62. gguf_writer.add_embedding_length(hparams["n_embd"])
  63. gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
  64. gguf_writer.add_block_count(block_count)
  65. gguf_writer.add_head_count(hparams["n_head"])
  66. gguf_writer.add_head_count_kv(1)
  67. gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
  68. gguf_writer.add_file_type(ftype)
  69. # TOKENIZATION
  70. print("gguf: get tokenizer metadata")
  71. tokens: list[bytearray] = []
  72. scores: list[float] = []
  73. toktypes: list[int] = []
  74. # gpt2 tokenizer
  75. gguf_writer.add_tokenizer_model("gpt2")
  76. print("gguf: get gpt2 tokenizer vocab")
  77. # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
  78. tokenizer = AutoTokenizer.from_pretrained(dir_model)
  79. # The number of tokens in tokenizer.json can differ from the expected vocab size.
  80. # This causes downstream issues with mismatched tensor sizes when running the inference
  81. vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
  82. assert max(tokenizer.vocab.values()) < vocab_size
  83. added_vocab = tokenizer.get_added_vocab()
  84. reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
  85. for i in range(vocab_size):
  86. if i not in reverse_vocab:
  87. tokens.append(f"[PAD{i}]")
  88. toktypes.append(gguf.TokenType.USER_DEFINED)
  89. elif reverse_vocab[i] in added_vocab:
  90. tokens.append(reverse_vocab[i])
  91. if tokenizer.added_tokens_decoder[i].special:
  92. toktypes.append(gguf.TokenType.CONTROL)
  93. else:
  94. toktypes.append(gguf.TokenType.USER_DEFINED)
  95. else:
  96. tokens.append(reverse_vocab[i])
  97. toktypes.append(gguf.TokenType.NORMAL)
  98. gguf_writer.add_token_list(tokens)
  99. gguf_writer.add_token_types(toktypes)
  100. special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
  101. special_vocab.add_to_gguf(gguf_writer)
  102. # TENSORS
  103. tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
  104. # params for qkv transform
  105. n_head = hparams["n_head"]
  106. n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
  107. head_dim = hparams["n_embd"] // n_head
  108. # tensor info
  109. print("gguf: get tensor metadata")
  110. if num_parts == 0:
  111. part_names = iter(("pytorch_model.bin",))
  112. else:
  113. part_names = (
  114. f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
  115. )
  116. for part_name in part_names:
  117. if args.vocab_only:
  118. break
  119. print("gguf: loading model part '" + part_name + "'")
  120. model_part = torch.load(dir_model / part_name, map_location="cpu")
  121. for name in model_part.keys():
  122. data = model_part[name]
  123. old_dtype = data.dtype
  124. # convert any unsupported data types to float32
  125. if data.dtype != torch.float16 and data.dtype != torch.float32:
  126. data = data.to(torch.float32)
  127. data = data.squeeze().numpy()
  128. # map tensor names
  129. new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
  130. if new_name is None:
  131. print("Can not map tensor '" + name + "'")
  132. sys.exit()
  133. n_dims = len(data.shape)
  134. data_dtype = data.dtype
  135. # if f32 desired, convert any float16 to float32
  136. if ftype == 0 and data_dtype == np.float16:
  137. data = data.astype(np.float32)
  138. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  139. if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  140. data = data.astype(np.float32)
  141. # if f16 desired, convert any float32 2-dim weight tensors to float16
  142. if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  143. data = data.astype(np.float16)
  144. print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
  145. gguf_writer.add_tensor(new_name, data)
  146. print("gguf: write header")
  147. gguf_writer.write_header_to_file()
  148. print("gguf: write metadata")
  149. gguf_writer.write_kv_data_to_file()
  150. if not args.vocab_only:
  151. print("gguf: write tensors")
  152. gguf_writer.write_tensors_to_file()
  153. gguf_writer.close()
  154. print(f"gguf: model successfully exported to '{fname_out}'")
  155. print("")