convert-starcoder-hf-to-gguf.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. #!/usr/bin/env python3
  2. # HF starcoder --> gguf conversion
  3. from __future__ import annotations
  4. import argparse
  5. import json
  6. import os
  7. import struct
  8. import sys
  9. from pathlib import Path
  10. from typing import Any
  11. import numpy as np
  12. import torch
  13. from transformers import AutoTokenizer # type: ignore[import]
  14. if 'NO_LOCAL_GGUF' not in os.environ:
  15. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
  16. import gguf
  17. def count_model_parts(dir_model: Path) -> int:
  18. num_parts = 0
  19. for filename in os.listdir(dir_model):
  20. if filename.startswith("pytorch_model-"):
  21. num_parts += 1
  22. if num_parts > 0:
  23. print("gguf: found " + str(num_parts) + " model parts")
  24. return num_parts
  25. def parse_args() -> argparse.Namespace:
  26. parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
  27. parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
  28. parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
  29. parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
  30. parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
  31. return parser.parse_args()
  32. args = parse_args()
  33. dir_model = args.model
  34. ftype = args.ftype
  35. if not dir_model.is_dir():
  36. print(f'Error: {args.model} is not a directory', file = sys.stderr)
  37. sys.exit(1)
  38. # possible tensor data types
  39. # ftype == 0 -> float32
  40. # ftype == 1 -> float16
  41. # map from ftype to string
  42. ftype_str = ["f32", "f16"]
  43. if args.outfile is not None:
  44. fname_out = args.outfile
  45. else:
  46. # output in the same directory as the model by default
  47. fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
  48. print("gguf: loading model "+dir_model.name)
  49. with open(dir_model / "config.json", "r", encoding="utf-8") as f:
  50. hparams = json.load(f)
  51. if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
  52. print("Model architecture not supported: " + hparams["architectures"][0])
  53. sys.exit(1)
  54. # get number of model parts
  55. num_parts = count_model_parts(dir_model)
  56. ARCH=gguf.MODEL_ARCH.STARCODER
  57. gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
  58. print("gguf: get model metadata")
  59. block_count = hparams["n_layer"]
  60. gguf_writer.add_name("StarCoder")
  61. gguf_writer.add_context_length(hparams["n_positions"])
  62. gguf_writer.add_embedding_length(hparams["n_embd"])
  63. gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
  64. gguf_writer.add_block_count(block_count)
  65. gguf_writer.add_head_count(hparams["n_head"])
  66. gguf_writer.add_head_count_kv(1)
  67. gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
  68. gguf_writer.add_file_type(ftype)
  69. # TOKENIZATION
  70. print("gguf: get tokenizer metadata")
  71. tokens: list[bytearray] = []
  72. scores: list[float] = []
  73. toktypes: list[int] = []
  74. # gpt2 tokenizer
  75. gguf_writer.add_tokenizer_model("gpt2")
  76. print("gguf: get gpt2 tokenizer vocab")
  77. # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
  78. tokenizer = AutoTokenizer.from_pretrained(dir_model)
  79. # The number of tokens in tokenizer.json can differ from the expected vocab size.
  80. # This causes downstream issues with mismatched tensor sizes when running the inference
  81. vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
  82. assert max(tokenizer.vocab.values()) < vocab_size
  83. reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
  84. for i in range(vocab_size):
  85. tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
  86. scores.append(0.0) # dummy
  87. toktypes.append(gguf.TokenType.NORMAL)
  88. gguf_writer.add_token_list(tokens)
  89. gguf_writer.add_token_scores(scores)
  90. gguf_writer.add_token_types(toktypes)
  91. special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
  92. special_vocab.add_to_gguf(gguf_writer)
  93. # TENSORS
  94. tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
  95. # params for qkv transform
  96. n_head = hparams["n_head"]
  97. n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
  98. head_dim = hparams["n_embd"] // n_head
  99. # tensor info
  100. print("gguf: get tensor metadata")
  101. if num_parts == 0:
  102. part_names = iter(("pytorch_model.bin",))
  103. else:
  104. part_names = (
  105. f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
  106. )
  107. for part_name in part_names:
  108. if args.vocab_only:
  109. break
  110. print("gguf: loading model part '" + part_name + "'")
  111. model_part = torch.load(dir_model / part_name, map_location="cpu")
  112. for name in model_part.keys():
  113. data = model_part[name]
  114. old_dtype = data.dtype
  115. # convert any unsupported data types to float32
  116. if data.dtype != torch.float16 and data.dtype != torch.float32:
  117. data = data.to(torch.float32)
  118. data = data.squeeze().numpy()
  119. # map tensor names
  120. new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
  121. if new_name is None:
  122. print("Can not map tensor '" + name + "'")
  123. sys.exit()
  124. n_dims = len(data.shape)
  125. data_dtype = data.dtype
  126. # if f32 desired, convert any float16 to float32
  127. if ftype == 0 and data_dtype == np.float16:
  128. data = data.astype(np.float32)
  129. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  130. if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  131. data = data.astype(np.float32)
  132. # if f16 desired, convert any float32 2-dim weight tensors to float16
  133. if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  134. data = data.astype(np.float16)
  135. print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
  136. gguf_writer.add_tensor(new_name, data)
  137. print("gguf: write header")
  138. gguf_writer.write_header_to_file()
  139. print("gguf: write metadata")
  140. gguf_writer.write_kv_data_to_file()
  141. if not args.vocab_only:
  142. print("gguf: write tensors")
  143. gguf_writer.write_tensors_to_file()
  144. gguf_writer.close()
  145. print(f"gguf: model successfully exported to '{fname_out}'")
  146. print("")