convert_persimmon_to_gguf.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. #!/usr/bin/env python3
  2. import argparse
  3. import os
  4. import sys
  5. from pathlib import Path
  6. from pprint import pprint
  7. import torch
  8. from sentencepiece import SentencePieceProcessor
  9. if 'NO_LOCAL_GGUF' not in os.environ:
  10. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
  11. import gguf
  12. def _flatten_dict(dct, tensors, prefix=None):
  13. assert isinstance(dct, dict)
  14. for key in dct.keys():
  15. new_prefix = prefix + '.' + key if prefix is not None else key
  16. if isinstance(dct[key], torch.Tensor):
  17. tensors[new_prefix] = dct[key]
  18. elif isinstance(dct[key], dict):
  19. _flatten_dict(dct[key], tensors, new_prefix)
  20. else:
  21. raise ValueError(type(dct[key]))
  22. return None
  23. def _get_sentencepiece_tokenizer_info(dir_model: Path):
  24. tokenizer_path = dir_model / 'adept_vocab.model'
  25. print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
  26. tokenizer = SentencePieceProcessor(str(tokenizer_path))
  27. print('gguf: adding tokens')
  28. tokens: list[bytes] = []
  29. scores: list[float] = []
  30. toktypes: list[int] = []
  31. for i in range(tokenizer.vocab_size()):
  32. text: bytes
  33. score: float
  34. piece = tokenizer.id_to_piece(i)
  35. text = piece.encode("utf-8")
  36. score = tokenizer.get_score(i)
  37. toktype = 1
  38. if tokenizer.is_unknown(i):
  39. toktype = 2
  40. if tokenizer.is_control(i):
  41. toktype = 3
  42. if tokenizer.is_unused(i):
  43. toktype = 5
  44. if tokenizer.is_byte(i):
  45. toktype = 6
  46. tokens.append(text)
  47. scores.append(score)
  48. toktypes.append(toktype)
  49. pass
  50. return tokens, scores, toktypes
  51. def main():
  52. parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
  53. parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
  54. parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
  55. parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
  56. parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
  57. args = parser.parse_args()
  58. sys.path.append(str(args.adept_inference_dir))
  59. persimmon_model = torch.load(args.ckpt_path)
  60. hparams = persimmon_model['args']
  61. pprint(hparams)
  62. tensors: dict[str, torch.Tensor] = {}
  63. _flatten_dict(persimmon_model['model'], tensors, None)
  64. arch = gguf.MODEL_ARCH.PERSIMMON
  65. gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
  66. block_count = hparams.num_layers
  67. head_count = hparams.num_attention_heads
  68. head_count_kv = head_count
  69. ctx_length = hparams.seq_length
  70. hidden_size = hparams.hidden_size
  71. gguf_writer.add_name('persimmon-8b-chat')
  72. gguf_writer.add_context_length(ctx_length)
  73. gguf_writer.add_embedding_length(hidden_size)
  74. gguf_writer.add_block_count(block_count)
  75. gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
  76. # ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
  77. gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
  78. gguf_writer.add_head_count(head_count)
  79. gguf_writer.add_head_count_kv(head_count_kv)
  80. gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
  81. gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
  82. tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
  83. gguf_writer.add_tokenizer_model('llama')
  84. gguf_writer.add_token_list(tokens)
  85. gguf_writer.add_token_scores(scores)
  86. gguf_writer.add_token_types(toktypes)
  87. gguf_writer.add_bos_token_id(71013)
  88. gguf_writer.add_eos_token_id(71013)
  89. tensor_map = gguf.get_tensor_name_map(arch, block_count)
  90. print(tensor_map)
  91. for name in tensors.keys():
  92. data = tensors[name]
  93. if name.endswith(".self_attention.rotary_emb.inv_freq"):
  94. continue
  95. old_dtype = data.dtype
  96. # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
  97. data = data.to(torch.float32).squeeze().numpy()
  98. new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
  99. if new_name is None:
  100. print("Can not map tensor '" + name + "'")
  101. sys.exit()
  102. n_dims = len(data.shape)
  103. print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
  104. gguf_writer.add_tensor(new_name, data)
  105. print("gguf: write header")
  106. gguf_writer.write_header_to_file()
  107. print("gguf: write metadata")
  108. gguf_writer.write_kv_data_to_file()
  109. print("gguf: write tensors")
  110. gguf_writer.write_tensors_to_file()
  111. gguf_writer.close()
  112. print(f"gguf: model successfully exported to '{args.outfile}'")
  113. print("")
  114. if __name__ == '__main__':
  115. main()