1
0

convert-persimmon-to-gguf.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. #!/usr/bin/env python3
  2. from __future__ import annotations
  3. import argparse
  4. import os
  5. import sys
  6. from pathlib import Path
  7. from pprint import pprint
  8. import torch
  9. from sentencepiece import SentencePieceProcessor
  10. if 'NO_LOCAL_GGUF' not in os.environ:
  11. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
  12. import gguf
  13. def _flatten_dict(dct, tensors, prefix=None):
  14. assert isinstance(dct, dict)
  15. for key in dct.keys():
  16. new_prefix = prefix + '.' + key if prefix is not None else key
  17. if isinstance(dct[key], torch.Tensor):
  18. tensors[new_prefix] = dct[key]
  19. elif isinstance(dct[key], dict):
  20. _flatten_dict(dct[key], tensors, new_prefix)
  21. else:
  22. raise ValueError(type(dct[key]))
  23. return None
  24. def _get_sentencepiece_tokenizer_info(dir_model: Path):
  25. tokenizer_path = dir_model / 'adept_vocab.model'
  26. print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
  27. tokenizer = SentencePieceProcessor(str(tokenizer_path))
  28. print('gguf: adding tokens')
  29. tokens: list[bytes] = []
  30. scores: list[float] = []
  31. toktypes: list[int] = []
  32. for i in range(tokenizer.vocab_size()):
  33. text: bytes
  34. score: float
  35. piece = tokenizer.id_to_piece(i)
  36. text = piece.encode("utf-8")
  37. score = tokenizer.get_score(i)
  38. toktype = 1
  39. if tokenizer.is_unknown(i):
  40. toktype = 2
  41. if tokenizer.is_control(i):
  42. toktype = 3
  43. if tokenizer.is_unused(i):
  44. toktype = 5
  45. if tokenizer.is_byte(i):
  46. toktype = 6
  47. tokens.append(text)
  48. scores.append(score)
  49. toktypes.append(toktype)
  50. pass
  51. return tokens, scores, toktypes
  52. def main():
  53. parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
  54. parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
  55. parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
  56. parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
  57. parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
  58. args = parser.parse_args()
  59. sys.path.append(str(args.adept_inference_dir))
  60. persimmon_model = torch.load(args.ckpt_path)
  61. hparams = persimmon_model['args']
  62. pprint(hparams)
  63. tensors: dict[str, torch.Tensor] = {}
  64. _flatten_dict(persimmon_model['model'], tensors, None)
  65. arch = gguf.MODEL_ARCH.PERSIMMON
  66. gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
  67. block_count = hparams.num_layers
  68. head_count = hparams.num_attention_heads
  69. head_count_kv = head_count
  70. ctx_length = hparams.seq_length
  71. hidden_size = hparams.hidden_size
  72. gguf_writer.add_name('persimmon-8b-chat')
  73. gguf_writer.add_context_length(ctx_length)
  74. gguf_writer.add_embedding_length(hidden_size)
  75. gguf_writer.add_block_count(block_count)
  76. gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
  77. # ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
  78. gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
  79. gguf_writer.add_head_count(head_count)
  80. gguf_writer.add_head_count_kv(head_count_kv)
  81. gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
  82. gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
  83. tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
  84. gguf_writer.add_tokenizer_model('llama')
  85. gguf_writer.add_tokenizer_pre('default')
  86. gguf_writer.add_token_list(tokens)
  87. gguf_writer.add_token_scores(scores)
  88. gguf_writer.add_token_types(toktypes)
  89. gguf_writer.add_bos_token_id(71013)
  90. gguf_writer.add_eos_token_id(71013)
  91. tensor_map = gguf.get_tensor_name_map(arch, block_count)
  92. print(tensor_map)
  93. for name in tensors.keys():
  94. data_torch = tensors[name]
  95. if name.endswith(".self_attention.rotary_emb.inv_freq"):
  96. continue
  97. old_dtype = data_torch.dtype
  98. # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
  99. data = data_torch.to(torch.float32).squeeze().numpy()
  100. new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
  101. if new_name is None:
  102. print("Can not map tensor '" + name + "'")
  103. sys.exit()
  104. n_dims = len(data.shape)
  105. print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
  106. gguf_writer.add_tensor(new_name, data)
  107. print("gguf: write header")
  108. gguf_writer.write_header_to_file()
  109. print("gguf: write metadata")
  110. gguf_writer.write_kv_data_to_file()
  111. print("gguf: write tensors")
  112. gguf_writer.write_tensors_to_file()
  113. gguf_writer.close()
  114. print(f"gguf: model successfully exported to '{args.outfile}'")
  115. print("")
  116. if __name__ == '__main__':
  117. main()