convert-persimmon-to-gguf.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. import torch
  2. import os
  3. from pprint import pprint
  4. import sys
  5. import argparse
  6. from pathlib import Path
  7. from sentencepiece import SentencePieceProcessor
  8. if 'NO_LOCAL_GGUF' not in os.environ:
  9. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
  10. import gguf
  11. def _flatten_dict(dct, tensors, prefix=None):
  12. assert isinstance(dct, dict)
  13. for key in dct.keys():
  14. new_prefix = prefix + '.' + key if prefix is not None else key
  15. if isinstance(dct[key], torch.Tensor):
  16. tensors[new_prefix] = dct[key]
  17. elif isinstance(dct[key], dict):
  18. _flatten_dict(dct[key], tensors, new_prefix)
  19. else:
  20. raise ValueError(type(dct[key]))
  21. return None
  22. def _get_sentencepiece_tokenizer_info(dir_model: Path):
  23. tokenizer_path = dir_model / 'adept_vocab.model'
  24. print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
  25. tokenizer = SentencePieceProcessor(str(tokenizer_path))
  26. print('gguf: adding tokens')
  27. tokens: list[bytes] = []
  28. scores: list[float] = []
  29. toktypes: list[int] = []
  30. for i in range(tokenizer.vocab_size()):
  31. text: bytes
  32. score: float
  33. piece = tokenizer.id_to_piece(i)
  34. text = piece.encode("utf-8")
  35. score = tokenizer.get_score(i)
  36. toktype = 1
  37. if tokenizer.is_unknown(i):
  38. toktype = 2
  39. if tokenizer.is_control(i):
  40. toktype = 3
  41. if tokenizer.is_unused(i):
  42. toktype = 5
  43. if tokenizer.is_byte(i):
  44. toktype = 6
  45. tokens.append(text)
  46. scores.append(score)
  47. toktypes.append(toktype)
  48. pass
  49. return tokens, scores, toktypes
  50. def main():
  51. parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
  52. parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
  53. parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
  54. parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
  55. parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
  56. args = parser.parse_args()
  57. sys.path.append(str(args.adept_inference_dir))
  58. persimmon_model = torch.load(args.ckpt_path)
  59. hparams = persimmon_model['args']
  60. pprint(hparams)
  61. tensors = {}
  62. _flatten_dict(persimmon_model['model'], tensors, None)
  63. arch = gguf.MODEL_ARCH.PERSIMMON
  64. gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
  65. block_count = hparams.num_layers
  66. head_count = hparams.num_attention_heads
  67. head_count_kv = head_count
  68. ctx_length = hparams.seq_length
  69. hidden_size = hparams.hidden_size
  70. gguf_writer.add_name('persimmon-8b-chat')
  71. gguf_writer.add_context_length(ctx_length)
  72. gguf_writer.add_embedding_length(hidden_size)
  73. gguf_writer.add_block_count(block_count)
  74. gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
  75. gguf_writer.add_rope_dimension_count(hidden_size // head_count)
  76. gguf_writer.add_head_count(head_count)
  77. gguf_writer.add_head_count_kv(head_count_kv)
  78. gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
  79. gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
  80. tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
  81. gguf_writer.add_tokenizer_model('llama')
  82. gguf_writer.add_token_list(tokens)
  83. gguf_writer.add_token_scores(scores)
  84. gguf_writer.add_token_types(toktypes)
  85. gguf_writer.add_bos_token_id(71013)
  86. gguf_writer.add_eos_token_id(71013)
  87. tensor_map = gguf.get_tensor_name_map(arch, block_count)
  88. print(tensor_map)
  89. for name in tensors.keys():
  90. data = tensors[name]
  91. if name.endswith(".self_attention.rotary_emb.inv_freq"):
  92. continue
  93. old_dtype = data.dtype
  94. # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
  95. data = data.to(torch.float32).squeeze().numpy()
  96. new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
  97. if new_name is None:
  98. print("Can not map tensor '" + name + "'")
  99. sys.exit()
  100. n_dims = len(data.shape)
  101. print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
  102. gguf_writer.add_tensor(new_name, data)
  103. print("gguf: write header")
  104. gguf_writer.write_header_to_file()
  105. print("gguf: write metadata")
  106. gguf_writer.write_kv_data_to_file()
  107. print("gguf: write tensors")
  108. gguf_writer.write_tensors_to_file()
  109. gguf_writer.close()
  110. print(f"gguf: model successfully exported to '{args.outfile}'")
  111. print("")
  112. if __name__ == '__main__':
  113. main()