run-original-model.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. #!/usr/bin/env python3
  2. import argparse
  3. import os
  4. import sys
  5. import numpy as np
  6. import importlib
  7. from pathlib import Path
  8. from transformers import AutoTokenizer, AutoConfig, AutoModel
  9. import torch
  10. def parse_arguments():
  11. parser = argparse.ArgumentParser(description='Run original embedding model')
  12. parser.add_argument(
  13. '--model-path',
  14. '-m',
  15. help='Path to the model'
  16. )
  17. parser.add_argument(
  18. '--prompts-file',
  19. '-p',
  20. help='Path to file containing prompts (one per line)'
  21. )
  22. parser.add_argument(
  23. '--use-sentence-transformers',
  24. action='store_true',
  25. help=('Use SentenceTransformer to apply all numbered layers '
  26. '(01_Pooling, 02_Dense, 03_Dense, 04_Normalize)')
  27. )
  28. parser.add_argument(
  29. '--device',
  30. '-d',
  31. help='Device to use (cpu, cuda, mps, auto)',
  32. default='auto'
  33. )
  34. return parser.parse_args()
  35. def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device="auto"):
  36. if device == "cpu":
  37. device_map = {"": "cpu"}
  38. print("Forcing CPU usage")
  39. elif device == "auto":
  40. # On Mac, "auto" device_map can cause issues with accelerate
  41. # So we detect the best device manually
  42. if torch.cuda.is_available():
  43. device_map = {"": "cuda"}
  44. print("Using CUDA")
  45. elif torch.backends.mps.is_available():
  46. device_map = {"": "mps"}
  47. print("Using MPS (Apple Metal)")
  48. else:
  49. device_map = {"": "cpu"}
  50. print("Using CPU")
  51. else:
  52. device_map = {"": device}
  53. if use_sentence_transformers:
  54. from sentence_transformers import SentenceTransformer
  55. print("Using SentenceTransformer to apply all numbered layers")
  56. model = SentenceTransformer(model_path)
  57. tokenizer = model.tokenizer
  58. config = model[0].auto_model.config # type: ignore
  59. else:
  60. tokenizer = AutoTokenizer.from_pretrained(model_path)
  61. config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
  62. # This can be used to override the sliding window size for manual testing. This
  63. # can be useful to verify the sliding window attention mask in the original model
  64. # and compare it with the converted .gguf model.
  65. if hasattr(config, 'sliding_window'):
  66. original_sliding_window = config.sliding_window
  67. print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")
  68. unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
  69. print(f"Using unreleased model: {unreleased_model_name}")
  70. if unreleased_model_name:
  71. model_name_lower = unreleased_model_name.lower()
  72. unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
  73. class_name = f"{unreleased_model_name}Model"
  74. print(f"Importing unreleased model module: {unreleased_module_path}")
  75. try:
  76. model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
  77. model = model_class.from_pretrained(
  78. model_path,
  79. device_map=device_map,
  80. offload_folder="offload",
  81. trust_remote_code=True,
  82. config=config
  83. )
  84. except (ImportError, AttributeError) as e:
  85. print(f"Failed to import or load model: {e}")
  86. sys.exit(1)
  87. else:
  88. model = AutoModel.from_pretrained(
  89. model_path,
  90. device_map=device_map,
  91. offload_folder="offload",
  92. trust_remote_code=True,
  93. config=config
  94. )
  95. print(f"Model class: {type(model)}")
  96. print(f"Model file: {type(model).__module__}")
  97. # Verify the model is using the correct sliding window
  98. if hasattr(model.config, 'sliding_window'): # type: ignore
  99. print(f"Model's sliding_window: {model.config.sliding_window}") # type: ignore
  100. else:
  101. print("Model config does not have sliding_window attribute")
  102. return model, tokenizer, config
  103. def get_prompt(args):
  104. if args.prompts_file:
  105. try:
  106. with open(args.prompts_file, 'r', encoding='utf-8') as f:
  107. return f.read().strip()
  108. except FileNotFoundError:
  109. print(f"Error: Prompts file '{args.prompts_file}' not found")
  110. sys.exit(1)
  111. except Exception as e:
  112. print(f"Error reading prompts file: {e}")
  113. sys.exit(1)
  114. else:
  115. return "Hello world today"
  116. def main():
  117. args = parse_arguments()
  118. model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
  119. if model_path is None:
  120. print("Error: Model path must be specified either via --model-path argument "
  121. "or EMBEDDING_MODEL_PATH environment variable")
  122. sys.exit(1)
  123. # Determine if we should use SentenceTransformer
  124. use_st = (
  125. args.use_sentence_transformers or os.environ.get('USE_SENTENCE_TRANSFORMERS', '').lower() in ('1', 'true', 'yes')
  126. )
  127. model, tokenizer, config = load_model_and_tokenizer(model_path, use_st, args.device)
  128. # Get the device the model is on
  129. if not use_st:
  130. device = next(model.parameters()).device
  131. else:
  132. # For SentenceTransformer, get device from the underlying model
  133. device = next(model[0].auto_model.parameters()).device # type: ignore
  134. model_name = os.path.basename(model_path)
  135. prompt_text = get_prompt(args)
  136. texts = [prompt_text]
  137. with torch.no_grad():
  138. if use_st:
  139. embeddings = model.encode(texts, convert_to_numpy=True)
  140. all_embeddings = embeddings # Shape: [batch_size, hidden_size]
  141. encoded = tokenizer(
  142. texts,
  143. padding=True,
  144. truncation=True,
  145. return_tensors="pt"
  146. )
  147. tokens = encoded['input_ids'][0]
  148. token_strings = tokenizer.convert_ids_to_tokens(tokens)
  149. for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
  150. print(f"{token_id:6d} -> '{token_str}'")
  151. print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
  152. print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}") # type: ignore
  153. else:
  154. # Standard approach: use base model output only
  155. encoded = tokenizer(
  156. texts,
  157. padding=True,
  158. truncation=True,
  159. return_tensors="pt"
  160. )
  161. tokens = encoded['input_ids'][0]
  162. token_strings = tokenizer.convert_ids_to_tokens(tokens)
  163. for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
  164. print(f"{token_id:6d} -> '{token_str}'")
  165. # Move inputs to the same device as the model
  166. encoded = {k: v.to(device) for k, v in encoded.items()}
  167. outputs = model(**encoded)
  168. hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_size]
  169. all_embeddings = hidden_states[0].float().cpu().numpy() # Shape: [seq_len, hidden_size]
  170. print(f"Hidden states shape: {hidden_states.shape}")
  171. print(f"All embeddings shape: {all_embeddings.shape}")
  172. print(f"Embedding dimension: {all_embeddings.shape[1]}")
  173. if len(all_embeddings.shape) == 1:
  174. n_embd = all_embeddings.shape[0] # type: ignore
  175. n_embd_count = 1
  176. all_embeddings = all_embeddings.reshape(1, -1)
  177. else:
  178. n_embd = all_embeddings.shape[1] # type: ignore
  179. n_embd_count = all_embeddings.shape[0] # type: ignore
  180. print()
  181. for j in range(n_embd_count):
  182. embedding = all_embeddings[j]
  183. print(f"embedding {j}: ", end="")
  184. # Print first 3 values
  185. for i in range(min(3, n_embd)):
  186. print(f"{embedding[i]:9.6f} ", end="")
  187. print(" ... ", end="")
  188. # Print last 3 values
  189. for i in range(n_embd - 3, n_embd):
  190. print(f"{embedding[i]:9.6f} ", end="")
  191. print() # New line
  192. print()
  193. data_dir = Path("data")
  194. data_dir.mkdir(exist_ok=True)
  195. bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
  196. txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
  197. flattened_embeddings = all_embeddings.flatten()
  198. flattened_embeddings.astype(np.float32).tofile(bin_filename)
  199. with open(txt_filename, "w") as f:
  200. idx = 0
  201. for j in range(n_embd_count):
  202. for value in all_embeddings[j]:
  203. f.write(f"{idx}: {value:.6f}\n")
  204. idx += 1
  205. print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings × {n_embd} dimensions)")
  206. print("")
  207. print(f"Saved bin embeddings to: {bin_filename}")
  208. print(f"Saved txt embeddings to: {txt_filename}")
  209. if __name__ == "__main__":
  210. main()