run-original-model.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. #!/usr/bin/env python3
  2. import argparse
  3. import os
  4. import sys
  5. import importlib
  6. from transformers import AutoTokenizer, AutoConfig, AutoModel
  7. import torch
  8. # Add parent directory to path for imports
  9. sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
  10. from utils.common import save_output_data
  11. def parse_arguments():
  12. parser = argparse.ArgumentParser(description='Run original embedding model')
  13. parser.add_argument(
  14. '--model-path',
  15. '-m',
  16. help='Path to the model'
  17. )
  18. parser.add_argument(
  19. '--prompts-file',
  20. '-p',
  21. help='Path to file containing prompts (one per line)'
  22. )
  23. parser.add_argument(
  24. '--use-sentence-transformers',
  25. action='store_true',
  26. help=('Use SentenceTransformer to apply all numbered layers '
  27. '(01_Pooling, 02_Dense, 03_Dense, 04_Normalize)')
  28. )
  29. parser.add_argument(
  30. '--device',
  31. '-d',
  32. help='Device to use (cpu, cuda, mps, auto)',
  33. default='auto'
  34. )
  35. return parser.parse_args()
  36. def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device="auto"):
  37. if device == "cpu":
  38. device_map = {"": "cpu"}
  39. print("Forcing CPU usage")
  40. elif device == "auto":
  41. # On Mac, "auto" device_map can cause issues with accelerate
  42. # So we detect the best device manually
  43. if torch.cuda.is_available():
  44. device_map = {"": "cuda"}
  45. print("Using CUDA")
  46. elif torch.backends.mps.is_available():
  47. device_map = {"": "mps"}
  48. print("Using MPS (Apple Metal)")
  49. else:
  50. device_map = {"": "cpu"}
  51. print("Using CPU")
  52. else:
  53. device_map = {"": device}
  54. if use_sentence_transformers:
  55. from sentence_transformers import SentenceTransformer
  56. print("Using SentenceTransformer to apply all numbered layers")
  57. model = SentenceTransformer(model_path)
  58. tokenizer = model.tokenizer
  59. config = model[0].auto_model.config # type: ignore
  60. else:
  61. tokenizer = AutoTokenizer.from_pretrained(model_path)
  62. config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
  63. # This can be used to override the sliding window size for manual testing. This
  64. # can be useful to verify the sliding window attention mask in the original model
  65. # and compare it with the converted .gguf model.
  66. if hasattr(config, 'sliding_window'):
  67. original_sliding_window = config.sliding_window
  68. print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")
  69. unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
  70. print(f"Using unreleased model: {unreleased_model_name}")
  71. if unreleased_model_name:
  72. model_name_lower = unreleased_model_name.lower()
  73. unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
  74. class_name = f"{unreleased_model_name}Model"
  75. print(f"Importing unreleased model module: {unreleased_module_path}")
  76. try:
  77. model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
  78. model = model_class.from_pretrained(
  79. model_path,
  80. device_map=device_map,
  81. offload_folder="offload",
  82. trust_remote_code=True,
  83. config=config
  84. )
  85. except (ImportError, AttributeError) as e:
  86. print(f"Failed to import or load model: {e}")
  87. sys.exit(1)
  88. else:
  89. model = AutoModel.from_pretrained(
  90. model_path,
  91. device_map=device_map,
  92. offload_folder="offload",
  93. trust_remote_code=True,
  94. config=config
  95. )
  96. print(f"Model class: {type(model)}")
  97. print(f"Model file: {type(model).__module__}")
  98. # Verify the model is using the correct sliding window
  99. if hasattr(model.config, 'sliding_window'): # type: ignore
  100. print(f"Model's sliding_window: {model.config.sliding_window}") # type: ignore
  101. else:
  102. print("Model config does not have sliding_window attribute")
  103. return model, tokenizer, config
  104. def get_prompt(args):
  105. if args.prompts_file:
  106. try:
  107. with open(args.prompts_file, 'r', encoding='utf-8') as f:
  108. return f.read().strip()
  109. except FileNotFoundError:
  110. print(f"Error: Prompts file '{args.prompts_file}' not found")
  111. sys.exit(1)
  112. except Exception as e:
  113. print(f"Error reading prompts file: {e}")
  114. sys.exit(1)
  115. else:
  116. return "Hello world today"
  117. def main():
  118. args = parse_arguments()
  119. model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
  120. if model_path is None:
  121. print("Error: Model path must be specified either via --model-path argument "
  122. "or EMBEDDING_MODEL_PATH environment variable")
  123. sys.exit(1)
  124. # Determine if we should use SentenceTransformer
  125. use_st = (
  126. args.use_sentence_transformers or os.environ.get('USE_SENTENCE_TRANSFORMERS', '').lower() in ('1', 'true', 'yes')
  127. )
  128. model, tokenizer, config = load_model_and_tokenizer(model_path, use_st, args.device)
  129. # Get the device the model is on
  130. if not use_st:
  131. device = next(model.parameters()).device
  132. else:
  133. # For SentenceTransformer, get device from the underlying model
  134. device = next(model[0].auto_model.parameters()).device # type: ignore
  135. model_name = os.path.basename(model_path)
  136. prompt_text = get_prompt(args)
  137. texts = [prompt_text]
  138. with torch.no_grad():
  139. if use_st:
  140. embeddings = model.encode(texts, convert_to_numpy=True)
  141. all_embeddings = embeddings # Shape: [batch_size, hidden_size]
  142. encoded = tokenizer(
  143. texts,
  144. padding=True,
  145. truncation=True,
  146. return_tensors="pt"
  147. )
  148. tokens = encoded['input_ids'][0]
  149. token_ids = tokens.cpu().tolist()
  150. token_strings = tokenizer.convert_ids_to_tokens(tokens)
  151. for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
  152. print(f"{token_id:6d} -> '{token_str}'")
  153. print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
  154. print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}") # type: ignore
  155. else:
  156. # Standard approach: use base model output only
  157. encoded = tokenizer(
  158. texts,
  159. padding=True,
  160. truncation=True,
  161. return_tensors="pt"
  162. )
  163. tokens = encoded['input_ids'][0]
  164. token_ids = tokens.cpu().tolist()
  165. token_strings = tokenizer.convert_ids_to_tokens(tokens)
  166. for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
  167. print(f"{token_id:6d} -> '{token_str}'")
  168. # Move inputs to the same device as the model
  169. encoded = {k: v.to(device) for k, v in encoded.items()}
  170. outputs = model(**encoded)
  171. hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_size]
  172. all_embeddings = hidden_states[0].float().cpu().numpy() # Shape: [seq_len, hidden_size]
  173. print(f"Hidden states shape: {hidden_states.shape}")
  174. print(f"All embeddings shape: {all_embeddings.shape}")
  175. print(f"Embedding dimension: {all_embeddings.shape[1]}")
  176. if len(all_embeddings.shape) == 1:
  177. n_embd = all_embeddings.shape[0] # type: ignore
  178. n_embd_count = 1
  179. all_embeddings = all_embeddings.reshape(1, -1)
  180. else:
  181. n_embd = all_embeddings.shape[1] # type: ignore
  182. n_embd_count = all_embeddings.shape[0] # type: ignore
  183. print()
  184. for j in range(n_embd_count):
  185. embedding = all_embeddings[j]
  186. print(f"embedding {j}: ", end="")
  187. # Print first 3 values
  188. for i in range(min(3, n_embd)):
  189. print(f"{embedding[i]:9.6f} ", end="")
  190. print(" ... ", end="")
  191. # Print last 3 values
  192. for i in range(n_embd - 3, n_embd):
  193. print(f"{embedding[i]:9.6f} ", end="")
  194. print() # New line
  195. print()
  196. flattened_embeddings = all_embeddings.flatten()
  197. print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings × {n_embd} dimensions)")
  198. print("")
  199. save_output_data(flattened_embeddings, token_ids, prompt_text, model_name, type_suffix="-embeddings")
  200. if __name__ == "__main__":
  201. main()