run-original-model.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. #!/usr/bin/env python3
  2. import argparse
  3. import os
  4. import numpy as np
  5. import importlib
  6. from pathlib import Path
  7. from transformers import AutoTokenizer, AutoConfig, AutoModel
  8. import torch
  9. unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
  10. parser = argparse.ArgumentParser(description='Process model with specified path')
  11. parser.add_argument('--model-path', '-m', help='Path to the model')
  12. parser.add_argument('--prompts-file', '-p', help='Path to file containing prompts (one per line)')
  13. parser.add_argument('--use-sentence-transformers', action='store_true',
  14. help='Use SentenceTransformer to apply all numbered layers (01_Pooling, 02_Dense, 03_Dense, 04_Normalize)')
  15. args = parser.parse_args()
  16. def read_prompt_from_file(file_path):
  17. try:
  18. with open(file_path, 'r', encoding='utf-8') as f:
  19. return f.read().strip()
  20. except FileNotFoundError:
  21. print(f"Error: Prompts file '{file_path}' not found")
  22. exit(1)
  23. except Exception as e:
  24. print(f"Error reading prompts file: {e}")
  25. exit(1)
  26. model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
  27. if model_path is None:
  28. parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable")
  29. # Determine if we should use SentenceTransformer
  30. use_sentence_transformers = args.use_sentence_transformers or os.environ.get('USE_SENTENCE_TRANSFORMERS', '').lower() in ('1', 'true', 'yes')
  31. if use_sentence_transformers:
  32. from sentence_transformers import SentenceTransformer
  33. print("Using SentenceTransformer to apply all numbered layers")
  34. model = SentenceTransformer(model_path)
  35. tokenizer = model.tokenizer
  36. config = model[0].auto_model.config # type: ignore
  37. else:
  38. tokenizer = AutoTokenizer.from_pretrained(model_path)
  39. config = AutoConfig.from_pretrained(model_path)
  40. # This can be used to override the sliding window size for manual testing. This
  41. # can be useful to verify the sliding window attention mask in the original model
  42. # and compare it with the converted .gguf model.
  43. if hasattr(config, 'sliding_window'):
  44. original_sliding_window = config.sliding_window
  45. #original_sliding_window = 6
  46. print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")
  47. print(f"Using unreleased model: {unreleased_model_name}")
  48. if unreleased_model_name:
  49. model_name_lower = unreleased_model_name.lower()
  50. unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
  51. class_name = f"{unreleased_model_name}Model"
  52. print(f"Importing unreleased model module: {unreleased_module_path}")
  53. try:
  54. model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
  55. model = model_class.from_pretrained(model_path, config=config)
  56. except (ImportError, AttributeError) as e:
  57. print(f"Failed to import or load model: {e}")
  58. exit(1)
  59. else:
  60. model = AutoModel.from_pretrained(model_path, config=config)
  61. print(f"Model class: {type(model)}")
  62. print(f"Model file: {type(model).__module__}")
  63. # Verify the model is using the correct sliding window
  64. if not use_sentence_transformers:
  65. if hasattr(model.config, 'sliding_window'): # type: ignore
  66. print(f"Model's sliding_window: {model.config.sliding_window}") # type: ignore
  67. else:
  68. print("Model config does not have sliding_window attribute")
  69. model_name = os.path.basename(model_path)
  70. if args.prompts_file:
  71. prompt_text = read_prompt_from_file(args.prompts_file)
  72. texts = [prompt_text]
  73. else:
  74. texts = ["Hello world today"]
  75. with torch.no_grad():
  76. if use_sentence_transformers:
  77. embeddings = model.encode(texts, convert_to_numpy=True)
  78. all_embeddings = embeddings # Shape: [batch_size, hidden_size]
  79. encoded = tokenizer(
  80. texts,
  81. padding=True,
  82. truncation=True,
  83. return_tensors="pt"
  84. )
  85. tokens = encoded['input_ids'][0]
  86. token_strings = tokenizer.convert_ids_to_tokens(tokens)
  87. for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
  88. print(f"{token_id:6d} -> '{token_str}'")
  89. print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
  90. print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}") # type: ignore
  91. else:
  92. # Standard approach: use base model output only
  93. encoded = tokenizer(
  94. texts,
  95. padding=True,
  96. truncation=True,
  97. return_tensors="pt"
  98. )
  99. tokens = encoded['input_ids'][0]
  100. token_strings = tokenizer.convert_ids_to_tokens(tokens)
  101. for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
  102. print(f"{token_id:6d} -> '{token_str}'")
  103. outputs = model(**encoded)
  104. hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_size]
  105. all_embeddings = hidden_states[0].cpu().numpy() # Shape: [seq_len, hidden_size]
  106. print(f"Hidden states shape: {hidden_states.shape}")
  107. print(f"All embeddings shape: {all_embeddings.shape}")
  108. print(f"Embedding dimension: {all_embeddings.shape[1]}")
  109. if len(all_embeddings.shape) == 1:
  110. n_embd = all_embeddings.shape[0] # type: ignore
  111. n_embd_count = 1
  112. all_embeddings = all_embeddings.reshape(1, -1)
  113. else:
  114. n_embd = all_embeddings.shape[1] # type: ignore
  115. n_embd_count = all_embeddings.shape[0] # type: ignore
  116. print()
  117. for j in range(n_embd_count):
  118. embedding = all_embeddings[j]
  119. print(f"embedding {j}: ", end="")
  120. # Print first 3 values
  121. for i in range(min(3, n_embd)):
  122. print(f"{embedding[i]:9.6f} ", end="")
  123. print(" ... ", end="")
  124. # Print last 3 values
  125. for i in range(n_embd - 3, n_embd):
  126. print(f"{embedding[i]:9.6f} ", end="")
  127. print() # New line
  128. print()
  129. data_dir = Path("data")
  130. data_dir.mkdir(exist_ok=True)
  131. bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
  132. txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
  133. flattened_embeddings = all_embeddings.flatten()
  134. flattened_embeddings.astype(np.float32).tofile(bin_filename)
  135. with open(txt_filename, "w") as f:
  136. idx = 0
  137. for j in range(n_embd_count):
  138. for value in all_embeddings[j]:
  139. f.write(f"{idx}: {value:.6f}\n")
  140. idx += 1
  141. print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings × {n_embd} dimensions)")
  142. print("")
  143. print(f"Saved bin embeddings to: {bin_filename}")
  144. print(f"Saved txt embeddings to: {txt_filename}")