run-original-model.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. #!/usr/bin/env python3
  2. import argparse
  3. import os
  4. import numpy as np
  5. import importlib
  6. from pathlib import Path
  7. from transformers import AutoTokenizer, AutoConfig, AutoModel
  8. import torch
  9. unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
  10. parser = argparse.ArgumentParser(description='Process model with specified path')
  11. parser.add_argument('--model-path', '-m', help='Path to the model')
  12. args = parser.parse_args()
  13. model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
  14. if model_path is None:
  15. parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable")
  16. tokenizer = AutoTokenizer.from_pretrained(model_path)
  17. if unreleased_model_name:
  18. model_name_lower = unreleased_model_name.lower()
  19. unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
  20. class_name = f"{unreleased_model_name}Model"
  21. print(f"Importing unreleased model module: {unreleased_module_path}")
  22. try:
  23. model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
  24. model = model_class.from_pretrained(model_path) # Note: from_pretrained, not fromPretrained
  25. except (ImportError, AttributeError) as e:
  26. print(f"Failed to import or load model: {e}")
  27. exit(1)
  28. else:
  29. model = AutoModel.from_pretrained(model_path)
  30. print(f"Model class: {type(model)}")
  31. #print(f"Model file: {type(model).__module__}")
  32. config = AutoConfig.from_pretrained(model_path)
  33. model_name = os.path.basename(model_path)
  34. texts = [ "Hello world today" ]
  35. encoded = tokenizer(
  36. texts,
  37. padding=True,
  38. truncation=True,
  39. return_tensors="pt"
  40. )
  41. tokens = encoded['input_ids'][0]
  42. token_strings = tokenizer.convert_ids_to_tokens(tokens)
  43. for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
  44. print(f"{token_id:6d} -> '{token_str}'")
  45. with torch.no_grad():
  46. outputs = model(**encoded)
  47. hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_size]
  48. # Extract embeddings for each token (matching LLAMA_POOLING_TYPE_NONE behavior)
  49. all_embeddings = hidden_states[0].cpu().numpy() # Shape: [seq_len, hidden_size]
  50. print(f"Hidden states shape: {hidden_states.shape}")
  51. print(f"All embeddings shape: {all_embeddings.shape}")
  52. print(f"Embedding dimension: {all_embeddings.shape[1]}")
  53. # Print embeddings exactly like embedding.cpp does for LLAMA_POOLING_TYPE_NONE
  54. n_embd = all_embeddings.shape[1]
  55. n_embd_count = all_embeddings.shape[0]
  56. print() # Empty line to match C++ output
  57. for j in range(n_embd_count):
  58. embedding = all_embeddings[j]
  59. print(f"embedding {j}: ", end="")
  60. # Print first 3 values
  61. for i in range(min(3, n_embd)):
  62. print(f"{embedding[i]:9.6f} ", end="")
  63. print(" ... ", end="")
  64. # Print last 3 values
  65. for i in range(n_embd - 3, n_embd):
  66. print(f"{embedding[i]:9.6f} ", end="")
  67. print() # New line
  68. print() # Final empty line to match C++ output
  69. data_dir = Path("data")
  70. data_dir.mkdir(exist_ok=True)
  71. bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
  72. txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
  73. # Save all embeddings flattened (matching what embedding.cpp would save if it did)
  74. flattened_embeddings = all_embeddings.flatten()
  75. flattened_embeddings.astype(np.float32).tofile(bin_filename)
  76. with open(txt_filename, "w") as f:
  77. f.write(f"# Model class: {model_name}\n")
  78. f.write(f"# Tokens: {token_strings}\n")
  79. f.write(f"# Shape: {all_embeddings.shape}\n")
  80. f.write(f"# n_embd_count: {n_embd_count}, n_embd: {n_embd}\n\n")
  81. for j in range(n_embd_count):
  82. f.write(f"# Token {j} ({token_strings[j]}):\n")
  83. for i, value in enumerate(all_embeddings[j]):
  84. f.write(f"{j}_{i}: {value:.6f}\n")
  85. f.write("\n")
  86. print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} tokens × {n_embd} dimensions)")
  87. print("")
  88. print(f"Saved bin embeddings to: {bin_filename}")
  89. print(f"Saved txt embeddings to: {txt_filename}")