compare-embeddings-logits.sh 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. #!/usr/bin/env bash
  2. set -e
  3. MODEL_PATH="${1:-"$MODEL_PATH"}"
  4. MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
  5. CONVERTED_MODEL_PATH="${1:-"$CONVERTED_MODEL"}"
  6. CONVERTED_MODEL_NAME="${2:-$(basename "$CONVERTED_MODEL_PATH" ".gguf")}"
  7. if [ -t 0 ]; then
  8. CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin"
  9. else
  10. # Process piped JSON data and convert to binary (matching logits.cpp format)
  11. TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
  12. python3 -c "
  13. import json
  14. import sys
  15. import struct
  16. data = json.load(sys.stdin)
  17. # Flatten all embeddings completely
  18. flattened = []
  19. for item in data:
  20. embedding = item['embedding']
  21. for token_embedding in embedding:
  22. flattened.extend(token_embedding)
  23. print(f'Total embedding values: {len(flattened)}', file=sys.stderr)
  24. # Write as binary floats - matches logitc.cpp fwrite format
  25. with open('$TEMP_FILE', 'wb') as f:
  26. for value in flattened:
  27. f.write(struct.pack('f', value))
  28. "
  29. CPP_EMBEDDINGS="$TEMP_FILE"
  30. trap "rm -f $TEMP_FILE" EXIT
  31. fi
  32. python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
  33. --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
  34. --cpp-embeddings $CPP_EMBEDDINGS \
  35. --prompt "Hello world today" \
  36. --causal