| 12345678910111213141516171819202122232425262728293031323334353637383940414243 |
- #/bin/bash
- set -e
- MODEL_PATH="${1:-"$MODEL_PATH"}"
- MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
- if [ -t 0 ]; then
- CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
- else
- # Process piped JSON data and convert to binary (matching logits.cpp format)
- TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
- python3 -c "
- import json
- import sys
- import struct
- data = json.load(sys.stdin)
- # Flatten all embeddings completely
- flattened = []
- for item in data:
- embedding = item['embedding']
- for token_embedding in embedding:
- flattened.extend(token_embedding)
- print(f'Total embedding values: {len(flattened)}', file=sys.stderr)
- # Write as binary floats - matches logitc.cpp fwrite format
- with open('$TEMP_FILE', 'wb') as f:
- for value in flattened:
- f.write(struct.pack('f', value))
- "
- CPP_EMBEDDINGS="$TEMP_FILE"
- trap "rm -f $TEMP_FILE" EXIT
- fi
- python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
- --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
- --cpp-embeddings $CPP_EMBEDDINGS \
- --prompt "Hello world today" \
- --causal
|