| 1234567891011121314151617181920212223242526272829303132333435363738394041 |
- #!/bin/bash
- #
- # Usage:
- #
- # test-tokenizer-0.sh <name> <input>
- #
- if [ $# -ne 2 ]; then
- printf "Usage: $0 <name> <input>\n"
- exit 1
- fi
- name=$1
- input=$2
- make -j tests/test-tokenizer-0
- printf "Testing %s on %s ...\n" $name $input
- set -e
- printf "Tokenizing using (py) Python AutoTokenizer ...\n"
- python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
- printf "Tokenizing using (cpp) llama.cpp ...\n"
- ./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
- cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
- cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
- set +e
- diff $input.tok $input.tokcpp > /dev/null 2>&1
- if [ $? -eq 0 ]; then
- printf "Tokenization is correct!\n"
- else
- diff $input.tok $input.tokcpp | head -n 32
- printf "Tokenization differs!\n"
- fi
|