test-tokenizer-0.sh 929 B

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. #!/usr/bin/env bash
  2. #
  3. # Usage:
  4. #
  5. # test-tokenizer-0.sh <name> <input>
  6. #
  7. if [ $# -ne 2 ]; then
  8. printf "Usage: $0 <name> <input>\n"
  9. exit 1
  10. fi
  11. name=$1
  12. input=$2
  13. make -j tests/test-tokenizer-0
  14. printf "Testing %s on %s ...\n" $name $input
  15. set -e
  16. printf "Tokenizing using (py) Python AutoTokenizer ...\n"
  17. python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
  18. printf "Tokenizing using (cpp) llama.cpp ...\n"
  19. ./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
  20. cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
  21. cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
  22. set +e
  23. diff $input.tok $input.tokcpp > /dev/null 2>&1
  24. if [ $? -eq 0 ]; then
  25. printf "Tokenization is correct!\n"
  26. else
  27. diff $input.tok $input.tokcpp | head -n 32
  28. printf "Tokenization differs!\n"
  29. fi