test_tokenize.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. import pytest
  2. from utils import *
  3. server = ServerPreset.tinyllama2()
  4. @pytest.fixture(scope="module", autouse=True)
  5. def create_server():
  6. global server
  7. server = ServerPreset.tinyllama2()
  8. def test_tokenize_detokenize():
  9. global server
  10. server.start()
  11. # tokenize
  12. content = "What is the capital of France ?"
  13. res_tok = server.make_request("POST", "/tokenize", data={
  14. "content": content
  15. })
  16. assert res_tok.status_code == 200
  17. assert len(res_tok.body["tokens"]) > 5
  18. # detokenize
  19. res_detok = server.make_request("POST", "/detokenize", data={
  20. "tokens": res_tok.body["tokens"],
  21. })
  22. assert res_detok.status_code == 200
  23. assert res_detok.body["content"].strip() == content
  24. def test_tokenize_with_bos():
  25. global server
  26. server.start()
  27. # tokenize
  28. content = "What is the capital of France ?"
  29. bosId = 1
  30. res_tok = server.make_request("POST", "/tokenize", data={
  31. "content": content,
  32. "add_special": True,
  33. })
  34. assert res_tok.status_code == 200
  35. assert res_tok.body["tokens"][0] == bosId
  36. def test_tokenize_with_pieces():
  37. global server
  38. server.start()
  39. # tokenize
  40. content = "This is a test string with unicode 媽 and emoji 🤗"
  41. res_tok = server.make_request("POST", "/tokenize", data={
  42. "content": content,
  43. "with_pieces": True,
  44. })
  45. assert res_tok.status_code == 200
  46. for token in res_tok.body["tokens"]:
  47. assert "id" in token
  48. assert token["id"] > 0
  49. assert "piece" in token
  50. assert len(token["piece"]) > 0