unicode.cpp 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. #include "unicode.h"
  2. // implementation adopted from src/unicode.cpp
  3. size_t utf8_sequence_length(unsigned char first_byte) {
  4. const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
  5. uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4;
  6. return lookup[highbits];
  7. }
  8. utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset) {
  9. if (offset >= input.size()) {
  10. return utf8_parse_result(utf8_parse_result::INCOMPLETE);
  11. }
  12. // ASCII fast path
  13. if (!(input[offset] & 0x80)) {
  14. return utf8_parse_result(utf8_parse_result::SUCCESS, input[offset], 1);
  15. }
  16. // Invalid: continuation byte as first byte
  17. if (!(input[offset] & 0x40)) {
  18. return utf8_parse_result(utf8_parse_result::INVALID);
  19. }
  20. // 2-byte sequence
  21. if (!(input[offset] & 0x20)) {
  22. if (offset + 1 >= input.size()) {
  23. return utf8_parse_result(utf8_parse_result::INCOMPLETE);
  24. }
  25. if ((input[offset + 1] & 0xc0) != 0x80) {
  26. return utf8_parse_result(utf8_parse_result::INVALID);
  27. }
  28. auto result = ((input[offset] & 0x1f) << 6) | (input[offset + 1] & 0x3f);
  29. return utf8_parse_result(utf8_parse_result::SUCCESS, result, 2);
  30. }
  31. // 3-byte sequence
  32. if (!(input[offset] & 0x10)) {
  33. if (offset + 2 >= input.size()) {
  34. return utf8_parse_result(utf8_parse_result::INCOMPLETE);
  35. }
  36. if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80) {
  37. return utf8_parse_result(utf8_parse_result::INVALID);
  38. }
  39. auto result = ((input[offset] & 0x0f) << 12) | ((input[offset + 1] & 0x3f) << 6) | (input[offset + 2] & 0x3f);
  40. return utf8_parse_result(utf8_parse_result::SUCCESS, result, 3);
  41. }
  42. // 4-byte sequence
  43. if (!(input[offset] & 0x08)) {
  44. if (offset + 3 >= input.size()) {
  45. return utf8_parse_result(utf8_parse_result::INCOMPLETE);
  46. }
  47. if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80 || (input[offset + 3] & 0xc0) != 0x80) {
  48. return utf8_parse_result(utf8_parse_result::INVALID);
  49. }
  50. auto result = ((input[offset] & 0x07) << 18) | ((input[offset + 1] & 0x3f) << 12) | ((input[offset + 2] & 0x3f) << 6) | (input[offset + 3] & 0x3f);
  51. return utf8_parse_result(utf8_parse_result::SUCCESS, result, 4);
  52. }
  53. // Invalid first byte
  54. return utf8_parse_result(utf8_parse_result::INVALID);
  55. }