unicode.h 790 B

12345678910111213141516171819202122
  1. #pragma once
  2. #include <cstdint>
  3. #include <string_view>
  4. // UTF-8 parsing utilities for streaming-aware unicode support
  5. struct utf8_parse_result {
  6. uint32_t codepoint; // Decoded codepoint (only valid if status == SUCCESS)
  7. size_t bytes_consumed; // How many bytes this codepoint uses (1-4)
  8. enum status { SUCCESS, INCOMPLETE, INVALID } status;
  9. utf8_parse_result(enum status s, uint32_t cp = 0, size_t bytes = 0)
  10. : codepoint(cp), bytes_consumed(bytes), status(s) {}
  11. };
  12. // Determine the expected length of a UTF-8 sequence from its first byte
  13. // Returns 0 for invalid first bytes
  14. size_t utf8_sequence_length(unsigned char first_byte);
  15. // Parse a single UTF-8 codepoint from input
  16. utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset);