byte_encoder.go 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. package tokenizer
  2. import "strings"
  3. // ByteEncoder handles GPT-2 style byte-to-unicode mapping
  4. // This allows representing all bytes as printable unicode characters
  5. // buildByteEncoder creates the byte <-> unicode mapping used in GPT-2/Qwen tokenizers
  6. func buildByteEncoder() (encoder map[byte]rune, decoder map[rune]byte) {
  7. encoder = make(map[byte]rune)
  8. decoder = make(map[rune]byte)
  9. // GPT-2 byte encoder: maps bytes to unicode chars
  10. // Printable ASCII bytes map to themselves
  11. // Others map to unicode starting at U+0100
  12. n := 0
  13. for b := 0; b < 256; b++ {
  14. // Printable ASCII and some extended
  15. if (b >= 33 && b <= 126) || (b >= 161 && b <= 172) || (b >= 174 && b <= 255) {
  16. encoder[byte(b)] = rune(b)
  17. decoder[rune(b)] = byte(b)
  18. } else {
  19. // Map to U+0100 + n
  20. encoder[byte(b)] = rune(256 + n)
  21. decoder[rune(256+n)] = byte(b)
  22. n++
  23. }
  24. }
  25. return encoder, decoder
  26. }
  27. // bytesToTokens converts bytes to unicode representation used by BPE vocab
  28. func (t *Tokenizer) bytesToTokens(b []byte) string {
  29. var sb strings.Builder
  30. for _, c := range b {
  31. sb.WriteRune(t.byteEncoder[c])
  32. }
  33. return sb.String()
  34. }
  35. // tokensToBytes converts unicode-encoded tokens back to bytes
  36. func (t *Tokenizer) tokensToBytes(text string) []byte {
  37. var result []byte
  38. for _, r := range text {
  39. if b, ok := t.byteDecoder[r]; ok {
  40. result = append(result, b)
  41. } else if r < 256 {
  42. // Fallback: direct rune to byte if within ASCII
  43. result = append(result, byte(r))
  44. }
  45. }
  46. return result
  47. }