package tokenizer import "strings" // ByteEncoder handles GPT-2 style byte-to-unicode mapping // This allows representing all bytes as printable unicode characters // buildByteEncoder creates the byte <-> unicode mapping used in GPT-2/Qwen tokenizers func buildByteEncoder() (encoder map[byte]rune, decoder map[rune]byte) { encoder = make(map[byte]rune) decoder = make(map[rune]byte) // GPT-2 byte encoder: maps bytes to unicode chars // Printable ASCII bytes map to themselves // Others map to unicode starting at U+0100 n := 0 for b := 0; b < 256; b++ { // Printable ASCII and some extended if (b >= 33 && b <= 126) || (b >= 161 && b <= 172) || (b >= 174 && b <= 255) { encoder[byte(b)] = rune(b) decoder[rune(b)] = byte(b) } else { // Map to U+0100 + n encoder[byte(b)] = rune(256 + n) decoder[rune(256+n)] = byte(b) n++ } } return encoder, decoder } // bytesToTokens converts bytes to unicode representation used by BPE vocab func (t *Tokenizer) bytesToTokens(b []byte) string { var sb strings.Builder for _, c := range b { sb.WriteRune(t.byteEncoder[c]) } return sb.String() } // tokensToBytes converts unicode-encoded tokens back to bytes func (t *Tokenizer) tokensToBytes(text string) []byte { var result []byte for _, r := range text { if b, ok := t.byteDecoder[r]; ok { result = append(result, b) } else if r < 256 { // Fallback: direct rune to byte if within ASCII result = append(result, byte(r)) } } return result }