| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253 |
- package tokenizer
- import "strings"
- // ByteEncoder handles GPT-2 style byte-to-unicode mapping
- // This allows representing all bytes as printable unicode characters
- // buildByteEncoder creates the byte <-> unicode mapping used in GPT-2/Qwen tokenizers
- func buildByteEncoder() (encoder map[byte]rune, decoder map[rune]byte) {
- encoder = make(map[byte]rune)
- decoder = make(map[rune]byte)
- // GPT-2 byte encoder: maps bytes to unicode chars
- // Printable ASCII bytes map to themselves
- // Others map to unicode starting at U+0100
- n := 0
- for b := 0; b < 256; b++ {
- // Printable ASCII and some extended
- if (b >= 33 && b <= 126) || (b >= 161 && b <= 172) || (b >= 174 && b <= 255) {
- encoder[byte(b)] = rune(b)
- decoder[rune(b)] = byte(b)
- } else {
- // Map to U+0100 + n
- encoder[byte(b)] = rune(256 + n)
- decoder[rune(256+n)] = byte(b)
- n++
- }
- }
- return encoder, decoder
- }
- // bytesToTokens converts bytes to unicode representation used by BPE vocab
- func (t *Tokenizer) bytesToTokens(b []byte) string {
- var sb strings.Builder
- for _, c := range b {
- sb.WriteRune(t.byteEncoder[c])
- }
- return sb.String()
- }
- // tokensToBytes converts unicode-encoded tokens back to bytes
- func (t *Tokenizer) tokensToBytes(text string) []byte {
- var result []byte
- for _, r := range text {
- if b, ok := t.byteDecoder[r]; ok {
- result = append(result, b)
- } else if r < 256 {
- // Fallback: direct rune to byte if within ASCII
- result = append(result, byte(r))
- }
- }
- return result
- }
|