// Package tokenizer provides BPE tokenization for LLM models package tokenizer import ( "regexp" "sort" "strings" ) // Tokenizer handles text tokenization using byte-level BPE type Tokenizer struct { vocab map[string]int // token -> id idToToken map[int]string // id -> token merges map[string]int // "a b" -> rank byteEncoder map[byte]rune // byte -> unicode byteDecoder map[rune]byte // unicode -> byte addedTokens map[string]int // special tokens eosID int // end of sequence token prePattern *regexp.Regexp // pre-tokenizer regex } // Encode converts text to token IDs func (t *Tokenizer) Encode(text string) []int { var ids []int // Split on special tokens first segments := t.splitOnSpecialTokens(text) // Process each segment for _, seg := range segments { if seg.isSpecial { ids = append(ids, seg.id) } else { ids = append(ids, t.encodeText(seg.text)...) } } return ids } // encodeText tokenizes regular text (non-special tokens) func (t *Tokenizer) encodeText(text string) []int { var ids []int // Pre-tokenization var chunks []string if t.prePattern != nil { chunks = t.prePattern.FindAllString(text, -1) } else { chunks = []string{text} } // BPE for each chunk for _, chunk := range chunks { byteRep := t.bytesToTokens([]byte(chunk)) tokens := t.bpe(byteRep) for _, tok := range tokens { if id, ok := t.vocab[tok]; ok { ids = append(ids, id) } } } return ids } // Decode converts token IDs back to text func (t *Tokenizer) Decode(ids []int) string { var tokens []string for _, id := range ids { if tok, ok := t.idToToken[id]; ok { tokens = append(tokens, tok) } } text := strings.Join(tokens, "") return string(t.tokensToBytes(text)) } // EosID returns the end-of-sequence token ID func (t *Tokenizer) EosID() int { return t.eosID } // VocabSize returns the vocabulary size func (t *Tokenizer) VocabSize() int { return len(t.vocab) + len(t.addedTokens) } // GetToken returns the token string for a given ID func (t *Tokenizer) GetToken(id int) (string, bool) { tok, ok := t.idToToken[id] return tok, ok } func (t *Tokenizer) AddedTokenStrings() []string { out := make([]string, 0, len(t.addedTokens)) for s := range t.addedTokens { out = append(out, s) } sort.Strings(out) return out }