| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103 |
- // Package tokenizer provides BPE tokenization for LLM models
- package tokenizer
- import (
- "regexp"
- "sort"
- "strings"
- )
- // Tokenizer handles text tokenization using byte-level BPE
- type Tokenizer struct {
- vocab map[string]int // token -> id
- idToToken map[int]string // id -> token
- merges map[string]int // "a b" -> rank
- byteEncoder map[byte]rune // byte -> unicode
- byteDecoder map[rune]byte // unicode -> byte
- addedTokens map[string]int // special tokens
- eosID int // end of sequence token
- prePattern *regexp.Regexp // pre-tokenizer regex
- }
- // Encode converts text to token IDs
- func (t *Tokenizer) Encode(text string) []int {
- var ids []int
- // Split on special tokens first
- segments := t.splitOnSpecialTokens(text)
- // Process each segment
- for _, seg := range segments {
- if seg.isSpecial {
- ids = append(ids, seg.id)
- } else {
- ids = append(ids, t.encodeText(seg.text)...)
- }
- }
- return ids
- }
- // encodeText tokenizes regular text (non-special tokens)
- func (t *Tokenizer) encodeText(text string) []int {
- var ids []int
- // Pre-tokenization
- var chunks []string
- if t.prePattern != nil {
- chunks = t.prePattern.FindAllString(text, -1)
- } else {
- chunks = []string{text}
- }
- // BPE for each chunk
- for _, chunk := range chunks {
- byteRep := t.bytesToTokens([]byte(chunk))
- tokens := t.bpe(byteRep)
- for _, tok := range tokens {
- if id, ok := t.vocab[tok]; ok {
- ids = append(ids, id)
- }
- }
- }
- return ids
- }
- // Decode converts token IDs back to text
- func (t *Tokenizer) Decode(ids []int) string {
- var tokens []string
- for _, id := range ids {
- if tok, ok := t.idToToken[id]; ok {
- tokens = append(tokens, tok)
- }
- }
- text := strings.Join(tokens, "")
- return string(t.tokensToBytes(text))
- }
- // EosID returns the end-of-sequence token ID
- func (t *Tokenizer) EosID() int {
- return t.eosID
- }
- // VocabSize returns the vocabulary size
- func (t *Tokenizer) VocabSize() int {
- return len(t.vocab) + len(t.addedTokens)
- }
- // GetToken returns the token string for a given ID
- func (t *Tokenizer) GetToken(id int) (string, bool) {
- tok, ok := t.idToToken[id]
- return tok, ok
- }
- func (t *Tokenizer) AddedTokenStrings() []string {
- out := make([]string, 0, len(t.addedTokens))
- for s := range t.addedTokens {
- out = append(out, s)
- }
- sort.Strings(out)
- return out
- }
|