cturan
/
makarna
miroir de https://github.com/cturan/makarna


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
							// Package tokenizer provides BPE tokenization for LLM models
package tokenizer

import (
	"regexp"
	"sort"
	"strings"
)

// Tokenizer handles text tokenization using byte-level BPE
type Tokenizer struct {
	vocab       map[string]int    // token -> id
	idToToken   map[int]string    // id -> token
	merges      map[string]int    // "a b" -> rank
	byteEncoder map[byte]rune     // byte -> unicode
	byteDecoder map[rune]byte     // unicode -> byte
	addedTokens map[string]int    // special tokens
	eosID       int               // end of sequence token
	prePattern  *regexp.Regexp    // pre-tokenizer regex
}

// Encode converts text to token IDs
func (t *Tokenizer) Encode(text string) []int {
	var ids []int

	// Split on special tokens first
	segments := t.splitOnSpecialTokens(text)

	// Process each segment
	for _, seg := range segments {
		if seg.isSpecial {
			ids = append(ids, seg.id)
		} else {
			ids = append(ids, t.encodeText(seg.text)...)
		}
	}

	return ids
}

// encodeText tokenizes regular text (non-special tokens)
func (t *Tokenizer) encodeText(text string) []int {
	var ids []int

	// Pre-tokenization
	var chunks []string
	if t.prePattern != nil {
		chunks = t.prePattern.FindAllString(text, -1)
	} else {
		chunks = []string{text}
	}

	// BPE for each chunk
	for _, chunk := range chunks {
		byteRep := t.bytesToTokens([]byte(chunk))
		tokens := t.bpe(byteRep)
		for _, tok := range tokens {
			if id, ok := t.vocab[tok]; ok {
				ids = append(ids, id)
			}
		}
	}

	return ids
}

// Decode converts token IDs back to text
func (t *Tokenizer) Decode(ids []int) string {
	var tokens []string
	for _, id := range ids {
		if tok, ok := t.idToToken[id]; ok {
			tokens = append(tokens, tok)
		}
	}

	text := strings.Join(tokens, "")
	return string(t.tokensToBytes(text))
}

// EosID returns the end-of-sequence token ID
func (t *Tokenizer) EosID() int {
	return t.eosID
}

// VocabSize returns the vocabulary size
func (t *Tokenizer) VocabSize() int {
	return len(t.vocab) + len(t.addedTokens)
}

// GetToken returns the token string for a given ID
func (t *Tokenizer) GetToken(id int) (string, bool) {
	tok, ok := t.idToToken[id]
	return tok, ok
}

func (t *Tokenizer) AddedTokenStrings() []string {
	out := make([]string, 0, len(t.addedTokens))
	for s := range t.addedTokens {
		out = append(out, s)
	}
	sort.Strings(out)
	return out
}