| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667 |
- package tokenizer
- import "strings"
- // bpe applies byte-pair encoding to a token string
- // Returns merged tokens based on learned merge rules
- func (t *Tokenizer) bpe(token string) []string {
- if len(token) == 0 {
- return nil
- }
- // Start with individual unicode chars
- word := make([]string, 0, len(token))
- for _, r := range token {
- word = append(word, string(r))
- }
- if len(word) == 1 {
- return word
- }
- // Iteratively merge best pairs
- for {
- // Find best pair (lowest merge rank)
- bestPair := ""
- bestRank := -1
- bestIdx := -1
- for i := 0; i < len(word)-1; i++ {
- pair := word[i] + " " + word[i+1]
- if rank, ok := t.merges[pair]; ok {
- if bestRank == -1 || rank < bestRank {
- bestPair = pair
- bestRank = rank
- bestIdx = i
- }
- }
- }
- if bestIdx == -1 {
- break // No more merges possible
- }
- // Merge the pair
- parts := strings.SplitN(bestPair, " ", 2)
- merged := parts[0] + parts[1]
- newWord := make([]string, 0, len(word)-1)
- i := 0
- for i < len(word) {
- if i == bestIdx {
- newWord = append(newWord, merged)
- i += 2
- } else {
- newWord = append(newWord, word[i])
- i++
- }
- }
- word = newWord
- if len(word) == 1 {
- break
- }
- }
- return word
- }
|