package tokenizer import "strings" // bpe applies byte-pair encoding to a token string // Returns merged tokens based on learned merge rules func (t *Tokenizer) bpe(token string) []string { if len(token) == 0 { return nil } // Start with individual unicode chars word := make([]string, 0, len(token)) for _, r := range token { word = append(word, string(r)) } if len(word) == 1 { return word } // Iteratively merge best pairs for { // Find best pair (lowest merge rank) bestPair := "" bestRank := -1 bestIdx := -1 for i := 0; i < len(word)-1; i++ { pair := word[i] + " " + word[i+1] if rank, ok := t.merges[pair]; ok { if bestRank == -1 || rank < bestRank { bestPair = pair bestRank = rank bestIdx = i } } } if bestIdx == -1 { break // No more merges possible } // Merge the pair parts := strings.SplitN(bestPair, " ", 2) merged := parts[0] + parts[1] newWord := make([]string, 0, len(word)-1) i := 0 for i < len(word) { if i == bestIdx { newWord = append(newWord, merged) i += 2 } else { newWord = append(newWord, word[i]) i++ } } word = newWord if len(word) == 1 { break } } return word }