cturan
/
makarna
mirror of https://github.com/cturan/makarna


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
							package tokenizer

import "strings"

// bpe applies byte-pair encoding to a token string
// Returns merged tokens based on learned merge rules
func (t *Tokenizer) bpe(token string) []string {
	if len(token) == 0 {
		return nil
	}

	// Start with individual unicode chars
	word := make([]string, 0, len(token))
	for _, r := range token {
		word = append(word, string(r))
	}

	if len(word) == 1 {
		return word
	}

	// Iteratively merge best pairs
	for {
		// Find best pair (lowest merge rank)
		bestPair := ""
		bestRank := -1
		bestIdx := -1

		for i := 0; i < len(word)-1; i++ {
			pair := word[i] + " " + word[i+1]
			if rank, ok := t.merges[pair]; ok {
				if bestRank == -1 || rank < bestRank {
					bestPair = pair
					bestRank = rank
					bestIdx = i
				}
			}
		}

		if bestIdx == -1 {
			break // No more merges possible
		}

		// Merge the pair
		parts := strings.SplitN(bestPair, " ", 2)
		merged := parts[0] + parts[1]

		newWord := make([]string, 0, len(word)-1)
		i := 0
		for i < len(word) {
			if i == bestIdx {
				newWord = append(newWord, merged)
				i += 2
			} else {
				newWord = append(newWord, word[i])
				i++
			}
		}
		word = newWord

		if len(word) == 1 {
			break
		}
	}

	return word
}