bpe.go 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. package tokenizer
  2. import "strings"
  3. // bpe applies byte-pair encoding to a token string
  4. // Returns merged tokens based on learned merge rules
  5. func (t *Tokenizer) bpe(token string) []string {
  6. if len(token) == 0 {
  7. return nil
  8. }
  9. // Start with individual unicode chars
  10. word := make([]string, 0, len(token))
  11. for _, r := range token {
  12. word = append(word, string(r))
  13. }
  14. if len(word) == 1 {
  15. return word
  16. }
  17. // Iteratively merge best pairs
  18. for {
  19. // Find best pair (lowest merge rank)
  20. bestPair := ""
  21. bestRank := -1
  22. bestIdx := -1
  23. for i := 0; i < len(word)-1; i++ {
  24. pair := word[i] + " " + word[i+1]
  25. if rank, ok := t.merges[pair]; ok {
  26. if bestRank == -1 || rank < bestRank {
  27. bestPair = pair
  28. bestRank = rank
  29. bestIdx = i
  30. }
  31. }
  32. }
  33. if bestIdx == -1 {
  34. break // No more merges possible
  35. }
  36. // Merge the pair
  37. parts := strings.SplitN(bestPair, " ", 2)
  38. merged := parts[0] + parts[1]
  39. newWord := make([]string, 0, len(word)-1)
  40. i := 0
  41. for i < len(word) {
  42. if i == bestIdx {
  43. newWord = append(newWord, merged)
  44. i += 2
  45. } else {
  46. newWord = append(newWord, word[i])
  47. i++
  48. }
  49. }
  50. word = newWord
  51. if len(word) == 1 {
  52. break
  53. }
  54. }
  55. return word
  56. }