special_tokens.go 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. package tokenizer
  2. import "strings"
  3. // textSegment represents a piece of text that may be a special token or regular text
  4. type textSegment struct {
  5. text string
  6. isSpecial bool
  7. id int
  8. }
  9. // splitOnSpecialTokens finds special tokens in text and splits around them
  10. // This ensures special tokens like <|im_start|> are tokenized correctly
  11. func (t *Tokenizer) splitOnSpecialTokens(text string) []textSegment {
  12. var segments []textSegment
  13. for len(text) > 0 {
  14. // Find earliest special token
  15. earliestIdx := -1
  16. earliestToken := ""
  17. earliestID := 0
  18. for tok, id := range t.addedTokens {
  19. idx := strings.Index(text, tok)
  20. if idx >= 0 && (earliestIdx == -1 || idx < earliestIdx) {
  21. earliestIdx = idx
  22. earliestToken = tok
  23. earliestID = id
  24. }
  25. }
  26. if earliestIdx == -1 {
  27. // No more special tokens, add remaining text
  28. if len(text) > 0 {
  29. segments = append(segments, textSegment{text: text, isSpecial: false})
  30. }
  31. break
  32. }
  33. // Add text before special token
  34. if earliestIdx > 0 {
  35. segments = append(segments, textSegment{text: text[:earliestIdx], isSpecial: false})
  36. }
  37. // Add special token
  38. segments = append(segments, textSegment{isSpecial: true, id: earliestID})
  39. // Continue with rest
  40. text = text[earliestIdx+len(earliestToken):]
  41. }
  42. return segments
  43. }