| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253 |
- package tokenizer
- import "strings"
- // textSegment represents a piece of text that may be a special token or regular text
- type textSegment struct {
- text string
- isSpecial bool
- id int
- }
- // splitOnSpecialTokens finds special tokens in text and splits around them
- // This ensures special tokens like <|im_start|> are tokenized correctly
- func (t *Tokenizer) splitOnSpecialTokens(text string) []textSegment {
- var segments []textSegment
- for len(text) > 0 {
- // Find earliest special token
- earliestIdx := -1
- earliestToken := ""
- earliestID := 0
- for tok, id := range t.addedTokens {
- idx := strings.Index(text, tok)
- if idx >= 0 && (earliestIdx == -1 || idx < earliestIdx) {
- earliestIdx = idx
- earliestToken = tok
- earliestID = id
- }
- }
- if earliestIdx == -1 {
- // No more special tokens, add remaining text
- if len(text) > 0 {
- segments = append(segments, textSegment{text: text, isSpecial: false})
- }
- break
- }
- // Add text before special token
- if earliestIdx > 0 {
- segments = append(segments, textSegment{text: text[:earliestIdx], isSpecial: false})
- }
- // Add special token
- segments = append(segments, textSegment{isSpecial: true, id: earliestID})
- // Continue with rest
- text = text[earliestIdx+len(earliestToken):]
- }
- return segments
- }
|