package tokenizer import "strings" // textSegment represents a piece of text that may be a special token or regular text type textSegment struct { text string isSpecial bool id int } // splitOnSpecialTokens finds special tokens in text and splits around them // This ensures special tokens like <|im_start|> are tokenized correctly func (t *Tokenizer) splitOnSpecialTokens(text string) []textSegment { var segments []textSegment for len(text) > 0 { // Find earliest special token earliestIdx := -1 earliestToken := "" earliestID := 0 for tok, id := range t.addedTokens { idx := strings.Index(text, tok) if idx >= 0 && (earliestIdx == -1 || idx < earliestIdx) { earliestIdx = idx earliestToken = tok earliestID = id } } if earliestIdx == -1 { // No more special tokens, add remaining text if len(text) > 0 { segments = append(segments, textSegment{text: text, isSpecial: false}) } break } // Add text before special token if earliestIdx > 0 { segments = append(segments, textSegment{text: text[:earliestIdx], isSpecial: false}) } // Add special token segments = append(segments, textSegment{isSpecial: true, id: earliestID}) // Continue with rest text = text[earliestIdx+len(earliestToken):] } return segments }