package tokenizer import "regexp" // Pre-tokenizer patterns for different model families // QwenPattern is the pre-tokenizer regex for Qwen/Qwen2/Qwen3 models const QwenPattern = `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+` const KimiPattern = `[\p{Han}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+` // LlamaPattern is the pre-tokenizer regex for Llama models const LlamaPattern = `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+` // GPT2Pattern is the pre-tokenizer regex for GPT-2 style models const GPT2Pattern = `'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+` // CompilePattern compiles a pre-tokenizer pattern func CompilePattern(pattern string) *regexp.Regexp { re, _ := regexp.Compile(pattern) return re } // DetectPattern returns the appropriate pattern based on model type func DetectPattern(modelType string) string { switch { case contains(modelType, "kimi"): return KimiPattern case contains(modelType, "qwen"): return QwenPattern case contains(modelType, "llama"): return LlamaPattern case contains(modelType, "gpt"): return GPT2Pattern default: return QwenPattern // Default to Qwen pattern } } func contains(s, substr string) bool { return len(s) >= len(substr) && (s == substr || len(s) > 0 && containsLower(s, substr)) } func containsLower(s, substr string) bool { for i := 0; i <= len(s)-len(substr); i++ { if eqFoldAt(s, i, substr) { return true } } return false } func eqFoldAt(s string, i int, substr string) bool { for j := 0; j < len(substr); j++ { c1, c2 := s[i+j], substr[j] if c1 != c2 && toLower(c1) != toLower(c2) { return false } } return true } func toLower(c byte) byte { if c >= 'A' && c <= 'Z' { return c + 32 } return c }