| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768 |
- package tokenizer
- import "regexp"
- // Pre-tokenizer patterns for different model families
- // QwenPattern is the pre-tokenizer regex for Qwen/Qwen2/Qwen3 models
- const QwenPattern = `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
- const KimiPattern = `[\p{Han}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
- // LlamaPattern is the pre-tokenizer regex for Llama models
- const LlamaPattern = `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
- // GPT2Pattern is the pre-tokenizer regex for GPT-2 style models
- const GPT2Pattern = `'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`
- // CompilePattern compiles a pre-tokenizer pattern
- func CompilePattern(pattern string) *regexp.Regexp {
- re, _ := regexp.Compile(pattern)
- return re
- }
- // DetectPattern returns the appropriate pattern based on model type
- func DetectPattern(modelType string) string {
- switch {
- case contains(modelType, "kimi"):
- return KimiPattern
- case contains(modelType, "qwen"):
- return QwenPattern
- case contains(modelType, "llama"):
- return LlamaPattern
- case contains(modelType, "gpt"):
- return GPT2Pattern
- default:
- return QwenPattern // Default to Qwen pattern
- }
- }
- func contains(s, substr string) bool {
- return len(s) >= len(substr) && (s == substr || len(s) > 0 && containsLower(s, substr))
- }
- func containsLower(s, substr string) bool {
- for i := 0; i <= len(s)-len(substr); i++ {
- if eqFoldAt(s, i, substr) {
- return true
- }
- }
- return false
- }
- func eqFoldAt(s string, i int, substr string) bool {
- for j := 0; j < len(substr); j++ {
- c1, c2 := s[i+j], substr[j]
- if c1 != c2 && toLower(c1) != toLower(c2) {
- return false
- }
- }
- return true
- }
- func toLower(c byte) byte {
- if c >= 'A' && c <= 'Z' {
- return c + 32
- }
- return c
- }
|