package tokenizer

import "regexp"

// Pre-tokenizer patterns for different model families

// QwenPattern is the pre-tokenizer regex for Qwen/Qwen2/Qwen3 models
const QwenPattern = `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`

const KimiPattern = `[\p{Han}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`

// LlamaPattern is the pre-tokenizer regex for Llama models
const LlamaPattern = `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`

// GPT2Pattern is the pre-tokenizer regex for GPT-2 style models
const GPT2Pattern = `'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`

// CompilePattern compiles a pre-tokenizer pattern
func CompilePattern(pattern string) *regexp.Regexp {
	re, _ := regexp.Compile(pattern)
	return re
}

// DetectPattern returns the appropriate pattern based on model type
func DetectPattern(modelType string) string {
	switch {
	case contains(modelType, "kimi"):
		return KimiPattern
	case contains(modelType, "qwen"):
		return QwenPattern
	case contains(modelType, "llama"):
		return LlamaPattern
	case contains(modelType, "gpt"):
		return GPT2Pattern
	default:
		return QwenPattern // Default to Qwen pattern
	}
}

func contains(s, substr string) bool {
	return len(s) >= len(substr) && (s == substr || len(s) > 0 && containsLower(s, substr))
}

func containsLower(s, substr string) bool {
	for i := 0; i <= len(s)-len(substr); i++ {
		if eqFoldAt(s, i, substr) {
			return true
		}
	}
	return false
}

func eqFoldAt(s string, i int, substr string) bool {
	for j := 0; j < len(substr); j++ {
		c1, c2 := s[i+j], substr[j]
		if c1 != c2 && toLower(c1) != toLower(c2) {
			return false
		}
	}
	return true
}

func toLower(c byte) byte {
	if c >= 'A' && c <= 'Z' {
		return c + 32
	}
	return c
}