6 months ago · 4a4f426944
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -840,6 +840,9 @@ class TextModel(ModelBase):
 
				         if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
			
 
				             # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
			
 
				             res = "lfm2"
			
 
				+        if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
			
 
				+            # ref: https://huggingface.co/moonshotai/Kimi-K2-Base
			
 
				+            res = "kimi-k2"
			
 
				 
			
 
				         if res is None:
			
 
				             logger.warning("\n")
			
@@ -5739,7 +5742,58 @@ class DeepseekV2Model(TextModel):
 
				     model_arch = gguf.MODEL_ARCH.DEEPSEEK2
			
 
				 
			
 
				     def set_vocab(self):
			
 
				-        self._set_vocab_gpt2()
			
 
				+        try:
			
 
				+            self._set_vocab_gpt2()
			
 
				+            return
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+
			
 
				+        from transformers import AutoTokenizer
			
 
				+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
			
 
				+        tokpre = self.get_vocab_base_pre(tokenizer)
			
 
				+
			
 
				+        if tokpre == "kimi-k2":
			
 
				+            # Build merges list using the approach similar to HunYuanMoE
			
 
				+            merges = []
			
 
				+            vocab = {}
			
 
				+            mergeable_ranks = tokenizer.model._mergeable_ranks
			
 
				+            for token, rank in mergeable_ranks.items():
			
 
				+                vocab[QwenModel.token_bytes_to_string(token)] = rank
			
 
				+                if len(token) == 1:
			
 
				+                    continue
			
 
				+                merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
			
 
				+                if len(merged) == 2:
			
 
				+                    merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
			
 
				+
			
 
				+            # Build token list
			
 
				+            vocab_size = self.hparams["vocab_size"]
			
 
				+            special_tokens = tokenizer.special_tokens
			
 
				+            reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
			
 
				+            tokens: list[str] = []
			
 
				+            toktypes: list[int] = []
			
 
				+
			
 
				+            for i in range(vocab_size):
			
 
				+                if i not in reverse_vocab:
			
 
				+                    tokens.append(f"[PAD{i}]")
			
 
				+                    toktypes.append(gguf.TokenType.UNUSED)
			
 
				+                else:
			
 
				+                    token = reverse_vocab[i]
			
 
				+                    tokens.append(token)
			
 
				+                    if i in special_tokens.values():
			
 
				+                        toktypes.append(gguf.TokenType.CONTROL)
			
 
				+                    else:
			
 
				+                        toktypes.append(gguf.TokenType.NORMAL)
			
 
				+
			
 
				+            self.gguf_writer.add_tokenizer_model("gpt2")
			
 
				+            self.gguf_writer.add_tokenizer_pre(tokpre)
			
 
				+            self.gguf_writer.add_token_list(tokens)
			
 
				+            self.gguf_writer.add_token_types(toktypes)
			
 
				+            self.gguf_writer.add_token_merges(merges)
			
 
				+
			
 
				+            special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
			
 
				+            special_vocab.add_to_gguf(self.gguf_writer)
			
 
				+        else:
			
 
				+            raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
			
 
				 
			
 
				     def set_gguf_parameters(self):
			
 
				 
			
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -146,6 +146,7 @@ pre_computed_hashes = [
 
				     {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
			
 
				     {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
			
 
				     {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
			
 
				+    {"name": "kimi-k2",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base",   "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
			
 
				 ]
			
 
				 
			
 
				 
			
--- a/models/templates/moonshotai-Kimi-K2.jinja
+++ b/models/templates/moonshotai-Kimi-K2.jinja
@@ -0,0 +1,43 @@
 
				+{%- if tools -%}

			
 
				+  <|im_system|>tool_declare<|im_middle|>{{ tools | tojson }}<|im_end|>

			
 
				+{%- endif -%}

			
 
				+{%- for message in messages -%}

			
 
				+  {%- if loop.first and messages[0]['role'] != 'system' -%}

			
 
				+    <|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>

			
 
				+  {%- endif -%}

			
 
				+  {%- if message['role'] == 'system' -%}

			
 
				+    <|im_system|>system<|im_middle|>

			
 
				+  {%- elif message['role'] == 'user' -%}

			
 
				+    <|im_user|>user<|im_middle|>

			
 
				+  {%- elif message['role'] == 'assistant' -%}

			
 
				+    <|im_assistant|>assistant<|im_middle|>

			
 
				+  {%- elif message['role'] == 'tool' -%}

			
 
				+    <|im_system|>tool<|im_middle|>

			
 
				+  {%- endif -%}

			
 
				+  {%- if message['role'] == 'assistant' and message.get('tool_calls') -%}

			
 
				+    {%- if message['content'] -%}{{ message['content'] }}{%- endif -%}

			
 
				+    <|tool_calls_section_begin|>

			
 
				+    {%- for tool_call in message['tool_calls'] -%}

			
 
				+      {%- set func_name = tool_call['function']['name'] -%}

			
 
				+      {%- set formatted_id = 'functions.' + func_name + ':' + loop.index0|string -%}

			
 
				+      <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{{ tool_call['function']['arguments'] | tojson}}<|tool_call_end|>

			
 
				+    {%- endfor -%}

			
 
				+    <|tool_calls_section_end|>

			
 
				+  {%- elif message['role'] == 'tool' -%}

			
 
				+    ## Return of {{ message.tool_call_id }}\n{{ message['content'] }}

			
 
				+  {%- elif message['content'] is string -%}

			
 
				+    {{ message['content'] }}

			
 
				+  {%- elif message['content'] is not none -%}

			
 
				+    {% for content in message['content'] -%}

			
 
				+      {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}

			
 
				+        <|media_start|>image<|media_content|><|media_pad|><|media_end|>

			
 
				+      {% else -%}

			
 
				+        {{ content['text'] }}

			
 
				+      {%- endif -%}

			
 
				+    {%- endfor -%}

			
 
				+  {%- endif -%}

			
 
				+  <|im_end|>

			
 
				+{%- endfor -%}

			
 
				+{%- if add_generation_prompt -%}

			
 
				+  <|im_assistant|>assistant<|im_middle|>

			
 
				+{%- endif -%}

			
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -65,6 +65,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
 
				     { "llama4",            LLM_CHAT_TEMPLATE_LLAMA4            },
			
 
				     { "smolvlm",           LLM_CHAT_TEMPLATE_SMOLVLM           },
			
 
				     { "hunyuan-moe",       LLM_CHAT_TEMPLATE_HUNYUAN_MOE       },
			
 
				+    { "kimi-k2",           LLM_CHAT_TEMPLATE_KIMI_K2           },
			
 
				 };
			
 
				 
			
 
				 llm_chat_template llm_chat_template_from_str(const std::string & name) {
			
@@ -188,6 +189,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
 
				         return LLM_CHAT_TEMPLATE_DOTS1;
			
 
				     } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
			
 
				         return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
			
 
				+    } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
			
 
				+        return LLM_CHAT_TEMPLATE_KIMI_K2;
			
 
				     }
			
 
				     return LLM_CHAT_TEMPLATE_UNKNOWN;
			
 
				 }
			
@@ -680,6 +683,26 @@ int32_t llm_chat_apply_template(
 
				                 ss << "<|startoftext|>" << message->content << "<|extra_0|>";
			
 
				             }
			
 
				         }
			
 
				+    } else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
			
 
				+        // moonshotai/Kimi-K2-Instruct
			
 
				+        for (auto message : chat) {
			
 
				+            std::string role(message->role);
			
 
				+            if (role == "system") {
			
 
				+                ss << "<|im_system|>system<|im_middle|>";
			
 
				+            } else if (role == "user") {
			
 
				+                ss << "<|im_user|>user<|im_middle|>";
			
 
				+            } else if (role == "assistant") {
			
 
				+                ss << "<|im_assistant|>assistant<|im_middle|>";
			
 
				+            } else if (role == "tool") {
			
 
				+                ss << "<|im_system|>tool<|im_middle|>";
			
 
				+            }
			
 
				+
			
 
				+            ss << message->content << "<|im_end|>";
			
 
				+
			
 
				+            if (add_ass) {
			
 
				+                ss << "<|im_assistant|>assistant<|im_middle|>";
			
 
				+            }
			
 
				+        }
			
 
				     } else {
			
 
				         // template not supported
			
 
				         return -1;
			
--- a/src/llama-chat.h
+++ b/src/llama-chat.h
@@ -45,6 +45,7 @@ enum llm_chat_template {
 
				     LLM_CHAT_TEMPLATE_SMOLVLM,
			
 
				     LLM_CHAT_TEMPLATE_DOTS1,
			
 
				     LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
			
 
				+    LLM_CHAT_TEMPLATE_KIMI_K2,
			
 
				     LLM_CHAT_TEMPLATE_UNKNOWN,
			
 
				 };
			
 
				 
			
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -6,7 +6,7 @@
 
				 
			
 
				 // bump if necessary
			
 
				 #define LLAMA_MAX_LAYERS  512
			
 
				-#define LLAMA_MAX_EXPERTS 256  // DeepSeekV3
			
 
				+#define LLAMA_MAX_EXPERTS 384  // Kimi-K2
			
 
				 
			
 
				 enum llama_expert_gating_func_type {
			
 
				     LLAMA_EXPERT_GATING_FUNC_TYPE_NONE    = 0,
			
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -405,6 +405,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
 
				                     "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
			
 
				                 };
			
 
				                 break;
			
 
				+            case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
			
 
				+                regex_exprs = {
			
 
				+                    // K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
			
 
				+                    // The custom handler implements all K2 patterns with proper Han character exclusion
			
 
				+                    "\\p{Han}+",
			
 
				+                };
			
 
				+                break;
			
 
				             case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
			
 
				                 regex_exprs = {
			
 
				                     "\\p{N}+",
			
@@ -1954,6 +1961,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
 
				                 tokenizer_pre == "hunyuan") {
			
 
				                 pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
			
 
				                 clean_spaces = false;
			
 
				+            } else if (
			
 
				+                tokenizer_pre == "kimi-k2") {
			
 
				+                pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
			
 
				+                clean_spaces = false;
			
 
				             } else {
			
 
				                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
			
 
				             }
			
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -45,6 +45,7 @@ enum llama_vocab_pre_type {
 
				     LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
			
 
				     LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
			
 
				     LLAMA_VOCAB_PRE_TYPE_HUNYUAN        = 36,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_KIMI_K2        = 37,
			
 
				 };
			
 
				 
			
 
				 struct LLM_KV;
			
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -557,6 +557,178 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
 
				     return bpe_offsets;
			
 
				 }
			
 
				 
			
 
				+// K2 system regex patterns (from tokenization_kimi.py):
			
 
				+// [\p{Han}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
			
 
				+static std::vector<size_t> unicode_regex_split_custom_kimi_k2(const std::string & text, const std::vector<size_t> & offsets) {
			
 
				+    std::vector<size_t> bpe_offsets;
			
 
				+    bpe_offsets.reserve(offsets.size());
			
 
				+
			
 
				+    const auto cpts = unicode_cpts_from_utf8(text);
			
 
				+
			
 
				+    size_t start = 0;
			
 
				+    for (auto offset : offsets) {
			
 
				+        const size_t offset_ini = start;
			
 
				+        const size_t offset_end = start + offset;
			
 
				+        assert(offset_end <= cpts.size());
			
 
				+        start = offset_end;
			
 
				+
			
 
				+        static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
			
 
				+        auto _get_cpt = [&] (const size_t pos) -> uint32_t {
			
 
				+            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
			
 
				+        };
			
 
				+
			
 
				+        auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
			
 
				+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
			
 
				+        };
			
 
				+
			
 
				+        size_t _prev_end = offset_ini;
			
 
				+        auto _add_token = [&] (const size_t end) -> size_t {
			
 
				+            assert(_prev_end <= end && end <= offset_end);
			
 
				+            size_t len = end - _prev_end;
			
 
				+            if (len > 0) {
			
 
				+                bpe_offsets.push_back(len);
			
 
				+            }
			
 
				+            _prev_end = end;
			
 
				+            return len;
			
 
				+        };
			
 
				+
			
 
				+        for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
			
 
				+            const uint32_t cpt = _get_cpt(pos);
			
 
				+            const auto flags = _get_flags(pos);
			
 
				+
			
 
				+            // Pattern 1: [\p{Han}]+ (Chinese characters)
			
 
				+            if (unicode_cpt_is_han(cpt)) {
			
 
				+                while (unicode_cpt_is_han(_get_cpt(pos))) {
			
 
				+                    pos++;
			
 
				+                }
			
 
				+                _add_token(pos);
			
 
				+                continue;
			
 
				+            }
			
 
				+
			
 
				+            // Pattern 2 & 3: Letter words excluding Han characters with optional contractions
			
 
				+            // [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?:'s|'t|'re|'ve|'m|'ll|'d)?
			
 
				+            // [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?:'s|'t|'re|'ve|'m|'ll|'d)?
			
 
				+            // Check if current char is a letter OR if current char could be a leading char and next char is a letter
			
 
				+            bool is_letter_pattern = (flags.is_letter && !unicode_cpt_is_han(cpt)) ||
			
 
				+                                     (!(cpt == '\r' || cpt == '\n' || flags.is_letter || flags.is_number) &&
			
 
				+                                      _get_flags(pos + 1).is_letter && !unicode_cpt_is_han(_get_cpt(pos + 1)));
			
 
				+
			
 
				+            if (is_letter_pattern) {
			
 
				+                // Handle optional leading non-letter/non-number character
			
 
				+                bool has_leading_char = false;
			
 
				+                if (!(cpt == '\r' || cpt == '\n' || flags.is_letter || flags.is_number)) {
			
 
				+                    has_leading_char = true;
			
 
				+                    pos++;
			
 
				+                }
			
 
				+
			
 
				+                // Match letter sequence (excluding Han characters)
			
 
				+                bool has_letters = false;
			
 
				+                while (_get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos))) {
			
 
				+                    has_letters = true;
			
 
				+                    pos++;
			
 
				+                }
			
 
				+
			
 
				+                // Only proceed if we found letters (after potentially skipping leading char)
			
 
				+                if (has_letters || (!has_leading_char && _get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos)))) {
			
 
				+                    if (!has_letters) pos++; // consume the first letter if we didn't already
			
 
				+
			
 
				+                    // Continue consuming letters
			
 
				+                    while (_get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos))) {
			
 
				+                        pos++;
			
 
				+                    }
			
 
				+
			
 
				+                    // Check for optional contractions (?:'s|'t|'re|'ve|'m|'ll|'d)
			
 
				+                    if (_get_cpt(pos) == '\'' && pos + 1 < offset_end) {
			
 
				+                        uint32_t cpt_next = unicode_tolower(_get_cpt(pos + 1));
			
 
				+                        if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
			
 
				+                            pos += 2;
			
 
				+                        } else if (pos + 2 < offset_end) {
			
 
				+                            uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos + 2));
			
 
				+                            if ((cpt_next == 'r' && cpt_next_next == 'e') ||
			
 
				+                                (cpt_next == 'v' && cpt_next_next == 'e') ||
			
 
				+                                (cpt_next == 'l' && cpt_next_next == 'l')) {
			
 
				+                                pos += 3;
			
 
				+                            }
			
 
				+                        }
			
 
				+                    }
			
 
				+
			
 
				+                    _add_token(pos);
			
 
				+                    continue;
			
 
				+                } else if (has_leading_char) {
			
 
				+                    // We consumed a leading char but found no letters, backtrack
			
 
				+                    pos--;
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            // Pattern 4: \p{N}{1,3} (numbers 1-3 digits)
			
 
				+            if (flags.is_number) {
			
 
				+                size_t ini = pos;
			
 
				+                while (_get_flags(pos).is_number) {
			
 
				+                    if (++pos - ini >= 3) {
			
 
				+                        _add_token(pos);
			
 
				+                        ini = pos;
			
 
				+                    }
			
 
				+                }
			
 
				+                _add_token(pos);
			
 
				+                continue;
			
 
				+            }
			
 
				+
			
 
				+            // Pattern 5:  ?[^\s\p{L}\p{N}]+[\r\n]* (optional space + non-word chars + optional newlines)
			
 
				+            auto flags2 = (cpt == ' ' ? _get_flags(pos + 1) : flags);
			
 
				+            if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number) && flags2.as_uint()) {
			
 
				+                pos += (cpt == ' ');
			
 
				+                while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number) && flags2.as_uint()) {
			
 
				+                    flags2 = _get_flags(++pos);
			
 
				+                }
			
 
				+                // Match optional [\r\n]*
			
 
				+                uint32_t cpt2 = _get_cpt(pos);
			
 
				+                while (cpt2 == '\r' || cpt2 == '\n') {
			
 
				+                    cpt2 = _get_cpt(++pos);
			
 
				+                }
			
 
				+                _add_token(pos);
			
 
				+                continue;
			
 
				+            }
			
 
				+
			
 
				+            // Count whitespace characters
			
 
				+            size_t num_whitespaces = 0;
			
 
				+            size_t last_end_r_or_n = 0;
			
 
				+            while (_get_flags(pos + num_whitespaces).is_whitespace) {
			
 
				+                uint32_t cpt2 = _get_cpt(pos + num_whitespaces);
			
 
				+                if (cpt2 == '\r' || cpt2 == '\n') {
			
 
				+                    last_end_r_or_n = pos + num_whitespaces + 1;
			
 
				+                }
			
 
				+                num_whitespaces++;
			
 
				+            }
			
 
				+
			
 
				+            // Pattern 6: \s*[\r\n]+ (whitespace with newlines)
			
 
				+            if (last_end_r_or_n > 0) {
			
 
				+                pos = last_end_r_or_n;
			
 
				+                _add_token(pos);
			
 
				+                continue;
			
 
				+            }
			
 
				+
			
 
				+            // Pattern 7: \s+(?!\S) (trailing whitespace)
			
 
				+            if (num_whitespaces > 1 && _get_cpt(pos + num_whitespaces) != OUT_OF_RANGE) {
			
 
				+                pos += num_whitespaces - 1;
			
 
				+                _add_token(pos);
			
 
				+                continue;
			
 
				+            }
			
 
				+
			
 
				+            // Pattern 8: \s+ (general whitespace)
			
 
				+            if (num_whitespaces > 0) {
			
 
				+                pos += num_whitespaces;
			
 
				+                _add_token(pos);
			
 
				+                continue;
			
 
				+            }
			
 
				+
			
 
				+            // No matches - consume single character
			
 
				+            _add_token(++pos);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return bpe_offsets;
			
 
				+}
			
 
				+
			
 
				 static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
			
 
				     std::vector<size_t> bpe_offsets;
			
 
				 
			
@@ -567,6 +739,9 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
 
				             regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
			
 
				 
			
 
				         bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
			
 
				+    } else if (regex_expr == "\\p{Han}+") {
			
 
				+        // K2's first pattern - handle all K2 patterns together
			
 
				+        bpe_offsets = unicode_regex_split_custom_kimi_k2(text, offsets);
			
 
				     }
			
 
				 
			
 
				     return bpe_offsets;
			
@@ -672,6 +847,38 @@ uint32_t unicode_tolower(uint32_t cpt) {
 
				     return cpt;  // Return the original code point if no lowercase mapping is found
			
 
				 }
			
 
				 
			
 
				+bool unicode_cpt_is_han(uint32_t cpt) {
			
 
				+    // Han character ranges (Chinese/CJK characters)
			
 
				+    // CJK Unified Ideographs (most common)
			
 
				+    if (cpt >= 0x4E00 && cpt <= 0x9FFF) return true;
			
 
				+
			
 
				+    // CJK Extension A
			
 
				+    if (cpt >= 0x3400 && cpt <= 0x4DBF) return true;
			
 
				+
			
 
				+    // CJK Extension B
			
 
				+    if (cpt >= 0x20000 && cpt <= 0x2A6DF) return true;
			
 
				+
			
 
				+    // CJK Extension C
			
 
				+    if (cpt >= 0x2A700 && cpt <= 0x2B73F) return true;
			
 
				+
			
 
				+    // CJK Extension D
			
 
				+    if (cpt >= 0x2B740 && cpt <= 0x2B81F) return true;
			
 
				+
			
 
				+    // CJK Extension E
			
 
				+    if (cpt >= 0x2B820 && cpt <= 0x2CEAF) return true;
			
 
				+
			
 
				+    // CJK Extension F
			
 
				+    if (cpt >= 0x2CEB0 && cpt <= 0x2EBEF) return true;
			
 
				+
			
 
				+    // CJK Compatibility Ideographs
			
 
				+    if (cpt >= 0xF900 && cpt <= 0xFAFF) return true;
			
 
				+
			
 
				+    // CJK Compatibility Ideographs Supplement
			
 
				+    if (cpt >= 0x2F800 && cpt <= 0x2FA1F) return true;
			
 
				+
			
 
				+    return false;
			
 
				+}
			
 
				+
			
 
				 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
			
 
				     // unicode categories
			
 
				     static const std::map<std::string, int> k_ucat_enum = {
			
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -63,4 +63,6 @@ uint8_t     unicode_utf8_to_byte(const std::string & utf8);
 
				 
			
 
				 uint32_t unicode_tolower(uint32_t cpt);
			
 
				 
			
 
				+bool unicode_cpt_is_han(uint32_t cpt);
			
 
				+
			
 
				 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);