|
|
@@ -9,6 +9,7 @@
|
|
|
#include <stdexcept>
|
|
|
#include <string>
|
|
|
#include <unordered_map>
|
|
|
+#include <unordered_set>
|
|
|
#include <utility>
|
|
|
#include <vector>
|
|
|
#include <locale>
|
|
|
@@ -111,27 +112,27 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
|
|
|
static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
|
|
|
std::unordered_map<uint32_t, int> cpt_types;
|
|
|
for (auto p : unicode_ranges_number) {
|
|
|
- for (auto i = p.first; i <= p.second; ++ i) {
|
|
|
+ for (auto i = p.first; i <= p.second; ++i) {
|
|
|
cpt_types[i] = CODEPOINT_TYPE_NUMBER;
|
|
|
}
|
|
|
}
|
|
|
for (auto p : unicode_ranges_letter) {
|
|
|
- for (auto i = p.first; i <= p.second; ++ i) {
|
|
|
+ for (auto i = p.first; i <= p.second; ++i) {
|
|
|
cpt_types[i] = CODEPOINT_TYPE_LETTER;
|
|
|
}
|
|
|
}
|
|
|
- for (auto p : unicode_ranges_whitespace) {
|
|
|
- for (auto i = p.first; i <= p.second; ++ i) {
|
|
|
- cpt_types[i] = CODEPOINT_TYPE_WHITESPACE;
|
|
|
+ for (auto p : unicode_ranges_separator) {
|
|
|
+ for (auto i = p.first; i <= p.second; ++i) {
|
|
|
+ cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
|
|
|
}
|
|
|
}
|
|
|
for (auto p : unicode_ranges_accent_mark) {
|
|
|
- for (auto i = p.first; i <= p.second; ++ i) {
|
|
|
+ for (auto i = p.first; i <= p.second; ++i) {
|
|
|
cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
|
|
|
}
|
|
|
}
|
|
|
for (auto p : unicode_ranges_punctuation) {
|
|
|
- for (auto i = p.first; i <= p.second; ++ i) {
|
|
|
+ for (auto i = p.first; i <= p.second; ++i) {
|
|
|
cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
|
|
|
}
|
|
|
}
|
|
|
@@ -141,7 +142,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
|
|
|
}
|
|
|
}
|
|
|
for (auto p : unicode_ranges_control) {
|
|
|
- for (auto i = p.first; i <= p.second; ++ i) {
|
|
|
+ for (auto i = p.first; i <= p.second; ++i) {
|
|
|
cpt_types[i] = CODEPOINT_TYPE_CONTROL;
|
|
|
}
|
|
|
}
|
|
|
@@ -224,138 +225,256 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|
|
std::vector<size_t> bpe_offsets; // store the offset of each word
|
|
|
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
|
|
|
|
|
- size_t start = 0;
|
|
|
-
|
|
|
const auto cpts = unicode_cpts_from_utf8(text);
|
|
|
|
|
|
+ size_t start = 0;
|
|
|
for (auto offset : offsets) {
|
|
|
- std::string token;
|
|
|
+ const size_t offset_ini = start;
|
|
|
+ const size_t offset_end = start + offset;
|
|
|
+ assert(offset_end <= cpts.size());
|
|
|
+ start = offset_end;
|
|
|
+
|
|
|
+ auto _get_cpt = [&] (const size_t pos) -> char32_t {
|
|
|
+ return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
|
|
+ };
|
|
|
+
|
|
|
+ auto _get_cpt_type = [&] (const size_t pos) -> int {
|
|
|
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
|
|
|
+ };
|
|
|
+
|
|
|
+ size_t _prev_end = offset_ini;
|
|
|
+ auto _add_token = [&] (const size_t end) -> size_t {
|
|
|
+ assert(_prev_end <= end && end <= offset_end);
|
|
|
+ size_t len = end - _prev_end;
|
|
|
+ if (len > 0) {
|
|
|
+ bpe_offsets.push_back(len);
|
|
|
+ }
|
|
|
+ _prev_end = end;
|
|
|
+ //if (len > 0) {
|
|
|
+ // std::string s = "";
|
|
|
+ // for(size_t p = end-len; p < end; p++)
|
|
|
+ // s += unicode_cpt_to_utf8(cpts[p]);
|
|
|
+ // printf(">>> '%s'\n", s.c_str());
|
|
|
+ //}
|
|
|
+ return len;
|
|
|
+ };
|
|
|
+
|
|
|
+ for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
|
|
+ const char32_t cpt = _get_cpt(pos);
|
|
|
+ const int cpt_type = _get_cpt_type(pos);
|
|
|
+
|
|
|
+ // regex: 's|'t|'re|'ve|'m|'ll|'d
|
|
|
+ if (cpt == '\'' && pos+1 < offset_end) {
|
|
|
+ char32_t cpt_next = _get_cpt(pos+1);
|
|
|
+ if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
|
|
+ pos += _add_token(pos+2);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ if (pos+2 < offset_end) {
|
|
|
+ char32_t cpt_next_next = _get_cpt(pos+2);
|
|
|
+ if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
|
|
+ (cpt_next == 'v' && cpt_next_next == 'e') ||
|
|
|
+ (cpt_next == 'l' && cpt_next_next == 'l')) {
|
|
|
+ pos += _add_token(pos+3);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
- bool collecting_numeric = false;
|
|
|
- bool collecting_letter = false;
|
|
|
- bool collecting_special = false;
|
|
|
- bool collecting_whitespace_lookahead = false;
|
|
|
- bool collecting = false;
|
|
|
+ char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
|
|
|
+ int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
|
|
|
+ // regex: <space>?\p{L}+
|
|
|
+ if (cpt2_type == CODEPOINT_TYPE_LETTER) {
|
|
|
+ pos += (cpt == ' ');
|
|
|
+ while (cpt2_type == CODEPOINT_TYPE_LETTER) {
|
|
|
+ cpt2_type = _get_cpt_type(++pos);
|
|
|
+ }
|
|
|
+ _add_token(pos);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ // regex: <space>?\p{N}+
|
|
|
+ if (cpt2_type == CODEPOINT_TYPE_NUMBER) {
|
|
|
+ pos += (cpt == ' ');
|
|
|
+ while (cpt2_type == CODEPOINT_TYPE_NUMBER) {
|
|
|
+ cpt2_type = _get_cpt_type(++pos);
|
|
|
+ }
|
|
|
+ _add_token(pos);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ // regex: <space>?[^\s\p{L}\p{N}]+
|
|
|
+ if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
|
|
+ pos += (cpt == ' ');
|
|
|
+ while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
|
|
+ cpt2_type = _get_cpt_type(++pos);
|
|
|
+ cpt2 = _get_cpt(pos);
|
|
|
+ }
|
|
|
+ _add_token(pos);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
|
|
|
- std::vector<std::string> text_utf;
|
|
|
- text_utf.reserve(offset);
|
|
|
+ size_t num_whitespaces = 0;
|
|
|
+ while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
|
|
|
+ num_whitespaces++;
|
|
|
+ }
|
|
|
|
|
|
- for (size_t i = start; i < start + offset; ++i) {
|
|
|
- text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
|
|
|
+ // regex: \s+(?!\S)
|
|
|
+ if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
|
|
|
+ pos += num_whitespaces - 1;
|
|
|
+ _add_token(pos);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ // regex: \s+
|
|
|
+ if (num_whitespaces > 0) {
|
|
|
+ pos += num_whitespaces;
|
|
|
+ _add_token(pos);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ // no matches
|
|
|
+ _add_token(++pos);
|
|
|
}
|
|
|
+ }
|
|
|
+
|
|
|
+ return bpe_offsets;
|
|
|
+}
|
|
|
|
|
|
- for (int i = 0; i < (int)text_utf.size(); i++) {
|
|
|
- const std::string & utf_char = text_utf[i];
|
|
|
- bool split_condition = false;
|
|
|
- int bytes_remain = text_utf.size() - i;
|
|
|
+// LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
|
|
|
+static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
|
|
|
+ std::vector<size_t> bpe_offsets; // store the offset of each word
|
|
|
+ bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
|
|
|
|
|
- // forward backward lookups
|
|
|
- const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
|
|
- const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
|
|
+ const auto cpts = unicode_cpts_from_utf8(text);
|
|
|
|
|
|
- // handling contractions
|
|
|
- if (!split_condition && bytes_remain >= 2) {
|
|
|
- // 's|'t|'m|'d
|
|
|
- if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
|
|
- split_condition = true;
|
|
|
+ size_t start = 0;
|
|
|
+ for (auto offset : offsets) {
|
|
|
+ const size_t offset_ini = start;
|
|
|
+ const size_t offset_end = start + offset;
|
|
|
+ assert(offset_end <= cpts.size());
|
|
|
+ start = offset_end;
|
|
|
+
|
|
|
+ auto _get_cpt = [&] (const size_t pos) -> char32_t {
|
|
|
+ return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
|
|
+ };
|
|
|
+
|
|
|
+ auto _get_cpt_type = [&] (const size_t pos) -> int {
|
|
|
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
|
|
|
+ };
|
|
|
+
|
|
|
+ size_t _prev_end = offset_ini;
|
|
|
+ auto _add_token = [&] (const size_t end) -> size_t {
|
|
|
+ assert(_prev_end <= end && end <= offset_end);
|
|
|
+ size_t len = end - _prev_end;
|
|
|
+ if (len > 0) {
|
|
|
+ bpe_offsets.push_back(len);
|
|
|
+ }
|
|
|
+ _prev_end = end;
|
|
|
+ //if (len > 0) {
|
|
|
+ // std::string s = "";
|
|
|
+ // for(size_t p = end-len; p < end; p++)
|
|
|
+ // s += unicode_cpt_to_utf8(cpts[p]);
|
|
|
+ // printf(">>> '%s'\n", s.c_str());
|
|
|
+ //}
|
|
|
+ return len;
|
|
|
+ };
|
|
|
+
|
|
|
+ for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
|
|
+ const char32_t cpt = _get_cpt(pos);
|
|
|
+ const int cpt_type = _get_cpt_type(pos);
|
|
|
+
|
|
|
+ // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
|
|
|
+ if (cpt == '\'' && pos+1 < offset_end) {
|
|
|
+ char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
|
|
|
+ if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
|
|
+ pos += _add_token(pos+2);
|
|
|
+ continue;
|
|
|
}
|
|
|
- if (split_condition) {
|
|
|
- if (token.size()) {
|
|
|
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
|
|
+ if (pos+2 < offset_end) {
|
|
|
+ char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
|
|
|
+ if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
|
|
+ (cpt_next == 'v' && cpt_next_next == 'e') ||
|
|
|
+ (cpt_next == 'l' && cpt_next_next == 'l')) {
|
|
|
+ pos += _add_token(pos+3);
|
|
|
+ continue;
|
|
|
}
|
|
|
- token = utf_char + utf_char_next;
|
|
|
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
|
|
- token = "";
|
|
|
- i++;
|
|
|
- continue;
|
|
|
}
|
|
|
}
|
|
|
- if (!split_condition && bytes_remain >= 3) {
|
|
|
- // 're|'ve|'ll
|
|
|
- if (utf_char == "\'" && (
|
|
|
- (utf_char_next == "r" && utf_char_next_next == "e") ||
|
|
|
- (utf_char_next == "v" && utf_char_next_next == "e") ||
|
|
|
- (utf_char_next == "l" && utf_char_next_next == "l"))
|
|
|
- ) {
|
|
|
- split_condition = true;
|
|
|
- }
|
|
|
- if (split_condition) {
|
|
|
- // current token + next token can be defined
|
|
|
- if (token.size()) {
|
|
|
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
|
|
- }
|
|
|
- token = utf_char;
|
|
|
- token += utf_char_next;
|
|
|
- token += utf_char_next_next;
|
|
|
|
|
|
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
|
|
- token = "";
|
|
|
- i += 2;
|
|
|
+ // regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
|
|
|
+ if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) {
|
|
|
+ if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) { // one or more letters
|
|
|
+ pos++;
|
|
|
+ while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) {
|
|
|
+ pos++;
|
|
|
+ }
|
|
|
+ _add_token(pos);
|
|
|
continue;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- if (!split_condition && !collecting) {
|
|
|
- if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
|
|
- collecting_letter = true;
|
|
|
- collecting = true;
|
|
|
- }
|
|
|
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
|
|
|
- collecting_numeric = true;
|
|
|
- collecting = true;
|
|
|
- }
|
|
|
- else if (
|
|
|
- ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
|
|
- (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_NUMBER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
|
|
- ) {
|
|
|
- collecting_special = true;
|
|
|
- collecting = true;
|
|
|
- }
|
|
|
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
|
|
- collecting_whitespace_lookahead = true;
|
|
|
- collecting = true;
|
|
|
- }
|
|
|
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
|
|
- split_condition = true;
|
|
|
+ // regex: \p{N}{1,3}
|
|
|
+ if (cpt_type == CODEPOINT_TYPE_NUMBER) {
|
|
|
+ size_t ini = pos;
|
|
|
+ while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) {
|
|
|
+ if (++pos - ini >= 3 ) {
|
|
|
+ _add_token(pos);
|
|
|
+ ini = pos;
|
|
|
+ }
|
|
|
}
|
|
|
+ _add_token(pos);
|
|
|
+ continue;
|
|
|
}
|
|
|
- else if (!split_condition && collecting) {
|
|
|
- if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
|
|
- split_condition = true;
|
|
|
- }
|
|
|
- else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) {
|
|
|
- split_condition = true;
|
|
|
+
|
|
|
+ // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
|
|
|
+ char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
|
|
|
+ int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
|
|
|
+ if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
|
|
+ pos += (cpt == ' ');
|
|
|
+ while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
|
|
+ cpt2_type = _get_cpt_type(++pos);
|
|
|
+ cpt2 = _get_cpt(pos);
|
|
|
}
|
|
|
- else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
|
|
- split_condition = true;
|
|
|
+ while (cpt2 == '\r' || cpt2 == '\n') {
|
|
|
+ cpt2 = _get_cpt(++pos);
|
|
|
}
|
|
|
- else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
|
|
|
- split_condition = true;
|
|
|
+ _add_token(pos);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ size_t num_whitespaces = 0;
|
|
|
+ size_t last_end_r_or_n = 0;
|
|
|
+ while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
|
|
|
+ char32_t cpt2 = _get_cpt(pos+num_whitespaces);
|
|
|
+ if (cpt2 == '\r' || cpt2 == '\n') {
|
|
|
+ last_end_r_or_n = pos + num_whitespaces + 1;
|
|
|
}
|
|
|
+ num_whitespaces++;
|
|
|
}
|
|
|
|
|
|
- if (utf_char_next == "") {
|
|
|
- split_condition = true; // final
|
|
|
- token += utf_char;
|
|
|
+ // regex: \s*[\r\n]+
|
|
|
+ if (last_end_r_or_n > 0) {
|
|
|
+ pos = last_end_r_or_n;
|
|
|
+ _add_token(pos);
|
|
|
+ continue;
|
|
|
}
|
|
|
|
|
|
- if (split_condition) {
|
|
|
- if (token.size()) {
|
|
|
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
|
|
- }
|
|
|
- token = utf_char;
|
|
|
- collecting = false;
|
|
|
- collecting_letter = false;
|
|
|
- collecting_numeric = false;
|
|
|
- collecting_special = false;
|
|
|
- collecting_whitespace_lookahead = false;
|
|
|
+ // regex: \s+(?!\S)
|
|
|
+ if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
|
|
|
+ pos += num_whitespaces - 1;
|
|
|
+ _add_token(pos);
|
|
|
+ continue;
|
|
|
}
|
|
|
- else {
|
|
|
- token += utf_char;
|
|
|
+
|
|
|
+ // regex: \s+
|
|
|
+ if (num_whitespaces > 0) {
|
|
|
+ pos += num_whitespaces;
|
|
|
+ _add_token(pos);
|
|
|
+ continue;
|
|
|
}
|
|
|
- }
|
|
|
|
|
|
- start += offset;
|
|
|
+ // no matches
|
|
|
+ _add_token(++pos);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
return bpe_offsets;
|
|
|
@@ -424,14 +543,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
|
|
|
static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
|
|
std::vector<size_t> bpe_offsets;
|
|
|
|
|
|
- (void)(text);
|
|
|
- (void)(regex_expr);
|
|
|
- (void)(offsets);
|
|
|
- // TODO: this implementation is actually wrong, uncomment and run:
|
|
|
- // make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf
|
|
|
- //if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
|
|
|
- // bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
|
|
|
- //}
|
|
|
+ if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
|
|
|
+ bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
|
|
|
+ } else if (
|
|
|
+ regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
|
|
|
+ regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
|
|
|
+
|
|
|
+ bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
|
|
|
+ }
|
|
|
|
|
|
return bpe_offsets;
|
|
|
}
|
|
|
@@ -506,6 +625,19 @@ int unicode_cpt_type(const std::string & utf8) {
|
|
|
return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
|
|
|
}
|
|
|
|
|
|
+bool unicode_cpt_is_whitespace(uint32_t cp) {
|
|
|
+ static const std::unordered_set<uint32_t> is_whitespace = [] {
|
|
|
+ std::unordered_set<uint32_t> is_whitespace;
|
|
|
+ for (auto p : unicode_ranges_whitespace) {
|
|
|
+ for (auto i = p.first; i <= p.second; ++i) {
|
|
|
+ is_whitespace.insert(i);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return is_whitespace;
|
|
|
+ }();
|
|
|
+ return (bool)is_whitespace.count(cp);
|
|
|
+}
|
|
|
+
|
|
|
std::string unicode_byte_to_utf8(uint8_t byte) {
|
|
|
static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
|
|
|
return map.at(byte);
|