|
|
@@ -618,7 +618,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
|
|
result.reserve(utf8.size());
|
|
|
size_t offset = 0;
|
|
|
while (offset < utf8.size()) {
|
|
|
- result.push_back(unicode_cpt_from_utf8(utf8, offset));
|
|
|
+ try {
|
|
|
+ result.push_back(unicode_cpt_from_utf8(utf8, offset));
|
|
|
+ }
|
|
|
+ catch (const std::invalid_argument & /*ex*/) {
|
|
|
+ // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
|
|
|
+ ++offset;
|
|
|
+ result.emplace_back(0xFFFD); // replacement character
|
|
|
+ }
|
|
|
}
|
|
|
return result;
|
|
|
}
|