11 months ago · 2d219b389e
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -618,7 +618,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
 
				     result.reserve(utf8.size());
			
 
				     size_t offset = 0;
			
 
				     while (offset < utf8.size()) {
			
 
				-        result.push_back(unicode_cpt_from_utf8(utf8, offset));
			
 
				+        try {
			
 
				+            result.push_back(unicode_cpt_from_utf8(utf8, offset));
			
 
				+        }
			
 
				+        catch (const std::invalid_argument & /*ex*/) {
			
 
				+            // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
			
 
				+            ++offset;
			
 
				+            result.emplace_back(0xFFFD); // replacement character
			
 
				+        }
			
 
				     }
			
 
				     return result;
			
 
				 }