|
@@ -2221,14 +2221,12 @@ void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer
|
|
|
// find the first occurrence of a given special token in this fragment
|
|
// find the first occurrence of a given special token in this fragment
|
|
|
// passing offset argument only limit the "search area" but match coordinates
|
|
// passing offset argument only limit the "search area" but match coordinates
|
|
|
// are still relative to the source full raw_text
|
|
// are still relative to the source full raw_text
|
|
|
- auto match = raw_text.find(text, raw_text_base_offset);
|
|
|
|
|
|
|
+ // string_view begins at pos 0 for the same reason
|
|
|
|
|
+ auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset);
|
|
|
|
|
|
|
|
// no occurrences found, stop processing this fragment for a given special token
|
|
// no occurrences found, stop processing this fragment for a given special token
|
|
|
if (match == std::string::npos) break;
|
|
if (match == std::string::npos) break;
|
|
|
|
|
|
|
|
- // check if match is within bounds of offset <-> length
|
|
|
|
|
- if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
|
|
|
|
|
-
|
|
|
|
|
#ifdef PRETOKENIZERDEBUG
|
|
#ifdef PRETOKENIZERDEBUG
|
|
|
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
|
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
|
|
#endif
|
|
#endif
|