|
|
@@ -215,6 +215,7 @@ bool llama_batch_allocr::init(
|
|
|
/*.n_seq_tokens =*/ (uint32_t) 1,
|
|
|
/*.n_seqs =*/ (uint32_t) batch.n_tokens,
|
|
|
/*.n_seqs_unq =*/ (uint32_t) this->seq_id_unq.size(),
|
|
|
+ /*.n_pos =*/ n_pos_per_embd,
|
|
|
/*.token =*/ batch.token,
|
|
|
/*.embd =*/ batch.embd,
|
|
|
/*.pos =*/ batch.pos,
|
|
|
@@ -251,46 +252,58 @@ bool llama_batch_allocr::init(
|
|
|
// consistency checks
|
|
|
//
|
|
|
|
|
|
- for (uint32_t s = 0; s < n_seq_max; ++s) {
|
|
|
- if (seq_pos[s].empty()) {
|
|
|
- continue;
|
|
|
+ if (n_pos_per_embd > 1) {
|
|
|
+ // M-RoPE case: allow position to "jump" forward only (non-continuous positions are allowed)
|
|
|
+ for (uint32_t s = 0; s < n_seq_max; ++s) {
|
|
|
+ if (seq_pos[s].empty()) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
|
|
|
+
|
|
|
+ if (p0 >= 0 && p0 >= seq_pos_min(s)) {
|
|
|
+ LLAMA_LOG_ERROR(
|
|
|
+ "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
|
|
|
+ " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
|
|
|
+ " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
|
|
|
+ " for M-RoPE, it is required that the position satisfies: X < Y\n",
|
|
|
+ __func__, s, s, p0, s, seq_pos_min(s));
|
|
|
+
|
|
|
+ return false;
|
|
|
+ }
|
|
|
}
|
|
|
+ } else {
|
|
|
+ for (uint32_t s = 0; s < n_seq_max; ++s) {
|
|
|
+ if (seq_pos[s].empty()) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
|
|
|
- const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
|
|
|
+ const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
|
|
|
|
|
|
- if (p0 >= 0) {
|
|
|
- bool ok = true;
|
|
|
+ if (p0 >= 0) {
|
|
|
+ bool ok = true;
|
|
|
|
|
|
- if (batch.token) {
|
|
|
if (seq_pos_min(s) != p0 + 1) {
|
|
|
ok = false;
|
|
|
}
|
|
|
- } else {
|
|
|
- assert(batch.embd);
|
|
|
|
|
|
- // for embeddings (typically used as vision input), we allow them to have repeating positions
|
|
|
- // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
|
|
|
- if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
|
|
|
- ok = false;
|
|
|
+ if (!ok) {
|
|
|
+ LLAMA_LOG_ERROR(
|
|
|
+ "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
|
|
|
+ " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
|
|
|
+ " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
|
|
|
+ " it is required that the sequence positions remain consecutive: Y = X + 1\n",
|
|
|
+ __func__, s, s, p0, s, seq_pos_min(s));
|
|
|
+
|
|
|
+ return false;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- if (!ok) {
|
|
|
- LLAMA_LOG_ERROR(
|
|
|
- "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
|
|
|
- " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
|
|
|
- " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
|
|
|
- " it is required that the sequence positions remain consecutive: Y = X + 1\n",
|
|
|
- __func__, s, s, p0, s, seq_pos_min(s));
|
|
|
-
|
|
|
+ if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
|
|
|
+ LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
|
|
|
return false;
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
- if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
|
|
|
- LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
|
|
|
- return false;
|
|
|
- }
|
|
|
}
|
|
|
|
|
|
if (memory) {
|
|
|
@@ -389,6 +402,7 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
|
|
|
/*.n_seq_tokens =*/ n_seq_tokens,
|
|
|
/*.n_seqs =*/ n_seqs,
|
|
|
/*.n_seqs_unq =*/ n_seqs,
|
|
|
+ /*.n_pos =*/ n_pos_per_embd,
|
|
|
|
|
|
/*.token =*/ udata->token.data(),
|
|
|
/*.embd =*/ nullptr,
|
|
|
@@ -710,6 +724,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|
|
/*.n_seq_tokens =*/ n_tokens/n_seqs,
|
|
|
/*.n_seqs =*/ n_seqs,
|
|
|
/*.n_seqs_unq =*/ (uint32_t) udata->seq_id_unq.size(),
|
|
|
+ /*.n_pos =*/ n_pos_per_embd,
|
|
|
|
|
|
/*.token =*/ batch.token ? udata->token.data() : nullptr,
|
|
|
/*.embd =*/ batch.embd ? udata->embd.data() : nullptr,
|