|
|
@@ -130,110 +130,97 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
|
|
|
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
|
|
const int64_t n_tokens = ubatch->n_tokens;
|
|
|
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
|
|
|
- const int64_t n_seqs = ubatch->n_seqs;
|
|
|
+ const int64_t n_seqs_unq = ubatch->n_seqs_unq;
|
|
|
|
|
|
GGML_ASSERT(mean);
|
|
|
GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer));
|
|
|
|
|
|
float * data = (float *) mean->data;
|
|
|
- memset(mean->data, 0, n_tokens * n_tokens * ggml_element_size(mean));
|
|
|
+ memset(mean->data, 0, n_tokens*n_seqs_unq*ggml_element_size(mean));
|
|
|
|
|
|
- std::vector<uint64_t> sum(n_tokens, 0);
|
|
|
+ std::vector<uint64_t> sums(n_seqs_unq, 0);
|
|
|
+ for (int i = 0; i < n_tokens; i += n_seq_tokens) {
|
|
|
+ for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
|
|
|
+ const llama_seq_id seq_id = ubatch->seq_id[i][s];
|
|
|
+ const int32_t seq_idx = ubatch->seq_idx[seq_id];
|
|
|
|
|
|
- // TODO: fix indexing [UBATCH_IDX]
|
|
|
- for (int s = 0; s < n_seqs; ++s) {
|
|
|
- const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
|
|
-
|
|
|
- // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
|
|
|
- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
|
|
|
-
|
|
|
- sum[seq_id] += ubatch->n_seq_tokens;
|
|
|
+ sums[seq_idx] += ubatch->n_seq_tokens;
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
- std::vector<float> div(n_tokens, 0.0f);
|
|
|
- for (int i = 0; i < n_tokens; ++i) {
|
|
|
- const uint64_t s = sum[i];
|
|
|
- if (s > 0) {
|
|
|
- div[i] = 1.0f/float(s);
|
|
|
+ std::vector<float> div(n_seqs_unq, 0.0f);
|
|
|
+ for (int s = 0; s < n_seqs_unq; ++s) {
|
|
|
+ const uint64_t sum = sums[s];
|
|
|
+ if (sum > 0) {
|
|
|
+ div[s] = 1.0f/float(sum);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- // TODO: fix indexing [UBATCH_IDX]
|
|
|
- for (int s = 0; s < n_seqs; ++s) {
|
|
|
- const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
|
|
+ for (int i = 0; i < n_tokens; i += n_seq_tokens) {
|
|
|
+ for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
|
|
|
+ const llama_seq_id seq_id = ubatch->seq_id[i][s];
|
|
|
+ const int32_t seq_idx = ubatch->seq_idx[seq_id];
|
|
|
|
|
|
- for (int i = 0; i < n_seq_tokens; ++i) {
|
|
|
- data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
|
|
|
+ for (int j = 0; j < n_seq_tokens; ++j) {
|
|
|
+ data[seq_idx*n_tokens + i + j] = div[seq_idx];
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
|
|
- if (cparams.embeddings && (
|
|
|
- cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
|
|
|
- cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
|
|
|
- const int64_t n_tokens = ubatch->n_tokens;
|
|
|
- const int64_t n_seq_tokens = ubatch->n_seq_tokens;
|
|
|
- const int64_t n_seqs = ubatch->n_seqs;
|
|
|
+ const int64_t n_tokens = ubatch->n_tokens;
|
|
|
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
|
|
|
+ const int64_t n_seqs_unq = ubatch->n_seqs_unq;
|
|
|
|
|
|
+ if (cparams.embeddings && (
|
|
|
+ cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
|
|
|
+ cparams.pooling_type == LLAMA_POOLING_TYPE_RANK
|
|
|
+ )) {
|
|
|
GGML_ASSERT(cls);
|
|
|
GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
|
|
|
|
|
|
uint32_t * data = (uint32_t *) cls->data;
|
|
|
- memset(cls->data, 0, n_tokens * ggml_element_size(cls));
|
|
|
+ memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
|
|
|
|
|
|
- // TODO: fix indexing [UBATCH_IDX]
|
|
|
- for (int s = 0; s < n_seqs; ++s) {
|
|
|
- const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
|
|
+ for (int i = 0; i < n_tokens; i += n_seq_tokens) {
|
|
|
+ for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
|
|
|
+ const llama_seq_id seq_id = ubatch->seq_id[i][s];
|
|
|
+ const int32_t seq_idx = ubatch->seq_idx[seq_id];
|
|
|
|
|
|
- // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
|
|
|
- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
|
|
|
-
|
|
|
- for (int i = 0; i < n_seq_tokens; ++i) {
|
|
|
- const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
|
|
|
-
|
|
|
- if (pos == 0) {
|
|
|
- data[seq_id] = s*n_seq_tokens + i;
|
|
|
- }
|
|
|
+ data[seq_idx] = i;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
|
|
|
- const int64_t n_tokens = ubatch->n_tokens;
|
|
|
- const int64_t n_seq_tokens = ubatch->n_seq_tokens;
|
|
|
- const int64_t n_seqs = ubatch->n_seqs;
|
|
|
-
|
|
|
GGML_ASSERT(cls);
|
|
|
GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
|
|
|
|
|
|
uint32_t * data = (uint32_t *) cls->data;
|
|
|
- memset(cls->data, 0, n_tokens * ggml_element_size(cls));
|
|
|
+ memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
|
|
|
|
|
|
- std::vector<int> last_pos(n_tokens, -1);
|
|
|
- std::vector<int> last_row(n_tokens, -1);
|
|
|
+ std::vector<int> last_pos(n_seqs_unq, -1);
|
|
|
+ std::vector<int> last_row(n_seqs_unq, -1);
|
|
|
|
|
|
- // TODO: fix indexing [UBATCH_IDX]
|
|
|
- for (int s = 0; s < n_seqs; ++s) {
|
|
|
- const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
|
|
-
|
|
|
- // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
|
|
|
- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
|
|
|
+ for (int i = 0; i < n_tokens; ++i) {
|
|
|
+ const llama_pos pos = ubatch->pos[i];
|
|
|
|
|
|
- for (int i = 0; i < n_seq_tokens; ++i) {
|
|
|
- const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
|
|
|
+ for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
|
|
|
+ const llama_seq_id seq_id = ubatch->seq_id[i][s];
|
|
|
+ const int32_t seq_idx = ubatch->seq_idx[seq_id];
|
|
|
|
|
|
- if (pos >= last_pos[seq_id]) {
|
|
|
- last_pos[seq_id] = pos;
|
|
|
- last_row[seq_id] = s*n_seq_tokens + i;
|
|
|
+ if (pos >= last_pos[seq_idx]) {
|
|
|
+ last_pos[seq_idx] = pos;
|
|
|
+ last_row[seq_idx] = i;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- for (int i = 0; i < n_tokens; ++i) {
|
|
|
- if (last_row[i] >= 0) {
|
|
|
- data[i] = last_row[i];
|
|
|
+ for (int s = 0; s < n_seqs_unq; ++s) {
|
|
|
+ if (last_row[s] >= 0) {
|
|
|
+ data[s] = last_row[s];
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
@@ -266,89 +253,36 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
|
|
}
|
|
|
|
|
|
void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
|
|
- if (kq_mask) {
|
|
|
- if (cparams.causal_attn) {
|
|
|
- const int64_t n_kv = ubatch->n_tokens;
|
|
|
- const int64_t n_tokens = ubatch->n_tokens;
|
|
|
- const int64_t n_seq_tokens = ubatch->n_seq_tokens;
|
|
|
- const int64_t n_seqs = ubatch->n_seqs;
|
|
|
-
|
|
|
- GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));
|
|
|
- float * data = (float *) kq_mask->data;
|
|
|
-
|
|
|
- for (int h = 0; h < 1; ++h) {
|
|
|
- for (int s1 = 0; s1 < n_seqs; ++s1) {
|
|
|
- const llama_seq_id seq_id = ubatch->seq_id[s1][0];
|
|
|
-
|
|
|
- for (int j = 0; j < n_seq_tokens; ++j) {
|
|
|
- const int32_t tj = s1*n_seq_tokens + j;
|
|
|
-
|
|
|
- for (int s0 = 0; s0 < n_seqs; ++s0) {
|
|
|
- for (int i = 0; i < n_seq_tokens; ++i) {
|
|
|
- const int32_t ti = s0*n_seq_tokens + i;
|
|
|
- float f = -INFINITY;
|
|
|
-
|
|
|
- // TODO: fix indexing [UBATCH_IDX]
|
|
|
- for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
|
|
|
- if (ubatch->seq_id[s0][s] == seq_id && ubatch->pos[ti] <= ubatch->pos[tj]) {
|
|
|
- if (hparams.use_alibi) {
|
|
|
- f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]);
|
|
|
- } else {
|
|
|
- f = 0.0f;
|
|
|
- }
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- data[h*(n_kv*n_tokens) + tj*n_kv + ti] = f;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- } else {
|
|
|
- const int64_t n_tokens = ubatch->n_tokens;
|
|
|
- const int64_t n_seq_tokens = ubatch->n_seq_tokens;
|
|
|
- const int64_t n_seqs = ubatch->n_seqs;
|
|
|
- const int64_t n_stride = ubatch->n_tokens;
|
|
|
-
|
|
|
- GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));
|
|
|
-
|
|
|
- float * data = (float *) kq_mask->data;
|
|
|
-
|
|
|
- for (int h = 0; h < 1; ++h) {
|
|
|
- for (int s1 = 0; s1 < n_seqs; ++s1) {
|
|
|
- const llama_seq_id seq_id = ubatch->seq_id[s1][0];
|
|
|
-
|
|
|
- for (int j = 0; j < n_seq_tokens; ++j) {
|
|
|
- const int32_t tj = s1*n_seq_tokens + j;
|
|
|
-
|
|
|
- for (int s0 = 0; s0 < n_seqs; ++s0) {
|
|
|
- for (int i = 0; i < n_seq_tokens; ++i) {
|
|
|
- const int32_t ti = s0*n_seq_tokens + i;
|
|
|
- float f = -INFINITY;
|
|
|
-
|
|
|
- // TODO: fix indexing [UBATCH_IDX]
|
|
|
- for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
|
|
|
- if (ubatch->seq_id[s0][s] == seq_id) {
|
|
|
- if (hparams.use_alibi) {
|
|
|
- f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]);
|
|
|
- } else {
|
|
|
- f = 0.0f;
|
|
|
- }
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f;
|
|
|
- }
|
|
|
- }
|
|
|
+ const int64_t n_kv = ubatch->n_tokens;
|
|
|
+ const int64_t n_tokens = ubatch->n_tokens;
|
|
|
+
|
|
|
+ GGML_ASSERT(kq_mask);
|
|
|
+ GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));
|
|
|
+
|
|
|
+ float * data = (float *) kq_mask->data;
|
|
|
+
|
|
|
+ for (int h = 0; h < 1; ++h) {
|
|
|
+ for (int i1 = 0; i1 < n_tokens; ++i1) {
|
|
|
+ const llama_seq_id s1 = ubatch->seq_id[i1][0];
|
|
|
+
|
|
|
+ for (int i0 = 0; i0 < n_tokens; ++i0) {
|
|
|
+ float f = -INFINITY;
|
|
|
+
|
|
|
+ for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
|
|
|
+ const llama_seq_id s0 = ubatch->seq_id[i0][0];
|
|
|
|
|
|
- for (int i = n_tokens; i < n_stride; ++i) {
|
|
|
- data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY;
|
|
|
+ // TODO: reimplement this like in llama_kv_cache_unified
|
|
|
+ if (s0 == s1 && (!cparams.causal_attn || ubatch->pos[i0] <= ubatch->pos[i1])) {
|
|
|
+ if (hparams.use_alibi) {
|
|
|
+ f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
|
|
|
+ } else {
|
|
|
+ f = 0.0f;
|
|
|
}
|
|
|
+ break;
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
@@ -371,34 +305,36 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch
|
|
|
}
|
|
|
|
|
|
void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
|
|
- if (cross_kq_mask) {
|
|
|
- const int64_t n_enc = cross_kq_mask->ne[0];
|
|
|
- const int64_t n_tokens = ubatch->n_tokens;
|
|
|
+ GGML_ASSERT(cross_kq_mask);
|
|
|
|
|
|
- GGML_ASSERT(ggml_backend_buffer_is_host(cross_kq_mask->buffer));
|
|
|
- GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
|
|
|
+ const int64_t n_enc = cross_kq_mask->ne[0];
|
|
|
+ const int64_t n_tokens = ubatch->n_tokens;
|
|
|
|
|
|
- float * data = (float *) cross_kq_mask->data;
|
|
|
+ GGML_ASSERT(ggml_backend_buffer_is_host(cross_kq_mask->buffer));
|
|
|
+ GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
|
|
|
|
|
|
- for (int h = 0; h < 1; ++h) {
|
|
|
- for (int j = 0; j < n_tokens; ++j) {
|
|
|
- for (int i = 0; i < n_enc; ++i) {
|
|
|
- float f = -INFINITY;
|
|
|
- // TODO: fix indexing [UBATCH_IDX]
|
|
|
- for (int s = 0; s < ubatch->n_seq_id[j]; ++s) {
|
|
|
- const llama_seq_id seq_id = ubatch->seq_id[j][s];
|
|
|
- if (cross->seq_ids_enc[i].find(seq_id) != cross->seq_ids_enc[i].end()) {
|
|
|
- f = 0.0f;
|
|
|
- }
|
|
|
+ float * data = (float *) cross_kq_mask->data;
|
|
|
+
|
|
|
+ for (int h = 0; h < 1; ++h) {
|
|
|
+ for (int i = 0; i < n_tokens; ++i) {
|
|
|
+ for (int j = 0; j < n_enc; ++j) {
|
|
|
+ float f = -INFINITY;
|
|
|
+
|
|
|
+ for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
|
|
|
+ const llama_seq_id seq_id = ubatch->seq_id[i][s];
|
|
|
+
|
|
|
+ if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
|
|
|
+ f = 0.0f;
|
|
|
}
|
|
|
- data[h*(n_enc*n_tokens) + j*n_enc + i] = f;
|
|
|
}
|
|
|
+
|
|
|
+ data[h*(n_enc*n_tokens) + i*n_enc + j] = f;
|
|
|
}
|
|
|
+ }
|
|
|
|
|
|
- for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
|
|
- for (int j = 0; j < n_enc; ++j) {
|
|
|
- data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
|
|
|
- }
|
|
|
+ for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
|
|
+ for (int j = 0; j < n_enc; ++j) {
|
|
|
+ data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
@@ -467,10 +403,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
|
|
res (std::make_unique<llm_graph_result>()) {
|
|
|
}
|
|
|
|
|
|
-int64_t llm_graph_context::n_pos_per_embd() const {
|
|
|
- return hparams.rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
|
|
|
-}
|
|
|
-
|
|
|
void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
|
|
|
if (cb_func) {
|
|
|
cb_func(ubatch, cur, name, il);
|
|
|
@@ -915,11 +847,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
|
|
}
|
|
|
|
|
|
ggml_tensor * llm_graph_context::build_inp_pos() const {
|
|
|
- auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
|
|
|
+ auto inp = std::make_unique<llm_graph_input_pos>(hparams.n_pos_per_embd());
|
|
|
|
|
|
auto & cur = inp->pos;
|
|
|
|
|
|
- cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
|
|
|
+ cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t)n_tokens*hparams.n_pos_per_embd());
|
|
|
ggml_set_input(cur);
|
|
|
|
|
|
res->add_input(std::move(inp));
|
|
|
@@ -959,7 +891,7 @@ ggml_tensor * llm_graph_context::build_inp_mean() const {
|
|
|
|
|
|
auto & cur = inp->mean;
|
|
|
|
|
|
- cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
|
|
+ cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, ubatch.n_seqs_unq);
|
|
|
ggml_set_input(cur);
|
|
|
|
|
|
res->add_input(std::move(inp));
|
|
|
@@ -972,7 +904,7 @@ ggml_tensor * llm_graph_context::build_inp_cls() const {
|
|
|
|
|
|
auto & cur = inp->cls;
|
|
|
|
|
|
- cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
|
|
+ cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_seqs_unq);
|
|
|
ggml_set_input(cur);
|
|
|
|
|
|
res->add_input(std::move(inp));
|