|
|
@@ -55,7 +55,18 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
|
|
|
if (ubatch->pos && pos) {
|
|
|
const int64_t n_tokens = ubatch->n_tokens;
|
|
|
|
|
|
- ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
|
|
|
+ if (ubatch->token && n_pos_per_embd > 1) {
|
|
|
+ // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
|
|
|
+ // the other dimensions are all 0, they are unused for text tokens
|
|
|
+ std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd, 0);
|
|
|
+ // copy the first dimension
|
|
|
+ for (int i = 0; i < n_tokens; ++i) {
|
|
|
+ pos_data[i] = ubatch->pos[i];
|
|
|
+ }
|
|
|
+ ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
|
|
|
+ } else {
|
|
|
+ ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -71,7 +82,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
|
|
) * f_attn_temp_scale + 1.0;
|
|
|
}
|
|
|
|
|
|
- ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
|
|
|
+ ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -592,7 +603,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
|
|
res (std::make_unique<llm_graph_result>()) {
|
|
|
}
|
|
|
|
|
|
-int64_t llm_graph_context::n_pos_per_token() const {
|
|
|
+int64_t llm_graph_context::n_pos_per_embd() const {
|
|
|
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
|
|
|
}
|
|
|
|
|
|
@@ -1018,11 +1029,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
|
|
}
|
|
|
|
|
|
ggml_tensor * llm_graph_context::build_inp_pos() const {
|
|
|
- auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());
|
|
|
+ auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
|
|
|
|
|
|
auto & cur = inp->pos;
|
|
|
|
|
|
- cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
|
|
|
+ cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
|
|
|
ggml_set_input(cur);
|
|
|
|
|
|
res->add_input(std::move(inp));
|
|
|
@@ -1031,11 +1042,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
|
|
|
}
|
|
|
|
|
|
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
|
|
- auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
|
|
+ auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
|
|
|
|
|
auto & cur = inp->attn_scale;
|
|
|
|
|
|
- cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
|
|
|
+ // this need to be 1x1xN for broadcasting
|
|
|
+ cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
|
|
|
ggml_set_input(cur);
|
|
|
|
|
|
res->add_input(std::move(inp));
|