|
@@ -359,6 +359,11 @@ static void llama_params_fit_impl(
|
|
|
|
|
|
|
|
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
|
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
|
|
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
|
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
|
|
|
|
+
|
|
|
|
|
+ uint32_t n_full() const {
|
|
|
|
|
+ assert(n_layer >= n_part);
|
|
|
|
|
+ return n_layer - n_part;
|
|
|
|
|
+ }
|
|
|
};
|
|
};
|
|
|
|
|
|
|
|
const size_t ntbo = llama_max_tensor_buft_overrides();
|
|
const size_t ntbo = llama_max_tensor_buft_overrides();
|
|
@@ -382,7 +387,7 @@ static void llama_params_fit_impl(
|
|
|
|
|
|
|
|
size_t itbo = 0;
|
|
size_t itbo = 0;
|
|
|
for (size_t id = 0; id < nd; id++) {
|
|
for (size_t id = 0; id < nd; id++) {
|
|
|
- il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
|
|
|
|
|
|
|
+ il0 += ngl_per_device[id].n_full();
|
|
|
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
|
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
|
|
if (itbo + 1 >= ntbo) {
|
|
if (itbo + 1 >= ntbo) {
|
|
|
tensor_buft_overrides[itbo].pattern = nullptr;
|
|
tensor_buft_overrides[itbo].pattern = nullptr;
|
|
@@ -393,7 +398,7 @@ static void llama_params_fit_impl(
|
|
|
+ std::to_string(ntbo) + " is insufficient for model");
|
|
+ std::to_string(ntbo) + " is insufficient for model");
|
|
|
}
|
|
}
|
|
|
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
|
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
|
|
- tensor_buft_overrides[itbo].buft = overflow_bufts[id];
|
|
|
|
|
|
|
+ tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
|
|
|
itbo++;
|
|
itbo++;
|
|
|
}
|
|
}
|
|
|
il0 += ngl_per_device[id].n_part;
|
|
il0 += ngl_per_device[id].n_part;
|
|
@@ -468,20 +473,14 @@ static void llama_params_fit_impl(
|
|
|
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
|
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
|
|
|
|
|
|
|
+ std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
|
|
|
overflow_bufts.reserve(nd);
|
|
overflow_bufts.reserve(nd);
|
|
|
- for (size_t id = 0; id < nd - 1; ++id) {
|
|
|
|
|
- overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
|
|
|
|
|
|
|
+ for (size_t id = 0; id < nd; id++) {
|
|
|
|
|
+ overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
|
|
}
|
|
}
|
|
|
- overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
|
|
|
|
|
|
|
|
|
std::vector<ngl_t> ngl_per_device(nd);
|
|
std::vector<ngl_t> ngl_per_device(nd);
|
|
|
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
|
|
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
|
|
|
- if (hp_nex > 0) {
|
|
|
|
|
- for (size_t id = 0; id < nd; id++) {
|
|
|
|
|
- ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
|
|
|
|
|
// optimize the number of layers per device using the method of false position:
|
|
// optimize the number of layers per device using the method of false position:
|
|
|
// - ngl_per_device has 0 layers for each device, lower bound
|
|
// - ngl_per_device has 0 layers for each device, lower bound
|
|
@@ -512,9 +511,6 @@ static void llama_params_fit_impl(
|
|
|
if (mem_high[id] > targets[id]) {
|
|
if (mem_high[id] > targets[id]) {
|
|
|
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
|
|
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
|
|
|
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
|
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
|
|
- if (hp_nex > 0 && size_t(id) == nd - 1) {
|
|
|
|
|
- delta--;
|
|
|
|
|
- }
|
|
|
|
|
LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
|
|
LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
|
|
|
while (delta > 1) {
|
|
while (delta > 1) {
|
|
|
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
|
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
|
@@ -524,7 +520,8 @@ static void llama_params_fit_impl(
|
|
|
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
|
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
|
|
ngl_per_device_test[id].n_layer += step_size;
|
|
ngl_per_device_test[id].n_layer += step_size;
|
|
|
if (hp_nex) {
|
|
if (hp_nex) {
|
|
|
- ngl_per_device_test[id].n_part += step_size;
|
|
|
|
|
|
|
+ ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
|
|
|
|
|
+ step_size - 1 : step_size; // the first layer is the output layer which must always be full
|
|
|
}
|
|
}
|
|
|
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
|
|
|
|
|
@@ -573,7 +570,7 @@ static void llama_params_fit_impl(
|
|
|
assert(id_dense_start < nd);
|
|
assert(id_dense_start < nd);
|
|
|
|
|
|
|
|
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
|
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
|
|
- for (size_t id = 0; id <= id_dense_start; id++) {
|
|
|
|
|
|
|
+ for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
|
|
|
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
|
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
|
|
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
|
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
|
|
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
|
|
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
|
|
@@ -585,12 +582,8 @@ static void llama_params_fit_impl(
|
|
|
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
|
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
|
|
|
|
|
|
|
if (mem_high[id] > targets[id]) {
|
|
if (mem_high[id] > targets[id]) {
|
|
|
- assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
|
|
|
|
|
- assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
|
|
|
|
|
- assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
|
|
|
|
- >= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
|
|
|
|
- uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
|
|
|
|
- - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
|
|
|
|
|
|
+ assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
|
|
|
|
+ uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
|
|
while (delta > 1) {
|
|
while (delta > 1) {
|
|
|
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
|
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
|
|
step_size = std::max(step_size, uint32_t(1));
|
|
step_size = std::max(step_size, uint32_t(1));
|
|
@@ -606,7 +599,7 @@ static void llama_params_fit_impl(
|
|
|
ngl_per_device_test[id].n_layer += n_convert_jd;
|
|
ngl_per_device_test[id].n_layer += n_convert_jd;
|
|
|
n_converted_test += n_convert_jd;
|
|
n_converted_test += n_convert_jd;
|
|
|
|
|
|
|
|
- if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
|
|
|
|
|
|
|
+ if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
|
|
|
break;
|
|
break;
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -625,8 +618,8 @@ static void llama_params_fit_impl(
|
|
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
|
|
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
|
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
|
|
}
|
|
}
|
|
|
- delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
|
|
|
|
- - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
|
|
|
|
|
|
+ assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
|
|
|
|
+ delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
|
|
}
|
|
}
|
|
|
} else {
|
|
} else {
|
|
|
ngl_per_device = ngl_per_device_high;
|
|
ngl_per_device = ngl_per_device_high;
|
|
@@ -644,14 +637,19 @@ static void llama_params_fit_impl(
|
|
|
ngl_per_device_test[id_dense_start_test].n_part--;
|
|
ngl_per_device_test[id_dense_start_test].n_part--;
|
|
|
ngl_per_device_test[id].n_layer++;
|
|
ngl_per_device_test[id].n_layer++;
|
|
|
ngl_per_device_test[id].n_part++;
|
|
ngl_per_device_test[id].n_part++;
|
|
|
- if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
|
|
|
|
|
|
|
+ if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
|
|
|
id_dense_start_test++;
|
|
id_dense_start_test++;
|
|
|
}
|
|
}
|
|
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
|
|
|
|
+ std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
|
|
|
|
|
+ if (id < nd - 1) {
|
|
|
|
|
+ overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
|
|
|
|
|
+ }
|
|
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
|
|
- std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
|
|
|
|
|
+ std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
|
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
|
ngl_per_device = ngl_per_device_test;
|
|
ngl_per_device = ngl_per_device_test;
|
|
|
|
|
+ overflow_bufts = overflow_bufts_test;
|
|
|
mem = mem_test;
|
|
mem = mem_test;
|
|
|
id_dense_start = id_dense_start_test;
|
|
id_dense_start = id_dense_start_test;
|
|
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
|
@@ -659,9 +657,10 @@ static void llama_params_fit_impl(
|
|
|
|
|
|
|
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
|
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
|
|
- mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
|
|
|
|
|
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
|
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
|
ngl_per_device = ngl_per_device_test;
|
|
ngl_per_device = ngl_per_device_test;
|
|
|
|
|
+ overflow_bufts = overflow_bufts_test;
|
|
|
mem = mem_test;
|
|
mem = mem_test;
|
|
|
id_dense_start = id_dense_start_test;
|
|
id_dense_start = id_dense_start_test;
|
|
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
|
@@ -670,9 +669,10 @@ static void llama_params_fit_impl(
|
|
|
} else {
|
|
} else {
|
|
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
|
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
|
|
- mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
|
|
|
|
|
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
|
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
|
ngl_per_device = ngl_per_device_test;
|
|
ngl_per_device = ngl_per_device_test;
|
|
|
|
|
+ overflow_bufts = overflow_bufts_test;
|
|
|
mem = mem_test;
|
|
mem = mem_test;
|
|
|
id_dense_start = id_dense_start_test;
|
|
id_dense_start = id_dense_start_test;
|
|
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
|
@@ -687,6 +687,14 @@ static void llama_params_fit_impl(
|
|
|
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
|
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ // print info for devices that were not changed during the conversion from dense only to full layers:
|
|
|
|
|
+ for (size_t id = id_dense_start + 1; id < nd; id++) {
|
|
|
|
|
+ const int64_t projected_margin = dmds_full[id].free - mem[id];
|
|
|
|
|
+ LLAMA_LOG_INFO(
|
|
|
|
|
+ "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
|
|
|
|
+ __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
|
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
|
|
}
|
|
}
|
|
|
|
|
|