|
|
|
GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); |
|
GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); |
|
|
|
if (!hparams.vocab_only) { |
|
// GPU backends |
|
for (auto * dev : model->devices) { |
|
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); |
|
if (backend == nullptr) { |
|
LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); |
|
llama_free(ctx); |
|
return nullptr; |
|
} |
|
ctx->backends.emplace_back(backend); |
|
} |
|
|
|
// add ACCEL backends (such as BLAS) |
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { |
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i); |
|
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) { |
|
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); |
|
if (backend == nullptr) { |
|
LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); |
|
llama_free(ctx); |
|
return nullptr; |
|
} |
|
ctx->backends.emplace_back(backend); |
|
} |
|
} |
|
|
|
// add CPU backend |
|
ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); |
|
if (ctx->backend_cpu == nullptr) { |
|
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); |
|
llama_free(ctx); |
|
return nullptr; |
|
} |
|
ctx->backends.emplace_back(ctx->backend_cpu); |
|
|
|
// create a list of the set_n_threads functions in the backends |
|
for (auto & backend : ctx->backends) { |
|
ggml_backend_dev_t dev = ggml_backend_get_device(backend.get()); |
|
ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; |
|
if (reg) { |
|
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); |
|
if (ggml_backend_set_n_threads_fn) { |
|
ctx->set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn); |
|
} |
|
} |
|
} |
|
|
|
llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data); |
|
|
|
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, ctx->cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { |
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); |
|
llama_free(ctx); |
|
return nullptr; |
|
} |
|
|
|
{ |
|
size_t memory_size_k = 0; |
|
size_t memory_size_v = 0; |
|
|
|
for (auto & k : ctx->kv_self.k_l) { |
|
memory_size_k += ggml_nbytes(k); |
|
} |
|
|
|
for (auto & v : ctx->kv_self.v_l) { |
|
memory_size_v += ggml_nbytes(v); |
|
} |
|
|
|
LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, |
|
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), |
|
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), |
|
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); |
|
} |
|
|
|
// graph outputs buffer |
|
{ |
|
// resized during inference when a batch uses more outputs |
|
if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) { |
|
LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__); |
|
llama_free(ctx); |
|
return nullptr; |
|
} |
|
|
|
LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, |
|
ggml_backend_buffer_name(ctx->buf_output.get()), |
|
ggml_backend_buffer_get_size(ctx->buf_output.get()) / 1024.0 / 1024.0); |
|
} |
|
|
|
// scheduler and compute buffers |
|
{ |
|
// buffer types used for the compute buffer of each backend |
|
std::vector<ggml_backend_buffer_type_t> backend_buft; |
|
std::vector<ggml_backend_t> backend_ptrs; |
|
for (auto & backend : ctx->backends) { |
|
auto * buft = ggml_backend_get_default_buffer_type(backend.get()); |
|
auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); |
|
if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) { |
|
// use the host buffer of the first device CPU for faster transfer of the intermediate state |
|
auto * dev = model->devices[0]; |
|
auto * host_buft = ggml_backend_dev_host_buffer_type(dev); |
|
if (host_buft) { |
|
buft = host_buft; |
|
} |
|
} |
|
backend_buft.push_back(buft); |
|
backend_ptrs.push_back(backend.get()); |
|
} |
|
|
|
const size_t max_nodes = model->max_nodes(); |
|
|
|
// buffer used to store the computation graph and the tensor meta data |
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); |
|
|
|
// TODO: move these checks to ggml_backend_sched |
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary |
|
bool pipeline_parallel = |
|
model->n_devices() > 1 && |
|
model->params.n_gpu_layers > (int)model->hparams.n_layer && |
|
model->params.split_mode == LLAMA_SPLIT_MODE_LAYER && |
|
params.offload_kqv; |
|
|
|
// pipeline parallelism requires support for async compute and events in all devices |
|
if (pipeline_parallel) { |
|
for (auto & backend : ctx->backends) { |
|
auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); |
|
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { |
|
// ignore CPU backend |
|
continue; |
|
} |
|
auto * dev = ggml_backend_get_device(backend.get()); |
|
ggml_backend_dev_props props; |
|
ggml_backend_dev_get_props(dev, &props); |
|
if (!props.caps.async || !props.caps.events) { |
|
// device does not support async compute or events |
|
pipeline_parallel = false; |
|
break; |
|
} |
|
} |
|
} |
|
|
|
ctx->sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); |
|
|
|
if (pipeline_parallel) { |
|
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched.get())); |
|
} |
|
|
|
// initialize scheduler with the worst-case graph |
|
uint32_t n_seqs = 1; // TODO: worst-case number of sequences |
|
uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); |
|
llama_token token = ctx->model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph |
|
|
|
llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; |
|
ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true); |
|
|
|
// reserve pp graph first so that buffers are only allocated once |
|
ggml_backend_sched_reserve(ctx->sched.get(), gf_pp); |
|
int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched.get()); |
|
int n_nodes_pp = ggml_graph_n_nodes(gf_pp); |
|
|
|
// reserve with tg graph to get the number of splits and nodes |
|
llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; |
|
ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true); |
|
ggml_backend_sched_reserve(ctx->sched.get(), gf_tg); |
|
int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched.get()); |
|
int n_nodes_tg = ggml_graph_n_nodes(gf_tg); |
|
|
|
// reserve again with pp graph to avoid ggml-alloc reallocations during inference |
|
gf_pp = llama_build_graph(*ctx, ubatch_pp, true); |
|
if (!ggml_backend_sched_reserve(ctx->sched.get(), gf_pp)) { |
|
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); |
|
llama_free(ctx); |
|
return nullptr; |
|
} |
|
|
|
for (size_t i = 0; i < backend_ptrs.size(); ++i) { |
|
ggml_backend_t backend = backend_ptrs[i]; |
|
ggml_backend_buffer_type_t buft = backend_buft[i]; |
|
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched.get(), backend); |
|
if (size > 1) { |
|
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, |
|
ggml_backend_buft_name(buft), |
|
size / 1024.0 / 1024.0); |
|
} |
|
} |
|
|
|
if (n_nodes_pp == n_nodes_tg) { |
|
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp); |
|
} else { |
|
LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg); |
|
} |
|
if (n_splits_pp == n_splits_tg) { |
|
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp); |
|
} else { |
|
LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg); |
|
} |
|
} |
|
} |