diff --git a/common/arg.cpp b/common/arg.cpp index a465eb36234e7..836f5e80e130e 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -1434,7 +1435,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context params.kv_overrides.back().key[0] = 0; } - if (!params.tensor_buft_overrides.empty()) { + // pad tensor_buft_overrides for llama_params_fit: + const size_t ntbo = llama_max_tensor_buft_overrides(); + while (params.tensor_buft_overrides.size() < ntbo) { params.tensor_buft_overrides.push_back({nullptr, nullptr}); } @@ -2961,6 +2964,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_env("LLAMA_ARG_MAIN_GPU")); + add_opt(common_arg( + { "-fit", "--fit" }, "[on|off]", + string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"), + [](common_params & params, const std::string & value) { + if (is_truthy(value)) { + params.fit_params = true; + } else if (is_falsey(value)) { + params.fit_params = false; + } else { + throw std::runtime_error( + string_format("error: unkown value for --fit: '%s'\n", value.c_str())); + } + } + ).set_env("LLAMA_ARG_FIT")); + add_opt(common_arg( + { "-fitm", "--fit-margin" }, "MiB", + string_format("target margin per device for --fit option, default: %zu", params.fit_params_margin/(1024*1024)), + [](common_params & params, int value) { + params.fit_params_margin = value * size_t(1024*1024); + } + ).set_env("LLAMA_ARG_FIT_MARGIN")); + add_opt(common_arg( + { "-fitc", "--fit-ctx" }, "N", + string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx), + [](common_params & params, int value) { + params.fit_params_min_ctx = value; + } + ).set_env("LLAMA_ARG_FIT_CTX")); add_opt(common_arg( {"--check-tensors"}, string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), diff --git a/common/common.cpp b/common/common.cpp index b0591e84b0668..b60655f8c12e0 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -916,6 +916,19 @@ std::string fs_get_cache_file(const std::string & filename) { struct common_init_result common_init_from_params(common_params & params) { common_init_result iparams; auto mparams = common_model_params_to_llama(params); + auto cparams = common_context_params_to_llama(params); + + if (params.fit_params) { + const bool fit_successful = llama_params_fit(params.model.path.c_str(), &mparams, &cparams, + params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_margin, params.fit_params_min_ctx, + params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); + + if (fit_successful) { + LOG_INF("%s: successfully fit parameters to device memory\n", __func__); + } else { + LOG_WRN("%s: failed to fit parameters to device memory, may crash during allocation\n", __func__); + } + } llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); if (model == NULL) { @@ -926,8 +939,6 @@ struct common_init_result common_init_from_params(common_params & params) { const llama_vocab * vocab = llama_model_get_vocab(model); - auto cparams = common_context_params_to_llama(params); - llama_context * lctx = llama_init_from_model(model, cparams); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n", diff --git a/common/common.h b/common/common.h index a8cb630ea5805..15ef3651c7cbc 100644 --- a/common/common.h +++ b/common/common.h @@ -274,8 +274,8 @@ struct lr_opt { struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata); struct common_params { - int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 4096; // context size + int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit + int32_t n_ctx = 0; // context size, 0 == context the model was trained with int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt @@ -296,9 +296,12 @@ struct common_params { // offload params std::vector devices; // devices to use for offloading - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + bool fit_params = true; // whether to fit unset model/context parameters to free device memory + size_t fit_params_margin = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory + int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h index 2cb150fd2a313..78aa059dde380 100644 --- a/ggml/include/ggml-alloc.h +++ b/ggml/include/ggml-alloc.h @@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); // call with a worst-case graph to avoid buffer reallocations // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed // returns false if the buffer allocation failed +// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph); +GGML_API void ggml_gallocr_reserve_n_size( + ggml_gallocr_t galloc, + struct ggml_cgraph * graph, + const int * node_buffer_ids, + const int * leaf_buffer_ids, + size_t * sizes); GGML_API bool ggml_gallocr_reserve_n( ggml_gallocr_t galloc, struct ggml_cgraph * graph, @@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i // Utils // Create a buffer and allocate all the tensors in a ggml_context +// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft +GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend); diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index f1b740785914e..4ed5f35774ffc 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -307,6 +307,7 @@ extern "C" { GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph + GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes); GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index d948b00cc7f30..c5b378e9e5fde 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2509,7 +2509,8 @@ extern "C" { // Set callback for all future logging events. // If this is not called, or NULL is supplied, everything is output on stderr. - GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data); + GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data); + GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data); GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 91aff205f1832..45f014d846ba0 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -602,7 +602,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) { } static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) { - return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated; + return t->data != NULL // tensor data already set externally + || t->buffer // tensor on external buffer (but not yet allocated) + || ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc } // free the extra space at the end if the new tensor is smaller @@ -820,7 +822,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr } } -bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { +static bool ggml_gallocr_reserve_n_impl( + ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) { size_t min_hash_size = graph->n_nodes + graph->n_leafs; // add 25% margin to avoid hash collisions min_hash_size += min_hash_size / 4; @@ -922,14 +925,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c if (realloc) { #ifndef NDEBUG size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0; - GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", + __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); #endif ggml_vbuffer_free(galloc->buffers[i]); - galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE); - if (galloc->buffers[i] == NULL) { - GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); - return false; + if (no_alloc) { + galloc->buffers[i] = NULL; + } else { + galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE); + if (galloc->buffers[i] == NULL) { + GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); + return false; + } } } } @@ -937,6 +945,21 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c return true; } +void ggml_gallocr_reserve_n_size( + ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) { + GGML_ASSERT(ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true)); + for (int i = 0; i < galloc->n_buffers; i++) { + sizes[i] = 0; + for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) { + sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size; + } + } +} + +bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { + return ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false); +} + bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL); } @@ -1139,7 +1162,8 @@ static bool alloc_tensor_range(struct ggml_context * ctx, return true; } -ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { +static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl( + struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) { GGML_ASSERT(ggml_get_no_alloc(ctx) == true); size_t alignment = ggml_backend_buft_get_alignment(buft); @@ -1147,6 +1171,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte ggml_backend_buffer_t * buffers = NULL; size_t n_buffers = 0; + *nbytes_total = 0; size_t cur_buf_size = 0; struct ggml_tensor * first = ggml_get_first_tensor(ctx); @@ -1158,10 +1183,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) { // allocate tensors in the current buffer - if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) { + if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) { return NULL; } first = t; + *nbytes_total += cur_buf_size; cur_buf_size = this_size; } else { cur_buf_size += this_size; @@ -1170,15 +1196,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte // allocate remaining tensors if (cur_buf_size > 0) { - if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) { + *nbytes_total += cur_buf_size; + if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) { return NULL; } } + if (no_alloc) { + return NULL; + } + if (n_buffers == 0) { #ifndef NDEBUG GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__); #endif + GGML_ASSERT(!buffers); return NULL; } @@ -1188,10 +1220,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } else { buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers); } - free(buffers); + if (buffers) { + free(buffers); // can be NULL if context is empty or no_alloc + } return buffer; } +size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { + size_t nbytes_total = 0; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true); + GGML_ASSERT(!buf); + return nbytes_total; +} + +ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { + size_t nbytes_total = 0; + return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false); +} + ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) { return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend)); } diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index ff9135fe2d878..a4507da93363c 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -36,12 +36,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { } ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + GGML_ASSERT(buft); if (size == 0) { // return a dummy buffer for zero-sized allocations return ggml_backend_buffer_init(buft, {}, NULL, 0); } - - GGML_ASSERT(buft); return buft->iface.alloc_buffer(buft, size); } @@ -1694,6 +1693,20 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) { sched->is_alloc = false; } +void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) { + GGML_ASSERT(sched); + GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs); + GGML_ASSERT(sizes); + + ggml_backend_sched_reset(sched); + + ggml_backend_sched_synchronize(sched); + + ggml_backend_sched_split_graph(sched, measure_graph); + + ggml_gallocr_reserve_n_size(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids, sizes); +} + bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { GGML_ASSERT(sched); GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 9be35c1be8456..bb53290fdacea 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7345,6 +7345,11 @@ size_t ggml_quantize_chunk( //////////////////////////////////////////////////////////////////////////////// +void ggml_log_get(ggml_log_callback * log_callback, void ** user_data) { + *log_callback = g_logger_state.log_callback; + *user_data = g_logger_state.log_callback_user_data; +} + void ggml_log_set(ggml_log_callback log_callback, void * user_data) { g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default; g_logger_state.log_callback_user_data = user_data; diff --git a/include/llama.h b/include/llama.h index a0a660bff88da..d54a9b62ee96b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -297,6 +297,7 @@ extern "C" { bool check_tensors; // validate model tensor data bool use_extra_bufts; // use extra buffer types (used for weight repacking) bool no_host; // bypass host buffer allowing extra buffers to be used + bool no_alloc; // only load metadata and simulate memory allocations }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations @@ -450,10 +451,23 @@ extern "C" { // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); + // fits mparams and cparams to free device memory (assumes system memory is unlimited) + // returns true if the parameters could be successfully modified to fit device memory + LLAMA_API bool llama_params_fit( + const char * path_model, + struct llama_model_params * mparams, + struct llama_context_params * cparams, + float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements + struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements + size_t margin, // margin of memory to leave per device in bytes + uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use + enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log + LLAMA_API int64_t llama_time_us(void); LLAMA_API size_t llama_max_devices(void); LLAMA_API size_t llama_max_parallel_sequences(void); + LLAMA_API size_t llama_max_tensor_buft_overrides(void); LLAMA_API bool llama_supports_mmap (void); LLAMA_API bool llama_supports_mlock (void); @@ -1332,7 +1346,8 @@ extern "C" { // Set callback for all future logging events. // If this is not called, or NULL is supplied, everything is output on stderr. - LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); + LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data); + LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); // // Performance utils diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f6192a36e0ee5..7c789a46bc1e6 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -208,6 +208,7 @@ llama_context::llama_context( backend_buft.clear(); backend_ptrs.clear(); + backend_buf_exp_size.clear(); for (auto & backend : backends) { auto * buft = ggml_backend_get_default_buffer_type(backend.get()); @@ -224,6 +225,7 @@ llama_context::llama_context( backend_buft.push_back(buft); backend_ptrs.push_back(backend.get()); + backend_buf_exp_size.push_back(0); } LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size()); @@ -339,7 +341,8 @@ llama_context::llama_context( // reserve pp (prompt processing) graph first so that buffers are only allocated once { - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); + auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), + model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr); if (!gf) { if (pipeline_parallel) { LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__); @@ -357,7 +360,7 @@ llama_context::llama_context( // reserve with tg (token generation) graph to get the number of splits and nodes { - auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get()); + auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc); if (!gf) { throw std::runtime_error("failed to allocate compute tg buffers"); } @@ -372,7 +375,7 @@ llama_context::llama_context( // // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get()); // - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); + auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc); if (!gf) { throw std::runtime_error("failed to allocate compute pp buffers"); } @@ -381,11 +384,13 @@ llama_context::llama_context( for (size_t i = 0; i < backend_ptrs.size(); ++i) { ggml_backend_t backend = backend_ptrs[i]; ggml_backend_buffer_type_t buft = backend_buft[i]; - size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend); - if (size > 1) { + if (!model.hparams.no_alloc) { + backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend); + } + if (backend_buf_exp_size[i] > 1) { LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, ggml_backend_buft_name(buft), - size / 1024.0 / 1024.0); + backend_buf_exp_size[i] / 1024.0 / 1024.0); } } @@ -404,6 +409,22 @@ llama_context::llama_context( } llama_context::~llama_context() { + if (!model.hparams.no_alloc) { + for (size_t i = 0; i < backend_ptrs.size(); ++i) { + ggml_backend_t backend = backend_ptrs[i]; + ggml_backend_buffer_type_t buft = backend_buft[i]; + + const size_t size_exp = backend_buf_exp_size[i]; + const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend); + if (size_exp == size_act) { + LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n", + __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0)); + } else { + LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n", + __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0)); + } + } + } ggml_opt_free(opt_ctx); } @@ -1374,7 +1395,8 @@ llm_graph_result * llama_context::get_gf_res_reserve() const { return static_cast(gf_res_reserve.get()); } -ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) { +ggml_cgraph * llama_context::graph_reserve( + uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) { LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs); GGML_ASSERT(n_outputs >= 1); @@ -1411,8 +1433,13 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u // initialize scheduler with the specified graph if (split_only) { - ggml_backend_sched_split_graph(sched.get(), gf); + if (sizes) { + ggml_backend_sched_reserve_size(sched.get(), gf, sizes); + } else { + ggml_backend_sched_split_graph(sched.get(), gf); + } } else if (!ggml_backend_sched_reserve(sched.get(), gf)) { + GGML_ASSERT(!sizes); LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); return nullptr; } @@ -2034,15 +2061,26 @@ void llama_context::perf_reset() { std::map llama_context::memory_breakdown() const { std::map ret; - for (const auto & buft_size : model.memory_breakdown()) { - ret[buft_size.first].model += buft_size.second; + for (const auto & [buft, size] : model.memory_breakdown()) { + ret[buft].model += size; } - for (const auto & buft_size : memory->memory_breakdown()) { - ret[buft_size.first].context += buft_size.second; + if (memory) { + for (const auto & [buft, size] : memory->memory_breakdown()) { + ret[buft].context += size; + } } - for (const auto & backend_ptr : backends) { - ggml_backend_t backend = backend_ptr.get(); - ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend); + if (model.hparams.no_alloc) { + for (size_t i = 0; i < backends.size(); ++i) { + ggml_backend_t backend = backends[i].get(); + ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend); + ret[buft].compute += backend_buf_exp_size[i]; + } + } else { + for (const auto & backend_ptr : backends) { + ggml_backend_t backend = backend_ptr.get(); + ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend); + ret[buft].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend); + } } return ret; } diff --git a/src/llama-context.h b/src/llama-context.h index ed6d82cb396f9..e03cc01483279 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -26,6 +26,10 @@ struct llama_memory_breakdown_data { size_t model = 0; // memory allocated for the model size_t context = 0; // memory allocated for the context size_t compute = 0; // memory allocated for temporary compute buffers + + size_t total() const { + return model + context + compute; + } }; struct llama_context { @@ -206,7 +210,8 @@ struct llama_context { ggml_status graph_compute(ggml_cgraph * gf, bool batched); // reserve a graph with a dummy ubatch of the specified size - ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false); + ggml_cgraph * graph_reserve( + uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr); private: llm_graph_params graph_params( @@ -281,9 +286,10 @@ struct llama_context { std::vector> set_n_threads_fns; - // buffer types used for the compute buffer of each backend + // pointers and buffer types used for the compute buffer of each backend std::vector backend_ptrs; std::vector backend_buft; + std::vector backend_buf_exp_size; // expected buffer sizes llm_graph_result_ptr gf_res_prev; llm_graph_result_ptr gf_res_reserve; diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 6fcf91b7daa47..b0b373e9af99f 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -34,6 +34,7 @@ struct llama_hparams_convnext { struct llama_hparams { bool vocab_only; + bool no_alloc; bool rope_finetuned; bool use_par_res; bool swin_norm; diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp index 6ec709dd323a6..87b9d8bb27a55 100644 --- a/src/llama-impl.cpp +++ b/src/llama-impl.cpp @@ -25,6 +25,10 @@ time_meas::~time_meas() { } } +void llama_log_get(ggml_log_callback * log_callback, void ** user_data) { + ggml_log_get(log_callback, user_data); +} + void llama_log_set(ggml_log_callback log_callback, void * user_data) { ggml_log_set(log_callback, user_data); g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index add74391f0c47..b0338d302c331 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -175,7 +175,15 @@ llama_kv_cache::llama_kv_cache( // allocate tensors and initialize the buffers to avoid NaNs in the padding for (auto & [buft, ctx] : ctx_map) { - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); + ggml_backend_buffer_t buf; + if (model.hparams.no_alloc) { + buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer + for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) { + t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it + } + } else { + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer + } if (!buf) { throw std::runtime_error("failed to allocate buffer for kv cache"); } @@ -476,9 +484,18 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const { std::map llama_kv_cache::memory_breakdown() const { std::map ret; - for (const auto & [_, buf] : ctxs_bufs) { - ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get()); + for (const auto & [ctx, buf] : ctxs_bufs) { + ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf.get()); + + if (hparams.no_alloc) { + GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) == nullptr); + ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft); + } else { + GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); + ret[buft] += ggml_backend_buffer_get_size(buf.get()); + } } + return ret; } diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index aa3a65f87a542..ca2ea2461d223 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -473,6 +473,7 @@ llama_model_loader::llama_model_loader( std::vector & splits, bool use_mmap, bool check_tensors, + bool no_alloc, const llama_model_kv_override * param_overrides_p, const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) { int trace = 0; @@ -716,6 +717,7 @@ llama_model_loader::llama_model_loader( this->use_mmap = use_mmap; this->check_tensors = check_tensors; + this->no_alloc = no_alloc; } std::string llama_model_loader::get_arch_name() const { diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index c9189f6cb4466..0380c92fde0e3 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -71,6 +71,7 @@ struct llama_model_loader { bool use_mmap = false; bool check_tensors; + bool no_alloc; llama_files files; llama_ftype ftype; @@ -97,6 +98,7 @@ struct llama_model_loader { std::vector & splits, // optional, only need if the split does not follow naming scheme bool use_mmap, bool check_tensors, + bool no_alloc, const llama_model_kv_override * param_overrides_p, const llama_model_tensor_buft_override * param_tensor_buft_overrides_p); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index bb83a04e96055..a0cedcd415275 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6187,9 +6187,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { std::vector bufs; if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) { + GGML_ASSERT(!ml.no_alloc); for (uint32_t idx = 0; idx < ml.files.size(); idx++) { // only the mmap region containing the tensors in the model is mapped to the backend buffer - // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers + // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, + // then we could just use metal for all layers // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size void * addr = nullptr; size_t first, last; // NOLINT @@ -6205,9 +6207,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) { bufs.emplace_back(buf); buf_map.emplace(idx, buf); } - } - else { - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + } else { + ggml_backend_buffer_t buf; + if (ml.no_alloc) { + buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them + } + } else { + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer + } if (buf == nullptr) { throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); } @@ -6262,6 +6271,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } + if (ml.no_alloc) { + return true; + } + // load tensor data for (auto & [ctx, buf_map] : ctx_buf_maps) { if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) { @@ -6304,9 +6317,18 @@ size_t llama_model::n_devices() const { std::map llama_model::memory_breakdown() const { std::map ret; - for (const auto & [_, bufs] : pimpl->ctxs_bufs) { - for (const auto & buf : bufs) { - ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get()); + for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) { + if (hparams.no_alloc) { + GGML_ASSERT(bufs.size() == 1); + ggml_backend_buffer_t buf = bufs[0].get(); + GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr); + ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf); + ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft); + } else { + for (const auto & buf : bufs) { + GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); + ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get()); + } } } return ret; @@ -6351,6 +6373,7 @@ void llama_model::print_info() const { // hparams LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str()); LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only); + LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc); if (!hparams.vocab_only) { LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); @@ -20218,6 +20241,7 @@ llama_model_params llama_model_default_params() { /*.check_tensors =*/ false, /*.use_extra_bufts =*/ true, /*.no_host =*/ false, + /*.no_alloc =*/ false, }; return result; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 6dd40412b488e..7c560aac23f97 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::vector splits = {}; - llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr); + llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index ab2e9868af468..f4c94df0f3f96 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1,6 +1,9 @@ +#include "llama.h" + #include "llama-impl.h" #include "llama-chat.h" +#include "llama-context.h" #include "llama-mmap.h" #include "llama-vocab.h" #include "llama-model-loader.h" @@ -11,11 +14,14 @@ #include "ggml-backend.h" #include +#include +#include #include #include #include #include #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -37,6 +43,589 @@ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_ty GGML_ABORT("fatal error"); } +struct llama_device_memory_data { + int64_t total; + int64_t free; + llama_memory_breakdown_data mb; +}; + +static std::vector llama_get_device_memory_data( + const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams, + std::vector & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert, + uint32_t & hp_n_layers_dense_lead, const ggml_log_level log_level) { + struct user_data_t { + struct { + ggml_log_callback callback; + void * user_data; + } original_logger; + ggml_log_level min_level; // prints below this log level go to debug log + }; + user_data_t ud; + llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data); + ud.min_level = log_level; + + llama_log_set([](ggml_log_level level, const char * text, void * user_data) { + const user_data_t * ud = (const user_data_t *) user_data; + const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG; + ud->original_logger.callback(level_eff, text, ud->original_logger.user_data); + }, &ud); + + llama_model_params mparams_copy = *mparams; + mparams_copy.no_alloc = true; + mparams_copy.use_mmap = false; + + llama_model * model = llama_model_load_from_file(path_model, mparams_copy); + if (model == nullptr) { + throw std::runtime_error("failed to load model"); + } + + llama_context * ctx = llama_init_from_model(model, *cparams); + if (ctx == nullptr) { + llama_model_free(model); + throw std::runtime_error("failed to create llama_context from model"); + } + + std::vector ret(model->devices.size()); + + std::map memory_breakdown = ctx->memory_breakdown(); + + for (const auto & buft_mb : memory_breakdown) { + ggml_backend_buffer_type_t buft = buft_mb.first; + const llama_memory_breakdown_data & mb = buft_mb.second; + + if (ggml_backend_buft_is_host(buft)) { + continue; + } + + ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); + if (!dev) { + continue; + } + for (size_t i = 0; i < ret.size(); i++) { + if (model->devices[i] == dev) { + ret[i].mb.model += mb.model; + ret[i].mb.context += mb.context; + ret[i].mb.compute += mb.compute; + break; + } + } + } + for (size_t i = 0; i < ret.size(); i++) { + size_t free, total; + ggml_backend_dev_memory(model->devices[i], &free, &total); + ret[i].free = free; + ret[i].total = total; + } + + devs = model->devices; + hp_ngl = model->hparams.n_layer; + hp_n_ctx_train = model->hparams.n_ctx_train; + hp_n_expert = model->hparams.n_expert; + hp_n_layers_dense_lead = model->hparams.n_layer_dense_lead; + + llama_memory_breakdown_print(ctx); // goes to debug log + + llama_free(ctx); + llama_model_free(model); + llama_log_set(ud.original_logger.callback, ud.original_logger.user_data); + return ret; +} + + +static void llama_params_fit_impl( + const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams, + float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides, + size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) { + constexpr int64_t MiB = 1024*1024; + const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits + typedef std::vector dmds_t; + const llama_model_params default_mparams = llama_model_default_params(); + + std::vector devs; + uint32_t hp_ngl = 0; // hparams.n_gpu_layers + uint32_t hp_nct = 0; // hparams.n_ctx_train + uint32_t hp_nex = 0; // hparams.n_expert + uint32_t hp_nldl = 0; // hparams.n_layers_dense_lead + + // step 1: get data for default parameters and check whether any changes are necessary in the first place + + LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__); + const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, hp_nldl, log_level); + const size_t nd = devs.size(); // number of devices + if (nd == 0) { + LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__); + return; + } + + std::vector dev_names; + { + dev_names.reserve(nd); + size_t max_length = 0; + for (ggml_backend_dev_t dev : devs) { + std::string name = ggml_backend_dev_name(dev); + name += " ("; + name += ggml_backend_dev_description(dev); + name += ")"; + dev_names.push_back(name); + max_length = std::max(max_length, name.length()); + } + for (std::string & dn : dev_names) { + dn.insert(dn.end(), max_length - dn.length(), ' '); + } + } + + int64_t sum_total = 0; + int64_t sum_projected_free = 0; + int64_t min_projected_free = INT64_MAX; + int64_t sum_projected_used = 0; + int64_t sum_projected_ctx = 0; + + if (nd > 1) { + LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__); + } + for (size_t id = 0; id < nd; id++) { + const llama_device_memory_data & dmd = dmds_full[id]; + + const int64_t projected_used = dmd.mb.total(); + const int64_t projected_free = dmd.free - projected_used; + + sum_total += dmd.total; + sum_projected_used += projected_used; + sum_projected_free += projected_free; + min_projected_free = std::min(min_projected_free, projected_free); + sum_projected_ctx += dmd.mb.context; + + if (nd > 1) { + LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n", + __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB, + projected_free >= 0 ? "surplus" : "deficit"); + } + } + assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0); + assert(sum_projected_used >= sum_projected_ctx); + LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n", + __func__, sum_projected_used/MiB, sum_total/MiB); + if (min_projected_free >= margin) { + if (nd == 1) { + LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n", + __func__, min_projected_free/MiB, margin/MiB); + return; + } + LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n", + __func__, min_projected_free/MiB, margin/MiB); + return; + } + + // step 2: try reducing memory use by reducing the context size + + int64_t global_memory_reduction_vs_full = 0; + { + int64_t global_surplus = sum_projected_free - int64_t(nd)*margin; + if (global_surplus < 0) { + LLAMA_LOG_INFO(nd == 1 ? + "%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" : + "%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n", + __func__, margin/MiB, -global_surplus/MiB); + if (cparams->n_ctx == 0) { + if (hp_nct > n_ctx_min) { + const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct; + const uint32_t ctx_reduction = std::min( + uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min); + cparams->n_ctx = hp_nct - ctx_reduction; + const int64_t memory_reduction = ctx_reduction * bytes_per_ctx; + global_surplus += memory_reduction; + global_memory_reduction_vs_full += memory_reduction; + LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", + __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); + } else { + LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n", + __func__, hp_nct, n_ctx_min); + } + } else { + LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx); + } + } + if (global_surplus > 0) { + LLAMA_LOG_INFO("%s: entire model can be fit across devices by reducing context\n", __func__); + return; + } + } + + if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) { + throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort"); + } + if (nd > 1) { + if (!tensor_split) { + throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort"); + } + if (mparams->tensor_split) { + for (size_t id = 0; id < nd; id++) { + if (mparams->tensor_split[id] != 0.0f) { + throw std::runtime_error("model_params::tensor_split already set by user, abort"); + } + } + } + if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) { + throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort"); + } + if (hp_ngl < 2*nd) { + throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least " + + std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort"); + } + } + if (hp_nex > 0 && !tensor_buft_overrides) { + throw std::runtime_error("did not provide buffer to set tensor_buft_overrides for MoE model, abort"); + } + if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) { + throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort"); + } + + // utility function that returns the memory use per device for given numbers of layers per device + auto get_memory_for_layers = [&](const std::vector & layers_per_device) -> std::vector { + llama_model_params mparams_copy = *mparams; + mparams_copy.n_gpu_layers = 0; + for (const uint32_t & ngl : layers_per_device) { + mparams_copy.n_gpu_layers += ngl; + } + assert(uint32_t(mparams_copy.n_gpu_layers) == hp_ngl + 1); + if (nd > 1) { + for (size_t id = 0; id < nd; id++) { + tensor_split[id] = layers_per_device[id]; + } + } + mparams_copy.tensor_split = tensor_split; + const dmds_t dmd_nl = llama_get_device_memory_data( + path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, hp_nldl, log_level); + std::vector ret; + ret.reserve(nd); + for (const llama_device_memory_data & dmd : dmd_nl) { + ret.push_back(dmd.mb.total()); + } + return ret; + }; + + if (hp_nex > 0) { + // utility function that returns a static C string matching the MoE tensors for a specific layer: + auto get_moe_pattern = [&](const size_t il) -> const char * { + static std::vector patterns; + while (patterns.size() <= il) { + patterns.push_back("blk\\." + std::to_string(patterns.size()) + "\\.ffn_(up|down|gate)_(ch|)exps"); + } + return patterns[il].c_str(); + }; + + const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate)_(ch|)exps"; // matches all MoE tensors + ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type(); + tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft}; + tensor_buft_overrides[1] = {nullptr, nullptr}; + mparams->tensor_buft_overrides = tensor_buft_overrides; + + LLAMA_LOG_DEBUG("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__); + const dmds_t dmds_cpu_moe = llama_get_device_memory_data( + path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, hp_nldl, log_level); + + // reset + tensor_buft_overrides[0] = {nullptr, nullptr}; + mparams->tensor_buft_overrides = tensor_buft_overrides; + + int64_t global_surplus = 0; + for (const llama_device_memory_data & dmd : dmds_cpu_moe) { + global_surplus += dmd.free; + global_surplus -= int64_t(dmd.mb.total()) + margin; + } + if (global_surplus >= 0) { + LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n", __func__, global_surplus/MiB); + + // step 3a: for MoE models and a single device, if at least the dense tensors can be fit, simply interpolate: + if (nd == 1) { + const int64_t projected_full = int64_t(dmds_full[0].mb.total()) - global_memory_reduction_vs_full; + const int64_t diff_total = projected_full - int64_t(dmds_cpu_moe[0].mb.total()); + const int64_t diff_per_layer = diff_total / int64_t(hp_ngl - hp_nldl); + const uint32_t layers_full = global_surplus / diff_per_layer + hp_nldl + 1; // extra "layer" for non-repeating tensors is always dense + const uint32_t layers_part = hp_ngl + 1 - layers_full; + + { + const size_t ntbo = llama_max_tensor_buft_overrides(); + size_t itbo = 0; + for (uint32_t il = hp_nldl; il < layers_part; il++) { + if (itbo + 1 >= ntbo) { + throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == " + + std::to_string(ntbo) + " is insufficient for model\n"); + } + tensor_buft_overrides[itbo].pattern = get_moe_pattern(il); + tensor_buft_overrides[itbo].buft = cpu_buft; + itbo++; + } + tensor_buft_overrides[itbo].pattern = nullptr; + tensor_buft_overrides[itbo].buft = nullptr; + itbo++; + mparams->tensor_buft_overrides = tensor_buft_overrides; + } + + const int64_t projected_use = projected_full - layers_part*diff_per_layer; + const int64_t projected_margin = dmds_full[0].free - projected_use; + LLAMA_LOG_INFO("%s: set to use %u dense-only layers and %u full layers, %" PRId64 " MiB used, %" PRId64 " MiB free\n", + __func__, layers_part, layers_full, projected_use/MiB, projected_margin/MiB); + return; + } + + // step 3b: for MoE models and multiple devices, if at least the dense tensors can be fit, + // try fitting as many full layers as possible by iteratively adjusting layers per device + + struct ngl_t { + uint32_t part = 0; + uint32_t full = 0; + + explicit operator std::string() const { + return "[" + std::to_string(part) + ", " + std::to_string(full) + "]"; + } + }; + const size_t ntbo = llama_max_tensor_buft_overrides(); + + // utility function that sets tensor buft overrides to produce a given layer distribution + auto set_tensor_buft_overrides = [&](const std::vector & ngl_per_device) { + size_t itbo = 0; + uint32_t il0 = 0; + for (size_t id = 0; id < nd && itbo + 1 < ntbo; id++) { + // on last device one of the "full layers" are the non-repeating layers + const uint32_t il0_loop = id < nd - 1 ? il0 + ngl_per_device[id].full : il0 + ngl_per_device[id].full - 1; + for (uint32_t il = il0_loop; il < il0_loop + ngl_per_device[id].part; il++) { + if (itbo + 1 >= ntbo) { + throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == " + + std::to_string(ntbo) + " is insufficient for model\n"); + } + assert(il >= hp_nldl); + assert(il < hp_ngl); + tensor_buft_overrides[itbo].pattern = get_moe_pattern(il); + tensor_buft_overrides[itbo].buft = cpu_buft; + itbo++; + } + const uint32_t ngl = ngl_per_device[id].part + ngl_per_device[id].full; + tensor_split[id] = ngl; + il0 += ngl; + } + tensor_buft_overrides[itbo].pattern = nullptr; + tensor_buft_overrides[itbo].buft = nullptr; + itbo++; + mparams->tensor_buft_overrides = tensor_buft_overrides; + }; + + // utility function that returns the projected memory use for a given layer assignment + auto get_memory_for_layers_moe = [&](const char * func_name, const std::vector & ngl_per_device) -> std::vector { + set_tensor_buft_overrides(ngl_per_device); + + std::vector total_ngl_per_device; + total_ngl_per_device.reserve(nd); + for (const ngl_t & ngl : ngl_per_device) { + total_ngl_per_device.push_back(ngl.full + ngl.part); + } + const auto mem = get_memory_for_layers(total_ngl_per_device); + + LLAMA_LOG_DEBUG("%s: memory for test allocation by device:\n", func_name); + for (size_t id = 0; id < nd; id++) { + LLAMA_LOG_DEBUG("%s: id=%zu, ngl_full=%" PRIu32 ", ngl_part=%" PRIu32 ", mem=%" PRId64 " MiB\n", + func_name, id, ngl_per_device[id].full, ngl_per_device[id].part, mem[id]/MiB); + } + + // reset + tensor_buft_overrides[0].pattern = nullptr; + tensor_buft_overrides[0].buft = nullptr; + mparams->tensor_buft_overrides = tensor_buft_overrides; + + return mem; + }; + + std::vector ngl_per_device(nd); + ngl_per_device.back().part = 1; // memory on first device can increase if last device has a partial layer, so start with it + ngl_per_device.back().full = hp_ngl + 1 - 1; // 1 "layer for non-repeating tensors" + std::vector targets; + targets.reserve(nd); + for (size_t id = 0; id < nd; id++) { + targets.push_back(dmds_full[id].free - margin); + } + std::vector mem; + + // utility function that iteratively tries moving layers from the last device to other devices + // initially use a larger step size in order to do fewer test allocations + auto distribute_layers = [&](const char * func_name, const uint32_t & initial_step_size, const bool convert) { + uint32_t step_size = initial_step_size; + std::vector device_is_full(nd - 1, false); + + for (size_t id = 0; step_size > 0; id = (id + 1) % (nd - 1)) { + if (device_is_full[id]) { + continue; + } + if (ngl_per_device.back().full - 1 < step_size) { + step_size /= 2; + std::fill(device_is_full.begin(), device_is_full.end(), false); + continue; + } + + const std::vector ngl_per_device_prev = ngl_per_device; + if (convert) { + ngl_per_device[id].part += step_size; + } else { + ngl_per_device[id].full += step_size; + } + ngl_per_device.back().full -= step_size; + + mem = get_memory_for_layers_moe(func_name, ngl_per_device); + + // if the allocation fits the last device the step size may still be too high and waste VRAM capacity + if (mem.back() < targets.back()) { + if (step_size == 1 && mem[id] <= targets[id]) { + return; // memory targets on all devices met and we cannot be more efficient with a smaller step size + } + ngl_per_device = ngl_per_device_prev; + step_size /= 2; + std::fill(device_is_full.begin(), device_is_full.end(), false); + continue; + } + + // check if test allocation is ok + if (mem[id] < targets[id]) { + // if we already halved the step size once we know that another increment would fail + if (step_size < initial_step_size) { + device_is_full[id] = true; + if (std::all_of(device_is_full.begin(), device_is_full.end(), [](bool b){ return b; })) { + step_size /= 2; + std::fill(device_is_full.begin(), device_is_full.end(), false); + } + } + continue; + } + + // target device is full, revert changes + device_is_full[id] = true; + ngl_per_device = ngl_per_device_prev; + if (std::all_of(device_is_full.begin(), device_is_full.end(), [](bool b){ return b; })) { + step_size /= 2; + std::fill(device_is_full.begin(), device_is_full.end(), false); + } + } + }; + + assert(ngl_per_device.back().full >= 1); + { + uint32_t initial_step_size = 1; + while (initial_step_size < std::min((ngl_per_device.back().full - 1) / uint32_t(nd - 1), uint32_t(4))) { + initial_step_size *= 2; + } + distribute_layers(__func__, initial_step_size, /*convert =*/ false); + } + assert(ngl_per_device.back().full >= 1); + { + uint32_t initial_step_size = 1; + while (initial_step_size < std::min((ngl_per_device.back().full - 1) / uint32_t(nd - 1), uint32_t(4))) { + initial_step_size *= 2; + } + distribute_layers(__func__, initial_step_size, /*convert =*/ true); + } + assert(ngl_per_device.back().full >= 1); + + if (mem.back() > targets.back()) { + std::vector ngl_per_device_high = ngl_per_device; + std::vector mem_high = get_memory_for_layers_moe(__func__, ngl_per_device_high); + + std::vector ngl_per_device_low = ngl_per_device; + ngl_per_device_low.back().part += ngl_per_device.back().full - 1; + ngl_per_device_low.back().full = 1; + std::vector mem_low = get_memory_for_layers_moe(__func__, ngl_per_device_low); + + const int64_t diff = mem_high.back() - mem_low.back(); + const int64_t diff_per_full = diff / + (int64_t(ngl_per_device_high.back().full) - int64_t(ngl_per_device_low.back().full)); + + const uint32_t ngl_full = 1 + (targets.back() - mem_low.back()) / diff_per_full; + ngl_per_device.back().part = ngl_per_device.back().part + ngl_per_device.back().full - ngl_full; + ngl_per_device.back().full = ngl_full; + mem = get_memory_for_layers_moe(__func__, ngl_per_device); + } + + set_tensor_buft_overrides(ngl_per_device); + uint32_t global_ngl_part = 0; + uint32_t global_ngl_full = 0; + for (size_t id = 0; id < nd; id++) { + global_ngl_part += ngl_per_device[id].part; + global_ngl_full += ngl_per_device[id].full; + } + + LLAMA_LOG_INFO("%s: set to use %u dense-only and %u full GPU layers in total, projected memory use:\n", + __func__, global_ngl_part, global_ngl_full); + for (size_t id = 0; id < nd; id++) { + const int64_t projected_margin = dmds_full[id].free - mem[id]; + LLAMA_LOG_INFO("%s: - %s: %2" PRIu32 " dense-only layers, %2" PRIu32 " full layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n", + __func__, dev_names[id].c_str(), ngl_per_device[id].part, ngl_per_device[id].full, mem[id]/MiB, projected_margin/MiB); + } + return; + } + + LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n", __func__, -global_surplus/MiB); + } + + // step 4: if the model only has dense tensors or there is insufficient memory to fit all dense tensors, + // all layers are the same so simply extrapolate how many layers will fit per device + + struct memory_scaling { + int64_t base = 0; + int64_t per_layer = 0; + }; + + std::vector ms(nd); + { + const uint32_t ngl_per_dev = hp_ngl / nd; + std::vector nl_scaling; + { + nl_scaling.reserve(nd); + for (size_t id = 0; id < nd; id++) { + nl_scaling.push_back(ngl_per_dev); + } + } + LLAMA_LOG_DEBUG("%s: getting device memory data for 1 full layer:\n", __func__); + auto tmp1 = get_memory_for_layers(std::vector(nd, 1)); + LLAMA_LOG_DEBUG("%s: getting device memory data for ~%" PRIu32 " full layers/device:\n", __func__, nl_scaling[0]); + auto tmpn = get_memory_for_layers(nl_scaling); + for (size_t id = 0; id < nd; id++) { + ms[id].per_layer = (tmpn[id] - tmp1[id]) / int64_t(ngl_per_dev - 1); + ms[id].base = tmp1[id] - ms[id].per_layer; + } + } + + mparams->n_gpu_layers = 0; + std::vector ngl_per_device; + ngl_per_device.reserve(nd); + for (size_t id = 0; id < nd; id++) { + const uint32_t ngl = (dmds_full[id].free - margin - ms[id].base) / ms[id].per_layer; + mparams->n_gpu_layers += ngl; + ngl_per_device.push_back(ngl); + } + LLAMA_LOG_INFO("%s: set n_gpu_layers to %" PRIu32 ", projected memory use:\n", __func__, mparams->n_gpu_layers); + for (size_t id = 0; id < nd; id++) { + const int64_t projected_use = ms[id].base + int64_t(ngl_per_device[id])*ms[id].per_layer; + const int64_t projected_margin = dmds_full[id].free - projected_use; + LLAMA_LOG_INFO("%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n", + __func__, dev_names[id].c_str(), ngl_per_device[id], projected_use/MiB, projected_margin/MiB); + } +} + +bool llama_params_fit( + const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams, + float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides, + size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) { + const int64_t t0_us = llama_time_us(); + bool ok = true; + try { + llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level); + LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__); + } catch (const std::runtime_error & e) { + LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what()); + ok = false; + } + const int64_t t1_us = llama_time_us(); + LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6); + return ok; +} + struct llama_sampler_chain_params llama_sampler_chain_default_params() { struct llama_sampler_chain_params result = { /*.no_perf =*/ true, @@ -49,6 +638,10 @@ size_t llama_max_devices(void) { return 16; } +size_t llama_max_tensor_buft_overrides() { + return 4096; +} + bool llama_supports_mmap(void) { return llama_mmap::SUPPORTED; } @@ -108,11 +701,12 @@ static int llama_model_load(const std::string & fname, std::vector model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides); + llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); model.hparams.vocab_only = params.vocab_only; + model.hparams.no_alloc = params.no_alloc; try { model.load_arch(ml);