diff --git a/common/fit.cpp b/common/fit.cpp index c10cb7f08b4..668d892e908 100644 --- a/common/fit.cpp +++ b/common/fit.cpp @@ -26,7 +26,7 @@ class common_params_fit_exception : public std::runtime_error { using std::runtime_error::runtime_error; }; -static std::vector common_get_device_memory_data( +std::vector common_get_device_memory_data( const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams, diff --git a/common/fit.h b/common/fit.h index e066092ec6c..643d3420095 100644 --- a/common/fit.h +++ b/common/fit.h @@ -1,6 +1,11 @@ #pragma once #include "ggml.h" +#include "ggml-backend.h" +#include "llama.h" +#include "../src/llama-ext.h" + +#include enum common_params_fit_status { COMMON_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit @@ -30,3 +35,14 @@ void common_fit_print( struct llama_context_params * cparams); void common_memory_breakdown_print(const struct llama_context * ctx); + +// Load a model + context with no_alloc and return the per-device memory breakdown. +std::vector common_get_device_memory_data( + const char * path_model, + const struct llama_model_params * mparams, + const struct llama_context_params * cparams, + std::vector & devs, + uint32_t & hp_ngl, + uint32_t & hp_n_ctx_train, + uint32_t & hp_n_expert, + enum ggml_log_level log_level); diff --git a/docs/backend/snapdragon/CMakeUserPresets.json b/docs/backend/snapdragon/CMakeUserPresets.json index d2629fc4de9..d37100764f1 100644 --- a/docs/backend/snapdragon/CMakeUserPresets.json +++ b/docs/backend/snapdragon/CMakeUserPresets.json @@ -33,8 +33,8 @@ "name": "arm64-windows-snapdragon", "inherits": [ "base", "arm64-windows-llvm" ], "cacheVariables": { - "CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE", - "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE", + "CMAKE_C_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE", + "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE", "CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", diff --git a/docs/backend/snapdragon/README.md b/docs/backend/snapdragon/README.md index f5bb3d11c48..a90f7da303e 100644 --- a/docs/backend/snapdragon/README.md +++ b/docs/backend/snapdragon/README.md @@ -24,7 +24,7 @@ Native Windows 11 arm64 builds has the following tools dependencies: - UCRT and Driver Kit - LLVM core libraries and Clang compiler (winget) - CMake, Git, Python (winget) -- Hexagon SDK Community Edition 6.4 or later (see windows.md) +- Hexagon SDK Community Edition 6.6 or later (see windows.md) - OpenCL SDK 2.3 or later (see windows.md) Note: The rest of the **Windows** build process assumes that you're running natively in Powershell. @@ -45,7 +45,7 @@ Preset CMake variables: GGML_HEXAGON="ON" GGML_OPENCL="ON" GGML_OPENMP="OFF" - HEXAGON_SDK_ROOT="/opt/hexagon/6.4.0.2" + HEXAGON_SDK_ROOT="/opt/hexagon/6.6.0.0" ... -- Including OpenCL backend -- Including Hexagon backend diff --git a/docs/backend/snapdragon/windows.md b/docs/backend/snapdragon/windows.md index 6307e1b69f1..aa731413c90 100644 --- a/docs/backend/snapdragon/windows.md +++ b/docs/backend/snapdragon/windows.md @@ -28,15 +28,15 @@ c:\Qualcomm\OpenCL_SDK\2.3.2 Either use the trimmed down version (optimized for CI) from - https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz + https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.6.0.0/hexagon-sdk-v6.6.0.0-arm64-wos.tar.xz Or download the complete official version from - https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.4.0.2 + https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.6.0.0 Unzip/untar the archive into ``` -c:\Qualcomm\Hexagon_SDK\6.4.0.2 +c:\Qualcomm\Hexagon_SDK\6.6.0.0 ``` ## Install the latest Adreno GPU driver @@ -123,10 +123,10 @@ The overall Hexagon backend build procedure for Windows on Snapdragon is the sam However, additional settings are required for generating and signing HTP Ops libraries. ``` > $env:OPENCL_SDK_ROOT="C:\Qualcomm\OpenCL_SDK\2.3.2" -> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2" -> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2\tools\HEXAGON_Tools\19.0.04" +> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.6.0.0" +> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.6.0.0\tools\HEXAGON_Tools\19.0.07" > $env:HEXAGON_HTP_CERT="c:\Users\MyUsers\Certs\ggml-htp-v1.pfx" -> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0\arm64" +> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0" > cmake --preset arm64-windows-snapdragon-release -B build-wos ... diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py index c4ec5c524e9..5c9305b1237 100755 --- a/examples/convert_legacy_llama.py +++ b/examples/convert_legacy_llama.py @@ -1308,7 +1308,8 @@ def do_dump_model(model_plus: ModelPlus) -> None: def main(args_in: list[str] | None = None) -> None: output_choices = ["f32", "f16"] - if np.uint32(1) == np.uint32(1).newbyteorder("<"): + dummy_val = np.uint32(1) + if dummy_val == dummy_val.view(dummy_val.dtype.newbyteorder("<")): # We currently only support Q8_0 output on little endian systems. output_choices.append("q8_0") parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file") diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h index 78aa059dde3..a7926a21a9a 100644 --- a/ggml/include/ggml-alloc.h +++ b/ggml/include/ggml-alloc.h @@ -76,6 +76,7 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i // Utils // Create a buffer and allocate all the tensors in a ggml_context // ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft +// ggml_backend_alloc_ctx_tensors_from_buft returns NULL on failure or if all tensors in ctx are already allocated or zero-sized GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend); diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp index df0f405ed9f..5f9ae9c1bc5 100644 --- a/ggml/src/ggml-backend-meta.cpp +++ b/ggml/src/ggml-backend-meta.cpp @@ -1275,6 +1275,9 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg for (size_t j = 0; j < n_bufs; j++) { ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1]; + if (chunk_size_j == 0) { + continue; + } const size_t simple_offset = i_start * chunk_size_j; ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full); offset_j += chunk_size_j; @@ -1382,6 +1385,9 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co for (size_t j = 0; j < n_bufs; j++){ const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1]; + if (chunk_size_j == 0) { + continue; + } const size_t simple_offset = i_start * chunk_size_j; ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full); offset_j += chunk_size_j; @@ -1445,6 +1451,7 @@ static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_bac buf_ctx->buf_configs.reserve(n_simple_bufts); for (size_t i = 0; i < n_simple_bufts; i++) { ggml_backend_buffer_t simple_buf = ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size); + GGML_ASSERT(simple_buf != nullptr); max_size = std::max(max_size, ggml_backend_buffer_get_size(simple_buf)); buf_ctx->buf_configs.emplace_back(ggml_init(params), simple_buf); } @@ -1474,8 +1481,27 @@ struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struc t->data = (void *) 0x2000000000000000; // FIXME } for (size_t i = 0; i < n_simple_bufts; i++) { - meta_buf_ctx->buf_configs[i].buf = ggml_backend_alloc_ctx_tensors_from_buft( - meta_buf_ctx->buf_configs[i].ctx, ggml_backend_meta_buft_simple_buft(buft, i)); + ggml_context * ctx = meta_buf_ctx->buf_configs[i].ctx; + ggml_backend_buffer_type_t simple_buft = ggml_backend_meta_buft_simple_buft(buft, i); + + // If a ggml_context only has zero-sized tensors, ggml_backend_alloc_ctx_tensors_from_buft returns NULL. + // For those edge cases, allocate a dummy buffer instead. + bool any_nonzero_slice = false; + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + if (ggml_nelements(t) != 0) { + any_nonzero_slice = true; + break; + } + } + if (any_nonzero_slice) { + meta_buf_ctx->buf_configs[i].buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft); + } else { + meta_buf_ctx->buf_configs[i].buf = ggml_backend_buft_alloc_buffer(simple_buft, 0); + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + t->buffer = meta_buf_ctx->buf_configs[i].buf; + } + } + GGML_ASSERT(meta_buf_ctx->buf_configs[i].buf != nullptr); meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->buf_configs[i].buf)); } return meta_buf; @@ -1605,6 +1631,9 @@ static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tens ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j); ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1]; + if (chunk_size_j == 0) { + continue; + } ggml_backend_tensor_set_2d_async(simple_backend, simple_tensor, (const char *) data + offset_j, offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full); offset_j += chunk_size_j; @@ -1646,6 +1675,9 @@ static void ggml_backend_meta_get_tensor_async(ggml_backend_t backend, const ggm ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j); const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1]; + if (chunk_size_j == 0) { + continue; + } ggml_backend_tensor_get_2d_async(simple_backend, simple_tensor, (char *) data + offset_j, offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full); offset_j += chunk_size_j; diff --git a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c index 4a4ff0b331d..9e1b778b01f 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +++ b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c @@ -852,9 +852,10 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) { v_s_rowmax1 = hvx_vec_reduce_max_f16(v_s_rowmax1); // Splat m_prev[r], m_prev[r+1] from the per-row accumulator. - // vror brings the target lane to lane 0, then extract + re-splat. - HVX_Vector v_m_prev0 = hvx_vec_splat_f16(hvx_vec_get_f16(Q6_V_vror_VR(m_prev_v, r_vec_off * 2))); - HVX_Vector v_m_prev1 = hvx_vec_splat_f16(hvx_vec_get_f16(Q6_V_vror_VR(m_prev_v, (r_vec_off + 1) * 2))); + // vror brings the target lane to lane 0, then vdelta replicates it + // across all lanes — stays in the vector domain (no store/reload). + HVX_Vector v_m_prev0 = hvx_vec_repl_f16(Q6_V_vror_VR(m_prev_v, r_vec_off * 2)); + HVX_Vector v_m_prev1 = hvx_vec_repl_f16(Q6_V_vror_VR(m_prev_v, (r_vec_off + 1) * 2)); // HVX max — both operands are splats, so result is splat of m_new. HVX_Vector v_dup_m0 = Q6_Vhf_vmax_VhfVhf(v_m_prev0, v_s_rowmax0); diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index ea0b44feea2..42286435bc6 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -661,11 +661,10 @@ struct ggml_backend_opencl_context { cl_kernel kernel_mul_mm_iq4_nl_f32_l4_lm; std::vector profiling_info; + std::vector profiling_results; - void write_profiling_info() { - FILE * fperf = fopen("cl_profiling.csv", "w"); - if (!fperf) { - GGML_LOG_ERROR("Failed to open cl_profiling.csv\n"); + void flush_profiling_batch() { + if (profiling_info.empty()) { return; } @@ -689,6 +688,7 @@ struct ggml_backend_opencl_context { CL_CHECK(clGetEventProfilingInfo( info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL)); CL_CHECK(clReleaseEvent(info.evt)); + info.evt = nullptr; char kernel_name[512]; CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME, @@ -706,10 +706,26 @@ struct ggml_backend_opencl_context { info.cmd_complete_duration_ns = cmd_complete - cmd_end; info.cmd_total_duration_ns = cmd_complete - cmd_queued; } + profiling_results.insert(profiling_results.end(), + std::make_move_iterator(profiling_info.begin()), + std::make_move_iterator(profiling_info.end())); + profiling_info.clear(); + } + + void write_profiling_info() { + if (profiling_results.empty()) { + return; + } // Dump a csv + FILE * fperf = fopen("cl_profiling.csv", "w"); + if (!fperf) { + GGML_LOG_ERROR("Failed to open cl_profiling.csv\n"); + return; + } + fprintf(fperf, "op name, kernel name, exec duration (ms), global size, local size, output size\n"); - for (const ProfilingInfo & info : profiling_info) { + for (const ProfilingInfo & info : profiling_results) { fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", info.op_name.c_str(), info.kernel_name.c_str(), info.cmd_duration_ns/1.e6f, @@ -720,14 +736,14 @@ struct ggml_backend_opencl_context { fclose(fperf); // Dump a simple chrome trace - FILE* ftrace = fopen("cl_trace.json", "w"); + FILE * ftrace = fopen("cl_trace.json", "w"); if (!ftrace) { GGML_LOG_ERROR("Failed to open cl_trace.json\n"); return; } fprintf(ftrace, "[\n"); - for (const ProfilingInfo & info : profiling_info) { + for (const ProfilingInfo & info : profiling_results) { fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n", info.kernel_name.c_str(), info.cmd_queued/1000); fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n", @@ -738,6 +754,7 @@ struct ggml_backend_opencl_context { fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Device\"},\n", info.kernel_name.c_str(), info.cmd_end/1000); } + fprintf(ftrace, "]\n"); fclose(ftrace); } @@ -758,6 +775,9 @@ struct ggml_backend_opencl_context { profiling_info.emplace_back(); populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor); + if (profiling_info.size() >= 2048) { + flush_profiling_batch(); + } #else GGML_UNUSED(tensor); CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL)); @@ -804,7 +824,7 @@ struct ggml_backend_opencl_context { if (ref_count == 0) { #ifdef GGML_OPENCL_PROFILING write_profiling_info(); - profiling_info.clear(); + profiling_results.clear(); #endif } } diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index 1d9d9ab7d70..80966b6ef15 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -28,6 +28,7 @@ def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizati # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray: rows = arr.reshape((-1, arr.shape[-1])) + assert len(rows.shape) osize = 1 for dim in oshape: osize *= dim diff --git a/scripts/snapdragon/windows/setup-build.ps1 b/scripts/snapdragon/windows/setup-build.ps1 index 0f3244cc9d2..d8ef24d4413 100644 --- a/scripts/snapdragon/windows/setup-build.ps1 +++ b/scripts/snapdragon/windows/setup-build.ps1 @@ -7,10 +7,10 @@ $ErrorActionPreference = "Stop" $BaseDir = "C:\Qualcomm" # SDK 1: Hexagon -$HexagonUrl = "https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz" +$HexagonUrl = "https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.6.0.0/hexagon-sdk-v6.6.0.0-arm64-wos.tar.xz" $HexagonParent = Join-Path $BaseDir "Hexagon_SDK" -$HexagonSdkVersion = "6.4.0.2" -$HexagonToolsVersion = "19.0.04" +$HexagonSdkVersion = "6.6.0.0" +$HexagonToolsVersion = "19.0.07" $HexagonSdkTarget = Join-Path $HexagonParent $HexagonSdkVersion $HexagonToolsTarget = Join-Path $HexagonSdkTarget "\tools\HEXAGON_Tools\$HexagonToolsVersion" diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index b939e3b75eb..c3daafd0d92 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -8,6 +8,7 @@ #include "build-info.h" #include "common.h" +#include "fit.h" #include "llama.h" #include "log.h" #include "sampling.h" @@ -775,7 +776,7 @@ struct server_context_impl { for (auto & [dev, size] : mmproj_mem) { total += size; } - SRV_INF("[mtmd] estimated memory usage of mmproj is %.2f MiB\n", total / (1024.0 * 1024.0)); + SRV_INF("[mtmd] estimated worst-case memory usage of mmproj is %.2f MiB\n", total / (1024.0 * 1024.0)); GGML_ASSERT(!params_base.fit_params_target.empty()); for (auto & [dev, size] : mmproj_mem) { for (size_t i = 0; i < ggml_backend_dev_count(); i++) { @@ -793,6 +794,82 @@ struct server_context_impl { } } + // optionally reserve VRAM for the draft / MTP context before fitting the target model + if (params_base.fit_params) { + const bool spec_mtp = std::find(params_base.speculative.types.begin(), + params_base.speculative.types.end(), + COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end(); + const bool has_draft = params_base.speculative.has_dft(); + + if (has_draft || spec_mtp) { + common_params params_dft = params_base; + bool measure_model_bytes = true; + + if (has_draft) { + const auto & params_spec = params_base.speculative.draft; + params_dft.devices = params_spec.devices; + params_dft.model = params_spec.mparams; + params_dft.n_gpu_layers = params_spec.n_gpu_layers; + params_dft.cache_type_k = params_spec.cache_type_k; + params_dft.cache_type_v = params_spec.cache_type_v; + params_dft.tensor_buft_overrides = params_spec.tensor_buft_overrides; + } else { + // MTP draft context lives on the target model, only context+compute are new + measure_model_bytes = false; + } + + auto mparams_dft = common_model_params_to_llama(params_dft); + auto cparams_dft = common_context_params_to_llama(params_dft); + if (spec_mtp) { + cparams_dft.ctx_type = LLAMA_CONTEXT_TYPE_MTP; + } + cparams_dft.n_rs_seq = 0; + + std::vector devs; + uint32_t hp_ngl = 0; + uint32_t hp_nct = 0; + uint32_t hp_nex = 0; + try { + auto dmd = common_get_device_memory_data( + params_dft.model.path.c_str(), &mparams_dft, &cparams_dft, + devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR); + + GGML_ASSERT(!params_base.fit_params_target.empty()); + size_t total = 0; + + std::vector tgt_devices = params.devices; + + if (tgt_devices.empty()) { + for(size_t i = 0; i < ggml_backend_dev_count(); ++i) { + tgt_devices.push_back(ggml_backend_dev_get(i)); + } + } + + for (size_t j = 0; j < devs.size(); ++j) { + const size_t bytes = + (measure_model_bytes ? dmd[j].mb.model : 0) + + dmd[j].mb.context + + dmd[j].mb.compute; + total += bytes; + for (size_t i = 0; i < tgt_devices.size(); i++) { + if (tgt_devices[i] == devs[j]) { + SRV_DBG("[spec] adding %.2f MiB to fit_params_target for device %s\n", + bytes / (1024.0 * 1024.0), ggml_backend_dev_name(devs[j])); + params_base.fit_params_target[i] += bytes; + break; + } + } + } + SRV_INF("[spec] estimated memory usage of %s is %.2f MiB\n", + has_draft ? "draft model" : "MTP context", + total / (1024.0 * 1024.0)); + } catch (const std::exception & e) { + SRV_ERR("[spec] failed to measure %s memory: %s\n", + has_draft ? "draft model" : "MTP context", e.what()); + } + } + } + llama_init = common_init_from_params(params_base); model_tgt = llama_init->model(); diff --git a/tools/ui/CMakeLists.txt b/tools/ui/CMakeLists.txt index d4cf3580234..60d9020da38 100644 --- a/tools/ui/CMakeLists.txt +++ b/tools/ui/CMakeLists.txt @@ -43,9 +43,9 @@ if(CMAKE_CROSSCOMPILING) message(STATUS "UI: building llama-ui-embed with host compiler ${HOST_CXX_COMPILER}") if(CMAKE_HOST_WIN32) - set(LLAMA_UI_EMBED_EXE "${CMAKE_CURRENT_BINARY_DIR}/llama-ui-embed.exe") + set(LLAMA_UI_EMBED_EXE "${CMAKE_CURRENT_BINARY_DIR}/llama-ui-embed-host.exe") else() - set(LLAMA_UI_EMBED_EXE "${CMAKE_CURRENT_BINARY_DIR}/llama-ui-embed") + set(LLAMA_UI_EMBED_EXE "${CMAKE_CURRENT_BINARY_DIR}/llama-ui-embed-host") endif() add_custom_command( @@ -56,6 +56,8 @@ if(CMAKE_CROSSCOMPILING) COMMENT "Building llama-ui-embed (host)" VERBATIM ) + + # phony target to tie it into the dependency graph add_custom_target(llama-ui-embed DEPENDS "${LLAMA_UI_EMBED_EXE}") else() add_executable(llama-ui-embed embed.cpp) @@ -93,6 +95,10 @@ add_library(${TARGET} STATIC ${UI_CPP} ${UI_H}) target_compile_features(${TARGET} PRIVATE cxx_std_17) add_dependencies(${TARGET} llama-ui-assets) +if (BUILD_SHARED_LIBS) + set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() + target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_BINARY_DIR} )