diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml index dc7aebe24ca..2d3fa163d23 100644 --- a/.github/workflows/python-type-check.yml +++ b/.github/workflows/python-type-check.yml @@ -31,7 +31,7 @@ jobs: uses: actions/setup-python@v6 with: python-version: "3.11" - pip-install: -r requirements/requirements-all.txt ty==0.0.26 + pip-install: -r requirements/requirements-all.txt ty==0.0.33 # - name: Type-check with Pyright # uses: jakebailey/pyright-action@v2 # with: diff --git a/common/arg.cpp b/common/arg.cpp index 943d0766fb2..c21598e7687 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3499,7 +3499,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_N_MIN")); add_opt(common_arg( - {"--spec--draft-p-split", "--draft-p-split"}, "P", + {"--spec-draft-p-split", "--draft-p-split"}, "P", string_format("speculative decoding split probability (default: %.2f)", (double)params.speculative.draft.p_split), [](common_params & params, const std::string & value) { params.speculative.draft.p_split = std::stof(value); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 90c2b7094c7..5287c4df941 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6658,7 +6658,7 @@ def _xlmroberta_set_vocab(self) -> None: tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size # ty: ignore[invalid-assignment] + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size if isinstance(tokenizer, SentencePieceProcessor): for token_id in range(tokenizer.vocab_size()): diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh index 928b856f9d2..585f2c22853 100644 --- a/ggml/src/ggml-cuda/fattn-tile.cuh +++ b/ggml/src/ggml-cuda/fattn-tile.cuh @@ -68,7 +68,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 64, 64) GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 64, 64) - GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2, 64, 64) + GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2, 64, 64) GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64) GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64) @@ -130,7 +130,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 32, 128) GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 32, 64) - GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2, 32, 64) + GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2, 32, 64) GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 32, 64) GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 32, 64) @@ -1124,7 +1124,7 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm constexpr size_t nbytes_shared = 0; #ifdef GGML_USE_HIP - if constexpr (DV <= 128) { + if constexpr (DKQ <= 128) { if (Q->ne[1] > 32/ncols2) { constexpr int cols_per_block = 64; const int nwarps = ggml_cuda_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size; @@ -1138,7 +1138,7 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm #endif // GGML_USE_HIP #ifndef GGML_USE_HIP - if constexpr (DV <= 256) + if constexpr (DKQ <= 256) #endif // GGML_USE_HIP { if (Q->ne[1] > 16/ncols2) { @@ -1220,11 +1220,22 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm const int gqa_limit = nvidia && gqa_ratio <= 4 && DV <= 256 ? 16 : INT_MAX; const bool use_gqa_opt = mask && max_bias == 0.0f && Q->ne[1] <= gqa_limit && K->ne[1] % FATTN_KQ_STRIDE == 0; - if constexpr (DKQ == 320) { // Mistral Small 4 + if constexpr (DKQ == 320) { + // This branch is only used for Mistral Small 4 which has a GQA ratio of 32. + // On AMD, simply use that GQA ratio with 32 columns / block since we always have enough SRAM. + // On NVIDIA however, the tile kernel is only used for GPUs that can't use the mma kernel (Pascal and older). + // Therefore, use a GQA ratio of 16 with 16 columns / block to stay below 48 kiB of SRAM / block. +#ifdef GGML_USE_HIP if (use_gqa_opt && gqa_ratio % 32 == 0) { launch_fattn_tile_switch_ncols1(ctx, dst); return; } +#else + if (use_gqa_opt && gqa_ratio % 16 == 0) { + launch_fattn_tile_switch_ncols1(ctx, dst); + return; + } +#endif // GGML_USE_HIP GGML_ABORT("flash-attn tile (320/256): expected GQA ratio multiple of 32"); } diff --git a/scripts/jinja/jinja-tester.py b/scripts/jinja/jinja-tester.py index 4f79b8da3db..a83f025411a 100755 --- a/scripts/jinja/jinja-tester.py +++ b/scripts/jinja/jinja-tester.py @@ -20,6 +20,7 @@ from jinja2 import TemplateSyntaxError from jinja2.sandbox import ImmutableSandboxedEnvironment from datetime import datetime +from typing import Callable def format_template_content(template_content): @@ -395,7 +396,7 @@ def raise_exception(text: str) -> str: ensure_ascii=ensure_ascii, ) ) - env.globals["strftime_now"] = lambda format: datetime.now().strftime(format) # ty: ignore[invalid-assignment] + env.globals["strftime_now"]: Callable[[str], str] = lambda format: datetime.now().strftime(format) env.globals["raise_exception"] = raise_exception # ty: ignore[invalid-assignment] try: template = env.from_string(template_str) diff --git a/scripts/sync_vendor.py b/scripts/sync_vendor.py index ff1dd075303..d2a8a50b5de 100755 --- a/scripts/sync_vendor.py +++ b/scripts/sync_vendor.py @@ -5,7 +5,7 @@ import sys import subprocess -HTTPLIB_VERSION = "refs/tags/v0.43.1" +HTTPLIB_VERSION = "refs/tags/v0.43.2" vendor = { "https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp", diff --git a/vendor/cpp-httplib/httplib.cpp b/vendor/cpp-httplib/httplib.cpp index 95bf0eb1bb5..66c0b6ebd16 100644 --- a/vendor/cpp-httplib/httplib.cpp +++ b/vendor/cpp-httplib/httplib.cpp @@ -1464,8 +1464,9 @@ bool mmap::open(const char *path) { auto wpath = u8string_to_wstring(path); if (wpath.empty()) { return false; } - hFile_ = ::CreateFile2(wpath.c_str(), GENERIC_READ, FILE_SHARE_READ, - OPEN_EXISTING, NULL); + hFile_ = + ::CreateFile2(wpath.c_str(), GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE, OPEN_EXISTING, NULL); if (hFile_ == INVALID_HANDLE_VALUE) { return false; } @@ -2052,56 +2053,50 @@ int getaddrinfo_with_timeout(const char *node, const char *service, return 0; #elif defined(_GNU_SOURCE) && defined(__GLIBC__) && \ (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2)) - // Linux implementation using getaddrinfo_a for asynchronous DNS resolution - struct gaicb request; + // #2431: gai_cancel() is non-blocking and may return EAI_NOTCANCELED while + // the resolver worker still references the stack-local gaicb. The cancel + // path therefore waits (gai_suspend with no timeout) for the worker to + // actually finish before letting the stack frame go. The trade-off is that + // a wedged DNS server can hold this thread for the system resolver timeout + // (~30s by default) past the caller's connection timeout. + struct gaicb request {}; struct gaicb *requests[1] = {&request}; - struct sigevent sevp; - struct timespec timeout; + struct sigevent sevp {}; + struct timespec timeout { + timeout_sec, 0 + }; - // Initialize the request structure - memset(&request, 0, sizeof(request)); request.ar_name = node; request.ar_service = service; request.ar_request = hints; - - // Set up timeout - timeout.tv_sec = timeout_sec; - timeout.tv_nsec = 0; - - // Initialize sigevent structure (not used, but required) - memset(&sevp, 0, sizeof(sevp)); sevp.sigev_notify = SIGEV_NONE; - // Start asynchronous resolution - int start_result = getaddrinfo_a(GAI_NOWAIT, requests, 1, &sevp); - if (start_result != 0) { return start_result; } + int rc = getaddrinfo_a(GAI_NOWAIT, requests, 1, &sevp); + if (rc != 0) { return rc; } - // Wait for completion with timeout - int wait_result = - gai_suspend((const struct gaicb *const *)requests, 1, &timeout); + auto cleanup = scope_exit([&] { + if (request.ar_result) { freeaddrinfo(request.ar_result); } + }); + + int wait_result = gai_suspend(requests, 1, &timeout); if (wait_result == 0 || wait_result == EAI_ALLDONE) { - // Completed successfully, get the result int gai_result = gai_error(&request); if (gai_result == 0) { *res = request.ar_result; + request.ar_result = nullptr; return 0; - } else { - // Clean up on error - if (request.ar_result) { freeaddrinfo(request.ar_result); } - return gai_result; } - } else if (wait_result == EAI_AGAIN) { - // Timeout occurred, cancel the request - gai_cancel(&request); - return EAI_AGAIN; - } else { - // Other error occurred - gai_cancel(&request); - return wait_result; + return gai_result; + } + + gai_cancel(&request); + while (gai_error(&request) == EAI_INPROGRESS) { + gai_suspend(requests, 1, nullptr); } + return wait_result; #else - // Fallback implementation using thread-based timeout for other Unix systems + // Fallback implementation using thread-based timeout for other Unix systems. struct GetAddrInfoState { ~GetAddrInfoState() { @@ -14142,6 +14137,9 @@ ssize_t read(session_t session, void *buf, size_t len, TlsError &err) { err.code = impl::map_mbedtls_error(ret, err.sys_errno); err.backend_code = static_cast(-ret); impl::mbedtls_last_error() = ret; + // mbedTLS signals a clean close_notify via a negative error code rather + // than 0; surface it as a clean EOF the way OpenSSL/wolfSSL do. + if (err.code == ErrorCode::PeerClosed) { return 0; } return -1; } diff --git a/vendor/cpp-httplib/httplib.h b/vendor/cpp-httplib/httplib.h index 8581d1695a8..7e530961b9c 100644 --- a/vendor/cpp-httplib/httplib.h +++ b/vendor/cpp-httplib/httplib.h @@ -8,8 +8,8 @@ #ifndef CPPHTTPLIB_HTTPLIB_H #define CPPHTTPLIB_HTTPLIB_H -#define CPPHTTPLIB_VERSION "0.43.1" -#define CPPHTTPLIB_VERSION_NUM "0x002b01" +#define CPPHTTPLIB_VERSION "0.43.2" +#define CPPHTTPLIB_VERSION_NUM "0x002b02" #ifdef _WIN32 #if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00