Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-type-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
uses: actions/setup-python@v6
with:
python-version: "3.11"
pip-install: -r requirements/requirements-all.txt ty==0.0.26
pip-install: -r requirements/requirements-all.txt ty==0.0.33
# - name: Type-check with Pyright
# uses: jakebailey/pyright-action@v2
# with:
Expand Down
2 changes: 1 addition & 1 deletion common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3499,7 +3499,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_N_MIN"));

add_opt(common_arg(
{"--spec--draft-p-split", "--draft-p-split"}, "P",
{"--spec-draft-p-split", "--draft-p-split"}, "P",
string_format("speculative decoding split probability (default: %.2f)", (double)params.speculative.draft.p_split),
[](common_params & params, const std::string & value) {
params.speculative.draft.p_split = std::stof(value);
Expand Down
2 changes: 1 addition & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6658,7 +6658,7 @@ def _xlmroberta_set_vocab(self) -> None:

tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size # ty: ignore[invalid-assignment]
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size

if isinstance(tokenizer, SentencePieceProcessor):
for token_id in range(tokenizer.vocab_size()):
Expand Down
21 changes: 16 additions & 5 deletions ggml/src/ggml-cuda/fattn-tile.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 64, 64)

GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2, 64, 64)

GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
Expand Down Expand Up @@ -130,7 +130,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 32, 128)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 32, 64)

GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2, 32, 64)

GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 32, 64)
Expand Down Expand Up @@ -1124,7 +1124,7 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm
constexpr size_t nbytes_shared = 0;

#ifdef GGML_USE_HIP
if constexpr (DV <= 128) {
if constexpr (DKQ <= 128) {
if (Q->ne[1] > 32/ncols2) {
constexpr int cols_per_block = 64;
const int nwarps = ggml_cuda_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
Expand All @@ -1138,7 +1138,7 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm
#endif // GGML_USE_HIP

#ifndef GGML_USE_HIP
if constexpr (DV <= 256)
if constexpr (DKQ <= 256)
#endif // GGML_USE_HIP
{
if (Q->ne[1] > 16/ncols2) {
Expand Down Expand Up @@ -1220,11 +1220,22 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
const int gqa_limit = nvidia && gqa_ratio <= 4 && DV <= 256 ? 16 : INT_MAX;
const bool use_gqa_opt = mask && max_bias == 0.0f && Q->ne[1] <= gqa_limit && K->ne[1] % FATTN_KQ_STRIDE == 0;

if constexpr (DKQ == 320) { // Mistral Small 4
if constexpr (DKQ == 320) {
// This branch is only used for Mistral Small 4 which has a GQA ratio of 32.
// On AMD, simply use that GQA ratio with 32 columns / block since we always have enough SRAM.
// On NVIDIA however, the tile kernel is only used for GPUs that can't use the mma kernel (Pascal and older).
// Therefore, use a GQA ratio of 16 with 16 columns / block to stay below 48 kiB of SRAM / block.
#ifdef GGML_USE_HIP
if (use_gqa_opt && gqa_ratio % 32 == 0) {
launch_fattn_tile_switch_ncols1<DKQ, DV, 32, use_logit_softcap>(ctx, dst);
return;
}
#else
if (use_gqa_opt && gqa_ratio % 16 == 0) {
launch_fattn_tile_switch_ncols1<DKQ, DV, 16, use_logit_softcap>(ctx, dst);
return;
}
#endif // GGML_USE_HIP
GGML_ABORT("flash-attn tile (320/256): expected GQA ratio multiple of 32");
}

Expand Down
3 changes: 2 additions & 1 deletion scripts/jinja/jinja-tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from jinja2 import TemplateSyntaxError
from jinja2.sandbox import ImmutableSandboxedEnvironment
from datetime import datetime
from typing import Callable


def format_template_content(template_content):
Expand Down Expand Up @@ -395,7 +396,7 @@ def raise_exception(text: str) -> str:
ensure_ascii=ensure_ascii,
)
)
env.globals["strftime_now"] = lambda format: datetime.now().strftime(format) # ty: ignore[invalid-assignment]
env.globals["strftime_now"]: Callable[[str], str] = lambda format: datetime.now().strftime(format)
env.globals["raise_exception"] = raise_exception # ty: ignore[invalid-assignment]
try:
template = env.from_string(template_str)
Expand Down
2 changes: 1 addition & 1 deletion scripts/sync_vendor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import sys
import subprocess

HTTPLIB_VERSION = "refs/tags/v0.43.1"
HTTPLIB_VERSION = "refs/tags/v0.43.2"

vendor = {
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",
Expand Down
68 changes: 33 additions & 35 deletions vendor/cpp-httplib/httplib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1464,8 +1464,9 @@ bool mmap::open(const char *path) {
auto wpath = u8string_to_wstring(path);
if (wpath.empty()) { return false; }

hFile_ = ::CreateFile2(wpath.c_str(), GENERIC_READ, FILE_SHARE_READ,
OPEN_EXISTING, NULL);
hFile_ =
::CreateFile2(wpath.c_str(), GENERIC_READ,
FILE_SHARE_READ | FILE_SHARE_WRITE, OPEN_EXISTING, NULL);

if (hFile_ == INVALID_HANDLE_VALUE) { return false; }

Expand Down Expand Up @@ -2052,56 +2053,50 @@ int getaddrinfo_with_timeout(const char *node, const char *service,
return 0;
#elif defined(_GNU_SOURCE) && defined(__GLIBC__) && \
(__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2))
// Linux implementation using getaddrinfo_a for asynchronous DNS resolution
struct gaicb request;
// #2431: gai_cancel() is non-blocking and may return EAI_NOTCANCELED while
// the resolver worker still references the stack-local gaicb. The cancel
// path therefore waits (gai_suspend with no timeout) for the worker to
// actually finish before letting the stack frame go. The trade-off is that
// a wedged DNS server can hold this thread for the system resolver timeout
// (~30s by default) past the caller's connection timeout.
struct gaicb request {};
struct gaicb *requests[1] = {&request};
struct sigevent sevp;
struct timespec timeout;
struct sigevent sevp {};
struct timespec timeout {
timeout_sec, 0
};

// Initialize the request structure
memset(&request, 0, sizeof(request));
request.ar_name = node;
request.ar_service = service;
request.ar_request = hints;

// Set up timeout
timeout.tv_sec = timeout_sec;
timeout.tv_nsec = 0;

// Initialize sigevent structure (not used, but required)
memset(&sevp, 0, sizeof(sevp));
sevp.sigev_notify = SIGEV_NONE;

// Start asynchronous resolution
int start_result = getaddrinfo_a(GAI_NOWAIT, requests, 1, &sevp);
if (start_result != 0) { return start_result; }
int rc = getaddrinfo_a(GAI_NOWAIT, requests, 1, &sevp);
if (rc != 0) { return rc; }

// Wait for completion with timeout
int wait_result =
gai_suspend((const struct gaicb *const *)requests, 1, &timeout);
auto cleanup = scope_exit([&] {
if (request.ar_result) { freeaddrinfo(request.ar_result); }
});

int wait_result = gai_suspend(requests, 1, &timeout);

if (wait_result == 0 || wait_result == EAI_ALLDONE) {
// Completed successfully, get the result
int gai_result = gai_error(&request);
if (gai_result == 0) {
*res = request.ar_result;
request.ar_result = nullptr;
return 0;
} else {
// Clean up on error
if (request.ar_result) { freeaddrinfo(request.ar_result); }
return gai_result;
}
} else if (wait_result == EAI_AGAIN) {
// Timeout occurred, cancel the request
gai_cancel(&request);
return EAI_AGAIN;
} else {
// Other error occurred
gai_cancel(&request);
return wait_result;
return gai_result;
}

gai_cancel(&request);
while (gai_error(&request) == EAI_INPROGRESS) {
gai_suspend(requests, 1, nullptr);
}
return wait_result;
#else
// Fallback implementation using thread-based timeout for other Unix systems
// Fallback implementation using thread-based timeout for other Unix systems.

struct GetAddrInfoState {
~GetAddrInfoState() {
Expand Down Expand Up @@ -14142,6 +14137,9 @@ ssize_t read(session_t session, void *buf, size_t len, TlsError &err) {
err.code = impl::map_mbedtls_error(ret, err.sys_errno);
err.backend_code = static_cast<uint64_t>(-ret);
impl::mbedtls_last_error() = ret;
// mbedTLS signals a clean close_notify via a negative error code rather
// than 0; surface it as a clean EOF the way OpenSSL/wolfSSL do.
if (err.code == ErrorCode::PeerClosed) { return 0; }
return -1;
}

Expand Down
4 changes: 2 additions & 2 deletions vendor/cpp-httplib/httplib.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
#ifndef CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_HTTPLIB_H

#define CPPHTTPLIB_VERSION "0.43.1"
#define CPPHTTPLIB_VERSION_NUM "0x002b01"
#define CPPHTTPLIB_VERSION "0.43.2"
#define CPPHTTPLIB_VERSION_NUM "0x002b02"

#ifdef _WIN32
#if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00
Expand Down
Loading