diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 97dcc42312f6..bbc896ec6819 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -82,6 +82,14 @@ if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"} fi +if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then + commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"} +fi + +if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then + commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"} +fi + if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} fi diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh index 75e3ef264095..037897e53dbe 100644 --- a/.buildkite/scripts/upload-wheels.sh +++ b/.buildkite/scripts/upload-wheels.sh @@ -75,3 +75,4 @@ else fi aws s3 cp "$wheel" "s3://vllm-wheels/$version/" +aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html" diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 1040d1e1b801..1459156f63db 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -216,7 +216,6 @@ steps: - pytest -v -s v1/spec_decode - pytest -v -s v1/kv_connector/unit - pytest -v -s v1/test_serial_utils.py - - pytest -v -s v1/test_stats.py - pytest -v -s v1/test_utils.py - pytest -v -s v1/test_oracle.py # TODO: accuracy does not match, whether setting @@ -456,7 +455,7 @@ steps: ##### models test ##### - label: Basic Models Test # 24min - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] torch_nightly: true source_file_dependencies: - vllm/ @@ -528,7 +527,7 @@ steps: - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' - label: Multi-Modal Models Test (Extended) 3 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] optional: true source_file_dependencies: - vllm/ @@ -538,7 +537,7 @@ steps: - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' - label: Quantized Models Test - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] source_file_dependencies: - vllm/model_executor/layers/quantization - tests/models/quantization diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 88275dbdd83a..55e659679701 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -70,6 +70,9 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) { int64_t num_tokens = input.numel() / input.size(-1); \ dim3 grid(num_tokens); \ dim3 block(std::min(d, 1024)); \ + if (num_tokens == 0) { \ + return; \ + } \ const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ VLLM_DISPATCH_FLOATING_TYPES( \ diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh index eb216dc8baf1..79a546554fa1 100644 --- a/csrc/attention/attention_kernels.cuh +++ b/csrc/attention/attention_kernels.cuh @@ -172,7 +172,7 @@ __device__ void paged_attention_kernel( // Load the query to registers. // Each thread in a thread group has a different part of the query. - // For example, if the the thread group size is 4, then the first thread in + // For example, if the thread group size is 4, then the first thread in // the group has 0, 4, 8, ... th vectors of the query, and the second thread // has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because // q is split from a qkv tensor, it may not be contiguous. @@ -259,7 +259,7 @@ __device__ void paged_attention_kernel( // Load a key to registers. // Each thread in a thread group has a different part of the key. - // For example, if the the thread group size is 4, then the first thread in + // For example, if the thread group size is 4, then the first thread in // the group has 0, 4, 8, ... th vectors of the key, and the second thread // has 1, 5, 9, ... th vectors of the key, and so on. for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) { diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h index dc6e0769b878..f7b75c48373f 100644 --- a/csrc/dispatch_utils.h +++ b/csrc/dispatch_utils.h @@ -65,5 +65,19 @@ AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) +#define VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::UInt16, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::UInt32, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::UInt64, __VA_ARGS__) + #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__)) + +#define VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(__VA_ARGS__)) diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index d7be769458e3..6b6a9d04a60f 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -326,7 +326,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, } if (use_global_memory) { - VLLM_DISPATCH_INTEGRAL_TYPES( + VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES( topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] { // calc needed amount of shared mem for `tokens_cnts` and `cumsum` // tensors @@ -351,7 +351,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, cumsum_buffer.data_ptr()); }); } else if (use_i16) { - VLLM_DISPATCH_INTEGRAL_TYPES( + VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES( topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { // set dynamic shared mem auto kernel = @@ -366,7 +366,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, topk_ids.numel()); }); } else { - VLLM_DISPATCH_INTEGRAL_TYPES( + VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES( topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { auto kernel = vllm::moe::moe_align_block_size_kernel; @@ -391,7 +391,7 @@ void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, TORCH_CHECK(num_experts == 256, "sgl_moe_align_block_size kernel only supports deepseek v3."); - VLLM_DISPATCH_INTEGRAL_TYPES( + VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES( topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] { // calc needed amount of shared mem for `cumsum` tensors auto options_int = diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index de9747b60252..a9379032245d 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -108,9 +108,17 @@ __launch_bounds__(TPB) __global__ } } -template -__launch_bounds__(TPB) __global__ void moeTopK(const float* inputs_after_softmax, const bool* finished, float* output, - int* indices, int* source_rows, const int num_experts, const int k, const int start_expert, const int end_expert) +template +__launch_bounds__(TPB) __global__ void moeTopK( + const float* inputs_after_softmax, + const bool* finished, + float* output, + IndType* indices, + int* source_rows, + const int num_experts, + const int k, + const int start_expert, + const int end_expert) { using cub_kvp = cub::KeyValuePair; @@ -182,9 +190,9 @@ __launch_bounds__(TPB) __global__ void moeTopK(const float* inputs_after_softmax 2) This implementation assumes k is small, but will work for any k. */ -template +template __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__ - void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, int* indices, + void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices, int* source_rows, const int k, const int start_expert, const int end_expert) { // We begin by enforcing compile time assertions and setting up compile time constants. @@ -397,8 +405,8 @@ struct TopkConstants }; } // namespace detail -template -void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, int* indices, +template +void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices, int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream) { static constexpr std::size_t MAX_BYTES_PER_LDG = 16; @@ -421,10 +429,11 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f token_expert_indices, num_tokens, topk, 0, num_experts, \ stream); +template void topkGatingSoftmaxKernelLauncher( const float* gating_output, float* topk_weights, - int* topk_indicies, + IndType* topk_indicies, int* token_expert_indices, float* softmax_workspace, const int num_tokens, @@ -493,14 +502,32 @@ void topk_softmax( const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options()); - vllm::moe::topkGatingSoftmaxKernelLauncher( - gating_output.data_ptr(), - topk_weights.data_ptr(), - topk_indices.data_ptr(), - token_expert_indices.data_ptr(), - softmax_workspace.data_ptr(), - num_tokens, - num_experts, - topk, - stream); + + if(topk_indices.scalar_type() == at::ScalarType::Int) + { + vllm::moe::topkGatingSoftmaxKernelLauncher( + gating_output.data_ptr(), + topk_weights.data_ptr(), + topk_indices.data_ptr(), + token_expert_indices.data_ptr(), + softmax_workspace.data_ptr(), + num_tokens, + num_experts, + topk, + stream); + } + else + { + assert(topk_indices.scalar_type() == at::ScalarType::UInt32); + vllm::moe::topkGatingSoftmaxKernelLauncher( + gating_output.data_ptr(), + topk_weights.data_ptr(), + topk_indices.data_ptr(), + token_expert_indices.data_ptr(), + softmax_workspace.data_ptr(), + num_tokens, + num_experts, + topk, + stream); + } } diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu index ef6dd1c0978d..266f2a0667a2 100644 --- a/csrc/pos_encoding_kernels.cu +++ b/csrc/pos_encoding_kernels.cu @@ -44,7 +44,8 @@ inline __device__ void apply_rotary_embedding( // head_size] const scalar_t* cache_ptr, const int head_size, const int num_heads, const int num_kv_heads, const int rot_dim, const int token_idx, - const int64_t query_stride, const int64_t key_stride) { + const int64_t query_stride, const int64_t key_stride, + const int64_t head_stride) { const int embed_dim = rot_dim / 2; const scalar_t* cos_ptr = cache_ptr; const scalar_t* sin_ptr = cache_ptr + embed_dim; @@ -52,7 +53,8 @@ inline __device__ void apply_rotary_embedding( const int nq = num_heads * embed_dim; for (int i = threadIdx.x; i < nq; i += blockDim.x) { const int head_idx = i / embed_dim; - const int64_t token_head = token_idx * query_stride + head_idx * head_size; + const int64_t token_head = + token_idx * query_stride + head_idx * head_stride; const int rot_offset = i % embed_dim; apply_token_rotary_embedding( query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim); @@ -62,7 +64,8 @@ inline __device__ void apply_rotary_embedding( const int nk = num_kv_heads * embed_dim; for (int i = threadIdx.x; i < nk; i += blockDim.x) { const int head_idx = i / embed_dim; - const int64_t token_head = token_idx * key_stride + head_idx * head_size; + const int64_t token_head = + token_idx * key_stride + head_idx * head_stride; const int rot_offset = i % embed_dim; apply_token_rotary_embedding( key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim); @@ -84,7 +87,8 @@ __global__ void rotary_embedding_kernel( const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // // 2] const int rot_dim, const int64_t query_stride, const int64_t key_stride, - const int num_heads, const int num_kv_heads, const int head_size) { + const int64_t head_stride, const int num_heads, const int num_kv_heads, + const int head_size) { // Each thread block is responsible for one token. const int token_idx = blockIdx.x; int64_t pos = positions[token_idx]; @@ -92,7 +96,7 @@ __global__ void rotary_embedding_kernel( apply_rotary_embedding( query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, - token_idx, query_stride, key_stride); + token_idx, query_stride, key_stride, head_stride); } template @@ -109,9 +113,9 @@ __global__ void batched_rotary_embedding_kernel( const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // // 2] const int64_t* __restrict__ cos_sin_cache_offsets, // [batch_size, seq_len] - // or [num_tokens] const int rot_dim, const int64_t query_stride, const int64_t key_stride, - const int num_heads, const int num_kv_heads, const int head_size) { + const int64_t head_stride, const int num_heads, const int num_kv_heads, + const int head_size) { // Each thread block is responsible for one token. const int token_idx = blockIdx.x; int64_t pos = positions[token_idx]; @@ -121,7 +125,7 @@ __global__ void batched_rotary_embedding_kernel( apply_rotary_embedding( query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, - token_idx, query_stride, key_stride); + token_idx, query_stride, key_stride, head_stride); } } // namespace vllm @@ -179,6 +183,12 @@ void rotary_embedding( int seq_dim_idx = positions_ndim - 1; int64_t query_stride = query.stride(seq_dim_idx); int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0; + // Determine head stride: for [*, heads, head_size] use stride of last dim; + // for flat [*, heads*head_size], heads blocks are contiguous of size + // head_size + int query_ndim = query.dim(); + int64_t head_stride = + (query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size; dim3 grid(num_tokens); dim3 block(std::min(num_heads * rot_dim / 2, 512)); @@ -190,14 +200,14 @@ void rotary_embedding( positions.data_ptr(), query.data_ptr(), key.has_value() ? key->data_ptr() : nullptr, cos_sin_cache.data_ptr(), rot_dim, query_stride, key_stride, - num_heads, num_kv_heads, head_size); + head_stride, num_heads, num_kv_heads, head_size); } else { vllm::rotary_embedding_kernel <<>>( positions.data_ptr(), query.data_ptr(), key.has_value() ? key->data_ptr() : nullptr, cos_sin_cache.data_ptr(), rot_dim, query_stride, - key_stride, num_heads, num_kv_heads, head_size); + key_stride, head_stride, num_heads, num_kv_heads, head_size); } }); } @@ -263,6 +273,12 @@ void batched_rotary_embedding( int seq_dim_idx = positions_ndim - 1; int64_t query_stride = query.stride(seq_dim_idx); int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0; + // Determine head stride: for [*, heads, head_size] use stride of last dim; + // for flat [*, heads*head_size], heads blocks are contiguous of size + // head_size + int query_ndim = query.dim(); + int64_t head_stride = + (query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size; dim3 grid(num_tokens); dim3 block(std::min(num_heads * rot_dim / 2, 512)); @@ -276,7 +292,7 @@ void batched_rotary_embedding( key.has_value() ? key->data_ptr() : nullptr, cos_sin_cache.data_ptr(), cos_sin_cache_offsets.data_ptr(), rot_dim, query_stride, - key_stride, num_heads, num_kv_heads, head_size); + key_stride, head_stride, num_heads, num_kv_heads, head_size); } else { vllm::batched_rotary_embedding_kernel <<>>( @@ -284,7 +300,7 @@ void batched_rotary_embedding( key.has_value() ? key->data_ptr() : nullptr, cos_sin_cache.data_ptr(), cos_sin_cache_offsets.data_ptr(), rot_dim, query_stride, - key_stride, num_heads, num_kv_heads, head_size); + key_stride, head_stride, num_heads, num_kv_heads, head_size); } }); } diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md index 6708f2c4135f..9744f5f4d362 100644 --- a/docs/source/deployment/frameworks/index.md +++ b/docs/source/deployment/frameworks/index.md @@ -10,6 +10,7 @@ chatbox dify dstack helm +lobe-chat lws modal open-webui diff --git a/docs/source/deployment/frameworks/lobe-chat.md b/docs/source/deployment/frameworks/lobe-chat.md new file mode 100644 index 000000000000..6d86b7fa9cce --- /dev/null +++ b/docs/source/deployment/frameworks/lobe-chat.md @@ -0,0 +1,13 @@ +(deployment-lobe-chat)= + +# Lobe Chat + +[Lobe Chat](https://github.com/lobehub/lobe-chat) is an open-source, modern-design ChatGPT/LLMs UI/Framework. + +Supports speech-synthesis, multi-modal, and extensible (function call) plugin system. + +One-click FREE deployment of your private OpenAI ChatGPT/Claude/Gemini/Groq/Ollama chat application. + +It supports vLLM as a AI model provider to efficiently serve large language models. + +For details, see the tutorial [Using vLLM in LobeChat](https://lobehub.com/docs/usage/providers/vllm). diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md index 4759d0c26c35..3c2571298e4f 100644 --- a/docs/source/features/reasoning_outputs.md +++ b/docs/source/features/reasoning_outputs.md @@ -141,10 +141,10 @@ Remember to check whether the `reasoning_content` exists in the response before The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now. ```bash -VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1 +vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1 ``` -Please note that the `VLLM_USE_V1` environment variable must be set to `0` to use the v0 engine. +The following is an example client: ```python from openai import OpenAI diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 25189b006c26..298ba59f7d8b 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -19,8 +19,8 @@ If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/ It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands: ```console -uv venv myenv --python 3.12 --seed -source myenv/bin/activate +uv venv --python 3.12 --seed +source .venv/bin/activate uv pip install vllm ``` diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py index 15519bfed9cb..b532bf42adfb 100644 --- a/examples/offline_inference/chat_with_tools.py +++ b/examples/offline_inference/chat_with_tools.py @@ -68,7 +68,7 @@ def get_current_weather(city: str, state: str, unit: 'str'): "partly cloudly, with highs in the 90's.") -tool_funtions = {"get_current_weather": get_current_weather} +tool_functions = {"get_current_weather": get_current_weather} tools = [{ "type": "function", @@ -122,7 +122,7 @@ def get_current_weather(city: str, state: str, unit: 'str'): # above defined function tool_calls = json.loads(output) tool_answers = [ - tool_funtions[call['name']](**call['arguments']) for call in tool_calls + tool_functions[call['name']](**call['arguments']) for call in tool_calls ] # append the answer as a tool message and let the LLM give you an answer diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index 965915beaf58..f636a08c0b09 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -65,11 +65,17 @@ def parse_args(): type=int, default=0, help="Master node port") + parser.add_argument("--enforce-eager", + action='store_true', + help="Enforce eager mode execution.") + parser.add_argument("--trust-remote-code", + action='store_true', + help="Trust remote code.") return parser.parse_args() def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, - dp_master_port, GPUs_per_dp_rank): + dp_master_port, GPUs_per_dp_rank, enforce_eager, trust_remote_code): os.environ["VLLM_DP_RANK"] = str(global_dp_rank) os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank) os.environ["VLLM_DP_SIZE"] = str(dp_size) @@ -109,10 +115,13 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, max_tokens=[16, 20][global_dp_rank % 2]) # Create an LLM. - llm = LLM(model=model, - tensor_parallel_size=GPUs_per_dp_rank, - enforce_eager=True, - enable_expert_parallel=True) + llm = LLM( + model=model, + tensor_parallel_size=GPUs_per_dp_rank, + enforce_eager=enforce_eager, + enable_expert_parallel=True, + trust_remote_code=trust_remote_code, + ) outputs = llm.generate(prompts, sampling_params) # Print the outputs. for i, output in enumerate(outputs): @@ -155,7 +164,8 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, proc = Process(target=main, args=(args.model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, dp_master_port, - tp_size)) + tp_size, args.enforce_eager, + args.trust_remote_code)) proc.start() procs.append(proc) exit_code = 0 diff --git a/examples/offline_inference/disaggregated-prefill-v1/README.md b/examples/offline_inference/disaggregated-prefill-v1/README.md new file mode 100644 index 000000000000..f708eb253838 --- /dev/null +++ b/examples/offline_inference/disaggregated-prefill-v1/README.md @@ -0,0 +1,9 @@ +# Disaggregated Prefill V1 + +This example contains scripts that demonstrate disaggregated prefill in the offline setting of vLLM. + +## Files + +- `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially. +- `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`. +- `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`. diff --git a/examples/offline_inference/openai/openai_batch.md b/examples/offline_inference/openai_batch/README.md similarity index 94% rename from examples/offline_inference/openai/openai_batch.md rename to examples/offline_inference/openai_batch/README.md index d271573aa96f..42a19f71e9de 100644 --- a/examples/offline_inference/openai/openai_batch.md +++ b/examples/offline_inference/openai_batch/README.md @@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format The OpenAI batch file format consists of a series of json objects on new lines. -[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl) +[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl) Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. @@ -30,13 +30,13 @@ We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` e To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ```console -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ```console -$ cat offline_inference/openai/openai_example_batch.jsonl +$ cat offline_inference/openai_batch/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -48,7 +48,7 @@ The batch running tool is designed to be used from the command line. You can run the batch with the following command, which will write its results to a file called `results.jsonl` ```console -python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct ``` ### Step 3: Check your results @@ -65,10 +65,10 @@ $ cat results.jsonl The batch runner supports remote input and output urls that are accessible via http/https. -For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run +For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run ```console -python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct ``` ## Example 3: Integrating with AWS S3 @@ -89,13 +89,13 @@ To integrate with cloud blob storage, we recommend using presigned urls. To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ```console -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ```console -$ cat offline_inference/openai/openai_example_batch.jsonl +$ cat offline_inference/openai_batch/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -103,7 +103,7 @@ $ cat offline_inference/openai/openai_example_batch.jsonl Now upload your batch file to your S3 bucket. ```console -aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl +aws s3 cp offline_inference/openai_batch/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl ``` ### Step 2: Generate your presigned urls diff --git a/examples/offline_inference/openai/openai_example_batch.jsonl b/examples/offline_inference/openai_batch/openai_example_batch.jsonl similarity index 100% rename from examples/offline_inference/openai/openai_example_batch.jsonl rename to examples/offline_inference/openai_batch/openai_example_batch.jsonl diff --git a/examples/online_serving/disaggregated_serving/README.md b/examples/online_serving/disaggregated_serving/README.md new file mode 100644 index 000000000000..090afd7515ee --- /dev/null +++ b/examples/online_serving/disaggregated_serving/README.md @@ -0,0 +1,8 @@ +# Disaggregated Serving + +This example contains scripts that demonstrate the disaggregated serving features of vLLM. + +## Files + +- `disagg_proxy_demo.py` - Demonstrates XpYd (X prefill instances, Y decode instances). +- `kv_events.sh` - Demonstrates KV cache event publishing. diff --git a/examples/online_serving/disagg_examples/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py similarity index 99% rename from examples/online_serving/disagg_examples/disagg_proxy_demo.py rename to examples/online_serving/disaggregated_serving/disagg_proxy_demo.py index a701636f357a..1bf4d50e2c92 100644 --- a/examples/online_serving/disagg_examples/disagg_proxy_demo.py +++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py @@ -4,7 +4,7 @@ example usage of XpYd disaggregated prefilling. We can launch multiple vllm instances (2 for prefill and 2 for decode), and launch this proxy demo through: - python3 examples/online_serving/disagg_examples/disagg_proxy_demo.py \ + python3 examples/online_serving/disaggregated_serving/disagg_proxy_demo.py \ --model $model_name \ --prefill localhost:8100 localhost:8101 \ --decode localhost:8200 localhost:8201 \ diff --git a/examples/online_serving/kv_events.sh b/examples/online_serving/disaggregated_serving/kv_events.sh similarity index 100% rename from examples/online_serving/kv_events.sh rename to examples/online_serving/disaggregated_serving/kv_events.sh diff --git a/examples/online_serving/opentelemetry/Otel.md b/examples/online_serving/opentelemetry/README.md similarity index 100% rename from examples/online_serving/opentelemetry/Otel.md rename to examples/online_serving/opentelemetry/README.md diff --git a/pyproject.toml b/pyproject.toml index 46cf7a801fd6..0b803a26b658 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,16 +71,15 @@ exclude = [ "vllm/third_party/**" = ["ALL"] "vllm/version.py" = ["F401"] "vllm/_version.py" = ["ALL"] -# Python 3.8 typing. TODO: Remove these excludes after v1.0.0 +# Python 3.8 typing - skip V0 code "vllm/attention/**/*.py" = ["UP006", "UP035"] "vllm/core/**/*.py" = ["UP006", "UP035"] "vllm/engine/**/*.py" = ["UP006", "UP035"] "vllm/executor/**/*.py" = ["UP006", "UP035"] -"vllm/model_executor/model_loader/**/*.py" = ["UP006", "UP035"] -"vllm/model_executor/models/**/*.py" = ["UP006", "UP035"] "vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"] "vllm/spec_decode/**/*.py" = ["UP006", "UP035"] "vllm/worker/**/*.py" = ["UP006", "UP035"] +# Python 3.8 typing - skip utils for ROCm "vllm/utils.py" = ["UP006", "UP035"] [tool.ruff.lint] @@ -170,3 +169,9 @@ plugins.md013.enabled = false # line-length plugins.md041.enabled = false # first-line-h1 plugins.md033.enabled = false # inline-html plugins.md024.allow_different_nesting = true # no-duplicate-headers + +[tool.ty] +respect-ignore-files = true + +[tool.ty.environment] +python = "./.venv" diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index abd4212c6e35..25f950a99ece 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -22,4 +22,10 @@ decord==0.6.0 #sentence-transformers # required by entrypoints/openai/test_score.py sentence-transformers==3.4.1 +# Basic Models Test +matplotlib==3.10.3 + +# Multi-Modal Models Test (Extended) 3 +blobfile==3.0.0 + diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index 663b722426c5..9773f3e45b99 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -145,6 +145,83 @@ async def test_tokenize_chat( } +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name,tokenizer_name", + [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], + indirect=["tokenizer_name"], +) +async def test_tokenize_chat_with_tools( + server: RemoteOpenAIServer, + model_name: str, + tokenizer_name: str, +): + tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, + tokenizer_mode="fast") + + for add_generation in [False, True]: + for add_special in [False, True]: + conversation = [{ + "role": + "user", + "content": + "What's the weather like in Paris today?", + }] + + tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string" + } + }, + }, + }, + }] + + for continue_final in [False, True]: + if add_generation and continue_final: + continue + if continue_final: + conversation.append({ + "role": "assistant", + "content": "Sure," + }) + + prompt = tokenizer.apply_chat_template( + add_generation_prompt=add_generation, + continue_final_message=continue_final, + conversation=conversation, + tools=tools, + tokenize=False, + ) + tokens = tokenizer.encode(prompt, + add_special_tokens=add_special) + + response = requests.post( + server.url_for("tokenize"), + json={ + "add_generation_prompt": add_generation, + "continue_final_message": continue_final, + "add_special_tokens": add_special, + "messages": conversation, + "model": model_name, + "tools": tools, + }, + ) + response.raise_for_status() + + assert response.json() == { + "tokens": tokens, + "count": len(tokens), + "max_model_len": 8192, + } + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name,tokenizer_name", diff --git a/tests/kernels/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py similarity index 98% rename from tests/kernels/test_triton_unified_attention.py rename to tests/kernels/attention/test_triton_unified_attention.py index 50da8e5fd5cd..4e15d00255a4 100644 --- a/tests/kernels/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -99,6 +99,9 @@ def test_triton_unified_attn( ) -> None: torch.set_default_device("cuda") + if q_dtype is not None and q_dtype.itemsize < 2 and block_size < 32: + pytest.skip("block size must be at least 32 for fp8") + current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index d81c7487b88c..383a3c83b84a 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -29,12 +29,20 @@ def _get_flat_tensor_shape(batch_size: int, seq_len: int, num_heads: int, return (batch_size, seq_len, num_heads * head_size) +# For testing sliced tensors +def _get_padded_tensor_shape(batch_size: int, seq_len: int, num_heads: int, + head_size: int) -> tuple[int, ...]: + return (batch_size, seq_len, num_heads, head_size + 64) + + def _get_batch_tensor_shape(batch_size: int, seq_len: int, num_heads: int, head_size: int) -> tuple[int, ...]: return (batch_size, seq_len, num_heads, head_size) -TENSORS_SHAPES_FN = [_get_batch_tensor_shape, _get_flat_tensor_shape] +TENSORS_SHAPES_FN = [ + _get_batch_tensor_shape, _get_flat_tensor_shape, _get_padded_tensor_shape +] @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @@ -79,6 +87,10 @@ def test_rotary_embedding( query = torch.randn(query_shape, dtype=dtype) key = torch.randn_like(query) if use_key else None + # slice tensor if required, noop otherwise + query = query[..., :head_size] + key = key[..., :head_size] if use_key else None + # NOTE(woosuk): The reference implementation should be executed first # because the custom kernel is in-place. ref_query, ref_key = rope.forward_native(positions, query, key) diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py index 4e54861005f2..8383f943b9fa 100644 --- a/tests/kernels/core/test_rotary_embedding.py +++ b/tests/kernels/core/test_rotary_embedding.py @@ -38,9 +38,10 @@ def rotary_embedding_opcheck(rot, @pytest.mark.parametrize("head_size", [32, 108]) @pytest.mark.parametrize("seq_len", [11, 1024]) @pytest.mark.parametrize("use_key", [True, False]) +@pytest.mark.parametrize("head_stride_is_contingous", [True, False]) def test_rotary_embedding_opcheck(dist_init, device, max_position, is_neox_style, rotary_dim, head_size, - seq_len, use_key): + seq_len, use_key, head_stride_is_contingous): batch_size = 1 base = 10000 num_heads = 7 @@ -50,15 +51,27 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position, positions = torch.randint(0, max_position, (batch_size, seq_len), device=device) + head_stride = head_size + (64 if head_stride_is_contingous else 0) + query = torch.randn(batch_size, seq_len, - num_heads * head_size, + num_heads, + head_stride, dtype=torch.float32, device=device) key = torch.randn_like(query) if use_key else None + query = query[..., :head_size] + key = key[..., :head_size] if use_key else None rotary_embedding_opcheck(rot, positions, query, key) offsets = torch.zeros(batch_size * seq_len, device=device, dtype=torch.long) rotary_embedding_opcheck(rot, positions, query, key, offsets) + + # if we have a contiguous head stride, test the alternate + # [..., num_heads * head_dim] shape/layout + if head_stride_is_contingous: + rotary_embedding_opcheck( + rot, positions, query.flatten(start_dim=-2), + key.flatten(start_dim=-2) if use_key else None) diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py new file mode 100644 index 000000000000..7d369edfc86a --- /dev/null +++ b/tests/kernels/moe/test_batched_moe.py @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass + +import pytest +import torch +import triton.language as tl + +from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( + invoke_moe_batched_triton_kernel) + + +@dataclass +class BatchedMMConfig: + dtype: torch.dtype + num_experts: int + max_tokens_per_expert: int + K: int + N: int + + +@dataclass +class BatchedMMTensors: + A: torch.Tensor # [E, max_tokens, K] + B: torch.Tensor # [E, K, N] - column major + C: torch.Tensor # [E, max_tokens, N] + num_expert_tokens: torch.Tensor # [E] + + @staticmethod + def make_tensors(config: BatchedMMConfig): + A = torch.randn( + (config.num_experts, config.max_tokens_per_expert, config.K), + device="cuda", + dtype=config.dtype) / 10 + B = torch.randn((config.num_experts, config.N, config.K), + device="cuda", + dtype=config.dtype) + C = torch.zeros( + (config.num_experts, config.max_tokens_per_expert, config.N), + device="cuda", + dtype=config.dtype) + num_expert_tokens = torch.randint(low=0, + high=config.max_tokens_per_expert, + size=(config.num_experts, ), + device="cuda", + dtype=torch.int32) + return BatchedMMTensors(A, B, C, num_expert_tokens) + + +def ref_impl(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, + num_expert_tokens: torch.Tensor) -> torch.Tensor: + + num_expert_tokens_cpu = num_expert_tokens.clone() + num_expert_tokens_cpu = num_expert_tokens_cpu.to(device="cpu") + num_experts = num_expert_tokens.size(0) + + for e in range(num_experts): + num_tokens = num_expert_tokens_cpu[e] + C[e, :num_tokens, :] = A[e, :num_tokens, :] @ B[e].transpose(0, 1) + + return C + + +@pytest.mark.parametrize("num_experts", [16, 32]) +@pytest.mark.parametrize("max_tokens_per_expert", + [32, 64, 128, 192, 224, 256, 512]) +@pytest.mark.parametrize("K", [128, 256, 1024]) +@pytest.mark.parametrize("N", [128, 256, 512, 1024]) +@pytest.mark.parametrize("dtype", + [torch.float32, torch.float16, torch.bfloat16]) +def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int, + N: int, dtype: torch.dtype): + + config = BatchedMMConfig(dtype, num_experts, max_tokens_per_expert, K, N) + tensors = BatchedMMTensors.make_tensors(config) + + test_output = tensors.C + ref_output = test_output.clone() + + compute_tl_dtype = { + torch.float16: tl.float16, + torch.bfloat16: tl.bfloat16, + torch.float32: tl.float32 + }[test_output.dtype] + invoke_moe_batched_triton_kernel( + tensors.A, + tensors.B, + test_output, + tensors.num_expert_tokens, + compute_tl_dtype, + # Quantization data + None, + None, + None, + # Quantization schemes + False, + False, + False, + config={ + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 16 + }) + + ref_output = ref_impl(tensors.A, tensors.B, ref_output, + tensors.num_expert_tokens) + + rtol, atol = { + torch.float16: (6e-2, 6e-2), + torch.bfloat16: (6e-2, 6e-2), + torch.float32: (1e-2, 1e-2), + }[test_output.dtype] + + torch.testing.assert_close(test_output, ref_output, atol=atol, rtol=rtol) diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 975cd418a171..7db4fe0f46e3 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -30,6 +30,11 @@ (224, 3072, 1536), ] +vllm_config = VllmConfig(parallel_config=ParallelConfig( + pipeline_parallel_size=1)) +vllm_config.scheduler_config.max_num_seqs = 128 +vllm_config.scheduler_config.max_model_len = 8192 + @dataclasses.dataclass class MOETensors: @@ -190,7 +195,7 @@ def run_8_bit(moe_tensors: MOETensors8Bit, 'w1_q': moe_tensors.w1_q.transpose(1, 2), # type: ignore[union-attr] 'w2_q': moe_tensors.w2_q.transpose(1, 2), # type: ignore[union-attr] 'topk_weights': topk_weights, - 'topk_ids_': topk_ids, + 'topk_ids': topk_ids, 'ab_strides1': moe_tensors.ab_strides1, 'c_strides1': moe_tensors.c_strides1, 'ab_strides2': moe_tensors.ab_strides2, @@ -231,18 +236,15 @@ def test_cutlass_moe_8_bit_no_graph( per_out_ch: bool, ): current_platform.seed_everything(7) - with set_current_vllm_config( - VllmConfig(parallel_config=ParallelConfig( - pipeline_parallel_size=1))): - + with set_current_vllm_config(vllm_config): mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_ch) score = torch.randn((m, e), device="cuda", dtype=torch.half) - topk_weights, topk_ids = fused_topk(mt.a, - score, - topk, - renormalize=False) + topk_weights, topk_ids, _ = fused_topk(mt.a, + score, + topk, + renormalize=False) # Note that we are using the dequantized versions of the tensors. # Using a, w1 and w2 directly results in minor output differences. @@ -276,20 +278,17 @@ def test_cutlass_moe_8_bit_cuda_graph( per_out_ch: bool, ): current_platform.seed_everything(7) - with set_current_vllm_config( - VllmConfig(parallel_config=ParallelConfig( - pipeline_parallel_size=1))): - + with set_current_vllm_config(vllm_config): dtype = torch.half mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_ch) score = torch.randn((m, e), device="cuda", dtype=dtype) - topk_weights, topk_ids = fused_topk(mt.a, - score, - topk, - renormalize=False) + topk_weights, topk_ids, _ = fused_topk(mt.a, + score, + topk, + renormalize=False) # Note that we are using the dequantized versions of the tensors. # Using a, w1 and w2 directly results in minor output differences. @@ -334,18 +333,15 @@ def test_cutlass_moe_8_bit_EP( ep_size: int, ): current_platform.seed_everything(7) - with set_current_vllm_config( - VllmConfig(parallel_config=ParallelConfig( - pipeline_parallel_size=1))): - + with set_current_vllm_config(vllm_config): mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_channel) score = torch.randn((m, e), device="cuda", dtype=torch.half) - topk_weights, topk_ids = fused_topk(mt.a, - score, - topk, - renormalize=False) + topk_weights, topk_ids, _ = fused_topk(mt.a, + score, + topk, + renormalize=False) # Note that we are using the dequantized versions of the tensors. # Using a, w1 and w2 directly results in minor output differences. diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 96b090136e3c..43ddc79fcb81 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -12,6 +12,7 @@ import vllm.model_executor.layers.fused_moe # noqa from tests.kernels.utils import opcheck, stack_and_dev, torch_moe +from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe.moe_torch_iterative import ( @@ -32,6 +33,10 @@ EP_SIZE = [1, 4] TOP_KS = [2, 6] +vllm_config = VllmConfig() +vllm_config.scheduler_config.max_num_seqs = 128 +vllm_config.scheduler_config.max_model_len = 8192 + @pytest.mark.parametrize("m", [1, 33, 64, 222, 1024 * 128]) @pytest.mark.parametrize("n", [128, 1024, 2048]) @@ -70,31 +75,33 @@ def test_fused_moe( else: e_map = None - torch_output = torch_moe(a, w1, w2, score, topk, e_map) - iterative_output = iterative_moe(a, - w1, - w2, - score, - topk, - global_num_experts=e, - expert_map=e_map, - renormalize=False) + with set_current_vllm_config(vllm_config): + torch_output = torch_moe(a, w1, w2, score, topk, e_map) + iterative_output = iterative_moe(a, + w1, + w2, + score, + topk, + global_num_experts=e, + expert_map=e_map, + renormalize=False) + + # Pad the weight if moe padding is enabled + if padding: + w1 = F.pad(w1, (0, 128), "constant", 0)[..., 0:-128] + torch.cuda.empty_cache() + w2 = F.pad(w2, (0, 128), "constant", 0)[..., 0:-128] + torch.cuda.empty_cache() + + triton_output = fused_moe(a, + w1, + w2, + score, + topk, + global_num_experts=e, + expert_map=e_map, + renormalize=False) - # Pad the weight if moe padding is enabled - if padding: - w1 = F.pad(w1, (0, 128), "constant", 0)[..., 0:-128] - torch.cuda.empty_cache() - w2 = F.pad(w2, (0, 128), "constant", 0)[..., 0:-128] - torch.cuda.empty_cache() - - triton_output = fused_moe(a, - w1, - w2, - score, - topk, - global_num_experts=e, - expert_map=e_map, - renormalize=False) torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0) torch.testing.assert_close(iterative_output, torch_output, @@ -115,7 +122,6 @@ def test_fused_moe( def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int, ep_size: int, dtype: torch.dtype, group_size: int, has_zp: bool, weight_bits: int): - print(m, n, k, e, topk, dtype, group_size, has_zp, weight_bits) a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 @@ -194,22 +200,24 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int, else: e_map = None - triton_output = fused_moe(a, - w1_qweight, - w2_qweight, - score, - topk, - renormalize=False, - use_int4_w4a16=weight_bits == 4, - use_int8_w8a16=weight_bits == 8, - global_num_experts=e, - expert_map=e_map, - w1_scale=w1_scales, - w2_scale=w2_scales, - w1_zp=w1_qzeros if has_zp else None, - w2_zp=w2_qzeros if has_zp else None, - block_shape=[0, group_size]) - torch_output = torch_moe(a, w1_ref, w2_ref, score, topk, e_map) + with set_current_vllm_config(vllm_config): + triton_output = fused_moe(a, + w1_qweight, + w2_qweight, + score, + topk, + renormalize=False, + use_int4_w4a16=weight_bits == 4, + use_int8_w8a16=weight_bits == 8, + global_num_experts=e, + expert_map=e_map, + w1_scale=w1_scales, + w2_scale=w2_scales, + w1_zp=w1_qzeros if has_zp else None, + w2_zp=w2_qzeros if has_zp else None, + block_shape=[0, group_size]) + torch_output = torch_moe(a, w1_ref, w2_ref, score, topk, e_map) + torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0) @@ -515,7 +523,8 @@ def test_fused_marlin_moe( topk_weights, topk_ids, _ = fused_topk(a, score, topk, False) - torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, e_map) + with set_current_vllm_config(vllm_config): + torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, e_map) marlin_output = torch.ops.vllm.fused_marlin_moe( a, diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py new file mode 100644 index 000000000000..8c4a2c3fa440 --- /dev/null +++ b/tests/kernels/moe/test_pplx_moe.py @@ -0,0 +1,691 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Tests for the MOE layers. + +Run `pytest tests/kernels/test_pplx_moe.py`. +""" +import dataclasses +import os +import traceback +from typing import Callable, Optional + +import pytest +import torch + +try: + from pplx_kernels import AllToAll + from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id, + nvshmem_finalize, nvshmem_get_unique_id, + nvshmem_init) + has_pplx = True +except ImportError: + has_pplx = False + +from torch.multiprocessing import ( + spawn) # pyright: ignore[reportPrivateImportUsage] +from typing_extensions import Concatenate, ParamSpec + +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import override_config +from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( + BatchedExperts, BatchedPrepareAndFinalize, BatchedTritonExperts) +from vllm.model_executor.layers.fused_moe.fused_moe import (fused_topk, + get_default_config) +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEModularKernel) +from vllm.platforms import current_platform + +PPLX_PREPARE_COMBOS = [(4, 128, 128), (32, 1024, 512), (64, 1024, 512), + (222, 2048, 1024)] + +PPLX_MOE_COMBOS = [ + (1, 128, 128), + (2, 128, 512), + (3, 1024, 2048), + (32, 128, 1024), + (45, 512, 2048), + (64, 1024, 1024), + (222, 1024, 2048), +] + +NUM_EXPERTS = [8, 64] +EP_SIZE = [1, 4] +TOP_KS = [1, 2, 6] + +vllm_config = VllmConfig() +vllm_config.scheduler_config.max_num_seqs = 128 +vllm_config.scheduler_config.max_model_len = 8192 + +P = ParamSpec("P") + +requires_pplx = pytest.mark.skipif( + not has_pplx, + reason="Requires PPLX kernels", +) + + +@dataclasses.dataclass +class ProcessGroupInfo: + world_size: int + world_local_size: int + rank: int + node_rank: int + local_rank: int + device: torch.device + + +def _worker_parallel_launch( + local_rank: int, + world_size: int, + world_local_size: int, + node_rank: int, + init_method: str, + worker: Callable[Concatenate[ProcessGroupInfo, P], None], + *args: P.args, + **kwargs: P.kwargs, +) -> None: + rank = node_rank * world_local_size + local_rank + torch.cuda.set_device(local_rank) + device = torch.device("cuda", local_rank) + torch.distributed.init_process_group( + backend="cpu:gloo,cuda:nccl", + init_method=init_method, + rank=rank, + world_size=world_size, + device_id=device, + ) + barrier = torch.tensor([rank], device=device) + torch.distributed.all_reduce(barrier) + + try: + worker( + ProcessGroupInfo( + world_size=world_size, + world_local_size=world_local_size, + rank=rank, + node_rank=node_rank, + local_rank=local_rank, + device=device, + ), + *args, + **kwargs, + ) + except Exception as ex: + print(ex) + traceback.print_exc() + raise + finally: + torch.distributed.destroy_process_group() + + +def parallel_launch( + world_size: int, + worker: Callable[Concatenate[ProcessGroupInfo, P], None], + *args: P.args, + **kwargs: P.kwargs, +) -> None: + assert not kwargs + spawn( + _worker_parallel_launch, + args=( + world_size, + world_size, + 0, + "tcp://localhost:29500", + worker, + ) + args, + nprocs=world_size, + join=True, + ) + + +def parallel_launch_from_env( + worker: Callable[Concatenate[ProcessGroupInfo, P], None], + *args: P.args, + **kwargs: P.kwargs, +) -> None: + """ + Launches a worker function in parallel across all processes in the current + environment. The environment must have the following variables set: + - WORLD_SIZE: The total number of processes. + - WORLD_LOCAL_SIZE: The number of processes on the current node. + - NODE_RANK: The rank of the current + - MASTER_ADDR: The address of the master process. + - MASTER_PORT: The port of the master process. + """ + assert not kwargs + world_size = int(os.environ["WORLD_SIZE"]) + world_local_size = int(os.environ["WORLD_LOCAL_SIZE"]) + node_rank = int(os.environ["NODE_RANK"]) + assert "MASTER_ADDR" in os.environ + assert "MASTER_PORT" in os.environ + spawn( + _worker_parallel_launch, + args=( + world_size, + world_local_size, + node_rank, + "env://", + worker, + ) + args, + nprocs=world_local_size, + join=True, + ) + + +def torch_prepare( + a: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + max_num_tokens: Optional[int] = None, +) -> tuple[torch.Tensor, torch.Tensor]: + assert topk_ids.dim() == 2 + assert topk_ids.shape[0] == a.shape[0] + + num_tokens, hidden_dim = a.shape + topk = topk_ids.shape[1] + + tokens_per_expert = torch.bincount(topk_ids.view(-1), + minlength=num_experts) + + assert tokens_per_expert.numel() == num_experts + + if max_num_tokens is None: + max_num_tokens = int(tokens_per_expert.max().item()) + + b_a = torch.zeros((num_experts, max_num_tokens, hidden_dim), + dtype=a.dtype, + device=a.device) + + token_counts = torch.zeros(num_experts, dtype=torch.int, device=a.device) + + for token in range(num_tokens): + for j in range(topk): + expert_id = topk_ids[token, j] + idx = token_counts[expert_id] + b_a[expert_id, idx:idx + 1, :] = a[token, :] + token_counts[expert_id] = token_counts[expert_id] + 1 + + return b_a, tokens_per_expert + + +def torch_finalize(b_out: torch.Tensor, topk_weight: torch.Tensor, + topk_ids: torch.Tensor) -> torch.Tensor: + num_tokens = topk_ids.shape[0] + num_experts = b_out.shape[0] + K = b_out.shape[-1] + out = torch.zeros((num_tokens, K), dtype=b_out.dtype, device=b_out.device) + expert_counts = torch.zeros(num_experts, + dtype=torch.int, + device=b_out.device) + for token in range(num_tokens): + expert_ids = topk_ids[token] + for i in range(expert_ids.numel()): + expert_id = expert_ids[i] + idx = expert_counts[expert_id] + out[token, :] = out[token, :] + b_out[expert_id, idx:idx + + 1, :] * topk_weight[token, i] + expert_counts[expert_id] = expert_counts[expert_id] + 1 + + return out + + +def torch_batched_moe( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, +) -> torch.Tensor: + num_experts = w1.shape[0] + b_a, tokens_per_expert = torch_prepare(a, topk_ids, num_experts) + assert b_a.dim() == 3 + num_tokens, topk = topk_ids.shape + _, max_num_tokens, K = b_a.shape + assert num_experts == b_a.shape[0] and w2.shape[1] == K + out = torch.zeros((num_experts, max_num_tokens, K), + dtype=b_a.dtype, + device=b_a.device) + tmp = torch.empty((max_num_tokens, w1.shape[1] // 2), + dtype=b_a.dtype, + device=b_a.device) + for expert in range(num_experts): + num = tokens_per_expert[expert] + if num > 0: + torch.ops._C.silu_and_mul( + tmp[:num], b_a[expert, :num, :] @ w1[expert].transpose(0, 1)) + out[expert, :num, :] = tmp[:num] @ w2[expert].transpose(0, 1) + + return torch_finalize(out, topk_weight, topk_ids) + + +def batched_moe( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, +) -> torch.Tensor: + num_experts = w1.shape[0] + + fused_experts = FusedMoEModularKernel( + BatchedPrepareAndFinalize(a.shape[0], world_size=1, dp_size=1, rank=0), + BatchedExperts(max_num_tokens=a.shape[0], dp_size=1, world_size=1)) + + return fused_experts(a, w1, w2, topk_weight, topk_ids, num_experts) + + +# Note: same as torch_moe but with fused_topk factored out. +def torch_moe2( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, +) -> torch.Tensor: + M, K = a.shape + topk = topk_ids.shape[1] + a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K) + out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device) + num_experts = w1.shape[0] + for i in range(num_experts): + mask = (topk_ids == i).view(-1) + if mask.sum(): + out[mask] = SiluAndMul()( + a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1) + + return (out.view(M, -1, w2.shape[1]) * + topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1) + + +@pytest.mark.parametrize("m", [1, 33, 64, 222]) +@pytest.mark.parametrize("n", [128, 1024, 2048]) +@pytest.mark.parametrize("k", [128, 512, 1024]) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +def test_fused_moe_batched_experts( + m: int, + n: int, + k: int, + e: int, + topk: int, + dtype: torch.dtype, +): + current_platform.seed_everything(7) + + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 + w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 + w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 + score = torch.randn((m, e), device="cuda", dtype=dtype) + + with set_current_vllm_config(vllm_config): + topk_weight, topk_ids, _ = fused_topk(a, score, topk, False) + baseline_output = torch_moe2(a, w1, w2, topk_weight, topk_ids) + torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids) + batched_output = batched_moe(a, w1, w2, topk_weight, topk_ids) + + torch.testing.assert_close(baseline_output, + torch_output, + atol=2e-2, + rtol=0) + torch.testing.assert_close(baseline_output, + batched_output, + atol=2e-2, + rtol=0) + + +def rank_chunk(num: int, r: int, w: int) -> int: + rem = num % w + return (num // w) + (1 if r < rem else 0) + + +def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor: + chunk = rank_chunk(t.shape[0], r, w) + return t[(r * chunk):(r + 1) * chunk] + + +def pplx_prepare_finalize(pgi: ProcessGroupInfo, dp_size: int, a: torch.Tensor, + topk_weight: torch.Tensor, topk_ids: torch.Tensor, + num_experts: int) -> torch.Tensor: + from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( + PplxPrepareAndFinalize) + + assert torch.cuda.current_device() == pgi.local_rank + + topk = topk_ids.shape[1] + num_tokens, hidden_dim = a.shape + block_size = 128 + device = pgi.device + rank = pgi.rank + world_size = pgi.world_size + max_num_tokens = rank_chunk(num_tokens, 0, world_size) + + ata = AllToAll.internode( + max_num_tokens=max_num_tokens, + num_experts=num_experts, + experts_per_token=topk, + rank=rank, + world_size=world_size, + dp_size=dp_size, + hidden_dim=hidden_dim, + hidden_dim_bytes=hidden_dim * a.dtype.itemsize, + hidden_dim_scale_bytes=(0 if a.dtype.itemsize != 1 else + ((hidden_dim + block_size - 1) // block_size * + torch.float32.itemsize)), + ) + + topk_ids = topk_ids.to(dtype=torch.uint32) + + prepare_finalize = PplxPrepareAndFinalize( + ata, + max_num_tokens, + world_size, + rank, + dp_size, + a.dtype, + ) + + a_chunk = chunk_by_rank(a, rank, world_size).to(device) + chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device) + chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device) + + b_a, b_a_scale, expert_num_tokens = prepare_finalize.prepare( + a_chunk, + None, + None, + chunk_topk_weight, + chunk_topk_ids, + num_experts, + None, + False, + ) + + b_a = b_a * 1.5 + + out = torch.full( + (max_num_tokens, hidden_dim), + torch.nan, + dtype=a.dtype, + device=device, + ) + + prepare_finalize.finalize( + out, + b_a, + chunk_topk_weight, + chunk_topk_ids, + False, + ) + + torch.cuda.synchronize() + + ata.destroy() + + num_tokens = a_chunk.shape[0] + + return out[:num_tokens] + + +def _pplx_prepare_finalize( + pgi: ProcessGroupInfo, + dp_size: int, + a: torch.Tensor, + score: torch.Tensor, + topk: torch.Tensor, + num_experts: int, +): + uid = nvshmem_get_unique_id( + ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id() + torch.distributed.broadcast(uid, src=0) + nvshmem_init(uid, pgi.rank, pgi.world_size) + device = pgi.device + + topk_weight, topk_ids, _ = fused_topk(a, score, topk, False) + k = a.shape[1] + + a_rep = torch.repeat_interleave(a, topk, dim=0).to(device) + + torch_output = (a_rep.view(-1, topk, k) * 1.5 * + topk_weight.view(-1, topk, 1).to(device)).sum(dim=1).to( + a.dtype) + + pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight, topk_ids, + num_experts) + + torch_output = chunk_by_rank(torch_output, pgi.rank, + pgi.world_size).to(pplx_output.device) + + torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0) + + nvshmem_finalize() + + +# TODO (bnell): this test point does not work for odd M due to how the test is +# written, not due to limitations of the pplx kernels. The pplx_moe +# test below is able to deal with odd M. +@pytest.mark.parametrize("mnk", PPLX_PREPARE_COMBOS) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("world_dp_size", [[2, 1]]) +@requires_pplx +def test_pplx_prepare_finalize( + mnk: tuple[int, int, int], + e: int, + topk: int, + dtype: torch.dtype, + world_dp_size: tuple[int, int], +): + current_platform.seed_everything(7) + m, n, k = mnk + world_size, dp_size = world_dp_size + device = "cuda" + a = torch.randn((m, k), device=device, dtype=dtype) / 10 + score = torch.randn((m, e), device=device, dtype=dtype) + + parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, score, + topk, e) + + +def pplx_moe( + rank: int, + world_size: int, + dp_size: int, + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, + use_compile: bool = True, + use_cudagraphs: bool = True, +) -> torch.Tensor: + from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( + PplxPrepareAndFinalize) + + device = torch.device("cuda", rank) + hidden_dim = a.shape[1] + num_experts = w1.shape[0] + block_size = 128 + topk = topk_ids.shape[1] + max_num_tokens = rank_chunk(a.shape[0], 0, world_size) + + ata = AllToAll.internode( + max_num_tokens=max_num_tokens, + num_experts=num_experts, + experts_per_token=topk, + rank=rank, + world_size=world_size, + dp_size=dp_size, + hidden_dim=hidden_dim, + hidden_dim_bytes=hidden_dim * a.dtype.itemsize, + hidden_dim_scale_bytes=(0 if a.dtype.itemsize != 1 else + ((hidden_dim + block_size - 1) // block_size * + torch.float32.itemsize)), + ) + + topk_ids = topk_ids.to(dtype=torch.uint32) + + prepare_finalize = PplxPrepareAndFinalize( + ata, + max_num_tokens, + world_size, + rank, + dp_size, + ) + + experts = BatchedTritonExperts(max_num_tokens=a.shape[0], + world_size=world_size, + dp_size=dp_size) + + fused_experts = FusedMoEModularKernel( + prepare_finalize, + experts, + ) + + # Note: workers with the same dp_rank must use the exact same inputs. + a_chunk = chunk_by_rank(a, rank, world_size).to(device) + chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device) + chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device) + + # Chunking weights like this only works for batched format + w1_chunk = chunk_by_rank(w1, rank, world_size).to(device) + w2_chunk = chunk_by_rank(w2, rank, world_size).to(device) + + if use_compile: + _fused_experts = torch.compile(fused_experts, + backend='inductor', + fullgraph=True) + else: + _fused_experts = fused_experts + + out = _fused_experts(a_chunk, + w1_chunk, + w2_chunk, + chunk_topk_weight, + chunk_topk_ids, + global_num_experts=num_experts) + + if use_cudagraphs: + out.fill_(0) + stream = torch.cuda.Stream() + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph, stream=stream): + out = _fused_experts(a_chunk, + w1_chunk, + w2_chunk, + chunk_topk_weight, + chunk_topk_ids, + global_num_experts=num_experts) + + torch.cuda.synchronize() + graph.replay() + + torch.cuda.synchronize() + + ata.destroy() + + return out + + +def _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids): + assert torch.cuda.current_device() == pgi.local_rank + + num_experts = w1.shape[0] + device = pgi.device + rank = pgi.rank + world_size = pgi.world_size + max_num_tokens = rank_chunk(a.shape[0], 0, world_size) + + prepare_finalize = BatchedPrepareAndFinalize( + max_num_tokens=max_num_tokens, + world_size=world_size, + dp_size=dp_size, + rank=rank, + ) + + experts = BatchedExperts(max_num_tokens=a.shape[0], + world_size=1, + dp_size=1) + + fused_experts = FusedMoEModularKernel( + prepare_finalize, + experts, + ) + + # Note: workers with the same dp_rank must use the exact same inputs. + a_chunk = chunk_by_rank(a, rank, world_size).to(device) + chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device) + chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device) + + out = fused_experts( + a_chunk, + # Chunking weights like this only works for batched format + chunk_by_rank(w1, rank, world_size).to(device), + chunk_by_rank(w2, rank, world_size).to(device), + chunk_topk_weight, + chunk_topk_ids, + global_num_experts=num_experts) + + return out + + +def _pplx_moe( + pgi: ProcessGroupInfo, + dp_size: int, + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + score: torch.Tensor, + topk: int, +): + uid = nvshmem_get_unique_id( + ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id() + torch.distributed.broadcast(uid, src=0) + nvshmem_init(uid, pgi.rank, pgi.world_size) + + m, k = a.shape + e, _, n = w2.shape + + moe_config = get_default_config(m, e, n, k, topk, a.dtype, False) + + with set_current_vllm_config(vllm_config), override_config(moe_config): + topk_weight, topk_ids, _ = fused_topk(a, score, topk, False) + torch_output = torch_moe2(a, w1, w2, topk_weight, topk_ids) + pplx_output = pplx_moe(pgi.rank, pgi.world_size, dp_size, a, w1, w2, + topk_weight, topk_ids) + # TODO (bnell): fix + re-enable + #batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, + # topk_ids) + + torch_output = chunk_by_rank(torch_output, pgi.rank, + pgi.world_size).to(pplx_output.device) + + torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0) + #torch.testing.assert_close(batched_output, torch_output, atol=2e-2, rtol=0) + + nvshmem_finalize() + + +@pytest.mark.parametrize("mnk", PPLX_MOE_COMBOS) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("world_dp_size", [[2, 1]]) +@requires_pplx +def test_pplx_moe( + mnk: tuple[int, int, int], + e: int, + topk: int, + dtype: torch.dtype, + world_dp_size: tuple[int, int], +): + current_platform.seed_everything(7) + m, n, k = mnk + world_size, dp_size = world_dp_size + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 + w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 + w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 + score = torch.randn((m, e), device="cuda", dtype=dtype) + + parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk) diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py index 44734e9340aa..3b5838a99fa1 100644 --- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py +++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py @@ -7,6 +7,7 @@ import torch from vllm import _custom_ops as ops +from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe from vllm.platforms import current_platform @@ -15,6 +16,10 @@ pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True) +vllm_config = VllmConfig() +vllm_config.scheduler_config.max_num_seqs = 128 +vllm_config.scheduler_config.max_model_len = 8192 + def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16): """Matrix multiplication function that supports per-token input @@ -137,20 +142,21 @@ def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed): w2_s = torch.rand(E, K, device=w2_fp32.device) * factor_for_scale score = torch.randn((M, E), dtype=dtype) - ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk) - out = fused_moe( - a, - w1, - w2, - score, - topk, - renormalize=False, - use_fp8_w8a8=True, # using fp8 - per_channel_quant=True, - w1_scale=w1_s, - w2_scale=w2_s, - block_shape=None, # Not using block quantization - ) + with set_current_vllm_config(vllm_config): + ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk) + out = fused_moe( + a, + w1, + w2, + score, + topk, + renormalize=False, + use_fp8_w8a8=True, # using fp8 + per_channel_quant=True, + w1_scale=w1_s, + w2_scale=w2_s, + block_shape=None, # Not using block quantization + ) # Check results rel_diff = (torch.mean( diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index 38c7e461bb9c..ef1d7e47ef81 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -11,7 +11,7 @@ from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( - deep_gemm_moe_fp8) + _valid_deep_gemm_shape, deep_gemm_moe_fp8) from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( moe_align_block_size) @@ -30,6 +30,10 @@ pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True) +vllm_config = VllmConfig() +vllm_config.scheduler_config.max_num_seqs = 128 +vllm_config.scheduler_config.max_model_len = 8192 + # Test configurations DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32] NUM_TOKENS = [7, 83, 2048] @@ -210,7 +214,6 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): score = torch.randn((M, E), dtype=dtype) # Set the context to avoid lots of warning spam. - vllm_config = VllmConfig() with set_current_vllm_config(vllm_config): out = fused_moe( a, @@ -258,6 +261,7 @@ def per_block_cast_to_fp8( @pytest.mark.parametrize( "M,N,K,block_size,out_dtype,seed", itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS)) +@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.") @torch.inference_mode() def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed): # only aligned sizes @@ -381,15 +385,11 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed): block_size = [block_m, block_m] dtype = torch.bfloat16 - # only aligned sizes - if (N % block_m != 0 or K % block_m != 0 or topk > E): - pytest.skip( - f"Skipping test; bad size m={M}, n={N}, k={K}, topk={topk}, E={E}") - - if N <= 512: - pytest.skip("Skipping N <= 512 until performance issues solved.") + if topk > E: + pytest.skip(f"Skipping test: topk={topk} > E={E}") - vllm_config = VllmConfig() + if not _valid_deep_gemm_shape(M, N, K): + pytest.skip(f"Skipping test: invalid size m={M}, n={N}, k={K}") torch.manual_seed(seed) fp8_info = torch.finfo(torch.float8_e4m3fn) diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py index 104f23fd7cd2..a4e9f83f0eaf 100644 --- a/tests/kernels/quantization/test_block_int8.py +++ b/tests/kernels/quantization/test_block_int8.py @@ -18,6 +18,10 @@ pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True) +vllm_config = VllmConfig() +vllm_config.scheduler_config.max_num_seqs = 128 +vllm_config.scheduler_config.max_model_len = 8192 + # For test def native_per_token_group_quant_int8(x, @@ -174,7 +178,6 @@ def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): score = torch.randn((M, E), dtype=dtype) # Set the context to avoid lots of warning spam. - vllm_config = VllmConfig() with set_current_vllm_config(vllm_config): out = fused_moe( a, diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index 0875128c4ff1..90498c47fb10 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -30,7 +30,7 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request): lora_path = get_adapter_absolute_path(lora_name) - # lora loading should work for either absolute path and hugggingface id. + # lora loading should work for either absolute path and huggingface id. peft_helper = PEFTHelper.from_local_dir(lora_path, 4096) lora_model = LoRAModel.from_local_checkpoint( lora_path, diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/weight_utils.py index 11dfe4d4995d..bdaba22c3c7a 100644 --- a/tests/model_executor/weight_utils.py +++ b/tests/model_executor/weight_utils.py @@ -20,11 +20,11 @@ def test_hf_transfer_auto_activation(): try: # enable hf hub transfer if available import hf_transfer # type: ignore # noqa - HF_TRANFER_ACTIVE = True + HF_TRANSFER_ACTIVE = True except ImportError: - HF_TRANFER_ACTIVE = False + HF_TRANSFER_ACTIVE = False assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER == - HF_TRANFER_ACTIVE) + HF_TRANSFER_ACTIVE) def test_download_weights_from_hf(): diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 6da488897be5..6e38c4c7cadb 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -2,6 +2,8 @@ """Test the functionality of the Transformers backend.""" import pytest +from vllm.platforms import current_platform + from ..conftest import HfRunner, VllmRunner from ..utils import multi_gpu_test from .utils import check_logprobs_close @@ -33,6 +35,9 @@ def check_implementation( ) +@pytest.mark.skipif( + current_platform.is_rocm(), + reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.") @pytest.mark.parametrize( "model,model_impl", [ @@ -64,6 +69,9 @@ def test_distributed( "meta-llama/Llama-3.2-1B-Instruct", **kwargs) +@pytest.mark.skipif( + current_platform.is_rocm(), + reason="bitsandbytes quantization is currently not supported in rocm.") @pytest.mark.parametrize("model, quantization_kwargs", [ ( "meta-llama/Llama-3.2-1B-Instruct", diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py new file mode 100644 index 000000000000..e67624ecefcb --- /dev/null +++ b/tests/multimodal/test_video.py @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: Apache-2.0 +import numpy as np +import numpy.typing as npt +import pytest + +from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader + +NUM_FRAMES = 10 +FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3) +FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3) + + +@VIDEO_LOADER_REGISTRY.register("test_video_loader_1") +class TestVideoLoader1(VideoLoader): + + @classmethod + def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray: + return FAKE_OUTPUT_1 + + +@VIDEO_LOADER_REGISTRY.register("test_video_loader_2") +class TestVideoLoader2(VideoLoader): + + @classmethod + def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray: + return FAKE_OUTPUT_2 + + +def test_video_loader_registry(): + custom_loader_1 = VIDEO_LOADER_REGISTRY.load("test_video_loader_1") + output_1 = custom_loader_1.load_bytes(b"test") + np.testing.assert_array_equal(output_1, FAKE_OUTPUT_1) + + custom_loader_2 = VIDEO_LOADER_REGISTRY.load("test_video_loader_2") + output_2 = custom_loader_2.load_bytes(b"test") + np.testing.assert_array_equal(output_2, FAKE_OUTPUT_2) + + +def test_video_loader_type_doesnt_exist(): + with pytest.raises(AssertionError): + VIDEO_LOADER_REGISTRY.load("non_existing_video_loader") diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py index 1a20228765e8..6571fc9e471b 100644 --- a/tests/quantization/test_torchao.py +++ b/tests/quantization/test_torchao.py @@ -31,9 +31,6 @@ def test_pre_quantized_model(vllm_runner): ]) def test_opt_125m_int4wo_model_loading_with_params(vllm_runner, pt_load_map_location): - """ - Test loading roberta-base model with no lm_head. - """ torch._dynamo.reset() model_name = "jerryzh168/opt-125m-int4wo" with vllm_runner(model_name=model_name, @@ -47,5 +44,20 @@ def test_opt_125m_int4wo_model_loading_with_params(vllm_runner, print(output) +@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available") +def test_opt_125m_int4wo_model_per_module_quant(vllm_runner): + torch._dynamo.reset() + model_name = "jerryzh168/opt-125m-int4wo-per-module" + with vllm_runner(model_name=model_name, + quantization="torchao", + dtype="bfloat16", + pt_load_map_location="cuda:0") as llm: + output = llm.generate_greedy(["The capital of France is"], + max_tokens=32) + + assert output + print(output) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py index eee535a146f4..2814bb6d3773 100644 --- a/tests/spec_decode/e2e/test_eagle_correctness.py +++ b/tests/spec_decode/e2e/test_eagle_correctness.py @@ -178,6 +178,8 @@ def test_eagle_e2e_greedy_correctness_cuda_graph( batch_size, output_len, seed) +# TRACKING: https://github.com/vllm-project/vllm/issues/18166 +@pytest.mark.skip(reason="RE-ENABLE: Failing on main.") @pytest.mark.parametrize( "common_llm_kwargs", [{ diff --git a/tests/test_utils.py b/tests/test_utils.py index deff33e5c3ca..ea7db0a79c86 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -17,7 +17,7 @@ from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache, MemorySnapshot, PlaceholderModule, StoreBoolean, bind_kv_cache, deprecate_kwargs, get_open_port, - make_zmq_socket, memory_profiling, + make_zmq_path, make_zmq_socket, memory_profiling, merge_async_iterators, sha256, split_zmq_path, supports_kw, swap_dict_values) @@ -714,3 +714,8 @@ def test_make_zmq_socket_ipv6(): # Clean up zsock.close() ctx.term() + + +def test_make_zmq_path(): + assert make_zmq_path("tcp", "127.0.0.1", "5555") == "tcp://127.0.0.1:5555" + assert make_zmq_path("tcp", "::1", "5555") == "tcp://[::1]:5555" diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 1cdc80dd3546..43a27da2dbe4 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +import importlib import pytest import torch @@ -10,8 +11,7 @@ from vllm.v1.core.kv_cache_manager import KVCacheManager # disable yapf here as it formats differently than isort such that both fail # yapf: disable -from vllm.v1.core.kv_cache_utils import (NONE_HASH, BlockHashType, - FreeKVCacheBlockQueue, KVCacheBlock, +from vllm.v1.core.kv_cache_utils import (FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics, estimate_max_model_len, generate_block_hash_extra_keys, @@ -19,7 +19,8 @@ hash_request_tokens, unify_kv_cache_configs) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, - KVCacheGroupSpec, KVCacheTensor) + KVCacheGroupSpec, KVCacheTensor, + SlidingWindowSpec) from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request @@ -54,21 +55,39 @@ def new_kv_cache_spec(block_size=16, num_kv_heads=2, head_size=64, dtype=torch.float32, - use_mla=False): + use_mla=False, + sliding_window=None): return FullAttentionSpec(block_size=block_size, num_kv_heads=num_kv_heads, head_size=head_size, dtype=dtype, - use_mla=use_mla) + use_mla=use_mla, + sliding_window=sliding_window) -def test_none_hash(): - assert NONE_HASH is not None - assert isinstance(NONE_HASH, int) - assert NONE_HASH != 0 +def test_none_hash(monkeypatch): + import vllm.v1.core.kv_cache_utils + + # case 1: PYTHONHASHSEED is not set, use random + with monkeypatch.context() as m: + m.delenv('PYTHONHASHSEED', raising=False) + reloaded_kv_cache_utils = importlib.reload(vllm.v1.core.kv_cache_utils) + assert reloaded_kv_cache_utils.NONE_HASH is not None + assert isinstance(reloaded_kv_cache_utils.NONE_HASH, int) + assert reloaded_kv_cache_utils.NONE_HASH != 0 + + # case 2: PYTHONHASHSEED is set, use the seed + with monkeypatch.context() as m: + m.setenv('PYTHONHASHSEED', 'python hash seed') + reloaded_kv_cache_utils = importlib.reload(vllm.v1.core.kv_cache_utils) + assert reloaded_kv_cache_utils.NONE_HASH is not None + assert isinstance(reloaded_kv_cache_utils.NONE_HASH, int) + assert sha256('python hash seed') == reloaded_kv_cache_utils.NONE_HASH def test_kv_cache_block(): + import vllm.v1.core.kv_cache_utils + # Test KVCacheBlock initialization block = KVCacheBlock(block_id=0) assert block.block_id == 0 @@ -82,7 +101,8 @@ def test_kv_cache_block(): assert block.ref_cnt == 0 # Test block hash setting and resetting - block_hash = BlockHashType(hash_value=123, token_ids=(1, 2, 3)) + block_hash = vllm.v1.core.kv_cache_utils.BlockHashType(hash_value=123, + token_ids=(1, 2, 3)) block.block_hash = block_hash assert block.block_hash == block_hash @@ -256,13 +276,14 @@ def test_generate_block_hash_extra_keys_cache_salt(): @pytest.mark.parametrize("hash_fn", [sha256, hash]) def test_hash_block_tokens(hash_fn): + import vllm.v1.core.kv_cache_utils parent_block_hash = 123 curr_block_token_ids = (1, 2, 3) extra_keys = ("key1", "key2") block_hash = hash_block_tokens(hash_fn, parent_block_hash, curr_block_token_ids, extra_keys) - assert isinstance(block_hash, BlockHashType) + assert isinstance(block_hash, vllm.v1.core.kv_cache_utils.BlockHashType) assert block_hash.hash_value == hash_fn( (parent_block_hash, curr_block_token_ids, extra_keys)) assert block_hash.token_ids == curr_block_token_ids @@ -271,6 +292,7 @@ def test_hash_block_tokens(hash_fn): @pytest.mark.parametrize("hash_fn", [sha256, hash]) def test_hash_request_tokens(hash_fn): + import vllm.v1.core.kv_cache_utils request = make_request( request_id=0, prompt_token_ids=[_ for _ in range(6)], @@ -285,8 +307,10 @@ def test_hash_request_tokens(hash_fn): block_hashes = hash_request_tokens(hash_fn, block_size, request) assert len(block_hashes) == 2 - assert isinstance(block_hashes[0], BlockHashType) - assert isinstance(block_hashes[1], BlockHashType) + assert isinstance(block_hashes[0], + vllm.v1.core.kv_cache_utils.BlockHashType) + assert isinstance(block_hashes[1], + vllm.v1.core.kv_cache_utils.BlockHashType) # Check the first block assert block_hashes[0].token_ids == (0, 1, 2) @@ -471,6 +495,68 @@ def test_unify_kv_cache_configs(): unify_kv_cache_configs(diff_kv_cache_config) +def test_merge_kv_cache_spec(): + same_layer_specs = [ + new_kv_cache_spec(num_kv_heads=32), + new_kv_cache_spec(num_kv_heads=32), + ] + merged_layer_spec = same_layer_specs[0].merge(same_layer_specs) + assert merged_layer_spec.block_size == 16 + assert merged_layer_spec.num_kv_heads == 32 + assert merged_layer_spec.head_size == 64 + assert merged_layer_spec.dtype == torch.float32 + assert merged_layer_spec.sliding_window is None + + different_layer_specs = [ + new_kv_cache_spec(num_kv_heads=32), + new_kv_cache_spec(num_kv_heads=16), + ] + with pytest.raises(AssertionError): + different_layer_specs[0].merge(different_layer_specs) + + full_spec = new_kv_cache_spec(num_kv_heads=32) + different_type_layer_specs = [ + full_spec, + SlidingWindowSpec( + block_size=full_spec.block_size, + num_kv_heads=full_spec.num_kv_heads, + head_size=full_spec.head_size, + dtype=full_spec.dtype, + use_mla=full_spec.use_mla, + sliding_window=1, + ), + ] + with pytest.raises(AssertionError): + different_type_layer_specs[0].merge(different_type_layer_specs) + with pytest.raises(AssertionError): + different_type_layer_specs[1].merge(different_type_layer_specs) + + different_sliding_window_layer_specs = [ + new_kv_cache_spec(num_kv_heads=32), + new_kv_cache_spec(num_kv_heads=32, sliding_window=1), + new_kv_cache_spec(num_kv_heads=32, sliding_window=2), + ] + with pytest.raises(ValueError): + different_sliding_window_layer_specs[0].merge( + different_sliding_window_layer_specs) + + same_sliding_window_layer_specs = [ + new_kv_cache_spec(num_kv_heads=32, sliding_window=1), + new_kv_cache_spec(num_kv_heads=32, sliding_window=1), + ] + merged_layer_spec = same_sliding_window_layer_specs[0].merge( + same_sliding_window_layer_specs) + assert merged_layer_spec.sliding_window == 1 + + same_sliding_window_layer_spec_with_none = [ + new_kv_cache_spec(num_kv_heads=32, sliding_window=1), + new_kv_cache_spec(num_kv_heads=32, sliding_window=None), + ] + merged_layer_spec = same_sliding_window_layer_spec_with_none[0].merge( + same_sliding_window_layer_spec_with_none) + assert merged_layer_spec.sliding_window == 1 + + @pytest.mark.parametrize( ("model_id", "max_model_len", "want_estimated_max_len"), [ ("Qwen/Qwen1.5-7B", 16385, 16384), diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 2d7411381e16..3da27786b1f2 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -84,7 +84,7 @@ def test_prefill(hash_algo): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] # Check full block metadata parent_block_hash = None @@ -107,13 +107,13 @@ def test_prefill(hash_algo): req1 = make_request("1", common_token_ids + unique_token_ids) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert len(manager.req_to_block_hashes[req1.request_id]) == 3 - assert computed_blocks.get_block_ids() == [1, 2, 3] + assert computed_blocks.get_block_ids() == [[1, 2, 3]] assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req1, num_new_tokens, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [5] + assert blocks.get_block_ids() == [[5]] for block in computed_blocks.blocks: assert block.ref_cnt == 2 @@ -141,13 +141,13 @@ def test_prefill(hash_algo): req2 = make_request("2", common_token_ids + unique_token_ids) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert len(manager.req_to_block_hashes[req2.request_id]) == 3 - assert computed_blocks.get_block_ids() == [1, 2, 3] + assert computed_blocks.get_block_ids() == [[1, 2, 3]] assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req2, num_new_tokens, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [6] + assert blocks.get_block_ids() == [[6]] # Although we only have 6 free blocks, we have 8 blocks in # the free block queue due to lazy removal. @@ -171,7 +171,7 @@ def test_prefill(hash_algo): len(computed_blocks.blocks) * 16, computed_blocks) # This block ID order also checks the eviction order. - assert blocks.get_block_ids() == [7, 8, 9, 10, 4, 5, 6, 3, 2, 1] + assert blocks.get_block_ids() == [[7, 8, 9, 10, 4, 5, 6, 3, 2, 1]] assert manager.block_pool.free_block_queue.num_free_blocks == 0 assert manager.block_pool.free_block_queue.free_list_head is None assert manager.block_pool.free_block_queue.free_list_tail is None @@ -208,7 +208,7 @@ def test_prefill_plp(): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] req0_block_hashes = [b.block_hash for b in blocks.blocks] # Check full block metadata @@ -233,13 +233,13 @@ def test_prefill_plp(): req1 = make_request("1", common_token_ids + unique_token_ids) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert len(manager.req_to_block_hashes[req1.request_id]) == 3 - assert computed_blocks.get_block_ids() == [1, 2, 3] + assert computed_blocks.get_block_ids() == [[1, 2, 3]] assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req1, num_new_tokens, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [5] + assert blocks.get_block_ids() == [[5]] for block in computed_blocks.blocks: assert block.ref_cnt == 2 @@ -277,11 +277,11 @@ def test_prefill_plp(): block_ids = blocks.get_block_ids() # Duplicate cached blocks have different ids but same hashes vs request #0 assert [b.block_hash for b in blocks.blocks] == req0_block_hashes - assert block_ids != [1, 2, 3, 4] + assert block_ids != [[1, 2, 3, 4]] # Request #2 block hashes are valid since request #0 hashes are. # Check block reference counts. - for block_id in block_ids: + for block_id in block_ids[0]: assert manager.block_pool.blocks[block_id].ref_cnt == 1 manager.free(req2) @@ -307,7 +307,7 @@ def test_decode(): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] # Append slots without allocating a new block. req0.num_computed_tokens = 55 @@ -379,12 +379,12 @@ def test_evict(): # Touch the first 2 blocks. req2 = make_request("2", list(range(2 * 16 + 3))) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) - assert computed_blocks.get_block_ids() == [1, 2] + assert computed_blocks.get_block_ids() == [[1, 2]] assert num_computed_tokens == 2 * 16 blocks = manager.allocate_slots(req2, 3, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [10] + assert blocks.get_block_ids() == [[10]] assert manager.block_pool.free_block_queue.num_free_blocks == 7 @@ -625,7 +625,7 @@ def test_mm_prefix_caching(): blocks = manager.allocate_slots(req0, 59, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] req0.num_computed_tokens = 59 # Append slots without allocating a new block. @@ -686,7 +686,7 @@ def test_cache_key_salting(): blocks = manager.allocate_slots(req0, 59, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] req0.num_computed_tokens = 59 # Append slots without allocating a new block. @@ -797,7 +797,7 @@ def test_reset_prefix_cache(): all_token_ids = full_block_token_ids + unique_token_ids req0 = make_request("0", all_token_ids) blocks = manager.allocate_slots(req0, 55) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] unique_token_ids = [4] * 7 all_token_ids = full_block_token_ids + unique_token_ids @@ -808,7 +808,7 @@ def test_reset_prefix_cache(): blocks = manager.allocate_slots(req1, 7, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [5] + assert blocks.get_block_ids() == [[5]] # Failed to reset prefix cache because some blocks are not freed yet. assert not manager.reset_prefix_cache() diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 452fe1e37e2c..8bea032f656f 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -290,7 +290,6 @@ def test_kv_cache_events( log_stats=False, ) endpoint = publisher_config.endpoint.replace("*", "127.0.0.1") - time.sleep(0.1) subscriber = MockSubscriber(endpoint, topic=publisher_config.topic, decode_type=KVEventBatch) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 5c116598ff3f..25bbcd901d6a 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -1,3 +1,4 @@ +# ruff: noqa: E501 # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -5,17 +6,22 @@ import json import re from enum import Enum -from typing import Any +from typing import TYPE_CHECKING, Any import jsonschema import pytest from pydantic import BaseModel +from tests.reasoning.utils import run_reasoning_extraction from vllm.entrypoints.llm import LLM from vllm.outputs import RequestOutput from vllm.platforms import current_platform +from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager from vllm.sampling_params import GuidedDecodingParams, SamplingParams +if TYPE_CHECKING: + from vllm.config import TokenizerMode + NGRAM_SPEC_CONFIG = { "model": "[ngram]", "num_speculative_tokens": 5, @@ -444,7 +450,7 @@ def test_structured_output( prompt = """ You have access to the following function to retrieve the weather in a city: - + { "name": "get_weather", "parameters": { @@ -455,7 +461,7 @@ def test_structured_output( } } } - + If a you choose to call a function ONLY reply in the following format: <{start_tag}={function_name}>{parameters}{end_tag} where @@ -476,7 +482,7 @@ def test_structured_output( - Always add your sources when using search results to answer the user query You are a helpful assistant. - + Given the previous instructions, what is the weather in New York City? \ Make the response as short as possible. """ @@ -514,6 +520,88 @@ def test_structured_output( f"{generated_text!r}\nError: {str(e)}") +@pytest.mark.skip_global_cleanup +@pytest.mark.parametrize( + "model_name, guided_decoding_backend, tokenizer_mode, reasoning_parser, speculative_config", # noqa: E501 + [ + ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "xgrammar", "auto", + "deepseek_r1", NGRAM_SPEC_CONFIG), + ("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None), + ], +) +def test_structured_output_with_reasoning_matrices( + monkeypatch: pytest.MonkeyPatch, + guided_decoding_backend: str, + tokenizer_mode: TokenizerMode, + reasoning_parser: str, + model_name: str, + speculative_config: dict[str, Any] | None, +): + monkeypatch.setenv("VLLM_USE_V1", "1") + + if current_platform.is_tpu() and speculative_config: + pytest.skip("TPU does not support speculative decoding") + + # Use a single LLM instance for several scenarios to + # speed up the test suite. + llm = LLM( + model=model_name, + # Don't use eager execution on TPUs because we want to test for no + # recompilation at runtime + enforce_eager=bool(not current_platform.is_tpu()), + max_model_len=1024, + max_num_seqs=16, + guided_decoding_backend=guided_decoding_backend, + guided_decoding_disable_any_whitespace=True, + tokenizer_mode=tokenizer_mode, + reasoning_parser=reasoning_parser, + speculative_config=speculative_config, + ) + tokenizer = llm.get_tokenizer(None) + reasoner = ReasoningParserManager.get_reasoning_parser(reasoning_parser)( + tokenizer=tokenizer) + + reasoning_prompt = "Solve the following math problem step-by-step, then provide the final answer as JSON object with a single key 'result'. Make sure to correct your reasoning if there are any issue should it arise.\nProblem: What is 5 * 8 + 2?" # noqa: E501 + reasoning_schema = { + "type": "object", + "properties": { + "result": { + "type": "integer" + } + }, + "required": ["result"], + "additionalProperties": False + } + if "Qwen3" in model_name: + reasoning_prompt += "\n" + + sampling_params = SamplingParams( + temperature=0.1, + max_tokens=8192, + guided_decoding=GuidedDecodingParams(json=reasoning_schema), + ) + outputs = llm.generate( + [reasoning_prompt], + sampling_params=sampling_params, + use_tqdm=True, + ) + + assert outputs is not None + output = outputs[0] + assert output is not None and isinstance(output, RequestOutput) + prompt = output.prompt + generated_text = output.outputs[0].text + reasoning_content, content = run_reasoning_extraction( + reasoner, [generated_text]) + print( + f"Prompt: {prompt!r}\nReasoning: {reasoning_content!r}\nContent: {content!r}" + ) + + assert content is not None and reasoning_content is not None + output_json = json.loads(content) + jsonschema.validate(instance=output_json, schema=reasoning_schema) + + @pytest.mark.skip_global_cleanup @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE) diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py new file mode 100644 index 000000000000..64da0d79bf33 --- /dev/null +++ b/tests/v1/kv_connector/unit/test_multi_connector.py @@ -0,0 +1,241 @@ +# SPDX-License-Identifier: Apache-2.0 +import filecmp +import shutil +import tempfile +from collections import defaultdict +from pathlib import Path + +from vllm import LLM, SamplingParams +from vllm.config import KVTransferConfig, VllmConfig +from vllm.distributed.kv_transfer.kv_connector.factory import ( + KVConnectorFactory) +from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa + SharedStorageConnector) + +MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" + +PROMPT_CONTEXT = "Hi " * 100 +PROMPTS = [ + PROMPT_CONTEXT + "Hello, my name is", + PROMPT_CONTEXT + "The capital of France is", +] + +SAMPLING_PARAMS = SamplingParams(temperature=0, max_tokens=20) + + +class TestSharedStorageConnector(SharedStorageConnector): + + def __init__(self, config: VllmConfig, role): + self.name = config.kv_transfer_config.kv_connector_extra_config["name"] + self._connector = SharedStorageConnector(config, role) + self.call_record: dict[str, int] = defaultdict(int) + # Use a unique temp file per connector + self._event_file = tempfile.gettempdir( + ) + f"/connector_{self.name}_events.log" + # Start with an empty file + with open(self._event_file, "w") as _: + pass + + def __getattribute__(self, name): + if name in ("_connector", "call_record", "name", "_event_file", + "__class__", "__dict__", "__getattribute__", + "__init__"): # avoid recursion + return object.__getattribute__(self, name) + if not hasattr(self._connector, name): + return object.__getattribute__(self, name) + attr = getattr(self._connector, name) + + # Intercept calls to the connector interface and write an event + # for each one to a file, which can be read back in the main test proc. + if callable(attr): + + def wrapper(*args, **kwargs): + self.call_record[name] += 1 + # Log the event as a line to the file + try: + with open(self._event_file, "a") as f: + f.write(name + "\n") + except Exception as e: + print(f"[ERROR] Could not log event {name} " + f"for {self.name}: {e}") + return attr(*args, **kwargs) + + return wrapper + return attr + + +KVConnectorFactory.register_connector("TestSharedStorageConnector", + TestSharedStorageConnector.__module__, + TestSharedStorageConnector.__name__) + + +# Helper function to compare directories recursively +def _compare_directories(dir1: Path, dir2: Path) -> bool: + """Compares two directories recursively for identical content.""" + dcmp = filecmp.dircmp(dir1, dir2) + if dcmp.left_only or dcmp.right_only or dcmp.diff_files: + print(f"Differences found between {dir1} and {dir2}:") + print(f" Left only: {dcmp.left_only}") + print(f" Right only: {dcmp.right_only}") + print(f" Different files: {dcmp.diff_files}") + return False + for sub_dir in dcmp.common_dirs: + if not _compare_directories(dir1 / sub_dir, dir2 / sub_dir): + return False + return True + + +def test_multi_shared_storage_connector_consistency(): + """ + Tests that MultiConnector with two SharedStorageConnectors saves + identical KV cache data to separate storage locations. + """ + storage_1_path = Path("storage_1/") + storage_2_path = Path("storage_2/") + shutil.rmtree(storage_1_path, ignore_errors=True) + shutil.rmtree(storage_2_path, ignore_errors=True) + storage_1_path.mkdir() + storage_2_path.mkdir() + + # Configure MultiConnector with two SharedStorageConnectors + kv_transfer_config = KVTransferConfig( + kv_connector="MultiConnector", + kv_role="kv_both", + kv_connector_extra_config={ + "connectors": [{ + "kv_connector": "TestSharedStorageConnector", + "kv_role": "kv_both", + "kv_connector_extra_config": { + "shared_storage_path": str(storage_1_path), + "name": "storage1", + } + }, { + "kv_connector": "TestSharedStorageConnector", + "kv_role": "kv_both", + "kv_connector_extra_config": { + "shared_storage_path": str(storage_2_path), + "name": "storage2", + } + }] + }, + ) + + llm = LLM( + model=MODEL_NAME, + enforce_eager=True, + gpu_memory_utilization=0.5, + kv_transfer_config=kv_transfer_config, + ) + # Run generation - this should trigger saving KV cache + _ = llm.generate(PROMPTS, SAMPLING_PARAMS) + + # --- Verification --- + + # Check that both storage directories were populated + local_subdirs = list(storage_1_path.iterdir()) + external_subdirs = list(storage_2_path.iterdir()) + + assert len( + local_subdirs + ) > 0, f"Local storage path {storage_1_path} is empty after generation." + assert len(external_subdirs) > 0, ( + f"External storage path {storage_2_path} is empty after generation.") + assert len(local_subdirs) == len(external_subdirs), ( + f"Mismatch in number of cache entries: " + f"Local={len(local_subdirs)}, External={len(external_subdirs)}") + + # The subdirectories should correspond to the prompt hashes + # Since prompts are the same, the hash directories should be the same name + local_subdir_names = sorted([d.name for d in local_subdirs]) + external_subdir_names = sorted([d.name for d in external_subdirs]) + assert local_subdir_names == external_subdir_names, ( + "Cache directory names do not match between local and external storage" + ) + + # Compare the contents of each corresponding cache directory + for subdir_name in local_subdir_names: + print(f"Comparing contents of cache directory: {subdir_name}") + assert _compare_directories(storage_1_path / subdir_name, + storage_2_path / subdir_name), \ + (f"Contents differ for cache directory '{subdir_name}' between " + f"{storage_1_path} and {storage_2_path}") + + events = get_connector_events() + # get_num_new_matched_tokens will be called on each connector in turn. + # neither of them have hits so update_state_after_alloc won't be called. + assert events["storage1"][:3] == [ + 'get_num_new_matched_tokens', 'build_connector_meta', + 'bind_connector_metadata' + ] + assert events["storage2"][:3] == [ + 'get_num_new_matched_tokens', 'build_connector_meta', + 'bind_connector_metadata' + ] + + # Reset prefix cache or else we'll just get the tokens back from there. + llm.reset_prefix_cache() + + # Run generation again - this should trigger loading from the first + # connector. + _ = llm.generate(PROMPTS, SAMPLING_PARAMS) + + events = get_connector_events() + # get_num_new_matched_tokens will return new tokens from the first + # connector so update_state_after_alloc will be called once blocks + # are allocated for the first connector. + # get_num_new_matched_tokens *won't* be called on the second connector + # in this case. + assert events["storage1"][:4] == [ + 'get_num_new_matched_tokens', 'update_state_after_alloc', + 'build_connector_meta', 'bind_connector_metadata' + ] + assert events["storage2"][:2] == [ + 'build_connector_meta', 'bind_connector_metadata' + ] + + # Delete storage1 connector state + shutil.rmtree(storage_1_path) + + # Reset prefix cache or else we'll just get the tokens back from there. + llm.reset_prefix_cache() + + # Run generation again - this should trigger loading from the first + # connector. + _ = llm.generate(PROMPTS, SAMPLING_PARAMS) + + events = get_connector_events() + # get_num_new_matched_tokens will be called for the first connector but it + # won't have a hit so update_state_after_alloc won't be called. + # get_num_new_matched_tokens will also be called on the second connector, + # but it should have a hit so update_state_after_alloc will be called. + assert events["storage1"][:3] == [ + 'get_num_new_matched_tokens', 'build_connector_meta', + 'bind_connector_metadata' + ] + assert events["storage2"][:4] == [ + 'get_num_new_matched_tokens', 'update_state_after_alloc', + 'build_connector_meta', 'bind_connector_metadata' + ] + + # Clean up + shutil.rmtree(storage_1_path) + shutil.rmtree(storage_2_path) + + +def get_connector_events() -> dict[str, list[str]]: + # Read in connector events and reset the files. + import glob + event_files = glob.glob(tempfile.gettempdir() + "/connector_*_events.log") + connector_events = {} + for fname in event_files: + name = fname.split("connector_")[1].split("_events.log")[0] + try: + with open(fname, "r+") as f: + connector_events[name] = [ + line.strip() for line in f if line.strip() + ] + f.truncate(0) + except Exception as e: + print(f"[ERROR] Could not read connector events for {name}: {e}") + + return connector_events diff --git a/tests/v1/test_stats.py b/tests/v1/test_stats.py deleted file mode 100644 index 48419d8a2791..000000000000 --- a/tests/v1/test_stats.py +++ /dev/null @@ -1,302 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import pytest - -from vllm.sampling_params import SamplingParams -from vllm.v1.stats.common import RequestStats, RequestStatsUpdate - - -def make_update( - request_id: str, - update_type: RequestStatsUpdate.Type, - monotonic_ts_s: float, - **kwargs, -): - if update_type == RequestStatsUpdate.Type.INPUT_PROCESSED: - kwargs.setdefault("sampling_params", SamplingParams(n=1)) - kwargs.setdefault("num_prompt_tokens", 10) - elif update_type == RequestStatsUpdate.Type.PREFILLING: - kwargs.setdefault("num_computed_tokens", 10) - kwargs.setdefault("num_cached_tokens", 10) - elif update_type == RequestStatsUpdate.Type.DETOKENIZED: - kwargs.setdefault("num_new_tokens", 10) - elif update_type == RequestStatsUpdate.Type.FINISHED: - kwargs.setdefault("finish_reason", "test_reason") - - return RequestStatsUpdate( - request_id=request_id, - type=update_type, - monotonic_ts_s=monotonic_ts_s, - **kwargs, - ) - - -def test_invalid_request_update(): - request_id = "test_request" - update_specific_required_fields = { - RequestStatsUpdate.Type.INPUT_PROCESSED: [ - "sampling_params", - "num_prompt_tokens", - ], - RequestStatsUpdate.Type.PREFILLING: [ - "num_computed_tokens", - "num_cached_tokens", - ], - RequestStatsUpdate.Type.DETOKENIZED: ["num_new_tokens"], - RequestStatsUpdate.Type.FINISHED: ["finish_reason"], - } - - # Missing a required field should raise an assertion error. - for update_type in RequestStatsUpdate.Type: - required_fields = update_specific_required_fields.get(update_type, []) - - # Try to miss one of the required fields. - kwargs = {field: object() for field in required_fields} - for field in required_fields: - copy_kwargs = kwargs.copy() - copy_kwargs.pop(field) - with pytest.raises(ValueError): - RequestStatsUpdate( - request_id=request_id, - type=update_type, - **copy_kwargs, - ) - - -def test_invalid_request_update_transition(): - # Test invalid transition type. - for src in RequestStatsUpdate.Type: - for dst in RequestStatsUpdate.Type: - if dst not in RequestStatsUpdate._VALID_TRANSITIONS[src]: - with pytest.raises(AssertionError): - RequestStatsUpdate.check_valid_update( - make_update( - update_type=dst, - request_id="test_request", - monotonic_ts_s=1, - ), - last_update_type=src, - last_updated_ts_s=0, - ) - else: - RequestStatsUpdate.check_valid_update( - make_update( - request_id="test_request", - update_type=dst, - monotonic_ts_s=1, - ), - last_update_type=src, - last_updated_ts_s=0, - ) - - # Test invalid timestamp. - with pytest.raises(AssertionError): - RequestStatsUpdate.check_valid_update( - make_update( - request_id="test_request", - update_type=RequestStatsUpdate.Type.ARRIVED, - monotonic_ts_s=1, - ), - last_update_type=None, - last_updated_ts_s=2, - ) - - -def test_lifecycle_updates(): - request_id = "test_request" - stats = RequestStats(request_id=request_id) - - # Test the below scenario: - arrived_ts = 0 - input_processed_ts = 1 - queued_ts = 2 - prefilling_ts = 3 - decoded_ts = 5 - detokenized_ts = 6 - decoded_2_ts = 7 - detokenized_2_ts = 8 - preempted_ts = 9 - resumed_ts = 10 - decoded_3_ts = 11 - detokenized_3_ts = 12 - finished_ts = 13 - - # Test ARRIVED - arrived_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.ARRIVED, - monotonic_ts_s=arrived_ts, - ) - stats.update_from(arrived_update) - assert stats.arrival_ts_s == arrived_ts - assert stats.last_updated_ts_s == arrived_ts - - # Test INPUT_PROCESSED - sampling_params = SamplingParams(n=1) - input_processed_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.INPUT_PROCESSED, - monotonic_ts_s=input_processed_ts, - sampling_params=sampling_params, - num_prompt_tokens=6, - ) - stats.update_from(input_processed_update) - assert stats.input_processor_end_ts_s == input_processed_ts - assert stats.last_updated_ts_s == input_processed_ts - assert stats.num_prompt_tokens == 6 - assert stats.sampling_params == sampling_params - - assert stats.first_token_ts_s is None - assert stats.prefill_ts_s is None - - # Test QUEUED - queued_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.QUEUED, - monotonic_ts_s=queued_ts, - ) - stats.update_from(queued_update) - assert stats.queued_ts_s == queued_ts - assert stats.last_updated_ts_s == queued_ts - - # Test PREFILLING - prefilling_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.PREFILLING, - monotonic_ts_s=prefilling_ts, - num_computed_tokens=3, - num_cached_tokens=1, - ) - stats.update_from(prefilling_update) - assert stats.prefill_ts_s == prefilling_ts - assert stats.num_computed_tokens == 3 - assert stats.num_cached_tokens == 1 - assert stats.queue_duration_s == prefilling_ts - queued_ts - - # Test DECODING - decoded_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.DECODING, - monotonic_ts_s=decoded_ts, - ) - stats.update_from(decoded_update) - assert stats.last_updated_ts_s == decoded_ts - - # Test DETOKENIZED - detokenized_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.DETOKENIZED, - monotonic_ts_s=detokenized_ts, - num_new_tokens=1, - ) - stats.update_from(detokenized_update) - assert stats.last_updated_ts_s == detokenized_ts - assert stats.num_output_tokens == 1 - # Since arrival - assert stats.first_token_latency_s == detokenized_ts - arrived_ts - # Since first scheduled - assert stats.prefill_latency_s == detokenized_ts - prefilling_ts - - # Test another DECODING and DETOKENIZED should - # yield correct inter token latency - decoded_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.DECODING, - monotonic_ts_s=decoded_2_ts, - ) - stats.update_from(decoded_update) - - detokenized_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.DETOKENIZED, - monotonic_ts_s=detokenized_2_ts, - num_new_tokens=1, - ) - stats.update_from(detokenized_update) - assert stats.output_token_latency_s_lst == [ - detokenized_2_ts - detokenized_ts, - ] - assert stats.num_output_tokens == 2 - - # Test PREEMPTED - preempted_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.PREEMPTED, - monotonic_ts_s=preempted_ts, - ) - stats.update_from(preempted_update) - assert stats.last_updated_ts_s == preempted_ts - assert stats.preempted_ts_s_lst == [preempted_ts] - # States should be reset - assert stats.num_computed_tokens == 0 - assert stats.num_cached_tokens == 0 - # These states should not be reset - assert stats.num_output_tokens == 2 - assert stats.output_token_latency_s_lst == [ - detokenized_2_ts - detokenized_ts, - ] - assert stats.prefill_latency_s == prefilling_ts - arrived_ts - assert stats.num_prompt_tokens == 6 - assert stats.prefill_start_ts_s_lst == [prefilling_ts] - - # Test resumed - resumed_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.PREFILLING, - monotonic_ts_s=resumed_ts, - num_computed_tokens=6, - num_cached_tokens=2, - ) - stats.update_from(resumed_update) - # prefill timestamp should not be updated since it's a resumed prefill - assert stats.prefill_ts_s == prefilling_ts - assert stats.num_computed_tokens == 6 - assert stats.num_cached_tokens == 2 - assert stats.prefill_start_ts_s_lst == [ - prefilling_ts, - resumed_ts, - ] - assert stats.last_updated_ts_s == resumed_ts - - # Test another DECODED/DETOKENIZED should yield correct first token latency. - decoded_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.DECODING, - monotonic_ts_s=decoded_3_ts, - ) - detokenized_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.DETOKENIZED, - monotonic_ts_s=detokenized_3_ts, - num_new_tokens=1, - ) - stats.update_from(decoded_update) - stats.update_from(detokenized_update) - assert stats.first_token_ts_s == detokenized_ts - arrived_ts - assert stats.num_output_tokens == 3 - assert stats.output_token_latency_s_lst == [ - detokenized_2_ts - detokenized_ts, - detokenized_3_ts - detokenized_2_ts, - ] - - # Test FINISHED - finished_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.FINISHED, - monotonic_ts_s=finished_ts, - finish_reason="test_reason", - ) - stats.update_from(finished_update) - assert stats.last_updated_ts_s == finished_ts - assert stats.e2e_latency_s == finished_ts - arrived_ts - assert stats.inference_latency_s == finished_ts - prefilling_ts - assert stats.prefill_latency_s == detokenized_ts - prefilling_ts - assert stats.decode_latency_s == finished_ts - detokenized_ts - assert stats.first_token_latency_s == detokenized_ts - arrived_ts - assert stats.queue_duration_s == prefilling_ts - queued_ts - assert stats.is_finished - assert stats.finish_reason == "test_reason" - - # TODO(rickyx): Add model forward/execute time. - assert stats.model_forward_duration_s == 0.0 - assert stats.model_execute_duration_s == 0.0 diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 7b1359c8576f..638f5bedcfca 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -9,9 +9,11 @@ from vllm.sampling_params import SamplingParams from vllm.utils import is_pin_memory_available, make_tensor_with_pad +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheGroupSpec, KVCacheTensor) from vllm.v1.sample.metadata import SamplingMetadata -from vllm.v1.worker.gpu_input_batch import (BlockTable, CachedRequestState, - InputBatch) +from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable +from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch VOCAB_SIZE = 1024 NUM_OUTPUT_TOKENS = 20 @@ -22,6 +24,27 @@ MAX_NUM_PROMPT_TOKENS = 64 +def get_kv_cache_config() -> KVCacheConfig: + return KVCacheConfig( + num_blocks=10, + tensors={ + "layer.0": KVCacheTensor(size=1024), + }, + kv_cache_groups=[ + KVCacheGroupSpec( + layer_names=["layer.0"], + kv_cache_spec=FullAttentionSpec( + block_size=1, + num_kv_heads=1, + head_size=16, + dtype=torch.float16, + use_mla=False, + ), + ), + ], + ) + + def _compare_objs(obj1, obj2): attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a))) attr_names = set([ @@ -41,6 +64,10 @@ def _compare_objs(obj1, obj2): elif isinstance(a, np.ndarray): if np.allclose(a, b): is_same = True + elif isinstance(a, MultiGroupBlockTable): + for a_i, b_i in zip(a.block_tables, b.block_tables): + _compare_objs(a_i, b_i) + is_same = True elif isinstance(a, (BlockTable, SamplingMetadata)): _compare_objs(a, b) is_same = True # if we make it here must be same @@ -198,7 +225,7 @@ def _construct_cached_request_state(req_id_suffix: int): sampling_params=_create_sampling_params(), mm_inputs=[], mm_positions=[], - block_ids=[], + block_ids=[[]], generator=None, num_computed_tokens=len(output_token_ids), output_token_ids=output_token_ids, @@ -220,11 +247,11 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int): input_batch: InputBatch = InputBatch( max_num_reqs=batch_size, max_model_len=1024, - max_num_blocks_per_req=10, max_num_batched_tokens=1024, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, + kv_cache_config=get_kv_cache_config(), ) reqs: list[CachedRequestState] = [] req_id_reqs = {} @@ -310,20 +337,20 @@ def test_swap_states_in_input_batch(device: str, batch_size: int, input_batch: InputBatch = InputBatch( max_num_reqs=batch_size, max_model_len=1024, - max_num_blocks_per_req=10, max_num_batched_tokens=1024, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, + kv_cache_config=get_kv_cache_config(), ) ref_input_batch: InputBatch = InputBatch( max_num_reqs=batch_size, max_model_len=1024, - max_num_blocks_per_req=10, max_num_batched_tokens=1024, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, + kv_cache_config=get_kv_cache_config(), ) reqs: list[CachedRequestState] = [] diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 725747294fd8..e44660525763 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -1,15 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 -import weakref import pytest -import torch -from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig +from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, + SchedulerConfig, VllmConfig) from vllm.sampling_params import SamplingParams from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput) -from vllm.v1.kv_cache_interface import FullAttentionSpec +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheGroupSpec, KVCacheTensor) from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_model_runner import GPUModelRunner @@ -17,13 +18,34 @@ def initialize_kv_cache(runner: GPUModelRunner): """ Only perform necessary steps in GPUModelRunner.initialize_kv_cache() """ - kv_cache_spec = FullAttentionSpec(block_size=16, - num_kv_heads=1, - head_size=64, - dtype=torch.float16, - use_mla=False) - runner.attn_metadata_builder = runner.attn_backend.get_builder_cls()( - weakref.proxy(runner), kv_cache_spec, runner.input_batch.block_table) + kv_cache_config = KVCacheConfig( + num_blocks=10, + tensors={ + "layer.0": KVCacheTensor(size=1024), + }, + kv_cache_groups=[ + KVCacheGroupSpec( + layer_names=["layer.0"], + kv_cache_spec=FullAttentionSpec( + block_size=16, + num_kv_heads=runner.model_config.get_num_kv_heads( + runner.parallel_config), + head_size=runner.model_config.get_head_size(), + dtype=runner.kv_cache_dtype, + use_mla=False, + )) + ]) + runner.kv_cache_config = kv_cache_config + runner.input_batch = InputBatch( + max_num_reqs=runner.max_num_reqs, + max_model_len=runner.max_model_len, + max_num_batched_tokens=runner.max_num_tokens, + device=runner.device, + pin_memory=runner.pin_memory, + vocab_size=runner.model_config.get_vocab_size(), + kv_cache_config=kv_cache_config, + ) + runner.initialize_attn_backend(kv_cache_config) @pytest.fixture @@ -48,10 +70,12 @@ def model_runner(): swap_space=0, cache_dtype="auto", ) + parallel_config = ParallelConfig() vllm_config = VllmConfig( model_config=model_config, cache_config=cache_config, scheduler_config=scheduler_config, + parallel_config=parallel_config, ) device = "cuda" @@ -73,7 +97,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: mm_hashes=[], mm_positions=[], sampling_params=SamplingParams(), - block_ids=[0], + block_ids=[[0]], num_computed_tokens=0, lora_request=None, )) @@ -111,13 +135,14 @@ def _is_sampling_metadata_changed(model_runner, def _is_req_state_block_table_match(model_runner, req_id: str) -> bool: req_index = model_runner.input_batch.req_id_to_index[req_id] - block_table = model_runner.input_batch.block_table + block_table = model_runner.input_batch.block_table[0] req_state = model_runner.requests[req_id] - if block_table.num_blocks_per_row[req_index] != len(req_state.block_ids): + if block_table.num_blocks_per_row[req_index] != len( + req_state.block_ids[0]): return False num_blocks = block_table.num_blocks_per_row[req_index] return (block_table.block_table_np[req_index, :num_blocks] == - req_state.block_ids).all() + req_state.block_ids[0]).all() def test_update_states_new_request(model_runner): @@ -200,7 +225,7 @@ def test_update_states_request_resumed(model_runner): req_id=req_id, resumed_from_preemption=False, new_token_ids=[], - new_block_ids=[], + new_block_ids=[[]], num_computed_tokens=0, ) diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt index 1b797074096e..9164f8595346 100644 --- a/tests/weight_loading/models.txt +++ b/tests/weight_loading/models.txt @@ -2,7 +2,7 @@ gptq_marlin, robertgshaw2/zephyr-7b-beta-channelwise-gptq, main gptq_marlin, TheBloke/Llama-2-7B-GPTQ, main gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, main gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit--1g-actorder_True -gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit-32g-actorder_True +#gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit-32g-actorder_True gptq_marlin, TechxGenus/gemma-1.1-2b-it-GPTQ, main gptq, robertgshaw2/zephyr-7b-beta-channelwise-gptq, main gptq, TheBloke/Llama-2-7B-GPTQ, main diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index c81300db5657..e74d139ab980 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -254,14 +254,8 @@ def rotary_embedding( cos_sin_cache: torch.Tensor, is_neox: bool, ) -> None: - # TODO: Remove this contiguous call when the kernel is updated to support tensor slices - query_contiguous = query.contiguous() - key_contiguous = key.contiguous() if key is not None else None - torch.ops._C.rotary_embedding(positions, query_contiguous, key_contiguous, - head_size, cos_sin_cache, is_neox) - query.copy_(query_contiguous) - if key is not None: - key.copy_(key_contiguous) + torch.ops._C.rotary_embedding(positions, query, key, head_size, + cos_sin_cache, is_neox) def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, @@ -269,16 +263,9 @@ def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, cos_sin_cache: torch.Tensor, is_neox: bool, rot_dim: int, cos_sin_cache_offsets: torch.Tensor) -> None: - # TODO: Remove this contiguous call when the kernel is updated to support tensor slices - query_contiguous = query.contiguous() - key_contiguous = key.contiguous() if key is not None else None - torch.ops._C.batched_rotary_embedding(positions, query_contiguous, - key_contiguous, head_size, + torch.ops._C.batched_rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox, rot_dim, cos_sin_cache_offsets) - query.copy_(query_contiguous) - if key is not None: - key.copy_(key_contiguous) # layer norm ops diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index 8c0cf9267f35..241e84ca669d 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -56,11 +56,11 @@ def kernel_unified_attention_2d( stride_k_cache_0: tl.int64, # int stride_k_cache_1: tl.int64, # int stride_k_cache_2: tl.int64, # int - stride_k_cache_3: tl.int64, # int + stride_k_cache_3: tl.constexpr, # int stride_v_cache_0: tl.int64, # int stride_v_cache_1: tl.int64, # int stride_v_cache_2: tl.int64, # int - stride_v_cache_3: tl.int64, # int + stride_v_cache_3: tl.constexpr, # int query_start_len_ptr, # [num_seqs+1] BLOCK_Q: tl.constexpr, # int num_seqs: tl.int32, @@ -268,6 +268,10 @@ def unified_attention( assert causal, "Only causal attention is supported" assert q_descale is None, "Q scales not supported" + block_size = v.shape[1] + assert q.element_size() >= 2 or block_size >= 32, \ + "Block size must be at least 32 for fp8" + use_alibi_slopes = alibi_slopes is not None block_size = v.shape[1] diff --git a/vllm/config.py b/vllm/config.py index c5d61405c839..dddfdabd126a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -297,7 +297,7 @@ class ModelConfig: - 1K -> 1024\n - 25.6k -> 25,600""" spec_target_max_model_len: Optional[int] = None - """Specify the the maximum length for spec decoding draft models.""" + """Specify the maximum length for spec decoding draft models.""" quantization: Optional[QuantizationMethods] = None """Method used to quantize the weights. If `None`, we first check the `quantization_config` attribute in the model config file. If that is @@ -906,12 +906,17 @@ def _verify_quantization(self) -> None: def _verify_cuda_graph(self) -> None: self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, self.max_model_len) + # CUDAGraph capture not supported for enc-dec models and mllama on ROCm ROCM_UNSUPPORTED_MODELS = ['mllama'] - if (self.hf_config.model_type in ROCM_UNSUPPORTED_MODELS - and not self.enforce_eager and current_platform.is_rocm()): + unsupported_rocm = (self.hf_config.model_type + in ROCM_UNSUPPORTED_MODELS + or self.is_encoder_decoder) + + if (unsupported_rocm and not self.enforce_eager + and current_platform.is_rocm()): logger.warning( "CUDA graph is not supported for %s on ROCm yet, fallback " - "to the eager mode.", self.hf_config.model_type) + "to eager mode.", self.hf_config.model_type) self.enforce_eager = True def _verify_bnb_config(self) -> None: @@ -2332,7 +2337,7 @@ class SpeculativeConfig: `TypicalAcceptanceSampler`.""" speculative_token_tree: Optional[str] = None - """Specifies the tree structure for speculative token generation. + """Specifies the tree structure for speculative token generation. """ # required configuration params passed from engine target_model_config: ModelConfig = field(default=None, @@ -3945,11 +3950,12 @@ def init_with_cudagraph_sizes(self, self.cudagraph_capture_sizes = cudagraph_capture_sizes else: # de-duplicate the sizes provided by the config - self.cudagraph_capture_sizes = list( - set(self.cudagraph_capture_sizes)) - logger.info(("cudagraph sizes specified by model runner" - " %s is overridden by config %s"), - cudagraph_capture_sizes, self.cudagraph_capture_sizes) + dedup_sizes = list(set(self.cudagraph_capture_sizes)) + if len(dedup_sizes) < len(self.cudagraph_capture_sizes): + logger.info(("cudagraph sizes specified by model runner" + " %s is overridden by config %s"), + cudagraph_capture_sizes, dedup_sizes) + self.cudagraph_capture_sizes = dedup_sizes computed_compile_sizes = [] if self.compile_sizes is not None: @@ -4024,7 +4030,7 @@ class VllmConfig: """LoRA configuration.""" speculative_config: Optional[SpeculativeConfig] = None """Speculative decoding configuration.""" - decoding_config: Optional[DecodingConfig] = None + decoding_config: DecodingConfig = field(default_factory=DecodingConfig) """Decoding configuration.""" observability_config: Optional[ObservabilityConfig] = None """Observability configuration.""" diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py index 1141a8e53c3b..29c6a70c4d26 100644 --- a/vllm/distributed/kv_events.py +++ b/vllm/distributed/kv_events.py @@ -130,6 +130,7 @@ def __init__( self._endpoint = endpoint self._replay_endpoint = replay_endpoint self._hwm = hwm + self._socket_setup() # Payload self._seq_gen = count() @@ -207,7 +208,6 @@ def _socket_setup(self) -> None: def _publisher_thread(self) -> None: """Background thread that processes the event queue.""" self._pack = msgspec.msgpack.Encoder() - self._socket_setup() assert self._pub is not None # narrows type for mypy diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 6766d5a24542..f998f5dd7b15 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -110,3 +110,8 @@ def create_connector_v1( "NixlConnector", "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector", "NixlConnector") + +KVConnectorFactory.register_connector( + "MultiConnector", + "vllm.distributed.kv_transfer.kv_connector.v1.multi_connector", + "MultiConnector") diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 03c99f20e775..9fdb5340f0e2 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -22,7 +22,6 @@ import enum from abc import ABC, abstractmethod -from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional import torch @@ -48,7 +47,6 @@ class KVConnectorRole(enum.Enum): WORKER = 1 -@dataclass class KVConnectorMetadata: """ Abstract Metadata used to communicate between the diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py new file mode 100644 index 000000000000..cc4a7fbadf5c --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: Apache-2.0 +import copy +from typing import TYPE_CHECKING, Any, Optional + +import torch + +from vllm.config import KVTransferConfig, VllmConfig +from vllm.distributed.kv_transfer.kv_connector.factory import ( + KVConnectorFactory) +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) +from vllm.logger import init_logger +from vllm.v1.core.sched.output import SchedulerOutput + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.request import Request + +logger = init_logger(__name__) + + +class MultiKVConnectorMetadata(tuple[KVConnectorMetadata, ...], + KVConnectorMetadata): + pass + + +class MultiConnector(KVConnectorBase_V1): + """ + A wrapper for using multiple KVConnectors at the same time. + + The current logic is: + - Load KV from the first connector that advertises available tokens from + get_num_new_matched_tokens(), based on the order in the config. + - Save to all connectors. + """ + + def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): + super().__init__(vllm_config=vllm_config, role=role) + self._connectors = [] + ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get( + "connectors") + assert ktcs is not None + for ktc in ktcs: + temp_config = copy.copy(vllm_config) + temp_config.kv_transfer_config = KVTransferConfig(**ktc) + self._connectors.append( + KVConnectorFactory.create_connector_v1(temp_config, role)) + + # A mapping from request id to the connector that is assigned to it. + self._requests_to_connector: dict[str, KVConnectorBase_V1] = {} + + # Keeps track of *additional* remaining async saves (beyond 1) to be + # finished per request. Not needed for async loads since we only allow + # a single connector to load. + self._extra_async_saves: dict[str, int] = {} + + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + for c in self._connectors: + c.register_kv_caches(kv_caches) + + # We must override the base class method here because we need to bind + # the metadata to each connector in the order of the connectors in the + # MultiKVConnectorMetadata. + def bind_connector_metadata( + self, connector_metadata: KVConnectorMetadata) -> None: + assert isinstance(connector_metadata, MultiKVConnectorMetadata) + for c, cm in zip(self._connectors, connector_metadata): + c.bind_connector_metadata(cm) + + def clear_connector_metadata(self) -> None: + for c in self._connectors: + c.clear_connector_metadata() + + # ============================== + # Worker-side methods + # ============================== + def start_load_kv(self, forward_context: "ForwardContext", + **kwargs) -> None: + for c in self._connectors: + c.start_load_kv(forward_context, **kwargs) + + def wait_for_layer_load(self, layer_name: str) -> None: + for c in self._connectors: + c.wait_for_layer_load(layer_name) + + def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", **kwargs) -> None: + for c in self._connectors: + c.save_kv_layer(layer_name, kv_layer, attn_metadata, **kwargs) + + def wait_for_save(self): + for c in self._connectors: + c.wait_for_save() + + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[Optional[set[str]], Optional[set[str]]]: + finished_recving: set[str] = set() + finished_sending: set[str] = set() + for c in self._connectors: + recving, sending = c.get_finished(finished_req_ids) + if not recving and not sending: + continue + # Aggregate finished recving request ids. + finished_recving.update(recving or ()) + # Aggregate finished sending request ids - only include + # once we've drained the "extra" count (for cases where + # more than one connector is async-saving the same request). + for req_id in sending or (): + extra_pending = self._extra_async_saves.get(req_id) + if extra_pending is None: + finished_sending.add(req_id) + continue + assert extra_pending > 0 + if extra_pending == 1: + del self._extra_async_saves[req_id] + else: + self._extra_async_saves[req_id] = extra_pending - 1 + + return finished_recving or None, finished_sending or None + + # ============================== + # Scheduler-side methods + # ============================== + def get_num_new_matched_tokens( + self, + request: "Request", + num_computed_tokens: int, + ) -> tuple[int, bool]: + for c in self._connectors: + toks, load_async = c.get_num_new_matched_tokens( + request, num_computed_tokens) + # The first connector that has new matched tokens will be assigned + # to this request. + if toks > 0: + self._requests_to_connector[request.request_id] = c + return toks, load_async + return 0, False + + def update_state_after_alloc(self, request: "Request", + blocks: "KVCacheBlocks", + num_external_tokens: int): + # If the request is not assigned to any connector, we do nothing. + if request.request_id not in self._requests_to_connector: + return + # We assume that the request is assigned to only one connector. + c = self._requests_to_connector.pop(request.request_id) + c.update_state_after_alloc(request, blocks, num_external_tokens) + + def build_connector_meta( + self, + scheduler_output: SchedulerOutput) -> MultiKVConnectorMetadata: + return MultiKVConnectorMetadata( + c.build_connector_meta(scheduler_output) for c in self._connectors) + + def request_finished( + self, + request: "Request", + blocks: "KVCacheBlocks", + ) -> tuple[bool, Optional[dict[str, Any]]]: + async_saves = 0 + kv_txfer_params = None + for c in self._connectors: + async_save, txfer_params = c.request_finished(request, blocks) + if async_save: + async_saves += 1 + if txfer_params is not None: + if kv_txfer_params is not None: + #TODO we can probably change this to merge the dicts here, + # checking for key clashes. + raise RuntimeError( + "Only one connector can produce KV transfer params") + kv_txfer_params = txfer_params + if async_saves > 1: + self._extra_async_saves[request.request_id] = async_saves - 1 + return async_saves > 0, kv_txfer_params diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index abd1ea2bea82..c0c03efcdbf4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -21,7 +21,7 @@ get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, get_tp_group) from vllm.logger import init_logger -from vllm.utils import round_down +from vllm.utils import make_zmq_path, make_zmq_socket, round_down from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.request import RequestStatus @@ -379,7 +379,7 @@ def _nixl_handshake_listener(metadata: NixlAgentMetadata, # hack to keeps us moving. We will switch when moving to etcd # or where we have a single ZMQ socket in the scheduler. port = envs.VLLM_NIXL_SIDE_CHANNEL_PORT + rank - path = f"tcp://{host}:{port}" + path = make_zmq_path("tcp", host, port) logger.debug("Starting listening on path: %s", path) with zmq_ctx(zmq.ROUTER, path) as sock: ready_event.set() @@ -397,7 +397,7 @@ def _nixl_handshake(self, host: str, port: int): # NOTE(rob): we need each rank to have a unique port. This is # a hack to keep us moving. We will switch when moving to etcd # or where we have a single ZMQ socket in the scheduler. - path = f"tcp://{host}:{port + self.rank}" + path = make_zmq_path("tcp", host, port + self.rank) logger.debug("Querying metadata on path: %s", path) with zmq_ctx(zmq.REQ, path) as sock: # Send query for the request. @@ -741,20 +741,16 @@ def _get_block_descs_ids(self, engine_id: str, def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]: """Context manager for a ZMQ socket""" + if socket_type not in (zmq.ROUTER, zmq.REQ): + raise ValueError(f"Unexpected socket type: {socket_type}") + ctx: Optional[zmq.Context] = None try: ctx = zmq.Context() # type: ignore[attr-defined] - - if socket_type == zmq.ROUTER: - socket = ctx.socket(zmq.ROUTER) - socket.bind(addr) - elif socket_type == zmq.REQ: - socket = ctx.socket(zmq.REQ) - socket.connect(addr) - else: - raise ValueError(f"Unexpected socket type: {socket_type}") - - yield socket + yield make_zmq_socket(ctx=ctx, + path=addr, + socket_type=socket_type, + bind=socket_type == zmq.ROUTER) finally: if ctx is not None: ctx.destroy(linger=0) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 0fedb6fd5ed9..0421a65a2c81 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -288,7 +288,7 @@ def build_connector_meta( for new_req in scheduler_output.scheduled_new_reqs: if new_req.req_id in self._requests_need_load: meta.add_request(token_ids=new_req.prompt_token_ids, - block_ids=new_req.block_ids, + block_ids=new_req.block_ids[0], block_size=self._block_size, is_store=False) total_need_load += 1 @@ -299,7 +299,7 @@ def build_connector_meta( # the original prompt tokens. if not self._found_match_for_request(new_req): meta.add_request(token_ids=new_req.prompt_token_ids, - block_ids=new_req.block_ids, + block_ids=new_req.block_ids[0], block_size=self._block_size, is_store=True) @@ -319,7 +319,7 @@ def build_connector_meta( # NOTE(rob): For resumed req, new_block_ids is all # of the block_ids for the request. - block_ids = cached_req.new_block_ids + block_ids = cached_req.new_block_ids[0] meta.add_request(token_ids=token_ids, block_ids=block_ids, diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 2041a54e8c0d..51c519d8f862 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -23,6 +23,7 @@ """ import contextlib import gc +import importlib.util import pickle import weakref from collections import namedtuple @@ -42,7 +43,7 @@ from vllm.distributed.utils import StatelessProcessGroup from vllm.logger import init_logger from vllm.utils import (direct_register_custom_op, resolve_obj_by_qualname, - supports_custom_op) + run_once, supports_custom_op) @dataclass @@ -936,9 +937,49 @@ def init_distributed_environment( "world group already initialized with a different world size") +PPLX_DID_INIT: bool = False + + +@run_once +def pplx_init(rank, world_size): + has_pplx = importlib.util.find_spec("pplx_kernels") is not None + + if has_pplx and world_size > 1: + from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id, + nvshmem_get_unique_id, nvshmem_init) + try: + global PPLX_DID_INIT + logger.debug( + "Initialize NVSHMEM for PPLX kernels: rank=%d, " + "world size=%d", rank, world_size) + uid = nvshmem_get_unique_id( + ) if rank == 0 else nvshmem_alloc_empty_unique_id() + uid_gpu = uid.cuda() + get_world_group().broadcast(uid_gpu, src=0) + uid = uid_gpu.to(device='cpu') + logger.debug("PPLX NVSHMEM UID = %s", uid) + nvshmem_init(uid, rank, world_size) + PPLX_DID_INIT = True + except Exception as ex: + logger.error("Failed to initialize NVSHMEM for PPLX: %s", ex) + + +@run_once +def pplx_finalize(): + global PPLX_DID_INIT + if PPLX_DID_INIT: + from pplx_kernels.nvshmem import nvshmem_finalize + logger.debug("PPLX NVSHMEM finalize") + from vllm.model_executor.layers.fused_moe.layer import ( + _all_to_all_cache) + _all_to_all_cache.destroy() + nvshmem_finalize() + + def initialize_model_parallel( tensor_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1, + enable_expert_parallel: bool = False, backend: Optional[str] = None, ) -> None: """ @@ -1041,10 +1082,14 @@ def initialize_model_parallel( _DP.rank_in_group, _PP.rank_in_group, _TP.rank_in_group, _EP.rank_in_group) + if enable_expert_parallel: + pplx_init(rank, world_size) + def ensure_model_parallel_initialized( tensor_model_parallel_size: int, pipeline_model_parallel_size: int, + enable_expert_parallel: bool = False, backend: Optional[str] = None, ) -> None: """Helper to initialize model parallel groups if they are not initialized, @@ -1055,7 +1100,8 @@ def ensure_model_parallel_initialized( get_world_group().device_group) if not model_parallel_is_initialized(): initialize_model_parallel(tensor_model_parallel_size, - pipeline_model_parallel_size, backend) + pipeline_model_parallel_size, + enable_expert_parallel, backend) return assert ( @@ -1133,6 +1179,9 @@ def get_tensor_model_parallel_rank(): def destroy_model_parallel(): """Set the groups to none and destroy them.""" global _TP + + pplx_finalize() + if _TP: _TP.destroy() _TP = None diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index 68983b91b2be..6bb323d79d64 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -23,7 +23,7 @@ import vllm.envs as envs from vllm.logger import init_logger -from vllm.utils import get_tcp_uri +from vllm.utils import get_tcp_uri, is_torch_equal_or_newer logger = init_logger(__name__) @@ -362,12 +362,11 @@ def stateless_destroy_torch_distributed_process_group( Destroy ProcessGroup returned by stateless_init_torch_distributed_process_group(). """ - # Lazy import for non-CUDA backends. - try: - # pytorch <= 2.6 + if is_torch_equal_or_newer("2.7"): + pg.shutdown() + else: + # Lazy import for non-CUDA backends. from torch.distributed.distributed_c10d import _shutdown_backend _shutdown_backend(pg) - except ImportError: - # pytorch >= 2.7 - pg.shutdown() + _unregister_process_group(pg.group_name) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 6f5514a6f801..adfacf2b4719 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -387,7 +387,6 @@ def resolve_hf_chat_template( def _resolve_chat_template_content_format( chat_template: Optional[str], tools: Optional[list[dict[str, Any]]], - given_format: ChatTemplateContentFormatOption, tokenizer: AnyTokenizer, *, model_config: ModelConfig, @@ -408,7 +407,7 @@ def _resolve_chat_template_content_format( detected_format = ("string" if jinja_text is None else _detect_content_format(jinja_text, default="string")) - return detected_format if given_format == "auto" else given_format + return detected_format @lru_cache @@ -451,7 +450,6 @@ def resolve_chat_template_content_format( detected_format = _resolve_chat_template_content_format( chat_template, tools, - given_format, tokenizer, model_config=model_config, ) @@ -462,7 +460,8 @@ def resolve_chat_template_content_format( detected_format=detected_format, ) - return detected_format + return detected_format if given_format == "auto" else given_format + ModalityStr = Literal["image", "audio", "video", "image_embeds"] diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 5b3df0faccf6..0ab6fcdca1a4 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -17,8 +17,10 @@ from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus +from json import JSONDecodeError from typing import Annotated, Optional, Union +import prometheus_client import uvloop from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request from fastapi.exceptions import RequestValidationError @@ -305,15 +307,18 @@ async def validate_json_request(raw_request: Request): content_type = raw_request.headers.get("content-type", "").lower() media_type = content_type.split(";", maxsplit=1)[0] if media_type != "application/json": - raise HTTPException( - status_code=HTTPStatus.UNSUPPORTED_MEDIA_TYPE, - detail="Unsupported Media Type: Only 'application/json' is allowed" - ) + raise RequestValidationError(errors=[ + "Unsupported Media Type: Only 'application/json' is allowed" + ]) router = APIRouter() +class PrometheusResponse(Response): + media_type = prometheus_client.CONTENT_TYPE_LATEST + + def mount_metrics(app: FastAPI): # Lazy import for prometheus multiprocessing. # We need to set PROMETHEUS_MULTIPROC_DIR environment variable @@ -332,6 +337,10 @@ def mount_metrics(app: FastAPI): registry = CollectorRegistry() multiprocess.MultiProcessCollector(registry) + # `response_class=PrometheusResponse` is needed to return an HTTP response + # with header "Content-Type: text/plain; version=0.0.4; charset=utf-8" + # instead of the default "application/json" which is incorrect. + # See https://github.com/trallnag/prometheus-fastapi-instrumentator/issues/163#issue-1296092364 Instrumentator( excluded_handlers=[ "/metrics", @@ -342,7 +351,7 @@ def mount_metrics(app: FastAPI): "/server_info", ], registry=registry, - ).add().instrument(app).expose(app) + ).add().instrument(app).expose(app, response_class=PrometheusResponse) # Add prometheus asgi middleware to route /metrics requests metrics_route = Mount("/metrics", make_asgi_app(registry=registry)) @@ -401,11 +410,11 @@ def engine_client(request: Request) -> EngineClient: return request.app.state.engine_client -@router.get("/health") -async def health(raw_request: Request) -> JSONResponse: +@router.get("/health", response_class=Response) +async def health(raw_request: Request) -> Response: """Health check.""" await engine_client(raw_request).check_health() - return JSONResponse(content={}, status_code=200) + return Response(status_code=200) @router.get("/load") @@ -427,18 +436,42 @@ async def get_server_load_metrics(request: Request): content={'server_load': request.app.state.server_load_metrics}) -@router.api_route("/ping", methods=["GET", "POST"]) -async def ping(raw_request: Request) -> JSONResponse: +@router.get("/ping", response_class=Response) +@router.post("/ping", response_class=Response) +async def ping(raw_request: Request) -> Response: """Ping check. Endpoint required for SageMaker""" return await health(raw_request) -@router.post("/tokenize", dependencies=[Depends(validate_json_request)]) +@router.post("/tokenize", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.NOT_FOUND.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + HTTPStatus.NOT_IMPLEMENTED.value: { + "model": ErrorResponse + }, + }) @with_cancellation async def tokenize(request: TokenizeRequest, raw_request: Request): handler = tokenization(raw_request) - generator = await handler.create_tokenize(request, raw_request) + try: + generator = await handler.create_tokenize(request, raw_request) + except NotImplementedError as e: + raise HTTPException(status_code=HTTPStatus.NOT_IMPLEMENTED.value, + detail=str(e)) from e + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e + if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -448,12 +481,31 @@ async def tokenize(request: TokenizeRequest, raw_request: Request): assert_never(generator) -@router.post("/detokenize", dependencies=[Depends(validate_json_request)]) +@router.post("/detokenize", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.NOT_FOUND.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation async def detokenize(request: DetokenizeRequest, raw_request: Request): handler = tokenization(raw_request) - generator = await handler.create_detokenize(request, raw_request) + try: + generator = await handler.create_detokenize(request, raw_request) + except OverflowError as e: + raise RequestValidationError(errors=[str(e)]) from e + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e + if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -478,7 +530,23 @@ async def show_version(): @router.post("/v1/chat/completions", - dependencies=[Depends(validate_json_request)]) + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: { + "content": { + "text/event-stream": {} + } + }, + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.NOT_FOUND.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + } + }) @with_cancellation @load_aware_call async def create_chat_completion(request: ChatCompletionRequest, @@ -500,7 +568,24 @@ async def create_chat_completion(request: ChatCompletionRequest, return StreamingResponse(content=generator, media_type="text/event-stream") -@router.post("/v1/completions", dependencies=[Depends(validate_json_request)]) +@router.post("/v1/completions", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: { + "content": { + "text/event-stream": {} + } + }, + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.NOT_FOUND.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation @load_aware_call async def create_completion(request: CompletionRequest, raw_request: Request): @@ -509,7 +594,15 @@ async def create_completion(request: CompletionRequest, raw_request: Request): return base(raw_request).create_error_response( message="The model does not support Completions API") - generator = await handler.create_completion(request, raw_request) + try: + generator = await handler.create_completion(request, raw_request) + except OverflowError as e: + raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value, + detail=str(e)) from e + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e + if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -519,7 +612,16 @@ async def create_completion(request: CompletionRequest, raw_request: Request): return StreamingResponse(content=generator, media_type="text/event-stream") -@router.post("/v1/embeddings", dependencies=[Depends(validate_json_request)]) +@router.post("/v1/embeddings", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation @load_aware_call async def create_embedding(request: EmbeddingRequest, raw_request: Request): @@ -566,7 +668,16 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): assert_never(generator) -@router.post("/pooling", dependencies=[Depends(validate_json_request)]) +@router.post("/pooling", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation @load_aware_call async def create_pooling(request: PoolingRequest, raw_request: Request): @@ -606,7 +717,16 @@ async def create_classify(request: ClassificationRequest, assert_never(generator) -@router.post("/score", dependencies=[Depends(validate_json_request)]) +@router.post("/score", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation @load_aware_call async def create_score(request: ScoreRequest, raw_request: Request): @@ -625,7 +745,16 @@ async def create_score(request: ScoreRequest, raw_request: Request): assert_never(generator) -@router.post("/v1/score", dependencies=[Depends(validate_json_request)]) +@router.post("/v1/score", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation @load_aware_call async def create_score_v1(request: ScoreRequest, raw_request: Request): @@ -636,12 +765,28 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): return await create_score(request, raw_request) -@router.post("/v1/audio/transcriptions") +@router.post("/v1/audio/transcriptions", + responses={ + HTTPStatus.OK.value: { + "content": { + "text/event-stream": {} + } + }, + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.UNPROCESSABLE_ENTITY.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation @load_aware_call -async def create_transcriptions(request: Annotated[TranscriptionRequest, - Form()], - raw_request: Request): +async def create_transcriptions(raw_request: Request, + request: Annotated[TranscriptionRequest, + Form()]): handler = transcription(raw_request) if handler is None: return base(raw_request).create_error_response( @@ -661,7 +806,16 @@ async def create_transcriptions(request: Annotated[TranscriptionRequest, return StreamingResponse(content=generator, media_type="text/event-stream") -@router.post("/rerank", dependencies=[Depends(validate_json_request)]) +@router.post("/rerank", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation @load_aware_call async def do_rerank(request: RerankRequest, raw_request: Request): @@ -679,7 +833,16 @@ async def do_rerank(request: RerankRequest, raw_request: Request): assert_never(generator) -@router.post("/v1/rerank", dependencies=[Depends(validate_json_request)]) +@router.post("/v1/rerank", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation async def do_rerank_v1(request: RerankRequest, raw_request: Request): logger.warning_once( @@ -690,7 +853,16 @@ async def do_rerank_v1(request: RerankRequest, raw_request: Request): return await do_rerank(request, raw_request) -@router.post("/v2/rerank", dependencies=[Depends(validate_json_request)]) +@router.post("/v2/rerank", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation async def do_rerank_v2(request: RerankRequest, raw_request: Request): return await do_rerank(request, raw_request) @@ -770,12 +942,29 @@ async def is_sleeping(raw_request: Request): return JSONResponse(content={"is_sleeping": is_sleeping}) -@router.post("/invocations", dependencies=[Depends(validate_json_request)]) +@router.post("/invocations", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.UNSUPPORTED_MEDIA_TYPE.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) async def invocations(raw_request: Request): """ For SageMaker, routes requests to other handlers based on model `task`. """ - body = await raw_request.json() + try: + body = await raw_request.json() + except JSONDecodeError as e: + raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value, + detail=f"JSON decode error: {e}") from e + task = raw_request.app.state.task if task not in TASK_HANDLERS: @@ -866,10 +1055,26 @@ def build_app(args: Namespace) -> FastAPI: allow_headers=args.allowed_headers, ) + @app.exception_handler(HTTPException) + async def http_exception_handler(_: Request, exc: HTTPException): + err = ErrorResponse(message=exc.detail, + type=HTTPStatus(exc.status_code).phrase, + code=exc.status_code) + return JSONResponse(err.model_dump(), status_code=exc.status_code) + @app.exception_handler(RequestValidationError) - async def validation_exception_handler(_, exc): - err = ErrorResponse(message=str(exc), - type="BadRequestError", + async def validation_exception_handler(_: Request, + exc: RequestValidationError): + exc_str = str(exc) + errors_str = str(exc.errors()) + + if exc.errors() and errors_str and errors_str != exc_str: + message = f"{exc_str} {errors_str}" + else: + message = exc_str + + err = ErrorResponse(message=message, + type=HTTPStatus.BAD_REQUEST.phrase, code=HTTPStatus.BAD_REQUEST) return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 8ac6534875dd..cd6ee3670117 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1593,6 +1593,10 @@ class TokenizeChatRequest(OpenAIBaseModel): default=None, description=("Additional kwargs to pass to the HF processor."), ) + tools: Optional[list[ChatCompletionToolsParam]] = Field( + default=None, + description=("A list of tools the model may call."), + ) @model_validator(mode="before") @classmethod diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index a9ba0e4d68ce..ee18e0b0a454 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -197,7 +197,7 @@ async def create_chat_completion( except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: logger.exception("Error in preprocessing prompt inputs") - return self.create_error_response(str(e)) + return self.create_error_response(f"{e} {e.__cause__}") request_id = "chatcmpl-" \ f"{self._base_request_id(raw_request, request.request_id)}" diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index c642fc51005e..349e0ac9e68b 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -65,6 +65,8 @@ async def create_tokenize( tokenizer = await self.engine_client.get_tokenizer(lora_request) if isinstance(request, TokenizeChatRequest): + tool_dicts = (None if request.tools is None else + [tool.model_dump() for tool in request.tools]) ( _, request_prompts, @@ -73,6 +75,7 @@ async def create_tokenize( request, tokenizer, request.messages, + tool_dicts=tool_dicts, chat_template=request.chat_template or self.chat_template, chat_template_content_format=self. chat_template_content_format, @@ -91,7 +94,7 @@ async def create_tokenize( ) except (ValueError, TypeError, jinja2.TemplateError) as e: logger.exception("Error in preprocessing prompt inputs") - return self.create_error_response(str(e)) + return self.create_error_response(f"{e} {e.__cause__}") input_ids: list[int] = [] for i, engine_prompt in enumerate(engine_prompts): diff --git a/vllm/envs.py b/vllm/envs.py index 9d585bf3578e..fe3fa91fbe33 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -55,6 +55,7 @@ VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_VIDEO_FETCH_TIMEOUT: int = 30 VLLM_AUDIO_FETCH_TIMEOUT: int = 10 + VLLM_VIDEO_LOADER_BACKEND: str = "opencv" VLLM_MM_INPUT_CACHE_GIB: int = 8 VLLM_TARGET_DEVICE: str = "cuda" MAX_JOBS: Optional[str] = None @@ -446,6 +447,16 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: "VLLM_AUDIO_FETCH_TIMEOUT": lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")), + # Backend for Video IO + # - "opencv": Default backend that uses OpenCV stream buffered backend. + # + # Custom backend implementations can be registered + # via `@VIDEO_LOADER_REGISTRY.register("my_custom_video_loader")` and + # imported at runtime. + # If a non-existing backend is used, an AssertionError will be thrown. + "VLLM_VIDEO_LOADER_BACKEND": + lambda: os.getenv("VLLM_VIDEO_LOADER_BACKEND", "opencv"), + # Cache size (in GiB) for multimodal input cache # Default is 4 GiB "VLLM_MM_INPUT_CACHE_GIB": diff --git a/vllm/forward_context.py b/vllm/forward_context.py index eb1e1f5694bb..5d2d95f18d2f 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -27,6 +27,7 @@ @dataclass class DPMetadata: + max_tokens_across_dp_cpu: torch.Tensor cu_tokens_across_dp_cpu: torch.Tensor @@ -90,8 +91,10 @@ def set_forward_context(attn_metadata: Any, dtype=torch.int32) from vllm.distributed.parallel_state import get_dp_group dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group) + max_tokens_across_dp_cpu = torch.max(num_tokens_tensor) cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0) - dp_metadata = DPMetadata(cu_tokens_across_dp_cpu) + dp_metadata = DPMetadata(max_tokens_across_dp_cpu, + cu_tokens_across_dp_cpu) global _forward_context prev_context = _forward_context diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index aecddbcd7515..148b3558c15e 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -159,7 +159,7 @@ def call_hf_processor( msg = (f"Failed to apply {type(hf_processor).__name__} " f"on data={data} with kwargs={merged_kwargs}") - raise RuntimeError(msg) from exc + raise ValueError(msg) from exc class DummyData(NamedTuple): diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py index 13ddaaf961f7..9feb9e462459 100644 --- a/vllm/lora/ops/triton_ops/lora_expand_op.py +++ b/vllm/lora/ops/triton_ops/lora_expand_op.py @@ -153,7 +153,7 @@ def _lora_expand( lora_token_start_loc (torch.Tensor): A cumulative sum of num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that lora_token_start_loc[i], along with num_tokens_per_lora[i] - identifies the the region in token_indices_sorted_by_lora_ids that + identifies the region in token_indices_sorted_by_lora_ids that LoRA lora_ids[i] should process. lora_ids (torch.Tensor): LoRA ids to process. no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 08be9de62621..5c262287f7dd 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -38,8 +38,8 @@ def get_config() -> Optional[dict[str, Any]]: from vllm.model_executor.layers.fused_moe.cutlass_moe import ( cutlass_moe_fp4, cutlass_moe_fp8) from vllm.model_executor.layers.fused_moe.fused_moe import ( - fused_experts, fused_moe, fused_topk, get_config_file_name, - grouped_topk) + TritonExperts, fused_experts, fused_moe, fused_topk, + get_config_file_name, grouped_topk) __all__ += [ "fused_moe", @@ -49,4 +49,5 @@ def get_config() -> Optional[dict[str, Any]]: "grouped_topk", "cutlass_moe_fp8", "cutlass_moe_fp4", + "TritonExperts", ] diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 7f96a4012716..aff108112b61 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -5,10 +5,176 @@ import torch +import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP) +from vllm.model_executor.layers.fused_moe.utils import _fp8_perm, _resize_cache from vllm.scalar_type import scalar_types +class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute): + + def __init__( + self, + ab_strides1: torch.Tensor, + c_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides2: torch.Tensor, + out_dtype: torch.dtype, + ): + super().__init__() + self.ab_strides1 = ab_strides1 + self.c_strides1 = c_strides1 + self.ab_strides2 = ab_strides2 + self.c_strides2 = c_strides2 + self.out_dtype = out_dtype + + def workspace_shapes( + self, + a: torch.Tensor, + M: int, + N: int, + K: int, + topk: int, + num_experts: int, + ) -> tuple[int, int, torch.dtype]: + # Note that K, N are transposed + N, K = K, N + workspace1 = M * topk * max(2 * N, K) + workspace2 = M * topk * N + return (workspace1, workspace2, self.out_dtype) + + def apply( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_num_tokens: Optional[torch.Tensor], + ) -> torch.Tensor: + a1q = hidden_states + + assert w1_scale is not None + assert w2_scale is not None + assert w1.dtype == torch.float8_e4m3fn + assert w2.dtype == torch.float8_e4m3fn + assert a1q.shape[1] == w1.shape[1], "Hidden size mismatch w1" + assert w1.shape[2] == w2.shape[1] * 2, "Hidden size mismatch w2" + assert w1.shape[0] == w2.shape[0], "Expert number mismatch" + assert a1q_scale is None or a1q_scale.dim( + ) == 0 or a1q_scale.shape[0] == 1 or a1q_scale.shape[0] == a1q.shape[ + 0], "Input scale shape mismatch" + assert w1_scale.dim() == 1 or w1_scale.shape[1] == 1 or w1_scale.shape[ + 1] == w1.shape[2], "W1 scale shape mismatch" + assert w2_scale.dim() == 1 or w2_scale.shape[1] == 1 or w2_scale.shape[ + 1] == w2.shape[2], "W2 scale shape mismatch" + assert w1.shape[0] == w2.shape[0], "Weights expert number mismatch" + assert w1.shape[0] == w1_scale.shape[ + 0], "w1 scales expert number mismatch" + assert w1.shape[0] == w2_scale.shape[ + 0], "w2 scales expert number mismatch" + assert a2_scale is None or a1q_scale is None or a2_scale.shape == a1q_scale.shape, "Intermediate scale shape mismatch" # noqa: E501 + assert self.ab_strides1.shape[0] == w1.shape[ + 0], "AB Strides 1 expert number mismatch" + assert self.c_strides1.shape[0] == w1.shape[ + 0], "C Strides 1 expert number mismatch" + assert self.ab_strides2.shape[0] == w2.shape[ + 0], "AB Strides 2 expert number mismatch" + assert self.c_strides2.shape[0] == w2.shape[ + 0], "C Strides 2 expert number mismatch" + assert self.out_dtype in [torch.half, + torch.bfloat16], "Invalid output dtype" + + M = a1q.shape[0] + _, N, K = w2.shape # because w1 + w2 are transposed + device = a1q.device + + assert w1.shape[1] == K + assert global_num_experts != -1 + assert a1q_scale is not None + + if expert_map is not None: + "Translate info from expert_map to topk_ids" + local_topk_ids = torch.where(expert_map[topk_ids] != -1, + expert_map[topk_ids], -1) + else: + local_topk_ids = topk_ids + + topk = local_topk_ids.shape[1] + + per_act_token = a1q_scale.numel() != 1 if a1q_scale is not None else ( + a2_scale.numel() != 1 if a2_scale is not None else False) + + expert_offsets = torch.empty((global_num_experts + 1), + dtype=torch.int32, + device=device) + problem_sizes1 = torch.empty((global_num_experts, 3), + dtype=torch.int32, + device=device) + problem_sizes2 = torch.empty((global_num_experts, 3), + dtype=torch.int32, + device=device) + + # With expert_map each Rank processes only a subset of experts. As + # a result not all of a_map and c2 tensors are filled. We fill it + # zeros for correctness. + if expert_map is not None: + a_map = torch.zeros((local_topk_ids.numel()), + dtype=torch.int32, + device=device) + else: + a_map = torch.empty((local_topk_ids.numel()), + dtype=torch.int32, + device=device) + + c_map = torch.empty((local_topk_ids.numel()), + dtype=torch.int32, + device=device) + + ops.get_cutlass_moe_mm_data(local_topk_ids, expert_offsets, + problem_sizes1, problem_sizes2, a_map, + c_map, global_num_experts, N, K) + + a1q = _fp8_perm(a1q, a_map) + a1q_scale = a1q_scale[a_map] if per_act_token else a1q_scale + + c1 = _resize_cache(workspace13, (M * topk, N * 2)) + c2 = _resize_cache(workspace2, (M * topk, N)) + c3 = _resize_cache(workspace13, (M * topk, K)) + + ops.cutlass_moe_mm(c1, a1q, w1, a1q_scale, w1_scale, + expert_offsets[:-1], problem_sizes1, + self.ab_strides1, self.ab_strides1, self.c_strides1) + + self.activation(activation, c2, c1) + + a2q, a2q_scale = ops.scaled_fp8_quant( + c2, a2_scale, use_per_token_if_dynamic=per_act_token) + + if expert_map is not None: + c3.fill_(0) + + ops.cutlass_moe_mm(c3, a2q, w2, a2q_scale, w2_scale, + expert_offsets[:-1], problem_sizes2, + self.ab_strides2, self.ab_strides2, self.c_strides2) + + c3 = c3[c_map] + + return c3 + + #TODO make the grouped gemm kernel consistent with scaled gemm kernel def cutlass_moe_fp8( a: torch.Tensor, @@ -17,7 +183,7 @@ def cutlass_moe_fp8( w1_scale: torch.Tensor, w2_scale: torch.Tensor, topk_weights: torch.Tensor, - topk_ids_: torch.Tensor, + topk_ids: torch.Tensor, ab_strides1: torch.Tensor, c_strides1: torch.Tensor, ab_strides2: torch.Tensor, @@ -59,7 +225,7 @@ def cutlass_moe_fp8( - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize the intermediate result between the gemms. Shape: scalar or [M] - - out_dtype (torch.Tensor): The output tensor type. + - out_dtype (torch.dtype): The output tensor type. - expert_map (Optional[torch.Tensor]): In the case of Expert parallel, every Rank is responsible for a subset of experts. expert_map is a mapping from global expert-id to local expert-id. When expert_map[i] @@ -71,115 +237,36 @@ def cutlass_moe_fp8( Returns: - torch.Tensor: The fp16 output tensor after applying the MoE layer. """ - - assert topk_weights.shape == topk_ids_.shape, "topk shape mismatch" - assert w1_q.dtype == torch.float8_e4m3fn - assert w2_q.dtype == torch.float8_e4m3fn - assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1" - assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2" - assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch" - assert a1_scale is None or a1_scale.dim( - ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[0] == a.shape[ - 0], "Input scale shape mismatch" - assert w1_scale.dim() == 1 or w1_scale.shape[1] == 1 or w1_scale.shape[ - 1] == w1_q.shape[2], "W1 scale shape mismatch" - assert w2_scale.dim() == 1 or w2_scale.shape[1] == 1 or w2_scale.shape[ - 1] == w2_q.shape[2], "W2 scale shape mismatch" - assert w1_q.shape[0] == w2_q.shape[0], "Weights expert number mismatch" - assert w1_q.shape[0] == w1_scale.shape[ - 0], "w1 scales expert number mismatch" - assert w1_q.shape[0] == w2_scale.shape[ - 0], "w2 scales expert number mismatch" - assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch" # noqa: E501 - assert ab_strides1.shape[0] == w1_q.shape[ - 0], "AB Strides 1 expert number mismatch" - assert c_strides1.shape[0] == w1_q.shape[ - 0], "C Strides 1 expert number mismatch" - assert ab_strides2.shape[0] == w2_q.shape[ - 0], "AB Strides 2 expert number mismatch" - assert c_strides2.shape[0] == w2_q.shape[ - 0], "C Strides 2 expert number mismatch" - assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype" - - num_experts = w1_q.size(0) - m = a.size(0) - k = w1_q.size(1) - n = w2_q.size(1) - - local_topk_ids = topk_ids_ - if expert_map is not None: - "Translate info from expert_map to topk_ids" - local_topk_ids = torch.where(expert_map[topk_ids_] != -1, - expert_map[topk_ids_], -1) - - topk = local_topk_ids.size(1) - per_act_token = a1_scale.numel() != 1 if a1_scale is not None else ( a2_scale.numel() != 1 if a2_scale is not None else False) - if apply_router_weight_on_input: - assert topk == 1, \ - "apply_router_weight_on_input is only implemented for topk=1" - # TODO: this only works for topK=1, will need to update for topK>1 - a = a * topk_weights.to(out_dtype) - - a_q, a1_scale = ops.scaled_fp8_quant( - a, a1_scale, use_per_token_if_dynamic=per_act_token) - device = a_q.device - - expert_offsets = torch.empty((num_experts + 1), - dtype=torch.int32, - device=device) - problem_sizes1 = torch.empty((num_experts, 3), - dtype=torch.int32, - device=device) - problem_sizes2 = torch.empty((num_experts, 3), - dtype=torch.int32, - device=device) - - a_map_initializer = torch.empty - c2_initializer = torch.empty - if expert_map is not None: - # With expert_map each Rank processes only a subset of experts. As - # a result not all of a_map and c2 tensors are filled. We fill it - # zeros for correctness. - a_map_initializer = torch.zeros - c2_initializer = torch.zeros - - a_map = a_map_initializer((local_topk_ids.numel()), - dtype=torch.int32, - device=device) - c_map = torch.empty((local_topk_ids.numel()), - dtype=torch.int32, - device=device) - ops.get_cutlass_moe_mm_data(local_topk_ids, expert_offsets, problem_sizes1, - problem_sizes2, a_map, c_map, num_experts, n, - k) - - rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype) - rep_a1_scales = a1_scale[a_map] if per_act_token else a1_scale - - c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype) - c2 = c2_initializer((m * topk, k), device=device, dtype=out_dtype) - - ops.cutlass_moe_mm(c1, rep_a_q, w1_q, rep_a1_scales, w1_scale, - expert_offsets[:-1], problem_sizes1, ab_strides1, - ab_strides1, c_strides1) - - intermediate = torch.empty((m * topk, n), device=device, dtype=out_dtype) - torch.ops._C.silu_and_mul(intermediate, c1) - - intemediate_q, a2_scale = ops.scaled_fp8_quant( - intermediate, a2_scale, use_per_token_if_dynamic=per_act_token) - - ops.cutlass_moe_mm(c2, intemediate_q, w2_q, a2_scale, w2_scale, - expert_offsets[:-1], problem_sizes2, ab_strides2, - ab_strides2, c_strides2) - # Gather tokens - c2 = c2[c_map].view(m, topk, k) - if not apply_router_weight_on_input: - c2 = c2 * topk_weights.view(m, topk, 1).to(out_dtype) - return c2.sum(dim=1) + fn = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP( + per_channel_quant=per_act_token, + quant_dtype=torch.float8_e4m3fn, + ), + CutlassExpertsFp8( + ab_strides1, + c_strides1, + ab_strides2, + c_strides2, + out_dtype, + ), + ) + + return fn( + a, + w1_q, + w2_q, + topk_weights, + topk_ids, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + apply_router_weight_on_input=apply_router_weight_on_input, + ) FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 5098e15dc5a4..46a814e6ecc3 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -1,16 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 +import functools import importlib.util from typing import Optional import torch -import vllm.envs as envs -from vllm import _custom_ops as ops +import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( - moe_align_block_size) -from vllm.model_executor.layers.fused_moe.utils import (_fp8_perm, - _fp8_quantize, +from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( + _moe_permute) +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP) +from vllm.model_executor.layers.fused_moe.utils import (_fp8_quantize, _resize_cache) from vllm.utils import round_up @@ -19,6 +20,19 @@ has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None +@functools.cache +def deep_gemm_block_shape() -> list[int]: + # Lazy import to avoid CUDA initialization problems. + import deep_gemm as dg + block = dg.get_m_alignment_for_contiguous_layout() + return [block, block] + + +def _valid_deep_gemm_shape(M: int, N: int, K: int): + align = deep_gemm_block_shape()[0] + return align <= M and N % align == 0 and K % align == 0 + + def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -29,89 +43,112 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, aligned by `dg.get_m_alignment_for_contiguous_layout()`. """ if not has_deep_gemm: + logger.debug("DeepGemm disabled: deep_gemm not available.") return False - # Lazy import to avoid CUDA initialization problems. - import deep_gemm as dg - - # Expert maps not supported yet. if expert_map is not None: + logger.debug("DeepGemm disabled: expert map NYI.") return False - align = dg.get_m_alignment_for_contiguous_layout() - M = hidden_states.shape[0] - _, K, N = w2.shape - - # For now, disable DeepGemm for small N until better permute/unpermute - # ops are available. - if N <= 512: + M = hidden_states.size(0) + _, K, N = w2.size() + if not _valid_deep_gemm_shape(M, N, K): + logger.debug("DeepGemm disabled: unalinged problem size.") return False - if align > M or N % align != 0 or K % align != 0: + if (w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn): + logger.debug("DeepGemm disabled: invalid weight dtype(s).") return False - return (hidden_states.is_contiguous() and w1.is_contiguous() - and w2.is_contiguous()) - - -def _moe_permute( - curr_hidden_states: torch.Tensor, - a1q_scale: Optional[torch.Tensor], - curr_topk_ids: torch.Tensor, - global_num_experts: int, - expert_map: Optional[torch.Tensor], - block_m: int, -) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor, - Optional[torch.Tensor]]: - """ - Determine the sorted_token_ids, expert_ids for the given problem size. - Permute the hidden states and scales according to `sorted_token_ids`. - """ - top_k_num = curr_topk_ids.shape[1] - - tokens_in_chunk, _ = curr_hidden_states.shape + if (not hidden_states.is_contiguous() or not w1.is_contiguous() + or not w2.is_contiguous()): + logger.debug( + "DeepGemm disabled: weights or activations not contiguous.") + return False - sorted_token_ids, expert_ids, num_tokens_post_padded = ( - moe_align_block_size(curr_topk_ids, - block_m, - global_num_experts, - expert_map, - pad_sorted_ids=True)) + return True + + +class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): + + def __init__(self): + super().__init__() + self.block_shape = deep_gemm_block_shape() + + def workspace_shapes( + self, + a: torch.Tensor, + M: int, + N: int, + K: int, + topk: int, + num_experts: int, + ) -> tuple[int, int, torch.dtype]: + block_m = self.block_shape[0] + M_sum = (M * topk) + num_experts * (block_m - 1) + M_sum = round_up(M_sum, block_m) + workspace1 = M_sum * max(N * 2, K) + workspace2 = M_sum * N + return (workspace1, workspace2, a.dtype) + + def apply( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_num_tokens: Optional[torch.Tensor], + ) -> torch.Tensor: + import deep_gemm as dg + + a1q = hidden_states + _, N, K = w1.size() + + assert global_num_experts != -1 + assert w2.size(1) == K + + a1q, a1q_scale, _, expert_ids, inv_perm = _moe_permute( + a1q, + a1q_scale, + topk_ids, + global_num_experts, + expert_map, + self.block_shape[0], + ) + + # Note: M_sum is different than the pre-permuted shape of a1q. + M_sum = a1q.size(0) + workspace1 = _resize_cache(workspace13, (M_sum, N)) + workspace2 = _resize_cache(workspace2, (M_sum, N // 2)) + workspace3 = _resize_cache(workspace13, (M_sum, K)) - inv_perm: Optional[torch.Tensor] = None + dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous( + (a1q, a1q_scale), (w1, w1_scale), workspace1, expert_ids) - num_tokens = top_k_num * tokens_in_chunk - sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1) - expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0) - inv_perm = torch.argsort(sorted_token_ids)[:num_tokens] + self.activation(activation, workspace2, workspace1.view(-1, N)) - # Permute according to sorted token ids. - curr_hidden_states = _fp8_perm(curr_hidden_states, - sorted_token_ids // top_k_num) + a2q_scale: Optional[torch.Tensor] = None - if a1q_scale is not None: - a1q_scale = a1q_scale[sorted_token_ids // top_k_num] + a2q, a2q_scale = _fp8_quantize(workspace2, a2_scale, False, + self.block_shape) - return (curr_hidden_states, a1q_scale, sorted_token_ids, expert_ids, - inv_perm) + dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous( + (a2q, a2q_scale), (w2, w2_scale), workspace3, expert_ids) + workspace3 = workspace3[inv_perm, ...] -def _moe_unpermute_and_reduce( - out: torch.Tensor, - curr_hidden: torch.Tensor, - inv_perm: Optional[torch.Tensor], - topk_weight: torch.Tensor, -) -> None: - """ - Unpermute the final result and apply topk_weights, then perform the final - reduction on the hidden states. - """ - M, topk = topk_weight.shape - K = curr_hidden.shape[1] - curr_hidden = curr_hidden[inv_perm, ...] - curr_hidden = curr_hidden.view(-1, topk, K) - curr_hidden.mul_(topk_weight.view(M, -1, 1)) - ops.moe_sum(curr_hidden, out) + return workspace3 def deep_gemm_moe_fp8( @@ -128,6 +165,7 @@ def deep_gemm_moe_fp8( expert_map: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, + apply_router_weight_on_input=False, ) -> torch.Tensor: """ This function computes a a8w8-quantized Mixture of Experts (MoE) layer @@ -166,129 +204,24 @@ def deep_gemm_moe_fp8( Returns: - torch.Tensor: The bfloat16 output tensor after applying the MoE layer. """ - # Lazy import to avoid CUDA initialization problems. - import deep_gemm as dg - - assert expert_map is None, "Expert maps not supported yet" - - assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch" - - assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" - assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" - assert w1.stride(-1) == 1, "Stride of last dimension must be 1" - assert w2.stride(-1) == 1, "Stride of last dimension must be 1" - assert hidden_states.dtype in [ - torch.float32, torch.float16, torch.bfloat16 - ] - assert w1.dtype == torch.float8_e4m3fn - assert w2.dtype == torch.float8_e4m3fn - assert w1.shape[0] == w2.shape[0], "Expert number mismatch" - assert w1.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch" - assert w1.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch" - assert a1_scale is None or a1_scale.dim( - ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[ - 0] == hidden_states.shape[0], "Input scale shape mismatch" - assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch" # noqa: E501 - - num_tokens, _ = hidden_states.shape - E, N, _ = w1.shape - K = w2.shape[1] - if global_num_experts == -1: - global_num_experts = E - - # We execute the fused_moe kernel in chunks to circumvent this issue: - # https://github.com/vllm-project/vllm/issues/5938 - CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE - - assert _valid_deep_gemm(hidden_states, w1, w2, expert_map) - - if inplace: - out_hidden_states = hidden_states - else: - out_hidden_states = torch.empty_like(hidden_states) - - block_m = dg.get_m_alignment_for_contiguous_layout() - block_shape = [block_m, block_m] - - assert w1_scale is not None - assert w2_scale is not None - - # We attempt to transpose and align offline in Fp8MoEMethod, in which - # case these calls will be nops. Otherwise, they'll be performed every - # time the layer is executed. - w1_scale = dg.get_col_major_tma_aligned_tensor(w1_scale).contiguous() - w2_scale = dg.get_col_major_tma_aligned_tensor(w2_scale).contiguous() - - M_sum = topk_ids.numel() + global_num_experts * (block_m - 1) - M_sum = round_up(M_sum, block_m) - - num_chunks = (num_tokens // CHUNK_SIZE) + 1 - - # We can reuse the memory between cache1 and cache3 because by the time - # we need cache3, we're done with cache1 - workspace13 = torch.empty(M_sum * max(N, K), - device=hidden_states.device, - dtype=hidden_states.dtype) - - workspace1 = workspace13[:M_sum * N].view(M_sum, N) - workspace2 = torch.empty((M_sum, N // 2), - device=hidden_states.device, - dtype=hidden_states.dtype) - workspace3 = workspace13[:M_sum * K].view(M_sum, K) - - for chunk in range(num_chunks): - begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE, - min((chunk + 1) * CHUNK_SIZE, - num_tokens)) - curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx] - tokens_in_chunk, _ = curr_hidden_states.shape - - if tokens_in_chunk == 0: - break - - curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx] - curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx] - - a1q_scale: Optional[torch.Tensor] = None - - qcurr_hidden_states, a1q_scale = _fp8_quantize(curr_hidden_states, - a1_scale, block_shape) - - (qcurr_hidden_states, a1q_scale, sorted_token_ids, expert_ids, - inv_perm) = _moe_permute(qcurr_hidden_states, a1q_scale, - curr_topk_ids, global_num_experts, - expert_map, block_m) - - # Adjust the intermediate cache size and config for the last chunk. - # Note that in most cases we only have one chunk so the cache size - # and config are already set correctly and do not need to be adjusted. - if tokens_in_chunk < CHUNK_SIZE and chunk > 0: - curr_M = sorted_token_ids.numel() - workspace1 = _resize_cache(workspace1, (curr_M, N)) - workspace2 = _resize_cache(workspace2, (curr_M, N // 2)) - workspace3 = _resize_cache(workspace3, (curr_M, K)) - - dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous( - (qcurr_hidden_states, a1q_scale), (w1, w1_scale), workspace1, - expert_ids) - - if activation == "silu": - torch.ops._C.silu_and_mul(workspace2, workspace1.view(-1, N)) - elif activation == "gelu": - torch.ops._C.gelu_and_mul(workspace2, workspace1.view(-1, N)) - else: - raise ValueError(f"Unsupported FusedMoe activation: {activation}") - - a2q_scale: Optional[torch.Tensor] = None - - qworkspace2, a2q_scale = _fp8_quantize(workspace2, a2_scale, - block_shape) - - dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous( - (qworkspace2, a2q_scale), (w2, w2_scale), workspace3, expert_ids) - - _moe_unpermute_and_reduce( - out_hidden_states[begin_chunk_idx:end_chunk_idx], - workspace3.view(*workspace3.shape), inv_perm, curr_topk_weights) - - return out_hidden_states + fn = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(quant_dtype=torch.float8_e4m3fn, + block_shape=deep_gemm_block_shape()), + DeepGemmExperts(), + ) + return fn( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + inplace, + activation, + global_num_experts, + expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + apply_router_weight_on_input=apply_router_weight_on_input, + ) diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py new file mode 100644 index 000000000000..c2db79365931 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -0,0 +1,755 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Fused batched MoE kernel.""" +from typing import Optional + +import torch +import triton +import triton.language as tl + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.fused_moe import ( + get_config_dtype_str, try_get_optimal_moe_config) +from vllm.model_executor.layers.fused_moe.utils import _resize_cache + + +@triton.jit +def moe_mmk( + a_ptrs, + b_ptrs, + K, + expert_id, + a_scale_ptr, + b_scale_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_ak, + stride_bk, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # Offsets and masks + offs_m, + offs_n, + mask_m, + # Block size for block-wise quantization + group_n: tl.constexpr, + group_k: tl.constexpr, + # Meta-parameters + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + compute_type: tl.constexpr, + use_w8a8: tl.constexpr, + use_w8a16: tl.constexpr): + + offs_k = tl.arange(0, BLOCK_K) + + if use_w8a16: + b_scale_ptrs = b_scale_ptr + expert_id * stride_bse + offs_n[ + None, :] * stride_bsn + b_scale = tl.load(b_scale_ptrs) + + if use_w8a8: + # block-wise + if group_k > 0 and group_n > 0: + a_scale_ptrs = a_scale_ptr + offs_m * stride_asm + offs_bsn = offs_n // group_n + b_scale_ptrs = (b_scale_ptr + expert_id * stride_bse + + offs_bsn * stride_bsn) + # tensor-wise + else: + a_scale = tl.load(a_scale_ptr) + b_scale = tl.load(b_scale_ptr + expert_id) + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_K)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + a = tl.load(a_ptrs, + mask=mask_m[:, None] & (offs_k[None, :] < K - k * BLOCK_K), + other=0.0) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_K, other=0.0) + # We accumulate along the K dimension. + if use_w8a16: + accumulator = tl.dot(a, b.to(compute_type), acc=accumulator) + elif use_w8a8: + if group_k > 0 and group_n > 0: + k_start = k * BLOCK_K + offs_ks = k_start // group_k + a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask, + mask=mask_m, + other=0.0) + b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk) + + accumulator += tl.dot(a, b) * a_scale[:, + None] * b_scale[None, :] + else: + if use_w8a8: + # acc used to enable fp8_fast_accum + accumulator = tl.dot(a, b, acc=accumulator) + else: + accumulator += tl.dot(a, b) + else: + accumulator += tl.dot(a, b) + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_K * stride_ak + b_ptrs += BLOCK_K * stride_bk + + if use_w8a16: + accumulator = (accumulator * b_scale).to(compute_type) + elif use_w8a8: + if group_k > 0 and group_n > 0: + accumulator = accumulator.to(compute_type) + else: + accumulator = (accumulator * a_scale * b_scale).to(compute_type) + else: + accumulator = accumulator.to(compute_type) + + return accumulator + + +@triton.jit +def expert_triton_kernel( + a_ptr, #[max_tokens, K] + b_ptr, #[K, N] + c_ptr, #[max_tokens, N] + expert_id, + compute_type: tl.constexpr, + # Dimensions + M, + N, + K, + # Quantization data + a_scale_ptr, + b_scale_ptr, + b_zp_ptr, + # strides + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # Blockwise quantization data + group_n, + group_k, + # Quantization schemes + use_fp8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, + # Kernel config + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr): + + offs_m = tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) % N + offs_k = tl.arange(0, BLOCK_K) + mask_m = offs_m < M + + a_ptrs = a_ptr + offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak + b_ptrs = b_ptr + offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn + + accumulator = moe_mmk( + a_ptrs, + b_ptrs, + K, + expert_id, + a_scale_ptr, + b_scale_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_ak, + stride_bk, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # Offsets and masks + offs_m, + offs_n, + mask_m, + # Block size for block-wise quantization + group_n, + group_k, + # Meta-parameters + BLOCK_M, + BLOCK_N, + BLOCK_K, + compute_type, + use_fp8_w8a8, + use_int8_w8a16) + + # store in C + offs_cn = tl.arange(0, BLOCK_N) + c_ptrs = c_ptr + offs_m[:, None] * stride_cm + offs_cn[None, :] * stride_cn + c_mask = mask_m[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +@triton.jit +def batched_triton_kernel( + a_ptr, # [E, max_num_tokens, K] + b_ptr, # [E, K, N] + c_ptr, # [E, max_num_tokens, N] + expert_num_tokens, # [E] + compute_type: tl.constexpr, + # Dimensions + max_num_tokens, + K, + N, + # Quantization data + a_scale_ptr, + b_scale_ptr, + b_zp_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_ae, + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_ce, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # Blockwise quantization data + group_n: tl.constexpr, + group_k: tl.constexpr, + # Quantization schemes + use_fp8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, + # Kernel config + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr): + expert_id = tl.program_id(axis=0) + e_num_tokens = tl.load(expert_num_tokens + expert_id) + if e_num_tokens == 0: + # Early exit + return + + pid_mn = tl.program_id(axis=1) + #num_pid_m = tl.cdiv(max_num_tokens, BLOCK_M) + num_pid_n = tl.cdiv(N, BLOCK_N) + pid_m = pid_mn // num_pid_n + pid_n = pid_mn % num_pid_n + + cta_m_start = pid_m * BLOCK_M + cta_n_start = pid_n * BLOCK_N + if cta_m_start >= e_num_tokens: + # Early exit + return + + cta_m_size = min(BLOCK_M, e_num_tokens - cta_m_start) + cta_n_size = min(BLOCK_N, N - cta_n_start) + + a_ptr = a_ptr + expert_id * stride_ae + cta_m_start * stride_am + b_ptr = b_ptr + expert_id * stride_be + cta_n_start * stride_bn + c_ptr = (c_ptr + expert_id * stride_ce + cta_m_start * stride_cm + + cta_n_start * stride_cn) + + expert_triton_kernel( + a_ptr, + b_ptr, + c_ptr, + expert_id, + compute_type, + cta_m_size, # M + cta_n_size, # N + K, # K + a_scale_ptr, + b_scale_ptr, + b_zp_ptr, + # Strides + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # Blockwise quantization data + group_n, + group_k, + # Quantization schemes + use_fp8_w8a8, + use_int8_w8a16, + # Kernel config + BLOCK_M, + BLOCK_N, + BLOCK_K) + + +def invoke_moe_batched_triton_kernel( + A: torch.Tensor, # [E, max_tokens, K] + B: torch.Tensor, # [E, K, N] + C: torch.Tensor, # [E, max_tokens, N] + expert_num_tokens: torch.Tensor, # [E] + compute_type: tl.dtype, + # Quantization data + A_scale: torch.Tensor, + B_scale: torch.Tensor, + B_zp: torch.Tensor, + # Quantization schemes + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + use_int4_w4a16: bool, + config: dict[str, int], + block_shape: Optional[list[int]] = None): + + assert not use_int4_w4a16 + max_num_tokens = A.size(1) + K = A.size(2) + N = C.size(2) + + BLOCK_M = config['BLOCK_SIZE_M'] + BLOCK_N = config['BLOCK_SIZE_N'] + BLOCK_K = config['BLOCK_SIZE_K'] + assert (torch.compiler.is_compiling() + or torch.cuda.is_current_stream_capturing() + or max_num_tokens % BLOCK_M == 0) + + grid = (expert_num_tokens.size(0), triton.cdiv(max_num_tokens, BLOCK_M) * + triton.cdiv(B.size(1), BLOCK_N)) + + batched_triton_kernel[grid]( + A, + B, + C, + expert_num_tokens, + compute_type, + # Dimensions + max_num_tokens, + K, + N, + # Quantization data + A_scale, + B_scale, + B_zp, + # Strides + A.stride(0), + A.stride(1), + A.stride(2), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(0), + C.stride(1), + C.stride(2), + A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0, + A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0, + B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0, + B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0, + B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0, + # Blockwise quantization data + 0 if block_shape is None else block_shape[0], + 0 if block_shape is None else block_shape[1], + # Quantization schemes + use_fp8_w8a8, + use_int8_w8a16, + # Kernel config + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + BLOCK_K=BLOCK_K) + + +class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): + """ + A reference prepare/finalize class that reorganizes the tokens into + expert batched format, i.e. E x max_num_tokens x K. This is the format + that the PPLX dispatch/combine kernels use. + """ + + def __init__(self, max_num_tokens: Optional[int], world_size: int, + dp_size: int, rank: int): + super().__init__() + self.world_size = world_size + self.dp_size = dp_size + self.rank = rank + self.max_num_tokens = max_num_tokens + + def prepare( + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + assert a1.dim() == 2 + assert topk_ids.dim() == 2 + assert topk_ids.size(0) == a1.size(0) + + if apply_router_weight_on_input: + topk = topk_ids.size(1) + # TODO: this only works for topK=1, will need to update for topK>1 + assert topk == 1, \ + "apply_router_weight_on_input is only implemented for topk=1" + a1.mul_(topk_weights.to(a1.dtype)) + + num_tokens, hidden_dim = a1.size() + topk = topk_ids.size(1) + + if self.max_num_tokens is None: + tokens_per_expert = torch.bincount(topk_ids.view(-1), + minlength=num_experts) + self.max_num_tokens = int(tokens_per_expert.max().item()) + else: + tokens_per_expert = torch.zeros(num_experts, + dtype=torch.int, + device=a1.device) + + assert num_experts % self.world_size == 0 + + num_local_experts = num_experts // self.world_size + + b_a1 = torch.zeros( + (num_local_experts, self.max_num_tokens, hidden_dim), + dtype=a1.dtype, + device=a1.device) + + first_expert = num_local_experts * self.rank + last_expert = first_expert + num_local_experts + + for expert_id in range(first_expert, last_expert): + topks = torch.any(topk_ids == expert_id, dim=1).flatten() + rows = torch.count_nonzero(topks.flatten()) + b_a1[expert_id - + first_expert, :rows, :] = a1[:topks.numel()][topks] + tokens_per_expert[expert_id - first_expert] = rows + + return b_a1, a1_scale, tokens_per_expert + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + ) -> None: + num_tokens = topk_ids.size(0) + num_local_experts = fused_expert_output.size(0) + K = fused_expert_output.size(-1) + assert output.size(0) == num_tokens and output.size(1) == K + + output.fill_(0) + + first_expert = num_local_experts * self.rank + last_expert = first_expert + num_local_experts + + for expert_id in range(first_expert, last_expert): + matching_tokens = topk_ids == expert_id + topks = torch.any(matching_tokens, dim=1).flatten() + rows = torch.count_nonzero(topks) + rhs = fused_expert_output[expert_id - first_expert, :rows, :] + if not apply_router_weight_on_input: + rhs.mul_(topk_weights[matching_tokens].view(rhs.size(0), 1)) + output[topks] = output[topks] + rhs + + +class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): + """ + A reference MoE expert class that operates on expert batched format, + i.e. E x max_num_tokens x K. This is the format that the pplx + dispatch/combine kernels use. + """ + + def __init__( + self, + world_size: int, + dp_size: int, + max_num_tokens: Optional[int] = None, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + block_shape: Optional[list[int]] = None, + block_m: Optional[int] = None, + ): + super().__init__() + assert block_shape is None + assert block_m is None + assert not use_fp8_w8a8, "NYI" + assert not use_int8_w8a8, "NYI" + assert not use_int8_w8a16, "NYI" + assert not use_int4_w4a16, "NYI" + self.max_num_tokens = max_num_tokens + self.world_size = world_size + self.dp_size = dp_size + + def workspace_shapes( + self, + a: torch.Tensor, + M: int, + N: int, + K: int, + topk: int, + num_experts: int, + ) -> tuple[int, int, torch.dtype]: + assert a.dim() == 2 + num_dp = self.world_size // self.dp_size + max_num_tokens = a.size( + 0) if self.max_num_tokens is None else self.max_num_tokens + #print(f"WORKSPACE {max_num_tokens} {num_dp}") + workspace13 = num_experts * max_num_tokens * num_dp * K + workspace2 = max_num_tokens * num_dp * N + return (workspace13, workspace2, a.dtype) + + def apply( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_num_tokens: Optional[torch.Tensor], + ) -> torch.Tensor: + assert hidden_states.dim() == 3 + assert expert_num_tokens is not None + hidden_dim = hidden_states.size(-1) + + if self.max_num_tokens is None: + max_num_tokens = hidden_states.size(1) + else: + max_num_tokens = self.max_num_tokens + + num_dp = self.world_size // self.dp_size + num_experts = global_num_experts + out = _resize_cache(workspace13, + (num_experts, max_num_tokens * num_dp, hidden_dim)) + num_local_experts = w1.size(0) + assert num_local_experts == w1.size(0), ( + f"{num_local_experts} == {w1.size(0)}") + + N = w1.size(1) // 2 + + # Not cudagraph friendly + assert (torch.compiler.is_compiling() + or torch.cuda.is_current_stream_capturing() + or torch.all(expert_num_tokens <= max_num_tokens * num_dp)), ( + f"{expert_num_tokens} <= {max_num_tokens * num_dp}") + + for expert in range(num_local_experts): + # Indexing expert_num_tokens doesn't work w/cudagraphs or inductor + if (torch.compiler.is_compiling() + or torch.cuda.is_current_stream_capturing()): + num = max_num_tokens * num_dp + else: + num = int(expert_num_tokens[expert].item()) + tmp = _resize_cache(workspace2, (num, N)) + input = hidden_states[expert, :num, :] @ w1[expert].transpose(0, 1) + self.activation(activation, tmp, input) + out[expert, :num, :] = tmp @ w2[expert].transpose(0, 1) + + return out + + +class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): + """ + A Triton based MoE expert class that operates on expert batched format, + i.e. E x max_num_tokens x K. This is the format that the pplx + dispatch/combine kernels use. + """ + + def __init__( + self, + max_num_tokens: Optional[int] = None, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + block_shape: Optional[list[int]] = None, + world_size: int = 1, + dp_size: int = 1, + ): + super().__init__() + self.use_fp8_w8a8 = use_fp8_w8a8 + self.use_int8_w8a8 = use_int8_w8a8 + self.use_int4_w4a16 = use_int4_w4a16 + self.use_int8_w8a16 = use_int8_w8a16 + self.block_shape = block_shape + self.max_num_tokens = max_num_tokens + assert not use_int8_w8a8, "NYI" + assert not use_int4_w4a16, "NYI" + self.world_size = world_size + self.dp_size = dp_size + + def workspace_shapes( + self, + a: torch.Tensor, + M: int, + N: int, + K: int, + topk: int, + num_experts: int, + ) -> tuple[int, int, torch.dtype]: + assert a.dim() == 2 + num_dp = self.world_size // self.dp_size + max_num_tokens = a.size( + 0) if self.max_num_tokens is None else self.max_num_tokens + workspace13 = num_experts * max_num_tokens * num_dp * max(K, N) + workspace2 = num_experts * max_num_tokens * num_dp * (N // 2) + return (workspace13, workspace2, a.dtype) + + def apply( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_num_tokens: Optional[torch.Tensor], + ) -> torch.Tensor: + # Check constraints. + if self.use_int4_w4a16: + assert hidden_states.size(-1) // 2 == w1.size(2), ( + "Hidden size mismatch") + else: + assert hidden_states.size(-1) == w1.size(2), ( + f"Hidden size mismatch {hidden_states.size(-1)} " + f"!= {w1.size(2)}") + + assert hidden_states.is_contiguous( + ), "Hidden_states must be contiguous" + assert w1.stride(-1) == 1, "Stride of last dimension must be 1" + assert w2.stride(-1) == 1, "Stride of last dimension must be 1" + assert hidden_states.dtype in [ + torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn + ] + + # TODO: num_tokens -> max_num_tokens? + E, num_tokens, N, K, top_k_num = mk._moe_problem_size( + hidden_states, w1, w2, topk_ids) + + assert w1.size(0) == E + assert w2.size(0) == E + + config_dtype = get_config_dtype_str(use_fp8_w8a8=self.use_fp8_w8a8, + use_int8_w8a16=self.use_int8_w8a16, + use_int4_w4a16=self.use_int4_w4a16, + dtype=hidden_states.dtype) + + config = try_get_optimal_moe_config( + w1.size(), + w2.size(), + top_k_num, + config_dtype, + num_tokens, + block_shape=self.block_shape, + ) + + if hidden_states.dtype == torch.bfloat16: + compute_type = tl.bfloat16 + elif hidden_states.dtype == torch.float16: + compute_type = tl.float16 + elif hidden_states.dtype == torch.float32: + compute_type = tl.float32 + elif hidden_states.dtype == torch.float8_e4m3fn: + compute_type = tl.bfloat16 + else: + raise ValueError( + f"Unsupported compute_type: {hidden_states.dtype}") + + #print(f"shape: E={E}, M={num_tokens}, N={N}, K={K}, top_k={top_k_num}") + # We can reuse the memory between these because by the time we need + # cache3, we're done with cache1 + intermediate_cache1 = _resize_cache(workspace13, (E, num_tokens, N)) + intermediate_cache2 = _resize_cache(workspace2, + (E, num_tokens, N // 2)) + intermediate_cache3 = _resize_cache(workspace13, (E, num_tokens, K)) + + # MM1 + invoke_moe_batched_triton_kernel(A=hidden_states, + B=w1, + C=intermediate_cache1, + expert_num_tokens=expert_num_tokens, + compute_type=compute_type, + A_scale=a1q_scale, + B_scale=w1_scale, + B_zp=w1_zp, + use_fp8_w8a8=self.use_fp8_w8a8, + use_int8_w8a16=self.use_int8_w8a16, + use_int4_w4a16=self.use_int4_w4a16, + config=config, + block_shape=self.block_shape) + + # TODO: would be nice to use expert_num_tokens here to reduce + # garbage compute + self.activation(activation, intermediate_cache2.view(-1, N // 2), + intermediate_cache1.view(-1, N)) + + #qintermediate_cache2 = intermediate_cache2 + a2q_scale = a2_scale + # TODO (varun) : support w8a8 + assert not self.use_fp8_w8a8 + #if self.use_fp8_w8a8: + # qintermediate_cache2, a2q_scale = _fp8_quantize( + # intermediate_cache2, a2_scale, self.block_shape) + + invoke_moe_batched_triton_kernel(A=intermediate_cache2, + B=w2, + C=intermediate_cache3, + expert_num_tokens=expert_num_tokens, + compute_type=compute_type, + A_scale=a2q_scale, + B_scale=w2_scale, + B_zp=w2_zp, + use_fp8_w8a8=self.use_fp8_w8a8, + use_int8_w8a16=self.use_int8_w8a16, + use_int4_w4a16=self.use_int4_w4a16, + config=config, + block_shape=self.block_shape) + + return intermediate_cache3 diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 7bf4243305ac..78f8eb926dc8 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -8,16 +8,17 @@ import torch import vllm.envs as envs +import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( _valid_deep_gemm, deep_gemm_moe_fp8) from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( moe_align_block_size) -from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - per_token_group_quant_fp8) -from vllm.model_executor.layers.quantization.utils.int8_utils import ( - per_token_group_quant_int8, per_token_quant_int8) +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP) +from vllm.model_executor.layers.fused_moe.utils import ( + _resize_cache, moe_kernel_quantize_input) from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils import direct_register_custom_op @@ -484,6 +485,20 @@ def invoke_fused_moe_kernel(A: torch.Tensor, assert topk_weights is None or topk_weights.stride(1) == 1 assert sorted_token_ids.stride(0) == 1 + if use_fp8_w8a8 or use_int8_w8a8: + assert B_scale is not None + assert (block_shape is None or triton.cdiv(B.shape[-2], block_shape[0]) + == B_scale.shape[-2]) + assert (block_shape is None or triton.cdiv(B.shape[-1], block_shape[1]) + == B_scale.shape[-1]) + + elif use_int8_w8a16 or use_int4_w4a16: + assert B_scale is not None + assert block_shape is None or block_shape[0] == 0 + else: + assert A_scale is None + assert B_scale is None + M = A.shape[0] num_tokens = M * top_k @@ -855,6 +870,7 @@ def fused_topk( gating_output: torch.Tensor, topk: int, renormalize: bool, + indices_type: Optional[torch.dtype] = None, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: assert hidden_states.shape[0] == gating_output.shape[0], ( "Number of tokens mismatch") @@ -865,10 +881,11 @@ def fused_topk( topk, dtype=torch.float32, device=hidden_states.device) - topk_ids = torch.empty(M, - topk, - dtype=torch.int32, - device=hidden_states.device) + topk_ids = torch.empty( + M, + topk, + dtype=torch.int32 if indices_type is None else indices_type, + device=hidden_states.device) token_expert_indices = torch.empty(M, topk, dtype=torch.int32, @@ -962,6 +979,20 @@ def get_config_dtype_str( return None +# TODO (bnell): use scalar_type instead of bools? +def get_config_qtype( + use_fp8_w8a8: bool, + use_int8_w8a8: bool, + use_int8_w8a16: bool, + use_int4_w4a16: bool, +) -> Optional[torch.dtype]: + if use_fp8_w8a8: + return torch.float8_e4m3fn + elif use_int8_w8a8: + return torch.int8 + return None + + def inplace_fused_experts(hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -1128,7 +1159,10 @@ def fused_experts(hidden_states: torch.Tensor, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[list[int]] = None, allow_deep_gemm: bool = False) -> torch.Tensor: - if (allow_deep_gemm and use_fp8_w8a8 + # For now, disable DeepGemm for small N (<= 512) until better + # permute/unpermute ops are available. + N = w1.shape[1] + if (allow_deep_gemm and use_fp8_w8a8 and N > 512 and _valid_deep_gemm(hidden_states, w1, w2, expert_map)): assert apply_router_weight_on_input is False return deep_gemm_moe_fp8( @@ -1145,6 +1179,7 @@ def fused_experts(hidden_states: torch.Tensor, w2_scale=w2_scale, a1_scale=a1_scale, a2_scale=a2_scale, + apply_router_weight_on_input=apply_router_weight_on_input, ) else: return dispatch_fused_experts_func(inplace)( @@ -1171,87 +1206,37 @@ def fused_experts(hidden_states: torch.Tensor, block_shape=block_shape) -def moe_kernel_prepare_input( - A: torch.Tensor, - B: torch.Tensor, - A_scale: Optional[torch.Tensor], - B_scale: Optional[torch.Tensor], - use_fp8_w8a8: bool, - use_int8_w8a8: bool, - use_int8_w8a16: bool, - use_int4_w4a16: bool, - per_channel_quant: bool, +def fused_experts_impl( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + inplace: bool = False, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[list[int]] = None, -) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - if use_fp8_w8a8: - assert B_scale is not None - if block_shape is None: - # If weights are per-channel (per_channel_quant=True), then - # activations apply per-token quantization. Otherwise, assume - # activation tensor-wise fp8 quantization, dynamic or static - A, A_scale = ops.scaled_fp8_quant( - A, A_scale, use_per_token_if_dynamic=per_channel_quant) - else: - # activation block-wise fp8 quantization - assert len(block_shape) == 2 - _, block_k = block_shape[0], block_shape[1] - A, A_scale = per_token_group_quant_fp8(A, block_k) - assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1] - # assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2] - # assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1] - elif use_int8_w8a8: - assert B_scale is not None - if block_shape is None: - # activation channel-wise int8 quantization - assert (per_channel_quant - ), "int8 quantization only supports block or channel-wise" - A, A_scale = per_token_quant_int8(A) - else: - # activation block-wise int8 quantization - assert len(block_shape) == 2 - _, block_k = block_shape[0], block_shape[1] - A, A_scale = per_token_group_quant_int8(A, block_k) - assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1] - # assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2] - # assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1] - elif use_int8_w8a16 or use_int4_w4a16: - assert B_scale is not None - assert block_shape is None or block_shape[0] == 0 - else: - assert A_scale is None - assert B_scale is None - - return A, A_scale - - -def fused_experts_impl(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - inplace: bool = False, - activation: str = "silu", - apply_router_weight_on_input: bool = False, - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - per_channel_quant: bool = False, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - w1_zp: Optional[torch.Tensor] = None, - w2_zp: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None): +) -> torch.Tensor: # Check constraints. if use_int4_w4a16: assert hidden_states.shape[1] // 2 == w1.shape[ 2], "Hidden size mismatch" else: - assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch" + assert hidden_states.shape[1] == w1.shape[2], ( + f"Hidden size mismatch {hidden_states.shape[1]} != {w1.shape[2]}") assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" @@ -1261,7 +1246,7 @@ def fused_experts_impl(hidden_states: torch.Tensor, torch.float32, torch.float16, torch.bfloat16 ] - num_tokens, _ = hidden_states.shape + num_tokens = hidden_states.shape[0] E, N, _ = w1.shape K = w2.shape[1] if global_num_experts == -1: @@ -1276,6 +1261,11 @@ def fused_experts_impl(hidden_states: torch.Tensor, use_int4_w4a16=use_int4_w4a16, dtype=hidden_states.dtype) + qtype = get_config_qtype(use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16) + get_config_func = functools.partial( try_get_optimal_moe_config, w1.shape, @@ -1338,15 +1328,10 @@ def fused_experts_impl(hidden_states: torch.Tensor, curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx] curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx] - qcurr_hidden_states, qa1_scale = moe_kernel_prepare_input( + qcurr_hidden_states, a1q_scale = moe_kernel_quantize_input( A=curr_hidden_states, - B=w1, A_scale=a1_scale, - B_scale=w1_scale, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, + qtype=qtype, per_channel_quant=per_channel_quant, block_shape=block_shape) @@ -1357,7 +1342,7 @@ def fused_experts_impl(hidden_states: torch.Tensor, invoke_fused_moe_kernel(qcurr_hidden_states, w1, intermediate_cache1, - qa1_scale, + a1q_scale, w1_scale, w1_zp, curr_topk_weights, @@ -1384,22 +1369,17 @@ def fused_experts_impl(hidden_states: torch.Tensor, else: raise ValueError(f"Unsupported FusedMoe activation: {activation}") - qintermediate_cache2, qa2_scale = moe_kernel_prepare_input( + qintermediate_cache2, a2q_scale = moe_kernel_quantize_input( A=intermediate_cache2, - B=w2, A_scale=a2_scale, - B_scale=w2_scale, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, + qtype=qtype, per_channel_quant=per_channel_quant, block_shape=block_shape) invoke_fused_moe_kernel(qintermediate_cache2, w2, intermediate_cache3, - qa2_scale, + a2q_scale, w2_scale, w2_zp, curr_topk_weights, @@ -1534,3 +1514,209 @@ def fused_moe( a1_scale=a1_scale, a2_scale=a2_scale, block_shape=block_shape) + + +class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): + + def __init__( + self, + use_fp8_w8a8: bool, + use_int8_w8a8: bool, + use_int8_w8a16: bool, + use_int4_w4a16: bool, + per_channel_quant: bool, + block_shape: Optional[list[int]] = None, + block_m: Optional[int] = None, + ): + super().__init__() + self.use_fp8_w8a8 = use_fp8_w8a8 + self.use_int4_w4a16 = use_int4_w4a16 + self.use_int8_w8a8 = use_int8_w8a8 + self.use_int8_w8a16 = use_int8_w8a16 + self.block_shape = block_shape + self.block_m = block_m + self.qtype = get_config_qtype(use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16) + self.per_channel_quant = per_channel_quant + + def workspace_shapes( + self, + a: torch.Tensor, + M: int, + N: int, + K: int, + topk: int, + num_experts: int, + ) -> tuple[int, int, torch.dtype]: + factor = num_experts if a.dim() == 3 else 1 + workspace1 = M * topk * max(N * 2, K) * factor + workspace2 = M * topk * N * factor + return (workspace1, workspace2, a.dtype) + + def apply( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_num_tokens: Optional[torch.Tensor], + ) -> torch.Tensor: + # Check constraints. + if self.use_int4_w4a16: + assert hidden_states.size(-1) // 2 == w1.size(2), ( + "Hidden size mismatch") + else: + assert hidden_states.size(-1) == w1.size(2), \ + (f"Hidden size mismatch {hidden_states.size(-1)} " + f"!= {w1.size(2)}") + + assert hidden_states.is_contiguous( + ), "Hidden_states must be contiguous" + assert hidden_states.dim() == 2 + assert w1.stride(-1) == 1, "Stride of last dimension must be 1" + assert w2.stride(-1) == 1, "Stride of last dimension must be 1" + assert hidden_states.dtype in [ + torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn + ] + + E, num_tokens, N, K, top_k_num = mk._moe_problem_size( + hidden_states, w1, w2, topk_ids) + + if global_num_experts == -1: + global_num_experts = E + + config_dtype = get_config_dtype_str(use_fp8_w8a8=self.use_fp8_w8a8, + use_int8_w8a16=self.use_int8_w8a16, + use_int4_w4a16=self.use_int4_w4a16, + dtype=hidden_states.dtype) + + config = try_get_optimal_moe_config( + w1.shape, + w2.shape, + top_k_num, + config_dtype, + num_tokens, + block_shape=self.block_shape, + ) + + if hidden_states.dtype == torch.bfloat16: + compute_type = tl.bfloat16 + elif hidden_states.dtype == torch.float16: + compute_type = tl.float16 + elif hidden_states.dtype == torch.float32: + compute_type = tl.float32 + elif hidden_states.dtype == torch.float8_e4m3fn: + compute_type = tl.bfloat16 + else: + raise ValueError( + f"Unsupported compute_type: {hidden_states.dtype}") + + # We can reuse the memory between these because by the time we need + # cache3, we're done with cache1 + intermediate_cache1 = _resize_cache(workspace13, + (num_tokens, top_k_num, N)) + intermediate_cache2 = _resize_cache(workspace2, + (num_tokens * top_k_num, N // 2)) + intermediate_cache3 = _resize_cache(workspace13, + (num_tokens, top_k_num, K)) + + sorted_token_ids, expert_ids, num_tokens_post_padded = ( + moe_align_block_size(topk_ids, config['BLOCK_SIZE_M'], + global_num_experts, expert_map)) + + invoke_fused_moe_kernel(hidden_states, + w1, + intermediate_cache1, + a1q_scale, + w1_scale, + w1_zp, + None, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + False, + top_k_num, + config, + compute_type=compute_type, + use_fp8_w8a8=self.use_fp8_w8a8, + use_int8_w8a8=self.use_int8_w8a8, + use_int8_w8a16=self.use_int8_w8a16, + use_int4_w4a16=self.use_int4_w4a16, + per_channel_quant=self.per_channel_quant, + block_shape=self.block_shape) + + self.activation(activation, intermediate_cache2, + intermediate_cache1.view(-1, N)) + + a2q_scale: Optional[torch.Tensor] = None + + qintermediate_cache2, a2q_scale = moe_kernel_quantize_input( + intermediate_cache2, a2_scale, self.qtype, self.per_channel_quant, + self.block_shape) + + invoke_fused_moe_kernel(qintermediate_cache2, + w2, + intermediate_cache3, + a2q_scale, + w2_scale, + w2_zp, + None, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + False, + 1, + config, + compute_type=compute_type, + use_fp8_w8a8=self.use_fp8_w8a8, + use_int8_w8a8=self.use_int8_w8a8, + use_int8_w8a16=self.use_int8_w8a16, + use_int4_w4a16=self.use_int4_w4a16, + per_channel_quant=self.per_channel_quant, + block_shape=self.block_shape) + + return intermediate_cache3 + + +def modular_triton_fused_moe( + use_fp8_w8a8: bool, + use_int8_w8a8: bool, + use_int8_w8a16: bool, + use_int4_w4a16: bool, + per_channel_quant: bool, + block_shape: Optional[list[int]] = None, +) -> mk.FusedMoEModularKernel: + qtype = get_config_qtype( + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + ) + return mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP( + quant_dtype=qtype, + per_channel_quant=per_channel_quant, + block_shape=block_shape, + ), + TritonExperts( + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + per_channel_quant=per_channel_quant, + block_shape=block_shape, + ), + ) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 14f360e3bbf3..f1cb77f64eae 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1,15 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 +import importlib +import threading from abc import abstractmethod +from dataclasses import dataclass from enum import Enum from typing import Callable, Optional +from weakref import WeakValueDictionary import torch import torch.nn.functional as F from torch.nn.parameter import UninitializedParameter import vllm.envs as envs -from vllm.config import get_current_vllm_config +from vllm.config import ParallelConfig, get_current_vllm_config from vllm.distributed import (get_dp_group, get_ep_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -26,10 +30,20 @@ from vllm.platforms.interface import CpuArchEnum from vllm.utils import direct_register_custom_op +has_pplx = importlib.util.find_spec("pplx_kernels") is not None + if current_platform.is_cuda_alike(): - from .fused_moe import fused_experts + from .fused_batched_moe import (BatchedPrepareAndFinalize, + BatchedTritonExperts) + from .fused_moe import TritonExperts, fused_experts + from .modular_kernel import (FusedMoEModularKernel, + FusedMoEPermuteExpertsUnpermute, + FusedMoEPrepareAndFinalize) + if has_pplx: + from .pplx_prepare_finalize import PplxPrepareAndFinalize else: fused_experts = None # type: ignore + FusedMoEPrepareAndFinalize = None # type: ignore if is_rocm_aiter_moe_enabled(): from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 rocm_aiter_biased_group_topk as grouped_topk) @@ -42,6 +56,179 @@ fused_moe_pallas = None # type: ignore logger = init_logger(__name__) +# Note: this limit is somewhat arbitrary and might be changed later. +# The size of the activations will be E x MOE_DP_CHUNK_SIZE x hidden_dim. +MOE_DP_CHUNK_SIZE = 256 + + +@dataclass +class FusedMoEParallelConfig: + tp_size: int + dp_size: int + ep_size: int + tp_rank: int + dp_rank: int + ep_rank: int + + use_ep: bool # whether to use EP or not + + @property + def use_pplx_kernels(self): + return self.dp_size > 1 and self.use_ep and has_pplx + + @staticmethod + def make(tp_size_: int, dp_size_: int, + vllm_parallel_config: ParallelConfig) -> "FusedMoEParallelConfig": + """ + Determine MoE parallel configuration. Based on the input tp_size_, + dp_size_, ep_size_ and vllm's parallel config, determine what + level's of parallelism to use in the fused moe layer. + + Args: + tp_size_ (int): tp_size passed into the FusedMoE constructor. + dp_size_ (int): dp_size passed into the FusedMoE constructor. + ep_size_ (int): ep_size passed into the FusedMoE constructor. + vllm_parallel_config (ParallelConfig): vllm's parallel config + object. + + Examples: + When there is no parallelism requested, i.e. tp_size_ = dp_size_ = 1, + we simply return the sizes unaltered and the ranks set to 0. + + Expert Parallelism is considered only when either dp_size_ or tp_size_ + is non trivial. + + When TP = 2, DP = 1 and EP = False, the configuration on different + devices, + - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} // + legend : {size, rank} + - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0} + - Comment : Tensors are sharded across 2 devices. + + When TP = 1, DP = 2 and EP = False, the configuration on different + devices, + - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0} + - device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0} + - Comment: There are 2 engine instances and the tensors are sharded + across 2 decvices. + + When TP = 2, DP = 2 and EP = False, the configuration on different + devices, + - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0} + - device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0} + - device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0} + - device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0} + - Comment: There are 2 engine instances and the tensors are sharded + across 4 devices. + + When, TP = 2, DP = 1 and EP = True, the configuration on different + devices, + - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0} + - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1} + - Comment: The experts are split between the 2 devices. + + When, TP = 1, DP = 2 and EP = True, the configuration on different + devices, + - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0} + - device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1} + - Comment: There are 2 engine instances and the experts are split + between the 2 devices. + + When TP = 2, DP = 2 and EP = True, the configuration on different + devices, + - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0} + - device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1} + - device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2} + - device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3} + - Comment: There are 2 engine instances and the experts are split + between the 4 devices. + """ + + def flatten_tp_across_dp(dp_rank: int): + tp_rank = 0 if tp_size_ == 1 else get_tensor_model_parallel_rank() + # There are actually dp_size_ * tp_size_ devices. Update tp_size + # and tp_rank so we shard across all devices. + tp_size = dp_size_ * tp_size_ + tp_rank = dp_rank * tp_size_ + tp_rank + return tp_size, tp_rank + + use_ep = (dp_size_ * tp_size_ > 1 + and vllm_parallel_config.enable_expert_parallel) + + dp_size = dp_size_ + dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0 + tp_size, tp_rank = flatten_tp_across_dp(dp_rank) + + if not use_ep: + return FusedMoEParallelConfig(tp_size=tp_size, + tp_rank=tp_rank, + dp_size=dp_size, + dp_rank=dp_rank, + ep_size=1, + ep_rank=0, + use_ep=False) + # DP + EP / TP + EP / DP + TP + EP + assert use_ep + # In EP, each device owns a set of experts fully. There is no tensor + # parallel update tp_size, tp_rank, ep_size and ep_rank to reflect that. + ep_size = tp_size + ep_rank = tp_rank + return FusedMoEParallelConfig(tp_size=1, + tp_rank=0, + dp_size=dp_size, + dp_rank=dp_rank, + ep_size=ep_size, + ep_rank=ep_rank, + use_ep=True) + + +# Adapted from pplx-kernels tests/all_to_all_utils.py +@dataclass +class MoEConfig: + num_experts: int + experts_per_token: int + hidden_dim: int + + num_local_experts: int + moe_parallel_config: FusedMoEParallelConfig + + in_dtype: torch.dtype # The activation type. + + # TODO: add more quantization params, blocked, per-token, etc. + block_size: int = 128 + + @property + def tp_size(self): + return self.moe_parallel_config.tp_size + + @property + def dp_size(self): + return self.moe_parallel_config.dp_size + + @property + def ep_size(self): + return self.moe_parallel_config.ep_size + + @property + def tp_rank(self): + return self.moe_parallel_config.tp_rank + + @property + def dp_rank(self): + return self.moe_parallel_config.dp_rank + + @property + def ep_rank(self): + return self.moe_parallel_config.ep_rank + + @property + def use_ep(self): + return self.moe_parallel_config.use_ep + + @property + def use_pplx_kernels(self): + return self.moe_parallel_config.use_pplx_kernels + class FusedMoeWeightScaleSupported(Enum): TENSOR = "tensor" @@ -58,6 +245,14 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, params_dtype: torch.dtype, **extra_weight_attrs): raise NotImplementedError + def set_prepare_finalize( + self, + dp_size: int, + world_size: int, + prepare_finalize: FusedMoEPrepareAndFinalize, + ) -> bool: + return False + @abstractmethod def apply( self, @@ -80,12 +275,54 @@ def apply( raise NotImplementedError +class AllToAllCache: + + def __init__(self): + self._cache: WeakValueDictionary = WeakValueDictionary() + self._lock = threading.RLock() # Reentrant lock for thread safety + + def destroy(self): + with self._lock: + # TODO: can we do del self._cache? + for _, a2a in self._cache.items(): + a2a.destroy() + + def get_or_create(self, **kwargs): + assert has_pplx + import pplx_kernels as pplx + + # Create a hashable key from the kwargs + key = tuple(sorted((k, v) for k, v in kwargs.items())) + + with self._lock: + instance = self._cache.get(key) + if instance is None: + # TODO (varun): Add support to switch to intranode + # when all communications are within the same + # node. + logger.debug("Create AllToAll %s", kwargs) + instance = pplx.AllToAll.internode(**kwargs) + self._cache[key] = instance + return instance + + +# Global singleton +_all_to_all_cache = AllToAllCache() + + +# Factory function as a cleaner interface +def get_all_to_all(**kwargs): + return _all_to_all_cache.get_or_create(**kwargs) + + @CustomOp.register("unquantized_fused_moe") class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): """MoE method without quantization.""" - def __init__(self): + def __init__(self, moe: MoEConfig): super().__init__() + self.fused_experts = fused_experts + self.moe = moe self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled() if self.rocm_aiter_moe_enabled: @@ -193,6 +430,47 @@ def apply( activation=activation, apply_router_weight_on_input=apply_router_weight_on_input) + def set_prepare_finalize( + self, + dp_size: int, + world_size: int, + prepare_finalize: FusedMoEPrepareAndFinalize, + ) -> bool: + assert self.fused_experts == fused_experts + + experts: Optional[FusedMoEPermuteExpertsUnpermute] = None + + if isinstance(prepare_finalize, + (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)): + logger.debug("BatchedTritonExperts %s", self.moe) + experts = BatchedTritonExperts( + max_num_tokens=MOE_DP_CHUNK_SIZE, + world_size=world_size, + dp_size=dp_size, + use_fp8_w8a8=False, + use_int8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False, + block_shape=None, + ) + else: + logger.debug("TritonExperts %s", self.moe) + experts = TritonExperts( + use_fp8_w8a8=False, + use_int8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False, + block_shape=None, + per_channel_quant=False, + ) + + self.fused_experts = FusedMoEModularKernel( + prepare_finalize, + experts, + ) + + return True + def forward_cuda( self, layer: torch.nn.Module, @@ -221,9 +499,11 @@ def forward_cuda( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=torch.uint32 if self.moe.use_pplx_kernels else None) if self.rocm_aiter_moe_enabled: + assert expert_map is None return self.rocm_aiter_fused_experts( hidden_states=x, w1=layer.w13_weight, @@ -232,18 +512,19 @@ def forward_cuda( topk_ids=topk_ids, activation=activation, apply_router_weight_on_input=apply_router_weight_on_input) - - return fused_experts( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map) + else: + return self.fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + ) def forward_cpu( self, @@ -399,6 +680,45 @@ def determine_expert_map( return (local_num_experts, expert_map) +def _construct_prepare_finalize( + moe: MoEConfig, quant_config: Optional[QuantizationConfig] +) -> Optional[FusedMoEPrepareAndFinalize]: + max_num_tokens = MOE_DP_CHUNK_SIZE + world_size = moe.ep_size + dp_size = moe.ep_size // moe.dp_size # dp_size actually means TP. + rank = moe.ep_rank + + if moe.use_pplx_kernels: + logger.debug("using PplxPrepareAndFinalize") + + all_to_all = get_all_to_all( + max_num_tokens=max_num_tokens, + num_experts=moe.num_experts, + experts_per_token=moe.experts_per_token, # topk + rank=rank, + world_size=world_size, + dp_size=dp_size, + hidden_dim=moe.hidden_dim, + hidden_dim_bytes=moe.hidden_dim * moe.in_dtype.itemsize, + # For blocked per token: set to + # ceil_div(hidden_dim, block_size) * sizeof(float32) + # For per-token: set to sizeof(float32) + hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else + ((moe.hidden_dim + moe.block_size - 1) // + moe.block_size * torch.float32.itemsize))) + + return PplxPrepareAndFinalize( + all_to_all, + max_num_tokens=max_num_tokens, + world_size=world_size, + rank=rank, + dp_size=dp_size, + quant_dtype=moe.in_dtype, + ) + + return None + + class FusedMoE(torch.nn.Module): """FusedMoE layer for MoE models. @@ -449,21 +769,16 @@ def __init__( params_dtype = torch.get_default_dtype() self.params_dtype = params_dtype - # Note: here we guard against accessing the TP and DP groups when - # uninitialized (this happens when testing) - self.tp_size = (tp_size if tp_size is not None else - get_tensor_model_parallel_world_size()) - tp_rank = 0 if self.tp_size == 1 else get_tensor_model_parallel_rank() - self.dp_size = (dp_size - if dp_size is not None else get_dp_group().world_size) - self.dp_rank = (0 - if self.dp_size == 1 else get_dp_group().rank_in_group) - self.global_num_experts = num_experts - - # Use expert parallelism instead of tensor parallelism? vllm_config = get_current_vllm_config() - use_ep = (vllm_config.parallel_config.enable_expert_parallel - and self.tp_size * self.dp_size > 1) + self.moe_parallel_config: FusedMoEParallelConfig = ( + FusedMoEParallelConfig.make( + tp_size_=(tp_size if tp_size is not None else + get_tensor_model_parallel_world_size()), + dp_size_=(dp_size if dp_size is not None else + get_dp_group().world_size), + vllm_parallel_config=vllm_config.parallel_config)) + + self.global_num_experts = num_experts # For smuggling this layer into the fused moe custom op self.use_direct_call = self.dp_size == 1 @@ -474,28 +789,17 @@ def __init__( compilation_config.static_forward_context[prefix] = self self.layer_name = prefix - if use_ep: - # Set TP size to 1 to adjust for EP and adjust EP size and rank - # for DP attention. - self.ep_rank = tp_rank + self.tp_size * self.dp_rank - self.tp_rank = 0 - self.ep_size = self.tp_size * self.dp_size - self.tp_size = 1 - + # Determine expert maps + if self.use_ep: self.local_num_experts, self.expert_map = determine_expert_map( ep_size=self.ep_size, ep_rank=self.ep_rank, global_num_experts=self.global_num_experts) else: - # Adjust TP size for DP attention - self.tp_rank = tp_rank + self.tp_size * self.dp_rank - self.ep_rank = 0 - self.tp_size = self.tp_size * self.dp_size - self.ep_size = 1 - self.local_num_experts = self.global_num_experts - self.expert_map = None + self.local_num_experts, self.expert_map = (self.global_num_experts, + None) + self.top_k = top_k - self.global_num_experts = num_experts assert intermediate_size % self.tp_size == 0 self.hidden_size = hidden_size @@ -520,14 +824,40 @@ def __init__( from vllm_hpu_extension.ops import DynamicFusedMOE self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts) + moe = MoEConfig( + num_experts=self.global_num_experts, + experts_per_token=top_k, + hidden_dim=hidden_size, + num_local_experts=self.local_num_experts, + moe_parallel_config=self.moe_parallel_config, + # TODO (bnell): this needs to be fixed for quantized types. + in_dtype=params_dtype, + ) + # Note: get_quant_method will look at the layer's local_num_experts # for heuristic purposes, so it must be initialized first. + quant_method: Optional[QuantizeMethodBase] = None + if quant_config is None: - self.quant_method: Optional[QuantizeMethodBase] = ( - UnquantizedFusedMoEMethod()) + quant_method = UnquantizedFusedMoEMethod(moe) + prepare_finalize = _construct_prepare_finalize(moe, quant_config) else: - self.quant_method = quant_config.get_quant_method(self, prefix) - assert self.quant_method is not None + quant_method = quant_config.get_quant_method(self, prefix) + # No pplx for quantized types yet. + prepare_finalize = None + + assert quant_method is not None + assert isinstance(quant_method, FusedMoEMethodBase) + self.quant_method = quant_method + + if prepare_finalize is not None: + world_size = moe.ep_size + dp_size = int(moe.ep_size // moe.dp_size) + success = self.quant_method.set_prepare_finalize( + dp_size, world_size, prepare_finalize) + if not success: + logger.warning("DP+EP not supported for %s.", + type(self.quant_method)) moe_quant_params = { "num_experts": self.local_num_experts, @@ -546,6 +876,38 @@ def __init__( self.quant_method.create_weights(layer=self, **moe_quant_params) + @property + def tp_size(self): + return self.moe_parallel_config.tp_size + + @property + def dp_size(self): + return self.moe_parallel_config.dp_size + + @property + def ep_size(self): + return self.moe_parallel_config.ep_size + + @property + def tp_rank(self): + return self.moe_parallel_config.tp_rank + + @property + def dp_rank(self): + return self.moe_parallel_config.dp_rank + + @property + def ep_rank(self): + return self.moe_parallel_config.ep_rank + + @property + def use_ep(self): + return self.moe_parallel_config.use_ep + + @property + def use_pplx_kernels(self): + return self.moe_parallel_config.use_pplx_kernels + def _load_per_tensor_weight_scale(self, shard_id: str, param: torch.nn.Parameter, loaded_weight: torch.Tensor, @@ -830,7 +1192,8 @@ def select_experts(hidden_states: torch.Tensor, num_expert_group: Optional[int] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", - e_score_correction_bias: Optional[torch.Tensor] = None): + e_score_correction_bias: Optional[torch.Tensor] = None, + indices_type: Optional[torch.dtype] = None): from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk # DeekSeekv2 uses grouped_top_k @@ -846,21 +1209,52 @@ def select_experts(hidden_states: torch.Tensor, topk_group=topk_group, scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias) + if indices_type is not None: + topk_ids = topk_ids.to(dtype=indices_type) elif custom_routing_function is None: topk_weights, topk_ids, token_expert_indices = fused_topk( hidden_states=hidden_states, gating_output=router_logits, topk=top_k, - renormalize=renormalize) + renormalize=renormalize, + indices_type=indices_type, + ) else: topk_weights, topk_ids = custom_routing_function( hidden_states=hidden_states, gating_output=router_logits, topk=top_k, renormalize=renormalize) + if indices_type is not None: + topk_ids = topk_ids.to(dtype=indices_type) return topk_weights, topk_ids + def must_reduce_shared_expert_outputs(self) -> bool: + """ + The shared_experts are typically computed using the RowParallelLinear + layer. The result of this function is typically used as + the reduce_results argument to the module. + When just tensor-parallel is used, it is not required to reduce + the shared_experts results immediately. Instead we reduce at the + once at the end of the MoE op. (Refer to DeepSeekV2MoE module) + With EP and the pplx kernels - this is no longer viable as all + GPU ranks in DP, produce the complete set of hidden_states. + Therefore it is required that we reduce the shared_experts output + early. + """ + return self.use_pplx_kernels + + def maybe_all_reduce_tensor_model_parallel( + self, final_hidden_states: torch.Tensor): + """ + The pplx combine kernel reduces across GPU ranks by default. + """ + if self.use_pplx_kernels: + return final_hidden_states + else: + return tensor_model_parallel_all_reduce(final_hidden_states) + def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor): if self.use_direct_call: @@ -869,9 +1263,62 @@ def forward(self, hidden_states: torch.Tensor, return torch.ops.vllm.moe_forward(hidden_states, router_logits, self.layer_name) + def forward_impl_chunked(self, full_hidden_states: torch.Tensor, + full_router_logits: torch.Tensor): + + full_final_hidden_states = torch.empty_like(full_hidden_states) + + def process_chunk(chunk_start, chunk_end, skip_result_store=False): + hidden_states = full_hidden_states[chunk_start:chunk_end, :] + router_logits = full_router_logits[chunk_start:chunk_end, :] + + # Matrix multiply. + final_hidden_states = self.quant_method.apply( + layer=self, + x=hidden_states, + router_logits=router_logits, + top_k=self.top_k, + renormalize=self.renormalize, + use_grouped_topk=self.use_grouped_topk, + global_num_experts=self.global_num_experts, + expert_map=self.expert_map, + topk_group=self.topk_group, + num_expert_group=self.num_expert_group, + custom_routing_function=self.custom_routing_function, + scoring_func=self.scoring_func, + e_score_correction_bias=self.e_score_correction_bias, + activation=self.activation, + ) + + if not skip_result_store: + full_final_hidden_states[chunk_start:chunk_end, :].copy_( + final_hidden_states) + + ctx = get_forward_context() + max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu + moe_dp_chunk_size_per_rank = MOE_DP_CHUNK_SIZE + + num_tokens = full_hidden_states.size(0) + for chunk_start_ in range(0, max_tokens_across_dp, + moe_dp_chunk_size_per_rank): + chunk_start = chunk_start_ + chunk_end = min(chunk_start + moe_dp_chunk_size_per_rank, + max_tokens_across_dp) + # clamp start and end + chunk_start = min(chunk_start, num_tokens - 1) + chunk_end = min(chunk_end, num_tokens) + + process_chunk(chunk_start, + chunk_end, + skip_result_store=chunk_start_ >= num_tokens) + + return full_final_hidden_states + def forward_impl(self, hidden_states: torch.Tensor, router_logits: torch.Tensor): assert self.quant_method is not None + if self.moe_parallel_config.use_pplx_kernels: + return self.forward_impl_chunked(hidden_states, router_logits) if self.dp_size > 1: hidden_states, router_logits = get_ep_group().dispatch( diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py new file mode 100644 index 000000000000..7d3ddf8f14c4 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -0,0 +1,364 @@ +# SPDX-License-Identifier: Apache-2.0 +from abc import ABC, abstractmethod +from typing import Optional + +import torch + +# +# This file defines a set of base classes used to make MoE kernels more modular. +# The goal is to be able to utilize different communication mechanisms with +# any fused MoE kernel without needing to have combinatoric implementations. +# +# The fused moe kernels are broken down into the following components: +# +# [Router] → [Quantize-Dispatch] → [Permute-Experts-Unpermute] → [Combine] +# +# Each component will be independent of the others except for +# [Quantize-Dispatch] and `[Combine] (see below). The components can then be +# mixed and matched with so that DP+EP can be supported easily for multiple +# MoE kernel implementations. +# +# The following main classes are defined: +# * FusedMoEPrepareAndFinalize - an abstract base class for preparation of MoE +# inputs (e.g. quantization, distribution) and finalization of Moe outputs. +# The prepare method must take care of any needed quantization and the +# finalize method must apply weights and do the final reduction of the output. +# * FusedMoEPermuteExpertsUnpermute - an abstract base class for the main fused +# MoE operation. One important feature to note is that this class does not +# apply topk weights or reduce the final output. +# * FusedMoEModularKernel - an interface class that combines a +# FusedMoEPrepareAndFinalize and a FusedMoEPermuteExpertsUnpermute to +# provide the standard fused MoE kernel interface. +# +# [Quantize-Prepare] and [Finalize] functionality are bundled into a single +# class `FusedMoEPrepareAndFinalize` since they could use collective +# communication mechanisms that need to be consistent. +# + + +def _moe_problem_size( + a1: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, +) -> tuple[int, int, int, int, int]: + """ + Extract the MoE problem size from the given tensor arguments: + - a: The hidden states, input to the MoE layer. + - w1: The first set of expert weights. + - w2: The second set of expert weights. + - topk_ids: The topk ids. + + Note: extracting the problem shape from the weight and activation tensors is + not obvious. It needs to be done this way specifically due to subtle issues + with particular kernels, e.g. the int4 kernels divide the trailing dimension + by two, so it's not "correct" to extract N or K from the trailing dimension + of w1 or w2. Similarly, some kernels transpose the weights, so this needs + to be kept in mind. + """ + assert w1.dim() == 3 and w2.dim() == 3 + E, N, _ = w1.size() + K = w2.size(1) + + if a1.dim() == 2: + # Make sure we are using the correct a1 (pre-permute). + assert topk_ids.size(0) == a1.size(0), \ + f"{topk_ids.size(0)} != {a1.size(0)}" + M = a1.size(0) + else: + assert a1.dim() == 3 + assert a1.size(0) == E, f"{a1.size(0)} == {E}" + M = a1.size(1) # This is max_num_tokens + + assert topk_ids.dim() == 2 + topk = topk_ids.size(1) + + return E, M, N, K, topk + + +class FusedMoEPrepareAndFinalize(ABC): + """ + An abstract base class for the [Quantize-Prepare] and [Finalize] steps + described above. + """ + + @abstractmethod + def prepare( + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform any quantization (and/or) dispatching needed + for this kernel. + - a1: The (unquantized) input to the MoE layer. + - a1_scale: Optional scales for a1 + - a2_scale: Optional scales for the second MoE gemm. Required to make + sure the quantization is consistent for both gemms. + - topk_ids: The topk ids. + - topk_weights: The topk weights. + - num_experts: The total number of experts in the global expert space. + - expert_map: A tensor mapping expert indices from the global expert + space to the local expert space of the expert parallel shard. + - apply_router_weight_on_input: When True, apply the weights to the + activations, before quantization + dispatching. + + Returns a tuple of: + - quantized + dispatched a. + - quantized + dispatched a1_scales. + """ + raise NotImplementedError + + @abstractmethod + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + ) -> None: + """ + Perform any combine plus apply weights and perform a reduction on the + fused experts output. + - output: The output tensor, written in place. Must be (M, K) shape. + - fused_expert_output: The unweighted, unreduced output of the fused + experts, it will have (M, topk, K) shape. + - topk_weights: The weights to be applied to the fused_experts_output. + - topk_ids: The topk_ids. + - apply_router_weight_on_input: When False, apply the weights to + fused_expert_output. + """ + raise NotImplementedError + + +class FusedMoEPermuteExpertsUnpermute(ABC): + """ + An abstract base class for the [Permute-Experts-Unpermute] step described + above. + """ + + @abstractmethod + def workspace_shapes( + self, + a: torch.Tensor, + M: int, + N: int, + K: int, + topk: int, + num_experts: int, + ) -> tuple[int, int, torch.dtype]: + """ + Compute the number of elements for the temporary outputs of the two + gemms and activation in the fused expert function. Since the + gemms are independent, the workspace for the first gemm can be shared + with the workspace for the last gemm. + + Returns a tuple of: + - Number of workspace13 elements: must be large enough to hold the + result of either expert gemm. + - Number of workspace2 elements: must be large enough to hold the + result of the activation function. + - Workspace type: The dtype to use for the workspace tensors. + """ + raise NotImplementedError + + def activation(self, activation: str, output: torch.Tensor, + input: torch.Tensor) -> None: + assert output.size(-1) * 2 == input.size(-1) + if activation == "silu": + torch.ops._C.silu_and_mul(output, input) + elif activation == "gelu": + torch.ops._C.gelu_and_mul(output, input) + else: + raise ValueError(f"Unsupported FusedMoe activation: {activation}") + + @abstractmethod + def apply( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_num_tokens: Optional[torch.Tensor], + ) -> torch.Tensor: + """ + This function computes the intermediate result of a Mixture of Experts + (MoE) layer using two sets of weights, w1 and w2. + + Parameters: + - hidden_states: (torch.Tensor): The (quantized) input tensor to the MoE + layer. + - w1 (torch.Tensor): The first set of expert weights. + - w2 (torch.Tensor): The second set of expert weights. + - topk_ids (torch.Tensor): A map of row to expert id. + - activation (str): The activation function to apply after the first + MoE layer. + - global_num_experts (int): The total number of experts in the global + expert space. + - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices + from the global expert space to the local expert space of the expert + parallel shard. + - w1_scale (Optional[torch.Tensor]): Optional scale to be used for w1. + - w2_scale (Optional[torch.Tensor]): Optional scale to be used for w2. + - w1_zp (Optional[torch.Tensor]): Optional zero points to be used for + w1. + - w2_zp (Optional[torch.Tensor]): Optional zero points to be used for + w2. + - a1q_scale (Optional[torch.Tensor]): Optional quantized scale to be + used for a1. + - a2_scale (Optional[torch.Tensor]): Optional scale to be used for a2. + - workspace13 (torch.Tensor): A scratch tensor used for gemm outputs + must be large enough to hold output of either MoE gemm. + - workspace2 (torch.Tensor): A scratch tensor used for the activation + function. + - expert_num_tokens: An optional tensor containing the number of tokens + assigned to each expert when using batched experts format input. + + Returns: + - torch.Tensor: The unweighted, unreduced output tensor + """ + raise NotImplementedError + + +class FusedMoEModularKernel(torch.nn.Module): + """ + This class combines a FusedMoEPrepareAndFinalize instance and + a FusedMoEPermuteExpertsUnpermute to provide an interface that + is compatible with the `fused_experts` function in fused_moe.py. + + It takes care of managing any required scratch space. + + Note: Instances of this class should only be used for a single model + layer due to any layer specific state that may be used by the component + objects. + """ + + def __init__( + self, + prepare_finalize: FusedMoEPrepareAndFinalize, + fused_experts: FusedMoEPermuteExpertsUnpermute, + ): + super().__init__() + self.prepare_finalize = prepare_finalize + self.fused_experts = fused_experts + + def forward( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + inplace: bool = False, + activation: str = "silu", + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + ) -> torch.Tensor: + """ + This function computes a Mixture of Experts (MoE) layer using two sets + of weights, w1 and w2, and top-k gating mechanism. + + Parameters: + - hidden_states: (torch.Tensor): The input tensor to the MoE layer. + - w1 (torch.Tensor): The first set of expert weights. + - w2 (torch.Tensor): The second set of expert weights. + - topk_weights (torch.Tensor): The topk weights applied at the end of + the layer. + - topk_ids (torch.Tensor): A map of row to expert id. + - inplace (bool): If True, perform the operation in-place. + Defaults to False. + - activation (str): The activation function to apply after the first + MoE layer. + - global_num_experts (int): The total number of experts in the global + expert space. + - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices + from the global expert space to the local expert space of the expert + parallel shard. + - w1_scale (Optional[torch.Tensor]): Optional scale to be used for w1. + - w2_scale (Optional[torch.Tensor]): Optional scale to be used for w2. + - w1_zp (Optional[torch.Tensor]): Optional zero points to be used for + w1. + - w2_zp (Optional[torch.Tensor]): Optional zero points to be used for + w2. + - a1_scale (Optional[torch.Tensor]): Optional scale to be used for a1. + - a2_scale (Optional[torch.Tensor]): Optional scale to be used for a2. + - apply_router_weight_on_input (bool): When true, the topk weights are + applied directly on the inputs. This is only applicable when topk is + 1. + + Returns: + - torch.Tensor: The output tensor after applying the MoE layer. + """ + a1 = hidden_states + E, M, N, K, top_k = _moe_problem_size(a1, w1, w2, topk_ids) + + if global_num_experts == -1: + global_num_experts = E + + output = a1 if inplace else torch.zeros_like(a1) + + workspace13_shape, workspace2_shape, workspace_dtype = ( + self.fused_experts.workspace_shapes(a1, M, N, K, top_k, + global_num_experts)) + + # We can reuse the memory between cache1 and cache3 because by the time + # we need cache3, we're done with cache1 + workspace13 = torch.zeros(workspace13_shape, + device=a1.device, + dtype=workspace_dtype) + workspace2 = torch.zeros(workspace2_shape, + device=a1.device, + dtype=workspace_dtype) + + a1q, a1q_scale, expert_num_tokens = self.prepare_finalize.prepare( + a1, a1_scale, a2_scale, topk_weights, topk_ids, global_num_experts, + expert_map, apply_router_weight_on_input) + + fused_out = self.fused_experts.apply( + a1q, + w1, + w2, + topk_ids, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=w1_zp, + w2_zp=w2_zp, + a1q_scale=a1q_scale, + a2_scale=a2_scale, + workspace13=workspace13, + workspace2=workspace2, + expert_num_tokens=expert_num_tokens, + ) + + self.prepare_finalize.finalize(output, fused_out, topk_weights, + topk_ids, apply_router_weight_on_input) + + return output diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py index 90cb04084809..270e7cf1298a 100644 --- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py @@ -3,6 +3,74 @@ import torch +from vllm import _custom_ops as ops +from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( + moe_align_block_size) +from vllm.model_executor.layers.fused_moe.utils import _fp8_perm + + +def _moe_permute( + curr_hidden_states: torch.Tensor, + a1q_scale: Optional[torch.Tensor], + curr_topk_ids: torch.Tensor, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + block_m: int, +) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor, + Optional[torch.Tensor]]: + """ + Determine the sorted_token_ids, expert_ids for the given problem size. + Permute the hidden states and scales according to `sorted_token_ids`. + """ + top_k_num = curr_topk_ids.size(1) + + tokens_in_chunk = curr_hidden_states.sizze(0) + + sorted_token_ids, expert_ids, num_tokens_post_padded = ( + moe_align_block_size(curr_topk_ids, + block_m, + global_num_experts, + expert_map, + pad_sorted_ids=True)) + + inv_perm: Optional[torch.Tensor] = None + + num_tokens = top_k_num * tokens_in_chunk + sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1) + expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0) + inv_perm = torch.argsort(sorted_token_ids)[:num_tokens] + + # Permute according to sorted token ids. + curr_hidden_states = _fp8_perm(curr_hidden_states, + sorted_token_ids // top_k_num) + + if a1q_scale is not None: + a1q_scale = a1q_scale[sorted_token_ids // top_k_num] + + return (curr_hidden_states, a1q_scale, sorted_token_ids, expert_ids, + inv_perm) + + +def _moe_unpermute_and_reduce( + out: torch.Tensor, + curr_hidden: torch.Tensor, + inv_perm: Optional[torch.Tensor], + topk_weight: torch.Tensor, + apply_router_weight_on_input: bool, +) -> None: + """ + Unpermute the final result and apply topk_weights, then perform the final + reduction on the hidden states. + """ + M, topk = topk_weight.size() + K = curr_hidden.size(-1) + if inv_perm is not None: + curr_hidden = curr_hidden[inv_perm, ...] + curr_hidden = curr_hidden.view(-1, topk, K) + if not apply_router_weight_on_input: + curr_hidden.mul_(topk_weight.view(M, -1, 1)) + ops.moe_sum(curr_hidden, out) + def moe_permute( hidden_states: torch.Tensor, @@ -17,21 +85,21 @@ def moe_permute( fill_invalid_expert: int = -1 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ - This function expands and permutes activation to gather uncontinuous tokens + This function expands and permutes activation to gather uncontinuous tokens for each expert. Parameters: - - hidden_states (torch.Tensor): The input tensor to the MoE layer. + - hidden_states (torch.Tensor): The input tensor to the MoE layer. - topk_weights (torch.Tensor): topk expert route weight for each token. - topk_ids (torch.Tensor): topk expert route id for each token. - token_expert_indices (torch.Tensor): indice for expanded hidden. - topk (int): The number of top-k experts to select. - n_expert (int): The number of expert. - n_local_expert (int): The number of expert in current EP rank. - - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices - from the global expert space to the local expert space of the expert + - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices + from the global expert space to the local expert space of the expert parallel shard. - align_block_size (Optional[int]): align group gemm block size for deepgemm - - fill_invalid_expert(int): fill expert id in m_indices for invalid expert + - fill_invalid_expert(int): fill expert id in m_indices for invalid expert to workaround DeepGemm unsupported -1 in m_indices Returns: - permuted_hidden_states (torch.Tensor): permuted activation. @@ -39,10 +107,10 @@ def moe_permute( of each expert for standard grouped gemm. if enable 'align_block_size' expert_first_token_offset will align up to 'align_block_size'. - src_row_id2dst_row_id_map (torch.Tensor): idx map for moe_unpermute. - - m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records + - m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records the group which the j-th row of the LHS belong to.` """ - n_token, n_hidden = hidden_states.shape + n_token, n_hidden = hidden_states.size() assert (n_hidden * hidden_states.element_size() ) % 16 == 0, "permue kernel need hidden dim align to 16B" permuted_row_size = n_token * topk @@ -87,7 +155,7 @@ def moe_unpermute( n_local_expert: int, ) -> torch.Tensor: """ - This function expands and permutes activation to gathering uncontinuous + This function expands and permutes activation to gathering uncontinuous tokens for each expert. Parameters: - permuted_hidden_states (torch.Tensor): permuted activation. @@ -99,10 +167,10 @@ def moe_unpermute( - n_expert (int): The number of expert. - n_local_expert (int): The number of expert in current EP rank. Returns: - - hidden_states (torch.Tensor): The reduced and unpermuted activation - tensor. + - hidden_states (torch.Tensor): The reduced and unpermuted activation + tensor. """ - n_token, n_hidden = topk_weights.shape[0], permuted_hidden_states.shape[-1] + n_token, n_hidden = topk_weights.size(0), permuted_hidden_states.size(-1) assert (n_hidden * permuted_hidden_states.element_size() ) % 16 == 0, "unpermue kernel need hidden dim align to 16B" hidden_states = torch.empty((n_token, n_hidden), diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py new file mode 100644 index 000000000000..b1126b94e45a --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -0,0 +1,147 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional + +import pplx_kernels as pplx +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.utils import ( + moe_kernel_quantize_input) + + +# Note use: layer.get_all_to_all() to get an AllToAll instance +# The max_num_tokens, world_size and dp_size must be the same +# as the ones used to create the AllToAll. +class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): + + def __init__(self, + a2a: pplx.AllToAll, + max_num_tokens: int, + world_size: int, + rank: int, + dp_size: int, + quant_dtype: Optional[torch.dtype] = None, + block_shape: Optional[list[int]] = None): + super().__init__() + assert max_num_tokens > 0 + self.a2a = a2a + self.block_shape = block_shape + self.max_num_tokens = max_num_tokens + self.world_size = world_size + self.rank = rank + self.dp_size = dp_size + self.quant_dtype = quant_dtype + + def prepare( + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + rank_topk_weights: torch.Tensor, + rank_topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + num_tokens = a1.size(0) # M + hidden_dim = a1.size(-1) # K + + assert rank_topk_ids.size(0) == num_tokens + # assert expert_map is None, "NYI" + + # Is this always going to be a1.device? + device = a1.device + + if apply_router_weight_on_input: + topk = rank_topk_ids.size(1) + # TODO: this only works for topK=1, will need to update for topK>1 + assert topk == 1, ( + "apply_router_weight_on_input is only implemented for topk=1") + a1 = a1 * rank_topk_weights.to(a1.dtype) + + per_act_token = a1_scale.numel() != 1 if a1_scale is not None else ( + a2_scale.numel() != 1 if a2_scale is not None else False) + + a1q, a1q_scale = moe_kernel_quantize_input(a1, a1_scale, + self.quant_dtype, + per_act_token, + self.block_shape) + + # rem_experts need to be 0 for pplx to work properly. + rem_experts = num_experts % self.world_size + assert rem_experts == 0 + num_local_experts = ((num_experts // self.world_size) + + (1 if self.rank < rem_experts else 0)) + + expert_num_tokens = torch.empty( + num_local_experts, + dtype=torch.int32, + device=device, + ) + + num_dp = self.world_size // self.dp_size + expert_x = torch.empty( + (num_local_experts, self.max_num_tokens * num_dp, hidden_dim), + dtype=a1q.dtype, + device=device, + ) + + expert_x_scale: Optional[torch.Tensor] = None + if a1q.dtype.itemsize == 1: + float32_size = torch.float32.itemsize + block_size = (self.block_shape[0] if self.block_shape is not None + else 1) * float32_size + expert_x_scale = torch.empty( + ( + num_experts, + expert_x.size(1), + (expert_x.size(2) + block_size - 1) // block_size, + ), + dtype=torch.float32, + device=device, + ) + + # This argument is optional, defaults to indices.size(0) + # There's not much point setting this unless it is != indices.size(0) + bound_m: Optional[torch.Tensor] = None + + self.a2a.dispatch( + out_expert_num_tokens=expert_num_tokens, + out_expert_x=expert_x, + out_expert_x_scale=expert_x_scale, + dp_x=a1q, + dp_x_scale=a1q_scale, + indices=rank_topk_ids, + bound_m=bound_m, + ) + + return expert_x, expert_x_scale, expert_num_tokens + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + ) -> None: + num_tokens = output.size(0) # M + # This argument is optional + # There's not much point setting this unless it is != topk_ids.size(0) + bound_m: Optional[torch.Tensor] = None + + assert topk_ids.size(0) == num_tokens, ( + f"{topk_ids.size(0)} == {num_tokens}") + assert output.size(0) <= self.max_num_tokens, ( + f"{output.size(0)} <= {self.max_num_tokens}") + assert output.size(1) == fused_expert_output.size(-1) + + # Set weights to 1 if we did them in dispatch. This is hacky. + if apply_router_weight_on_input: + topk_weights = torch.ones_like(topk_weights) + + self.a2a.combine(out_tokens=output, + indices=topk_ids, + weights=topk_weights, + expert_y=fused_expert_output, + bound_m=bound_m) diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py new file mode 100644 index 000000000000..98f98b3bd20b --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( + _moe_unpermute_and_reduce) +from vllm.model_executor.layers.fused_moe.utils import ( + moe_kernel_quantize_input) + + +class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): + + def __init__( + self, + quant_dtype: Optional[torch.dtype] = None, + per_channel_quant: bool = False, + block_shape: Optional[list[int]] = None, + ): + super().__init__() + self.per_channel_quant = per_channel_quant + self.block_shape = block_shape + self.quant_dtype = quant_dtype + + def prepare( + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + if apply_router_weight_on_input: + topk = topk_ids.size(1) + # TODO: this only works for topK=1, will need to update for topK>1 + assert topk == 1, \ + "apply_router_weight_on_input is only implemented for topk=1" + a1.mul_(topk_weights.to(a1.dtype)) + + a1q, a1q_scale = moe_kernel_quantize_input(a1, a1_scale, + self.quant_dtype, + self.per_channel_quant, + self.block_shape) + + return a1q, a1q_scale, None + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + ) -> None: + _moe_unpermute_and_reduce(output, fused_expert_output, None, + topk_weights, apply_router_weight_on_input) diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py new file mode 100644 index 000000000000..2cfe373140bb --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( + DeepGemmExperts, _valid_deep_gemm, _valid_deep_gemm_shape) +from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts + + +class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): + + def __init__(self, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + per_channel_quant: bool = False, + block_shape: Optional[list[int]] = None, + block_m: Optional[int] = None, + allow_deep_gemm: bool = False): + super().__init__() + self.triton_expert = TritonExperts(use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int4_w4a16=use_int4_w4a16, + use_int8_w8a16=use_int8_w8a16, + per_channel_quant=per_channel_quant, + block_shape=block_shape, + block_m=block_m) + self.deep_gemm_expert = DeepGemmExperts() + self.allow_deep_gemm = allow_deep_gemm + self.use_fp8_w8a8 = use_fp8_w8a8 + + def workspace_shapes( + self, + a: torch.Tensor, + M: int, + N: int, + K: int, + topk: int, + num_experts: int, + ) -> tuple[int, int, torch.dtype]: + # Note: the deep gemm workspaces are strictly larger than the triton + # workspaces so we can be pessimistic here and allocate for DeepGemm + # even if we fall back to triton later, e.g. if expert maps are set. + if self.allow_deep_gemm and _valid_deep_gemm_shape(M, N, K): + return self.deep_gemm_expert.workspace_shapes( + a, M, N, K, topk, num_experts) + else: + return self.triton_expert.workspace_shapes(a, M, N, K, topk, + num_experts) + + def apply( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_num_tokens: Optional[torch.Tensor], + ) -> torch.Tensor: + N = w1.size(1) + if (self.allow_deep_gemm and self.use_fp8_w8a8 and N > 512 + and _valid_deep_gemm(hidden_states, w1, w2, expert_map)): + return self.deep_gemm_expert.apply( + hidden_states, + w1, + w2, + topk_ids, + activation, + global_num_experts, + expert_map, + w1_scale, + w2_scale, + w1_zp, + w2_zp, + a1q_scale, + a2_scale, + workspace13, + workspace2, + expert_num_tokens, + ) + else: + return self.triton_expert.apply( + hidden_states, + w1, + w2, + topk_ids, + activation, + global_num_experts, + expert_map, + w1_scale, + w2_scale, + w1_zp, + w2_zp, + a1q_scale, + a2_scale, + workspace13, + workspace2, + expert_num_tokens, + ) diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index 1acbba2056b0..d9d2520e18b3 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -7,6 +7,8 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8) +from vllm.model_executor.layers.quantization.utils.int8_utils import ( + per_token_group_quant_int8, per_token_quant_int8) from vllm.utils import cdiv @@ -15,34 +17,81 @@ def _resize_cache(x: torch.Tensor, v: tuple[int, ...]) -> torch.Tensor: Shrink the given tensor and apply the given view to it. This is used to resize the intermediate fused_moe caches. """ - assert prod(v) <= x.numel() + assert prod( + v) <= x.numel(), f"{prod(v)} <= {x.numel()}" # CUDAGRAPH unfriendly? return x.flatten()[:prod(v)].view(*v) def _fp8_quantize( A: torch.Tensor, A_scale: Optional[torch.Tensor], - block_shape: Optional[list[int]], + per_act_token: bool, + block_shape: Optional[list[int]] = None, ) -> tuple[torch.Tensor, torch.Tensor]: """ Perform fp8 quantization on the inputs. If a block_shape is provided, the output will be blocked. """ if block_shape is None: - A, A_scale = ops.scaled_fp8_quant(A, A_scale) + A, A_scale = ops.scaled_fp8_quant( + A, A_scale, use_per_token_if_dynamic=per_act_token) else: assert len(block_shape) == 2 _, block_k = block_shape[0], block_shape[1] A, A_scale = per_token_group_quant_fp8(A, block_k) - assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + assert cdiv(A.size(-1), block_k) == A_scale.size(-1) + return A, A_scale +def _int8_quantize( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + per_act_token: bool, + block_shape: Optional[list[int]] = None, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Perform int8 quantization on the inputs. If a block_shape + is provided, the output will be blocked. + """ + + # If weights are per-channel (per_channel_quant=True), then + # activations apply per-token quantization. Otherwise, assume + # activation tensor-wise fp8/int8 quantization, dynamic or static + if block_shape is None: + assert per_act_token, \ + "int8 quantization only supports block or channel-wise" + A, A_scale = per_token_quant_int8(A) + else: + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_int8(A, block_k) + assert cdiv(A.size(-1), block_k) == A_scale.size(-1) + + return A, A_scale + + +def moe_kernel_quantize_input( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + qtype: Optional[torch.dtype], + per_channel_quant: bool, + block_shape: Optional[list[int]] = None, +) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + if qtype == torch.float8_e4m3fn: + return _fp8_quantize(A, A_scale, per_channel_quant, block_shape) + elif qtype == torch.int8: + return _int8_quantize(A, A_scale, per_channel_quant, block_shape) + else: + assert A_scale is None + return A, A_scale + + def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor: """ A permutation routine that works on fp8 types. """ - if torch.is_floating_point(m) and torch.finfo(m.dtype).bits == 8: + if torch.is_floating_point(m) and m.dtype.itemsize == 1: return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype) else: return m[idx, ...] diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 1ea65e96d750..bc6e6fcdd0a2 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -142,7 +142,7 @@ def mamba_v2_sharded_weight_loader( ) -> LoaderFunction: """Create a weight loader for mamba v2. This ensures that the projections are correctly sharded so that they can be split into x, B, C. It also - ensures the the all the groups corresponding to a head shard is placed + ensures that all the groups corresponding to a head shard is placed together with it. """ diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index cfd398c07fb9..f4cdc3db1a0d 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 +import functools import importlib.util from typing import Any, Callable, Optional @@ -9,6 +10,7 @@ from torch.nn.parameter import Parameter import vllm.envs as envs +import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger @@ -434,6 +436,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): """ def __init__(self, quant_config: Fp8Config): + from vllm.model_executor.layers.fused_moe import fused_experts self.quant_config = quant_config self.block_quant = self.quant_config.weight_block_size is not None @@ -458,6 +461,11 @@ def __init__(self, quant_config: Fp8Config): logger.warning_once( "DeepGemm not supported on the current platform.") + self.fused_experts = functools.partial( + fused_experts, + block_shape=self.quant_config.weight_block_size, + allow_deep_gemm=self.allow_deep_gemm) + def create_weights(self, layer: Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): @@ -783,6 +791,31 @@ def process_weights_after_loading(self, layer: Module) -> None: del layer.w13_input_scale del layer.w2_input_scale + def set_prepare_finalize( + self, + dp_size: int, + world_size: int, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, + ) -> bool: + from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( + TritonOrDeepGemmExperts) + + if self.use_marlin or self.rocm_aiter_moe_enabled: + return False + + experts = TritonOrDeepGemmExperts( + use_fp8_w8a8=True, + block_shape=self.quant_config.weight_block_size, + allow_deep_gemm=self.allow_deep_gemm, + ) + + self.fused_experts = mk.FusedMoEModularKernel( + prepare_finalize, + experts, + ) + + return True + def apply( self, layer: torch.nn.Module, @@ -801,10 +834,6 @@ def apply( apply_router_weight_on_input: bool = False, activation: str = "silu", ) -> torch.Tensor: - from vllm.model_executor.layers.fused_moe import fused_experts - from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( - rocm_aiter_fused_experts) - topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, router_logits=router_logits, @@ -819,6 +848,8 @@ def apply( ) if self.rocm_aiter_moe_enabled: + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 + rocm_aiter_fused_experts) return rocm_aiter_fused_experts( x, layer.w13_weight, @@ -835,8 +866,7 @@ def apply( a1_scale=layer.w13_input_scale, a2_scale=layer.w2_input_scale, block_shape=self.quant_config.weight_block_size) - - if self.use_marlin: + elif self.use_marlin: assert activation == "silu", ( f"{activation} not supported for Marlin MoE.") assert not apply_router_weight_on_input, ( @@ -853,28 +883,26 @@ def apply( quant_type_id=scalar_types.float8_e4m3fn.id, global_num_experts=global_num_experts, expert_map=expert_map) - - return fused_experts( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=True, - activation=activation, - use_fp8_w8a8=True, - global_num_experts=global_num_experts, - apply_router_weight_on_input=apply_router_weight_on_input, - expert_map=expert_map, - w1_scale=(layer.w13_weight_scale_inv - if self.block_quant else layer.w13_weight_scale), - w2_scale=(layer.w2_weight_scale_inv - if self.block_quant else layer.w2_weight_scale), - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale, - block_shape=self.quant_config.weight_block_size, - allow_deep_gemm=self.allow_deep_gemm, - ) + else: + return self.fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + use_fp8_w8a8=True, + global_num_experts=global_num_experts, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=expert_map, + w1_scale=(layer.w13_weight_scale_inv + if self.block_quant else layer.w13_weight_scale), + w2_scale=(layer.w2_weight_scale_inv + if self.block_quant else layer.w2_weight_scale), + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + ) class Fp8KVCacheMethod(BaseKVCacheMethod): diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py index 9b60775df96f..7f9f3e643bfa 100644 --- a/vllm/model_executor/layers/quantization/torchao.py +++ b/vllm/model_executor/layers/quantization/torchao.py @@ -5,10 +5,11 @@ import torch.nn.functional as F from torch.nn.parameter import Parameter -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + UnquantizedLinearMethod) from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) + QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.utils import set_weight_attrs @@ -55,10 +56,24 @@ def from_config(cls, config: dict[str, Any]) -> "TorchAOConfig": return cls(ao_config) def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["TorchAOLinearMethod"]: - if isinstance(layer, LinearBase): - return TorchAOLinearMethod(self) - return None + prefix: str) -> Optional["QuantizeMethodBase"]: + if not isinstance(layer, LinearBase): + return None + + from torchao.quantization import AOPerModuleConfig + + module_fqn = prefix + if isinstance(self.torchao_config, AOPerModuleConfig): + module_fqn_to_config = self.torchao_config.module_fqn_to_config + c = module_fqn_to_config.get( + module_fqn) or module_fqn_to_config.get("_default", None) + if c is not None: + current_torchao_config = TorchAOConfig(c) + return TorchAOLinearMethod(current_torchao_config) + else: + return UnquantizedLinearMethod() + + return TorchAOLinearMethod(self) def get_scaled_act_names(self) -> list[str]: return [] @@ -75,7 +90,7 @@ def torchao_quantize_param_data(param: torch.Tensor, """ from torchao.core.config import AOBaseConfig from torchao.quantization import quantize_ - assert isinstance(torchao_config, AOBaseConfig) + assert isinstance(torchao_config, AOBaseConfig), f"{torchao_config}" dummy_linear = torch.nn.Linear(param.shape[1], param.shape[0], bias=False) dummy_linear.weight = param quantize_(dummy_linear, torchao_config) diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 57189bfafc06..47a7a06bb744 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -6,7 +6,8 @@ import itertools import math import os -from typing import Any, Callable, Dict, Generator, List, Optional, Tuple +from collections.abc import Generator +from typing import Any, Callable, Optional import numpy as np import torch @@ -49,21 +50,21 @@ def __init__(self, load_config: LoadConfig): super().__init__(load_config) # Save the module names without sharding. - self.unsharded_weights_modules: List[str] = [] + self.unsharded_weights_modules: list[str] = [] # Save the module names that are sharded by column. - self.column_sharded_weights_modules: List[str] = [] + self.column_sharded_weights_modules: list[str] = [] # Store all module names (from transformers) that support # BNB quantization. - self.target_modules: List[str] = [] + self.target_modules: list[str] = [] # mapping weight names from transformers to vllm. self.weight_mapper: Callable = lambda name: name def _get_weight_files( self, model_name_or_path: str, - allowed_patterns: List[str], + allowed_patterns: list[str], revision: Optional[str] = None, - ) -> Tuple[str, List[str], str]: + ) -> tuple[str, list[str], str]: """Retrieve weight files. Download the files if necessary. Return the weight files and the file pattern.""" @@ -95,7 +96,7 @@ def _get_weight_files( f"No model weights found in: `{model_name_or_path}`") def _prepare_weights(self, model_name_or_path: str, - revision: Optional[str]) -> Tuple[List[str], bool]: + revision: Optional[str]) -> tuple[list[str], bool]: """Prepare weight files for the model.""" allowed_patterns = ["*.safetensors", "*.bin", "*.pt"] @@ -155,7 +156,7 @@ def _get_quantized_weights_iterator( revision: Optional[str], pre_quant: bool, load_8bit: bool, - ) -> Tuple[Generator[Tuple[str, torch.Tensor], None, None], Dict[str, + ) -> tuple[Generator[tuple[str, torch.Tensor], None, None], dict[str, Any]]: """Get an iterator to the model weights with bitsandbytes quantization, as well as the quantization state dictionary.""" @@ -175,7 +176,7 @@ def _get_quantized_weights_iterator( hf_weights_files, use_safetensors = self._prepare_weights( model_name_or_path, revision) - quant_state_dict: Dict[str, Any] = {} + quant_state_dict: dict[str, Any] = {} if pre_quant: if load_8bit: @@ -257,7 +258,7 @@ def _quantized_4bit_generator(self, hf_weights_files, use_safetensors, # Closure to parse quant_state for each prequant weight def _parse_quant_state(param_name: str, - temp_state_dict: Dict) -> QuantState: + temp_state_dict: dict) -> QuantState: quant_state = {} for k in temp_state_dict: if param_name + "." in k: @@ -415,7 +416,7 @@ def _load_weights(self, model_config: ModelConfig, # Modules whose weights might have fused on disk # we need their output_sizes to make shard in flight correctly with TP - self.maybe_fused_weights_modules: Dict[str, List[int]] = {} + self.maybe_fused_weights_modules: dict[str, list[int]] = {} self._get_bnb_target_modules(model) for name, module in model.named_modules(): # Some modules like `ReplicatedLinear` should not have their weights @@ -480,7 +481,7 @@ def _load_weights(self, model_config: ModelConfig, torch.cuda.empty_cache() param_dict = dict(model.named_parameters()) - stacked_quant_state_dict: Dict[str, Dict[int, Any]] = {} + stacked_quant_state_dict: dict[str, dict[int, Any]] = {} # TODO: Change this lazy import to normal import # after the checks are updated to run on a new version from vllm.model_executor.models.utils import is_pp_missing_parameter diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index c8bc4aecaecf..21eb7d8a75fb 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -3,7 +3,8 @@ import glob import os import time -from typing import Generator, Iterable, List, Optional, Tuple, cast +from collections.abc import Generator, Iterable +from typing import Optional, cast import huggingface_hub import torch @@ -92,7 +93,7 @@ def _prepare_weights( revision: Optional[str], fall_back_to_pt: bool, allow_patterns_overrides: Optional[list[str]], - ) -> Tuple[str, List[str], bool]: + ) -> tuple[str, list[str], bool]: """Prepare weights for the model. If the model is not local, it will be downloaded.""" @@ -138,7 +139,7 @@ def _prepare_weights( else: hf_folder = model_name_or_path - hf_weights_files: List[str] = [] + hf_weights_files: list[str] = [] for pattern in allow_patterns: hf_weights_files += glob.glob(os.path.join(hf_folder, pattern)) if len(hf_weights_files) > 0: @@ -173,7 +174,7 @@ def _prepare_weights( def _get_weights_iterator( self, source: "Source" - ) -> Generator[Tuple[str, torch.Tensor], None, None]: + ) -> Generator[tuple[str, torch.Tensor], None, None]: """Get an iterator for the model weights based on the load format.""" hf_folder, hf_weights_files, use_safetensors = self._prepare_weights( source.model_or_path, source.revision, source.fall_back_to_pt, @@ -238,7 +239,7 @@ def get_all_weights( self, model_config: ModelConfig, model: nn.Module, - ) -> Generator[Tuple[str, torch.Tensor], None, None]: + ) -> Generator[tuple[str, torch.Tensor], None, None]: primary_weights = DefaultModelLoader.Source( model_config.model, model_config.revision, diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py index ace1cd371286..2766c9787b83 100644 --- a/vllm/model_executor/model_loader/gguf_loader.py +++ b/vllm/model_executor/model_loader/gguf_loader.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import Dict, Generator, Tuple +from collections.abc import Generator import gguf import torch @@ -84,8 +84,8 @@ def _get_gguf_weights_map(self, model_config: ModelConfig): return gguf_to_hf_name_map def _get_weights_iterator( - self, model_name_or_path: str, gguf_to_hf_name_map: Dict[str, str] - ) -> Generator[Tuple[str, torch.Tensor], None, None]: + self, model_name_or_path: str, gguf_to_hf_name_map: dict[str, str] + ) -> Generator[tuple[str, torch.Tensor], None, None]: return gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map) diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py index e4a48483764a..e65d16cae76c 100644 --- a/vllm/model_executor/model_loader/neuron.py +++ b/vllm/model_executor/model_loader/neuron.py @@ -5,7 +5,7 @@ import copy import importlib import os -from typing import Dict, List, Optional, Tuple +from typing import Optional import torch import torch.nn as nn @@ -33,7 +33,7 @@ } # Models supported by Neuron. -_NEURON_SUPPORTED_MODELS: Dict[str, Tuple[str, str, str]] = { +_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str, str]] = { "LlamaForCausalLM": ("transformers_neuronx.llama.model", "LlamaForSampling", "LlamaForCausalLM"), "MistralForCausalLM": ("transformers_neuronx.mistral.model", @@ -146,7 +146,7 @@ def sample( self, logits: torch.Tensor, sampling_metadata: SamplingMetadata, - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: batch_size, num_steps = logits.shape seq_ids = [ seq_id for sg in sampling_metadata.seq_groups @@ -188,7 +188,7 @@ def _get_model_architecture(config: PretrainedConfig) -> str: f"{list(_NEURON_SUPPORTED_MODELS.keys())}") -def _get_buckets(env: str, default_value: List[int]) -> List[int]: +def _get_buckets(env: str, default_value: list[int]) -> list[int]: env_value = os.getenv(env) if env_value is None: return default_value @@ -464,7 +464,7 @@ def get_neuron_eagle_speculation_model(model_config: ModelConfig, draft_model.eval() - token_tree: Dict[int, List[int]] = ast.literal_eval( + token_tree: dict[int, list[int]] = ast.literal_eval( speculation_config.speculative_token_tree) speculation_model = EagleSpeculativeDecoder(draft_model.model, diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py index f879c99ac2ef..1c4f66061d1d 100644 --- a/vllm/model_executor/model_loader/neuronx_distributed.py +++ b/vllm/model_executor/model_loader/neuronx_distributed.py @@ -9,7 +9,7 @@ import multiprocessing import os import shutil -from typing import Dict, List, Optional, Tuple +from typing import Optional import torch import torch.nn as nn @@ -46,7 +46,7 @@ } # Models supported by Neuronx distributed for inference. -_NEURON_SUPPORTED_MODELS: Dict[str, Tuple[str, str]] = { +_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str]] = { "LlamaForCausalLM": ("neuronx_distributed_inference.models.llama.modeling_llama", "NeuronLlamaForCausalLM"), @@ -365,7 +365,7 @@ def sample( self, logits: torch.Tensor, sampling_metadata: SamplingMetadata, - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: batch_size, num_steps = logits.shape seq_ids = [ seq_id for sg in sampling_metadata.seq_groups diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py index 1fbb5ca56644..a695ba03bd1d 100644 --- a/vllm/model_executor/model_loader/runai_streamer_loader.py +++ b/vllm/model_executor/model_loader/runai_streamer_loader.py @@ -2,7 +2,8 @@ # ruff: noqa: SIM117 import glob import os -from typing import Generator, List, Optional, Tuple +from collections.abc import Generator +from typing import Optional import torch from torch import nn @@ -48,7 +49,7 @@ def __init__(self, load_config: LoadConfig): os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url def _prepare_weights(self, model_name_or_path: str, - revision: Optional[str]) -> List[str]: + revision: Optional[str]) -> list[str]: """Prepare weights for the model. If the model is not local, it will be downloaded.""" @@ -87,7 +88,7 @@ def _prepare_weights(self, model_name_or_path: str, def _get_weights_iterator( self, model_or_path: str, - revision: str) -> Generator[Tuple[str, torch.Tensor], None, None]: + revision: str) -> Generator[tuple[str, torch.Tensor], None, None]: """Get an iterator for the model weights based on the load format.""" hf_weights_files = self._prepare_weights(model_or_path, revision) return runai_safetensors_weights_iterator( diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py index 152a3d699726..913bda7e007a 100644 --- a/vllm/model_executor/model_loader/sharded_state_loader.py +++ b/vllm/model_executor/model_loader/sharded_state_loader.py @@ -3,7 +3,8 @@ import collections import glob import os -from typing import Any, Dict, Generator, List, Optional, Tuple +from collections.abc import Generator +from typing import Any, Optional import torch from torch import nn @@ -48,12 +49,12 @@ def __init__(self, @staticmethod def _filter_subtensors( - tensors: Dict[str, torch.Tensor], ) -> Dict[str, torch.Tensor]: + tensors: dict[str, torch.Tensor], ) -> dict[str, torch.Tensor]: """ Filter out all tensors that share the same memory or a subset of the memory of another tensor. """ - same_storage_groups: Dict[Any, List[Tuple[str, torch.Tensor]]] = ( + same_storage_groups: dict[Any, list[tuple[str, torch.Tensor]]] = ( collections.defaultdict(list)) for key, tensor in tensors.items(): if tensor.numel(): @@ -63,7 +64,7 @@ def _filter_subtensors( def get_end_ptr(tensor: torch.Tensor) -> int: return tensor.view(-1)[-1].data_ptr() + tensor.element_size() - result: Dict[str, torch.Tensor] = {} + result: dict[str, torch.Tensor] = {} for group in same_storage_groups.values(): for k, t in group: a, b = t.data_ptr(), get_end_ptr(t) @@ -160,7 +161,7 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module: return model.eval() def iterate_over_files( - self, paths) -> Generator[Tuple[str, torch.Tensor], None, None]: + self, paths) -> Generator[tuple[str, torch.Tensor], None, None]: if self.runai_model_streamer: yield from runai_safetensors_weights_iterator(paths, True) else: @@ -188,7 +189,7 @@ def save_model( part_idx = 0 total_size = 0 state_dict = ShardedStateLoader._filter_subtensors(model.state_dict()) - state_dict_part: Dict[str, torch.Tensor] = {} + state_dict_part: dict[str, torch.Tensor] = {} for key, tensor in state_dict.items(): param_size = tensor.nelement() * tensor.element_size() if max_size is not None and total_size + param_size > max_size: diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 117251ccf05f..0ff35b3a6dca 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -6,9 +6,10 @@ import os import re import time +from collections.abc import Generator from dataclasses import dataclass from functools import partial -from typing import BinaryIO, Generator, Optional, Tuple, Type, Union +from typing import BinaryIO, Optional, Union import torch from torch import nn @@ -67,7 +68,7 @@ class TensorizerConfig: s3_access_key_id: Optional[str] = None s3_secret_access_key: Optional[str] = None s3_endpoint: Optional[str] = None - model_class: Optional[Type[torch.nn.Module]] = None + model_class: Optional[type[torch.nn.Module]] = None hf_config: Optional[PretrainedConfig] = None dtype: Optional[Union[str, torch.dtype]] = None _is_sharded: bool = False @@ -365,7 +366,7 @@ def deserialize(self): def tensorizer_weights_iterator( tensorizer_args: "TensorizerArgs" -) -> Generator[Tuple[str, torch.Tensor], None, None]: +) -> Generator[tuple[str, torch.Tensor], None, None]: logger.warning("Deserializing HuggingFace models is not optimized for " "loading on vLLM, as tensorizer is forced to load to CPU. " "Consider deserializing a vLLM model instead for faster " diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py index 7cf3940ab644..4107e741fd8f 100644 --- a/vllm/model_executor/model_loader/tensorizer_loader.py +++ b/vllm/model_executor/model_loader/tensorizer_loader.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # ruff: noqa: SIM117 import copy -from typing import Generator, Tuple +from collections.abc import Generator import torch from torch import nn @@ -36,7 +36,7 @@ def _verify_config(self, model_config: ModelConfig, self.tensorizer_config.verify_with_parallel_config(parallel_config) def _get_weights_iterator( - self, ) -> Generator[Tuple[str, torch.Tensor], None, None]: + self, ) -> Generator[tuple[str, torch.Tensor], None, None]: tensorizer_args = self.tensorizer_config._construct_tensorizer_args() return tensorizer_weights_iterator(tensorizer_args) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index ddc857aebdc8..68b1f1ad74d3 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -5,7 +5,7 @@ import warnings from contextlib import contextmanager from dataclasses import dataclass, field -from typing import Dict, List, Optional, Tuple, Type +from typing import Optional import torch import transformers @@ -124,7 +124,7 @@ def device_loading_context(module: torch.nn.Module, yield module return - original_device_states: Dict[str, torch.device] = {} + original_device_states: dict[str, torch.device] = {} # Store original device states and move parameters to GPU if they're on CPU for name, p in module.named_parameters(): @@ -214,7 +214,7 @@ def resolve_transformers_arch(model_config: ModelConfig, def get_model_architecture( - model_config: ModelConfig) -> Tuple[Type[nn.Module], str]: + model_config: ModelConfig) -> tuple[type[nn.Module], str]: architectures = getattr(model_config.hf_config, "architectures", []) # Special handling for quantized Mixtral. @@ -257,8 +257,8 @@ class ParamMapping: It creates a bidirectional mapping between packed parameters and their constituent parts. """ - packed_mapping: Dict[str, List[str]] - inverse_packed_mapping: Dict[str, Tuple[str, + packed_mapping: dict[str, list[str]] + inverse_packed_mapping: dict[str, tuple[str, int]] = field(default_factory=dict) def __post_init__(self): @@ -273,7 +273,7 @@ def __post_init__(self): ) def get_sub_modules(self, - module_name: str) -> Optional[Tuple[str, List[str]]]: + module_name: str) -> Optional[tuple[str, list[str]]]: for key, value in self.packed_mapping.items(): if module_name.endswith(key): return key, value @@ -281,7 +281,7 @@ def get_sub_modules(self, def configure_quant_config(quant_config: QuantizationConfig, - model_class: Type[nn.Module]): + model_class: type[nn.Module]): """ Pass packed_modules_mapping by reference to quant_config so that quant_config can properly match fused modules diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 8f9d809022aa..a1cf43328bab 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -8,8 +8,9 @@ import tempfile import time from collections import defaultdict +from collections.abc import Generator from pathlib import Path -from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union +from typing import Any, Callable, Optional, Union import filelock import gguf @@ -221,7 +222,7 @@ def get_sparse_attention_config( model_config: ModelConfig, load_config: LoadConfig, sparse_attention_config_filename: str = "sparse_attention_config.json", -) -> Dict[str, Any]: +) -> dict[str, Any]: model_name_or_path = model_config.model is_local = os.path.isdir(model_name_or_path) if not is_local: @@ -253,9 +254,9 @@ def get_sparse_attention_config( def download_weights_from_hf( model_name_or_path: str, cache_dir: Optional[str], - allow_patterns: List[str], + allow_patterns: list[str], revision: Optional[str] = None, - ignore_patterns: Optional[Union[str, List[str]]] = None, + ignore_patterns: Optional[Union[str, list[str]]] = None, ) -> str: """Download model weights from Hugging Face Hub. @@ -263,11 +264,11 @@ def download_weights_from_hf( model_name_or_path (str): The model name or path. cache_dir (Optional[str]): The cache directory to store the model weights. If None, will use HF defaults. - allow_patterns (List[str]): The allowed patterns for the + allow_patterns (list[str]): The allowed patterns for the weight files. Files matched by any of the patterns will be downloaded. revision (Optional[str]): The revision of the model. - ignore_patterns (Optional[Union[str, List[str]]]): The patterns to + ignore_patterns (Optional[Union[str, list[str]]]): The patterns to filter out the weight files. Files matched by any of the patterns will be ignored. @@ -347,9 +348,9 @@ def download_safetensors_index_file_from_hf( # Passing both of these to the weight loader functionality breaks. # So, we use the index_file to # look up which safetensors files should be used. -def filter_duplicate_safetensors_files(hf_weights_files: List[str], +def filter_duplicate_safetensors_files(hf_weights_files: list[str], hf_folder: str, - index_file: str) -> List[str]: + index_file: str) -> list[str]: # model.safetensors.index.json is a mapping from keys in the # torch state_dict to safetensors file holding that weight. index_file_name = os.path.join(hf_folder, index_file) @@ -372,7 +373,7 @@ def filter_duplicate_safetensors_files(hf_weights_files: List[str], def filter_files_not_needed_for_inference( - hf_weights_files: List[str]) -> List[str]: + hf_weights_files: list[str]) -> list[str]: """ Exclude files that are not needed for inference. @@ -408,9 +409,9 @@ def np_cache_weights_iterator( model_name_or_path: str, cache_dir: Optional[str], hf_folder: str, - hf_weights_files: List[str], + hf_weights_files: list[str], use_tqdm_on_load: bool, -) -> Generator[Tuple[str, torch.Tensor], None, None]: +) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model np files. Will dump the model weights to numpy files if they are not already dumped. @@ -424,7 +425,7 @@ def np_cache_weights_iterator( # dumping the same model weights to numpy at the same time. with get_lock(model_name_or_path, cache_dir): if not os.path.exists(weight_names_file): - weight_names: List[str] = [] + weight_names: list[str] = [] for bin_file in tqdm( hf_weights_files, desc="Loading np_cache checkpoint shards", @@ -453,9 +454,9 @@ def np_cache_weights_iterator( def safetensors_weights_iterator( - hf_weights_files: List[str], + hf_weights_files: list[str], use_tqdm_on_load: bool, -) -> Generator[Tuple[str, torch.Tensor], None, None]: +) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model safetensor files.""" for st_file in tqdm( hf_weights_files, @@ -470,9 +471,9 @@ def safetensors_weights_iterator( def runai_safetensors_weights_iterator( - hf_weights_files: List[str], + hf_weights_files: list[str], use_tqdm_on_load: bool, -) -> Generator[Tuple[str, torch.Tensor], None, None]: +) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model safetensor files.""" with SafetensorsStreamer() as streamer: for st_file in tqdm( @@ -486,9 +487,9 @@ def runai_safetensors_weights_iterator( def fastsafetensors_weights_iterator( - hf_weights_files: List[str], + hf_weights_files: list[str], use_tqdm_on_load: bool, -) -> Generator[Tuple[str, torch.Tensor], None, None]: +) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model safetensor files using fastsafetensor library.""" if torch.distributed.is_initialized(): @@ -525,10 +526,10 @@ def fastsafetensors_weights_iterator( def pt_weights_iterator( - hf_weights_files: List[str], + hf_weights_files: list[str], use_tqdm_on_load: bool, pt_load_map_location: Union[str, dict[str, str]] = "cpu", -) -> Generator[Tuple[str, torch.Tensor], None, None]: +) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model bin/pt files.""" for bin_file in tqdm( hf_weights_files, @@ -544,7 +545,7 @@ def pt_weights_iterator( def get_gguf_extra_tensor_names( - gguf_file: str, gguf_to_hf_name_map: Dict[str, str]) -> List[str]: + gguf_file: str, gguf_to_hf_name_map: dict[str, str]) -> list[str]: reader = gguf.GGUFReader(gguf_file) expected_gguf_keys = set(gguf_to_hf_name_map.keys()) exact_gguf_keys = set([tensor.name for tensor in reader.tensors]) @@ -553,8 +554,8 @@ def get_gguf_extra_tensor_names( def gguf_quant_weights_iterator( - gguf_file: str, gguf_to_hf_name_map: Dict[str, str] -) -> Generator[Tuple[str, torch.Tensor], None, None]: + gguf_file: str, gguf_to_hf_name_map: dict[str, str] +) -> Generator[tuple[str, torch.Tensor], None, None]: """ Iterate over the quant weights in the model gguf files and convert them to torch tensors diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index c518efdb54f8..94a4328564bb 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only Snowflake Arctic model.""" -from typing import Iterable, List, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -458,8 +459,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -467,8 +468,8 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] - mlp_params_mapping: List[Tuple[str, str, int]] = [] - expert_params_mapping: List[Tuple[str, str, int]] = [] + mlp_params_mapping: list[tuple[str, str, int]] = [] + expert_params_mapping: list[tuple[str, str, int]] = [] num_layers = self.config.num_hidden_layers for layer in range(num_layers): @@ -497,7 +498,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("ws", f"experts.{expert_id}.w3.weight", expert_id)) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() logger.info( "It will take ~10 minutes loading from the 16-bit weights. " diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 7c716efab8ef..f74e13888c48 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Iterable, Mapping, Sequence -from typing import List, Optional, Set, Tuple, TypedDict, Union +from typing import Optional, TypedDict, Union import torch import torch.nn as nn @@ -66,8 +66,8 @@ def __init__( # Identity layer self.post_layernorm = nn.Identity() - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -75,7 +75,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: # NOTE: post_layernorm is not used in Aria @@ -326,8 +326,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # Adapted from LlamaModel.load_weights with the modification of adding # the expert weights mapping to `stacked_params_mapping` - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -339,7 +339,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("experts.w2_weight", "experts.fc2.weight", 'w2'), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -528,7 +528,7 @@ def __init__( self.vocab_size, logit_scale) def _validate_image_sizes( - self, images: List[torch.Tensor]) -> List[torch.Tensor]: + self, images: list[torch.Tensor]) -> list[torch.Tensor]: if not all(img.shape == images[0].shape for img in images): raise ValueError("All images must be the same size") return images @@ -578,7 +578,7 @@ def _create_patch_attention_mask( def _process_image_input( self, image_input: AriaImagePixelInputs - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: assert self.vision_tower is not None pixel_values = image_input['pixel_values'] @@ -651,6 +651,6 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index d152287e8fa3..08d49d71eca1 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 Adapted from # https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision -from typing import (Iterable, Literal, Mapping, Optional, Sequence, Set, Tuple, - TypedDict, Union, cast) +from collections.abc import Iterable, Mapping, Sequence +from typing import Literal, Optional, TypedDict, Union, cast import torch from torch import nn @@ -315,8 +315,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def dtype(self): return next(self.parameters()).dtype - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 444ed38d05c0..077e36176430 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -20,7 +20,8 @@ # limitations under the License. """Inference-only BaiChuan model compatible with HuggingFace weights.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -230,7 +231,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -320,15 +321,15 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -421,8 +422,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 87e1e102efd8..d6a705fb1859 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only Bamba model.""" # Added by the IBM Team, 2024 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -355,8 +356,8 @@ def forward( hidden_states, _ = self.final_layernorm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -367,7 +368,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -495,7 +496,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() hidden_size = self.config.hidden_size @@ -535,7 +536,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index bcfbe92c3a11..92bbe1bb67a3 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -19,7 +19,8 @@ # limitations under the License. """PyTorch BART model.""" import math -from typing import Iterable, Optional, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -859,14 +860,14 @@ def _rename_key(self, key: str): def _rename_stacked_param( self, name: str, - ) -> Tuple[str, Optional[str]]: + ) -> tuple[str, Optional[str]]: for key, mapping in self.stacked_params_mapping.items(): if key in name: name = name.replace(key, mapping["param_name"]) return name, mapping["shard_id"] return name, None - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): model_params_dict = dict(self.model.named_parameters()) top_params_dict = dict(self.named_parameters()) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 111b49ab8dd2..0c6593bbe3a1 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -349,8 +350,8 @@ def forward( token_type_ids=token_type_ids) return self.encoder(hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "query", "q"), @@ -359,7 +360,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if self.pooler is None and "pooler" in name: continue @@ -424,7 +425,7 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): weights = self.hf_to_vllm_mapper.apply(weights) weights = ((name, data) for name, data in weights if not name.startswith("lm_head.")) @@ -472,7 +473,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._pooler = CrossEncodingPooler(config, self.classifier, self.bert.pooler) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): self_weights = [] diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 002949abff52..af6deb3bf072 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -208,7 +209,7 @@ def __init__(self, hidden_size: int, moe_num_experts: int, moe_top_k: int): def forward( self, x: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.LongTensor]: weights = self.layer(x.view(-1, x.shape[-1]))[0].softmax( dim=-1, dtype=torch.float32) top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1) @@ -428,8 +429,8 @@ def forward( token_type_ids=token_type_ids) return self.encoder(positions, hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: weights = self.hf_to_vllm_mapper.apply(weights) if self.config.hidden_act in ["silu", "geglu"]: @@ -442,7 +443,7 @@ def load_weights(self, weights: Iterable[Tuple[str, stacked_params_mapping = [] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "pooler" in name: continue @@ -567,7 +568,7 @@ def config_verify(self, vllm_config): } return config - def split_up_gate_proj(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def split_up_gate_proj(self, weights: Iterable[tuple[str, torch.Tensor]]): n = "mlp.up_gate_proj" for name, weight in weights: if n in name: @@ -578,14 +579,14 @@ def split_up_gate_proj(self, weights: Iterable[Tuple[str, torch.Tensor]]): yield name, weight def ignore_unnecessary_layers(self, - weights: Iterable[Tuple[str, torch.Tensor]]): + weights: Iterable[tuple[str, torch.Tensor]]): for name, weight in weights: if name.startswith("classifier"): continue yield name, weight - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: weights = self.ignore_unnecessary_layers(weights) weights = self.split_up_gate_proj(weights) return super().load_weights(weights) @@ -664,7 +665,7 @@ def forward( token_type_ids=token_type_ids) @torch.inference_mode() - def jina_merge_lora_weights(self, weights: Iterable[Tuple[str, + def jina_merge_lora_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): # use for jina-embeddings-v3 # Merge Lora weights into a single weight tensor. @@ -707,7 +708,7 @@ def jina_merge_lora_weights(self, weights: Iterable[Tuple[str, return [(name, weight) for name, weight in weights.items()] - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: weights = self.jina_merge_lora_weights(weights) return super().load_weights(weights) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index f3d488926d09..acbc5d04d7e3 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Minimal implementation of BlipVisionModel intended to be only used within a vision language model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -296,8 +297,8 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return self.post_layernorm(hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -305,7 +306,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index f44565bd2e01..2ff7e394a416 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -186,7 +186,7 @@ def forward( self, hidden_states: torch.Tensor, encoder_hidden_states: Optional[torch.FloatTensor] = None, - ) -> Tuple[torch.Tensor]: + ) -> tuple[torch.Tensor]: self_output = self.attention( hidden_states, encoder_hidden_states=encoder_hidden_states, @@ -712,7 +712,7 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 74d401b295ce..eb1085d6b40d 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -18,7 +18,8 @@ # limitations under the License. """Inference-only BLOOM model compatible with HuggingFace weights.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -322,10 +323,10 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if name == "lm_head.weight": continue diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index ef8b033f3846..a4528ca26d01 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -2,7 +2,7 @@ from collections.abc import Iterable, Mapping, Sequence from functools import cached_property -from typing import Any, Dict, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Any, Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -229,7 +229,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 4096, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -292,7 +292,7 @@ def __init__( prefix=f"{prefix}.attn") def _apply_qk_norm(self, q: torch.Tensor, - k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: # reshape for layernorm q = q.reshape(-1, self.num_heads, self.head_dim) k = k.reshape(-1, self.num_kv_heads, self.head_dim) @@ -367,7 +367,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: if residual is None: residual = hidden_states @@ -438,7 +438,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: residual = hidden_states hidden_states = self.self_attn( @@ -773,7 +773,7 @@ def __init__(self, config: ChameleonVQVAEConfig): def encode( self, pixel_values: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: hidden_states = self.encoder(pixel_values) hidden_states = self.quant_conv(hidden_states) quant, emb_loss, indices = self.quantize(hidden_states) @@ -786,7 +786,7 @@ class ChameleonImageVocabularyMapping: A class for mapping discrete image tokens from VQGAN to BPE tokens. """ - def __init__(self, vocab_map: Dict[str, int]): + def __init__(self, vocab_map: dict[str, int]): self.vocab_map = vocab_map self.image_token_id = vocab_map.get("") @@ -1052,8 +1052,8 @@ def compute_logits( return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1063,7 +1063,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 233e9ee0a258..4e95afe1a147 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -3,7 +3,8 @@ # https://github.com/THUDM/ChatGLM2-6B """Inference-only ChatGLM model compatible with THUDM weights.""" import json -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -358,15 +359,15 @@ def forward( return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("linear_proj.merged_proj", "linear_proj.gate_proj", 0), ("linear_proj.merged_proj", "linear_proj.dense_h_to_4h", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -440,7 +441,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 153054e5c028..e8f3ae2156e0 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Minimal implementation of CLIPVisionModel intended to be only used within a vision language model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -368,8 +369,8 @@ def device(self): # (TODO) Add prefix argument for filtering out weights to be loaded # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -377,7 +378,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.vision_model.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 8f64e5d5c966..546b5f932877 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -21,7 +21,8 @@ # This file is based on the LLama model definition file in transformers """PyTorch Cohere model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -259,7 +260,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states, residual = self.input_layernorm(hidden_states, residual) @@ -404,8 +405,8 @@ def compute_logits( return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -415,7 +416,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: # Skip loading rotary embeddings since vLLM has its own diff --git a/vllm/model_executor/models/constant_size_cache.py b/vllm/model_executor/models/constant_size_cache.py index d073a7de6917..f1cc7e0f9e29 100644 --- a/vllm/model_executor/models/constant_size_cache.py +++ b/vllm/model_executor/models/constant_size_cache.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod -from typing import Any, Dict, List, Tuple +from typing import Any import torch @@ -16,7 +16,7 @@ class ConstantSizeCache(ABC): def __init__(self, max_batch_size: int): # Maps between the request id and a dict that maps between the seq_id # and its index inside the cache - self.cache_indices_mapping: Dict[str, Dict[int, int]] = {} + self.cache_indices_mapping: dict[str, dict[int, int]] = {} self.free_cache_indices = list(range(max_batch_size)) @property @@ -30,7 +30,7 @@ def _copy_cache(self, from_index: int, to_index: int): """Copy cache data from one index to another""" pass - def current_run_tensors(self, **kwargs) -> Tuple: + def current_run_tensors(self, **kwargs) -> tuple: """ Return the tensors for the current run's conv and ssm state. """ @@ -117,8 +117,8 @@ def _assign_seq_id_to_cache_index(self, cur_rid: str, seq_id: int, return self.cache_indices_mapping[cur_rid][seq_id] def _prepare_current_run_cache( - self, request_ids_to_seq_ids: Dict[str, list[int]], - finished_requests_ids: List[str]) -> List[int]: + self, request_ids_to_seq_ids: dict[str, list[int]], + finished_requests_ids: list[str]) -> list[int]: return [ self._assign_seq_id_to_cache_index(req_id, seq_id, finished_requests_ids) @@ -127,7 +127,7 @@ def _prepare_current_run_cache( ] def _release_finished_requests(self, - finished_seq_groups_req_ids: List[str]): + finished_seq_groups_req_ids: list[str]): for req_id in finished_seq_groups_req_ids: if req_id in self.cache_indices_mapping: for seq_id in self.cache_indices_mapping[req_id]: diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 9ec245cce189..e0b4712cdb47 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -79,7 +80,6 @@ def __init__( prefix=prefix, ) self.config = config - self.tp_size = get_tensor_model_parallel_world_size() self.d_model = config.d_model self.intermediate_size = (self.config.ffn_config.ffn_hidden_size // self.tp_size) @@ -415,14 +415,14 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: expert_params_mapping = [( "w13" if weight_name in ["w1", "v1"] else "w2", f"mlp.{weight_name}", ) for weight_name in ["w1", "v1", "w2"]] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index c6421143dd68..88d1ca9f7b83 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Deepseek model.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -184,7 +185,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -385,8 +386,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -397,7 +398,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -478,7 +479,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) \ No newline at end of file diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index b50175cf764f..6d7b52aba5f9 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -176,8 +177,8 @@ def compute_logits( return self.model.compute_logits(hidden_states, sampling_metadata, spec_step_idx) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), @@ -190,7 +191,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.n_routed_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 0366895ef02e..b78c193c1345 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only DeepseekV2/DeepseekV3 model.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -31,9 +32,7 @@ from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig -from vllm.distributed import (get_pp_group, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce) +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm @@ -143,7 +142,8 @@ def __init__( intermediate_size=intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, - reduce_results=False, + reduce_results=self.experts.must_reduce_shared_expert_outputs( + ), prefix=f"{prefix}.shared_experts", ) @@ -154,6 +154,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: shared_output = self.shared_experts(hidden_states) # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) + if hidden_states.dtype != torch.float16: final_hidden_states = self.experts( hidden_states=hidden_states, @@ -171,9 +172,11 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # See DeepseekV2DecoderLayer for more details. final_hidden_states = final_hidden_states + shared_output \ * (1. / self.routed_scaling_factor) + if self.tp_size > 1: - final_hidden_states = tensor_model_parallel_all_reduce( - final_hidden_states) + final_hidden_states = ( + self.experts.maybe_all_reduce_tensor_model_parallel( + final_hidden_states)) return final_hidden_states.view(num_tokens, hidden_dim) @@ -198,7 +201,7 @@ def __init__( q_lora_rank: int, kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -350,7 +353,7 @@ def __init__( q_lora_rank: Optional[int], kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -734,8 +737,8 @@ def make_empty_intermediate_tensors( device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "gate_proj", 0), @@ -751,7 +754,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.n_routed_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 6d8f27530cee..164fa40ffebe 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -4,7 +4,7 @@ """Inference-only Deepseek-VL2 model compatible with HuggingFace weights.""" import math from collections.abc import Iterable, Mapping, Sequence -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -45,7 +45,7 @@ class DeepseekVL2ImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size * num_images, num_channels, height, width)` """ @@ -57,7 +57,7 @@ class DeepseekVL2ImagePixelInputs(TypedDict): class DeepseekVL2VImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. @@ -394,8 +394,8 @@ def _init_vision_module( return model def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.vision_config.image_size expected_dims = (3, h, w) @@ -415,8 +415,8 @@ def _validate_shape(d: torch.Tensor): return data def _validate_images_spatial_crop( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: expected_dims = 2 def _validate_shape(d: torch.Tensor): @@ -640,8 +640,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) autoloaded_weights = loader.load_weights(weights, diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index 4ff1e785494f..726660796a6f 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -183,7 +184,7 @@ def compute_logits(self, hidden_states: torch.Tensor, return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): # This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B # due to missing lm_head weights and its config being that of a # Llama model. Here's a compatible version with the same weights: diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 4a6490cd127a..4ffd06319684 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -24,7 +24,8 @@ # limitations under the License. """Inference-only Exaone model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -102,7 +103,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -196,7 +197,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -282,7 +283,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -384,8 +385,8 @@ def forward( hidden_states, _ = self.ln_f(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -395,7 +396,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".c_fc_1", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -535,8 +536,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, # With tie_word_embeddings, we can skip lm_head.weight diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py index 310aca999bc2..00dbbebb120e 100644 --- a/vllm/model_executor/models/fairseq2_llama.py +++ b/vllm/model_executor/models/fairseq2_llama.py @@ -16,7 +16,7 @@ # limitations under the License. """Llama model for fairseq2 weights.""" -from typing import Iterable, Set, Tuple +from collections.abc import Iterable import torch from torch.nn import Parameter @@ -44,8 +44,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): f"model.{self.tp_rank}.pt", ] - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: # fairseq2's serialization adds a wrapper to usual .pt state_dict's: # { "model_key": my_model_name, "my_model_name": state_dict } # which we first need to unpack @@ -102,7 +102,7 @@ def reshape_fairseq2_weights( name: str, loaded_weight: torch.Tensor, params: dict[str, Parameter], - ) -> Tuple[str, torch.Tensor]: + ) -> tuple[str, torch.Tensor]: """Reshape fairseq2's weights.""" def permute(w: torch.Tensor, n_heads: int) -> torch.Tensor: diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index e7e03fc09972..376793594f8b 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -20,7 +20,8 @@ """PyTorch Falcon model.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -394,8 +395,8 @@ def forward( hidden_states = self.ln_f(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: total_num_heads = self.config.num_attention_heads if self.config.new_decoder_architecture: total_num_kv_heads = self.config.num_kv_heads @@ -405,7 +406,7 @@ def load_weights(self, weights: Iterable[Tuple[str, total_num_kv_heads = total_num_heads num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: @@ -498,8 +499,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index d1a36c3f481a..f8acc56706d2 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -3,7 +3,7 @@ import math from collections import OrderedDict from collections.abc import Iterable, Mapping, Sequence -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -713,8 +713,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -723,7 +723,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -922,8 +922,8 @@ def _build_image_projection_layers(self, config: PretrainedConfig): 'Florence2 only supports COSINE as temporal embedding.') def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: size = self.processor_config["size"] h, w = size["height"], size["width"] @@ -944,12 +944,12 @@ def _validate_shape(d: torch.Tensor): return data def _parse_and_validate_image_input(self, **kwargs: object): - pixel_values: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + pixel_values: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "pixel_values", None) - image_embeds: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + image_embeds: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "image_embeds", None) @@ -1096,7 +1096,7 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index d6bd6155a447..fbad7f56d0ba 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -18,7 +18,7 @@ """ PyTorch Fuyu model.""" import math from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, Set, Tuple, TypedDict +from typing import Literal, Optional, TypedDict import torch import torch.nn as nn @@ -382,7 +382,7 @@ def compute_logits( self.language_model.lm_head, hidden_states, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index c1cc0df11178..0f6d94e7518b 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -15,8 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Gemma model compatible with HuggingFace weights.""" +from collections.abc import Iterable from functools import cache -from typing import Iterable, Optional, Set, Tuple, Union +from typing import Optional, Union import torch from torch import nn @@ -231,7 +232,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -318,8 +319,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -329,7 +330,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, shard_name, shard_id) in stacked_params_mapping: if shard_name not in name: @@ -413,8 +414,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 7fb2e9948c06..b46716213c62 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -15,7 +15,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -218,7 +219,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: if residual is None: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -305,8 +306,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -316,7 +317,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -413,8 +414,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 4e0d4f84ca6b..3a88adcce0bd 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -14,7 +14,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn.functional as F @@ -320,7 +321,7 @@ def forward( hidden_states: torch.Tensor, residual: Optional[torch.Tensor], **kwargs, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: if residual is None: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -412,8 +413,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -423,7 +424,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -521,8 +522,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 65c177f8c5ad..743542ec8dfa 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import math from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Literal, Optional, Set, Tuple, TypedDict +from typing import Any, Literal, Optional, TypedDict import torch from torch import nn @@ -701,8 +701,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 290be968cb54..f351ce5a0681 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -21,7 +21,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GLM-4-0414 model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -60,7 +61,7 @@ def __init__(self, rope_theta: float = 10000, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - rope_scaling: Optional[Tuple] = None, + rope_scaling: Optional[tuple] = None, prefix: str = "", attn_type: str = AttentionType.DECODER) -> None: super().__init__() @@ -183,7 +184,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -293,8 +294,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index e3219333915e..470a7053e1b6 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -18,7 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-2 model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -280,10 +281,10 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if ".attn.bias" in name or ".attn.masked_bias" in name: # Skip attention mask. diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index def6b1544d8c..6a1d97bd7b69 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -19,7 +19,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPTBigCode model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -243,10 +244,10 @@ def forward( hidden_states = self.ln_f(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if ".attn.bias" in name: # Skip attention mask. @@ -327,8 +328,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."]), diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 3db96fb8e187..69fdd90cfbe8 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -17,7 +17,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-J model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -228,8 +229,8 @@ def forward( hidden_states = self.ln_f(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -239,7 +240,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "attn.bias" in name or "attn.masked_bias" in name: continue @@ -331,7 +332,7 @@ def compute_logits( sampling_metadata, self.lm_head.bias) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) \ No newline at end of file diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 620ee66f57e7..401fa9f5cc8b 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -17,7 +17,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-NeoX model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -240,10 +241,10 @@ def forward( hidden_states = self.final_layer_norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if ("attention.bias" in name or "attention.masked_bias" in name or "rotary_emb.inv_freq" in name): @@ -324,7 +325,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 0696a7245c22..eed0820a5779 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only IBM Granite model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -97,7 +98,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -230,7 +231,7 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -321,8 +322,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -332,7 +333,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -475,8 +476,8 @@ def make_empty_intermediate_tensors( device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: skip_prefixes = [ "rotary_emb.inv_freq", # Models trained using ColossalAI may include these tensors in diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index b43b59da6d11..fd8fb48c50e3 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -21,9 +21,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Inference-only IBM Granite speeech model.""" +"""Inference-only IBM Granite speech model.""" import math -from typing import Iterable, Mapping, Optional, Set, Tuple, TypedDict, Union +from collections.abc import Iterable, Mapping +from typing import Optional, TypedDict, Union import torch import torch.nn.functional as F @@ -625,7 +626,7 @@ def _build_input_features_mask( audio_embed_sizes: torch.Tensor, ) -> torch.Tensor: """Calculate the input features mask, which will generally be used - to mask the the padded features for all entries in the batch except + to mask the padded features for all entries in the batch except for those with the most audio features. Args: @@ -763,8 +764,8 @@ def compute_logits( def load_weights( self, - weights: Iterable[Tuple[str, torch.Tensor]], - ) -> Set[str]: + weights: Iterable[tuple[str, torch.Tensor]], + ) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 7fff14cb9f12..f342dfff824f 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GraniteMoe model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -305,8 +306,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: new_weights = {} for n, p in weights: if n.endswith('.block_sparse_moe.input_linear.weight'): @@ -425,8 +426,8 @@ def make_empty_intermediate_tensors( device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 706e648f1b4f..443b102c9968 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only GraniteMoeHybrid model.""" # Added by the IBM Team, 2025 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -381,10 +382,10 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() def _load(n, p): param = params_dict[n] @@ -538,7 +539,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() hidden_size = self.config.hidden_size @@ -578,7 +579,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index 4e660cbf667b..817e6091d276 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -4,7 +4,8 @@ The architecture is the same as granitemoe but with the addition of shared experts. """ -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -208,8 +209,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: new_weights = {} for n, p in weights: if n.endswith('.block_sparse_moe.input_linear.weight'): @@ -329,8 +330,8 @@ def make_empty_intermediate_tensors( device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 6f56eb2d5e38..6d2d16d098d4 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -21,7 +21,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Grok1 model.""" -from typing import Iterable, List, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn.functional as F @@ -263,7 +264,7 @@ def forward( kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -340,7 +341,7 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], inputs_embeds: Optional[torch.Tensor] = None, @@ -371,8 +372,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -390,7 +391,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and @@ -528,7 +529,7 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -547,8 +548,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: skip_prefixes = ["rotary_emb.inv_freq"] # Skip lm_head when tie_word_embeddings is True if self.config.tie_word_embeddings: diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index cb0379c10f3a..b8bdc7aa32b2 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -17,7 +17,8 @@ # limitations under the License. """PyTorch Idefics2 model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -342,8 +343,8 @@ def forward( last_hidden_state = self.post_layernorm(encoder_outputs) return last_hidden_state - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -351,7 +352,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 961954c2b584..fdb128ef5b54 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -17,7 +17,7 @@ import math from collections.abc import Iterable, Mapping, Sequence -from typing import Dict, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch from torch import nn @@ -85,7 +85,7 @@ class Idefics3ProcessingInfo(BaseProcessingInfo): def get_hf_processor( self, *, - size: Optional[Dict[str, int]] = None, + size: Optional[dict[str, int]] = None, **kwargs: object, ) -> Idefics3Processor: if size is not None: @@ -752,8 +752,8 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 7fea9647ead9..8f33a3e29c60 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional, - Protocol, Type, Union, overload, runtime_checkable) +from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, + Union, overload, runtime_checkable) import torch from torch import Tensor @@ -102,7 +102,7 @@ class _SupportsMultiModalType(Protocol): @overload def supports_multimodal( - model: Type[object]) -> TypeIs[Type[SupportsMultiModal]]: + model: type[object]) -> TypeIs[type[SupportsMultiModal]]: ... @@ -112,8 +112,8 @@ def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]: def supports_multimodal( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsMultiModal]], TypeIs[SupportsMultiModal]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsMultiModal]], TypeIs[SupportsMultiModal]]: if isinstance(model, type): return isinstance(model, _SupportsMultiModalType) @@ -134,9 +134,9 @@ class SupportsLoRA(Protocol): """ # The `embedding_module` and `embedding_padding_modules` # are empty by default. - embedding_modules: ClassVar[Dict[str, str]] = {} - embedding_padding_modules: ClassVar[List[str]] = [] - packed_modules_mapping: ClassVar[Dict[str, List[str]]] = {} + embedding_modules: ClassVar[dict[str, str]] = {} + embedding_padding_modules: ClassVar[list[str]] = [] + packed_modules_mapping: ClassVar[dict[str, list[str]]] = {} # We can't use runtime_checkable with ClassVar for issubclass checks @@ -145,13 +145,13 @@ class SupportsLoRA(Protocol): class _SupportsLoRAType(Protocol): supports_lora: Literal[True] - packed_modules_mapping: Dict[str, List[str]] - embedding_modules: Dict[str, str] - embedding_padding_modules: List[str] + packed_modules_mapping: dict[str, list[str]] + embedding_modules: dict[str, str] + embedding_padding_modules: list[str] @overload -def supports_lora(model: Type[object]) -> TypeIs[Type[SupportsLoRA]]: +def supports_lora(model: type[object]) -> TypeIs[type[SupportsLoRA]]: ... @@ -161,8 +161,8 @@ def supports_lora(model: object) -> TypeIs[SupportsLoRA]: def supports_lora( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsLoRA]], TypeIs[SupportsLoRA]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsLoRA]], TypeIs[SupportsLoRA]]: result = _supports_lora(model) if not result: @@ -191,7 +191,7 @@ def supports_lora( return result -def _supports_lora(model: Union[Type[object], object]) -> bool: +def _supports_lora(model: Union[type[object], object]) -> bool: if isinstance(model, type): return isinstance(model, _SupportsLoRAType) @@ -256,7 +256,7 @@ def forward( @overload -def supports_pp(model: Type[object]) -> TypeIs[Type[SupportsPP]]: +def supports_pp(model: type[object]) -> TypeIs[type[SupportsPP]]: ... @@ -266,8 +266,8 @@ def supports_pp(model: object) -> TypeIs[SupportsPP]: def supports_pp( - model: Union[Type[object], object], -) -> Union[bool, TypeIs[Type[SupportsPP]], TypeIs[SupportsPP]]: + model: Union[type[object], object], +) -> Union[bool, TypeIs[type[SupportsPP]], TypeIs[SupportsPP]]: supports_attributes = _supports_pp_attributes(model) supports_inspect = _supports_pp_inspect(model) @@ -298,14 +298,14 @@ def supports_pp( return supports_attributes and supports_inspect -def _supports_pp_attributes(model: Union[Type[object], object]) -> bool: +def _supports_pp_attributes(model: Union[type[object], object]) -> bool: if isinstance(model, type): return isinstance(model, _SupportsPPType) return isinstance(model, SupportsPP) -def _supports_pp_inspect(model: Union[Type[object], object]) -> bool: +def _supports_pp_inspect(model: Union[type[object], object]) -> bool: model_forward = getattr(model, "forward", None) if not callable(model_forward): return False @@ -336,13 +336,13 @@ def has_inner_state(model: object) -> TypeIs[HasInnerState]: @overload -def has_inner_state(model: Type[object]) -> TypeIs[Type[HasInnerState]]: +def has_inner_state(model: type[object]) -> TypeIs[type[HasInnerState]]: ... def has_inner_state( - model: Union[Type[object], object] -) -> Union[TypeIs[Type[HasInnerState]], TypeIs[HasInnerState]]: + model: Union[type[object], object] +) -> Union[TypeIs[type[HasInnerState]], TypeIs[HasInnerState]]: if isinstance(model, type): return isinstance(model, _HasInnerStateType) @@ -373,13 +373,13 @@ def is_attention_free(model: object) -> TypeIs[IsAttentionFree]: @overload -def is_attention_free(model: Type[object]) -> TypeIs[Type[IsAttentionFree]]: +def is_attention_free(model: type[object]) -> TypeIs[type[IsAttentionFree]]: ... def is_attention_free( - model: Union[Type[object], object] -) -> Union[TypeIs[Type[IsAttentionFree]], TypeIs[IsAttentionFree]]: + model: Union[type[object], object] +) -> Union[TypeIs[type[IsAttentionFree]], TypeIs[IsAttentionFree]]: if isinstance(model, type): return isinstance(model, _IsAttentionFreeType) @@ -410,13 +410,13 @@ def is_hybrid(model: object) -> TypeIs[IsHybrid]: @overload -def is_hybrid(model: Type[object]) -> TypeIs[Type[IsHybrid]]: +def is_hybrid(model: type[object]) -> TypeIs[type[IsHybrid]]: ... def is_hybrid( - model: Union[Type[object], object] -) -> Union[TypeIs[Type[IsHybrid]], TypeIs[IsHybrid]]: + model: Union[type[object], object] +) -> Union[TypeIs[type[IsHybrid]], TypeIs[IsHybrid]]: if isinstance(model, type): return isinstance(model, _IsHybridType) @@ -439,13 +439,13 @@ def has_noops(model: object) -> TypeIs[HasNoOps]: @overload -def has_noops(model: Type[object]) -> TypeIs[Type[HasNoOps]]: +def has_noops(model: type[object]) -> TypeIs[type[HasNoOps]]: ... def has_noops( - model: Union[Type[object], object] -) -> Union[TypeIs[Type[HasNoOps]], TypeIs[HasNoOps]]: + model: Union[type[object], object] +) -> Union[TypeIs[type[HasNoOps]], TypeIs[HasNoOps]]: if isinstance(model, type): return isinstance(model, _HasNoOpsType) @@ -461,7 +461,7 @@ class SupportsCrossEncoding(Protocol): @overload def supports_cross_encoding( - model: Type[object]) -> TypeIs[Type[SupportsCrossEncoding]]: + model: type[object]) -> TypeIs[type[SupportsCrossEncoding]]: ... @@ -471,8 +471,8 @@ def supports_cross_encoding(model: object) -> TypeIs[SupportsCrossEncoding]: def _supports_cross_encoding( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: if isinstance(model, type): return isinstance(model, SupportsCrossEncoding) @@ -481,15 +481,15 @@ def _supports_cross_encoding( def supports_cross_encoding( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: return is_pooling_model(model) and _supports_cross_encoding(model) class SupportsQuant: """The interface required for all models that support quantization.""" - packed_modules_mapping: ClassVar[Dict[str, List[str]]] = {} + packed_modules_mapping: ClassVar[dict[str, list[str]]] = {} quant_config: Optional[QuantizationConfig] = None def __new__(cls, *args, **kwargs) -> Self: @@ -525,7 +525,7 @@ class SupportsTranscription(Protocol): @overload def supports_transcription( - model: Type[object]) -> TypeIs[Type[SupportsTranscription]]: + model: type[object]) -> TypeIs[type[SupportsTranscription]]: ... @@ -535,8 +535,8 @@ def supports_transcription(model: object) -> TypeIs[SupportsTranscription]: def supports_transcription( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsTranscription]], TypeIs[SupportsTranscription]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsTranscription]], TypeIs[SupportsTranscription]]: if isinstance(model, type): return isinstance(model, SupportsTranscription) @@ -551,7 +551,7 @@ class SupportsV0Only(Protocol): @overload -def supports_v0_only(model: Type[object]) -> TypeIs[Type[SupportsV0Only]]: +def supports_v0_only(model: type[object]) -> TypeIs[type[SupportsV0Only]]: ... @@ -561,8 +561,8 @@ def supports_v0_only(model: object) -> TypeIs[SupportsV0Only]: def supports_v0_only( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsV0Only]], TypeIs[SupportsV0Only]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsV0Only]], TypeIs[SupportsV0Only]]: if isinstance(model, type): return isinstance(model, SupportsV0Only) diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index f141dcf3cd4f..d325a6b67132 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import (TYPE_CHECKING, Optional, Protocol, Type, Union, overload, +from typing import (TYPE_CHECKING, Optional, Protocol, Union, overload, runtime_checkable) import torch @@ -20,7 +20,7 @@ # The type of hidden states # Currently, T = torch.Tensor for all models except for Medusa -# which has T = List[torch.Tensor] +# which has T = list[torch.Tensor] T = TypeVar("T", default=torch.Tensor) T_co = TypeVar("T_co", default=torch.Tensor, covariant=True) @@ -48,12 +48,12 @@ def forward( ... -def _check_vllm_model_init(model: Union[Type[object], object]) -> bool: +def _check_vllm_model_init(model: Union[type[object], object]) -> bool: model_init = model.__init__ return supports_kw(model_init, "vllm_config") -def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool: +def _check_vllm_model_forward(model: Union[type[object], object]) -> bool: model_forward = getattr(model, "forward", None) if not callable(model_forward): return False @@ -75,7 +75,7 @@ def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool: @overload -def is_vllm_model(model: Type[object]) -> TypeIs[Type[VllmModel]]: +def is_vllm_model(model: type[object]) -> TypeIs[type[VllmModel]]: ... @@ -85,8 +85,8 @@ def is_vllm_model(model: object) -> TypeIs[VllmModel]: def is_vllm_model( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[VllmModel]], TypeIs[VllmModel]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[VllmModel]], TypeIs[VllmModel]]: return _check_vllm_model_init(model) and _check_vllm_model_forward(model) @@ -105,7 +105,7 @@ def compute_logits( @overload def is_text_generation_model( - model: Type[object]) -> TypeIs[Type[VllmModelForTextGeneration]]: + model: type[object]) -> TypeIs[type[VllmModelForTextGeneration]]: ... @@ -116,8 +116,8 @@ def is_text_generation_model( def is_text_generation_model( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[VllmModelForTextGeneration]], + model: Union[type[object], object], +) -> Union[TypeIs[type[VllmModelForTextGeneration]], TypeIs[VllmModelForTextGeneration]]: if not is_vllm_model(model): return False @@ -142,7 +142,7 @@ def pooler( @overload -def is_pooling_model(model: Type[object]) -> TypeIs[Type[VllmModelForPooling]]: +def is_pooling_model(model: type[object]) -> TypeIs[type[VllmModelForPooling]]: ... @@ -152,8 +152,8 @@ def is_pooling_model(model: object) -> TypeIs[VllmModelForPooling]: def is_pooling_model( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]: if not is_vllm_model(model): return False diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index fdcef8b9be8d..d9d9002bd5ba 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -6,8 +6,9 @@ # Copyright (c) 2023 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- +from collections.abc import Iterable from functools import partial -from typing import Iterable, Optional, Set, Tuple +from typing import Optional import torch import torch.nn as nn @@ -461,10 +462,10 @@ def forward( return encoder_outputs - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: param = params_dict[name] weight_loader = getattr(param, "weight_loader", diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index c3d7cbfcddbb..3f3e3966e838 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Iterable from functools import partial -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Type, Union +from typing import Any, Optional, Union import torch from torch import nn @@ -81,7 +82,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -225,7 +226,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -252,7 +253,7 @@ def __init__( *, vllm_config: VllmConfig, prefix: str = "", - layer_type: Type[InternLMDecoderLayer] = InternLMDecoderLayer): + layer_type: type[InternLMDecoderLayer] = InternLMDecoderLayer): super().__init__() config = vllm_config.model_config.hf_config @@ -316,7 +317,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", - model_type: Type[InternLM2Model] = InternLM2Model): + model_type: type[InternLM2Model] = InternLM2Model): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config @@ -361,15 +362,15 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "w1", 0), ("gate_up_proj", "w3", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -407,7 +408,7 @@ def __init__( *, vllm_config: VllmConfig, prefix: str = "", - model_type: Type[InternLM2Model] = InternLM2Model, + model_type: type[InternLM2Model] = InternLM2Model, ): super().__init__(vllm_config=vllm_config, prefix=prefix, diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 69b0caab8f8e..6893d0239121 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Tuple, Union +from typing import Optional, Union import torch from torch import nn @@ -66,7 +66,7 @@ def forward( hidden_states: torch.Tensor, residual: Optional[torch.Tensor], visual_token_mask: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 23b92ad2bbf6..66e78fcc4e80 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -8,7 +8,7 @@ # -------------------------------------------------------- from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union +from typing import Literal, Optional, TypedDict, TypeVar, Union import torch import torch.nn as nn @@ -932,8 +932,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: # unused modules appear in OpenGVLab/InternVideo2_5_Chat_8B skip_prefixes = [ "action_embed", "temporal_embed", "track_embed", diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index e1e3f0f199c5..d6a1e0bb4845 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -21,7 +21,8 @@ """Inference-only Jais model compatible with HuggingFace weights.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -333,10 +334,10 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "lm_head.weight" in name: # GPT-2 ties the weights of the embedding layer and the final diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 46335c2b3930..6f9fa60c9b05 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only Jamba model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -442,7 +443,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() hidden_size = self.config.hidden_size conv_state_shape = ( @@ -464,8 +465,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -482,7 +483,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -583,7 +584,7 @@ def pooler( logits = self.score(hidden_states) return self._pooler(logits, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): # TODO: The reward weights themselves have float32 accuracy data, we # would like to load them in fp32 to get that extra precision. super().load_weights(weights) diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index 0629266860fd..b575f44765a8 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -43,10 +43,9 @@ import copy import math -from collections.abc import Mapping +from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass -from typing import (Any, Iterable, List, Literal, Optional, Sequence, Tuple, - TypedDict, Union) +from typing import Any, Literal, Optional, TypedDict, Union import torch from torch import nn @@ -120,7 +119,7 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: class KimiVLImagePixelInputs(TypedDict): type: Literal["pixel_values"] - pixel_values: Union[torch.Tensor, List[torch.Tensor]] + pixel_values: Union[torch.Tensor, list[torch.Tensor]] """ Shape:`(num_patches, num_channels, patch_size, patch_size)` """ @@ -447,7 +446,7 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata, **kwargs) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): config = self.config.text_config _KEYS_TO_MODIFY_MAPPING = { "language_model.lm_head": "lm_head", diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index c1593dcbe344..c15c0213b520 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only LLaMA model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -103,7 +104,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -285,7 +286,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -394,8 +395,8 @@ def forward( return hidden_states, aux_hidden_states return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -405,7 +406,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -582,8 +583,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] @@ -599,7 +600,7 @@ def maybe_remap_mistral( self, name: str, loaded_weight: torch.Tensor, - ) -> Tuple[str, torch.Tensor]: + ) -> tuple[str, torch.Tensor]: def permute(w: torch.Tensor, n_heads: int): attn_in = self.config.head_dim * n_heads diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 0fdc30f36f9b..40fdd84d8fb0 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -16,7 +16,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only LLaMA model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Any, Optional import torch from torch import nn @@ -25,8 +26,7 @@ from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce) +from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (QKVParallelLinear, @@ -49,7 +49,7 @@ def custom_routing_function( gating_output: torch.Tensor, topk: int, renormalize: bool, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: router_scores, router_indices = fast_topk(gating_output, topk, dim=-1) # psuedo-standard is that the router scores are floats router_scores = torch.sigmoid(router_scores.float()) @@ -89,7 +89,7 @@ def __init__(self, quant_config=quant_config, bias=False, prefix=f"{prefix}.shared_expert", - reduce_results=False, # We need to do scatter before reduce + reduce_results=self.experts.must_reduce_shared_expert_outputs(), ) def forward(self, hidden_states): @@ -102,7 +102,8 @@ def forward(self, hidden_states): experts_out = routed_out + shared_out if self.tp_size > 1: - experts_out = tensor_model_parallel_all_reduce(experts_out) + experts_out = self.experts.maybe_all_reduce_tensor_model_parallel( + experts_out) return experts_out @@ -115,7 +116,7 @@ def __init__(self, num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -300,7 +301,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -335,9 +336,9 @@ def load_moe_expert_weights( self, name: str, loaded_weight: torch.Tensor, - params_dict: Dict[str, nn.Parameter], - loaded_params: Set[str], - expert_params_mapping: List[Tuple[str, str, int, str]], + params_dict: dict[str, nn.Parameter], + loaded_params: set[str], + expert_params_mapping: list[tuple[str, str, int, str]], fused: bool = True, ) -> bool: expert_param_loaded = False @@ -390,8 +391,8 @@ def load_moe_expert_weights( expert_param_loaded = True return expert_param_loaded - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -412,7 +413,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ckpt_up_proj_name="gate_up_proj", num_experts=1) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "experts.gate_up_proj" in name or "experts.down_proj" in name: fused_experts_params = True @@ -489,8 +490,8 @@ def _init_model(self, prefix=prefix, layer_type=layer_type) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] @@ -506,7 +507,7 @@ def permute_qk_weight_for_rotary( self, name: str, loaded_weight: torch.Tensor, - ) -> Tuple[str, torch.Tensor]: + ) -> tuple[str, torch.Tensor]: def permute(w: torch.Tensor, n_heads: int): attn_in = self.config.head_dim * n_heads diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index 4e51daa220e4..018ecc2a8c0f 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Set, Tuple +from collections.abc import Iterable import torch import torch.nn as nn @@ -92,8 +92,8 @@ def forward( hidden_states = hidden_states + residual return hidden_states, hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -103,7 +103,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: @@ -150,7 +150,7 @@ def forward( ) -> tuple[torch.Tensor, torch.Tensor]: return self.model(input_ids, positions, hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader( self, skip_prefixes=None, diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 9761c8389db2..2302d1352de6 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -56,7 +57,7 @@ def forward( embeds: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: residual = hidden_states embeds = self.input_layernorm(embeds) @@ -140,8 +141,8 @@ def forward( hidden_states, hidden_prenorm = self.norm(hidden_states, residual) return hidden_states, hidden_prenorm - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -151,7 +152,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if 'midlayer.' in name: name = name.replace('midlayer.', 'layers.0.') @@ -228,7 +229,7 @@ def combine_hidden_states( # combine multiple auxiliary hidden states returned by eagle3 return self.model.fc(hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader( self, skip_prefixes=None, diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 6287fdb3300c..95c1a0ca0b98 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -2,8 +2,8 @@ from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict, - TypeVar, Union, cast) +from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, + Union, cast) import torch import torch.nn as nn @@ -751,8 +751,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index c7e8d6991b25..e731f1bfdb9a 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 from abc import abstractmethod -from typing import (Final, Iterable, List, Literal, Mapping, Optional, - Protocol, Set, Tuple, TypedDict, TypeVar, Union) +from collections.abc import Iterable, Mapping +from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, + Union) import torch import torch.nn as nn @@ -266,8 +267,8 @@ def _validate_shape(d: torch.Tensor): return data def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -450,7 +451,7 @@ def _process_image_pixels( def _process_image_input( self, image_input: LlavaNextImageInputs, - ) -> Union[torch.Tensor, List[torch.Tensor]]: + ) -> Union[torch.Tensor, list[torch.Tensor]]: if image_input["type"] == "image_embeds": return [image_input["data"]] @@ -577,7 +578,7 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index a5ff189cfdb5..9303ea121727 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -2,7 +2,7 @@ import math from collections.abc import Iterable, Mapping, Sequence -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -35,7 +35,7 @@ class LlavaNextVideoPixelInputs(TypedDict): type: Literal["pixel_values_videos"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size, num_frames, num_channels, height, width)` @@ -300,8 +300,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.language_model.model.make_empty_intermediate_tensors) def _validate_video_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -326,7 +326,7 @@ def _parse_and_validate_video_input( A legal video input should have the following dimensions: { "pixel_values_videos" : - List[b, Tensor(nb_frames, nb_channels, height, width)] + list[b, Tensor(nb_frames, nb_channels, height, width)] } """ pixel_values_videos = kwargs.pop("pixel_values_videos", None) @@ -460,8 +460,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, # This model doesn't support images for now diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 5c2b388e403d..49f1ecb4be89 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -2,8 +2,7 @@ import math from collections.abc import Iterable, Mapping, Sequence -from typing import (Final, List, Literal, Optional, Protocol, Set, Tuple, - TypedDict, Union) +from typing import Final, Literal, Optional, Protocol, TypedDict, Union import torch import torch.nn as nn @@ -471,8 +470,8 @@ def _validate_shape(d: torch.Tensor): return data def _validate_image_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -530,8 +529,8 @@ def _parse_and_validate_image_input( raise AssertionError("This line should be unreachable.") def _validate_video_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -557,7 +556,7 @@ def _parse_and_validate_video_input( A legal video input should have the following dimensions: { "pixel_values_videos" : - List[b, Tensor(nb_frames, nb_channels, height, width)] + list[b, Tensor(nb_frames, nb_channels, height, width)] } """ pixel_values_videos = kwargs.pop("pixel_values_videos", None) @@ -706,7 +705,7 @@ def _merge_image_patch_embeddings(self, def _process_image_pixels( self, inputs: LlavaOnevisionImagePixelInputs, - ) -> Union[torch.Tensor, List[torch.Tensor]]: + ) -> Union[torch.Tensor, list[torch.Tensor]]: assert self.vision_tower is not None pixel_values = inputs["pixel_values"] @@ -735,7 +734,7 @@ def _process_image_pixels( def _process_image_input( self, image_input: LlavaOnevisionImageInputs, - ) -> Union[torch.Tensor, List[torch.Tensor]]: + ) -> Union[torch.Tensor, list[torch.Tensor]]: if image_input["type"] == "image_embeds": return [image_input["data"]] @@ -948,7 +947,7 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index af78ece66bbe..ce76a76b6574 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """PyTorch MAMBA model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -30,7 +31,7 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = tuple[torch.Tensor, torch.Tensor] class MambaDecoderLayer(nn.Module): @@ -153,10 +154,10 @@ def forward( return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "A_log" in name: name = name.replace("A_log", "A") @@ -247,7 +248,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() conv_state_shape = ( self.config.intermediate_size // world_size, @@ -265,7 +266,7 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index 72daf34c4412..858a1633befa 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """PyTorch MAMBA2 model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -35,7 +36,7 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = tuple[torch.Tensor, torch.Tensor] class Mamba2DecoderLayer(nn.Module): @@ -241,7 +242,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() conv_state_shape, temporal_state_shape = None, None @@ -279,10 +280,10 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "A_log" in name: name = name.replace("A_log", "A") diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py index 25839727898f..47d0ef9cc6bb 100644 --- a/vllm/model_executor/models/mamba_cache.py +++ b/vllm/model_executor/models/mamba_cache.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Tuple import torch @@ -25,8 +24,8 @@ def at_layer_idx(self, layer_idx): class MambaCacheManager(ConstantSizeCache): def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype, - num_mamba_layers: int, conv_state_shape: Tuple[int, int], - temporal_state_shape: Tuple[int, int]): + num_mamba_layers: int, conv_state_shape: tuple[int, int], + temporal_state_shape: tuple[int, int]): # Determine max batch size to set size of MambaCache max_batch_size = vllm_config.scheduler_config.max_num_seqs diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index a19d7da5654b..ac0b281f359c 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, List, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -96,13 +97,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: # checkpoint file has token_map tensor. self.token_map = None - def forward(self, hidden_states: torch.Tensor) -> List[torch.Tensor]: + def forward(self, hidden_states: torch.Tensor) -> list[torch.Tensor]: return [block(hidden_states) for block in self.blocks] def compute_logits( - self, hidden_states: List[torch.Tensor], - sampling_metadata: SamplingMetadata) -> List[torch.Tensor]: - logits_lst: List[torch.Tensor] = [] + self, hidden_states: list[torch.Tensor], + sampling_metadata: SamplingMetadata) -> list[torch.Tensor]: + logits_lst: list[torch.Tensor] = [] for hs, lm_head in zip(hidden_states, self.lm_heads): _logits = self.logits_processor(lm_head, hs, sampling_metadata) @@ -127,9 +128,9 @@ def compute_logits( def sample( self, - logits: List[torch.Tensor], + logits: list[torch.Tensor], sampling_metadata: SamplingMetadata, - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: logits = torch.stack(logits, dim=0).float() logprobs = torch.log_softmax(logits, dim=-1) token_ids = logits.argmax(-1) # support only top-1 for now @@ -144,7 +145,7 @@ def sample( token_prob_list.append(probs[:, seq_group.sample_indices]) token_logprob_list.append(logprobs[:, seq_group.sample_indices]) - outputs: List[Optional[SamplerOutput]] = [] + outputs: list[Optional[SamplerOutput]] = [] for idx in range(len(sampling_metadata.seq_groups)): outputs.append( SamplerOutput( @@ -160,7 +161,7 @@ def generate_proposals( self, previous_hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: return self.sample( logits=self.compute_logits( hidden_states=self.forward(previous_hidden_states), @@ -169,10 +170,10 @@ def generate_proposals( sampling_metadata=sampling_metadata, ) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() weights_map = {} diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py index b882aeebb08d..49ea64e029d6 100644 --- a/vllm/model_executor/models/mimo.py +++ b/vllm/model_executor/models/mimo.py @@ -24,7 +24,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only MiMo model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -87,8 +88,8 @@ def forward( hidden_states = hidden_states + residual return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), @@ -97,7 +98,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "mtp_layers" in name: continue diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py index c2f1cf4112fe..adcfcaa6b1e6 100644 --- a/vllm/model_executor/models/mimo_mtp.py +++ b/vllm/model_executor/models/mimo_mtp.py @@ -18,7 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only MiMo-MTP model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -193,8 +194,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), @@ -204,7 +205,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 866dc3f466e7..d99ae81468a9 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -23,7 +23,8 @@ # limitations under the License. """Inference-only MiniCPM model compatible with HuggingFace weights.""" import math -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -190,7 +191,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -329,7 +330,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -428,8 +429,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -446,7 +447,7 @@ def load_weights(self, weights: Iterable[Tuple[str, for weight_name in ["w1", "w2", "w3"] ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -582,8 +583,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index 1b24c38cef1b..2a6867d12d99 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -23,7 +23,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only MiniCPM3 model compatible with HuggingFace weights.""" -from typing import Any, Dict, Optional +from typing import Any, Optional import torch from torch import nn @@ -58,7 +58,7 @@ def __init__( q_lora_rank: int, kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index f42d48e919cd..ae5df0f9273f 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -23,8 +23,7 @@ # limitations under the License. """Inference-only MiniCPM-O model compatible with HuggingFace weights.""" from collections.abc import Iterable, Mapping, Sequence -from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict, - Union) +from typing import Any, Callable, Literal, Optional, TypedDict, Union import torch from torch import nn @@ -559,8 +558,8 @@ def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""): self.audio_encoder_layer = -1 return model - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["tts"]) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 300360f785ae..04cc7e35e345 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -26,8 +26,7 @@ from collections import defaultdict from collections.abc import Iterable, Mapping, Sequence from functools import partial -from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict, - Union) +from typing import Any, Callable, Literal, Optional, TypedDict, Union import numpy as np import torch @@ -118,7 +117,7 @@ def __init__(self, num_heads: int, kv_dim: Optional[int] = None, norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN, - max_size: Tuple[int, int] = (70, 70), + max_size: tuple[int, int] = (70, 70), quant_config: Optional[QuantizationConfig] = None, prefix: str = "") -> None: super().__init__(num_queries, @@ -133,7 +132,7 @@ def __init__(self, self._set_2d_pos_cache(self.max_size) def _set_2d_pos_cache(self, - max_size: Tuple[int, int], + max_size: tuple[int, int], device: torch.types.Device = "cpu") -> None: pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim, max_size, @@ -203,7 +202,7 @@ def forward(self, x: torch.Tensor, return x -def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]: +def get_version_by_config(config: PretrainedConfig) -> tuple[int, ...]: version_float = getattr(config, "version", None) # The old configs do not include version number @@ -938,8 +937,8 @@ def compute_logits( ) -> Optional[torch.Tensor]: return self.llm.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 951f4e2304a1..0285402dadf7 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -3,7 +3,8 @@ import copy import math import re -from typing import Dict, Iterable, List, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.distributed @@ -127,7 +128,7 @@ def forward( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert residual is None, "RMSNorm does not support residual connection." return self._forward(x) @@ -178,7 +179,7 @@ def forward( positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: from vllm import _custom_ops as ops self.cos_sin_cache = self.cos_sin_cache.to(positions.device) query_cast = query.to(self.cache_dtype) @@ -708,11 +709,11 @@ def __init__( def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor, - kv_caches: Union[List[Dict], Optional[torch.Tensor]], + kv_caches: Union[list[dict], Optional[torch.Tensor]], attn_metadata: AttentionMetadata, residual: Optional[torch.Tensor], is_warmup: bool = False, - **kwargs) -> Tuple[torch.Tensor, torch.Tensor]: + **kwargs) -> tuple[torch.Tensor, torch.Tensor]: forward_context = get_forward_context() attn_metadata = forward_context.attn_metadata @@ -1072,10 +1073,10 @@ def make_empty_intermediate_tensors( device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() def which_layer(name: str) -> int: if "layers" in name: diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index 4ac60f97bb5f..14c1250ca3b4 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Iterable, Mapping -from typing import Literal, Optional, Set, Tuple, TypedDict, Union, cast +from typing import Literal, Optional, TypedDict, Union, cast import torch import torch.nn as nn @@ -357,7 +357,7 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 42ec786f3a59..2b9cbf10440a 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -2,8 +2,8 @@ from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict, - TypeVar, Union) +from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, + Union) import torch import torch.nn as nn @@ -589,8 +589,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 1513c8dad097..1968bf9e68af 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Mixtral model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -314,8 +315,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -332,7 +333,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.num_local_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -479,7 +480,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["rotary_emb.inv_freq"]) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 7c022a5b8f68..4de83d12be6a 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Mixtral model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import numpy as np import torch @@ -397,8 +398,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -407,7 +408,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 0c1d61c01f91..713c9e8d203f 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -16,7 +16,7 @@ """PyTorch Mllama model.""" import math from collections.abc import Iterable, Mapping, Sequence -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import numpy as np import torch @@ -224,7 +224,7 @@ def apply( return mm_inputs - def _get_num_image_in_last_group(self, prompt_token_ids: List[int]) -> int: + def _get_num_image_in_last_group(self, prompt_token_ids: list[int]) -> int: num_images = 0 for token_id in prompt_token_ids[::-1]: if token_id == self.info.get_hf_config().image_token_index: @@ -370,8 +370,8 @@ def __init__( self, in_channels: int, out_channels: int, - kernel_size: Union[int, Tuple[int, int]], - stride: Union[int, Tuple[int, int]], + kernel_size: Union[int, tuple[int, int]], + stride: Union[int, tuple[int, int]], bias: bool = False, ) -> None: super().__init__() @@ -603,7 +603,7 @@ def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> Union[BaseModelOutput]: encoder_states = () for i, encoder_layer in enumerate(self.layers): @@ -878,7 +878,7 @@ def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor], - kv_range_for_decode: Optional[List[Tuple[int, int]]], + kv_range_for_decode: Optional[list[tuple[int, int]]], cross_attention_states: Optional[torch.Tensor], ) -> torch.Tensor: q, k, v = self.qkv_proj(hidden_states, cross_attention_states) @@ -905,7 +905,7 @@ def _attention_with_mask( k: torch.Tensor, v: torch.Tensor, attention_mask: torch.Tensor, - kv_range_for_decode: List[Tuple[int, int]], + kv_range_for_decode: list[tuple[int, int]], ) -> torch.Tensor: kv_cache = self.attn.kv_cache[self.pipeline_parallel_rank] attn_metadata: AttentionMetadata = get_forward_context().attn_metadata @@ -1019,7 +1019,7 @@ def forward( hidden_states: torch.Tensor, cross_attention_states: torch.Tensor, cross_attention_mask: torch.Tensor, - kv_range_for_decode: Optional[List[Tuple[int, int]]], + kv_range_for_decode: Optional[list[tuple[int, int]]], full_text_row_masked_out_mask: torch.Tensor, ) -> torch.Tensor: residual = hidden_states @@ -1089,8 +1089,8 @@ def forward( positions: Optional[torch.LongTensor], cross_attention_states: Optional[torch.LongTensor], cross_attention_mask: Optional[torch.LongTensor], - kv_range_for_decode: Optional[List[Tuple[int, int]]], - full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, + kv_range_for_decode: Optional[list[tuple[int, int]]], + full_text_row_masked_out_mask: Optional[tuple[torch.Tensor, torch.Tensor]], skip_cross_attention: bool, ) -> torch.Tensor: @@ -1150,8 +1150,8 @@ def forward( positions: Optional[torch.LongTensor], cross_attention_states: Optional[torch.LongTensor], cross_attention_mask: Optional[torch.LongTensor], - kv_range_for_decode: Optional[List[Tuple[int, int]]], - full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, + kv_range_for_decode: Optional[list[tuple[int, int]]], + full_text_row_masked_out_mask: Optional[tuple[torch.Tensor, torch.Tensor]], skip_cross_attention: bool, ) -> torch.Tensor: @@ -1221,7 +1221,7 @@ def compute_logits( return logits def unpack_data(self, - image_data: Union[List[torch.Tensor], torch.Tensor], + image_data: Union[list[torch.Tensor], torch.Tensor], padding_value=0) -> torch.Tensor: if isinstance(image_data, torch.Tensor): # torch.Tensor @@ -1230,7 +1230,7 @@ def unpack_data(self, assert isinstance( image_data[0], torch.Tensor), "Image data is not properly batched." - # List[torch.Tensor] + # list[torch.Tensor] bsz = len(image_data) max_length = max(t.size(0) for t in image_data) trailing_dims = image_data[0].shape[1:] @@ -1248,24 +1248,24 @@ def unpack_data(self, def _parse_and_validate_image_input(self, **kwargs: object): # tensor with the same shape will be batched together by # MultiModalKwargs.batch, so pixel_values here can be: - # - List[torch.Tensor]: + # - list[torch.Tensor]: # with shape (num_image, num_tiles, 3, image_res, image_res) # - torch.Tensor: # with shape (bs, num_image, num_tiles, 3, image_res, image_res) - pixel_values: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + pixel_values: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "pixel_values", None) - image_embeds: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + image_embeds: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "image_embeds", None) - aspect_ratio_ids: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + aspect_ratio_ids: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "aspect_ratio_ids", None) - aspect_ratio_mask: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + aspect_ratio_mask: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "aspect_ratio_mask", None) @@ -1293,10 +1293,10 @@ def _parse_and_validate_image_input(self, **kwargs: object): def _get_and_validate_encoder_lens( self, - encoder_seq_lens: List[int], - num_tiles: List[List[int]], + encoder_seq_lens: list[int], + num_tiles: list[list[int]], num_tokens_per_tile: int, - ) -> List[int]: + ) -> list[int]: # Get the actual number of encoder tokens for each sample. # Because attn_metadata.encoder_seq_lens only counts the last # group of images for each sample, which is used to cheat the @@ -1318,7 +1318,7 @@ def _get_and_validate_encoder_lens( def flat_encoder_result(self, cross_attention_states: torch.Tensor, attn_metadata: AttentionMetadata, - actual_encoder_seq_lens: List[int]): + actual_encoder_seq_lens: list[int]): cross_attention_states_flat = torch.zeros( sum(actual_encoder_seq_lens), @@ -1342,8 +1342,8 @@ def get_cross_attention_states( self, image_inputs: MllamaImagePixelInputs, attn_metadata: AttentionMetadata, - actual_encoder_seq_lens: List[int], - ) -> Tuple[torch.Tensor]: + actual_encoder_seq_lens: list[int], + ) -> tuple[torch.Tensor]: # NOTE: llama's reference implementation runs vision model on CPU pixel_values = image_inputs['data'] aspect_ratio_ids = image_inputs['aspect_ratio_ids'] @@ -1367,10 +1367,10 @@ def get_cross_attention_mask( self, input_ids: torch.Tensor, attn_metadata: AttentionMetadata, - num_tiles: List[List[int]], + num_tiles: list[list[int]], num_tokens_per_tile: int, dtype: torch.dtype, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: token_ids = input_ids.tolist() start = 0 batch_token_ids = [] @@ -1422,7 +1422,7 @@ def forward( input_ids: torch.Tensor, positions: torch.Tensor, **kwargs: object, - ) -> Union[Tuple, CausalLMOutputWithPast]: + ) -> Union[CausalLMOutputWithPast]: attn_metadata = get_forward_context().attn_metadata if attn_metadata.num_prefill_tokens > 0 and \ attn_metadata.num_decode_tokens > 0: @@ -1476,8 +1476,8 @@ def forward( return outputs - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1487,7 +1487,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - updated_params: Set[str] = set() + updated_params: set[str] = set() for name, loaded_weight in weights: if 'patch_embedding.weight' in name: name = name.replace('patch_embedding.weight', @@ -1538,7 +1538,7 @@ def get_mm_mapping(self) -> MultiModelKeys: tower_model="vision_model") -def skip_attention_mask(sparse_mask: List[List[int]]) -> bool: +def skip_attention_mask(sparse_mask: list[list[int]]) -> bool: for mask in sparse_mask: # Skip text-only samples. if len(mask) == 0: @@ -1556,10 +1556,10 @@ def skip_attention_mask(sparse_mask: List[List[int]]) -> bool: def convert_sparse_cross_attention_mask_to_dense( - sparse_mask: List[List[List[int]]], - num_tiles: List[List[int]], - lengths: List[int], -) -> Tuple[np.ndarray, List[Tuple[int, int]]]: + sparse_mask: list[list[list[int]]], + num_tiles: list[list[int]], + lengths: list[int], +) -> tuple[np.ndarray, list[tuple[int, int]]]: total_length = sum(lengths) total_tiles = sum([sum(tiles) for tiles in num_tiles]) dense_mask = np.zeros(shape=(total_length, total_tiles), dtype=np.int64) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 741b9837398c..8c98492c0bed 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -18,7 +18,7 @@ import math from collections.abc import Iterable, Mapping from itertools import tee -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch from torch import nn @@ -582,7 +582,7 @@ def _get_prompt_updates( mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, - ) -> List[PromptUpdate]: + ) -> list[PromptUpdate]: assert ( mm_items.get_count("image", strict=False) == 0 or "aspect_ratios" in out_mm_kwargs @@ -778,26 +778,26 @@ def compute_logits( def separate_weights( self, - weights: Iterable[Tuple[str, torch.Tensor]], + weights: Iterable[tuple[str, torch.Tensor]], prefix: str, - ) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[ + ) -> tuple[Iterable[tuple[str, torch.Tensor]], Iterable[tuple[ str, torch.Tensor]]]: weights1, weights2 = tee(weights, 2) - def get_prefix_weights() -> Iterable[Tuple[str, torch.Tensor]]: + def get_prefix_weights() -> Iterable[tuple[str, torch.Tensor]]: for name, data in weights1: if name.startswith(prefix): yield (name, data) - def get_other_weights() -> Iterable[Tuple[str, torch.Tensor]]: + def get_other_weights() -> Iterable[tuple[str, torch.Tensor]]: for name, data in weights2: if not name.startswith(prefix): yield (name, data) return get_prefix_weights(), get_other_weights() - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) @@ -806,7 +806,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".self_attn.qkv_proj", ".self_attn.v_proj", "v"), ] params_dict = dict(self.named_parameters()) - updated_params: Set[str] = set() + updated_params: set[str] = set() # language_model is an Llama4ForCausalLM instance. We load it's # using llama4's load_weights routine. diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index 2920427f94f7..a7d7aa7d44ef 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import math -from typing import Iterable, List, Set, Tuple +from collections.abc import Iterable import torch import torch.nn as nn @@ -148,7 +148,7 @@ def generate_proposals( previous_hidden_states: torch.Tensor, num_predict_tokens: int, sampling_metadata: SamplingMetadata, - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: if num_predict_tokens > self.max_speculative_tokens: raise ValueError(f"Max speculative tokens for model is " f"{self.max_speculative_tokens}, but " @@ -190,10 +190,10 @@ def generate_proposals( return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: name = name.replace("speculator.", "") param = params_dict.get(name) diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 73effb207bce..86552aa05bf9 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -212,11 +213,11 @@ def __init__( eps=config.norm_eps, bias=config.norm_bias) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: weights = self.hf_to_vllm_mapper.apply(weights) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if name.endswith(".bias") and name not in params_dict: continue @@ -280,7 +281,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._pooler = CrossEncodingPooler(config, self.classifier, ModernBertPooler(config)) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): self_weights = [] diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py index 23814e6322d2..25e6f594069e 100644 --- a/vllm/model_executor/models/module_mapping.py +++ b/vllm/model_executor/models/module_mapping.py @@ -4,7 +4,7 @@ # https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py from dataclasses import dataclass, field -from typing import List, Union +from typing import Union @dataclass @@ -46,17 +46,17 @@ class ModelKeys: @dataclass class MultiModelKeys(ModelKeys): - language_model: List[str] = field(default_factory=list) - connector: List[str] = field(default_factory=list) + language_model: list[str] = field(default_factory=list) + connector: list[str] = field(default_factory=list) # vision tower and audio tower - tower_model: List[str] = field(default_factory=list) - generator: List[str] = field(default_factory=list) + tower_model: list[str] = field(default_factory=list) + generator: list[str] = field(default_factory=list) @staticmethod - def from_string_field(language_model: Union[str, List[str]] = None, - connector: Union[str, List[str]] = None, - tower_model: Union[str, List[str]] = None, - generator: Union[str, List[str]] = None, + def from_string_field(language_model: Union[str, list[str]] = None, + connector: Union[str, list[str]] = None, + tower_model: Union[str, list[str]] = None, + generator: Union[str, list[str]] = None, **kwargs) -> 'MultiModelKeys': def to_list(value): diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 42bbb77a22c0..e215582a37ac 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -4,7 +4,7 @@ from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass from functools import cached_property, partial -from typing import List, Optional, Set, Tuple, TypedDict, Union +from typing import Optional, TypedDict, Union import numpy as np import torch @@ -90,7 +90,7 @@ class MolmoImageInputs(TypedDict): @dataclass class VisionBackboneConfig: - image_default_input_size: Tuple[int, int] = (336, 336) + image_default_input_size: tuple[int, int] = (336, 336) image_patch_size: int = 14 image_pos_patch_size: int = 14 image_emb_dim: int = 1024 @@ -267,7 +267,7 @@ def __init__( for _ in range(config.image_num_layers) ]) - def forward(self, x: torch.Tensor) -> List[torch.Tensor]: + def forward(self, x: torch.Tensor) -> list[torch.Tensor]: hidden_states = [] for r in self.resblocks: x = r(x) @@ -334,7 +334,7 @@ def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor: def forward(self, x: torch.Tensor, - patch_num: Optional[int] = None) -> List[torch.Tensor]: + patch_num: Optional[int] = None) -> list[torch.Tensor]: """ : param x: (batch_size, num_patch, n_pixels) """ @@ -434,7 +434,7 @@ def __init__( ) def _apply_qk_norm(self, q: torch.Tensor, - k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: if self.tp_size > 1: q = tensor_model_parallel_all_gather(q.contiguous()) k = tensor_model_parallel_all_gather(k.contiguous()) @@ -570,7 +570,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]: # Self Attention if residual is None: residual = hidden_states @@ -596,7 +596,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]: # Self Attention residual = hidden_states hidden_states = self.self_attn( @@ -740,15 +740,15 @@ def forward( # image_features: (batch_size, num_image, num_patch, d_model) return image_features - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("merged_linear", "gate_proj", 0), ("merged_linear", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -855,10 +855,10 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if name.endswith(".bias") and name not in params_dict: @@ -1530,7 +1530,7 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) weights = _get_weights_with_merged_embedding(weights) @@ -1548,8 +1548,8 @@ def get_mm_mapping(self) -> MultiModelKeys: def _get_weights_with_merged_embedding( - weights: Iterable[Tuple[str, torch.Tensor]] -) -> Iterable[Tuple[str, torch.Tensor]]: + weights: Iterable[tuple[str, torch.Tensor]] +) -> Iterable[tuple[str, torch.Tensor]]: embedding_weights = {} for name, weight in weights: if "wte.embedding" in name: diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index c367d90f847b..9f11d4a42273 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -42,9 +42,10 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import math +from collections.abc import Sequence from copy import deepcopy from functools import cached_property -from typing import List, Optional, Sequence, Tuple, Union +from typing import Optional, Union import torch import torch.nn as nn @@ -222,7 +223,7 @@ def __init__( self, out_dim: int, in_dim: int = 3, - patch_size: Union[int, Tuple[int, int]] = (14, 14), + patch_size: Union[int, tuple[int, int]] = (14, 14), pos_emb_height: int = 14, pos_emb_width: int = 14, ): @@ -526,7 +527,7 @@ def patch_merger( x: torch.Tensor, grid_hw: torch.Tensor, merge_kernel_size: list[int, int] = (2, 2), -) -> List[torch.Tensor]: +) -> list[torch.Tensor]: d_model = x.size(-1) outputs = [] diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 77bd794058cd..6c396d778ae7 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -2,7 +2,8 @@ # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -265,10 +266,10 @@ def forward( hidden_states = self.norm_f(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: @@ -323,7 +324,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 5208c0796c8d..862c53535e8a 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Nemotron model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -69,7 +70,7 @@ def _cast_if_autocast_enabled(*args): class NemotronLayerNorm1P(nn.LayerNorm): def __init__(self, - normalized_shape: Union[int, List[int], torch.Size], + normalized_shape: Union[int, list[int], torch.Size], eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True, @@ -133,7 +134,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -267,7 +268,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -441,8 +442,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -450,7 +451,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".qkv_proj", ".v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index 988b994b7689..f4d5a77f2086 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only deci model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Type, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -135,7 +136,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if self._is_no_op_attention: @@ -168,7 +169,7 @@ def __init__( *, vllm_config: VllmConfig, prefix: str = "", - layer_type: Type[DeciLMDecoderLayer] = DeciLMDecoderLayer, + layer_type: type[DeciLMDecoderLayer] = DeciLMDecoderLayer, ): super().__init__() @@ -260,8 +261,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -271,7 +272,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -428,8 +429,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 0781ca168f84..a36b62cd2284 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OLMo model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -209,7 +210,7 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]: # Attention block. residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -338,8 +339,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -349,7 +350,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 422b53d86f11..a41a959cdb04 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -23,8 +23,9 @@ # limitations under the License. """Inference-only OLMo2 model compatible with HuggingFace weights.""" +from collections.abc import Iterable from functools import partial -from typing import Iterable, Optional, Tuple, Union +from typing import Optional, Union import torch from torch import nn @@ -135,7 +136,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) def _apply_qk_norm(self, q: torch.Tensor, - k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: if self.tp_size > 1: q = tensor_model_parallel_all_gather(q.contiguous()) k = tensor_model_parallel_all_gather(k.contiguous()) @@ -365,7 +366,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index e6925e125690..9a07f57fd999 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OLMoE model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -102,7 +103,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 4096, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -307,8 +308,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -327,7 +328,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). @@ -439,8 +440,8 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=["rotary_emb.inv_freq"], diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index d258eddae25d..8376d62410d4 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -18,7 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OPT model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -312,8 +313,8 @@ def forward( intermediate_tensors, inputs_embeds=inputs_embeds) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -321,7 +322,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -400,8 +401,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head.weight"] diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 8d9c000750d7..1ccd1fe1f741 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -5,7 +5,8 @@ # Copyright (c) OrionStar Inc. # LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE """Inference-only Orion-14B model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -72,7 +73,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -186,7 +187,7 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -259,8 +260,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -270,7 +271,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -341,8 +342,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=([ diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 5204c751216f..e03705d48f3e 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -17,8 +17,8 @@ # limitations under the License. """ PyTorch Ovis model.""" import math -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from collections.abc import Iterable, Mapping +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -211,7 +211,7 @@ class OvisImagePatchInputs(TypedDict): `(batch_size * (num_patches + 1))` """ - patches_per_image: List[int] + patches_per_image: list[int] """ List of number of total patches for each image in the batch. This is used to restore the first two dimensions of `flat_data`. @@ -545,8 +545,8 @@ def compute_logits( logits = self.llm.compute_logits(hidden_states, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 8699ae52622d..427005e9b704 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch from torch import nn @@ -391,7 +391,7 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index eacf02433b57..d46b95fea5a8 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -21,7 +21,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only persimmon model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -260,10 +261,10 @@ def forward( hidden_states = self.final_layernorm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if is_pp_missing_parameter(name, self): continue @@ -336,7 +337,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index fc2b108bad97..330ad5c59448 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -36,7 +36,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """Inference-only Phi-1.5 model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -248,8 +249,8 @@ def forward( return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -257,7 +258,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v") ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: @@ -348,7 +349,7 @@ def compute_logits( sampling_metadata, self.lm_head.bias) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index 338e87b4285f..d00d7d886d67 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -230,8 +231,8 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], - Optional[Tuple[torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor], + Optional[tuple[torch.Tensor]]]: qkv, _ = self.query_key_value(hidden_states) qkv = qkv.view(qkv.shape[:-1] + @@ -352,10 +353,10 @@ def forward( hidden_states = self.final_layernorm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if name.endswith(".bias") and name not in params_dict: continue @@ -454,8 +455,8 @@ def forward( output_hidden_states = output_hidden_states return output_hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head.weight"] diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index a1442251b992..bb4d46be3f99 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -16,7 +16,7 @@ # limitations under the License. import re from collections.abc import Iterable, Mapping, Sequence -from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Any, Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -94,7 +94,7 @@ def _init_img_processor(hf_config: PretrainedConfig, class Phi3VImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` @@ -113,7 +113,7 @@ class Phi3VImagePixelInputs(TypedDict): class Phi3VImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. @@ -571,8 +571,8 @@ def _validate_shape(d: torch.Tensor): return data def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size expected_dims = (3, h, w) @@ -707,8 +707,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) autoloaded_weights = loader.load_weights(weights, diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index e5ff9ceddef7..fd154940ea7f 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import math from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union +from typing import Any, Literal, Optional, TypedDict, Union import numpy as np import torch @@ -392,7 +392,7 @@ def forward(self, pixel_values: torch.FloatTensor, class Phi4MMImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` @@ -417,7 +417,7 @@ class Phi4MMImagePixelInputs(TypedDict): class Phi4MMImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. @@ -426,7 +426,7 @@ class Phi4MMImageEmbeddingInputs(TypedDict): class Phi4MMAudioFeatureInputs(TypedDict): type: Literal["audio_features"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """Shape: `(batch_size * num_audios, 80, M)""" @@ -1031,7 +1031,7 @@ def _process_audio_input(self, audio_input: Phi4MMAudioInputs, return audio_embeds def _parse_and_validate_image_input(self, - **kwargs: object) -> Optional[Dict]: + **kwargs: object) -> Optional[dict]: input_image_embeds: NestedTensors = kwargs.get("input_image_embeds") if input_image_embeds is None: return None @@ -1238,7 +1238,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> None: weights = ((name, data) for name, data in weights if "lora" not in name) diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py index 34a7a73d057a..98cef75069ae 100644 --- a/vllm/model_executor/models/phi4mm_audio.py +++ b/vllm/model_executor/models/phi4mm_audio.py @@ -6,7 +6,7 @@ #!/usr/bin/env python3 import abc import math -from typing import List, Literal, Optional +from typing import Literal, Optional import numpy as np import torch @@ -91,9 +91,9 @@ class ConformerEncoderLayer(nn.Module): if set to True, use GLULinear module, otherwise, used GLUPointWiseConv module. default to False. - attention_innner_dim: int, optional + attention_inner_dim: int, optional if equal to -1, attention dim for linears k/q/v is - equal to d_model. otherwise attention_innner_dim is used. + equal to d_model. otherwise attention_inner_dim is used. default -1. attention_glu_type: str, optional activation function for glu used in the multihead attention, @@ -148,7 +148,7 @@ def __init__( conv_glu_type="sigmoid", bias_in_glu=True, linear_glu_in_convm=False, - attention_innner_dim=-1, + attention_inner_dim=-1, attention_glu_type="swish", activation_checkpointing="", export=False, @@ -169,7 +169,7 @@ def __init__( n_head, d_model, dropout_rate, - attention_innner_dim, + attention_inner_dim, attention_glu_type, bias_in_glu, use_pt_scaled_dot_product_attention= @@ -746,7 +746,7 @@ class ConformerEncoder(TransformerEncoderBase): attention_group_size = attenion_heads = Multi-Query Attention """ - extra_multi_layer_output_idxs: List[int] + extra_multi_layer_output_idxs: list[int] def __init__( # pylint: disable-all self, diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py index 4051763cec8c..f468fdbd5417 100644 --- a/vllm/model_executor/models/phi4mm_utils.py +++ b/vllm/model_executor/models/phi4mm_utils.py @@ -5,7 +5,7 @@ # but implemented by the Phi-Speech team #!/usr/bin/env python3 import math -from typing import Optional, Tuple, Union +from typing import Optional, Union import torch import torch.nn.functional as F @@ -1586,7 +1586,7 @@ def forward( memory: Optional[Tensor] = None, pos_emb: Optional[Tensor] = None, att_mask: Optional[Tensor] = None, - ) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]: + ) -> tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]: """AttModule forward Args: diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 2dc55e4c352e..7f2e9fdf7c4e 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only PhiMoE model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -505,8 +506,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -521,7 +522,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.num_local_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -657,8 +658,8 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["rotary_emb.inv_freq"]), diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index c0b492dbfcb9..c664d2371e27 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -4,7 +4,7 @@ from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass, fields from functools import cached_property -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -438,18 +438,18 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): - def is_vision_encoder_weights(weight: Tuple[str, torch.Tensor]): + def is_vision_encoder_weights(weight: tuple[str, torch.Tensor]): return weight[0].startswith("vision_encoder") - def is_vision_lang_adapter_weights(weight: Tuple[str, torch.Tensor]): + def is_vision_lang_adapter_weights(weight: tuple[str, torch.Tensor]): return weight[0].startswith("vision_language_adapter") - def is_patch_merger(weight: Tuple[str, torch.Tensor]): + def is_patch_merger(weight: tuple[str, torch.Tensor]): return weight[0].startswith("patch_merger") - def is_pre_mm_projector_norm(weight: Tuple[str, torch.Tensor]): + def is_pre_mm_projector_norm(weight: tuple[str, torch.Tensor]): return weight[0].startswith("pre_mm_projector_norm") # Get references to parameters for direct loading @@ -566,7 +566,7 @@ def apply_rotary_emb_vit( xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) assert freqs_cis.dtype == torch.complex64 @@ -671,7 +671,7 @@ def forward( return x -def position_meshgrid(patch_embeds_list: List[torch.Tensor], ) -> torch.Tensor: +def position_meshgrid(patch_embeds_list: list[torch.Tensor], ) -> torch.Tensor: positions = torch.cat([ torch.stack( torch.meshgrid( @@ -733,7 +733,7 @@ def freqs_cis(self) -> torch.Tensor: def forward( self, - images: List[torch.Tensor], + images: list[torch.Tensor], ) -> torch.Tensor: """ Args: @@ -1023,7 +1023,7 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor, position_embeddings: torch.Tensor, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: batch, patches, _ = hidden_states.size() qkv_states, _ = self.qkv_proj(hidden_states) @@ -1249,8 +1249,8 @@ def forward( # (TODO) Add prefix argument for filtering out weights to be loaded # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1260,7 +1260,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.transformer.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 790c48ccd216..55a65f8078a4 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only PLaMo2 model.""" import math -from typing import Iterable, Optional, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -659,7 +660,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() hidden_size = (self.config.mamba_num_heads * self.config.hidden_size_per_head) @@ -682,7 +683,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): params_dict = dict(self.named_parameters()) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index c10ef45440b1..40ac5e30a368 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -16,7 +16,7 @@ # limitations under the License. """Inference-only IBM/NASA Prithvi Geospatial model.""" from collections.abc import Iterable, Mapping, Sequence -from typing import Optional, Set, Tuple, Union +from typing import Optional, Union import torch import torch.nn as nn @@ -154,7 +154,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""): "by PrithviGeospatialMAE.") def _parse_and_validate_multimodal_data( - self, **kwargs) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + self, **kwargs) -> tuple[torch.Tensor, Optional[torch.Tensor]]: pixel_values = kwargs.pop("pixel_values", None) if not isinstance(pixel_values, torch.Tensor): @@ -195,8 +195,8 @@ def pooler( ) -> Optional[PoolerOutput]: return PoolerOutput([PoolingSequenceGroupOutput(hidden_states)]) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_list = [] model_buffers = dict(self.named_buffers()) loaded_buffers = [] diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index e75294bc6cba..2fda87a4ff0f 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -6,7 +6,8 @@ # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE """Inference-only QWen model compatible with HuggingFace weights.""" import json -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -76,7 +77,7 @@ def __init__( num_heads: int, max_position_embeddings: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -166,7 +167,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -284,15 +285,15 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "w2", 0), ("gate_up_proj", "w1", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 60f8a7cd7270..0d0d98c59dbc 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -23,7 +23,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2 model compatible with HuggingFace weights.""" -from typing import Any, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -108,7 +109,7 @@ def __init__( rope_theta: float = 10000, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - rope_scaling: Optional[Tuple] = None, + rope_scaling: Optional[tuple] = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: Optional[dict[str, Any]] = None, @@ -245,7 +246,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -290,14 +291,14 @@ def __init__(self, # TODO (@robertgshaw2): see if this can be moved out if (cache_config.sliding_window is not None and hasattr(config, "max_window_layers")): - raise ValueError("Sliding window for some but all layers is not " - "supported. This model uses sliding window " - "but `max_window_layers` = {} is less than " - "`num_hidden_layers` = {}. Please open an issue " - "to discuss this feature.".format( - config.max_window_layers, - config.num_hidden_layers, - )) + assert config.max_window_layers == config.num_hidden_layers, ( + "Sliding window for some but all layers is not supported. " + "This model uses sliding window but `max_window_layers` = {} " + "is less than `num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( + config.max_window_layers, + config.num_hidden_layers, + )) self.config = config self.quant_config = quant_config @@ -367,8 +368,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -378,7 +379,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -490,8 +491,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] @@ -559,7 +560,7 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): weights = self.hf_to_vllm_mapper.apply(weights) weights = ((name, data) for name, data in weights if not name.startswith("lm_head.")) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index d8e178f9cd47..d89b822dd873 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -21,10 +21,10 @@ # limitations under the License. """Inference-only Qwen2.5-Omni model (thinker part).""" +from collections.abc import Iterable, Mapping, Sequence from copy import copy from functools import partial -from typing import (Any, Dict, Iterable, List, Mapping, Optional, Sequence, - Set, Tuple, Union) +from typing import Any, Optional, Union import torch import torch.nn as nn @@ -138,7 +138,7 @@ def get_hf_processor( min_pixels: Optional[int] = None, max_pixels: Optional[int] = None, size: Optional[dict[str, int]] = None, - fps: Optional[Union[float, List[float]]] = None, + fps: Optional[Union[float, list[float]]] = None, **kwargs: object, ) -> Qwen2_5OmniProcessor: if fps is not None: @@ -550,7 +550,7 @@ def _parse_and_validate_audio_input( def _parse_and_validate_image_input( self, - **kwargs: Dict[str, Any], + **kwargs: dict[str, Any], ) -> Optional[Qwen2_5_VLImageInputs]: pixel_values = kwargs.pop("pixel_values", None) image_embeds = kwargs.pop("image_embeds", None) @@ -589,7 +589,7 @@ def _parse_and_validate_image_input( def _parse_and_validate_video_input( self, - **kwargs: Dict[str, Any], + **kwargs: dict[str, Any], ) -> Optional[Qwen2_5_VLVideoInputs]: pixel_values_videos = kwargs.pop("pixel_values_videos", None) video_embeds = kwargs.pop("video_embeds", None) @@ -627,7 +627,7 @@ def _parse_and_validate_video_input( def _process_audio_input( self, audio_input: Qwen2AudioInputs, - audio_hashes: List[str] = None, + audio_hashes: list[str] = None, cached_audio_features: torch.Tensor = None, ) -> torch.Tensor: @@ -676,7 +676,7 @@ def _process_image_input( def _process_video_input( self, video_input: Qwen2_5_VLVideoInputs, - video_hashes: List[str] = None, + video_hashes: list[str] = None, cached_video_embeds: torch.Tensor = None) -> torch.Tensor: if video_input["type"] == "video_embeds": return video_input["video_embeds"].type(self.visual.dtype) @@ -825,7 +825,7 @@ def get_multimodal_embeddings_v0( if audio_input is None and image_input is None and video_input is None: return None - multimodal_embeddings: List[Tuple[NestedTensors, str]] = [] + multimodal_embeddings: list[tuple[NestedTensors, str]] = [] if audio_input is not None: audio_embeds = self._process_audio_input(audio_input) @@ -891,8 +891,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=["talker.", "token2wav."], diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 8728de95134d..5904ad1f1f24 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -24,9 +24,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2.5-VL model compatible with HuggingFace weights.""" +from collections.abc import Iterable, Mapping from functools import partial -from typing import (Callable, Iterable, List, Literal, Mapping, Optional, Set, - Tuple, TypedDict, Union) +from typing import Callable, Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -91,7 +91,7 @@ class Qwen2_5_VLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] image_embeds: torch.Tensor """Supported types: - - List[`torch.Tensor`]: A list of tensors holding all images' features. + - list[`torch.Tensor`]: A list of tensors holding all images' features. Each tensor holds an image's features. - `torch.Tensor`: A tensor holding all images' features (concatenation of all images' feature tensors). @@ -137,7 +137,7 @@ class Qwen2_5_VLVideoEmbeddingInputs(TypedDict): type: Literal["video_embeds"] video_embeds: torch.Tensor """Supported types: - - List[`torch.Tensor`]: A list of tensors holding all videos' features. + - list[`torch.Tensor`]: A list of tensors holding all videos' features. Each tensor holds an video's features. - `torch.Tensor`: A tensor holding all videos' features (concatenation of all videos' feature tensors). @@ -709,8 +709,8 @@ def forward( hidden_states = hidden_states[reverse_indices, :] return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("attn.qkv.", "attn.q.", "q"), @@ -718,7 +718,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("attn.qkv.", "attn.v.", "v"), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -750,7 +750,7 @@ def get_hf_processor( min_pixels: Optional[int] = None, max_pixels: Optional[int] = None, size: Optional[dict[str, int]] = None, - fps: Optional[Union[float, List[float]]] = None, + fps: Optional[Union[float, list[float]]] = None, **kwargs: object, ) -> Qwen2_5_VLProcessor: if fps is not None: @@ -1116,8 +1116,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index f30bf08ab18b..3182a7532578 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -22,7 +22,7 @@ # limitations under the License. """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Optional, Set, Tuple, TypedDict, Union +from typing import Any, Optional, TypedDict, Union import torch import torch.nn as nn @@ -403,7 +403,7 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 14f9f8158940..7cf98dc7a4ea 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -23,7 +23,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2MoE model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch import torch.nn.functional as F @@ -33,9 +34,7 @@ from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_pp_group, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce) +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE @@ -129,7 +128,8 @@ def __init__( intermediate_size=config.shared_expert_intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, - reduce_results=False, + reduce_results=self.experts.must_reduce_shared_expert_outputs( + ), ) else: self.shared_expert = None @@ -156,7 +156,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: if shared_output is not None: final_hidden_states = final_hidden_states + shared_output if self.tp_size > 1: - final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( # noqa E501 final_hidden_states) return final_hidden_states.view(orig_shape) @@ -170,12 +170,12 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", - dual_chunk_attention_config: Optional[Dict[str, Any]] = None, + dual_chunk_attention_config: Optional[dict[str, Any]] = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -390,8 +390,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -410,7 +410,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). @@ -533,8 +533,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["rotary_emb.inv_freq"]), diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 90f799e6734e..81dc38988c9d 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -5,7 +5,8 @@ # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. """Inference-only Qwen2-RM model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -95,8 +96,8 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["lm_head."]) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index ac0a6de523df..0ff0836b0897 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -25,8 +25,7 @@ """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" from collections.abc import Iterable, Mapping, Sequence from functools import partial -from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict, - Union) +from typing import Any, Callable, Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -102,7 +101,7 @@ class Qwen2VLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] image_embeds: torch.Tensor """Supported types: - - List[`torch.Tensor`]: A list of tensors holding all images' features. + - list[`torch.Tensor`]: A list of tensors holding all images' features. Each tensor holds an image's features. - `torch.Tensor`: A tensor holding all images' features (concatenation of all images' feature tensors). @@ -142,7 +141,7 @@ class Qwen2VLVideoEmbeddingInputs(TypedDict): type: Literal["video_embeds"] video_embeds: torch.Tensor """Supported types: - - List[`torch.Tensor`]: A list of tensors holding all videos' features. + - list[`torch.Tensor`]: A list of tensors holding all videos' features. Each tensor holds an video's features. - `torch.Tensor`: A tensor holding all videos' features (concatenation of all videos' feature tensors). @@ -662,8 +661,8 @@ def forward( return x - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -671,7 +670,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -1394,8 +1393,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 40e0ccc1bab6..dbe2be8a73d5 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -21,7 +21,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen3 model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -63,7 +64,7 @@ def __init__(self, rope_theta: float = 10000, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - rope_scaling: Optional[Tuple] = None, + rope_scaling: Optional[tuple] = None, prefix: str = "", attn_type: str = AttentionType.DECODER) -> None: super().__init__() @@ -201,7 +202,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -309,8 +310,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 51cfa5796187..aae5401721df 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -21,7 +21,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen3MoE model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -30,9 +31,7 @@ from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_pp_group, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce) +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE @@ -137,7 +136,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: router_logits=router_logits) final_hidden_states = final_hidden_states if self.tp_size > 1: - final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( # noqa E501 final_hidden_states) return final_hidden_states.view(orig_shape) @@ -151,7 +150,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, head_dim: Optional[int] = None, rms_norm_eps: float = 1e-06, @@ -375,8 +374,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -395,7 +394,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). @@ -529,8 +528,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["rotary_emb.inv_freq"]), diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 199b885a5850..3701153bace5 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -9,10 +9,9 @@ import math import re import unicodedata -from collections.abc import Collection, Mapping, Sequence -from collections.abc import Set as AbstractSet +from collections.abc import Collection, Mapping, Sequence, Set from functools import lru_cache, partial -from typing import Callable, List, Literal, Optional, TypedDict, Union +from typing import Callable, Literal, Optional, TypedDict, Union import torch from torch import nn @@ -395,7 +394,7 @@ class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore def tokenize( self, text: str, - allowed_special: Union[AbstractSet[str], str] = "all", + allowed_special: Union[Set[str], str] = "all", disallowed_special: Union[Collection[str], str] = (), **kwargs, ) -> list[Union[bytes, str]]: @@ -411,7 +410,7 @@ def tokenize( def _decode( self, - token_ids: Union[int, List[int]], + token_ids: Union[int, list[int]], skip_special_tokens: bool = False, errors: Optional[str] = None, **kwargs, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 06a0e6574630..c55f7ccd344f 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -10,10 +10,10 @@ import sys import tempfile from abc import ABC, abstractmethod +from collections.abc import Set from dataclasses import dataclass, field from functools import lru_cache -from typing import (AbstractSet, Callable, Dict, List, Optional, Tuple, Type, - TypeVar, Union) +from typing import Callable, Optional, TypeVar, Union import cloudpickle import torch.nn as nn @@ -266,7 +266,7 @@ class _ModelInfo: supports_v0_only: bool @staticmethod - def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo": + def from_model_cls(model: type[nn.Module]) -> "_ModelInfo": return _ModelInfo( architecture=model.__name__, is_text_generation_model=is_text_generation_model(model), @@ -290,7 +290,7 @@ def inspect_model_cls(self) -> _ModelInfo: raise NotImplementedError @abstractmethod - def load_model_cls(self) -> Type[nn.Module]: + def load_model_cls(self) -> type[nn.Module]: raise NotImplementedError @@ -301,10 +301,10 @@ class _RegisteredModel(_BaseRegisteredModel): """ interfaces: _ModelInfo - model_cls: Type[nn.Module] + model_cls: type[nn.Module] @staticmethod - def from_model_cls(model_cls: Type[nn.Module]): + def from_model_cls(model_cls: type[nn.Module]): return _RegisteredModel( interfaces=_ModelInfo.from_model_cls(model_cls), model_cls=model_cls, @@ -313,7 +313,7 @@ def from_model_cls(model_cls: Type[nn.Module]): def inspect_model_cls(self) -> _ModelInfo: return self.interfaces - def load_model_cls(self) -> Type[nn.Module]: + def load_model_cls(self) -> type[nn.Module]: return self.model_cls @@ -330,7 +330,7 @@ def inspect_model_cls(self) -> _ModelInfo: return _run_in_subprocess( lambda: _ModelInfo.from_model_cls(self.load_model_cls())) - def load_model_cls(self) -> Type[nn.Module]: + def load_model_cls(self) -> type[nn.Module]: mod = importlib.import_module(self.module_name) return getattr(mod, self.class_name) @@ -339,7 +339,7 @@ def load_model_cls(self) -> Type[nn.Module]: def _try_load_model_cls( model_arch: str, model: _BaseRegisteredModel, -) -> Optional[Type[nn.Module]]: +) -> Optional[type[nn.Module]]: from vllm.platforms import current_platform current_platform.verify_model_arch(model_arch) try: @@ -366,15 +366,15 @@ def _try_inspect_model_cls( @dataclass class _ModelRegistry: # Keyed by model_arch - models: Dict[str, _BaseRegisteredModel] = field(default_factory=dict) + models: dict[str, _BaseRegisteredModel] = field(default_factory=dict) - def get_supported_archs(self) -> AbstractSet[str]: + def get_supported_archs(self) -> Set[str]: return self.models.keys() def register_model( self, model_arch: str, - model_cls: Union[Type[nn.Module], str], + model_cls: Union[type[nn.Module], str], ) -> None: """ Register an external model to be used in vLLM. @@ -413,7 +413,7 @@ def register_model( self.models[model_arch] = model - def _raise_for_unsupported(self, architectures: List[str]): + def _raise_for_unsupported(self, architectures: list[str]): all_supported_archs = self.get_supported_archs() if any(arch in all_supported_archs for arch in architectures): @@ -426,7 +426,7 @@ def _raise_for_unsupported(self, architectures: List[str]): f"Supported architectures: {all_supported_archs}") def _try_load_model_cls(self, - model_arch: str) -> Optional[Type[nn.Module]]: + model_arch: str) -> Optional[type[nn.Module]]: if model_arch not in self.models: return None @@ -440,8 +440,8 @@ def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]: def _normalize_archs( self, - architectures: Union[str, List[str]], - ) -> List[str]: + architectures: Union[str, list[str]], + ) -> list[str]: if isinstance(architectures, str): architectures = [architectures] if not architectures: @@ -458,8 +458,8 @@ def _normalize_archs( def inspect_model_cls( self, - architectures: Union[str, List[str]], - ) -> Tuple[_ModelInfo, str]: + architectures: Union[str, list[str]], + ) -> tuple[_ModelInfo, str]: architectures = self._normalize_archs(architectures) for arch in architectures: @@ -471,8 +471,8 @@ def inspect_model_cls( def resolve_model_cls( self, - architectures: Union[str, List[str]], - ) -> Tuple[Type[nn.Module], str]: + architectures: Union[str, list[str]], + ) -> tuple[type[nn.Module], str]: architectures = self._normalize_archs(architectures) for arch in architectures: @@ -484,77 +484,77 @@ def resolve_model_cls( def is_text_generation_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_text_generation_model def is_pooling_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_pooling_model def is_cross_encoder_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_cross_encoding def is_multimodal_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_multimodal def is_pp_supported_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_pp def model_has_inner_state( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.has_inner_state def is_attention_free_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_attention_free def is_hybrid_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_hybrid def is_noops_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.has_noops def is_transcription_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_transcription def is_v1_compatible( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return not model_cls.supports_v0_only diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index ebefe7689c97..9a4d0ab2dd4d 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import itertools -from typing import Iterable, Optional, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -135,7 +136,7 @@ def _build_model(self, prefix=prefix, embedding_class=RobertaEmbedding) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): weights = self.hf_to_vllm_mapper.apply(weights) # Separate weights in "roberta"-prefixed and all else (not in memory). # For use with models like FacebookAI/roberta-base. @@ -187,7 +188,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.classifier = RobertaClassificationHead(config) self._pooler = CrossEncodingPooler(config, self.classifier) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): bert_weights, task_weights = roberta_task_weights_filter(weights) bert_weights = self.jina_to_vllm_mapper.apply(bert_weights) @@ -249,8 +250,8 @@ def create_position_ids_from_input_ids(input_ids, def roberta_task_weights_filter( - all_weights: Iterable[Tuple[str, torch.Tensor]] -) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[str, + all_weights: Iterable[tuple[str, torch.Tensor]] +) -> tuple[Iterable[tuple[str, torch.Tensor]], Iterable[tuple[str, torch.Tensor]]]: """ Separate task-specific weights that are applied on top diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 75fcf540b0b1..3b5334afa7af 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -3,7 +3,8 @@ within a vision language model.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -265,7 +266,7 @@ def __init__( def forward( self, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, None]: + ) -> tuple[torch.Tensor, None]: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) @@ -480,8 +481,8 @@ def forward( feature_sample_layers=feature_sample_layers, ) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -489,7 +490,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.vision_model.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index e78c37b65f87..91f6c7753c68 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -8,7 +8,7 @@ # -------------------------------------------------------- from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union +from typing import Literal, Optional, TypedDict, TypeVar, Union import torch import torch.nn as nn @@ -937,8 +937,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: skip_prefixes = [ "action_embed", "temporal_embed", "track_embed", "track_embed_decoder", "box_token", "cg_criterion", "cg_model", diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py index 17217dc9a247..31dec55026ba 100644 --- a/vllm/model_executor/models/smolvlm.py +++ b/vllm/model_executor/models/smolvlm.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, Optional +from typing import Optional from transformers import SmolVLMProcessor @@ -21,7 +21,7 @@ class SmolVLMProcessingInfo(Idefics3ProcessingInfo): def get_hf_processor( self, *, - max_image_size: Optional[Dict[str, int]] = None, + max_image_size: Optional[dict[str, int]] = None, **kwargs: object, ) -> SmolVLMProcessor: if max_image_size is not None: diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index f86aff7ba7ef..1c9f3c77c7a8 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -23,7 +23,8 @@ # limitations under the License. """Inference-only Solar model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -101,7 +102,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -236,7 +237,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -437,8 +438,8 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -448,7 +449,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 1cbda7267e4c..8c2ad6f19251 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -20,7 +20,8 @@ # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json """Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -180,7 +181,7 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -252,8 +253,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -263,7 +264,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -335,8 +336,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, # Models trained using ColossalAI may include these tensors in diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 6eebe4c4d614..5927afa91f49 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -19,7 +19,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch Starcoder2 model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -255,8 +256,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -265,7 +266,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -342,8 +343,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, # Models trained using ColossalAI may include these tensors in diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py index 379e19e1beea..7d713d23c772 100644 --- a/vllm/model_executor/models/telechat2.py +++ b/vllm/model_executor/models/telechat2.py @@ -19,7 +19,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, Set, Tuple +from collections.abc import Iterable import torch import torch.nn as nn @@ -50,14 +50,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): layer.mlp.gate_up_proj.bias = None layer.mlp.gate_up_proj.skip_bias_add = True - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ ('gate_up_proj', 'gate_proj', 0), ('gate_up_proj', 'up_proj', 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() total_num_heads = self.config.n_head head_dim = self.config.hidden_size // total_num_heads for name, loaded_weight in weights: @@ -128,8 +128,8 @@ def _init_model(self, layer_type: type[nn.Module] = LlamaDecoderLayer): return TeleChat2Model(vllm_config=vllm_config, prefix=prefix) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 7b946ad6aac7..a8f30b2f27bf 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -15,7 +15,8 @@ # limitations under the License. """Wrapper around `transformers` models""" import re -from typing import Iterable, Literal, Optional, Union +from collections.abc import Iterable +from typing import Literal, Optional, Union import torch from torch import nn diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 0bc5d218f8d0..c1a4dc1b33d7 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -3,7 +3,7 @@ # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py """PyTorch Ultravox model.""" from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Any, Literal, Optional, TypedDict, Union import torch from torch import nn @@ -619,8 +619,8 @@ def compute_logits(self, hidden_states: torch.Tensor, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["audio_tower."]) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 0458e3ce03b5..5cc501622891 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import itertools +from collections.abc import Iterable, Mapping from dataclasses import dataclass, field -from typing import (Callable, Dict, Iterable, List, Literal, Mapping, Optional, - Protocol, Set, Tuple, Union, overload) +from typing import Callable, Literal, Optional, Protocol, Union, overload import torch import torch.nn as nn @@ -58,8 +58,8 @@ def _map_name(self, key: str) -> Optional[str]: return key def apply( - self, weights: Iterable[Tuple[str, torch.Tensor]] - ) -> Iterable[Tuple[str, torch.Tensor]]: + self, weights: Iterable[tuple[str, torch.Tensor]] + ) -> Iterable[tuple[str, torch.Tensor]]: return ((out_name, data) for name, data in weights if (out_name := self._map_name(name)) is not None) @@ -84,8 +84,8 @@ def __init__( self, module: nn.Module, *, - skip_prefixes: Optional[List[str]] = None, - ignore_unexpected_prefixes: Optional[List[str]] = None, + skip_prefixes: Optional[list[str]] = None, + ignore_unexpected_prefixes: Optional[list[str]] = None, ) -> None: super().__init__() @@ -95,8 +95,8 @@ def __init__( def _groupby_prefix( self, - weights: Iterable[Tuple[str, torch.Tensor]], - ) -> Iterable[Tuple[str, Iterable[Tuple[str, torch.Tensor]]]]: + weights: Iterable[tuple[str, torch.Tensor]], + ) -> Iterable[tuple[str, Iterable[tuple[str, torch.Tensor]]]]: weights_by_parts = ((weight_name.split(".", 1), weight_data) for weight_name, weight_data in weights) @@ -129,7 +129,7 @@ def _load_param( self, base_prefix: str, param: nn.Parameter, - weights: Iterable[Tuple[str, torch.Tensor]], + weights: Iterable[tuple[str, torch.Tensor]], ) -> Iterable[str]: for weight_name, weight_data in weights: weight_qualname = self._get_qualname(base_prefix, weight_name) @@ -159,7 +159,7 @@ def _load_param( yield weight_qualname def _add_loadable_non_param_tensors(self, module: nn.Module, - child_params: Dict[str, torch.Tensor]): + child_params: dict[str, torch.Tensor]): """ Add tensor names that are not in the model params that may be in the safetensors, e.g., batch normalization stats. @@ -182,7 +182,7 @@ def _load_module( self, base_prefix: str, module: nn.Module, - weights: Iterable[Tuple[str, torch.Tensor]], + weights: Iterable[tuple[str, torch.Tensor]], ) -> Iterable[str]: if isinstance(module, PPMissingLayer): return @@ -251,10 +251,10 @@ def _load_module( def load_weights( self, - weights: Iterable[Tuple[str, torch.Tensor]], + weights: Iterable[tuple[str, torch.Tensor]], *, mapper: Optional[WeightsMapper] = None, - ) -> Set[str]: + ) -> set[str]: if mapper is not None: weights = mapper.apply(weights) @@ -292,13 +292,13 @@ def flatten_bn(x: torch.Tensor) -> torch.Tensor: @overload -def flatten_bn(x: List[torch.Tensor]) -> List[torch.Tensor]: +def flatten_bn(x: list[torch.Tensor]) -> list[torch.Tensor]: ... @overload def flatten_bn( - x: Union[List[torch.Tensor], torch.Tensor], + x: Union[list[torch.Tensor], torch.Tensor], *, concat: Literal[True], ) -> torch.Tensor: @@ -307,18 +307,18 @@ def flatten_bn( @overload def flatten_bn( - x: Union[List[torch.Tensor], torch.Tensor], + x: Union[list[torch.Tensor], torch.Tensor], *, concat: bool = False, -) -> Union[List[torch.Tensor], torch.Tensor]: +) -> Union[list[torch.Tensor], torch.Tensor]: ... def flatten_bn( - x: Union[List[torch.Tensor], torch.Tensor], + x: Union[list[torch.Tensor], torch.Tensor], *, concat: bool = False, -) -> Union[List[torch.Tensor], torch.Tensor]: +) -> Union[list[torch.Tensor], torch.Tensor]: """ Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs. @@ -442,7 +442,7 @@ def merge_multimodal_embeddings( input_ids: torch.Tensor, inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors, - placeholder_token_id: Union[int, List[int]], + placeholder_token_id: Union[int, list[int]], ) -> torch.Tensor: """ Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the @@ -596,7 +596,7 @@ def make_layers( num_hidden_layers: int, layer_fn: LayerFn, prefix: str, -) -> Tuple[int, int, torch.nn.ModuleList]: +) -> tuple[int, int, torch.nn.ModuleList]: """Make a list of layers with the given layer function, taking pipeline parallelism into account. """ @@ -614,10 +614,10 @@ def make_layers( # NOTE: don't use lru_cache here because it can prevent garbage collection -_model_to_pp_missing_layer_names: Dict[int, List[str]] = {} +_model_to_pp_missing_layer_names: dict[int, list[str]] = {} -def get_pp_missing_layer_names(model: torch.nn.Module) -> List[str]: +def get_pp_missing_layer_names(model: torch.nn.Module) -> list[str]: """Get the names of the missing layers in a pipeline parallel model.""" model_id = id(model) if model_id in _model_to_pp_missing_layer_names: @@ -645,7 +645,7 @@ def is_pp_missing_parameter(name: str, model: torch.nn.Module) -> bool: for missing_layer_name in get_pp_missing_layer_names(model)) -def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int): +def make_empty_intermediate_tensors_factory(keys: list[str], hidden_size: int): def make_empty_intermediate_tensors( batch_size: int, @@ -684,7 +684,7 @@ def extract_layer_index(layer_name: str) -> int: - "model.encoder.layers.0.sub.1" -> ValueError """ subnames = layer_name.split(".") - int_vals: List[int] = [] + int_vals: list[int] = [] for subname in subnames: try: int_vals.append(int(subname)) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 908cd7885aa8..c6e303d6024a 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -2,7 +2,7 @@ import math from collections.abc import Iterable, Mapping, Sequence -from typing import List, Optional, Set, Tuple, TypedDict, Union +from typing import Optional, TypedDict, Union import torch from torch import nn @@ -382,7 +382,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.embed_positions.weight.copy_( sinusoids(*self.embed_positions.weight.shape)) - def forward(self, input_features: Union[torch.Tensor, List[torch.Tensor]]): + def forward(self, input_features: Union[torch.Tensor, list[torch.Tensor]]): hidden_states = [] for features in input_features: embeds = nn.functional.gelu(self.conv1(features)) @@ -460,7 +460,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def forward( self, - input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]], + input_features: Optional[Union[torch.Tensor, list[torch.Tensor]]], input_ids: Optional[torch.Tensor], positions: torch.Tensor, ) -> torch.Tensor: @@ -474,14 +474,14 @@ def forward( def get_encoder_outputs( self, - input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]], + input_features: Optional[Union[torch.Tensor, list[torch.Tensor]]], ) -> Optional[torch.Tensor]: if input_features is None: return None return self.encoder(input_features) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".self_attn.qkv_proj", ".self_attn.q_proj", "q"), @@ -491,7 +491,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: @@ -722,8 +722,8 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."]) # add fake zeros bias for k_proj to state_dict @@ -732,8 +732,8 @@ def load_weights(self, weights: Iterable[Tuple[str, def _create_fake_bias_for_k_proj( - weights: Iterable[Tuple[str, torch.Tensor]] -) -> Iterable[Tuple[str, torch.Tensor]]: + weights: Iterable[tuple[str, torch.Tensor]] +) -> Iterable[tuple[str, torch.Tensor]]: """ Create full zeros bias for k_proj weight in self-attn and x-attn layers. So that the bias for k_proj in qkv_proj can be initialized with zeros. diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index eddccbba5a2d..48e254bdd85b 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -6,8 +6,9 @@ architectures in a hybrid model optimized for efficient sequence modeling. The model alternates between state space model layers and attention-based layers. """ +from collections.abc import Iterable from itertools import cycle -from typing import Dict, Iterable, List, Optional, Set, Tuple, Union +from typing import Optional, Union import torch from torch import nn @@ -54,7 +55,7 @@ def __init__( self, input_dim: int, rank: int, - output_dim: Union[int, List[int]], + output_dim: Union[int, list[int]], quant_config: Optional[QuantizationConfig] = None, ): """Initialize the attention layer. @@ -279,7 +280,7 @@ def __init__( self, config: Zamba2Config, bare_block_idx: int, - num_hybrid_layers: Dict[int, int], + num_hybrid_layers: dict[int, int], quant_config: Optional[QuantizationConfig] = None, ) -> None: """Initialize the MLP layer. @@ -769,8 +770,8 @@ def forward( hidden_states = self.final_layernorm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -779,7 +780,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for chkpt_weight_name, loaded_weight in weights: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in chkpt_weight_name: @@ -914,9 +915,9 @@ def forward(self, return hidden_states - def copy_inputs_before_cuda_graphs(self, input_buffers: Dict[str, + def copy_inputs_before_cuda_graphs(self, input_buffers: dict[str, torch.Tensor], - **kwargs) -> Dict[str, torch.Tensor]: + **kwargs) -> dict[str, torch.Tensor]: """Copy inputs before CUDA graph capture. Args: @@ -930,7 +931,7 @@ def copy_inputs_before_cuda_graphs(self, input_buffers: Dict[str, input_buffers, **kwargs) def get_seqlen_agnostic_capture_inputs( - self, batch_size: int) -> Dict[str, torch.Tensor]: + self, batch_size: int) -> dict[str, torch.Tensor]: """Get inputs for sequence-length-agnostic graph capture. Args: @@ -941,7 +942,7 @@ def get_seqlen_agnostic_capture_inputs( return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: """Calculate shapes for Mamba's convolutional and state caches. Returns: @@ -1001,7 +1002,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 61d8eb62ffaf..2335af843ed5 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -10,40 +10,43 @@ Union, cast, final) import numpy as np -import torch -import torch.types -from PIL.Image import Image -from transformers import BatchFeature from typing_extensions import NotRequired, TypeAlias from vllm.jsontree import JSONTree, json_map_leaves -from vllm.utils import full_groupby, is_list_of +from vllm.utils import LazyLoader, full_groupby, is_list_of if TYPE_CHECKING: + import torch + import torch.types + from PIL.Image import Image + from transformers.feature_extraction_utils import BatchFeature + from .hasher import MultiModalHashDict +else: + torch = LazyLoader("torch", globals(), "torch") _T = TypeVar("_T") -HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor] +HfImageItem: TypeAlias = Union["Image", np.ndarray, "torch.Tensor"] """ A {class}`transformers.image_utils.ImageInput` representing a single image item, which can be passed to a HuggingFace `ImageProcessor`. """ -HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor, - list[np.ndarray], list[torch.Tensor]] +HfVideoItem: TypeAlias = Union[list["Image"], np.ndarray, "torch.Tensor", + list[np.ndarray], list["torch.Tensor"]] """ A {class}`transformers.image_utils.VideoInput` representing a single video item, which can be passed to a HuggingFace `VideoProcessor`. """ -HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor] +HfAudioItem: TypeAlias = Union[list[float], np.ndarray, "torch.Tensor"] """ Represents a single audio item, which can be passed to a HuggingFace `AudioProcessor`. """ -ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor] +ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"] """ A {class}`transformers.image_utils.ImageInput` representing a single image item, which can be passed to a HuggingFace `ImageProcessor`. @@ -53,7 +56,7 @@ these are directly passed to the model without HF processing. """ -VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor] +VideoItem: TypeAlias = Union[HfVideoItem, "torch.Tensor"] """ A {class}`transformers.image_utils.VideoInput` representing a single video item, which can be passed to a HuggingFace `VideoProcessor`. @@ -64,7 +67,7 @@ """ AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float], - torch.Tensor] + "torch.Tensor"] """ Represents a single audio item, which can be passed to a HuggingFace `AudioProcessor`. @@ -132,7 +135,7 @@ class PlaceholderRange: length: int """The length of the placeholder.""" - is_embed: Optional[torch.Tensor] = None + is_embed: Optional["torch.Tensor"] = None """ A boolean mask of shape `(length,)` indicating which positions between `offset` and `offset + length` to assign embeddings to. @@ -158,8 +161,8 @@ def __eq__(self, other: object) -> bool: return nested_tensors_equal(self.is_embed, other.is_embed) -NestedTensors = Union[list["NestedTensors"], list[torch.Tensor], torch.Tensor, - tuple[torch.Tensor, ...]] +NestedTensors: TypeAlias = Union[list["NestedTensors"], list["torch.Tensor"], + "torch.Tensor", tuple["torch.Tensor", ...]] """ Uses a list instead of a tensor if the dimensions of each element do not match. """ @@ -261,7 +264,7 @@ def build_elems( """ Construct {class}`MultiModalFieldElem` instances to represent the provided data. - + This is the inverse of {meth}`reduce_data`. """ raise NotImplementedError @@ -422,7 +425,7 @@ def flat(modality: str, modality: The modality of the multi-modal item that uses this keyword argument. slices: For each multi-modal item, a slice (dim=0) or a tuple of - slices (dim>0) that is used to extract the data corresponding + slices (dim>0) that is used to extract the data corresponding to it. dim: The dimension to extract data, default to 0. @@ -465,7 +468,7 @@ def flat(modality: str, @staticmethod def flat_from_sizes(modality: str, - size_per_item: torch.Tensor, + size_per_item: "torch.Tensor", dim: int = 0): """ Defines a field where an element in the batch is obtained by @@ -602,7 +605,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): @staticmethod def from_hf_inputs( - hf_inputs: BatchFeature, + hf_inputs: "BatchFeature", config_by_key: Mapping[str, MultiModalFieldConfig], ): # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key` @@ -792,7 +795,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]: return self._items_by_modality[modality] -MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]] +MultiModalPlaceholderDict: TypeAlias = Mapping[str, Sequence[PlaceholderRange]] """ A dictionary containing placeholder ranges for each modality. """ @@ -823,7 +826,7 @@ class MultiModalInputs(TypedDict): mm_hashes: Optional["MultiModalHashDict"] """The hashes of the multi-modal data.""" - mm_placeholders: MultiModalPlaceholderDict + mm_placeholders: "MultiModalPlaceholderDict" """ For each modality, information about the placeholder tokens in `prompt_token_ids`. diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index f9588431c8ef..6e9ec9555802 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -8,11 +8,9 @@ import numpy as np import torch -from PIL.Image import Image -from transformers import BatchFeature from typing_extensions import TypeAlias, TypeGuard, assert_never -from vllm.utils import is_list_of +from vllm.utils import LazyLoader, is_list_of from .audio import AudioResampler from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem, @@ -22,6 +20,11 @@ _T = TypeVar("_T") _I = TypeVar("_I") +if TYPE_CHECKING: + import PIL.Image as PILImage +else: + PILImage = LazyLoader("PILImage", globals(), "PIL.Image") + class ModalityDataItems(ABC, Generic[_T, _I]): """ @@ -131,6 +134,8 @@ def __init__( Mapping[str, MultiModalFieldConfig], ], ) -> None: + from transformers.feature_extraction_utils import BatchFeature + super().__init__(data, modality) missing_required_data_keys = required_fields - data.keys() @@ -200,7 +205,7 @@ def __init__(self, data: Sequence[HfImageItem]) -> None: def get_image_size(self, item_idx: int) -> ImageSize: image = self.get(item_idx) - if isinstance(image, Image): + if isinstance(image, PILImage.Image): return ImageSize(*image.size) if isinstance(image, (np.ndarray, torch.Tensor)): _, h, w = image.shape @@ -226,7 +231,7 @@ def get_num_frames(self, item_idx: int) -> int: def get_frame_size(self, item_idx: int) -> ImageSize: image = self.get(item_idx)[0] # Assume that the video isn't empty - if isinstance(image, Image): + if isinstance(image, PILImage.Image): return ImageSize(*image.size) if isinstance(image, (np.ndarray, torch.Tensor)): _, h, w = image.shape @@ -253,7 +258,7 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): def get_count(self, modality: str, *, strict: bool = True) -> int: """ Get the number of data items belonging to a modality. - + If `strict=False`, return `0` instead of raising {exc}`KeyError` even if the modality is not found. """ @@ -399,7 +404,7 @@ def _parse_image_data( if self._is_embeddings(data): return ImageEmbeddingItems(data) - if (isinstance(data, Image) + if (isinstance(data, PILImage.Image) or isinstance(data, (np.ndarray, torch.Tensor)) and data.ndim == 3): data_items = [data] @@ -420,7 +425,7 @@ def _parse_video_data( if self._is_embeddings(data): return VideoEmbeddingItems(data) - if (is_list_of(data, Image) + if (is_list_of(data, PILImage.Image) or isinstance(data, (np.ndarray, torch.Tensor)) and data.ndim == 4): data_items = [data] diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 92f9e70b5234..320a26f37555 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -13,7 +13,6 @@ TypeVar, Union, cast) import torch -from transformers import BatchFeature, PretrainedConfig, ProcessorMixin from typing_extensions import assert_never from vllm.inputs import InputProcessingContext @@ -31,6 +30,10 @@ MultiModalDataParser) if TYPE_CHECKING: + from transformers.configuration_utils import PretrainedConfig + from transformers.feature_extraction_utils import BatchFeature + from transformers.processing_utils import ProcessorMixin + from .profiling import BaseDummyInputsBuilder logger = init_logger(__name__) @@ -1047,10 +1050,10 @@ def model_id(self) -> str: def get_tokenizer(self) -> AnyTokenizer: return self.ctx.tokenizer - def get_hf_config(self) -> PretrainedConfig: + def get_hf_config(self) -> "PretrainedConfig": return self.ctx.get_hf_config() - def get_hf_processor(self, **kwargs: object) -> ProcessorMixin: + def get_hf_processor(self, **kwargs: object) -> "ProcessorMixin": """ Subclasses can override this method to handle specific kwargs from model config or user inputs. @@ -1165,7 +1168,7 @@ def _to_mm_items( @abstractmethod def _get_mm_fields_config( self, - hf_inputs: BatchFeature, + hf_inputs: "BatchFeature", hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: """Given the HF-processed data, output the metadata of each field.""" @@ -1222,7 +1225,7 @@ def _call_hf_processor( # This refers to the data to be passed to HF processor. mm_data: Mapping[str, object], mm_kwargs: Mapping[str, object], - ) -> BatchFeature: + ) -> "BatchFeature": """ Call the HF processor on the prompt text and associated multi-modal data. diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 72e9b65d763c..3685fd4c3458 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import base64 +from abc import abstractmethod from functools import partial from io import BytesIO from pathlib import Path @@ -9,6 +10,8 @@ import numpy.typing as npt from PIL import Image +from vllm import envs + from .base import MediaIO from .image import ImageMediaIO @@ -48,10 +51,35 @@ def sample_frames_from_video(frames: npt.NDArray, class VideoLoader: @classmethod - def load_bytes(self, data: bytes, num_frames: int = -1) -> npt.NDArray: + @abstractmethod + def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray: raise NotImplementedError +class VideoLoaderRegistry: + + def __init__(self) -> None: + self.name2class: dict[str, type] = {} + + def register(self, name: str): + + def wrap(cls_to_register): + self.name2class[name] = cls_to_register + return cls_to_register + + return wrap + + @staticmethod + def load(cls_name: str) -> VideoLoader: + cls = VIDEO_LOADER_REGISTRY.name2class.get(cls_name) + assert cls is not None, f"VideoLoader class {cls_name} not found" + return cls() + + +VIDEO_LOADER_REGISTRY = VideoLoaderRegistry() + + +@VIDEO_LOADER_REGISTRY.register("opencv") class OpenCVVideoBackend(VideoLoader): def get_cv2_video_api(self): @@ -122,7 +150,8 @@ def __init__( self.image_io = image_io self.num_frames = num_frames - self.video_loader = OpenCVVideoBackend + video_loader_backend = envs.VLLM_VIDEO_LOADER_BACKEND + self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend) def load_bytes(self, data: bytes) -> npt.NDArray: return self.video_loader.load_bytes(data, self.num_frames) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 9163b97c51a0..bdee8b2f821d 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -158,6 +158,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: "currently not supported with CUDA Graphs.") vllm_config.model_config.enforce_eager = True compilation_config.use_cudagraph = False + compilation_config.use_inductor = False @classmethod def get_current_memory_usage(cls, diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index 454167a0dc95..9dd5191da918 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import os from abc import abstractmethod from collections.abc import Sequence @@ -33,7 +35,7 @@ def vocab(self) -> dict[str, int]: return self.model_tokenizer.get_vocab() @abstractmethod - def is_reasoning_end(self, input_ids: list[int]) -> bool: + def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: """ Check if the reasoning content ends in the input_ids. @@ -106,7 +108,7 @@ class ReasoningParserManager: reasoning_parsers: dict[str, type] = {} @classmethod - def get_reasoning_parser(cls, name) -> type: + def get_reasoning_parser(cls, name: str | None) -> type[ReasoningParser]: """ Get reasoning parser by name which is registered by `register_module`. diff --git a/vllm/utils.py b/vllm/utils.py index 9a7da8067ba4..edfbb8c9481e 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -2350,6 +2350,24 @@ def split_zmq_path(path: str) -> Tuple[str, str, str]: return scheme, host, port +def make_zmq_path(scheme: str, host: str, port: Optional[int] = None) -> str: + """Make a ZMQ path from its parts. + + Args: + scheme: The ZMQ transport scheme (e.g. tcp, ipc, inproc). + host: The host - can be an IPv4 address, IPv6 address, or hostname. + port: Optional port number, only used for TCP sockets. + + Returns: + A properly formatted ZMQ path string. + """ + if not port: + return f"{scheme}://{host}" + if is_valid_ipv6_address(host): + return f"{scheme}://[{host}]:{port}" + return f"{scheme}://{host}:{port}" + + # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L783 # noqa: E501 def make_zmq_socket( ctx: Union[zmq.asyncio.Context, zmq.Context], # type: ignore[name-defined] diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 69fc1ac69ab6..83e181116577 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -865,8 +865,10 @@ def forward( assert output is not None, "Output tensor must be provided." if attn_metadata is None: - # Profiling run. - return output + # The zero fill is required when used with DP + EP + # to ensure all ranks within a DP group compute the + # same expert outputs. + return output.fill_(0) num_actual_toks = attn_metadata.num_actual_tokens diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 3abb185c5b8f..7ce39110ac01 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -67,13 +67,13 @@ def __init__(self, runner, kv_cache_spec: AttentionSpec, max_model_len = self.runner.model_config.max_model_len assert max_model_len == 32768,\ "AITER MLA requires max_model_len=32768" - assert self.runner.block_size == 1, "AITER MLA" \ + assert self.kv_cache_spec.block_size == 1, "AITER MLA" \ "only supports block size 1." def _get_paged_kv_tensors( self, block_table: torch.Tensor, seq_lens: torch.Tensor) -> tuple[torch.Tensor, ...]: - page_size = self.runner.block_size + page_size = self.kv_cache_spec.block_size block_table_bounds = (seq_lens + page_size - 1) // page_size mask = (torch.arange(block_table.size(1), diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index c4922a716bc2..908bf1274125 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Attention layer with PagedAttention and Triton prefix prefill.""" -from typing import Any, Optional +from typing import TYPE_CHECKING, Any, Optional import torch @@ -12,10 +12,23 @@ from vllm.platforms import current_platform from vllm.v1.attention.backends.flash_attn import ( FlashAttentionMetadata, FlashAttentionMetadataBuilder) +from vllm.v1.kv_cache_interface import AttentionSpec +from vllm.v1.worker.block_table import BlockTable + +if TYPE_CHECKING: + from vllm.v1.worker.gpu_model_runner import GPUModelRunner logger = init_logger(__name__) +class TritonAttentionMetadataBuilder(FlashAttentionMetadataBuilder): + + def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec, + block_table: BlockTable): + super().__init__(runner, kv_cache_spec, block_table) + self.aot_schedule = False + + class TritonAttentionBackend(AttentionBackend): accept_output_buffer: bool = True @@ -52,8 +65,8 @@ def use_cascade_attention(*args, **kwargs) -> bool: return False @staticmethod - def get_builder_cls() -> type["FlashAttentionMetadataBuilder"]: - return FlashAttentionMetadataBuilder + def get_builder_cls() -> type["TritonAttentionMetadataBuilder"]: + return TritonAttentionMetadataBuilder class TritonAttentionImpl(AttentionImpl): diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 598fc871110e..da18ece7555a 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -32,9 +32,16 @@ def create_empty(cls) -> "KVCacheBlocks": """Creates a new KVCacheBlocks instance with no blocks.""" return cls([]) - def get_block_ids(self) -> list[int]: - """Converts the KVCacheBlocks instance to a list of block IDs.""" - return [block.block_id for block in self.blocks] + def get_block_ids(self) -> list[list[int]]: + """ + Converts the KVCacheBlocks instance to block_ids. + + Returns: + list[list[int]]: A two-level list where + * the outer list corresponds to KV cache groups (only 1 group now) + * each inner list contains the block_ids of the blocks in that group + """ + return [[block.block_id for block in self.blocks]] def get_unhashed_block_ids(self) -> list[int]: """Get block_ids of unhashed blocks from KVCacheBlocks instance.""" @@ -300,9 +307,9 @@ def get_num_common_prefix_blocks( self, request: Request, num_running_requests: int, - ) -> int: + ) -> list[int]: """Calculate the number of common prefix blocks shared by all requests - in the RUNNING state. + in the RUNNING state for each kv cache group. The function determines this by selecting any request and iterating through its blocks. A block is considered a common prefix block if its @@ -332,11 +339,14 @@ def get_num_common_prefix_blocks( requests in the current step. Returns: - int: The number of common prefix blocks. + list[int]: The number of common prefix blocks for each kv cache + group. """ assert request.status == RequestStatus.RUNNING - return self.single_type_manager.get_num_common_prefix_blocks( - request.request_id, num_running_requests) + return [ + self.single_type_manager.get_num_common_prefix_blocks( + request.request_id, num_running_requests) + ] def free_block_hashes(self, request: Request) -> None: """Discard the block hashes for the request. @@ -354,10 +364,8 @@ def take_events(self) -> list[KVCacheEvent]: """ return self.block_pool.take_events() - def get_block_ids(self, request_id: str) -> list[int]: + def get_block_ids(self, request_id: str) -> list[list[int]]: """Get the block ids of a request.""" assert request_id in self.single_type_manager.req_to_blocks - return [ - block.block_id - for block in self.single_type_manager.req_to_blocks[request_id] - ] + return KVCacheBlocks(self.single_type_manager.req_to_blocks[request_id] + ).get_block_ids() diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 27c515835087..403b5401be75 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -577,14 +577,12 @@ def create_kv_cache_group_specs( """ kv_cache_groups = [] for layer_names_one_group in grouped_layer_names: - layer_spec = kv_cache_spec[layer_names_one_group[0]] - assert all( - kv_cache_spec[layer_name] == layer_spec - for layer_name in layer_names_one_group[1:]), ( - "All layers in the same KV cache group must share the same " - "KVCacheSpec.") + layer_specs = [ + kv_cache_spec[layer_name] for layer_name in layer_names_one_group + ] + merged_layer_spec = layer_specs[0].merge(layer_specs) kv_cache_groups.append( - KVCacheGroupSpec(layer_names_one_group, layer_spec)) + KVCacheGroupSpec(layer_names_one_group, merged_layer_spec)) return kv_cache_groups @@ -683,6 +681,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): head_size=spec.head_size, dtype=spec.dtype, use_mla=spec.use_mla, + sliding_window=spec.sliding_window, ) diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 24032498e50b..257234430983 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -26,7 +26,7 @@ class NewRequestData: mm_hashes: list[str] mm_positions: list[PlaceholderRange] sampling_params: SamplingParams - block_ids: list[int] + block_ids: list[list[int]] num_computed_tokens: int lora_request: Optional[LoRARequest] @@ -34,7 +34,7 @@ class NewRequestData: def from_request( cls, request: Request, - block_ids: list[int], + block_ids: list[list[int]], ) -> NewRequestData: return cls( req_id=request.request_id, @@ -85,7 +85,7 @@ class CachedRequestData: # request's block IDs instead of appending to the existing block IDs. resumed_from_preemption: bool new_token_ids: list[int] - new_block_ids: list[int] + new_block_ids: list[list[int]] num_computed_tokens: int @classmethod @@ -94,7 +94,7 @@ def from_request( request: Request, resumed_from_preemption: bool, new_token_ids: list[int], - new_block_ids: list[int], + new_block_ids: list[list[int]], ) -> CachedRequestData: return cls( req_id=request.request_id, @@ -131,9 +131,9 @@ class SchedulerOutput: # E.g., if a request has [0, 1], it could mean the vision encoder needs # to process that the request's 0-th and 1-th images in the current step. scheduled_encoder_inputs: dict[str, list[int]] - # Number of common prefix blocks for all requests. + # Number of common prefix blocks for all requests in each KV cache group. # This can be used for cascade attention. - num_common_prefix_blocks: int + num_common_prefix_blocks: list[int] # Request IDs that are finished in between the previous and the current # steps. This is used to notify the workers about the finished requests diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index f338e4ba1440..5ad05485e8f3 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -173,7 +173,7 @@ def schedule(self) -> SchedulerOutput: # uses structured decoding. structured_output_request_ids: dict[str, int] = {} - req_to_new_block_ids: dict[str, list[int]] = {} + req_to_new_block_ids: dict[str, list[list[int]]] = {} num_scheduled_tokens: dict[str, int] = {} token_budget = self.max_num_scheduled_tokens # Encoder-related. @@ -477,7 +477,8 @@ def schedule(self) -> SchedulerOutput: # Get the longest common prefix among all requests in the running queue. # This can be potentially used for cascade attention. - num_common_prefix_blocks = 0 + num_common_prefix_blocks = [0] * len( + self.kv_cache_config.kv_cache_groups) if self.running: any_request = self.running[0] num_common_prefix_blocks = ( @@ -564,7 +565,7 @@ def _make_cached_request_data( request: Request, num_scheduled_tokens: int, num_scheduled_spec_tokens: int, - new_block_ids: list[int], + new_block_ids: list[list[int]], resumed_from_preemption: bool, ) -> CachedRequestData: # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating @@ -758,7 +759,8 @@ def update_from_output( # the outer lists can be of length > 1. new_logprobs = logprobs.slice(req_index, req_index + 1) - if new_token_ids and request.use_structured_output: + if new_token_ids and self.structured_output_manager.should_advance( + request): # NOTE: structured_output_request # should not be None if use_structured_output, we have # check above, so safe to ignore type warning @@ -767,11 +769,10 @@ def update_from_output( # Add newly generated spec token ids to the request. if spec_token_ids is not None: - if request.use_structured_output: + if self.structured_output_manager.should_advance(request): metadata = request.structured_output_request - assert metadata is not None and metadata.grammar is not None # Needs to happen after new_token_ids are accepted. - request.spec_token_ids = metadata.grammar.validate_tokens( + request.spec_token_ids = metadata.grammar.validate_tokens( # type: ignore[union-attr] spec_token_ids[req_index]) else: request.spec_token_ids = spec_token_ids[req_index] @@ -939,7 +940,9 @@ def _connector_finished( """ if self.connector is None: return False, None - block_ids = self.kv_cache_manager.get_block_ids(request.request_id) + assert len(self.kv_cache_config.kv_cache_groups + ) == 1, "KV connector only supports one KV cache group now" + block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0] return self.connector.request_finished(request, block_ids) def _update_waiting_for_remote_kv(self, request: Request) -> bool: @@ -956,9 +959,10 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool: """ if request.request_id not in self.finished_recving_kv_req_ids: return False - + assert len(self.kv_cache_config.kv_cache_groups + ) == 1, "KV connector only supports one KV cache group now" # Now that the blocks are ready, actually cache them. - block_ids = self.kv_cache_manager.get_block_ids(request.request_id) + block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0] num_computed_tokens = len(block_ids) * self.block_size if num_computed_tokens == request.num_tokens: num_computed_tokens -= 1 diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 4fc0844cd1f4..2747fc7fabd1 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -1,8 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 +import copy from dataclasses import dataclass +from typing import Optional import torch +from typing_extensions import Self from vllm.config import VllmConfig from vllm.logger import init_logger @@ -53,6 +56,16 @@ def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: """ raise NotImplementedError + @classmethod + def merge(cls, specs: list[Self]) -> Self: + """ + Merge a list of KVCacheSpec objects into a single KVCacheSpec object. + """ + assert all(spec.type_id == specs[0].type_id for spec in specs[1:]), ( + "All layers in the same KV cache group must share the same " + "type_id.") + return copy.deepcopy(specs[0]) + @dataclass class AttentionSpec(KVCacheSpec): @@ -71,6 +84,16 @@ def page_size_bytes(self) -> int: @dataclass class FullAttentionSpec(AttentionSpec): + sliding_window: Optional[int] = None + """ + When hybrid allocator is disabled and the model contains both full + attention layers and sliding window attention layers, sliding + window attention are regarded as full attention in KV cache manager + (blocks are allocated for all tokens), while computed as sliding window + attention in model runner. + In this case, we use FullAttentionSpec and record the sliding window size. + Default to None for not using sliding window attention. + """ @property def type_id(self) -> str: @@ -80,6 +103,25 @@ def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: max_model_len = vllm_config.model_config.max_model_len return cdiv(max_model_len, self.block_size) * self.page_size_bytes + @classmethod + def merge(cls, specs: list[Self]) -> Self: + """ + Merge a list of FullAttentionSpec objects into a single + FullAttentionSpec object. + """ + merged_spec = super().merge(specs) + sliding_window = set(spec.sliding_window for spec in specs + if spec.sliding_window is not None) + if len(sliding_window) == 0: + merged_spec.sliding_window = None + elif len(sliding_window) == 1: + merged_spec.sliding_window = sliding_window.pop() + else: + raise ValueError( + "All sliding window layers in the same KV cache group " + "must have the same window size.") + return merged_spec + @dataclass class SlidingWindowSpec(AttentionSpec): diff --git a/vllm/v1/request.py b/vllm/v1/request.py index d2843b65ab59..d1cdd2c52750 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -72,7 +72,7 @@ def __init__( assert len(self.mm_inputs) == len(self.mm_hashes) # Read-only views - # Prevent directly appending to the these lists since + # Prevent directly appending to these lists since # they should also be updated simultaneously. self.output_token_ids = ConstantList(self._output_token_ids) self.all_token_ids = ConstantList(self._all_token_ids) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 5b84bc1f5ec3..012ddd093596 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -166,52 +166,12 @@ def propose( attn_metadata.max_query_len = 1 attn_metadata.query_start_loc = self.arange[:batch_size + 1] for _ in range(self.num_speculative_tokens - 1): - # Update the inputs. - # cast to int32 is crucial when eagle model is compiled. - # tensor.argmax() returns int64 by default. - input_ids = draft_token_ids_list[-1].int() - positions += 1 - - # NOTE(woosuk): We should handle the case where the draft model - # generates tokens beyond the max model length. Since it is complex - # to remove such requests from the batch, we keep them in the batch - # but adjust the position ids and slot mappings to avoid the - # out-of-range access during the model execution. The draft tokens - # generated with this adjustment should be ignored. - exceeds_max_model_len = positions >= self.max_model_len - # Mask out the position ids that exceed the max model length. - # Otherwise, we may get out-of-range error in RoPE. - clamped_positions = torch.where(exceeds_max_model_len, 0, - positions) - - # Increment the sequence lengths. - attn_metadata.max_seq_len += 1 - attn_metadata.seq_lens += 1 - # Consider max model length. - attn_metadata.max_seq_len = min(attn_metadata.max_seq_len, - self.max_model_len) - # For the requests that exceed the max model length, we set the - # sequence length to 1 to minimize their overheads in attention. - attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1) - - # Compute the slot mapping. - block_numbers = clamped_positions // self.block_size - block_ids = block_table.gather(dim=1, - index=block_numbers.view(-1, 1)) - block_ids = block_ids.view(-1) - attn_metadata.slot_mapping = (block_ids * self.block_size + - clamped_positions % self.block_size) - # Mask out the slot mappings that exceed the max model length. - # Otherwise, the KV cache will be inadvertently updated with the - # padding tokens. - attn_metadata.slot_mapping.masked_fill_(exceeds_max_model_len, - PADDING_SLOT_ID) - # copy inputs to buffer for cudagraph - self.input_ids[:batch_size] = input_ids - self.positions[:batch_size] = clamped_positions - self.hidden_states[:batch_size] = hidden_states + self.advance_speculative_state(draft_token_ids_list[-1], positions, + hidden_states, attn_metadata, + batch_size) + # copy inputs to buffer for cudagraph # Run the model. with set_forward_context(attn_metadata, self.vllm_config, @@ -233,6 +193,44 @@ def propose( draft_token_ids = torch.stack(draft_token_ids_list, dim=1) return draft_token_ids + def advance_speculative_state(self, draft_token_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + attn_metadata: FlashAttentionMetadata, + batch_size: int): + grid = lambda meta: (triton.cdiv(batch_size, meta['BLOCK_SIZE']), ) + attn_metadata.slot_mapping = torch.empty_like(positions) + advance_state_kernel[grid]( + # === Input tensors === + draft_token_ids, + positions, + hidden_states, + + # === Model input buffers to be updated === + self.input_ids[:batch_size], + self.positions[:batch_size], + self.hidden_states[:batch_size], + + # === Metadata tensors === + attn_metadata.seq_lens, + attn_metadata.block_table, + attn_metadata.slot_mapping, + + # === Scalar configuration === + self.max_model_len, + self.block_size, + + # === Execution control === + batch_size, + BLOCK_SIZE=1024, + PADDING_SLOT_ID=PADDING_SLOT_ID) + + # Increment the sequence lengths. + attn_metadata.max_seq_len += 1 + # Consider max model length. + attn_metadata.max_seq_len = min(attn_metadata.max_seq_len, + self.max_model_len) + @staticmethod def prepare_inputs( # [batch_size + 1] @@ -415,3 +413,82 @@ def prepare_input_kernel( index_start + offset, mask=offset < num_tokens, ) + + +@triton.jit +def advance_state_kernel( + draft_token_ids_ptr, + positions_ptr, + hidden_states_ptr, + + # === Model input buffers to be updated === + model_input_ids_ptr, + model_positions_ptr, + model_hidden_states_ptr, + + # === Metadata tensors === + seq_lens_ptr, + block_table_ptr, + slot_mapping_ptr, + + # === Scalar configuration === + model_max_len: int, + model_block_size: int, + + # === Execution control === + n_elements: int, + BLOCK_SIZE: tl.constexpr, + PADDING_SLOT_ID: tl.constexpr, +): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + offsets = block_start + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + draft_token_list_last = tl.load(draft_token_ids_ptr + offsets, mask=mask) + position = tl.load(positions_ptr + offsets, mask=mask) + seq_lens = tl.load(seq_lens_ptr + offsets, mask=mask) + hidden_states = tl.load(hidden_states_ptr + offsets, mask=mask) + + # Update the inputs. + # cast to int32 is crucial when eagle model is compiled. + # tensor.argmax() returns int64 by default. + input_id = draft_token_list_last.cast(tl.int32) + position = position + 1 + + # NOTE(woosuk): We should handle the case where the draft model + # generates tokens beyond the max model length. Since it is complex + # to remove such requests from the batch, we keep them in the batch + # but adjust the position ids and slot mappings to avoid the + # out-of-range access during the model execution. The draft tokens + # generated with this adjustment should be ignored. + exceeds_max_model_len = position >= model_max_len + # Mask out the position ids that exceed the max model length. + # Otherwise, we may get out-of-range error in RoPE. + clamped_position = tl.where(exceeds_max_model_len, 0, position) + + # For the requests that exceed the max model length, we set the + # sequence length to 1 to minimize their overheads in attention. + seq_lens += 1 + seq_lens = tl.where(exceeds_max_model_len, 1, seq_lens) + + block_numbers = clamped_position // model_block_size + block_offsets = clamped_position % model_block_size + + # Gather from block_table[0, block_numbers] + block_ids = tl.load(block_table_ptr + block_numbers, mask=mask) + + # Compute slot mapping + slot_mapping = block_ids * model_block_size + block_offsets + + # Mask out the slot mappings that exceed the max model length. + # Otherwise, the KV cache will be inadvertently updated with the + # padding tokens. + slot_mapping = tl.where(exceeds_max_model_len, PADDING_SLOT_ID, + slot_mapping) + + tl.store(model_input_ids_ptr + offsets, input_id, mask=mask) + tl.store(positions_ptr + offsets, position, mask=mask) + tl.store(model_positions_ptr + offsets, clamped_position, mask=mask) + tl.store(seq_lens_ptr + offsets, seq_lens, mask=mask) + tl.store(slot_mapping_ptr + offsets, slot_mapping, mask=mask) + tl.store(model_hidden_states_ptr + offsets, hidden_states, mask=mask) diff --git a/vllm/v1/stats/__init__.py b/vllm/v1/stats/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py deleted file mode 100644 index 46818977dae5..000000000000 --- a/vllm/v1/stats/common.py +++ /dev/null @@ -1,453 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import time -from dataclasses import dataclass -from dataclasses import field as dataclass_field -from enum import IntEnum -from typing import ClassVar, Optional - -import msgspec -from msgspec import field as msgspec_field - -from vllm.sampling_params import SamplingParams - - -class RequestStatsUpdate( - msgspec.Struct, # type: ignore - array_like=True, - omit_defaults=True, - gc=False): - """ - An update to the request stats. - - This represents a stats update at a specific timestamp with metadata - associated with the update. - - NOTE: since there might be multiple processes generating updates at - different parts of the engine (e.g. input processor, scheduler, engine core, - etc.), we use the monotonic timestamp to record the update to compute any - intervals, and explicit wall-clock timestamp should be used for timestamps. - - WARNING: This assumes stats are generated in a single machine. If there are - potentially multiple machines, one should always generate the stats updates - on one single machine or use something else. - """ - - class Type(IntEnum): - """See `RequestStats` for the lifecycle of a request.""" - - # Request arrived at the engine frontend. - ARRIVED = 0 - # Input processed by the input processor. - INPUT_PROCESSED = 1 - # Queued on the engine core. - QUEUED = 2 - # Scheduled running prefill by the scheduler. - # A request could be running a new prefill on the prompt tokens or - # a resumed prefill on the original prefill tokens + generated output - # tokens before preemption. - PREFILLING = 3 - # Preempted by the scheduler. - PREEMPTED = 4 - # Output token is generated by the engine core. - DECODING = 5 - # Token detokenized by the detokenizer. - # We will record the timestamp for each output token, as well as the - # finish reason. - DETOKENIZED = 6 - # Request finishes (or aborts). - FINISHED = 7 - - """ - Valid state updates: - ARRIVED - │ - ├──────► INPUT_PROCESSED ──────► QUEUED ──────► PREFILLING ◄────┐ - │ │ │ │ │ - │ │ │ ▼ │ - │ │ │ -──► DECODING │ - │ │ │ | │ │ - │ │ │ | ▼ │ - │ │ │ └─ DETOKENIZED │ - │ │ │ │ │ - │ │ │ ▼ │ - │ ▼ ▼ PREEMPTED ◄──────┘ - │ │ │ │ - └──────────────┴───────────────────┴──────────────┴ - │ - ▼ - FINISHED (All could go to FINISHED) - """ - _VALID_TRANSITIONS: ClassVar[dict[Type, set[Type]]] = { - Type.ARRIVED: { - Type.INPUT_PROCESSED, - Type.FINISHED, - }, - Type.INPUT_PROCESSED: { - Type.QUEUED, - Type.FINISHED, - }, - Type.QUEUED: { - Type.PREFILLING, - Type.FINISHED, - }, - Type.PREFILLING: { - Type.DECODING, - Type.PREEMPTED, - Type.FINISHED, - }, - Type.DECODING: { - Type.DETOKENIZED, - Type.FINISHED, - }, - Type.DETOKENIZED: { - Type.DECODING, - Type.PREEMPTED, - Type.FINISHED, - }, - Type.PREEMPTED: {Type.PREFILLING, Type.FINISHED}, - Type.FINISHED: set(), - } - - request_id: str - - type: Type - - # Timestamp when the update is recorded. This is used to record time - # intervals between events rather than wall clock time. - monotonic_ts_s: float = msgspec_field( - default_factory=lambda: time.monotonic()) - - ############################################################ - # Metadata associated with the update. - ############################################################ - # For input_processed. Metadata needed for stats logging. - num_prompt_tokens: Optional[int] = None - sampling_params: Optional[SamplingParams] = None - - # For running. - # Number of tokens computed when scheduled to run. - num_computed_tokens: Optional[int] = None - # Number of cached tokens when scheduled to run. - num_cached_tokens: Optional[int] = None - - # For decoded. - # The number of new output tokens generated. - num_new_tokens: Optional[int] = None - - # For both detokenized and decoded. - # Finished reason. - finish_reason: Optional[str] = None - - # Non-optional fields for each update type. - _REQUIRED_FIELDS: ClassVar[dict[Type, list[str]]] = { - Type.INPUT_PROCESSED: ["num_prompt_tokens", "sampling_params"], - Type.PREFILLING: ["num_computed_tokens", "num_cached_tokens"], - Type.DETOKENIZED: ["num_new_tokens"], - Type.FINISHED: ["finish_reason"], - } - - def __post_init__(self): - required_fields = self._REQUIRED_FIELDS.get(self.type, []) - for field in required_fields: - if getattr(self, field) is None: - raise ValueError( - f"Field {field} is required for update type {self.type}.") - - @staticmethod - def check_valid_update( - update: "RequestStatsUpdate", - last_update_type: Optional[Type], - last_updated_ts_s: Optional[float], - ): - if last_update_type is None: - assert update.type == RequestStatsUpdate.Type.ARRIVED - else: - valid_cur_update_types = RequestStatsUpdate._VALID_TRANSITIONS[ - last_update_type] - assert update.type in valid_cur_update_types, ( - f"Invalid update type: {update.type} for last_update_type: " - f"{last_update_type}.") - - if last_updated_ts_s is not None: - assert update.monotonic_ts_s >= last_updated_ts_s, ( - "Update timestamp must be monotonically increasing, but " - f"last_updated_ts_s={last_updated_ts_s} and " - f"update.monotonic_ts_s={update.monotonic_ts_s}.") - - -@dataclass -class RequestStats: - """Stats associated with a request (`Request`).""" - - ############################################################ - # Metadata - ############################################################ - request_id: str - sampling_params: Optional[SamplingParams] = None - num_prompt_tokens: Optional[int] = None - - ############################################################ - # Metrics and Stats - ############################################################ - # Timestamp when the request was last updated. - last_updated_ts_s: Optional[float] = None - - # Last update stats type. - last_update_type: Optional[RequestStatsUpdate.Type] = None - - # Timestamp when the request arrived at the llm engine. - arrival_ts_s: Optional[float] = None - - # Number of tokens cached. When part of the request prefix is cached, - # this will be set. - num_cached_tokens: int = 0 - - # Number of tokens computed. - num_computed_tokens: int = 0 - - # The timestamp when the request become waiting in the queue. - queued_ts_s: Optional[float] = None - - # When the input processor is completed. - input_processor_end_ts_s: Optional[float] = None - - # A sorted list of timestamps when the request was scheduled to prefill. - # This could be when: - # 1. the request is newly scheduled, so it's a new prefill. - # 2. the request was preempted and resumed. It is equivalent to running - # a prefill of the original prefill tokens + generated output tokens - # before preemption. - prefill_start_ts_s_lst: list[float] = dataclass_field(default_factory=list) - - # A list of timestamps when a token is decoded by the engine core. - decoding_ts_s_lst: list[float] = dataclass_field(default_factory=list) - - # A sorted list of timestamps for each output token. - output_token_ts_s_lst: list[float] = dataclass_field(default_factory=list) - - # First token's timestamp. - first_token_ts_s: Optional[float] = None - - # TODO(rickyx): we need model runner to surface these. - model_forward_duration_s: float = 0.0 - # Includes model forward, block/sync across workers, cpu-gpu sync time - # and sampling time. - model_execute_duration_s: float = 0.0 - - # A sorted list of timestamps when the request was preempted at the - # scheduler. - # TODO(rickyx): right now, we don't actually have a good high-level - # metric to measure the impact of preemption other than observation of - # large P99 TPOT. Ideally we could quantify the impact of preemption by - # measuring the number of tokens re-computed due to preemption. - preempted_ts_s_lst: list[float] = dataclass_field(default_factory=list) - - # Timestamp when the request was finished at the engine core. - finished_ts_s: Optional[float] = None - - # Finish reason. - finish_reason: Optional[str] = None - - ############################################################ - # Derived properties. - ############################################################ - @property - def prefill_ts_s(self) -> Optional[float]: - """The timestamp when the request started prefilling. - Since a request could be preempted in decoding and later resumed - to prefill the decoded tokens, we use the first prefill start timestamp. - """ - return (self.prefill_start_ts_s_lst[0] - if self.prefill_start_ts_s_lst else None) - - @property - def e2e_latency_s(self) -> Optional[float]: - if self.finished_ts_s is None or self.arrival_ts_s is None: - return None - assert self.finished_ts_s >= self.arrival_ts_s - return self.finished_ts_s - self.arrival_ts_s - - @property - def queue_duration_s(self) -> Optional[float]: - """How long the request was waiting to run.""" - if self.queued_ts_s is None or self.prefill_ts_s is None: - # Either not queued or not running yet. - return None - assert self.queued_ts_s <= self.prefill_ts_s - return self.prefill_ts_s - self.queued_ts_s - - @property - def inference_latency_s(self) -> Optional[float]: - """How long the request was running inference - (prefill and decode).""" - if self.finished_ts_s is None or self.prefill_ts_s is None: - return None - assert self.finished_ts_s >= self.prefill_ts_s - return self.finished_ts_s - self.prefill_ts_s - - @property - def first_token_latency_s(self) -> Optional[float]: - if self.first_token_ts_s is None or self.arrival_ts_s is None: - return None - assert self.first_token_ts_s >= self.arrival_ts_s - return self.first_token_ts_s - self.arrival_ts_s - - @property - def prefill_latency_s(self) -> Optional[float]: - if self.first_token_ts_s is None or self.prefill_ts_s is None: - return None - assert self.first_token_ts_s >= self.prefill_ts_s - return self.first_token_ts_s - self.prefill_ts_s - - @property - def decode_latency_s(self) -> Optional[float]: - if self.e2e_latency_s is None or self.first_token_latency_s is None: - return None - assert self.e2e_latency_s >= self.first_token_latency_s - return self.e2e_latency_s - self.first_token_latency_s - - @property - def output_token_latency_s_lst(self) -> list[float]: - if len(self.output_token_ts_s_lst) == 0: - return [] - latency_s_lst = [] - for i in range(1, len(self.output_token_ts_s_lst)): - assert (self.output_token_ts_s_lst[i] - >= self.output_token_ts_s_lst[i - 1]) - latency_s = (self.output_token_ts_s_lst[i] - - self.output_token_ts_s_lst[i - 1]) - latency_s_lst.append(latency_s) - return latency_s_lst - - @property - def num_output_tokens(self) -> int: - return len(self.output_token_ts_s_lst) - - @property - def is_finished(self) -> bool: - return self.finished_ts_s is not None - - def update_from(self, update: "RequestStatsUpdate"): - RequestStatsUpdate.check_valid_update(update, self.last_update_type, - self.last_updated_ts_s) - ts = update.monotonic_ts_s - self.last_updated_ts_s = ts - self.last_update_type = update.type - if update.type == RequestStatsUpdate.Type.ARRIVED: - self.arrival_ts_s = ts - elif update.type == RequestStatsUpdate.Type.INPUT_PROCESSED: - self.input_processor_end_ts_s = ts - self.sampling_params = update.sampling_params - self.num_prompt_tokens = update.num_prompt_tokens - elif update.type == RequestStatsUpdate.Type.QUEUED: - self.queued_ts_s = ts - elif update.type == RequestStatsUpdate.Type.PREFILLING: - self.prefill_start_ts_s_lst.append(ts) - self.num_cached_tokens = update.num_cached_tokens or 0 - self.num_computed_tokens = update.num_computed_tokens or 0 - elif update.type == RequestStatsUpdate.Type.PREEMPTED: - self._reset_for_preemption(ts) - elif update.type == RequestStatsUpdate.Type.DECODING: - self.decoding_ts_s_lst.append(ts) - elif update.type == RequestStatsUpdate.Type.DETOKENIZED: - self._record_detokenized_output( - ts, - update.num_new_tokens or 0, - ) - elif update.type == RequestStatsUpdate.Type.FINISHED: - self.finished_ts_s = ts - self.finish_reason = update.finish_reason - else: - raise ValueError(f"Unknown update type: {update.type}") - - def _record_detokenized_output( - self, - ts_s: float, - num_new_tokens: int, - ): - # Update if first output token is generated. - if len(self.output_token_ts_s_lst) == 0: - self.first_token_ts_s = ts_s - assert ( - self.prefill_ts_s is not None - ), "Request must be running before generating output tokens." - - # Some X new tokens were generated at the ts. - self.output_token_ts_s_lst.extend([ts_s] * num_new_tokens) - - def _reset_for_preemption(self, ts_s: float): - self.preempted_ts_s_lst.append(ts_s) - # Reset the computed tokens since it might restart the prefill. - self.num_computed_tokens = 0 - # Cached token count might also change when resumed. - self.num_cached_tokens = 0 - # These stats don't change since they happen before request running. - # - arrival_ts_s - # - input_processor_end_ts_s - # - sampling_params - # - num_prompt_tokens - # - first_token_ts_s - # - # These stats are accumulated over preemptions: - # - output_token_ts_s_lst - # - prefill_start_ts_s_lst (after preemption, it will prefill the - # original prefill tokens and any output tokens generated before - # preemption.) - - -@dataclass -class KVCacheStats: - # KV Cache Usage in % - gpu_cache_usage_sys: float = 0.0 - gpu_prefix_cache_hit_rate: float = 0.0 - - -@dataclass -class SchedulerStats: - """Stats associated with the scheduler.""" - - # Number of requests currently running. - num_running_reqs: int = 0 - # Number of requests currently waiting. - num_waiting_reqs: int = 0 - - kv_cache_stats: KVCacheStats = dataclass_field( - default_factory=KVCacheStats) - - -@dataclass -class EngineCoreProcessStats: - """Stats associated with the engine core process.""" - - # Number of requests currently in the input queue. None if the engine core - # is not running in multiprocess mode. - input_queue_size: Optional[int] = None - # Number of outputs currently in the output queue. None if the engine core - # is not running in multiprocess mode. - output_queue_size: Optional[int] = None - - -class EngineCoreStatsSnapshot( - msgspec.Struct, # type: ignore - array_like=True, - omit_defaults=True, - gc=False): - """ - A snapshot of the EngineCore's current stats over a period of time. - """ - - # Snapshot of the scheduler stats. - scheduler_stats: SchedulerStats = msgspec_field( - default_factory=SchedulerStats) - - # Per request stats updates. - requests_stats_updates: list[RequestStatsUpdate] = msgspec_field( - default_factory=list) - - # Engine core's queue stats. - engine_core_process_stats: EngineCoreProcessStats = msgspec_field( - default_factory=EngineCoreProcessStats) - - # TODO(rickyx): Add other components' stats, - # e.g. model runner/worker and etc. diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 3183edb7c94e..c701ab1d35a5 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -7,16 +7,23 @@ from vllm.config import VllmConfig from vllm.logger import init_logger +from vllm.reasoning import ReasoningParserManager +from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs +from vllm.utils import LazyLoader from vllm.v1.structured_output.backend_guidance import GuidanceBackend from vllm.v1.structured_output.backend_types import (StructuredOutputBackend, StructuredOutputGrammar) +from vllm.v1.structured_output.backend_xgrammar import XgrammarBackend if TYPE_CHECKING: import numpy as np import numpy.typing as npt import torch + from vllm.reasoning import ReasoningParser from vllm.v1.request import Request +else: + torch = LazyLoader("torch", globals(), "torch") logger = init_logger(__name__) @@ -26,9 +33,11 @@ class StructuredOutputManager: def __init__(self, vllm_config: VllmConfig): self.backend: Optional[StructuredOutputBackend] = None + self.reasoner: Optional[ReasoningParser] = None self.vllm_config = vllm_config self._grammar_bitmask: Optional[torch.Tensor] = None + self._full_mask = torch.tensor(-1, dtype=torch.int32) # The default max_workers if not specified is the number of CPUs * 5, # which is way too high since these tasks are CPU-bound, not I/O bound. @@ -36,24 +45,43 @@ def __init__(self, vllm_config: VllmConfig): # compilation, so we set it to half the number of CPUs. max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) self.executor = ThreadPoolExecutor(max_workers=max_workers) + self.tokenizer = init_tokenizer_from_configs( + model_config=self.vllm_config.model_config, + scheduler_config=self.vllm_config.scheduler_config, + lora_config=self.vllm_config.lora_config, + ).get_lora_tokenizer(None) + reasoning_backend = vllm_config.decoding_config.reasoning_backend + if reasoning_backend: + reasoner_cls = ReasoningParserManager.get_reasoning_parser( + reasoning_backend) + self.reasoner = reasoner_cls(tokenizer=self.tokenizer) def grammar_init(self, request: Request) -> None: if request.structured_output_request is None: return + if TYPE_CHECKING: + assert request.sampling_params.guided_decoding is not None + # Initialize the backend the first time it is needed. # # NOTE: We only support a single backend. We do NOT support different # backends on a per-request basis in V1 (for now, anyway...). if self.backend is None: backend = request.sampling_params.guided_decoding.backend + vocab_size = self.vllm_config.model_config.get_vocab_size() if backend == "xgrammar": - from vllm.v1.structured_output.backend_xgrammar import ( - XgrammarBackend) - - self.backend = XgrammarBackend(self.vllm_config) + self.backend = XgrammarBackend( + self.vllm_config, + tokenizer=self.tokenizer, + vocab_size=vocab_size, + ) elif backend == "guidance": - self.backend = GuidanceBackend(self.vllm_config) + self.backend = GuidanceBackend( + self.vllm_config, + tokenizer=self.tokenizer, + vocab_size=vocab_size, + ) else: raise ValueError( f"Unsupported structured output backend: {backend}") @@ -87,14 +115,14 @@ def grammar_bitmask( if not structured_output_request_ids: return None + max_num_spec_tokens = 0 + if self.vllm_config.speculative_config is not None: + max_num_spec_tokens = \ + self.vllm_config.speculative_config.num_speculative_tokens + if self._grammar_bitmask is None: assert self.backend is not None max_batch_size = self.vllm_config.scheduler_config.max_num_seqs - if self.vllm_config.speculative_config is not None: - max_num_spec_tokens = self.vllm_config.\ - speculative_config.num_speculative_tokens - else: - max_num_spec_tokens = 0 # Allocate a bitmask for each token needing to be checked: # one for each speculative position, and one more for the @@ -103,6 +131,7 @@ def grammar_bitmask( self.backend.allocate_token_bitmask( max_batch_size * (1 + max_num_spec_tokens)) + bitmask_tensor = self._grammar_bitmask # Generate a batched bitmask for all structured output requests. # When speculative decoding is enabled, we need to include multiple # masks for each request, one for each possible bonus token position. @@ -110,16 +139,30 @@ def grammar_bitmask( cumulative_index = 0 ordered_seq = sorted(structured_output_request_ids.items(), key=lambda x: x[1]) + + # Note that for thinking support, we will need to + # reset the relevant part of the bitmask for consequent + # request here. + bitmask_tensor[:(len(ordered_seq) * (1 + max_num_spec_tokens))].fill_( + self._full_mask) + # NOTE: This outer loop can likely be parallelized to improve # performance of bitmask generation for large batches. for req_id, _ in ordered_seq: request = requests[req_id].structured_output_request - assert request is not None and request.grammar is not None + if TYPE_CHECKING: + assert request is not None + assert request.grammar is not None + + apply_bitmask = ( + request.reasoning_ended if self.reasoner is not None else True + ) # noqa: E501 + state_advancements = 0 req_tokens = scheduled_spec_decode_tokens.get(req_id, []) + [None] for i, token in enumerate(req_tokens): - if not request.grammar.is_terminated(): - request.grammar.fill_bitmask(self._grammar_bitmask, + if apply_bitmask and not request.grammar.is_terminated(): + request.grammar.fill_bitmask(bitmask_tensor, cumulative_index) if token is not None: # In order to generate the correct bitmask for each @@ -132,15 +175,41 @@ def grammar_bitmask( if state_advancements > 0: request.grammar.rollback(state_advancements) - bitmask_tensor = self._grammar_bitmask - if cumulative_index < self._grammar_bitmask.shape[0]: - bitmask_tensor = self._grammar_bitmask[:cumulative_index] + if cumulative_index < bitmask_tensor.shape[0]: + bitmask_tensor = bitmask_tensor[:cumulative_index] # After finishing with the xgrammar operations, we convert to # np.ndarray, because that is much more efficient for serialization # and deserialization when sending this to the GPU workers. return bitmask_tensor.numpy() + def should_advance(self, request: Request) -> bool: + if not request.use_structured_output: + return False + + # To determine whether we can advance the FSM. + # Supports thinking usage where we skip the reasoning components. + if TYPE_CHECKING: + assert request.structured_output_request is not None + assert request.structured_output_request.grammar is not None + # by default, we should always advance + # for cases that doesn't uses thinking mode. + if self.reasoner is not None: + structured_req = request.structured_output_request + + if structured_req.reasoning_ended: + return True + + # Check if reasoning ends in *this* step + if self.reasoner.is_reasoning_end(request.all_token_ids): + # Reasoning just ended, so we shouldn't advanced til + # next pass + structured_req.reasoning_ended = True + + return False + else: + return True + def clear_backend(self) -> None: if self.backend is not None: self.backend.destroy() diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py index 0ab175e781e7..55c5f609095d 100644 --- a/vllm/v1/structured_output/backend_guidance.py +++ b/vllm/v1/structured_output/backend_guidance.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import copy import json import os @@ -8,10 +10,8 @@ import torch -from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.sampling_params import SamplingParams -from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.utils import LazyLoader from vllm.v1.structured_output.backend_types import (StructuredOutputBackend, StructuredOutputGrammar, @@ -54,25 +54,17 @@ def process_for_additional_properties( return guide_json_obj +@dataclass class GuidanceBackend(StructuredOutputBackend): - def __init__(self, vllm_config: VllmConfig): - self.vllm_config = vllm_config - tokenizer_group = init_tokenizer_from_configs( - model_config=vllm_config.model_config, - scheduler_config=vllm_config.scheduler_config, - lora_config=vllm_config.lora_config) # type: ignore[arg-type] - self.vllm_config = vllm_config - self.vocab_size = vllm_config.model_config.get_vocab_size() - + def __post_init__(self): self.disable_any_whitespace = \ - vllm_config.decoding_config.disable_any_whitespace + self.vllm_config.decoding_config.disable_any_whitespace self.disable_additional_properties = \ - vllm_config.decoding_config.disable_additional_properties + self.vllm_config.decoding_config.disable_additional_properties - tokenizer = tokenizer_group.get_lora_tokenizer(None) self.ll_tokenizer = llguidance_hf.from_tokenizer( - tokenizer, self.vocab_size) + self.tokenizer, self.vocab_size) def compile_grammar(self, request_type: StructuredOutputOptions, grammar_spec: str) -> StructuredOutputGrammar: diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py index 33ca9f8cf484..09f6cdf73337 100644 --- a/vllm/v1/structured_output/backend_types.py +++ b/vllm/v1/structured_output/backend_types.py @@ -1,9 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import enum from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import torch -import torch + from vllm.config import VllmConfig + from vllm.transformers_utils.tokenizer import AnyTokenizer class StructuredOutputOptions(enum.Enum): @@ -85,9 +93,14 @@ def reset(self): """ +@dataclass class StructuredOutputBackend(ABC): """Engine-level backend for structured output requests.""" + vllm_config: VllmConfig + tokenizer: AnyTokenizer + vocab_size: int + @abstractmethod def compile_grammar(self, request_type: StructuredOutputOptions, grammar_spec: str) -> StructuredOutputGrammar: @@ -104,7 +117,7 @@ def compile_grammar(self, request_type: StructuredOutputOptions, """ @abstractmethod - def allocate_token_bitmask(self, max_num_seqs: int): + def allocate_token_bitmask(self, max_num_seqs: int) -> torch.Tensor: """ Allocates a token bitmask for the specified maximum number of sequences. diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index 2ce2be337ecb..f2570221da25 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import json from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any @@ -7,10 +9,8 @@ import torch import vllm.envs -from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.sampling_params import SamplingParams -from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer from vllm.utils import LazyLoader from vllm.v1.structured_output.backend_types import (StructuredOutputBackend, @@ -28,61 +28,49 @@ logger = init_logger(__name__) +@dataclass class XgrammarBackend(StructuredOutputBackend): - def __init__(self, vllm_config: VllmConfig): - self.vllm_config = vllm_config - tokenizer_group = init_tokenizer_from_configs( - model_config=vllm_config.model_config, - scheduler_config=vllm_config.scheduler_config, - lora_config=vllm_config.lora_config) # type: ignore[arg-type] - + def __post_init__(self): self.disable_any_whitespace = \ - vllm_config.decoding_config.disable_any_whitespace + self.vllm_config.decoding_config.disable_any_whitespace - self.num_speculative_tokens = 0 - if self.vllm_config.speculative_config is not None: - self.num_speculative_tokens = \ - self.vllm_config.speculative_config.num_speculative_tokens - - tokenizer = tokenizer_group.get_lora_tokenizer(None) - self.vocab_size = vllm_config.model_config.get_vocab_size() - if isinstance(tokenizer, MistralTokenizer): + if isinstance(self.tokenizer, MistralTokenizer): # NOTE: ideally, xgrammar should handle this accordingly. # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98 try: - if tokenizer.is_tekken: - encoded_vocab = tokenizer._vocab + if self.tokenizer.is_tekken: + encoded_vocab = self.tokenizer._vocab else: encoded_vocab = [ token for token, _ in sorted( - tokenizer.get_vocab().items(), + self.tokenizer.get_vocab().items(), key=lambda x: x[1], ) ] stop_token_ids = None - if hasattr( - tokenizer, + if (hasattr( + self.tokenizer, "eos_token_id", - ) and tokenizer.eos_token_id is not None: - stop_token_ids = [tokenizer.eos_token_id] + ) and self.tokenizer.eos_token_id is not None): + stop_token_ids = [self.tokenizer.eos_token_id] except AttributeError as e: raise ValueError( f"Cannot get the vocabulary of the tokenizer " - f"{type(tokenizer)}. The tokenizer should have a " + f"{type(self.tokenizer)}. The tokenizer should have a " "get_vocab method.") from e tokenizer_info = xgr.TokenizerInfo( # type: ignore encoded_vocab=encoded_vocab, # NOTE: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501 vocab_type=xgr.VocabType.RAW - if tokenizer.is_tekken else xgr.VocabType.BYTE_FALLBACK, + if self.tokenizer.is_tekken else xgr.VocabType.BYTE_FALLBACK, vocab_size=self.vocab_size, stop_token_ids=stop_token_ids, add_prefix_space=True, ) else: tokenizer_info = xgr.TokenizerInfo.from_huggingface( - tokenizer, + self.tokenizer, vocab_size=self.vocab_size, ) self.compiler = xgr.GrammarCompiler( @@ -92,6 +80,11 @@ def __init__(self, vllm_config: VllmConfig): cache_limit_bytes=vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024, ) + self.num_speculative_tokens = 0 + if self.vllm_config.speculative_config is not None: + self.num_speculative_tokens = \ + self.vllm_config.speculative_config.num_speculative_tokens + def compile_grammar(self, request_type: StructuredOutputOptions, grammar_spec: str) -> StructuredOutputGrammar: if request_type == StructuredOutputOptions.JSON: diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py index 6ef472eb896c..c16320b9e74c 100644 --- a/vllm/v1/structured_output/request.py +++ b/vllm/v1/structured_output/request.py @@ -20,6 +20,7 @@ class StructuredOutputRequest: sampling_params: SamplingParams _grammar: Optional[Union[Future[StructuredOutputGrammar], StructuredOutputGrammar]] = None + reasoning_ended: bool = False def _check_grammar_completion(self) -> bool: # NOTE: We have to lazy import to gate circular imports diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 581d3d9bd11b..0c3341691509 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -4,6 +4,8 @@ import torch from vllm.logger import init_logger +from vllm.utils import cdiv +from vllm.v1.kv_cache_interface import KVCacheConfig logger = init_logger(__name__) @@ -96,3 +98,48 @@ def get_cpu_tensor(self) -> torch.Tensor: def get_numpy_array(self) -> np.ndarray: """Returns the numpy array of the block table.""" return self.block_table_np + + +class MultiGroupBlockTable: + """The BlockTables for each KV cache group.""" + + def __init__(self, max_num_reqs: int, max_model_len: int, + max_num_batched_tokens: int, pin_memory: bool, + device: torch.device, kv_cache_config: KVCacheConfig) -> None: + max_num_blocks_per_req = [ + cdiv(max_model_len, g.kv_cache_spec.block_size) + for g in kv_cache_config.kv_cache_groups + ] + self.block_tables = [ + BlockTable(max_num_reqs, max_num_blocks_per_req[i], + max_num_batched_tokens, pin_memory, device) + for i in range(len(kv_cache_config.kv_cache_groups)) + ] + + def append_row(self, block_ids: list[list[int]], row_idx: int) -> None: + for i, block_table in enumerate(self.block_tables): + block_table.append_row(block_ids[i], row_idx) + + def add_row(self, block_ids: list[list[int]], row_idx: int) -> None: + for i, block_table in enumerate(self.block_tables): + block_table.add_row(block_ids[i], row_idx) + + def move_row(self, src: int, tgt: int) -> None: + for block_table in self.block_tables: + block_table.move_row(src, tgt) + + def swap_row(self, src: int, tgt: int) -> None: + for block_table in self.block_tables: + block_table.swap_row(src, tgt) + + def commit(self, num_reqs: int) -> None: + for block_table in self.block_tables: + block_table.commit(num_reqs) + + def clear(self) -> None: + for block_table in self.block_tables: + block_table.clear() + + def __getitem__(self, idx: int) -> "BlockTable": + """Returns the BlockTable for the i-th KV cache group.""" + return self.block_tables[idx] diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 871654fca366..570de9bddd29 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -11,10 +11,11 @@ from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.sampling_params import SamplingParams, SamplingType from vllm.utils import swap_dict_values +from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.outputs import LogprobsTensors from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.utils import copy_slice -from vllm.v1.worker.block_table import BlockTable +from vllm.v1.worker.block_table import MultiGroupBlockTable _SAMPLING_EPS = 1e-5 @@ -29,7 +30,7 @@ class CachedRequestState: sampling_params: SamplingParams generator: Optional[torch.Generator] - block_ids: list[int] + block_ids: list[list[int]] num_computed_tokens: int output_token_ids: list[int] @@ -58,15 +59,14 @@ def __init__( self, max_num_reqs: int, max_model_len: int, - max_num_blocks_per_req: int, max_num_batched_tokens: int, device: torch.device, pin_memory: bool, vocab_size: int, + kv_cache_config: KVCacheConfig, ): self.max_num_reqs = max_num_reqs self.max_model_len = max_model_len - self.max_num_blocks_per_req = max_num_blocks_per_req self.max_num_batched_tokens = max_num_batched_tokens self.device = device self.pin_memory = pin_memory @@ -99,12 +99,13 @@ def __init__( self.num_computed_tokens_cpu_tensor.numpy() # Block table. - self.block_table = BlockTable( + self.block_table = MultiGroupBlockTable( max_num_reqs=max_num_reqs, - max_num_blocks_per_req=max_num_blocks_per_req, + max_model_len=max_model_len, max_num_batched_tokens=max_num_batched_tokens, pin_memory=pin_memory, device=device, + kv_cache_config=kv_cache_config, ) # Sampling-related. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1b16f273a6de..1b34a9fb0616 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -12,6 +12,8 @@ import torch.nn as nn from vllm.attention import AttentionType, get_attn_backend +from vllm.attention.backends.abstract import (AttentionBackend, + AttentionMetadataBuilder) from vllm.attention.layer import Attention from vllm.attention.utils.fa_utils import get_flash_attn_version from vllm.config import (CompilationLevel, VllmConfig, @@ -31,8 +33,8 @@ from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - GiB_bytes, LayerBlockType, LazyLoader, cdiv, - check_use_alibi, is_pin_memory_available) + GiB_bytes, LazyLoader, cdiv, check_use_alibi, + is_pin_memory_available) from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.core.encoder_cache_manager import compute_encoder_budget @@ -49,6 +51,7 @@ from vllm.v1.spec_decode.ngram_proposer import NgramProposer from vllm.v1.spec_decode.utils import is_spec_decode_supported from vllm.v1.utils import bind_kv_cache +from vllm.v1.worker.block_table import BlockTable from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin @@ -100,59 +103,17 @@ def __init__( self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ cache_config.cache_dtype] - # NOTE(woosuk): sliding_window is None for models with interleaved - # attention. Use interleaved_sliding_window instead. - self.sliding_window = model_config.get_sliding_window() - self.interleaved_sliding_window = getattr( - model_config.hf_text_config, "interleaved_sliding_window", None) - self.window_size = (self.sliding_window - or self.interleaved_sliding_window) - self.is_multimodal_model = model_config.is_multimodal_model - self.block_size = cache_config.block_size self.max_model_len = model_config.max_model_len - self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size) self.max_num_tokens = scheduler_config.max_num_batched_tokens self.max_num_reqs = scheduler_config.max_num_seqs # Model-related. - self.num_attn_layers = model_config.get_num_layers_by_block_type( - parallel_config, LayerBlockType.attention) self.num_query_heads = model_config.get_num_attention_heads( parallel_config) - self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) - self.head_size = model_config.get_head_size() self.hidden_size = model_config.get_hidden_size() self.attention_chunk_size = model_config.attention_chunk_size - self.attn_backend = get_attn_backend( - self.head_size, - self.dtype, - self.kv_cache_dtype, - self.block_size, - self.model_config.is_attention_free, - use_mla=self.model_config.use_mla, - ) - if self.attn_backend is None: - error_msg = ( - f"Error with get_att_backend: {self.head_size=}, " - f"{self.dtype=}, {self.kv_cache_dtype=}, {self.block_size=}, " - f"{self.model_config.is_attention_free=}, " - f"{self.model_config.use_mla=}") - logger.error(error_msg) - raise NotImplementedError( - "Non-Attention backend is not supported by V1 GPUModelRunner.") - - if self.vllm_config.compilation_config.full_cuda_graph: - attn_backend_name = self.attn_backend.__name__ - flash_attn_version = get_flash_attn_version() - if attn_backend_name != "FlashAttentionBackend" or \ - flash_attn_version != 3: - raise ValueError( - f"full_cuda_graph is only supported with " - f"FA3. Current attention backend is {attn_backend_name}, " - f"FlashAttention version is {flash_attn_version}.") - self.cascade_attn_enabled = not self.model_config.disable_cascade_attn # Multi-modal data support @@ -174,8 +135,10 @@ def __init__( # self.model: nn.Module # Set after load_model # Initialize in initialize_kv_cache self.kv_caches: list[torch.Tensor] = [] + self.attn_metadata_builders: list[AttentionMetadataBuilder] = [] + self.attn_backends: list[type[AttentionBackend]] = [] # self.kv_cache_config: KVCacheConfig - # self.attn_metadata_builder: type[AttentionMetadataBuilder] + # self.input_batch: InputBatch # Persistent batch. # req_id -> (input_id -> encoder_output) self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} @@ -200,16 +163,6 @@ def __init__( # Request states. self.requests: dict[str, CachedRequestState] = {} - # Persistent batch. - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.max_model_len, - max_num_blocks_per_req=self.max_num_blocks_per_req, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=self.pin_memory, - vocab_size=model_config.get_vocab_size(), - ) self.use_cuda_graph = (self.vllm_config.compilation_config.level == CompilationLevel.PIECEWISE @@ -304,6 +257,31 @@ def __init__( pin_memory=self.pin_memory) self.seq_lens_np = self.seq_lens_cpu.numpy() + def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool: + """ + Update the order of requests in the batch based on the attention + backend's needs. For example, some attention backends (namely MLA) may + want to separate requests based on if the attention computation will be + compute-bound or memory-bound. + + Args: + scheduler_output: The scheduler output. + + Returns: + True if the batch was reordered, False otherwise. + """ + batch_reordered = self.attn_metadata_builders[0].reorder_batch( + self.input_batch, scheduler_output) + + # For models with multiple KV cache groups, the groups should agree on + # the same order of requests. We ensure this by only allowing the first + # group to reorder the batch and asserting that all other groups do not + # reorder the batch. + for i in range(1, len(self.kv_cache_config.kv_cache_groups)): + assert not self.attn_metadata_builders[i].reorder_batch( + self.input_batch, scheduler_output) + return batch_reordered + def _update_states(self, scheduler_output: "SchedulerOutput") -> None: """Update the cached states and the persistent batch with the scheduler output. @@ -440,7 +418,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Update the block IDs. if not req_data.resumed_from_preemption: # Append the new blocks to the existing block IDs. - req_state.block_ids.extend(req_data.new_block_ids) + for i in range(len(self.kv_cache_config.kv_cache_groups)): + req_state.block_ids[i].extend(req_data.new_block_ids[i]) else: # The request is resumed from preemption. # Replace the existing block IDs with the new ones. @@ -498,11 +477,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: if removed_req_indices: self.input_batch.condense(removed_req_indices) - # Some attention backends (namely MLA) may want to separate requests - # based on if the attention computation will be compute-bound or - # memory-bound. This gives them a hook to do that. - batch_reordered = self.attn_metadata_builder.reorder_batch( - self.input_batch, scheduler_output) + batch_reordered = self._may_reorder_batch(scheduler_output) if batch_changed or batch_reordered: self.input_batch.refresh_sampling_metadata() @@ -570,21 +545,29 @@ def _prepare_inputs( torch.from_numpy(token_indices), out=self.input_ids_cpu[:total_num_scheduled_tokens]) - # Calculate the slot mapping. - # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] - # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1] - # where K is the max_num_blocks_per_req and the block size is 2. - # NOTE(woosuk): We can't simply use `token_indices // block_size` here - # because M (max_model_len) is not necessarily divisible by block_size. - block_table_indices = (req_indices * self.max_num_blocks_per_req + - positions_np // self.block_size) - block_table_cpu = self.input_batch.block_table.get_cpu_tensor() - block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() - block_offsets = positions_np % self.block_size - np.add(block_numbers * self.block_size, - block_offsets, - out=self.input_batch.block_table. - slot_mapping_np[:total_num_scheduled_tokens]) + # Calculate the slot mapping for each KV cache group. + for kv_cache_group_id, kv_cache_group_spec in enumerate( + self.kv_cache_config.kv_cache_groups): + block_size = kv_cache_group_spec.kv_cache_spec.block_size + block_table: BlockTable = self.input_batch.block_table[ + kv_cache_group_id] + # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1] + # where K is the max_num_blocks_per_req and the block size is 2. + # NOTE(woosuk): We can't simply use `token_indices // block_size` + # here because M (max_model_len) is not necessarily divisible by + # block_size. + block_table_indices = ( + req_indices * block_table.max_num_blocks_per_req + + positions_np // block_size) + block_table_cpu = block_table.get_cpu_tensor() + block_numbers = block_table_cpu.flatten( + )[block_table_indices].numpy() + block_offsets = positions_np % block_size + np.add( + block_numbers * block_size, + block_offsets, + out=block_table.slot_mapping_np[:total_num_scheduled_tokens]) # Prepare the attention metadata. self.query_start_loc_np[0] = 0 @@ -626,10 +609,6 @@ def _prepare_inputs( attn_metadata: dict[str, FlashAttentionMetadata] = {} # Prepare the attention metadata for each KV cache group and make layers # in the same group share the same metadata. - # NOTE(Chen): there is exactly one KV cache group that contains all - # attetnion layers in the model for now, so the current logic for - # getting attn_metadata is not related to kv_cache_group information. - # Will extend this part to support multiple KV cache groups later. for kv_cache_group_id, kv_cache_group_spec in enumerate( self.kv_cache_config.kv_cache_groups): @@ -638,15 +617,19 @@ def _prepare_inputs( if self.cascade_attn_enabled: common_prefix_len = self._compute_cascade_attn_prefix_len( num_scheduled_tokens, - scheduler_output.num_common_prefix_blocks, + scheduler_output. + num_common_prefix_blocks[kv_cache_group_id], + kv_cache_group_spec.kv_cache_spec, + self.attn_metadata_builders[kv_cache_group_id], ) - attn_metadata_i = self.attn_metadata_builder.build( - num_reqs=num_reqs, - num_actual_tokens=total_num_scheduled_tokens, - max_query_len=max_num_scheduled_tokens, - common_prefix_len=common_prefix_len, - common_attn_metadata=common_attn_metadata) + attn_metadata_i = ( + self.attn_metadata_builders[kv_cache_group_id].build( + num_reqs=num_reqs, + num_actual_tokens=total_num_scheduled_tokens, + max_query_len=max_num_scheduled_tokens, + common_prefix_len=common_prefix_len, + common_attn_metadata=common_attn_metadata)) for layer_name in kv_cache_group_spec.layer_names: attn_metadata[layer_name] = attn_metadata_i @@ -684,6 +667,8 @@ def _compute_cascade_attn_prefix_len( self, num_scheduled_tokens: np.ndarray, num_common_prefix_blocks: int, + kv_cache_spec: KVCacheSpec, + attn_metadata_builder: AttentionMetadataBuilder, ) -> int: """Compute the length of the common prefix for cascade attention. @@ -702,7 +687,7 @@ def _compute_cascade_attn_prefix_len( Returns: int: Length of common prefix in tokens. """ - common_prefix_len = num_common_prefix_blocks * self.block_size + common_prefix_len = num_common_prefix_blocks * kv_cache_spec.block_size if common_prefix_len == 0: # Common case. return 0 @@ -751,15 +736,19 @@ def _compute_cascade_attn_prefix_len( common_prefix_len, self.input_batch.num_computed_tokens_cpu[:num_reqs].min()) # common_prefix_len should be a multiple of the block size. - common_prefix_len = (common_prefix_len // self.block_size * - self.block_size) - use_cascade = self.attn_metadata_builder.use_cascade_attention( + common_prefix_len = (common_prefix_len // kv_cache_spec.block_size * + kv_cache_spec.block_size) + use_sliding_window = (isinstance(kv_cache_spec, SlidingWindowSpec) or + (isinstance(kv_cache_spec, FullAttentionSpec) + and kv_cache_spec.sliding_window is not None)) + assert isinstance(kv_cache_spec, AttentionSpec) + use_cascade = attn_metadata_builder.use_cascade_attention( common_prefix_len=common_prefix_len, query_lens=num_scheduled_tokens, num_query_heads=self.num_query_heads, - num_kv_heads=self.num_kv_heads, + num_kv_heads=kv_cache_spec.num_kv_heads, use_alibi=self.use_alibi, - use_sliding_window=self.window_size is not None, + use_sliding_window=use_sliding_window, num_sms=self.num_sms, ) return common_prefix_len if use_cascade else 0 @@ -1577,7 +1566,7 @@ def _dummy_run( dtype=np.int32) if skip_attn: - attn_metadata = None + attn_metadata: Optional[dict[str, FlashAttentionMetadata]] = None else: query_start_loc = self.query_start_loc[:num_reqs + 1] seq_lens = self.seq_lens[:num_reqs] @@ -1585,13 +1574,19 @@ def _dummy_run( common_attn_metadata = CommonAttentionMetadata( query_start_loc=query_start_loc, seq_lens=seq_lens) - attn_metadata = self.attn_metadata_builder.build( - num_reqs=num_tokens, - num_actual_tokens=num_tokens, - max_query_len=num_tokens, - common_prefix_len=0, - common_attn_metadata=common_attn_metadata, - ) + attn_metadata = {} + for kv_cache_group_id, kv_cache_group_spec in enumerate( + self.kv_cache_config.kv_cache_groups): + attn_metadata_i = ( + self.attn_metadata_builders[kv_cache_group_id].build( + num_reqs=num_tokens, + num_actual_tokens=num_tokens, + max_query_len=num_tokens, + common_prefix_len=0, + common_attn_metadata=common_attn_metadata, + )) + for layer_name in kv_cache_group_spec.layer_names: + attn_metadata[layer_name] = attn_metadata_i with self.maybe_dummy_run_with_lora(self.lora_config, num_scheduled_tokens): @@ -1822,6 +1817,56 @@ def capture_model(self) -> None: logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", elapsed_time, cuda_graph_size / (1 << 30)) + def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: + """ + Initialize the attention backends and attention metadata builders. + """ + assert len(self.attn_backends) == 0 and len( + self.attn_metadata_builders + ) == 0, "Attention backends are already initialized" + for i, kv_cache_group_spec in enumerate( + kv_cache_config.kv_cache_groups): + kv_cache_spec = kv_cache_group_spec.kv_cache_spec + if not isinstance(kv_cache_spec, AttentionSpec): + raise NotImplementedError( + "Only AttentionSpec is supported for now.") + attn_backend_i = get_attn_backend( + kv_cache_spec.head_size, + self.dtype, + kv_cache_spec.dtype, + kv_cache_spec.block_size, + self.model_config.is_attention_free, + use_mla=kv_cache_spec.use_mla, + ) + if attn_backend_i is None: + error_msg = ( + f"Error with get_attn_backend: {kv_cache_spec.head_size=}, " + f"{self.dtype=}, {kv_cache_spec.dtype=}, " + f"{kv_cache_spec.block_size=}, " + f"{self.model_config.is_attention_free=}, " + f"{kv_cache_spec.use_mla=}") + logger.error(error_msg) + raise NotImplementedError( + "Non-Attention backend is not supported by V1 " + "GPUModelRunner.") + + if self.vllm_config.compilation_config.full_cuda_graph: + attn_backend_name = attn_backend_i.__name__ + flash_attn_version = get_flash_attn_version() + if attn_backend_name != "FlashAttentionBackend" or \ + flash_attn_version != 3: + raise ValueError( + f"full_cuda_graph is only supported with " + f"FA3. Current attention backend is " + f"{attn_backend_name}, FlashAttention version is " + f"{flash_attn_version}.") + + block_table_i = self.input_batch.block_table[i] + attn_metadata_builder_i = attn_backend_i.get_builder_cls()( + weakref.proxy(self), kv_cache_spec, block_table_i) + self.attn_backends.append(attn_backend_i) + self.attn_metadata_builders.append(attn_metadata_builder_i) + def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: """ Initialize KV cache based on `kv_cache_config`. @@ -1829,15 +1874,21 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: kv_cache_config: Configuration for the KV cache, including the KV cache size of each layer """ - if len(kv_cache_config.kv_cache_groups) > 1: - raise NotImplementedError( - "Hybrid models with more than one KV cache type are not " - "supported yet.") self.kv_cache_config = kv_cache_config + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=self.model_config.get_vocab_size(), + kv_cache_config=kv_cache_config, + ) + self.initialize_attn_backend(kv_cache_config) kv_caches: dict[str, torch.Tensor] = {} - for kv_cache_group in kv_cache_config.kv_cache_groups: + for i, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups): kv_cache_spec = kv_cache_group.kv_cache_spec for layer_name in kv_cache_group.layer_names: tensor_config = kv_cache_config.tensors[layer_name] @@ -1852,7 +1903,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: # the min of all `num_blocks`. Verify it here. assert num_blocks >= kv_cache_config.num_blocks if isinstance(kv_cache_spec, AttentionSpec): - kv_cache_shape = self.attn_backend.get_kv_cache_shape( + kv_cache_shape = self.attn_backends[i].get_kv_cache_shape( num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) dtype = kv_cache_spec.dtype @@ -1872,11 +1923,6 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: if has_kv_transfer_group(): get_kv_transfer_group().register_kv_caches(kv_caches) - self.attn_metadata_builder = self.attn_backend.get_builder_cls()( - weakref.proxy(self), - kv_cache_config.kv_cache_groups[0].kv_cache_spec, - self.input_batch.block_table) - def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: """ Generates the KVCacheSpec by parsing the kv cache format from each diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 5352b1c5a37c..d85701fa93df 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -341,7 +341,8 @@ def init_worker_distributed_environment( distributed_init_method, local_rank) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + parallel_config.pipeline_parallel_size, + parallel_config.enable_expert_parallel) ensure_kv_transfer_initialized(vllm_config) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index b4daf5a34678..2da99696445e 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -171,19 +171,10 @@ def __init__( self.kv_caches: list[torch.Tensor] = [] # req_id -> (input_id -> encoder_output) self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} + # self.input_batch: InputBatch # Persistent batch. # Request states. self.requests: dict[str, CachedRequestState] = {} - # Persistent batch. - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.max_model_len, - max_num_blocks_per_req=self.max_num_blocks_per_req, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=self.pin_memory, - vocab_size=self.vocab_size, - ) # Cached torch/numpy tensor # The pytorch tensor and numpy array share the same buffer. @@ -199,7 +190,7 @@ def __init__( self.block_table_cpu = torch.zeros( (self.max_num_reqs, self.max_num_blocks_per_req), - dtype=self.input_batch.block_table.get_cpu_tensor().dtype, + dtype=torch.int32, device="cpu") self.query_start_loc_cpu = torch.zeros(self.max_num_tokens + 1, @@ -524,12 +515,12 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # NOTE(woosuk): We use torch.index_select instead of np.take here # because torch.index_select is much faster than np.take for large # tensors. - block_table_cpu = self.input_batch.block_table.get_cpu_tensor() + block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor() block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() block_offsets = positions_np % self.block_size np.add(block_numbers * self.block_size, block_offsets, - out=self.input_batch.block_table. + out=self.input_batch.block_table[0]. slot_mapping_np[:total_num_scheduled_tokens]) # Prepare the attention metadata. @@ -554,15 +545,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): self.position_ids = self.positions_cpu[: padded_total_num_scheduled_tokens].to( self.device) - self.input_batch.block_table.slot_mapping_cpu[ + self.input_batch.block_table[0].slot_mapping_cpu[ total_num_scheduled_tokens:] = _PAD_SLOT_ID slot_mapping = ( - self.input_batch.block_table. + self.input_batch.block_table[0]. slot_mapping_cpu[:padded_total_num_scheduled_tokens].to( self.device)) block_tables = self.block_table_cpu[:self.max_num_reqs] block_tables[:num_reqs, :self.max_num_blocks_per_req] = ( - self.input_batch.block_table.get_cpu_tensor()[:num_reqs]) + self.input_batch.block_table[0].get_cpu_tensor()[:num_reqs]) block_tables = block_tables.to(self.device) query_start_loc = self.query_start_loc_cpu[:self.max_num_reqs + 1].to( self.device) @@ -1263,6 +1254,18 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: "Hybrid models with more than one KV cache type are not " "supported yet.") + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=self.model_config.get_vocab_size(), + kv_cache_config=kv_cache_config, + ) + assert self.block_table_cpu.dtype == self.input_batch.block_table[ + 0].get_cpu_tensor().dtype + kv_caches: dict[str, torch.Tensor] = {} for kv_cache_group in kv_cache_config.kv_cache_groups: diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 9eea26d85249..25715407ceee 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -265,4 +265,5 @@ def init_tpu_worker_distributed_environment( backend="gloo", ) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + parallel_config.pipeline_parallel_size, + parallel_config.enable_expert_parallel) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 1436a404335a..a92cf1e5a3b3 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -390,7 +390,8 @@ def init_distributed_environment(self) -> None: ensure_model_parallel_initialized( parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + parallel_config.pipeline_parallel_size, + parallel_config.enable_expert_parallel) def get_cache_block_size_bytes(self) -> int: """Return the size in bytes of a single KV cache block. diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 7898c645d66a..42882992f2da 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -416,7 +416,8 @@ def init_worker_distributed_environment( backend='hccl') ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + parallel_config.pipeline_parallel_size, + parallel_config.enable_expert_parallel) if torch.distributed.is_initialized(): torch_world_size = torch.distributed.get_world_size() @@ -442,7 +443,8 @@ def init_worker_distributed_environment( torch.distributed.all_reduce(dummy_tensor_hpu) assert dummy_tensor_hpu.item() == parallel_config.world_size ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + parallel_config.pipeline_parallel_size, + parallel_config.enable_expert_parallel) def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len, diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 4bb9bea022f9..891ed66599dc 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -76,7 +76,8 @@ def init_device(self) -> None: ) ensure_model_parallel_initialized( self.parallel_config.tensor_parallel_size, - self.parallel_config.pipeline_parallel_size) + self.parallel_config.pipeline_parallel_size, + self.parallel_config.enable_expert_parallel) # Device initialization should happen after initializing the distributed # runtime. diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 17f636765ff9..41546462e5c4 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -530,7 +530,8 @@ def init_worker_distributed_environment( init_distributed_environment(parallel_config.world_size, rank, distributed_init_method, local_rank) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + parallel_config.pipeline_parallel_size, + parallel_config.enable_expert_parallel) ensure_kv_transfer_initialized(vllm_config) diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 17f533525171..65085f80f97a 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -176,7 +176,8 @@ def init_worker_distributed_environment(self) -> None: ensure_model_parallel_initialized( parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + parallel_config.pipeline_parallel_size, + parallel_config.enable_expert_parallel) # global all_reduce needed for overall oneccl warm up torch.distributed.all_reduce(torch.zeros(1).xpu())