Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,24 @@ outputs = llm.generate(prompts, sampling_params)
outputs[0]["text"]
```

## Profiling

Enable PyTorch profiler to trace per-process performance when using tensor parallelism:

```python
llm = LLM(
"/YOUR/MODEL/PATH",
enable_profiling=True,
profiling_output_dir="./profiler_logs",
)

# Run inference...
# Then view traces with TensorBoard:
# tensorboard --logdir=./profiler_logs
```

The profiler traces are saved per rank (e.g., `rank0`, `rank1`) and can be analyzed using https://ui.perfetto.dev/ to identify performance bottlenecks in multi-process inference.

## Benchmark

See `bench.py` for benchmark.
Expand Down
6 changes: 3 additions & 3 deletions nanovllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

class LLMEngine:

def __init__(self, model, **kwargs):
def __init__(self, model, enable_profiling: bool = False, profiling_output_dir: str = "./profiler_logs", **kwargs):
config_fields = {field.name for field in fields(Config)}
config_kwargs = {k: v for k, v in kwargs.items() if k in config_fields}
config = Config(model, **config_kwargs)
Expand All @@ -23,11 +23,11 @@ def __init__(self, model, **kwargs):
ctx = mp.get_context("spawn")
for i in range(1, config.tensor_parallel_size):
event = ctx.Event()
process = ctx.Process(target=ModelRunner, args=(config, i, event))
process = ctx.Process(target=ModelRunner, args=(config, i, event, enable_profiling, profiling_output_dir))
process.start()
self.ps.append(process)
self.events.append(event)
self.model_runner = ModelRunner(config, 0, self.events)
self.model_runner = ModelRunner(config, 0, self.events, enable_profiling, profiling_output_dir)
self.tokenizer = AutoTokenizer.from_pretrained(config.model, use_fast=True)
config.eos = self.tokenizer.eos_token_id
self.scheduler = Scheduler(config)
Expand Down
65 changes: 64 additions & 1 deletion nanovllm/engine/model_runner.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import os
import pickle
from contextlib import nullcontext
import torch
import torch.distributed as dist
from multiprocessing.synchronize import Event
Expand All @@ -14,14 +16,17 @@

class ModelRunner:

def __init__(self, config: Config, rank: int, event: Event | list[Event]):
def __init__(self, config: Config, rank: int, event: Event | list[Event], enable_profiling: bool = False, profiling_output_dir: str = "./profiler_logs"):
self.config = config
hf_config = config.hf_config
self.block_size = config.kvcache_block_size
self.enforce_eager = config.enforce_eager
self.world_size = config.tensor_parallel_size
self.rank = rank
self.event = event
self.enable_profiling = enable_profiling
self.profiling_output_dir = profiling_output_dir
self.profiler = None

dist.init_process_group("nccl", "tcp://localhost:2333", world_size=self.world_size, rank=rank)
torch.cuda.set_device(rank)
Expand All @@ -38,6 +43,10 @@ def __init__(self, config: Config, rank: int, event: Event | list[Event]):
torch.set_default_device("cpu")
torch.set_default_dtype(default_dtype)

# Setup profiler after model is ready
if self.enable_profiling:
self._setup_profiler()

if self.world_size > 1:
if rank == 0:
self.shm = SharedMemory(name="nanovllm", create=True, size=2**20)
Expand All @@ -47,7 +56,44 @@ def __init__(self, config: Config, rank: int, event: Event | list[Event]):
self.shm = SharedMemory(name="nanovllm")
self.loop()

def _setup_profiler(self):
"""Setup PyTorch profiler for performance tracing."""
import torch.profiler
import os

os.makedirs(self.profiling_output_dir, exist_ok=True)
worker_name = f"rank{self.rank}"

profiler_schedule = torch.profiler.schedule(
wait=1,
warmup=1,
active=3,
repeat=1,
)

self.profiler = torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
schedule=profiler_schedule,
# 建议开启 record_shapes 和 profile_memory 以获得更详细的分析
record_shapes=True,
profile_memory=True,
with_stack=True,
on_trace_ready=torch.profiler.tensorboard_trace_handler(
self.profiling_output_dir,
worker_name=worker_name,
),
)
# 添加标志位,防止重复启动
self.profiler_started = False
print(f"[Rank {self.rank}] Profiler enabled. Traces will be saved to: {self.profiling_output_dir}")

def exit(self):
# Stop profiler before exit
if self.profiler is not None:
self.profiler.stop()
if self.world_size > 1:
self.shm.close()
dist.barrier()
Expand Down Expand Up @@ -206,13 +252,30 @@ def run_model(self, input_ids: torch.Tensor, positions: torch.Tensor, is_prefill
return self.model.compute_logits(graph_vars["outputs"][:bs])

def run(self, seqs: list[Sequence], is_prefill: bool) -> list[int]:
# 1. 懒加载启动:在第一次推理时启动 profiler
# 这样可以避免抓取初始化时的开销,只抓取实际推理过程
if self.profiler is not None and not self.profiler_started:
self.profiler.start()
self.profiler_started = True

# 2. 正常执行推理逻辑(移除 with 语句)
input_ids, positions = self.prepare_prefill(seqs) if is_prefill else self.prepare_decode(seqs)
temperatures = self.prepare_sample(seqs) if self.rank == 0 else None

logits = self.run_model(input_ids, positions, is_prefill)
token_ids = self.sampler(logits, temperatures).tolist() if self.rank == 0 else None

reset_context()

# 3. 推进 Schedule
# 根据 wait=1, warmup=1, active=3 的配置,
# 前 2 次调用 step() 不会记录,后 3 次会记录,之后自动停止记录
if self.profiler is not None:
self.profiler.step()

return token_ids


@torch.inference_mode()
def capture_cudagraph(self):
config = self.config
Expand Down