diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 0fa59579ee76..75e6f614fa3f 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -1043,6 +1043,7 @@ def run_engine_core(*args, dp_rank: int = 0, local_dp_rank: int = 0, **kwargs): engine_core: EngineCoreProc | None = None signal_callback: SignalCallback | None = None + exitcode = 0 try: vllm_config: VllmConfig = kwargs["vllm_config"] parallel_config: ParallelConfig = vllm_config.parallel_config @@ -1104,6 +1105,7 @@ def signal_handler(signum, frame): logger.debug("EngineCore exiting.") raise except Exception as e: + exitcode = 1 if engine_core is None: logger.exception("EngineCore failed to start.") else: @@ -1117,6 +1119,12 @@ def signal_handler(signum, frame): signal_callback.stop() if engine_core is not None: engine_core.shutdown() + # Use os._exit() to terminate this subprocess immediately + # after cleanup. This skips Python's threading._shutdown() + # which can hang indefinitely when third-party libraries + # (e.g., httpx from huggingface-hub >= 1.7) create + # non-daemon background threads that block interpreter exit. + os._exit(exitcode) def _init_data_parallel(self, vllm_config: VllmConfig): pass