diff --git a/provider/src/inference.rs b/provider/src/inference.rs index 187b64f7..2a27b734 100644 --- a/provider/src/inference.rs +++ b/provider/src/inference.rs @@ -331,6 +331,13 @@ for _name in ( /// OpenAI-compatible features (chat templates, tool calling, structured /// output) without starting an HTTP server. fn load_vllm_mlx(&self, py: Python<'_>) -> Result<()> { + // Lock sys.path before the model load so no extra packages can be injected. + // block_dangerous_modules is intentionally called AFTER _load_model completes: + // load_model() may download weights from HuggingFace on a cold start, which + // requires socket/urllib. The blocker is installed once the engine is cached + // and before the process begins serving inference requests. + Self::lock_python_path(py)?; + let model = serde_json::to_string(&self.model_id).context("invalid model path")?; let cache_key = serde_json::to_string(&self.cache_key).context("invalid cache key")?; let code = format!( @@ -357,6 +364,10 @@ except Exception as _e: let ccode = CString::new(code).context("invalid code string")?; py.run(ccode.as_c_str(), None, None) .context("failed to initialize vllm-mlx engine via server handler")?; + + // Block dangerous modules now that the model is loaded. Any attempt by + // inference-time Python code to import socket, subprocess, etc. will fail. + Self::block_dangerous_modules(py)?; Ok(()) }