diff --git a/provider/src/inference.rs b/provider/src/inference.rs
index 187b64f7..2a27b734 100644
--- a/provider/src/inference.rs
+++ b/provider/src/inference.rs
@@ -331,6 +331,13 @@ for _name in (
     /// OpenAI-compatible features (chat templates, tool calling, structured
     /// output) without starting an HTTP server.
     fn load_vllm_mlx(&self, py: Python<'_>) -> Result<()> {
+        // Lock sys.path before the model load so no extra packages can be injected.
+        // block_dangerous_modules is intentionally called AFTER _load_model completes:
+        // load_model() may download weights from HuggingFace on a cold start, which
+        // requires socket/urllib. The blocker is installed once the engine is cached
+        // and before the process begins serving inference requests.
+        Self::lock_python_path(py)?;
+
         let model = serde_json::to_string(&self.model_id).context("invalid model path")?;
         let cache_key = serde_json::to_string(&self.cache_key).context("invalid cache key")?;
         let code = format!(
@@ -357,6 +364,10 @@ except Exception as _e:
         let ccode = CString::new(code).context("invalid code string")?;
         py.run(ccode.as_c_str(), None, None)
             .context("failed to initialize vllm-mlx engine via server handler")?;
+
+        // Block dangerous modules now that the model is loaded. Any attempt by
+        // inference-time Python code to import socket, subprocess, etc. will fail.
+        Self::block_dangerous_modules(py)?;
         Ok(())
     }