diff --git a/CHANGELOG.md b/CHANGELOG.md index 995919121c..cd99103896 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ All notable changes to this project will be documented in this file. ### Enhancements made -- If ServerApp.ip is ipv6 use [::1] as local_url [#1495](https://github.com/jupyter-server/jupyter_server/pull/1495) ([@manics](https://github.com/manics)) +- If ServerApp.ip is ipv6 use \[::1\] as local_url [#1495](https://github.com/jupyter-server/jupyter_server/pull/1495) ([@manics](https://github.com/manics)) - Don't hide .so,.dylib files by default [#1457](https://github.com/jupyter-server/jupyter_server/pull/1457) ([@nokados](https://github.com/nokados)) - Add async start hook to ExtensionApp API [#1417](https://github.com/jupyter-server/jupyter_server/pull/1417) ([@Zsailer](https://github.com/Zsailer)) diff --git a/docs/source/api/jupyter_server.prometheus.rst b/docs/source/api/jupyter_server.prometheus.rst index b7b01e2753..1f6d7abe25 100644 --- a/docs/source/api/jupyter_server.prometheus.rst +++ b/docs/source/api/jupyter_server.prometheus.rst @@ -16,6 +16,12 @@ Submodules :show-inheritance: :undoc-members: + +.. automodule:: jupyter_server.prometheus.server + :members: + :show-inheritance: + :undoc-members: + Module contents --------------- diff --git a/jupyter_server/prometheus/__init__.py b/jupyter_server/prometheus/__init__.py index e69de29bb2..ca367effc6 100644 --- a/jupyter_server/prometheus/__init__.py +++ b/jupyter_server/prometheus/__init__.py @@ -0,0 +1,27 @@ +""" +Prometheus metrics integration for Jupyter Server. + +This module provides Prometheus metrics collection and exposure for Jupyter Server. +""" + +from .metrics import ( + KERNEL_CURRENTLY_RUNNING_TOTAL, + LAST_ACTIVITY, + SERVER_EXTENSION_INFO, + SERVER_INFO, + SERVER_STARTED, + TERMINAL_CURRENTLY_RUNNING_TOTAL, +) +from .server import PrometheusMetricsServer, start_metrics_server + +__all__ = [ + "KERNEL_CURRENTLY_RUNNING_TOTAL", + "TERMINAL_CURRENTLY_RUNNING_TOTAL", + "SERVER_INFO", + "SERVER_EXTENSION_INFO", + "LAST_ACTIVITY", + "SERVER_STARTED", + "ACTIVE_DURATION", + "PrometheusMetricsServer", + "start_metrics_server", +] diff --git a/jupyter_server/prometheus/server.py b/jupyter_server/prometheus/server.py new file mode 100644 index 0000000000..57bc9df324 --- /dev/null +++ b/jupyter_server/prometheus/server.py @@ -0,0 +1,265 @@ +""" +Prometheus metrics server for Jupyter Server + +This module provides functionality to start a separate Prometheus metrics server +that exposes Jupyter-specific metrics on a dedicated port. + +Note on HTTP Request Metrics: +The separate metrics server uses the same prometheus registry as the main server. +HTTP request duration metrics (http_request_duration_seconds) are recorded by the +main server's logging system when record_http_request_metrics=True. Since both +servers share the same registry, these metrics will be available in the separate +metrics server as well. + +The record_http_request_metrics parameter controls whether the main server records +these metrics, and the separate metrics server will automatically reflect this +setting since it uses the same underlying metrics collection. + +Authentication: +The separate metrics server reuses the main server's authentication settings and +handler infrastructure, ensuring consistent behavior. +""" + +import asyncio +import socket +import threading +import time +import warnings +from typing import Optional + +import tornado.httpserver +import tornado.ioloop +import tornado.web + +from jupyter_server._version import __version__ +from jupyter_server.base.handlers import PrometheusMetricsHandler +from jupyter_server.prometheus.metrics import ( + ACTIVE_DURATION, + KERNEL_CURRENTLY_RUNNING_TOTAL, + LAST_ACTIVITY, + SERVER_EXTENSION_INFO, + SERVER_INFO, + SERVER_STARTED, + TERMINAL_CURRENTLY_RUNNING_TOTAL, +) + + +class PrometheusMetricsServer: + """A separate Tornado server for serving Prometheus metrics.""" + + def __init__(self, app): + """Initialize the metrics server.""" + self.app = app + self.port = None + self.server = None + self.ioloop = None + self.thread = None + self._running = False + + def initialize_metrics(self): + """Initialize Jupyter-specific metrics for this server instance.""" + # Set server version info + SERVER_INFO.info({"version": __version__}) + + # Set up extension info + for ext in self.app.extension_manager.extensions.values(): + SERVER_EXTENSION_INFO.labels( + name=ext.name, version=ext.version, enabled=str(ext.enabled).lower() + ).info({}) + + # Set server start time + started = self.app.web_app.settings["started"] + SERVER_STARTED.set(started.timestamp()) + + # Set up activity tracking + LAST_ACTIVITY.set_function(lambda: self.app.web_app.last_activity().timestamp()) + ACTIVE_DURATION.set_function( + lambda: ( + self.app.web_app.last_activity() - self.app.web_app.settings["started"] + ).total_seconds() + ) + + # Set up kernel and terminal metrics + self._setup_runtime_metrics() + + # Note: HTTP request metrics are recorded by the main server's logging system + # via the log_request function when record_http_request_metrics=True. + # The separate metrics server uses the same prometheus registry, so those + # metrics will be available here as well. + + def _setup_runtime_metrics(self): + """Set up metrics that track runtime state.""" + + # Set up kernel count tracking + def update_kernel_metrics(): + try: + kernel_manager = self.app.kernel_manager + if hasattr(kernel_manager, "list_kernel_ids"): + kernel_ids = kernel_manager.list_kernel_ids() + # Reset all kernel type metrics to 0 + for kernel_type in set(KERNEL_CURRENTLY_RUNNING_TOTAL._metrics.keys()): + KERNEL_CURRENTLY_RUNNING_TOTAL.labels(type=kernel_type).set(0) + + # Count kernels by type + kernel_types: dict[str, int] = {} + for kid in kernel_ids: + try: + kernel = kernel_manager.get_kernel(kid) + if hasattr(kernel, "kernel_name"): + kernel_type = kernel.kernel_name + else: + kernel_type = "unknown" + kernel_types[kernel_type] = kernel_types.get(kernel_type, 0) + 1 + except Exception: + kernel_types["unknown"] = kernel_types.get("unknown", 0) + 1 + + # Update metrics + for kernel_type, count in kernel_types.items(): + KERNEL_CURRENTLY_RUNNING_TOTAL.labels(type=kernel_type).set(count) + except Exception as e: + self.app.log.debug(f"Error updating kernel metrics: {e}") + + # Set up terminal count tracking + def update_terminal_metrics(): + try: + terminal_manager = getattr(self.app, "terminal_manager", None) + if terminal_manager and hasattr(terminal_manager, "list"): + terminal_count = len(terminal_manager.list()) + TERMINAL_CURRENTLY_RUNNING_TOTAL.set(terminal_count) + else: + TERMINAL_CURRENTLY_RUNNING_TOTAL.set(0) + except Exception as e: + self.app.log.debug(f"Error updating terminal metrics: {e}") + + # Set up periodic updates + def periodic_update(): + update_kernel_metrics() + update_terminal_metrics() + + # Run initial update + periodic_update() + + # Store the periodic update function to be called from the metrics server thread + self._periodic_update = periodic_update + + def start(self, port: int = 9090) -> None: + """Start the metrics server on the specified port.""" + if self._running: + return + + # Initialize Jupyter metrics + self.initialize_metrics() + + # Create Tornado application with metrics handler + app = tornado.web.Application( + [ + (r"/metrics", PrometheusMetricsHandler), + ] + ) + + # Create HTTP server + self.server = tornado.httpserver.HTTPServer(app) + + # Try to bind to the specified port + try: + self.server.bind(port) + self.port = port + except OSError: + # If port is in use, try alternative ports + for alt_port in range(port + 1, port + 10): + try: + self.server.bind(alt_port) + self.port = alt_port + break + except OSError: + continue + else: + raise RuntimeError(f"Could not bind to any port starting from {port}") + + # Start the server in a separate thread + self.thread = threading.Thread(target=self._start_metrics_loop, daemon=True) + self.thread.start() + + # Wait for server to be ready + self._wait_for_server_ready() + self._running = True + + def _start_metrics_loop(self) -> None: + """Start the IOLoop in a separate thread.""" + try: + # Create a new IOLoop for this thread + self.ioloop = tornado.ioloop.IOLoop() + + # Set as current event loop for this thread + asyncio.set_event_loop(self.ioloop.asyncio_loop) + + # Start the server + self.server.start(1) # Single process + + # Set up periodic updates in this IOLoop + def periodic_update_wrapper(): + if hasattr(self, "_periodic_update"): + self._periodic_update() + # Schedule next update in 30 seconds + self.ioloop.call_later(30, periodic_update_wrapper) + + # Start periodic updates + self.ioloop.call_later(30, periodic_update_wrapper) + + # Start the IOLoop + self.ioloop.start() + except Exception as e: + # Log error but don't raise to avoid unhandled thread exceptions + print(f"Metrics server error: {e}") + + def _wait_for_server_ready(self, timeout: float = 5.0) -> None: + """Wait for the server to be ready to accept connections.""" + start_time = time.time() + while time.time() - start_time < timeout: + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.settimeout(0.1) + s.connect(("localhost", self.port)) + return + except OSError: + time.sleep(0.1) + raise TimeoutError(f"Server not ready after {timeout} seconds") + + def stop(self) -> None: + """Stop the metrics server.""" + if not self._running: + return + + self._running = False + + # Stop the server + if self.server: + self.server.stop() + + # Stop the IOLoop + if self.ioloop: + try: + self.ioloop.add_callback(self.ioloop.stop) + except Exception: + pass + + # Wait for thread to finish + if self.thread and self.thread.is_alive(): + self.thread.join(timeout=2.0) + + # Clean up + self.server = None + self.ioloop = None + self.thread = None + self.port = None + + +def start_metrics_server(app, port: int = 9090) -> Optional[PrometheusMetricsServer]: + """Start a metrics server for the given app.""" + try: + server = PrometheusMetricsServer(app) + server.start(port) + return server + except Exception as e: + print(f"Failed to start metrics server: {e}") + return None diff --git a/jupyter_server/serverapp.py b/jupyter_server/serverapp.py index 1c70dd60ab..987ac08ca4 100644 --- a/jupyter_server/serverapp.py +++ b/jupyter_server/serverapp.py @@ -113,6 +113,8 @@ from jupyter_server.log import log_request from jupyter_server.prometheus.metrics import ( ACTIVE_DURATION, + HTTP_REQUEST_DURATION_SECONDS, + KERNEL_CURRENTLY_RUNNING_TOTAL, LAST_ACTIVITY, SERVER_EXTENSION_INFO, SERVER_INFO, @@ -301,7 +303,7 @@ def __init__( websocket_ping_interval=websocket_ping_interval, websocket_ping_timeout=websocket_ping_timeout, ) - handlers = self.init_handlers(default_services, settings) + handlers = self.init_handlers(default_services, settings, jupyter_app) undecorated_methods = [] for matcher, handler, *_ in handlers: @@ -480,7 +482,7 @@ def init_settings( settings["xsrf_cookie_kwargs"] = {"path": base_url} return settings - def init_handlers(self, default_services, settings): + def init_handlers(self, default_services, settings, jupyter_app): """Load the (URL pattern, handler) tuples for each component.""" # Order matters. The first handler to match the URL will handle the request. handlers = [] @@ -511,7 +513,13 @@ def init_handlers(self, default_services, settings): handlers.extend(settings["identity_provider"].get_handlers()) # register base handlers last - handlers.extend(load_handlers("jupyter_server.base.handlers")) + base_handlers = load_handlers("jupyter_server.base.handlers") + + # If a separate metrics server is running, exclude the /metrics handler from main server + if jupyter_app.metrics_port: + base_handlers = [h for h in base_handlers if h[0] != r"/metrics"] + + handlers.extend(base_handlers) if settings["default_url"] != settings["base_url"]: # set the URL that will be redirected from `/` @@ -2006,8 +2014,19 @@ def _default_terminals_enabled(self) -> bool: Set to False to disable recording the http_request_duration_seconds metric. """, + config=True, + ) + + metrics_port = Integer( + 9090, + help="Port to expose metrics server on alternate port (set to 0 to disable). When set, disables /metrics endpoint on main server.", + config=True, ) + @default("metrics_port") + def _metrics_port_default(self) -> int: + return int(os.getenv("JUPYTER_SERVER_METRICS_PORT", "9090")) + static_immutable_cache = List( Unicode(), help=""" @@ -2821,6 +2840,11 @@ def initialize( self.init_mime_overrides() self.init_shutdown_no_activity() self.init_metrics() + + # Start metrics server after webapp is initialized, so handlers can be properly excluded + if self.metrics_port: + self._start_metrics_server(self.metrics_port) + if new_httpserver: self.init_httpserver() @@ -2862,6 +2886,18 @@ def running_server_info(self, kernel_count: bool = True) -> str: info += _i18n("Jupyter Server {version} is running at:\n{url}").format( version=ServerApp.version, url=self.display_url ) + info += "\n" + # Show metrics URL - if metrics_port is set, the separate server is guaranteed to be running + if self.metrics_port: + info += _i18n("Metrics server is running at:\n{url}").format( + url=f"http://localhost:{self.metrics_server.port}/metrics" + ) + info += "\n" + else: + info += _i18n("Metrics are available at:\n{url}").format( + url=f"{self.connection_url.rstrip('/')}/metrics" + ) + info += "\n" if self.gateway_config.gateway_enabled: info += ( _i18n("\nKernels will be managed by the Gateway server running at:\n%s") @@ -3025,6 +3061,22 @@ def _prepare_browser_open(self) -> tuple[str, t.Optional[str]]: return assembled_url, open_file + def _start_metrics_server(self, port): + """Start a separate metrics server on the specified port using Jupyter's Prometheus integration.""" + from jupyter_server.prometheus.server import start_metrics_server + + try: + self.metrics_server = start_metrics_server(self, port) + # Check if the metrics server actually started (has a port) + if not hasattr(self.metrics_server, "port") or self.metrics_server.port is None: + raise RuntimeError("Metrics server failed to start - no port assigned") + + self.log.info(f"Metrics server is running on port {self.metrics_server.port}") + + except Exception as e: + self.log.error(f"Failed to start metrics server: {e}") + raise RuntimeError(f"Metrics server is required but failed to start: {e}") + def launch_browser(self) -> None: """Launch the browser.""" # Deferred import for environments that do not have @@ -3094,6 +3146,27 @@ def start_app(self) -> None: if self.identity_provider.token and self.identity_provider.token_generated: # log full URL with generated token, so there's a copy/pasteable link # with auth info. + + # Determine metrics URL based on whether separate metrics server is running + if ( + self.metrics_port + and hasattr(self.metrics_server, "port") + and self.metrics_server.port is not None + ): + # Separate metrics server is running + if self.authenticate_prometheus: + metrics_url = f"http://localhost:{self.metrics_server.port}/metrics?token={self.identity_provider.token}" + else: + metrics_url = f"http://localhost:{self.metrics_server.port}/metrics" + else: + # Metrics are served on main server + # Use the connection_url as base and append /metrics + base_url = self.connection_url.rstrip("/") + if self.authenticate_prometheus: + metrics_url = f"{base_url}/metrics?token={self.identity_provider.token}" + else: + metrics_url = f"{base_url}/metrics" + if self.sock: self.log.critical( "\n".join( @@ -3106,6 +3179,10 @@ def start_app(self) -> None: f"the instance via e.g.`ssh -L 8888:{self.sock} -N user@this_host` and then " f"open e.g. {self.connection_url} in a browser." ), + _i18n( + "To access metrics, open this endpoint in a browser:", + ), + f" {metrics_url}", ] ) ) @@ -3115,6 +3192,10 @@ def start_app(self) -> None: "\n", _i18n("To access the server, copy and paste one of these URLs:"), " %s" % self.display_url, + _i18n( + "To access metrics, open this endpoint in a browser:", + ), + f" {metrics_url}", ] else: message = [ @@ -3127,6 +3208,10 @@ def start_app(self) -> None: "Or copy and paste one of these URLs:", ), " %s" % self.display_url, + _i18n( + "To access metrics, open this endpoint in a browser:", + ), + f" {metrics_url}", ] self.log.critical("\n".join(message)) @@ -3159,6 +3244,10 @@ async def _cleanup(self) -> None: if hasattr(self, "http_server"): # Stop a server if its set. self.http_server.stop() + if hasattr(self, "metrics_server"): + # Stop the metrics server if it's running + if hasattr(self.metrics_server, "stop"): + self.metrics_server.stop() def start_ioloop(self) -> None: """Start the IO Loop.""" diff --git a/jupyter_server/services/contents/handlers.py b/jupyter_server/services/contents/handlers.py index ae160e6707..7c9b24ff83 100644 --- a/jupyter_server/services/contents/handlers.py +++ b/jupyter_server/services/contents/handlers.py @@ -25,19 +25,16 @@ def _validate_keys(expect_defined: bool, model: dict[str, Any], keys: list[str]): - """ - Validate that the keys are defined (i.e. not None) or not (i.e. None) - """ - + """Validate that keys are defined or not defined as expected.""" if expect_defined: - errors = [key for key in keys if model[key] is None] + errors = {key: model[key] for key in keys if model[key] is None} if errors: raise web.HTTPError( 500, f"Keys unexpectedly None: {errors}", ) else: - errors = {key: model[key] for key in keys if model[key] is not None} # type: ignore[assignment] + errors = {key: model[key] for key in keys if model[key] is not None} if errors: raise web.HTTPError( 500, @@ -400,7 +397,7 @@ class NotebooksRedirectHandler(JupyterHandler): "PATCH", "POST", "DELETE", - ) # type:ignore[assignment] + ) @allow_unauthenticated def get(self, path): diff --git a/tests/conftest.py b/tests/conftest.py index 440bcf9c09..ecf25d4d74 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,13 @@ import os +import time +import warnings + +# Disable metrics server for all tests by default +os.environ["JUPYTER_SERVER_METRICS_PORT"] = "0" + +# Suppress deprecation warnings and thread exceptions for tests +warnings.filterwarnings("ignore", category=DeprecationWarning, message="make_current is deprecated") +warnings.filterwarnings("ignore", category=ResourceWarning) # isort: off # This must come before any Jupyter imports. @@ -18,6 +27,14 @@ pytest_plugins = ["jupyter_server.pytest_plugin"] +@pytest.fixture(autouse=True) +def cleanup_metrics_threads(): + """Ensure metrics server threads are cleaned up between tests.""" + yield + # Give any remaining daemon threads time to clean up + time.sleep(0.1) + + def pytest_addoption(parser): parser.addoption( "--integration_tests", diff --git a/tests/test_metrics.py b/tests/test_metrics.py new file mode 100644 index 0000000000..55c8085562 --- /dev/null +++ b/tests/test_metrics.py @@ -0,0 +1,194 @@ +"""Tests for Jupyter Server metrics functionality.""" + +import socket +import time +from unittest.mock import patch + +import pytest +import requests + +from jupyter_server.prometheus.server import PrometheusMetricsServer, start_metrics_server +from jupyter_server.serverapp import ServerApp + + +def find_available_port(start_port=9090, max_attempts=10): + """Find an available port starting from start_port.""" + for i in range(max_attempts): + port = start_port + i + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("localhost", port)) + return port + except OSError: + continue + raise RuntimeError(f"Could not find available port starting from {start_port}") + + +def wait_for_server(url, timeout=10, interval=0.1): + """Wait for a server to be ready to accept connections.""" + start_time = time.time() + while time.time() - start_time < timeout: + try: + response = requests.get(url, timeout=1) + return response + except (requests.exceptions.ConnectionError, requests.exceptions.Timeout): + time.sleep(interval) + raise TimeoutError(f"Server at {url} not ready after {timeout} seconds") + + +@pytest.fixture(autouse=True) +def cleanup_metrics_servers(): + """Ensure metrics servers are cleaned up after each test.""" + yield + # Give any remaining threads time to clean up + time.sleep(0.3) + + +@pytest.fixture +def metrics_server_app(): + """Create a server app with metrics enabled on a specific port.""" + # Use a unique port for this test + port = find_available_port(9090) + # Override the environment variable for this test + with patch.dict("os.environ", {"JUPYTER_SERVER_METRICS_PORT": str(port)}): + app = ServerApp() + # Set the metrics_port directly as a trait + app.metrics_port = port + app.initialize([]) + return app + + +@pytest.fixture +def standalone_metrics_server(): + """Create a standalone metrics server for testing.""" + port = find_available_port(9091) + server = PrometheusMetricsServer(port=port) + server.start() + # Wait for server to be ready + time.sleep(0.5) + yield server + server.stop() + + +def test_metrics_server_startup(standalone_metrics_server): + """Test that metrics server starts correctly.""" + assert standalone_metrics_server.port is not None + assert standalone_metrics_server.port > 0 + + # Test that metrics endpoint is accessible + response = wait_for_server(f"http://localhost:{standalone_metrics_server.port}/metrics") + assert response.status_code == 200 + assert "jupyter_server_info" in response.text + + +def test_metrics_server_with_authentication(): + """Test metrics server with authentication enabled.""" + port = find_available_port(9092) + + # Create a server app with authentication + with patch.dict("os.environ", {"JUPYTER_SERVER_METRICS_PORT": str(port)}): + app = ServerApp() + app.metrics_port = port + app.authenticate_prometheus = True + app.initialize([]) + + # Start the app + app.start_app() + + # Wait for both servers to be ready + time.sleep(1.0) + + try: + # Get the token + token = app.identity_provider.token + + # Test metrics endpoint with token + response = wait_for_server(f"http://localhost:{port}/metrics?token={token}", timeout=5) + assert response.status_code == 200 + assert "jupyter_server_info" in response.text + + # Test without token should fail + try: + response = requests.get(f"http://localhost:{port}/metrics", timeout=2) + assert response.status_code == 403 + except requests.exceptions.ConnectionError: + # Server might not be ready yet, which is also acceptable + pass + + finally: + app.stop() + + +def test_metrics_server_without_authentication(): + """Test metrics server without authentication.""" + port = find_available_port(9093) + + # Create a server app without authentication + with patch.dict("os.environ", {"JUPYTER_SERVER_METRICS_PORT": str(port)}): + app = ServerApp() + app.metrics_port = port + app.authenticate_prometheus = False + app.initialize([]) + + # Start the app + app.start_app() + + # Wait for both servers to be ready + time.sleep(1.0) + + try: + # Test metrics endpoint without token should work + response = wait_for_server(f"http://localhost:{port}/metrics", timeout=5) + assert response.status_code == 200 + assert "jupyter_server_info" in response.text + + finally: + app.stop() + + +def test_metrics_server_port_conflict(): + """Test that metrics server handles port conflicts gracefully.""" + # Use a port that's likely to be in use + port = 8888 # Default Jupyter port + + # Create a server app that should fail to start metrics server + with patch.dict("os.environ", {"JUPYTER_SERVER_METRICS_PORT": str(port)}): + app = ServerApp() + app.metrics_port = port + app.initialize([]) + + # Start the app - should not crash + app.start_app() + + try: + # The app should still be running even if metrics server failed + assert app.http_server is not None + + finally: + app.stop() + + +def test_metrics_server_disabled(): + """Test that metrics server is disabled when port is 0.""" + with patch.dict("os.environ", {"JUPYTER_SERVER_METRICS_PORT": "0"}): + app = ServerApp() + app.metrics_port = 0 + app.initialize([]) + + # Start the app + app.start_app() + + # Wait for server to be ready + time.sleep(0.5) + + try: + # Metrics should be available on main server + token = app.identity_provider.token + response = wait_for_server( + f"http://localhost:{app.port}/metrics?token={token}", timeout=5 + ) + assert response.status_code == 200 + assert "jupyter_server_info" in response.text + + finally: + app.stop()