diff --git a/api/resources/openapi.yml b/api/resources/openapi.yml index 8522980c..a4b7a29d 100644 --- a/api/resources/openapi.yml +++ b/api/resources/openapi.yml @@ -1,7 +1,7 @@ openapi: 3.0.2 info: title: Name Resolver - version: 1.5.1 + version: 1.5.2 email: bizon@renci.org name: Chris Bizon x-id: https://github.com/cbizon diff --git a/api/server.py b/api/server.py index 3c3ec88a..8fc60a97 100755 --- a/api/server.py +++ b/api/server.py @@ -8,9 +8,13 @@ * Matching names are returned first, followed by non-matching names """ import json -import logging, warnings +import logging +import statistics +import time +import warnings import os import re +from collections import deque from typing import Dict, List, Union, Annotated, Optional from fastapi import Body, FastAPI, Query @@ -20,12 +24,16 @@ from starlette.middleware.cors import CORSMiddleware from .apidocs import get_app_info, construct_open_api_schema +from .solr import SolrClient -LOGGER = logging.getLogger(__name__) SOLR_HOST = os.getenv("SOLR_HOST", "localhost") SOLR_PORT = os.getenv("SOLR_PORT", "8983") +solr_client = SolrClient(SOLR_HOST, int(SOLR_PORT)) + app = FastAPI(**get_app_info()) +logger = logging.getLogger(__name__) +logging.basicConfig(level=os.getenv("LOGLEVEL", logging.INFO)) app.add_middleware( CORSMiddleware, @@ -35,6 +43,16 @@ allow_headers=["*"], ) +# We track (timestamp_s, duration_ms) for recent queries to compute both latency and rate stats. +# A large default covers ~100s at 500 qps, giving meaningful rate windows even under heavy load. +QUERY_LOG_SIZE = int(os.getenv("QUERY_LOG_SIZE", 50000)) +query_log: deque = deque(maxlen=QUERY_LOG_SIZE) + +# Queries slower than this threshold will be logged at WARNING level (see documentation/Performance.md). +SLOW_QUERY_THRESHOLD_MS = float(os.getenv("SLOW_QUERY_THRESHOLD_MS", "500")) +IDEAL_QUERY_THRESHOLD_MS = 100 # below this: "ideal" +VERY_SLOW_QUERY_THRESHOLD_MS = 1000 # at or above this: "very slow" + # ENDPOINT / # If someone tries accessing /, we should redirect them to the Swagger interface. @app.get("/", include_in_schema=False) @@ -50,53 +68,159 @@ async def docs_redirect(): description="This endpoint will return status information and a list of counts from the underlying Solr " "instance for this NameRes instance." ) -async def status_get() -> Dict: +async def status_get( + full: bool = Query( + False, + description=( + "When false (default), only the Solr cores endpoint is called, returning basic index stats " + "(numDocs, startTime, etc.) with jvm, os, and cache as null. " + "Set to true to also fetch JVM memory, OS metrics, and cache statistics — " + "useful for diagnostics but should not be used for frequent liveness probes." + ), + ) +) -> Dict: """ Return status and count information from the underyling Solr instance. """ - return await status() + return await status(full=full) -async def status() -> Dict: +async def status(full: bool = False) -> Dict: """ Return a dictionary containing status and count information for the underlying Solr instance. """ - query_url = f"http://{SOLR_HOST}:{SOLR_PORT}/solr/admin/cores" - async with httpx.AsyncClient(timeout=None) as client: - response = await client.get(query_url, params={ - 'action': 'STATUS' - }) - if response.status_code >= 300: - LOGGER.error("Solr error on accessing /solr/admin/cores?action=STATUS: %s", response.text) - response.raise_for_status() - result = response.json() + solr = await solr_client.fetch_status(full=full) # Do we know the Babel version and version URL? It will be stored in an environmental variable if we do. babel_version = os.environ.get("BABEL_VERSION", "unknown") babel_version_url = os.environ.get("BABEL_VERSION_URL", "") - # We should have a status for name_lookup_shard1_replica_n1. - if 'status' in result and 'name_lookup_shard1_replica_n1' in result['status']: - core = result['status']['name_lookup_shard1_replica_n1'] + # Look up the BIOLINK_MODEL_TAG. + # Note: this should be a tag from the Biolink Model repo, e.g. "master" or "v4.3.6". + biolink_model_tag = os.environ.get("BIOLINK_MODEL_TAG", "master") + biolink_model_url = f"https://github.com/biolink/biolink-model/tree/{biolink_model_tag}" + biolink_model_download_url = f"https://raw.githubusercontent.com/biolink/biolink-model/{biolink_model_tag}/biolink-model.yaml" + + # Figure out the NameRes version. + nameres_version = "master" + app_info = get_app_info() + if 'version' in app_info and app_info['version']: + nameres_version = 'v' + app_info['version'] + # Unpack query_log into parallel lists for latency and rate computations. + log_snapshot = list(query_log) # snapshot to avoid mutation during computation + # Sort by timestamp: concurrent requests complete in a different order than they started, + # so insertion order does not reflect arrival order. + log_snapshot.sort(key=lambda x: x[0]) + timestamps = [ts for ts, _ in log_snapshot] + durations = [dur for _, dur in log_snapshot] + + # Latency percentiles. + if len(durations) >= 2: + qs = statistics.quantiles(durations, n=100) + p50, p95, p99 = qs[49], qs[94], qs[98] + else: + p50 = p95 = p99 = None + + # Inter-arrival times (gaps between consecutive query start timestamps, in ms). + # Requires >= 3 timestamps (>= 2 gaps) because statistics.quantiles needs at least 2 data points. + inter_arrival_ms = None + if len(timestamps) >= 3: + gaps = [(timestamps[i] - timestamps[i - 1]) * 1000 for i in range(1, len(timestamps))] + gaps_sorted = sorted(gaps) + inter_arrival_ms = { + 'mean': round(sum(gaps) / len(gaps), 2), + 'median': round(statistics.median(gaps), 2), + 'min': round(gaps_sorted[0], 2), + 'max': round(gaps_sorted[-1], 2), + 'p95': round(statistics.quantiles(gaps, n=100)[94], 2), + } + + # Latency buckets: fraction of queries in each performance tier. + total = len(durations) + if total: + n_ideal = n_fine = n_slow = n_very_slow = 0 + for d in durations: + if d < IDEAL_QUERY_THRESHOLD_MS: + n_ideal += 1 + elif d < SLOW_QUERY_THRESHOLD_MS: + n_fine += 1 + elif d < VERY_SLOW_QUERY_THRESHOLD_MS: + n_slow += 1 + else: + n_very_slow += 1 + latency_buckets = { + 'slow_threshold_ms': SLOW_QUERY_THRESHOLD_MS, + 'ideal_pct': round(n_ideal / total, 4), + 'fine_pct': round(n_fine / total, 4), + 'slow_pct': round(n_slow / total, 4), + 'very_slow_pct': round(n_very_slow / total, 4), + } + else: + latency_buckets = None + + # Windowed query rates. Scan from newest to oldest, stopping at the largest window. + now = time.time() + count_10s = count_60s = count_300s = 0 + for ts in reversed(timestamps): + age = now - ts + if age <= 300: + count_300s += 1 + if age <= 60: + count_60s += 1 + if age <= 10: + count_10s += 1 + else: + break + + history_span = (timestamps[-1] - timestamps[0]) if len(timestamps) >= 2 else 0 + time_since_last = (now - timestamps[-1]) if timestamps else None + + recent_queries = { + 'count': len(durations), + 'mean_time_ms': round(sum(durations) / len(durations), 2) if durations else -1, + 'p50_ms': p50, + 'p95_ms': p95, + 'p99_ms': p99, + 'latency_buckets': latency_buckets, + 'rate': { + 'history_span_seconds': round(history_span, 1), + 'time_since_last_query_seconds': round(time_since_last, 2) if time_since_last is not None else None, + 'queries_last_10s': count_10s, + 'queries_per_second_last_10s': round(count_10s / 10, 2), + 'queries_last_60s': count_60s, + 'queries_per_second_last_60s': round(count_60s / 60, 2), + 'queries_last_300s': count_300s, + 'queries_per_second_last_300s': round(count_300s / 300, 2), + 'inter_arrival_ms': inter_arrival_ms, + }, + } - index = {} - if 'index' in core: - index = core['index'] + biolink_model = { + 'tag': biolink_model_tag, + 'url': biolink_model_url, + 'download_url': biolink_model_download_url, + } + if solr['found']: + solr_dict = {k: v for k, v in solr.items() if k != 'found'} return { 'status': 'ok', 'message': 'Reporting results from primary core.', 'babel_version': babel_version, 'babel_version_url': babel_version_url, - 'startTime': core['startTime'], - 'numDocs': index.get('numDocs', ''), - 'maxDoc': index.get('maxDoc', ''), - 'deletedDocs': index.get('deletedDocs', ''), - 'version': index.get('version', ''), - 'segmentCount': index.get('segmentCount', ''), - 'lastModified': index.get('lastModified', ''), - 'size': index.get('size', ''), + 'biolink_model': biolink_model, + 'nameres_version': nameres_version, + 'recent_queries': recent_queries, + 'solr': solr_dict, } else: return { 'status': 'error', - 'message': 'Expected core not found.' + 'message': 'Expected core not found.', + 'babel_version': babel_version, + 'babel_version_url': babel_version_url, + 'biolink_model': biolink_model, + 'nameres_version': nameres_version, + 'solr': { + 'jvm': solr['jvm'], + 'os': solr['os'], + }, } @@ -125,7 +249,7 @@ async def reverse_lookup_get( ) ) -> Dict[str, Dict]: """Returns a list of synonyms for a particular CURIE.""" - return await reverse_lookup(curies) + return await curie_lookup(curies) @app.get( @@ -135,14 +259,14 @@ async def reverse_lookup_get( response_model=Dict[str, Dict], tags=["lookup"], ) -async def lookup_names_get( +async def synonyms_get( preferred_curies: List[str]= Query( example=["MONDO:0005737", "MONDO:0009757"], description="A list of CURIEs to look up synonyms for." ) ) -> Dict[str, Dict]: """Returns a list of synonyms for a particular CURIE.""" - return await reverse_lookup(preferred_curies) + return await curie_lookup(preferred_curies) @app.post( @@ -159,7 +283,7 @@ async def lookup_names_post( }), ) -> Dict[str, Dict]: """Returns a list of synonyms for a particular CURIE.""" - return await reverse_lookup(request.curies) + return await curie_lookup(request.curies) @app.post( @@ -169,17 +293,18 @@ async def lookup_names_post( response_model=Dict[str, Dict], tags=["lookup"], ) -async def lookup_names_post( +async def synonyms_post( request: SynonymsRequest = Body(..., example={ "preferred_curies": ["MONDO:0005737", "MONDO:0009757"], }), ) -> Dict[str, Dict]: """Returns a list of synonyms for a particular CURIE.""" - return await reverse_lookup(request.preferred_curies) + return await curie_lookup(request.preferred_curies) -async def reverse_lookup(curies) -> Dict[str, Dict]: +async def curie_lookup(curies) -> Dict[str, Dict]: """Returns a list of synonyms for a particular CURIE.""" + time_start = time.perf_counter_ns() query = f"http://{SOLR_HOST}:{SOLR_PORT}/solr/name_lookup/select" curie_filter = " OR ".join( f"curie:\"{curie}\"" @@ -199,6 +324,10 @@ async def reverse_lookup(curies) -> Dict[str, Dict]: } for doc in response_json["response"]["docs"]: output[doc["curie"]] = doc + time_end = time.perf_counter_ns() + + logger.info(f"CURIE Lookup on {len(curies)} CURIEs {json.dumps(curies)} took {(time_end - time_start)/1_000_000:.2f}ms") + return output class LookupResult(BaseModel): @@ -351,6 +480,8 @@ async def lookup(string: str, will be returned, rather than filtering to concepts that are both PhenotypicFeature and Disease. """ + time_start = time.perf_counter_ns() + # First, we strip and lowercase the query since all our indexes are case-insensitive. string_lc = string.strip().lower() @@ -459,16 +590,18 @@ async def lookup(string: str, "fields": "*, score", "params": inner_params, } - logging.debug(f"Query: {json.dumps(params, indent=2)}") + logger.debug(f"Query: {json.dumps(params, indent=2)}") + time_solr_start = time.perf_counter_ns() query_url = f"http://{SOLR_HOST}:{SOLR_PORT}/solr/name_lookup/select" async with httpx.AsyncClient(timeout=None) as client: response = await client.post(query_url, json=params) if response.status_code >= 300: - LOGGER.error("Solr REST error: %s", response.text) + logger.error("Solr REST error: %s", response.text) response.raise_for_status() response = response.json() - logging.debug(f"Solr response: {json.dumps(response, indent=2)}") + time_solr_end = time.perf_counter_ns() + logger.debug(f"Solr response: {json.dumps(response, indent=2)}") # Associate highlighting information with search results. highlighting_response = response.get("highlighting", {}) @@ -511,6 +644,22 @@ async def lookup(string: str, clique_identifier_count=doc.get("clique_identifier_count", 0), types=[f"biolink:{d}" for d in doc.get("types", [])])) + time_end = time.perf_counter_ns() + time_taken_ms = (time_end - time_start)/1_000_000 + solr_ms = (time_solr_end - time_solr_start)/1_000_000 + query_log.append((time_start / 1_000_000_000, time_taken_ms)) + log_msg = ( + f"Lookup query to Solr for {json.dumps(string)} " + f"(autocomplete={autocomplete}, highlighting={highlighting}, offset={offset}, limit={limit}, " + f"biolink_types={biolink_types}, only_prefixes={only_prefixes}, exclude_prefixes={exclude_prefixes}, " + f"only_taxa={only_taxa}) " + f"took {time_taken_ms:.2f}ms (with {solr_ms:.2f}ms waiting for Solr)" + ) + if time_taken_ms > SLOW_QUERY_THRESHOLD_MS: + logger.warning("SLOW QUERY: " + log_msg) + else: + logger.info(log_msg) + return outputs ## BULK ENDPOINT @@ -579,6 +728,7 @@ class NameResQuery(BaseModel): tags=["lookup"] ) async def bulk_lookup(query: NameResQuery) -> Dict[str, List[LookupResult]]: + time_start = time.perf_counter_ns() result = {} for string in query.strings: result[string] = await lookup( @@ -591,6 +741,9 @@ async def bulk_lookup(query: NameResQuery) -> Dict[str, List[LookupResult]]: query.only_prefixes, query.exclude_prefixes, query.only_taxa) + time_end = time.perf_counter_ns() + logger.info(f"Bulk lookup query for {len(query.strings)} strings ({query}) took {(time_end - time_start)/1_000_000:.2f}ms") + return result @@ -614,7 +767,7 @@ async def bulk_lookup(query: NameResQuery) -> Dict[str, List[LookupResult]]: # these supresses such warnings. logging.captureWarnings(capture=True) warnings.filterwarnings("ignore", category=ResourceWarning) - otel_service_name = os.environ.get('SERVER_NAME', 'infores:sri-node-normalizer') + otel_service_name = os.environ.get('SERVER_NAME', 'infores:sri-name-resolver') assert otel_service_name and isinstance(otel_service_name, str) otlp_host = os.environ.get("JAEGER_HOST", "http://localhost/").rstrip('/') diff --git a/api/solr.py b/api/solr.py new file mode 100644 index 00000000..9cfd4ecc --- /dev/null +++ b/api/solr.py @@ -0,0 +1,184 @@ +"""Solr HTTP client and response parsers for the NameRes status endpoint.""" +import asyncio +import logging + +import httpx + +logger = logging.getLogger(__name__) + +# Primary core name we monitor. +_CORE_NAME = "name_lookup_shard1_replica_n1" + + +class SolrClient: + """HTTP client for Solr admin APIs with static parsing utilities. + + Async fetch methods make HTTP calls and return raw JSON (or None on + failure). Static parse_* methods accept raw JSON dicts and can be + unit-tested directly without a running Solr instance. + """ + + def __init__(self, host: str, port: int): + self._base = f"http://{host}:{port}/solr" + + # ------------------------------------------------------------------ # + # Static parsers — pure functions, independently unit-testable # + # ------------------------------------------------------------------ # + + @staticmethod + def parse_jvm(sysinfo_data: dict) -> dict: + """Parse JVM heap stats from a /admin/info/system response.""" + jvm_raw = sysinfo_data.get("jvm", {}).get("memory", {}).get("raw", {}) + heap_used = jvm_raw.get("used") + heap_max = jvm_raw.get("max") + return { + "heap_used_bytes": heap_used, + "heap_max_bytes": heap_max, + "heap_used_pct": round(heap_used / heap_max, 4) if heap_used and heap_max else None, + } + + @staticmethod + def parse_os(sysinfo_data: dict) -> dict: + """Parse OS / memory stats from a /admin/info/system response.""" + system = sysinfo_data.get("system", {}) + free_mem = system.get("freePhysicalMemorySize") + total_mem = system.get("totalPhysicalMemorySize") + if free_mem is not None and total_mem and total_mem > 0 and 0 <= free_mem <= total_mem: + physical_memory_used_pct = round((total_mem - free_mem) / total_mem, 4) + else: + physical_memory_used_pct = None + return { + "process_cpu_load": system.get("processCpuLoad"), + "system_cpu_load": system.get("systemCpuLoad"), + "free_physical_memory_bytes": free_mem, + "total_physical_memory_bytes": total_mem, + "physical_memory_used_pct": physical_memory_used_pct, + } + + @staticmethod + def parse_cache(mbeans_data: dict, name: str) -> dict | None: + """Parse stats for one named cache from a /admin/mbeans?cat=CACHE response. + + Solr stores stats under fully-qualified keys such as + ``CACHE.searcher..hitratio``, not bare ``hitratio``. + Returns None when the cache is absent from the response. + """ + for entry in mbeans_data.get("solr-mbeans", []): + if isinstance(entry, dict) and name in entry: + stats = entry[name].get("stats", {}) + p = f"CACHE.searcher.{name}." + return { + "hitratio": stats.get(f"{p}hitratio"), + "evictions": stats.get(f"{p}evictions"), + "size": stats.get(f"{p}size"), + "lookups": stats.get(f"{p}lookups"), + "hits": stats.get(f"{p}hits"), + } + return None + + @staticmethod + def parse_index(cores_data: dict) -> dict | None: + """Parse core index info from a /admin/cores?action=STATUS response. + + Returns None when the expected core is not present. + """ + core = cores_data.get("status", {}).get(_CORE_NAME) + if core is None: + return None + index = core.get("index", {}) + return { + "startTime": core.get("startTime"), + "numDocs": index.get("numDocs", ""), + "maxDoc": index.get("maxDoc", ""), + "deletedDocs": index.get("deletedDocs", ""), + "version": index.get("version", ""), + "segmentCount": index.get("segmentCount", ""), + "lastModified": index.get("lastModified", ""), + "size": index.get("size", ""), + } + + # ------------------------------------------------------------------ # + # Async HTTP fetchers # + # ------------------------------------------------------------------ # + + async def fetch_sysinfo(self, client: httpx.AsyncClient) -> dict | None: + try: + r = await client.get(f"{self._base}/admin/info/system", params={"wt": "json"}) + r.raise_for_status() + return r.json() + except Exception as e: + logger.warning("Could not fetch Solr system info: %s", e) + return None + + async def fetch_mbeans(self, client: httpx.AsyncClient) -> dict | None: + try: + r = await client.get( + f"{self._base}/name_lookup/admin/mbeans", + params={"cat": "CACHE", "stats": "true", "wt": "json"}, + ) + r.raise_for_status() + return r.json() + except Exception as e: + logger.warning("Could not fetch Solr cache MBeans: %s", e) + return None + + async def fetch_cores(self, client: httpx.AsyncClient) -> dict: + r = await client.get(f"{self._base}/admin/cores", params={"action": "STATUS"}) + if r.status_code >= 300: + logger.error("Solr error on /solr/admin/cores?action=STATUS: %s", r.text) + r.raise_for_status() + return r.json() + + # ------------------------------------------------------------------ # + # High-level: fetch everything and return a parsed snapshot # + # ------------------------------------------------------------------ # + + async def fetch_status(self, full: bool = False) -> dict: + """Fetch and parse Solr monitoring data. + + When ``full=False`` (default), only the cores endpoint is called, + returning basic index stats with ``jvm``, ``os``, and ``cache`` as + ``None``. Pass ``full=True`` to also fetch JVM, OS, and cache metrics + (three concurrent requests instead of one). + + Returns a dict with a ``found`` flag plus parsed fields. Callers + should check ``result["found"]`` before accessing index-level keys. + + Raises ``httpx.HTTPStatusError`` if the cores endpoint is unavailable. + """ + async with httpx.AsyncClient(timeout=None) as client: + if full: + cores_data, sysinfo_data, mbeans_data = await asyncio.gather( + self.fetch_cores(client), + self.fetch_sysinfo(client), + self.fetch_mbeans(client), + ) + else: + cores_data = await self.fetch_cores(client) + sysinfo_data = mbeans_data = None + + jvm_info = self.parse_jvm(sysinfo_data) if sysinfo_data else None + os_info = self.parse_os(sysinfo_data) if sysinfo_data else None + cache_info = { + "filterCache": self.parse_cache(mbeans_data, "filterCache"), + "queryResultCache": self.parse_cache(mbeans_data, "queryResultCache"), + } if mbeans_data else None + + index_info = self.parse_index(cores_data) + if index_info is None: + return {"found": False, "jvm": jvm_info, "os": os_info} + + return { + "found": True, + "startTime": index_info["startTime"], + "numDocs": index_info["numDocs"], + "maxDoc": index_info["maxDoc"], + "deletedDocs": index_info["deletedDocs"], + "version": index_info["version"], + "segmentCount": index_info["segmentCount"], + "lastModified": index_info["lastModified"], + "size": index_info["size"], + "jvm": jvm_info, + "os": os_info, + "cache": cache_info, + } diff --git a/data-loading/README.md b/data-loading/README.md index 376c9e9f..ab5ac060 100644 --- a/data-loading/README.md +++ b/data-loading/README.md @@ -58,7 +58,7 @@ automatically. This is a seven-step process: Note the double-quotes: setup-and-load-solr.sh requires a glob pattern as its first argument, not a list of files to process! 5. Generate a backup of the Solr instance. The first command will create a directory at - `solrdata/data/name_lookup_shard1_repical_n1/data/snapshot.backup` -- you can track its progress by comparing the + `solrdata/data/name_lookup_shard1_replica_n1/data/snapshot.backup` -- you can track its progress by comparing the number of files in that directory to the number of files in `../data/index` (as I write this, it has 513 files). ```shell diff --git a/documentation/Performance.md b/documentation/Performance.md new file mode 100644 index 00000000..7a52a1cc --- /dev/null +++ b/documentation/Performance.md @@ -0,0 +1,194 @@ +# NameRes Performance Diagnostics + +This document explains how to diagnose why Solr may be slow or under strain, using the +observability built into NameRes. It covers what the metrics in `/status` mean, how to +read the log messages, and a decision tree for identifying CPU pressure, memory pressure, +high query rate, and other causes. + +--- + +## 1. `/status` response fields + +The response has two main diagnostic sections: `recent_queries` (Python frontend metrics) +and `solr` (everything from the Solr database itself). + +### Frontend query metrics (`recent_queries`) + +These are tracked by the Python process and reflect the full round-trip time seen by callers. + +#### Latency + +| Field | What it means | +|---|---| +| `mean_time_ms` | Average round-trip time (Python → Solr → Python) over the entries in `query_log`. Rising mean = sustained slowdown. | +| `p50_ms` / `p95_ms` / `p99_ms` | Latency percentiles over the same window. p50 rising = every query is slow. p99 spiking but p50 stable = occasional GC pauses or one-off expensive queries. | + +#### Rate (`recent_queries.rate`) + +Query start timestamps and durations are stored together in a single `query_log` deque +(up to `QUERY_LOG_SIZE` entries, default 50,000). The large size ensures rate estimates +stay meaningful even at high query rates (e.g., 500 qps fills 1,000 entries in 2 seconds, +but 50,000 entries covers 100 seconds). + +| Field | What it means | +|---|---| +| `history_span_seconds` | Time from the oldest to newest entry in the log. Shows how much history backs the rate estimates. | +| `time_since_last_query_seconds` | Seconds since the most recent query. Large values mean the service is idle and windowed rates are stale. | +| `queries_last_10s` / `queries_per_second_last_10s` | 10-second window. Use this to catch the onset of a spike before the 60s average catches up. | +| `queries_last_60s` / `queries_per_second_last_60s` | 1-minute average rate. Use this for current load. | +| `queries_last_300s` / `queries_per_second_last_300s` | 5-minute average rate. Use this to smooth over short bursts. | +| `inter_arrival_ms.mean` | Average gap between consecutive queries in ms. Equals 1000 / mean_qps; cross-checks the windowed rates. | +| `inter_arrival_ms.median` | Median gap. More robust than mean under burst traffic. | +| `inter_arrival_ms.min` | Tightest burst observed — how closely packed the busiest queries were. | +| `inter_arrival_ms.max` | Longest idle gap in the log window. | +| `inter_arrival_ms.p95` | 95% of queries arrive within this gap. | + +The key diagnostic use: **if Solr is slow AND the query rate is high**, the cause is likely +load rather than an internal Solr problem. If the rate is normal but Solr is slow, look at +the `solr` fields below. + +### Solr database metrics (`solr`) + +All fields under `solr` come from Solr admin endpoints fetched in parallel when `/status` +is called. They are `null` if the relevant Solr endpoint is unreachable (a warning is logged). + +#### Index health + +| Field | What it means | +|---|---| +| `solr.segmentCount` | Number of Lucene segments. Above ~20 means Solr does more per-segment work per query. Consider triggering an optimize (`POST /solr/name_lookup/update?optimize=true`). | +| `solr.numDocs` / `solr.maxDoc` | `maxDoc - numDocs` = soft-deleted docs not yet merged. High deleted count + high segment count amplifies query cost. | +| `solr.size` | Index size on disk. Unexpectedly small may indicate an incomplete data load. | + +#### JVM and OS (`solr.jvm`, `solr.os`) + +Fetched from Solr's `/solr/admin/info/system` endpoint. + +| Field | What it means | +|---|---| +| `solr.jvm.heap_used_pct` | Fraction of JVM heap in use (0.0–1.0). **>0.80 = memory pressure.** | +| `solr.jvm.heap_used_bytes` / `solr.jvm.heap_max_bytes` | Absolute heap figures. Max is set by `-Xmx` in Solr's JVM config. | +| `solr.os.process_cpu_load` | Solr process CPU (0.0–1.0). **>0.80 = CPU saturation.** | +| `solr.os.system_cpu_load` | Host-wide CPU. If higher than process load, other processes are competing. | +| `solr.os.free_physical_memory_bytes` | OS RAM available. If low, the OS may be swapping. | + +#### Cache statistics (`solr.cache`) + +Fetched from Solr's MBeans endpoint. Reports `filterCache` and `queryResultCache`. + +| Field | What it means | +|---|---| +| `hitratio` | Fraction of cache lookups that were hits. Should be >0.90. Below 0.50 = Solr is re-computing filters on nearly every query. | +| `evictions` | Rising count = cache too small for the working set (a symptom of memory pressure). | +| `size` | Current number of entries in the cache. If evictions are rising, the working set is larger than the cache. | +| `lookups` | Total cache lookups. | +| `hits` | Cache lookups that were served from cache (rather than recomputed). | + +Cache sizes are configured in Solr's `solrconfig.xml`. If evictions are high, increase +`` for the affected cache — or investigate whether requests use many distinct +filter combinations that defeat caching. + +--- + +## 2. Log messages + +Every call to `lookup()` emits a line at INFO (or WARNING if slow): + +``` +INFO: Lookup query to Solr for "diabetes" (autocomplete=False, highlighting=False, offset=0, + limit=10, biolink_types=['biolink:Disease'], only_prefixes=None, exclude_prefixes=None, + only_taxa=None) took 123.45ms (with 100.12ms waiting for Solr) +``` + +``` +WARNING: SLOW QUERY: Lookup query to Solr for "..." ... took 850.12ms (with 840.00ms waiting for Solr) +``` + +Key interpretation: +- **"waiting for Solr" ≈ total** → the bottleneck is inside Solr (JVM, index, caches). +- **"waiting for Solr" is small, total is high** → the bottleneck is Python result processing + (large result sets being deserialized or filtered). +- A WARNING is emitted when total time exceeds `SLOW_QUERY_THRESHOLD_MS` (default 500ms). + Set `LOGLEVEL=DEBUG` to also log the full Solr request and response JSON. + +--- + +## 3. Diagnostic decision tree + +``` +Solr seems slow or the service is unresponsive +│ +├─ Step 1: Check recent_queries.rate in /status +│ │ +│ ├─ queries_per_second_last_60s is unusually high (e.g. 10x normal) +│ │ → HIGH QUERY RATE is driving the load +│ │ Check: are requests batching correctly? (use /reverse_lookup or /bulk_lookup) +│ │ Check: is a client in a retry loop? (look for repeated identical queries in logs) +│ │ Fix: rate-limit upstream callers; scale horizontally +│ │ +│ └─ Rate is normal → the problem is internal to Solr → continue +│ +├─ Step 2: Check log messages for "waiting for Solr" vs total time +│ │ +│ ├─ "waiting for Solr" is small, total is high +│ │ → Python result-processing bottleneck +│ │ Check: is limit very large? High result counts = expensive deserialization +│ │ Check: Python process CPU (scale horizontally if saturated) +│ │ +│ └─ "waiting for Solr" is most of total → problem is INSIDE Solr → continue +│ +├─ Step 3: Check solr.jvm.heap_used_pct in /status +│ │ +│ ├─ >0.80 → MEMORY PRESSURE +│ │ │ +│ │ ├─ Check solr.cache.filterCache.evictions +│ │ │ ├─ Rising evictions → cache is too small for the working set +│ │ │ │ Fix: increase in solrconfig.xml for filterCache +│ │ │ └─ Evictions low but heap still high → data or fieldCache is large +│ │ │ Fix: increase JVM -Xmx (SOLR_JAVA_MEM in Solr's environment config) +│ │ │ or add more RAM to the host +│ │ │ +│ │ └─ Heap high AND evictions low → not a cache-size problem +│ │ Consider: Solr fieldCache warming on startup; large stored fields +│ │ +│ └─ <0.50 → NOT a memory issue → continue +│ +├─ Step 4: Check solr.os.process_cpu_load in /status +│ │ +│ ├─ >0.80 → CPU SATURATION +│ │ │ +│ │ ├─ Check solr.segmentCount in /status +│ │ │ ├─ >20 → run Solr optimize to merge segments +│ │ │ │ POST http://solr-host:8983/solr/name_lookup/update?optimize=true +│ │ │ └─ Low segmentCount → CPU is busy with query evaluation itself +│ │ │ +│ │ ├─ Check SLOW QUERY WARNINGs in logs +│ │ │ Are specific queries (many filters, wildcard-heavy) driving the load? +│ │ │ Fix: cache common filter combinations; avoid leading wildcards +│ │ │ +│ │ └─ Even load across all queries → scale horizontally (add Solr replicas) +│ │ +│ └─ Low CPU and low memory but slow queries → likely JVM GC pauses → continue +│ +└─ Step 5: Check p99 vs p50 in recent_queries + │ + ├─ p99 >> p50 (e.g. p50=50ms, p99=5000ms) → GC pause signature + │ Fix: tune JVM GC settings in Solr's JVM config: + │ -XX:+UseG1GC -XX:MaxGCPauseMillis=200 + │ Check Solr GC logs (solr-gc.log) for Full GC frequency and duration + │ + └─ p50 and p99 both high → sustained overload at all percentiles + → All of the above apply; start with memory (Step 3) +``` + +--- + +## 4. Environment variables + +| Variable | Default | Effect | +|---|---|---| +| `SOLR_HOST` | `localhost` | Solr hostname | +| `SOLR_PORT` | `8983` | Solr port | +| `QUERY_LOG_SIZE` | `50000` | How many `(timestamp, duration)` pairs to retain. Covers both latency percentiles and rate estimation. At 500 qps this covers ~100 seconds; lower to reduce memory on low-traffic instances. | +| `SLOW_QUERY_THRESHOLD_MS` | `500` | Queries slower than this are logged at WARNING level | +| `LOGLEVEL` | `INFO` | Set to `DEBUG` to log full Solr request/response JSON for every query | diff --git a/tests/test_solr.py b/tests/test_solr.py new file mode 100644 index 00000000..c9fe923f --- /dev/null +++ b/tests/test_solr.py @@ -0,0 +1,189 @@ +"""Unit tests for SolrClient static parsing methods. + +These tests exercise the pure parsing logic without any HTTP calls or a +running Solr instance. The mock payloads mirror the actual structure +returned by Solr's admin endpoints. +""" +from api.solr import SolrClient + + +# ------------------------------------------------------------------ # +# parse_cache # +# ------------------------------------------------------------------ # + +# Minimal mbeans payload matching real Solr structure: +# solr-mbeans is an alternating [category-string, {entries...}, ...] list. +_MBEANS = { + "solr-mbeans": [ + "CACHE", + { + "filterCache": { + "class": "org.apache.solr.search.CaffeineCache", + "description": "Caffeine Cache(maxSize=512, initialSize=512)", + "stats": { + "CACHE.searcher.filterCache.hitratio": 0.9, + "CACHE.searcher.filterCache.evictions": 10, + "CACHE.searcher.filterCache.size": 500, + "CACHE.searcher.filterCache.lookups": 1000, + "CACHE.searcher.filterCache.hits": 900, + }, + }, + "queryResultCache": { + "class": "org.apache.solr.search.CaffeineCache", + "description": "Caffeine Cache(maxSize=512, initialSize=512)", + "stats": { + "CACHE.searcher.queryResultCache.hitratio": 0.5756, + "CACHE.searcher.queryResultCache.evictions": 2310, + "CACHE.searcher.queryResultCache.size": 512, + "CACHE.searcher.queryResultCache.lookups": 6649, + "CACHE.searcher.queryResultCache.hits": 3827, + }, + }, + }, + ] +} + + +def test_parse_cache_queryResultCache(): + result = SolrClient.parse_cache(_MBEANS, "queryResultCache") + assert result == { + "hitratio": 0.5756, + "evictions": 2310, + "size": 512, + "lookups": 6649, + "hits": 3827, + } + + +def test_parse_cache_filterCache(): + result = SolrClient.parse_cache(_MBEANS, "filterCache") + assert result == { + "hitratio": 0.9, + "evictions": 10, + "size": 500, + "lookups": 1000, + "hits": 900, + } + + +def test_parse_cache_missing_name(): + assert SolrClient.parse_cache(_MBEANS, "perSegFilter") is None + + +def test_parse_cache_empty_mbeans(): + assert SolrClient.parse_cache({}, "queryResultCache") is None + + +def test_parse_cache_empty_stats(): + """Cache present but no stats entries — all values None, not an error.""" + data = {"solr-mbeans": ["CACHE", {"queryResultCache": {"stats": {}}}]} + result = SolrClient.parse_cache(data, "queryResultCache") + assert result == { + "hitratio": None, + "evictions": None, + "size": None, + "lookups": None, + "hits": None, + } + + +# ------------------------------------------------------------------ # +# parse_jvm # +# ------------------------------------------------------------------ # + +def test_parse_jvm(): + data = {"jvm": {"memory": {"raw": {"used": 500_000_000, "max": 1_000_000_000}}}} + result = SolrClient.parse_jvm(data) + assert result["heap_used_bytes"] == 500_000_000 + assert result["heap_max_bytes"] == 1_000_000_000 + assert result["heap_used_pct"] == 0.5 + + +def test_parse_jvm_missing_data(): + result = SolrClient.parse_jvm({}) + assert result["heap_used_bytes"] is None + assert result["heap_max_bytes"] is None + assert result["heap_used_pct"] is None + + +# ------------------------------------------------------------------ # +# parse_os # +# ------------------------------------------------------------------ # + +def test_parse_os(): + data = { + "system": { + "processCpuLoad": 0.25, + "systemCpuLoad": 0.40, + "freePhysicalMemorySize": 2_000_000_000, + "totalPhysicalMemorySize": 8_000_000_000, + } + } + result = SolrClient.parse_os(data) + assert result["process_cpu_load"] == 0.25 + assert result["system_cpu_load"] == 0.40 + assert result["free_physical_memory_bytes"] == 2_000_000_000 + assert result["total_physical_memory_bytes"] == 8_000_000_000 + assert result["physical_memory_used_pct"] == 0.75 + + +def test_parse_os_invalid_memory_free_greater_than_total(): + data = { + "system": { + "freePhysicalMemorySize": 9_000_000_000, + "totalPhysicalMemorySize": 8_000_000_000, + } + } + result = SolrClient.parse_os(data) + assert result["physical_memory_used_pct"] is None + + +def test_parse_os_zero_total(): + data = {"system": {"freePhysicalMemorySize": 0, "totalPhysicalMemorySize": 0}} + result = SolrClient.parse_os(data) + assert result["physical_memory_used_pct"] is None + + +def test_parse_os_missing_data(): + result = SolrClient.parse_os({}) + assert result["process_cpu_load"] is None + assert result["physical_memory_used_pct"] is None + + +# ------------------------------------------------------------------ # +# parse_index # +# ------------------------------------------------------------------ # + +_CORES = { + "status": { + "name_lookup_shard1_replica_n1": { + "startTime": "2025-12-19T21:20:02.292Z", + "index": { + "numDocs": 89, + "maxDoc": 89, + "deletedDocs": 0, + "version": 42, + "segmentCount": 3, + "lastModified": "2025-12-19T21:21:00.000Z", + "size": "10 MB", + }, + } + } +} + + +def test_parse_index(): + result = SolrClient.parse_index(_CORES) + assert result["startTime"] == "2025-12-19T21:20:02.292Z" + assert result["numDocs"] == 89 + assert result["maxDoc"] == 89 + assert result["deletedDocs"] == 0 + assert result["size"] == "10 MB" + + +def test_parse_index_core_not_found(): + assert SolrClient.parse_index({"status": {}}) is None + + +def test_parse_index_empty(): + assert SolrClient.parse_index({}) is None diff --git a/tests/test_status.py b/tests/test_status.py new file mode 100644 index 00000000..2f8393d6 --- /dev/null +++ b/tests/test_status.py @@ -0,0 +1,50 @@ +import logging + +from api.server import app +from fastapi.testclient import TestClient + +# Turn on debugging for tests. +logging.basicConfig(level=logging.DEBUG) + +def test_status(): + client = TestClient(app) + response = client.get("/status") + status = response.json() + + assert status['status'] == 'ok' + assert status['message'] != '' + assert 'babel_version' in status + assert 'babel_version_url' in status + assert 'biolink_model' in status + assert 'tag' in status['biolink_model'] + assert 'nameres_version' in status + + solr = status['solr'] + assert solr['version'] != '' + assert solr['size'] != '' + assert solr['startTime'] + + # Count the specific number of test documents we load. + assert solr['numDocs'] == 88 + assert solr['maxDoc'] == 88 + assert solr['deletedDocs'] == 0 + + +def test_status_default_excludes_metrics(): + """Default /status omits expensive metrics — only one Solr call needed.""" + client = TestClient(app) + solr = client.get("/status").json()['solr'] + assert solr['numDocs'] is not None + assert solr['jvm'] is None + assert solr['os'] is None + assert solr['cache'] is None + + +def test_status_full_includes_metrics(): + """?full=true fetches JVM, OS, and cache metrics.""" + client = TestClient(app) + solr = client.get("/status?full=true").json()['solr'] + assert solr['jvm'] is not None + assert solr['jvm']['heap_used_bytes'] is not None + assert solr['os'] is not None + # assert solr['cache'] is not None -- TODO: figure out why this doesn't work on our little Docker image.