Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 89 additions & 10 deletions api/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import time
import os
import re
from collections import deque
from enum import Enum
from typing import Dict, List, Union, Annotated, Optional

Expand All @@ -35,6 +36,15 @@
allow_headers=["*"],
)

# Solr core name for this application.
SOLR_CORE_NAME = 'name_lookup_shard1_replica_n1'

# We track the time taken for each Solr query for the last 1000 queries so we can track performance via /status.
DEFAULT_RECENT_TIMES_COUNT = 1000
RECENT_TIMES_COUNT = int(os.getenv("RECENT_TIMES_COUNT", DEFAULT_RECENT_TIMES_COUNT))
recent_query_times = deque(maxlen=RECENT_TIMES_COUNT)
recent_solr_times = deque(maxlen=RECENT_TIMES_COUNT)

# ENDPOINT /
# If someone tries accessing /, we should redirect them to the Swagger interface.
@app.get("/", include_in_schema=False)
Expand All @@ -50,21 +60,74 @@ async def docs_redirect():
description="<p>This endpoint will return status information and a list of counts from the underlying Solr database instance for this NameRes instance.</p>"
"<p>You can find out more about this endpoint in the <a href=\"https://github.com/NCATSTranslator/NameResolution/blob/master/documentation/API.md#status\">API documentation</a>.</p>"
)
async def status_get() -> Dict:
async def status_get(metrics: bool = False) -> Dict:
""" Return status and count information from the underyling Solr instance. """
return await status()
return await status(metrics)


async def status() -> Dict:
async def status(include_metrics: bool = False) -> Dict:
""" Return a dictionary containing status and count information for the underlying Solr instance. """
query_url = f"http://{SOLR_HOST}:{SOLR_PORT}/solr/admin/cores"
metrics_url = f"http://{SOLR_HOST}:{SOLR_PORT}/solr/admin/metrics"
async with httpx.AsyncClient(timeout=None) as client:
response = await client.get(query_url, params={
'action': 'STATUS'
})
if response.status_code >= 300:
logger.error("Solr error on accessing /solr/admin/cores?action=STATUS: %s", response.text)
response.raise_for_status()
if response.status_code >= 300:
logger.error("Solr error on accessing /solr/admin/cores?action=STATUS: %s", response.text)
response.raise_for_status()

# Fetch Solr query handler, cache, and JVM metrics for strain detection.
# A single call with group=core&group=jvm retrieves both in one round-trip.
# Only performed when the caller passes ?metrics=true, as it adds latency.
solr_metrics = {
"message": "Use /status?metrics=true to retrieve these metrics."
}
if include_metrics:
try:
metrics_resp = await client.get(metrics_url, params=[
('group', 'core'),
('group', 'jvm'),
('prefix', 'QUERY./select'),
('prefix', 'CACHE.core.queryResultCache'),
('prefix', 'memory.heap'),
('prefix', 'os.processCpuLoad'),
('wt', 'json'),
])
if metrics_resp.status_code < 300:
all_metrics = metrics_resp.json().get('metrics', {})

core_data = all_metrics.get(f'solr.core.{SOLR_CORE_NAME}', {})
qh = core_data.get('QUERY./select.requestTimes', {})
cache = core_data.get('CACHE.core.queryResultCache', {})
heap = all_metrics.get('solr.jvm', {}).get('memory.heap', {})
cpu = all_metrics.get('solr.jvm', {}).get('os.processCpuLoad', None)

solr_metrics = {
'query_handler': {
'requests': core_data.get('QUERY./select.requests'),
'errors': core_data.get('QUERY./select.errors'),
'timeouts': core_data.get('QUERY./select.timeouts'),
'mean_ms': qh.get('mean_ms'),
'p75_ms': qh.get('p75_ms'),
'p95_ms': qh.get('p95_ms'),
'p99_ms': qh.get('p99_ms'),
},
'cache': {
'hitratio': cache.get('hitratio'),
'evictions': cache.get('evictions'),
'size': cache.get('size'),
},
'jvm': {
'heap_used_mb': round(heap.get('used', 0) / 1_048_576, 1) if 'used' in heap else None,
'heap_max_mb': round(heap.get('max', 0) / 1_048_576, 1) if 'max' in heap else None,
'heap_used_pct': round(heap.get('used', 0) / heap['max'] * 100, 1) if heap.get('max') else None,
'cpu_load': cpu,
},
}
except Exception:
logger.warning("Failed to retrieve Solr metrics for /status", exc_info=True)

result = response.json()

# Do we know the Babel version and version URL? It will be stored in an environmental variable if we do.
Expand All @@ -83,9 +146,17 @@ async def status() -> Dict:
if 'version' in app_info and app_info['version']:
nameres_version = 'v' + app_info['version']

# We should have a status for name_lookup_shard1_replica_n1.
if 'status' in result and 'name_lookup_shard1_replica_n1' in result['status']:
core = result['status']['name_lookup_shard1_replica_n1']
# Prepare recent times for reporting.
recent_queries = {
'max': RECENT_TIMES_COUNT,
'count': len(recent_query_times),
'mean_time_ms': sum(recent_query_times) / len(recent_query_times) if recent_query_times else None,
'mean_solr_time_ms': sum(recent_solr_times) / len(recent_solr_times) if recent_solr_times else None,
}

# We should have a status for SOLR_CORE_NAME.
if 'status' in result and SOLR_CORE_NAME in result['status']:
core = result['status'][SOLR_CORE_NAME]

index = {}
if 'index' in core:
Expand All @@ -110,6 +181,8 @@ async def status() -> Dict:
'segmentCount': index.get('segmentCount', ''),
'lastModified': index.get('lastModified', ''),
'size': index.get('size', ''),
'recent_queries': recent_queries,
'solr_metrics': solr_metrics,
}
else:
return {
Expand All @@ -122,7 +195,9 @@ async def status() -> Dict:
'url': biolink_model_url,
'download_url': biolink_model_download_url,
},
'recent_queries': recent_queries,
'nameres_version': nameres_version,
'solr_metrics': solr_metrics,
}


Expand Down Expand Up @@ -605,9 +680,13 @@ async def lookup(string: str,
debug=debug_for_this_request))

time_end = time.time_ns()
time_taken_ms = (time_end - time_start)/1_000_000
time_taken_ms_solr = (time_solr_end - time_solr_start)/1_000_000
recent_query_times.append(time_taken_ms)
Copy link

Copilot AI Apr 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment and variable name suggest you're tracking “Solr query time”, but the value appended is time_taken_ms (overall request time including non-Solr processing). This makes /status's recent_queries.mean_time_ms ambiguous/misleading. Either append time_taken_ms_solr (if you want Solr time) or rename the variables/keys to reflect total request timing (or expose both metrics separately).

Suggested change
recent_query_times.append(time_taken_ms)
recent_query_times.append(time_taken_ms_solr)

Copilot uses AI. Check for mistakes.
recent_solr_times.append(time_taken_ms_solr)
logger.info(f"Lookup query to Solr for {json.dumps(string)} " +
f"(autocomplete={autocomplete}, highlighting={highlighting}, offset={offset}, limit={limit}, biolink_types={biolink_types}, only_prefixes={only_prefixes}, exclude_prefixes={exclude_prefixes}, only_taxa={only_taxa}): "
f"took {(time_end - time_start)/1_000_000:.2f}ms (with {(time_solr_end - time_solr_start)/1_000_000:.2f}ms waiting for Solr)"
f"took {time_taken_ms:.2f}ms (with {time_taken_ms_solr:.2f}ms waiting for Solr)"
)

return outputs
Expand Down
33 changes: 32 additions & 1 deletion documentation/API.md
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,37 @@ Solr database.
"version": 34838,
"segmentCount": 57,
"lastModified": "2025-09-24T19:09:56.524Z",
"size": "142.17 GB"
"size": "142.17 GB",
"recent_queries": {
"count": 1000,
"mean_time_ms": 42.3,
"mean_solr_time_ms": 38.1
},
"solr_metrics": {
"query_handler": {
"requests": 9842301,
"errors": 0,
"timeouts": 0,
"mean_ms": 41.2,
"p75_ms": 55.0,
"p95_ms": 120.3,
"p99_ms": 340.7
},
"cache": {
"hitratio": 0.91,
"evictions": 1240,
"size": 512
},
"jvm": {
"heap_used_mb": 4096.0,
"heap_max_mb": 8192.0,
"heap_used_pct": 50.0,
"cpu_load": 0.12
}
}
}
```

`recent_queries` tracks the last 1000 `/lookup` queries handled by this NameRes instance (configurable via the `RECENT_TIMES_COUNT` environment variable). `mean_time_ms` is the total end-to-end time; `mean_solr_time_ms` isolates the time spent waiting for Solr, which helps distinguish Solr-side strain from NameRes processing overhead. Both fields are `null` if no queries have been handled since startup.

`solr_metrics` is only included when the `?metrics=true` query parameter is passed, as fetching it requires an additional round-trip to Solr. It is populated directly from Solr's `/admin/metrics` API and provides native Solr health indicators: cumulative query handler statistics (useful for detecting errors or timeouts), queryResultCache hit ratio (a low ratio indicates memory pressure or cache thrashing), and JVM heap/CPU metrics. This field is `null` within the response if the Solr metrics API is unavailable.
3 changes: 2 additions & 1 deletion tests/test_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,4 +259,5 @@ def test_only_taxa_queries():
})
results_ftd_disease_with_only_taxon = response.json()
assert len(results_ftd_disease_with_only_taxon) == 1
assert results_ftd_disease_with_only_taxon[0]['curie'] == 'MONDO:0010857'
assert results_ftd_disease_with_only_taxon[0]['curie'] == 'MONDO:0010857'

50 changes: 50 additions & 0 deletions tests/test_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,53 @@ def test_status():
assert status['maxDoc'] == 89
assert status['deletedDocs'] == 0


def test_status_shape():
"""Verify /status returns expected fields including recent_queries; solr_metrics absent by default."""
client = TestClient(app)
response = client.get("/status")
assert response.status_code == 200
data = response.json()

assert data['status'] == 'ok'
assert 'numDocs' in data

# recent_queries should always be present; count/means are None before any queries.
rq = data['recent_queries']
assert 'count' in rq
assert 'mean_time_ms' in rq
assert 'mean_solr_time_ms' in rq

# solr_metrics should be present but with only a message unless ?metrics=true is passed.
assert 'solr_metrics' in data and 'message' in data['solr_metrics']


def test_status_metrics_param():
"""With ?metrics=true, solr_metrics is included and has the expected structure."""
client = TestClient(app)
response = client.get("/status", params={'metrics': 'true'})
assert response.status_code == 200
data = response.json()

assert 'solr_metrics' in data
# solr_metrics may be None if Solr's metrics API is unavailable, but if present
# it must contain the expected structure.
if 'message' not in data['solr_metrics']:
sm = data['solr_metrics']
assert 'query_handler' in sm
assert 'cache' in sm
assert 'jvm' in sm
assert 'requests' in sm['query_handler']
assert 'hitratio' in sm['cache']
assert 'heap_used_pct' in sm['jvm']


def test_status_recent_queries_populated():
"""After a lookup, recent_queries should reflect at least one recorded time."""
client = TestClient(app)
client.get("/lookup", params={'string': 'alzheimer'})
response = client.get("/status")
data = response.json()
assert data['recent_queries']['count'] >= 1
assert data['recent_queries']['mean_time_ms'] is not None
assert data['recent_queries']['mean_solr_time_ms'] is not None
Loading