Skip to content

Add Prometheus alerts test #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 3, 2025
42 changes: 42 additions & 0 deletions stackhpc_cloud_tests/monitoring/test_prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,45 @@ def test_prometheus_node_exporter_metrics(prom):
"""Check that expected node exporter metrics exist."""
metrics = prom.all_metrics()
assert "node_cpu_seconds_total" in metrics


def test_prometheus_alerts_inactive(prom):
"""Check that no Prometheus alerts are active."""
# https://prometheus.io/docs/prometheus/latest/querying/api/#alerts
response = prom._session.get(
"{0}/api/v1/alerts".format(prom.url),
verify=prom._session.verify,
headers=prom.headers,
auth=prom.auth,
cert=prom._session.cert,
)
assert response.ok
response = response.json()
assert "status" in response
assert response["status"] == "success"
assert "data" in response
alerts = response["data"]["alerts"] or []

# (MaxN) Allow for, and filter out, alerts we'd expect to see in an AIO environment.
# TODO - find a way of configuring this for SCT running in other environments.
aio_alerts_to_ignore = [
# We know our volumes are small.
{ "alertname": "StorageFillingUp", "instance": "controller0" },
# This is probably due to storage space..
{ "alertname": "ElasticsearchClusterYellow", "instance": "controller0" },
# ..or because we're running in a single instance and it wants to be clustered across multiple nodes.
{ "alertname": "ElasticsearchUnassignedShards", "instance": "controller0" },
# It's a small AIO!
{ "alertname": "LowMemory", "instance": "controller0" },
# It's only one node and expects three, see https://github.com/stackhpc/stackhpc-kayobe-config/pull/1579
{ "alertname": "RabbitMQNodeDown" },
# This is probably because Tempest runs before pytest so the container has been recently stopped.
{ "alertname": "ContainerKilled", "name": "tempest" }
]

def alert_is_ignored(alert, alerts_to_ignore):
# Check if any of the "ignore cases" match the alert
return any(alert_to_ignore.items() <= alert.items() for alert_to_ignore in alerts_to_ignore)

alerts = [ alert for alert in alerts if not alert_is_ignored(alert["labels"], aio_alerts_to_ignore) ]
assert len(alerts) == 0