Skip to content

Commit 00eabad

Browse files
Merge branch 'development' into alex_sku
2 parents 92b4344 + 887656e commit 00eabad

6 files changed

Lines changed: 1215 additions & 982 deletions

File tree

nodescraper/plugins/inband/rdma/rdma_analyzer.py

Lines changed: 33 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -37,106 +37,12 @@ class RdmaAnalyzer(DataAnalyzer[RdmaDataModel, None]):
3737

3838
DATA_MODEL = RdmaDataModel
3939

40-
# Error fields checked from rdma statistic output (bnxt_re, mlx5, ionic, etc.)
41-
ERROR_FIELDS = [
42-
"recoverable_errors",
43-
"tx_roce_errors",
44-
"tx_roce_discards",
45-
"rx_roce_errors",
46-
"rx_roce_discards",
47-
"local_ack_timeout_err",
48-
"packet_seq_err",
49-
"max_retry_exceeded",
50-
"rnr_nak_retry_err",
51-
"implied_nak_seq_err",
52-
"unrecoverable_err",
53-
"bad_resp_err",
54-
"local_qp_op_err",
55-
"local_protection_err",
56-
"mem_mgmt_op_err",
57-
"req_remote_invalid_request",
58-
"req_remote_access_errors",
59-
"remote_op_err",
60-
"duplicate_request",
61-
"res_exceed_max",
62-
"resp_local_length_error",
63-
"res_exceeds_wqe",
64-
"res_opcode_err",
65-
"res_rx_invalid_rkey",
66-
"res_rx_domain_err",
67-
"res_rx_no_perm",
68-
"res_rx_range_err",
69-
"res_tx_invalid_rkey",
70-
"res_tx_domain_err",
71-
"res_tx_no_perm",
72-
"res_tx_range_err",
73-
"res_irrq_oflow",
74-
"res_unsup_opcode",
75-
"res_unaligned_atomic",
76-
"res_rem_inv_err",
77-
"res_mem_err",
78-
"res_srq_err",
79-
"res_cmp_err",
80-
"res_invalid_dup_rkey",
81-
"res_wqe_format_err",
82-
"res_cq_load_err",
83-
"res_srq_load_err",
84-
"res_tx_pci_err",
85-
"res_rx_pci_err",
86-
"out_of_buffer",
87-
"out_of_sequence",
88-
"req_cqe_error",
89-
"req_cqe_flush_error",
90-
"resp_cqe_error",
91-
"resp_cqe_flush_error",
92-
"resp_remote_access_errors",
93-
"req_rx_pkt_seq_err",
94-
"req_rx_rnr_retry_err",
95-
"req_rx_rmt_acc_err",
96-
"req_rx_rmt_req_err",
97-
"req_rx_oper_err",
98-
"req_rx_impl_nak_seq_err",
99-
"req_rx_cqe_err",
100-
"req_rx_cqe_flush",
101-
"req_rx_dup_response",
102-
"req_rx_inval_pkts",
103-
"req_tx_loc_acc_err",
104-
"req_tx_loc_oper_err",
105-
"req_tx_mem_mgmt_err",
106-
"req_tx_retry_excd_err",
107-
"req_tx_loc_sgl_inv_err",
108-
"resp_rx_dup_request",
109-
"resp_rx_outof_buf",
110-
"resp_rx_outouf_seq",
111-
"resp_rx_cqe_err",
112-
"resp_rx_cqe_flush",
113-
"resp_rx_loc_len_err",
114-
"resp_rx_inval_request",
115-
"resp_rx_loc_oper_err",
116-
"resp_rx_outof_atomic",
117-
"resp_tx_pkt_seq_err",
118-
"resp_tx_rmt_inval_req_err",
119-
"resp_tx_rmt_acc_err",
120-
"resp_tx_rmt_oper_err",
121-
"resp_tx_rnr_retry_err",
122-
"resp_tx_loc_sgl_inv_err",
123-
"resp_rx_s0_table_err",
124-
"resp_rx_ccl_cts_outouf_seq",
125-
"tx_rdma_ack_timeout",
126-
"tx_rdma_ccl_cts_ack_timeout",
127-
"rx_rdma_mtu_discard_pkts",
128-
]
129-
130-
CRITICAL_ERROR_FIELDS = [
131-
"unrecoverable_err",
132-
"res_tx_pci_err",
133-
"res_rx_pci_err",
134-
"res_mem_err",
135-
]
136-
13740
def analyze_data(self, data: RdmaDataModel, args: Optional[None] = None) -> TaskResult:
13841
"""Analyze RDMA statistics for non-zero error counters.
13942
43+
Error and critical counter names come from each vendor's statistics model
44+
(ionic / bnxt / mlx prefixes).
45+
14046
Args:
14147
data: RDMA data model with statistic_list (and optionally link_list).
14248
args: Unused (analyzer has no configurable args).
@@ -150,32 +56,36 @@ def analyze_data(self, data: RdmaDataModel, args: Optional[None] = None) -> Task
15056
return self.result
15157

15258
error_state = False
153-
for idx, stat in enumerate(data.statistic_list):
154-
errors_on_interface = [] # (error_field, value, is_critical)
155-
for error_field in self.ERROR_FIELDS:
156-
value = getattr(stat, error_field, None)
157-
if value is not None and value > 0:
158-
is_critical = error_field in self.CRITICAL_ERROR_FIELDS
159-
errors_on_interface.append((error_field, value, is_critical))
160-
if errors_on_interface:
161-
error_state = True
162-
interface_label = stat.ifname or "unknown"
163-
error_names = [e[0] for e in errors_on_interface]
164-
any_critical = any(e[2] for e in errors_on_interface)
165-
priority = EventPriority.CRITICAL if any_critical else EventPriority.ERROR
166-
errors_data = {field: value for field, value, _ in errors_on_interface}
167-
self._log_event(
168-
category=EventCategory.IO,
169-
description=f"RDMA error detected on {interface_label}: [{', '.join(error_names)}]",
170-
data={
171-
"interface": stat.ifname,
172-
"port": stat.port,
173-
"errors": errors_data,
174-
"statistic_index": idx,
175-
},
176-
priority=priority,
177-
console_log=True,
178-
)
59+
60+
for stat in data.statistic_list:
61+
if stat.vendor_statistics is None:
62+
continue
63+
64+
error_fields = stat.vendor_statistics.error_fields
65+
critical_fields = stat.vendor_statistics.critial_error_fields
66+
67+
for error_field in error_fields + critical_fields:
68+
error_value = getattr(stat.vendor_statistics, error_field, None)
69+
70+
if error_value is not None and error_value > 0:
71+
priority = (
72+
EventPriority.CRITICAL
73+
if error_field in critical_fields
74+
else EventPriority.ERROR
75+
)
76+
self._log_event(
77+
category=EventCategory.NETWORK,
78+
description=f"RDMA error detected: {error_field}",
79+
data={
80+
"interface": stat.ifname,
81+
"port": stat.port,
82+
"error_field": error_field,
83+
"error_count": error_value,
84+
},
85+
priority=priority,
86+
console_log=True,
87+
)
88+
error_state = True
17989

18090
if error_state:
18191
self.result.message = "RDMA errors detected in statistics"

nodescraper/plugins/inband/rdma/rdma_collector.py

Lines changed: 65 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,15 @@
3434
from nodescraper.models import TaskResult
3535
from nodescraper.utils import get_exception_traceback
3636

37-
from .rdmadata import RdmaDataModel, RdmaDevice, RdmaLink, RdmaLinkText, RdmaStatistics
37+
from .rdmadata import (
38+
VENDOR_PREFIX_MAP,
39+
RdmaDataModel,
40+
RdmaDevice,
41+
RdmaLink,
42+
RdmaLinkText,
43+
RdmaStatistics,
44+
RdmaVendorStatistics,
45+
)
3846

3947

4048
class RdmaCollector(InBandDataCollector[RdmaDataModel, None]):
@@ -172,7 +180,11 @@ def _parse_rdma_link_text(self, output: str) -> list[RdmaLinkText]:
172180
return links
173181

174182
def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]:
175-
"""Get RDMA statistics from 'rdma statistic -j'."""
183+
"""Get RDMA statistics from 'rdma statistic -j'.
184+
185+
Warns on unexpected or missing fields relative to the vendor-specific model
186+
for the interface prefix (ionic / bnxt / mlx).
187+
"""
176188
stat_data = self._run_rdma_command(self.CMD_STATISTIC)
177189
if stat_data is None:
178190
return None
@@ -190,15 +202,64 @@ def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]:
190202
priority=EventPriority.WARNING,
191203
)
192204
continue
193-
statistics.append(RdmaStatistics(**stat))
205+
206+
ifname = stat.get("ifname", "")
207+
vendor_stats: Optional[RdmaVendorStatistics] = None
208+
for prefix, vendor_cls in VENDOR_PREFIX_MAP.items():
209+
if ifname.startswith(prefix):
210+
vendor_fields = set(vendor_cls.model_fields.keys())
211+
stat_fields = set(stat.keys()) - {"ifname", "port"}
212+
213+
extra_fields = stat_fields - vendor_fields
214+
if extra_fields:
215+
self._log_event(
216+
category=EventCategory.NETWORK,
217+
description=f"Unexpected fields in RDMA statistic for {ifname}",
218+
data={
219+
"interface": ifname,
220+
"extra_fields": sorted(extra_fields),
221+
},
222+
priority=EventPriority.WARNING,
223+
)
224+
225+
missing_fields = vendor_fields - stat_fields
226+
if missing_fields:
227+
self._log_event(
228+
category=EventCategory.NETWORK,
229+
description=f"Missing fields in RDMA statistic for {ifname}",
230+
data={
231+
"interface": ifname,
232+
"missing_fields": sorted(missing_fields),
233+
},
234+
priority=EventPriority.WARNING,
235+
)
236+
237+
try:
238+
vendor_stats = vendor_cls(**stat)
239+
except ValidationError as ve:
240+
self._log_event(
241+
category=EventCategory.NETWORK,
242+
description=f"Failed to build vendor model for {ifname}",
243+
data={"exception": get_exception_traceback(ve)},
244+
priority=EventPriority.WARNING,
245+
)
246+
break
247+
248+
rdma_stat = RdmaStatistics(
249+
ifname=stat.get("ifname"),
250+
port=stat.get("port"),
251+
vendor_statistics=vendor_stats,
252+
)
253+
statistics.append(rdma_stat)
254+
return statistics
194255
except ValidationError as e:
195256
self._log_event(
196257
category=EventCategory.NETWORK,
197258
description="Failed to build RdmaStatistics model",
198259
data={"exception": get_exception_traceback(e)},
199260
priority=EventPriority.WARNING,
200261
)
201-
return statistics
262+
return None
202263

203264
def _get_rdma_link(self) -> Optional[list[RdmaLink]]:
204265
"""Get RDMA link data from 'rdma link -j'."""

0 commit comments

Comments
 (0)