@@ -37,106 +37,12 @@ class RdmaAnalyzer(DataAnalyzer[RdmaDataModel, None]):
3737
3838 DATA_MODEL = RdmaDataModel
3939
40- # Error fields checked from rdma statistic output (bnxt_re, mlx5, ionic, etc.)
41- ERROR_FIELDS = [
42- "recoverable_errors" ,
43- "tx_roce_errors" ,
44- "tx_roce_discards" ,
45- "rx_roce_errors" ,
46- "rx_roce_discards" ,
47- "local_ack_timeout_err" ,
48- "packet_seq_err" ,
49- "max_retry_exceeded" ,
50- "rnr_nak_retry_err" ,
51- "implied_nak_seq_err" ,
52- "unrecoverable_err" ,
53- "bad_resp_err" ,
54- "local_qp_op_err" ,
55- "local_protection_err" ,
56- "mem_mgmt_op_err" ,
57- "req_remote_invalid_request" ,
58- "req_remote_access_errors" ,
59- "remote_op_err" ,
60- "duplicate_request" ,
61- "res_exceed_max" ,
62- "resp_local_length_error" ,
63- "res_exceeds_wqe" ,
64- "res_opcode_err" ,
65- "res_rx_invalid_rkey" ,
66- "res_rx_domain_err" ,
67- "res_rx_no_perm" ,
68- "res_rx_range_err" ,
69- "res_tx_invalid_rkey" ,
70- "res_tx_domain_err" ,
71- "res_tx_no_perm" ,
72- "res_tx_range_err" ,
73- "res_irrq_oflow" ,
74- "res_unsup_opcode" ,
75- "res_unaligned_atomic" ,
76- "res_rem_inv_err" ,
77- "res_mem_err" ,
78- "res_srq_err" ,
79- "res_cmp_err" ,
80- "res_invalid_dup_rkey" ,
81- "res_wqe_format_err" ,
82- "res_cq_load_err" ,
83- "res_srq_load_err" ,
84- "res_tx_pci_err" ,
85- "res_rx_pci_err" ,
86- "out_of_buffer" ,
87- "out_of_sequence" ,
88- "req_cqe_error" ,
89- "req_cqe_flush_error" ,
90- "resp_cqe_error" ,
91- "resp_cqe_flush_error" ,
92- "resp_remote_access_errors" ,
93- "req_rx_pkt_seq_err" ,
94- "req_rx_rnr_retry_err" ,
95- "req_rx_rmt_acc_err" ,
96- "req_rx_rmt_req_err" ,
97- "req_rx_oper_err" ,
98- "req_rx_impl_nak_seq_err" ,
99- "req_rx_cqe_err" ,
100- "req_rx_cqe_flush" ,
101- "req_rx_dup_response" ,
102- "req_rx_inval_pkts" ,
103- "req_tx_loc_acc_err" ,
104- "req_tx_loc_oper_err" ,
105- "req_tx_mem_mgmt_err" ,
106- "req_tx_retry_excd_err" ,
107- "req_tx_loc_sgl_inv_err" ,
108- "resp_rx_dup_request" ,
109- "resp_rx_outof_buf" ,
110- "resp_rx_outouf_seq" ,
111- "resp_rx_cqe_err" ,
112- "resp_rx_cqe_flush" ,
113- "resp_rx_loc_len_err" ,
114- "resp_rx_inval_request" ,
115- "resp_rx_loc_oper_err" ,
116- "resp_rx_outof_atomic" ,
117- "resp_tx_pkt_seq_err" ,
118- "resp_tx_rmt_inval_req_err" ,
119- "resp_tx_rmt_acc_err" ,
120- "resp_tx_rmt_oper_err" ,
121- "resp_tx_rnr_retry_err" ,
122- "resp_tx_loc_sgl_inv_err" ,
123- "resp_rx_s0_table_err" ,
124- "resp_rx_ccl_cts_outouf_seq" ,
125- "tx_rdma_ack_timeout" ,
126- "tx_rdma_ccl_cts_ack_timeout" ,
127- "rx_rdma_mtu_discard_pkts" ,
128- ]
129-
130- CRITICAL_ERROR_FIELDS = [
131- "unrecoverable_err" ,
132- "res_tx_pci_err" ,
133- "res_rx_pci_err" ,
134- "res_mem_err" ,
135- ]
136-
13740 def analyze_data (self , data : RdmaDataModel , args : Optional [None ] = None ) -> TaskResult :
13841 """Analyze RDMA statistics for non-zero error counters.
13942
43+ Error and critical counter names come from each vendor's statistics model
44+ (ionic / bnxt / mlx prefixes).
45+
14046 Args:
14147 data: RDMA data model with statistic_list (and optionally link_list).
14248 args: Unused (analyzer has no configurable args).
@@ -150,32 +56,36 @@ def analyze_data(self, data: RdmaDataModel, args: Optional[None] = None) -> Task
15056 return self .result
15157
15258 error_state = False
153- for idx , stat in enumerate (data .statistic_list ):
154- errors_on_interface = [] # (error_field, value, is_critical)
155- for error_field in self .ERROR_FIELDS :
156- value = getattr (stat , error_field , None )
157- if value is not None and value > 0 :
158- is_critical = error_field in self .CRITICAL_ERROR_FIELDS
159- errors_on_interface .append ((error_field , value , is_critical ))
160- if errors_on_interface :
161- error_state = True
162- interface_label = stat .ifname or "unknown"
163- error_names = [e [0 ] for e in errors_on_interface ]
164- any_critical = any (e [2 ] for e in errors_on_interface )
165- priority = EventPriority .CRITICAL if any_critical else EventPriority .ERROR
166- errors_data = {field : value for field , value , _ in errors_on_interface }
167- self ._log_event (
168- category = EventCategory .IO ,
169- description = f"RDMA error detected on { interface_label } : [{ ', ' .join (error_names )} ]" ,
170- data = {
171- "interface" : stat .ifname ,
172- "port" : stat .port ,
173- "errors" : errors_data ,
174- "statistic_index" : idx ,
175- },
176- priority = priority ,
177- console_log = True ,
178- )
59+
60+ for stat in data .statistic_list :
61+ if stat .vendor_statistics is None :
62+ continue
63+
64+ error_fields = stat .vendor_statistics .error_fields
65+ critical_fields = stat .vendor_statistics .critial_error_fields
66+
67+ for error_field in error_fields + critical_fields :
68+ error_value = getattr (stat .vendor_statistics , error_field , None )
69+
70+ if error_value is not None and error_value > 0 :
71+ priority = (
72+ EventPriority .CRITICAL
73+ if error_field in critical_fields
74+ else EventPriority .ERROR
75+ )
76+ self ._log_event (
77+ category = EventCategory .NETWORK ,
78+ description = f"RDMA error detected: { error_field } " ,
79+ data = {
80+ "interface" : stat .ifname ,
81+ "port" : stat .port ,
82+ "error_field" : error_field ,
83+ "error_count" : error_value ,
84+ },
85+ priority = priority ,
86+ console_log = True ,
87+ )
88+ error_state = True
17989
18090 if error_state :
18191 self .result .message = "RDMA errors detected in statistics"
0 commit comments