Skip to content

Commit 8416b1a

Browse files
committed
Gdma error logging
1 parent 4e26b4e commit 8416b1a

File tree

2 files changed

+40
-1
lines changed

2 files changed

+40
-1
lines changed

vm/devices/net/gdma_defs/src/bnic.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ pub const CQE_TX_VF_DISABLED: u8 = 38;
268268
pub const CQE_TX_VPORT_IDX_OUT_OF_RANGE: u8 = 39;
269269
pub const CQE_TX_VPORT_DISABLED: u8 = 40;
270270
pub const CQE_TX_VLAN_TAGGING_VIOLATION: u8 = 41;
271+
pub const CQE_TX_GDMA_ERR: u8 = 42;
271272

272273
pub const MANA_CQE_COMPLETION: u8 = 1;
273274

vm/devices/net/net_mana/src/lib.rs

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use gdma_defs::Cqe;
1111
use gdma_defs::GDMA_EQE_COMPLETION;
1212
use gdma_defs::Sge;
1313
use gdma_defs::bnic::CQE_RX_OKAY;
14+
use gdma_defs::bnic::CQE_TX_GDMA_ERR;
1415
use gdma_defs::bnic::CQE_TX_OKAY;
1516
use gdma_defs::bnic::MANA_LONG_PKT_FMT;
1617
use gdma_defs::bnic::MANA_SHORT_PKT_FMT;
@@ -607,6 +608,7 @@ struct QueueStats {
607608
tx_packets: u64,
608609
tx_errors: u64,
609610
tx_dropped: u64,
611+
tx_stuck: u64,
610612

611613
rx_events: u64,
612614
rx_packets: u64,
@@ -622,6 +624,7 @@ impl Inspect for QueueStats {
622624
.counter("tx_packets", self.tx_packets)
623625
.counter("tx_errors", self.tx_errors)
624626
.counter("tx_dropped", self.tx_dropped)
627+
.counter("tx_stuck", self.tx_stuck)
625628
.counter("rx_events", self.rx_events)
626629
.counter("rx_packets", self.rx_packets)
627630
.counter("rx_errors", self.rx_errors)
@@ -718,6 +721,26 @@ impl<T: DeviceBacking> ManaQueue<T> {
718721
false
719722
}
720723
}
724+
725+
fn trace_tx_wqe(&mut self, tx_oob: ManaTxCompOob) {
726+
tracelimit::error_ratelimited!(
727+
cqe_hdr_type = tx_oob.cqe_hdr.cqe_type(),
728+
cqe_hdr_vendor_err = tx_oob.cqe_hdr.vendor_err(),
729+
tx_oob_data_offset = tx_oob.tx_data_offset,
730+
tx_oob_sgl_offset = tx_oob.offsets.tx_sgl_offset(),
731+
tx_oob_wqe_offset = tx_oob.offsets.tx_wqe_offset(),
732+
"tx completion error"
733+
);
734+
735+
if let Some(packet) = self.posted_tx.front() {
736+
tracelimit::error_ratelimited!(
737+
id = packet.id.0,
738+
wqe_len = packet.wqe_len,
739+
bounced_len_with_padding = packet.bounced_len_with_padding,
740+
"posted tx"
741+
);
742+
}
743+
}
721744
}
722745

723746
#[async_trait]
@@ -903,18 +926,33 @@ impl<T: DeviceBacking + Send> Queue for ManaQueue<T> {
903926

904927
fn tx_poll(&mut self, done: &mut [TxId]) -> anyhow::Result<usize> {
905928
let mut i = 0;
929+
let mut queue_stuck = false;
906930
while i < done.len() {
907931
let id = if let Some(cqe) = self.tx_cq.pop() {
908932
let tx_oob = ManaTxCompOob::read_from_prefix(&cqe.data[..]).unwrap().0; // TODO: zerocopy: use-rest-of-range (https://github.com/microsoft/openvmm/issues/759)
909933
match tx_oob.cqe_hdr.cqe_type() {
910934
CQE_TX_OKAY => {
911935
self.stats.tx_packets += 1;
912936
}
937+
CQE_TX_GDMA_ERR => {
938+
queue_stuck = true;
939+
}
913940
ty => {
914-
tracelimit::error_ratelimited!(ty, "tx completion error");
941+
let vendor_err = tx_oob.cqe_hdr.vendor_err();
942+
tracelimit::error_ratelimited!(ty, vendor_err, "tx completion error");
915943
self.stats.tx_errors += 1;
916944
}
917945
}
946+
if queue_stuck {
947+
// Hardware hit an error with the packet coming from the Guest.
948+
// CQE_TX_GDMA_ERR is how the Hardware indicates that it has disabled the queue.
949+
self.stats.tx_errors += 1;
950+
self.stats.tx_stuck += 1;
951+
self.trace_tx_wqe(tx_oob);
952+
// TODO: attempt to recover by reenabling the queue.
953+
// tracelimit::info_ratelimited!("recreated tx queue");
954+
break;
955+
}
918956
let packet = self.posted_tx.pop_front().unwrap();
919957
self.tx_wq.advance_head(packet.wqe_len);
920958
if packet.bounced_len_with_padding > 0 {

0 commit comments

Comments
 (0)