@@ -11,6 +11,7 @@ use gdma_defs::Cqe;
11
11
use gdma_defs:: GDMA_EQE_COMPLETION ;
12
12
use gdma_defs:: Sge ;
13
13
use gdma_defs:: bnic:: CQE_RX_OKAY ;
14
+ use gdma_defs:: bnic:: CQE_TX_GDMA_ERR ;
14
15
use gdma_defs:: bnic:: CQE_TX_OKAY ;
15
16
use gdma_defs:: bnic:: MANA_LONG_PKT_FMT ;
16
17
use gdma_defs:: bnic:: MANA_SHORT_PKT_FMT ;
@@ -607,6 +608,7 @@ struct QueueStats {
607
608
tx_packets : u64 ,
608
609
tx_errors : u64 ,
609
610
tx_dropped : u64 ,
611
+ tx_stuck : u64 ,
610
612
611
613
rx_events : u64 ,
612
614
rx_packets : u64 ,
@@ -622,6 +624,7 @@ impl Inspect for QueueStats {
622
624
. counter ( "tx_packets" , self . tx_packets )
623
625
. counter ( "tx_errors" , self . tx_errors )
624
626
. counter ( "tx_dropped" , self . tx_dropped )
627
+ . counter ( "tx_stuck" , self . tx_stuck )
625
628
. counter ( "rx_events" , self . rx_events )
626
629
. counter ( "rx_packets" , self . rx_packets )
627
630
. counter ( "rx_errors" , self . rx_errors )
@@ -718,6 +721,26 @@ impl<T: DeviceBacking> ManaQueue<T> {
718
721
false
719
722
}
720
723
}
724
+
725
+ fn trace_tx_wqe ( & mut self , tx_oob : ManaTxCompOob ) {
726
+ tracelimit:: error_ratelimited!(
727
+ cqe_hdr_type = tx_oob. cqe_hdr. cqe_type( ) ,
728
+ cqe_hdr_vendor_err = tx_oob. cqe_hdr. vendor_err( ) ,
729
+ tx_oob_data_offset = tx_oob. tx_data_offset,
730
+ tx_oob_sgl_offset = tx_oob. offsets. tx_sgl_offset( ) ,
731
+ tx_oob_wqe_offset = tx_oob. offsets. tx_wqe_offset( ) ,
732
+ "tx completion error"
733
+ ) ;
734
+
735
+ if let Some ( packet) = self . posted_tx . front ( ) {
736
+ tracelimit:: error_ratelimited!(
737
+ id = packet. id. 0 ,
738
+ wqe_len = packet. wqe_len,
739
+ bounced_len_with_padding = packet. bounced_len_with_padding,
740
+ "posted tx"
741
+ ) ;
742
+ }
743
+ }
721
744
}
722
745
723
746
#[ async_trait]
@@ -903,18 +926,33 @@ impl<T: DeviceBacking + Send> Queue for ManaQueue<T> {
903
926
904
927
fn tx_poll ( & mut self , done : & mut [ TxId ] ) -> anyhow:: Result < usize > {
905
928
let mut i = 0 ;
929
+ let mut queue_stuck = false ;
906
930
while i < done. len ( ) {
907
931
let id = if let Some ( cqe) = self . tx_cq . pop ( ) {
908
932
let tx_oob = ManaTxCompOob :: read_from_prefix ( & cqe. data [ ..] ) . unwrap ( ) . 0 ; // TODO: zerocopy: use-rest-of-range (https://github.com/microsoft/openvmm/issues/759)
909
933
match tx_oob. cqe_hdr . cqe_type ( ) {
910
934
CQE_TX_OKAY => {
911
935
self . stats . tx_packets += 1 ;
912
936
}
937
+ CQE_TX_GDMA_ERR => {
938
+ queue_stuck = true ;
939
+ }
913
940
ty => {
914
- tracelimit:: error_ratelimited!( ty, "tx completion error" ) ;
941
+ let vendor_err = tx_oob. cqe_hdr . vendor_err ( ) ;
942
+ tracelimit:: error_ratelimited!( ty, vendor_err, "tx completion error" ) ;
915
943
self . stats . tx_errors += 1 ;
916
944
}
917
945
}
946
+ if queue_stuck {
947
+ // Hardware hit an error with the packet coming from the Guest.
948
+ // CQE_TX_GDMA_ERR is how the Hardware indicates that it has disabled the queue.
949
+ self . stats . tx_errors += 1 ;
950
+ self . stats . tx_stuck += 1 ;
951
+ self . trace_tx_wqe ( tx_oob) ;
952
+ // TODO: attempt to recover by reenabling the queue.
953
+ // tracelimit::info_ratelimited!("recreated tx queue");
954
+ break ;
955
+ }
918
956
let packet = self . posted_tx . pop_front ( ) . unwrap ( ) ;
919
957
self . tx_wq . advance_head ( packet. wqe_len ) ;
920
958
if packet. bounced_len_with_padding > 0 {
0 commit comments