From 49266f50bc195b338e84782cf8fee73ef30339f6 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Tue, 22 Oct 2024 19:08:16 -0400 Subject: [PATCH 01/21] selftests/mm temporary fix of hmm infinite loop jira SECO-170 In Rocky9 if you run ./run_vmtests.sh -t hmm it will fail and cause an infinite loop on ASSERTs in FIXTURE_TEARDOWN() This temporary fix is based on the discussion here https://patchwork.kernel.org/project/linux-kselftest/patch/26017fe3-5ad7-6946-57db-e5ec48063ceb@suse.cz/#25046055 We will investigate further kselftest updates that will resolve the root causes of this. Signed-off-by: Jonathan Maple --- tools/testing/selftests/mm/hmm-tests.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index d2cfc9b494a0e..6f75c54564176 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -159,6 +159,10 @@ FIXTURE_TEARDOWN(hmm) { int ret = close(self->fd); + if (ret != 0) { + fprintf(stderr, "close returned (%d) fd is (%d)\n", ret, self->fd); + exit(1); + } ASSERT_EQ(ret, 0); self->fd = -1; } From 841e13e2504f9539fe50bcbc730b24595921f828 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Mon, 9 Jun 2025 15:48:12 -0400 Subject: [PATCH 02/21] net: mana: Add support for Multi Vports on Bare metal jira LE-3208 feature net_mana commit-author Haiyang Zhang commit 290e5d3c49f687c1567bde634dc33d57b0674919 To support Multi Vports on Bare metal, increase the device config response version. And, skip the register HW vport, and register filter steps, when the Bare metal hostmode is set. Signed-off-by: Haiyang Zhang Link: https://patch.msgid.link/1747671636-5810-1-git-send-email-haiyangz@microsoft.com Signed-off-by: Paolo Abeni (cherry picked from commit 290e5d3c49f687c1567bde634dc33d57b0674919) Signed-off-by: Jonathan Maple Signed-off-by: Jonathan Maple --- drivers/net/ethernet/microsoft/mana/mana_en.c | 24 ++++++++++++------- include/net/mana/mana.h | 4 +++- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index ae796e64d24ca..7f125c7289ebf 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -920,7 +920,7 @@ static void mana_pf_deregister_filter(struct mana_port_context *apc) static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver, u32 proto_minor_ver, u32 proto_micro_ver, - u16 *max_num_vports) + u16 *max_num_vports, u8 *bm_hostmode) { struct gdma_context *gc = ac->gdma_dev->gdma_context; struct mana_query_device_cfg_resp resp = {}; @@ -931,7 +931,7 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver, mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_DEV_CONFIG, sizeof(req), sizeof(resp)); - req.hdr.resp.msg_version = GDMA_MESSAGE_V2; + req.hdr.resp.msg_version = GDMA_MESSAGE_V3; req.proto_major_ver = proto_major_ver; req.proto_minor_ver = proto_minor_ver; @@ -955,11 +955,16 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver, *max_num_vports = resp.max_num_vports; - if (resp.hdr.response.msg_version == GDMA_MESSAGE_V2) + if (resp.hdr.response.msg_version >= GDMA_MESSAGE_V2) gc->adapter_mtu = resp.adapter_mtu; else gc->adapter_mtu = ETH_FRAME_LEN; + if (resp.hdr.response.msg_version >= GDMA_MESSAGE_V3) + *bm_hostmode = resp.bm_hostmode; + else + *bm_hostmode = 0; + debugfs_create_u16("adapter-MTU", 0400, gc->mana_pci_debugfs, &gc->adapter_mtu); return 0; @@ -2439,7 +2444,7 @@ static void mana_destroy_vport(struct mana_port_context *apc) mana_destroy_txq(apc); mana_uncfg_vport(apc); - if (gd->gdma_context->is_pf) + if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode) mana_pf_deregister_hw_vport(apc); } @@ -2451,7 +2456,7 @@ static int mana_create_vport(struct mana_port_context *apc, apc->default_rxobj = INVALID_MANA_HANDLE; - if (gd->gdma_context->is_pf) { + if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode) { err = mana_pf_register_hw_vport(apc); if (err) return err; @@ -2675,7 +2680,7 @@ int mana_alloc_queues(struct net_device *ndev) if (err) goto destroy_vport; - if (gd->gdma_context->is_pf) { + if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode) { err = mana_pf_register_filter(apc); if (err) goto destroy_vport; @@ -2737,7 +2742,7 @@ static int mana_dealloc_queues(struct net_device *ndev) mana_chn_setxdp(apc, NULL); - if (gd->gdma_context->is_pf) + if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode) mana_pf_deregister_filter(apc); /* No packet can be transmitted now since apc->port_is_up is false. @@ -2978,6 +2983,7 @@ int mana_probe(struct gdma_dev *gd, bool resuming) struct gdma_context *gc = gd->gdma_context; struct mana_context *ac = gd->driver_data; struct device *dev = gc->dev; + u8 bm_hostmode = 0; u16 num_ports = 0; int err; int i; @@ -3004,10 +3010,12 @@ int mana_probe(struct gdma_dev *gd, bool resuming) goto out; err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION, - MANA_MICRO_VERSION, &num_ports); + MANA_MICRO_VERSION, &num_ports, &bm_hostmode); if (err) goto out; + ac->bm_hostmode = bm_hostmode; + if (!resuming) { ac->num_ports = num_ports; } else { diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index ceac201caa3a8..e74415c833791 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -405,6 +405,7 @@ struct mana_context { struct gdma_dev *gdma_dev; u16 num_ports; + u8 bm_hostmode; struct mana_eq *eqs; struct dentry *mana_eqs_debugfs; @@ -554,7 +555,8 @@ struct mana_query_device_cfg_resp { u64 pf_cap_flags4; u16 max_num_vports; - u16 reserved; + u8 bm_hostmode; /* response v3: Bare Metal Host Mode */ + u8 reserved; u32 max_num_eqs; /* response v2: */ From c98862f2c1ebc9da20eb00142c90095ecc087d39 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Mon, 9 Jun 2025 15:49:43 -0400 Subject: [PATCH 03/21] tools: hv: Enable debug logs for hv_kvp_daemon jira LE-3207 feature tools_hv commit-author Shradha Gupta commit a9c0b33ef2306327dd2db02c6274107065ff9307 Allow the KVP daemon to log the KVP updates triggered in the VM with a new debug flag(-d). When the daemon is started with this flag, it logs updates and debug information in syslog with loglevel LOG_DEBUG. This information comes in handy for debugging issues where the key-value pairs for certain pools show mismatch/incorrect values. The distro-vendors can further consume these changes and modify the respective service files to redirect the logs to specific files as needed. Signed-off-by: Shradha Gupta Reviewed-by: Naman Jain Reviewed-by: Dexuan Cui Link: https://lore.kernel.org/r/1744715978-8185-1-git-send-email-shradhagupta@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1744715978-8185-1-git-send-email-shradhagupta@linux.microsoft.com> (cherry picked from commit a9c0b33ef2306327dd2db02c6274107065ff9307) Signed-off-by: Jonathan Maple Signed-off-by: Jonathan Maple --- tools/hv/hv_kvp_daemon.c | 64 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 5 deletions(-) diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index 1e6fd6ca513bd..0e0c997134ec6 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -77,6 +77,7 @@ enum { }; static int in_hand_shake; +static int debug; static char *os_name = ""; static char *os_major = ""; @@ -172,6 +173,20 @@ static void kvp_update_file(int pool) kvp_release_lock(pool); } +static void kvp_dump_initial_pools(int pool) +{ + int i; + + syslog(LOG_DEBUG, "===Start dumping the contents of pool %d ===\n", + pool); + + for (i = 0; i < kvp_file_info[pool].num_records; i++) + syslog(LOG_DEBUG, "pool: %d, %d/%d key=%s val=%s\n", + pool, i + 1, kvp_file_info[pool].num_records, + kvp_file_info[pool].records[i].key, + kvp_file_info[pool].records[i].value); +} + static void kvp_update_mem_state(int pool) { FILE *filep; @@ -259,6 +274,8 @@ static int kvp_file_init(void) return 1; kvp_file_info[i].num_records = 0; kvp_update_mem_state(i); + if (debug) + kvp_dump_initial_pools(i); } return 0; @@ -286,6 +303,9 @@ static int kvp_key_delete(int pool, const __u8 *key, int key_size) * Found a match; just move the remaining * entries up. */ + if (debug) + syslog(LOG_DEBUG, "%s: deleting the KVP: pool=%d key=%s val=%s", + __func__, pool, record[i].key, record[i].value); if (i == (num_records - 1)) { kvp_file_info[pool].num_records--; kvp_update_file(pool); @@ -304,20 +324,36 @@ static int kvp_key_delete(int pool, const __u8 *key, int key_size) kvp_update_file(pool); return 0; } + + if (debug) + syslog(LOG_DEBUG, "%s: could not delete KVP: pool=%d key=%s. Record not found", + __func__, pool, key); + return 1; } static int kvp_key_add_or_modify(int pool, const __u8 *key, int key_size, const __u8 *value, int value_size) { - int i; - int num_records; struct kvp_record *record; + int num_records; int num_blocks; + int i; + + if (debug) + syslog(LOG_DEBUG, "%s: got a KVP: pool=%d key=%s val=%s", + __func__, pool, key, value); if ((key_size > HV_KVP_EXCHANGE_MAX_KEY_SIZE) || - (value_size > HV_KVP_EXCHANGE_MAX_VALUE_SIZE)) + (value_size > HV_KVP_EXCHANGE_MAX_VALUE_SIZE)) { + syslog(LOG_ERR, "%s: Too long key or value: key=%s, val=%s", + __func__, key, value); + + if (debug) + syslog(LOG_DEBUG, "%s: Too long key or value: pool=%d, key=%s, val=%s", + __func__, pool, key, value); return 1; + } /* * First update the in-memory state. @@ -337,6 +373,9 @@ static int kvp_key_add_or_modify(int pool, const __u8 *key, int key_size, */ memcpy(record[i].value, value, value_size); kvp_update_file(pool); + if (debug) + syslog(LOG_DEBUG, "%s: updated: pool=%d key=%s val=%s", + __func__, pool, key, value); return 0; } @@ -348,8 +387,10 @@ static int kvp_key_add_or_modify(int pool, const __u8 *key, int key_size, record = realloc(record, sizeof(struct kvp_record) * ENTRIES_PER_BLOCK * (num_blocks + 1)); - if (record == NULL) + if (!record) { + syslog(LOG_ERR, "%s: Memory alloc failure", __func__); return 1; + } kvp_file_info[pool].num_blocks++; } @@ -357,6 +398,11 @@ static int kvp_key_add_or_modify(int pool, const __u8 *key, int key_size, memcpy(record[i].key, key, key_size); kvp_file_info[pool].records = record; kvp_file_info[pool].num_records++; + + if (debug) + syslog(LOG_DEBUG, "%s: added: pool=%d key=%s val=%s", + __func__, pool, key, value); + kvp_update_file(pool); return 0; } @@ -1355,6 +1401,7 @@ void print_usage(char *argv[]) fprintf(stderr, "Usage: %s [options]\n" "Options are:\n" " -n, --no-daemon stay in foreground, don't daemonize\n" + " -d, --debug Enable debug logs(syslog debug by default)\n" " -h, --help print this help\n", argv[0]); } @@ -1376,10 +1423,11 @@ int main(int argc, char *argv[]) static struct option long_options[] = { {"help", no_argument, 0, 'h' }, {"no-daemon", no_argument, 0, 'n' }, + {"debug", no_argument, 0, 'd' }, {0, 0, 0, 0 } }; - while ((opt = getopt_long(argc, argv, "hn", long_options, + while ((opt = getopt_long(argc, argv, "hnd", long_options, &long_index)) != -1) { switch (opt) { case 'n': @@ -1388,6 +1436,9 @@ int main(int argc, char *argv[]) case 'h': print_usage(argv); exit(0); + case 'd': + debug = 1; + break; default: print_usage(argv); exit(EXIT_FAILURE); @@ -1410,6 +1461,9 @@ int main(int argc, char *argv[]) */ kvp_get_domain_name(full_domain_name, sizeof(full_domain_name)); + if (debug) + syslog(LOG_INFO, "Logging debug info in syslog(debug)"); + if (kvp_file_init()) { syslog(LOG_ERR, "Failed to initialize the pools"); exit(EXIT_FAILURE); From 7c366db95d01f7de18553c4a1778c93a58e82baa Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Tue, 19 Aug 2025 10:28:05 +0000 Subject: [PATCH 04/21] RDMA/mana_ib: use the correct page size for mapping user-mode doorbell page jira LE-3813 commit-author Long Li commit 4a3b99bc04e501b816db78f70064e26a01257910 When mapping doorbell page from user-mode, the driver should use the system page size as this memory is allocated via mmap() from user-mode. Cc: stable@vger.kernel.org Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter") Signed-off-by: Long Li Link: https://patch.msgid.link/1725030993-16213-2-git-send-email-longli@linuxonhyperv.com Signed-off-by: Leon Romanovsky (cherry picked from commit 4a3b99bc04e501b816db78f70064e26a01257910) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/infiniband/hw/mana/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index d13abc954d2aa..148d74a8c3a87 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -511,13 +511,13 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) PAGE_SHIFT; prot = pgprot_writecombine(vma->vm_page_prot); - ret = rdma_user_mmap_io(ibcontext, vma, pfn, gc->db_page_size, prot, + ret = rdma_user_mmap_io(ibcontext, vma, pfn, PAGE_SIZE, prot, NULL); if (ret) ibdev_dbg(ibdev, "can't rdma_user_mmap_io ret %d\n", ret); else - ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %u, ret %d\n", - pfn, gc->db_page_size, ret); + ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %lu, ret %d\n", + pfn, PAGE_SIZE, ret); return ret; } From 65884ec35b6d4bf15515bffdcdbfcd4f7f29895b Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Tue, 19 Aug 2025 10:28:47 +0000 Subject: [PATCH 05/21] RDMA/mana_ib: use the correct page table index based on hardware page size jira LE-3813 commit-author Long Li commit 9e517a8e9d9a303bf9bde35e5c5374795544c152 MANA hardware uses 4k page size. When calculating the page table index, it should use the hardware page size, not the system page size. Cc: stable@vger.kernel.org Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter") Signed-off-by: Long Li Link: https://patch.msgid.link/1725030993-16213-1-git-send-email-longli@linuxonhyperv.com Signed-off-by: Leon Romanovsky (cherry picked from commit 9e517a8e9d9a303bf9bde35e5c5374795544c152) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/infiniband/hw/mana/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 148d74a8c3a87..67c2d43135a8a 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -383,7 +383,7 @@ static int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem create_req->length = umem->length; create_req->offset_in_page = ib_umem_dma_offset(umem, page_sz); - create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT; + create_req->gdma_page_type = order_base_2(page_sz) - MANA_PAGE_SHIFT; create_req->page_count = num_pages_total; ibdev_dbg(&dev->ib_dev, "size_dma_region %lu num_pages_total %lu\n", From c804b028d51aece8e6e9a7c99286d4e124536834 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 13 Aug 2025 16:43:17 -0700 Subject: [PATCH 06/21] scsi: storvsc: Increase the timeouts to storvsc_timeout jira LE-3545 commit-author Dexuan Cui commit b2f966568faaad326de97481096d0f3dc0971c43 Currently storvsc_timeout is only used in storvsc_sdev_configure(), and 5s and 10s are used elsewhere. It turns out that rarely the 5s is not enough on Azure, so let's use storvsc_timeout everywhere. In case a timeout happens and storvsc_channel_init() returns an error, close the VMBus channel so that any host-to-guest messages in the channel's ringbuffer, which might come late, can be safely ignored. Add a "const" to storvsc_timeout. Cc: stable@kernel.org Signed-off-by: Dexuan Cui Link: https://lore.kernel.org/r/1749243459-10419-1-git-send-email-decui@microsoft.com Reviewed-by: Long Li Signed-off-by: Martin K. Petersen (cherry picked from commit b2f966568faaad326de97481096d0f3dc0971c43) Signed-off-by: Sultan Alsawaf Signed-off-by: Jonathan Maple --- drivers/scsi/storvsc_drv.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 9ede8e35b19ab..239f096a32288 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -356,7 +356,7 @@ MODULE_PARM_DESC(ring_avail_percent_lowater, /* * Timeout in seconds for all devices managed by this driver. */ -static int storvsc_timeout = 180; +static const int storvsc_timeout = 180; #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS) static struct scsi_transport_template *fc_transport_template; @@ -762,7 +762,7 @@ static void handle_multichannel_storage(struct hv_device *device, int max_chns) return; } - t = wait_for_completion_timeout(&request->wait_event, 10*HZ); + t = wait_for_completion_timeout(&request->wait_event, storvsc_timeout * HZ); if (t == 0) { dev_err(dev, "Failed to create sub-channel: timed out\n"); return; @@ -827,7 +827,7 @@ static int storvsc_execute_vstor_op(struct hv_device *device, if (ret != 0) return ret; - t = wait_for_completion_timeout(&request->wait_event, 5*HZ); + t = wait_for_completion_timeout(&request->wait_event, storvsc_timeout * HZ); if (t == 0) return -ETIMEDOUT; @@ -1345,6 +1345,8 @@ static int storvsc_connect_to_vsp(struct hv_device *device, u32 ring_size, return ret; ret = storvsc_channel_init(device, is_fc); + if (ret) + vmbus_close(device->channel); return ret; } @@ -1662,7 +1664,7 @@ static int storvsc_host_reset_handler(struct scsi_cmnd *scmnd) if (ret != 0) return FAILED; - t = wait_for_completion_timeout(&request->wait_event, 5*HZ); + t = wait_for_completion_timeout(&request->wait_event, storvsc_timeout * HZ); if (t == 0) return TIMEOUT_ERROR; From 0e3aa278118d83d25a00001a60e036050b23f911 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Mon, 25 Aug 2025 18:10:26 +0000 Subject: [PATCH 07/21] Drivers: hv: Allow vmbus_sendpacket_mpb_desc() to create multiple ranges jira LE-3554 commit-author Michael Kelley commit 380b75d3078626aadd0817de61f3143f5db6e393 vmbus_sendpacket_mpb_desc() is currently used only by the storvsc driver and is hardcoded to create a single GPA range. To allow it to also be used by the netvsc driver to create multiple GPA ranges, no longer hardcode as having a single GPA range. Allow the calling driver to specify the rangecount in the supplied descriptor. Update the storvsc driver to reflect this new approach. Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-2-mhklinux@outlook.com Signed-off-by: Jakub Kicinski (cherry picked from commit 380b75d3078626aadd0817de61f3143f5db6e393) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/hv/channel.c | 6 +++--- drivers/scsi/storvsc_drv.c | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index fb8cd8469328e..4ffd5eaa78172 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -1136,9 +1136,10 @@ int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer); /* - * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet + * vmbus_sendpacket_mpb_desc - Send one or more multi-page buffer packets * using a GPADL Direct packet type. - * The buffer includes the vmbus descriptor. + * The desc argument must include space for the VMBus descriptor. The + * rangecount field must already be set. */ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, struct vmbus_packet_mpb_array *desc, @@ -1160,7 +1161,6 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, desc->length8 = (u16)(packetlen_aligned >> 3); desc->transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */ desc->reserved = 0; - desc->rangecount = 1; bufferlist[0].iov_base = desc; bufferlist[0].iov_len = desc_size; diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 239f096a32288..c12240753d376 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1815,6 +1815,7 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd) return SCSI_MLQUEUE_DEVICE_BUSY; } + payload->rangecount = 1; payload->range.len = length; payload->range.offset = offset_in_hvpg; From 4a937fb31c65da0680ef4b8a2a59a1a88728c0d7 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Mon, 25 Aug 2025 18:11:12 +0000 Subject: [PATCH 08/21] hv_netvsc: Use vmbus_sendpacket_mpb_desc() to send VMBus messages jira LE-3554 commit-author Michael Kelley commit 4f98616b855cb0e3b5917918bb07b44728eb96ea netvsc currently uses vmbus_sendpacket_pagebuffer() to send VMBus messages. This function creates a series of GPA ranges, each of which contains a single PFN. However, if the rndis header in the VMBus message crosses a page boundary, the netvsc protocol with the host requires that both PFNs for the rndis header must be in a single "GPA range" data structure, which isn't possible with vmbus_sendpacket_pagebuffer(). As the first step in fixing this, add a new function netvsc_build_mpb_array() to build a VMBus message with multiple GPA ranges, each of which may contain multiple PFNs. Use vmbus_sendpacket_mpb_desc() to send this VMBus message to the host. There's no functional change since higher levels of netvsc don't maintain or propagate knowledge of contiguous PFNs. Based on its input, netvsc_build_mpb_array() still produces a separate GPA range for each PFN and the behavior is the same as with vmbus_sendpacket_pagebuffer(). But the groundwork is laid for a subsequent patch to provide the necessary grouping. Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-3-mhklinux@outlook.com Signed-off-by: Jakub Kicinski (cherry picked from commit 4f98616b855cb0e3b5917918bb07b44728eb96ea) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/net/hyperv/netvsc.c | 50 +++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 2b6ec979a62f2..1fc80bbe7941d 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1049,6 +1049,42 @@ static int netvsc_dma_map(struct hv_device *hv_dev, return 0; } +/* Build an "array" of mpb entries describing the data to be transferred + * over VMBus. After the desc header fields, each "array" entry is variable + * size, and each entry starts after the end of the previous entry. The + * "offset" and "len" fields for each entry imply the size of the entry. + * + * The pfns are in HV_HYP_PAGE_SIZE, because all communication with Hyper-V + * uses that granularity, even if the system page size of the guest is larger. + * Each entry in the input "pb" array must describe a contiguous range of + * guest physical memory so that the pfns are sequential if the range crosses + * a page boundary. The offset field must be < HV_HYP_PAGE_SIZE. + */ +static inline void netvsc_build_mpb_array(struct hv_page_buffer *pb, + u32 page_buffer_count, + struct vmbus_packet_mpb_array *desc, + u32 *desc_size) +{ + struct hv_mpb_array *mpb_entry = &desc->range; + int i, j; + + for (i = 0; i < page_buffer_count; i++) { + u32 offset = pb[i].offset; + u32 len = pb[i].len; + + mpb_entry->offset = offset; + mpb_entry->len = len; + + for (j = 0; j < HVPFN_UP(offset + len); j++) + mpb_entry->pfn_array[j] = pb[i].pfn + j; + + mpb_entry = (struct hv_mpb_array *)&mpb_entry->pfn_array[j]; + } + + desc->rangecount = page_buffer_count; + *desc_size = (char *)mpb_entry - (char *)desc; +} + static inline int netvsc_send_pkt( struct hv_device *device, struct hv_netvsc_packet *packet, @@ -1091,6 +1127,9 @@ static inline int netvsc_send_pkt( packet->dma_range = NULL; if (packet->page_buf_cnt) { + struct vmbus_channel_packet_page_buffer desc; + u32 desc_size; + if (packet->cp_partial) pb += packet->rmsg_pgcnt; @@ -1100,11 +1139,12 @@ static inline int netvsc_send_pkt( goto exit; } - ret = vmbus_sendpacket_pagebuffer(out_channel, - pb, packet->page_buf_cnt, - &nvmsg, sizeof(nvmsg), - req_id); - + netvsc_build_mpb_array(pb, packet->page_buf_cnt, + (struct vmbus_packet_mpb_array *)&desc, + &desc_size); + ret = vmbus_sendpacket_mpb_desc(out_channel, + (struct vmbus_packet_mpb_array *)&desc, + desc_size, &nvmsg, sizeof(nvmsg), req_id); if (ret) netvsc_dma_unmap(ndev_ctx->device_ctx, packet); } else { From a20a21f842c970d3533cfa943233e729fc5c74ed Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Mon, 25 Aug 2025 18:11:37 +0000 Subject: [PATCH 09/21] hv_netvsc: Preserve contiguous PFN grouping in the page buffer array jira LE-3554 commit-author Michael Kelley commit 41a6328b2c55276f89ea3812069fd7521e348bbf Starting with commit dca5161f9bd0 ("hv_netvsc: Check status in SEND_RNDIS_PKT completion message") in the 6.3 kernel, the Linux driver for Hyper-V synthetic networking (netvsc) occasionally reports "nvsp_rndis_pkt_complete error status: 2".[1] This error indicates that Hyper-V has rejected a network packet transmit request from the guest, and the outgoing network packet is dropped. Higher level network protocols presumably recover and resend the packet so there is no functional error, but performance is slightly impacted. Commit dca5161f9bd0 is not the cause of the error -- it only added reporting of an error that was already happening without any notice. The error has presumably been present since the netvsc driver was originally introduced into Linux. The root cause of the problem is that the netvsc driver in Linux may send an incorrectly formatted VMBus message to Hyper-V when transmitting the network packet. The incorrect formatting occurs when the rndis header of the VMBus message crosses a page boundary due to how the Linux skb head memory is aligned. In such a case, two PFNs are required to describe the location of the rndis header, even though they are contiguous in guest physical address (GPA) space. Hyper-V requires that two rndis header PFNs be in a single "GPA range" data struture, but current netvsc code puts each PFN in its own GPA range, which Hyper-V rejects as an error. The incorrect formatting occurs only for larger packets that netvsc must transmit via a VMBus "GPA Direct" message. There's no problem when netvsc transmits a smaller packet by copying it into a pre- allocated send buffer slot because the pre-allocated slots don't have page crossing issues. After commit 14ad6ed30a10 ("net: allow small head cache usage with large MAX_SKB_FRAGS values") in the 6.14-rc4 kernel, the error occurs much more frequently in VMs with 16 or more vCPUs. It may occur every few seconds, or even more frequently, in an ssh session that outputs a lot of text. Commit 14ad6ed30a10 subtly changes how skb head memory is allocated, making it much more likely that the rndis header will cross a page boundary when the vCPU count is 16 or more. The changes in commit 14ad6ed30a10 are perfectly valid -- they just had the side effect of making the netvsc bug more prominent. Current code in init_page_array() creates a separate page buffer array entry for each PFN required to identify the data to be transmitted. Contiguous PFNs get separate entries in the page buffer array, and any information about contiguity is lost. Fix the core issue by having init_page_array() construct the page buffer array to represent contiguous ranges rather than individual pages. When these ranges are subsequently passed to netvsc_build_mpb_array(), it can build GPA ranges that contain multiple PFNs, as required to avoid the error "nvsp_rndis_pkt_complete error status: 2". If instead the network packet is sent by copying into a pre-allocated send buffer slot, the copy proceeds using the contiguous ranges rather than individual pages, but the result of the copying is the same. Also fix rndis_filter_send_request() to construct a contiguous range, since it has its own page buffer array. This change has a side benefit in CoCo VMs in that netvsc_dma_map() calls dma_map_single() on each contiguous range instead of on each page. This results in fewer calls to dma_map_single() but on larger chunks of memory, which should reduce contention on the swiotlb. Since the page buffer array now contains one entry for each contiguous range instead of for each individual page, the number of entries in the array can be reduced, saving 208 bytes of stack space in netvsc_xmit() when MAX_SKG_FRAGS has the default value of 17. [1] https://bugzilla.kernel.org/show_bug.cgi?id=217503 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217503 Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-4-mhklinux@outlook.com Signed-off-by: Jakub Kicinski (cherry picked from commit 41a6328b2c55276f89ea3812069fd7521e348bbf) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/net/hyperv/hyperv_net.h | 12 ++++++ drivers/net/hyperv/netvsc_drv.c | 63 ++++++++----------------------- drivers/net/hyperv/rndis_filter.c | 24 +++--------- 3 files changed, 32 insertions(+), 67 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index 2e868d352228a..3871ecc961dc7 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -892,6 +892,18 @@ struct nvsp_message { sizeof(struct nvsp_message)) #define NETVSC_MIN_IN_MSG_SIZE sizeof(struct vmpacket_descriptor) +/* Maximum # of contiguous data ranges that can make up a trasmitted packet. + * Typically it's the max SKB fragments plus 2 for the rndis packet and the + * linear portion of the SKB. But if MAX_SKB_FRAGS is large, the value may + * need to be limited to MAX_PAGE_BUFFER_COUNT, which is the max # of entries + * in a GPA direct packet sent to netvsp over VMBus. + */ +#if MAX_SKB_FRAGS + 2 < MAX_PAGE_BUFFER_COUNT +#define MAX_DATA_RANGES (MAX_SKB_FRAGS + 2) +#else +#define MAX_DATA_RANGES MAX_PAGE_BUFFER_COUNT +#endif + /* Estimated requestor size: * out_ring_size/min_out_msg_size + in_ring_size/min_in_msg_size */ diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 23180f7b67b6a..f95cb14794ecc 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -325,43 +325,10 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb, return txq; } -static u32 fill_pg_buf(unsigned long hvpfn, u32 offset, u32 len, - struct hv_page_buffer *pb) -{ - int j = 0; - - hvpfn += offset >> HV_HYP_PAGE_SHIFT; - offset = offset & ~HV_HYP_PAGE_MASK; - - while (len > 0) { - unsigned long bytes; - - bytes = HV_HYP_PAGE_SIZE - offset; - if (bytes > len) - bytes = len; - pb[j].pfn = hvpfn; - pb[j].offset = offset; - pb[j].len = bytes; - - offset += bytes; - len -= bytes; - - if (offset == HV_HYP_PAGE_SIZE && len) { - hvpfn++; - offset = 0; - j++; - } - } - - return j + 1; -} - static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, struct hv_netvsc_packet *packet, struct hv_page_buffer *pb) { - u32 slots_used = 0; - char *data = skb->data; int frags = skb_shinfo(skb)->nr_frags; int i; @@ -370,28 +337,28 @@ static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, * 2. skb linear data * 3. skb fragment data */ - slots_used += fill_pg_buf(virt_to_hvpfn(hdr), - offset_in_hvpage(hdr), - len, - &pb[slots_used]); + pb[0].offset = offset_in_hvpage(hdr); + pb[0].len = len; + pb[0].pfn = virt_to_hvpfn(hdr); packet->rmsg_size = len; - packet->rmsg_pgcnt = slots_used; + packet->rmsg_pgcnt = 1; - slots_used += fill_pg_buf(virt_to_hvpfn(data), - offset_in_hvpage(data), - skb_headlen(skb), - &pb[slots_used]); + pb[1].offset = offset_in_hvpage(skb->data); + pb[1].len = skb_headlen(skb); + pb[1].pfn = virt_to_hvpfn(skb->data); for (i = 0; i < frags; i++) { skb_frag_t *frag = skb_shinfo(skb)->frags + i; + struct hv_page_buffer *cur_pb = &pb[i + 2]; + u64 pfn = page_to_hvpfn(skb_frag_page(frag)); + u32 offset = skb_frag_off(frag); - slots_used += fill_pg_buf(page_to_hvpfn(skb_frag_page(frag)), - skb_frag_off(frag), - skb_frag_size(frag), - &pb[slots_used]); + cur_pb->offset = offset_in_hvpage(offset); + cur_pb->len = skb_frag_size(frag); + cur_pb->pfn = pfn + (offset >> HV_HYP_PAGE_SHIFT); } - return slots_used; + return frags + 2; } static int count_skb_frag_slots(struct sk_buff *skb) @@ -482,7 +449,7 @@ static int netvsc_xmit(struct sk_buff *skb, struct net_device *net, bool xdp_tx) struct net_device *vf_netdev; u32 rndis_msg_size; u32 hash; - struct hv_page_buffer pb[MAX_PAGE_BUFFER_COUNT]; + struct hv_page_buffer pb[MAX_DATA_RANGES]; /* If VF is present and up then redirect packets to it. * Skip the VF if it is marked down or has no carrier. diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index ecc2128ca9b72..e457f809fe311 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -225,8 +225,7 @@ static int rndis_filter_send_request(struct rndis_device *dev, struct rndis_request *req) { struct hv_netvsc_packet *packet; - struct hv_page_buffer page_buf[2]; - struct hv_page_buffer *pb = page_buf; + struct hv_page_buffer pb; int ret; /* Setup the packet to send it */ @@ -235,27 +234,14 @@ static int rndis_filter_send_request(struct rndis_device *dev, packet->total_data_buflen = req->request_msg.msg_len; packet->page_buf_cnt = 1; - pb[0].pfn = virt_to_phys(&req->request_msg) >> - HV_HYP_PAGE_SHIFT; - pb[0].len = req->request_msg.msg_len; - pb[0].offset = offset_in_hvpage(&req->request_msg); - - /* Add one page_buf when request_msg crossing page boundary */ - if (pb[0].offset + pb[0].len > HV_HYP_PAGE_SIZE) { - packet->page_buf_cnt++; - pb[0].len = HV_HYP_PAGE_SIZE - - pb[0].offset; - pb[1].pfn = virt_to_phys((void *)&req->request_msg - + pb[0].len) >> HV_HYP_PAGE_SHIFT; - pb[1].offset = 0; - pb[1].len = req->request_msg.msg_len - - pb[0].len; - } + pb.pfn = virt_to_phys(&req->request_msg) >> HV_HYP_PAGE_SHIFT; + pb.len = req->request_msg.msg_len; + pb.offset = offset_in_hvpage(&req->request_msg); trace_rndis_send(dev->ndev, 0, &req->request_msg); rcu_read_lock_bh(); - ret = netvsc_send(dev->ndev, packet, NULL, pb, NULL, false); + ret = netvsc_send(dev->ndev, packet, NULL, &pb, NULL, false); rcu_read_unlock_bh(); return ret; From 382ea2f1e6814a0f1b2bcb651a74186430950254 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Mon, 25 Aug 2025 18:12:34 +0000 Subject: [PATCH 10/21] hv_netvsc: Remove rmsg_pgcnt jira LE-3554 commit-author Michael Kelley commit 5bbc644bbf4e97a05bc0cb052189004588ff8a09 init_page_array() now always creates a single page buffer array entry for the rndis message, even if the rndis message crosses a page boundary. As such, the number of page buffer array entries used for the rndis message must no longer be tracked -- it is always just 1. Remove the rmsg_pgcnt field and use "1" where the value is needed. Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-5-mhklinux@outlook.com Signed-off-by: Jakub Kicinski (cherry picked from commit 5bbc644bbf4e97a05bc0cb052189004588ff8a09) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/net/hyperv/hyperv_net.h | 1 - drivers/net/hyperv/netvsc.c | 7 +++---- drivers/net/hyperv/netvsc_drv.c | 1 - 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index 3871ecc961dc7..1b90359db64c9 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -157,7 +157,6 @@ struct hv_netvsc_packet { u8 cp_partial; /* partial copy into send buffer */ u8 rmsg_size; /* RNDIS header and PPI size */ - u8 rmsg_pgcnt; /* page count of RNDIS header and PPI */ u8 page_buf_cnt; u16 q_idx; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 1fc80bbe7941d..807465dd4c8e3 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -947,8 +947,7 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device, + pend_size; int i; u32 padding = 0; - u32 page_count = packet->cp_partial ? packet->rmsg_pgcnt : - packet->page_buf_cnt; + u32 page_count = packet->cp_partial ? 1 : packet->page_buf_cnt; u32 remain; /* Add padding */ @@ -1131,7 +1130,7 @@ static inline int netvsc_send_pkt( u32 desc_size; if (packet->cp_partial) - pb += packet->rmsg_pgcnt; + pb++; ret = netvsc_dma_map(ndev_ctx->device_ctx, packet, pb); if (ret) { @@ -1293,7 +1292,7 @@ int netvsc_send(struct net_device *ndev, packet->send_buf_index = section_index; if (packet->cp_partial) { - packet->page_buf_cnt -= packet->rmsg_pgcnt; + packet->page_buf_cnt--; packet->total_data_buflen = msd_len + packet->rmsg_size; } else { packet->page_buf_cnt = 0; diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index f95cb14794ecc..8ec497023224a 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -342,7 +342,6 @@ static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, pb[0].len = len; pb[0].pfn = virt_to_hvpfn(hdr); packet->rmsg_size = len; - packet->rmsg_pgcnt = 1; pb[1].offset = offset_in_hvpage(skb->data); pb[1].len = skb_headlen(skb); From 090e9800da97e33851a8629e3a45f5c745cabdcd Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Mon, 25 Aug 2025 18:13:08 +0000 Subject: [PATCH 11/21] Drivers: hv: vmbus: Remove vmbus_sendpacket_pagebuffer() jira LE-3554 commit-author Michael Kelley commit 45a442fe369e6c4e0b4aa9f63b31c3f2f9e2090e With the netvsc driver changed to use vmbus_sendpacket_mpb_desc() instead of vmbus_sendpacket_pagebuffer(), the latter has no remaining callers. Remove it. Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-6-mhklinux@outlook.com Signed-off-by: Jakub Kicinski (cherry picked from commit 45a442fe369e6c4e0b4aa9f63b31c3f2f9e2090e) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/hv/channel.c | 59 ------------------------------------------ include/linux/hyperv.h | 7 ----- 2 files changed, 66 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 4ffd5eaa78172..35f26fa1ffe76 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -1076,65 +1076,6 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer, } EXPORT_SYMBOL(vmbus_sendpacket); -/* - * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer - * packets using a GPADL Direct packet type. This interface allows you - * to control notifying the host. This will be useful for sending - * batched data. Also the sender can control the send flags - * explicitly. - */ -int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, - struct hv_page_buffer pagebuffers[], - u32 pagecount, void *buffer, u32 bufferlen, - u64 requestid) -{ - int i; - struct vmbus_channel_packet_page_buffer desc; - u32 descsize; - u32 packetlen; - u32 packetlen_aligned; - struct kvec bufferlist[3]; - u64 aligned_data = 0; - - if (pagecount > MAX_PAGE_BUFFER_COUNT) - return -EINVAL; - - /* - * Adjust the size down since vmbus_channel_packet_page_buffer is the - * largest size we support - */ - descsize = sizeof(struct vmbus_channel_packet_page_buffer) - - ((MAX_PAGE_BUFFER_COUNT - pagecount) * - sizeof(struct hv_page_buffer)); - packetlen = descsize + bufferlen; - packetlen_aligned = ALIGN(packetlen, sizeof(u64)); - - /* Setup the descriptor */ - desc.type = VM_PKT_DATA_USING_GPA_DIRECT; - desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; - desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */ - desc.length8 = (u16)(packetlen_aligned >> 3); - desc.transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */ - desc.reserved = 0; - desc.rangecount = pagecount; - - for (i = 0; i < pagecount; i++) { - desc.range[i].len = pagebuffers[i].len; - desc.range[i].offset = pagebuffers[i].offset; - desc.range[i].pfn = pagebuffers[i].pfn; - } - - bufferlist[0].iov_base = &desc; - bufferlist[0].iov_len = descsize; - bufferlist[1].iov_base = buffer; - bufferlist[1].iov_len = bufferlen; - bufferlist[2].iov_base = &aligned_data; - bufferlist[2].iov_len = (packetlen_aligned - packetlen); - - return hv_ringbuffer_write(channel, bufferlist, 3, requestid, NULL); -} -EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer); - /* * vmbus_sendpacket_mpb_desc - Send one or more multi-page buffer packets * using a GPADL Direct packet type. diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 550ad87c38b6c..ccb2ef2fa1e93 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1226,13 +1226,6 @@ extern int vmbus_sendpacket(struct vmbus_channel *channel, enum vmbus_packet_type type, u32 flags); -extern int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, - struct hv_page_buffer pagebuffers[], - u32 pagecount, - void *buffer, - u32 bufferlen, - u64 requestid); - extern int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, struct vmbus_packet_mpb_array *mpb, u32 desc_size, From ce7e9432dca9eb50cc975cb14ef07ff3b7dc7fd5 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Mon, 25 Aug 2025 19:37:39 +0000 Subject: [PATCH 12/21] hv_netvsc: Use VF's tso_max_size value when data path is VF jira LE-3885 commit-author Shradha Gupta commit 685920920e3d5f68a8c50107b97747b0f8ce050f On Azure, increasing VF's gso/gro packet size to up-to GSO_MAX_SIZE is not possible without allowing the same for netvsc NIC (as the NICs are bonded together). For bonded NICs, the min of the max aggregated pkt size of the members is propagated in the stack. Therefore, we use netif_set_tso_max_size() to set max aggregated pkt size to VF's packet size for netvsc too, when the data path is switched over to the VF Tested on azure env with Accelerated Networking enabled and disabled. Signed-off-by: Shradha Gupta Reviewed-by: Haiyang Zhang Signed-off-by: David S. Miller (cherry picked from commit 685920920e3d5f68a8c50107b97747b0f8ce050f) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/net/hyperv/hyperv_net.h | 2 ++ drivers/net/hyperv/netvsc_drv.c | 15 +++++++++++++++ drivers/net/hyperv/rndis_filter.c | 13 +++++++------ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index 1b90359db64c9..83c51e1bae4bd 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -1176,6 +1176,8 @@ struct netvsc_device { u32 max_chn; u32 num_chn; + u32 netvsc_gso_max_size; + atomic_t open_chn; struct work_struct subchan_work; wait_queue_head_t subchan_open; diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 8ec497023224a..2d23e955f8fb6 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2427,6 +2427,21 @@ static int netvsc_vf_changed(struct net_device *vf_netdev, unsigned long event) } else { netdev_info(ndev, "Data path switched %s VF: %s\n", vf_is_up ? "to" : "from", vf_netdev->name); + + /* In Azure, when accelerated networking in enabled, other NICs + * like MANA, MLX, are configured as a bonded nic with + * Netvsc(failover) NIC. For bonded NICs, the min of the max + * pkt aggregate size of the members is propagated in the stack. + * In order to allow these NICs (MANA/MLX) to use up to + * GSO_MAX_SIZE gso packet size, we need to allow Netvsc NIC to + * also support this in the guest. + * This value is only increased for netvsc NIC when datapath is + * switched over to the VF + */ + if (vf_is_up) + netif_set_tso_max_size(ndev, vf_netdev->tso_max_size); + else + netif_set_tso_max_size(ndev, netvsc_dev->netvsc_gso_max_size); } return NOTIFY_OK; diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index e457f809fe311..98549cf1d4138 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1337,9 +1337,10 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, struct net_device_context *net_device_ctx = netdev_priv(net); struct ndis_offload hwcaps; struct ndis_offload_params offloads; - unsigned int gso_max_size = GSO_LEGACY_MAX_SIZE; int ret; + nvdev->netvsc_gso_max_size = GSO_LEGACY_MAX_SIZE; + /* Find HW offload capabilities */ ret = rndis_query_hwcaps(rndis_device, nvdev, &hwcaps); if (ret != 0) @@ -1371,8 +1372,8 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, offloads.lso_v2_ipv4 = NDIS_OFFLOAD_PARAMETERS_LSOV2_ENABLED; net->hw_features |= NETIF_F_TSO; - if (hwcaps.lsov2.ip4_maxsz < gso_max_size) - gso_max_size = hwcaps.lsov2.ip4_maxsz; + if (hwcaps.lsov2.ip4_maxsz < nvdev->netvsc_gso_max_size) + nvdev->netvsc_gso_max_size = hwcaps.lsov2.ip4_maxsz; } if (hwcaps.csum.ip4_txcsum & NDIS_TXCSUM_CAP_UDP4) { @@ -1392,8 +1393,8 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, offloads.lso_v2_ipv6 = NDIS_OFFLOAD_PARAMETERS_LSOV2_ENABLED; net->hw_features |= NETIF_F_TSO6; - if (hwcaps.lsov2.ip6_maxsz < gso_max_size) - gso_max_size = hwcaps.lsov2.ip6_maxsz; + if (hwcaps.lsov2.ip6_maxsz < nvdev->netvsc_gso_max_size) + nvdev->netvsc_gso_max_size = hwcaps.lsov2.ip6_maxsz; } if (hwcaps.csum.ip6_txcsum & NDIS_TXCSUM_CAP_UDP6) { @@ -1419,7 +1420,7 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, */ net->features &= ~NETVSC_SUPPORTED_HW_FEATURES | net->hw_features; - netif_set_tso_max_size(net, gso_max_size); + netif_set_tso_max_size(net, nvdev->netvsc_gso_max_size); ret = rndis_filter_set_offload_params(net, nvdev, &offloads); From 4cfa5f5a4f750cd8aa2567b7a6d269732943739c Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Mon, 25 Aug 2025 19:23:36 +0000 Subject: [PATCH 13/21] net: mana: Allow tso_max_size to go up-to GSO_MAX_SIZE jira LE-3885 commit-author Shradha Gupta commit 27315836f4bcc8e4879d50dfc1fa6eb41e7952ef Allow the max aggregated pkt size to go up-to GSO_MAX_SIZE for MANA NIC. This patch only increases the max allowable gso/gro pkt size for MANA devices and does not change the defaults. Following are the perf benefits by increasing the pkt aggregate size from legacy gso_max_size value(64K) to newer one(up-to 511K IPv4 tests for i in {1..10}; do netperf -t TCP_RR -H 10.0.0.5 -p50000 -- -r80000,80000 -O MIN_LATENCY,P90_LATENCY,P99_LATENCY,THROUGHPUT|tail -1; done min p90 p99 Throughput gso_max_size 93 171 194 6594.25 97 154 180 7183.74 95 165 189 6927.86 96 165 188 6976.04 93 154 185 7338.05 64K 93 168 189 6938.03 94 169 189 6784.93 92 166 189 7117.56 94 179 191 6678.44 95 157 183 7277.81 min p90 p99 Throughput 93 134 146 8448.75 95 134 140 8396.54 94 137 148 8204.12 94 137 148 8244.41 94 128 139 8666.52 80K 94 141 153 8116.86 94 138 149 8163.92 92 135 142 8362.72 92 134 142 8497.57 93 136 148 8393.23 IPv6 Tests for i in {1..10}; do netperf -t TCP_RR -H fd00:9013:cadd::4 -p50000 -- -r80000,80000 -O MIN_LATENCY,P90_LATENCY,P99_LATENCY,THROUGHPUT|tail -1; done min p90 p99 Throughput gso_max_size 108 165 170 6673.2 101 169 189 6451.69 101 165 169 6737.65 102 167 175 6614.64 101 178 189 6247.13 64K 107 163 169 6678.63 106 176 187 6350.86 100 164 169 6617.36 102 163 170 6849.21 102 168 175 6605.7 min p90 p99 Throughput 108 155 166 7183 110 154 163 7268.87 109 152 159 7434.35 107 145 157 7569.15 107 149 164 7496.17 80K 110 154 159 7245.85 108 156 162 7266.24 109 145 158 7526.66 106 145 151 7785.75 111 148 157 7246.65 Tested on azure env with Accelerated Networking enabled and disabled. Signed-off-by: Shradha Gupta Reviewed-by: Haiyang Zhang Signed-off-by: David S. Miller (cherry picked from commit 27315836f4bcc8e4879d50dfc1fa6eb41e7952ef) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/net/ethernet/microsoft/mana/mana_en.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 7f125c7289ebf..cbaa23c614694 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -255,6 +255,9 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev) if (skb_cow_head(skb, MANA_HEADROOM)) goto tx_drop_count; + if (unlikely(ipv6_hopopt_jumbo_remove(skb))) + goto tx_drop_count; + txq = &apc->tx_qp[txq_idx].txq; gdma_sq = txq->gdma_sq; cq = &apc->tx_qp[txq_idx].tx_cq; @@ -2869,6 +2872,8 @@ static int mana_probe_port(struct mana_context *ac, int port_idx, ndev->dev_port = port_idx; SET_NETDEV_DEV(ndev, gc->dev); + netif_set_tso_max_size(ndev, GSO_MAX_SIZE); + netif_carrier_off(ndev); netdev_rss_key_fill(apc->hashkey, MANA_HASH_KEY_SIZE); From 9d4aa4fcec69780bdf2550f150802eba90e2d863 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Mon, 25 Aug 2025 19:26:07 +0000 Subject: [PATCH 14/21] net: mana: Add debug logs in MANA network driver jira LE-3889 commit-author Erni Sri Satya Vennela commit 47dfd7a72257e91171d56e220ea484a04df89847 Add more logs to assist in debugging and monitoring driver behaviour, making it easier to identify potential issues during development and testing. Signed-off-by: Erni Sri Satya Vennela Reviewed-by: Haiyang Zhang Link: https://patch.msgid.link/1739842455-23899-1-git-send-email-ernis@linux.microsoft.com Signed-off-by: Jakub Kicinski (cherry picked from commit 47dfd7a72257e91171d56e220ea484a04df89847) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- .../net/ethernet/microsoft/mana/gdma_main.c | 50 +++++++++++++--- .../net/ethernet/microsoft/mana/hw_channel.c | 6 +- drivers/net/ethernet/microsoft/mana/mana_en.c | 58 +++++++++++++++---- 3 files changed, 94 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 0df906f2cdf30..074e00987320f 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -667,8 +667,11 @@ int mana_gd_create_hwc_queue(struct gdma_dev *gd, gmi = &queue->mem_info; err = mana_gd_alloc_memory(gc, spec->queue_size, gmi); - if (err) + if (err) { + dev_err(gc->dev, "GDMA queue type: %d, size: %u, gdma memory allocation err: %d\n", + spec->type, spec->queue_size, err); goto free_q; + } queue->head = 0; queue->tail = 0; @@ -689,6 +692,8 @@ int mana_gd_create_hwc_queue(struct gdma_dev *gd, *queue_ptr = queue; return 0; out: + dev_err(gc->dev, "Failed to create queue type %d of size %u, err: %d\n", + spec->type, spec->queue_size, err); mana_gd_free_memory(gmi); free_q: kfree(queue); @@ -771,7 +776,13 @@ static int mana_gd_create_dma_region(struct gdma_dev *gd, } gmi->dma_region_handle = resp.dma_region_handle; + dev_dbg(gc->dev, "Created DMA region handle 0x%llx\n", + gmi->dma_region_handle); out: + if (err) + dev_dbg(gc->dev, + "Failed to create DMA region of length: %u, page_type: %d, status: 0x%x, err: %d\n", + length, req->gdma_page_type, resp.hdr.status, err); kfree(req); return err; } @@ -794,8 +805,11 @@ int mana_gd_create_mana_eq(struct gdma_dev *gd, gmi = &queue->mem_info; err = mana_gd_alloc_memory(gc, spec->queue_size, gmi); - if (err) + if (err) { + dev_err(gc->dev, "GDMA queue type: %d, size: %u, gdma memory allocation err: %d\n", + spec->type, spec->queue_size, err); goto free_q; + } err = mana_gd_create_dma_region(gd, gmi); if (err) @@ -816,6 +830,8 @@ int mana_gd_create_mana_eq(struct gdma_dev *gd, *queue_ptr = queue; return 0; out: + dev_err(gc->dev, "Failed to create queue type %d of size: %u, err: %d\n", + spec->type, spec->queue_size, err); mana_gd_free_memory(gmi); free_q: kfree(queue); @@ -842,8 +858,11 @@ int mana_gd_create_mana_wq_cq(struct gdma_dev *gd, gmi = &queue->mem_info; err = mana_gd_alloc_memory(gc, spec->queue_size, gmi); - if (err) + if (err) { + dev_err(gc->dev, "GDMA queue type: %d, size: %u, memory allocation err: %d\n", + spec->type, spec->queue_size, err); goto free_q; + } err = mana_gd_create_dma_region(gd, gmi); if (err) @@ -863,6 +882,8 @@ int mana_gd_create_mana_wq_cq(struct gdma_dev *gd, *queue_ptr = queue; return 0; out: + dev_err(gc->dev, "Failed to create queue type %d of size: %u, err: %d\n", + spec->type, spec->queue_size, err); mana_gd_free_memory(gmi); free_q: kfree(queue); @@ -1158,8 +1179,11 @@ int mana_gd_post_and_ring(struct gdma_queue *queue, int err; err = mana_gd_post_work_request(queue, wqe_req, wqe_info); - if (err) + if (err) { + dev_err(gc->dev, "Failed to post work req from queue type %d of size %u (err=%d)\n", + queue->type, queue->queue_size, err); return err; + } mana_gd_wq_ring_doorbell(gc, queue); @@ -1436,8 +1460,10 @@ static int mana_gd_setup(struct pci_dev *pdev) mana_smc_init(&gc->shm_channel, gc->dev, gc->shm_base); err = mana_gd_setup_irqs(pdev); - if (err) + if (err) { + dev_err(gc->dev, "Failed to setup IRQs: %d\n", err); return err; + } err = mana_hwc_create_channel(gc); if (err) @@ -1455,12 +1481,14 @@ static int mana_gd_setup(struct pci_dev *pdev) if (err) goto destroy_hwc; + dev_dbg(&pdev->dev, "mana gdma setup successful\n"); return 0; destroy_hwc: mana_hwc_destroy_channel(gc); remove_irq: mana_gd_remove_irqs(pdev); + dev_err(&pdev->dev, "%s failed (error %d)\n", __func__, err); return err; } @@ -1471,6 +1499,7 @@ static void mana_gd_cleanup(struct pci_dev *pdev) mana_hwc_destroy_channel(gc); mana_gd_remove_irqs(pdev); + dev_dbg(&pdev->dev, "mana gdma cleanup successful\n"); } static bool mana_is_pf(unsigned short dev_id) @@ -1489,8 +1518,10 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) BUILD_BUG_ON(2 * MAX_PORTS_IN_MANA_DEV * GDMA_EQE_SIZE > EQ_SIZE); err = pci_enable_device(pdev); - if (err) + if (err) { + dev_err(&pdev->dev, "Failed to enable pci device (err=%d)\n", err); return -ENXIO; + } pci_set_master(pdev); @@ -1499,9 +1530,10 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto disable_dev; err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); - if (err) + if (err) { + dev_err(&pdev->dev, "DMA set mask failed: %d\n", err); goto release_region; - + } dma_set_max_seg_size(&pdev->dev, UINT_MAX); err = -ENOMEM; @@ -1579,6 +1611,8 @@ static void mana_gd_remove(struct pci_dev *pdev) pci_release_regions(pdev); pci_disable_device(pdev); + + dev_dbg(&pdev->dev, "mana gdma remove successful\n"); } /* The 'state' parameter is not used. */ diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c index 48b899834d0aa..98bf4ecf2d950 100644 --- a/drivers/net/ethernet/microsoft/mana/hw_channel.c +++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c @@ -439,7 +439,8 @@ static int mana_hwc_alloc_dma_buf(struct hw_channel_context *hwc, u16 q_depth, gmi = &dma_buf->mem_info; err = mana_gd_alloc_memory(gc, buf_size, gmi); if (err) { - dev_err(hwc->dev, "Failed to allocate DMA buffer: %d\n", err); + dev_err(hwc->dev, "Failed to allocate DMA buffer size: %u, err %d\n", + buf_size, err); goto out; } @@ -528,6 +529,9 @@ static int mana_hwc_create_wq(struct hw_channel_context *hwc, out: if (err) mana_hwc_destroy_wq(hwc, hwc_wq); + + dev_err(hwc->dev, "Failed to create HWC queue size= %u type= %d err= %d\n", + queue_size, q_type, err); return err; } diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index cbaa23c614694..91bba8fb2b7c3 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -51,10 +51,12 @@ static int mana_open(struct net_device *ndev) { struct mana_port_context *apc = netdev_priv(ndev); int err; - err = mana_alloc_queues(ndev); - if (err) + + if (err) { + netdev_err(ndev, "%s failed to allocate queues: %d\n", __func__, err); return err; + } apc->port_is_up = true; @@ -63,7 +65,7 @@ static int mana_open(struct net_device *ndev) netif_carrier_on(ndev); netif_tx_wake_all_queues(ndev); - + netdev_dbg(ndev, "%s successful\n", __func__); return 0; } @@ -175,6 +177,9 @@ static int mana_map_skb(struct sk_buff *skb, struct mana_port_context *apc, return 0; frag_err: + if (net_ratelimit()) + netdev_err(apc->ndev, "Failed to map skb of size %u to DMA\n", + skb->len); for (i = sg_i - 1; i >= hsg; i--) dma_unmap_page(dev, ash->dma_handle[i], ash->size[i], DMA_TO_DEVICE); @@ -685,6 +690,7 @@ int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu, int num_qu return 0; error: + netdev_err(mpc->ndev, "Failed to pre-allocate RX buffers for %d queues\n", num_queues); mana_pre_dealloc_rxbufs(mpc); return -ENOMEM; } @@ -1307,8 +1313,10 @@ static int mana_create_eq(struct mana_context *ac) for (i = 0; i < gc->max_num_queues; i++) { spec.eq.msix_index = (i + 1) % gc->num_msix_usable; err = mana_gd_create_mana_eq(gd, &spec, &ac->eqs[i].eq); - if (err) + if (err) { + dev_err(gc->dev, "Failed to create EQ %d : %d\n", i, err); goto out; + } mana_create_eq_debugfs(ac, i); } @@ -2078,6 +2086,8 @@ static int mana_create_txq(struct mana_port_context *apc, return 0; out: + netdev_err(net, "Failed to create %d TX queues, %d\n", + apc->num_queues, err); mana_destroy_txq(apc); return err; } @@ -2414,6 +2424,7 @@ static int mana_add_rx_queues(struct mana_port_context *apc, rxq = mana_create_rxq(apc, i, &ac->eqs[i], ndev); if (!rxq) { err = -ENOMEM; + netdev_err(ndev, "Failed to create rxq %d : %d\n", i, err); goto out; } @@ -2660,12 +2671,18 @@ int mana_alloc_queues(struct net_device *ndev) int err; err = mana_create_vport(apc, ndev); - if (err) + if (err) { + netdev_err(ndev, "Failed to create vPort %u : %d\n", apc->port_idx, err); return err; + } err = netif_set_real_num_tx_queues(ndev, apc->num_queues); - if (err) + if (err) { + netdev_err(ndev, + "netif_set_real_num_tx_queues () failed for ndev with num_queues %u : %d\n", + apc->num_queues, err); goto destroy_vport; + } err = mana_add_rx_queues(apc, ndev); if (err) @@ -2674,14 +2691,20 @@ int mana_alloc_queues(struct net_device *ndev) apc->rss_state = apc->num_queues > 1 ? TRI_STATE_TRUE : TRI_STATE_FALSE; err = netif_set_real_num_rx_queues(ndev, apc->num_queues); - if (err) + if (err) { + netdev_err(ndev, + "netif_set_real_num_rx_queues () failed for ndev with num_queues %u : %d\n", + apc->num_queues, err); goto destroy_vport; + } mana_rss_table_init(apc); err = mana_config_rss(apc, TRI_STATE_TRUE, true, true); - if (err) + if (err) { + netdev_err(ndev, "Failed to configure RSS table: %d\n", err); goto destroy_vport; + } if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode) { err = mana_pf_register_filter(apc); @@ -2822,8 +2845,10 @@ int mana_detach(struct net_device *ndev, bool from_close) if (apc->port_st_save) { err = mana_dealloc_queues(ndev); - if (err) + if (err) { + netdev_err(ndev, "%s failed to deallocate queues: %d\n", __func__, err); return err; + } } if (!from_close) { @@ -2969,6 +2994,8 @@ static int add_adev(struct gdma_dev *gd) goto add_fail; gd->adev = adev; + dev_dbg(gd->gdma_context->dev, + "Auxiliary device added successfully\n"); return 0; add_fail: @@ -3011,8 +3038,10 @@ int mana_probe(struct gdma_dev *gd, bool resuming) } err = mana_create_eq(ac); - if (err) + if (err) { + dev_err(dev, "Failed to create EQs: %d\n", err); goto out; + } err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION, MANA_MICRO_VERSION, &num_ports, &bm_hostmode); @@ -3070,8 +3099,14 @@ int mana_probe(struct gdma_dev *gd, bool resuming) err = add_adev(gd); out: - if (err) + if (err) { mana_remove(gd, false); + } else { + dev_dbg(dev, "gd=%p, id=%u, num_ports=%d, type=%u, instance=%u\n", + gd, gd->dev_id.as_uint32, ac->num_ports, + gd->dev_id.type, gd->dev_id.instance); + dev_dbg(dev, "%s succeeded\n", __func__); + } return err; } @@ -3133,6 +3168,7 @@ void mana_remove(struct gdma_dev *gd, bool suspending) gd->driver_data = NULL; gd->gdma_context = NULL; kfree(ac); + dev_dbg(dev, "%s succeeded\n", __func__); } struct net_device *mana_get_primary_netdev_rcu(struct mana_context *ac, u32 port_index) From 1cb1d9269e44f6adf21e023c7c804af378d24a99 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Thu, 28 Aug 2025 10:20:41 +0000 Subject: [PATCH 15/21] net: mana: Change the function signature of mana_get_primary_netdev_rcu jira LE-3893 commit-author Long Li commit a8445cfec101c42e9d64cdb2dac13973b22c205c Change mana_get_primary_netdev_rcu() to mana_get_primary_netdev(), and return the ndev with refcount held. The caller is responsible for dropping the refcount. Also drop the check for IFF_SLAVE as it is not necessary if the upper device is present. Signed-off-by: Long Li Link: https://patch.msgid.link/1741821332-9392-1-git-send-email-longli@linuxonhyperv.com Signed-off-by: Leon Romanovsky (cherry picked from commit a8445cfec101c42e9d64cdb2dac13973b22c205c) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/infiniband/hw/mana/device.c | 7 +++--- drivers/infiniband/hw/mana/mana_ib.h | 1 + drivers/net/ethernet/microsoft/mana/mana_en.c | 22 ++++++++++++------- include/net/mana/mana.h | 4 +++- 4 files changed, 21 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 50437272b7b96..16442d53ec8e3 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -85,10 +85,8 @@ static int mana_ib_probe(struct auxiliary_device *adev, dev->ib_dev.num_comp_vectors = mdev->gdma_context->max_num_queues; dev->ib_dev.dev.parent = mdev->gdma_context->dev; - rcu_read_lock(); /* required to get primary netdev */ - ndev = mana_get_primary_netdev_rcu(mc, 0); + ndev = mana_get_primary_netdev(mc, 0, &dev->dev_tracker); if (!ndev) { - rcu_read_unlock(); ret = -ENODEV; ibdev_err(&dev->ib_dev, "Failed to get netdev for IB port 1"); goto free_ib_device; @@ -96,7 +94,8 @@ static int mana_ib_probe(struct auxiliary_device *adev, ether_addr_copy(mac_addr, ndev->dev_addr); addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, ndev->dev_addr); ret = ib_device_set_netdev(&dev->ib_dev, ndev, 1); - rcu_read_unlock(); + /* mana_get_primary_netdev() returns ndev with refcount held */ + netdev_put(ndev, &dev->dev_tracker); if (ret) { ibdev_err(&dev->ib_dev, "Failed to set ib netdev, ret %d", ret); goto free_ib_device; diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index b53a5b4de908d..2638688f25057 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -64,6 +64,7 @@ struct mana_ib_dev { struct gdma_queue **eqs; struct xarray qp_table_wq; struct mana_ib_adapter_caps adapter_caps; + netdevice_tracker dev_tracker; }; struct mana_ib_wq { diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 91bba8fb2b7c3..a2c0ad4c62fb2 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -3171,21 +3171,27 @@ void mana_remove(struct gdma_dev *gd, bool suspending) dev_dbg(dev, "%s succeeded\n", __func__); } -struct net_device *mana_get_primary_netdev_rcu(struct mana_context *ac, u32 port_index) +struct net_device *mana_get_primary_netdev(struct mana_context *ac, + u32 port_index, + netdevice_tracker *tracker) { struct net_device *ndev; - RCU_LOCKDEP_WARN(!rcu_read_lock_held(), - "Taking primary netdev without holding the RCU read lock"); if (port_index >= ac->num_ports) return NULL; - /* When mana is used in netvsc, the upper netdevice should be returned. */ - if (ac->ports[port_index]->flags & IFF_SLAVE) - ndev = netdev_master_upper_dev_get_rcu(ac->ports[port_index]); - else + rcu_read_lock(); + + /* If mana is used in netvsc, the upper netdevice should be returned. */ + ndev = netdev_master_upper_dev_get_rcu(ac->ports[port_index]); + + /* If there is no upper device, use the parent Ethernet device */ + if (!ndev) ndev = ac->ports[port_index]; + netdev_hold(ndev, tracker, GFP_ATOMIC); + rcu_read_unlock(); + return ndev; } -EXPORT_SYMBOL_NS(mana_get_primary_netdev_rcu, NET_MANA); +EXPORT_SYMBOL_NS(mana_get_primary_netdev, NET_MANA); diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index e74415c833791..bf573c8a62819 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -826,5 +826,7 @@ int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id, u32 doorbell_pg_id); void mana_uncfg_vport(struct mana_port_context *apc); -struct net_device *mana_get_primary_netdev_rcu(struct mana_context *ac, u32 port_index); +struct net_device *mana_get_primary_netdev(struct mana_context *ac, + u32 port_index, + netdevice_tracker *tracker); #endif /* _MANA_H */ From 73f058c61222d3f886afd4e42dd153c7eb2cfc11 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Thu, 28 Aug 2025 10:25:45 +0000 Subject: [PATCH 16/21] RDMA/mana_ib: Handle net event for pointing to the current netdev jira LE-3893 commit-author Long Li commit bee35b7161aaaed9831e2f14876c374b9c566952 upstream-diff There were conflicts when applying this patch due to the following missing commits :- 79bccd746132 ("RDMA/mana_ib: Add port statistics support") df91c470d9e5 ("RDMA/mana_ib: create/destroy AH") When running under Hyper-V, the master device to the RDMA device is always bonded to this RDMA device. This is not user-configurable. The master device can be unbind/bind from the kernel. During those events, the RDMA device should set to the current netdev to reflect the change of master device from those events. Signed-off-by: Long Li Link: https://patch.msgid.link/1741821332-9392-2-git-send-email-longli@linuxonhyperv.com Signed-off-by: Leon Romanovsky (cherry picked from commit bee35b7161aaaed9831e2f14876c374b9c566952) Signed-off-by: Shreeya Patel Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/infiniband/hw/mana/device.c | 47 ++++++++++++++++++++++++++-- drivers/infiniband/hw/mana/mana_ib.h | 1 + 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 16442d53ec8e3..181717fd4fa05 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -52,6 +52,38 @@ static const struct ib_device_ops mana_ib_dev_ops = { ib_ind_table), }; +static int mana_ib_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct mana_ib_dev *dev = container_of(this, struct mana_ib_dev, nb); + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + struct gdma_context *gc = dev->gdma_dev->gdma_context; + struct mana_context *mc = gc->mana.driver_data; + struct net_device *ndev; + + /* Only process events from our parent device */ + if (event_dev != mc->ports[0]) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_CHANGEUPPER: + ndev = mana_get_primary_netdev(mc, 0, &dev->dev_tracker); + /* + * RDMA core will setup GID based on updated netdev. + * It's not possible to race with the core as rtnl lock is being + * held. + */ + ib_device_set_netdev(&dev->ib_dev, ndev, 1); + + /* mana_get_primary_netdev() returns ndev with refcount held */ + netdev_put(ndev, &dev->dev_tracker); + + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + static int mana_ib_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { @@ -109,17 +141,25 @@ static int mana_ib_probe(struct auxiliary_device *adev, } dev->gdma_dev = &mdev->gdma_context->mana_ib; + dev->nb.notifier_call = mana_ib_netdev_event; + ret = register_netdevice_notifier(&dev->nb); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to register net notifier, %d", + ret); + goto deregister_device; + } + ret = mana_ib_gd_query_adapter_caps(dev); if (ret) { ibdev_err(&dev->ib_dev, "Failed to query device caps, ret %d", ret); - goto deregister_device; + goto deregister_net_notifier; } ret = mana_ib_create_eqs(dev); if (ret) { ibdev_err(&dev->ib_dev, "Failed to create EQs, ret %d", ret); - goto deregister_device; + goto deregister_net_notifier; } ret = mana_ib_gd_create_rnic_adapter(dev); @@ -148,6 +188,8 @@ static int mana_ib_probe(struct auxiliary_device *adev, mana_ib_gd_destroy_rnic_adapter(dev); destroy_eqs: mana_ib_destroy_eqs(dev); +deregister_net_notifier: + unregister_netdevice_notifier(&dev->nb); deregister_device: mana_gd_deregister_device(dev->gdma_dev); free_ib_device: @@ -163,6 +205,7 @@ static void mana_ib_remove(struct auxiliary_device *adev) xa_destroy(&dev->qp_table_wq); mana_ib_gd_destroy_rnic_adapter(dev); mana_ib_destroy_eqs(dev); + unregister_netdevice_notifier(&dev->nb); mana_gd_deregister_device(dev->gdma_dev); ib_dealloc_device(&dev->ib_dev); } diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 2638688f25057..bb9c6b1af24e1 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -65,6 +65,7 @@ struct mana_ib_dev { struct xarray qp_table_wq; struct mana_ib_adapter_caps adapter_caps; netdevice_tracker dev_tracker; + struct notifier_block nb; }; struct mana_ib_wq { From d4756c1959368178c01ff79125fa94a81143dca8 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Thu, 28 Aug 2025 10:39:32 +0000 Subject: [PATCH 17/21] net: mana: Support holes in device list reply msg jira LE-3903 commit-author Haiyang Zhang commit 2fc8a346625eb1abfe202062c7e6a13d76cde5ea According to GDMA protocol, holes (zeros) are allowed at the beginning or middle of the gdma_list_devices_resp message. The existing code cannot properly handle this, and may miss some devices in the list. To fix, scan the entire list until the num_of_devs are found, or until the end of the list. Cc: stable@vger.kernel.org Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)") Signed-off-by: Haiyang Zhang Reviewed-by: Long Li Reviewed-by: Shradha Gupta Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/1741723974-1534-1-git-send-email-haiyangz@microsoft.com Signed-off-by: Paolo Abeni (cherry picked from commit 2fc8a346625eb1abfe202062c7e6a13d76cde5ea) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/net/ethernet/microsoft/mana/gdma_main.c | 14 ++++++++++---- include/net/mana/gdma.h | 11 +++++++---- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 074e00987320f..0c5e302e75c98 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -135,9 +135,10 @@ static int mana_gd_detect_devices(struct pci_dev *pdev) struct gdma_list_devices_resp resp = {}; struct gdma_general_req req = {}; struct gdma_dev_id dev; - u32 i, max_num_devs; + int found_dev = 0; u16 dev_type; int err; + u32 i; mana_gd_init_req_hdr(&req.hdr, GDMA_LIST_DEVICES, sizeof(req), sizeof(resp)); @@ -149,12 +150,17 @@ static int mana_gd_detect_devices(struct pci_dev *pdev) return err ? err : -EPROTO; } - max_num_devs = min_t(u32, MAX_NUM_GDMA_DEVICES, resp.num_of_devs); - - for (i = 0; i < max_num_devs; i++) { + for (i = 0; i < GDMA_DEV_LIST_SIZE && + found_dev < resp.num_of_devs; i++) { dev = resp.devs[i]; dev_type = dev.type; + /* Skip empty devices */ + if (dev.as_uint32 == 0) + continue; + + found_dev++; + /* HWC is already detected in mana_hwc_create_channel(). */ if (dev_type == GDMA_DEVICE_HWC) continue; diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 90f56656b5724..62e9d76738628 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -408,8 +408,6 @@ struct gdma_context { struct gdma_dev mana_ib; }; -#define MAX_NUM_GDMA_DEVICES 4 - static inline bool mana_gd_is_mana(struct gdma_dev *gd) { return gd->dev_id.type == GDMA_DEVICE_MANA; @@ -556,11 +554,15 @@ enum { #define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG BIT(3) #define GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT BIT(5) +/* Driver can handle holes (zeros) in the device list */ +#define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11) + #define GDMA_DRV_CAP_FLAGS1 \ (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \ GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \ - GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT) + GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \ + GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP) #define GDMA_DRV_CAP_FLAGS2 0 @@ -621,11 +623,12 @@ struct gdma_query_max_resources_resp { }; /* HW DATA */ /* GDMA_LIST_DEVICES */ +#define GDMA_DEV_LIST_SIZE 64 struct gdma_list_devices_resp { struct gdma_resp_hdr hdr; u32 num_of_devs; u32 reserved; - struct gdma_dev_id devs[64]; + struct gdma_dev_id devs[GDMA_DEV_LIST_SIZE]; }; /* HW DATA */ /* GDMA_REGISTER_DEVICE */ From 234205412f34a24a392f0654f833f86f3423adbd Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Thu, 28 Aug 2025 10:41:54 +0000 Subject: [PATCH 18/21] net: mana: Switch to page pool for jumbo frames jira LE-3907 commit-author Haiyang Zhang commit fa37a8849634db2dd3545116873da8cf4b1e67c6 Frag allocators, such as netdev_alloc_frag(), were not designed to work for fragsz > PAGE_SIZE. So, switch to page pool for jumbo frames instead of using page frag allocators. This driver is using page pool for smaller MTUs already. Cc: stable@vger.kernel.org Fixes: 80f6215b450e ("net: mana: Add support for jumbo frame") Signed-off-by: Haiyang Zhang Reviewed-by: Long Li Reviewed-by: Shradha Gupta Link: https://patch.msgid.link/1742920357-27263-1-git-send-email-haiyangz@microsoft.com Signed-off-by: Jakub Kicinski (cherry picked from commit fa37a8849634db2dd3545116873da8cf4b1e67c6) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/net/ethernet/microsoft/mana/mana_en.c | 46 ++++--------------- 1 file changed, 9 insertions(+), 37 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index a2c0ad4c62fb2..ee08f30c45348 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -655,30 +655,16 @@ int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu, int num_qu mpc->rxbpre_total = 0; for (i = 0; i < num_rxb; i++) { - if (mpc->rxbpre_alloc_size > PAGE_SIZE) { - va = netdev_alloc_frag(mpc->rxbpre_alloc_size); - if (!va) - goto error; - - page = virt_to_head_page(va); - /* Check if the frag falls back to single page */ - if (compound_order(page) < - get_order(mpc->rxbpre_alloc_size)) { - put_page(page); - goto error; - } - } else { - page = dev_alloc_page(); - if (!page) - goto error; + page = dev_alloc_pages(get_order(mpc->rxbpre_alloc_size)); + if (!page) + goto error; - va = page_to_virt(page); - } + va = page_to_virt(page); da = dma_map_single(dev, va + mpc->rxbpre_headroom, mpc->rxbpre_datasize, DMA_FROM_DEVICE); if (dma_mapping_error(dev, da)) { - put_page(virt_to_head_page(va)); + put_page(page); goto error; } @@ -1671,7 +1657,7 @@ static void mana_rx_skb(void *buf_va, bool from_pool, } static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev, - dma_addr_t *da, bool *from_pool, bool is_napi) + dma_addr_t *da, bool *from_pool) { struct page *page; void *va; @@ -1682,21 +1668,6 @@ static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev, if (rxq->xdp_save_va) { va = rxq->xdp_save_va; rxq->xdp_save_va = NULL; - } else if (rxq->alloc_size > PAGE_SIZE) { - if (is_napi) - va = napi_alloc_frag(rxq->alloc_size); - else - va = netdev_alloc_frag(rxq->alloc_size); - - if (!va) - return NULL; - - page = virt_to_head_page(va); - /* Check if the frag falls back to single page */ - if (compound_order(page) < get_order(rxq->alloc_size)) { - put_page(page); - return NULL; - } } else { page = page_pool_dev_alloc_pages(rxq->page_pool); if (!page) @@ -1729,7 +1700,7 @@ static void mana_refill_rx_oob(struct device *dev, struct mana_rxq *rxq, dma_addr_t da; void *va; - va = mana_get_rxfrag(rxq, dev, &da, &from_pool, true); + va = mana_get_rxfrag(rxq, dev, &da, &from_pool); if (!va) return; @@ -2165,7 +2136,7 @@ static int mana_fill_rx_oob(struct mana_recv_buf_oob *rx_oob, u32 mem_key, if (mpc->rxbufs_pre) va = mana_get_rxbuf_pre(rxq, &da); else - va = mana_get_rxfrag(rxq, dev, &da, &from_pool, false); + va = mana_get_rxfrag(rxq, dev, &da, &from_pool); if (!va) return -ENOMEM; @@ -2251,6 +2222,7 @@ static int mana_create_page_pool(struct mana_rxq *rxq, struct gdma_context *gc) pprm.nid = gc->numa_node; pprm.napi = &rxq->rx_cq.napi; pprm.netdev = rxq->ndev; + pprm.order = get_order(rxq->alloc_size); rxq->page_pool = page_pool_create(&pprm); From d78ae8f565700d0de313bfb65df758c317b3c594 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Wed, 3 Sep 2025 11:51:50 +0000 Subject: [PATCH 19/21] net: mana: Expose additional hardware counters for drop and TC via ethtool. jira LE-3915 commit-author Dipayaan Roy commit c09ef59e17c6921c577d54bc8da4331b955d01a7 Add support for reporting additional hardware counters for drop and TC using the ethtool -S interface. These counters include: - Aggregate Rx/Tx drop counters - Per-TC Rx/Tx packet counters - Per-TC Rx/Tx byte counters - Per-TC Rx/Tx pause frame counters The counters are exposed using ethtool_ops->get_ethtool_stats and ethtool_ops->get_strings. This feature/counters are not available to all versions of hardware. Signed-off-by: Dipayaan Roy Reviewed-by: Subbaraya Sundeep Reviewed-by: Haiyang Zhang Link: https://patch.msgid.link/20250609100103.GA7102@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net Signed-off-by: Jakub Kicinski (cherry picked from commit c09ef59e17c6921c577d54bc8da4331b955d01a7) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- .../net/ethernet/microsoft/mana/hw_channel.c | 6 +- drivers/net/ethernet/microsoft/mana/mana_en.c | 87 +++++++++++- .../ethernet/microsoft/mana/mana_ethtool.c | 76 +++++++++- include/net/mana/mana.h | 131 ++++++++++++++++++ 4 files changed, 292 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c index 98bf4ecf2d950..df8b52c991d86 100644 --- a/drivers/net/ethernet/microsoft/mana/hw_channel.c +++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c @@ -2,6 +2,7 @@ /* Copyright (c) 2021, Microsoft Corporation. */ #include +#include #include static int mana_hwc_get_msg_index(struct hw_channel_context *hwc, u16 *msg_id) @@ -870,8 +871,9 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len, } if (ctx->status_code && ctx->status_code != GDMA_STATUS_MORE_ENTRIES) { - dev_err(hwc->dev, "HWC: Failed hw_channel req: 0x%x\n", - ctx->status_code); + if (req_msg->req.msg_type != MANA_QUERY_PHY_STAT) + dev_err(hwc->dev, "HWC: Failed hw_channel req: 0x%x\n", + ctx->status_code); err = -EPROTO; goto out; } diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index ee08f30c45348..cbecacf503422 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -768,8 +768,9 @@ static int mana_send_request(struct mana_context *ac, void *in_buf, err = mana_gd_send_request(gc, in_len, in_buf, out_len, out_buf); if (err || resp->status) { - dev_err(dev, "Failed to send mana message: %d, 0x%x\n", - err, resp->status); + if (req->req.msg_type != MANA_QUERY_PHY_STAT) + dev_err(dev, "Failed to send mana message: %d, 0x%x\n", + err, resp->status); return err ? err : -EPROTO; } @@ -2595,6 +2596,88 @@ void mana_query_gf_stats(struct mana_port_context *apc) apc->eth_stats.hc_tx_err_gdma = resp.tx_err_gdma; } +void mana_query_phy_stats(struct mana_port_context *apc) +{ + struct mana_query_phy_stat_resp resp = {}; + struct mana_query_phy_stat_req req = {}; + struct net_device *ndev = apc->ndev; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_PHY_STAT, + sizeof(req), sizeof(resp)); + err = mana_send_request(apc->ac, &req, sizeof(req), &resp, + sizeof(resp)); + if (err) + return; + + err = mana_verify_resp_hdr(&resp.hdr, MANA_QUERY_PHY_STAT, + sizeof(resp)); + if (err || resp.hdr.status) { + netdev_err(ndev, + "Failed to query PHY stats: %d, resp:0x%x\n", + err, resp.hdr.status); + return; + } + + /* Aggregate drop counters */ + apc->phy_stats.rx_pkt_drop_phy = resp.rx_pkt_drop_phy; + apc->phy_stats.tx_pkt_drop_phy = resp.tx_pkt_drop_phy; + + /* Per TC traffic Counters */ + apc->phy_stats.rx_pkt_tc0_phy = resp.rx_pkt_tc0_phy; + apc->phy_stats.tx_pkt_tc0_phy = resp.tx_pkt_tc0_phy; + apc->phy_stats.rx_pkt_tc1_phy = resp.rx_pkt_tc1_phy; + apc->phy_stats.tx_pkt_tc1_phy = resp.tx_pkt_tc1_phy; + apc->phy_stats.rx_pkt_tc2_phy = resp.rx_pkt_tc2_phy; + apc->phy_stats.tx_pkt_tc2_phy = resp.tx_pkt_tc2_phy; + apc->phy_stats.rx_pkt_tc3_phy = resp.rx_pkt_tc3_phy; + apc->phy_stats.tx_pkt_tc3_phy = resp.tx_pkt_tc3_phy; + apc->phy_stats.rx_pkt_tc4_phy = resp.rx_pkt_tc4_phy; + apc->phy_stats.tx_pkt_tc4_phy = resp.tx_pkt_tc4_phy; + apc->phy_stats.rx_pkt_tc5_phy = resp.rx_pkt_tc5_phy; + apc->phy_stats.tx_pkt_tc5_phy = resp.tx_pkt_tc5_phy; + apc->phy_stats.rx_pkt_tc6_phy = resp.rx_pkt_tc6_phy; + apc->phy_stats.tx_pkt_tc6_phy = resp.tx_pkt_tc6_phy; + apc->phy_stats.rx_pkt_tc7_phy = resp.rx_pkt_tc7_phy; + apc->phy_stats.tx_pkt_tc7_phy = resp.tx_pkt_tc7_phy; + + /* Per TC byte Counters */ + apc->phy_stats.rx_byte_tc0_phy = resp.rx_byte_tc0_phy; + apc->phy_stats.tx_byte_tc0_phy = resp.tx_byte_tc0_phy; + apc->phy_stats.rx_byte_tc1_phy = resp.rx_byte_tc1_phy; + apc->phy_stats.tx_byte_tc1_phy = resp.tx_byte_tc1_phy; + apc->phy_stats.rx_byte_tc2_phy = resp.rx_byte_tc2_phy; + apc->phy_stats.tx_byte_tc2_phy = resp.tx_byte_tc2_phy; + apc->phy_stats.rx_byte_tc3_phy = resp.rx_byte_tc3_phy; + apc->phy_stats.tx_byte_tc3_phy = resp.tx_byte_tc3_phy; + apc->phy_stats.rx_byte_tc4_phy = resp.rx_byte_tc4_phy; + apc->phy_stats.tx_byte_tc4_phy = resp.tx_byte_tc4_phy; + apc->phy_stats.rx_byte_tc5_phy = resp.rx_byte_tc5_phy; + apc->phy_stats.tx_byte_tc5_phy = resp.tx_byte_tc5_phy; + apc->phy_stats.rx_byte_tc6_phy = resp.rx_byte_tc6_phy; + apc->phy_stats.tx_byte_tc6_phy = resp.tx_byte_tc6_phy; + apc->phy_stats.rx_byte_tc7_phy = resp.rx_byte_tc7_phy; + apc->phy_stats.tx_byte_tc7_phy = resp.tx_byte_tc7_phy; + + /* Per TC pause Counters */ + apc->phy_stats.rx_pause_tc0_phy = resp.rx_pause_tc0_phy; + apc->phy_stats.tx_pause_tc0_phy = resp.tx_pause_tc0_phy; + apc->phy_stats.rx_pause_tc1_phy = resp.rx_pause_tc1_phy; + apc->phy_stats.tx_pause_tc1_phy = resp.tx_pause_tc1_phy; + apc->phy_stats.rx_pause_tc2_phy = resp.rx_pause_tc2_phy; + apc->phy_stats.tx_pause_tc2_phy = resp.tx_pause_tc2_phy; + apc->phy_stats.rx_pause_tc3_phy = resp.rx_pause_tc3_phy; + apc->phy_stats.tx_pause_tc3_phy = resp.tx_pause_tc3_phy; + apc->phy_stats.rx_pause_tc4_phy = resp.rx_pause_tc4_phy; + apc->phy_stats.tx_pause_tc4_phy = resp.tx_pause_tc4_phy; + apc->phy_stats.rx_pause_tc5_phy = resp.rx_pause_tc5_phy; + apc->phy_stats.tx_pause_tc5_phy = resp.tx_pause_tc5_phy; + apc->phy_stats.rx_pause_tc6_phy = resp.rx_pause_tc6_phy; + apc->phy_stats.tx_pause_tc6_phy = resp.tx_pause_tc6_phy; + apc->phy_stats.rx_pause_tc7_phy = resp.rx_pause_tc7_phy; + apc->phy_stats.tx_pause_tc7_phy = resp.tx_pause_tc7_phy; +} + static int mana_init_port(struct net_device *ndev) { struct mana_port_context *apc = netdev_priv(ndev); diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c index c419626073f5f..4fb3a04994a2d 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c @@ -7,10 +7,12 @@ #include -static const struct { +struct mana_stats_desc { char name[ETH_GSTRING_LEN]; u16 offset; -} mana_eth_stats[] = { +}; + +static const struct mana_stats_desc mana_eth_stats[] = { {"stop_queue", offsetof(struct mana_ethtool_stats, stop_queue)}, {"wake_queue", offsetof(struct mana_ethtool_stats, wake_queue)}, {"hc_rx_discards_no_wqe", offsetof(struct mana_ethtool_stats, @@ -75,6 +77,59 @@ static const struct { rx_cqe_unknown_type)}, }; +static const struct mana_stats_desc mana_phy_stats[] = { + { "hc_rx_pkt_drop_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_drop_phy) }, + { "hc_tx_pkt_drop_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_drop_phy) }, + { "hc_tc0_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc0_phy) }, + { "hc_tc0_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc0_phy) }, + { "hc_tc0_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc0_phy) }, + { "hc_tc0_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc0_phy) }, + { "hc_tc1_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc1_phy) }, + { "hc_tc1_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc1_phy) }, + { "hc_tc1_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc1_phy) }, + { "hc_tc1_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc1_phy) }, + { "hc_tc2_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc2_phy) }, + { "hc_tc2_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc2_phy) }, + { "hc_tc2_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc2_phy) }, + { "hc_tc2_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc2_phy) }, + { "hc_tc3_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc3_phy) }, + { "hc_tc3_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc3_phy) }, + { "hc_tc3_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc3_phy) }, + { "hc_tc3_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc3_phy) }, + { "hc_tc4_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc4_phy) }, + { "hc_tc4_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc4_phy) }, + { "hc_tc4_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc4_phy) }, + { "hc_tc4_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc4_phy) }, + { "hc_tc5_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc5_phy) }, + { "hc_tc5_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc5_phy) }, + { "hc_tc5_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc5_phy) }, + { "hc_tc5_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc5_phy) }, + { "hc_tc6_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc6_phy) }, + { "hc_tc6_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc6_phy) }, + { "hc_tc6_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc6_phy) }, + { "hc_tc6_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc6_phy) }, + { "hc_tc7_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc7_phy) }, + { "hc_tc7_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc7_phy) }, + { "hc_tc7_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc7_phy) }, + { "hc_tc7_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc7_phy) }, + { "hc_tc0_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc0_phy) }, + { "hc_tc0_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc0_phy) }, + { "hc_tc1_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc1_phy) }, + { "hc_tc1_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc1_phy) }, + { "hc_tc2_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc2_phy) }, + { "hc_tc2_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc2_phy) }, + { "hc_tc3_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc3_phy) }, + { "hc_tc3_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc3_phy) }, + { "hc_tc4_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc4_phy) }, + { "hc_tc4_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc4_phy) }, + { "hc_tc5_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc5_phy) }, + { "hc_tc5_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc5_phy) }, + { "hc_tc6_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc6_phy) }, + { "hc_tc6_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc6_phy) }, + { "hc_tc7_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc7_phy) }, + { "hc_tc7_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc7_phy) }, +}; + static int mana_get_sset_count(struct net_device *ndev, int stringset) { struct mana_port_context *apc = netdev_priv(ndev); @@ -83,8 +138,8 @@ static int mana_get_sset_count(struct net_device *ndev, int stringset) if (stringset != ETH_SS_STATS) return -EINVAL; - return ARRAY_SIZE(mana_eth_stats) + num_queues * - (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT); + return ARRAY_SIZE(mana_eth_stats) + ARRAY_SIZE(mana_phy_stats) + + num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT); } static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data) @@ -99,6 +154,9 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data) for (i = 0; i < ARRAY_SIZE(mana_eth_stats); i++) ethtool_puts(&data, mana_eth_stats[i].name); + for (i = 0; i < ARRAY_SIZE(mana_phy_stats); i++) + ethtool_puts(&data, mana_phy_stats[i].name); + for (i = 0; i < num_queues; i++) { ethtool_sprintf(&data, "rx_%d_packets", i); ethtool_sprintf(&data, "rx_%d_bytes", i); @@ -128,6 +186,7 @@ static void mana_get_ethtool_stats(struct net_device *ndev, struct mana_port_context *apc = netdev_priv(ndev); unsigned int num_queues = apc->num_queues; void *eth_stats = &apc->eth_stats; + void *phy_stats = &apc->phy_stats; struct mana_stats_rx *rx_stats; struct mana_stats_tx *tx_stats; unsigned int start; @@ -151,9 +210,18 @@ static void mana_get_ethtool_stats(struct net_device *ndev, /* we call mana function to update stats from GDMA */ mana_query_gf_stats(apc); + /* We call this mana function to get the phy stats from GDMA and includes + * aggregate tx/rx drop counters, Per-TC(Traffic Channel) tx/rx and pause + * counters. + */ + mana_query_phy_stats(apc); + for (q = 0; q < ARRAY_SIZE(mana_eth_stats); q++) data[i++] = *(u64 *)(eth_stats + mana_eth_stats[q].offset); + for (q = 0; q < ARRAY_SIZE(mana_phy_stats); q++) + data[i++] = *(u64 *)(phy_stats + mana_phy_stats[q].offset); + for (q = 0; q < num_queues; q++) { rx_stats = &apc->rxqs[q]->stats; diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index bf573c8a62819..a08659b5219fd 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -401,6 +401,65 @@ struct mana_ethtool_stats { u64 rx_cqe_unknown_type; }; +struct mana_ethtool_phy_stats { + /* Drop Counters */ + u64 rx_pkt_drop_phy; + u64 tx_pkt_drop_phy; + + /* Per TC traffic Counters */ + u64 rx_pkt_tc0_phy; + u64 tx_pkt_tc0_phy; + u64 rx_pkt_tc1_phy; + u64 tx_pkt_tc1_phy; + u64 rx_pkt_tc2_phy; + u64 tx_pkt_tc2_phy; + u64 rx_pkt_tc3_phy; + u64 tx_pkt_tc3_phy; + u64 rx_pkt_tc4_phy; + u64 tx_pkt_tc4_phy; + u64 rx_pkt_tc5_phy; + u64 tx_pkt_tc5_phy; + u64 rx_pkt_tc6_phy; + u64 tx_pkt_tc6_phy; + u64 rx_pkt_tc7_phy; + u64 tx_pkt_tc7_phy; + + u64 rx_byte_tc0_phy; + u64 tx_byte_tc0_phy; + u64 rx_byte_tc1_phy; + u64 tx_byte_tc1_phy; + u64 rx_byte_tc2_phy; + u64 tx_byte_tc2_phy; + u64 rx_byte_tc3_phy; + u64 tx_byte_tc3_phy; + u64 rx_byte_tc4_phy; + u64 tx_byte_tc4_phy; + u64 rx_byte_tc5_phy; + u64 tx_byte_tc5_phy; + u64 rx_byte_tc6_phy; + u64 tx_byte_tc6_phy; + u64 rx_byte_tc7_phy; + u64 tx_byte_tc7_phy; + + /* Per TC pause Counters */ + u64 rx_pause_tc0_phy; + u64 tx_pause_tc0_phy; + u64 rx_pause_tc1_phy; + u64 tx_pause_tc1_phy; + u64 rx_pause_tc2_phy; + u64 tx_pause_tc2_phy; + u64 rx_pause_tc3_phy; + u64 tx_pause_tc3_phy; + u64 rx_pause_tc4_phy; + u64 tx_pause_tc4_phy; + u64 rx_pause_tc5_phy; + u64 tx_pause_tc5_phy; + u64 rx_pause_tc6_phy; + u64 tx_pause_tc6_phy; + u64 rx_pause_tc7_phy; + u64 tx_pause_tc7_phy; +}; + struct mana_context { struct gdma_dev *gdma_dev; @@ -471,6 +530,8 @@ struct mana_port_context { struct mana_ethtool_stats eth_stats; + struct mana_ethtool_phy_stats phy_stats; + /* Debugfs */ struct dentry *mana_port_debugfs; }; @@ -495,6 +556,7 @@ struct bpf_prog *mana_xdp_get(struct mana_port_context *apc); void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog); int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf); void mana_query_gf_stats(struct mana_port_context *apc); +void mana_query_phy_stats(struct mana_port_context *apc); int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int num_queues); void mana_pre_dealloc_rxbufs(struct mana_port_context *apc); @@ -521,6 +583,7 @@ enum mana_command_code { MANA_FENCE_RQ = 0x20006, MANA_CONFIG_VPORT_RX = 0x20007, MANA_QUERY_VPORT_CONFIG = 0x20008, + MANA_QUERY_PHY_STAT = 0x2000c, /* Privileged commands for the PF mode */ MANA_REGISTER_FILTER = 0x28000, @@ -683,6 +746,74 @@ struct mana_query_gf_stat_resp { u64 tx_err_gdma; }; /* HW DATA */ +/* Query phy stats */ +struct mana_query_phy_stat_req { + struct gdma_req_hdr hdr; + u64 req_stats; +}; /* HW DATA */ + +struct mana_query_phy_stat_resp { + struct gdma_resp_hdr hdr; + u64 reported_stats; + + /* Aggregate Drop Counters */ + u64 rx_pkt_drop_phy; + u64 tx_pkt_drop_phy; + + /* Per TC(Traffic class) traffic Counters */ + u64 rx_pkt_tc0_phy; + u64 tx_pkt_tc0_phy; + u64 rx_pkt_tc1_phy; + u64 tx_pkt_tc1_phy; + u64 rx_pkt_tc2_phy; + u64 tx_pkt_tc2_phy; + u64 rx_pkt_tc3_phy; + u64 tx_pkt_tc3_phy; + u64 rx_pkt_tc4_phy; + u64 tx_pkt_tc4_phy; + u64 rx_pkt_tc5_phy; + u64 tx_pkt_tc5_phy; + u64 rx_pkt_tc6_phy; + u64 tx_pkt_tc6_phy; + u64 rx_pkt_tc7_phy; + u64 tx_pkt_tc7_phy; + + u64 rx_byte_tc0_phy; + u64 tx_byte_tc0_phy; + u64 rx_byte_tc1_phy; + u64 tx_byte_tc1_phy; + u64 rx_byte_tc2_phy; + u64 tx_byte_tc2_phy; + u64 rx_byte_tc3_phy; + u64 tx_byte_tc3_phy; + u64 rx_byte_tc4_phy; + u64 tx_byte_tc4_phy; + u64 rx_byte_tc5_phy; + u64 tx_byte_tc5_phy; + u64 rx_byte_tc6_phy; + u64 tx_byte_tc6_phy; + u64 rx_byte_tc7_phy; + u64 tx_byte_tc7_phy; + + /* Per TC(Traffic Class) pause Counters */ + u64 rx_pause_tc0_phy; + u64 tx_pause_tc0_phy; + u64 rx_pause_tc1_phy; + u64 tx_pause_tc1_phy; + u64 rx_pause_tc2_phy; + u64 tx_pause_tc2_phy; + u64 rx_pause_tc3_phy; + u64 tx_pause_tc3_phy; + u64 rx_pause_tc4_phy; + u64 tx_pause_tc4_phy; + u64 rx_pause_tc5_phy; + u64 tx_pause_tc5_phy; + u64 rx_pause_tc6_phy; + u64 tx_pause_tc6_phy; + u64 rx_pause_tc7_phy; + u64 tx_pause_tc7_phy; +}; /* HW DATA */ + /* Configure vPort Rx Steering */ struct mana_cfg_rx_steer_req_v2 { struct gdma_req_hdr hdr; From 5fbac25014399ee0c9fbe57d471cd70f36fff275 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Wed, 3 Sep 2025 11:54:27 +0000 Subject: [PATCH 20/21] net: mana: Add handler for hardware servicing events jira LE-3919 commit-author Haiyang Zhang commit 7768c5f417336fa58dbfef9bb7ecd7eeec6d8886 To collaborate with hardware servicing events, upon receiving the special EQE notification from the HW channel, remove the devices on this bus. Then, after a waiting period based on the device specs, rescan the parent bus to recover the devices. Signed-off-by: Haiyang Zhang Reviewed-by: Shradha Gupta Reviewed-by: Simon Horman Link: https://patch.msgid.link/1749834034-18498-1-git-send-email-haiyangz@linux.microsoft.com Signed-off-by: Jakub Kicinski (cherry picked from commit 7768c5f417336fa58dbfef9bb7ecd7eeec6d8886) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- .../net/ethernet/microsoft/mana/gdma_main.c | 75 +++++++++++++++++++ include/net/mana/gdma.h | 10 ++- 2 files changed, 83 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 0c5e302e75c98..9d19df7cd82b3 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -351,11 +351,59 @@ void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit) head, arm_bit); } +#define MANA_SERVICE_PERIOD 10 + +struct mana_serv_work { + struct work_struct serv_work; + struct pci_dev *pdev; +}; + +static void mana_serv_func(struct work_struct *w) +{ + struct mana_serv_work *mns_wk; + struct pci_bus *bus, *parent; + struct pci_dev *pdev; + + mns_wk = container_of(w, struct mana_serv_work, serv_work); + pdev = mns_wk->pdev; + + pci_lock_rescan_remove(); + + if (!pdev) + goto out; + + bus = pdev->bus; + if (!bus) { + dev_err(&pdev->dev, "MANA service: no bus\n"); + goto out; + } + + parent = bus->parent; + if (!parent) { + dev_err(&pdev->dev, "MANA service: no parent bus\n"); + goto out; + } + + pci_stop_and_remove_bus_device(bus->self); + + msleep(MANA_SERVICE_PERIOD * 1000); + + pci_rescan_bus(parent); + +out: + pci_unlock_rescan_remove(); + + pci_dev_put(pdev); + kfree(mns_wk); + module_put(THIS_MODULE); +} + static void mana_gd_process_eqe(struct gdma_queue *eq) { u32 head = eq->head % (eq->queue_size / GDMA_EQE_SIZE); struct gdma_context *gc = eq->gdma_dev->gdma_context; struct gdma_eqe *eq_eqe_ptr = eq->queue_mem_ptr; + struct mana_serv_work *mns_wk; union gdma_eqe_info eqe_info; enum gdma_eqe_type type; struct gdma_event event; @@ -399,6 +447,33 @@ static void mana_gd_process_eqe(struct gdma_queue *eq) eq->eq.callback(eq->eq.context, eq, &event); break; + case GDMA_EQE_HWC_FPGA_RECONFIG: + dev_info(gc->dev, "Recv MANA service type:%d\n", type); + + if (gc->in_service) { + dev_info(gc->dev, "Already in service\n"); + break; + } + + if (!try_module_get(THIS_MODULE)) { + dev_info(gc->dev, "Module is unloading\n"); + break; + } + + mns_wk = kzalloc(sizeof(*mns_wk), GFP_ATOMIC); + if (!mns_wk) { + module_put(THIS_MODULE); + break; + } + + dev_info(gc->dev, "Start MANA service type:%d\n", type); + gc->in_service = true; + mns_wk->pdev = to_pci_dev(gc->dev); + pci_dev_get(mns_wk->pdev); + INIT_WORK(&mns_wk->serv_work, mana_serv_func); + schedule_work(&mns_wk->serv_work); + break; + default: break; } diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 62e9d76738628..b602b2e55939c 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -58,7 +58,7 @@ enum gdma_eqe_type { GDMA_EQE_HWC_INIT_EQ_ID_DB = 129, GDMA_EQE_HWC_INIT_DATA = 130, GDMA_EQE_HWC_INIT_DONE = 131, - GDMA_EQE_HWC_SOC_RECONFIG = 132, + GDMA_EQE_HWC_FPGA_RECONFIG = 132, GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133, GDMA_EQE_RNIC_QP_FATAL = 176, }; @@ -387,6 +387,8 @@ struct gdma_context { u32 test_event_eq_id; bool is_pf; + bool in_service; + phys_addr_t bar0_pa; void __iomem *bar0_va; void __iomem *shm_base; @@ -557,12 +559,16 @@ enum { /* Driver can handle holes (zeros) in the device list */ #define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11) +/* Driver can self reset on FPGA Reconfig EQE notification */ +#define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17) + #define GDMA_DRV_CAP_FLAGS1 \ (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \ GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \ GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \ - GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP) + GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \ + GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE) #define GDMA_DRV_CAP_FLAGS2 0 From f6a810230c4cd59ec3a81836d7b843076461cbd6 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Wed, 3 Sep 2025 12:36:41 +0000 Subject: [PATCH 21/21] net: mana: Handle Reset Request from MANA NIC jira LE-3923 commit-author Haiyang Zhang commit fbe346ce9d626680a4dd0f079e17c7b5dd32ffad upstream-diff There were conflicts seen when applying this patch due to the following missing commits :- ca8ac489ca33 ("net: mana: Handle unsupported HWC commands") 505cc26bcae0 ("net: mana: Add support for auxiliary device servicing events") Upon receiving the Reset Request, pause the connection and clean up queues, wait for the specified period, then resume the NIC. In the cleanup phase, the HWC is no longer responding, so set hwc_timeout to zero to skip waiting on the response. Signed-off-by: Haiyang Zhang Link: https://patch.msgid.link/1751055983-29760-1-git-send-email-haiyangz@linux.microsoft.com Signed-off-by: Jakub Kicinski (cherry picked from commit fbe346ce9d626680a4dd0f079e17c7b5dd32ffad) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- .../net/ethernet/microsoft/mana/gdma_main.c | 127 ++++++++++++++---- .../net/ethernet/microsoft/mana/hw_channel.c | 4 +- drivers/net/ethernet/microsoft/mana/mana_en.c | 37 +++-- include/net/mana/gdma.h | 10 ++ 4 files changed, 143 insertions(+), 35 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 9d19df7cd82b3..d9f3c65eace2f 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -8,6 +8,7 @@ #include #include +#include #include struct dentry *mana_debugfs_root; @@ -64,6 +65,24 @@ static void mana_gd_init_registers(struct pci_dev *pdev) mana_gd_init_vf_regs(pdev); } +/* Suppress logging when we set timeout to zero */ +bool mana_need_log(struct gdma_context *gc, int err) +{ + struct hw_channel_context *hwc; + + if (err != -ETIMEDOUT) + return true; + + if (!gc) + return true; + + hwc = gc->hwc.driver_data; + if (hwc && hwc->hwc_timeout == 0) + return false; + + return true; +} + static int mana_gd_query_max_resources(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); @@ -267,8 +286,9 @@ static int mana_gd_disable_queue(struct gdma_queue *queue) err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); if (err || resp.hdr.status) { - dev_err(gc->dev, "Failed to disable queue: %d, 0x%x\n", err, - resp.hdr.status); + if (mana_need_log(gc, err)) + dev_err(gc->dev, "Failed to disable queue: %d, 0x%x\n", err, + resp.hdr.status); return err ? err : -EPROTO; } @@ -353,25 +373,12 @@ void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit) #define MANA_SERVICE_PERIOD 10 -struct mana_serv_work { - struct work_struct serv_work; - struct pci_dev *pdev; -}; - -static void mana_serv_func(struct work_struct *w) +static void mana_serv_fpga(struct pci_dev *pdev) { - struct mana_serv_work *mns_wk; struct pci_bus *bus, *parent; - struct pci_dev *pdev; - - mns_wk = container_of(w, struct mana_serv_work, serv_work); - pdev = mns_wk->pdev; pci_lock_rescan_remove(); - if (!pdev) - goto out; - bus = pdev->bus; if (!bus) { dev_err(&pdev->dev, "MANA service: no bus\n"); @@ -392,7 +399,74 @@ static void mana_serv_func(struct work_struct *w) out: pci_unlock_rescan_remove(); +} + +static void mana_serv_reset(struct pci_dev *pdev) +{ + struct gdma_context *gc = pci_get_drvdata(pdev); + struct hw_channel_context *hwc; + + if (!gc) { + dev_err(&pdev->dev, "MANA service: no GC\n"); + return; + } + + hwc = gc->hwc.driver_data; + if (!hwc) { + dev_err(&pdev->dev, "MANA service: no HWC\n"); + goto out; + } + + /* HWC is not responding in this case, so don't wait */ + hwc->hwc_timeout = 0; + + dev_info(&pdev->dev, "MANA reset cycle start\n"); + mana_gd_suspend(pdev, PMSG_SUSPEND); + + msleep(MANA_SERVICE_PERIOD * 1000); + + mana_gd_resume(pdev); + + dev_info(&pdev->dev, "MANA reset cycle completed\n"); + +out: + gc->in_service = false; +} + +struct mana_serv_work { + struct work_struct serv_work; + struct pci_dev *pdev; + enum gdma_eqe_type type; +}; + +static void mana_serv_func(struct work_struct *w) +{ + struct mana_serv_work *mns_wk; + struct pci_dev *pdev; + + mns_wk = container_of(w, struct mana_serv_work, serv_work); + pdev = mns_wk->pdev; + + if (!pdev) + goto out; + + switch (mns_wk->type) { + case GDMA_EQE_HWC_FPGA_RECONFIG: + mana_serv_fpga(pdev); + break; + + case GDMA_EQE_HWC_RESET_REQUEST: + mana_serv_reset(pdev); + break; + + default: + dev_err(&pdev->dev, "MANA service: unknown type %d\n", + mns_wk->type); + break; + } + +out: pci_dev_put(pdev); kfree(mns_wk); module_put(THIS_MODULE); @@ -448,6 +522,7 @@ static void mana_gd_process_eqe(struct gdma_queue *eq) break; case GDMA_EQE_HWC_FPGA_RECONFIG: + case GDMA_EQE_HWC_RESET_REQUEST: dev_info(gc->dev, "Recv MANA service type:%d\n", type); if (gc->in_service) { @@ -469,6 +544,7 @@ static void mana_gd_process_eqe(struct gdma_queue *eq) dev_info(gc->dev, "Start MANA service type:%d\n", type); gc->in_service = true; mns_wk->pdev = to_pci_dev(gc->dev); + mns_wk->type = type; pci_dev_get(mns_wk->pdev); INIT_WORK(&mns_wk->serv_work, mana_serv_func); schedule_work(&mns_wk->serv_work); @@ -615,7 +691,8 @@ int mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq) err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); if (err) { - dev_err(dev, "test_eq failed: %d\n", err); + if (mana_need_log(gc, err)) + dev_err(dev, "test_eq failed: %d\n", err); goto out; } @@ -650,7 +727,7 @@ static void mana_gd_destroy_eq(struct gdma_context *gc, bool flush_evenets, if (flush_evenets) { err = mana_gd_test_eq(gc, queue); - if (err) + if (err && mana_need_log(gc, err)) dev_warn(gc->dev, "Failed to flush EQ: %d\n", err); } @@ -796,8 +873,9 @@ int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle) err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); if (err || resp.hdr.status) { - dev_err(gc->dev, "Failed to destroy DMA region: %d, 0x%x\n", - err, resp.hdr.status); + if (mana_need_log(gc, err)) + dev_err(gc->dev, "Failed to destroy DMA region: %d, 0x%x\n", + err, resp.hdr.status); return -EPROTO; } @@ -1096,8 +1174,9 @@ int mana_gd_deregister_device(struct gdma_dev *gd) err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); if (err || resp.hdr.status) { - dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n", - err, resp.hdr.status); + if (mana_need_log(gc, err)) + dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n", + err, resp.hdr.status); if (!err) err = -EPROTO; } @@ -1697,7 +1776,7 @@ static void mana_gd_remove(struct pci_dev *pdev) } /* The 'state' parameter is not used. */ -static int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state) +int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state) { struct gdma_context *gc = pci_get_drvdata(pdev); @@ -1712,7 +1791,7 @@ static int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state) * fail -- if this happens, it's safer to just report an error than try to undo * what has been done. */ -static int mana_gd_resume(struct pci_dev *pdev) +int mana_gd_resume(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); int err; diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c index df8b52c991d86..afab02b0f1685 100644 --- a/drivers/net/ethernet/microsoft/mana/hw_channel.c +++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c @@ -860,7 +860,9 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len, if (!wait_for_completion_timeout(&ctx->comp_event, (msecs_to_jiffies(hwc->hwc_timeout)))) { - dev_err(hwc->dev, "HWC: Request timed out!\n"); + if (hwc->hwc_timeout != 0) + dev_err(hwc->dev, "HWC: Request timed out!\n"); + err = -ETIMEDOUT; goto out; } diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index cbecacf503422..acf1342536463 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -45,6 +45,15 @@ static const struct file_operations mana_dbg_q_fops = { .read = mana_dbg_q_read, }; +static bool mana_en_need_log(struct mana_port_context *apc, int err) +{ + if (apc && apc->ac && apc->ac->gdma_dev && + apc->ac->gdma_dev->gdma_context) + return mana_need_log(apc->ac->gdma_dev->gdma_context, err); + else + return true; +} + /* Microsoft Azure Network Adapter (MANA) functions */ static int mana_open(struct net_device *ndev) @@ -768,7 +777,8 @@ static int mana_send_request(struct mana_context *ac, void *in_buf, err = mana_gd_send_request(gc, in_len, in_buf, out_len, out_buf); if (err || resp->status) { - if (req->req.msg_type != MANA_QUERY_PHY_STAT) + if (req->req.msg_type != MANA_QUERY_PHY_STAT && + mana_need_log(gc, err)) dev_err(dev, "Failed to send mana message: %d, 0x%x\n", err, resp->status); return err ? err : -EPROTO; @@ -845,8 +855,10 @@ static void mana_pf_deregister_hw_vport(struct mana_port_context *apc) err = mana_send_request(apc->ac, &req, sizeof(req), &resp, sizeof(resp)); if (err) { - netdev_err(apc->ndev, "Failed to unregister hw vPort: %d\n", - err); + if (mana_en_need_log(apc, err)) + netdev_err(apc->ndev, "Failed to unregister hw vPort: %d\n", + err); + return; } @@ -901,8 +913,10 @@ static void mana_pf_deregister_filter(struct mana_port_context *apc) err = mana_send_request(apc->ac, &req, sizeof(req), &resp, sizeof(resp)); if (err) { - netdev_err(apc->ndev, "Failed to unregister filter: %d\n", - err); + if (mana_en_need_log(apc, err)) + netdev_err(apc->ndev, "Failed to unregister filter: %d\n", + err); + return; } @@ -1132,7 +1146,9 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc, err = mana_send_request(apc->ac, req, req_buf_size, &resp, sizeof(resp)); if (err) { - netdev_err(ndev, "Failed to configure vPort RX: %d\n", err); + if (mana_en_need_log(apc, err)) + netdev_err(ndev, "Failed to configure vPort RX: %d\n", err); + goto out; } @@ -1227,7 +1243,9 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type, err = mana_send_request(apc->ac, &req, sizeof(req), &resp, sizeof(resp)); if (err) { - netdev_err(ndev, "Failed to destroy WQ object: %d\n", err); + if (mana_en_need_log(apc, err)) + netdev_err(ndev, "Failed to destroy WQ object: %d\n", err); + return; } @@ -2872,11 +2890,10 @@ static int mana_dealloc_queues(struct net_device *ndev) apc->rss_state = TRI_STATE_FALSE; err = mana_config_rss(apc, TRI_STATE_FALSE, false, false); - if (err) { + if (err && mana_en_need_log(apc, err)) netdev_err(ndev, "Failed to disable vPort: %d\n", err); - return err; - } + /* Even in err case, still need to cleanup the vPort */ mana_destroy_vport(apc); return 0; diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index b602b2e55939c..af5596bf46878 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -60,6 +60,7 @@ enum gdma_eqe_type { GDMA_EQE_HWC_INIT_DONE = 131, GDMA_EQE_HWC_FPGA_RECONFIG = 132, GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133, + GDMA_EQE_HWC_RESET_REQUEST = 135, GDMA_EQE_RNIC_QP_FATAL = 176, }; @@ -559,6 +560,9 @@ enum { /* Driver can handle holes (zeros) in the device list */ #define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11) +/* Driver can self reset on EQE notification */ +#define GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE BIT(14) + /* Driver can self reset on FPGA Reconfig EQE notification */ #define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17) @@ -568,6 +572,7 @@ enum { GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \ GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \ GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \ + GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \ GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE) #define GDMA_DRV_CAP_FLAGS2 0 @@ -892,4 +897,9 @@ int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle); void mana_register_debugfs(void); void mana_unregister_debugfs(void); +int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state); +int mana_gd_resume(struct pci_dev *pdev); + +bool mana_need_log(struct gdma_context *gc, int err); + #endif /* _GDMA_H */