From 4ab93d8b40a9198fa144cd7218df6937e8fdaf02 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Mon, 16 Dec 2024 22:09:43 +0000 Subject: [PATCH 1/2] Use netdev refcount tracking in a couple places Netdev refcounting was added a couple years back in 5.17 [1], but onload only uses the untracked versions. This patch adds support to the ones that are low-hanging fruit. For sfc, kernel compatibility for netdevice_tracker is already provided by its kernel_compat.h. Tracking in tc_encap_actions.c is already upstream [2] so this is backporting from there. For rx_common.c I just sent a patch upstream [3] and backported it here. Outside sfc I added the same support to ci/driver/kernel_compat.h. [1] https://lore.kernel.org/netdev/20211205042217.982127-4-eric.dumazet@gmail.com/ [2] https://github.com/torvalds/linux/commit/7e5e7d800011ad#diff-9ffb1b01a8a11d0d7b6976a10eede3bebdcfaf256ef26d0d95714fe8aa03c89fR156 [3] https://lore.kernel.org/netdev/20241217224717.1711626-1-zhuyifei@google.com/T/ Signed-off-by: YiFei Zhu --- .../drivers/net/ethernet/sfc/net_driver.h | 2 ++ .../drivers/net/ethernet/sfc/rx_common.c | 6 ++--- .../net/ethernet/sfc/tc_encap_actions.c | 10 ++++--- .../net/ethernet/sfc/tc_encap_actions.h | 2 ++ src/driver/linux_resource/kernel_compat.sh | 3 +++ src/driver/linux_resource/nondl_resource.c | 5 ++-- src/driver/linux_resource/resource_driver.c | 8 +++--- src/include/ci/driver/kernel_compat.h | 26 +++++++++++++++++++ src/include/ci/efhw/efhw_types.h | 2 ++ src/include/ci/efrm/nondl.h | 2 ++ 10 files changed, 53 insertions(+), 13 deletions(-) diff --git a/src/driver/linux_net/drivers/net/ethernet/sfc/net_driver.h b/src/driver/linux_net/drivers/net/ethernet/sfc/net_driver.h index 48bbd2d03..eeacad487 100644 --- a/src/driver/linux_net/drivers/net/ethernet/sfc/net_driver.h +++ b/src/driver/linux_net/drivers/net/ethernet/sfc/net_driver.h @@ -1291,6 +1291,7 @@ struct efx_arfs_rule { /** * struct efx_async_filter_insertion - Request to asynchronously insert a filter * @net_dev: Reference to the netdevice + * @net_dev_tracker: reference tracker entry for @net_dev * @spec: The filter to insert * @work: Workitem for this request * @rxq_index: Identifies the channel for which this request was made @@ -1298,6 +1299,7 @@ struct efx_arfs_rule { */ struct efx_async_filter_insertion { struct net_device *net_dev; + netdevice_tracker net_dev_tracker; struct efx_filter_spec spec; struct work_struct work; u16 rxq_index; diff --git a/src/driver/linux_net/drivers/net/ethernet/sfc/rx_common.c b/src/driver/linux_net/drivers/net/ethernet/sfc/rx_common.c index 37d6a394c..2e5c73764 100644 --- a/src/driver/linux_net/drivers/net/ethernet/sfc/rx_common.c +++ b/src/driver/linux_net/drivers/net/ethernet/sfc/rx_common.c @@ -1845,7 +1845,7 @@ static void efx_filter_rfs_work(struct work_struct *data) /* Release references */ clear_bit(slot_idx, &efx->rps_slot_map); - dev_put(req->net_dev); + netdev_put(req->net_dev, &req->net_dev_tracker); return; } @@ -1908,7 +1908,8 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb, } /* Queue the request */ - dev_hold(req->net_dev = net_dev); + req->net_dev = net_dev; + netdev_hold(req->net_dev, &req->net_dev_tracker, GFP_ATOMIC); INIT_WORK(&req->work, efx_filter_rfs_work); req->rxq_index = rxq_index; req->flow_id = flow_id; @@ -1965,4 +1966,3 @@ bool __efx_filter_rfs_expire(struct efx_channel *channel, unsigned int quota) } #endif /* CONFIG_RFS_ACCEL */ - diff --git a/src/driver/linux_net/drivers/net/ethernet/sfc/tc_encap_actions.c b/src/driver/linux_net/drivers/net/ethernet/sfc/tc_encap_actions.c index d81e31e7d..a188dbf5d 100644 --- a/src/driver/linux_net/drivers/net/ethernet/sfc/tc_encap_actions.c +++ b/src/driver/linux_net/drivers/net/ethernet/sfc/tc_encap_actions.c @@ -209,7 +209,8 @@ static int efx_bind_neigh(struct efx_nic *efx, EFX_TC_ERR_MSG(efx, extack, "Failed to lookup route for encap"); goto out_free; } - dev_hold(neigh->egdev = dst->dev); + netdev_hold(neigh->egdev = dst->dev, &neigh->dev_tracker, + GFP_KERNEL_ACCOUNT); neigh->ttl = ip6_dst_hoplimit(dst); n = dst_neigh_lookup(dst, &flow6.daddr); dst_release(dst); @@ -223,7 +224,8 @@ static int efx_bind_neigh(struct efx_nic *efx, EFX_TC_ERR_MSG(efx, extack, "Failed to lookup route for encap"); goto out_free; } - dev_hold(neigh->egdev = rt->dst.dev); + netdev_hold(neigh->egdev = rt->dst.dev, &neigh->dev_tracker, + GFP_KERNEL_ACCOUNT); neigh->ttl = ip4_dst_hoplimit(&rt->dst); n = dst_neigh_lookup(&rt->dst, &flow4.daddr); ip_rt_put(rt); @@ -233,7 +235,7 @@ static int efx_bind_neigh(struct efx_nic *efx, if (!n) { rc = -ENETUNREACH; EFX_TC_ERR_MSG(efx, extack, "Failed to lookup neighbour for encap"); - dev_put(neigh->egdev); + netdev_put(neigh->egdev, &neigh->dev_tracker); goto out_free; } refcount_set(&neigh->ref, 1); @@ -273,7 +275,7 @@ static void efx_free_neigh(struct efx_neigh_binder *neigh) rhashtable_remove_fast(&efx->tc->neigh_ht, &neigh->linkage, efx_neigh_ht_params); synchronize_rcu(); - dev_put(neigh->egdev); + netdev_put(neigh->egdev, &neigh->dev_tracker); put_net(neigh->net); kfree(neigh); } diff --git a/src/driver/linux_net/drivers/net/ethernet/sfc/tc_encap_actions.h b/src/driver/linux_net/drivers/net/ethernet/sfc/tc_encap_actions.h index 71d0dc18f..af79703e3 100644 --- a/src/driver/linux_net/drivers/net/ethernet/sfc/tc_encap_actions.h +++ b/src/driver/linux_net/drivers/net/ethernet/sfc/tc_encap_actions.h @@ -26,6 +26,7 @@ * @ttl: Time To Live associated with the route used * @dying: set when egdev is going away, to skip further updates * @egdev: egress device from the route lookup. Holds a reference + * @dev_tracker: reference tracker entry for @egdev * @ref: counts encap actions referencing this entry * @used: jiffies of last time traffic hit any encap action using this. * When counter reads update this, a new neighbour event is sent to @@ -53,6 +54,7 @@ struct efx_neigh_binder { u8 ttl; bool dying; struct net_device *egdev; + netdevice_tracker dev_tracker; refcount_t ref; unsigned long used; struct list_head users; diff --git a/src/driver/linux_resource/kernel_compat.sh b/src/driver/linux_resource/kernel_compat.sh index d277510b1..3167c61b0 100755 --- a/src/driver/linux_resource/kernel_compat.sh +++ b/src/driver/linux_resource/kernel_compat.sh @@ -178,6 +178,9 @@ EFRM_HAVE_FOLLOW_PTE_VMA symtype follow_pte include/linux/mm.h int(struct vm_are EFRM_HAVE_LINUX_TPH_H file include/linux/pci-tph.h +EFX_NEED_NETDEV_HOLD nsymbol netdev_hold include/linux/netdevice.h +EFX_HAVE_DEV_HOLD_TRACK symbol dev_hold_track include/linux/netdevice.h + # TODO move onload-related stuff from net kernel_compat " | grep -E -v -e '^#' -e '^$' | sed 's/[ \t][ \t]*/:/g' } diff --git a/src/driver/linux_resource/nondl_resource.c b/src/driver/linux_resource/nondl_resource.c index c7ec99d82..0b3d2156d 100644 --- a/src/driver/linux_resource/nondl_resource.c +++ b/src/driver/linux_resource/nondl_resource.c @@ -8,6 +8,7 @@ #include #include #include +#include /* Maximum number of VIs we will try to create for each device. */ #define MAX_VIS 128 @@ -132,7 +133,7 @@ int efrm_nondl_register_netdev(struct net_device *netdev, INIT_LIST_HEAD(&device->node); - dev_hold(netdev); + netdev_hold(netdev, &device->netdev_tracker, GFP_KERNEL); device->netdev = netdev; device->n_vis = n_vis; device->is_up = 1; @@ -153,7 +154,7 @@ static void efrm_nondl_cleanup_netdev(struct efrm_nondl_device *device) BUG_ON(device->driver); list_del(&device->node); - dev_put(device->netdev); + netdev_put(device->netdev, &device->netdev_tracker); kfree(device); } diff --git a/src/driver/linux_resource/resource_driver.c b/src/driver/linux_resource/resource_driver.c index db1a2ba09..020907506 100644 --- a/src/driver/linux_resource/resource_driver.c +++ b/src/driver/linux_resource/resource_driver.c @@ -206,7 +206,7 @@ linux_efrm_nic_ctor(struct linux_efhw_nic *lnic, struct device *dev, /* Tie the lifetime of the kernel's state to that of our own. */ if( dev ) get_device(dev); - dev_hold(net_dev); + netdev_hold(net_dev, &nic->net_dev_tracker, GFP_KERNEL); rc = efhw_nic_ctor(nic, res_dim, dev_type, net_dev, dev); if (rc < 0) @@ -236,7 +236,7 @@ linux_efrm_nic_ctor(struct linux_efhw_nic *lnic, struct device *dev, fail1: if( dev ) put_device(dev); - dev_put(net_dev); + netdev_put(net_dev, &nic->net_dev_tracker); return rc; } @@ -259,7 +259,7 @@ linux_efrm_nic_reclaim(struct linux_efhw_nic *lnic, /* Replace the net & pci devs */ get_device(dev); - dev_hold(net_dev); + netdev_hold(net_dev, &nic->net_dev_tracker, GFP_KERNEL); spin_lock_bh(&nic->pci_dev_lock); old_dev = nic->dev; nic->dev = dev; @@ -506,7 +506,7 @@ efrm_nic_do_unplug(struct efhw_nic* nic, bool hard) spin_unlock_bh(&nic->pci_dev_lock); EFRM_ASSERT(net_dev != NULL); - dev_put(net_dev); + netdev_put(net_dev, &nic->net_dev_tracker); put_device(dev); return 0; diff --git a/src/include/ci/driver/kernel_compat.h b/src/include/ci/driver/kernel_compat.h index a1d7c91a1..3b8e4defa 100644 --- a/src/include/ci/driver/kernel_compat.h +++ b/src/include/ci/driver/kernel_compat.h @@ -546,4 +546,30 @@ static inline int efrm_follow_pfn(struct vm_area_struct *vma, } #endif +#ifdef EFX_NEED_NETDEV_HOLD +#ifdef EFX_HAVE_DEV_HOLD_TRACK +/* Commit d62607c3fe45 ("net: rename reference+tracking helpers") + * renamed these. + */ +#define netdev_hold(_n, _t, _g) dev_hold_track(_n, _t, _g) +#define netdev_put(_n, _t) dev_put_track(_n, _t) +#else +/* This was introduced in the same commit that adds dev_hold_track */ +typedef struct {} netdevice_tracker; + +static inline void netdev_hold(struct net_device *dev, + netdevice_tracker *tracker __always_unused, + gfp_t gfp __always_unused) +{ + dev_hold(dev); +} + +static inline void netdev_put(struct net_device *dev, + netdevice_tracker *tracker __always_unused) +{ + dev_put(dev); +} +#endif +#endif + #endif /* DRIVER_LINUX_RESOURCE_KERNEL_COMPAT_H */ diff --git a/src/include/ci/efhw/efhw_types.h b/src/include/ci/efhw/efhw_types.h index d7260ae6a..f9cd40952 100644 --- a/src/include/ci/efhw/efhw_types.h +++ b/src/include/ci/efhw/efhw_types.h @@ -40,6 +40,7 @@ #ifndef __CI_EFHW_EFAB_TYPES_H__ #define __CI_EFHW_EFAB_TYPES_H__ +#include #include #include #include @@ -569,6 +570,7 @@ struct efhw_nic { int index; struct net_device *net_dev; /*!< Network device */ + netdevice_tracker net_dev_tracker; struct device *dev; /*!< HW device */ struct pci_dev *pci_dev; /*!< PCI device */ spinlock_t pci_dev_lock; /*!< Protects access to dev & net_dev */ diff --git a/src/include/ci/efrm/nondl.h b/src/include/ci/efrm/nondl.h index f1fd2927c..164be2707 100644 --- a/src/include/ci/efrm/nondl.h +++ b/src/include/ci/efrm/nondl.h @@ -8,6 +8,7 @@ #define __EFRM_NONDL_H__ #include +#include /* Non-driverlink network device. @@ -43,6 +44,7 @@ struct efrm_nondl_device { /* Network device currently associated with this non-driverlink * device. */ struct net_device *netdev; + netdevice_tracker netdev_tracker; /* Number of VIs we would like to create on this device. */ unsigned int n_vis; From c632bb4db5becc7e60128a6d9e2a85f20a7c1353 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Mon, 16 Dec 2024 22:57:19 +0000 Subject: [PATCH 2/2] Unregister efrm-nondl on shutdown / reboot On shutdown / reboot, if netdevs don't have a refcount of 1, the kernel loops with message: unregister_netdevice: waiting for eth0 to become free. Usage count = 3 unregister_netdevice: waiting for eth0 to become free. Usage count = 3 unregister_netdevice: waiting for eth0 to become free. Usage count = 3 [...] With netdev refcount tracking, the traces are visible: unregister_netdevice: waiting for eth0 to become free. Usage count = 3 ref_tracker: eth%d@ff3cf768c8344548 has 1/2 users at efrm_nic_add+0x29b/0x460 [sfc_resource] efrm_nondl_add_device+0x124/0x1c0 [sfc_resource] efrm_nondl_try_add_device+0x27/0xf0 [sfc_resource] efrm_nondl_register_netdev+0x106/0x160 [sfc_resource] nondl_register_store+0x174/0x1e0 [sfc_resource] kernfs_fop_write_iter+0x128/0x1c0 vfs_write+0x308/0x420 ksys_write+0x5f/0xe0 do_syscall_64+0x7b/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e ref_tracker: eth%d@ff3cf768c8344548 has 1/2 users at efrm_nondl_register_netdev+0xa9/0x160 [sfc_resource] nondl_register_store+0x174/0x1e0 [sfc_resource] kernfs_fop_write_iter+0x128/0x1c0 vfs_write+0x308/0x420 ksys_write+0x5f/0xe0 do_syscall_64+0x7b/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e This patch makes sfc_resource register a reboot notifier that calls efrm_nondl_unregister & efrm_nondl_shutdown that handles each of the two refcounts. Considering that the only other caller of those functions is the cleanup_sfc_resource module exit function, I don't think shutdown should be concerned about module unload racing under normal circumstances. In any case, efrm_nondl_unregister_driver is made to exit early before the assert, in the event that the race does end up happening. Signed-off-by: YiFei Zhu --- src/driver/linux_resource/nondl_resource.c | 4 ++++ src/driver/linux_resource/resource_driver.c | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/src/driver/linux_resource/nondl_resource.c b/src/driver/linux_resource/nondl_resource.c index 0b3d2156d..f485f4482 100644 --- a/src/driver/linux_resource/nondl_resource.c +++ b/src/driver/linux_resource/nondl_resource.c @@ -95,6 +95,9 @@ void efrm_nondl_unregister_driver(struct efrm_nondl_driver *driver) rtnl_lock(); + if (!nondl_driver) + goto out; + EFRM_ASSERT(nondl_driver == driver); list_for_each_entry_safe_reverse(device, device_n, &driver->devices, @@ -105,6 +108,7 @@ void efrm_nondl_unregister_driver(struct efrm_nondl_driver *driver) nondl_driver = NULL; +out: rtnl_unlock(); } EXPORT_SYMBOL(efrm_nondl_unregister_driver); diff --git a/src/driver/linux_resource/resource_driver.c b/src/driver/linux_resource/resource_driver.c index 020907506..3f17c8402 100644 --- a/src/driver/linux_resource/resource_driver.c +++ b/src/driver/linux_resource/resource_driver.c @@ -69,6 +69,7 @@ #include "sfcaffinity.h" #include #include +#include #include "debugfs.h" MODULE_AUTHOR("Solarflare Communications"); @@ -631,6 +632,21 @@ static void efrm_nic_del_all(void) efrm_nic_del(linux_efhw_nic(nic)); } +static int sfc_resource_shutdown_notify(struct notifier_block *unused1, + unsigned long unused2, void *unused3) +{ + /* Due to refcounting reasons in netdev, these must + * be called for the shutdown to happen without delays + */ + efrm_nondl_unregister(); + efrm_nondl_shutdown(); + + return NOTIFY_DONE; +} + +static struct notifier_block sfc_resource_shutdown_nb = { + .notifier_call = sfc_resource_shutdown_notify, +}; /**************************************************************************** * @@ -683,6 +699,8 @@ static int init_sfc_resource(void) efrm_install_sysfs_entries(); efrm_nondl_register(); + register_reboot_notifier(&sfc_resource_shutdown_nb); + return 0; failed_notifier: @@ -705,6 +723,8 @@ static int init_sfc_resource(void) ****************************************************************************/ static void cleanup_sfc_resource(void) { + unregister_reboot_notifier(&sfc_resource_shutdown_nb); + efrm_nondl_unregister(); efrm_remove_sysfs_entries(); efrm_nondl_shutdown();