// SPDX-License-Identifier: GPL-2.0-or-later /* * NET3 Protocol independent device support routines. * * Derived from the non IP parts of dev.c 1.0.19 * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * * Additional Authors: * Florian la Roche <rzsfl@rz.uni-sb.de> * Alan Cox <gw4pts@gw4pts.ampr.org> * David Hinds <dahinds@users.sourceforge.net> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Adam Sulmicki <adam@cfar.umd.edu> * Pekka Riikonen <priikone@poesidon.pspt.fi> * * Changes: * D.J. Barrow : Fixed bug where dev->refcnt gets set * to 2 if register_netdev gets called * before net_dev_init & also removed a * few lines of code in the process. * Alan Cox : device private ioctl copies fields back. * Alan Cox : Transmit queue code does relevant * stunts to keep the queue safe. * Alan Cox : Fixed double lock. * Alan Cox : Fixed promisc NULL pointer trap * ???????? : Support the full private ioctl range * Alan Cox : Moved ioctl permission check into * drivers * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI * Alan Cox : 100 backlog just doesn't cut it when * you start doing multicast video 8) * Alan Cox : Rewrote net_bh and list manager. * Alan Cox : Fix ETH_P_ALL echoback lengths. * Alan Cox : Took out transmit every packet pass * Saved a few bytes in the ioctl handler * Alan Cox : Network driver sets packet type before * calling netif_rx. Saves a function * call a packet. * Alan Cox : Hashed net_bh() * Richard Kooijman: Timestamp fixes. * Alan Cox : Wrong field in SIOCGIFDSTADDR * Alan Cox : Device lock protection. * Alan Cox : Fixed nasty side effect of device close * changes. * Rudi Cilibrasi : Pass the right thing to * set_mac_address() * Dave Miller : 32bit quantity for the device lock to * make it work out on a Sparc. * Bjorn Ekwall : Added KERNELD hack. * Alan Cox : Cleaned up the backlog initialise. * Craig Metz : SIOCGIFCONF fix if space for under * 1 device. * Thomas Bogendoerfer : Return ENODEV for dev_open, if there * is no device open function. * Andi Kleen : Fix error reporting for SIOCGIFCONF * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF * Cyrus Durgin : Cleaned for KMOD * Adam Sulmicki : Bug Fix : Network Device Unload * A network device unload needs to purge * the backlog queue. * Paul Rusty Russell : SIOCSIFNAME * Pekka Riikonen : Netdev boot-time settings code * Andrew Morton : Make unregister_netdevice wait * indefinitely on dev->refcnt * J Hadi Salim : - Backlog queue sampling * - netif_rx() feedback
*/
int netdev_name_node_alt_create(struct net_device *dev, constchar *name)
{ struct netdev_name_node *name_node; struct net *net = dev_net(dev);
name_node = netdev_name_node_lookup(net, name); if (name_node) return -EEXIST;
name_node = netdev_name_node_alloc(dev, name); if (!name_node) return -ENOMEM;
netdev_name_node_add(net, name_node); /* The node that holds dev->name acts as a head of per-device list. */
list_add_tail_rcu(&name_node->list, &dev->name_node->list);
int netdev_name_node_alt_destroy(struct net_device *dev, constchar *name)
{ struct netdev_name_node *name_node; struct net *net = dev_net(dev);
name_node = netdev_name_node_lookup(net, name); if (!name_node) return -ENOENT; /* lookup might have found our primary name or a name belonging * to another device.
*/ if (name_node == dev->name_node || name_node->dev != dev) return -EINVAL;
/* We reserved the ifindex, this can't fail */
WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
dev_base_seq_inc(net);
}
/* Device list removal * caller must respect a RCU grace period before freeing/reusing dev
*/ staticvoid unlist_netdevice(struct net_device *dev)
{ struct netdev_name_node *name_node; struct net *net = dev_net(dev);
/* Page_pool has a lockless array/stack to alloc/recycle pages. * PP consumers must pay attention to run APIs in the appropriate context * (e.g. NAPI context).
*/
DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = {
.bh_lock = INIT_LOCAL_LOCK(bh_lock),
};
staticinlineunsignedshort netdev_lock_pos(unsignedshort dev_type)
{ int i;
for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) if (netdev_lock_type[i] == dev_type) return i; /* the last key is used by default */ return ARRAY_SIZE(netdev_lock_type) - 1;
}
staticinlinevoid netdev_set_xmit_lockdep_class(spinlock_t *lock, unsignedshort dev_type)
{ int i;
i = netdev_lock_pos(dev_type);
lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
netdev_lock_name[i]);
}
staticinlinevoid netdev_set_addr_lockdep_class(struct net_device *dev)
{ int i;
/******************************************************************************* * * Protocol management and registration routines *
*******************************************************************************/
/* * Add a protocol ID to the list. Now that the input handler is * smarter we can dispense with all the messy stuff that used to be * here. * * BEWARE!!! Protocol handlers, mangling input packets, * MUST BE last in hash buckets and checking protocol handlers * MUST start from promiscuous ptype_all chain in net_bh. * It is true now, do not change it. * Explanation follows: if protocol handler, mangling packet, will * be the first on list, it is not able to sense, that packet * is cloned and should be copied-on-write, so that it will * change it and subsequent readers will get broken packet. * --ANK (980803)
*/
staticinlinestruct list_head *ptype_head(conststruct packet_type *pt)
{ if (pt->type == htons(ETH_P_ALL)) { if (!pt->af_packet_net && !pt->dev) return NULL;
/** * dev_add_pack - add packet handler * @pt: packet type declaration * * Add a protocol handler to the networking stack. The passed &packet_type * is linked into kernel lists and may not be freed until it has been * removed from the kernel lists. * * This call does not sleep therefore it can not * guarantee all CPU's that are in middle of receiving packets * will see the new packet type (until the next received packet).
*/
/** * __dev_remove_pack - remove packet handler * @pt: packet type declaration * * Remove a protocol handler that was previously added to the kernel * protocol handlers by dev_add_pack(). The passed &packet_type is removed * from the kernel lists and can be freed or reused once this function * returns. * * The packet type might still be in use by receivers * and must not be freed until after all the CPU's have gone * through a quiescent state.
*/ void __dev_remove_pack(struct packet_type *pt)
{ struct list_head *head = ptype_head(pt); struct packet_type *pt1;
pr_warn("dev_remove_pack: %p not found\n", pt);
out:
spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(__dev_remove_pack);
/** * dev_remove_pack - remove packet handler * @pt: packet type declaration * * Remove a protocol handler that was previously added to the kernel * protocol handlers by dev_add_pack(). The passed &packet_type is removed * from the kernel lists and can be freed or reused once this function * returns. * * This call sleeps to guarantee that no CPU is looking at the packet * type after return.
*/ void dev_remove_pack(struct packet_type *pt)
{
__dev_remove_pack(pt);
/** * dev_get_iflink - get 'iflink' value of a interface * @dev: targeted interface * * Indicates the ifindex the interface is linked to. * Physical interfaces have the same 'ifindex' and 'iflink' values.
*/
int dev_get_iflink(conststruct net_device *dev)
{ if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) return dev->netdev_ops->ndo_get_iflink(dev);
/** * dev_fill_metadata_dst - Retrieve tunnel egress information. * @dev: targeted interface * @skb: The packet. * * For better visibility of tunnel traffic OVS needs to retrieve * egress tunnel information for a packet. Following API allows * user to get this info.
*/ int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{ struct ip_tunnel_info *info;
if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) return -EINVAL;
info = skb_tunnel_info_unclone(skb); if (!info) return -ENOMEM; if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) return -EINVAL;
/* must be called under rcu_read_lock(), as we dont take a reference */ staticstruct napi_struct *napi_by_id(unsignedint napi_id)
{ unsignedint hash = napi_id % HASH_SIZE(napi_hash); struct napi_struct *napi;
hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) if (napi->napi_id == napi_id) return napi;
return NULL;
}
/* must be called under rcu_read_lock(), as we dont take a reference */ staticstruct napi_struct *
netdev_napi_by_id(struct net *net, unsignedint napi_id)
{ struct napi_struct *napi;
napi = napi_by_id(napi_id); if (!napi) return NULL;
if (WARN_ON_ONCE(!napi->dev)) return NULL; if (!net_eq(net, dev_net(napi->dev))) return NULL;
return napi;
}
/** * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it * @net: the applicable net namespace * @napi_id: ID of a NAPI of a target device * * Find a NAPI instance with @napi_id. Lock its device. * The device must be in %NETREG_REGISTERED state for lookup to succeed. * netdev_unlock() must be called to release it. * * Return: pointer to NAPI, its device with lock held, NULL if not found.
*/ struct napi_struct *
netdev_napi_by_id_lock(struct net *net, unsignedint napi_id)
{ struct napi_struct *napi; struct net_device *dev;
rcu_read_lock();
napi = netdev_napi_by_id(net, napi_id); if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) {
rcu_read_unlock(); return NULL;
}
dev = napi->dev;
dev_hold(dev);
rcu_read_unlock();
dev = __netdev_put_lock(dev, net); if (!dev) return NULL;
rcu_read_lock();
napi = netdev_napi_by_id(net, napi_id); if (napi && napi->dev != dev)
napi = NULL;
rcu_read_unlock();
if (!napi)
netdev_unlock(dev); return napi;
}
/** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace * @name: name to find * * Find an interface by name. Must be called under RTNL semaphore. * If the name is found a pointer to the device is returned. * If the name is not found then %NULL is returned. The * reference counters are not incremented so the caller must be * careful with locks.
*/
/** * dev_get_by_name_rcu - find a device by its name * @net: the applicable net namespace * @name: name to find * * Find an interface by name. * If the name is found a pointer to the device is returned. * If the name is not found then %NULL is returned. * The reference counters are not incremented so the caller must be * careful with locks. The caller must hold RCU lock.
*/
/** * netdev_get_by_name() - find a device by its name * @net: the applicable net namespace * @name: name to find * @tracker: tracking object for the acquired reference * @gfp: allocation flags for the tracker * * Find an interface by name. This can be called from any * context and does its own locking. The returned handle has * the usage count incremented and the caller must use netdev_put() to * release it when it is no longer needed. %NULL is returned if no * matching device is found.
*/ struct net_device *netdev_get_by_name(struct net *net, constchar *name,
netdevice_tracker *tracker, gfp_t gfp)
{ struct net_device *dev;
dev = dev_get_by_name(net, name); if (dev)
netdev_tracker_alloc(dev, tracker, gfp); return dev;
}
EXPORT_SYMBOL(netdev_get_by_name);
/** * __dev_get_by_index - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful * about locking. The caller must hold the RTNL semaphore.
*/
struct net_device *__dev_get_by_index(struct net *net, int ifindex)
{ struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex);
hlist_for_each_entry(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev;
return NULL;
}
EXPORT_SYMBOL(__dev_get_by_index);
/** * dev_get_by_index_rcu - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock.
*/
struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
{ struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex);
hlist_for_each_entry_rcu(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev;
/* Deprecated for new users, call netdev_get_by_index() instead */ struct net_device *dev_get_by_index(struct net *net, int ifindex)
{ struct net_device *dev;
/** * netdev_get_by_index() - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * @tracker: tracking object for the acquired reference * @gfp: allocation flags for the tracker * * Search for an interface by index. Returns NULL if the device * is not found or a pointer to the device. The device returned has * had a reference added and the pointer is safe until the user calls * netdev_put() to indicate they have finished with it.
*/ struct net_device *netdev_get_by_index(struct net *net, int ifindex,
netdevice_tracker *tracker, gfp_t gfp)
{ struct net_device *dev;
dev = dev_get_by_index(net, ifindex); if (dev)
netdev_tracker_alloc(dev, tracker, gfp); return dev;
}
EXPORT_SYMBOL(netdev_get_by_index);
/** * dev_get_by_napi_id - find a device by napi_id * @napi_id: ID of the NAPI struct * * Search for an interface by NAPI ID. Returns %NULL if the device * is not found or a pointer to the device. The device has not had * its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock.
*/ struct net_device *dev_get_by_napi_id(unsignedint napi_id)
{ struct napi_struct *napi;
WARN_ON_ONCE(!rcu_read_lock_held());
if (!napi_id_valid(napi_id)) return NULL;
napi = napi_by_id(napi_id);
return napi ? napi->dev : NULL;
}
/* Release the held reference on the net_device, and if the net_device * is still registered try to lock the instance lock. If device is being * unregistered NULL will be returned (but the reference has been released, * either way!) * * This helper is intended for locking net_device after it has been looked up * using a lockless lookup helper. Lock prevents the instance from going away.
*/ struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net)
{
netdev_lock(dev); if (dev->reg_state > NETREG_REGISTERED ||
dev->moving_ns || !net_eq(dev_net(dev), net)) {
netdev_unlock(dev);
dev_put(dev); return NULL;
}
dev_put(dev); return dev;
}
/** * netdev_get_by_index_lock() - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. If a valid device * with @ifindex is found it will be returned with netdev->lock held. * netdev_unlock() must be called to release it. * * Return: pointer to a device with lock held, NULL if not found.
*/ struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex)
{ struct net_device *dev;
dev = dev_get_by_index(net, ifindex); if (!dev) return NULL;
return __netdev_put_lock(dev, net);
}
struct net_device *
netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex)
{ struct net_device *dev;
dev = dev_get_by_index(net, ifindex); if (!dev) return NULL;
return __netdev_put_lock_ops_compat(dev, net);
}
struct net_device *
netdev_xa_find_lock(struct net *net, struct net_device *dev, unsignedlong *index)
{ if (dev)
netdev_unlock(dev);
do {
rcu_read_lock();
dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); if (!dev) {
rcu_read_unlock(); return NULL;
}
dev_hold(dev);
rcu_read_unlock();
dev = __netdev_put_lock(dev, net); if (dev) return dev;
(*index)++;
} while (true);
}
struct net_device *
netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, unsignedlong *index)
{ if (dev)
netdev_unlock_ops_compat(dev);
do {
rcu_read_lock();
dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); if (!dev) {
rcu_read_unlock(); return NULL;
}
dev_hold(dev);
rcu_read_unlock();
dev = __netdev_put_lock_ops_compat(dev, net); if (dev) return dev;
do {
seq = read_seqbegin(&netdev_rename_lock);
strscpy(name, dev->name, IFNAMSIZ);
} while (read_seqretry(&netdev_rename_lock, seq));
}
/** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace * @name: a pointer to the buffer where the name will be stored. * @ifindex: the ifindex of the interface to get the name from.
*/ int netdev_get_name(struct net *net, char *name, int ifindex)
{ struct net_device *dev; int ret;
rcu_read_lock();
dev = dev_get_by_index_rcu(net, ifindex); if (!dev) {
ret = -ENODEV; goto out;
}
netdev_copy_name(dev, name);
ret = 0;
out:
rcu_read_unlock(); return ret;
}
staticbool dev_addr_cmp(struct net_device *dev, unsignedshort type, constchar *ha)
{ return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len);
}
/** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace * @type: media type of device * @ha: hardware address * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. * The caller must hold RCU. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking *
*/
/** * dev_getbyhwaddr() - find a device by its hardware address * @net: the applicable net namespace * @type: media type of device * @ha: hardware address * * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold * rtnl_lock. * * Context: rtnl_lock() must be held. * Return: pointer to the net_device, or NULL if not found
*/ struct net_device *dev_getbyhwaddr(struct net *net, unsignedshort type, constchar *ha)
{ struct net_device *dev;
ASSERT_RTNL();
for_each_netdev(net, dev) if (dev_addr_cmp(dev, type, ha)) return dev;
rcu_read_lock();
for_each_netdev_rcu(net, dev) if (dev->type == type) {
dev_hold(dev);
ret = dev; break;
}
rcu_read_unlock(); return ret;
}
EXPORT_SYMBOL(dev_getfirstbyhwtype);
/** * netdev_get_by_flags_rcu - find any device with given flags * @net: the applicable net namespace * @tracker: tracking object for the acquired reference * @if_flags: IFF_* values * @mask: bitmask of bits in if_flags to check * * Search for any interface with the given flags. * * Context: rcu_read_lock() must be held. * Returns: NULL if a device is not found or a pointer to the device.
*/ struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker, unsignedshort if_flags, unsignedshort mask)
{ struct net_device *dev;
/** * dev_valid_name - check if name is okay for network device * @name: name string * * Network device names need to be valid file names to * allow sysfs to work. We also disallow any kind of * whitespace.
*/ bool dev_valid_name(constchar *name)
{ if (*name == '\0') returnfalse; if (strnlen(name, IFNAMSIZ) == IFNAMSIZ) returnfalse; if (!strcmp(name, ".") || !strcmp(name, "..")) returnfalse;
while (*name) { if (*name == '/' || *name == ':' || isspace(*name)) returnfalse;
name++;
} returntrue;
}
EXPORT_SYMBOL(dev_valid_name);
/** * __dev_alloc_name - allocate a name for a device * @net: network namespace to allocate the device name in * @name: name format string * @res: result name string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses * the first empty slot. The caller must hold the dev_base or rtnl lock * while allocating the name and adding the device in order to avoid * duplicates. * Limited to bits_per_byte * page size devices (ie 32K on most platforms). * Returns the number of the unit assigned or a negative errno code.
*/
staticint __dev_alloc_name(struct net *net, constchar *name, char *res)
{ int i = 0; constchar *p; constint max_netdevices = 8*PAGE_SIZE; unsignedlong *inuse; struct net_device *d; char buf[IFNAMSIZ];
/* Verify the string as this thing may have come from the user. * There must be one "%d" and no other "%" characters.
*/
p = strchr(name, '%'); if (!p || p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL;
/* Use one page as a bit array of possible slots */
inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC); if (!inuse) return -ENOMEM;
netdev_for_each_altname(d, name_node) { if (!sscanf(name_node->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue;
/* avoid cases where sscanf is not exact inverse of printf */
snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, name_node->name, IFNAMSIZ))
__set_bit(i, inuse);
} if (!sscanf(d->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue;
/* avoid cases where sscanf is not exact inverse of printf */
snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, d->name, IFNAMSIZ))
__set_bit(i, inuse);
}
i = find_first_zero_bit(inuse, max_netdevices);
bitmap_free(inuse); if (i == max_netdevices) return -ENFILE;
/* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
strscpy(buf, name, IFNAMSIZ);
snprintf(res, IFNAMSIZ, buf, i); return i;
}
/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */ staticint dev_prep_valid_name(struct net *net, struct net_device *dev, constchar *want_name, char *out_name, int dup_errno)
{ if (!dev_valid_name(want_name)) return -EINVAL;
if (strchr(want_name, '%')) return __dev_alloc_name(net, want_name, out_name);
if (netdev_name_in_use(net, want_name)) return -dup_errno; if (out_name != want_name)
strscpy(out_name, want_name, IFNAMSIZ); return 0;
}
/** * dev_alloc_name - allocate a name for a device * @dev: device * @name: name format string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses * the first empty slot. The caller must hold the dev_base or rtnl lock * while allocating the name and adding the device in order to avoid * duplicates. * Limited to bits_per_byte * page size devices (ie 32K on most platforms). * Returns the number of the unit assigned or a negative errno code.
*/
/** * dev_get_alias - get ifalias of a device * @dev: device * @name: buffer to store name of ifalias * @len: size of buffer * * get ifalias for a device. Caller must make sure dev cannot go * away, e.g. rcu read lock or own a reference count to device.
*/ int dev_get_alias(conststruct net_device *dev, char *name, size_t len)
{ conststruct dev_ifalias *alias; int ret = 0;
rcu_read_lock();
alias = rcu_dereference(dev->ifalias); if (alias)
ret = snprintf(name, len, "%s", alias->ifalias);
rcu_read_unlock();
return ret;
}
/** * netdev_features_change - device changes features * @dev: device to cause notification * * Called to indicate a device has changed features.
*/ void netdev_features_change(struct net_device *dev)
{
call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
}
EXPORT_SYMBOL(netdev_features_change);
/** * __netdev_notify_peers - notify network peers about existence of @dev, * to be called when rtnl lock is already held. * @dev: network device * * Generate traffic such that interested network peers are aware of * @dev, such as by generating a gratuitous ARP. This may be used when * a device wants to inform the rest of the network about some sort of * reconfiguration such as a failover event or virtual machine * migration.
*/ void __netdev_notify_peers(struct net_device *dev)
{
ASSERT_RTNL();
call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
}
EXPORT_SYMBOL(__netdev_notify_peers);
/** * netdev_notify_peers - notify network peers about existence of @dev * @dev: network device * * Generate traffic such that interested network peers are aware of * @dev, such as by generating a gratuitous ARP. This may be used when * a device wants to inform the rest of the network about some sort of * reconfiguration such as a failover event or virtual machine * migration.
*/ void netdev_notify_peers(struct net_device *dev)
{
rtnl_lock();
__netdev_notify_peers(dev);
rtnl_unlock();
}
EXPORT_SYMBOL(netdev_notify_peers);
staticint napi_threaded_poll(void *data);
staticint napi_kthread_create(struct napi_struct *n)
{ int err = 0;
/* Create and wake up the kthread once to put it in * TASK_INTERRUPTIBLE mode to avoid the blocked task * warning and work with loadavg.
*/
n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
n->dev->name, n->napi_id); if (IS_ERR(n->thread)) {
err = PTR_ERR(n->thread);
pr_err("kthread_run failed with err %d\n", err);
n->thread = NULL;
}
if (!netif_device_present(dev)) { /* may be detached because parent is runtime-suspended */ if (dev->dev.parent)
pm_runtime_resume(dev->dev.parent); if (!netif_device_present(dev)) return -ENODEV;
}
/* Block netpoll from trying to do any rx path servicing. * If we don't do this there is a chance ndo_poll_controller * or ndo_poll may be running while we open the device
*/
netpoll_poll_disable(dev);
ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
ret = notifier_to_errno(ret); if (ret) return ret;
set_bit(__LINK_STATE_START, &dev->state);
netdev_ops_assert_locked(dev);
if (ops->ndo_validate_addr)
ret = ops->ndo_validate_addr(dev);
if (!ret && ops->ndo_open)
ret = ops->ndo_open(dev);
list_for_each_entry(dev, head, close_list) { /* Temporarily disable netpoll until the interface is down */
netpoll_poll_disable(dev);
call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
clear_bit(__LINK_STATE_START, &dev->state);
/* Synchronize to scheduled poll. We cannot touch poll list, it * can be even on different cpu. So just clear netif_running(). * * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device.
*/
smp_mb__after_atomic(); /* Commit netif_running(). */
}
/* Remove the devices that don't need to be closed */
list_for_each_entry_safe(dev, tmp, head, close_list) if (!(dev->flags & IFF_UP))
list_del_init(&dev->close_list);
/** * dev_disable_gro_hw - disable HW Generic Receive Offload on a device * @dev: device * * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be * called under RTNL. This is needed if Generic XDP is installed on * the device.
*/ staticvoid dev_disable_gro_hw(struct net_device *dev)
{
dev->wanted_features &= ~NETIF_F_GRO_HW;
netdev_update_features(dev);
if (unlikely(dev->features & NETIF_F_GRO_HW))
netdev_WARN(dev, "failed to disable GRO_HW!\n");
}
/** * register_netdevice_notifier - register a network notifier block * @nb: notifier * * Register a notifier to be called when network device events occur. * The notifier passed is linked into the kernel structures and must * not be reused until it has been unregistered. A negative errno code * is returned on a failure. * * When registered all registration and up events are replayed * to the new notifier to allow device to have a race free * view of the network device list.
*/
int register_netdevice_notifier(struct notifier_block *nb)
{ struct net *net; int err;
/* Close race with setup_net() and cleanup_net() */
down_write(&pernet_ops_rwsem);
/* When RTNL is removed, we need protection for netdev_chain. */
rtnl_lock();
err = raw_notifier_chain_register(&netdev_chain, nb); if (err) goto unlock; if (dev_boot_phase) goto unlock;
for_each_net(net) {
__rtnl_net_lock(net);
err = call_netdevice_register_net_notifiers(nb, net);
__rtnl_net_unlock(net); if (err) goto rollback;
}
/** * unregister_netdevice_notifier - unregister a network notifier block * @nb: notifier * * Unregister a notifier previously registered by * register_netdevice_notifier(). The notifier is unlinked into the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * * After unregistering unregister and down device events are synthesized * for all devices on the device list to the removed notifier to remove * the need for special case cleanup code.
*/
int unregister_netdevice_notifier(struct notifier_block *nb)
{ struct net *net; int err;
/* Close race with setup_net() and cleanup_net() */
down_write(&pernet_ops_rwsem);
rtnl_lock();
err = raw_notifier_chain_unregister(&netdev_chain, nb); if (err) goto unlock;
/** * register_netdevice_notifier_net - register a per-netns network notifier block * @net: network namespace * @nb: notifier * * Register a notifier to be called when network device events occur. * The notifier passed is linked into the kernel structures and must * not be reused until it has been unregistered. A negative errno code * is returned on a failure. * * When registered all registration and up events are replayed * to the new notifier to allow device to have a race free * view of the network device list.
*/
int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
{ int err;
/** * unregister_netdevice_notifier_net - unregister a per-netns * network notifier block * @net: network namespace * @nb: notifier * * Unregister a notifier previously registered by * register_netdevice_notifier_net(). The notifier is unlinked from the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * * After unregistering unregister and down device events are synthesized * for all devices on the device list to the removed notifier to remove * the need for special case cleanup code.
*/
int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
{ int err;
/* netns might be being dismantled. */
rcu_read_lock();
net = dev_net_rcu(dev);
net_passive_inc(net);
rcu_read_unlock();
rtnl_net_lock(net);
#ifdef CONFIG_NET_NS /* dev might have been moved to another netns. */ if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) {
rtnl_net_unlock(net);
net_passive_dec(net);
again = true;
} #endif
} while (again);
}
staticvoid rtnl_net_dev_unlock(struct net_device *dev)
{ struct net *net = dev_net(dev);
rtnl_net_unlock(net);
net_passive_dec(net);
}
int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn)
{ int err;
/** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function * @info: notifier information data * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain().
*/
int call_netdevice_notifiers_info(unsignedlong val, struct netdev_notifier_info *info)
{ struct net *net = dev_net(info->dev); int ret;
ASSERT_RTNL();
/* Run per-netns notifier block chain first, then run the global one. * Hopefully, one day, the global one is going to be removed after * all notifier block registrators get converted to be per-netns.
*/
ret = raw_notifier_call_chain(&net->netdev_chain, val, info); if (ret & NOTIFY_STOP_MASK) return ret; return raw_notifier_call_chain(&netdev_chain, val, info);
}
/** * call_netdevice_notifiers_info_robust - call per-netns notifier blocks * for and rollback on error * @val_up: value passed unmodified to notifier function * @val_down: value passed unmodified to the notifier function when * recovering from an error on @val_up * @info: notifier information data * * Call all per-netns network notifier blocks, but not notifier blocks on * the global notifier chain. Parameters and return value are as for * raw_notifier_call_chain_robust().
*/
/** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain().
*/
/** * call_netdevice_notifiers_mtu - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function * @arg: additional u32 argument passed to the notifier function * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain().
*/ staticint call_netdevice_notifiers_mtu(unsignedlong val, struct net_device *dev, u32 arg)
{ struct netdev_notifier_info_ext info = {
.info.dev = dev,
.ext.mtu = arg,
};
/** * dev_forward_skb - loopback an skb to another netif * * @dev: destination network device * @skb: buffer to forward * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped, but freed) * * dev_forward_skb can be used for injecting an skb from the * start_xmit function of one device into the receive queue * of another device. * * The receiving device may be in another namespace, so * we have to clear all information in the skb that could * impact namespace isolation.
*/ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{ return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
}
EXPORT_SYMBOL_GPL(dev_forward_skb);
/** * dev_nit_active_rcu - return true if any network interface taps are in use * * The caller must hold the RCU lock * * @dev: network device to check for the presence of taps
*/ bool dev_nit_active_rcu(conststruct net_device *dev)
{ /* Callers may hold either RCU or RCU BH lock */
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
/* need to clone skb, done only once */
skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) goto out_unlock;
net_timestamp_set(skb2);
/* skb->nh should be correctly * set by sender, so that the second statement is * just protection against buggy protocols.
*/
skb_reset_mac_header(skb2);
if (skb_network_header(skb2) < skb2->data ||
skb_network_header(skb2) > skb_tail_pointer(skb2)) {
net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
ntohs(skb2->protocol),
dev->name);
skb_reset_network_header(skb2);
}
if (ptype_list != &dev->ptype_all) {
ptype_list = &dev->ptype_all; goto again;
}
out_unlock: if (pt_prev) { if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); else
kfree_skb(skb2);
}
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
/** * netif_setup_tc - Handle tc mappings on real_num_tx_queues change * @dev: Network device * @txq: number of queues available * * If real_num_tx_queues is changed the tc mappings may no longer be * valid. To resolve this verify the tc mapping remains valid and if * not NULL the mapping. With no priorities mapping to this * offset/count pair it will no longer be used. In the worst case TC0 * is invalid nothing can be done so disable priority mappings. If is * expected that drivers will fix this mapping if they can before * calling netif_set_real_num_tx_queues.
*/ staticvoid netif_setup_tc(struct net_device *dev, unsignedint txq)
{ int i; struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
/* If TC0 is invalidated disable TC mapping */ if (tc->offset + tc->count > txq) {
netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
dev->num_tc = 0; return;
}
/* Invalidated prio to tc mappings set to TC0 */ for (i = 1; i < TC_BITMASK + 1; i++) { int q = netdev_get_prio_tc_map(dev, i);
tc = &dev->tc_to_txq[q]; if (tc->offset + tc->count > txq) {
netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
i, q);
netdev_set_prio_tc_map(dev, i, 0);
}
}
}
int netdev_txq_to_tc(struct net_device *dev, unsignedint txq)
{ if (dev->num_tc) { struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; int i;
/* walk through the TCs and see if it falls into any of them */ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { if ((txq - tc->offset) < tc->count) return i;
}
/* didn't find it, just return -1 to indicate no match */ return -1;
}
staticstruct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
u16 index, bool is_rxqs_map)
{ struct xps_map *new_map; int alloc_len = XPS_MIN_MAP_ALLOC; int i, pos;
for (pos = 0; map && pos < map->len; pos++) { if (map->queues[pos] != index) continue; return map;
}
/* Need to add tx-queue to this CPU's/rx-queue's existing map */ if (map) { if (pos < map->alloc_len) return map;
alloc_len = map->alloc_len * 2;
}
/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's * map
*/ if (is_rxqs_map)
new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL); else
new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
cpu_to_node(attr_index)); if (!new_map) return NULL;
for (i = 0; i < pos; i++)
new_map->queues[i] = map->queues[i];
new_map->alloc_len = alloc_len;
new_map->len = pos;
return new_map;
}
/* Copy xps maps at a given index */ staticvoid xps_copy_dev_maps(struct xps_dev_maps *dev_maps, struct xps_dev_maps *new_dev_maps, int index, int tc, bool skip_tc)
{ int i, tci = index * dev_maps->num_tc; struct xps_map *map;
/* copy maps belonging to foreign traffic classes */ for (i = 0; i < dev_maps->num_tc; i++, tci++) { if (i == tc && skip_tc) continue;
/* fill in the new device map from the old device map */
map = xmap_dereference(dev_maps->attr_map[tci]);
RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
}
/* Must be called under cpus_read_lock */ int __netif_set_xps_queue(struct net_device *dev, constunsignedlong *mask,
u16 index, enum xps_map_type type)
{ struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL; constunsignedlong *online_mask = NULL; bool active = false, copy = false; int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; unsignedint nr_ids;
WARN_ON_ONCE(index >= dev->num_tx_queues);
if (dev->num_tc) { /* Do not allow XPS on subordinate device directly */
num_tc = dev->num_tc; if (num_tc < 0) return -EINVAL;
/* If queue belongs to subordinate dev use its map */
dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
if (maps_sz < L1_CACHE_BYTES)
maps_sz = L1_CACHE_BYTES;
/* The old dev_maps could be larger or smaller than the one we're * setting up now, as dev->num_tc or nr_ids could have been updated in * between. We could try to be smart, but let's be safe instead and only * copy foreign traffic classes if the two map sizes match.
*/ if (dev_maps &&
dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
copy = true;
/* allocate memory for queue storage */ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
j < nr_ids;) { if (!new_dev_maps) {
new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) {
mutex_unlock(&xps_map_mutex); return -ENOMEM;
}
if (!dev_maps) { /* Increment static keys at most once per type */
static_key_slow_inc_cpuslocked(&xps_needed); if (type == XPS_RXQS)
static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
}
for (i = 0; i < dev_maps->num_tc; i++, tci++) { if (i == tc &&
netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) continue;
while (txq-- != &dev->_tx[0]) { if (txq->sb_dev == sb_dev)
txq->sb_dev = NULL;
}
}
EXPORT_SYMBOL(netdev_unbind_sb_channel);
int netdev_bind_sb_channel_queue(struct net_device *dev, struct net_device *sb_dev,
u8 tc, u16 count, u16 offset)
{ /* Make certain the sb_dev and dev are already configured */ if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) return -EINVAL;
/* We cannot hand out queues we don't have */ if ((offset + count) > dev->real_num_tx_queues) return -EINVAL;
/* Record the mapping */
sb_dev->tc_to_txq[tc].count = count;
sb_dev->tc_to_txq[tc].offset = offset;
/* Provide a way for Tx queue to find the tc_to_txq map or * XPS map for itself.
*/ while (count--)
netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
int netdev_set_sb_channel(struct net_device *dev, u16 channel)
{ /* Do not use a multiqueue device to represent a subordinate channel */ if (netif_is_multiqueue(dev)) return -ENODEV;
/* We allow channels 1 - 32767 to be used for subordinate channels. * Channel 0 is meant to be "native" mode and used only to represent * the main root device. We allow writing 0 to reset the device back * to normal mode after being used as a subordinate channel.
*/ if (channel > S16_MAX) return -EINVAL;
dev->num_tc = -channel;
return 0;
}
EXPORT_SYMBOL(netdev_set_sb_channel);
/* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
*/ int netif_set_real_num_tx_queues(struct net_device *dev, unsignedint txq)
{ bool disabling; int rc;
disabling = txq < dev->real_num_tx_queues;
if (txq < 1 || txq > dev->num_tx_queues) return -EINVAL;
if (dev->reg_state == NETREG_REGISTERED ||
dev->reg_state == NETREG_UNREGISTERING) {
netdev_ops_assert_locked(dev);
rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
txq); if (rc) return rc;
/** * netif_set_real_num_rx_queues - set actual number of RX queues used * @dev: Network device * @rxq: Actual number of RX queues * * This must be called either with the rtnl_lock held or before * registration of the net device. Returns 0 on success, or a * negative error code. If called before registration, it always * succeeds.
*/ int netif_set_real_num_rx_queues(struct net_device *dev, unsignedint rxq)
{ int rc;
if (rxq < 1 || rxq > dev->num_rx_queues) return -EINVAL;
if (dev->reg_state == NETREG_REGISTERED) {
netdev_ops_assert_locked(dev);
rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
rxq); if (rc) return rc;
}
/** * netif_set_real_num_queues - set actual number of RX and TX queues used * @dev: Network device * @txq: Actual number of TX queues * @rxq: Actual number of RX queues * * Set the real number of both TX and RX queues. * Does nothing if the number of queues is already correct.
*/ int netif_set_real_num_queues(struct net_device *dev, unsignedint txq, unsignedint rxq)
{ unsignedint old_rxq = dev->real_num_rx_queues; int err;
/** * netif_set_tso_max_size() - set the max size of TSO frames supported * @dev: netdev to update * @size: max skb->len of a TSO frame * * Set the limit on the size of TSO super-frames the device can handle. * Unless explicitly set the stack will assume the value of * %GSO_LEGACY_MAX_SIZE.
*/ void netif_set_tso_max_size(struct net_device *dev, unsignedint size)
{
dev->tso_max_size = min(GSO_MAX_SIZE, size); if (size < READ_ONCE(dev->gso_max_size))
netif_set_gso_max_size(dev, size); if (size < READ_ONCE(dev->gso_ipv4_max_size))
netif_set_gso_ipv4_max_size(dev, size);
}
EXPORT_SYMBOL(netif_set_tso_max_size);
/** * netif_set_tso_max_segs() - set the max number of segs supported for TSO * @dev: netdev to update * @segs: max number of TCP segments * * Set the limit on the number of TCP segments the device can generate from * a single TSO super-frame. * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
*/ void netif_set_tso_max_segs(struct net_device *dev, unsignedint segs)
{
dev->tso_max_segs = segs; if (segs < READ_ONCE(dev->gso_max_segs))
netif_set_gso_max_segs(dev, segs);
}
EXPORT_SYMBOL(netif_set_tso_max_segs);
/** * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper * @to: netdev to update * @from: netdev from which to copy the limits
*/ void netif_inherit_tso_max(struct net_device *to, conststruct net_device *from)
{
netif_set_tso_max_size(to, from->tso_max_size);
netif_set_tso_max_segs(to, from->tso_max_segs);
}
EXPORT_SYMBOL(netif_inherit_tso_max);
/** * netif_get_num_default_rss_queues - default number of RSS queues * * Default value is the number of physical cores if there are only 1 or 2, or * divided by 2 if there are more.
*/ int netif_get_num_default_rss_queues(void)
{
cpumask_var_t cpus; int cpu, count = 0;
if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL))) return 1;
/** * netif_device_detach - mark device as removed * @dev: network device * * Mark device as removed from system and therefore no longer available.
*/ void netif_device_detach(struct net_device *dev)
{ if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
netif_running(dev)) {
netif_tx_stop_all_queues(dev);
}
}
EXPORT_SYMBOL(netif_device_detach);
/** * netif_device_attach - mark device as attached * @dev: network device * * Mark device as attached from system and restart if needed.
*/ void netif_device_attach(struct net_device *dev)
{ if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
netif_running(dev)) {
netif_tx_wake_all_queues(dev);
netdev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(netif_device_attach);
/* * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range.
*/ static u16 skb_tx_hash(conststruct net_device *dev, conststruct net_device *sb_dev, struct sk_buff *skb)
{
u32 hash;
u16 qoffset = 0;
u16 qcount = dev->real_num_tx_queues;
if (dev->num_tc) {
u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
if (dev) { if (dev->dev.parent)
name = dev_driver_string(dev->dev.parent); else
name = netdev_name(dev);
}
skb_dump(KERN_WARNING, skb, false);
WARN(1, "%s: caps=(%pNF, %pNF)\n",
name, dev ? &dev->features : &null_features,
skb->sk ? &skb->sk->sk_route_caps : &null_features);
}
/* * Invalidate hardware checksum when packet is to be mangled, and * complete checksum manually on outgoing path.
*/ int skb_checksum_help(struct sk_buff *skb)
{
__wsum csum; int ret = 0, offset;
if (skb->ip_summed == CHECKSUM_COMPLETE) goto out_set_summed;
if (unlikely(skb_is_gso(skb))) {
skb_warn_bad_offload(skb); return -EINVAL;
}
if (!skb_frags_readable(skb)) { return -EFAULT;
}
/* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong.
*/ if (skb_has_shared_frag(skb)) {
ret = __skb_linearize(skb); if (ret) goto out;
}
#ifdef CONFIG_NET_CRC32C int skb_crc32c_csum_help(struct sk_buff *skb)
{
u32 crc; int ret = 0, offset, start;
if (skb->ip_summed != CHECKSUM_PARTIAL) goto out;
if (unlikely(skb_is_gso(skb))) goto out;
/* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong.
*/ if (unlikely(skb_has_shared_frag(skb))) {
ret = __skb_linearize(skb); if (ret) goto out;
}
start = skb_checksum_start_offset(skb);
offset = start + offsetof(struct sctphdr, checksum); if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
ret = -EINVAL; goto out;
}
ret = skb_ensure_writable(skb, offset + sizeof(__le32)); if (ret) goto out;
/* XXX: check that highmem exists at all on the given machine. */ staticint illegal_highdma(struct net_device *dev, struct sk_buff *skb)
{ #ifdef CONFIG_HIGHMEM int i;
if (!(dev->features & NETIF_F_HIGHDMA)) { for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; struct page *page = skb_frag_page(frag);
/* If MPLS offload request, verify we are testing hardware MPLS features * instead of standard features for the netdev.
*/ #if IS_ENABLED(CONFIG_NET_MPLS_GSO) static netdev_features_t net_mpls_features(struct sk_buff *skb,
netdev_features_t features,
__be16 type)
{ if (eth_p_mpls(type))
features &= skb->dev->mpls_features;
type = skb_network_protocol(skb, NULL);
features = net_mpls_features(skb, features, type);
if (skb->ip_summed != CHECKSUM_NONE &&
!can_checksum_protocol(features, type)) {
features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
} if (illegal_highdma(skb->dev, skb))
features &= ~NETIF_F_SG;
if (gso_segs > READ_ONCE(dev->gso_max_segs)) return features & ~NETIF_F_GSO_MASK;
if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb))) return features & ~NETIF_F_GSO_MASK;
if (!skb_shinfo(skb)->gso_type) {
skb_warn_bad_offload(skb); return features & ~NETIF_F_GSO_MASK;
}
/* Support for GSO partial features requires software * intervention before we can actually process the packets * so we need to strip support for any partial features now * and we can pull them back in after we have partially * segmented the frame.
*/ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
features &= ~dev->gso_partial_features;
/* Make sure to clear the IPv4 ID mangling feature if the * IPv4 header has the potential to be fragmented.
*/ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { struct iphdr *iph = skb->encapsulation ?
inner_ip_hdr(skb) : ip_hdr(skb);
if (!(iph->frag_off & htons(IP_DF)))
features &= ~NETIF_F_TSO_MANGLEID;
}
/* NETIF_F_IPV6_CSUM does not support IPv6 extension headers, * so neither does TSO that depends on it.
*/ if (features & NETIF_F_IPV6_CSUM &&
(skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 ||
(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
vlan_get_protocol(skb) == htons(ETH_P_IPV6))) &&
skb_transport_header_was_set(skb) &&
skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
!ipv6_has_hopopt_jumbo(skb))
features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4);
if (skb_is_gso(skb))
features = gso_features_check(skb, dev, features);
/* If encapsulation offload request, verify we are testing * hardware encapsulation features instead of standard * features for the netdev
*/ if (skb->encapsulation)
features &= dev->hw_enc_features;
if (skb_vlan_tagged(skb))
features = netdev_intersect_features(features,
dev->vlan_features |
NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX);
if (dev->netdev_ops->ndo_features_check)
features &= dev->netdev_ops->ndo_features_check(skb, dev,
features); else
features &= dflt_features_check(skb, dev, features);
/* If packet is not checksummed and device does not * support checksumming for this protocol, complete * checksumming here.
*/ if (skb->ip_summed == CHECKSUM_PARTIAL) { if (skb->encapsulation)
skb_set_inner_transport_header(skb,
skb_checksum_start_offset(skb)); else
skb_set_transport_header(skb,
skb_checksum_start_offset(skb)); if (skb_csum_hwoffload_help(skb, features)) goto out_kfree_skb;
}
}
for (; skb != NULL; skb = next) {
next = skb->next;
skb_mark_not_on_list(skb);
/* in case skb won't be segmented, point to itself */
skb->prev = skb;
skb = validate_xmit_skb(skb, dev, again); if (!skb) continue;
if (!head)
head = skb; else
tail->next = skb; /* If skb was segmented, skb->prev points to * the last segment. If not, it still contains skb.
*/
tail = skb->prev;
} return head;
}
EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
/* To get more precise estimation of bytes sent on wire, * we add to pkt_len the headers size of all segments
*/ if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
u16 gso_segs = shinfo->gso_segs; unsignedint hdr_len;
/* mac layer + network layer */ if (!skb->encapsulation)
hdr_len = skb_transport_offset(skb); else
hdr_len = skb_inner_transport_offset(skb);
/* + transport layer */ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { conststruct tcphdr *th; struct tcphdr _tcphdr;
if (q->flags & TCQ_F_NOLOCK) { if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
qdisc_run_begin(q)) { /* Retest nolock_qdisc_is_empty() within the protection * of q->seqlock to protect from racing with requeuing.
*/ if (unlikely(!nolock_qdisc_is_empty(q))) {
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
__qdisc_run(q);
qdisc_run_end(q);
goto no_lock_out;
}
qdisc_bstats_cpu_update(q, skb); if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
!nolock_qdisc_is_empty(q))
__qdisc_run(q);
qdisc_run_end(q); return NET_XMIT_SUCCESS;
}
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
qdisc_run(q);
no_lock_out: if (unlikely(to_free))
kfree_skb_list_reason(to_free,
tcf_get_drop_reason(to_free)); return rc;
}
if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); return NET_XMIT_DROP;
} /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit * and then other tasks will only enqueue packets. The packets will be * sent after the qdisc owner is scheduled again. To prevent this * scenario the task always serialize on the lock.
*/
contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); if (unlikely(contended))
spin_lock(&q->busylock);
spin_lock(root_lock); if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
__qdisc_drop(skb, &to_free);
rc = NET_XMIT_DROP;
} elseif ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
qdisc_run_begin(q)) { /* * This is a work-conserving queue; there are no old skbs * waiting to be sent out; and the qdisc is not running - * xmit the skb directly.
*/
qdisc_bstats_update(q, skb);
if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { if (unlikely(contended)) {
spin_unlock(&q->busylock);
contended = false;
}
__qdisc_run(q);
}
qdisc_run_end(q);
rc = NET_XMIT_SUCCESS;
} else {
WRITE_ONCE(q->owner, smp_processor_id());
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
WRITE_ONCE(q->owner, -1); if (qdisc_run_begin(q)) { if (unlikely(contended)) {
spin_unlock(&q->busylock);
contended = false;
}
__qdisc_run(q);
qdisc_run_end(q);
}
}
spin_unlock(root_lock); if (unlikely(to_free))
kfree_skb_list_reason(to_free,
tcf_get_drop_reason(to_free)); if (unlikely(contended))
spin_unlock(&q->busylock); return rc;
}
if (static_branch_unlikely(&tcx_needed_key)) {
sch_ret = tcx_run(entry, skb, true); if (sch_ret != TC_ACT_UNSPEC) goto ingress_verdict;
}
sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
ingress_verdict: switch (sch_ret) { case TC_ACT_REDIRECT: /* skb_mac_header check was done by BPF, so we can safely * push the L2 header back before redirecting to another * netdev.
*/
__skb_push(skb, skb->mac_len); if (skb_do_redirect(skb) == -EAGAIN) {
__skb_pull(skb, skb->mac_len);
*another = true; break;
}
*ret = NET_RX_SUCCESS;
bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT:
kfree_skb_reason(skb, drop_reason);
*ret = NET_RX_DROP;
bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP:
consume_skb(skb);
fallthrough; case TC_ACT_CONSUMED:
*ret = NET_RX_SUCCESS;
bpf_net_ctx_clear(bpf_net_ctx); return NULL;
}
bpf_net_ctx_clear(bpf_net_ctx);
/* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was * already set by the caller.
*/ if (static_branch_unlikely(&tcx_needed_key)) {
sch_ret = tcx_run(entry, skb, false); if (sch_ret != TC_ACT_UNSPEC) goto egress_verdict;
}
sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
egress_verdict: switch (sch_ret) { case TC_ACT_REDIRECT: /* No need to push/pop skb's mac_header here on egress! */
skb_do_redirect(skb);
*ret = NET_XMIT_SUCCESS;
bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT:
kfree_skb_reason(skb, drop_reason);
*ret = NET_XMIT_DROP;
bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP:
consume_skb(skb);
fallthrough; case TC_ACT_CONSUMED:
*ret = NET_XMIT_SUCCESS;
bpf_net_ctx_clear(bpf_net_ctx); return NULL;
}
bpf_net_ctx_clear(bpf_net_ctx);
/** * __dev_queue_xmit() - transmit a buffer * @skb: buffer to transmit * @sb_dev: suboordinate device used for L2 forwarding offload * * Queue a buffer for transmission to a network device. The caller must * have set the device and priority and built the buffer before calling * this function. The function can be called from an interrupt. * * When calling this method, interrupts MUST be enabled. This is because * the BH enable code must have IRQs enabled so that it will not deadlock. * * Regardless of the return value, the skb is consumed, so it is currently * difficult to retry a send to this method. (You can bump the ref count * before sending to hold a reference for retry if you are careful.) * * Return: * * 0 - buffer successfully transmitted * * positive qdisc return code - NET_XMIT_DROP etc. * * negative errno - other errors
*/ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{ struct net_device *dev = skb->dev; struct netdev_queue *txq = NULL; struct Qdisc *q; int rc = -ENOMEM; bool again = false;
if (netdev_xmit_txqueue_skipped())
txq = netdev_tx_queue_mapping(dev, skb);
} #endif /* If device/qdisc don't need skb->dst, release it right now while * its hot in this cpu cache.
*/ if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
skb_dst_drop(skb); else
skb_dst_force(skb);
if (!txq)
txq = netdev_core_pick_tx(dev, skb, sb_dev);
q = rcu_dereference_bh(txq->qdisc);
trace_net_dev_queue(skb); if (q->enqueue) {
rc = __dev_xmit_skb(skb, q, dev, txq); goto out;
}
/* The device has no queue. Common case for software devices: * loopback, all the sorts of tunnels...
* Really, it is unlikely that netif_tx_lock protection is necessary * here. (f.e. loopback and IP tunnels are clean ignoring statistics * counters.) * However, it is possible, that they rely on protection * made by us here.
* Check this and shot the lock. It is not prone from deadlocks. *Either shot noqueue qdisc, it is even simpler 8)
*/ if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */
/* Other cpus might concurrently change txq->xmit_lock_owner * to -1 or to their cpu id, but not to our id.
*/ if (READ_ONCE(txq->xmit_lock_owner) != cpu) { if (dev_xmit_recursion()) goto recursion_alert;
skb = validate_xmit_skb(skb, dev, &again); if (!skb) goto out;
HARD_TX_LOCK(dev, txq, cpu);
if (!netif_xmit_stopped(txq)) {
dev_xmit_recursion_inc();
skb = dev_hard_start_xmit(skb, dev, txq, &rc);
dev_xmit_recursion_dec(); if (dev_xmit_complete(rc)) {
HARD_TX_UNLOCK(dev, txq); goto out;
}
}
HARD_TX_UNLOCK(dev, txq);
net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
dev->name);
} else { /* Recursion is detected! It is possible, * unfortunately
*/
recursion_alert:
net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
dev->name);
}
}
int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
/* Called with irq disabled */ staticinlinevoid ____napi_schedule(struct softnet_data *sd, struct napi_struct *napi)
{ struct task_struct *thread;
lockdep_assert_irqs_disabled();
if (test_bit(NAPI_STATE_THREADED, &napi->state)) { /* Paired with smp_mb__before_atomic() in * napi_enable()/netif_set_threaded(). * Use READ_ONCE() to guarantee a complete * read on napi->thread. Only call * wake_up_process() when it's not NULL.
*/
thread = READ_ONCE(napi->thread); if (thread) { if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) goto use_local_napi;
use_local_napi:
DEBUG_NET_WARN_ON_ONCE(!list_empty(&napi->poll_list));
list_add_tail(&napi->poll_list, &sd->poll_list);
WRITE_ONCE(napi->list_owner, smp_processor_id()); /* If not called from net_rx_action() * we have to raise NET_RX_SOFTIRQ.
*/ if (!sd->in_net_rx_action)
raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
#ifdef CONFIG_RFS_ACCEL /** * rps_flow_is_active - check whether the flow is recently active. * @rflow: Specific flow to check activity. * @flow_table: per-queue flowtable that @rflow belongs to. * @cpu: CPU saved in @rflow. * * If the CPU has processed many packets since the flow's last activity * (beyond 10 times the table size), the flow is considered stale. * * Return: true if flow was recently active.
*/ staticbool rps_flow_is_active(struct rps_dev_flow *rflow, struct rps_dev_flow_table *flow_table, unsignedint cpu)
{ unsignedint flow_last_active; unsignedint sd_input_head;
/* Should we steer this flow to a different hardware queue? */ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
!(dev->features & NETIF_F_NTUPLE)) goto out;
rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); if (rxq_index == skb_get_rx_queue(skb)) goto out;
if (old_rflow->filter == rc)
WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
out: #endif
head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head);
rps_input_queue_tail_save(&rflow->last_qtail, head);
}
WRITE_ONCE(rflow->cpu, next_cpu); return rflow;
}
/* * get_rps_cpu is called from netif_receive_skb and returns the target * CPU from the RPS map of the receiving queue for a given skb. * rcu_read_lock must be held on entry.
*/ staticint get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp)
{ conststruct rps_sock_flow_table *sock_flow_table; struct netdev_rx_queue *rxqueue = dev->_rx; struct rps_dev_flow_table *flow_table; struct rps_map *map; int cpu = -1;
u32 tcpu;
u32 hash;
if (skb_rx_queue_recorded(skb)) {
u16 index = skb_get_rx_queue(skb);
if (unlikely(index >= dev->real_num_rx_queues)) {
WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " "of RX queues is %u\n",
dev->name, index, dev->real_num_rx_queues); goto done;
}
rxqueue += index;
}
/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
/* First check into global flow table if there is a match. * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
*/
ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]); if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask) goto try_rps;
next_cpu = ident & net_hotdata.rps_cpu_mask;
/* OK, now we know there is a match, * we can look at the local (per receive queue) flow table
*/
rflow = &flow_table->flows[rfs_slot(hash, flow_table)];
tcpu = rflow->cpu;
/* * If the desired CPU (where last recvmsg was done) is * different from current CPU (one in the rx-queue flow * table entry), switch if one of the following holds: * - Current CPU is unset (>= nr_cpu_ids). * - Current CPU is offline. * - The current CPU's queue tail has advanced beyond the * last packet that was enqueued using this table entry. * This guarantees that all previous packets for the flow * have been dequeued, thus preserving in order delivery.
*/ if (unlikely(tcpu != next_cpu) &&
(tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
rflow->last_qtail)) >= 0)) {
tcpu = next_cpu;
rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
}
if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
*rflowp = rflow;
cpu = tcpu; goto done;
}
}
try_rps:
if (map) {
tcpu = map->cpus[reciprocal_scale(hash, map->len)]; if (cpu_online(tcpu)) {
cpu = tcpu; goto done;
}
}
done: return cpu;
}
#ifdef CONFIG_RFS_ACCEL
/** * rps_may_expire_flow - check whether an RFS hardware filter may be removed * @dev: Device on which the filter was set * @rxq_index: RX queue index * @flow_id: Flow ID passed to ndo_rx_flow_steer() * @filter_id: Filter ID returned by ndo_rx_flow_steer() * * Drivers that implement ndo_rx_flow_steer() should periodically call * this function for each installed filter and remove the filters for * which it returns %true.
*/ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
u32 flow_id, u16 filter_id)
{ struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true;
/* * After we queued a packet into sd->input_pkt_queue, * we need to make sure this queue is serviced soon. * * - If this is another cpu queue, link it to our rps_ipi_list, * and make sure we will process rps_ipi_list from net_rx_action(). * * - If this is our own queue, NAPI schedule our backlog. * Note that this also raises NET_RX_SOFTIRQ.
*/ staticvoid napi_schedule_rps(struct softnet_data *sd)
{ struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
#ifdef CONFIG_RPS if (sd != mysd) { if (use_backlog_threads()) {
__napi_schedule_irqoff(&sd->backlog); return;
}
/* If not called from net_rx_action() or napi_threaded_poll() * we have to raise NET_RX_SOFTIRQ.
*/ if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
__raise_softirq_irqoff(NET_RX_SOFTIRQ); return;
} #endif/* CONFIG_RPS */
__napi_schedule_irqoff(&mysd->backlog);
}
if (likely(fl->buckets[old_flow]))
fl->buckets[old_flow]--;
if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { /* Pairs with READ_ONCE() in softnet_seq_show() */
WRITE_ONCE(fl->count, fl->count + 1);
rcu_read_unlock(); returntrue;
}
}
rcu_read_unlock(); #endif returnfalse;
}
/* * enqueue_to_backlog is called to queue an skb to a per CPU backlog * queue (may be a remote CPU queue).
*/ staticint enqueue_to_backlog(struct sk_buff *skb, int cpu, unsignedint *qtail)
{ enum skb_drop_reason reason; struct softnet_data *sd; unsignedlong flags; unsignedint qlen; int max_backlog;
u32 tail;
reason = SKB_DROP_REASON_DEV_READY; if (!netif_running(skb->dev)) goto bad_dev;
qlen = skb_queue_len_lockless(&sd->input_pkt_queue);
max_backlog = READ_ONCE(net_hotdata.max_backlog); if (unlikely(qlen > max_backlog)) goto cpu_backlog_drop;
backlog_lock_irq_save(sd, &flags);
qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { if (!qlen) { /* Schedule NAPI for backlog device. We can use * non atomic operation as we own the queue lock.
*/ if (!__test_and_set_bit(NAPI_STATE_SCHED,
&sd->backlog.state))
napi_schedule_rps(sd);
}
__skb_queue_tail(&sd->input_pkt_queue, skb);
tail = rps_input_queue_tail_incr(sd);
backlog_unlock_irq_restore(sd, &flags);
/* save the tail outside of the critical section */
rps_input_queue_tail_save(qtail, tail); return NET_RX_SUCCESS;
}
if (skb_rx_queue_recorded(skb)) {
u16 index = skb_get_rx_queue(skb);
if (unlikely(index >= dev->real_num_rx_queues)) {
WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " "of RX queues is %u\n",
dev->name, index, dev->real_num_rx_queues);
/* The XDP program wants to see the packet starting at the MAC * header.
*/
mac_len = skb->data - skb_mac_header(skb);
hard_start = skb->data - skb_headroom(skb);
/* SKB "head" area always have tailroom for skb_shared_info */
frame_sz = (void *)skb_end_pointer(skb) - hard_start;
frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
/* check if bpf_xdp_adjust_head was used */
off = xdp->data - orig_data; if (off) { if (off > 0)
__skb_pull(skb, off); elseif (off < 0)
__skb_push(skb, -off);
/* check if bpf_xdp_adjust_tail was used */
off = xdp->data_end - orig_data_end; if (off != 0) {
skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
skb->len += off; /* positive on grow, negative on shrink */
}
/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
*/ if (xdp_buff_has_frags(xdp))
skb->data_len = skb_shinfo(skb)->xdp_frags_size; else
skb->data_len = 0;
/* check if XDP changed eth hdr such SKB needs update */
eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) ||
(orig_host != ether_addr_equal_64bits(eth->h_dest,
skb->dev->dev_addr)) ||
(orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
__skb_push(skb, ETH_HLEN);
skb->pkt_type = PACKET_HOST;
skb->protocol = eth_type_trans(skb, skb->dev);
}
/* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull * before calling us again on redirect path. We do not call do_redirect * as we leave that up to the caller. * * Caller is responsible for managing lifetime of skb (i.e. calling * kfree_skb in response to actions it cannot handle/XDP_DROP).
*/ switch (act) { case XDP_REDIRECT: case XDP_TX:
__skb_push(skb, mac_len); break; case XDP_PASS:
metalen = xdp->data - xdp->data_meta; if (metalen)
skb_metadata_set(skb, metalen); break;
}
/* In case we have to go down the path and also linearize, * then lets do the pskb_expand_head() work just once here.
*/
hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
troom = skb->tail + skb->data_len - skb->end;
err = pskb_expand_head(skb,
hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
troom > 0 ? troom + 128 : 0, GFP_ATOMIC); if (err) return err;
/* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing.
*/ if (skb_is_redirected(skb)) return XDP_PASS;
/* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM * bytes. This is the guarantee that also native XDP provides, * thus we need to do it here as well.
*/
mac_len = skb->data - skb_mac_header(skb);
__skb_push(skb, mac_len);
if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
skb_headroom(skb) < XDP_PACKET_HEADROOM) { if (netif_skb_check_for_xdp(pskb, xdp_prog)) goto do_drop;
}
__skb_pull(*pskb, mac_len);
act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog); switch (act) { case XDP_REDIRECT: case XDP_TX: case XDP_PASS: break; default:
bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
fallthrough; case XDP_ABORTED:
trace_xdp_exception((*pskb)->dev, xdp_prog, act);
fallthrough; case XDP_DROP:
do_drop:
kfree_skb(*pskb); break;
}
return act;
}
/* When doing generic XDP we have to bypass the qdisc layer and the * network taps in order to match in-driver-XDP behavior. This also means * that XDP packets are able to starve other packets going through a qdisc, * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX * queues, so they do not have this starvation issue.
*/ void generic_xdp_tx(struct sk_buff *skb, conststruct bpf_prog *xdp_prog)
{ struct net_device *dev = skb->dev; struct netdev_queue *txq; bool free_skb = true; int cpu, rc;
txq = netdev_core_pick_tx(dev, skb, NULL);
cpu = smp_processor_id();
HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_frozen_or_drv_stopped(txq)) {
rc = netdev_start_xmit(skb, dev, txq, 0); if (dev_xmit_complete(rc))
free_skb = false;
}
HARD_TX_UNLOCK(dev, txq); if (free_skb) {
trace_xdp_exception(dev, xdp_prog, XDP_TX);
dev_core_stats_tx_dropped_inc(dev);
kfree_skb(skb);
}
}
ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
} return ret;
}
/** * __netif_rx - Slightly optimized version of netif_rx * @skb: buffer to post * * This behaves as netif_rx except that it does not disable bottom halves. * As a result this function may only be invoked from the interrupt context * (either hard or soft interrupt).
*/ int __netif_rx(struct sk_buff *skb)
{ int ret;
trace_netif_rx_entry(skb);
ret = netif_rx_internal(skb);
trace_netif_rx_exit(ret); return ret;
}
EXPORT_SYMBOL(__netif_rx);
/** * netif_rx - post buffer to the network code * @skb: buffer to post * * This function receives a packet from a device driver and queues it for * the upper (protocol) levels to process via the backlog NAPI device. It * always succeeds. The buffer may be dropped during processing for * congestion control or by the protocol layers. * The network buffer is passed via the backlog NAPI device. Modern NIC * driver should use NAPI and GRO. * This function can used from interrupt and from process context. The * caller from process context must not disable interrupts before invoking * this function. * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped) *
*/ int netif_rx(struct sk_buff *skb)
{ bool need_bh_off = !(hardirq_count() | softirq_count()); int ret;
if (need_bh_off)
local_bh_disable();
trace_netif_rx_entry(skb);
ret = netif_rx_internal(skb);
trace_netif_rx_exit(ret); if (need_bh_off)
local_bh_enable(); return ret;
}
EXPORT_SYMBOL(netif_rx);
/* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED
*/
smp_mb__before_atomic();
if (!(q->flags & TCQ_F_NOLOCK)) {
root_lock = qdisc_lock(q);
spin_lock(root_lock);
} elseif (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
&q->state))) { /* There is a synchronize_net() between * STATE_DEACTIVATED flag being set and * qdisc_reset()/some_qdisc_is_busy() in * dev_deactivate(), so we can safely bail out * early here to avoid data race between * qdisc_deactivate() and some_qdisc_is_busy() * for lockless qdisc.
*/
clear_bit(__QDISC_STATE_SCHED, &q->state); continue;
}
clear_bit(__QDISC_STATE_SCHED, &q->state);
qdisc_run(q); if (root_lock)
spin_unlock(root_lock);
}
rcu_read_unlock();
}
xfrm_dev_backlog(sd);
}
#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) /* This hook is defined here for ATM LANE */ int (*br_fdb_test_addr_hook)(struct net_device *dev, unsignedchar *addr) __read_mostly;
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif
/** * netdev_is_rx_handler_busy - check if receive handler is registered * @dev: device to check * * Check if a receive handler is already registered for a given device. * Return true if there one. * * The caller must hold the rtnl_mutex.
*/ bool netdev_is_rx_handler_busy(struct net_device *dev)
{
ASSERT_RTNL(); return dev && rtnl_dereference(dev->rx_handler);
}
EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
/** * netdev_rx_handler_register - register receive handler * @dev: device to register a handler for * @rx_handler: receive handler to register * @rx_handler_data: data pointer that is used by rx handler * * Register a receive handler for a device. This handler will then be * called from __netif_receive_skb. A negative errno code is returned * on a failure. * * The caller must hold the rtnl_mutex. * * For a general description of rx_handler, see enum rx_handler_result.
*/ int netdev_rx_handler_register(struct net_device *dev,
rx_handler_func_t *rx_handler, void *rx_handler_data)
{ if (netdev_is_rx_handler_busy(dev)) return -EBUSY;
if (dev->priv_flags & IFF_NO_RX_HANDLER) return -EINVAL;
/* Note: rx_handler_data must be set before rx_handler */
rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
rcu_assign_pointer(dev->rx_handler, rx_handler);
/** * netdev_rx_handler_unregister - unregister receive handler * @dev: device to unregister a handler from * * Unregister a receive handler from a device. * * The caller must hold the rtnl_mutex.
*/ void netdev_rx_handler_unregister(struct net_device *dev)
{
ASSERT_RTNL();
RCU_INIT_POINTER(dev->rx_handler, NULL); /* a reader seeing a non NULL rx_handler in a rcu_read_lock() * section has a guarantee to see a non NULL rx_handler_data * as well.
*/
synchronize_net();
RCU_INIT_POINTER(dev->rx_handler_data, NULL);
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
/* * Limit the use of PFMEMALLOC reserves to those protocols that implement * the special handling of PFMEMALLOC skbs.
*/ staticbool skb_pfmemalloc_protocol(struct sk_buff *skb)
{ switch (skb->protocol) { case htons(ETH_P_ARP): case htons(ETH_P_IP): case htons(ETH_P_IPV6): case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): returntrue; default: returnfalse;
}
}
staticinlineint nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev)
{ if (nf_hook_ingress_active(skb)) { int ingress_retval;
skb_reset_network_header(skb); #if !defined(CONFIG_DEBUG_NET) /* We plan to no longer reset the transport header here. * Give some time to fuzzers and dev build to catch bugs * in network stacks.
*/ if (!skb_transport_header_was_set(skb))
skb_reset_transport_header(skb); #endif
skb_reset_mac_len(skb);
pt_prev = NULL;
another_round:
skb->skb_iif = skb->dev->ifindex;
__this_cpu_inc(softnet_data.processed);
if (static_branch_unlikely(&generic_xdp_needed_key)) { int ret2;
if (skb_vlan_tag_present(skb)) { if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
} if (vlan_do_receive(&skb)) goto another_round; elseif (unlikely(!skb)) goto out;
}
rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
} switch (rx_handler(&skb)) { case RX_HANDLER_CONSUMED:
ret = NET_RX_SUCCESS; goto out; case RX_HANDLER_ANOTHER: goto another_round; case RX_HANDLER_EXACT:
deliver_exact = true; break; case RX_HANDLER_PASS: break; default:
BUG();
}
}
if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
check_vlan_id: if (skb_vlan_tag_get_id(skb)) { /* Vlan id is non 0 and vlan_do_receive() above couldn't * find vlan device.
*/
skb->pkt_type = PACKET_OTHERHOST;
} elseif (eth_type_vlan(skb->protocol)) { /* Outer header is 802.1P with vlan 0, inner header is * 802.1Q or 802.1AD and vlan_do_receive() above could * not find vlan dev for vlan id 0.
*/
__vlan_hwaccel_clear_tag(skb);
skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto out; if (vlan_do_receive(&skb)) /* After stripping off 802.1P header with vlan 0 * vlan dev is found for inner header.
*/ goto another_round; elseif (unlikely(!skb)) goto out; else /* We have stripped outer 802.1P vlan 0 header. * But could not find vlan dev. * check again for vlan id to set OTHERHOST.
*/ goto check_vlan_id;
} /* Note: we might in the future use prio bits * and set skb->priority like in vlan_do_receive() * For the time being, just ignore Priority Code Point
*/
__vlan_hwaccel_clear_tag(skb);
}
type = skb->protocol;
/* deliver only exact match when indicated */ if (likely(!deliver_exact)) {
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&ptype_base[ntohs(type) &
PTYPE_HASH_MASK]);
/* orig_dev and skb->dev could belong to different netns; * Even in such case we need to traverse only the list * coming from skb->dev, as the ptype owner (packet socket) * will use dev_net(skb->dev) to do namespace filtering.
*/
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&dev_net_rcu(skb->dev)->ptype_specific);
}
if (pt_prev) {
*ppt_prev = pt_prev;
} else {
drop: if (!deliver_exact)
dev_core_stats_rx_dropped_inc(skb->dev); else
dev_core_stats_rx_nohandler_inc(skb->dev);
kfree_skb_reason(skb, drop_reason); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-)
*/
ret = NET_RX_DROP;
}
out: /* The invariant here is that if *ppt_prev is not NULL * then skb should also be non-NULL. * * Apparently *ppt_prev assignment above holds this invariant due to * skb dereferencing near it.
*/
*pskb = skb; return ret;
}
ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); if (pt_prev)
ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
skb->dev, pt_prev, orig_dev); return ret;
}
/** * netif_receive_skb_core - special purpose version of netif_receive_skb * @skb: buffer to process * * More direct receive version of netif_receive_skb(). It should * only be used by callers that have a need to skip RPS and Generic XDP. * Caller must also take care of handling if ``(page_is_)pfmemalloc``. * * This function may only be called from softirq context and interrupts * should be enabled. * * Return values (usually ignored): * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped
*/ int netif_receive_skb_core(struct sk_buff *skb)
{ int ret;
rcu_read_lock();
ret = __netif_receive_skb_one_core(skb, false);
rcu_read_unlock();
if (!pt_prev) return; if (list_empty(head)) return; if (pt_prev->list_func != NULL)
INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
ip_list_rcv, head, pt_prev, orig_dev); else
list_for_each_entry_safe(skb, next, head, list) {
skb_list_del_init(skb);
pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
}
staticvoid __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
{ /* Fast-path assumptions: * - There is no RX handler. * - Only one packet_type matches. * If either of these fails, we will end up doing some per-packet * processing in-line, then handling the 'last ptype' for the whole * sublist. This can't cause out-of-order delivery to any single ptype, * because the 'last ptype' must be constant across the sublist, and all * other ptypes are handled per-packet.
*/ /* Current (common) ptype of sublist */ struct packet_type *pt_curr = NULL; /* Current (common) orig_dev of sublist */ struct net_device *od_curr = NULL; struct sk_buff *skb, *next;
LIST_HEAD(sublist);
skb_list_del_init(skb);
__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); if (!pt_prev) continue; if (pt_curr != pt_prev || od_curr != orig_dev) { /* dispatch old sublist */
__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); /* start new sublist */
INIT_LIST_HEAD(&sublist);
pt_curr = pt_prev;
od_curr = orig_dev;
}
list_add_tail(&skb->list, &sublist);
}
/* dispatch final sublist */
__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
}
staticint __netif_receive_skb(struct sk_buff *skb)
{ int ret;
if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { unsignedint noreclaim_flag;
/* * PFMEMALLOC skbs are special, they should * - be delivered to SOCK_MEMALLOC sockets only * - stay away from userspace * - have bounded memory usage * * Use PF_MEMALLOC as this saves us from propagating the allocation * context down to all allocation sites.
*/
noreclaim_flag = memalloc_noreclaim_save();
ret = __netif_receive_skb_one_core(skb, true);
memalloc_noreclaim_restore(noreclaim_flag);
} else
ret = __netif_receive_skb_one_core(skb, false);
rcu_read_lock(); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) {
list_for_each_entry_safe(skb, next, head, list) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu >= 0) { /* Will be handled, remove from list */
skb_list_del_init(skb);
enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
}
}
} #endif
__netif_receive_skb_list(head);
rcu_read_unlock();
}
/** * netif_receive_skb - process receive buffer from network * @skb: buffer to process * * netif_receive_skb() is the main receive data processing function. * It always succeeds. The buffer may be dropped during processing * for congestion control or by the protocol layers. * * This function may only be called from softirq context and interrupts * should be enabled. * * Return values (usually ignored): * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped
*/ int netif_receive_skb(struct sk_buff *skb)
{ int ret;
trace_netif_receive_skb_entry(skb);
ret = netif_receive_skb_internal(skb);
trace_netif_receive_skb_exit(ret);
return ret;
}
EXPORT_SYMBOL(netif_receive_skb);
/** * netif_receive_skb_list - process many receive buffers from network * @head: list of skbs to process. * * Since return value of netif_receive_skb() is normally ignored, and * wouldn't be meaningful for a list, this function returns void. * * This function may only be called from softirq context and interrupts * should be enabled.
*/ void netif_receive_skb_list(struct list_head *head)
{ struct sk_buff *skb;
if (list_empty(head)) return; if (trace_netif_receive_skb_list_entry_enabled()) {
list_for_each_entry(skb, head, list)
trace_netif_receive_skb_list_entry(skb);
}
netif_receive_skb_list_internal(head);
trace_netif_receive_skb_list_exit(0);
}
EXPORT_SYMBOL(netif_receive_skb_list);
/* Network device is going away, flush any packets still pending */ staticvoid flush_backlog(struct work_struct *work)
{ struct sk_buff *skb, *tmp; struct sk_buff_head list; struct softnet_data *sd;
/* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue
*/
do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
!skb_queue_empty_lockless(&sd->process_queue);
backlog_unlock_irq_enable(sd);
return do_flush; #endif /* without RPS we can't safely check input_pkt_queue: during a * concurrent remote skb_queue_splice() we can detect as empty both * input_pkt_queue and process_queue even if the latter could end-up * containing a lot of packets.
*/ returntrue;
}
/* we can have in flight packet[s] on the cpus we are not flushing, * synchronize_net() in unregister_netdevice_many() will take care of * them.
*/
for_each_cpu(cpu, &ptr->flush_cpus)
flush_work(&ptr->w[cpu]);
cpus_read_unlock();
if (ptr != flush_backlogs_fallback)
kfree(ptr); else
mutex_unlock(&flush_backlogs_mutex);
}
if (cpu_online(remsd->cpu))
smp_call_function_single_async(remsd->cpu, &remsd->csd);
remsd = next;
} #endif
}
/* * net_rps_action_and_irq_enable sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled.
*/ staticvoid net_rps_action_and_irq_enable(struct softnet_data *sd)
{ #ifdef CONFIG_RPS struct softnet_data *remsd = sd->rps_ipi_list;
if (!use_backlog_threads() && remsd) {
sd->rps_ipi_list = NULL;
local_irq_enable();
/* Send pending IPI's to kick RPS processing on remote cpus. */
net_rps_send_ipi(remsd);
} else #endif
local_irq_enable();
}
staticint process_backlog(struct napi_struct *napi, int quota)
{ struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); bool again = true; int work = 0;
/* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end.
*/ if (sd_has_rps_ipi_waiting(sd)) {
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
napi->weight = READ_ONCE(net_hotdata.dev_rx_weight); while (again) { struct sk_buff *skb;
local_lock_nested_bh(&softnet_data.process_queue_bh_lock); while ((skb = __skb_dequeue(&sd->process_queue))) {
local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
rcu_read_lock();
__netif_receive_skb(skb);
rcu_read_unlock(); if (++work >= quota) {
rps_input_queue_head_add(sd, work); return work;
}
backlog_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). * only current cpu owns and manipulates this napi, * and NAPI_STATE_SCHED is the only possible flag set * on backlog. * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier.
*/
napi->state &= NAPIF_STATE_THREADED;
again = false;
} else {
local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
}
backlog_unlock_irq_enable(sd);
}
if (work)
rps_input_queue_head_add(sd, work); return work;
}
/** * __napi_schedule - schedule for receive * @n: entry to schedule * * The entry's receive function will be scheduled to run. * Consider using __napi_schedule_irqoff() if hard irqs are masked.
*/ void __napi_schedule(struct napi_struct *n)
{ unsignedlong flags;
/** * napi_schedule_prep - check if napi can be scheduled * @n: napi context * * Test if NAPI routine is already running, and if not mark * it as running. This is used as a condition variable to * insure only one NAPI poll instance runs. We also make * sure there is no pending NAPI disable.
*/ bool napi_schedule_prep(struct napi_struct *n)
{ unsignedlongnew, val = READ_ONCE(n->state);
do { if (unlikely(val & NAPIF_STATE_DISABLE)) returnfalse; new = val | NAPIF_STATE_SCHED;
/* Sets STATE_MISSED bit if STATE_SCHED was already set * This was suggested by Alexander Duyck, as compiler * emits better code than : * if (val & NAPIF_STATE_SCHED) * new |= NAPIF_STATE_MISSED;
*/ new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
NAPIF_STATE_MISSED;
} while (!try_cmpxchg(&n->state, &val, new));
/** * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule * * Variant of __napi_schedule() assuming hard irqs are masked. * * On PREEMPT_RT enabled kernels this maps to __napi_schedule() * because the interrupt disabled assumption might not be true * due to force-threaded interrupts and spinlock substitution.
*/ void __napi_schedule_irqoff(struct napi_struct *n)
{ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
____napi_schedule(this_cpu_ptr(&softnet_data), n); else
__napi_schedule(n);
}
EXPORT_SYMBOL(__napi_schedule_irqoff);
bool napi_complete_done(struct napi_struct *n, int work_done)
{ unsignedlong flags, val, new, timeout = 0; bool ret = true;
/* * 1) Don't let napi dequeue from the cpu poll list * just in case its running on a different cpu. * 2) If we are busy polling, do nothing here, we have * the guarantee we will be called later.
*/ if (unlikely(n->state & (NAPIF_STATE_NPSVC |
NAPIF_STATE_IN_BUSY_POLL))) returnfalse;
if (work_done) { if (n->gro.bitmask)
timeout = napi_get_gro_flush_timeout(n);
n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
} if (n->defer_hard_irqs_count > 0) {
n->defer_hard_irqs_count--;
timeout = napi_get_gro_flush_timeout(n); if (timeout)
ret = false;
}
/* * When the NAPI instance uses a timeout and keeps postponing * it, we need to bound somehow the time packets are kept in * the GRO layer.
*/
gro_flush_normal(&n->gro, !!timeout);
if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */
local_irq_save(flags);
list_del_init(&n->poll_list);
local_irq_restore(flags);
}
WRITE_ONCE(n->list_owner, -1);
val = READ_ONCE(n->state); do {
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
NAPIF_STATE_SCHED_THREADED |
NAPIF_STATE_PREFER_BUSY_POLL);
/* If STATE_MISSED was set, leave STATE_SCHED set, * because we will call napi->poll() one more time. * This C code was suggested by Alexander Duyck to help gcc.
*/ new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
NAPIF_STATE_SCHED;
} while (!try_cmpxchg(&n->state, &val, new));
if (unlikely(val & NAPIF_STATE_MISSED)) {
__napi_schedule(n); returnfalse;
}
if (timeout)
hrtimer_start(&n->timer, ns_to_ktime(timeout),
HRTIMER_MODE_REL_PINNED); return ret;
}
EXPORT_SYMBOL(napi_complete_done);
/* Busy polling means there is a high chance device driver hard irq * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was * set in napi_schedule_prep(). * Since we are about to call napi->poll() once more, we can safely * clear NAPI_STATE_MISSED. * * Note: x86 could use a single "lock and ..." instruction * to perform these two clear_bit()
*/
clear_bit(NAPI_STATE_MISSED, &napi->state);
clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
/* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round.
*/
rc = napi->poll(napi, budget); /* We can't gro_normal_list() here, because napi->poll() might have * rearmed the napi (napi_complete_done()) in which case it could * already be running on another CPU.
*/
trace_napi_poll(napi, rc, budget);
netpoll_poll_unlock(have_poll_lock); if (rc == budget)
__busy_poll_stop(napi, skip_schedule);
bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
}
rcu_read_lock();
napi = napi_by_id(napi_id); if (napi) { /* If irq_suspend_timeout is set to 0 between the call to * napi_suspend_irqs and now, the original value still * determines the safety timeout as intended and napi_watchdog * will resume irq processing.
*/ if (napi_get_irq_suspend_timeout(napi)) {
local_bh_disable();
napi_schedule(napi);
local_bh_enable();
}
}
rcu_read_unlock();
}
if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) return;
spin_lock_irqsave(&napi_hash_lock, flags);
/* 0..NR_CPUS range is reserved for sender_cpu use */ do { if (unlikely(!napi_id_valid(++napi_gen_id)))
napi_gen_id = MIN_NAPI_ID;
} while (napi_by_id(napi_gen_id));
__napi_hash_add_with_id(napi, napi_gen_id);
spin_unlock_irqrestore(&napi_hash_lock, flags);
}
/* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi
*/ staticvoid napi_hash_del(struct napi_struct *napi)
{ unsignedlong flags;
napi = container_of(timer, struct napi_struct, timer);
/* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/ if (!napi_disable_pending(napi) &&
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
__napi_schedule_irqoff(napi);
}
/* Wait until the napi STATE_THREADED is unset. */ while (true) {
val = READ_ONCE(napi->state);
/* If napi kthread own this napi or the napi is idle, * STATE_THREADED can be unset here.
*/ if ((val & NAPIF_STATE_SCHED_THREADED) ||
!(val & NAPIF_STATE_SCHED)) { new = val & (~NAPIF_STATE_THREADED);
} else {
msleep(20); continue;
}
if (try_cmpxchg(&napi->state, &val, new)) break;
}
/* Once STATE_THREADED is unset, wait for SCHED_THREADED to be unset by * the kthread.
*/ while (true) { if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) break;
int napi_set_threaded(struct napi_struct *napi, enum netdev_napi_threaded threaded)
{ if (threaded) { if (!napi->thread) { int err = napi_kthread_create(napi);
if (err) return err;
}
}
if (napi->config)
napi->config->threaded = threaded;
/* Setting/unsetting threaded mode on a napi might not immediately * take effect, if the current napi instance is actively being * polled. In this case, the switch between threaded mode and * softirq mode will happen in the next round of napi_schedule(). * This should not cause hiccups/stalls to the live traffic.
*/ if (!threaded && napi->thread) {
napi_stop_kthread(napi);
} else { /* Make sure kthread is created before THREADED bit is set. */
smp_mb__before_atomic();
assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
}
return 0;
}
int netif_set_threaded(struct net_device *dev, enum netdev_napi_threaded threaded)
{ struct napi_struct *napi; int i, err = 0;
netdev_assert_locked_or_invisible(dev);
if (threaded) {
list_for_each_entry(napi, &dev->napi_list, dev_list) { if (!napi->thread) {
err = napi_kthread_create(napi); if (err) {
threaded = NETDEV_NAPI_THREADED_DISABLED; break;
}
}
}
}
WRITE_ONCE(dev->threaded, threaded);
/* The error should not occur as the kthreads are already created. */
list_for_each_entry(napi, &dev->napi_list, dev_list)
WARN_ON_ONCE(napi_set_threaded(napi, threaded));
/* Override the config for all NAPIs even if currently not listed */ for (i = 0; i < dev->num_napi_configs; i++)
dev->napi_config[i].threaded = threaded;
return err;
}
/** * netif_threaded_enable() - enable threaded NAPIs * @dev: net_device instance * * Enable threaded mode for the NAPI instances of the device. This may be useful * for devices where multiple NAPI instances get scheduled by a single * interrupt. Threaded NAPI allows moving the NAPI processing to cores other * than the core where IRQ is mapped. * * This function should be called before @dev is registered.
*/ void netif_threaded_enable(struct net_device *dev)
{
WARN_ON_ONCE(netif_set_threaded(dev, NETDEV_NAPI_THREADED_ENABLED));
}
EXPORT_SYMBOL(netif_threaded_enable);
/** * netif_queue_set_napi - Associate queue with the napi * @dev: device to which NAPI and queue belong * @queue_index: Index of queue * @type: queue type as RX or TX * @napi: NAPI context, pass NULL to clear previously set NAPI * * Set queue with its corresponding napi context. This should be done after * registering the NAPI handler for the queue-vector and the queues have been * mapped to the corresponding interrupt vector.
*/ void netif_queue_set_napi(struct net_device *dev, unsignedint queue_index, enum netdev_queue_type type, struct napi_struct *napi)
{ struct netdev_rx_queue *rxq; struct netdev_queue *txq;
if (WARN_ON_ONCE(napi && !napi->dev)) return;
netdev_ops_assert_locked_or_invisible(dev);
if (n->dev->irq_affinity_auto &&
test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state))
irq_set_affinity(n->irq, &n->config->affinity_mask);
/* a NAPI ID might be stored in the config, if so use it. if not, use * napi_hash_add to generate one for us.
*/ if (n->config->napi_id) {
napi_hash_add_with_id(n, n->config->napi_id);
} else {
napi_hash_add(n);
n->config->napi_id = n->napi_id;
}
/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will * inherit an existing ID try to insert it at the right position.
*/ staticvoid
netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
{ unsignedint new_id, pos_id; struct list_head *higher; struct napi_struct *pos;
new_id = UINT_MAX; if (napi->config && napi->config->napi_id)
new_id = napi->config->napi_id;
if (pos_id <= new_id) break;
higher = &pos->dev_list;
}
list_add_rcu(&napi->dev_list, higher); /* adds after higher */
}
/* Double check that napi_get_frags() allocates skbs with * skb->head being backed by slab, not a page fragment. * This is to make sure bug fixed in 3226b158e67c * ("net: avoid 32 x truesize under-estimation for tiny skbs") * does not accidentally come back.
*/ staticvoid napi_get_frags_check(struct napi_struct *napi)
{ struct sk_buff *skb;
/* default settings from sysfs are applied to all NAPIs. any per-NAPI * configuration will be loaded in napi_enable
*/
napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
napi_get_frags_check(napi); /* Create kthread for this napi if dev->threaded is set. * Clear dev->threaded if kthread creation failed so that * threaded mode will not be enabled in napi_enable().
*/ if (napi_get_threaded_config(dev, napi)) if (napi_kthread_create(napi))
dev->threaded = NETDEV_NAPI_THREADED_DISABLED;
netif_napi_set_irq_locked(napi, -1);
}
EXPORT_SYMBOL(netif_napi_add_weight_locked);
val = READ_ONCE(n->state); do { while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
usleep_range(20, 200);
val = READ_ONCE(n->state);
}
new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
} while (!try_cmpxchg(&n->state, &val, new));
hrtimer_cancel(&n->timer);
if (n->config)
napi_save_config(n); else
napi_hash_del(n);
/** * napi_disable() - prevent NAPI from scheduling * @n: NAPI context * * Stop NAPI from being scheduled on this context. * Waits till any outstanding processing completes. * Takes netdev_lock() for associated net_device.
*/ void napi_disable(struct napi_struct *n)
{
netdev_lock(n->dev);
napi_disable_locked(n);
netdev_unlock(n->dev);
}
EXPORT_SYMBOL(napi_disable);
void napi_enable_locked(struct napi_struct *n)
{ unsignedlongnew, val = READ_ONCE(n->state);
if (n->config)
napi_restore_config(n); else
napi_hash_add(n);
do {
BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC); if (n->dev->threaded && n->thread) new |= NAPIF_STATE_THREADED;
} while (!try_cmpxchg(&n->state, &val, new));
}
EXPORT_SYMBOL(napi_enable_locked);
/** * napi_enable() - enable NAPI scheduling * @n: NAPI context * * Enable scheduling of a NAPI instance. * Must be paired with napi_disable(). * Takes netdev_lock() for associated net_device.
*/ void napi_enable(struct napi_struct *n)
{
netdev_lock(n->dev);
napi_enable_locked(n);
netdev_unlock(n->dev);
}
EXPORT_SYMBOL(napi_enable);
/* Must be called in process context */ void __netif_napi_del_locked(struct napi_struct *napi)
{
netdev_assert_locked(napi->dev);
if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) return;
/* Make sure NAPI is disabled (or was never enabled). */
WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state));
if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
irq_set_affinity_notifier(napi->irq, NULL);
if (napi->config) {
napi->index = -1;
napi->config = NULL;
}
if (napi->thread) {
kthread_stop(napi->thread);
napi->thread = NULL;
}
}
EXPORT_SYMBOL(__netif_napi_del_locked);
staticint __napi_poll(struct napi_struct *n, bool *repoll)
{ int work, weight;
weight = n->weight;
/* This NAPI_STATE_SCHED test is for avoiding a race * with netpoll's poll_napi(). Only the entity which * obtains the lock and sees NAPI_STATE_SCHED set will * actually make the ->poll() call. Therefore we avoid * accidentally calling ->poll() when NAPI is not scheduled.
*/
work = 0; if (napi_is_scheduled(n)) {
work = n->poll(n, weight);
trace_napi_poll(n, work, weight);
xdp_do_check_flushed(n);
}
if (unlikely(work > weight))
netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
n->poll, work, weight);
if (likely(work < weight)) return work;
/* Drivers must not modify the NAPI state if they * consume the entire weight. In such cases this code * still "owns" the NAPI instance and therefore can * move the instance around on the list at-will.
*/ if (unlikely(napi_disable_pending(n))) {
napi_complete(n); return work;
}
/* The NAPI context has more processing work, but busy-polling * is preferred. Exit early.
*/ if (napi_prefer_busy_poll(n)) { if (napi_complete_done(n, work)) { /* If timeout is not set, we need to make sure * that the NAPI is re-scheduled.
*/
napi_schedule(n);
} return work;
}
/* Flush too old packets. If HZ < 1000, flush all packets */
gro_flush_normal(&n->gro, HZ >= 1000);
/* Some drivers may have called napi_schedule * prior to exhausting their budget.
*/ if (unlikely(!list_empty(&n->poll_list))) {
pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
n->dev ? n->dev->name : "backlog"); return work;
}
if (do_repoll) { #ifdefined(CONFIG_DEBUG_NET) if (unlikely(!napi_is_scheduled(n)))
pr_crit("repoll requested for device %s %ps but napi is not scheduled.\n",
n->dev->name, n->poll); #endif
list_add_tail(&n->poll_list, repoll);
}
netpoll_poll_unlock(have);
while (!kthread_should_stop()) { /* Testing SCHED_THREADED bit here to make sure the current * kthread owns this napi and could poll on this napi. * Testing SCHED bit is not enough because SCHED bit might be * set by some other busy poll thread or by napi_disable().
*/ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
WARN_ON(!list_empty(&napi->poll_list));
__set_current_state(TASK_RUNNING); return 0;
}
if (list_empty(&list)) { if (list_empty(&repoll)) {
sd->in_net_rx_action = false;
barrier(); /* We need to check if ____napi_schedule() * had refilled poll_list while * sd->in_net_rx_action was true.
*/ if (!list_empty(&sd->poll_list)) goto start; if (!sd_has_rps_ipi_waiting(sd)) goto end;
} break;
}
n = list_first_entry(&list, struct napi_struct, poll_list);
budget -= napi_poll(n, &repoll);
/* If softirq window is exhausted then punt. * Allow this to run for 2 jiffies since which will allow * an average latency of 1.5/HZ.
*/ if (unlikely(budget <= 0 ||
time_after_eq(jiffies, time_limit))) { /* Pairs with READ_ONCE() in softnet_seq_show() */
WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1); break;
}
}
/** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * * Find out if a device is linked to specified upper device and return true * in case it is. Note that this checks only immediate upper device, * not through a complete stack of devices. The caller must hold the RTNL lock.
*/ bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev)
{ struct netdev_nested_priv priv = {
.data = (void *)upper_dev,
};
/** * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * * Find out if a device is linked to specified upper device and return true * in case it is. Note that this checks the entire upper device chain. * The caller must hold rcu lock.
*/
/** * netdev_has_any_upper_dev - Check if device is linked to some device * @dev: device * * Find out if a device is linked to an upper device and return true in case * it is. The caller must hold the RTNL lock.
*/ bool netdev_has_any_upper_dev(struct net_device *dev)
{
ASSERT_RTNL();
/** * netdev_master_upper_dev_get - Get master upper device * @dev: device * * Find a master upper device and return pointer to it or NULL in case * it's not there. The caller must hold the RTNL lock.
*/ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
{ struct netdev_adjacent *upper;
ASSERT_RTNL();
if (list_empty(&dev->adj_list.upper)) return NULL;
/** * netdev_has_any_lower_dev - Check if device is linked to some device * @dev: device * * Find out if a device is linked to a lower device and return true in case * it is. The caller must hold the RTNL lock.
*/ staticbool netdev_has_any_lower_dev(struct net_device *dev)
{
ASSERT_RTNL();
/** * netdev_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device * @iter: list_head ** of the current position * * Gets the next device from the dev's upper list, starting from iter * position. The caller must hold RCU read lock.
*/ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter)
{ struct netdev_adjacent *upper;
/** * netdev_lower_get_next_private - Get the next ->private from the * lower neighbour list * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent->private from the dev's lower neighbour * list, starting from iter position. The caller must hold either hold the * RTNL lock or its own locking that guarantees that the neighbour lower * list will remain unchanged.
*/ void *netdev_lower_get_next_private(struct net_device *dev, struct list_head **iter)
{ struct netdev_adjacent *lower;
/** * netdev_lower_get_next_private_rcu - Get the next ->private from the * lower neighbour list, RCU * variant * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent->private from the dev's lower neighbour * list, starting from iter position. The caller must hold RCU read lock.
*/ void *netdev_lower_get_next_private_rcu(struct net_device *dev, struct list_head **iter)
{ struct netdev_adjacent *lower;
/** * netdev_lower_get_next - Get the next device from the lower neighbour * list * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent from the dev's lower neighbour * list, starting from iter position. The caller must hold RTNL lock or * its own locking that guarantees that the neighbour lower * list will remain unchanged.
*/ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
{ struct netdev_adjacent *lower;
/** * netdev_lower_get_first_private_rcu - Get the first ->private from the * lower neighbour list, RCU * variant * @dev: device * * Gets the first netdev_adjacent->private from the dev's lower neighbour * list. The caller must hold RCU read lock.
*/ void *netdev_lower_get_first_private_rcu(struct net_device *dev)
{ struct netdev_adjacent *lower;
/** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device * * Find a master upper device and return pointer to it or NULL in case * it's not there. The caller must hold the RCU read lock.
*/ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
{ struct netdev_adjacent *upper;
pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); if (ret) goto free_adj;
}
/* Ensure that master link is always the first item in list. */ if (master) {
ret = sysfs_create_link(&(dev->dev.kobj),
&(adj_dev->dev.kobj), "master"); if (ret) goto remove_symlinks;
if (adj->master)
sysfs_remove_link(&(dev->dev.kobj), "master");
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
list_del_rcu(&adj->list);
pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
adj_dev->name, dev->name, adj_dev->name);
netdev_put(adj_dev, &adj->dev_tracker);
kfree_rcu(adj, rcu);
}
/** * netdev_upper_dev_link - Add a link to the upper device * @dev: device * @upper_dev: new upper device * @extack: netlink extended ack * * Adds a link to device which is upper to this one. The caller must hold * the RTNL lock. On a failure a negative errno code is returned. * On success the reference counts are adjusted and the function * returns zero.
*/ int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, struct netlink_ext_ack *extack)
{ struct netdev_nested_priv priv = {
.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
.data = NULL,
};
/** * netdev_master_upper_dev_link - Add a master link to the upper device * @dev: device * @upper_dev: new upper device * @upper_priv: upper device private * @upper_info: upper info to be passed down via notifier * @extack: netlink extended ack * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices * might be linked as well. The caller must hold the RTNL lock. * On a failure a negative errno code is returned. On success the reference * counts are adjusted and the function returns zero.
*/ int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, void *upper_priv, void *upper_info, struct netlink_ext_ack *extack)
{ struct netdev_nested_priv priv = {
.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
.data = NULL,
};
/** * netdev_upper_dev_unlink - Removes a link to upper device * @dev: device * @upper_dev: new upper device * * Removes a link to device which is upper to this one. The caller must hold * the RTNL lock.
*/ void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev)
{ struct netdev_nested_priv priv = {
.flags = NESTED_SYNC_TODO,
.data = NULL,
};
/* Cache whatever we got, even if there was an error, otherwise the * successful stats retrievals would get lost.
*/
netdev_hw_stats64_add(stats, &report_delta.stats);
if (p_stats)
*p_stats = *stats;
*p_used = report_delta.used;
/** * netdev_get_xmit_slave - Get the xmit slave of master device * @dev: device * @skb: The packet * @all_slaves: assume all the slaves are active * * The reference counters are not incremented so the caller must be * careful with locks. The caller must hold RCU lock. * %NULL is returned if no slave is found.
*/
if (!ops->ndo_sk_get_lower_dev) return NULL; return ops->ndo_sk_get_lower_dev(dev, sk);
}
/** * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket * @dev: device * @sk: the socket * * %NULL is returned if no lower device is found.
*/
dev_change_rx_flags(dev, IFF_PROMISC);
} if (notify) { /* The ops lock is only required to ensure consistent locking * for `NETDEV_CHANGE` notifiers. This function is sometimes * called without the lock, even for devices that are ops * locked, such as in `dev_uc_sync_multiple` when using * bonding or teaming.
*/
netdev_ops_assert_locked(dev);
__dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
} return 0;
}
int netif_set_promiscuity(struct net_device *dev, int inc)
{ unsignedint old_flags = dev->flags; int err;
err = __dev_set_promiscuity(dev, inc, true); if (err < 0) return err; if (dev->flags != old_flags)
dev_set_rx_mode(dev); return err;
}
int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
{ unsignedint old_flags = dev->flags, old_gflags = dev->gflags; unsignedint allmulti, flags;
/* * Upload unicast and multicast address lists to device and * configure RX filtering. When the device doesn't support unicast * filtering it is put in promiscuous mode while unicast addresses * are present.
*/ void __dev_set_rx_mode(struct net_device *dev)
{ conststruct net_device_ops *ops = dev->netdev_ops;
/* dev_open will call this function so the list will stay sane. */ if (!(dev->flags&IFF_UP)) return;
if (!netif_device_present(dev)) return;
if (!(dev->priv_flags & IFF_UNICAST_FLT)) { /* Unicast addresses changes may only happen under the rtnl, * therefore calling __dev_set_promiscuity here is safe.
*/ if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
__dev_set_promiscuity(dev, 1, false);
dev->uc_promisc = true;
} elseif (netdev_uc_empty(dev) && dev->uc_promisc) {
__dev_set_promiscuity(dev, -1, false);
dev->uc_promisc = false;
}
}
if (ops->ndo_set_rx_mode)
ops->ndo_set_rx_mode(dev);
}
/** * netif_get_flags() - get flags reported to userspace * @dev: device * * Get the combination of flag bits exported through APIs to userspace.
*/ unsignedint netif_get_flags(conststruct net_device *dev)
{ unsignedint flags;
if (netif_running(dev)) { if (netif_oper_up(dev))
flags |= IFF_RUNNING; if (netif_carrier_ok(dev))
flags |= IFF_LOWER_UP; if (netif_dormant(dev))
flags |= IFF_DORMANT;
}
return flags;
}
EXPORT_SYMBOL(netif_get_flags);
int __dev_change_flags(struct net_device *dev, unsignedint flags, struct netlink_ext_ack *extack)
{ unsignedint old_flags = dev->flags; int ret;
/* * Load in the correct multicast list now the flags have changed.
*/
if ((old_flags ^ flags) & IFF_MULTICAST)
dev_change_rx_flags(dev, IFF_MULTICAST);
dev_set_rx_mode(dev);
/* * Have we downed the interface. We handle IFF_UP ourselves * according to user attempts to set it, rather than blindly * setting it.
*/
ret = 0; if ((old_flags ^ flags) & IFF_UP) { if (old_flags & IFF_UP)
__dev_close(dev); else
ret = __dev_open(dev, extack);
}
if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1;
old_flags = dev->flags;
dev->gflags ^= IFF_PROMISC;
if (__dev_set_promiscuity(dev, inc, false) >= 0) if (dev->flags != old_flags)
dev_set_rx_mode(dev);
}
/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI * is important. Some (broken) drivers set IFF_PROMISC, when * IFF_ALLMULTI is requested not asking us and not reporting.
*/ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
int __netif_set_mtu(struct net_device *dev, int new_mtu)
{ conststruct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_change_mtu) return ops->ndo_change_mtu(dev, new_mtu);
/* Pairs with all the lockless reads of dev->mtu in the stack */
WRITE_ONCE(dev->mtu, new_mtu); return 0;
}
EXPORT_SYMBOL_NS_GPL(__netif_set_mtu, "NETDEV_INTERNAL");
int dev_validate_mtu(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack)
{ /* MTU must be positive, and in range */ if (new_mtu < 0 || new_mtu < dev->min_mtu) {
NL_SET_ERR_MSG(extack, "mtu less than device minimum"); return -EINVAL;
}
/** * netif_set_mtu_ext() - Change maximum transfer unit * @dev: device * @new_mtu: new transfer unit * @extack: netlink extended ack * * Change the maximum transfer size of the network device. * * Return: 0 on success, -errno on failure.
*/ int netif_set_mtu_ext(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack)
{ int err, orig_mtu;
netdev_ops_assert_locked(dev);
if (new_mtu == dev->mtu) return 0;
err = dev_validate_mtu(dev, new_mtu, extack); if (err) return err;
if (!err) {
err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
orig_mtu);
err = notifier_to_errno(err); if (err) { /* setting mtu back and notifying everyone again, * so that they have a chance to revert changes.
*/
__netif_set_mtu(dev, orig_mtu);
call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
new_mtu);
}
} return err;
}
int netif_set_mtu(struct net_device *dev, int new_mtu)
{ struct netlink_ext_ack extack; int err;
int netif_change_tx_queue_len(struct net_device *dev, unsignedlong new_len)
{ unsignedint orig_len = dev->tx_queue_len; int res;
if (new_len != (unsignedint)new_len) return -ERANGE;
if (new_len != orig_len) {
WRITE_ONCE(dev->tx_queue_len, new_len);
res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
res = notifier_to_errno(res); if (res) goto err_rollback;
res = dev_qdisc_change_tx_queue_len(dev); if (res) goto err_rollback;
}
int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack)
{ conststruct net_device_ops *ops = dev->netdev_ops; int err;
if (!ops->ndo_set_mac_address) return -EOPNOTSUPP; if (ss->ss_family != dev->type) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV;
err = netif_pre_changeaddr_notify(dev, ss->__data, extack); if (err) return err; if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) {
err = ops->ndo_set_mac_address(dev, ss); if (err) return err;
}
dev->addr_assign_type = NET_ADDR_SET;
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
add_device_randomness(dev->dev_addr, dev->addr_len); return 0;
}
DECLARE_RWSEM(dev_addr_sem);
/* "sa" is a true struct sockaddr with limited "sa_data" member. */ int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
{
size_t size = sizeof(sa->sa_data_min); struct net_device *dev; int ret = 0;
down_read(&dev_addr_sem);
rcu_read_lock();
dev = dev_get_by_name_rcu(net, dev_name); if (!dev) {
ret = -ENODEV; goto unlock;
} if (!dev->addr_len)
memset(sa->sa_data, 0, size); else
memcpy(sa->sa_data, dev->dev_addr,
min_t(size_t, size, dev->addr_len));
sa->sa_family = dev->type;
if (!ops->ndo_change_carrier) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; return ops->ndo_change_carrier(dev, new_carrier);
}
/** * dev_get_phys_port_id - Get device physical port ID * @dev: device * @ppid: port ID * * Get device physical port ID
*/ int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid)
{ conststruct net_device_ops *ops = dev->netdev_ops;
if (!ops->ndo_get_phys_port_id) return -EOPNOTSUPP; return ops->ndo_get_phys_port_id(dev, ppid);
}
/** * dev_get_phys_port_name - Get device physical port name * @dev: device * @name: port name * @len: limit of bytes to copy to name * * Get device physical port name
*/ int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len)
{ conststruct net_device_ops *ops = dev->netdev_ops; int err;
if (ops->ndo_get_phys_port_name) {
err = ops->ndo_get_phys_port_name(dev, name, len); if (err != -EOPNOTSUPP) return err;
} return devlink_compat_phys_port_name_get(dev, name, len);
}
/** * netif_get_port_parent_id() - Get the device's port parent identifier * @dev: network device * @ppid: pointer to a storage for the port's parent identifier * @recurse: allow/disallow recursion to lower devices * * Get the devices's port parent identifier. * * Return: 0 on success, -errno on failure.
*/ int netif_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse)
{ conststruct net_device_ops *ops = dev->netdev_ops; struct netdev_phys_item_id first = { }; struct net_device *lower_dev; struct list_head *iter; int err;
if (ops->ndo_get_port_parent_id) {
err = ops->ndo_get_port_parent_id(dev, ppid); if (err != -EOPNOTSUPP) return err;
}
int netif_change_proto_down(struct net_device *dev, bool proto_down)
{ if (!dev->change_proto_down) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; if (proto_down)
netif_carrier_off(dev); else
netif_carrier_on(dev);
WRITE_ONCE(dev->proto_down, proto_down); return 0;
}
/** * netdev_change_proto_down_reason_locked - proto down reason * * @dev: device * @mask: proto down mask * @value: proto down value
*/ void netdev_change_proto_down_reason_locked(struct net_device *dev, unsignedlong mask, u32 value)
{
u32 proto_down_reason; int b;
for (i = 0; i < __MAX_XDP_MODE; i++) if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
count++; return count;
}
EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
/* Drivers assume refcnt is already incremented (i.e, prog pointer is * "moved" into driver), so they don't increment it on their own, but * they do decrement refcnt when program is detached or replaced. * Given net_device also owns link/prog, we need to bump refcnt here * to prevent drivers from underflowing it.
*/ if (prog)
bpf_prog_inc(prog);
err = bpf_op(dev, &xdp); if (err) { if (prog)
bpf_prog_put(prog); return err;
}
if (mode != XDP_MODE_HW)
bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
/* either link or prog attachment, never both */ if (link && (new_prog || old_prog)) return -EINVAL; /* link supports only XDP mode flags */ if (link && (flags & ~XDP_FLAGS_MODES)) {
NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); return -EINVAL;
} /* just one XDP mode bit should be set, zero defaults to drv/skb mode */ if (num_modes > 1) {
NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); return -EINVAL;
} /* avoid ambiguity if offload + drv/skb mode progs are both loaded */ if (!num_modes && dev_xdp_prog_count(dev) > 1) {
NL_SET_ERR_MSG(extack, "More than one program loaded, unset mode is ambiguous"); return -EINVAL;
} /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); return -EINVAL;
}
mode = dev_xdp_mode(dev, flags); /* can't replace attached link */ if (dev_xdp_link(dev, mode)) {
NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); return -EBUSY;
}
/* don't allow if an upper device already has a program */
netdev_for_each_upper_dev_rcu(dev, upper, iter) { if (dev_xdp_prog_count(upper) > 0) {
NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program"); return -EEXIST;
}
}
cur_prog = dev_xdp_prog(dev, mode); /* can't replace attached prog with link */ if (link && cur_prog) {
NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); return -EBUSY;
} if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
NL_SET_ERR_MSG(extack, "Active program does not match expected"); return -EEXIST;
}
/* put effective new program into new_prog */ if (link)
new_prog = link->link.prog;
if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
NL_SET_ERR_MSG(extack, "XDP program already attached"); return -EBUSY;
} if (!offload && dev_xdp_prog(dev, other_mode)) {
NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); return -EEXIST;
} if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported"); return -EINVAL;
} if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
NL_SET_ERR_MSG(extack, "Program bound to different device"); return -EINVAL;
} if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) {
NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode"); return -EINVAL;
} if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); return -EINVAL;
} if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device"); return -EINVAL;
}
}
/* don't call drivers if the effective program didn't change */ if (new_prog != cur_prog) {
bpf_op = dev_xdp_bpf_op(dev, mode); if (!bpf_op) {
NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode"); return -EOPNOTSUPP;
}
/* if racing with net_device's tear down, xdp_link->dev might be * already NULL, in which case link was already auto-detached
*/ if (xdp_link->dev) {
netdev_lock_ops(xdp_link->dev);
WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
netdev_unlock_ops(xdp_link->dev);
xdp_link->dev = NULL;
}
fd = bpf_link_settle(&link_primer); /* link itself doesn't hold dev's refcnt to not complicate shutdown */
dev_put(dev); return fd;
unlock:
rtnl_unlock();
out_put_dev:
dev_put(dev); return err;
}
/** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device * @extack: netlink extended ack * @fd: new program fd or negative value to clear * @expected_fd: old program fd that userspace expects to replace or clear * @flags: xdp-related flags * * Set or clear a bpf program for a device
*/ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags)
{ enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags); struct bpf_prog *new_prog = NULL, *old_prog = NULL; int err;
ASSERT_RTNL();
if (fd >= 0) {
new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
mode != XDP_MODE_SKB); if (IS_ERR(new_prog)) return PTR_ERR(new_prog);
}
err_out: if (err && new_prog)
bpf_prog_put(new_prog); if (old_prog)
bpf_prog_put(old_prog); return err;
}
u32 dev_get_min_mp_channel_count(conststruct net_device *dev)
{ int i;
netdev_ops_assert_locked(dev);
for (i = dev->real_num_rx_queues - 1; i >= 0; i--) if (dev->_rx[i].mp_params.mp_priv) /* The channel count is the idx plus 1. */ return i + 1;
return 0;
}
/** * dev_index_reserve() - allocate an ifindex in a namespace * @net: the applicable net namespace * @ifindex: requested ifindex, pass %0 to get one allocated * * Allocate a ifindex for a new device. Caller must either use the ifindex * to store the device (via list_netdevice()) or call dev_index_release() * to give the index up. * * Return: a suitable unique value for a new device interface number or -errno.
*/ staticint dev_index_reserve(struct net *net, u32 ifindex)
{ int err;
if (ifindex > INT_MAX) {
DEBUG_NET_WARN_ON_ONCE(1); return -EINVAL;
}
staticvoid dev_index_release(struct net *net, int ifindex)
{ /* Expect only unused indexes, unlist_netdevice() removes the used */
WARN_ON(xa_erase(&net->dev_by_index, ifindex));
}
static netdev_features_t netdev_fix_features(struct net_device *dev,
netdev_features_t features)
{ /* Fix illegal checksum combinations */ if ((features & NETIF_F_HW_CSUM) &&
(features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
netdev_warn(dev, "mixed HW and IP checksum settings.\n");
features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
}
/* TSO requires that SG is present as well. */ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
features &= ~NETIF_F_ALL_TSO;
}
if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
!(features & NETIF_F_IP_CSUM)) {
netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
features &= ~NETIF_F_TSO;
features &= ~NETIF_F_TSO_ECN;
}
if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
!(features & NETIF_F_IPV6_CSUM)) {
netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
features &= ~NETIF_F_TSO6;
}
/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
features &= ~NETIF_F_TSO_MANGLEID;
/* TSO ECN requires that TSO is present as well. */ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
features &= ~NETIF_F_TSO_ECN;
/* Software GSO depends on SG. */ if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
features &= ~NETIF_F_GSO;
}
/* GSO partial features require GSO partial be set */ if ((features & dev->gso_partial_features) &&
!(features & NETIF_F_GSO_PARTIAL)) {
netdev_dbg(dev, "Dropping partially supported GSO features since no GSO partial.\n");
features &= ~dev->gso_partial_features;
}
if (!(features & NETIF_F_RXCSUM)) { /* NETIF_F_GRO_HW implies doing RXCSUM since every packet * successfully merged by hardware must also have the * checksum verified by hardware. If the user does not * want to enable RXCSUM, logically, we should disable GRO_HW.
*/ if (features & NETIF_F_GRO_HW) {
netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
features &= ~NETIF_F_GRO_HW;
}
}
/* LRO/HW-GRO features cannot be combined with RX-FCS */ if (features & NETIF_F_RXFCS) { if (features & NETIF_F_LRO) {
netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
features &= ~NETIF_F_LRO;
}
if (features & NETIF_F_GRO_HW) {
netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
features &= ~NETIF_F_GRO_HW;
}
}
if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
features &= ~NETIF_F_LRO;
}
if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) {
netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
features &= ~NETIF_F_HW_TLS_TX;
}
if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
features &= ~NETIF_F_HW_TLS_RX;
}
if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) {
netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n");
features &= ~NETIF_F_GSO_UDP_L4;
}
return features;
}
int __netdev_update_features(struct net_device *dev)
{ struct net_device *upper, *lower;
netdev_features_t features; struct list_head *iter; int err = -1;
ASSERT_RTNL();
netdev_ops_assert_locked(dev);
features = netdev_get_wanted_features(dev);
if (dev->netdev_ops->ndo_fix_features)
features = dev->netdev_ops->ndo_fix_features(dev, features);
/* driver might be less strict about feature dependencies */
features = netdev_fix_features(dev, features);
/* some features can't be enabled if they're off on an upper device */
netdev_for_each_upper_dev_rcu(dev, upper, iter)
features = netdev_sync_upper_features(dev, upper, features);
if (dev->netdev_ops->ndo_set_features)
err = dev->netdev_ops->ndo_set_features(dev, features); else
err = 0;
if (unlikely(err < 0)) {
netdev_err(dev, "set_features() failed (%d); wanted %pNF, left %pNF\n",
err, &features, &dev->features); /* return non-0 since some features might have changed and * it's better to fire a spurious notification than miss it
*/ return -1;
}
sync_lower: /* some features must be disabled on lower devices when disabled * on an upper device (think: bonding master or bridge)
*/
netdev_for_each_lower_dev(dev, lower, iter)
netdev_sync_lower_features(dev, lower, features);
if (!err) {
netdev_features_t diff = features ^ dev->features;
if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) { /* udp_tunnel_{get,drop}_rx_info both need * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the * device, or they won't do anything. * Thus we need to update dev->features * *before* calling udp_tunnel_get_rx_info, * but *after* calling udp_tunnel_drop_rx_info.
*/
udp_tunnel_nic_lock(dev); if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
dev->features = features;
udp_tunnel_get_rx_info(dev);
} else {
udp_tunnel_drop_rx_info(dev);
}
udp_tunnel_nic_unlock(dev);
}
/** * netdev_update_features - recalculate device features * @dev: the device to check * * Recalculate dev->features set and send notifications if it * has changed. Should be called after driver or hardware dependent * conditions might have changed that influence the features.
*/ void netdev_update_features(struct net_device *dev)
{ if (__netdev_update_features(dev))
netdev_features_change(dev);
}
EXPORT_SYMBOL(netdev_update_features);
/** * netdev_change_features - recalculate device features * @dev: the device to check * * Recalculate dev->features set and send notifications even * if they have not changed. Should be called instead of * netdev_update_features() if also dev->vlan_features might * have changed to allow the changes to be propagated to stacked * VLAN devices.
*/ void netdev_change_features(struct net_device *dev)
{
__netdev_update_features(dev);
netdev_features_change(dev);
}
EXPORT_SYMBOL(netdev_change_features);
/** * netif_stacked_transfer_operstate - transfer operstate * @rootdev: the root or lower level device to transfer state from * @dev: the device to transfer operstate to * * Transfer operational state from root to device. This is normally * called when a stacking relationship exists between the root * device and the device(a leaf device).
*/ void netif_stacked_transfer_operstate(conststruct net_device *rootdev, struct net_device *dev)
{ if (rootdev->operstate == IF_OPER_DORMANT)
netif_dormant_on(dev); else
netif_dormant_off(dev);
if (rootdev->operstate == IF_OPER_TESTING)
netif_testing_on(dev); else
netif_testing_off(dev);
if (netif_carrier_ok(rootdev))
netif_carrier_on(dev); else
netif_carrier_off(dev);
}
EXPORT_SYMBOL(netif_stacked_transfer_operstate);
/* Drivers implementing ndo_get_peer_dev must support tstat * accounting, so that skb_do_redirect() can bump the dev's * RX stats upon network namespace switch.
*/ if (dev->netdev_ops->ndo_get_peer_dev &&
dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS) return -EOPNOTSUPP;
switch (dev->pcpu_stat_type) { case NETDEV_PCPU_STAT_NONE: return 0; case NETDEV_PCPU_STAT_LSTATS:
v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); break; case NETDEV_PCPU_STAT_TSTATS:
v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); break; case NETDEV_PCPU_STAT_DSTATS:
v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); break; default: return -EINVAL;
}
return v ? 0 : -ENOMEM;
}
staticvoid netdev_do_free_pcpu_stats(struct net_device *dev)
{ switch (dev->pcpu_stat_type) { case NETDEV_PCPU_STAT_NONE: return; case NETDEV_PCPU_STAT_LSTATS:
free_percpu(dev->lstats); break; case NETDEV_PCPU_STAT_TSTATS:
free_percpu(dev->tstats); break; case NETDEV_PCPU_STAT_DSTATS:
free_percpu(dev->dstats); break;
}
}
/** * register_netdevice() - register a network device * @dev: device to register * * Take a prepared network device structure and make it externally accessible. * A %NETDEV_REGISTER message is sent to the netdev notifier chain. * Callers must hold the rtnl lock - you may want register_netdev() * instead of this.
*/ int register_netdevice(struct net_device *dev)
{ int ret; struct net *net = dev_net(dev);
/* When net_device's are persistent, this will be fatal. */
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
BUG_ON(!net);
ret = ethtool_check_ops(dev->ethtool_ops); if (ret) return ret;
/* rss ctx ID 0 is reserved for the default context, start from 1 */
xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1);
mutex_init(&dev->ethtool->rss_lock);
ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) goto out;
ret = -ENOMEM;
dev->name_node = netdev_name_node_head_alloc(dev); if (!dev->name_node) goto out;
/* Init, if this function is available */ if (dev->netdev_ops->ndo_init) {
ret = dev->netdev_ops->ndo_init(dev); if (ret) { if (ret > 0)
ret = -EIO; goto err_free_name;
}
}
if (((dev->hw_features | dev->features) &
NETIF_F_HW_VLAN_CTAG_FILTER) &&
(!dev->netdev_ops->ndo_vlan_rx_add_vid ||
!dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
ret = -EINVAL; goto err_uninit;
}
ret = netdev_do_alloc_pcpu_stats(dev); if (ret) goto err_uninit;
ret = dev_index_reserve(net, dev->ifindex); if (ret < 0) goto err_free_pcpu;
dev->ifindex = ret;
/* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO).
*/
dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
dev->features |= NETIF_F_SOFT_FEATURES;
if (dev->udp_tunnel_nic_info) {
dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
}
if (!(dev->flags & IFF_LOOPBACK))
dev->hw_features |= NETIF_F_NOCACHE_COPY;
/* If IPv4 TCP segmentation offload is supported we should also * allow the device to enable segmenting the frame with the option * of ignoring a static IP ID value. This doesn't enable the * feature itself but allows the user to enable it later.
*/ if (dev->hw_features & NETIF_F_TSO)
dev->hw_features |= NETIF_F_TSO_MANGLEID; if (dev->vlan_features & NETIF_F_TSO)
dev->vlan_features |= NETIF_F_TSO_MANGLEID; if (dev->mpls_features & NETIF_F_TSO)
dev->mpls_features |= NETIF_F_TSO_MANGLEID; if (dev->hw_enc_features & NETIF_F_TSO)
dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
*/
dev->vlan_features |= NETIF_F_HIGHDMA;
/* Make NETIF_F_SG inheritable to tunnel devices.
*/
dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
/* Make NETIF_F_SG inheritable to MPLS.
*/
dev->mpls_features |= NETIF_F_SG;
ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
ret = notifier_to_errno(ret); if (ret) goto err_ifindex_release;
ret = netdev_register_kobject(dev);
netdev_lock(dev);
WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
netdev_unlock(dev);
/* If the device has permanent device address, driver should * set dev_addr and also addr_assign_type should be set to * NET_ADDR_PERM (default value).
*/ if (dev->addr_assign_type == NET_ADDR_PERM)
memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
/* Notify protocols, that a new device appeared. */
netdev_lock_ops(dev);
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
netdev_unlock_ops(dev);
ret = notifier_to_errno(ret); if (ret) { /* Expect explicit free_netdev() on failure */
dev->needs_free_netdev = false;
unregister_netdevice_queue(dev, NULL); goto out;
} /* * Prevent userspace races by waiting until the network * device is fully setup before sending notifications.
*/ if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
/* Initialize the core of a dummy net device. * The setup steps dummy netdevs need which normal netdevs get by going * through register_netdevice().
*/ staticvoid init_dummy_netdev(struct net_device *dev)
{ /* make sure we BUG if trying to hit standard * register/unregister code path
*/
dev->reg_state = NETREG_DUMMY;
/* a dummy interface is started by default */
set_bit(__LINK_STATE_PRESENT, &dev->state);
set_bit(__LINK_STATE_START, &dev->state);
/* Note : We dont allocate pcpu_refcnt for dummy devices, * because users of this 'device' dont need to change * its refcount.
*/
}
/** * register_netdev - register a network device * @dev: device to register * * Take a completed network device structure and add it to the kernel * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier * chain. 0 is returned on success. A negative errno code is returned * on a failure to set up the device, or if the name is a duplicate. * * This is a wrapper around register_netdevice that takes the rtnl semaphore * and expands the device name if you passed a format string to * alloc_netdev.
*/ int register_netdev(struct net_device *dev)
{ struct net *net = dev_net(dev); int err;
if (rtnl_net_lock_killable(net)) return -EINTR;
err = register_netdevice(dev);
rtnl_net_unlock(net);
return err;
}
EXPORT_SYMBOL(register_netdev);
int netdev_refcnt_read(conststruct net_device *dev)
{ #ifdef CONFIG_PCPU_DEV_REFCNT int i, refcnt = 0;
int netdev_unregister_timeout_secs __read_mostly = 10;
#define WAIT_REFS_MIN_MSECS 1 #define WAIT_REFS_MAX_MSECS 250 /** * netdev_wait_allrefs_any - wait until all references are gone. * @list: list of net_devices to wait on * * This is called when unregistering network devices. * * Any protocol or device that holds a reference should register * for netdevice notification, and cleanup and put back the * reference if they receive an UNREGISTER event. * We can get stuck here if buggy protocols don't correctly * call dev_put.
*/ staticstruct net_device *netdev_wait_allrefs_any(struct list_head *list)
{ unsignedlong rebroadcast_time, warning_time; struct net_device *dev; int wait = 0;
rebroadcast_time = warning_time = jiffies;
list_for_each_entry(dev, list, todo_list) if (netdev_refcnt_read(dev) == 1) return dev;
while (true) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock();
list_for_each_entry(dev, list, todo_list) if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
&dev->state)) { /* We must not have linkwatch events * pending on unregister. If this * happens, we simply run the queue * unscheduled, resulting in a noop * for this device.
*/
linkwatch_run_queue(); break;
}
list_for_each_entry(dev, list, todo_list) if (netdev_refcnt_read(dev) == 1) return dev;
if (time_after(jiffies, warning_time +
READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
list_for_each_entry(dev, list, todo_list) {
pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
dev->name, netdev_refcnt_read(dev));
ref_tracker_dir_print(&dev->refcnt_tracker, 10);
}
warning_time = jiffies;
}
}
}
/* The sequence is: * * rtnl_lock(); * ... * register_netdevice(x1); * register_netdevice(x2); * ... * unregister_netdevice(y1); * unregister_netdevice(y2); * ... * rtnl_unlock(); * free_netdev(y1); * free_netdev(y2); * * We are invoked by rtnl_unlock(). * This allows us to deal with problems: * 1) We can delete sysfs objects which invoke hotplug * without deadlocking with linkwatch via keventd. * 2) Since we run with the RTNL semaphore not held, we can sleep * safely in order to wait for the netdev refcnt to drop to zero. * * We must not return until all unregister events added during * the interval the lock was held have been completed.
*/ void netdev_run_todo(void)
{ struct net_device *dev, *tmp; struct list_head list; int cnt; #ifdef CONFIG_LOCKDEP struct list_head unlink_list;
/* ndo_get_stats64 implementation for dtstats-based accounting. * * Populate @s from dev->stats and dev->dstats. This is used internally by the * core for NETDEV_PCPU_STAT_DSTAT-type stats collection.
*/ staticvoid dev_get_dstats64(conststruct net_device *dev, struct rtnl_link_stats64 *s)
{
netdev_stats_to_stats64(s, &dev->stats);
dev_fetch_dstats(s, dev->dstats);
}
/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has * all the same fields in the same order as net_device_stats, with only * the type differing, but rtnl_link_stats64 may have additional fields * at the end for newer counters.
*/ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, conststruct net_device_stats *netdev_stats)
{
size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t); const atomic_long_t *src = (atomic_long_t *)netdev_stats;
u64 *dst = (u64 *)stats64;
BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++)
dst[i] = (unsignedlong)atomic_long_read(&src[i]); /* zero out counters that only exist in rtnl_link_stats64 */
memset((char *)stats64 + n * sizeof(u64), 0, sizeof(*stats64) - n * sizeof(u64));
}
EXPORT_SYMBOL(netdev_stats_to_stats64);
/** * dev_get_stats - get network device statistics * @dev: device to get statistics from * @storage: place to store stats * * Get network statistics from device. Return @storage. * The device driver may provide its own method by setting * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; * otherwise the internal statistics structure is used.
*/ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage)
{ conststruct net_device_ops *ops = dev->netdev_ops; conststruct net_device_core_stats __percpu *p;
/* * IPv{4,6} and udp tunnels share common stat helpers and use * different stat type (NETDEV_PCPU_STAT_TSTATS vs * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent.
*/
BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) !=
offsetof(struct pcpu_dstats, rx_bytes));
BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) !=
offsetof(struct pcpu_dstats, rx_packets));
BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) !=
offsetof(struct pcpu_dstats, tx_bytes));
BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) !=
offsetof(struct pcpu_dstats, tx_packets));
/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
p = READ_ONCE(dev->core_stats); if (p) { conststruct net_device_core_stats *core_stats; int i;
/** * dev_fetch_sw_netstats - get per-cpu network device statistics * @s: place to store stats * @netstats: per-cpu network stats to read from * * Read per-cpu network statistics and populate the related fields in @s.
*/ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, conststruct pcpu_sw_netstats __percpu *netstats)
{ int cpu;
/** * dev_get_tstats64 - ndo_get_stats64 implementation * @dev: device to get statistics from * @s: place to store stats * * Populate @s from dev->stats and dev->tstats. Can be used as * ndo_get_stats64() callback.
*/ void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
{
netdev_stats_to_stats64(s, &dev->stats);
dev_fetch_sw_netstats(s, dev->tstats);
}
EXPORT_SYMBOL_GPL(dev_get_tstats64);
/** * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default * @dev: netdev to enable the IRQ coalescing on * * Sets a conservative default for SW IRQ coalescing. Users can use * sysfs attributes to override the default values.
*/ void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
{
WARN_ON(dev->reg_state == NETREG_REGISTERED);
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
netdev_set_gro_flush_timeout(dev, 20000);
netdev_set_defer_hard_irqs(dev, 1);
}
}
EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
/** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for * @name: device name format string * @name_assign_type: origin of device name * @setup: callback to initialize device * @txqs: the number of TX subqueues to allocate * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use * and performs basic initialization. Also allocates subqueue structs * for each queue on the device.
*/ struct net_device *alloc_netdev_mqs(int sizeof_priv, constchar *name, unsignedchar name_assign_type, void (*setup)(struct net_device *), unsignedint txqs, unsignedint rxqs)
{ struct net_device *dev;
size_t napi_config_sz; unsignedint maxqs;
BUG_ON(strlen(name) >= sizeof(dev->name));
if (txqs < 1) {
pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); return NULL;
}
if (rxqs < 1) {
pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL;
}
maxqs = max(txqs, rxqs);
dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!dev) return NULL;
netdev_lock(dev);
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
__netif_napi_del_locked(p);
netdev_unlock(dev);
synchronize_net();
}
kvfree(dev->napi_config);
}
/** * free_netdev - free network device * @dev: device * * This function does the last stage of destroying an allocated device * interface. The reference to the device object is released. If this * is the last reference then it will be freed.Must be called in process * context.
*/ void free_netdev(struct net_device *dev)
{
might_sleep();
/* When called immediately after register_netdevice() failed the unwind * handling may still be dismantling the device. Handle that case by * deferring the free.
*/ if (dev->reg_state == NETREG_UNREGISTERING) {
ASSERT_RTNL();
dev->needs_free_netdev = true; return;
}
/* will free via device release */
put_device(&dev->dev);
}
EXPORT_SYMBOL(free_netdev);
/** * alloc_netdev_dummy - Allocate and initialize a dummy net device. * @sizeof_priv: size of private data to allocate space for * * Return: the allocated net_device on success, NULL otherwise
*/ struct net_device *alloc_netdev_dummy(int sizeof_priv)
{ return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
init_dummy_netdev);
}
EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
/** * synchronize_net - Synchronize with packet receive processing * * Wait for packets currently being received to be done. * Does not block later packets from starting.
*/ void synchronize_net(void)
{
might_sleep(); if (from_cleanup_net() || rtnl_is_locked())
synchronize_rcu_expedited(); else
synchronize_rcu();
}
EXPORT_SYMBOL(synchronize_net);
/** * unregister_netdevice_queue - remove device from the kernel * @dev: device * @head: list * * This function shuts down a device interface and removes it * from the kernel tables. * If head not NULL, device is queued to be unregistered later. * * Callers must hold the rtnl semaphore. You may want * unregister_netdev() instead of this.
*/
for (i = 0; i < dev->real_num_rx_queues; i++) { struct netdev_rx_queue *rxq = &dev->_rx[i]; struct pp_memory_provider_params *p = &rxq->mp_params;
if (p->mp_ops && p->mp_ops->uninstall)
p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq);
}
}
/* devices must be UP and netdev_lock()'d */ staticvoid netif_close_many_and_unlock(struct list_head *close_head)
{ struct net_device *dev, *tmp;
netif_close_many(close_head, false);
/* ... now unlock them */
list_for_each_entry_safe(dev, tmp, close_head, close_list) {
netdev_unlock(dev);
list_del_init(&dev->close_list);
}
}
staticvoid netif_close_many_and_unlock_cond(struct list_head *close_head)
{ #ifdef CONFIG_LOCKDEP /* We can only track up to MAX_LOCK_DEPTH locks per task. * * Reserve half the available slots for additional locks possibly * taken by notifiers and (soft)irqs.
*/ unsignedint limit = MAX_LOCK_DEPTH / 2;
if (lockdep_depth(current) > limit)
netif_close_many_and_unlock(close_head); #endif
}
list_for_each_entry_safe(dev, tmp, head, unreg_list) { /* Some devices call without registering * for initialization unwind. Remove those * devices and proceed with the remaining.
*/ if (dev->reg_state == NETREG_UNINITIALIZED) {
pr_debug("unregister_netdevice: device %s/%p never was registered\n",
dev->name, dev);
/* If device is running, close it first. Start with ops locked... */
list_for_each_entry(dev, head, unreg_list) { if (!(dev->flags & IFF_UP)) continue; if (netdev_need_ops_lock(dev)) {
list_add_tail(&dev->close_list, &close_head);
netdev_lock(dev);
}
netif_close_many_and_unlock_cond(&close_head);
}
netif_close_many_and_unlock(&close_head); /* ... now go over the rest. */
list_for_each_entry(dev, head, unreg_list) { if (!netdev_need_ops_lock(dev))
list_add_tail(&dev->close_list, &close_head);
}
netif_close_many(&close_head, true);
list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */
unlist_netdevice(dev);
netdev_lock(dev);
WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
netdev_unlock(dev);
}
flush_all_backlogs();
/* Notify protocols, that we are about to destroy * this device. They should clean all the things.
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
/** * unregister_netdevice_many - unregister many devices * @head: list of devices * * Note: As most callers use a stack allocated list_head, * we force a list_del() to make sure stack won't be corrupted later.
*/ void unregister_netdevice_many(struct list_head *head)
{
unregister_netdevice_many_notify(head, 0, NULL);
}
EXPORT_SYMBOL(unregister_netdevice_many);
/** * unregister_netdev - remove device from the kernel * @dev: device * * This function shuts down a device interface and removes it * from the kernel tables. * * This is just a wrapper for unregister_netdevice that takes * the rtnl semaphore. In general you want to use this and not * unregister_netdevice.
*/ void unregister_netdev(struct net_device *dev)
{
rtnl_net_dev_lock(dev);
unregister_netdevice(dev);
rtnl_net_dev_unlock(dev);
}
EXPORT_SYMBOL(unregister_netdev);
int __dev_change_net_namespace(struct net_device *dev, struct net *net, constchar *pat, int new_ifindex, struct netlink_ext_ack *extack)
{ struct netdev_name_node *name_node; struct net *net_old = dev_net(dev); char new_name[IFNAMSIZ] = {}; int err, new_nsid;
ASSERT_RTNL();
/* Don't allow namespace local devices to be moved. */
err = -EINVAL; if (dev->netns_immutable) {
NL_SET_ERR_MSG(extack, "The interface netns is immutable"); goto out;
}
/* Ensure the device has been registered */ if (dev->reg_state != NETREG_REGISTERED) {
NL_SET_ERR_MSG(extack, "The interface isn't registered"); goto out;
}
/* Get out if there is nothing todo */
err = 0; if (net_eq(net_old, net)) goto out;
/* Pick the destination device name, and ensure * we can use it in the destination network namespace.
*/
err = -EEXIST; if (netdev_name_in_use(net, dev->name)) { /* We get here if we can't use the current device name */ if (!pat) {
NL_SET_ERR_MSG(extack, "An interface with the same name exists in the target netns"); goto out;
}
err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST); if (err < 0) {
NL_SET_ERR_MSG_FMT(extack, "Unable to use '%s' for the new interface name in the target netns",
pat); goto out;
}
} /* Check that none of the altnames conflicts. */
err = -EEXIST;
netdev_for_each_altname(dev, name_node) { if (netdev_name_in_use(net, name_node->name)) {
NL_SET_ERR_MSG_FMT(extack, "An interface with the altname %s exists in the target netns",
name_node->name); goto out;
}
}
/* Check that new_ifindex isn't used yet. */ if (new_ifindex) {
err = dev_index_reserve(net, new_ifindex); if (err < 0) {
NL_SET_ERR_MSG_FMT(extack, "The ifindex %d is not available in the target netns",
new_ifindex); goto out;
}
} else { /* If there is an ifindex conflict assign a new one */
err = dev_index_reserve(net, dev->ifindex); if (err == -EBUSY)
err = dev_index_reserve(net, 0); if (err < 0) {
NL_SET_ERR_MSG(extack, "Unable to allocate a new ifindex in the target netns"); goto out;
}
new_ifindex = err;
}
/* * And now a mini version of register_netdevice unregister_netdevice.
*/
netdev_lock_ops(dev); /* If device is running close it first. */
netif_close(dev); /* And unlink it from device chain */
unlist_netdevice(dev);
if (!netdev_need_ops_lock(dev))
netdev_lock(dev);
dev->moving_ns = true;
netdev_unlock(dev);
/* Notify protocols, that we are about to destroy * this device. They should clean all the things. * * Note that dev->reg_state stays at NETREG_REGISTERED. * This is wanted because this way 8021q and macvlan know * the device is just moving and can keep their slaves up.
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
rcu_barrier();
if (new_name[0]) { /* Rename the netdev to prepared name */
write_seqlock_bh(&netdev_rename_lock);
strscpy(dev->name, new_name, IFNAMSIZ);
write_sequnlock_bh(&netdev_rename_lock);
}
/* Send a netdev-add uevent to the new namespace */
kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
netdev_adjacent_add_links(dev);
/* Adapt owner in case owning user namespace of target network * namespace is different from the original one.
*/
err = netdev_change_owner(dev, net_old, net);
WARN_ON(err);
netdev_lock(dev);
dev->moving_ns = false; if (!netdev_need_ops_lock(dev))
netdev_unlock(dev);
/* Add the device back in the hashes */
list_netdevice(dev); /* Notify protocols, that a new device appeared. */
call_netdevice_notifiers(NETDEV_REGISTER, dev);
netdev_unlock_ops(dev);
/* * Prevent userspace races by waiting until the network * device is fully setup before sending notifications.
*/
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
/* Find end of our completion_queue. */
list_skb = &sd->completion_queue; while (*list_skb)
list_skb = &(*list_skb)->next; /* Append completion queue from offline CPU. */
*list_skb = oldsd->completion_queue;
oldsd->completion_queue = NULL;
/* Append output queue from offline CPU. */ if (oldsd->output_queue) {
*sd->output_queue_tailp = oldsd->output_queue;
sd->output_queue_tailp = oldsd->output_queue_tailp;
oldsd->output_queue = NULL;
oldsd->output_queue_tailp = &oldsd->output_queue;
} /* Append NAPI poll list from offline CPU, with one exception : * process_backlog() must be called by cpu owning percpu backlog. * We properly handle process_queue & input_pkt_queue later.
*/ while (!list_empty(&oldsd->poll_list)) { struct napi_struct *napi = list_first_entry(&oldsd->poll_list, struct napi_struct,
poll_list);
if (!use_backlog_threads()) { #ifdef CONFIG_RPS
remsd = oldsd->rps_ipi_list;
oldsd->rps_ipi_list = NULL; #endif /* send out pending IPI's on offline CPU */
net_rps_send_ipi(remsd);
}
/* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) {
netif_rx(skb);
rps_input_queue_head_incr(oldsd);
} while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
netif_rx(skb);
rps_input_queue_head_incr(oldsd);
}
return 0;
}
/** * netdev_increment_features - increment feature set by one * @all: current feature set * @one: new feature set * @mask: mask feature set * * Computes a new feature set after adding a device with feature set * @one to the master device with current feature set @all. Will not * enable anything that is off in @mask. Returns the new feature set.
*/
netdev_features_t netdev_increment_features(netdev_features_t all,
netdev_features_t one, netdev_features_t mask)
{ if (mask & NETIF_F_HW_CSUM)
mask |= NETIF_F_CSUM_MASK;
mask |= NETIF_F_VLAN_CHALLENGED;
all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
all &= one | ~NETIF_F_ALL_FOR_ALL;
/* If one device supports hw checksumming, set for all. */ if (all & NETIF_F_HW_CSUM)
all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL); if (hash != NULL) for (i = 0; i < NETDEV_HASHENTRIES; i++)
INIT_HLIST_HEAD(&hash[i]);
return hash;
}
/* Initialize per network namespace state */ staticint __net_init netdev_init(struct net *net)
{
BUILD_BUG_ON(GRO_HASH_BUCKETS >
BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask));
INIT_LIST_HEAD(&net->dev_base_head);
net->dev_name_head = netdev_create_hash(); if (net->dev_name_head == NULL) goto err_name;
net->dev_index_head = netdev_create_hash(); if (net->dev_index_head == NULL) goto err_idx;
staticvoid __net_exit default_device_exit_net(struct net *net)
{ struct netdev_name_node *name_node, *tmp; struct net_device *dev, *aux; /* * Push all migratable network devices back to the * initial network namespace
*/
ASSERT_RTNL();
for_each_netdev_safe(net, dev, aux) { int err; char fb_name[IFNAMSIZ];
/* Ignore unmoveable devices (i.e. loopback) */ if (dev->netns_immutable) continue;
/* Leave virtual devices for the generic cleanup */ if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund) continue;
/* Push remaining network devices to init_net */
snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); if (netdev_name_in_use(&init_net, fb_name))
snprintf(fb_name, IFNAMSIZ, "dev%%d");
netdev_for_each_altname_safe(dev, name_node, tmp) if (netdev_name_in_use(&init_net, name_node->name))
__netdev_name_node_alt_destroy(name_node);
err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) {
pr_emerg("%s: failed to move %s to init_net: %d\n",
__func__, dev->name, err);
BUG();
}
}
}
staticvoid __net_exit default_device_exit_batch(struct list_head *net_list)
{ /* At exit all network devices most be removed from a network * namespace. Do this in the reverse order of registration. * Do this across as many network namespaces as possible to * improve batching efficiency.
*/ struct net_device *dev; struct net *net;
LIST_HEAD(dev_kill_list);
/* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not * present) and leaves us with a valid list of present and active devices. *
*/
/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */ #define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE)
/* * This is called single threaded during boot, so no need * to take the rtnl semaphore.
*/ staticint __init net_dev_init(void)
{ int i, rc = -ENOMEM;
BUG_ON(!dev_boot_phase);
net_dev_struct_check();
if (dev_proc_init()) goto out;
if (netdev_kobject_init()) goto out;
for (i = 0; i < PTYPE_HASH_SIZE; i++)
INIT_LIST_HEAD(&ptype_base[i]);
if (register_pernet_subsys(&netdev_net_ops)) goto out;
/* * Initialise the packet receive queues.
*/
flush_backlogs_fallback = flush_backlogs_alloc(); if (!flush_backlogs_fallback) goto out;
if (net_page_pool_create(i)) goto out;
} if (use_backlog_threads())
smpboot_register_percpu_thread(&backlog_threads);
dev_boot_phase = 0;
/* The loopback device is special if any other network devices * is present in a network namespace the loopback device must * be present. Since we now dynamically allocate and free the * loopback device ensure this invariant is maintained by * keeping the loopback device as the first device on the * list of network devices. Ensuring the loopback devices * is the first device that appears and the last network device * that disappears.
*/ if (register_pernet_device(&loopback_net_ops)) goto out;
if (register_pernet_device(&default_device_ops)) goto out;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.