xref: /kernel/linux/linux-6.6/net/core/dev.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 *      NET3    Protocol independent device support routines.
4 *
5 *	Derived from the non IP parts of dev.c 1.0.19
6 *              Authors:	Ross Biro
7 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
8 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
9 *
10 *	Additional Authors:
11 *		Florian la Roche <rzsfl@rz.uni-sb.de>
12 *		Alan Cox <gw4pts@gw4pts.ampr.org>
13 *		David Hinds <dahinds@users.sourceforge.net>
14 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
15 *		Adam Sulmicki <adam@cfar.umd.edu>
16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
17 *
18 *	Changes:
19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
20 *                                      to 2 if register_netdev gets called
21 *                                      before net_dev_init & also removed a
22 *                                      few lines of code in the process.
23 *		Alan Cox	:	device private ioctl copies fields back.
24 *		Alan Cox	:	Transmit queue code does relevant
25 *					stunts to keep the queue safe.
26 *		Alan Cox	:	Fixed double lock.
27 *		Alan Cox	:	Fixed promisc NULL pointer trap
28 *		????????	:	Support the full private ioctl range
29 *		Alan Cox	:	Moved ioctl permission check into
30 *					drivers
31 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
32 *		Alan Cox	:	100 backlog just doesn't cut it when
33 *					you start doing multicast video 8)
34 *		Alan Cox	:	Rewrote net_bh and list manager.
35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
36 *		Alan Cox	:	Took out transmit every packet pass
37 *					Saved a few bytes in the ioctl handler
38 *		Alan Cox	:	Network driver sets packet type before
39 *					calling netif_rx. Saves a function
40 *					call a packet.
41 *		Alan Cox	:	Hashed net_bh()
42 *		Richard Kooijman:	Timestamp fixes.
43 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
44 *		Alan Cox	:	Device lock protection.
45 *              Alan Cox        :       Fixed nasty side effect of device close
46 *					changes.
47 *		Rudi Cilibrasi	:	Pass the right thing to
48 *					set_mac_address()
49 *		Dave Miller	:	32bit quantity for the device lock to
50 *					make it work out on a Sparc.
51 *		Bjorn Ekwall	:	Added KERNELD hack.
52 *		Alan Cox	:	Cleaned up the backlog initialise.
53 *		Craig Metz	:	SIOCGIFCONF fix if space for under
54 *					1 device.
55 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
56 *					is no device open function.
57 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
58 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
59 *		Cyrus Durgin	:	Cleaned for KMOD
60 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
61 *					A network device unload needs to purge
62 *					the backlog queue.
63 *	Paul Rusty Russell	:	SIOCSIFNAME
64 *              Pekka Riikonen  :	Netdev boot-time settings code
65 *              Andrew Morton   :       Make unregister_netdevice wait
66 *                                      indefinitely on dev->refcnt
67 *              J Hadi Salim    :       - Backlog queue sampling
68 *				        - netif_rx() feedback
69 */
70
71#include <linux/uaccess.h>
72#include <linux/bitmap.h>
73#include <linux/capability.h>
74#include <linux/cpu.h>
75#include <linux/types.h>
76#include <linux/kernel.h>
77#include <linux/hash.h>
78#include <linux/slab.h>
79#include <linux/sched.h>
80#include <linux/sched/mm.h>
81#include <linux/mutex.h>
82#include <linux/rwsem.h>
83#include <linux/string.h>
84#include <linux/mm.h>
85#include <linux/socket.h>
86#include <linux/sockios.h>
87#include <linux/errno.h>
88#include <linux/interrupt.h>
89#include <linux/if_ether.h>
90#include <linux/netdevice.h>
91#include <linux/etherdevice.h>
92#include <linux/ethtool.h>
93#include <linux/skbuff.h>
94#include <linux/kthread.h>
95#include <linux/bpf.h>
96#include <linux/bpf_trace.h>
97#include <net/net_namespace.h>
98#include <net/sock.h>
99#include <net/busy_poll.h>
100#include <linux/rtnetlink.h>
101#include <linux/stat.h>
102#include <net/dsa.h>
103#include <net/dst.h>
104#include <net/dst_metadata.h>
105#include <net/gro.h>
106#include <net/pkt_sched.h>
107#include <net/pkt_cls.h>
108#include <net/checksum.h>
109#include <net/xfrm.h>
110#include <net/tcx.h>
111#include <linux/highmem.h>
112#include <linux/init.h>
113#include <linux/module.h>
114#include <linux/netpoll.h>
115#include <linux/rcupdate.h>
116#include <linux/delay.h>
117#include <net/iw_handler.h>
118#include <asm/current.h>
119#include <linux/audit.h>
120#include <linux/dmaengine.h>
121#include <linux/err.h>
122#include <linux/ctype.h>
123#include <linux/if_arp.h>
124#include <linux/if_vlan.h>
125#include <linux/ip.h>
126#include <net/ip.h>
127#include <net/mpls.h>
128#include <linux/ipv6.h>
129#include <linux/in.h>
130#include <linux/jhash.h>
131#include <linux/random.h>
132#include <trace/events/napi.h>
133#include <trace/events/net.h>
134#include <trace/events/skb.h>
135#include <trace/events/qdisc.h>
136#include <trace/events/xdp.h>
137#include <linux/inetdevice.h>
138#include <linux/cpu_rmap.h>
139#include <linux/static_key.h>
140#include <linux/hashtable.h>
141#include <linux/vmalloc.h>
142#include <linux/if_macvlan.h>
143#include <linux/errqueue.h>
144#include <linux/hrtimer.h>
145#include <linux/netfilter_netdev.h>
146#include <linux/crash_dump.h>
147#include <linux/sctp.h>
148#include <net/udp_tunnel.h>
149#include <linux/net_namespace.h>
150#include <linux/indirect_call_wrapper.h>
151#include <net/devlink.h>
152#include <linux/pm_runtime.h>
153#include <linux/prandom.h>
154#include <linux/once_lite.h>
155#include <net/netdev_rx_queue.h>
156
157#include "dev.h"
158#include "net-sysfs.h"
159
160static DEFINE_SPINLOCK(ptype_lock);
161struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
162struct list_head ptype_all __read_mostly;	/* Taps */
163
164static int netif_rx_internal(struct sk_buff *skb);
165static int call_netdevice_notifiers_extack(unsigned long val,
166					   struct net_device *dev,
167					   struct netlink_ext_ack *extack);
168static struct napi_struct *napi_by_id(unsigned int napi_id);
169
170/*
171 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
172 * semaphore.
173 *
174 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
175 *
176 * Writers must hold the rtnl semaphore while they loop through the
177 * dev_base_head list, and hold dev_base_lock for writing when they do the
178 * actual updates.  This allows pure readers to access the list even
179 * while a writer is preparing to update it.
180 *
181 * To put it another way, dev_base_lock is held for writing only to
182 * protect against pure readers; the rtnl semaphore provides the
183 * protection against other writers.
184 *
185 * See, for example usages, register_netdevice() and
186 * unregister_netdevice(), which must be called with the rtnl
187 * semaphore held.
188 */
189DEFINE_RWLOCK(dev_base_lock);
190EXPORT_SYMBOL(dev_base_lock);
191
192static DEFINE_MUTEX(ifalias_mutex);
193
194/* protects napi_hash addition/deletion and napi_gen_id */
195static DEFINE_SPINLOCK(napi_hash_lock);
196
197static unsigned int napi_gen_id = NR_CPUS;
198static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
199
200static DECLARE_RWSEM(devnet_rename_sem);
201
202static inline void dev_base_seq_inc(struct net *net)
203{
204	while (++net->dev_base_seq == 0)
205		;
206}
207
208static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
209{
210	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
211
212	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
213}
214
215static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
216{
217	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
218}
219
220static inline void rps_lock_irqsave(struct softnet_data *sd,
221				    unsigned long *flags)
222{
223	if (IS_ENABLED(CONFIG_RPS))
224		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
225	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
226		local_irq_save(*flags);
227}
228
229static inline void rps_lock_irq_disable(struct softnet_data *sd)
230{
231	if (IS_ENABLED(CONFIG_RPS))
232		spin_lock_irq(&sd->input_pkt_queue.lock);
233	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
234		local_irq_disable();
235}
236
237static inline void rps_unlock_irq_restore(struct softnet_data *sd,
238					  unsigned long *flags)
239{
240	if (IS_ENABLED(CONFIG_RPS))
241		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
242	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
243		local_irq_restore(*flags);
244}
245
246static inline void rps_unlock_irq_enable(struct softnet_data *sd)
247{
248	if (IS_ENABLED(CONFIG_RPS))
249		spin_unlock_irq(&sd->input_pkt_queue.lock);
250	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
251		local_irq_enable();
252}
253
254static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
255						       const char *name)
256{
257	struct netdev_name_node *name_node;
258
259	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
260	if (!name_node)
261		return NULL;
262	INIT_HLIST_NODE(&name_node->hlist);
263	name_node->dev = dev;
264	name_node->name = name;
265	return name_node;
266}
267
268static struct netdev_name_node *
269netdev_name_node_head_alloc(struct net_device *dev)
270{
271	struct netdev_name_node *name_node;
272
273	name_node = netdev_name_node_alloc(dev, dev->name);
274	if (!name_node)
275		return NULL;
276	INIT_LIST_HEAD(&name_node->list);
277	return name_node;
278}
279
280static void netdev_name_node_free(struct netdev_name_node *name_node)
281{
282	kfree(name_node);
283}
284
285static void netdev_name_node_add(struct net *net,
286				 struct netdev_name_node *name_node)
287{
288	hlist_add_head_rcu(&name_node->hlist,
289			   dev_name_hash(net, name_node->name));
290}
291
292static void netdev_name_node_del(struct netdev_name_node *name_node)
293{
294	hlist_del_rcu(&name_node->hlist);
295}
296
297static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
298							const char *name)
299{
300	struct hlist_head *head = dev_name_hash(net, name);
301	struct netdev_name_node *name_node;
302
303	hlist_for_each_entry(name_node, head, hlist)
304		if (!strcmp(name_node->name, name))
305			return name_node;
306	return NULL;
307}
308
309static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
310							    const char *name)
311{
312	struct hlist_head *head = dev_name_hash(net, name);
313	struct netdev_name_node *name_node;
314
315	hlist_for_each_entry_rcu(name_node, head, hlist)
316		if (!strcmp(name_node->name, name))
317			return name_node;
318	return NULL;
319}
320
321bool netdev_name_in_use(struct net *net, const char *name)
322{
323	return netdev_name_node_lookup(net, name);
324}
325EXPORT_SYMBOL(netdev_name_in_use);
326
327int netdev_name_node_alt_create(struct net_device *dev, const char *name)
328{
329	struct netdev_name_node *name_node;
330	struct net *net = dev_net(dev);
331
332	name_node = netdev_name_node_lookup(net, name);
333	if (name_node)
334		return -EEXIST;
335	name_node = netdev_name_node_alloc(dev, name);
336	if (!name_node)
337		return -ENOMEM;
338	netdev_name_node_add(net, name_node);
339	/* The node that holds dev->name acts as a head of per-device list. */
340	list_add_tail(&name_node->list, &dev->name_node->list);
341
342	return 0;
343}
344
345static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
346{
347	list_del(&name_node->list);
348	kfree(name_node->name);
349	netdev_name_node_free(name_node);
350}
351
352int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
353{
354	struct netdev_name_node *name_node;
355	struct net *net = dev_net(dev);
356
357	name_node = netdev_name_node_lookup(net, name);
358	if (!name_node)
359		return -ENOENT;
360	/* lookup might have found our primary name or a name belonging
361	 * to another device.
362	 */
363	if (name_node == dev->name_node || name_node->dev != dev)
364		return -EINVAL;
365
366	netdev_name_node_del(name_node);
367	synchronize_rcu();
368	__netdev_name_node_alt_destroy(name_node);
369
370	return 0;
371}
372
373static void netdev_name_node_alt_flush(struct net_device *dev)
374{
375	struct netdev_name_node *name_node, *tmp;
376
377	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
378		__netdev_name_node_alt_destroy(name_node);
379}
380
381/* Device list insertion */
382static void list_netdevice(struct net_device *dev)
383{
384	struct netdev_name_node *name_node;
385	struct net *net = dev_net(dev);
386
387	ASSERT_RTNL();
388
389	write_lock(&dev_base_lock);
390	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
391	netdev_name_node_add(net, dev->name_node);
392	hlist_add_head_rcu(&dev->index_hlist,
393			   dev_index_hash(net, dev->ifindex));
394	write_unlock(&dev_base_lock);
395
396	netdev_for_each_altname(dev, name_node)
397		netdev_name_node_add(net, name_node);
398
399	/* We reserved the ifindex, this can't fail */
400	WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
401
402	dev_base_seq_inc(net);
403}
404
405/* Device list removal
406 * caller must respect a RCU grace period before freeing/reusing dev
407 */
408static void unlist_netdevice(struct net_device *dev, bool lock)
409{
410	struct netdev_name_node *name_node;
411	struct net *net = dev_net(dev);
412
413	ASSERT_RTNL();
414
415	xa_erase(&net->dev_by_index, dev->ifindex);
416
417	netdev_for_each_altname(dev, name_node)
418		netdev_name_node_del(name_node);
419
420	/* Unlink dev from the device chain */
421	if (lock)
422		write_lock(&dev_base_lock);
423	list_del_rcu(&dev->dev_list);
424	netdev_name_node_del(dev->name_node);
425	hlist_del_rcu(&dev->index_hlist);
426	if (lock)
427		write_unlock(&dev_base_lock);
428
429	dev_base_seq_inc(dev_net(dev));
430}
431
432/*
433 *	Our notifier list
434 */
435
436static RAW_NOTIFIER_HEAD(netdev_chain);
437
438/*
439 *	Device drivers call our routines to queue packets here. We empty the
440 *	queue in the local softnet handler.
441 */
442
443DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
444EXPORT_PER_CPU_SYMBOL(softnet_data);
445
446#ifdef CONFIG_LOCKDEP
447/*
448 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
449 * according to dev->type
450 */
451static const unsigned short netdev_lock_type[] = {
452	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
453	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
454	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
455	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
456	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
457	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
458	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
459	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
460	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
461	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
462	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
463	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
464	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
465	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
466	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
467
468static const char *const netdev_lock_name[] = {
469	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
470	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
471	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
472	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
473	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
474	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
475	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
476	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
477	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
478	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
479	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
480	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
481	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
482	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
483	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
484
485static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
486static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
487
488static inline unsigned short netdev_lock_pos(unsigned short dev_type)
489{
490	int i;
491
492	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
493		if (netdev_lock_type[i] == dev_type)
494			return i;
495	/* the last key is used by default */
496	return ARRAY_SIZE(netdev_lock_type) - 1;
497}
498
499static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
500						 unsigned short dev_type)
501{
502	int i;
503
504	i = netdev_lock_pos(dev_type);
505	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
506				   netdev_lock_name[i]);
507}
508
509static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
510{
511	int i;
512
513	i = netdev_lock_pos(dev->type);
514	lockdep_set_class_and_name(&dev->addr_list_lock,
515				   &netdev_addr_lock_key[i],
516				   netdev_lock_name[i]);
517}
518#else
519static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
520						 unsigned short dev_type)
521{
522}
523
524static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
525{
526}
527#endif
528
529/*******************************************************************************
530 *
531 *		Protocol management and registration routines
532 *
533 *******************************************************************************/
534
535
536/*
537 *	Add a protocol ID to the list. Now that the input handler is
538 *	smarter we can dispense with all the messy stuff that used to be
539 *	here.
540 *
541 *	BEWARE!!! Protocol handlers, mangling input packets,
542 *	MUST BE last in hash buckets and checking protocol handlers
543 *	MUST start from promiscuous ptype_all chain in net_bh.
544 *	It is true now, do not change it.
545 *	Explanation follows: if protocol handler, mangling packet, will
546 *	be the first on list, it is not able to sense, that packet
547 *	is cloned and should be copied-on-write, so that it will
548 *	change it and subsequent readers will get broken packet.
549 *							--ANK (980803)
550 */
551
552static inline struct list_head *ptype_head(const struct packet_type *pt)
553{
554	if (pt->type == htons(ETH_P_ALL))
555		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
556	else
557		return pt->dev ? &pt->dev->ptype_specific :
558				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
559}
560
561/**
562 *	dev_add_pack - add packet handler
563 *	@pt: packet type declaration
564 *
565 *	Add a protocol handler to the networking stack. The passed &packet_type
566 *	is linked into kernel lists and may not be freed until it has been
567 *	removed from the kernel lists.
568 *
569 *	This call does not sleep therefore it can not
570 *	guarantee all CPU's that are in middle of receiving packets
571 *	will see the new packet type (until the next received packet).
572 */
573
574void dev_add_pack(struct packet_type *pt)
575{
576	struct list_head *head = ptype_head(pt);
577
578	spin_lock(&ptype_lock);
579	list_add_rcu(&pt->list, head);
580	spin_unlock(&ptype_lock);
581}
582EXPORT_SYMBOL(dev_add_pack);
583
584/**
585 *	__dev_remove_pack	 - remove packet handler
586 *	@pt: packet type declaration
587 *
588 *	Remove a protocol handler that was previously added to the kernel
589 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
590 *	from the kernel lists and can be freed or reused once this function
591 *	returns.
592 *
593 *      The packet type might still be in use by receivers
594 *	and must not be freed until after all the CPU's have gone
595 *	through a quiescent state.
596 */
597void __dev_remove_pack(struct packet_type *pt)
598{
599	struct list_head *head = ptype_head(pt);
600	struct packet_type *pt1;
601
602	spin_lock(&ptype_lock);
603
604	list_for_each_entry(pt1, head, list) {
605		if (pt == pt1) {
606			list_del_rcu(&pt->list);
607			goto out;
608		}
609	}
610
611	pr_warn("dev_remove_pack: %p not found\n", pt);
612out:
613	spin_unlock(&ptype_lock);
614}
615EXPORT_SYMBOL(__dev_remove_pack);
616
617/**
618 *	dev_remove_pack	 - remove packet handler
619 *	@pt: packet type declaration
620 *
621 *	Remove a protocol handler that was previously added to the kernel
622 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
623 *	from the kernel lists and can be freed or reused once this function
624 *	returns.
625 *
626 *	This call sleeps to guarantee that no CPU is looking at the packet
627 *	type after return.
628 */
629void dev_remove_pack(struct packet_type *pt)
630{
631	__dev_remove_pack(pt);
632
633	synchronize_net();
634}
635EXPORT_SYMBOL(dev_remove_pack);
636
637
638/*******************************************************************************
639 *
640 *			    Device Interface Subroutines
641 *
642 *******************************************************************************/
643
644/**
645 *	dev_get_iflink	- get 'iflink' value of a interface
646 *	@dev: targeted interface
647 *
648 *	Indicates the ifindex the interface is linked to.
649 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
650 */
651
652int dev_get_iflink(const struct net_device *dev)
653{
654	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
655		return dev->netdev_ops->ndo_get_iflink(dev);
656
657	return dev->ifindex;
658}
659EXPORT_SYMBOL(dev_get_iflink);
660
661/**
662 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
663 *	@dev: targeted interface
664 *	@skb: The packet.
665 *
666 *	For better visibility of tunnel traffic OVS needs to retrieve
667 *	egress tunnel information for a packet. Following API allows
668 *	user to get this info.
669 */
670int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
671{
672	struct ip_tunnel_info *info;
673
674	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
675		return -EINVAL;
676
677	info = skb_tunnel_info_unclone(skb);
678	if (!info)
679		return -ENOMEM;
680	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
681		return -EINVAL;
682
683	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
684}
685EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
686
687static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
688{
689	int k = stack->num_paths++;
690
691	if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
692		return NULL;
693
694	return &stack->path[k];
695}
696
697int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
698			  struct net_device_path_stack *stack)
699{
700	const struct net_device *last_dev;
701	struct net_device_path_ctx ctx = {
702		.dev	= dev,
703	};
704	struct net_device_path *path;
705	int ret = 0;
706
707	memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
708	stack->num_paths = 0;
709	while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
710		last_dev = ctx.dev;
711		path = dev_fwd_path(stack);
712		if (!path)
713			return -1;
714
715		memset(path, 0, sizeof(struct net_device_path));
716		ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
717		if (ret < 0)
718			return -1;
719
720		if (WARN_ON_ONCE(last_dev == ctx.dev))
721			return -1;
722	}
723
724	if (!ctx.dev)
725		return ret;
726
727	path = dev_fwd_path(stack);
728	if (!path)
729		return -1;
730	path->type = DEV_PATH_ETHERNET;
731	path->dev = ctx.dev;
732
733	return ret;
734}
735EXPORT_SYMBOL_GPL(dev_fill_forward_path);
736
737/**
738 *	__dev_get_by_name	- find a device by its name
739 *	@net: the applicable net namespace
740 *	@name: name to find
741 *
742 *	Find an interface by name. Must be called under RTNL semaphore
743 *	or @dev_base_lock. If the name is found a pointer to the device
744 *	is returned. If the name is not found then %NULL is returned. The
745 *	reference counters are not incremented so the caller must be
746 *	careful with locks.
747 */
748
749struct net_device *__dev_get_by_name(struct net *net, const char *name)
750{
751	struct netdev_name_node *node_name;
752
753	node_name = netdev_name_node_lookup(net, name);
754	return node_name ? node_name->dev : NULL;
755}
756EXPORT_SYMBOL(__dev_get_by_name);
757
758/**
759 * dev_get_by_name_rcu	- find a device by its name
760 * @net: the applicable net namespace
761 * @name: name to find
762 *
763 * Find an interface by name.
764 * If the name is found a pointer to the device is returned.
765 * If the name is not found then %NULL is returned.
766 * The reference counters are not incremented so the caller must be
767 * careful with locks. The caller must hold RCU lock.
768 */
769
770struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
771{
772	struct netdev_name_node *node_name;
773
774	node_name = netdev_name_node_lookup_rcu(net, name);
775	return node_name ? node_name->dev : NULL;
776}
777EXPORT_SYMBOL(dev_get_by_name_rcu);
778
779/* Deprecated for new users, call netdev_get_by_name() instead */
780struct net_device *dev_get_by_name(struct net *net, const char *name)
781{
782	struct net_device *dev;
783
784	rcu_read_lock();
785	dev = dev_get_by_name_rcu(net, name);
786	dev_hold(dev);
787	rcu_read_unlock();
788	return dev;
789}
790EXPORT_SYMBOL(dev_get_by_name);
791
792/**
793 *	netdev_get_by_name() - find a device by its name
794 *	@net: the applicable net namespace
795 *	@name: name to find
796 *	@tracker: tracking object for the acquired reference
797 *	@gfp: allocation flags for the tracker
798 *
799 *	Find an interface by name. This can be called from any
800 *	context and does its own locking. The returned handle has
801 *	the usage count incremented and the caller must use netdev_put() to
802 *	release it when it is no longer needed. %NULL is returned if no
803 *	matching device is found.
804 */
805struct net_device *netdev_get_by_name(struct net *net, const char *name,
806				      netdevice_tracker *tracker, gfp_t gfp)
807{
808	struct net_device *dev;
809
810	dev = dev_get_by_name(net, name);
811	if (dev)
812		netdev_tracker_alloc(dev, tracker, gfp);
813	return dev;
814}
815EXPORT_SYMBOL(netdev_get_by_name);
816
817/**
818 *	__dev_get_by_index - find a device by its ifindex
819 *	@net: the applicable net namespace
820 *	@ifindex: index of device
821 *
822 *	Search for an interface by index. Returns %NULL if the device
823 *	is not found or a pointer to the device. The device has not
824 *	had its reference counter increased so the caller must be careful
825 *	about locking. The caller must hold either the RTNL semaphore
826 *	or @dev_base_lock.
827 */
828
829struct net_device *__dev_get_by_index(struct net *net, int ifindex)
830{
831	struct net_device *dev;
832	struct hlist_head *head = dev_index_hash(net, ifindex);
833
834	hlist_for_each_entry(dev, head, index_hlist)
835		if (dev->ifindex == ifindex)
836			return dev;
837
838	return NULL;
839}
840EXPORT_SYMBOL(__dev_get_by_index);
841
842/**
843 *	dev_get_by_index_rcu - find a device by its ifindex
844 *	@net: the applicable net namespace
845 *	@ifindex: index of device
846 *
847 *	Search for an interface by index. Returns %NULL if the device
848 *	is not found or a pointer to the device. The device has not
849 *	had its reference counter increased so the caller must be careful
850 *	about locking. The caller must hold RCU lock.
851 */
852
853struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
854{
855	struct net_device *dev;
856	struct hlist_head *head = dev_index_hash(net, ifindex);
857
858	hlist_for_each_entry_rcu(dev, head, index_hlist)
859		if (dev->ifindex == ifindex)
860			return dev;
861
862	return NULL;
863}
864EXPORT_SYMBOL(dev_get_by_index_rcu);
865
866/* Deprecated for new users, call netdev_get_by_index() instead */
867struct net_device *dev_get_by_index(struct net *net, int ifindex)
868{
869	struct net_device *dev;
870
871	rcu_read_lock();
872	dev = dev_get_by_index_rcu(net, ifindex);
873	dev_hold(dev);
874	rcu_read_unlock();
875	return dev;
876}
877EXPORT_SYMBOL(dev_get_by_index);
878
879/**
880 *	netdev_get_by_index() - find a device by its ifindex
881 *	@net: the applicable net namespace
882 *	@ifindex: index of device
883 *	@tracker: tracking object for the acquired reference
884 *	@gfp: allocation flags for the tracker
885 *
886 *	Search for an interface by index. Returns NULL if the device
887 *	is not found or a pointer to the device. The device returned has
888 *	had a reference added and the pointer is safe until the user calls
889 *	netdev_put() to indicate they have finished with it.
890 */
891struct net_device *netdev_get_by_index(struct net *net, int ifindex,
892				       netdevice_tracker *tracker, gfp_t gfp)
893{
894	struct net_device *dev;
895
896	dev = dev_get_by_index(net, ifindex);
897	if (dev)
898		netdev_tracker_alloc(dev, tracker, gfp);
899	return dev;
900}
901EXPORT_SYMBOL(netdev_get_by_index);
902
903/**
904 *	dev_get_by_napi_id - find a device by napi_id
905 *	@napi_id: ID of the NAPI struct
906 *
907 *	Search for an interface by NAPI ID. Returns %NULL if the device
908 *	is not found or a pointer to the device. The device has not had
909 *	its reference counter increased so the caller must be careful
910 *	about locking. The caller must hold RCU lock.
911 */
912
913struct net_device *dev_get_by_napi_id(unsigned int napi_id)
914{
915	struct napi_struct *napi;
916
917	WARN_ON_ONCE(!rcu_read_lock_held());
918
919	if (napi_id < MIN_NAPI_ID)
920		return NULL;
921
922	napi = napi_by_id(napi_id);
923
924	return napi ? napi->dev : NULL;
925}
926EXPORT_SYMBOL(dev_get_by_napi_id);
927
928/**
929 *	netdev_get_name - get a netdevice name, knowing its ifindex.
930 *	@net: network namespace
931 *	@name: a pointer to the buffer where the name will be stored.
932 *	@ifindex: the ifindex of the interface to get the name from.
933 */
934int netdev_get_name(struct net *net, char *name, int ifindex)
935{
936	struct net_device *dev;
937	int ret;
938
939	down_read(&devnet_rename_sem);
940	rcu_read_lock();
941
942	dev = dev_get_by_index_rcu(net, ifindex);
943	if (!dev) {
944		ret = -ENODEV;
945		goto out;
946	}
947
948	strcpy(name, dev->name);
949
950	ret = 0;
951out:
952	rcu_read_unlock();
953	up_read(&devnet_rename_sem);
954	return ret;
955}
956
957/**
958 *	dev_getbyhwaddr_rcu - find a device by its hardware address
959 *	@net: the applicable net namespace
960 *	@type: media type of device
961 *	@ha: hardware address
962 *
963 *	Search for an interface by MAC address. Returns NULL if the device
964 *	is not found or a pointer to the device.
965 *	The caller must hold RCU or RTNL.
966 *	The returned device has not had its ref count increased
967 *	and the caller must therefore be careful about locking
968 *
969 */
970
971struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
972				       const char *ha)
973{
974	struct net_device *dev;
975
976	for_each_netdev_rcu(net, dev)
977		if (dev->type == type &&
978		    !memcmp(dev->dev_addr, ha, dev->addr_len))
979			return dev;
980
981	return NULL;
982}
983EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
984
985struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
986{
987	struct net_device *dev, *ret = NULL;
988
989	rcu_read_lock();
990	for_each_netdev_rcu(net, dev)
991		if (dev->type == type) {
992			dev_hold(dev);
993			ret = dev;
994			break;
995		}
996	rcu_read_unlock();
997	return ret;
998}
999EXPORT_SYMBOL(dev_getfirstbyhwtype);
1000
1001/**
1002 *	__dev_get_by_flags - find any device with given flags
1003 *	@net: the applicable net namespace
1004 *	@if_flags: IFF_* values
1005 *	@mask: bitmask of bits in if_flags to check
1006 *
1007 *	Search for any interface with the given flags. Returns NULL if a device
1008 *	is not found or a pointer to the device. Must be called inside
1009 *	rtnl_lock(), and result refcount is unchanged.
1010 */
1011
1012struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1013				      unsigned short mask)
1014{
1015	struct net_device *dev, *ret;
1016
1017	ASSERT_RTNL();
1018
1019	ret = NULL;
1020	for_each_netdev(net, dev) {
1021		if (((dev->flags ^ if_flags) & mask) == 0) {
1022			ret = dev;
1023			break;
1024		}
1025	}
1026	return ret;
1027}
1028EXPORT_SYMBOL(__dev_get_by_flags);
1029
1030/**
1031 *	dev_valid_name - check if name is okay for network device
1032 *	@name: name string
1033 *
1034 *	Network device names need to be valid file names to
1035 *	allow sysfs to work.  We also disallow any kind of
1036 *	whitespace.
1037 */
1038bool dev_valid_name(const char *name)
1039{
1040	if (*name == '\0')
1041		return false;
1042	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1043		return false;
1044	if (!strcmp(name, ".") || !strcmp(name, ".."))
1045		return false;
1046
1047	while (*name) {
1048		if (*name == '/' || *name == ':' || isspace(*name))
1049			return false;
1050		name++;
1051	}
1052	return true;
1053}
1054EXPORT_SYMBOL(dev_valid_name);
1055
1056/**
1057 *	__dev_alloc_name - allocate a name for a device
1058 *	@net: network namespace to allocate the device name in
1059 *	@name: name format string
1060 *	@buf:  scratch buffer and result name string
1061 *
1062 *	Passed a format string - eg "lt%d" it will try and find a suitable
1063 *	id. It scans list of devices to build up a free map, then chooses
1064 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1065 *	while allocating the name and adding the device in order to avoid
1066 *	duplicates.
1067 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1068 *	Returns the number of the unit assigned or a negative errno code.
1069 */
1070
1071static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1072{
1073	int i = 0;
1074	const char *p;
1075	const int max_netdevices = 8*PAGE_SIZE;
1076	unsigned long *inuse;
1077	struct net_device *d;
1078
1079	if (!dev_valid_name(name))
1080		return -EINVAL;
1081
1082	p = strchr(name, '%');
1083	if (p) {
1084		/*
1085		 * Verify the string as this thing may have come from
1086		 * the user.  There must be either one "%d" and no other "%"
1087		 * characters.
1088		 */
1089		if (p[1] != 'd' || strchr(p + 2, '%'))
1090			return -EINVAL;
1091
1092		/* Use one page as a bit array of possible slots */
1093		inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
1094		if (!inuse)
1095			return -ENOMEM;
1096
1097		for_each_netdev(net, d) {
1098			struct netdev_name_node *name_node;
1099
1100			netdev_for_each_altname(d, name_node) {
1101				if (!sscanf(name_node->name, name, &i))
1102					continue;
1103				if (i < 0 || i >= max_netdevices)
1104					continue;
1105
1106				/*  avoid cases where sscanf is not exact inverse of printf */
1107				snprintf(buf, IFNAMSIZ, name, i);
1108				if (!strncmp(buf, name_node->name, IFNAMSIZ))
1109					__set_bit(i, inuse);
1110			}
1111			if (!sscanf(d->name, name, &i))
1112				continue;
1113			if (i < 0 || i >= max_netdevices)
1114				continue;
1115
1116			/*  avoid cases where sscanf is not exact inverse of printf */
1117			snprintf(buf, IFNAMSIZ, name, i);
1118			if (!strncmp(buf, d->name, IFNAMSIZ))
1119				__set_bit(i, inuse);
1120		}
1121
1122		i = find_first_zero_bit(inuse, max_netdevices);
1123		bitmap_free(inuse);
1124	}
1125
1126	snprintf(buf, IFNAMSIZ, name, i);
1127	if (!netdev_name_in_use(net, buf))
1128		return i;
1129
1130	/* It is possible to run out of possible slots
1131	 * when the name is long and there isn't enough space left
1132	 * for the digits, or if all bits are used.
1133	 */
1134	return -ENFILE;
1135}
1136
1137static int dev_prep_valid_name(struct net *net, struct net_device *dev,
1138			       const char *want_name, char *out_name)
1139{
1140	int ret;
1141
1142	if (!dev_valid_name(want_name))
1143		return -EINVAL;
1144
1145	if (strchr(want_name, '%')) {
1146		ret = __dev_alloc_name(net, want_name, out_name);
1147		return ret < 0 ? ret : 0;
1148	} else if (netdev_name_in_use(net, want_name)) {
1149		return -EEXIST;
1150	} else if (out_name != want_name) {
1151		strscpy(out_name, want_name, IFNAMSIZ);
1152	}
1153
1154	return 0;
1155}
1156
1157static int dev_alloc_name_ns(struct net *net,
1158			     struct net_device *dev,
1159			     const char *name)
1160{
1161	char buf[IFNAMSIZ];
1162	int ret;
1163
1164	BUG_ON(!net);
1165	ret = __dev_alloc_name(net, name, buf);
1166	if (ret >= 0)
1167		strscpy(dev->name, buf, IFNAMSIZ);
1168	return ret;
1169}
1170
1171/**
1172 *	dev_alloc_name - allocate a name for a device
1173 *	@dev: device
1174 *	@name: name format string
1175 *
1176 *	Passed a format string - eg "lt%d" it will try and find a suitable
1177 *	id. It scans list of devices to build up a free map, then chooses
1178 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1179 *	while allocating the name and adding the device in order to avoid
1180 *	duplicates.
1181 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1182 *	Returns the number of the unit assigned or a negative errno code.
1183 */
1184
1185int dev_alloc_name(struct net_device *dev, const char *name)
1186{
1187	return dev_alloc_name_ns(dev_net(dev), dev, name);
1188}
1189EXPORT_SYMBOL(dev_alloc_name);
1190
1191static int dev_get_valid_name(struct net *net, struct net_device *dev,
1192			      const char *name)
1193{
1194	char buf[IFNAMSIZ];
1195	int ret;
1196
1197	ret = dev_prep_valid_name(net, dev, name, buf);
1198	if (ret >= 0)
1199		strscpy(dev->name, buf, IFNAMSIZ);
1200	return ret;
1201}
1202
1203/**
1204 *	dev_change_name - change name of a device
1205 *	@dev: device
1206 *	@newname: name (or format string) must be at least IFNAMSIZ
1207 *
1208 *	Change name of a device, can pass format strings "eth%d".
1209 *	for wildcarding.
1210 */
1211int dev_change_name(struct net_device *dev, const char *newname)
1212{
1213	unsigned char old_assign_type;
1214	char oldname[IFNAMSIZ];
1215	int err = 0;
1216	int ret;
1217	struct net *net;
1218
1219	ASSERT_RTNL();
1220	BUG_ON(!dev_net(dev));
1221
1222	net = dev_net(dev);
1223
1224	down_write(&devnet_rename_sem);
1225
1226	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1227		up_write(&devnet_rename_sem);
1228		return 0;
1229	}
1230
1231	memcpy(oldname, dev->name, IFNAMSIZ);
1232
1233	err = dev_get_valid_name(net, dev, newname);
1234	if (err < 0) {
1235		up_write(&devnet_rename_sem);
1236		return err;
1237	}
1238
1239	if (oldname[0] && !strchr(oldname, '%'))
1240		netdev_info(dev, "renamed from %s%s\n", oldname,
1241			    dev->flags & IFF_UP ? " (while UP)" : "");
1242
1243	old_assign_type = dev->name_assign_type;
1244	dev->name_assign_type = NET_NAME_RENAMED;
1245
1246rollback:
1247	ret = device_rename(&dev->dev, dev->name);
1248	if (ret) {
1249		memcpy(dev->name, oldname, IFNAMSIZ);
1250		dev->name_assign_type = old_assign_type;
1251		up_write(&devnet_rename_sem);
1252		return ret;
1253	}
1254
1255	up_write(&devnet_rename_sem);
1256
1257	netdev_adjacent_rename_links(dev, oldname);
1258
1259	write_lock(&dev_base_lock);
1260	netdev_name_node_del(dev->name_node);
1261	write_unlock(&dev_base_lock);
1262
1263	synchronize_rcu();
1264
1265	write_lock(&dev_base_lock);
1266	netdev_name_node_add(net, dev->name_node);
1267	write_unlock(&dev_base_lock);
1268
1269	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1270	ret = notifier_to_errno(ret);
1271
1272	if (ret) {
1273		/* err >= 0 after dev_alloc_name() or stores the first errno */
1274		if (err >= 0) {
1275			err = ret;
1276			down_write(&devnet_rename_sem);
1277			memcpy(dev->name, oldname, IFNAMSIZ);
1278			memcpy(oldname, newname, IFNAMSIZ);
1279			dev->name_assign_type = old_assign_type;
1280			old_assign_type = NET_NAME_RENAMED;
1281			goto rollback;
1282		} else {
1283			netdev_err(dev, "name change rollback failed: %d\n",
1284				   ret);
1285		}
1286	}
1287
1288	return err;
1289}
1290
1291/**
1292 *	dev_set_alias - change ifalias of a device
1293 *	@dev: device
1294 *	@alias: name up to IFALIASZ
1295 *	@len: limit of bytes to copy from info
1296 *
1297 *	Set ifalias for a device,
1298 */
1299int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1300{
1301	struct dev_ifalias *new_alias = NULL;
1302
1303	if (len >= IFALIASZ)
1304		return -EINVAL;
1305
1306	if (len) {
1307		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1308		if (!new_alias)
1309			return -ENOMEM;
1310
1311		memcpy(new_alias->ifalias, alias, len);
1312		new_alias->ifalias[len] = 0;
1313	}
1314
1315	mutex_lock(&ifalias_mutex);
1316	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1317					mutex_is_locked(&ifalias_mutex));
1318	mutex_unlock(&ifalias_mutex);
1319
1320	if (new_alias)
1321		kfree_rcu(new_alias, rcuhead);
1322
1323	return len;
1324}
1325EXPORT_SYMBOL(dev_set_alias);
1326
1327/**
1328 *	dev_get_alias - get ifalias of a device
1329 *	@dev: device
1330 *	@name: buffer to store name of ifalias
1331 *	@len: size of buffer
1332 *
1333 *	get ifalias for a device.  Caller must make sure dev cannot go
1334 *	away,  e.g. rcu read lock or own a reference count to device.
1335 */
1336int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1337{
1338	const struct dev_ifalias *alias;
1339	int ret = 0;
1340
1341	rcu_read_lock();
1342	alias = rcu_dereference(dev->ifalias);
1343	if (alias)
1344		ret = snprintf(name, len, "%s", alias->ifalias);
1345	rcu_read_unlock();
1346
1347	return ret;
1348}
1349
1350/**
1351 *	netdev_features_change - device changes features
1352 *	@dev: device to cause notification
1353 *
1354 *	Called to indicate a device has changed features.
1355 */
1356void netdev_features_change(struct net_device *dev)
1357{
1358	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1359}
1360EXPORT_SYMBOL(netdev_features_change);
1361
1362/**
1363 *	netdev_state_change - device changes state
1364 *	@dev: device to cause notification
1365 *
1366 *	Called to indicate a device has changed state. This function calls
1367 *	the notifier chains for netdev_chain and sends a NEWLINK message
1368 *	to the routing socket.
1369 */
1370void netdev_state_change(struct net_device *dev)
1371{
1372	if (dev->flags & IFF_UP) {
1373		struct netdev_notifier_change_info change_info = {
1374			.info.dev = dev,
1375		};
1376
1377		call_netdevice_notifiers_info(NETDEV_CHANGE,
1378					      &change_info.info);
1379		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
1380	}
1381}
1382EXPORT_SYMBOL(netdev_state_change);
1383
1384/**
1385 * __netdev_notify_peers - notify network peers about existence of @dev,
1386 * to be called when rtnl lock is already held.
1387 * @dev: network device
1388 *
1389 * Generate traffic such that interested network peers are aware of
1390 * @dev, such as by generating a gratuitous ARP. This may be used when
1391 * a device wants to inform the rest of the network about some sort of
1392 * reconfiguration such as a failover event or virtual machine
1393 * migration.
1394 */
1395void __netdev_notify_peers(struct net_device *dev)
1396{
1397	ASSERT_RTNL();
1398	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1399	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1400}
1401EXPORT_SYMBOL(__netdev_notify_peers);
1402
1403/**
1404 * netdev_notify_peers - notify network peers about existence of @dev
1405 * @dev: network device
1406 *
1407 * Generate traffic such that interested network peers are aware of
1408 * @dev, such as by generating a gratuitous ARP. This may be used when
1409 * a device wants to inform the rest of the network about some sort of
1410 * reconfiguration such as a failover event or virtual machine
1411 * migration.
1412 */
1413void netdev_notify_peers(struct net_device *dev)
1414{
1415	rtnl_lock();
1416	__netdev_notify_peers(dev);
1417	rtnl_unlock();
1418}
1419EXPORT_SYMBOL(netdev_notify_peers);
1420
1421static int napi_threaded_poll(void *data);
1422
1423static int napi_kthread_create(struct napi_struct *n)
1424{
1425	int err = 0;
1426
1427	/* Create and wake up the kthread once to put it in
1428	 * TASK_INTERRUPTIBLE mode to avoid the blocked task
1429	 * warning and work with loadavg.
1430	 */
1431	n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
1432				n->dev->name, n->napi_id);
1433	if (IS_ERR(n->thread)) {
1434		err = PTR_ERR(n->thread);
1435		pr_err("kthread_run failed with err %d\n", err);
1436		n->thread = NULL;
1437	}
1438
1439	return err;
1440}
1441
1442static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1443{
1444	const struct net_device_ops *ops = dev->netdev_ops;
1445	int ret;
1446
1447	ASSERT_RTNL();
1448	dev_addr_check(dev);
1449
1450	if (!netif_device_present(dev)) {
1451		/* may be detached because parent is runtime-suspended */
1452		if (dev->dev.parent)
1453			pm_runtime_resume(dev->dev.parent);
1454		if (!netif_device_present(dev))
1455			return -ENODEV;
1456	}
1457
1458	/* Block netpoll from trying to do any rx path servicing.
1459	 * If we don't do this there is a chance ndo_poll_controller
1460	 * or ndo_poll may be running while we open the device
1461	 */
1462	netpoll_poll_disable(dev);
1463
1464	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1465	ret = notifier_to_errno(ret);
1466	if (ret)
1467		return ret;
1468
1469	set_bit(__LINK_STATE_START, &dev->state);
1470
1471	if (ops->ndo_validate_addr)
1472		ret = ops->ndo_validate_addr(dev);
1473
1474	if (!ret && ops->ndo_open)
1475		ret = ops->ndo_open(dev);
1476
1477	netpoll_poll_enable(dev);
1478
1479	if (ret)
1480		clear_bit(__LINK_STATE_START, &dev->state);
1481	else {
1482		dev->flags |= IFF_UP;
1483		dev_set_rx_mode(dev);
1484		dev_activate(dev);
1485		add_device_randomness(dev->dev_addr, dev->addr_len);
1486	}
1487
1488	return ret;
1489}
1490
1491/**
1492 *	dev_open	- prepare an interface for use.
1493 *	@dev: device to open
1494 *	@extack: netlink extended ack
1495 *
1496 *	Takes a device from down to up state. The device's private open
1497 *	function is invoked and then the multicast lists are loaded. Finally
1498 *	the device is moved into the up state and a %NETDEV_UP message is
1499 *	sent to the netdev notifier chain.
1500 *
1501 *	Calling this function on an active interface is a nop. On a failure
1502 *	a negative errno code is returned.
1503 */
1504int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1505{
1506	int ret;
1507
1508	if (dev->flags & IFF_UP)
1509		return 0;
1510
1511	ret = __dev_open(dev, extack);
1512	if (ret < 0)
1513		return ret;
1514
1515	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1516	call_netdevice_notifiers(NETDEV_UP, dev);
1517
1518	return ret;
1519}
1520EXPORT_SYMBOL(dev_open);
1521
1522static void __dev_close_many(struct list_head *head)
1523{
1524	struct net_device *dev;
1525
1526	ASSERT_RTNL();
1527	might_sleep();
1528
1529	list_for_each_entry(dev, head, close_list) {
1530		/* Temporarily disable netpoll until the interface is down */
1531		netpoll_poll_disable(dev);
1532
1533		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1534
1535		clear_bit(__LINK_STATE_START, &dev->state);
1536
1537		/* Synchronize to scheduled poll. We cannot touch poll list, it
1538		 * can be even on different cpu. So just clear netif_running().
1539		 *
1540		 * dev->stop() will invoke napi_disable() on all of it's
1541		 * napi_struct instances on this device.
1542		 */
1543		smp_mb__after_atomic(); /* Commit netif_running(). */
1544	}
1545
1546	dev_deactivate_many(head);
1547
1548	list_for_each_entry(dev, head, close_list) {
1549		const struct net_device_ops *ops = dev->netdev_ops;
1550
1551		/*
1552		 *	Call the device specific close. This cannot fail.
1553		 *	Only if device is UP
1554		 *
1555		 *	We allow it to be called even after a DETACH hot-plug
1556		 *	event.
1557		 */
1558		if (ops->ndo_stop)
1559			ops->ndo_stop(dev);
1560
1561		dev->flags &= ~IFF_UP;
1562		netpoll_poll_enable(dev);
1563	}
1564}
1565
1566static void __dev_close(struct net_device *dev)
1567{
1568	LIST_HEAD(single);
1569
1570	list_add(&dev->close_list, &single);
1571	__dev_close_many(&single);
1572	list_del(&single);
1573}
1574
1575void dev_close_many(struct list_head *head, bool unlink)
1576{
1577	struct net_device *dev, *tmp;
1578
1579	/* Remove the devices that don't need to be closed */
1580	list_for_each_entry_safe(dev, tmp, head, close_list)
1581		if (!(dev->flags & IFF_UP))
1582			list_del_init(&dev->close_list);
1583
1584	__dev_close_many(head);
1585
1586	list_for_each_entry_safe(dev, tmp, head, close_list) {
1587		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1588		call_netdevice_notifiers(NETDEV_DOWN, dev);
1589		if (unlink)
1590			list_del_init(&dev->close_list);
1591	}
1592}
1593EXPORT_SYMBOL(dev_close_many);
1594
1595/**
1596 *	dev_close - shutdown an interface.
1597 *	@dev: device to shutdown
1598 *
1599 *	This function moves an active device into down state. A
1600 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1601 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1602 *	chain.
1603 */
1604void dev_close(struct net_device *dev)
1605{
1606	if (dev->flags & IFF_UP) {
1607		LIST_HEAD(single);
1608
1609		list_add(&dev->close_list, &single);
1610		dev_close_many(&single, true);
1611		list_del(&single);
1612	}
1613}
1614EXPORT_SYMBOL(dev_close);
1615
1616
1617/**
1618 *	dev_disable_lro - disable Large Receive Offload on a device
1619 *	@dev: device
1620 *
1621 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1622 *	called under RTNL.  This is needed if received packets may be
1623 *	forwarded to another interface.
1624 */
1625void dev_disable_lro(struct net_device *dev)
1626{
1627	struct net_device *lower_dev;
1628	struct list_head *iter;
1629
1630	dev->wanted_features &= ~NETIF_F_LRO;
1631	netdev_update_features(dev);
1632
1633	if (unlikely(dev->features & NETIF_F_LRO))
1634		netdev_WARN(dev, "failed to disable LRO!\n");
1635
1636	netdev_for_each_lower_dev(dev, lower_dev, iter)
1637		dev_disable_lro(lower_dev);
1638}
1639EXPORT_SYMBOL(dev_disable_lro);
1640
1641/**
1642 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1643 *	@dev: device
1644 *
1645 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
1646 *	called under RTNL.  This is needed if Generic XDP is installed on
1647 *	the device.
1648 */
1649static void dev_disable_gro_hw(struct net_device *dev)
1650{
1651	dev->wanted_features &= ~NETIF_F_GRO_HW;
1652	netdev_update_features(dev);
1653
1654	if (unlikely(dev->features & NETIF_F_GRO_HW))
1655		netdev_WARN(dev, "failed to disable GRO_HW!\n");
1656}
1657
1658const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1659{
1660#define N(val) 						\
1661	case NETDEV_##val:				\
1662		return "NETDEV_" __stringify(val);
1663	switch (cmd) {
1664	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1665	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1666	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1667	N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
1668	N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
1669	N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
1670	N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1671	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1672	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1673	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
1674	N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
1675	N(XDP_FEAT_CHANGE)
1676	}
1677#undef N
1678	return "UNKNOWN_NETDEV_EVENT";
1679}
1680EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1681
1682static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1683				   struct net_device *dev)
1684{
1685	struct netdev_notifier_info info = {
1686		.dev = dev,
1687	};
1688
1689	return nb->notifier_call(nb, val, &info);
1690}
1691
1692static int call_netdevice_register_notifiers(struct notifier_block *nb,
1693					     struct net_device *dev)
1694{
1695	int err;
1696
1697	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1698	err = notifier_to_errno(err);
1699	if (err)
1700		return err;
1701
1702	if (!(dev->flags & IFF_UP))
1703		return 0;
1704
1705	call_netdevice_notifier(nb, NETDEV_UP, dev);
1706	return 0;
1707}
1708
1709static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1710						struct net_device *dev)
1711{
1712	if (dev->flags & IFF_UP) {
1713		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1714					dev);
1715		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1716	}
1717	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1718}
1719
1720static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1721						 struct net *net)
1722{
1723	struct net_device *dev;
1724	int err;
1725
1726	for_each_netdev(net, dev) {
1727		err = call_netdevice_register_notifiers(nb, dev);
1728		if (err)
1729			goto rollback;
1730	}
1731	return 0;
1732
1733rollback:
1734	for_each_netdev_continue_reverse(net, dev)
1735		call_netdevice_unregister_notifiers(nb, dev);
1736	return err;
1737}
1738
1739static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1740						    struct net *net)
1741{
1742	struct net_device *dev;
1743
1744	for_each_netdev(net, dev)
1745		call_netdevice_unregister_notifiers(nb, dev);
1746}
1747
1748static int dev_boot_phase = 1;
1749
1750/**
1751 * register_netdevice_notifier - register a network notifier block
1752 * @nb: notifier
1753 *
1754 * Register a notifier to be called when network device events occur.
1755 * The notifier passed is linked into the kernel structures and must
1756 * not be reused until it has been unregistered. A negative errno code
1757 * is returned on a failure.
1758 *
1759 * When registered all registration and up events are replayed
1760 * to the new notifier to allow device to have a race free
1761 * view of the network device list.
1762 */
1763
1764int register_netdevice_notifier(struct notifier_block *nb)
1765{
1766	struct net *net;
1767	int err;
1768
1769	/* Close race with setup_net() and cleanup_net() */
1770	down_write(&pernet_ops_rwsem);
1771	rtnl_lock();
1772	err = raw_notifier_chain_register(&netdev_chain, nb);
1773	if (err)
1774		goto unlock;
1775	if (dev_boot_phase)
1776		goto unlock;
1777	for_each_net(net) {
1778		err = call_netdevice_register_net_notifiers(nb, net);
1779		if (err)
1780			goto rollback;
1781	}
1782
1783unlock:
1784	rtnl_unlock();
1785	up_write(&pernet_ops_rwsem);
1786	return err;
1787
1788rollback:
1789	for_each_net_continue_reverse(net)
1790		call_netdevice_unregister_net_notifiers(nb, net);
1791
1792	raw_notifier_chain_unregister(&netdev_chain, nb);
1793	goto unlock;
1794}
1795EXPORT_SYMBOL(register_netdevice_notifier);
1796
1797/**
1798 * unregister_netdevice_notifier - unregister a network notifier block
1799 * @nb: notifier
1800 *
1801 * Unregister a notifier previously registered by
1802 * register_netdevice_notifier(). The notifier is unlinked into the
1803 * kernel structures and may then be reused. A negative errno code
1804 * is returned on a failure.
1805 *
1806 * After unregistering unregister and down device events are synthesized
1807 * for all devices on the device list to the removed notifier to remove
1808 * the need for special case cleanup code.
1809 */
1810
1811int unregister_netdevice_notifier(struct notifier_block *nb)
1812{
1813	struct net *net;
1814	int err;
1815
1816	/* Close race with setup_net() and cleanup_net() */
1817	down_write(&pernet_ops_rwsem);
1818	rtnl_lock();
1819	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1820	if (err)
1821		goto unlock;
1822
1823	for_each_net(net)
1824		call_netdevice_unregister_net_notifiers(nb, net);
1825
1826unlock:
1827	rtnl_unlock();
1828	up_write(&pernet_ops_rwsem);
1829	return err;
1830}
1831EXPORT_SYMBOL(unregister_netdevice_notifier);
1832
1833static int __register_netdevice_notifier_net(struct net *net,
1834					     struct notifier_block *nb,
1835					     bool ignore_call_fail)
1836{
1837	int err;
1838
1839	err = raw_notifier_chain_register(&net->netdev_chain, nb);
1840	if (err)
1841		return err;
1842	if (dev_boot_phase)
1843		return 0;
1844
1845	err = call_netdevice_register_net_notifiers(nb, net);
1846	if (err && !ignore_call_fail)
1847		goto chain_unregister;
1848
1849	return 0;
1850
1851chain_unregister:
1852	raw_notifier_chain_unregister(&net->netdev_chain, nb);
1853	return err;
1854}
1855
1856static int __unregister_netdevice_notifier_net(struct net *net,
1857					       struct notifier_block *nb)
1858{
1859	int err;
1860
1861	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
1862	if (err)
1863		return err;
1864
1865	call_netdevice_unregister_net_notifiers(nb, net);
1866	return 0;
1867}
1868
1869/**
1870 * register_netdevice_notifier_net - register a per-netns network notifier block
1871 * @net: network namespace
1872 * @nb: notifier
1873 *
1874 * Register a notifier to be called when network device events occur.
1875 * The notifier passed is linked into the kernel structures and must
1876 * not be reused until it has been unregistered. A negative errno code
1877 * is returned on a failure.
1878 *
1879 * When registered all registration and up events are replayed
1880 * to the new notifier to allow device to have a race free
1881 * view of the network device list.
1882 */
1883
1884int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1885{
1886	int err;
1887
1888	rtnl_lock();
1889	err = __register_netdevice_notifier_net(net, nb, false);
1890	rtnl_unlock();
1891	return err;
1892}
1893EXPORT_SYMBOL(register_netdevice_notifier_net);
1894
1895/**
1896 * unregister_netdevice_notifier_net - unregister a per-netns
1897 *                                     network notifier block
1898 * @net: network namespace
1899 * @nb: notifier
1900 *
1901 * Unregister a notifier previously registered by
1902 * register_netdevice_notifier_net(). The notifier is unlinked from the
1903 * kernel structures and may then be reused. A negative errno code
1904 * is returned on a failure.
1905 *
1906 * After unregistering unregister and down device events are synthesized
1907 * for all devices on the device list to the removed notifier to remove
1908 * the need for special case cleanup code.
1909 */
1910
1911int unregister_netdevice_notifier_net(struct net *net,
1912				      struct notifier_block *nb)
1913{
1914	int err;
1915
1916	rtnl_lock();
1917	err = __unregister_netdevice_notifier_net(net, nb);
1918	rtnl_unlock();
1919	return err;
1920}
1921EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1922
1923static void __move_netdevice_notifier_net(struct net *src_net,
1924					  struct net *dst_net,
1925					  struct notifier_block *nb)
1926{
1927	__unregister_netdevice_notifier_net(src_net, nb);
1928	__register_netdevice_notifier_net(dst_net, nb, true);
1929}
1930
1931int register_netdevice_notifier_dev_net(struct net_device *dev,
1932					struct notifier_block *nb,
1933					struct netdev_net_notifier *nn)
1934{
1935	int err;
1936
1937	rtnl_lock();
1938	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
1939	if (!err) {
1940		nn->nb = nb;
1941		list_add(&nn->list, &dev->net_notifier_list);
1942	}
1943	rtnl_unlock();
1944	return err;
1945}
1946EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
1947
1948int unregister_netdevice_notifier_dev_net(struct net_device *dev,
1949					  struct notifier_block *nb,
1950					  struct netdev_net_notifier *nn)
1951{
1952	int err;
1953
1954	rtnl_lock();
1955	list_del(&nn->list);
1956	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
1957	rtnl_unlock();
1958	return err;
1959}
1960EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
1961
1962static void move_netdevice_notifiers_dev_net(struct net_device *dev,
1963					     struct net *net)
1964{
1965	struct netdev_net_notifier *nn;
1966
1967	list_for_each_entry(nn, &dev->net_notifier_list, list)
1968		__move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
1969}
1970
1971/**
1972 *	call_netdevice_notifiers_info - call all network notifier blocks
1973 *	@val: value passed unmodified to notifier function
1974 *	@info: notifier information data
1975 *
1976 *	Call all network notifier blocks.  Parameters and return value
1977 *	are as for raw_notifier_call_chain().
1978 */
1979
1980int call_netdevice_notifiers_info(unsigned long val,
1981				  struct netdev_notifier_info *info)
1982{
1983	struct net *net = dev_net(info->dev);
1984	int ret;
1985
1986	ASSERT_RTNL();
1987
1988	/* Run per-netns notifier block chain first, then run the global one.
1989	 * Hopefully, one day, the global one is going to be removed after
1990	 * all notifier block registrators get converted to be per-netns.
1991	 */
1992	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
1993	if (ret & NOTIFY_STOP_MASK)
1994		return ret;
1995	return raw_notifier_call_chain(&netdev_chain, val, info);
1996}
1997
1998/**
1999 *	call_netdevice_notifiers_info_robust - call per-netns notifier blocks
2000 *	                                       for and rollback on error
2001 *	@val_up: value passed unmodified to notifier function
2002 *	@val_down: value passed unmodified to the notifier function when
2003 *	           recovering from an error on @val_up
2004 *	@info: notifier information data
2005 *
2006 *	Call all per-netns network notifier blocks, but not notifier blocks on
2007 *	the global notifier chain. Parameters and return value are as for
2008 *	raw_notifier_call_chain_robust().
2009 */
2010
2011static int
2012call_netdevice_notifiers_info_robust(unsigned long val_up,
2013				     unsigned long val_down,
2014				     struct netdev_notifier_info *info)
2015{
2016	struct net *net = dev_net(info->dev);
2017
2018	ASSERT_RTNL();
2019
2020	return raw_notifier_call_chain_robust(&net->netdev_chain,
2021					      val_up, val_down, info);
2022}
2023
2024static int call_netdevice_notifiers_extack(unsigned long val,
2025					   struct net_device *dev,
2026					   struct netlink_ext_ack *extack)
2027{
2028	struct netdev_notifier_info info = {
2029		.dev = dev,
2030		.extack = extack,
2031	};
2032
2033	return call_netdevice_notifiers_info(val, &info);
2034}
2035
2036/**
2037 *	call_netdevice_notifiers - call all network notifier blocks
2038 *      @val: value passed unmodified to notifier function
2039 *      @dev: net_device pointer passed unmodified to notifier function
2040 *
2041 *	Call all network notifier blocks.  Parameters and return value
2042 *	are as for raw_notifier_call_chain().
2043 */
2044
2045int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
2046{
2047	return call_netdevice_notifiers_extack(val, dev, NULL);
2048}
2049EXPORT_SYMBOL(call_netdevice_notifiers);
2050
2051/**
2052 *	call_netdevice_notifiers_mtu - call all network notifier blocks
2053 *	@val: value passed unmodified to notifier function
2054 *	@dev: net_device pointer passed unmodified to notifier function
2055 *	@arg: additional u32 argument passed to the notifier function
2056 *
2057 *	Call all network notifier blocks.  Parameters and return value
2058 *	are as for raw_notifier_call_chain().
2059 */
2060static int call_netdevice_notifiers_mtu(unsigned long val,
2061					struct net_device *dev, u32 arg)
2062{
2063	struct netdev_notifier_info_ext info = {
2064		.info.dev = dev,
2065		.ext.mtu = arg,
2066	};
2067
2068	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
2069
2070	return call_netdevice_notifiers_info(val, &info.info);
2071}
2072
2073#ifdef CONFIG_NET_INGRESS
2074static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2075
2076void net_inc_ingress_queue(void)
2077{
2078	static_branch_inc(&ingress_needed_key);
2079}
2080EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
2081
2082void net_dec_ingress_queue(void)
2083{
2084	static_branch_dec(&ingress_needed_key);
2085}
2086EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
2087#endif
2088
2089#ifdef CONFIG_NET_EGRESS
2090static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
2091
2092void net_inc_egress_queue(void)
2093{
2094	static_branch_inc(&egress_needed_key);
2095}
2096EXPORT_SYMBOL_GPL(net_inc_egress_queue);
2097
2098void net_dec_egress_queue(void)
2099{
2100	static_branch_dec(&egress_needed_key);
2101}
2102EXPORT_SYMBOL_GPL(net_dec_egress_queue);
2103#endif
2104
2105DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2106EXPORT_SYMBOL(netstamp_needed_key);
2107#ifdef CONFIG_JUMP_LABEL
2108static atomic_t netstamp_needed_deferred;
2109static atomic_t netstamp_wanted;
2110static void netstamp_clear(struct work_struct *work)
2111{
2112	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
2113	int wanted;
2114
2115	wanted = atomic_add_return(deferred, &netstamp_wanted);
2116	if (wanted > 0)
2117		static_branch_enable(&netstamp_needed_key);
2118	else
2119		static_branch_disable(&netstamp_needed_key);
2120}
2121static DECLARE_WORK(netstamp_work, netstamp_clear);
2122#endif
2123
2124void net_enable_timestamp(void)
2125{
2126#ifdef CONFIG_JUMP_LABEL
2127	int wanted = atomic_read(&netstamp_wanted);
2128
2129	while (wanted > 0) {
2130		if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
2131			return;
2132	}
2133	atomic_inc(&netstamp_needed_deferred);
2134	schedule_work(&netstamp_work);
2135#else
2136	static_branch_inc(&netstamp_needed_key);
2137#endif
2138}
2139EXPORT_SYMBOL(net_enable_timestamp);
2140
2141void net_disable_timestamp(void)
2142{
2143#ifdef CONFIG_JUMP_LABEL
2144	int wanted = atomic_read(&netstamp_wanted);
2145
2146	while (wanted > 1) {
2147		if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
2148			return;
2149	}
2150	atomic_dec(&netstamp_needed_deferred);
2151	schedule_work(&netstamp_work);
2152#else
2153	static_branch_dec(&netstamp_needed_key);
2154#endif
2155}
2156EXPORT_SYMBOL(net_disable_timestamp);
2157
2158static inline void net_timestamp_set(struct sk_buff *skb)
2159{
2160	skb->tstamp = 0;
2161	skb->mono_delivery_time = 0;
2162	if (static_branch_unlikely(&netstamp_needed_key))
2163		skb->tstamp = ktime_get_real();
2164}
2165
2166#define net_timestamp_check(COND, SKB)				\
2167	if (static_branch_unlikely(&netstamp_needed_key)) {	\
2168		if ((COND) && !(SKB)->tstamp)			\
2169			(SKB)->tstamp = ktime_get_real();	\
2170	}							\
2171
2172bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2173{
2174	return __is_skb_forwardable(dev, skb, true);
2175}
2176EXPORT_SYMBOL_GPL(is_skb_forwardable);
2177
2178static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
2179			      bool check_mtu)
2180{
2181	int ret = ____dev_forward_skb(dev, skb, check_mtu);
2182
2183	if (likely(!ret)) {
2184		skb->protocol = eth_type_trans(skb, dev);
2185		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2186	}
2187
2188	return ret;
2189}
2190
2191int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2192{
2193	return __dev_forward_skb2(dev, skb, true);
2194}
2195EXPORT_SYMBOL_GPL(__dev_forward_skb);
2196
2197/**
2198 * dev_forward_skb - loopback an skb to another netif
2199 *
2200 * @dev: destination network device
2201 * @skb: buffer to forward
2202 *
2203 * return values:
2204 *	NET_RX_SUCCESS	(no congestion)
2205 *	NET_RX_DROP     (packet was dropped, but freed)
2206 *
2207 * dev_forward_skb can be used for injecting an skb from the
2208 * start_xmit function of one device into the receive queue
2209 * of another device.
2210 *
2211 * The receiving device may be in another namespace, so
2212 * we have to clear all information in the skb that could
2213 * impact namespace isolation.
2214 */
2215int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2216{
2217	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2218}
2219EXPORT_SYMBOL_GPL(dev_forward_skb);
2220
2221int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
2222{
2223	return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
2224}
2225
2226static inline int deliver_skb(struct sk_buff *skb,
2227			      struct packet_type *pt_prev,
2228			      struct net_device *orig_dev)
2229{
2230	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2231		return -ENOMEM;
2232	refcount_inc(&skb->users);
2233	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2234}
2235
2236static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2237					  struct packet_type **pt,
2238					  struct net_device *orig_dev,
2239					  __be16 type,
2240					  struct list_head *ptype_list)
2241{
2242	struct packet_type *ptype, *pt_prev = *pt;
2243
2244	list_for_each_entry_rcu(ptype, ptype_list, list) {
2245		if (ptype->type != type)
2246			continue;
2247		if (pt_prev)
2248			deliver_skb(skb, pt_prev, orig_dev);
2249		pt_prev = ptype;
2250	}
2251	*pt = pt_prev;
2252}
2253
2254static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2255{
2256	if (!ptype->af_packet_priv || !skb->sk)
2257		return false;
2258
2259	if (ptype->id_match)
2260		return ptype->id_match(ptype, skb->sk);
2261	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2262		return true;
2263
2264	return false;
2265}
2266
2267/**
2268 * dev_nit_active - return true if any network interface taps are in use
2269 *
2270 * @dev: network device to check for the presence of taps
2271 */
2272bool dev_nit_active(struct net_device *dev)
2273{
2274	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
2275}
2276EXPORT_SYMBOL_GPL(dev_nit_active);
2277
2278/*
2279 *	Support routine. Sends outgoing frames to any network
2280 *	taps currently in use.
2281 */
2282
2283void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2284{
2285	struct packet_type *ptype;
2286	struct sk_buff *skb2 = NULL;
2287	struct packet_type *pt_prev = NULL;
2288	struct list_head *ptype_list = &ptype_all;
2289
2290	rcu_read_lock();
2291again:
2292	list_for_each_entry_rcu(ptype, ptype_list, list) {
2293		if (READ_ONCE(ptype->ignore_outgoing))
2294			continue;
2295
2296		/* Never send packets back to the socket
2297		 * they originated from - MvS (miquels@drinkel.ow.org)
2298		 */
2299		if (skb_loop_sk(ptype, skb))
2300			continue;
2301
2302		if (pt_prev) {
2303			deliver_skb(skb2, pt_prev, skb->dev);
2304			pt_prev = ptype;
2305			continue;
2306		}
2307
2308		/* need to clone skb, done only once */
2309		skb2 = skb_clone(skb, GFP_ATOMIC);
2310		if (!skb2)
2311			goto out_unlock;
2312
2313		net_timestamp_set(skb2);
2314
2315		/* skb->nh should be correctly
2316		 * set by sender, so that the second statement is
2317		 * just protection against buggy protocols.
2318		 */
2319		skb_reset_mac_header(skb2);
2320
2321		if (skb_network_header(skb2) < skb2->data ||
2322		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2323			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2324					     ntohs(skb2->protocol),
2325					     dev->name);
2326			skb_reset_network_header(skb2);
2327		}
2328
2329		skb2->transport_header = skb2->network_header;
2330		skb2->pkt_type = PACKET_OUTGOING;
2331		pt_prev = ptype;
2332	}
2333
2334	if (ptype_list == &ptype_all) {
2335		ptype_list = &dev->ptype_all;
2336		goto again;
2337	}
2338out_unlock:
2339	if (pt_prev) {
2340		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2341			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2342		else
2343			kfree_skb(skb2);
2344	}
2345	rcu_read_unlock();
2346}
2347EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2348
2349/**
2350 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2351 * @dev: Network device
2352 * @txq: number of queues available
2353 *
2354 * If real_num_tx_queues is changed the tc mappings may no longer be
2355 * valid. To resolve this verify the tc mapping remains valid and if
2356 * not NULL the mapping. With no priorities mapping to this
2357 * offset/count pair it will no longer be used. In the worst case TC0
2358 * is invalid nothing can be done so disable priority mappings. If is
2359 * expected that drivers will fix this mapping if they can before
2360 * calling netif_set_real_num_tx_queues.
2361 */
2362static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2363{
2364	int i;
2365	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2366
2367	/* If TC0 is invalidated disable TC mapping */
2368	if (tc->offset + tc->count > txq) {
2369		netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2370		dev->num_tc = 0;
2371		return;
2372	}
2373
2374	/* Invalidated prio to tc mappings set to TC0 */
2375	for (i = 1; i < TC_BITMASK + 1; i++) {
2376		int q = netdev_get_prio_tc_map(dev, i);
2377
2378		tc = &dev->tc_to_txq[q];
2379		if (tc->offset + tc->count > txq) {
2380			netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2381				    i, q);
2382			netdev_set_prio_tc_map(dev, i, 0);
2383		}
2384	}
2385}
2386
2387int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2388{
2389	if (dev->num_tc) {
2390		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2391		int i;
2392
2393		/* walk through the TCs and see if it falls into any of them */
2394		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2395			if ((txq - tc->offset) < tc->count)
2396				return i;
2397		}
2398
2399		/* didn't find it, just return -1 to indicate no match */
2400		return -1;
2401	}
2402
2403	return 0;
2404}
2405EXPORT_SYMBOL(netdev_txq_to_tc);
2406
2407#ifdef CONFIG_XPS
2408static struct static_key xps_needed __read_mostly;
2409static struct static_key xps_rxqs_needed __read_mostly;
2410static DEFINE_MUTEX(xps_map_mutex);
2411#define xmap_dereference(P)		\
2412	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2413
2414static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2415			     struct xps_dev_maps *old_maps, int tci, u16 index)
2416{
2417	struct xps_map *map = NULL;
2418	int pos;
2419
2420	map = xmap_dereference(dev_maps->attr_map[tci]);
2421	if (!map)
2422		return false;
2423
2424	for (pos = map->len; pos--;) {
2425		if (map->queues[pos] != index)
2426			continue;
2427
2428		if (map->len > 1) {
2429			map->queues[pos] = map->queues[--map->len];
2430			break;
2431		}
2432
2433		if (old_maps)
2434			RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
2435		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2436		kfree_rcu(map, rcu);
2437		return false;
2438	}
2439
2440	return true;
2441}
2442
2443static bool remove_xps_queue_cpu(struct net_device *dev,
2444				 struct xps_dev_maps *dev_maps,
2445				 int cpu, u16 offset, u16 count)
2446{
2447	int num_tc = dev_maps->num_tc;
2448	bool active = false;
2449	int tci;
2450
2451	for (tci = cpu * num_tc; num_tc--; tci++) {
2452		int i, j;
2453
2454		for (i = count, j = offset; i--; j++) {
2455			if (!remove_xps_queue(dev_maps, NULL, tci, j))
2456				break;
2457		}
2458
2459		active |= i < 0;
2460	}
2461
2462	return active;
2463}
2464
2465static void reset_xps_maps(struct net_device *dev,
2466			   struct xps_dev_maps *dev_maps,
2467			   enum xps_map_type type)
2468{
2469	static_key_slow_dec_cpuslocked(&xps_needed);
2470	if (type == XPS_RXQS)
2471		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2472
2473	RCU_INIT_POINTER(dev->xps_maps[type], NULL);
2474
2475	kfree_rcu(dev_maps, rcu);
2476}
2477
2478static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
2479			   u16 offset, u16 count)
2480{
2481	struct xps_dev_maps *dev_maps;
2482	bool active = false;
2483	int i, j;
2484
2485	dev_maps = xmap_dereference(dev->xps_maps[type]);
2486	if (!dev_maps)
2487		return;
2488
2489	for (j = 0; j < dev_maps->nr_ids; j++)
2490		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
2491	if (!active)
2492		reset_xps_maps(dev, dev_maps, type);
2493
2494	if (type == XPS_CPUS) {
2495		for (i = offset + (count - 1); count--; i--)
2496			netdev_queue_numa_node_write(
2497				netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
2498	}
2499}
2500
2501static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2502				   u16 count)
2503{
2504	if (!static_key_false(&xps_needed))
2505		return;
2506
2507	cpus_read_lock();
2508	mutex_lock(&xps_map_mutex);
2509
2510	if (static_key_false(&xps_rxqs_needed))
2511		clean_xps_maps(dev, XPS_RXQS, offset, count);
2512
2513	clean_xps_maps(dev, XPS_CPUS, offset, count);
2514
2515	mutex_unlock(&xps_map_mutex);
2516	cpus_read_unlock();
2517}
2518
2519static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2520{
2521	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2522}
2523
2524static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2525				      u16 index, bool is_rxqs_map)
2526{
2527	struct xps_map *new_map;
2528	int alloc_len = XPS_MIN_MAP_ALLOC;
2529	int i, pos;
2530
2531	for (pos = 0; map && pos < map->len; pos++) {
2532		if (map->queues[pos] != index)
2533			continue;
2534		return map;
2535	}
2536
2537	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
2538	if (map) {
2539		if (pos < map->alloc_len)
2540			return map;
2541
2542		alloc_len = map->alloc_len * 2;
2543	}
2544
2545	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2546	 *  map
2547	 */
2548	if (is_rxqs_map)
2549		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2550	else
2551		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2552				       cpu_to_node(attr_index));
2553	if (!new_map)
2554		return NULL;
2555
2556	for (i = 0; i < pos; i++)
2557		new_map->queues[i] = map->queues[i];
2558	new_map->alloc_len = alloc_len;
2559	new_map->len = pos;
2560
2561	return new_map;
2562}
2563
2564/* Copy xps maps at a given index */
2565static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
2566			      struct xps_dev_maps *new_dev_maps, int index,
2567			      int tc, bool skip_tc)
2568{
2569	int i, tci = index * dev_maps->num_tc;
2570	struct xps_map *map;
2571
2572	/* copy maps belonging to foreign traffic classes */
2573	for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2574		if (i == tc && skip_tc)
2575			continue;
2576
2577		/* fill in the new device map from the old device map */
2578		map = xmap_dereference(dev_maps->attr_map[tci]);
2579		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2580	}
2581}
2582
2583/* Must be called under cpus_read_lock */
2584int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2585			  u16 index, enum xps_map_type type)
2586{
2587	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
2588	const unsigned long *online_mask = NULL;
2589	bool active = false, copy = false;
2590	int i, j, tci, numa_node_id = -2;
2591	int maps_sz, num_tc = 1, tc = 0;
2592	struct xps_map *map, *new_map;
2593	unsigned int nr_ids;
2594
2595	WARN_ON_ONCE(index >= dev->num_tx_queues);
2596
2597	if (dev->num_tc) {
2598		/* Do not allow XPS on subordinate device directly */
2599		num_tc = dev->num_tc;
2600		if (num_tc < 0)
2601			return -EINVAL;
2602
2603		/* If queue belongs to subordinate dev use its map */
2604		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2605
2606		tc = netdev_txq_to_tc(dev, index);
2607		if (tc < 0)
2608			return -EINVAL;
2609	}
2610
2611	mutex_lock(&xps_map_mutex);
2612
2613	dev_maps = xmap_dereference(dev->xps_maps[type]);
2614	if (type == XPS_RXQS) {
2615		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2616		nr_ids = dev->num_rx_queues;
2617	} else {
2618		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2619		if (num_possible_cpus() > 1)
2620			online_mask = cpumask_bits(cpu_online_mask);
2621		nr_ids = nr_cpu_ids;
2622	}
2623
2624	if (maps_sz < L1_CACHE_BYTES)
2625		maps_sz = L1_CACHE_BYTES;
2626
2627	/* The old dev_maps could be larger or smaller than the one we're
2628	 * setting up now, as dev->num_tc or nr_ids could have been updated in
2629	 * between. We could try to be smart, but let's be safe instead and only
2630	 * copy foreign traffic classes if the two map sizes match.
2631	 */
2632	if (dev_maps &&
2633	    dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
2634		copy = true;
2635
2636	/* allocate memory for queue storage */
2637	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2638	     j < nr_ids;) {
2639		if (!new_dev_maps) {
2640			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2641			if (!new_dev_maps) {
2642				mutex_unlock(&xps_map_mutex);
2643				return -ENOMEM;
2644			}
2645
2646			new_dev_maps->nr_ids = nr_ids;
2647			new_dev_maps->num_tc = num_tc;
2648		}
2649
2650		tci = j * num_tc + tc;
2651		map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
2652
2653		map = expand_xps_map(map, j, index, type == XPS_RXQS);
2654		if (!map)
2655			goto error;
2656
2657		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2658	}
2659
2660	if (!new_dev_maps)
2661		goto out_no_new_maps;
2662
2663	if (!dev_maps) {
2664		/* Increment static keys at most once per type */
2665		static_key_slow_inc_cpuslocked(&xps_needed);
2666		if (type == XPS_RXQS)
2667			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2668	}
2669
2670	for (j = 0; j < nr_ids; j++) {
2671		bool skip_tc = false;
2672
2673		tci = j * num_tc + tc;
2674		if (netif_attr_test_mask(j, mask, nr_ids) &&
2675		    netif_attr_test_online(j, online_mask, nr_ids)) {
2676			/* add tx-queue to CPU/rx-queue maps */
2677			int pos = 0;
2678
2679			skip_tc = true;
2680
2681			map = xmap_dereference(new_dev_maps->attr_map[tci]);
2682			while ((pos < map->len) && (map->queues[pos] != index))
2683				pos++;
2684
2685			if (pos == map->len)
2686				map->queues[map->len++] = index;
2687#ifdef CONFIG_NUMA
2688			if (type == XPS_CPUS) {
2689				if (numa_node_id == -2)
2690					numa_node_id = cpu_to_node(j);
2691				else if (numa_node_id != cpu_to_node(j))
2692					numa_node_id = -1;
2693			}
2694#endif
2695		}
2696
2697		if (copy)
2698			xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
2699					  skip_tc);
2700	}
2701
2702	rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
2703
2704	/* Cleanup old maps */
2705	if (!dev_maps)
2706		goto out_no_old_maps;
2707
2708	for (j = 0; j < dev_maps->nr_ids; j++) {
2709		for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
2710			map = xmap_dereference(dev_maps->attr_map[tci]);
2711			if (!map)
2712				continue;
2713
2714			if (copy) {
2715				new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2716				if (map == new_map)
2717					continue;
2718			}
2719
2720			RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2721			kfree_rcu(map, rcu);
2722		}
2723	}
2724
2725	old_dev_maps = dev_maps;
2726
2727out_no_old_maps:
2728	dev_maps = new_dev_maps;
2729	active = true;
2730
2731out_no_new_maps:
2732	if (type == XPS_CPUS)
2733		/* update Tx queue numa node */
2734		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2735					     (numa_node_id >= 0) ?
2736					     numa_node_id : NUMA_NO_NODE);
2737
2738	if (!dev_maps)
2739		goto out_no_maps;
2740
2741	/* removes tx-queue from unused CPUs/rx-queues */
2742	for (j = 0; j < dev_maps->nr_ids; j++) {
2743		tci = j * dev_maps->num_tc;
2744
2745		for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2746			if (i == tc &&
2747			    netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
2748			    netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
2749				continue;
2750
2751			active |= remove_xps_queue(dev_maps,
2752						   copy ? old_dev_maps : NULL,
2753						   tci, index);
2754		}
2755	}
2756
2757	if (old_dev_maps)
2758		kfree_rcu(old_dev_maps, rcu);
2759
2760	/* free map if not active */
2761	if (!active)
2762		reset_xps_maps(dev, dev_maps, type);
2763
2764out_no_maps:
2765	mutex_unlock(&xps_map_mutex);
2766
2767	return 0;
2768error:
2769	/* remove any maps that we added */
2770	for (j = 0; j < nr_ids; j++) {
2771		for (i = num_tc, tci = j * num_tc; i--; tci++) {
2772			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2773			map = copy ?
2774			      xmap_dereference(dev_maps->attr_map[tci]) :
2775			      NULL;
2776			if (new_map && new_map != map)
2777				kfree(new_map);
2778		}
2779	}
2780
2781	mutex_unlock(&xps_map_mutex);
2782
2783	kfree(new_dev_maps);
2784	return -ENOMEM;
2785}
2786EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2787
2788int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2789			u16 index)
2790{
2791	int ret;
2792
2793	cpus_read_lock();
2794	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
2795	cpus_read_unlock();
2796
2797	return ret;
2798}
2799EXPORT_SYMBOL(netif_set_xps_queue);
2800
2801#endif
2802static void netdev_unbind_all_sb_channels(struct net_device *dev)
2803{
2804	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2805
2806	/* Unbind any subordinate channels */
2807	while (txq-- != &dev->_tx[0]) {
2808		if (txq->sb_dev)
2809			netdev_unbind_sb_channel(dev, txq->sb_dev);
2810	}
2811}
2812
2813void netdev_reset_tc(struct net_device *dev)
2814{
2815#ifdef CONFIG_XPS
2816	netif_reset_xps_queues_gt(dev, 0);
2817#endif
2818	netdev_unbind_all_sb_channels(dev);
2819
2820	/* Reset TC configuration of device */
2821	dev->num_tc = 0;
2822	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2823	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2824}
2825EXPORT_SYMBOL(netdev_reset_tc);
2826
2827int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2828{
2829	if (tc >= dev->num_tc)
2830		return -EINVAL;
2831
2832#ifdef CONFIG_XPS
2833	netif_reset_xps_queues(dev, offset, count);
2834#endif
2835	dev->tc_to_txq[tc].count = count;
2836	dev->tc_to_txq[tc].offset = offset;
2837	return 0;
2838}
2839EXPORT_SYMBOL(netdev_set_tc_queue);
2840
2841int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2842{
2843	if (num_tc > TC_MAX_QUEUE)
2844		return -EINVAL;
2845
2846#ifdef CONFIG_XPS
2847	netif_reset_xps_queues_gt(dev, 0);
2848#endif
2849	netdev_unbind_all_sb_channels(dev);
2850
2851	dev->num_tc = num_tc;
2852	return 0;
2853}
2854EXPORT_SYMBOL(netdev_set_num_tc);
2855
2856void netdev_unbind_sb_channel(struct net_device *dev,
2857			      struct net_device *sb_dev)
2858{
2859	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2860
2861#ifdef CONFIG_XPS
2862	netif_reset_xps_queues_gt(sb_dev, 0);
2863#endif
2864	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2865	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2866
2867	while (txq-- != &dev->_tx[0]) {
2868		if (txq->sb_dev == sb_dev)
2869			txq->sb_dev = NULL;
2870	}
2871}
2872EXPORT_SYMBOL(netdev_unbind_sb_channel);
2873
2874int netdev_bind_sb_channel_queue(struct net_device *dev,
2875				 struct net_device *sb_dev,
2876				 u8 tc, u16 count, u16 offset)
2877{
2878	/* Make certain the sb_dev and dev are already configured */
2879	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2880		return -EINVAL;
2881
2882	/* We cannot hand out queues we don't have */
2883	if ((offset + count) > dev->real_num_tx_queues)
2884		return -EINVAL;
2885
2886	/* Record the mapping */
2887	sb_dev->tc_to_txq[tc].count = count;
2888	sb_dev->tc_to_txq[tc].offset = offset;
2889
2890	/* Provide a way for Tx queue to find the tc_to_txq map or
2891	 * XPS map for itself.
2892	 */
2893	while (count--)
2894		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2895
2896	return 0;
2897}
2898EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2899
2900int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2901{
2902	/* Do not use a multiqueue device to represent a subordinate channel */
2903	if (netif_is_multiqueue(dev))
2904		return -ENODEV;
2905
2906	/* We allow channels 1 - 32767 to be used for subordinate channels.
2907	 * Channel 0 is meant to be "native" mode and used only to represent
2908	 * the main root device. We allow writing 0 to reset the device back
2909	 * to normal mode after being used as a subordinate channel.
2910	 */
2911	if (channel > S16_MAX)
2912		return -EINVAL;
2913
2914	dev->num_tc = -channel;
2915
2916	return 0;
2917}
2918EXPORT_SYMBOL(netdev_set_sb_channel);
2919
2920/*
2921 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2922 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2923 */
2924int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2925{
2926	bool disabling;
2927	int rc;
2928
2929	disabling = txq < dev->real_num_tx_queues;
2930
2931	if (txq < 1 || txq > dev->num_tx_queues)
2932		return -EINVAL;
2933
2934	if (dev->reg_state == NETREG_REGISTERED ||
2935	    dev->reg_state == NETREG_UNREGISTERING) {
2936		ASSERT_RTNL();
2937
2938		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2939						  txq);
2940		if (rc)
2941			return rc;
2942
2943		if (dev->num_tc)
2944			netif_setup_tc(dev, txq);
2945
2946		dev_qdisc_change_real_num_tx(dev, txq);
2947
2948		dev->real_num_tx_queues = txq;
2949
2950		if (disabling) {
2951			synchronize_net();
2952			qdisc_reset_all_tx_gt(dev, txq);
2953#ifdef CONFIG_XPS
2954			netif_reset_xps_queues_gt(dev, txq);
2955#endif
2956		}
2957	} else {
2958		dev->real_num_tx_queues = txq;
2959	}
2960
2961	return 0;
2962}
2963EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2964
2965#ifdef CONFIG_SYSFS
2966/**
2967 *	netif_set_real_num_rx_queues - set actual number of RX queues used
2968 *	@dev: Network device
2969 *	@rxq: Actual number of RX queues
2970 *
2971 *	This must be called either with the rtnl_lock held or before
2972 *	registration of the net device.  Returns 0 on success, or a
2973 *	negative error code.  If called before registration, it always
2974 *	succeeds.
2975 */
2976int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2977{
2978	int rc;
2979
2980	if (rxq < 1 || rxq > dev->num_rx_queues)
2981		return -EINVAL;
2982
2983	if (dev->reg_state == NETREG_REGISTERED) {
2984		ASSERT_RTNL();
2985
2986		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2987						  rxq);
2988		if (rc)
2989			return rc;
2990	}
2991
2992	dev->real_num_rx_queues = rxq;
2993	return 0;
2994}
2995EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2996#endif
2997
2998/**
2999 *	netif_set_real_num_queues - set actual number of RX and TX queues used
3000 *	@dev: Network device
3001 *	@txq: Actual number of TX queues
3002 *	@rxq: Actual number of RX queues
3003 *
3004 *	Set the real number of both TX and RX queues.
3005 *	Does nothing if the number of queues is already correct.
3006 */
3007int netif_set_real_num_queues(struct net_device *dev,
3008			      unsigned int txq, unsigned int rxq)
3009{
3010	unsigned int old_rxq = dev->real_num_rx_queues;
3011	int err;
3012
3013	if (txq < 1 || txq > dev->num_tx_queues ||
3014	    rxq < 1 || rxq > dev->num_rx_queues)
3015		return -EINVAL;
3016
3017	/* Start from increases, so the error path only does decreases -
3018	 * decreases can't fail.
3019	 */
3020	if (rxq > dev->real_num_rx_queues) {
3021		err = netif_set_real_num_rx_queues(dev, rxq);
3022		if (err)
3023			return err;
3024	}
3025	if (txq > dev->real_num_tx_queues) {
3026		err = netif_set_real_num_tx_queues(dev, txq);
3027		if (err)
3028			goto undo_rx;
3029	}
3030	if (rxq < dev->real_num_rx_queues)
3031		WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
3032	if (txq < dev->real_num_tx_queues)
3033		WARN_ON(netif_set_real_num_tx_queues(dev, txq));
3034
3035	return 0;
3036undo_rx:
3037	WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
3038	return err;
3039}
3040EXPORT_SYMBOL(netif_set_real_num_queues);
3041
3042/**
3043 * netif_set_tso_max_size() - set the max size of TSO frames supported
3044 * @dev:	netdev to update
3045 * @size:	max skb->len of a TSO frame
3046 *
3047 * Set the limit on the size of TSO super-frames the device can handle.
3048 * Unless explicitly set the stack will assume the value of
3049 * %GSO_LEGACY_MAX_SIZE.
3050 */
3051void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
3052{
3053	dev->tso_max_size = min(GSO_MAX_SIZE, size);
3054	if (size < READ_ONCE(dev->gso_max_size))
3055		netif_set_gso_max_size(dev, size);
3056	if (size < READ_ONCE(dev->gso_ipv4_max_size))
3057		netif_set_gso_ipv4_max_size(dev, size);
3058}
3059EXPORT_SYMBOL(netif_set_tso_max_size);
3060
3061/**
3062 * netif_set_tso_max_segs() - set the max number of segs supported for TSO
3063 * @dev:	netdev to update
3064 * @segs:	max number of TCP segments
3065 *
3066 * Set the limit on the number of TCP segments the device can generate from
3067 * a single TSO super-frame.
3068 * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
3069 */
3070void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
3071{
3072	dev->tso_max_segs = segs;
3073	if (segs < READ_ONCE(dev->gso_max_segs))
3074		netif_set_gso_max_segs(dev, segs);
3075}
3076EXPORT_SYMBOL(netif_set_tso_max_segs);
3077
3078/**
3079 * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
3080 * @to:		netdev to update
3081 * @from:	netdev from which to copy the limits
3082 */
3083void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
3084{
3085	netif_set_tso_max_size(to, from->tso_max_size);
3086	netif_set_tso_max_segs(to, from->tso_max_segs);
3087}
3088EXPORT_SYMBOL(netif_inherit_tso_max);
3089
3090/**
3091 * netif_get_num_default_rss_queues - default number of RSS queues
3092 *
3093 * Default value is the number of physical cores if there are only 1 or 2, or
3094 * divided by 2 if there are more.
3095 */
3096int netif_get_num_default_rss_queues(void)
3097{
3098	cpumask_var_t cpus;
3099	int cpu, count = 0;
3100
3101	if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
3102		return 1;
3103
3104	cpumask_copy(cpus, cpu_online_mask);
3105	for_each_cpu(cpu, cpus) {
3106		++count;
3107		cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
3108	}
3109	free_cpumask_var(cpus);
3110
3111	return count > 2 ? DIV_ROUND_UP(count, 2) : count;
3112}
3113EXPORT_SYMBOL(netif_get_num_default_rss_queues);
3114
3115static void __netif_reschedule(struct Qdisc *q)
3116{
3117	struct softnet_data *sd;
3118	unsigned long flags;
3119
3120	local_irq_save(flags);
3121	sd = this_cpu_ptr(&softnet_data);
3122	q->next_sched = NULL;
3123	*sd->output_queue_tailp = q;
3124	sd->output_queue_tailp = &q->next_sched;
3125	raise_softirq_irqoff(NET_TX_SOFTIRQ);
3126	local_irq_restore(flags);
3127}
3128
3129void __netif_schedule(struct Qdisc *q)
3130{
3131	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
3132		__netif_reschedule(q);
3133}
3134EXPORT_SYMBOL(__netif_schedule);
3135
3136struct dev_kfree_skb_cb {
3137	enum skb_drop_reason reason;
3138};
3139
3140static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
3141{
3142	return (struct dev_kfree_skb_cb *)skb->cb;
3143}
3144
3145void netif_schedule_queue(struct netdev_queue *txq)
3146{
3147	rcu_read_lock();
3148	if (!netif_xmit_stopped(txq)) {
3149		struct Qdisc *q = rcu_dereference(txq->qdisc);
3150
3151		__netif_schedule(q);
3152	}
3153	rcu_read_unlock();
3154}
3155EXPORT_SYMBOL(netif_schedule_queue);
3156
3157void netif_tx_wake_queue(struct netdev_queue *dev_queue)
3158{
3159	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
3160		struct Qdisc *q;
3161
3162		rcu_read_lock();
3163		q = rcu_dereference(dev_queue->qdisc);
3164		__netif_schedule(q);
3165		rcu_read_unlock();
3166	}
3167}
3168EXPORT_SYMBOL(netif_tx_wake_queue);
3169
3170void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3171{
3172	unsigned long flags;
3173
3174	if (unlikely(!skb))
3175		return;
3176
3177	if (likely(refcount_read(&skb->users) == 1)) {
3178		smp_rmb();
3179		refcount_set(&skb->users, 0);
3180	} else if (likely(!refcount_dec_and_test(&skb->users))) {
3181		return;
3182	}
3183	get_kfree_skb_cb(skb)->reason = reason;
3184	local_irq_save(flags);
3185	skb->next = __this_cpu_read(softnet_data.completion_queue);
3186	__this_cpu_write(softnet_data.completion_queue, skb);
3187	raise_softirq_irqoff(NET_TX_SOFTIRQ);
3188	local_irq_restore(flags);
3189}
3190EXPORT_SYMBOL(dev_kfree_skb_irq_reason);
3191
3192void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3193{
3194	if (in_hardirq() || irqs_disabled())
3195		dev_kfree_skb_irq_reason(skb, reason);
3196	else
3197		kfree_skb_reason(skb, reason);
3198}
3199EXPORT_SYMBOL(dev_kfree_skb_any_reason);
3200
3201
3202/**
3203 * netif_device_detach - mark device as removed
3204 * @dev: network device
3205 *
3206 * Mark device as removed from system and therefore no longer available.
3207 */
3208void netif_device_detach(struct net_device *dev)
3209{
3210	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
3211	    netif_running(dev)) {
3212		netif_tx_stop_all_queues(dev);
3213	}
3214}
3215EXPORT_SYMBOL(netif_device_detach);
3216
3217/**
3218 * netif_device_attach - mark device as attached
3219 * @dev: network device
3220 *
3221 * Mark device as attached from system and restart if needed.
3222 */
3223void netif_device_attach(struct net_device *dev)
3224{
3225	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
3226	    netif_running(dev)) {
3227		netif_tx_wake_all_queues(dev);
3228		__netdev_watchdog_up(dev);
3229	}
3230}
3231EXPORT_SYMBOL(netif_device_attach);
3232
3233/*
3234 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3235 * to be used as a distribution range.
3236 */
3237static u16 skb_tx_hash(const struct net_device *dev,
3238		       const struct net_device *sb_dev,
3239		       struct sk_buff *skb)
3240{
3241	u32 hash;
3242	u16 qoffset = 0;
3243	u16 qcount = dev->real_num_tx_queues;
3244
3245	if (dev->num_tc) {
3246		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
3247
3248		qoffset = sb_dev->tc_to_txq[tc].offset;
3249		qcount = sb_dev->tc_to_txq[tc].count;
3250		if (unlikely(!qcount)) {
3251			net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
3252					     sb_dev->name, qoffset, tc);
3253			qoffset = 0;
3254			qcount = dev->real_num_tx_queues;
3255		}
3256	}
3257
3258	if (skb_rx_queue_recorded(skb)) {
3259		DEBUG_NET_WARN_ON_ONCE(qcount == 0);
3260		hash = skb_get_rx_queue(skb);
3261		if (hash >= qoffset)
3262			hash -= qoffset;
3263		while (unlikely(hash >= qcount))
3264			hash -= qcount;
3265		return hash + qoffset;
3266	}
3267
3268	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
3269}
3270
3271void skb_warn_bad_offload(const struct sk_buff *skb)
3272{
3273	static const netdev_features_t null_features;
3274	struct net_device *dev = skb->dev;
3275	const char *name = "";
3276
3277	if (!net_ratelimit())
3278		return;
3279
3280	if (dev) {
3281		if (dev->dev.parent)
3282			name = dev_driver_string(dev->dev.parent);
3283		else
3284			name = netdev_name(dev);
3285	}
3286	skb_dump(KERN_WARNING, skb, false);
3287	WARN(1, "%s: caps=(%pNF, %pNF)\n",
3288	     name, dev ? &dev->features : &null_features,
3289	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
3290}
3291
3292/*
3293 * Invalidate hardware checksum when packet is to be mangled, and
3294 * complete checksum manually on outgoing path.
3295 */
3296int skb_checksum_help(struct sk_buff *skb)
3297{
3298	__wsum csum;
3299	int ret = 0, offset;
3300
3301	if (skb->ip_summed == CHECKSUM_COMPLETE)
3302		goto out_set_summed;
3303
3304	if (unlikely(skb_is_gso(skb))) {
3305		skb_warn_bad_offload(skb);
3306		return -EINVAL;
3307	}
3308
3309	/* Before computing a checksum, we should make sure no frag could
3310	 * be modified by an external entity : checksum could be wrong.
3311	 */
3312	if (skb_has_shared_frag(skb)) {
3313		ret = __skb_linearize(skb);
3314		if (ret)
3315			goto out;
3316	}
3317
3318	offset = skb_checksum_start_offset(skb);
3319	ret = -EINVAL;
3320	if (unlikely(offset >= skb_headlen(skb))) {
3321		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3322		WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
3323			  offset, skb_headlen(skb));
3324		goto out;
3325	}
3326	csum = skb_checksum(skb, offset, skb->len - offset, 0);
3327
3328	offset += skb->csum_offset;
3329	if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
3330		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3331		WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
3332			  offset + sizeof(__sum16), skb_headlen(skb));
3333		goto out;
3334	}
3335	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3336	if (ret)
3337		goto out;
3338
3339	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3340out_set_summed:
3341	skb->ip_summed = CHECKSUM_NONE;
3342out:
3343	return ret;
3344}
3345EXPORT_SYMBOL(skb_checksum_help);
3346
3347int skb_crc32c_csum_help(struct sk_buff *skb)
3348{
3349	__le32 crc32c_csum;
3350	int ret = 0, offset, start;
3351
3352	if (skb->ip_summed != CHECKSUM_PARTIAL)
3353		goto out;
3354
3355	if (unlikely(skb_is_gso(skb)))
3356		goto out;
3357
3358	/* Before computing a checksum, we should make sure no frag could
3359	 * be modified by an external entity : checksum could be wrong.
3360	 */
3361	if (unlikely(skb_has_shared_frag(skb))) {
3362		ret = __skb_linearize(skb);
3363		if (ret)
3364			goto out;
3365	}
3366	start = skb_checksum_start_offset(skb);
3367	offset = start + offsetof(struct sctphdr, checksum);
3368	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3369		ret = -EINVAL;
3370		goto out;
3371	}
3372
3373	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3374	if (ret)
3375		goto out;
3376
3377	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3378						  skb->len - start, ~(__u32)0,
3379						  crc32c_csum_stub));
3380	*(__le32 *)(skb->data + offset) = crc32c_csum;
3381	skb_reset_csum_not_inet(skb);
3382out:
3383	return ret;
3384}
3385
3386__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3387{
3388	__be16 type = skb->protocol;
3389
3390	/* Tunnel gso handlers can set protocol to ethernet. */
3391	if (type == htons(ETH_P_TEB)) {
3392		struct ethhdr *eth;
3393
3394		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3395			return 0;
3396
3397		eth = (struct ethhdr *)skb->data;
3398		type = eth->h_proto;
3399	}
3400
3401	return vlan_get_protocol_and_depth(skb, type, depth);
3402}
3403
3404
3405/* Take action when hardware reception checksum errors are detected. */
3406#ifdef CONFIG_BUG
3407static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3408{
3409	netdev_err(dev, "hw csum failure\n");
3410	skb_dump(KERN_ERR, skb, true);
3411	dump_stack();
3412}
3413
3414void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3415{
3416	DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
3417}
3418EXPORT_SYMBOL(netdev_rx_csum_fault);
3419#endif
3420
3421/* XXX: check that highmem exists at all on the given machine. */
3422static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3423{
3424#ifdef CONFIG_HIGHMEM
3425	int i;
3426
3427	if (!(dev->features & NETIF_F_HIGHDMA)) {
3428		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3429			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3430
3431			if (PageHighMem(skb_frag_page(frag)))
3432				return 1;
3433		}
3434	}
3435#endif
3436	return 0;
3437}
3438
3439/* If MPLS offload request, verify we are testing hardware MPLS features
3440 * instead of standard features for the netdev.
3441 */
3442#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3443static netdev_features_t net_mpls_features(struct sk_buff *skb,
3444					   netdev_features_t features,
3445					   __be16 type)
3446{
3447	if (eth_p_mpls(type))
3448		features &= skb->dev->mpls_features;
3449
3450	return features;
3451}
3452#else
3453static netdev_features_t net_mpls_features(struct sk_buff *skb,
3454					   netdev_features_t features,
3455					   __be16 type)
3456{
3457	return features;
3458}
3459#endif
3460
3461static netdev_features_t harmonize_features(struct sk_buff *skb,
3462	netdev_features_t features)
3463{
3464	__be16 type;
3465
3466	type = skb_network_protocol(skb, NULL);
3467	features = net_mpls_features(skb, features, type);
3468
3469	if (skb->ip_summed != CHECKSUM_NONE &&
3470	    !can_checksum_protocol(features, type)) {
3471		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3472	}
3473	if (illegal_highdma(skb->dev, skb))
3474		features &= ~NETIF_F_SG;
3475
3476	return features;
3477}
3478
3479netdev_features_t passthru_features_check(struct sk_buff *skb,
3480					  struct net_device *dev,
3481					  netdev_features_t features)
3482{
3483	return features;
3484}
3485EXPORT_SYMBOL(passthru_features_check);
3486
3487static netdev_features_t dflt_features_check(struct sk_buff *skb,
3488					     struct net_device *dev,
3489					     netdev_features_t features)
3490{
3491	return vlan_features_check(skb, features);
3492}
3493
3494static netdev_features_t gso_features_check(const struct sk_buff *skb,
3495					    struct net_device *dev,
3496					    netdev_features_t features)
3497{
3498	u16 gso_segs = skb_shinfo(skb)->gso_segs;
3499
3500	if (gso_segs > READ_ONCE(dev->gso_max_segs))
3501		return features & ~NETIF_F_GSO_MASK;
3502
3503	if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size)))
3504		return features & ~NETIF_F_GSO_MASK;
3505
3506	if (!skb_shinfo(skb)->gso_type) {
3507		skb_warn_bad_offload(skb);
3508		return features & ~NETIF_F_GSO_MASK;
3509	}
3510
3511	/* Support for GSO partial features requires software
3512	 * intervention before we can actually process the packets
3513	 * so we need to strip support for any partial features now
3514	 * and we can pull them back in after we have partially
3515	 * segmented the frame.
3516	 */
3517	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3518		features &= ~dev->gso_partial_features;
3519
3520	/* Make sure to clear the IPv4 ID mangling feature if the
3521	 * IPv4 header has the potential to be fragmented.
3522	 */
3523	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3524		struct iphdr *iph = skb->encapsulation ?
3525				    inner_ip_hdr(skb) : ip_hdr(skb);
3526
3527		if (!(iph->frag_off & htons(IP_DF)))
3528			features &= ~NETIF_F_TSO_MANGLEID;
3529	}
3530
3531	return features;
3532}
3533
3534netdev_features_t netif_skb_features(struct sk_buff *skb)
3535{
3536	struct net_device *dev = skb->dev;
3537	netdev_features_t features = dev->features;
3538
3539	if (skb_is_gso(skb))
3540		features = gso_features_check(skb, dev, features);
3541
3542	/* If encapsulation offload request, verify we are testing
3543	 * hardware encapsulation features instead of standard
3544	 * features for the netdev
3545	 */
3546	if (skb->encapsulation)
3547		features &= dev->hw_enc_features;
3548
3549	if (skb_vlan_tagged(skb))
3550		features = netdev_intersect_features(features,
3551						     dev->vlan_features |
3552						     NETIF_F_HW_VLAN_CTAG_TX |
3553						     NETIF_F_HW_VLAN_STAG_TX);
3554
3555	if (dev->netdev_ops->ndo_features_check)
3556		features &= dev->netdev_ops->ndo_features_check(skb, dev,
3557								features);
3558	else
3559		features &= dflt_features_check(skb, dev, features);
3560
3561	return harmonize_features(skb, features);
3562}
3563EXPORT_SYMBOL(netif_skb_features);
3564
3565static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3566		    struct netdev_queue *txq, bool more)
3567{
3568	unsigned int len;
3569	int rc;
3570
3571	if (dev_nit_active(dev))
3572		dev_queue_xmit_nit(skb, dev);
3573
3574	len = skb->len;
3575	trace_net_dev_start_xmit(skb, dev);
3576	rc = netdev_start_xmit(skb, dev, txq, more);
3577	trace_net_dev_xmit(skb, rc, dev, len);
3578
3579	return rc;
3580}
3581
3582struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3583				    struct netdev_queue *txq, int *ret)
3584{
3585	struct sk_buff *skb = first;
3586	int rc = NETDEV_TX_OK;
3587
3588	while (skb) {
3589		struct sk_buff *next = skb->next;
3590
3591		skb_mark_not_on_list(skb);
3592		rc = xmit_one(skb, dev, txq, next != NULL);
3593		if (unlikely(!dev_xmit_complete(rc))) {
3594			skb->next = next;
3595			goto out;
3596		}
3597
3598		skb = next;
3599		if (netif_tx_queue_stopped(txq) && skb) {
3600			rc = NETDEV_TX_BUSY;
3601			break;
3602		}
3603	}
3604
3605out:
3606	*ret = rc;
3607	return skb;
3608}
3609
3610static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3611					  netdev_features_t features)
3612{
3613	if (skb_vlan_tag_present(skb) &&
3614	    !vlan_hw_offload_capable(features, skb->vlan_proto))
3615		skb = __vlan_hwaccel_push_inside(skb);
3616	return skb;
3617}
3618
3619int skb_csum_hwoffload_help(struct sk_buff *skb,
3620			    const netdev_features_t features)
3621{
3622	if (unlikely(skb_csum_is_sctp(skb)))
3623		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3624			skb_crc32c_csum_help(skb);
3625
3626	if (features & NETIF_F_HW_CSUM)
3627		return 0;
3628
3629	if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
3630		switch (skb->csum_offset) {
3631		case offsetof(struct tcphdr, check):
3632		case offsetof(struct udphdr, check):
3633			return 0;
3634		}
3635	}
3636
3637	return skb_checksum_help(skb);
3638}
3639EXPORT_SYMBOL(skb_csum_hwoffload_help);
3640
3641static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3642{
3643	netdev_features_t features;
3644
3645	features = netif_skb_features(skb);
3646	skb = validate_xmit_vlan(skb, features);
3647	if (unlikely(!skb))
3648		goto out_null;
3649
3650	skb = sk_validate_xmit_skb(skb, dev);
3651	if (unlikely(!skb))
3652		goto out_null;
3653
3654	if (netif_needs_gso(skb, features)) {
3655		struct sk_buff *segs;
3656
3657		segs = skb_gso_segment(skb, features);
3658		if (IS_ERR(segs)) {
3659			goto out_kfree_skb;
3660		} else if (segs) {
3661			consume_skb(skb);
3662			skb = segs;
3663		}
3664	} else {
3665		if (skb_needs_linearize(skb, features) &&
3666		    __skb_linearize(skb))
3667			goto out_kfree_skb;
3668
3669		/* If packet is not checksummed and device does not
3670		 * support checksumming for this protocol, complete
3671		 * checksumming here.
3672		 */
3673		if (skb->ip_summed == CHECKSUM_PARTIAL) {
3674			if (skb->encapsulation)
3675				skb_set_inner_transport_header(skb,
3676							       skb_checksum_start_offset(skb));
3677			else
3678				skb_set_transport_header(skb,
3679							 skb_checksum_start_offset(skb));
3680			if (skb_csum_hwoffload_help(skb, features))
3681				goto out_kfree_skb;
3682		}
3683	}
3684
3685	skb = validate_xmit_xfrm(skb, features, again);
3686
3687	return skb;
3688
3689out_kfree_skb:
3690	kfree_skb(skb);
3691out_null:
3692	dev_core_stats_tx_dropped_inc(dev);
3693	return NULL;
3694}
3695
3696struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3697{
3698	struct sk_buff *next, *head = NULL, *tail;
3699
3700	for (; skb != NULL; skb = next) {
3701		next = skb->next;
3702		skb_mark_not_on_list(skb);
3703
3704		/* in case skb wont be segmented, point to itself */
3705		skb->prev = skb;
3706
3707		skb = validate_xmit_skb(skb, dev, again);
3708		if (!skb)
3709			continue;
3710
3711		if (!head)
3712			head = skb;
3713		else
3714			tail->next = skb;
3715		/* If skb was segmented, skb->prev points to
3716		 * the last segment. If not, it still contains skb.
3717		 */
3718		tail = skb->prev;
3719	}
3720	return head;
3721}
3722EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3723
3724static void qdisc_pkt_len_init(struct sk_buff *skb)
3725{
3726	const struct skb_shared_info *shinfo = skb_shinfo(skb);
3727
3728	qdisc_skb_cb(skb)->pkt_len = skb->len;
3729
3730	/* To get more precise estimation of bytes sent on wire,
3731	 * we add to pkt_len the headers size of all segments
3732	 */
3733	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3734		u16 gso_segs = shinfo->gso_segs;
3735		unsigned int hdr_len;
3736
3737		/* mac layer + network layer */
3738		hdr_len = skb_transport_offset(skb);
3739
3740		/* + transport layer */
3741		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3742			const struct tcphdr *th;
3743			struct tcphdr _tcphdr;
3744
3745			th = skb_header_pointer(skb, hdr_len,
3746						sizeof(_tcphdr), &_tcphdr);
3747			if (likely(th))
3748				hdr_len += __tcp_hdrlen(th);
3749		} else {
3750			struct udphdr _udphdr;
3751
3752			if (skb_header_pointer(skb, hdr_len,
3753					       sizeof(_udphdr), &_udphdr))
3754				hdr_len += sizeof(struct udphdr);
3755		}
3756
3757		if (shinfo->gso_type & SKB_GSO_DODGY)
3758			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3759						shinfo->gso_size);
3760
3761		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3762	}
3763}
3764
3765static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
3766			     struct sk_buff **to_free,
3767			     struct netdev_queue *txq)
3768{
3769	int rc;
3770
3771	rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
3772	if (rc == NET_XMIT_SUCCESS)
3773		trace_qdisc_enqueue(q, txq, skb);
3774	return rc;
3775}
3776
3777static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3778				 struct net_device *dev,
3779				 struct netdev_queue *txq)
3780{
3781	spinlock_t *root_lock = qdisc_lock(q);
3782	struct sk_buff *to_free = NULL;
3783	bool contended;
3784	int rc;
3785
3786	qdisc_calculate_pkt_len(skb, q);
3787
3788	if (q->flags & TCQ_F_NOLOCK) {
3789		if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
3790		    qdisc_run_begin(q)) {
3791			/* Retest nolock_qdisc_is_empty() within the protection
3792			 * of q->seqlock to protect from racing with requeuing.
3793			 */
3794			if (unlikely(!nolock_qdisc_is_empty(q))) {
3795				rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3796				__qdisc_run(q);
3797				qdisc_run_end(q);
3798
3799				goto no_lock_out;
3800			}
3801
3802			qdisc_bstats_cpu_update(q, skb);
3803			if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
3804			    !nolock_qdisc_is_empty(q))
3805				__qdisc_run(q);
3806
3807			qdisc_run_end(q);
3808			return NET_XMIT_SUCCESS;
3809		}
3810
3811		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3812		qdisc_run(q);
3813
3814no_lock_out:
3815		if (unlikely(to_free))
3816			kfree_skb_list_reason(to_free,
3817					      SKB_DROP_REASON_QDISC_DROP);
3818		return rc;
3819	}
3820
3821	/*
3822	 * Heuristic to force contended enqueues to serialize on a
3823	 * separate lock before trying to get qdisc main lock.
3824	 * This permits qdisc->running owner to get the lock more
3825	 * often and dequeue packets faster.
3826	 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
3827	 * and then other tasks will only enqueue packets. The packets will be
3828	 * sent after the qdisc owner is scheduled again. To prevent this
3829	 * scenario the task always serialize on the lock.
3830	 */
3831	contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
3832	if (unlikely(contended))
3833		spin_lock(&q->busylock);
3834
3835	spin_lock(root_lock);
3836	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3837		__qdisc_drop(skb, &to_free);
3838		rc = NET_XMIT_DROP;
3839	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3840		   qdisc_run_begin(q)) {
3841		/*
3842		 * This is a work-conserving queue; there are no old skbs
3843		 * waiting to be sent out; and the qdisc is not running -
3844		 * xmit the skb directly.
3845		 */
3846
3847		qdisc_bstats_update(q, skb);
3848
3849		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3850			if (unlikely(contended)) {
3851				spin_unlock(&q->busylock);
3852				contended = false;
3853			}
3854			__qdisc_run(q);
3855		}
3856
3857		qdisc_run_end(q);
3858		rc = NET_XMIT_SUCCESS;
3859	} else {
3860		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3861		if (qdisc_run_begin(q)) {
3862			if (unlikely(contended)) {
3863				spin_unlock(&q->busylock);
3864				contended = false;
3865			}
3866			__qdisc_run(q);
3867			qdisc_run_end(q);
3868		}
3869	}
3870	spin_unlock(root_lock);
3871	if (unlikely(to_free))
3872		kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP);
3873	if (unlikely(contended))
3874		spin_unlock(&q->busylock);
3875	return rc;
3876}
3877
3878#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3879static void skb_update_prio(struct sk_buff *skb)
3880{
3881	const struct netprio_map *map;
3882	const struct sock *sk;
3883	unsigned int prioidx;
3884
3885	if (skb->priority)
3886		return;
3887	map = rcu_dereference_bh(skb->dev->priomap);
3888	if (!map)
3889		return;
3890	sk = skb_to_full_sk(skb);
3891	if (!sk)
3892		return;
3893
3894	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3895
3896	if (prioidx < map->priomap_len)
3897		skb->priority = map->priomap[prioidx];
3898}
3899#else
3900#define skb_update_prio(skb)
3901#endif
3902
3903/**
3904 *	dev_loopback_xmit - loop back @skb
3905 *	@net: network namespace this loopback is happening in
3906 *	@sk:  sk needed to be a netfilter okfn
3907 *	@skb: buffer to transmit
3908 */
3909int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3910{
3911	skb_reset_mac_header(skb);
3912	__skb_pull(skb, skb_network_offset(skb));
3913	skb->pkt_type = PACKET_LOOPBACK;
3914	if (skb->ip_summed == CHECKSUM_NONE)
3915		skb->ip_summed = CHECKSUM_UNNECESSARY;
3916	DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
3917	skb_dst_force(skb);
3918	netif_rx(skb);
3919	return 0;
3920}
3921EXPORT_SYMBOL(dev_loopback_xmit);
3922
3923#ifdef CONFIG_NET_EGRESS
3924static struct netdev_queue *
3925netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
3926{
3927	int qm = skb_get_queue_mapping(skb);
3928
3929	return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
3930}
3931
3932static bool netdev_xmit_txqueue_skipped(void)
3933{
3934	return __this_cpu_read(softnet_data.xmit.skip_txqueue);
3935}
3936
3937void netdev_xmit_skip_txqueue(bool skip)
3938{
3939	__this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
3940}
3941EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
3942#endif /* CONFIG_NET_EGRESS */
3943
3944#ifdef CONFIG_NET_XGRESS
3945static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
3946{
3947	int ret = TC_ACT_UNSPEC;
3948#ifdef CONFIG_NET_CLS_ACT
3949	struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
3950	struct tcf_result res;
3951
3952	if (!miniq)
3953		return ret;
3954
3955	tc_skb_cb(skb)->mru = 0;
3956	tc_skb_cb(skb)->post_ct = false;
3957
3958	mini_qdisc_bstats_cpu_update(miniq, skb);
3959	ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
3960	/* Only tcf related quirks below. */
3961	switch (ret) {
3962	case TC_ACT_SHOT:
3963		mini_qdisc_qstats_cpu_drop(miniq);
3964		break;
3965	case TC_ACT_OK:
3966	case TC_ACT_RECLASSIFY:
3967		skb->tc_index = TC_H_MIN(res.classid);
3968		break;
3969	}
3970#endif /* CONFIG_NET_CLS_ACT */
3971	return ret;
3972}
3973
3974static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);
3975
3976void tcx_inc(void)
3977{
3978	static_branch_inc(&tcx_needed_key);
3979}
3980
3981void tcx_dec(void)
3982{
3983	static_branch_dec(&tcx_needed_key);
3984}
3985
3986static __always_inline enum tcx_action_base
3987tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
3988	const bool needs_mac)
3989{
3990	const struct bpf_mprog_fp *fp;
3991	const struct bpf_prog *prog;
3992	int ret = TCX_NEXT;
3993
3994	if (needs_mac)
3995		__skb_push(skb, skb->mac_len);
3996	bpf_mprog_foreach_prog(entry, fp, prog) {
3997		bpf_compute_data_pointers(skb);
3998		ret = bpf_prog_run(prog, skb);
3999		if (ret != TCX_NEXT)
4000			break;
4001	}
4002	if (needs_mac)
4003		__skb_pull(skb, skb->mac_len);
4004	return tcx_action_code(skb, ret);
4005}
4006
4007static __always_inline struct sk_buff *
4008sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4009		   struct net_device *orig_dev, bool *another)
4010{
4011	struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
4012	int sch_ret;
4013
4014	if (!entry)
4015		return skb;
4016	if (*pt_prev) {
4017		*ret = deliver_skb(skb, *pt_prev, orig_dev);
4018		*pt_prev = NULL;
4019	}
4020
4021	qdisc_skb_cb(skb)->pkt_len = skb->len;
4022	tcx_set_ingress(skb, true);
4023
4024	if (static_branch_unlikely(&tcx_needed_key)) {
4025		sch_ret = tcx_run(entry, skb, true);
4026		if (sch_ret != TC_ACT_UNSPEC)
4027			goto ingress_verdict;
4028	}
4029	sch_ret = tc_run(tcx_entry(entry), skb);
4030ingress_verdict:
4031	switch (sch_ret) {
4032	case TC_ACT_REDIRECT:
4033		/* skb_mac_header check was done by BPF, so we can safely
4034		 * push the L2 header back before redirecting to another
4035		 * netdev.
4036		 */
4037		__skb_push(skb, skb->mac_len);
4038		if (skb_do_redirect(skb) == -EAGAIN) {
4039			__skb_pull(skb, skb->mac_len);
4040			*another = true;
4041			break;
4042		}
4043		*ret = NET_RX_SUCCESS;
4044		return NULL;
4045	case TC_ACT_SHOT:
4046		kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
4047		*ret = NET_RX_DROP;
4048		return NULL;
4049	/* used by tc_run */
4050	case TC_ACT_STOLEN:
4051	case TC_ACT_QUEUED:
4052	case TC_ACT_TRAP:
4053		consume_skb(skb);
4054		fallthrough;
4055	case TC_ACT_CONSUMED:
4056		*ret = NET_RX_SUCCESS;
4057		return NULL;
4058	}
4059
4060	return skb;
4061}
4062
4063static __always_inline struct sk_buff *
4064sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
4065{
4066	struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
4067	int sch_ret;
4068
4069	if (!entry)
4070		return skb;
4071
4072	/* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
4073	 * already set by the caller.
4074	 */
4075	if (static_branch_unlikely(&tcx_needed_key)) {
4076		sch_ret = tcx_run(entry, skb, false);
4077		if (sch_ret != TC_ACT_UNSPEC)
4078			goto egress_verdict;
4079	}
4080	sch_ret = tc_run(tcx_entry(entry), skb);
4081egress_verdict:
4082	switch (sch_ret) {
4083	case TC_ACT_REDIRECT:
4084		/* No need to push/pop skb's mac_header here on egress! */
4085		skb_do_redirect(skb);
4086		*ret = NET_XMIT_SUCCESS;
4087		return NULL;
4088	case TC_ACT_SHOT:
4089		kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
4090		*ret = NET_XMIT_DROP;
4091		return NULL;
4092	/* used by tc_run */
4093	case TC_ACT_STOLEN:
4094	case TC_ACT_QUEUED:
4095	case TC_ACT_TRAP:
4096		consume_skb(skb);
4097		fallthrough;
4098	case TC_ACT_CONSUMED:
4099		*ret = NET_XMIT_SUCCESS;
4100		return NULL;
4101	}
4102
4103	return skb;
4104}
4105#else
4106static __always_inline struct sk_buff *
4107sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4108		   struct net_device *orig_dev, bool *another)
4109{
4110	return skb;
4111}
4112
4113static __always_inline struct sk_buff *
4114sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
4115{
4116	return skb;
4117}
4118#endif /* CONFIG_NET_XGRESS */
4119
4120#ifdef CONFIG_XPS
4121static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
4122			       struct xps_dev_maps *dev_maps, unsigned int tci)
4123{
4124	int tc = netdev_get_prio_tc_map(dev, skb->priority);
4125	struct xps_map *map;
4126	int queue_index = -1;
4127
4128	if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
4129		return queue_index;
4130
4131	tci *= dev_maps->num_tc;
4132	tci += tc;
4133
4134	map = rcu_dereference(dev_maps->attr_map[tci]);
4135	if (map) {
4136		if (map->len == 1)
4137			queue_index = map->queues[0];
4138		else
4139			queue_index = map->queues[reciprocal_scale(
4140						skb_get_hash(skb), map->len)];
4141		if (unlikely(queue_index >= dev->real_num_tx_queues))
4142			queue_index = -1;
4143	}
4144	return queue_index;
4145}
4146#endif
4147
4148static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
4149			 struct sk_buff *skb)
4150{
4151#ifdef CONFIG_XPS
4152	struct xps_dev_maps *dev_maps;
4153	struct sock *sk = skb->sk;
4154	int queue_index = -1;
4155
4156	if (!static_key_false(&xps_needed))
4157		return -1;
4158
4159	rcu_read_lock();
4160	if (!static_key_false(&xps_rxqs_needed))
4161		goto get_cpus_map;
4162
4163	dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
4164	if (dev_maps) {
4165		int tci = sk_rx_queue_get(sk);
4166
4167		if (tci >= 0)
4168			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4169							  tci);
4170	}
4171
4172get_cpus_map:
4173	if (queue_index < 0) {
4174		dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
4175		if (dev_maps) {
4176			unsigned int tci = skb->sender_cpu - 1;
4177
4178			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4179							  tci);
4180		}
4181	}
4182	rcu_read_unlock();
4183
4184	return queue_index;
4185#else
4186	return -1;
4187#endif
4188}
4189
4190u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
4191		     struct net_device *sb_dev)
4192{
4193	return 0;
4194}
4195EXPORT_SYMBOL(dev_pick_tx_zero);
4196
4197u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
4198		       struct net_device *sb_dev)
4199{
4200	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
4201}
4202EXPORT_SYMBOL(dev_pick_tx_cpu_id);
4203
4204u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
4205		     struct net_device *sb_dev)
4206{
4207	struct sock *sk = skb->sk;
4208	int queue_index = sk_tx_queue_get(sk);
4209
4210	sb_dev = sb_dev ? : dev;
4211
4212	if (queue_index < 0 || skb->ooo_okay ||
4213	    queue_index >= dev->real_num_tx_queues) {
4214		int new_index = get_xps_queue(dev, sb_dev, skb);
4215
4216		if (new_index < 0)
4217			new_index = skb_tx_hash(dev, sb_dev, skb);
4218
4219		if (queue_index != new_index && sk &&
4220		    sk_fullsock(sk) &&
4221		    rcu_access_pointer(sk->sk_dst_cache))
4222			sk_tx_queue_set(sk, new_index);
4223
4224		queue_index = new_index;
4225	}
4226
4227	return queue_index;
4228}
4229EXPORT_SYMBOL(netdev_pick_tx);
4230
4231struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
4232					 struct sk_buff *skb,
4233					 struct net_device *sb_dev)
4234{
4235	int queue_index = 0;
4236
4237#ifdef CONFIG_XPS
4238	u32 sender_cpu = skb->sender_cpu - 1;
4239
4240	if (sender_cpu >= (u32)NR_CPUS)
4241		skb->sender_cpu = raw_smp_processor_id() + 1;
4242#endif
4243
4244	if (dev->real_num_tx_queues != 1) {
4245		const struct net_device_ops *ops = dev->netdev_ops;
4246
4247		if (ops->ndo_select_queue)
4248			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
4249		else
4250			queue_index = netdev_pick_tx(dev, skb, sb_dev);
4251
4252		queue_index = netdev_cap_txqueue(dev, queue_index);
4253	}
4254
4255	skb_set_queue_mapping(skb, queue_index);
4256	return netdev_get_tx_queue(dev, queue_index);
4257}
4258
4259/**
4260 * __dev_queue_xmit() - transmit a buffer
4261 * @skb:	buffer to transmit
4262 * @sb_dev:	suboordinate device used for L2 forwarding offload
4263 *
4264 * Queue a buffer for transmission to a network device. The caller must
4265 * have set the device and priority and built the buffer before calling
4266 * this function. The function can be called from an interrupt.
4267 *
4268 * When calling this method, interrupts MUST be enabled. This is because
4269 * the BH enable code must have IRQs enabled so that it will not deadlock.
4270 *
4271 * Regardless of the return value, the skb is consumed, so it is currently
4272 * difficult to retry a send to this method. (You can bump the ref count
4273 * before sending to hold a reference for retry if you are careful.)
4274 *
4275 * Return:
4276 * * 0				- buffer successfully transmitted
4277 * * positive qdisc return code	- NET_XMIT_DROP etc.
4278 * * negative errno		- other errors
4279 */
4280int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
4281{
4282	struct net_device *dev = skb->dev;
4283	struct netdev_queue *txq = NULL;
4284	struct Qdisc *q;
4285	int rc = -ENOMEM;
4286	bool again = false;
4287
4288	skb_reset_mac_header(skb);
4289	skb_assert_len(skb);
4290
4291	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
4292		__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
4293
4294	/* Disable soft irqs for various locks below. Also
4295	 * stops preemption for RCU.
4296	 */
4297	rcu_read_lock_bh();
4298
4299	skb_update_prio(skb);
4300
4301	qdisc_pkt_len_init(skb);
4302	tcx_set_ingress(skb, false);
4303#ifdef CONFIG_NET_EGRESS
4304	if (static_branch_unlikely(&egress_needed_key)) {
4305		if (nf_hook_egress_active()) {
4306			skb = nf_hook_egress(skb, &rc, dev);
4307			if (!skb)
4308				goto out;
4309		}
4310
4311		netdev_xmit_skip_txqueue(false);
4312
4313		nf_skip_egress(skb, true);
4314		skb = sch_handle_egress(skb, &rc, dev);
4315		if (!skb)
4316			goto out;
4317		nf_skip_egress(skb, false);
4318
4319		if (netdev_xmit_txqueue_skipped())
4320			txq = netdev_tx_queue_mapping(dev, skb);
4321	}
4322#endif
4323	/* If device/qdisc don't need skb->dst, release it right now while
4324	 * its hot in this cpu cache.
4325	 */
4326	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
4327		skb_dst_drop(skb);
4328	else
4329		skb_dst_force(skb);
4330
4331	if (!txq)
4332		txq = netdev_core_pick_tx(dev, skb, sb_dev);
4333
4334	q = rcu_dereference_bh(txq->qdisc);
4335
4336	trace_net_dev_queue(skb);
4337	if (q->enqueue) {
4338		rc = __dev_xmit_skb(skb, q, dev, txq);
4339		goto out;
4340	}
4341
4342	/* The device has no queue. Common case for software devices:
4343	 * loopback, all the sorts of tunnels...
4344
4345	 * Really, it is unlikely that netif_tx_lock protection is necessary
4346	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
4347	 * counters.)
4348	 * However, it is possible, that they rely on protection
4349	 * made by us here.
4350
4351	 * Check this and shot the lock. It is not prone from deadlocks.
4352	 *Either shot noqueue qdisc, it is even simpler 8)
4353	 */
4354	if (dev->flags & IFF_UP) {
4355		int cpu = smp_processor_id(); /* ok because BHs are off */
4356
4357		/* Other cpus might concurrently change txq->xmit_lock_owner
4358		 * to -1 or to their cpu id, but not to our id.
4359		 */
4360		if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
4361			if (dev_xmit_recursion())
4362				goto recursion_alert;
4363
4364			skb = validate_xmit_skb(skb, dev, &again);
4365			if (!skb)
4366				goto out;
4367
4368			HARD_TX_LOCK(dev, txq, cpu);
4369
4370			if (!netif_xmit_stopped(txq)) {
4371				dev_xmit_recursion_inc();
4372				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4373				dev_xmit_recursion_dec();
4374				if (dev_xmit_complete(rc)) {
4375					HARD_TX_UNLOCK(dev, txq);
4376					goto out;
4377				}
4378			}
4379			HARD_TX_UNLOCK(dev, txq);
4380			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4381					     dev->name);
4382		} else {
4383			/* Recursion is detected! It is possible,
4384			 * unfortunately
4385			 */
4386recursion_alert:
4387			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4388					     dev->name);
4389		}
4390	}
4391
4392	rc = -ENETDOWN;
4393	rcu_read_unlock_bh();
4394
4395	dev_core_stats_tx_dropped_inc(dev);
4396	kfree_skb_list(skb);
4397	return rc;
4398out:
4399	rcu_read_unlock_bh();
4400	return rc;
4401}
4402EXPORT_SYMBOL(__dev_queue_xmit);
4403
4404int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4405{
4406	struct net_device *dev = skb->dev;
4407	struct sk_buff *orig_skb = skb;
4408	struct netdev_queue *txq;
4409	int ret = NETDEV_TX_BUSY;
4410	bool again = false;
4411
4412	if (unlikely(!netif_running(dev) ||
4413		     !netif_carrier_ok(dev)))
4414		goto drop;
4415
4416	skb = validate_xmit_skb_list(skb, dev, &again);
4417	if (skb != orig_skb)
4418		goto drop;
4419
4420	skb_set_queue_mapping(skb, queue_id);
4421	txq = skb_get_tx_queue(dev, skb);
4422
4423	local_bh_disable();
4424
4425	dev_xmit_recursion_inc();
4426	HARD_TX_LOCK(dev, txq, smp_processor_id());
4427	if (!netif_xmit_frozen_or_drv_stopped(txq))
4428		ret = netdev_start_xmit(skb, dev, txq, false);
4429	HARD_TX_UNLOCK(dev, txq);
4430	dev_xmit_recursion_dec();
4431
4432	local_bh_enable();
4433	return ret;
4434drop:
4435	dev_core_stats_tx_dropped_inc(dev);
4436	kfree_skb_list(skb);
4437	return NET_XMIT_DROP;
4438}
4439EXPORT_SYMBOL(__dev_direct_xmit);
4440
4441/*************************************************************************
4442 *			Receiver routines
4443 *************************************************************************/
4444
4445int netdev_max_backlog __read_mostly = 1000;
4446EXPORT_SYMBOL(netdev_max_backlog);
4447
4448int netdev_tstamp_prequeue __read_mostly = 1;
4449unsigned int sysctl_skb_defer_max __read_mostly = 64;
4450int netdev_budget __read_mostly = 300;
4451/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
4452unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
4453int weight_p __read_mostly = 64;           /* old backlog weight */
4454int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
4455int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
4456int dev_rx_weight __read_mostly = 64;
4457int dev_tx_weight __read_mostly = 64;
4458
4459/* Called with irq disabled */
4460static inline void ____napi_schedule(struct softnet_data *sd,
4461				     struct napi_struct *napi)
4462{
4463	struct task_struct *thread;
4464
4465	lockdep_assert_irqs_disabled();
4466
4467	if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
4468		/* Paired with smp_mb__before_atomic() in
4469		 * napi_enable()/dev_set_threaded().
4470		 * Use READ_ONCE() to guarantee a complete
4471		 * read on napi->thread. Only call
4472		 * wake_up_process() when it's not NULL.
4473		 */
4474		thread = READ_ONCE(napi->thread);
4475		if (thread) {
4476			/* Avoid doing set_bit() if the thread is in
4477			 * INTERRUPTIBLE state, cause napi_thread_wait()
4478			 * makes sure to proceed with napi polling
4479			 * if the thread is explicitly woken from here.
4480			 */
4481			if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
4482				set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
4483			wake_up_process(thread);
4484			return;
4485		}
4486	}
4487
4488	list_add_tail(&napi->poll_list, &sd->poll_list);
4489	WRITE_ONCE(napi->list_owner, smp_processor_id());
4490	/* If not called from net_rx_action()
4491	 * we have to raise NET_RX_SOFTIRQ.
4492	 */
4493	if (!sd->in_net_rx_action)
4494		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4495}
4496
4497#ifdef CONFIG_RPS
4498
4499/* One global table that all flow-based protocols share. */
4500struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
4501EXPORT_SYMBOL(rps_sock_flow_table);
4502u32 rps_cpu_mask __read_mostly;
4503EXPORT_SYMBOL(rps_cpu_mask);
4504
4505struct static_key_false rps_needed __read_mostly;
4506EXPORT_SYMBOL(rps_needed);
4507struct static_key_false rfs_needed __read_mostly;
4508EXPORT_SYMBOL(rfs_needed);
4509
4510static struct rps_dev_flow *
4511set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4512	    struct rps_dev_flow *rflow, u16 next_cpu)
4513{
4514	if (next_cpu < nr_cpu_ids) {
4515#ifdef CONFIG_RFS_ACCEL
4516		struct netdev_rx_queue *rxqueue;
4517		struct rps_dev_flow_table *flow_table;
4518		struct rps_dev_flow *old_rflow;
4519		u32 flow_id;
4520		u16 rxq_index;
4521		int rc;
4522
4523		/* Should we steer this flow to a different hardware queue? */
4524		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4525		    !(dev->features & NETIF_F_NTUPLE))
4526			goto out;
4527		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4528		if (rxq_index == skb_get_rx_queue(skb))
4529			goto out;
4530
4531		rxqueue = dev->_rx + rxq_index;
4532		flow_table = rcu_dereference(rxqueue->rps_flow_table);
4533		if (!flow_table)
4534			goto out;
4535		flow_id = skb_get_hash(skb) & flow_table->mask;
4536		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4537							rxq_index, flow_id);
4538		if (rc < 0)
4539			goto out;
4540		old_rflow = rflow;
4541		rflow = &flow_table->flows[flow_id];
4542		rflow->filter = rc;
4543		if (old_rflow->filter == rflow->filter)
4544			old_rflow->filter = RPS_NO_FILTER;
4545	out:
4546#endif
4547		rflow->last_qtail =
4548			per_cpu(softnet_data, next_cpu).input_queue_head;
4549	}
4550
4551	rflow->cpu = next_cpu;
4552	return rflow;
4553}
4554
4555/*
4556 * get_rps_cpu is called from netif_receive_skb and returns the target
4557 * CPU from the RPS map of the receiving queue for a given skb.
4558 * rcu_read_lock must be held on entry.
4559 */
4560static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4561		       struct rps_dev_flow **rflowp)
4562{
4563	const struct rps_sock_flow_table *sock_flow_table;
4564	struct netdev_rx_queue *rxqueue = dev->_rx;
4565	struct rps_dev_flow_table *flow_table;
4566	struct rps_map *map;
4567	int cpu = -1;
4568	u32 tcpu;
4569	u32 hash;
4570
4571	if (skb_rx_queue_recorded(skb)) {
4572		u16 index = skb_get_rx_queue(skb);
4573
4574		if (unlikely(index >= dev->real_num_rx_queues)) {
4575			WARN_ONCE(dev->real_num_rx_queues > 1,
4576				  "%s received packet on queue %u, but number "
4577				  "of RX queues is %u\n",
4578				  dev->name, index, dev->real_num_rx_queues);
4579			goto done;
4580		}
4581		rxqueue += index;
4582	}
4583
4584	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4585
4586	flow_table = rcu_dereference(rxqueue->rps_flow_table);
4587	map = rcu_dereference(rxqueue->rps_map);
4588	if (!flow_table && !map)
4589		goto done;
4590
4591	skb_reset_network_header(skb);
4592	hash = skb_get_hash(skb);
4593	if (!hash)
4594		goto done;
4595
4596	sock_flow_table = rcu_dereference(rps_sock_flow_table);
4597	if (flow_table && sock_flow_table) {
4598		struct rps_dev_flow *rflow;
4599		u32 next_cpu;
4600		u32 ident;
4601
4602		/* First check into global flow table if there is a match.
4603		 * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
4604		 */
4605		ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
4606		if ((ident ^ hash) & ~rps_cpu_mask)
4607			goto try_rps;
4608
4609		next_cpu = ident & rps_cpu_mask;
4610
4611		/* OK, now we know there is a match,
4612		 * we can look at the local (per receive queue) flow table
4613		 */
4614		rflow = &flow_table->flows[hash & flow_table->mask];
4615		tcpu = rflow->cpu;
4616
4617		/*
4618		 * If the desired CPU (where last recvmsg was done) is
4619		 * different from current CPU (one in the rx-queue flow
4620		 * table entry), switch if one of the following holds:
4621		 *   - Current CPU is unset (>= nr_cpu_ids).
4622		 *   - Current CPU is offline.
4623		 *   - The current CPU's queue tail has advanced beyond the
4624		 *     last packet that was enqueued using this table entry.
4625		 *     This guarantees that all previous packets for the flow
4626		 *     have been dequeued, thus preserving in order delivery.
4627		 */
4628		if (unlikely(tcpu != next_cpu) &&
4629		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4630		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4631		      rflow->last_qtail)) >= 0)) {
4632			tcpu = next_cpu;
4633			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4634		}
4635
4636		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4637			*rflowp = rflow;
4638			cpu = tcpu;
4639			goto done;
4640		}
4641	}
4642
4643try_rps:
4644
4645	if (map) {
4646		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4647		if (cpu_online(tcpu)) {
4648			cpu = tcpu;
4649			goto done;
4650		}
4651	}
4652
4653done:
4654	return cpu;
4655}
4656
4657#ifdef CONFIG_RFS_ACCEL
4658
4659/**
4660 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4661 * @dev: Device on which the filter was set
4662 * @rxq_index: RX queue index
4663 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4664 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4665 *
4666 * Drivers that implement ndo_rx_flow_steer() should periodically call
4667 * this function for each installed filter and remove the filters for
4668 * which it returns %true.
4669 */
4670bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4671			 u32 flow_id, u16 filter_id)
4672{
4673	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4674	struct rps_dev_flow_table *flow_table;
4675	struct rps_dev_flow *rflow;
4676	bool expire = true;
4677	unsigned int cpu;
4678
4679	rcu_read_lock();
4680	flow_table = rcu_dereference(rxqueue->rps_flow_table);
4681	if (flow_table && flow_id <= flow_table->mask) {
4682		rflow = &flow_table->flows[flow_id];
4683		cpu = READ_ONCE(rflow->cpu);
4684		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4685		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4686			   rflow->last_qtail) <
4687		     (int)(10 * flow_table->mask)))
4688			expire = false;
4689	}
4690	rcu_read_unlock();
4691	return expire;
4692}
4693EXPORT_SYMBOL(rps_may_expire_flow);
4694
4695#endif /* CONFIG_RFS_ACCEL */
4696
4697/* Called from hardirq (IPI) context */
4698static void rps_trigger_softirq(void *data)
4699{
4700	struct softnet_data *sd = data;
4701
4702	____napi_schedule(sd, &sd->backlog);
4703	sd->received_rps++;
4704}
4705
4706#endif /* CONFIG_RPS */
4707
4708/* Called from hardirq (IPI) context */
4709static void trigger_rx_softirq(void *data)
4710{
4711	struct softnet_data *sd = data;
4712
4713	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4714	smp_store_release(&sd->defer_ipi_scheduled, 0);
4715}
4716
4717/*
4718 * After we queued a packet into sd->input_pkt_queue,
4719 * we need to make sure this queue is serviced soon.
4720 *
4721 * - If this is another cpu queue, link it to our rps_ipi_list,
4722 *   and make sure we will process rps_ipi_list from net_rx_action().
4723 *
4724 * - If this is our own queue, NAPI schedule our backlog.
4725 *   Note that this also raises NET_RX_SOFTIRQ.
4726 */
4727static void napi_schedule_rps(struct softnet_data *sd)
4728{
4729	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4730
4731#ifdef CONFIG_RPS
4732	if (sd != mysd) {
4733		sd->rps_ipi_next = mysd->rps_ipi_list;
4734		mysd->rps_ipi_list = sd;
4735
4736		/* If not called from net_rx_action() or napi_threaded_poll()
4737		 * we have to raise NET_RX_SOFTIRQ.
4738		 */
4739		if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
4740			__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4741		return;
4742	}
4743#endif /* CONFIG_RPS */
4744	__napi_schedule_irqoff(&mysd->backlog);
4745}
4746
4747#ifdef CONFIG_NET_FLOW_LIMIT
4748int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4749#endif
4750
4751static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4752{
4753#ifdef CONFIG_NET_FLOW_LIMIT
4754	struct sd_flow_limit *fl;
4755	struct softnet_data *sd;
4756	unsigned int old_flow, new_flow;
4757
4758	if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
4759		return false;
4760
4761	sd = this_cpu_ptr(&softnet_data);
4762
4763	rcu_read_lock();
4764	fl = rcu_dereference(sd->flow_limit);
4765	if (fl) {
4766		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4767		old_flow = fl->history[fl->history_head];
4768		fl->history[fl->history_head] = new_flow;
4769
4770		fl->history_head++;
4771		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4772
4773		if (likely(fl->buckets[old_flow]))
4774			fl->buckets[old_flow]--;
4775
4776		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4777			fl->count++;
4778			rcu_read_unlock();
4779			return true;
4780		}
4781	}
4782	rcu_read_unlock();
4783#endif
4784	return false;
4785}
4786
4787/*
4788 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4789 * queue (may be a remote CPU queue).
4790 */
4791static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4792			      unsigned int *qtail)
4793{
4794	enum skb_drop_reason reason;
4795	struct softnet_data *sd;
4796	unsigned long flags;
4797	unsigned int qlen;
4798
4799	reason = SKB_DROP_REASON_NOT_SPECIFIED;
4800	sd = &per_cpu(softnet_data, cpu);
4801
4802	rps_lock_irqsave(sd, &flags);
4803	if (!netif_running(skb->dev))
4804		goto drop;
4805	qlen = skb_queue_len(&sd->input_pkt_queue);
4806	if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
4807		if (qlen) {
4808enqueue:
4809			__skb_queue_tail(&sd->input_pkt_queue, skb);
4810			input_queue_tail_incr_save(sd, qtail);
4811			rps_unlock_irq_restore(sd, &flags);
4812			return NET_RX_SUCCESS;
4813		}
4814
4815		/* Schedule NAPI for backlog device
4816		 * We can use non atomic operation since we own the queue lock
4817		 */
4818		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
4819			napi_schedule_rps(sd);
4820		goto enqueue;
4821	}
4822	reason = SKB_DROP_REASON_CPU_BACKLOG;
4823
4824drop:
4825	sd->dropped++;
4826	rps_unlock_irq_restore(sd, &flags);
4827
4828	dev_core_stats_rx_dropped_inc(skb->dev);
4829	kfree_skb_reason(skb, reason);
4830	return NET_RX_DROP;
4831}
4832
4833static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4834{
4835	struct net_device *dev = skb->dev;
4836	struct netdev_rx_queue *rxqueue;
4837
4838	rxqueue = dev->_rx;
4839
4840	if (skb_rx_queue_recorded(skb)) {
4841		u16 index = skb_get_rx_queue(skb);
4842
4843		if (unlikely(index >= dev->real_num_rx_queues)) {
4844			WARN_ONCE(dev->real_num_rx_queues > 1,
4845				  "%s received packet on queue %u, but number "
4846				  "of RX queues is %u\n",
4847				  dev->name, index, dev->real_num_rx_queues);
4848
4849			return rxqueue; /* Return first rxqueue */
4850		}
4851		rxqueue += index;
4852	}
4853	return rxqueue;
4854}
4855
4856u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
4857			     struct bpf_prog *xdp_prog)
4858{
4859	void *orig_data, *orig_data_end, *hard_start;
4860	struct netdev_rx_queue *rxqueue;
4861	bool orig_bcast, orig_host;
4862	u32 mac_len, frame_sz;
4863	__be16 orig_eth_type;
4864	struct ethhdr *eth;
4865	u32 metalen, act;
4866	int off;
4867
4868	/* The XDP program wants to see the packet starting at the MAC
4869	 * header.
4870	 */
4871	mac_len = skb->data - skb_mac_header(skb);
4872	hard_start = skb->data - skb_headroom(skb);
4873
4874	/* SKB "head" area always have tailroom for skb_shared_info */
4875	frame_sz = (void *)skb_end_pointer(skb) - hard_start;
4876	frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
4877
4878	rxqueue = netif_get_rxqueue(skb);
4879	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
4880	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
4881			 skb_headlen(skb) + mac_len, true);
4882
4883	orig_data_end = xdp->data_end;
4884	orig_data = xdp->data;
4885	eth = (struct ethhdr *)xdp->data;
4886	orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
4887	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4888	orig_eth_type = eth->h_proto;
4889
4890	act = bpf_prog_run_xdp(xdp_prog, xdp);
4891
4892	/* check if bpf_xdp_adjust_head was used */
4893	off = xdp->data - orig_data;
4894	if (off) {
4895		if (off > 0)
4896			__skb_pull(skb, off);
4897		else if (off < 0)
4898			__skb_push(skb, -off);
4899
4900		skb->mac_header += off;
4901		skb_reset_network_header(skb);
4902	}
4903
4904	/* check if bpf_xdp_adjust_tail was used */
4905	off = xdp->data_end - orig_data_end;
4906	if (off != 0) {
4907		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4908		skb->len += off; /* positive on grow, negative on shrink */
4909	}
4910
4911	/* check if XDP changed eth hdr such SKB needs update */
4912	eth = (struct ethhdr *)xdp->data;
4913	if ((orig_eth_type != eth->h_proto) ||
4914	    (orig_host != ether_addr_equal_64bits(eth->h_dest,
4915						  skb->dev->dev_addr)) ||
4916	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4917		__skb_push(skb, ETH_HLEN);
4918		skb->pkt_type = PACKET_HOST;
4919		skb->protocol = eth_type_trans(skb, skb->dev);
4920	}
4921
4922	/* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
4923	 * before calling us again on redirect path. We do not call do_redirect
4924	 * as we leave that up to the caller.
4925	 *
4926	 * Caller is responsible for managing lifetime of skb (i.e. calling
4927	 * kfree_skb in response to actions it cannot handle/XDP_DROP).
4928	 */
4929	switch (act) {
4930	case XDP_REDIRECT:
4931	case XDP_TX:
4932		__skb_push(skb, mac_len);
4933		break;
4934	case XDP_PASS:
4935		metalen = xdp->data - xdp->data_meta;
4936		if (metalen)
4937			skb_metadata_set(skb, metalen);
4938		break;
4939	}
4940
4941	return act;
4942}
4943
4944static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4945				     struct xdp_buff *xdp,
4946				     struct bpf_prog *xdp_prog)
4947{
4948	u32 act = XDP_DROP;
4949
4950	/* Reinjected packets coming from act_mirred or similar should
4951	 * not get XDP generic processing.
4952	 */
4953	if (skb_is_redirected(skb))
4954		return XDP_PASS;
4955
4956	/* XDP packets must be linear and must have sufficient headroom
4957	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4958	 * native XDP provides, thus we need to do it here as well.
4959	 */
4960	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
4961	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4962		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4963		int troom = skb->tail + skb->data_len - skb->end;
4964
4965		/* In case we have to go down the path and also linearize,
4966		 * then lets do the pskb_expand_head() work just once here.
4967		 */
4968		if (pskb_expand_head(skb,
4969				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4970				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
4971			goto do_drop;
4972		if (skb_linearize(skb))
4973			goto do_drop;
4974	}
4975
4976	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
4977	switch (act) {
4978	case XDP_REDIRECT:
4979	case XDP_TX:
4980	case XDP_PASS:
4981		break;
4982	default:
4983		bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
4984		fallthrough;
4985	case XDP_ABORTED:
4986		trace_xdp_exception(skb->dev, xdp_prog, act);
4987		fallthrough;
4988	case XDP_DROP:
4989	do_drop:
4990		kfree_skb(skb);
4991		break;
4992	}
4993
4994	return act;
4995}
4996
4997/* When doing generic XDP we have to bypass the qdisc layer and the
4998 * network taps in order to match in-driver-XDP behavior. This also means
4999 * that XDP packets are able to starve other packets going through a qdisc,
5000 * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
5001 * queues, so they do not have this starvation issue.
5002 */
5003void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
5004{
5005	struct net_device *dev = skb->dev;
5006	struct netdev_queue *txq;
5007	bool free_skb = true;
5008	int cpu, rc;
5009
5010	txq = netdev_core_pick_tx(dev, skb, NULL);
5011	cpu = smp_processor_id();
5012	HARD_TX_LOCK(dev, txq, cpu);
5013	if (!netif_xmit_frozen_or_drv_stopped(txq)) {
5014		rc = netdev_start_xmit(skb, dev, txq, 0);
5015		if (dev_xmit_complete(rc))
5016			free_skb = false;
5017	}
5018	HARD_TX_UNLOCK(dev, txq);
5019	if (free_skb) {
5020		trace_xdp_exception(dev, xdp_prog, XDP_TX);
5021		dev_core_stats_tx_dropped_inc(dev);
5022		kfree_skb(skb);
5023	}
5024}
5025
5026static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
5027
5028int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
5029{
5030	if (xdp_prog) {
5031		struct xdp_buff xdp;
5032		u32 act;
5033		int err;
5034
5035		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
5036		if (act != XDP_PASS) {
5037			switch (act) {
5038			case XDP_REDIRECT:
5039				err = xdp_do_generic_redirect(skb->dev, skb,
5040							      &xdp, xdp_prog);
5041				if (err)
5042					goto out_redir;
5043				break;
5044			case XDP_TX:
5045				generic_xdp_tx(skb, xdp_prog);
5046				break;
5047			}
5048			return XDP_DROP;
5049		}
5050	}
5051	return XDP_PASS;
5052out_redir:
5053	kfree_skb_reason(skb, SKB_DROP_REASON_XDP);
5054	return XDP_DROP;
5055}
5056EXPORT_SYMBOL_GPL(do_xdp_generic);
5057
5058static int netif_rx_internal(struct sk_buff *skb)
5059{
5060	int ret;
5061
5062	net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
5063
5064	trace_netif_rx(skb);
5065
5066#ifdef CONFIG_RPS
5067	if (static_branch_unlikely(&rps_needed)) {
5068		struct rps_dev_flow voidflow, *rflow = &voidflow;
5069		int cpu;
5070
5071		rcu_read_lock();
5072
5073		cpu = get_rps_cpu(skb->dev, skb, &rflow);
5074		if (cpu < 0)
5075			cpu = smp_processor_id();
5076
5077		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5078
5079		rcu_read_unlock();
5080	} else
5081#endif
5082	{
5083		unsigned int qtail;
5084
5085		ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
5086	}
5087	return ret;
5088}
5089
5090/**
5091 *	__netif_rx	-	Slightly optimized version of netif_rx
5092 *	@skb: buffer to post
5093 *
5094 *	This behaves as netif_rx except that it does not disable bottom halves.
5095 *	As a result this function may only be invoked from the interrupt context
5096 *	(either hard or soft interrupt).
5097 */
5098int __netif_rx(struct sk_buff *skb)
5099{
5100	int ret;
5101
5102	lockdep_assert_once(hardirq_count() | softirq_count());
5103
5104	trace_netif_rx_entry(skb);
5105	ret = netif_rx_internal(skb);
5106	trace_netif_rx_exit(ret);
5107	return ret;
5108}
5109EXPORT_SYMBOL(__netif_rx);
5110
5111/**
5112 *	netif_rx	-	post buffer to the network code
5113 *	@skb: buffer to post
5114 *
5115 *	This function receives a packet from a device driver and queues it for
5116 *	the upper (protocol) levels to process via the backlog NAPI device. It
5117 *	always succeeds. The buffer may be dropped during processing for
5118 *	congestion control or by the protocol layers.
5119 *	The network buffer is passed via the backlog NAPI device. Modern NIC
5120 *	driver should use NAPI and GRO.
5121 *	This function can used from interrupt and from process context. The
5122 *	caller from process context must not disable interrupts before invoking
5123 *	this function.
5124 *
5125 *	return values:
5126 *	NET_RX_SUCCESS	(no congestion)
5127 *	NET_RX_DROP     (packet was dropped)
5128 *
5129 */
5130int netif_rx(struct sk_buff *skb)
5131{
5132	bool need_bh_off = !(hardirq_count() | softirq_count());
5133	int ret;
5134
5135	if (need_bh_off)
5136		local_bh_disable();
5137	trace_netif_rx_entry(skb);
5138	ret = netif_rx_internal(skb);
5139	trace_netif_rx_exit(ret);
5140	if (need_bh_off)
5141		local_bh_enable();
5142	return ret;
5143}
5144EXPORT_SYMBOL(netif_rx);
5145
5146static __latent_entropy void net_tx_action(struct softirq_action *h)
5147{
5148	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5149
5150	if (sd->completion_queue) {
5151		struct sk_buff *clist;
5152
5153		local_irq_disable();
5154		clist = sd->completion_queue;
5155		sd->completion_queue = NULL;
5156		local_irq_enable();
5157
5158		while (clist) {
5159			struct sk_buff *skb = clist;
5160
5161			clist = clist->next;
5162
5163			WARN_ON(refcount_read(&skb->users));
5164			if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
5165				trace_consume_skb(skb, net_tx_action);
5166			else
5167				trace_kfree_skb(skb, net_tx_action,
5168						get_kfree_skb_cb(skb)->reason);
5169
5170			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
5171				__kfree_skb(skb);
5172			else
5173				__napi_kfree_skb(skb,
5174						 get_kfree_skb_cb(skb)->reason);
5175		}
5176	}
5177
5178	if (sd->output_queue) {
5179		struct Qdisc *head;
5180
5181		local_irq_disable();
5182		head = sd->output_queue;
5183		sd->output_queue = NULL;
5184		sd->output_queue_tailp = &sd->output_queue;
5185		local_irq_enable();
5186
5187		rcu_read_lock();
5188
5189		while (head) {
5190			struct Qdisc *q = head;
5191			spinlock_t *root_lock = NULL;
5192
5193			head = head->next_sched;
5194
5195			/* We need to make sure head->next_sched is read
5196			 * before clearing __QDISC_STATE_SCHED
5197			 */
5198			smp_mb__before_atomic();
5199
5200			if (!(q->flags & TCQ_F_NOLOCK)) {
5201				root_lock = qdisc_lock(q);
5202				spin_lock(root_lock);
5203			} else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
5204						     &q->state))) {
5205				/* There is a synchronize_net() between
5206				 * STATE_DEACTIVATED flag being set and
5207				 * qdisc_reset()/some_qdisc_is_busy() in
5208				 * dev_deactivate(), so we can safely bail out
5209				 * early here to avoid data race between
5210				 * qdisc_deactivate() and some_qdisc_is_busy()
5211				 * for lockless qdisc.
5212				 */
5213				clear_bit(__QDISC_STATE_SCHED, &q->state);
5214				continue;
5215			}
5216
5217			clear_bit(__QDISC_STATE_SCHED, &q->state);
5218			qdisc_run(q);
5219			if (root_lock)
5220				spin_unlock(root_lock);
5221		}
5222
5223		rcu_read_unlock();
5224	}
5225
5226	xfrm_dev_backlog(sd);
5227}
5228
5229#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
5230/* This hook is defined here for ATM LANE */
5231int (*br_fdb_test_addr_hook)(struct net_device *dev,
5232			     unsigned char *addr) __read_mostly;
5233EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
5234#endif
5235
5236/**
5237 *	netdev_is_rx_handler_busy - check if receive handler is registered
5238 *	@dev: device to check
5239 *
5240 *	Check if a receive handler is already registered for a given device.
5241 *	Return true if there one.
5242 *
5243 *	The caller must hold the rtnl_mutex.
5244 */
5245bool netdev_is_rx_handler_busy(struct net_device *dev)
5246{
5247	ASSERT_RTNL();
5248	return dev && rtnl_dereference(dev->rx_handler);
5249}
5250EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
5251
5252/**
5253 *	netdev_rx_handler_register - register receive handler
5254 *	@dev: device to register a handler for
5255 *	@rx_handler: receive handler to register
5256 *	@rx_handler_data: data pointer that is used by rx handler
5257 *
5258 *	Register a receive handler for a device. This handler will then be
5259 *	called from __netif_receive_skb. A negative errno code is returned
5260 *	on a failure.
5261 *
5262 *	The caller must hold the rtnl_mutex.
5263 *
5264 *	For a general description of rx_handler, see enum rx_handler_result.
5265 */
5266int netdev_rx_handler_register(struct net_device *dev,
5267			       rx_handler_func_t *rx_handler,
5268			       void *rx_handler_data)
5269{
5270	if (netdev_is_rx_handler_busy(dev))
5271		return -EBUSY;
5272
5273	if (dev->priv_flags & IFF_NO_RX_HANDLER)
5274		return -EINVAL;
5275
5276	/* Note: rx_handler_data must be set before rx_handler */
5277	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
5278	rcu_assign_pointer(dev->rx_handler, rx_handler);
5279
5280	return 0;
5281}
5282EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
5283
5284/**
5285 *	netdev_rx_handler_unregister - unregister receive handler
5286 *	@dev: device to unregister a handler from
5287 *
5288 *	Unregister a receive handler from a device.
5289 *
5290 *	The caller must hold the rtnl_mutex.
5291 */
5292void netdev_rx_handler_unregister(struct net_device *dev)
5293{
5294
5295	ASSERT_RTNL();
5296	RCU_INIT_POINTER(dev->rx_handler, NULL);
5297	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
5298	 * section has a guarantee to see a non NULL rx_handler_data
5299	 * as well.
5300	 */
5301	synchronize_net();
5302	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
5303}
5304EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
5305
5306/*
5307 * Limit the use of PFMEMALLOC reserves to those protocols that implement
5308 * the special handling of PFMEMALLOC skbs.
5309 */
5310static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
5311{
5312	switch (skb->protocol) {
5313	case htons(ETH_P_ARP):
5314	case htons(ETH_P_IP):
5315	case htons(ETH_P_IPV6):
5316	case htons(ETH_P_8021Q):
5317	case htons(ETH_P_8021AD):
5318		return true;
5319	default:
5320		return false;
5321	}
5322}
5323
5324static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
5325			     int *ret, struct net_device *orig_dev)
5326{
5327	if (nf_hook_ingress_active(skb)) {
5328		int ingress_retval;
5329
5330		if (*pt_prev) {
5331			*ret = deliver_skb(skb, *pt_prev, orig_dev);
5332			*pt_prev = NULL;
5333		}
5334
5335		rcu_read_lock();
5336		ingress_retval = nf_hook_ingress(skb);
5337		rcu_read_unlock();
5338		return ingress_retval;
5339	}
5340	return 0;
5341}
5342
5343static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
5344				    struct packet_type **ppt_prev)
5345{
5346	struct packet_type *ptype, *pt_prev;
5347	rx_handler_func_t *rx_handler;
5348	struct sk_buff *skb = *pskb;
5349	struct net_device *orig_dev;
5350	bool deliver_exact = false;
5351	int ret = NET_RX_DROP;
5352	__be16 type;
5353
5354	net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
5355
5356	trace_netif_receive_skb(skb);
5357
5358	orig_dev = skb->dev;
5359
5360	skb_reset_network_header(skb);
5361	if (!skb_transport_header_was_set(skb))
5362		skb_reset_transport_header(skb);
5363	skb_reset_mac_len(skb);
5364
5365	pt_prev = NULL;
5366
5367another_round:
5368	skb->skb_iif = skb->dev->ifindex;
5369
5370	__this_cpu_inc(softnet_data.processed);
5371
5372	if (static_branch_unlikely(&generic_xdp_needed_key)) {
5373		int ret2;
5374
5375		migrate_disable();
5376		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
5377		migrate_enable();
5378
5379		if (ret2 != XDP_PASS) {
5380			ret = NET_RX_DROP;
5381			goto out;
5382		}
5383	}
5384
5385	if (eth_type_vlan(skb->protocol)) {
5386		skb = skb_vlan_untag(skb);
5387		if (unlikely(!skb))
5388			goto out;
5389	}
5390
5391	if (skb_skip_tc_classify(skb))
5392		goto skip_classify;
5393
5394	if (pfmemalloc)
5395		goto skip_taps;
5396
5397	list_for_each_entry_rcu(ptype, &ptype_all, list) {
5398		if (pt_prev)
5399			ret = deliver_skb(skb, pt_prev, orig_dev);
5400		pt_prev = ptype;
5401	}
5402
5403	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
5404		if (pt_prev)
5405			ret = deliver_skb(skb, pt_prev, orig_dev);
5406		pt_prev = ptype;
5407	}
5408
5409skip_taps:
5410#ifdef CONFIG_NET_INGRESS
5411	if (static_branch_unlikely(&ingress_needed_key)) {
5412		bool another = false;
5413
5414		nf_skip_egress(skb, true);
5415		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
5416					 &another);
5417		if (another)
5418			goto another_round;
5419		if (!skb)
5420			goto out;
5421
5422		nf_skip_egress(skb, false);
5423		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
5424			goto out;
5425	}
5426#endif
5427	skb_reset_redirect(skb);
5428skip_classify:
5429	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5430		goto drop;
5431
5432	if (skb_vlan_tag_present(skb)) {
5433		if (pt_prev) {
5434			ret = deliver_skb(skb, pt_prev, orig_dev);
5435			pt_prev = NULL;
5436		}
5437		if (vlan_do_receive(&skb))
5438			goto another_round;
5439		else if (unlikely(!skb))
5440			goto out;
5441	}
5442
5443	rx_handler = rcu_dereference(skb->dev->rx_handler);
5444	if (rx_handler) {
5445		if (pt_prev) {
5446			ret = deliver_skb(skb, pt_prev, orig_dev);
5447			pt_prev = NULL;
5448		}
5449		switch (rx_handler(&skb)) {
5450		case RX_HANDLER_CONSUMED:
5451			ret = NET_RX_SUCCESS;
5452			goto out;
5453		case RX_HANDLER_ANOTHER:
5454			goto another_round;
5455		case RX_HANDLER_EXACT:
5456			deliver_exact = true;
5457			break;
5458		case RX_HANDLER_PASS:
5459			break;
5460		default:
5461			BUG();
5462		}
5463	}
5464
5465	if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
5466check_vlan_id:
5467		if (skb_vlan_tag_get_id(skb)) {
5468			/* Vlan id is non 0 and vlan_do_receive() above couldn't
5469			 * find vlan device.
5470			 */
5471			skb->pkt_type = PACKET_OTHERHOST;
5472		} else if (eth_type_vlan(skb->protocol)) {
5473			/* Outer header is 802.1P with vlan 0, inner header is
5474			 * 802.1Q or 802.1AD and vlan_do_receive() above could
5475			 * not find vlan dev for vlan id 0.
5476			 */
5477			__vlan_hwaccel_clear_tag(skb);
5478			skb = skb_vlan_untag(skb);
5479			if (unlikely(!skb))
5480				goto out;
5481			if (vlan_do_receive(&skb))
5482				/* After stripping off 802.1P header with vlan 0
5483				 * vlan dev is found for inner header.
5484				 */
5485				goto another_round;
5486			else if (unlikely(!skb))
5487				goto out;
5488			else
5489				/* We have stripped outer 802.1P vlan 0 header.
5490				 * But could not find vlan dev.
5491				 * check again for vlan id to set OTHERHOST.
5492				 */
5493				goto check_vlan_id;
5494		}
5495		/* Note: we might in the future use prio bits
5496		 * and set skb->priority like in vlan_do_receive()
5497		 * For the time being, just ignore Priority Code Point
5498		 */
5499		__vlan_hwaccel_clear_tag(skb);
5500	}
5501
5502	type = skb->protocol;
5503
5504	/* deliver only exact match when indicated */
5505	if (likely(!deliver_exact)) {
5506		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5507				       &ptype_base[ntohs(type) &
5508						   PTYPE_HASH_MASK]);
5509	}
5510
5511	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5512			       &orig_dev->ptype_specific);
5513
5514	if (unlikely(skb->dev != orig_dev)) {
5515		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5516				       &skb->dev->ptype_specific);
5517	}
5518
5519	if (pt_prev) {
5520		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5521			goto drop;
5522		*ppt_prev = pt_prev;
5523	} else {
5524drop:
5525		if (!deliver_exact)
5526			dev_core_stats_rx_dropped_inc(skb->dev);
5527		else
5528			dev_core_stats_rx_nohandler_inc(skb->dev);
5529		kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
5530		/* Jamal, now you will not able to escape explaining
5531		 * me how you were going to use this. :-)
5532		 */
5533		ret = NET_RX_DROP;
5534	}
5535
5536out:
5537	/* The invariant here is that if *ppt_prev is not NULL
5538	 * then skb should also be non-NULL.
5539	 *
5540	 * Apparently *ppt_prev assignment above holds this invariant due to
5541	 * skb dereferencing near it.
5542	 */
5543	*pskb = skb;
5544	return ret;
5545}
5546
5547static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
5548{
5549	struct net_device *orig_dev = skb->dev;
5550	struct packet_type *pt_prev = NULL;
5551	int ret;
5552
5553	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5554	if (pt_prev)
5555		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5556					 skb->dev, pt_prev, orig_dev);
5557	return ret;
5558}
5559
5560/**
5561 *	netif_receive_skb_core - special purpose version of netif_receive_skb
5562 *	@skb: buffer to process
5563 *
5564 *	More direct receive version of netif_receive_skb().  It should
5565 *	only be used by callers that have a need to skip RPS and Generic XDP.
5566 *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
5567 *
5568 *	This function may only be called from softirq context and interrupts
5569 *	should be enabled.
5570 *
5571 *	Return values (usually ignored):
5572 *	NET_RX_SUCCESS: no congestion
5573 *	NET_RX_DROP: packet was dropped
5574 */
5575int netif_receive_skb_core(struct sk_buff *skb)
5576{
5577	int ret;
5578
5579	rcu_read_lock();
5580	ret = __netif_receive_skb_one_core(skb, false);
5581	rcu_read_unlock();
5582
5583	return ret;
5584}
5585EXPORT_SYMBOL(netif_receive_skb_core);
5586
5587static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5588						  struct packet_type *pt_prev,
5589						  struct net_device *orig_dev)
5590{
5591	struct sk_buff *skb, *next;
5592
5593	if (!pt_prev)
5594		return;
5595	if (list_empty(head))
5596		return;
5597	if (pt_prev->list_func != NULL)
5598		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5599				   ip_list_rcv, head, pt_prev, orig_dev);
5600	else
5601		list_for_each_entry_safe(skb, next, head, list) {
5602			skb_list_del_init(skb);
5603			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5604		}
5605}
5606
5607static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5608{
5609	/* Fast-path assumptions:
5610	 * - There is no RX handler.
5611	 * - Only one packet_type matches.
5612	 * If either of these fails, we will end up doing some per-packet
5613	 * processing in-line, then handling the 'last ptype' for the whole
5614	 * sublist.  This can't cause out-of-order delivery to any single ptype,
5615	 * because the 'last ptype' must be constant across the sublist, and all
5616	 * other ptypes are handled per-packet.
5617	 */
5618	/* Current (common) ptype of sublist */
5619	struct packet_type *pt_curr = NULL;
5620	/* Current (common) orig_dev of sublist */
5621	struct net_device *od_curr = NULL;
5622	struct list_head sublist;
5623	struct sk_buff *skb, *next;
5624
5625	INIT_LIST_HEAD(&sublist);
5626	list_for_each_entry_safe(skb, next, head, list) {
5627		struct net_device *orig_dev = skb->dev;
5628		struct packet_type *pt_prev = NULL;
5629
5630		skb_list_del_init(skb);
5631		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5632		if (!pt_prev)
5633			continue;
5634		if (pt_curr != pt_prev || od_curr != orig_dev) {
5635			/* dispatch old sublist */
5636			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5637			/* start new sublist */
5638			INIT_LIST_HEAD(&sublist);
5639			pt_curr = pt_prev;
5640			od_curr = orig_dev;
5641		}
5642		list_add_tail(&skb->list, &sublist);
5643	}
5644
5645	/* dispatch final sublist */
5646	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5647}
5648
5649static int __netif_receive_skb(struct sk_buff *skb)
5650{
5651	int ret;
5652
5653	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5654		unsigned int noreclaim_flag;
5655
5656		/*
5657		 * PFMEMALLOC skbs are special, they should
5658		 * - be delivered to SOCK_MEMALLOC sockets only
5659		 * - stay away from userspace
5660		 * - have bounded memory usage
5661		 *
5662		 * Use PF_MEMALLOC as this saves us from propagating the allocation
5663		 * context down to all allocation sites.
5664		 */
5665		noreclaim_flag = memalloc_noreclaim_save();
5666		ret = __netif_receive_skb_one_core(skb, true);
5667		memalloc_noreclaim_restore(noreclaim_flag);
5668	} else
5669		ret = __netif_receive_skb_one_core(skb, false);
5670
5671	return ret;
5672}
5673
5674static void __netif_receive_skb_list(struct list_head *head)
5675{
5676	unsigned long noreclaim_flag = 0;
5677	struct sk_buff *skb, *next;
5678	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5679
5680	list_for_each_entry_safe(skb, next, head, list) {
5681		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5682			struct list_head sublist;
5683
5684			/* Handle the previous sublist */
5685			list_cut_before(&sublist, head, &skb->list);
5686			if (!list_empty(&sublist))
5687				__netif_receive_skb_list_core(&sublist, pfmemalloc);
5688			pfmemalloc = !pfmemalloc;
5689			/* See comments in __netif_receive_skb */
5690			if (pfmemalloc)
5691				noreclaim_flag = memalloc_noreclaim_save();
5692			else
5693				memalloc_noreclaim_restore(noreclaim_flag);
5694		}
5695	}
5696	/* Handle the remaining sublist */
5697	if (!list_empty(head))
5698		__netif_receive_skb_list_core(head, pfmemalloc);
5699	/* Restore pflags */
5700	if (pfmemalloc)
5701		memalloc_noreclaim_restore(noreclaim_flag);
5702}
5703
5704static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5705{
5706	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5707	struct bpf_prog *new = xdp->prog;
5708	int ret = 0;
5709
5710	switch (xdp->command) {
5711	case XDP_SETUP_PROG:
5712		rcu_assign_pointer(dev->xdp_prog, new);
5713		if (old)
5714			bpf_prog_put(old);
5715
5716		if (old && !new) {
5717			static_branch_dec(&generic_xdp_needed_key);
5718		} else if (new && !old) {
5719			static_branch_inc(&generic_xdp_needed_key);
5720			dev_disable_lro(dev);
5721			dev_disable_gro_hw(dev);
5722		}
5723		break;
5724
5725	default:
5726		ret = -EINVAL;
5727		break;
5728	}
5729
5730	return ret;
5731}
5732
5733static int netif_receive_skb_internal(struct sk_buff *skb)
5734{
5735	int ret;
5736
5737	net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
5738
5739	if (skb_defer_rx_timestamp(skb))
5740		return NET_RX_SUCCESS;
5741
5742	rcu_read_lock();
5743#ifdef CONFIG_RPS
5744	if (static_branch_unlikely(&rps_needed)) {
5745		struct rps_dev_flow voidflow, *rflow = &voidflow;
5746		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5747
5748		if (cpu >= 0) {
5749			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5750			rcu_read_unlock();
5751			return ret;
5752		}
5753	}
5754#endif
5755	ret = __netif_receive_skb(skb);
5756	rcu_read_unlock();
5757	return ret;
5758}
5759
5760void netif_receive_skb_list_internal(struct list_head *head)
5761{
5762	struct sk_buff *skb, *next;
5763	struct list_head sublist;
5764
5765	INIT_LIST_HEAD(&sublist);
5766	list_for_each_entry_safe(skb, next, head, list) {
5767		net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
5768		skb_list_del_init(skb);
5769		if (!skb_defer_rx_timestamp(skb))
5770			list_add_tail(&skb->list, &sublist);
5771	}
5772	list_splice_init(&sublist, head);
5773
5774	rcu_read_lock();
5775#ifdef CONFIG_RPS
5776	if (static_branch_unlikely(&rps_needed)) {
5777		list_for_each_entry_safe(skb, next, head, list) {
5778			struct rps_dev_flow voidflow, *rflow = &voidflow;
5779			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5780
5781			if (cpu >= 0) {
5782				/* Will be handled, remove from list */
5783				skb_list_del_init(skb);
5784				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5785			}
5786		}
5787	}
5788#endif
5789	__netif_receive_skb_list(head);
5790	rcu_read_unlock();
5791}
5792
5793/**
5794 *	netif_receive_skb - process receive buffer from network
5795 *	@skb: buffer to process
5796 *
5797 *	netif_receive_skb() is the main receive data processing function.
5798 *	It always succeeds. The buffer may be dropped during processing
5799 *	for congestion control or by the protocol layers.
5800 *
5801 *	This function may only be called from softirq context and interrupts
5802 *	should be enabled.
5803 *
5804 *	Return values (usually ignored):
5805 *	NET_RX_SUCCESS: no congestion
5806 *	NET_RX_DROP: packet was dropped
5807 */
5808int netif_receive_skb(struct sk_buff *skb)
5809{
5810	int ret;
5811
5812	trace_netif_receive_skb_entry(skb);
5813
5814	ret = netif_receive_skb_internal(skb);
5815	trace_netif_receive_skb_exit(ret);
5816
5817	return ret;
5818}
5819EXPORT_SYMBOL(netif_receive_skb);
5820
5821/**
5822 *	netif_receive_skb_list - process many receive buffers from network
5823 *	@head: list of skbs to process.
5824 *
5825 *	Since return value of netif_receive_skb() is normally ignored, and
5826 *	wouldn't be meaningful for a list, this function returns void.
5827 *
5828 *	This function may only be called from softirq context and interrupts
5829 *	should be enabled.
5830 */
5831void netif_receive_skb_list(struct list_head *head)
5832{
5833	struct sk_buff *skb;
5834
5835	if (list_empty(head))
5836		return;
5837	if (trace_netif_receive_skb_list_entry_enabled()) {
5838		list_for_each_entry(skb, head, list)
5839			trace_netif_receive_skb_list_entry(skb);
5840	}
5841	netif_receive_skb_list_internal(head);
5842	trace_netif_receive_skb_list_exit(0);
5843}
5844EXPORT_SYMBOL(netif_receive_skb_list);
5845
5846static DEFINE_PER_CPU(struct work_struct, flush_works);
5847
5848/* Network device is going away, flush any packets still pending */
5849static void flush_backlog(struct work_struct *work)
5850{
5851	struct sk_buff *skb, *tmp;
5852	struct softnet_data *sd;
5853
5854	local_bh_disable();
5855	sd = this_cpu_ptr(&softnet_data);
5856
5857	rps_lock_irq_disable(sd);
5858	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5859		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5860			__skb_unlink(skb, &sd->input_pkt_queue);
5861			dev_kfree_skb_irq(skb);
5862			input_queue_head_incr(sd);
5863		}
5864	}
5865	rps_unlock_irq_enable(sd);
5866
5867	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5868		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5869			__skb_unlink(skb, &sd->process_queue);
5870			kfree_skb(skb);
5871			input_queue_head_incr(sd);
5872		}
5873	}
5874	local_bh_enable();
5875}
5876
5877static bool flush_required(int cpu)
5878{
5879#if IS_ENABLED(CONFIG_RPS)
5880	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
5881	bool do_flush;
5882
5883	rps_lock_irq_disable(sd);
5884
5885	/* as insertion into process_queue happens with the rps lock held,
5886	 * process_queue access may race only with dequeue
5887	 */
5888	do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
5889		   !skb_queue_empty_lockless(&sd->process_queue);
5890	rps_unlock_irq_enable(sd);
5891
5892	return do_flush;
5893#endif
5894	/* without RPS we can't safely check input_pkt_queue: during a
5895	 * concurrent remote skb_queue_splice() we can detect as empty both
5896	 * input_pkt_queue and process_queue even if the latter could end-up
5897	 * containing a lot of packets.
5898	 */
5899	return true;
5900}
5901
5902static void flush_all_backlogs(void)
5903{
5904	static cpumask_t flush_cpus;
5905	unsigned int cpu;
5906
5907	/* since we are under rtnl lock protection we can use static data
5908	 * for the cpumask and avoid allocating on stack the possibly
5909	 * large mask
5910	 */
5911	ASSERT_RTNL();
5912
5913	cpus_read_lock();
5914
5915	cpumask_clear(&flush_cpus);
5916	for_each_online_cpu(cpu) {
5917		if (flush_required(cpu)) {
5918			queue_work_on(cpu, system_highpri_wq,
5919				      per_cpu_ptr(&flush_works, cpu));
5920			cpumask_set_cpu(cpu, &flush_cpus);
5921		}
5922	}
5923
5924	/* we can have in flight packet[s] on the cpus we are not flushing,
5925	 * synchronize_net() in unregister_netdevice_many() will take care of
5926	 * them
5927	 */
5928	for_each_cpu(cpu, &flush_cpus)
5929		flush_work(per_cpu_ptr(&flush_works, cpu));
5930
5931	cpus_read_unlock();
5932}
5933
5934static void net_rps_send_ipi(struct softnet_data *remsd)
5935{
5936#ifdef CONFIG_RPS
5937	while (remsd) {
5938		struct softnet_data *next = remsd->rps_ipi_next;
5939
5940		if (cpu_online(remsd->cpu))
5941			smp_call_function_single_async(remsd->cpu, &remsd->csd);
5942		remsd = next;
5943	}
5944#endif
5945}
5946
5947/*
5948 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
5949 * Note: called with local irq disabled, but exits with local irq enabled.
5950 */
5951static void net_rps_action_and_irq_enable(struct softnet_data *sd)
5952{
5953#ifdef CONFIG_RPS
5954	struct softnet_data *remsd = sd->rps_ipi_list;
5955
5956	if (remsd) {
5957		sd->rps_ipi_list = NULL;
5958
5959		local_irq_enable();
5960
5961		/* Send pending IPI's to kick RPS processing on remote cpus. */
5962		net_rps_send_ipi(remsd);
5963	} else
5964#endif
5965		local_irq_enable();
5966}
5967
5968static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
5969{
5970#ifdef CONFIG_RPS
5971	return sd->rps_ipi_list != NULL;
5972#else
5973	return false;
5974#endif
5975}
5976
5977static int process_backlog(struct napi_struct *napi, int quota)
5978{
5979	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
5980	bool again = true;
5981	int work = 0;
5982
5983	/* Check if we have pending ipi, its better to send them now,
5984	 * not waiting net_rx_action() end.
5985	 */
5986	if (sd_has_rps_ipi_waiting(sd)) {
5987		local_irq_disable();
5988		net_rps_action_and_irq_enable(sd);
5989	}
5990
5991	napi->weight = READ_ONCE(dev_rx_weight);
5992	while (again) {
5993		struct sk_buff *skb;
5994
5995		while ((skb = __skb_dequeue(&sd->process_queue))) {
5996			rcu_read_lock();
5997			__netif_receive_skb(skb);
5998			rcu_read_unlock();
5999			input_queue_head_incr(sd);
6000			if (++work >= quota)
6001				return work;
6002
6003		}
6004
6005		rps_lock_irq_disable(sd);
6006		if (skb_queue_empty(&sd->input_pkt_queue)) {
6007			/*
6008			 * Inline a custom version of __napi_complete().
6009			 * only current cpu owns and manipulates this napi,
6010			 * and NAPI_STATE_SCHED is the only possible flag set
6011			 * on backlog.
6012			 * We can use a plain write instead of clear_bit(),
6013			 * and we dont need an smp_mb() memory barrier.
6014			 */
6015			napi->state = 0;
6016			again = false;
6017		} else {
6018			skb_queue_splice_tail_init(&sd->input_pkt_queue,
6019						   &sd->process_queue);
6020		}
6021		rps_unlock_irq_enable(sd);
6022	}
6023
6024	return work;
6025}
6026
6027/**
6028 * __napi_schedule - schedule for receive
6029 * @n: entry to schedule
6030 *
6031 * The entry's receive function will be scheduled to run.
6032 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
6033 */
6034void __napi_schedule(struct napi_struct *n)
6035{
6036	unsigned long flags;
6037
6038	local_irq_save(flags);
6039	____napi_schedule(this_cpu_ptr(&softnet_data), n);
6040	local_irq_restore(flags);
6041}
6042EXPORT_SYMBOL(__napi_schedule);
6043
6044/**
6045 *	napi_schedule_prep - check if napi can be scheduled
6046 *	@n: napi context
6047 *
6048 * Test if NAPI routine is already running, and if not mark
6049 * it as running.  This is used as a condition variable to
6050 * insure only one NAPI poll instance runs.  We also make
6051 * sure there is no pending NAPI disable.
6052 */
6053bool napi_schedule_prep(struct napi_struct *n)
6054{
6055	unsigned long new, val = READ_ONCE(n->state);
6056
6057	do {
6058		if (unlikely(val & NAPIF_STATE_DISABLE))
6059			return false;
6060		new = val | NAPIF_STATE_SCHED;
6061
6062		/* Sets STATE_MISSED bit if STATE_SCHED was already set
6063		 * This was suggested by Alexander Duyck, as compiler
6064		 * emits better code than :
6065		 * if (val & NAPIF_STATE_SCHED)
6066		 *     new |= NAPIF_STATE_MISSED;
6067		 */
6068		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
6069						   NAPIF_STATE_MISSED;
6070	} while (!try_cmpxchg(&n->state, &val, new));
6071
6072	return !(val & NAPIF_STATE_SCHED);
6073}
6074EXPORT_SYMBOL(napi_schedule_prep);
6075
6076/**
6077 * __napi_schedule_irqoff - schedule for receive
6078 * @n: entry to schedule
6079 *
6080 * Variant of __napi_schedule() assuming hard irqs are masked.
6081 *
6082 * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
6083 * because the interrupt disabled assumption might not be true
6084 * due to force-threaded interrupts and spinlock substitution.
6085 */
6086void __napi_schedule_irqoff(struct napi_struct *n)
6087{
6088	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6089		____napi_schedule(this_cpu_ptr(&softnet_data), n);
6090	else
6091		__napi_schedule(n);
6092}
6093EXPORT_SYMBOL(__napi_schedule_irqoff);
6094
6095bool napi_complete_done(struct napi_struct *n, int work_done)
6096{
6097	unsigned long flags, val, new, timeout = 0;
6098	bool ret = true;
6099
6100	/*
6101	 * 1) Don't let napi dequeue from the cpu poll list
6102	 *    just in case its running on a different cpu.
6103	 * 2) If we are busy polling, do nothing here, we have
6104	 *    the guarantee we will be called later.
6105	 */
6106	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6107				 NAPIF_STATE_IN_BUSY_POLL)))
6108		return false;
6109
6110	if (work_done) {
6111		if (n->gro_bitmask)
6112			timeout = READ_ONCE(n->dev->gro_flush_timeout);
6113		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
6114	}
6115	if (n->defer_hard_irqs_count > 0) {
6116		n->defer_hard_irqs_count--;
6117		timeout = READ_ONCE(n->dev->gro_flush_timeout);
6118		if (timeout)
6119			ret = false;
6120	}
6121	if (n->gro_bitmask) {
6122		/* When the NAPI instance uses a timeout and keeps postponing
6123		 * it, we need to bound somehow the time packets are kept in
6124		 * the GRO layer
6125		 */
6126		napi_gro_flush(n, !!timeout);
6127	}
6128
6129	gro_normal_list(n);
6130
6131	if (unlikely(!list_empty(&n->poll_list))) {
6132		/* If n->poll_list is not empty, we need to mask irqs */
6133		local_irq_save(flags);
6134		list_del_init(&n->poll_list);
6135		local_irq_restore(flags);
6136	}
6137	WRITE_ONCE(n->list_owner, -1);
6138
6139	val = READ_ONCE(n->state);
6140	do {
6141		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6142
6143		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
6144			      NAPIF_STATE_SCHED_THREADED |
6145			      NAPIF_STATE_PREFER_BUSY_POLL);
6146
6147		/* If STATE_MISSED was set, leave STATE_SCHED set,
6148		 * because we will call napi->poll() one more time.
6149		 * This C code was suggested by Alexander Duyck to help gcc.
6150		 */
6151		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6152						    NAPIF_STATE_SCHED;
6153	} while (!try_cmpxchg(&n->state, &val, new));
6154
6155	if (unlikely(val & NAPIF_STATE_MISSED)) {
6156		__napi_schedule(n);
6157		return false;
6158	}
6159
6160	if (timeout)
6161		hrtimer_start(&n->timer, ns_to_ktime(timeout),
6162			      HRTIMER_MODE_REL_PINNED);
6163	return ret;
6164}
6165EXPORT_SYMBOL(napi_complete_done);
6166
6167/* must be called under rcu_read_lock(), as we dont take a reference */
6168static struct napi_struct *napi_by_id(unsigned int napi_id)
6169{
6170	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6171	struct napi_struct *napi;
6172
6173	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6174		if (napi->napi_id == napi_id)
6175			return napi;
6176
6177	return NULL;
6178}
6179
6180#if defined(CONFIG_NET_RX_BUSY_POLL)
6181
6182static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
6183{
6184	if (!skip_schedule) {
6185		gro_normal_list(napi);
6186		__napi_schedule(napi);
6187		return;
6188	}
6189
6190	if (napi->gro_bitmask) {
6191		/* flush too old packets
6192		 * If HZ < 1000, flush all packets.
6193		 */
6194		napi_gro_flush(napi, HZ >= 1000);
6195	}
6196
6197	gro_normal_list(napi);
6198	clear_bit(NAPI_STATE_SCHED, &napi->state);
6199}
6200
6201static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
6202			   u16 budget)
6203{
6204	bool skip_schedule = false;
6205	unsigned long timeout;
6206	int rc;
6207
6208	/* Busy polling means there is a high chance device driver hard irq
6209	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6210	 * set in napi_schedule_prep().
6211	 * Since we are about to call napi->poll() once more, we can safely
6212	 * clear NAPI_STATE_MISSED.
6213	 *
6214	 * Note: x86 could use a single "lock and ..." instruction
6215	 * to perform these two clear_bit()
6216	 */
6217	clear_bit(NAPI_STATE_MISSED, &napi->state);
6218	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6219
6220	local_bh_disable();
6221
6222	if (prefer_busy_poll) {
6223		napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
6224		timeout = READ_ONCE(napi->dev->gro_flush_timeout);
6225		if (napi->defer_hard_irqs_count && timeout) {
6226			hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
6227			skip_schedule = true;
6228		}
6229	}
6230
6231	/* All we really want here is to re-enable device interrupts.
6232	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6233	 */
6234	rc = napi->poll(napi, budget);
6235	/* We can't gro_normal_list() here, because napi->poll() might have
6236	 * rearmed the napi (napi_complete_done()) in which case it could
6237	 * already be running on another CPU.
6238	 */
6239	trace_napi_poll(napi, rc, budget);
6240	netpoll_poll_unlock(have_poll_lock);
6241	if (rc == budget)
6242		__busy_poll_stop(napi, skip_schedule);
6243	local_bh_enable();
6244}
6245
6246void napi_busy_loop(unsigned int napi_id,
6247		    bool (*loop_end)(void *, unsigned long),
6248		    void *loop_end_arg, bool prefer_busy_poll, u16 budget)
6249{
6250	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6251	int (*napi_poll)(struct napi_struct *napi, int budget);
6252	void *have_poll_lock = NULL;
6253	struct napi_struct *napi;
6254
6255restart:
6256	napi_poll = NULL;
6257
6258	rcu_read_lock();
6259
6260	napi = napi_by_id(napi_id);
6261	if (!napi)
6262		goto out;
6263
6264	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6265		preempt_disable();
6266	for (;;) {
6267		int work = 0;
6268
6269		local_bh_disable();
6270		if (!napi_poll) {
6271			unsigned long val = READ_ONCE(napi->state);
6272
6273			/* If multiple threads are competing for this napi,
6274			 * we avoid dirtying napi->state as much as we can.
6275			 */
6276			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6277				   NAPIF_STATE_IN_BUSY_POLL)) {
6278				if (prefer_busy_poll)
6279					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6280				goto count;
6281			}
6282			if (cmpxchg(&napi->state, val,
6283				    val | NAPIF_STATE_IN_BUSY_POLL |
6284					  NAPIF_STATE_SCHED) != val) {
6285				if (prefer_busy_poll)
6286					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6287				goto count;
6288			}
6289			have_poll_lock = netpoll_poll_lock(napi);
6290			napi_poll = napi->poll;
6291		}
6292		work = napi_poll(napi, budget);
6293		trace_napi_poll(napi, work, budget);
6294		gro_normal_list(napi);
6295count:
6296		if (work > 0)
6297			__NET_ADD_STATS(dev_net(napi->dev),
6298					LINUX_MIB_BUSYPOLLRXPACKETS, work);
6299		local_bh_enable();
6300
6301		if (!loop_end || loop_end(loop_end_arg, start_time))
6302			break;
6303
6304		if (unlikely(need_resched())) {
6305			if (napi_poll)
6306				busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
6307			if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6308				preempt_enable();
6309			rcu_read_unlock();
6310			cond_resched();
6311			if (loop_end(loop_end_arg, start_time))
6312				return;
6313			goto restart;
6314		}
6315		cpu_relax();
6316	}
6317	if (napi_poll)
6318		busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
6319	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6320		preempt_enable();
6321out:
6322	rcu_read_unlock();
6323}
6324EXPORT_SYMBOL(napi_busy_loop);
6325
6326#endif /* CONFIG_NET_RX_BUSY_POLL */
6327
6328static void napi_hash_add(struct napi_struct *napi)
6329{
6330	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
6331		return;
6332
6333	spin_lock(&napi_hash_lock);
6334
6335	/* 0..NR_CPUS range is reserved for sender_cpu use */
6336	do {
6337		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6338			napi_gen_id = MIN_NAPI_ID;
6339	} while (napi_by_id(napi_gen_id));
6340	napi->napi_id = napi_gen_id;
6341
6342	hlist_add_head_rcu(&napi->napi_hash_node,
6343			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6344
6345	spin_unlock(&napi_hash_lock);
6346}
6347
6348/* Warning : caller is responsible to make sure rcu grace period
6349 * is respected before freeing memory containing @napi
6350 */
6351static void napi_hash_del(struct napi_struct *napi)
6352{
6353	spin_lock(&napi_hash_lock);
6354
6355	hlist_del_init_rcu(&napi->napi_hash_node);
6356
6357	spin_unlock(&napi_hash_lock);
6358}
6359
6360static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6361{
6362	struct napi_struct *napi;
6363
6364	napi = container_of(timer, struct napi_struct, timer);
6365
6366	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
6367	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6368	 */
6369	if (!napi_disable_pending(napi) &&
6370	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
6371		clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6372		__napi_schedule_irqoff(napi);
6373	}
6374
6375	return HRTIMER_NORESTART;
6376}
6377
6378static void init_gro_hash(struct napi_struct *napi)
6379{
6380	int i;
6381
6382	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6383		INIT_LIST_HEAD(&napi->gro_hash[i].list);
6384		napi->gro_hash[i].count = 0;
6385	}
6386	napi->gro_bitmask = 0;
6387}
6388
6389int dev_set_threaded(struct net_device *dev, bool threaded)
6390{
6391	struct napi_struct *napi;
6392	int err = 0;
6393
6394	if (dev->threaded == threaded)
6395		return 0;
6396
6397	if (threaded) {
6398		list_for_each_entry(napi, &dev->napi_list, dev_list) {
6399			if (!napi->thread) {
6400				err = napi_kthread_create(napi);
6401				if (err) {
6402					threaded = false;
6403					break;
6404				}
6405			}
6406		}
6407	}
6408
6409	dev->threaded = threaded;
6410
6411	/* Make sure kthread is created before THREADED bit
6412	 * is set.
6413	 */
6414	smp_mb__before_atomic();
6415
6416	/* Setting/unsetting threaded mode on a napi might not immediately
6417	 * take effect, if the current napi instance is actively being
6418	 * polled. In this case, the switch between threaded mode and
6419	 * softirq mode will happen in the next round of napi_schedule().
6420	 * This should not cause hiccups/stalls to the live traffic.
6421	 */
6422	list_for_each_entry(napi, &dev->napi_list, dev_list)
6423		assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
6424
6425	return err;
6426}
6427EXPORT_SYMBOL(dev_set_threaded);
6428
6429void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
6430			   int (*poll)(struct napi_struct *, int), int weight)
6431{
6432	if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
6433		return;
6434
6435	INIT_LIST_HEAD(&napi->poll_list);
6436	INIT_HLIST_NODE(&napi->napi_hash_node);
6437	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6438	napi->timer.function = napi_watchdog;
6439	init_gro_hash(napi);
6440	napi->skb = NULL;
6441	INIT_LIST_HEAD(&napi->rx_list);
6442	napi->rx_count = 0;
6443	napi->poll = poll;
6444	if (weight > NAPI_POLL_WEIGHT)
6445		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6446				weight);
6447	napi->weight = weight;
6448	napi->dev = dev;
6449#ifdef CONFIG_NETPOLL
6450	napi->poll_owner = -1;
6451#endif
6452	napi->list_owner = -1;
6453	set_bit(NAPI_STATE_SCHED, &napi->state);
6454	set_bit(NAPI_STATE_NPSVC, &napi->state);
6455	list_add_rcu(&napi->dev_list, &dev->napi_list);
6456	napi_hash_add(napi);
6457	napi_get_frags_check(napi);
6458	/* Create kthread for this napi if dev->threaded is set.
6459	 * Clear dev->threaded if kthread creation failed so that
6460	 * threaded mode will not be enabled in napi_enable().
6461	 */
6462	if (dev->threaded && napi_kthread_create(napi))
6463		dev->threaded = 0;
6464}
6465EXPORT_SYMBOL(netif_napi_add_weight);
6466
6467void napi_disable(struct napi_struct *n)
6468{
6469	unsigned long val, new;
6470
6471	might_sleep();
6472	set_bit(NAPI_STATE_DISABLE, &n->state);
6473
6474	val = READ_ONCE(n->state);
6475	do {
6476		while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
6477			usleep_range(20, 200);
6478			val = READ_ONCE(n->state);
6479		}
6480
6481		new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
6482		new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
6483	} while (!try_cmpxchg(&n->state, &val, new));
6484
6485	hrtimer_cancel(&n->timer);
6486
6487	clear_bit(NAPI_STATE_DISABLE, &n->state);
6488}
6489EXPORT_SYMBOL(napi_disable);
6490
6491/**
6492 *	napi_enable - enable NAPI scheduling
6493 *	@n: NAPI context
6494 *
6495 * Resume NAPI from being scheduled on this context.
6496 * Must be paired with napi_disable.
6497 */
6498void napi_enable(struct napi_struct *n)
6499{
6500	unsigned long new, val = READ_ONCE(n->state);
6501
6502	do {
6503		BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
6504
6505		new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
6506		if (n->dev->threaded && n->thread)
6507			new |= NAPIF_STATE_THREADED;
6508	} while (!try_cmpxchg(&n->state, &val, new));
6509}
6510EXPORT_SYMBOL(napi_enable);
6511
6512static void flush_gro_hash(struct napi_struct *napi)
6513{
6514	int i;
6515
6516	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6517		struct sk_buff *skb, *n;
6518
6519		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6520			kfree_skb(skb);
6521		napi->gro_hash[i].count = 0;
6522	}
6523}
6524
6525/* Must be called in process context */
6526void __netif_napi_del(struct napi_struct *napi)
6527{
6528	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
6529		return;
6530
6531	napi_hash_del(napi);
6532	list_del_rcu(&napi->dev_list);
6533	napi_free_frags(napi);
6534
6535	flush_gro_hash(napi);
6536	napi->gro_bitmask = 0;
6537
6538	if (napi->thread) {
6539		kthread_stop(napi->thread);
6540		napi->thread = NULL;
6541	}
6542}
6543EXPORT_SYMBOL(__netif_napi_del);
6544
6545static int __napi_poll(struct napi_struct *n, bool *repoll)
6546{
6547	int work, weight;
6548
6549	weight = n->weight;
6550
6551	/* This NAPI_STATE_SCHED test is for avoiding a race
6552	 * with netpoll's poll_napi().  Only the entity which
6553	 * obtains the lock and sees NAPI_STATE_SCHED set will
6554	 * actually make the ->poll() call.  Therefore we avoid
6555	 * accidentally calling ->poll() when NAPI is not scheduled.
6556	 */
6557	work = 0;
6558	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
6559		work = n->poll(n, weight);
6560		trace_napi_poll(n, work, weight);
6561	}
6562
6563	if (unlikely(work > weight))
6564		netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
6565				n->poll, work, weight);
6566
6567	if (likely(work < weight))
6568		return work;
6569
6570	/* Drivers must not modify the NAPI state if they
6571	 * consume the entire weight.  In such cases this code
6572	 * still "owns" the NAPI instance and therefore can
6573	 * move the instance around on the list at-will.
6574	 */
6575	if (unlikely(napi_disable_pending(n))) {
6576		napi_complete(n);
6577		return work;
6578	}
6579
6580	/* The NAPI context has more processing work, but busy-polling
6581	 * is preferred. Exit early.
6582	 */
6583	if (napi_prefer_busy_poll(n)) {
6584		if (napi_complete_done(n, work)) {
6585			/* If timeout is not set, we need to make sure
6586			 * that the NAPI is re-scheduled.
6587			 */
6588			napi_schedule(n);
6589		}
6590		return work;
6591	}
6592
6593	if (n->gro_bitmask) {
6594		/* flush too old packets
6595		 * If HZ < 1000, flush all packets.
6596		 */
6597		napi_gro_flush(n, HZ >= 1000);
6598	}
6599
6600	gro_normal_list(n);
6601
6602	/* Some drivers may have called napi_schedule
6603	 * prior to exhausting their budget.
6604	 */
6605	if (unlikely(!list_empty(&n->poll_list))) {
6606		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6607			     n->dev ? n->dev->name : "backlog");
6608		return work;
6609	}
6610
6611	*repoll = true;
6612
6613	return work;
6614}
6615
6616static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6617{
6618	bool do_repoll = false;
6619	void *have;
6620	int work;
6621
6622	list_del_init(&n->poll_list);
6623
6624	have = netpoll_poll_lock(n);
6625
6626	work = __napi_poll(n, &do_repoll);
6627
6628	if (do_repoll)
6629		list_add_tail(&n->poll_list, repoll);
6630
6631	netpoll_poll_unlock(have);
6632
6633	return work;
6634}
6635
6636static int napi_thread_wait(struct napi_struct *napi)
6637{
6638	bool woken = false;
6639
6640	set_current_state(TASK_INTERRUPTIBLE);
6641
6642	while (!kthread_should_stop()) {
6643		/* Testing SCHED_THREADED bit here to make sure the current
6644		 * kthread owns this napi and could poll on this napi.
6645		 * Testing SCHED bit is not enough because SCHED bit might be
6646		 * set by some other busy poll thread or by napi_disable().
6647		 */
6648		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
6649			WARN_ON(!list_empty(&napi->poll_list));
6650			__set_current_state(TASK_RUNNING);
6651			return 0;
6652		}
6653
6654		schedule();
6655		/* woken being true indicates this thread owns this napi. */
6656		woken = true;
6657		set_current_state(TASK_INTERRUPTIBLE);
6658	}
6659	__set_current_state(TASK_RUNNING);
6660
6661	return -1;
6662}
6663
6664static void skb_defer_free_flush(struct softnet_data *sd)
6665{
6666	struct sk_buff *skb, *next;
6667
6668	/* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
6669	if (!READ_ONCE(sd->defer_list))
6670		return;
6671
6672	spin_lock(&sd->defer_lock);
6673	skb = sd->defer_list;
6674	sd->defer_list = NULL;
6675	sd->defer_count = 0;
6676	spin_unlock(&sd->defer_lock);
6677
6678	while (skb != NULL) {
6679		next = skb->next;
6680		napi_consume_skb(skb, 1);
6681		skb = next;
6682	}
6683}
6684
6685static int napi_threaded_poll(void *data)
6686{
6687	struct napi_struct *napi = data;
6688	struct softnet_data *sd;
6689	void *have;
6690
6691	while (!napi_thread_wait(napi)) {
6692		unsigned long last_qs = jiffies;
6693
6694		for (;;) {
6695			bool repoll = false;
6696
6697			local_bh_disable();
6698			sd = this_cpu_ptr(&softnet_data);
6699			sd->in_napi_threaded_poll = true;
6700
6701			have = netpoll_poll_lock(napi);
6702			__napi_poll(napi, &repoll);
6703			netpoll_poll_unlock(have);
6704
6705			sd->in_napi_threaded_poll = false;
6706			barrier();
6707
6708			if (sd_has_rps_ipi_waiting(sd)) {
6709				local_irq_disable();
6710				net_rps_action_and_irq_enable(sd);
6711			}
6712			skb_defer_free_flush(sd);
6713			local_bh_enable();
6714
6715			if (!repoll)
6716				break;
6717
6718			rcu_softirq_qs_periodic(last_qs);
6719			cond_resched();
6720		}
6721	}
6722	return 0;
6723}
6724
6725static __latent_entropy void net_rx_action(struct softirq_action *h)
6726{
6727	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6728	unsigned long time_limit = jiffies +
6729		usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
6730	int budget = READ_ONCE(netdev_budget);
6731	LIST_HEAD(list);
6732	LIST_HEAD(repoll);
6733
6734start:
6735	sd->in_net_rx_action = true;
6736	local_irq_disable();
6737	list_splice_init(&sd->poll_list, &list);
6738	local_irq_enable();
6739
6740	for (;;) {
6741		struct napi_struct *n;
6742
6743		skb_defer_free_flush(sd);
6744
6745		if (list_empty(&list)) {
6746			if (list_empty(&repoll)) {
6747				sd->in_net_rx_action = false;
6748				barrier();
6749				/* We need to check if ____napi_schedule()
6750				 * had refilled poll_list while
6751				 * sd->in_net_rx_action was true.
6752				 */
6753				if (!list_empty(&sd->poll_list))
6754					goto start;
6755				if (!sd_has_rps_ipi_waiting(sd))
6756					goto end;
6757			}
6758			break;
6759		}
6760
6761		n = list_first_entry(&list, struct napi_struct, poll_list);
6762		budget -= napi_poll(n, &repoll);
6763
6764		/* If softirq window is exhausted then punt.
6765		 * Allow this to run for 2 jiffies since which will allow
6766		 * an average latency of 1.5/HZ.
6767		 */
6768		if (unlikely(budget <= 0 ||
6769			     time_after_eq(jiffies, time_limit))) {
6770			sd->time_squeeze++;
6771			break;
6772		}
6773	}
6774
6775	local_irq_disable();
6776
6777	list_splice_tail_init(&sd->poll_list, &list);
6778	list_splice_tail(&repoll, &list);
6779	list_splice(&list, &sd->poll_list);
6780	if (!list_empty(&sd->poll_list))
6781		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
6782	else
6783		sd->in_net_rx_action = false;
6784
6785	net_rps_action_and_irq_enable(sd);
6786end:;
6787}
6788
6789struct netdev_adjacent {
6790	struct net_device *dev;
6791	netdevice_tracker dev_tracker;
6792
6793	/* upper master flag, there can only be one master device per list */
6794	bool master;
6795
6796	/* lookup ignore flag */
6797	bool ignore;
6798
6799	/* counter for the number of times this device was added to us */
6800	u16 ref_nr;
6801
6802	/* private field for the users */
6803	void *private;
6804
6805	struct list_head list;
6806	struct rcu_head rcu;
6807};
6808
6809static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6810						 struct list_head *adj_list)
6811{
6812	struct netdev_adjacent *adj;
6813
6814	list_for_each_entry(adj, adj_list, list) {
6815		if (adj->dev == adj_dev)
6816			return adj;
6817	}
6818	return NULL;
6819}
6820
6821static int ____netdev_has_upper_dev(struct net_device *upper_dev,
6822				    struct netdev_nested_priv *priv)
6823{
6824	struct net_device *dev = (struct net_device *)priv->data;
6825
6826	return upper_dev == dev;
6827}
6828
6829/**
6830 * netdev_has_upper_dev - Check if device is linked to an upper device
6831 * @dev: device
6832 * @upper_dev: upper device to check
6833 *
6834 * Find out if a device is linked to specified upper device and return true
6835 * in case it is. Note that this checks only immediate upper device,
6836 * not through a complete stack of devices. The caller must hold the RTNL lock.
6837 */
6838bool netdev_has_upper_dev(struct net_device *dev,
6839			  struct net_device *upper_dev)
6840{
6841	struct netdev_nested_priv priv = {
6842		.data = (void *)upper_dev,
6843	};
6844
6845	ASSERT_RTNL();
6846
6847	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6848					     &priv);
6849}
6850EXPORT_SYMBOL(netdev_has_upper_dev);
6851
6852/**
6853 * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
6854 * @dev: device
6855 * @upper_dev: upper device to check
6856 *
6857 * Find out if a device is linked to specified upper device and return true
6858 * in case it is. Note that this checks the entire upper device chain.
6859 * The caller must hold rcu lock.
6860 */
6861
6862bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
6863				  struct net_device *upper_dev)
6864{
6865	struct netdev_nested_priv priv = {
6866		.data = (void *)upper_dev,
6867	};
6868
6869	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6870					       &priv);
6871}
6872EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
6873
6874/**
6875 * netdev_has_any_upper_dev - Check if device is linked to some device
6876 * @dev: device
6877 *
6878 * Find out if a device is linked to an upper device and return true in case
6879 * it is. The caller must hold the RTNL lock.
6880 */
6881bool netdev_has_any_upper_dev(struct net_device *dev)
6882{
6883	ASSERT_RTNL();
6884
6885	return !list_empty(&dev->adj_list.upper);
6886}
6887EXPORT_SYMBOL(netdev_has_any_upper_dev);
6888
6889/**
6890 * netdev_master_upper_dev_get - Get master upper device
6891 * @dev: device
6892 *
6893 * Find a master upper device and return pointer to it or NULL in case
6894 * it's not there. The caller must hold the RTNL lock.
6895 */
6896struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
6897{
6898	struct netdev_adjacent *upper;
6899
6900	ASSERT_RTNL();
6901
6902	if (list_empty(&dev->adj_list.upper))
6903		return NULL;
6904
6905	upper = list_first_entry(&dev->adj_list.upper,
6906				 struct netdev_adjacent, list);
6907	if (likely(upper->master))
6908		return upper->dev;
6909	return NULL;
6910}
6911EXPORT_SYMBOL(netdev_master_upper_dev_get);
6912
6913static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
6914{
6915	struct netdev_adjacent *upper;
6916
6917	ASSERT_RTNL();
6918
6919	if (list_empty(&dev->adj_list.upper))
6920		return NULL;
6921
6922	upper = list_first_entry(&dev->adj_list.upper,
6923				 struct netdev_adjacent, list);
6924	if (likely(upper->master) && !upper->ignore)
6925		return upper->dev;
6926	return NULL;
6927}
6928
6929/**
6930 * netdev_has_any_lower_dev - Check if device is linked to some device
6931 * @dev: device
6932 *
6933 * Find out if a device is linked to a lower device and return true in case
6934 * it is. The caller must hold the RTNL lock.
6935 */
6936static bool netdev_has_any_lower_dev(struct net_device *dev)
6937{
6938	ASSERT_RTNL();
6939
6940	return !list_empty(&dev->adj_list.lower);
6941}
6942
6943void *netdev_adjacent_get_private(struct list_head *adj_list)
6944{
6945	struct netdev_adjacent *adj;
6946
6947	adj = list_entry(adj_list, struct netdev_adjacent, list);
6948
6949	return adj->private;
6950}
6951EXPORT_SYMBOL(netdev_adjacent_get_private);
6952
6953/**
6954 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
6955 * @dev: device
6956 * @iter: list_head ** of the current position
6957 *
6958 * Gets the next device from the dev's upper list, starting from iter
6959 * position. The caller must hold RCU read lock.
6960 */
6961struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
6962						 struct list_head **iter)
6963{
6964	struct netdev_adjacent *upper;
6965
6966	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6967
6968	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6969
6970	if (&upper->list == &dev->adj_list.upper)
6971		return NULL;
6972
6973	*iter = &upper->list;
6974
6975	return upper->dev;
6976}
6977EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
6978
6979static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
6980						  struct list_head **iter,
6981						  bool *ignore)
6982{
6983	struct netdev_adjacent *upper;
6984
6985	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
6986
6987	if (&upper->list == &dev->adj_list.upper)
6988		return NULL;
6989
6990	*iter = &upper->list;
6991	*ignore = upper->ignore;
6992
6993	return upper->dev;
6994}
6995
6996static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
6997						    struct list_head **iter)
6998{
6999	struct netdev_adjacent *upper;
7000
7001	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7002
7003	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7004
7005	if (&upper->list == &dev->adj_list.upper)
7006		return NULL;
7007
7008	*iter = &upper->list;
7009
7010	return upper->dev;
7011}
7012
7013static int __netdev_walk_all_upper_dev(struct net_device *dev,
7014				       int (*fn)(struct net_device *dev,
7015					 struct netdev_nested_priv *priv),
7016				       struct netdev_nested_priv *priv)
7017{
7018	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7019	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7020	int ret, cur = 0;
7021	bool ignore;
7022
7023	now = dev;
7024	iter = &dev->adj_list.upper;
7025
7026	while (1) {
7027		if (now != dev) {
7028			ret = fn(now, priv);
7029			if (ret)
7030				return ret;
7031		}
7032
7033		next = NULL;
7034		while (1) {
7035			udev = __netdev_next_upper_dev(now, &iter, &ignore);
7036			if (!udev)
7037				break;
7038			if (ignore)
7039				continue;
7040
7041			next = udev;
7042			niter = &udev->adj_list.upper;
7043			dev_stack[cur] = now;
7044			iter_stack[cur++] = iter;
7045			break;
7046		}
7047
7048		if (!next) {
7049			if (!cur)
7050				return 0;
7051			next = dev_stack[--cur];
7052			niter = iter_stack[cur];
7053		}
7054
7055		now = next;
7056		iter = niter;
7057	}
7058
7059	return 0;
7060}
7061
7062int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
7063				  int (*fn)(struct net_device *dev,
7064					    struct netdev_nested_priv *priv),
7065				  struct netdev_nested_priv *priv)
7066{
7067	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7068	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7069	int ret, cur = 0;
7070
7071	now = dev;
7072	iter = &dev->adj_list.upper;
7073
7074	while (1) {
7075		if (now != dev) {
7076			ret = fn(now, priv);
7077			if (ret)
7078				return ret;
7079		}
7080
7081		next = NULL;
7082		while (1) {
7083			udev = netdev_next_upper_dev_rcu(now, &iter);
7084			if (!udev)
7085				break;
7086
7087			next = udev;
7088			niter = &udev->adj_list.upper;
7089			dev_stack[cur] = now;
7090			iter_stack[cur++] = iter;
7091			break;
7092		}
7093
7094		if (!next) {
7095			if (!cur)
7096				return 0;
7097			next = dev_stack[--cur];
7098			niter = iter_stack[cur];
7099		}
7100
7101		now = next;
7102		iter = niter;
7103	}
7104
7105	return 0;
7106}
7107EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
7108
7109static bool __netdev_has_upper_dev(struct net_device *dev,
7110				   struct net_device *upper_dev)
7111{
7112	struct netdev_nested_priv priv = {
7113		.flags = 0,
7114		.data = (void *)upper_dev,
7115	};
7116
7117	ASSERT_RTNL();
7118
7119	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
7120					   &priv);
7121}
7122
7123/**
7124 * netdev_lower_get_next_private - Get the next ->private from the
7125 *				   lower neighbour list
7126 * @dev: device
7127 * @iter: list_head ** of the current position
7128 *
7129 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7130 * list, starting from iter position. The caller must hold either hold the
7131 * RTNL lock or its own locking that guarantees that the neighbour lower
7132 * list will remain unchanged.
7133 */
7134void *netdev_lower_get_next_private(struct net_device *dev,
7135				    struct list_head **iter)
7136{
7137	struct netdev_adjacent *lower;
7138
7139	lower = list_entry(*iter, struct netdev_adjacent, list);
7140
7141	if (&lower->list == &dev->adj_list.lower)
7142		return NULL;
7143
7144	*iter = lower->list.next;
7145
7146	return lower->private;
7147}
7148EXPORT_SYMBOL(netdev_lower_get_next_private);
7149
7150/**
7151 * netdev_lower_get_next_private_rcu - Get the next ->private from the
7152 *				       lower neighbour list, RCU
7153 *				       variant
7154 * @dev: device
7155 * @iter: list_head ** of the current position
7156 *
7157 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7158 * list, starting from iter position. The caller must hold RCU read lock.
7159 */
7160void *netdev_lower_get_next_private_rcu(struct net_device *dev,
7161					struct list_head **iter)
7162{
7163	struct netdev_adjacent *lower;
7164
7165	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
7166
7167	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7168
7169	if (&lower->list == &dev->adj_list.lower)
7170		return NULL;
7171
7172	*iter = &lower->list;
7173
7174	return lower->private;
7175}
7176EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
7177
7178/**
7179 * netdev_lower_get_next - Get the next device from the lower neighbour
7180 *                         list
7181 * @dev: device
7182 * @iter: list_head ** of the current position
7183 *
7184 * Gets the next netdev_adjacent from the dev's lower neighbour
7185 * list, starting from iter position. The caller must hold RTNL lock or
7186 * its own locking that guarantees that the neighbour lower
7187 * list will remain unchanged.
7188 */
7189void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
7190{
7191	struct netdev_adjacent *lower;
7192
7193	lower = list_entry(*iter, struct netdev_adjacent, list);
7194
7195	if (&lower->list == &dev->adj_list.lower)
7196		return NULL;
7197
7198	*iter = lower->list.next;
7199
7200	return lower->dev;
7201}
7202EXPORT_SYMBOL(netdev_lower_get_next);
7203
7204static struct net_device *netdev_next_lower_dev(struct net_device *dev,
7205						struct list_head **iter)
7206{
7207	struct netdev_adjacent *lower;
7208
7209	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7210
7211	if (&lower->list == &dev->adj_list.lower)
7212		return NULL;
7213
7214	*iter = &lower->list;
7215
7216	return lower->dev;
7217}
7218
7219static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
7220						  struct list_head **iter,
7221						  bool *ignore)
7222{
7223	struct netdev_adjacent *lower;
7224
7225	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7226
7227	if (&lower->list == &dev->adj_list.lower)
7228		return NULL;
7229
7230	*iter = &lower->list;
7231	*ignore = lower->ignore;
7232
7233	return lower->dev;
7234}
7235
7236int netdev_walk_all_lower_dev(struct net_device *dev,
7237			      int (*fn)(struct net_device *dev,
7238					struct netdev_nested_priv *priv),
7239			      struct netdev_nested_priv *priv)
7240{
7241	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7242	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7243	int ret, cur = 0;
7244
7245	now = dev;
7246	iter = &dev->adj_list.lower;
7247
7248	while (1) {
7249		if (now != dev) {
7250			ret = fn(now, priv);
7251			if (ret)
7252				return ret;
7253		}
7254
7255		next = NULL;
7256		while (1) {
7257			ldev = netdev_next_lower_dev(now, &iter);
7258			if (!ldev)
7259				break;
7260
7261			next = ldev;
7262			niter = &ldev->adj_list.lower;
7263			dev_stack[cur] = now;
7264			iter_stack[cur++] = iter;
7265			break;
7266		}
7267
7268		if (!next) {
7269			if (!cur)
7270				return 0;
7271			next = dev_stack[--cur];
7272			niter = iter_stack[cur];
7273		}
7274
7275		now = next;
7276		iter = niter;
7277	}
7278
7279	return 0;
7280}
7281EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
7282
7283static int __netdev_walk_all_lower_dev(struct net_device *dev,
7284				       int (*fn)(struct net_device *dev,
7285					 struct netdev_nested_priv *priv),
7286				       struct netdev_nested_priv *priv)
7287{
7288	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7289	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7290	int ret, cur = 0;
7291	bool ignore;
7292
7293	now = dev;
7294	iter = &dev->adj_list.lower;
7295
7296	while (1) {
7297		if (now != dev) {
7298			ret = fn(now, priv);
7299			if (ret)
7300				return ret;
7301		}
7302
7303		next = NULL;
7304		while (1) {
7305			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
7306			if (!ldev)
7307				break;
7308			if (ignore)
7309				continue;
7310
7311			next = ldev;
7312			niter = &ldev->adj_list.lower;
7313			dev_stack[cur] = now;
7314			iter_stack[cur++] = iter;
7315			break;
7316		}
7317
7318		if (!next) {
7319			if (!cur)
7320				return 0;
7321			next = dev_stack[--cur];
7322			niter = iter_stack[cur];
7323		}
7324
7325		now = next;
7326		iter = niter;
7327	}
7328
7329	return 0;
7330}
7331
7332struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
7333					     struct list_head **iter)
7334{
7335	struct netdev_adjacent *lower;
7336
7337	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7338	if (&lower->list == &dev->adj_list.lower)
7339		return NULL;
7340
7341	*iter = &lower->list;
7342
7343	return lower->dev;
7344}
7345EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
7346
7347static u8 __netdev_upper_depth(struct net_device *dev)
7348{
7349	struct net_device *udev;
7350	struct list_head *iter;
7351	u8 max_depth = 0;
7352	bool ignore;
7353
7354	for (iter = &dev->adj_list.upper,
7355	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
7356	     udev;
7357	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
7358		if (ignore)
7359			continue;
7360		if (max_depth < udev->upper_level)
7361			max_depth = udev->upper_level;
7362	}
7363
7364	return max_depth;
7365}
7366
7367static u8 __netdev_lower_depth(struct net_device *dev)
7368{
7369	struct net_device *ldev;
7370	struct list_head *iter;
7371	u8 max_depth = 0;
7372	bool ignore;
7373
7374	for (iter = &dev->adj_list.lower,
7375	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
7376	     ldev;
7377	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
7378		if (ignore)
7379			continue;
7380		if (max_depth < ldev->lower_level)
7381			max_depth = ldev->lower_level;
7382	}
7383
7384	return max_depth;
7385}
7386
7387static int __netdev_update_upper_level(struct net_device *dev,
7388				       struct netdev_nested_priv *__unused)
7389{
7390	dev->upper_level = __netdev_upper_depth(dev) + 1;
7391	return 0;
7392}
7393
7394#ifdef CONFIG_LOCKDEP
7395static LIST_HEAD(net_unlink_list);
7396
7397static void net_unlink_todo(struct net_device *dev)
7398{
7399	if (list_empty(&dev->unlink_list))
7400		list_add_tail(&dev->unlink_list, &net_unlink_list);
7401}
7402#endif
7403
7404static int __netdev_update_lower_level(struct net_device *dev,
7405				       struct netdev_nested_priv *priv)
7406{
7407	dev->lower_level = __netdev_lower_depth(dev) + 1;
7408
7409#ifdef CONFIG_LOCKDEP
7410	if (!priv)
7411		return 0;
7412
7413	if (priv->flags & NESTED_SYNC_IMM)
7414		dev->nested_level = dev->lower_level - 1;
7415	if (priv->flags & NESTED_SYNC_TODO)
7416		net_unlink_todo(dev);
7417#endif
7418	return 0;
7419}
7420
7421int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
7422				  int (*fn)(struct net_device *dev,
7423					    struct netdev_nested_priv *priv),
7424				  struct netdev_nested_priv *priv)
7425{
7426	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7427	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7428	int ret, cur = 0;
7429
7430	now = dev;
7431	iter = &dev->adj_list.lower;
7432
7433	while (1) {
7434		if (now != dev) {
7435			ret = fn(now, priv);
7436			if (ret)
7437				return ret;
7438		}
7439
7440		next = NULL;
7441		while (1) {
7442			ldev = netdev_next_lower_dev_rcu(now, &iter);
7443			if (!ldev)
7444				break;
7445
7446			next = ldev;
7447			niter = &ldev->adj_list.lower;
7448			dev_stack[cur] = now;
7449			iter_stack[cur++] = iter;
7450			break;
7451		}
7452
7453		if (!next) {
7454			if (!cur)
7455				return 0;
7456			next = dev_stack[--cur];
7457			niter = iter_stack[cur];
7458		}
7459
7460		now = next;
7461		iter = niter;
7462	}
7463
7464	return 0;
7465}
7466EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
7467
7468/**
7469 * netdev_lower_get_first_private_rcu - Get the first ->private from the
7470 *				       lower neighbour list, RCU
7471 *				       variant
7472 * @dev: device
7473 *
7474 * Gets the first netdev_adjacent->private from the dev's lower neighbour
7475 * list. The caller must hold RCU read lock.
7476 */
7477void *netdev_lower_get_first_private_rcu(struct net_device *dev)
7478{
7479	struct netdev_adjacent *lower;
7480
7481	lower = list_first_or_null_rcu(&dev->adj_list.lower,
7482			struct netdev_adjacent, list);
7483	if (lower)
7484		return lower->private;
7485	return NULL;
7486}
7487EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
7488
7489/**
7490 * netdev_master_upper_dev_get_rcu - Get master upper device
7491 * @dev: device
7492 *
7493 * Find a master upper device and return pointer to it or NULL in case
7494 * it's not there. The caller must hold the RCU read lock.
7495 */
7496struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
7497{
7498	struct netdev_adjacent *upper;
7499
7500	upper = list_first_or_null_rcu(&dev->adj_list.upper,
7501				       struct netdev_adjacent, list);
7502	if (upper && likely(upper->master))
7503		return upper->dev;
7504	return NULL;
7505}
7506EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
7507
7508static int netdev_adjacent_sysfs_add(struct net_device *dev,
7509			      struct net_device *adj_dev,
7510			      struct list_head *dev_list)
7511{
7512	char linkname[IFNAMSIZ+7];
7513
7514	sprintf(linkname, dev_list == &dev->adj_list.upper ?
7515		"upper_%s" : "lower_%s", adj_dev->name);
7516	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
7517				 linkname);
7518}
7519static void netdev_adjacent_sysfs_del(struct net_device *dev,
7520			       char *name,
7521			       struct list_head *dev_list)
7522{
7523	char linkname[IFNAMSIZ+7];
7524
7525	sprintf(linkname, dev_list == &dev->adj_list.upper ?
7526		"upper_%s" : "lower_%s", name);
7527	sysfs_remove_link(&(dev->dev.kobj), linkname);
7528}
7529
7530static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
7531						 struct net_device *adj_dev,
7532						 struct list_head *dev_list)
7533{
7534	return (dev_list == &dev->adj_list.upper ||
7535		dev_list == &dev->adj_list.lower) &&
7536		net_eq(dev_net(dev), dev_net(adj_dev));
7537}
7538
7539static int __netdev_adjacent_dev_insert(struct net_device *dev,
7540					struct net_device *adj_dev,
7541					struct list_head *dev_list,
7542					void *private, bool master)
7543{
7544	struct netdev_adjacent *adj;
7545	int ret;
7546
7547	adj = __netdev_find_adj(adj_dev, dev_list);
7548
7549	if (adj) {
7550		adj->ref_nr += 1;
7551		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
7552			 dev->name, adj_dev->name, adj->ref_nr);
7553
7554		return 0;
7555	}
7556
7557	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
7558	if (!adj)
7559		return -ENOMEM;
7560
7561	adj->dev = adj_dev;
7562	adj->master = master;
7563	adj->ref_nr = 1;
7564	adj->private = private;
7565	adj->ignore = false;
7566	netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
7567
7568	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
7569		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
7570
7571	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
7572		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
7573		if (ret)
7574			goto free_adj;
7575	}
7576
7577	/* Ensure that master link is always the first item in list. */
7578	if (master) {
7579		ret = sysfs_create_link(&(dev->dev.kobj),
7580					&(adj_dev->dev.kobj), "master");
7581		if (ret)
7582			goto remove_symlinks;
7583
7584		list_add_rcu(&adj->list, dev_list);
7585	} else {
7586		list_add_tail_rcu(&adj->list, dev_list);
7587	}
7588
7589	return 0;
7590
7591remove_symlinks:
7592	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7593		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7594free_adj:
7595	netdev_put(adj_dev, &adj->dev_tracker);
7596	kfree(adj);
7597
7598	return ret;
7599}
7600
7601static void __netdev_adjacent_dev_remove(struct net_device *dev,
7602					 struct net_device *adj_dev,
7603					 u16 ref_nr,
7604					 struct list_head *dev_list)
7605{
7606	struct netdev_adjacent *adj;
7607
7608	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
7609		 dev->name, adj_dev->name, ref_nr);
7610
7611	adj = __netdev_find_adj(adj_dev, dev_list);
7612
7613	if (!adj) {
7614		pr_err("Adjacency does not exist for device %s from %s\n",
7615		       dev->name, adj_dev->name);
7616		WARN_ON(1);
7617		return;
7618	}
7619
7620	if (adj->ref_nr > ref_nr) {
7621		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
7622			 dev->name, adj_dev->name, ref_nr,
7623			 adj->ref_nr - ref_nr);
7624		adj->ref_nr -= ref_nr;
7625		return;
7626	}
7627
7628	if (adj->master)
7629		sysfs_remove_link(&(dev->dev.kobj), "master");
7630
7631	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7632		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7633
7634	list_del_rcu(&adj->list);
7635	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7636		 adj_dev->name, dev->name, adj_dev->name);
7637	netdev_put(adj_dev, &adj->dev_tracker);
7638	kfree_rcu(adj, rcu);
7639}
7640
7641static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
7642					    struct net_device *upper_dev,
7643					    struct list_head *up_list,
7644					    struct list_head *down_list,
7645					    void *private, bool master)
7646{
7647	int ret;
7648
7649	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7650					   private, master);
7651	if (ret)
7652		return ret;
7653
7654	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7655					   private, false);
7656	if (ret) {
7657		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7658		return ret;
7659	}
7660
7661	return 0;
7662}
7663
7664static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
7665					       struct net_device *upper_dev,
7666					       u16 ref_nr,
7667					       struct list_head *up_list,
7668					       struct list_head *down_list)
7669{
7670	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
7671	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7672}
7673
7674static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
7675						struct net_device *upper_dev,
7676						void *private, bool master)
7677{
7678	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
7679						&dev->adj_list.upper,
7680						&upper_dev->adj_list.lower,
7681						private, master);
7682}
7683
7684static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
7685						   struct net_device *upper_dev)
7686{
7687	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7688					   &dev->adj_list.upper,
7689					   &upper_dev->adj_list.lower);
7690}
7691
7692static int __netdev_upper_dev_link(struct net_device *dev,
7693				   struct net_device *upper_dev, bool master,
7694				   void *upper_priv, void *upper_info,
7695				   struct netdev_nested_priv *priv,
7696				   struct netlink_ext_ack *extack)
7697{
7698	struct netdev_notifier_changeupper_info changeupper_info = {
7699		.info = {
7700			.dev = dev,
7701			.extack = extack,
7702		},
7703		.upper_dev = upper_dev,
7704		.master = master,
7705		.linking = true,
7706		.upper_info = upper_info,
7707	};
7708	struct net_device *master_dev;
7709	int ret = 0;
7710
7711	ASSERT_RTNL();
7712
7713	if (dev == upper_dev)
7714		return -EBUSY;
7715
7716	/* To prevent loops, check if dev is not upper device to upper_dev. */
7717	if (__netdev_has_upper_dev(upper_dev, dev))
7718		return -EBUSY;
7719
7720	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
7721		return -EMLINK;
7722
7723	if (!master) {
7724		if (__netdev_has_upper_dev(dev, upper_dev))
7725			return -EEXIST;
7726	} else {
7727		master_dev = __netdev_master_upper_dev_get(dev);
7728		if (master_dev)
7729			return master_dev == upper_dev ? -EEXIST : -EBUSY;
7730	}
7731
7732	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7733					    &changeupper_info.info);
7734	ret = notifier_to_errno(ret);
7735	if (ret)
7736		return ret;
7737
7738	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7739						   master);
7740	if (ret)
7741		return ret;
7742
7743	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7744					    &changeupper_info.info);
7745	ret = notifier_to_errno(ret);
7746	if (ret)
7747		goto rollback;
7748
7749	__netdev_update_upper_level(dev, NULL);
7750	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7751
7752	__netdev_update_lower_level(upper_dev, priv);
7753	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7754				    priv);
7755
7756	return 0;
7757
7758rollback:
7759	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7760
7761	return ret;
7762}
7763
7764/**
7765 * netdev_upper_dev_link - Add a link to the upper device
7766 * @dev: device
7767 * @upper_dev: new upper device
7768 * @extack: netlink extended ack
7769 *
7770 * Adds a link to device which is upper to this one. The caller must hold
7771 * the RTNL lock. On a failure a negative errno code is returned.
7772 * On success the reference counts are adjusted and the function
7773 * returns zero.
7774 */
7775int netdev_upper_dev_link(struct net_device *dev,
7776			  struct net_device *upper_dev,
7777			  struct netlink_ext_ack *extack)
7778{
7779	struct netdev_nested_priv priv = {
7780		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7781		.data = NULL,
7782	};
7783
7784	return __netdev_upper_dev_link(dev, upper_dev, false,
7785				       NULL, NULL, &priv, extack);
7786}
7787EXPORT_SYMBOL(netdev_upper_dev_link);
7788
7789/**
7790 * netdev_master_upper_dev_link - Add a master link to the upper device
7791 * @dev: device
7792 * @upper_dev: new upper device
7793 * @upper_priv: upper device private
7794 * @upper_info: upper info to be passed down via notifier
7795 * @extack: netlink extended ack
7796 *
7797 * Adds a link to device which is upper to this one. In this case, only
7798 * one master upper device can be linked, although other non-master devices
7799 * might be linked as well. The caller must hold the RTNL lock.
7800 * On a failure a negative errno code is returned. On success the reference
7801 * counts are adjusted and the function returns zero.
7802 */
7803int netdev_master_upper_dev_link(struct net_device *dev,
7804				 struct net_device *upper_dev,
7805				 void *upper_priv, void *upper_info,
7806				 struct netlink_ext_ack *extack)
7807{
7808	struct netdev_nested_priv priv = {
7809		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7810		.data = NULL,
7811	};
7812
7813	return __netdev_upper_dev_link(dev, upper_dev, true,
7814				       upper_priv, upper_info, &priv, extack);
7815}
7816EXPORT_SYMBOL(netdev_master_upper_dev_link);
7817
7818static void __netdev_upper_dev_unlink(struct net_device *dev,
7819				      struct net_device *upper_dev,
7820				      struct netdev_nested_priv *priv)
7821{
7822	struct netdev_notifier_changeupper_info changeupper_info = {
7823		.info = {
7824			.dev = dev,
7825		},
7826		.upper_dev = upper_dev,
7827		.linking = false,
7828	};
7829
7830	ASSERT_RTNL();
7831
7832	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
7833
7834	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7835				      &changeupper_info.info);
7836
7837	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7838
7839	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7840				      &changeupper_info.info);
7841
7842	__netdev_update_upper_level(dev, NULL);
7843	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7844
7845	__netdev_update_lower_level(upper_dev, priv);
7846	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7847				    priv);
7848}
7849
7850/**
7851 * netdev_upper_dev_unlink - Removes a link to upper device
7852 * @dev: device
7853 * @upper_dev: new upper device
7854 *
7855 * Removes a link to device which is upper to this one. The caller must hold
7856 * the RTNL lock.
7857 */
7858void netdev_upper_dev_unlink(struct net_device *dev,
7859			     struct net_device *upper_dev)
7860{
7861	struct netdev_nested_priv priv = {
7862		.flags = NESTED_SYNC_TODO,
7863		.data = NULL,
7864	};
7865
7866	__netdev_upper_dev_unlink(dev, upper_dev, &priv);
7867}
7868EXPORT_SYMBOL(netdev_upper_dev_unlink);
7869
7870static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
7871				      struct net_device *lower_dev,
7872				      bool val)
7873{
7874	struct netdev_adjacent *adj;
7875
7876	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
7877	if (adj)
7878		adj->ignore = val;
7879
7880	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
7881	if (adj)
7882		adj->ignore = val;
7883}
7884
7885static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
7886					struct net_device *lower_dev)
7887{
7888	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
7889}
7890
7891static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
7892				       struct net_device *lower_dev)
7893{
7894	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
7895}
7896
7897int netdev_adjacent_change_prepare(struct net_device *old_dev,
7898				   struct net_device *new_dev,
7899				   struct net_device *dev,
7900				   struct netlink_ext_ack *extack)
7901{
7902	struct netdev_nested_priv priv = {
7903		.flags = 0,
7904		.data = NULL,
7905	};
7906	int err;
7907
7908	if (!new_dev)
7909		return 0;
7910
7911	if (old_dev && new_dev != old_dev)
7912		netdev_adjacent_dev_disable(dev, old_dev);
7913	err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
7914				      extack);
7915	if (err) {
7916		if (old_dev && new_dev != old_dev)
7917			netdev_adjacent_dev_enable(dev, old_dev);
7918		return err;
7919	}
7920
7921	return 0;
7922}
7923EXPORT_SYMBOL(netdev_adjacent_change_prepare);
7924
7925void netdev_adjacent_change_commit(struct net_device *old_dev,
7926				   struct net_device *new_dev,
7927				   struct net_device *dev)
7928{
7929	struct netdev_nested_priv priv = {
7930		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7931		.data = NULL,
7932	};
7933
7934	if (!new_dev || !old_dev)
7935		return;
7936
7937	if (new_dev == old_dev)
7938		return;
7939
7940	netdev_adjacent_dev_enable(dev, old_dev);
7941	__netdev_upper_dev_unlink(old_dev, dev, &priv);
7942}
7943EXPORT_SYMBOL(netdev_adjacent_change_commit);
7944
7945void netdev_adjacent_change_abort(struct net_device *old_dev,
7946				  struct net_device *new_dev,
7947				  struct net_device *dev)
7948{
7949	struct netdev_nested_priv priv = {
7950		.flags = 0,
7951		.data = NULL,
7952	};
7953
7954	if (!new_dev)
7955		return;
7956
7957	if (old_dev && new_dev != old_dev)
7958		netdev_adjacent_dev_enable(dev, old_dev);
7959
7960	__netdev_upper_dev_unlink(new_dev, dev, &priv);
7961}
7962EXPORT_SYMBOL(netdev_adjacent_change_abort);
7963
7964/**
7965 * netdev_bonding_info_change - Dispatch event about slave change
7966 * @dev: device
7967 * @bonding_info: info to dispatch
7968 *
7969 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
7970 * The caller must hold the RTNL lock.
7971 */
7972void netdev_bonding_info_change(struct net_device *dev,
7973				struct netdev_bonding_info *bonding_info)
7974{
7975	struct netdev_notifier_bonding_info info = {
7976		.info.dev = dev,
7977	};
7978
7979	memcpy(&info.bonding_info, bonding_info,
7980	       sizeof(struct netdev_bonding_info));
7981	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
7982				      &info.info);
7983}
7984EXPORT_SYMBOL(netdev_bonding_info_change);
7985
7986static int netdev_offload_xstats_enable_l3(struct net_device *dev,
7987					   struct netlink_ext_ack *extack)
7988{
7989	struct netdev_notifier_offload_xstats_info info = {
7990		.info.dev = dev,
7991		.info.extack = extack,
7992		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
7993	};
7994	int err;
7995	int rc;
7996
7997	dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
7998					 GFP_KERNEL);
7999	if (!dev->offload_xstats_l3)
8000		return -ENOMEM;
8001
8002	rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
8003						  NETDEV_OFFLOAD_XSTATS_DISABLE,
8004						  &info.info);
8005	err = notifier_to_errno(rc);
8006	if (err)
8007		goto free_stats;
8008
8009	return 0;
8010
8011free_stats:
8012	kfree(dev->offload_xstats_l3);
8013	dev->offload_xstats_l3 = NULL;
8014	return err;
8015}
8016
8017int netdev_offload_xstats_enable(struct net_device *dev,
8018				 enum netdev_offload_xstats_type type,
8019				 struct netlink_ext_ack *extack)
8020{
8021	ASSERT_RTNL();
8022
8023	if (netdev_offload_xstats_enabled(dev, type))
8024		return -EALREADY;
8025
8026	switch (type) {
8027	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8028		return netdev_offload_xstats_enable_l3(dev, extack);
8029	}
8030
8031	WARN_ON(1);
8032	return -EINVAL;
8033}
8034EXPORT_SYMBOL(netdev_offload_xstats_enable);
8035
8036static void netdev_offload_xstats_disable_l3(struct net_device *dev)
8037{
8038	struct netdev_notifier_offload_xstats_info info = {
8039		.info.dev = dev,
8040		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
8041	};
8042
8043	call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
8044				      &info.info);
8045	kfree(dev->offload_xstats_l3);
8046	dev->offload_xstats_l3 = NULL;
8047}
8048
8049int netdev_offload_xstats_disable(struct net_device *dev,
8050				  enum netdev_offload_xstats_type type)
8051{
8052	ASSERT_RTNL();
8053
8054	if (!netdev_offload_xstats_enabled(dev, type))
8055		return -EALREADY;
8056
8057	switch (type) {
8058	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8059		netdev_offload_xstats_disable_l3(dev);
8060		return 0;
8061	}
8062
8063	WARN_ON(1);
8064	return -EINVAL;
8065}
8066EXPORT_SYMBOL(netdev_offload_xstats_disable);
8067
8068static void netdev_offload_xstats_disable_all(struct net_device *dev)
8069{
8070	netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
8071}
8072
8073static struct rtnl_hw_stats64 *
8074netdev_offload_xstats_get_ptr(const struct net_device *dev,
8075			      enum netdev_offload_xstats_type type)
8076{
8077	switch (type) {
8078	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8079		return dev->offload_xstats_l3;
8080	}
8081
8082	WARN_ON(1);
8083	return NULL;
8084}
8085
8086bool netdev_offload_xstats_enabled(const struct net_device *dev,
8087				   enum netdev_offload_xstats_type type)
8088{
8089	ASSERT_RTNL();
8090
8091	return netdev_offload_xstats_get_ptr(dev, type);
8092}
8093EXPORT_SYMBOL(netdev_offload_xstats_enabled);
8094
8095struct netdev_notifier_offload_xstats_ru {
8096	bool used;
8097};
8098
8099struct netdev_notifier_offload_xstats_rd {
8100	struct rtnl_hw_stats64 stats;
8101	bool used;
8102};
8103
8104static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
8105				  const struct rtnl_hw_stats64 *src)
8106{
8107	dest->rx_packets	  += src->rx_packets;
8108	dest->tx_packets	  += src->tx_packets;
8109	dest->rx_bytes		  += src->rx_bytes;
8110	dest->tx_bytes		  += src->tx_bytes;
8111	dest->rx_errors		  += src->rx_errors;
8112	dest->tx_errors		  += src->tx_errors;
8113	dest->rx_dropped	  += src->rx_dropped;
8114	dest->tx_dropped	  += src->tx_dropped;
8115	dest->multicast		  += src->multicast;
8116}
8117
8118static int netdev_offload_xstats_get_used(struct net_device *dev,
8119					  enum netdev_offload_xstats_type type,
8120					  bool *p_used,
8121					  struct netlink_ext_ack *extack)
8122{
8123	struct netdev_notifier_offload_xstats_ru report_used = {};
8124	struct netdev_notifier_offload_xstats_info info = {
8125		.info.dev = dev,
8126		.info.extack = extack,
8127		.type = type,
8128		.report_used = &report_used,
8129	};
8130	int rc;
8131
8132	WARN_ON(!netdev_offload_xstats_enabled(dev, type));
8133	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
8134					   &info.info);
8135	*p_used = report_used.used;
8136	return notifier_to_errno(rc);
8137}
8138
8139static int netdev_offload_xstats_get_stats(struct net_device *dev,
8140					   enum netdev_offload_xstats_type type,
8141					   struct rtnl_hw_stats64 *p_stats,
8142					   bool *p_used,
8143					   struct netlink_ext_ack *extack)
8144{
8145	struct netdev_notifier_offload_xstats_rd report_delta = {};
8146	struct netdev_notifier_offload_xstats_info info = {
8147		.info.dev = dev,
8148		.info.extack = extack,
8149		.type = type,
8150		.report_delta = &report_delta,
8151	};
8152	struct rtnl_hw_stats64 *stats;
8153	int rc;
8154
8155	stats = netdev_offload_xstats_get_ptr(dev, type);
8156	if (WARN_ON(!stats))
8157		return -EINVAL;
8158
8159	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
8160					   &info.info);
8161
8162	/* Cache whatever we got, even if there was an error, otherwise the
8163	 * successful stats retrievals would get lost.
8164	 */
8165	netdev_hw_stats64_add(stats, &report_delta.stats);
8166
8167	if (p_stats)
8168		*p_stats = *stats;
8169	*p_used = report_delta.used;
8170
8171	return notifier_to_errno(rc);
8172}
8173
8174int netdev_offload_xstats_get(struct net_device *dev,
8175			      enum netdev_offload_xstats_type type,
8176			      struct rtnl_hw_stats64 *p_stats, bool *p_used,
8177			      struct netlink_ext_ack *extack)
8178{
8179	ASSERT_RTNL();
8180
8181	if (p_stats)
8182		return netdev_offload_xstats_get_stats(dev, type, p_stats,
8183						       p_used, extack);
8184	else
8185		return netdev_offload_xstats_get_used(dev, type, p_used,
8186						      extack);
8187}
8188EXPORT_SYMBOL(netdev_offload_xstats_get);
8189
8190void
8191netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
8192				   const struct rtnl_hw_stats64 *stats)
8193{
8194	report_delta->used = true;
8195	netdev_hw_stats64_add(&report_delta->stats, stats);
8196}
8197EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
8198
8199void
8200netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
8201{
8202	report_used->used = true;
8203}
8204EXPORT_SYMBOL(netdev_offload_xstats_report_used);
8205
8206void netdev_offload_xstats_push_delta(struct net_device *dev,
8207				      enum netdev_offload_xstats_type type,
8208				      const struct rtnl_hw_stats64 *p_stats)
8209{
8210	struct rtnl_hw_stats64 *stats;
8211
8212	ASSERT_RTNL();
8213
8214	stats = netdev_offload_xstats_get_ptr(dev, type);
8215	if (WARN_ON(!stats))
8216		return;
8217
8218	netdev_hw_stats64_add(stats, p_stats);
8219}
8220EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
8221
8222/**
8223 * netdev_get_xmit_slave - Get the xmit slave of master device
8224 * @dev: device
8225 * @skb: The packet
8226 * @all_slaves: assume all the slaves are active
8227 *
8228 * The reference counters are not incremented so the caller must be
8229 * careful with locks. The caller must hold RCU lock.
8230 * %NULL is returned if no slave is found.
8231 */
8232
8233struct net_device *netdev_get_xmit_slave(struct net_device *dev,
8234					 struct sk_buff *skb,
8235					 bool all_slaves)
8236{
8237	const struct net_device_ops *ops = dev->netdev_ops;
8238
8239	if (!ops->ndo_get_xmit_slave)
8240		return NULL;
8241	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
8242}
8243EXPORT_SYMBOL(netdev_get_xmit_slave);
8244
8245static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
8246						  struct sock *sk)
8247{
8248	const struct net_device_ops *ops = dev->netdev_ops;
8249
8250	if (!ops->ndo_sk_get_lower_dev)
8251		return NULL;
8252	return ops->ndo_sk_get_lower_dev(dev, sk);
8253}
8254
8255/**
8256 * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
8257 * @dev: device
8258 * @sk: the socket
8259 *
8260 * %NULL is returned if no lower device is found.
8261 */
8262
8263struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
8264					    struct sock *sk)
8265{
8266	struct net_device *lower;
8267
8268	lower = netdev_sk_get_lower_dev(dev, sk);
8269	while (lower) {
8270		dev = lower;
8271		lower = netdev_sk_get_lower_dev(dev, sk);
8272	}
8273
8274	return dev;
8275}
8276EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
8277
8278static void netdev_adjacent_add_links(struct net_device *dev)
8279{
8280	struct netdev_adjacent *iter;
8281
8282	struct net *net = dev_net(dev);
8283
8284	list_for_each_entry(iter, &dev->adj_list.upper, list) {
8285		if (!net_eq(net, dev_net(iter->dev)))
8286			continue;
8287		netdev_adjacent_sysfs_add(iter->dev, dev,
8288					  &iter->dev->adj_list.lower);
8289		netdev_adjacent_sysfs_add(dev, iter->dev,
8290					  &dev->adj_list.upper);
8291	}
8292
8293	list_for_each_entry(iter, &dev->adj_list.lower, list) {
8294		if (!net_eq(net, dev_net(iter->dev)))
8295			continue;
8296		netdev_adjacent_sysfs_add(iter->dev, dev,
8297					  &iter->dev->adj_list.upper);
8298		netdev_adjacent_sysfs_add(dev, iter->dev,
8299					  &dev->adj_list.lower);
8300	}
8301}
8302
8303static void netdev_adjacent_del_links(struct net_device *dev)
8304{
8305	struct netdev_adjacent *iter;
8306
8307	struct net *net = dev_net(dev);
8308
8309	list_for_each_entry(iter, &dev->adj_list.upper, list) {
8310		if (!net_eq(net, dev_net(iter->dev)))
8311			continue;
8312		netdev_adjacent_sysfs_del(iter->dev, dev->name,
8313					  &iter->dev->adj_list.lower);
8314		netdev_adjacent_sysfs_del(dev, iter->dev->name,
8315					  &dev->adj_list.upper);
8316	}
8317
8318	list_for_each_entry(iter, &dev->adj_list.lower, list) {
8319		if (!net_eq(net, dev_net(iter->dev)))
8320			continue;
8321		netdev_adjacent_sysfs_del(iter->dev, dev->name,
8322					  &iter->dev->adj_list.upper);
8323		netdev_adjacent_sysfs_del(dev, iter->dev->name,
8324					  &dev->adj_list.lower);
8325	}
8326}
8327
8328void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
8329{
8330	struct netdev_adjacent *iter;
8331
8332	struct net *net = dev_net(dev);
8333
8334	list_for_each_entry(iter, &dev->adj_list.upper, list) {
8335		if (!net_eq(net, dev_net(iter->dev)))
8336			continue;
8337		netdev_adjacent_sysfs_del(iter->dev, oldname,
8338					  &iter->dev->adj_list.lower);
8339		netdev_adjacent_sysfs_add(iter->dev, dev,
8340					  &iter->dev->adj_list.lower);
8341	}
8342
8343	list_for_each_entry(iter, &dev->adj_list.lower, list) {
8344		if (!net_eq(net, dev_net(iter->dev)))
8345			continue;
8346		netdev_adjacent_sysfs_del(iter->dev, oldname,
8347					  &iter->dev->adj_list.upper);
8348		netdev_adjacent_sysfs_add(iter->dev, dev,
8349					  &iter->dev->adj_list.upper);
8350	}
8351}
8352
8353void *netdev_lower_dev_get_private(struct net_device *dev,
8354				   struct net_device *lower_dev)
8355{
8356	struct netdev_adjacent *lower;
8357
8358	if (!lower_dev)
8359		return NULL;
8360	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
8361	if (!lower)
8362		return NULL;
8363
8364	return lower->private;
8365}
8366EXPORT_SYMBOL(netdev_lower_dev_get_private);
8367
8368
8369/**
8370 * netdev_lower_state_changed - Dispatch event about lower device state change
8371 * @lower_dev: device
8372 * @lower_state_info: state to dispatch
8373 *
8374 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
8375 * The caller must hold the RTNL lock.
8376 */
8377void netdev_lower_state_changed(struct net_device *lower_dev,
8378				void *lower_state_info)
8379{
8380	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
8381		.info.dev = lower_dev,
8382	};
8383
8384	ASSERT_RTNL();
8385	changelowerstate_info.lower_state_info = lower_state_info;
8386	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
8387				      &changelowerstate_info.info);
8388}
8389EXPORT_SYMBOL(netdev_lower_state_changed);
8390
8391static void dev_change_rx_flags(struct net_device *dev, int flags)
8392{
8393	const struct net_device_ops *ops = dev->netdev_ops;
8394
8395	if (ops->ndo_change_rx_flags)
8396		ops->ndo_change_rx_flags(dev, flags);
8397}
8398
8399static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
8400{
8401	unsigned int old_flags = dev->flags;
8402	kuid_t uid;
8403	kgid_t gid;
8404
8405	ASSERT_RTNL();
8406
8407	dev->flags |= IFF_PROMISC;
8408	dev->promiscuity += inc;
8409	if (dev->promiscuity == 0) {
8410		/*
8411		 * Avoid overflow.
8412		 * If inc causes overflow, untouch promisc and return error.
8413		 */
8414		if (inc < 0)
8415			dev->flags &= ~IFF_PROMISC;
8416		else {
8417			dev->promiscuity -= inc;
8418			netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
8419			return -EOVERFLOW;
8420		}
8421	}
8422	if (dev->flags != old_flags) {
8423		netdev_info(dev, "%s promiscuous mode\n",
8424			    dev->flags & IFF_PROMISC ? "entered" : "left");
8425		if (audit_enabled) {
8426			current_uid_gid(&uid, &gid);
8427			audit_log(audit_context(), GFP_ATOMIC,
8428				  AUDIT_ANOM_PROMISCUOUS,
8429				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
8430				  dev->name, (dev->flags & IFF_PROMISC),
8431				  (old_flags & IFF_PROMISC),
8432				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
8433				  from_kuid(&init_user_ns, uid),
8434				  from_kgid(&init_user_ns, gid),
8435				  audit_get_sessionid(current));
8436		}
8437
8438		dev_change_rx_flags(dev, IFF_PROMISC);
8439	}
8440	if (notify)
8441		__dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
8442	return 0;
8443}
8444
8445/**
8446 *	dev_set_promiscuity	- update promiscuity count on a device
8447 *	@dev: device
8448 *	@inc: modifier
8449 *
8450 *	Add or remove promiscuity from a device. While the count in the device
8451 *	remains above zero the interface remains promiscuous. Once it hits zero
8452 *	the device reverts back to normal filtering operation. A negative inc
8453 *	value is used to drop promiscuity on the device.
8454 *	Return 0 if successful or a negative errno code on error.
8455 */
8456int dev_set_promiscuity(struct net_device *dev, int inc)
8457{
8458	unsigned int old_flags = dev->flags;
8459	int err;
8460
8461	err = __dev_set_promiscuity(dev, inc, true);
8462	if (err < 0)
8463		return err;
8464	if (dev->flags != old_flags)
8465		dev_set_rx_mode(dev);
8466	return err;
8467}
8468EXPORT_SYMBOL(dev_set_promiscuity);
8469
8470static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
8471{
8472	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
8473
8474	ASSERT_RTNL();
8475
8476	dev->flags |= IFF_ALLMULTI;
8477	dev->allmulti += inc;
8478	if (dev->allmulti == 0) {
8479		/*
8480		 * Avoid overflow.
8481		 * If inc causes overflow, untouch allmulti and return error.
8482		 */
8483		if (inc < 0)
8484			dev->flags &= ~IFF_ALLMULTI;
8485		else {
8486			dev->allmulti -= inc;
8487			netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
8488			return -EOVERFLOW;
8489		}
8490	}
8491	if (dev->flags ^ old_flags) {
8492		netdev_info(dev, "%s allmulticast mode\n",
8493			    dev->flags & IFF_ALLMULTI ? "entered" : "left");
8494		dev_change_rx_flags(dev, IFF_ALLMULTI);
8495		dev_set_rx_mode(dev);
8496		if (notify)
8497			__dev_notify_flags(dev, old_flags,
8498					   dev->gflags ^ old_gflags, 0, NULL);
8499	}
8500	return 0;
8501}
8502
8503/**
8504 *	dev_set_allmulti	- update allmulti count on a device
8505 *	@dev: device
8506 *	@inc: modifier
8507 *
8508 *	Add or remove reception of all multicast frames to a device. While the
8509 *	count in the device remains above zero the interface remains listening
8510 *	to all interfaces. Once it hits zero the device reverts back to normal
8511 *	filtering operation. A negative @inc value is used to drop the counter
8512 *	when releasing a resource needing all multicasts.
8513 *	Return 0 if successful or a negative errno code on error.
8514 */
8515
8516int dev_set_allmulti(struct net_device *dev, int inc)
8517{
8518	return __dev_set_allmulti(dev, inc, true);
8519}
8520EXPORT_SYMBOL(dev_set_allmulti);
8521
8522/*
8523 *	Upload unicast and multicast address lists to device and
8524 *	configure RX filtering. When the device doesn't support unicast
8525 *	filtering it is put in promiscuous mode while unicast addresses
8526 *	are present.
8527 */
8528void __dev_set_rx_mode(struct net_device *dev)
8529{
8530	const struct net_device_ops *ops = dev->netdev_ops;
8531
8532	/* dev_open will call this function so the list will stay sane. */
8533	if (!(dev->flags&IFF_UP))
8534		return;
8535
8536	if (!netif_device_present(dev))
8537		return;
8538
8539	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
8540		/* Unicast addresses changes may only happen under the rtnl,
8541		 * therefore calling __dev_set_promiscuity here is safe.
8542		 */
8543		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
8544			__dev_set_promiscuity(dev, 1, false);
8545			dev->uc_promisc = true;
8546		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
8547			__dev_set_promiscuity(dev, -1, false);
8548			dev->uc_promisc = false;
8549		}
8550	}
8551
8552	if (ops->ndo_set_rx_mode)
8553		ops->ndo_set_rx_mode(dev);
8554}
8555
8556void dev_set_rx_mode(struct net_device *dev)
8557{
8558	netif_addr_lock_bh(dev);
8559	__dev_set_rx_mode(dev);
8560	netif_addr_unlock_bh(dev);
8561}
8562
8563/**
8564 *	dev_get_flags - get flags reported to userspace
8565 *	@dev: device
8566 *
8567 *	Get the combination of flag bits exported through APIs to userspace.
8568 */
8569unsigned int dev_get_flags(const struct net_device *dev)
8570{
8571	unsigned int flags;
8572
8573	flags = (dev->flags & ~(IFF_PROMISC |
8574				IFF_ALLMULTI |
8575				IFF_RUNNING |
8576				IFF_LOWER_UP |
8577				IFF_DORMANT)) |
8578		(dev->gflags & (IFF_PROMISC |
8579				IFF_ALLMULTI));
8580
8581	if (netif_running(dev)) {
8582		if (netif_oper_up(dev))
8583			flags |= IFF_RUNNING;
8584		if (netif_carrier_ok(dev))
8585			flags |= IFF_LOWER_UP;
8586		if (netif_dormant(dev))
8587			flags |= IFF_DORMANT;
8588	}
8589
8590	return flags;
8591}
8592EXPORT_SYMBOL(dev_get_flags);
8593
8594int __dev_change_flags(struct net_device *dev, unsigned int flags,
8595		       struct netlink_ext_ack *extack)
8596{
8597	unsigned int old_flags = dev->flags;
8598	int ret;
8599
8600	ASSERT_RTNL();
8601
8602	/*
8603	 *	Set the flags on our device.
8604	 */
8605
8606	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
8607			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
8608			       IFF_AUTOMEDIA)) |
8609		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
8610				    IFF_ALLMULTI));
8611
8612	/*
8613	 *	Load in the correct multicast list now the flags have changed.
8614	 */
8615
8616	if ((old_flags ^ flags) & IFF_MULTICAST)
8617		dev_change_rx_flags(dev, IFF_MULTICAST);
8618
8619	dev_set_rx_mode(dev);
8620
8621	/*
8622	 *	Have we downed the interface. We handle IFF_UP ourselves
8623	 *	according to user attempts to set it, rather than blindly
8624	 *	setting it.
8625	 */
8626
8627	ret = 0;
8628	if ((old_flags ^ flags) & IFF_UP) {
8629		if (old_flags & IFF_UP)
8630			__dev_close(dev);
8631		else
8632			ret = __dev_open(dev, extack);
8633	}
8634
8635	if ((flags ^ dev->gflags) & IFF_PROMISC) {
8636		int inc = (flags & IFF_PROMISC) ? 1 : -1;
8637		unsigned int old_flags = dev->flags;
8638
8639		dev->gflags ^= IFF_PROMISC;
8640
8641		if (__dev_set_promiscuity(dev, inc, false) >= 0)
8642			if (dev->flags != old_flags)
8643				dev_set_rx_mode(dev);
8644	}
8645
8646	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
8647	 * is important. Some (broken) drivers set IFF_PROMISC, when
8648	 * IFF_ALLMULTI is requested not asking us and not reporting.
8649	 */
8650	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
8651		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
8652
8653		dev->gflags ^= IFF_ALLMULTI;
8654		__dev_set_allmulti(dev, inc, false);
8655	}
8656
8657	return ret;
8658}
8659
8660void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
8661			unsigned int gchanges, u32 portid,
8662			const struct nlmsghdr *nlh)
8663{
8664	unsigned int changes = dev->flags ^ old_flags;
8665
8666	if (gchanges)
8667		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
8668
8669	if (changes & IFF_UP) {
8670		if (dev->flags & IFF_UP)
8671			call_netdevice_notifiers(NETDEV_UP, dev);
8672		else
8673			call_netdevice_notifiers(NETDEV_DOWN, dev);
8674	}
8675
8676	if (dev->flags & IFF_UP &&
8677	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
8678		struct netdev_notifier_change_info change_info = {
8679			.info = {
8680				.dev = dev,
8681			},
8682			.flags_changed = changes,
8683		};
8684
8685		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
8686	}
8687}
8688
8689/**
8690 *	dev_change_flags - change device settings
8691 *	@dev: device
8692 *	@flags: device state flags
8693 *	@extack: netlink extended ack
8694 *
8695 *	Change settings on device based state flags. The flags are
8696 *	in the userspace exported format.
8697 */
8698int dev_change_flags(struct net_device *dev, unsigned int flags,
8699		     struct netlink_ext_ack *extack)
8700{
8701	int ret;
8702	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
8703
8704	ret = __dev_change_flags(dev, flags, extack);
8705	if (ret < 0)
8706		return ret;
8707
8708	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
8709	__dev_notify_flags(dev, old_flags, changes, 0, NULL);
8710	return ret;
8711}
8712EXPORT_SYMBOL(dev_change_flags);
8713
8714int __dev_set_mtu(struct net_device *dev, int new_mtu)
8715{
8716	const struct net_device_ops *ops = dev->netdev_ops;
8717
8718	if (ops->ndo_change_mtu)
8719		return ops->ndo_change_mtu(dev, new_mtu);
8720
8721	/* Pairs with all the lockless reads of dev->mtu in the stack */
8722	WRITE_ONCE(dev->mtu, new_mtu);
8723	return 0;
8724}
8725EXPORT_SYMBOL(__dev_set_mtu);
8726
8727int dev_validate_mtu(struct net_device *dev, int new_mtu,
8728		     struct netlink_ext_ack *extack)
8729{
8730	/* MTU must be positive, and in range */
8731	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
8732		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
8733		return -EINVAL;
8734	}
8735
8736	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
8737		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
8738		return -EINVAL;
8739	}
8740	return 0;
8741}
8742
8743/**
8744 *	dev_set_mtu_ext - Change maximum transfer unit
8745 *	@dev: device
8746 *	@new_mtu: new transfer unit
8747 *	@extack: netlink extended ack
8748 *
8749 *	Change the maximum transfer size of the network device.
8750 */
8751int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
8752		    struct netlink_ext_ack *extack)
8753{
8754	int err, orig_mtu;
8755
8756	if (new_mtu == dev->mtu)
8757		return 0;
8758
8759	err = dev_validate_mtu(dev, new_mtu, extack);
8760	if (err)
8761		return err;
8762
8763	if (!netif_device_present(dev))
8764		return -ENODEV;
8765
8766	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
8767	err = notifier_to_errno(err);
8768	if (err)
8769		return err;
8770
8771	orig_mtu = dev->mtu;
8772	err = __dev_set_mtu(dev, new_mtu);
8773
8774	if (!err) {
8775		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8776						   orig_mtu);
8777		err = notifier_to_errno(err);
8778		if (err) {
8779			/* setting mtu back and notifying everyone again,
8780			 * so that they have a chance to revert changes.
8781			 */
8782			__dev_set_mtu(dev, orig_mtu);
8783			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8784						     new_mtu);
8785		}
8786	}
8787	return err;
8788}
8789
8790int dev_set_mtu(struct net_device *dev, int new_mtu)
8791{
8792	struct netlink_ext_ack extack;
8793	int err;
8794
8795	memset(&extack, 0, sizeof(extack));
8796	err = dev_set_mtu_ext(dev, new_mtu, &extack);
8797	if (err && extack._msg)
8798		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
8799	return err;
8800}
8801EXPORT_SYMBOL(dev_set_mtu);
8802
8803/**
8804 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
8805 *	@dev: device
8806 *	@new_len: new tx queue length
8807 */
8808int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
8809{
8810	unsigned int orig_len = dev->tx_queue_len;
8811	int res;
8812
8813	if (new_len != (unsigned int)new_len)
8814		return -ERANGE;
8815
8816	if (new_len != orig_len) {
8817		dev->tx_queue_len = new_len;
8818		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
8819		res = notifier_to_errno(res);
8820		if (res)
8821			goto err_rollback;
8822		res = dev_qdisc_change_tx_queue_len(dev);
8823		if (res)
8824			goto err_rollback;
8825	}
8826
8827	return 0;
8828
8829err_rollback:
8830	netdev_err(dev, "refused to change device tx_queue_len\n");
8831	dev->tx_queue_len = orig_len;
8832	return res;
8833}
8834
8835/**
8836 *	dev_set_group - Change group this device belongs to
8837 *	@dev: device
8838 *	@new_group: group this device should belong to
8839 */
8840void dev_set_group(struct net_device *dev, int new_group)
8841{
8842	dev->group = new_group;
8843}
8844
8845/**
8846 *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
8847 *	@dev: device
8848 *	@addr: new address
8849 *	@extack: netlink extended ack
8850 */
8851int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
8852			      struct netlink_ext_ack *extack)
8853{
8854	struct netdev_notifier_pre_changeaddr_info info = {
8855		.info.dev = dev,
8856		.info.extack = extack,
8857		.dev_addr = addr,
8858	};
8859	int rc;
8860
8861	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
8862	return notifier_to_errno(rc);
8863}
8864EXPORT_SYMBOL(dev_pre_changeaddr_notify);
8865
8866/**
8867 *	dev_set_mac_address - Change Media Access Control Address
8868 *	@dev: device
8869 *	@sa: new address
8870 *	@extack: netlink extended ack
8871 *
8872 *	Change the hardware (MAC) address of the device
8873 */
8874int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
8875			struct netlink_ext_ack *extack)
8876{
8877	const struct net_device_ops *ops = dev->netdev_ops;
8878	int err;
8879
8880	if (!ops->ndo_set_mac_address)
8881		return -EOPNOTSUPP;
8882	if (sa->sa_family != dev->type)
8883		return -EINVAL;
8884	if (!netif_device_present(dev))
8885		return -ENODEV;
8886	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
8887	if (err)
8888		return err;
8889	if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) {
8890		err = ops->ndo_set_mac_address(dev, sa);
8891		if (err)
8892			return err;
8893	}
8894	dev->addr_assign_type = NET_ADDR_SET;
8895	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
8896	add_device_randomness(dev->dev_addr, dev->addr_len);
8897	return 0;
8898}
8899EXPORT_SYMBOL(dev_set_mac_address);
8900
8901static DECLARE_RWSEM(dev_addr_sem);
8902
8903int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
8904			     struct netlink_ext_ack *extack)
8905{
8906	int ret;
8907
8908	down_write(&dev_addr_sem);
8909	ret = dev_set_mac_address(dev, sa, extack);
8910	up_write(&dev_addr_sem);
8911	return ret;
8912}
8913EXPORT_SYMBOL(dev_set_mac_address_user);
8914
8915int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
8916{
8917	size_t size = sizeof(sa->sa_data_min);
8918	struct net_device *dev;
8919	int ret = 0;
8920
8921	down_read(&dev_addr_sem);
8922	rcu_read_lock();
8923
8924	dev = dev_get_by_name_rcu(net, dev_name);
8925	if (!dev) {
8926		ret = -ENODEV;
8927		goto unlock;
8928	}
8929	if (!dev->addr_len)
8930		memset(sa->sa_data, 0, size);
8931	else
8932		memcpy(sa->sa_data, dev->dev_addr,
8933		       min_t(size_t, size, dev->addr_len));
8934	sa->sa_family = dev->type;
8935
8936unlock:
8937	rcu_read_unlock();
8938	up_read(&dev_addr_sem);
8939	return ret;
8940}
8941EXPORT_SYMBOL(dev_get_mac_address);
8942
8943/**
8944 *	dev_change_carrier - Change device carrier
8945 *	@dev: device
8946 *	@new_carrier: new value
8947 *
8948 *	Change device carrier
8949 */
8950int dev_change_carrier(struct net_device *dev, bool new_carrier)
8951{
8952	const struct net_device_ops *ops = dev->netdev_ops;
8953
8954	if (!ops->ndo_change_carrier)
8955		return -EOPNOTSUPP;
8956	if (!netif_device_present(dev))
8957		return -ENODEV;
8958	return ops->ndo_change_carrier(dev, new_carrier);
8959}
8960
8961/**
8962 *	dev_get_phys_port_id - Get device physical port ID
8963 *	@dev: device
8964 *	@ppid: port ID
8965 *
8966 *	Get device physical port ID
8967 */
8968int dev_get_phys_port_id(struct net_device *dev,
8969			 struct netdev_phys_item_id *ppid)
8970{
8971	const struct net_device_ops *ops = dev->netdev_ops;
8972
8973	if (!ops->ndo_get_phys_port_id)
8974		return -EOPNOTSUPP;
8975	return ops->ndo_get_phys_port_id(dev, ppid);
8976}
8977
8978/**
8979 *	dev_get_phys_port_name - Get device physical port name
8980 *	@dev: device
8981 *	@name: port name
8982 *	@len: limit of bytes to copy to name
8983 *
8984 *	Get device physical port name
8985 */
8986int dev_get_phys_port_name(struct net_device *dev,
8987			   char *name, size_t len)
8988{
8989	const struct net_device_ops *ops = dev->netdev_ops;
8990	int err;
8991
8992	if (ops->ndo_get_phys_port_name) {
8993		err = ops->ndo_get_phys_port_name(dev, name, len);
8994		if (err != -EOPNOTSUPP)
8995			return err;
8996	}
8997	return devlink_compat_phys_port_name_get(dev, name, len);
8998}
8999
9000/**
9001 *	dev_get_port_parent_id - Get the device's port parent identifier
9002 *	@dev: network device
9003 *	@ppid: pointer to a storage for the port's parent identifier
9004 *	@recurse: allow/disallow recursion to lower devices
9005 *
9006 *	Get the devices's port parent identifier
9007 */
9008int dev_get_port_parent_id(struct net_device *dev,
9009			   struct netdev_phys_item_id *ppid,
9010			   bool recurse)
9011{
9012	const struct net_device_ops *ops = dev->netdev_ops;
9013	struct netdev_phys_item_id first = { };
9014	struct net_device *lower_dev;
9015	struct list_head *iter;
9016	int err;
9017
9018	if (ops->ndo_get_port_parent_id) {
9019		err = ops->ndo_get_port_parent_id(dev, ppid);
9020		if (err != -EOPNOTSUPP)
9021			return err;
9022	}
9023
9024	err = devlink_compat_switch_id_get(dev, ppid);
9025	if (!recurse || err != -EOPNOTSUPP)
9026		return err;
9027
9028	netdev_for_each_lower_dev(dev, lower_dev, iter) {
9029		err = dev_get_port_parent_id(lower_dev, ppid, true);
9030		if (err)
9031			break;
9032		if (!first.id_len)
9033			first = *ppid;
9034		else if (memcmp(&first, ppid, sizeof(*ppid)))
9035			return -EOPNOTSUPP;
9036	}
9037
9038	return err;
9039}
9040EXPORT_SYMBOL(dev_get_port_parent_id);
9041
9042/**
9043 *	netdev_port_same_parent_id - Indicate if two network devices have
9044 *	the same port parent identifier
9045 *	@a: first network device
9046 *	@b: second network device
9047 */
9048bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
9049{
9050	struct netdev_phys_item_id a_id = { };
9051	struct netdev_phys_item_id b_id = { };
9052
9053	if (dev_get_port_parent_id(a, &a_id, true) ||
9054	    dev_get_port_parent_id(b, &b_id, true))
9055		return false;
9056
9057	return netdev_phys_item_id_same(&a_id, &b_id);
9058}
9059EXPORT_SYMBOL(netdev_port_same_parent_id);
9060
9061/**
9062 *	dev_change_proto_down - set carrier according to proto_down.
9063 *
9064 *	@dev: device
9065 *	@proto_down: new value
9066 */
9067int dev_change_proto_down(struct net_device *dev, bool proto_down)
9068{
9069	if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
9070		return -EOPNOTSUPP;
9071	if (!netif_device_present(dev))
9072		return -ENODEV;
9073	if (proto_down)
9074		netif_carrier_off(dev);
9075	else
9076		netif_carrier_on(dev);
9077	dev->proto_down = proto_down;
9078	return 0;
9079}
9080
9081/**
9082 *	dev_change_proto_down_reason - proto down reason
9083 *
9084 *	@dev: device
9085 *	@mask: proto down mask
9086 *	@value: proto down value
9087 */
9088void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
9089				  u32 value)
9090{
9091	int b;
9092
9093	if (!mask) {
9094		dev->proto_down_reason = value;
9095	} else {
9096		for_each_set_bit(b, &mask, 32) {
9097			if (value & (1 << b))
9098				dev->proto_down_reason |= BIT(b);
9099			else
9100				dev->proto_down_reason &= ~BIT(b);
9101		}
9102	}
9103}
9104
9105struct bpf_xdp_link {
9106	struct bpf_link link;
9107	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
9108	int flags;
9109};
9110
9111static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
9112{
9113	if (flags & XDP_FLAGS_HW_MODE)
9114		return XDP_MODE_HW;
9115	if (flags & XDP_FLAGS_DRV_MODE)
9116		return XDP_MODE_DRV;
9117	if (flags & XDP_FLAGS_SKB_MODE)
9118		return XDP_MODE_SKB;
9119	return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
9120}
9121
9122static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
9123{
9124	switch (mode) {
9125	case XDP_MODE_SKB:
9126		return generic_xdp_install;
9127	case XDP_MODE_DRV:
9128	case XDP_MODE_HW:
9129		return dev->netdev_ops->ndo_bpf;
9130	default:
9131		return NULL;
9132	}
9133}
9134
9135static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
9136					 enum bpf_xdp_mode mode)
9137{
9138	return dev->xdp_state[mode].link;
9139}
9140
9141static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
9142				     enum bpf_xdp_mode mode)
9143{
9144	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
9145
9146	if (link)
9147		return link->link.prog;
9148	return dev->xdp_state[mode].prog;
9149}
9150
9151u8 dev_xdp_prog_count(struct net_device *dev)
9152{
9153	u8 count = 0;
9154	int i;
9155
9156	for (i = 0; i < __MAX_XDP_MODE; i++)
9157		if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
9158			count++;
9159	return count;
9160}
9161EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
9162
9163u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
9164{
9165	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
9166
9167	return prog ? prog->aux->id : 0;
9168}
9169
9170static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
9171			     struct bpf_xdp_link *link)
9172{
9173	dev->xdp_state[mode].link = link;
9174	dev->xdp_state[mode].prog = NULL;
9175}
9176
9177static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
9178			     struct bpf_prog *prog)
9179{
9180	dev->xdp_state[mode].link = NULL;
9181	dev->xdp_state[mode].prog = prog;
9182}
9183
9184static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
9185			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
9186			   u32 flags, struct bpf_prog *prog)
9187{
9188	struct netdev_bpf xdp;
9189	int err;
9190
9191	memset(&xdp, 0, sizeof(xdp));
9192	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
9193	xdp.extack = extack;
9194	xdp.flags = flags;
9195	xdp.prog = prog;
9196
9197	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
9198	 * "moved" into driver), so they don't increment it on their own, but
9199	 * they do decrement refcnt when program is detached or replaced.
9200	 * Given net_device also owns link/prog, we need to bump refcnt here
9201	 * to prevent drivers from underflowing it.
9202	 */
9203	if (prog)
9204		bpf_prog_inc(prog);
9205	err = bpf_op(dev, &xdp);
9206	if (err) {
9207		if (prog)
9208			bpf_prog_put(prog);
9209		return err;
9210	}
9211
9212	if (mode != XDP_MODE_HW)
9213		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
9214
9215	return 0;
9216}
9217
9218static void dev_xdp_uninstall(struct net_device *dev)
9219{
9220	struct bpf_xdp_link *link;
9221	struct bpf_prog *prog;
9222	enum bpf_xdp_mode mode;
9223	bpf_op_t bpf_op;
9224
9225	ASSERT_RTNL();
9226
9227	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
9228		prog = dev_xdp_prog(dev, mode);
9229		if (!prog)
9230			continue;
9231
9232		bpf_op = dev_xdp_bpf_op(dev, mode);
9233		if (!bpf_op)
9234			continue;
9235
9236		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9237
9238		/* auto-detach link from net device */
9239		link = dev_xdp_link(dev, mode);
9240		if (link)
9241			link->dev = NULL;
9242		else
9243			bpf_prog_put(prog);
9244
9245		dev_xdp_set_link(dev, mode, NULL);
9246	}
9247}
9248
9249static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
9250			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
9251			  struct bpf_prog *old_prog, u32 flags)
9252{
9253	unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
9254	struct bpf_prog *cur_prog;
9255	struct net_device *upper;
9256	struct list_head *iter;
9257	enum bpf_xdp_mode mode;
9258	bpf_op_t bpf_op;
9259	int err;
9260
9261	ASSERT_RTNL();
9262
9263	/* either link or prog attachment, never both */
9264	if (link && (new_prog || old_prog))
9265		return -EINVAL;
9266	/* link supports only XDP mode flags */
9267	if (link && (flags & ~XDP_FLAGS_MODES)) {
9268		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
9269		return -EINVAL;
9270	}
9271	/* just one XDP mode bit should be set, zero defaults to drv/skb mode */
9272	if (num_modes > 1) {
9273		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
9274		return -EINVAL;
9275	}
9276	/* avoid ambiguity if offload + drv/skb mode progs are both loaded */
9277	if (!num_modes && dev_xdp_prog_count(dev) > 1) {
9278		NL_SET_ERR_MSG(extack,
9279			       "More than one program loaded, unset mode is ambiguous");
9280		return -EINVAL;
9281	}
9282	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
9283	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
9284		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
9285		return -EINVAL;
9286	}
9287
9288	mode = dev_xdp_mode(dev, flags);
9289	/* can't replace attached link */
9290	if (dev_xdp_link(dev, mode)) {
9291		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
9292		return -EBUSY;
9293	}
9294
9295	/* don't allow if an upper device already has a program */
9296	netdev_for_each_upper_dev_rcu(dev, upper, iter) {
9297		if (dev_xdp_prog_count(upper) > 0) {
9298			NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
9299			return -EEXIST;
9300		}
9301	}
9302
9303	cur_prog = dev_xdp_prog(dev, mode);
9304	/* can't replace attached prog with link */
9305	if (link && cur_prog) {
9306		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
9307		return -EBUSY;
9308	}
9309	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
9310		NL_SET_ERR_MSG(extack, "Active program does not match expected");
9311		return -EEXIST;
9312	}
9313
9314	/* put effective new program into new_prog */
9315	if (link)
9316		new_prog = link->link.prog;
9317
9318	if (new_prog) {
9319		bool offload = mode == XDP_MODE_HW;
9320		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
9321					       ? XDP_MODE_DRV : XDP_MODE_SKB;
9322
9323		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
9324			NL_SET_ERR_MSG(extack, "XDP program already attached");
9325			return -EBUSY;
9326		}
9327		if (!offload && dev_xdp_prog(dev, other_mode)) {
9328			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
9329			return -EEXIST;
9330		}
9331		if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
9332			NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
9333			return -EINVAL;
9334		}
9335		if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
9336			NL_SET_ERR_MSG(extack, "Program bound to different device");
9337			return -EINVAL;
9338		}
9339		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
9340			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
9341			return -EINVAL;
9342		}
9343		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
9344			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
9345			return -EINVAL;
9346		}
9347	}
9348
9349	/* don't call drivers if the effective program didn't change */
9350	if (new_prog != cur_prog) {
9351		bpf_op = dev_xdp_bpf_op(dev, mode);
9352		if (!bpf_op) {
9353			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
9354			return -EOPNOTSUPP;
9355		}
9356
9357		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
9358		if (err)
9359			return err;
9360	}
9361
9362	if (link)
9363		dev_xdp_set_link(dev, mode, link);
9364	else
9365		dev_xdp_set_prog(dev, mode, new_prog);
9366	if (cur_prog)
9367		bpf_prog_put(cur_prog);
9368
9369	return 0;
9370}
9371
9372static int dev_xdp_attach_link(struct net_device *dev,
9373			       struct netlink_ext_ack *extack,
9374			       struct bpf_xdp_link *link)
9375{
9376	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
9377}
9378
9379static int dev_xdp_detach_link(struct net_device *dev,
9380			       struct netlink_ext_ack *extack,
9381			       struct bpf_xdp_link *link)
9382{
9383	enum bpf_xdp_mode mode;
9384	bpf_op_t bpf_op;
9385
9386	ASSERT_RTNL();
9387
9388	mode = dev_xdp_mode(dev, link->flags);
9389	if (dev_xdp_link(dev, mode) != link)
9390		return -EINVAL;
9391
9392	bpf_op = dev_xdp_bpf_op(dev, mode);
9393	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9394	dev_xdp_set_link(dev, mode, NULL);
9395	return 0;
9396}
9397
9398static void bpf_xdp_link_release(struct bpf_link *link)
9399{
9400	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9401
9402	rtnl_lock();
9403
9404	/* if racing with net_device's tear down, xdp_link->dev might be
9405	 * already NULL, in which case link was already auto-detached
9406	 */
9407	if (xdp_link->dev) {
9408		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
9409		xdp_link->dev = NULL;
9410	}
9411
9412	rtnl_unlock();
9413}
9414
9415static int bpf_xdp_link_detach(struct bpf_link *link)
9416{
9417	bpf_xdp_link_release(link);
9418	return 0;
9419}
9420
9421static void bpf_xdp_link_dealloc(struct bpf_link *link)
9422{
9423	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9424
9425	kfree(xdp_link);
9426}
9427
9428static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
9429				     struct seq_file *seq)
9430{
9431	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9432	u32 ifindex = 0;
9433
9434	rtnl_lock();
9435	if (xdp_link->dev)
9436		ifindex = xdp_link->dev->ifindex;
9437	rtnl_unlock();
9438
9439	seq_printf(seq, "ifindex:\t%u\n", ifindex);
9440}
9441
9442static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
9443				       struct bpf_link_info *info)
9444{
9445	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9446	u32 ifindex = 0;
9447
9448	rtnl_lock();
9449	if (xdp_link->dev)
9450		ifindex = xdp_link->dev->ifindex;
9451	rtnl_unlock();
9452
9453	info->xdp.ifindex = ifindex;
9454	return 0;
9455}
9456
9457static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
9458			       struct bpf_prog *old_prog)
9459{
9460	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9461	enum bpf_xdp_mode mode;
9462	bpf_op_t bpf_op;
9463	int err = 0;
9464
9465	rtnl_lock();
9466
9467	/* link might have been auto-released already, so fail */
9468	if (!xdp_link->dev) {
9469		err = -ENOLINK;
9470		goto out_unlock;
9471	}
9472
9473	if (old_prog && link->prog != old_prog) {
9474		err = -EPERM;
9475		goto out_unlock;
9476	}
9477	old_prog = link->prog;
9478	if (old_prog->type != new_prog->type ||
9479	    old_prog->expected_attach_type != new_prog->expected_attach_type) {
9480		err = -EINVAL;
9481		goto out_unlock;
9482	}
9483
9484	if (old_prog == new_prog) {
9485		/* no-op, don't disturb drivers */
9486		bpf_prog_put(new_prog);
9487		goto out_unlock;
9488	}
9489
9490	mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
9491	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
9492	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
9493			      xdp_link->flags, new_prog);
9494	if (err)
9495		goto out_unlock;
9496
9497	old_prog = xchg(&link->prog, new_prog);
9498	bpf_prog_put(old_prog);
9499
9500out_unlock:
9501	rtnl_unlock();
9502	return err;
9503}
9504
9505static const struct bpf_link_ops bpf_xdp_link_lops = {
9506	.release = bpf_xdp_link_release,
9507	.dealloc = bpf_xdp_link_dealloc,
9508	.detach = bpf_xdp_link_detach,
9509	.show_fdinfo = bpf_xdp_link_show_fdinfo,
9510	.fill_link_info = bpf_xdp_link_fill_link_info,
9511	.update_prog = bpf_xdp_link_update,
9512};
9513
9514int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
9515{
9516	struct net *net = current->nsproxy->net_ns;
9517	struct bpf_link_primer link_primer;
9518	struct netlink_ext_ack extack = {};
9519	struct bpf_xdp_link *link;
9520	struct net_device *dev;
9521	int err, fd;
9522
9523	rtnl_lock();
9524	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
9525	if (!dev) {
9526		rtnl_unlock();
9527		return -EINVAL;
9528	}
9529
9530	link = kzalloc(sizeof(*link), GFP_USER);
9531	if (!link) {
9532		err = -ENOMEM;
9533		goto unlock;
9534	}
9535
9536	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
9537	link->dev = dev;
9538	link->flags = attr->link_create.flags;
9539
9540	err = bpf_link_prime(&link->link, &link_primer);
9541	if (err) {
9542		kfree(link);
9543		goto unlock;
9544	}
9545
9546	err = dev_xdp_attach_link(dev, &extack, link);
9547	rtnl_unlock();
9548
9549	if (err) {
9550		link->dev = NULL;
9551		bpf_link_cleanup(&link_primer);
9552		trace_bpf_xdp_link_attach_failed(extack._msg);
9553		goto out_put_dev;
9554	}
9555
9556	fd = bpf_link_settle(&link_primer);
9557	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
9558	dev_put(dev);
9559	return fd;
9560
9561unlock:
9562	rtnl_unlock();
9563
9564out_put_dev:
9565	dev_put(dev);
9566	return err;
9567}
9568
9569/**
9570 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
9571 *	@dev: device
9572 *	@extack: netlink extended ack
9573 *	@fd: new program fd or negative value to clear
9574 *	@expected_fd: old program fd that userspace expects to replace or clear
9575 *	@flags: xdp-related flags
9576 *
9577 *	Set or clear a bpf program for a device
9578 */
9579int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
9580		      int fd, int expected_fd, u32 flags)
9581{
9582	enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
9583	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
9584	int err;
9585
9586	ASSERT_RTNL();
9587
9588	if (fd >= 0) {
9589		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
9590						 mode != XDP_MODE_SKB);
9591		if (IS_ERR(new_prog))
9592			return PTR_ERR(new_prog);
9593	}
9594
9595	if (expected_fd >= 0) {
9596		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
9597						 mode != XDP_MODE_SKB);
9598		if (IS_ERR(old_prog)) {
9599			err = PTR_ERR(old_prog);
9600			old_prog = NULL;
9601			goto err_out;
9602		}
9603	}
9604
9605	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
9606
9607err_out:
9608	if (err && new_prog)
9609		bpf_prog_put(new_prog);
9610	if (old_prog)
9611		bpf_prog_put(old_prog);
9612	return err;
9613}
9614
9615/**
9616 * dev_index_reserve() - allocate an ifindex in a namespace
9617 * @net: the applicable net namespace
9618 * @ifindex: requested ifindex, pass %0 to get one allocated
9619 *
9620 * Allocate a ifindex for a new device. Caller must either use the ifindex
9621 * to store the device (via list_netdevice()) or call dev_index_release()
9622 * to give the index up.
9623 *
9624 * Return: a suitable unique value for a new device interface number or -errno.
9625 */
9626static int dev_index_reserve(struct net *net, u32 ifindex)
9627{
9628	int err;
9629
9630	if (ifindex > INT_MAX) {
9631		DEBUG_NET_WARN_ON_ONCE(1);
9632		return -EINVAL;
9633	}
9634
9635	if (!ifindex)
9636		err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
9637				      xa_limit_31b, &net->ifindex, GFP_KERNEL);
9638	else
9639		err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
9640	if (err < 0)
9641		return err;
9642
9643	return ifindex;
9644}
9645
9646static void dev_index_release(struct net *net, int ifindex)
9647{
9648	/* Expect only unused indexes, unlist_netdevice() removes the used */
9649	WARN_ON(xa_erase(&net->dev_by_index, ifindex));
9650}
9651
9652/* Delayed registration/unregisteration */
9653LIST_HEAD(net_todo_list);
9654DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
9655
9656static void net_set_todo(struct net_device *dev)
9657{
9658	list_add_tail(&dev->todo_list, &net_todo_list);
9659	atomic_inc(&dev_net(dev)->dev_unreg_count);
9660}
9661
9662static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
9663	struct net_device *upper, netdev_features_t features)
9664{
9665	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9666	netdev_features_t feature;
9667	int feature_bit;
9668
9669	for_each_netdev_feature(upper_disables, feature_bit) {
9670		feature = __NETIF_F_BIT(feature_bit);
9671		if (!(upper->wanted_features & feature)
9672		    && (features & feature)) {
9673			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
9674				   &feature, upper->name);
9675			features &= ~feature;
9676		}
9677	}
9678
9679	return features;
9680}
9681
9682static void netdev_sync_lower_features(struct net_device *upper,
9683	struct net_device *lower, netdev_features_t features)
9684{
9685	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9686	netdev_features_t feature;
9687	int feature_bit;
9688
9689	for_each_netdev_feature(upper_disables, feature_bit) {
9690		feature = __NETIF_F_BIT(feature_bit);
9691		if (!(features & feature) && (lower->features & feature)) {
9692			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
9693				   &feature, lower->name);
9694			lower->wanted_features &= ~feature;
9695			__netdev_update_features(lower);
9696
9697			if (unlikely(lower->features & feature))
9698				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
9699					    &feature, lower->name);
9700			else
9701				netdev_features_change(lower);
9702		}
9703	}
9704}
9705
9706static netdev_features_t netdev_fix_features(struct net_device *dev,
9707	netdev_features_t features)
9708{
9709	/* Fix illegal checksum combinations */
9710	if ((features & NETIF_F_HW_CSUM) &&
9711	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
9712		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
9713		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
9714	}
9715
9716	/* TSO requires that SG is present as well. */
9717	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
9718		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
9719		features &= ~NETIF_F_ALL_TSO;
9720	}
9721
9722	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
9723					!(features & NETIF_F_IP_CSUM)) {
9724		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
9725		features &= ~NETIF_F_TSO;
9726		features &= ~NETIF_F_TSO_ECN;
9727	}
9728
9729	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
9730					 !(features & NETIF_F_IPV6_CSUM)) {
9731		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
9732		features &= ~NETIF_F_TSO6;
9733	}
9734
9735	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
9736	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
9737		features &= ~NETIF_F_TSO_MANGLEID;
9738
9739	/* TSO ECN requires that TSO is present as well. */
9740	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
9741		features &= ~NETIF_F_TSO_ECN;
9742
9743	/* Software GSO depends on SG. */
9744	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
9745		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
9746		features &= ~NETIF_F_GSO;
9747	}
9748
9749	/* GSO partial features require GSO partial be set */
9750	if ((features & dev->gso_partial_features) &&
9751	    !(features & NETIF_F_GSO_PARTIAL)) {
9752		netdev_dbg(dev,
9753			   "Dropping partially supported GSO features since no GSO partial.\n");
9754		features &= ~dev->gso_partial_features;
9755	}
9756
9757	if (!(features & NETIF_F_RXCSUM)) {
9758		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
9759		 * successfully merged by hardware must also have the
9760		 * checksum verified by hardware.  If the user does not
9761		 * want to enable RXCSUM, logically, we should disable GRO_HW.
9762		 */
9763		if (features & NETIF_F_GRO_HW) {
9764			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
9765			features &= ~NETIF_F_GRO_HW;
9766		}
9767	}
9768
9769	/* LRO/HW-GRO features cannot be combined with RX-FCS */
9770	if (features & NETIF_F_RXFCS) {
9771		if (features & NETIF_F_LRO) {
9772			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
9773			features &= ~NETIF_F_LRO;
9774		}
9775
9776		if (features & NETIF_F_GRO_HW) {
9777			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
9778			features &= ~NETIF_F_GRO_HW;
9779		}
9780	}
9781
9782	if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
9783		netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
9784		features &= ~NETIF_F_LRO;
9785	}
9786
9787	if (features & NETIF_F_HW_TLS_TX) {
9788		bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
9789			(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
9790		bool hw_csum = features & NETIF_F_HW_CSUM;
9791
9792		if (!ip_csum && !hw_csum) {
9793			netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
9794			features &= ~NETIF_F_HW_TLS_TX;
9795		}
9796	}
9797
9798	if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
9799		netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
9800		features &= ~NETIF_F_HW_TLS_RX;
9801	}
9802
9803	return features;
9804}
9805
9806int __netdev_update_features(struct net_device *dev)
9807{
9808	struct net_device *upper, *lower;
9809	netdev_features_t features;
9810	struct list_head *iter;
9811	int err = -1;
9812
9813	ASSERT_RTNL();
9814
9815	features = netdev_get_wanted_features(dev);
9816
9817	if (dev->netdev_ops->ndo_fix_features)
9818		features = dev->netdev_ops->ndo_fix_features(dev, features);
9819
9820	/* driver might be less strict about feature dependencies */
9821	features = netdev_fix_features(dev, features);
9822
9823	/* some features can't be enabled if they're off on an upper device */
9824	netdev_for_each_upper_dev_rcu(dev, upper, iter)
9825		features = netdev_sync_upper_features(dev, upper, features);
9826
9827	if (dev->features == features)
9828		goto sync_lower;
9829
9830	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
9831		&dev->features, &features);
9832
9833	if (dev->netdev_ops->ndo_set_features)
9834		err = dev->netdev_ops->ndo_set_features(dev, features);
9835	else
9836		err = 0;
9837
9838	if (unlikely(err < 0)) {
9839		netdev_err(dev,
9840			"set_features() failed (%d); wanted %pNF, left %pNF\n",
9841			err, &features, &dev->features);
9842		/* return non-0 since some features might have changed and
9843		 * it's better to fire a spurious notification than miss it
9844		 */
9845		return -1;
9846	}
9847
9848sync_lower:
9849	/* some features must be disabled on lower devices when disabled
9850	 * on an upper device (think: bonding master or bridge)
9851	 */
9852	netdev_for_each_lower_dev(dev, lower, iter)
9853		netdev_sync_lower_features(dev, lower, features);
9854
9855	if (!err) {
9856		netdev_features_t diff = features ^ dev->features;
9857
9858		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
9859			/* udp_tunnel_{get,drop}_rx_info both need
9860			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
9861			 * device, or they won't do anything.
9862			 * Thus we need to update dev->features
9863			 * *before* calling udp_tunnel_get_rx_info,
9864			 * but *after* calling udp_tunnel_drop_rx_info.
9865			 */
9866			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
9867				dev->features = features;
9868				udp_tunnel_get_rx_info(dev);
9869			} else {
9870				udp_tunnel_drop_rx_info(dev);
9871			}
9872		}
9873
9874		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
9875			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
9876				dev->features = features;
9877				err |= vlan_get_rx_ctag_filter_info(dev);
9878			} else {
9879				vlan_drop_rx_ctag_filter_info(dev);
9880			}
9881		}
9882
9883		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
9884			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
9885				dev->features = features;
9886				err |= vlan_get_rx_stag_filter_info(dev);
9887			} else {
9888				vlan_drop_rx_stag_filter_info(dev);
9889			}
9890		}
9891
9892		dev->features = features;
9893	}
9894
9895	return err < 0 ? 0 : 1;
9896}
9897
9898/**
9899 *	netdev_update_features - recalculate device features
9900 *	@dev: the device to check
9901 *
9902 *	Recalculate dev->features set and send notifications if it
9903 *	has changed. Should be called after driver or hardware dependent
9904 *	conditions might have changed that influence the features.
9905 */
9906void netdev_update_features(struct net_device *dev)
9907{
9908	if (__netdev_update_features(dev))
9909		netdev_features_change(dev);
9910}
9911EXPORT_SYMBOL(netdev_update_features);
9912
9913/**
9914 *	netdev_change_features - recalculate device features
9915 *	@dev: the device to check
9916 *
9917 *	Recalculate dev->features set and send notifications even
9918 *	if they have not changed. Should be called instead of
9919 *	netdev_update_features() if also dev->vlan_features might
9920 *	have changed to allow the changes to be propagated to stacked
9921 *	VLAN devices.
9922 */
9923void netdev_change_features(struct net_device *dev)
9924{
9925	__netdev_update_features(dev);
9926	netdev_features_change(dev);
9927}
9928EXPORT_SYMBOL(netdev_change_features);
9929
9930/**
9931 *	netif_stacked_transfer_operstate -	transfer operstate
9932 *	@rootdev: the root or lower level device to transfer state from
9933 *	@dev: the device to transfer operstate to
9934 *
9935 *	Transfer operational state from root to device. This is normally
9936 *	called when a stacking relationship exists between the root
9937 *	device and the device(a leaf device).
9938 */
9939void netif_stacked_transfer_operstate(const struct net_device *rootdev,
9940					struct net_device *dev)
9941{
9942	if (rootdev->operstate == IF_OPER_DORMANT)
9943		netif_dormant_on(dev);
9944	else
9945		netif_dormant_off(dev);
9946
9947	if (rootdev->operstate == IF_OPER_TESTING)
9948		netif_testing_on(dev);
9949	else
9950		netif_testing_off(dev);
9951
9952	if (netif_carrier_ok(rootdev))
9953		netif_carrier_on(dev);
9954	else
9955		netif_carrier_off(dev);
9956}
9957EXPORT_SYMBOL(netif_stacked_transfer_operstate);
9958
9959static int netif_alloc_rx_queues(struct net_device *dev)
9960{
9961	unsigned int i, count = dev->num_rx_queues;
9962	struct netdev_rx_queue *rx;
9963	size_t sz = count * sizeof(*rx);
9964	int err = 0;
9965
9966	BUG_ON(count < 1);
9967
9968	rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
9969	if (!rx)
9970		return -ENOMEM;
9971
9972	dev->_rx = rx;
9973
9974	for (i = 0; i < count; i++) {
9975		rx[i].dev = dev;
9976
9977		/* XDP RX-queue setup */
9978		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
9979		if (err < 0)
9980			goto err_rxq_info;
9981	}
9982	return 0;
9983
9984err_rxq_info:
9985	/* Rollback successful reg's and free other resources */
9986	while (i--)
9987		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
9988	kvfree(dev->_rx);
9989	dev->_rx = NULL;
9990	return err;
9991}
9992
9993static void netif_free_rx_queues(struct net_device *dev)
9994{
9995	unsigned int i, count = dev->num_rx_queues;
9996
9997	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
9998	if (!dev->_rx)
9999		return;
10000
10001	for (i = 0; i < count; i++)
10002		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
10003
10004	kvfree(dev->_rx);
10005}
10006
10007static void netdev_init_one_queue(struct net_device *dev,
10008				  struct netdev_queue *queue, void *_unused)
10009{
10010	/* Initialize queue lock */
10011	spin_lock_init(&queue->_xmit_lock);
10012	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
10013	queue->xmit_lock_owner = -1;
10014	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
10015	queue->dev = dev;
10016#ifdef CONFIG_BQL
10017	dql_init(&queue->dql, HZ);
10018#endif
10019}
10020
10021static void netif_free_tx_queues(struct net_device *dev)
10022{
10023	kvfree(dev->_tx);
10024}
10025
10026static int netif_alloc_netdev_queues(struct net_device *dev)
10027{
10028	unsigned int count = dev->num_tx_queues;
10029	struct netdev_queue *tx;
10030	size_t sz = count * sizeof(*tx);
10031
10032	if (count < 1 || count > 0xffff)
10033		return -EINVAL;
10034
10035	tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10036	if (!tx)
10037		return -ENOMEM;
10038
10039	dev->_tx = tx;
10040
10041	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
10042	spin_lock_init(&dev->tx_global_lock);
10043
10044	return 0;
10045}
10046
10047void netif_tx_stop_all_queues(struct net_device *dev)
10048{
10049	unsigned int i;
10050
10051	for (i = 0; i < dev->num_tx_queues; i++) {
10052		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
10053
10054		netif_tx_stop_queue(txq);
10055	}
10056}
10057EXPORT_SYMBOL(netif_tx_stop_all_queues);
10058
10059static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
10060{
10061	void __percpu *v;
10062
10063	/* Drivers implementing ndo_get_peer_dev must support tstat
10064	 * accounting, so that skb_do_redirect() can bump the dev's
10065	 * RX stats upon network namespace switch.
10066	 */
10067	if (dev->netdev_ops->ndo_get_peer_dev &&
10068	    dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
10069		return -EOPNOTSUPP;
10070
10071	switch (dev->pcpu_stat_type) {
10072	case NETDEV_PCPU_STAT_NONE:
10073		return 0;
10074	case NETDEV_PCPU_STAT_LSTATS:
10075		v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
10076		break;
10077	case NETDEV_PCPU_STAT_TSTATS:
10078		v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
10079		break;
10080	case NETDEV_PCPU_STAT_DSTATS:
10081		v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
10082		break;
10083	default:
10084		return -EINVAL;
10085	}
10086
10087	return v ? 0 : -ENOMEM;
10088}
10089
10090static void netdev_do_free_pcpu_stats(struct net_device *dev)
10091{
10092	switch (dev->pcpu_stat_type) {
10093	case NETDEV_PCPU_STAT_NONE:
10094		return;
10095	case NETDEV_PCPU_STAT_LSTATS:
10096		free_percpu(dev->lstats);
10097		break;
10098	case NETDEV_PCPU_STAT_TSTATS:
10099		free_percpu(dev->tstats);
10100		break;
10101	case NETDEV_PCPU_STAT_DSTATS:
10102		free_percpu(dev->dstats);
10103		break;
10104	}
10105}
10106
10107/**
10108 * register_netdevice() - register a network device
10109 * @dev: device to register
10110 *
10111 * Take a prepared network device structure and make it externally accessible.
10112 * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
10113 * Callers must hold the rtnl lock - you may want register_netdev()
10114 * instead of this.
10115 */
10116int register_netdevice(struct net_device *dev)
10117{
10118	int ret;
10119	struct net *net = dev_net(dev);
10120
10121	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
10122		     NETDEV_FEATURE_COUNT);
10123	BUG_ON(dev_boot_phase);
10124	ASSERT_RTNL();
10125
10126	might_sleep();
10127
10128	/* When net_device's are persistent, this will be fatal. */
10129	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
10130	BUG_ON(!net);
10131
10132	ret = ethtool_check_ops(dev->ethtool_ops);
10133	if (ret)
10134		return ret;
10135
10136	spin_lock_init(&dev->addr_list_lock);
10137	netdev_set_addr_lockdep_class(dev);
10138
10139	ret = dev_get_valid_name(net, dev, dev->name);
10140	if (ret < 0)
10141		goto out;
10142
10143	ret = -ENOMEM;
10144	dev->name_node = netdev_name_node_head_alloc(dev);
10145	if (!dev->name_node)
10146		goto out;
10147
10148	/* Init, if this function is available */
10149	if (dev->netdev_ops->ndo_init) {
10150		ret = dev->netdev_ops->ndo_init(dev);
10151		if (ret) {
10152			if (ret > 0)
10153				ret = -EIO;
10154			goto err_free_name;
10155		}
10156	}
10157
10158	if (((dev->hw_features | dev->features) &
10159	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
10160	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
10161	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
10162		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
10163		ret = -EINVAL;
10164		goto err_uninit;
10165	}
10166
10167	ret = netdev_do_alloc_pcpu_stats(dev);
10168	if (ret)
10169		goto err_uninit;
10170
10171	ret = dev_index_reserve(net, dev->ifindex);
10172	if (ret < 0)
10173		goto err_free_pcpu;
10174	dev->ifindex = ret;
10175
10176	/* Transfer changeable features to wanted_features and enable
10177	 * software offloads (GSO and GRO).
10178	 */
10179	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
10180	dev->features |= NETIF_F_SOFT_FEATURES;
10181
10182	if (dev->udp_tunnel_nic_info) {
10183		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10184		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10185	}
10186
10187	dev->wanted_features = dev->features & dev->hw_features;
10188
10189	if (!(dev->flags & IFF_LOOPBACK))
10190		dev->hw_features |= NETIF_F_NOCACHE_COPY;
10191
10192	/* If IPv4 TCP segmentation offload is supported we should also
10193	 * allow the device to enable segmenting the frame with the option
10194	 * of ignoring a static IP ID value.  This doesn't enable the
10195	 * feature itself but allows the user to enable it later.
10196	 */
10197	if (dev->hw_features & NETIF_F_TSO)
10198		dev->hw_features |= NETIF_F_TSO_MANGLEID;
10199	if (dev->vlan_features & NETIF_F_TSO)
10200		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
10201	if (dev->mpls_features & NETIF_F_TSO)
10202		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
10203	if (dev->hw_enc_features & NETIF_F_TSO)
10204		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
10205
10206	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
10207	 */
10208	dev->vlan_features |= NETIF_F_HIGHDMA;
10209
10210	/* Make NETIF_F_SG inheritable to tunnel devices.
10211	 */
10212	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
10213
10214	/* Make NETIF_F_SG inheritable to MPLS.
10215	 */
10216	dev->mpls_features |= NETIF_F_SG;
10217
10218	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
10219	ret = notifier_to_errno(ret);
10220	if (ret)
10221		goto err_ifindex_release;
10222
10223	ret = netdev_register_kobject(dev);
10224	write_lock(&dev_base_lock);
10225	dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
10226	write_unlock(&dev_base_lock);
10227	if (ret)
10228		goto err_uninit_notify;
10229
10230	__netdev_update_features(dev);
10231
10232	/*
10233	 *	Default initial state at registry is that the
10234	 *	device is present.
10235	 */
10236
10237	set_bit(__LINK_STATE_PRESENT, &dev->state);
10238
10239	linkwatch_init_dev(dev);
10240
10241	dev_init_scheduler(dev);
10242
10243	netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
10244	list_netdevice(dev);
10245
10246	add_device_randomness(dev->dev_addr, dev->addr_len);
10247
10248	/* If the device has permanent device address, driver should
10249	 * set dev_addr and also addr_assign_type should be set to
10250	 * NET_ADDR_PERM (default value).
10251	 */
10252	if (dev->addr_assign_type == NET_ADDR_PERM)
10253		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
10254
10255	/* Notify protocols, that a new device appeared. */
10256	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
10257	ret = notifier_to_errno(ret);
10258	if (ret) {
10259		/* Expect explicit free_netdev() on failure */
10260		dev->needs_free_netdev = false;
10261		unregister_netdevice_queue(dev, NULL);
10262		goto out;
10263	}
10264	/*
10265	 *	Prevent userspace races by waiting until the network
10266	 *	device is fully setup before sending notifications.
10267	 */
10268	if (!dev->rtnl_link_ops ||
10269	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10270		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
10271
10272out:
10273	return ret;
10274
10275err_uninit_notify:
10276	call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
10277err_ifindex_release:
10278	dev_index_release(net, dev->ifindex);
10279err_free_pcpu:
10280	netdev_do_free_pcpu_stats(dev);
10281err_uninit:
10282	if (dev->netdev_ops->ndo_uninit)
10283		dev->netdev_ops->ndo_uninit(dev);
10284	if (dev->priv_destructor)
10285		dev->priv_destructor(dev);
10286err_free_name:
10287	netdev_name_node_free(dev->name_node);
10288	goto out;
10289}
10290EXPORT_SYMBOL(register_netdevice);
10291
10292/**
10293 *	init_dummy_netdev	- init a dummy network device for NAPI
10294 *	@dev: device to init
10295 *
10296 *	This takes a network device structure and initialize the minimum
10297 *	amount of fields so it can be used to schedule NAPI polls without
10298 *	registering a full blown interface. This is to be used by drivers
10299 *	that need to tie several hardware interfaces to a single NAPI
10300 *	poll scheduler due to HW limitations.
10301 */
10302int init_dummy_netdev(struct net_device *dev)
10303{
10304	/* Clear everything. Note we don't initialize spinlocks
10305	 * are they aren't supposed to be taken by any of the
10306	 * NAPI code and this dummy netdev is supposed to be
10307	 * only ever used for NAPI polls
10308	 */
10309	memset(dev, 0, sizeof(struct net_device));
10310
10311	/* make sure we BUG if trying to hit standard
10312	 * register/unregister code path
10313	 */
10314	dev->reg_state = NETREG_DUMMY;
10315
10316	/* NAPI wants this */
10317	INIT_LIST_HEAD(&dev->napi_list);
10318
10319	/* a dummy interface is started by default */
10320	set_bit(__LINK_STATE_PRESENT, &dev->state);
10321	set_bit(__LINK_STATE_START, &dev->state);
10322
10323	/* napi_busy_loop stats accounting wants this */
10324	dev_net_set(dev, &init_net);
10325
10326	/* Note : We dont allocate pcpu_refcnt for dummy devices,
10327	 * because users of this 'device' dont need to change
10328	 * its refcount.
10329	 */
10330
10331	return 0;
10332}
10333EXPORT_SYMBOL_GPL(init_dummy_netdev);
10334
10335
10336/**
10337 *	register_netdev	- register a network device
10338 *	@dev: device to register
10339 *
10340 *	Take a completed network device structure and add it to the kernel
10341 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10342 *	chain. 0 is returned on success. A negative errno code is returned
10343 *	on a failure to set up the device, or if the name is a duplicate.
10344 *
10345 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
10346 *	and expands the device name if you passed a format string to
10347 *	alloc_netdev.
10348 */
10349int register_netdev(struct net_device *dev)
10350{
10351	int err;
10352
10353	if (rtnl_lock_killable())
10354		return -EINTR;
10355	err = register_netdevice(dev);
10356	rtnl_unlock();
10357	return err;
10358}
10359EXPORT_SYMBOL(register_netdev);
10360
10361int netdev_refcnt_read(const struct net_device *dev)
10362{
10363#ifdef CONFIG_PCPU_DEV_REFCNT
10364	int i, refcnt = 0;
10365
10366	for_each_possible_cpu(i)
10367		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10368	return refcnt;
10369#else
10370	return refcount_read(&dev->dev_refcnt);
10371#endif
10372}
10373EXPORT_SYMBOL(netdev_refcnt_read);
10374
10375int netdev_unregister_timeout_secs __read_mostly = 10;
10376
10377#define WAIT_REFS_MIN_MSECS 1
10378#define WAIT_REFS_MAX_MSECS 250
10379/**
10380 * netdev_wait_allrefs_any - wait until all references are gone.
10381 * @list: list of net_devices to wait on
10382 *
10383 * This is called when unregistering network devices.
10384 *
10385 * Any protocol or device that holds a reference should register
10386 * for netdevice notification, and cleanup and put back the
10387 * reference if they receive an UNREGISTER event.
10388 * We can get stuck here if buggy protocols don't correctly
10389 * call dev_put.
10390 */
10391static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
10392{
10393	unsigned long rebroadcast_time, warning_time;
10394	struct net_device *dev;
10395	int wait = 0;
10396
10397	rebroadcast_time = warning_time = jiffies;
10398
10399	list_for_each_entry(dev, list, todo_list)
10400		if (netdev_refcnt_read(dev) == 1)
10401			return dev;
10402
10403	while (true) {
10404		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10405			rtnl_lock();
10406
10407			/* Rebroadcast unregister notification */
10408			list_for_each_entry(dev, list, todo_list)
10409				call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10410
10411			__rtnl_unlock();
10412			rcu_barrier();
10413			rtnl_lock();
10414
10415			list_for_each_entry(dev, list, todo_list)
10416				if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10417					     &dev->state)) {
10418					/* We must not have linkwatch events
10419					 * pending on unregister. If this
10420					 * happens, we simply run the queue
10421					 * unscheduled, resulting in a noop
10422					 * for this device.
10423					 */
10424					linkwatch_run_queue();
10425					break;
10426				}
10427
10428			__rtnl_unlock();
10429
10430			rebroadcast_time = jiffies;
10431		}
10432
10433		if (!wait) {
10434			rcu_barrier();
10435			wait = WAIT_REFS_MIN_MSECS;
10436		} else {
10437			msleep(wait);
10438			wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10439		}
10440
10441		list_for_each_entry(dev, list, todo_list)
10442			if (netdev_refcnt_read(dev) == 1)
10443				return dev;
10444
10445		if (time_after(jiffies, warning_time +
10446			       READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
10447			list_for_each_entry(dev, list, todo_list) {
10448				pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10449					 dev->name, netdev_refcnt_read(dev));
10450				ref_tracker_dir_print(&dev->refcnt_tracker, 10);
10451			}
10452
10453			warning_time = jiffies;
10454		}
10455	}
10456}
10457
10458/* The sequence is:
10459 *
10460 *	rtnl_lock();
10461 *	...
10462 *	register_netdevice(x1);
10463 *	register_netdevice(x2);
10464 *	...
10465 *	unregister_netdevice(y1);
10466 *	unregister_netdevice(y2);
10467 *      ...
10468 *	rtnl_unlock();
10469 *	free_netdev(y1);
10470 *	free_netdev(y2);
10471 *
10472 * We are invoked by rtnl_unlock().
10473 * This allows us to deal with problems:
10474 * 1) We can delete sysfs objects which invoke hotplug
10475 *    without deadlocking with linkwatch via keventd.
10476 * 2) Since we run with the RTNL semaphore not held, we can sleep
10477 *    safely in order to wait for the netdev refcnt to drop to zero.
10478 *
10479 * We must not return until all unregister events added during
10480 * the interval the lock was held have been completed.
10481 */
10482void netdev_run_todo(void)
10483{
10484	struct net_device *dev, *tmp;
10485	struct list_head list;
10486#ifdef CONFIG_LOCKDEP
10487	struct list_head unlink_list;
10488
10489	list_replace_init(&net_unlink_list, &unlink_list);
10490
10491	while (!list_empty(&unlink_list)) {
10492		struct net_device *dev = list_first_entry(&unlink_list,
10493							  struct net_device,
10494							  unlink_list);
10495		list_del_init(&dev->unlink_list);
10496		dev->nested_level = dev->lower_level - 1;
10497	}
10498#endif
10499
10500	/* Snapshot list, allow later requests */
10501	list_replace_init(&net_todo_list, &list);
10502
10503	__rtnl_unlock();
10504
10505	/* Wait for rcu callbacks to finish before next phase */
10506	if (!list_empty(&list))
10507		rcu_barrier();
10508
10509	list_for_each_entry_safe(dev, tmp, &list, todo_list) {
10510		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10511			netdev_WARN(dev, "run_todo but not unregistering\n");
10512			list_del(&dev->todo_list);
10513			continue;
10514		}
10515
10516		write_lock(&dev_base_lock);
10517		dev->reg_state = NETREG_UNREGISTERED;
10518		write_unlock(&dev_base_lock);
10519		linkwatch_forget_dev(dev);
10520	}
10521
10522	while (!list_empty(&list)) {
10523		dev = netdev_wait_allrefs_any(&list);
10524		list_del(&dev->todo_list);
10525
10526		/* paranoia */
10527		BUG_ON(netdev_refcnt_read(dev) != 1);
10528		BUG_ON(!list_empty(&dev->ptype_all));
10529		BUG_ON(!list_empty(&dev->ptype_specific));
10530		WARN_ON(rcu_access_pointer(dev->ip_ptr));
10531		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10532
10533		netdev_do_free_pcpu_stats(dev);
10534		if (dev->priv_destructor)
10535			dev->priv_destructor(dev);
10536		if (dev->needs_free_netdev)
10537			free_netdev(dev);
10538
10539		if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
10540			wake_up(&netdev_unregistering_wq);
10541
10542		/* Free network device */
10543		kobject_put(&dev->dev.kobj);
10544	}
10545}
10546
10547/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10548 * all the same fields in the same order as net_device_stats, with only
10549 * the type differing, but rtnl_link_stats64 may have additional fields
10550 * at the end for newer counters.
10551 */
10552void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10553			     const struct net_device_stats *netdev_stats)
10554{
10555	size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
10556	const atomic_long_t *src = (atomic_long_t *)netdev_stats;
10557	u64 *dst = (u64 *)stats64;
10558
10559	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10560	for (i = 0; i < n; i++)
10561		dst[i] = (unsigned long)atomic_long_read(&src[i]);
10562	/* zero out counters that only exist in rtnl_link_stats64 */
10563	memset((char *)stats64 + n * sizeof(u64), 0,
10564	       sizeof(*stats64) - n * sizeof(u64));
10565}
10566EXPORT_SYMBOL(netdev_stats_to_stats64);
10567
10568struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev)
10569{
10570	struct net_device_core_stats __percpu *p;
10571
10572	p = alloc_percpu_gfp(struct net_device_core_stats,
10573			     GFP_ATOMIC | __GFP_NOWARN);
10574
10575	if (p && cmpxchg(&dev->core_stats, NULL, p))
10576		free_percpu(p);
10577
10578	/* This READ_ONCE() pairs with the cmpxchg() above */
10579	return READ_ONCE(dev->core_stats);
10580}
10581EXPORT_SYMBOL(netdev_core_stats_alloc);
10582
10583/**
10584 *	dev_get_stats	- get network device statistics
10585 *	@dev: device to get statistics from
10586 *	@storage: place to store stats
10587 *
10588 *	Get network statistics from device. Return @storage.
10589 *	The device driver may provide its own method by setting
10590 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10591 *	otherwise the internal statistics structure is used.
10592 */
10593struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10594					struct rtnl_link_stats64 *storage)
10595{
10596	const struct net_device_ops *ops = dev->netdev_ops;
10597	const struct net_device_core_stats __percpu *p;
10598
10599	if (ops->ndo_get_stats64) {
10600		memset(storage, 0, sizeof(*storage));
10601		ops->ndo_get_stats64(dev, storage);
10602	} else if (ops->ndo_get_stats) {
10603		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10604	} else {
10605		netdev_stats_to_stats64(storage, &dev->stats);
10606	}
10607
10608	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
10609	p = READ_ONCE(dev->core_stats);
10610	if (p) {
10611		const struct net_device_core_stats *core_stats;
10612		int i;
10613
10614		for_each_possible_cpu(i) {
10615			core_stats = per_cpu_ptr(p, i);
10616			storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
10617			storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
10618			storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
10619			storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
10620		}
10621	}
10622	return storage;
10623}
10624EXPORT_SYMBOL(dev_get_stats);
10625
10626/**
10627 *	dev_fetch_sw_netstats - get per-cpu network device statistics
10628 *	@s: place to store stats
10629 *	@netstats: per-cpu network stats to read from
10630 *
10631 *	Read per-cpu network statistics and populate the related fields in @s.
10632 */
10633void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10634			   const struct pcpu_sw_netstats __percpu *netstats)
10635{
10636	int cpu;
10637
10638	for_each_possible_cpu(cpu) {
10639		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
10640		const struct pcpu_sw_netstats *stats;
10641		unsigned int start;
10642
10643		stats = per_cpu_ptr(netstats, cpu);
10644		do {
10645			start = u64_stats_fetch_begin(&stats->syncp);
10646			rx_packets = u64_stats_read(&stats->rx_packets);
10647			rx_bytes   = u64_stats_read(&stats->rx_bytes);
10648			tx_packets = u64_stats_read(&stats->tx_packets);
10649			tx_bytes   = u64_stats_read(&stats->tx_bytes);
10650		} while (u64_stats_fetch_retry(&stats->syncp, start));
10651
10652		s->rx_packets += rx_packets;
10653		s->rx_bytes   += rx_bytes;
10654		s->tx_packets += tx_packets;
10655		s->tx_bytes   += tx_bytes;
10656	}
10657}
10658EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
10659
10660/**
10661 *	dev_get_tstats64 - ndo_get_stats64 implementation
10662 *	@dev: device to get statistics from
10663 *	@s: place to store stats
10664 *
10665 *	Populate @s from dev->stats and dev->tstats. Can be used as
10666 *	ndo_get_stats64() callback.
10667 */
10668void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
10669{
10670	netdev_stats_to_stats64(s, &dev->stats);
10671	dev_fetch_sw_netstats(s, dev->tstats);
10672}
10673EXPORT_SYMBOL_GPL(dev_get_tstats64);
10674
10675struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10676{
10677	struct netdev_queue *queue = dev_ingress_queue(dev);
10678
10679#ifdef CONFIG_NET_CLS_ACT
10680	if (queue)
10681		return queue;
10682	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10683	if (!queue)
10684		return NULL;
10685	netdev_init_one_queue(dev, queue, NULL);
10686	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10687	RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
10688	rcu_assign_pointer(dev->ingress_queue, queue);
10689#endif
10690	return queue;
10691}
10692
10693static const struct ethtool_ops default_ethtool_ops;
10694
10695void netdev_set_default_ethtool_ops(struct net_device *dev,
10696				    const struct ethtool_ops *ops)
10697{
10698	if (dev->ethtool_ops == &default_ethtool_ops)
10699		dev->ethtool_ops = ops;
10700}
10701EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10702
10703/**
10704 * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
10705 * @dev: netdev to enable the IRQ coalescing on
10706 *
10707 * Sets a conservative default for SW IRQ coalescing. Users can use
10708 * sysfs attributes to override the default values.
10709 */
10710void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
10711{
10712	WARN_ON(dev->reg_state == NETREG_REGISTERED);
10713
10714	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
10715		dev->gro_flush_timeout = 20000;
10716		dev->napi_defer_hard_irqs = 1;
10717	}
10718}
10719EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
10720
10721void netdev_freemem(struct net_device *dev)
10722{
10723	char *addr = (char *)dev - dev->padded;
10724
10725	kvfree(addr);
10726}
10727
10728/**
10729 * alloc_netdev_mqs - allocate network device
10730 * @sizeof_priv: size of private data to allocate space for
10731 * @name: device name format string
10732 * @name_assign_type: origin of device name
10733 * @setup: callback to initialize device
10734 * @txqs: the number of TX subqueues to allocate
10735 * @rxqs: the number of RX subqueues to allocate
10736 *
10737 * Allocates a struct net_device with private data area for driver use
10738 * and performs basic initialization.  Also allocates subqueue structs
10739 * for each queue on the device.
10740 */
10741struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10742		unsigned char name_assign_type,
10743		void (*setup)(struct net_device *),
10744		unsigned int txqs, unsigned int rxqs)
10745{
10746	struct net_device *dev;
10747	unsigned int alloc_size;
10748	struct net_device *p;
10749
10750	BUG_ON(strlen(name) >= sizeof(dev->name));
10751
10752	if (txqs < 1) {
10753		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10754		return NULL;
10755	}
10756
10757	if (rxqs < 1) {
10758		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10759		return NULL;
10760	}
10761
10762	alloc_size = sizeof(struct net_device);
10763	if (sizeof_priv) {
10764		/* ensure 32-byte alignment of private area */
10765		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10766		alloc_size += sizeof_priv;
10767	}
10768	/* ensure 32-byte alignment of whole construct */
10769	alloc_size += NETDEV_ALIGN - 1;
10770
10771	p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10772	if (!p)
10773		return NULL;
10774
10775	dev = PTR_ALIGN(p, NETDEV_ALIGN);
10776	dev->padded = (char *)dev - (char *)p;
10777
10778	ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);
10779#ifdef CONFIG_PCPU_DEV_REFCNT
10780	dev->pcpu_refcnt = alloc_percpu(int);
10781	if (!dev->pcpu_refcnt)
10782		goto free_dev;
10783	__dev_hold(dev);
10784#else
10785	refcount_set(&dev->dev_refcnt, 1);
10786#endif
10787
10788	if (dev_addr_init(dev))
10789		goto free_pcpu;
10790
10791	dev_mc_init(dev);
10792	dev_uc_init(dev);
10793
10794	dev_net_set(dev, &init_net);
10795
10796	dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
10797	dev->xdp_zc_max_segs = 1;
10798	dev->gso_max_segs = GSO_MAX_SEGS;
10799	dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
10800	dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
10801	dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
10802	dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
10803	dev->tso_max_segs = TSO_MAX_SEGS;
10804	dev->upper_level = 1;
10805	dev->lower_level = 1;
10806#ifdef CONFIG_LOCKDEP
10807	dev->nested_level = 0;
10808	INIT_LIST_HEAD(&dev->unlink_list);
10809#endif
10810
10811	INIT_LIST_HEAD(&dev->napi_list);
10812	INIT_LIST_HEAD(&dev->unreg_list);
10813	INIT_LIST_HEAD(&dev->close_list);
10814	INIT_LIST_HEAD(&dev->link_watch_list);
10815	INIT_LIST_HEAD(&dev->adj_list.upper);
10816	INIT_LIST_HEAD(&dev->adj_list.lower);
10817	INIT_LIST_HEAD(&dev->ptype_all);
10818	INIT_LIST_HEAD(&dev->ptype_specific);
10819	INIT_LIST_HEAD(&dev->net_notifier_list);
10820#ifdef CONFIG_NET_SCHED
10821	hash_init(dev->qdisc_hash);
10822#endif
10823	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10824	setup(dev);
10825
10826	if (!dev->tx_queue_len) {
10827		dev->priv_flags |= IFF_NO_QUEUE;
10828		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10829	}
10830
10831	dev->num_tx_queues = txqs;
10832	dev->real_num_tx_queues = txqs;
10833	if (netif_alloc_netdev_queues(dev))
10834		goto free_all;
10835
10836	dev->num_rx_queues = rxqs;
10837	dev->real_num_rx_queues = rxqs;
10838	if (netif_alloc_rx_queues(dev))
10839		goto free_all;
10840
10841	strcpy(dev->name, name);
10842	dev->name_assign_type = name_assign_type;
10843	dev->group = INIT_NETDEV_GROUP;
10844	if (!dev->ethtool_ops)
10845		dev->ethtool_ops = &default_ethtool_ops;
10846
10847	nf_hook_netdev_init(dev);
10848
10849	return dev;
10850
10851free_all:
10852	free_netdev(dev);
10853	return NULL;
10854
10855free_pcpu:
10856#ifdef CONFIG_PCPU_DEV_REFCNT
10857	free_percpu(dev->pcpu_refcnt);
10858free_dev:
10859#endif
10860	netdev_freemem(dev);
10861	return NULL;
10862}
10863EXPORT_SYMBOL(alloc_netdev_mqs);
10864
10865/**
10866 * free_netdev - free network device
10867 * @dev: device
10868 *
10869 * This function does the last stage of destroying an allocated device
10870 * interface. The reference to the device object is released. If this
10871 * is the last reference then it will be freed.Must be called in process
10872 * context.
10873 */
10874void free_netdev(struct net_device *dev)
10875{
10876	struct napi_struct *p, *n;
10877
10878	might_sleep();
10879
10880	/* When called immediately after register_netdevice() failed the unwind
10881	 * handling may still be dismantling the device. Handle that case by
10882	 * deferring the free.
10883	 */
10884	if (dev->reg_state == NETREG_UNREGISTERING) {
10885		ASSERT_RTNL();
10886		dev->needs_free_netdev = true;
10887		return;
10888	}
10889
10890	netif_free_tx_queues(dev);
10891	netif_free_rx_queues(dev);
10892
10893	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10894
10895	/* Flush device addresses */
10896	dev_addr_flush(dev);
10897
10898	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10899		netif_napi_del(p);
10900
10901	ref_tracker_dir_exit(&dev->refcnt_tracker);
10902#ifdef CONFIG_PCPU_DEV_REFCNT
10903	free_percpu(dev->pcpu_refcnt);
10904	dev->pcpu_refcnt = NULL;
10905#endif
10906	free_percpu(dev->core_stats);
10907	dev->core_stats = NULL;
10908	free_percpu(dev->xdp_bulkq);
10909	dev->xdp_bulkq = NULL;
10910
10911	/*  Compatibility with error handling in drivers */
10912	if (dev->reg_state == NETREG_UNINITIALIZED) {
10913		netdev_freemem(dev);
10914		return;
10915	}
10916
10917	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10918	dev->reg_state = NETREG_RELEASED;
10919
10920	/* will free via device release */
10921	put_device(&dev->dev);
10922}
10923EXPORT_SYMBOL(free_netdev);
10924
10925/**
10926 *	synchronize_net -  Synchronize with packet receive processing
10927 *
10928 *	Wait for packets currently being received to be done.
10929 *	Does not block later packets from starting.
10930 */
10931void synchronize_net(void)
10932{
10933	might_sleep();
10934	if (rtnl_is_locked())
10935		synchronize_rcu_expedited();
10936	else
10937		synchronize_rcu();
10938}
10939EXPORT_SYMBOL(synchronize_net);
10940
10941/**
10942 *	unregister_netdevice_queue - remove device from the kernel
10943 *	@dev: device
10944 *	@head: list
10945 *
10946 *	This function shuts down a device interface and removes it
10947 *	from the kernel tables.
10948 *	If head not NULL, device is queued to be unregistered later.
10949 *
10950 *	Callers must hold the rtnl semaphore.  You may want
10951 *	unregister_netdev() instead of this.
10952 */
10953
10954void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
10955{
10956	ASSERT_RTNL();
10957
10958	if (head) {
10959		list_move_tail(&dev->unreg_list, head);
10960	} else {
10961		LIST_HEAD(single);
10962
10963		list_add(&dev->unreg_list, &single);
10964		unregister_netdevice_many(&single);
10965	}
10966}
10967EXPORT_SYMBOL(unregister_netdevice_queue);
10968
10969void unregister_netdevice_many_notify(struct list_head *head,
10970				      u32 portid, const struct nlmsghdr *nlh)
10971{
10972	struct net_device *dev, *tmp;
10973	LIST_HEAD(close_head);
10974
10975	BUG_ON(dev_boot_phase);
10976	ASSERT_RTNL();
10977
10978	if (list_empty(head))
10979		return;
10980
10981	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
10982		/* Some devices call without registering
10983		 * for initialization unwind. Remove those
10984		 * devices and proceed with the remaining.
10985		 */
10986		if (dev->reg_state == NETREG_UNINITIALIZED) {
10987			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
10988				 dev->name, dev);
10989
10990			WARN_ON(1);
10991			list_del(&dev->unreg_list);
10992			continue;
10993		}
10994		dev->dismantle = true;
10995		BUG_ON(dev->reg_state != NETREG_REGISTERED);
10996	}
10997
10998	/* If device is running, close it first. */
10999	list_for_each_entry(dev, head, unreg_list)
11000		list_add_tail(&dev->close_list, &close_head);
11001	dev_close_many(&close_head, true);
11002
11003	list_for_each_entry(dev, head, unreg_list) {
11004		/* And unlink it from device chain. */
11005		write_lock(&dev_base_lock);
11006		unlist_netdevice(dev, false);
11007		dev->reg_state = NETREG_UNREGISTERING;
11008		write_unlock(&dev_base_lock);
11009	}
11010	flush_all_backlogs();
11011
11012	synchronize_net();
11013
11014	list_for_each_entry(dev, head, unreg_list) {
11015		struct sk_buff *skb = NULL;
11016
11017		/* Shutdown queueing discipline. */
11018		dev_shutdown(dev);
11019		dev_tcx_uninstall(dev);
11020		dev_xdp_uninstall(dev);
11021		bpf_dev_bound_netdev_unregister(dev);
11022
11023		netdev_offload_xstats_disable_all(dev);
11024
11025		/* Notify protocols, that we are about to destroy
11026		 * this device. They should clean all the things.
11027		 */
11028		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11029
11030		if (!dev->rtnl_link_ops ||
11031		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
11032			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
11033						     GFP_KERNEL, NULL, 0,
11034						     portid, nlh);
11035
11036		/*
11037		 *	Flush the unicast and multicast chains
11038		 */
11039		dev_uc_flush(dev);
11040		dev_mc_flush(dev);
11041
11042		netdev_name_node_alt_flush(dev);
11043		netdev_name_node_free(dev->name_node);
11044
11045		call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
11046
11047		if (dev->netdev_ops->ndo_uninit)
11048			dev->netdev_ops->ndo_uninit(dev);
11049
11050		if (skb)
11051			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
11052
11053		/* Notifier chain MUST detach us all upper devices. */
11054		WARN_ON(netdev_has_any_upper_dev(dev));
11055		WARN_ON(netdev_has_any_lower_dev(dev));
11056
11057		/* Remove entries from kobject tree */
11058		netdev_unregister_kobject(dev);
11059#ifdef CONFIG_XPS
11060		/* Remove XPS queueing entries */
11061		netif_reset_xps_queues_gt(dev, 0);
11062#endif
11063	}
11064
11065	synchronize_net();
11066
11067	list_for_each_entry(dev, head, unreg_list) {
11068		netdev_put(dev, &dev->dev_registered_tracker);
11069		net_set_todo(dev);
11070	}
11071
11072	list_del(head);
11073}
11074
11075/**
11076 *	unregister_netdevice_many - unregister many devices
11077 *	@head: list of devices
11078 *
11079 *  Note: As most callers use a stack allocated list_head,
11080 *  we force a list_del() to make sure stack wont be corrupted later.
11081 */
11082void unregister_netdevice_many(struct list_head *head)
11083{
11084	unregister_netdevice_many_notify(head, 0, NULL);
11085}
11086EXPORT_SYMBOL(unregister_netdevice_many);
11087
11088/**
11089 *	unregister_netdev - remove device from the kernel
11090 *	@dev: device
11091 *
11092 *	This function shuts down a device interface and removes it
11093 *	from the kernel tables.
11094 *
11095 *	This is just a wrapper for unregister_netdevice that takes
11096 *	the rtnl semaphore.  In general you want to use this and not
11097 *	unregister_netdevice.
11098 */
11099void unregister_netdev(struct net_device *dev)
11100{
11101	rtnl_lock();
11102	unregister_netdevice(dev);
11103	rtnl_unlock();
11104}
11105EXPORT_SYMBOL(unregister_netdev);
11106
11107/**
11108 *	__dev_change_net_namespace - move device to different nethost namespace
11109 *	@dev: device
11110 *	@net: network namespace
11111 *	@pat: If not NULL name pattern to try if the current device name
11112 *	      is already taken in the destination network namespace.
11113 *	@new_ifindex: If not zero, specifies device index in the target
11114 *	              namespace.
11115 *
11116 *	This function shuts down a device interface and moves it
11117 *	to a new network namespace. On success 0 is returned, on
11118 *	a failure a netagive errno code is returned.
11119 *
11120 *	Callers must hold the rtnl semaphore.
11121 */
11122
11123int __dev_change_net_namespace(struct net_device *dev, struct net *net,
11124			       const char *pat, int new_ifindex)
11125{
11126	struct netdev_name_node *name_node;
11127	struct net *net_old = dev_net(dev);
11128	char new_name[IFNAMSIZ] = {};
11129	int err, new_nsid;
11130
11131	ASSERT_RTNL();
11132
11133	/* Don't allow namespace local devices to be moved. */
11134	err = -EINVAL;
11135	if (dev->features & NETIF_F_NETNS_LOCAL)
11136		goto out;
11137
11138	/* Ensure the device has been registrered */
11139	if (dev->reg_state != NETREG_REGISTERED)
11140		goto out;
11141
11142	/* Get out if there is nothing todo */
11143	err = 0;
11144	if (net_eq(net_old, net))
11145		goto out;
11146
11147	/* Pick the destination device name, and ensure
11148	 * we can use it in the destination network namespace.
11149	 */
11150	err = -EEXIST;
11151	if (netdev_name_in_use(net, dev->name)) {
11152		/* We get here if we can't use the current device name */
11153		if (!pat)
11154			goto out;
11155		err = dev_prep_valid_name(net, dev, pat, new_name);
11156		if (err < 0)
11157			goto out;
11158	}
11159	/* Check that none of the altnames conflicts. */
11160	err = -EEXIST;
11161	netdev_for_each_altname(dev, name_node)
11162		if (netdev_name_in_use(net, name_node->name))
11163			goto out;
11164
11165	/* Check that new_ifindex isn't used yet. */
11166	if (new_ifindex) {
11167		err = dev_index_reserve(net, new_ifindex);
11168		if (err < 0)
11169			goto out;
11170	} else {
11171		/* If there is an ifindex conflict assign a new one */
11172		err = dev_index_reserve(net, dev->ifindex);
11173		if (err == -EBUSY)
11174			err = dev_index_reserve(net, 0);
11175		if (err < 0)
11176			goto out;
11177		new_ifindex = err;
11178	}
11179
11180	/*
11181	 * And now a mini version of register_netdevice unregister_netdevice.
11182	 */
11183
11184	/* If device is running close it first. */
11185	dev_close(dev);
11186
11187	/* And unlink it from device chain */
11188	unlist_netdevice(dev, true);
11189
11190	synchronize_net();
11191
11192	/* Shutdown queueing discipline. */
11193	dev_shutdown(dev);
11194
11195	/* Notify protocols, that we are about to destroy
11196	 * this device. They should clean all the things.
11197	 *
11198	 * Note that dev->reg_state stays at NETREG_REGISTERED.
11199	 * This is wanted because this way 8021q and macvlan know
11200	 * the device is just moving and can keep their slaves up.
11201	 */
11202	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11203	rcu_barrier();
11204
11205	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
11206
11207	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
11208			    new_ifindex);
11209
11210	/*
11211	 *	Flush the unicast and multicast chains
11212	 */
11213	dev_uc_flush(dev);
11214	dev_mc_flush(dev);
11215
11216	/* Send a netdev-removed uevent to the old namespace */
11217	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
11218	netdev_adjacent_del_links(dev);
11219
11220	/* Move per-net netdevice notifiers that are following the netdevice */
11221	move_netdevice_notifiers_dev_net(dev, net);
11222
11223	/* Actually switch the network namespace */
11224	dev_net_set(dev, net);
11225	dev->ifindex = new_ifindex;
11226
11227	/* Send a netdev-add uevent to the new namespace */
11228	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
11229	netdev_adjacent_add_links(dev);
11230
11231	if (new_name[0]) /* Rename the netdev to prepared name */
11232		strscpy(dev->name, new_name, IFNAMSIZ);
11233
11234	/* Fixup kobjects */
11235	err = device_rename(&dev->dev, dev->name);
11236	WARN_ON(err);
11237
11238	/* Adapt owner in case owning user namespace of target network
11239	 * namespace is different from the original one.
11240	 */
11241	err = netdev_change_owner(dev, net_old, net);
11242	WARN_ON(err);
11243
11244	/* Add the device back in the hashes */
11245	list_netdevice(dev);
11246
11247	/* Notify protocols, that a new device appeared. */
11248	call_netdevice_notifiers(NETDEV_REGISTER, dev);
11249
11250	/*
11251	 *	Prevent userspace races by waiting until the network
11252	 *	device is fully setup before sending notifications.
11253	 */
11254	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
11255
11256	synchronize_net();
11257	err = 0;
11258out:
11259	return err;
11260}
11261EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
11262
11263static int dev_cpu_dead(unsigned int oldcpu)
11264{
11265	struct sk_buff **list_skb;
11266	struct sk_buff *skb;
11267	unsigned int cpu;
11268	struct softnet_data *sd, *oldsd, *remsd = NULL;
11269
11270	local_irq_disable();
11271	cpu = smp_processor_id();
11272	sd = &per_cpu(softnet_data, cpu);
11273	oldsd = &per_cpu(softnet_data, oldcpu);
11274
11275	/* Find end of our completion_queue. */
11276	list_skb = &sd->completion_queue;
11277	while (*list_skb)
11278		list_skb = &(*list_skb)->next;
11279	/* Append completion queue from offline CPU. */
11280	*list_skb = oldsd->completion_queue;
11281	oldsd->completion_queue = NULL;
11282
11283	/* Append output queue from offline CPU. */
11284	if (oldsd->output_queue) {
11285		*sd->output_queue_tailp = oldsd->output_queue;
11286		sd->output_queue_tailp = oldsd->output_queue_tailp;
11287		oldsd->output_queue = NULL;
11288		oldsd->output_queue_tailp = &oldsd->output_queue;
11289	}
11290	/* Append NAPI poll list from offline CPU, with one exception :
11291	 * process_backlog() must be called by cpu owning percpu backlog.
11292	 * We properly handle process_queue & input_pkt_queue later.
11293	 */
11294	while (!list_empty(&oldsd->poll_list)) {
11295		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
11296							    struct napi_struct,
11297							    poll_list);
11298
11299		list_del_init(&napi->poll_list);
11300		if (napi->poll == process_backlog)
11301			napi->state = 0;
11302		else
11303			____napi_schedule(sd, napi);
11304	}
11305
11306	raise_softirq_irqoff(NET_TX_SOFTIRQ);
11307	local_irq_enable();
11308
11309#ifdef CONFIG_RPS
11310	remsd = oldsd->rps_ipi_list;
11311	oldsd->rps_ipi_list = NULL;
11312#endif
11313	/* send out pending IPI's on offline CPU */
11314	net_rps_send_ipi(remsd);
11315
11316	/* Process offline CPU's input_pkt_queue */
11317	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
11318		netif_rx(skb);
11319		input_queue_head_incr(oldsd);
11320	}
11321	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
11322		netif_rx(skb);
11323		input_queue_head_incr(oldsd);
11324	}
11325
11326	return 0;
11327}
11328
11329/**
11330 *	netdev_increment_features - increment feature set by one
11331 *	@all: current feature set
11332 *	@one: new feature set
11333 *	@mask: mask feature set
11334 *
11335 *	Computes a new feature set after adding a device with feature set
11336 *	@one to the master device with current feature set @all.  Will not
11337 *	enable anything that is off in @mask. Returns the new feature set.
11338 */
11339netdev_features_t netdev_increment_features(netdev_features_t all,
11340	netdev_features_t one, netdev_features_t mask)
11341{
11342	if (mask & NETIF_F_HW_CSUM)
11343		mask |= NETIF_F_CSUM_MASK;
11344	mask |= NETIF_F_VLAN_CHALLENGED;
11345
11346	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
11347	all &= one | ~NETIF_F_ALL_FOR_ALL;
11348
11349	/* If one device supports hw checksumming, set for all. */
11350	if (all & NETIF_F_HW_CSUM)
11351		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
11352
11353	return all;
11354}
11355EXPORT_SYMBOL(netdev_increment_features);
11356
11357static struct hlist_head * __net_init netdev_create_hash(void)
11358{
11359	int i;
11360	struct hlist_head *hash;
11361
11362	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
11363	if (hash != NULL)
11364		for (i = 0; i < NETDEV_HASHENTRIES; i++)
11365			INIT_HLIST_HEAD(&hash[i]);
11366
11367	return hash;
11368}
11369
11370/* Initialize per network namespace state */
11371static int __net_init netdev_init(struct net *net)
11372{
11373	BUILD_BUG_ON(GRO_HASH_BUCKETS >
11374		     8 * sizeof_field(struct napi_struct, gro_bitmask));
11375
11376	INIT_LIST_HEAD(&net->dev_base_head);
11377
11378	net->dev_name_head = netdev_create_hash();
11379	if (net->dev_name_head == NULL)
11380		goto err_name;
11381
11382	net->dev_index_head = netdev_create_hash();
11383	if (net->dev_index_head == NULL)
11384		goto err_idx;
11385
11386	xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);
11387
11388	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
11389
11390	return 0;
11391
11392err_idx:
11393	kfree(net->dev_name_head);
11394err_name:
11395	return -ENOMEM;
11396}
11397
11398/**
11399 *	netdev_drivername - network driver for the device
11400 *	@dev: network device
11401 *
11402 *	Determine network driver for device.
11403 */
11404const char *netdev_drivername(const struct net_device *dev)
11405{
11406	const struct device_driver *driver;
11407	const struct device *parent;
11408	const char *empty = "";
11409
11410	parent = dev->dev.parent;
11411	if (!parent)
11412		return empty;
11413
11414	driver = parent->driver;
11415	if (driver && driver->name)
11416		return driver->name;
11417	return empty;
11418}
11419
11420static void __netdev_printk(const char *level, const struct net_device *dev,
11421			    struct va_format *vaf)
11422{
11423	if (dev && dev->dev.parent) {
11424		dev_printk_emit(level[1] - '0',
11425				dev->dev.parent,
11426				"%s %s %s%s: %pV",
11427				dev_driver_string(dev->dev.parent),
11428				dev_name(dev->dev.parent),
11429				netdev_name(dev), netdev_reg_state(dev),
11430				vaf);
11431	} else if (dev) {
11432		printk("%s%s%s: %pV",
11433		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
11434	} else {
11435		printk("%s(NULL net_device): %pV", level, vaf);
11436	}
11437}
11438
11439void netdev_printk(const char *level, const struct net_device *dev,
11440		   const char *format, ...)
11441{
11442	struct va_format vaf;
11443	va_list args;
11444
11445	va_start(args, format);
11446
11447	vaf.fmt = format;
11448	vaf.va = &args;
11449
11450	__netdev_printk(level, dev, &vaf);
11451
11452	va_end(args);
11453}
11454EXPORT_SYMBOL(netdev_printk);
11455
11456#define define_netdev_printk_level(func, level)			\
11457void func(const struct net_device *dev, const char *fmt, ...)	\
11458{								\
11459	struct va_format vaf;					\
11460	va_list args;						\
11461								\
11462	va_start(args, fmt);					\
11463								\
11464	vaf.fmt = fmt;						\
11465	vaf.va = &args;						\
11466								\
11467	__netdev_printk(level, dev, &vaf);			\
11468								\
11469	va_end(args);						\
11470}								\
11471EXPORT_SYMBOL(func);
11472
11473define_netdev_printk_level(netdev_emerg, KERN_EMERG);
11474define_netdev_printk_level(netdev_alert, KERN_ALERT);
11475define_netdev_printk_level(netdev_crit, KERN_CRIT);
11476define_netdev_printk_level(netdev_err, KERN_ERR);
11477define_netdev_printk_level(netdev_warn, KERN_WARNING);
11478define_netdev_printk_level(netdev_notice, KERN_NOTICE);
11479define_netdev_printk_level(netdev_info, KERN_INFO);
11480
11481static void __net_exit netdev_exit(struct net *net)
11482{
11483	kfree(net->dev_name_head);
11484	kfree(net->dev_index_head);
11485	xa_destroy(&net->dev_by_index);
11486	if (net != &init_net)
11487		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
11488}
11489
11490static struct pernet_operations __net_initdata netdev_net_ops = {
11491	.init = netdev_init,
11492	.exit = netdev_exit,
11493};
11494
11495static void __net_exit default_device_exit_net(struct net *net)
11496{
11497	struct netdev_name_node *name_node, *tmp;
11498	struct net_device *dev, *aux;
11499	/*
11500	 * Push all migratable network devices back to the
11501	 * initial network namespace
11502	 */
11503	ASSERT_RTNL();
11504	for_each_netdev_safe(net, dev, aux) {
11505		int err;
11506		char fb_name[IFNAMSIZ];
11507
11508		/* Ignore unmoveable devices (i.e. loopback) */
11509		if (dev->features & NETIF_F_NETNS_LOCAL)
11510			continue;
11511
11512		/* Leave virtual devices for the generic cleanup */
11513		if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
11514			continue;
11515
11516		/* Push remaining network devices to init_net */
11517		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
11518		if (netdev_name_in_use(&init_net, fb_name))
11519			snprintf(fb_name, IFNAMSIZ, "dev%%d");
11520
11521		netdev_for_each_altname_safe(dev, name_node, tmp)
11522			if (netdev_name_in_use(&init_net, name_node->name)) {
11523				netdev_name_node_del(name_node);
11524				synchronize_rcu();
11525				__netdev_name_node_alt_destroy(name_node);
11526			}
11527
11528		err = dev_change_net_namespace(dev, &init_net, fb_name);
11529		if (err) {
11530			pr_emerg("%s: failed to move %s to init_net: %d\n",
11531				 __func__, dev->name, err);
11532			BUG();
11533		}
11534	}
11535}
11536
11537static void __net_exit default_device_exit_batch(struct list_head *net_list)
11538{
11539	/* At exit all network devices most be removed from a network
11540	 * namespace.  Do this in the reverse order of registration.
11541	 * Do this across as many network namespaces as possible to
11542	 * improve batching efficiency.
11543	 */
11544	struct net_device *dev;
11545	struct net *net;
11546	LIST_HEAD(dev_kill_list);
11547
11548	rtnl_lock();
11549	list_for_each_entry(net, net_list, exit_list) {
11550		default_device_exit_net(net);
11551		cond_resched();
11552	}
11553
11554	list_for_each_entry(net, net_list, exit_list) {
11555		for_each_netdev_reverse(net, dev) {
11556			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
11557				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
11558			else
11559				unregister_netdevice_queue(dev, &dev_kill_list);
11560		}
11561	}
11562	unregister_netdevice_many(&dev_kill_list);
11563	rtnl_unlock();
11564}
11565
11566static struct pernet_operations __net_initdata default_device_ops = {
11567	.exit_batch = default_device_exit_batch,
11568};
11569
11570/*
11571 *	Initialize the DEV module. At boot time this walks the device list and
11572 *	unhooks any devices that fail to initialise (normally hardware not
11573 *	present) and leaves us with a valid list of present and active devices.
11574 *
11575 */
11576
11577/*
11578 *       This is called single threaded during boot, so no need
11579 *       to take the rtnl semaphore.
11580 */
11581static int __init net_dev_init(void)
11582{
11583	int i, rc = -ENOMEM;
11584
11585	BUG_ON(!dev_boot_phase);
11586
11587	if (dev_proc_init())
11588		goto out;
11589
11590	if (netdev_kobject_init())
11591		goto out;
11592
11593	INIT_LIST_HEAD(&ptype_all);
11594	for (i = 0; i < PTYPE_HASH_SIZE; i++)
11595		INIT_LIST_HEAD(&ptype_base[i]);
11596
11597	if (register_pernet_subsys(&netdev_net_ops))
11598		goto out;
11599
11600	/*
11601	 *	Initialise the packet receive queues.
11602	 */
11603
11604	for_each_possible_cpu(i) {
11605		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11606		struct softnet_data *sd = &per_cpu(softnet_data, i);
11607
11608		INIT_WORK(flush, flush_backlog);
11609
11610		skb_queue_head_init(&sd->input_pkt_queue);
11611		skb_queue_head_init(&sd->process_queue);
11612#ifdef CONFIG_XFRM_OFFLOAD
11613		skb_queue_head_init(&sd->xfrm_backlog);
11614#endif
11615		INIT_LIST_HEAD(&sd->poll_list);
11616		sd->output_queue_tailp = &sd->output_queue;
11617#ifdef CONFIG_RPS
11618		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
11619		sd->cpu = i;
11620#endif
11621		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
11622		spin_lock_init(&sd->defer_lock);
11623
11624		init_gro_hash(&sd->backlog);
11625		sd->backlog.poll = process_backlog;
11626		sd->backlog.weight = weight_p;
11627	}
11628
11629	dev_boot_phase = 0;
11630
11631	/* The loopback device is special if any other network devices
11632	 * is present in a network namespace the loopback device must
11633	 * be present. Since we now dynamically allocate and free the
11634	 * loopback device ensure this invariant is maintained by
11635	 * keeping the loopback device as the first device on the
11636	 * list of network devices.  Ensuring the loopback devices
11637	 * is the first device that appears and the last network device
11638	 * that disappears.
11639	 */
11640	if (register_pernet_device(&loopback_net_ops))
11641		goto out;
11642
11643	if (register_pernet_device(&default_device_ops))
11644		goto out;
11645
11646	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11647	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11648
11649	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11650				       NULL, dev_cpu_dead);
11651	WARN_ON(rc < 0);
11652	rc = 0;
11653out:
11654	return rc;
11655}
11656
11657subsys_initcall(net_dev_init);
11658