xref: /kernel/linux/linux-5.10/net/core/dev.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 *      NET3    Protocol independent device support routines.
4 *
5 *	Derived from the non IP parts of dev.c 1.0.19
6 *              Authors:	Ross Biro
7 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
8 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
9 *
10 *	Additional Authors:
11 *		Florian la Roche <rzsfl@rz.uni-sb.de>
12 *		Alan Cox <gw4pts@gw4pts.ampr.org>
13 *		David Hinds <dahinds@users.sourceforge.net>
14 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
15 *		Adam Sulmicki <adam@cfar.umd.edu>
16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
17 *
18 *	Changes:
19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
20 *                                      to 2 if register_netdev gets called
21 *                                      before net_dev_init & also removed a
22 *                                      few lines of code in the process.
23 *		Alan Cox	:	device private ioctl copies fields back.
24 *		Alan Cox	:	Transmit queue code does relevant
25 *					stunts to keep the queue safe.
26 *		Alan Cox	:	Fixed double lock.
27 *		Alan Cox	:	Fixed promisc NULL pointer trap
28 *		????????	:	Support the full private ioctl range
29 *		Alan Cox	:	Moved ioctl permission check into
30 *					drivers
31 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
32 *		Alan Cox	:	100 backlog just doesn't cut it when
33 *					you start doing multicast video 8)
34 *		Alan Cox	:	Rewrote net_bh and list manager.
35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
36 *		Alan Cox	:	Took out transmit every packet pass
37 *					Saved a few bytes in the ioctl handler
38 *		Alan Cox	:	Network driver sets packet type before
39 *					calling netif_rx. Saves a function
40 *					call a packet.
41 *		Alan Cox	:	Hashed net_bh()
42 *		Richard Kooijman:	Timestamp fixes.
43 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
44 *		Alan Cox	:	Device lock protection.
45 *              Alan Cox        :       Fixed nasty side effect of device close
46 *					changes.
47 *		Rudi Cilibrasi	:	Pass the right thing to
48 *					set_mac_address()
49 *		Dave Miller	:	32bit quantity for the device lock to
50 *					make it work out on a Sparc.
51 *		Bjorn Ekwall	:	Added KERNELD hack.
52 *		Alan Cox	:	Cleaned up the backlog initialise.
53 *		Craig Metz	:	SIOCGIFCONF fix if space for under
54 *					1 device.
55 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
56 *					is no device open function.
57 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
58 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
59 *		Cyrus Durgin	:	Cleaned for KMOD
60 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
61 *					A network device unload needs to purge
62 *					the backlog queue.
63 *	Paul Rusty Russell	:	SIOCSIFNAME
64 *              Pekka Riikonen  :	Netdev boot-time settings code
65 *              Andrew Morton   :       Make unregister_netdevice wait
66 *                                      indefinitely on dev->refcnt
67 *              J Hadi Salim    :       - Backlog queue sampling
68 *				        - netif_rx() feedback
69 */
70
71#include <linux/uaccess.h>
72#include <linux/bitops.h>
73#include <linux/capability.h>
74#include <linux/cpu.h>
75#include <linux/types.h>
76#include <linux/kernel.h>
77#include <linux/hash.h>
78#include <linux/slab.h>
79#include <linux/sched.h>
80#include <linux/sched/mm.h>
81#include <linux/mutex.h>
82#include <linux/rwsem.h>
83#include <linux/string.h>
84#include <linux/mm.h>
85#include <linux/socket.h>
86#include <linux/sockios.h>
87#include <linux/errno.h>
88#include <linux/interrupt.h>
89#include <linux/if_ether.h>
90#include <linux/netdevice.h>
91#include <linux/etherdevice.h>
92#include <linux/ethtool.h>
93#include <linux/skbuff.h>
94#include <linux/bpf.h>
95#include <linux/bpf_trace.h>
96#include <net/net_namespace.h>
97#include <net/sock.h>
98#include <net/busy_poll.h>
99#include <linux/rtnetlink.h>
100#include <linux/stat.h>
101#include <net/dsa.h>
102#include <net/dst.h>
103#include <net/dst_metadata.h>
104#include <net/pkt_sched.h>
105#include <net/pkt_cls.h>
106#include <net/checksum.h>
107#include <net/xfrm.h>
108#include <linux/highmem.h>
109#include <linux/init.h>
110#include <linux/module.h>
111#include <linux/netpoll.h>
112#include <linux/rcupdate.h>
113#include <linux/delay.h>
114#include <net/iw_handler.h>
115#include <asm/current.h>
116#include <linux/audit.h>
117#include <linux/dmaengine.h>
118#include <linux/err.h>
119#include <linux/ctype.h>
120#include <linux/if_arp.h>
121#include <linux/if_vlan.h>
122#include <linux/ip.h>
123#include <net/ip.h>
124#include <net/mpls.h>
125#include <linux/ipv6.h>
126#include <linux/in.h>
127#include <linux/jhash.h>
128#include <linux/random.h>
129#include <trace/events/napi.h>
130#include <trace/events/net.h>
131#include <trace/events/skb.h>
132#include <linux/inetdevice.h>
133#include <linux/cpu_rmap.h>
134#include <linux/static_key.h>
135#include <linux/hashtable.h>
136#include <linux/vmalloc.h>
137#include <linux/if_macvlan.h>
138#include <linux/errqueue.h>
139#include <linux/hrtimer.h>
140#include <linux/netfilter_ingress.h>
141#include <linux/crash_dump.h>
142#include <linux/sctp.h>
143#include <net/udp_tunnel.h>
144#include <linux/net_namespace.h>
145#include <linux/indirect_call_wrapper.h>
146#include <net/devlink.h>
147#include <linux/pm_runtime.h>
148#include <linux/prandom.h>
149
150#include "net-sysfs.h"
151
152#define MAX_GRO_SKBS 8
153
154/* This should be increased if a protocol with a bigger head is added. */
155#define GRO_MAX_HEAD (MAX_HEADER + 128)
156
157static DEFINE_SPINLOCK(ptype_lock);
158static DEFINE_SPINLOCK(offload_lock);
159struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
160struct list_head ptype_all __read_mostly;	/* Taps */
161static struct list_head offload_base __read_mostly;
162
163static int netif_rx_internal(struct sk_buff *skb);
164static int call_netdevice_notifiers_info(unsigned long val,
165					 struct netdev_notifier_info *info);
166static int call_netdevice_notifiers_extack(unsigned long val,
167					   struct net_device *dev,
168					   struct netlink_ext_ack *extack);
169static struct napi_struct *napi_by_id(unsigned int napi_id);
170
171/*
172 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
173 * semaphore.
174 *
175 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
176 *
177 * Writers must hold the rtnl semaphore while they loop through the
178 * dev_base_head list, and hold dev_base_lock for writing when they do the
179 * actual updates.  This allows pure readers to access the list even
180 * while a writer is preparing to update it.
181 *
182 * To put it another way, dev_base_lock is held for writing only to
183 * protect against pure readers; the rtnl semaphore provides the
184 * protection against other writers.
185 *
186 * See, for example usages, register_netdevice() and
187 * unregister_netdevice(), which must be called with the rtnl
188 * semaphore held.
189 */
190DEFINE_RWLOCK(dev_base_lock);
191EXPORT_SYMBOL(dev_base_lock);
192
193static DEFINE_MUTEX(ifalias_mutex);
194
195/* protects napi_hash addition/deletion and napi_gen_id */
196static DEFINE_SPINLOCK(napi_hash_lock);
197
198static unsigned int napi_gen_id = NR_CPUS;
199static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
200
201static DECLARE_RWSEM(devnet_rename_sem);
202
203static inline void dev_base_seq_inc(struct net *net)
204{
205	while (++net->dev_base_seq == 0)
206		;
207}
208
209static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
210{
211	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
212
213	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
214}
215
216static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
217{
218	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
219}
220
221static inline void rps_lock(struct softnet_data *sd)
222{
223#ifdef CONFIG_RPS
224	spin_lock(&sd->input_pkt_queue.lock);
225#endif
226}
227
228static inline void rps_unlock(struct softnet_data *sd)
229{
230#ifdef CONFIG_RPS
231	spin_unlock(&sd->input_pkt_queue.lock);
232#endif
233}
234
235static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
236						       const char *name)
237{
238	struct netdev_name_node *name_node;
239
240	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
241	if (!name_node)
242		return NULL;
243	INIT_HLIST_NODE(&name_node->hlist);
244	name_node->dev = dev;
245	name_node->name = name;
246	return name_node;
247}
248
249static struct netdev_name_node *
250netdev_name_node_head_alloc(struct net_device *dev)
251{
252	struct netdev_name_node *name_node;
253
254	name_node = netdev_name_node_alloc(dev, dev->name);
255	if (!name_node)
256		return NULL;
257	INIT_LIST_HEAD(&name_node->list);
258	return name_node;
259}
260
261static void netdev_name_node_free(struct netdev_name_node *name_node)
262{
263	kfree(name_node);
264}
265
266static void netdev_name_node_add(struct net *net,
267				 struct netdev_name_node *name_node)
268{
269	hlist_add_head_rcu(&name_node->hlist,
270			   dev_name_hash(net, name_node->name));
271}
272
273static void netdev_name_node_del(struct netdev_name_node *name_node)
274{
275	hlist_del_rcu(&name_node->hlist);
276}
277
278static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
279							const char *name)
280{
281	struct hlist_head *head = dev_name_hash(net, name);
282	struct netdev_name_node *name_node;
283
284	hlist_for_each_entry(name_node, head, hlist)
285		if (!strcmp(name_node->name, name))
286			return name_node;
287	return NULL;
288}
289
290static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
291							    const char *name)
292{
293	struct hlist_head *head = dev_name_hash(net, name);
294	struct netdev_name_node *name_node;
295
296	hlist_for_each_entry_rcu(name_node, head, hlist)
297		if (!strcmp(name_node->name, name))
298			return name_node;
299	return NULL;
300}
301
302int netdev_name_node_alt_create(struct net_device *dev, const char *name)
303{
304	struct netdev_name_node *name_node;
305	struct net *net = dev_net(dev);
306
307	name_node = netdev_name_node_lookup(net, name);
308	if (name_node)
309		return -EEXIST;
310	name_node = netdev_name_node_alloc(dev, name);
311	if (!name_node)
312		return -ENOMEM;
313	netdev_name_node_add(net, name_node);
314	/* The node that holds dev->name acts as a head of per-device list. */
315	list_add_tail(&name_node->list, &dev->name_node->list);
316
317	return 0;
318}
319EXPORT_SYMBOL(netdev_name_node_alt_create);
320
321static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
322{
323	list_del(&name_node->list);
324	netdev_name_node_del(name_node);
325	kfree(name_node->name);
326	netdev_name_node_free(name_node);
327}
328
329int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
330{
331	struct netdev_name_node *name_node;
332	struct net *net = dev_net(dev);
333
334	name_node = netdev_name_node_lookup(net, name);
335	if (!name_node)
336		return -ENOENT;
337	/* lookup might have found our primary name or a name belonging
338	 * to another device.
339	 */
340	if (name_node == dev->name_node || name_node->dev != dev)
341		return -EINVAL;
342
343	__netdev_name_node_alt_destroy(name_node);
344
345	return 0;
346}
347EXPORT_SYMBOL(netdev_name_node_alt_destroy);
348
349static void netdev_name_node_alt_flush(struct net_device *dev)
350{
351	struct netdev_name_node *name_node, *tmp;
352
353	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
354		__netdev_name_node_alt_destroy(name_node);
355}
356
357/* Device list insertion */
358static void list_netdevice(struct net_device *dev)
359{
360	struct net *net = dev_net(dev);
361
362	ASSERT_RTNL();
363
364	write_lock_bh(&dev_base_lock);
365	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
366	netdev_name_node_add(net, dev->name_node);
367	hlist_add_head_rcu(&dev->index_hlist,
368			   dev_index_hash(net, dev->ifindex));
369	write_unlock_bh(&dev_base_lock);
370
371	dev_base_seq_inc(net);
372}
373
374/* Device list removal
375 * caller must respect a RCU grace period before freeing/reusing dev
376 */
377static void unlist_netdevice(struct net_device *dev)
378{
379	ASSERT_RTNL();
380
381	/* Unlink dev from the device chain */
382	write_lock_bh(&dev_base_lock);
383	list_del_rcu(&dev->dev_list);
384	netdev_name_node_del(dev->name_node);
385	hlist_del_rcu(&dev->index_hlist);
386	write_unlock_bh(&dev_base_lock);
387
388	dev_base_seq_inc(dev_net(dev));
389}
390
391/*
392 *	Our notifier list
393 */
394
395static RAW_NOTIFIER_HEAD(netdev_chain);
396
397/*
398 *	Device drivers call our routines to queue packets here. We empty the
399 *	queue in the local softnet handler.
400 */
401
402DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
403EXPORT_PER_CPU_SYMBOL(softnet_data);
404
405#ifdef CONFIG_LOCKDEP
406/*
407 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
408 * according to dev->type
409 */
410static const unsigned short netdev_lock_type[] = {
411	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
412	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
413	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
414	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
415	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
416	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
417	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
418	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
419	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
420	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
421	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
422	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
423	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
424	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
425	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
426
427static const char *const netdev_lock_name[] = {
428	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
429	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
430	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
431	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
432	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
433	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
434	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
435	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
436	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
437	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
438	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
439	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
440	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
441	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
442	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
443
444static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
445static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
446
447static inline unsigned short netdev_lock_pos(unsigned short dev_type)
448{
449	int i;
450
451	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
452		if (netdev_lock_type[i] == dev_type)
453			return i;
454	/* the last key is used by default */
455	return ARRAY_SIZE(netdev_lock_type) - 1;
456}
457
458static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
459						 unsigned short dev_type)
460{
461	int i;
462
463	i = netdev_lock_pos(dev_type);
464	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
465				   netdev_lock_name[i]);
466}
467
468static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
469{
470	int i;
471
472	i = netdev_lock_pos(dev->type);
473	lockdep_set_class_and_name(&dev->addr_list_lock,
474				   &netdev_addr_lock_key[i],
475				   netdev_lock_name[i]);
476}
477#else
478static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
479						 unsigned short dev_type)
480{
481}
482
483static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
484{
485}
486#endif
487
488/*******************************************************************************
489 *
490 *		Protocol management and registration routines
491 *
492 *******************************************************************************/
493
494
495/*
496 *	Add a protocol ID to the list. Now that the input handler is
497 *	smarter we can dispense with all the messy stuff that used to be
498 *	here.
499 *
500 *	BEWARE!!! Protocol handlers, mangling input packets,
501 *	MUST BE last in hash buckets and checking protocol handlers
502 *	MUST start from promiscuous ptype_all chain in net_bh.
503 *	It is true now, do not change it.
504 *	Explanation follows: if protocol handler, mangling packet, will
505 *	be the first on list, it is not able to sense, that packet
506 *	is cloned and should be copied-on-write, so that it will
507 *	change it and subsequent readers will get broken packet.
508 *							--ANK (980803)
509 */
510
511static inline struct list_head *ptype_head(const struct packet_type *pt)
512{
513	if (pt->type == htons(ETH_P_ALL))
514		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
515	else
516		return pt->dev ? &pt->dev->ptype_specific :
517				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
518}
519
520/**
521 *	dev_add_pack - add packet handler
522 *	@pt: packet type declaration
523 *
524 *	Add a protocol handler to the networking stack. The passed &packet_type
525 *	is linked into kernel lists and may not be freed until it has been
526 *	removed from the kernel lists.
527 *
528 *	This call does not sleep therefore it can not
529 *	guarantee all CPU's that are in middle of receiving packets
530 *	will see the new packet type (until the next received packet).
531 */
532
533void dev_add_pack(struct packet_type *pt)
534{
535	struct list_head *head = ptype_head(pt);
536
537	spin_lock(&ptype_lock);
538	list_add_rcu(&pt->list, head);
539	spin_unlock(&ptype_lock);
540}
541EXPORT_SYMBOL(dev_add_pack);
542
543/**
544 *	__dev_remove_pack	 - remove packet handler
545 *	@pt: packet type declaration
546 *
547 *	Remove a protocol handler that was previously added to the kernel
548 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
549 *	from the kernel lists and can be freed or reused once this function
550 *	returns.
551 *
552 *      The packet type might still be in use by receivers
553 *	and must not be freed until after all the CPU's have gone
554 *	through a quiescent state.
555 */
556void __dev_remove_pack(struct packet_type *pt)
557{
558	struct list_head *head = ptype_head(pt);
559	struct packet_type *pt1;
560
561	spin_lock(&ptype_lock);
562
563	list_for_each_entry(pt1, head, list) {
564		if (pt == pt1) {
565			list_del_rcu(&pt->list);
566			goto out;
567		}
568	}
569
570	pr_warn("dev_remove_pack: %p not found\n", pt);
571out:
572	spin_unlock(&ptype_lock);
573}
574EXPORT_SYMBOL(__dev_remove_pack);
575
576/**
577 *	dev_remove_pack	 - remove packet handler
578 *	@pt: packet type declaration
579 *
580 *	Remove a protocol handler that was previously added to the kernel
581 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
582 *	from the kernel lists and can be freed or reused once this function
583 *	returns.
584 *
585 *	This call sleeps to guarantee that no CPU is looking at the packet
586 *	type after return.
587 */
588void dev_remove_pack(struct packet_type *pt)
589{
590	__dev_remove_pack(pt);
591
592	synchronize_net();
593}
594EXPORT_SYMBOL(dev_remove_pack);
595
596
597/**
598 *	dev_add_offload - register offload handlers
599 *	@po: protocol offload declaration
600 *
601 *	Add protocol offload handlers to the networking stack. The passed
602 *	&proto_offload is linked into kernel lists and may not be freed until
603 *	it has been removed from the kernel lists.
604 *
605 *	This call does not sleep therefore it can not
606 *	guarantee all CPU's that are in middle of receiving packets
607 *	will see the new offload handlers (until the next received packet).
608 */
609void dev_add_offload(struct packet_offload *po)
610{
611	struct packet_offload *elem;
612
613	spin_lock(&offload_lock);
614	list_for_each_entry(elem, &offload_base, list) {
615		if (po->priority < elem->priority)
616			break;
617	}
618	list_add_rcu(&po->list, elem->list.prev);
619	spin_unlock(&offload_lock);
620}
621EXPORT_SYMBOL(dev_add_offload);
622
623/**
624 *	__dev_remove_offload	 - remove offload handler
625 *	@po: packet offload declaration
626 *
627 *	Remove a protocol offload handler that was previously added to the
628 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
629 *	is removed from the kernel lists and can be freed or reused once this
630 *	function returns.
631 *
632 *      The packet type might still be in use by receivers
633 *	and must not be freed until after all the CPU's have gone
634 *	through a quiescent state.
635 */
636static void __dev_remove_offload(struct packet_offload *po)
637{
638	struct list_head *head = &offload_base;
639	struct packet_offload *po1;
640
641	spin_lock(&offload_lock);
642
643	list_for_each_entry(po1, head, list) {
644		if (po == po1) {
645			list_del_rcu(&po->list);
646			goto out;
647		}
648	}
649
650	pr_warn("dev_remove_offload: %p not found\n", po);
651out:
652	spin_unlock(&offload_lock);
653}
654
655/**
656 *	dev_remove_offload	 - remove packet offload handler
657 *	@po: packet offload declaration
658 *
659 *	Remove a packet offload handler that was previously added to the kernel
660 *	offload handlers by dev_add_offload(). The passed &offload_type is
661 *	removed from the kernel lists and can be freed or reused once this
662 *	function returns.
663 *
664 *	This call sleeps to guarantee that no CPU is looking at the packet
665 *	type after return.
666 */
667void dev_remove_offload(struct packet_offload *po)
668{
669	__dev_remove_offload(po);
670
671	synchronize_net();
672}
673EXPORT_SYMBOL(dev_remove_offload);
674
675/******************************************************************************
676 *
677 *		      Device Boot-time Settings Routines
678 *
679 ******************************************************************************/
680
681/* Boot time configuration table */
682static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
683
684/**
685 *	netdev_boot_setup_add	- add new setup entry
686 *	@name: name of the device
687 *	@map: configured settings for the device
688 *
689 *	Adds new setup entry to the dev_boot_setup list.  The function
690 *	returns 0 on error and 1 on success.  This is a generic routine to
691 *	all netdevices.
692 */
693static int netdev_boot_setup_add(char *name, struct ifmap *map)
694{
695	struct netdev_boot_setup *s;
696	int i;
697
698	s = dev_boot_setup;
699	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
700		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
701			memset(s[i].name, 0, sizeof(s[i].name));
702			strlcpy(s[i].name, name, IFNAMSIZ);
703			memcpy(&s[i].map, map, sizeof(s[i].map));
704			break;
705		}
706	}
707
708	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
709}
710
711/**
712 * netdev_boot_setup_check	- check boot time settings
713 * @dev: the netdevice
714 *
715 * Check boot time settings for the device.
716 * The found settings are set for the device to be used
717 * later in the device probing.
718 * Returns 0 if no settings found, 1 if they are.
719 */
720int netdev_boot_setup_check(struct net_device *dev)
721{
722	struct netdev_boot_setup *s = dev_boot_setup;
723	int i;
724
725	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
726		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
727		    !strcmp(dev->name, s[i].name)) {
728			dev->irq = s[i].map.irq;
729			dev->base_addr = s[i].map.base_addr;
730			dev->mem_start = s[i].map.mem_start;
731			dev->mem_end = s[i].map.mem_end;
732			return 1;
733		}
734	}
735	return 0;
736}
737EXPORT_SYMBOL(netdev_boot_setup_check);
738
739
740/**
741 * netdev_boot_base	- get address from boot time settings
742 * @prefix: prefix for network device
743 * @unit: id for network device
744 *
745 * Check boot time settings for the base address of device.
746 * The found settings are set for the device to be used
747 * later in the device probing.
748 * Returns 0 if no settings found.
749 */
750unsigned long netdev_boot_base(const char *prefix, int unit)
751{
752	const struct netdev_boot_setup *s = dev_boot_setup;
753	char name[IFNAMSIZ];
754	int i;
755
756	sprintf(name, "%s%d", prefix, unit);
757
758	/*
759	 * If device already registered then return base of 1
760	 * to indicate not to probe for this interface
761	 */
762	if (__dev_get_by_name(&init_net, name))
763		return 1;
764
765	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
766		if (!strcmp(name, s[i].name))
767			return s[i].map.base_addr;
768	return 0;
769}
770
771/*
772 * Saves at boot time configured settings for any netdevice.
773 */
774int __init netdev_boot_setup(char *str)
775{
776	int ints[5];
777	struct ifmap map;
778
779	str = get_options(str, ARRAY_SIZE(ints), ints);
780	if (!str || !*str)
781		return 0;
782
783	/* Save settings */
784	memset(&map, 0, sizeof(map));
785	if (ints[0] > 0)
786		map.irq = ints[1];
787	if (ints[0] > 1)
788		map.base_addr = ints[2];
789	if (ints[0] > 2)
790		map.mem_start = ints[3];
791	if (ints[0] > 3)
792		map.mem_end = ints[4];
793
794	/* Add new entry to the list */
795	return netdev_boot_setup_add(str, &map);
796}
797
798__setup("netdev=", netdev_boot_setup);
799
800/*******************************************************************************
801 *
802 *			    Device Interface Subroutines
803 *
804 *******************************************************************************/
805
806/**
807 *	dev_get_iflink	- get 'iflink' value of a interface
808 *	@dev: targeted interface
809 *
810 *	Indicates the ifindex the interface is linked to.
811 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
812 */
813
814int dev_get_iflink(const struct net_device *dev)
815{
816	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
817		return dev->netdev_ops->ndo_get_iflink(dev);
818
819	return dev->ifindex;
820}
821EXPORT_SYMBOL(dev_get_iflink);
822
823/**
824 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
825 *	@dev: targeted interface
826 *	@skb: The packet.
827 *
828 *	For better visibility of tunnel traffic OVS needs to retrieve
829 *	egress tunnel information for a packet. Following API allows
830 *	user to get this info.
831 */
832int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
833{
834	struct ip_tunnel_info *info;
835
836	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
837		return -EINVAL;
838
839	info = skb_tunnel_info_unclone(skb);
840	if (!info)
841		return -ENOMEM;
842	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
843		return -EINVAL;
844
845	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
846}
847EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
848
849/**
850 *	__dev_get_by_name	- find a device by its name
851 *	@net: the applicable net namespace
852 *	@name: name to find
853 *
854 *	Find an interface by name. Must be called under RTNL semaphore
855 *	or @dev_base_lock. If the name is found a pointer to the device
856 *	is returned. If the name is not found then %NULL is returned. The
857 *	reference counters are not incremented so the caller must be
858 *	careful with locks.
859 */
860
861struct net_device *__dev_get_by_name(struct net *net, const char *name)
862{
863	struct netdev_name_node *node_name;
864
865	node_name = netdev_name_node_lookup(net, name);
866	return node_name ? node_name->dev : NULL;
867}
868EXPORT_SYMBOL(__dev_get_by_name);
869
870/**
871 * dev_get_by_name_rcu	- find a device by its name
872 * @net: the applicable net namespace
873 * @name: name to find
874 *
875 * Find an interface by name.
876 * If the name is found a pointer to the device is returned.
877 * If the name is not found then %NULL is returned.
878 * The reference counters are not incremented so the caller must be
879 * careful with locks. The caller must hold RCU lock.
880 */
881
882struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
883{
884	struct netdev_name_node *node_name;
885
886	node_name = netdev_name_node_lookup_rcu(net, name);
887	return node_name ? node_name->dev : NULL;
888}
889EXPORT_SYMBOL(dev_get_by_name_rcu);
890
891/**
892 *	dev_get_by_name		- find a device by its name
893 *	@net: the applicable net namespace
894 *	@name: name to find
895 *
896 *	Find an interface by name. This can be called from any
897 *	context and does its own locking. The returned handle has
898 *	the usage count incremented and the caller must use dev_put() to
899 *	release it when it is no longer needed. %NULL is returned if no
900 *	matching device is found.
901 */
902
903struct net_device *dev_get_by_name(struct net *net, const char *name)
904{
905	struct net_device *dev;
906
907	rcu_read_lock();
908	dev = dev_get_by_name_rcu(net, name);
909	if (dev)
910		dev_hold(dev);
911	rcu_read_unlock();
912	return dev;
913}
914EXPORT_SYMBOL(dev_get_by_name);
915
916/**
917 *	__dev_get_by_index - find a device by its ifindex
918 *	@net: the applicable net namespace
919 *	@ifindex: index of device
920 *
921 *	Search for an interface by index. Returns %NULL if the device
922 *	is not found or a pointer to the device. The device has not
923 *	had its reference counter increased so the caller must be careful
924 *	about locking. The caller must hold either the RTNL semaphore
925 *	or @dev_base_lock.
926 */
927
928struct net_device *__dev_get_by_index(struct net *net, int ifindex)
929{
930	struct net_device *dev;
931	struct hlist_head *head = dev_index_hash(net, ifindex);
932
933	hlist_for_each_entry(dev, head, index_hlist)
934		if (dev->ifindex == ifindex)
935			return dev;
936
937	return NULL;
938}
939EXPORT_SYMBOL(__dev_get_by_index);
940
941/**
942 *	dev_get_by_index_rcu - find a device by its ifindex
943 *	@net: the applicable net namespace
944 *	@ifindex: index of device
945 *
946 *	Search for an interface by index. Returns %NULL if the device
947 *	is not found or a pointer to the device. The device has not
948 *	had its reference counter increased so the caller must be careful
949 *	about locking. The caller must hold RCU lock.
950 */
951
952struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
953{
954	struct net_device *dev;
955	struct hlist_head *head = dev_index_hash(net, ifindex);
956
957	hlist_for_each_entry_rcu(dev, head, index_hlist)
958		if (dev->ifindex == ifindex)
959			return dev;
960
961	return NULL;
962}
963EXPORT_SYMBOL(dev_get_by_index_rcu);
964
965
966/**
967 *	dev_get_by_index - find a device by its ifindex
968 *	@net: the applicable net namespace
969 *	@ifindex: index of device
970 *
971 *	Search for an interface by index. Returns NULL if the device
972 *	is not found or a pointer to the device. The device returned has
973 *	had a reference added and the pointer is safe until the user calls
974 *	dev_put to indicate they have finished with it.
975 */
976
977struct net_device *dev_get_by_index(struct net *net, int ifindex)
978{
979	struct net_device *dev;
980
981	rcu_read_lock();
982	dev = dev_get_by_index_rcu(net, ifindex);
983	if (dev)
984		dev_hold(dev);
985	rcu_read_unlock();
986	return dev;
987}
988EXPORT_SYMBOL(dev_get_by_index);
989
990/**
991 *	dev_get_by_napi_id - find a device by napi_id
992 *	@napi_id: ID of the NAPI struct
993 *
994 *	Search for an interface by NAPI ID. Returns %NULL if the device
995 *	is not found or a pointer to the device. The device has not had
996 *	its reference counter increased so the caller must be careful
997 *	about locking. The caller must hold RCU lock.
998 */
999
1000struct net_device *dev_get_by_napi_id(unsigned int napi_id)
1001{
1002	struct napi_struct *napi;
1003
1004	WARN_ON_ONCE(!rcu_read_lock_held());
1005
1006	if (napi_id < MIN_NAPI_ID)
1007		return NULL;
1008
1009	napi = napi_by_id(napi_id);
1010
1011	return napi ? napi->dev : NULL;
1012}
1013EXPORT_SYMBOL(dev_get_by_napi_id);
1014
1015/**
1016 *	netdev_get_name - get a netdevice name, knowing its ifindex.
1017 *	@net: network namespace
1018 *	@name: a pointer to the buffer where the name will be stored.
1019 *	@ifindex: the ifindex of the interface to get the name from.
1020 */
1021int netdev_get_name(struct net *net, char *name, int ifindex)
1022{
1023	struct net_device *dev;
1024	int ret;
1025
1026	down_read(&devnet_rename_sem);
1027	rcu_read_lock();
1028
1029	dev = dev_get_by_index_rcu(net, ifindex);
1030	if (!dev) {
1031		ret = -ENODEV;
1032		goto out;
1033	}
1034
1035	strcpy(name, dev->name);
1036
1037	ret = 0;
1038out:
1039	rcu_read_unlock();
1040	up_read(&devnet_rename_sem);
1041	return ret;
1042}
1043
1044/**
1045 *	dev_getbyhwaddr_rcu - find a device by its hardware address
1046 *	@net: the applicable net namespace
1047 *	@type: media type of device
1048 *	@ha: hardware address
1049 *
1050 *	Search for an interface by MAC address. Returns NULL if the device
1051 *	is not found or a pointer to the device.
1052 *	The caller must hold RCU or RTNL.
1053 *	The returned device has not had its ref count increased
1054 *	and the caller must therefore be careful about locking
1055 *
1056 */
1057
1058struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
1059				       const char *ha)
1060{
1061	struct net_device *dev;
1062
1063	for_each_netdev_rcu(net, dev)
1064		if (dev->type == type &&
1065		    !memcmp(dev->dev_addr, ha, dev->addr_len))
1066			return dev;
1067
1068	return NULL;
1069}
1070EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
1071
1072struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1073{
1074	struct net_device *dev;
1075
1076	ASSERT_RTNL();
1077	for_each_netdev(net, dev)
1078		if (dev->type == type)
1079			return dev;
1080
1081	return NULL;
1082}
1083EXPORT_SYMBOL(__dev_getfirstbyhwtype);
1084
1085struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
1086{
1087	struct net_device *dev, *ret = NULL;
1088
1089	rcu_read_lock();
1090	for_each_netdev_rcu(net, dev)
1091		if (dev->type == type) {
1092			dev_hold(dev);
1093			ret = dev;
1094			break;
1095		}
1096	rcu_read_unlock();
1097	return ret;
1098}
1099EXPORT_SYMBOL(dev_getfirstbyhwtype);
1100
1101/**
1102 *	__dev_get_by_flags - find any device with given flags
1103 *	@net: the applicable net namespace
1104 *	@if_flags: IFF_* values
1105 *	@mask: bitmask of bits in if_flags to check
1106 *
1107 *	Search for any interface with the given flags. Returns NULL if a device
1108 *	is not found or a pointer to the device. Must be called inside
1109 *	rtnl_lock(), and result refcount is unchanged.
1110 */
1111
1112struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1113				      unsigned short mask)
1114{
1115	struct net_device *dev, *ret;
1116
1117	ASSERT_RTNL();
1118
1119	ret = NULL;
1120	for_each_netdev(net, dev) {
1121		if (((dev->flags ^ if_flags) & mask) == 0) {
1122			ret = dev;
1123			break;
1124		}
1125	}
1126	return ret;
1127}
1128EXPORT_SYMBOL(__dev_get_by_flags);
1129
1130/**
1131 *	dev_valid_name - check if name is okay for network device
1132 *	@name: name string
1133 *
1134 *	Network device names need to be valid file names to
1135 *	allow sysfs to work.  We also disallow any kind of
1136 *	whitespace.
1137 */
1138bool dev_valid_name(const char *name)
1139{
1140	if (*name == '\0')
1141		return false;
1142	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1143		return false;
1144	if (!strcmp(name, ".") || !strcmp(name, ".."))
1145		return false;
1146
1147	while (*name) {
1148		if (*name == '/' || *name == ':' || isspace(*name))
1149			return false;
1150		name++;
1151	}
1152	return true;
1153}
1154EXPORT_SYMBOL(dev_valid_name);
1155
1156/**
1157 *	__dev_alloc_name - allocate a name for a device
1158 *	@net: network namespace to allocate the device name in
1159 *	@name: name format string
1160 *	@buf:  scratch buffer and result name string
1161 *
1162 *	Passed a format string - eg "lt%d" it will try and find a suitable
1163 *	id. It scans list of devices to build up a free map, then chooses
1164 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1165 *	while allocating the name and adding the device in order to avoid
1166 *	duplicates.
1167 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1168 *	Returns the number of the unit assigned or a negative errno code.
1169 */
1170
1171static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1172{
1173	int i = 0;
1174	const char *p;
1175	const int max_netdevices = 8*PAGE_SIZE;
1176	unsigned long *inuse;
1177	struct net_device *d;
1178
1179	if (!dev_valid_name(name))
1180		return -EINVAL;
1181
1182	p = strchr(name, '%');
1183	if (p) {
1184		/*
1185		 * Verify the string as this thing may have come from
1186		 * the user.  There must be either one "%d" and no other "%"
1187		 * characters.
1188		 */
1189		if (p[1] != 'd' || strchr(p + 2, '%'))
1190			return -EINVAL;
1191
1192		/* Use one page as a bit array of possible slots */
1193		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1194		if (!inuse)
1195			return -ENOMEM;
1196
1197		for_each_netdev(net, d) {
1198			struct netdev_name_node *name_node;
1199			list_for_each_entry(name_node, &d->name_node->list, list) {
1200				if (!sscanf(name_node->name, name, &i))
1201					continue;
1202				if (i < 0 || i >= max_netdevices)
1203					continue;
1204
1205				/*  avoid cases where sscanf is not exact inverse of printf */
1206				snprintf(buf, IFNAMSIZ, name, i);
1207				if (!strncmp(buf, name_node->name, IFNAMSIZ))
1208					set_bit(i, inuse);
1209			}
1210			if (!sscanf(d->name, name, &i))
1211				continue;
1212			if (i < 0 || i >= max_netdevices)
1213				continue;
1214
1215			/*  avoid cases where sscanf is not exact inverse of printf */
1216			snprintf(buf, IFNAMSIZ, name, i);
1217			if (!strncmp(buf, d->name, IFNAMSIZ))
1218				set_bit(i, inuse);
1219		}
1220
1221		i = find_first_zero_bit(inuse, max_netdevices);
1222		free_page((unsigned long) inuse);
1223	}
1224
1225	snprintf(buf, IFNAMSIZ, name, i);
1226	if (!__dev_get_by_name(net, buf))
1227		return i;
1228
1229	/* It is possible to run out of possible slots
1230	 * when the name is long and there isn't enough space left
1231	 * for the digits, or if all bits are used.
1232	 */
1233	return -ENFILE;
1234}
1235
1236static int dev_alloc_name_ns(struct net *net,
1237			     struct net_device *dev,
1238			     const char *name)
1239{
1240	char buf[IFNAMSIZ];
1241	int ret;
1242
1243	BUG_ON(!net);
1244	ret = __dev_alloc_name(net, name, buf);
1245	if (ret >= 0)
1246		strlcpy(dev->name, buf, IFNAMSIZ);
1247	return ret;
1248}
1249
1250/**
1251 *	dev_alloc_name - allocate a name for a device
1252 *	@dev: device
1253 *	@name: name format string
1254 *
1255 *	Passed a format string - eg "lt%d" it will try and find a suitable
1256 *	id. It scans list of devices to build up a free map, then chooses
1257 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1258 *	while allocating the name and adding the device in order to avoid
1259 *	duplicates.
1260 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1261 *	Returns the number of the unit assigned or a negative errno code.
1262 */
1263
1264int dev_alloc_name(struct net_device *dev, const char *name)
1265{
1266	return dev_alloc_name_ns(dev_net(dev), dev, name);
1267}
1268EXPORT_SYMBOL(dev_alloc_name);
1269
1270static int dev_get_valid_name(struct net *net, struct net_device *dev,
1271			      const char *name)
1272{
1273	BUG_ON(!net);
1274
1275	if (!dev_valid_name(name))
1276		return -EINVAL;
1277
1278	if (strchr(name, '%'))
1279		return dev_alloc_name_ns(net, dev, name);
1280	else if (__dev_get_by_name(net, name))
1281		return -EEXIST;
1282	else if (dev->name != name)
1283		strlcpy(dev->name, name, IFNAMSIZ);
1284
1285	return 0;
1286}
1287
1288/**
1289 *	dev_change_name - change name of a device
1290 *	@dev: device
1291 *	@newname: name (or format string) must be at least IFNAMSIZ
1292 *
1293 *	Change name of a device, can pass format strings "eth%d".
1294 *	for wildcarding.
1295 */
1296int dev_change_name(struct net_device *dev, const char *newname)
1297{
1298	unsigned char old_assign_type;
1299	char oldname[IFNAMSIZ];
1300	int err = 0;
1301	int ret;
1302	struct net *net;
1303
1304	ASSERT_RTNL();
1305	BUG_ON(!dev_net(dev));
1306
1307	net = dev_net(dev);
1308
1309	/* Some auto-enslaved devices e.g. failover slaves are
1310	 * special, as userspace might rename the device after
1311	 * the interface had been brought up and running since
1312	 * the point kernel initiated auto-enslavement. Allow
1313	 * live name change even when these slave devices are
1314	 * up and running.
1315	 *
1316	 * Typically, users of these auto-enslaving devices
1317	 * don't actually care about slave name change, as
1318	 * they are supposed to operate on master interface
1319	 * directly.
1320	 */
1321	if (dev->flags & IFF_UP &&
1322	    likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
1323		return -EBUSY;
1324
1325	down_write(&devnet_rename_sem);
1326
1327	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1328		up_write(&devnet_rename_sem);
1329		return 0;
1330	}
1331
1332	memcpy(oldname, dev->name, IFNAMSIZ);
1333
1334	err = dev_get_valid_name(net, dev, newname);
1335	if (err < 0) {
1336		up_write(&devnet_rename_sem);
1337		return err;
1338	}
1339
1340	if (oldname[0] && !strchr(oldname, '%'))
1341		netdev_info(dev, "renamed from %s\n", oldname);
1342
1343	old_assign_type = dev->name_assign_type;
1344	dev->name_assign_type = NET_NAME_RENAMED;
1345
1346rollback:
1347	ret = device_rename(&dev->dev, dev->name);
1348	if (ret) {
1349		memcpy(dev->name, oldname, IFNAMSIZ);
1350		dev->name_assign_type = old_assign_type;
1351		up_write(&devnet_rename_sem);
1352		return ret;
1353	}
1354
1355	up_write(&devnet_rename_sem);
1356
1357	netdev_adjacent_rename_links(dev, oldname);
1358
1359	write_lock_bh(&dev_base_lock);
1360	netdev_name_node_del(dev->name_node);
1361	write_unlock_bh(&dev_base_lock);
1362
1363	synchronize_rcu();
1364
1365	write_lock_bh(&dev_base_lock);
1366	netdev_name_node_add(net, dev->name_node);
1367	write_unlock_bh(&dev_base_lock);
1368
1369	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1370	ret = notifier_to_errno(ret);
1371
1372	if (ret) {
1373		/* err >= 0 after dev_alloc_name() or stores the first errno */
1374		if (err >= 0) {
1375			err = ret;
1376			down_write(&devnet_rename_sem);
1377			memcpy(dev->name, oldname, IFNAMSIZ);
1378			memcpy(oldname, newname, IFNAMSIZ);
1379			dev->name_assign_type = old_assign_type;
1380			old_assign_type = NET_NAME_RENAMED;
1381			goto rollback;
1382		} else {
1383			pr_err("%s: name change rollback failed: %d\n",
1384			       dev->name, ret);
1385		}
1386	}
1387
1388	return err;
1389}
1390
1391/**
1392 *	dev_set_alias - change ifalias of a device
1393 *	@dev: device
1394 *	@alias: name up to IFALIASZ
1395 *	@len: limit of bytes to copy from info
1396 *
1397 *	Set ifalias for a device,
1398 */
1399int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1400{
1401	struct dev_ifalias *new_alias = NULL;
1402
1403	if (len >= IFALIASZ)
1404		return -EINVAL;
1405
1406	if (len) {
1407		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1408		if (!new_alias)
1409			return -ENOMEM;
1410
1411		memcpy(new_alias->ifalias, alias, len);
1412		new_alias->ifalias[len] = 0;
1413	}
1414
1415	mutex_lock(&ifalias_mutex);
1416	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1417					mutex_is_locked(&ifalias_mutex));
1418	mutex_unlock(&ifalias_mutex);
1419
1420	if (new_alias)
1421		kfree_rcu(new_alias, rcuhead);
1422
1423	return len;
1424}
1425EXPORT_SYMBOL(dev_set_alias);
1426
1427/**
1428 *	dev_get_alias - get ifalias of a device
1429 *	@dev: device
1430 *	@name: buffer to store name of ifalias
1431 *	@len: size of buffer
1432 *
1433 *	get ifalias for a device.  Caller must make sure dev cannot go
1434 *	away,  e.g. rcu read lock or own a reference count to device.
1435 */
1436int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1437{
1438	const struct dev_ifalias *alias;
1439	int ret = 0;
1440
1441	rcu_read_lock();
1442	alias = rcu_dereference(dev->ifalias);
1443	if (alias)
1444		ret = snprintf(name, len, "%s", alias->ifalias);
1445	rcu_read_unlock();
1446
1447	return ret;
1448}
1449
1450/**
1451 *	netdev_features_change - device changes features
1452 *	@dev: device to cause notification
1453 *
1454 *	Called to indicate a device has changed features.
1455 */
1456void netdev_features_change(struct net_device *dev)
1457{
1458	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1459}
1460EXPORT_SYMBOL(netdev_features_change);
1461
1462/**
1463 *	netdev_state_change - device changes state
1464 *	@dev: device to cause notification
1465 *
1466 *	Called to indicate a device has changed state. This function calls
1467 *	the notifier chains for netdev_chain and sends a NEWLINK message
1468 *	to the routing socket.
1469 */
1470void netdev_state_change(struct net_device *dev)
1471{
1472	if (dev->flags & IFF_UP) {
1473		struct netdev_notifier_change_info change_info = {
1474			.info.dev = dev,
1475		};
1476
1477		call_netdevice_notifiers_info(NETDEV_CHANGE,
1478					      &change_info.info);
1479		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1480	}
1481}
1482EXPORT_SYMBOL(netdev_state_change);
1483
1484/**
1485 * netdev_notify_peers - notify network peers about existence of @dev
1486 * @dev: network device
1487 *
1488 * Generate traffic such that interested network peers are aware of
1489 * @dev, such as by generating a gratuitous ARP. This may be used when
1490 * a device wants to inform the rest of the network about some sort of
1491 * reconfiguration such as a failover event or virtual machine
1492 * migration.
1493 */
1494void netdev_notify_peers(struct net_device *dev)
1495{
1496	rtnl_lock();
1497	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1498	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1499	rtnl_unlock();
1500}
1501EXPORT_SYMBOL(netdev_notify_peers);
1502
1503static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1504{
1505	const struct net_device_ops *ops = dev->netdev_ops;
1506	int ret;
1507
1508	ASSERT_RTNL();
1509
1510	if (!netif_device_present(dev)) {
1511		/* may be detached because parent is runtime-suspended */
1512		if (dev->dev.parent)
1513			pm_runtime_resume(dev->dev.parent);
1514		if (!netif_device_present(dev))
1515			return -ENODEV;
1516	}
1517
1518	/* Block netpoll from trying to do any rx path servicing.
1519	 * If we don't do this there is a chance ndo_poll_controller
1520	 * or ndo_poll may be running while we open the device
1521	 */
1522	netpoll_poll_disable(dev);
1523
1524	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1525	ret = notifier_to_errno(ret);
1526	if (ret)
1527		return ret;
1528
1529	set_bit(__LINK_STATE_START, &dev->state);
1530
1531	if (ops->ndo_validate_addr)
1532		ret = ops->ndo_validate_addr(dev);
1533
1534	if (!ret && ops->ndo_open)
1535		ret = ops->ndo_open(dev);
1536
1537	netpoll_poll_enable(dev);
1538
1539	if (ret)
1540		clear_bit(__LINK_STATE_START, &dev->state);
1541	else {
1542		dev->flags |= IFF_UP;
1543		dev_set_rx_mode(dev);
1544		dev_activate(dev);
1545		add_device_randomness(dev->dev_addr, dev->addr_len);
1546	}
1547
1548	return ret;
1549}
1550
1551/**
1552 *	dev_open	- prepare an interface for use.
1553 *	@dev: device to open
1554 *	@extack: netlink extended ack
1555 *
1556 *	Takes a device from down to up state. The device's private open
1557 *	function is invoked and then the multicast lists are loaded. Finally
1558 *	the device is moved into the up state and a %NETDEV_UP message is
1559 *	sent to the netdev notifier chain.
1560 *
1561 *	Calling this function on an active interface is a nop. On a failure
1562 *	a negative errno code is returned.
1563 */
1564int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1565{
1566	int ret;
1567
1568	if (dev->flags & IFF_UP)
1569		return 0;
1570
1571	ret = __dev_open(dev, extack);
1572	if (ret < 0)
1573		return ret;
1574
1575	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1576	call_netdevice_notifiers(NETDEV_UP, dev);
1577
1578	return ret;
1579}
1580EXPORT_SYMBOL(dev_open);
1581
1582static void __dev_close_many(struct list_head *head)
1583{
1584	struct net_device *dev;
1585
1586	ASSERT_RTNL();
1587	might_sleep();
1588
1589	list_for_each_entry(dev, head, close_list) {
1590		/* Temporarily disable netpoll until the interface is down */
1591		netpoll_poll_disable(dev);
1592
1593		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1594
1595		clear_bit(__LINK_STATE_START, &dev->state);
1596
1597		/* Synchronize to scheduled poll. We cannot touch poll list, it
1598		 * can be even on different cpu. So just clear netif_running().
1599		 *
1600		 * dev->stop() will invoke napi_disable() on all of it's
1601		 * napi_struct instances on this device.
1602		 */
1603		smp_mb__after_atomic(); /* Commit netif_running(). */
1604	}
1605
1606	dev_deactivate_many(head);
1607
1608	list_for_each_entry(dev, head, close_list) {
1609		const struct net_device_ops *ops = dev->netdev_ops;
1610
1611		/*
1612		 *	Call the device specific close. This cannot fail.
1613		 *	Only if device is UP
1614		 *
1615		 *	We allow it to be called even after a DETACH hot-plug
1616		 *	event.
1617		 */
1618		if (ops->ndo_stop)
1619			ops->ndo_stop(dev);
1620
1621		dev->flags &= ~IFF_UP;
1622		netpoll_poll_enable(dev);
1623	}
1624}
1625
1626static void __dev_close(struct net_device *dev)
1627{
1628	LIST_HEAD(single);
1629
1630	list_add(&dev->close_list, &single);
1631	__dev_close_many(&single);
1632	list_del(&single);
1633}
1634
1635void dev_close_many(struct list_head *head, bool unlink)
1636{
1637	struct net_device *dev, *tmp;
1638
1639	/* Remove the devices that don't need to be closed */
1640	list_for_each_entry_safe(dev, tmp, head, close_list)
1641		if (!(dev->flags & IFF_UP))
1642			list_del_init(&dev->close_list);
1643
1644	__dev_close_many(head);
1645
1646	list_for_each_entry_safe(dev, tmp, head, close_list) {
1647		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1648		call_netdevice_notifiers(NETDEV_DOWN, dev);
1649		if (unlink)
1650			list_del_init(&dev->close_list);
1651	}
1652}
1653EXPORT_SYMBOL(dev_close_many);
1654
1655/**
1656 *	dev_close - shutdown an interface.
1657 *	@dev: device to shutdown
1658 *
1659 *	This function moves an active device into down state. A
1660 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1661 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1662 *	chain.
1663 */
1664void dev_close(struct net_device *dev)
1665{
1666	if (dev->flags & IFF_UP) {
1667		LIST_HEAD(single);
1668
1669		list_add(&dev->close_list, &single);
1670		dev_close_many(&single, true);
1671		list_del(&single);
1672	}
1673}
1674EXPORT_SYMBOL(dev_close);
1675
1676
1677/**
1678 *	dev_disable_lro - disable Large Receive Offload on a device
1679 *	@dev: device
1680 *
1681 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1682 *	called under RTNL.  This is needed if received packets may be
1683 *	forwarded to another interface.
1684 */
1685void dev_disable_lro(struct net_device *dev)
1686{
1687	struct net_device *lower_dev;
1688	struct list_head *iter;
1689
1690	dev->wanted_features &= ~NETIF_F_LRO;
1691	netdev_update_features(dev);
1692
1693	if (unlikely(dev->features & NETIF_F_LRO))
1694		netdev_WARN(dev, "failed to disable LRO!\n");
1695
1696	netdev_for_each_lower_dev(dev, lower_dev, iter)
1697		dev_disable_lro(lower_dev);
1698}
1699EXPORT_SYMBOL(dev_disable_lro);
1700
1701/**
1702 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1703 *	@dev: device
1704 *
1705 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
1706 *	called under RTNL.  This is needed if Generic XDP is installed on
1707 *	the device.
1708 */
1709static void dev_disable_gro_hw(struct net_device *dev)
1710{
1711	dev->wanted_features &= ~NETIF_F_GRO_HW;
1712	netdev_update_features(dev);
1713
1714	if (unlikely(dev->features & NETIF_F_GRO_HW))
1715		netdev_WARN(dev, "failed to disable GRO_HW!\n");
1716}
1717
1718const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1719{
1720#define N(val) 						\
1721	case NETDEV_##val:				\
1722		return "NETDEV_" __stringify(val);
1723	switch (cmd) {
1724	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1725	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1726	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1727	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
1728	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
1729	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
1730	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1731	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1732	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1733	N(PRE_CHANGEADDR)
1734	}
1735#undef N
1736	return "UNKNOWN_NETDEV_EVENT";
1737}
1738EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1739
1740static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1741				   struct net_device *dev)
1742{
1743	struct netdev_notifier_info info = {
1744		.dev = dev,
1745	};
1746
1747	return nb->notifier_call(nb, val, &info);
1748}
1749
1750static int call_netdevice_register_notifiers(struct notifier_block *nb,
1751					     struct net_device *dev)
1752{
1753	int err;
1754
1755	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1756	err = notifier_to_errno(err);
1757	if (err)
1758		return err;
1759
1760	if (!(dev->flags & IFF_UP))
1761		return 0;
1762
1763	call_netdevice_notifier(nb, NETDEV_UP, dev);
1764	return 0;
1765}
1766
1767static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1768						struct net_device *dev)
1769{
1770	if (dev->flags & IFF_UP) {
1771		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1772					dev);
1773		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1774	}
1775	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1776}
1777
1778static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1779						 struct net *net)
1780{
1781	struct net_device *dev;
1782	int err;
1783
1784	for_each_netdev(net, dev) {
1785		err = call_netdevice_register_notifiers(nb, dev);
1786		if (err)
1787			goto rollback;
1788	}
1789	return 0;
1790
1791rollback:
1792	for_each_netdev_continue_reverse(net, dev)
1793		call_netdevice_unregister_notifiers(nb, dev);
1794	return err;
1795}
1796
1797static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1798						    struct net *net)
1799{
1800	struct net_device *dev;
1801
1802	for_each_netdev(net, dev)
1803		call_netdevice_unregister_notifiers(nb, dev);
1804}
1805
1806static int dev_boot_phase = 1;
1807
1808/**
1809 * register_netdevice_notifier - register a network notifier block
1810 * @nb: notifier
1811 *
1812 * Register a notifier to be called when network device events occur.
1813 * The notifier passed is linked into the kernel structures and must
1814 * not be reused until it has been unregistered. A negative errno code
1815 * is returned on a failure.
1816 *
1817 * When registered all registration and up events are replayed
1818 * to the new notifier to allow device to have a race free
1819 * view of the network device list.
1820 */
1821
1822int register_netdevice_notifier(struct notifier_block *nb)
1823{
1824	struct net *net;
1825	int err;
1826
1827	/* Close race with setup_net() and cleanup_net() */
1828	down_write(&pernet_ops_rwsem);
1829	rtnl_lock();
1830	err = raw_notifier_chain_register(&netdev_chain, nb);
1831	if (err)
1832		goto unlock;
1833	if (dev_boot_phase)
1834		goto unlock;
1835	for_each_net(net) {
1836		err = call_netdevice_register_net_notifiers(nb, net);
1837		if (err)
1838			goto rollback;
1839	}
1840
1841unlock:
1842	rtnl_unlock();
1843	up_write(&pernet_ops_rwsem);
1844	return err;
1845
1846rollback:
1847	for_each_net_continue_reverse(net)
1848		call_netdevice_unregister_net_notifiers(nb, net);
1849
1850	raw_notifier_chain_unregister(&netdev_chain, nb);
1851	goto unlock;
1852}
1853EXPORT_SYMBOL(register_netdevice_notifier);
1854
1855/**
1856 * unregister_netdevice_notifier - unregister a network notifier block
1857 * @nb: notifier
1858 *
1859 * Unregister a notifier previously registered by
1860 * register_netdevice_notifier(). The notifier is unlinked into the
1861 * kernel structures and may then be reused. A negative errno code
1862 * is returned on a failure.
1863 *
1864 * After unregistering unregister and down device events are synthesized
1865 * for all devices on the device list to the removed notifier to remove
1866 * the need for special case cleanup code.
1867 */
1868
1869int unregister_netdevice_notifier(struct notifier_block *nb)
1870{
1871	struct net *net;
1872	int err;
1873
1874	/* Close race with setup_net() and cleanup_net() */
1875	down_write(&pernet_ops_rwsem);
1876	rtnl_lock();
1877	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1878	if (err)
1879		goto unlock;
1880
1881	for_each_net(net)
1882		call_netdevice_unregister_net_notifiers(nb, net);
1883
1884unlock:
1885	rtnl_unlock();
1886	up_write(&pernet_ops_rwsem);
1887	return err;
1888}
1889EXPORT_SYMBOL(unregister_netdevice_notifier);
1890
1891static int __register_netdevice_notifier_net(struct net *net,
1892					     struct notifier_block *nb,
1893					     bool ignore_call_fail)
1894{
1895	int err;
1896
1897	err = raw_notifier_chain_register(&net->netdev_chain, nb);
1898	if (err)
1899		return err;
1900	if (dev_boot_phase)
1901		return 0;
1902
1903	err = call_netdevice_register_net_notifiers(nb, net);
1904	if (err && !ignore_call_fail)
1905		goto chain_unregister;
1906
1907	return 0;
1908
1909chain_unregister:
1910	raw_notifier_chain_unregister(&net->netdev_chain, nb);
1911	return err;
1912}
1913
1914static int __unregister_netdevice_notifier_net(struct net *net,
1915					       struct notifier_block *nb)
1916{
1917	int err;
1918
1919	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
1920	if (err)
1921		return err;
1922
1923	call_netdevice_unregister_net_notifiers(nb, net);
1924	return 0;
1925}
1926
1927/**
1928 * register_netdevice_notifier_net - register a per-netns network notifier block
1929 * @net: network namespace
1930 * @nb: notifier
1931 *
1932 * Register a notifier to be called when network device events occur.
1933 * The notifier passed is linked into the kernel structures and must
1934 * not be reused until it has been unregistered. A negative errno code
1935 * is returned on a failure.
1936 *
1937 * When registered all registration and up events are replayed
1938 * to the new notifier to allow device to have a race free
1939 * view of the network device list.
1940 */
1941
1942int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1943{
1944	int err;
1945
1946	rtnl_lock();
1947	err = __register_netdevice_notifier_net(net, nb, false);
1948	rtnl_unlock();
1949	return err;
1950}
1951EXPORT_SYMBOL(register_netdevice_notifier_net);
1952
1953/**
1954 * unregister_netdevice_notifier_net - unregister a per-netns
1955 *                                     network notifier block
1956 * @net: network namespace
1957 * @nb: notifier
1958 *
1959 * Unregister a notifier previously registered by
1960 * register_netdevice_notifier(). The notifier is unlinked into the
1961 * kernel structures and may then be reused. A negative errno code
1962 * is returned on a failure.
1963 *
1964 * After unregistering unregister and down device events are synthesized
1965 * for all devices on the device list to the removed notifier to remove
1966 * the need for special case cleanup code.
1967 */
1968
1969int unregister_netdevice_notifier_net(struct net *net,
1970				      struct notifier_block *nb)
1971{
1972	int err;
1973
1974	rtnl_lock();
1975	err = __unregister_netdevice_notifier_net(net, nb);
1976	rtnl_unlock();
1977	return err;
1978}
1979EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1980
1981int register_netdevice_notifier_dev_net(struct net_device *dev,
1982					struct notifier_block *nb,
1983					struct netdev_net_notifier *nn)
1984{
1985	int err;
1986
1987	rtnl_lock();
1988	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
1989	if (!err) {
1990		nn->nb = nb;
1991		list_add(&nn->list, &dev->net_notifier_list);
1992	}
1993	rtnl_unlock();
1994	return err;
1995}
1996EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
1997
1998int unregister_netdevice_notifier_dev_net(struct net_device *dev,
1999					  struct notifier_block *nb,
2000					  struct netdev_net_notifier *nn)
2001{
2002	int err;
2003
2004	rtnl_lock();
2005	list_del(&nn->list);
2006	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
2007	rtnl_unlock();
2008	return err;
2009}
2010EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
2011
2012static void move_netdevice_notifiers_dev_net(struct net_device *dev,
2013					     struct net *net)
2014{
2015	struct netdev_net_notifier *nn;
2016
2017	list_for_each_entry(nn, &dev->net_notifier_list, list) {
2018		__unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
2019		__register_netdevice_notifier_net(net, nn->nb, true);
2020	}
2021}
2022
2023/**
2024 *	call_netdevice_notifiers_info - call all network notifier blocks
2025 *	@val: value passed unmodified to notifier function
2026 *	@info: notifier information data
2027 *
2028 *	Call all network notifier blocks.  Parameters and return value
2029 *	are as for raw_notifier_call_chain().
2030 */
2031
2032static int call_netdevice_notifiers_info(unsigned long val,
2033					 struct netdev_notifier_info *info)
2034{
2035	struct net *net = dev_net(info->dev);
2036	int ret;
2037
2038	ASSERT_RTNL();
2039
2040	/* Run per-netns notifier block chain first, then run the global one.
2041	 * Hopefully, one day, the global one is going to be removed after
2042	 * all notifier block registrators get converted to be per-netns.
2043	 */
2044	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
2045	if (ret & NOTIFY_STOP_MASK)
2046		return ret;
2047	return raw_notifier_call_chain(&netdev_chain, val, info);
2048}
2049
2050static int call_netdevice_notifiers_extack(unsigned long val,
2051					   struct net_device *dev,
2052					   struct netlink_ext_ack *extack)
2053{
2054	struct netdev_notifier_info info = {
2055		.dev = dev,
2056		.extack = extack,
2057	};
2058
2059	return call_netdevice_notifiers_info(val, &info);
2060}
2061
2062/**
2063 *	call_netdevice_notifiers - call all network notifier blocks
2064 *      @val: value passed unmodified to notifier function
2065 *      @dev: net_device pointer passed unmodified to notifier function
2066 *
2067 *	Call all network notifier blocks.  Parameters and return value
2068 *	are as for raw_notifier_call_chain().
2069 */
2070
2071int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
2072{
2073	return call_netdevice_notifiers_extack(val, dev, NULL);
2074}
2075EXPORT_SYMBOL(call_netdevice_notifiers);
2076
2077/**
2078 *	call_netdevice_notifiers_mtu - call all network notifier blocks
2079 *	@val: value passed unmodified to notifier function
2080 *	@dev: net_device pointer passed unmodified to notifier function
2081 *	@arg: additional u32 argument passed to the notifier function
2082 *
2083 *	Call all network notifier blocks.  Parameters and return value
2084 *	are as for raw_notifier_call_chain().
2085 */
2086static int call_netdevice_notifiers_mtu(unsigned long val,
2087					struct net_device *dev, u32 arg)
2088{
2089	struct netdev_notifier_info_ext info = {
2090		.info.dev = dev,
2091		.ext.mtu = arg,
2092	};
2093
2094	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
2095
2096	return call_netdevice_notifiers_info(val, &info.info);
2097}
2098
2099#ifdef CONFIG_NET_INGRESS
2100static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2101
2102void net_inc_ingress_queue(void)
2103{
2104	static_branch_inc(&ingress_needed_key);
2105}
2106EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
2107
2108void net_dec_ingress_queue(void)
2109{
2110	static_branch_dec(&ingress_needed_key);
2111}
2112EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
2113#endif
2114
2115#ifdef CONFIG_NET_EGRESS
2116static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
2117
2118void net_inc_egress_queue(void)
2119{
2120	static_branch_inc(&egress_needed_key);
2121}
2122EXPORT_SYMBOL_GPL(net_inc_egress_queue);
2123
2124void net_dec_egress_queue(void)
2125{
2126	static_branch_dec(&egress_needed_key);
2127}
2128EXPORT_SYMBOL_GPL(net_dec_egress_queue);
2129#endif
2130
2131static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2132#ifdef CONFIG_JUMP_LABEL
2133static atomic_t netstamp_needed_deferred;
2134static atomic_t netstamp_wanted;
2135static void netstamp_clear(struct work_struct *work)
2136{
2137	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
2138	int wanted;
2139
2140	wanted = atomic_add_return(deferred, &netstamp_wanted);
2141	if (wanted > 0)
2142		static_branch_enable(&netstamp_needed_key);
2143	else
2144		static_branch_disable(&netstamp_needed_key);
2145}
2146static DECLARE_WORK(netstamp_work, netstamp_clear);
2147#endif
2148
2149void net_enable_timestamp(void)
2150{
2151#ifdef CONFIG_JUMP_LABEL
2152	int wanted;
2153
2154	while (1) {
2155		wanted = atomic_read(&netstamp_wanted);
2156		if (wanted <= 0)
2157			break;
2158		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
2159			return;
2160	}
2161	atomic_inc(&netstamp_needed_deferred);
2162	schedule_work(&netstamp_work);
2163#else
2164	static_branch_inc(&netstamp_needed_key);
2165#endif
2166}
2167EXPORT_SYMBOL(net_enable_timestamp);
2168
2169void net_disable_timestamp(void)
2170{
2171#ifdef CONFIG_JUMP_LABEL
2172	int wanted;
2173
2174	while (1) {
2175		wanted = atomic_read(&netstamp_wanted);
2176		if (wanted <= 1)
2177			break;
2178		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
2179			return;
2180	}
2181	atomic_dec(&netstamp_needed_deferred);
2182	schedule_work(&netstamp_work);
2183#else
2184	static_branch_dec(&netstamp_needed_key);
2185#endif
2186}
2187EXPORT_SYMBOL(net_disable_timestamp);
2188
2189static inline void net_timestamp_set(struct sk_buff *skb)
2190{
2191	skb->tstamp = 0;
2192	if (static_branch_unlikely(&netstamp_needed_key))
2193		__net_timestamp(skb);
2194}
2195
2196#define net_timestamp_check(COND, SKB)				\
2197	if (static_branch_unlikely(&netstamp_needed_key)) {	\
2198		if ((COND) && !(SKB)->tstamp)			\
2199			__net_timestamp(SKB);			\
2200	}							\
2201
2202bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2203{
2204	unsigned int len;
2205
2206	if (!(dev->flags & IFF_UP))
2207		return false;
2208
2209	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
2210	if (skb->len <= len)
2211		return true;
2212
2213	/* if TSO is enabled, we don't care about the length as the packet
2214	 * could be forwarded without being segmented before
2215	 */
2216	if (skb_is_gso(skb))
2217		return true;
2218
2219	return false;
2220}
2221EXPORT_SYMBOL_GPL(is_skb_forwardable);
2222
2223int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2224{
2225	int ret = ____dev_forward_skb(dev, skb);
2226
2227	if (likely(!ret)) {
2228		skb->protocol = eth_type_trans(skb, dev);
2229		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2230	}
2231
2232	return ret;
2233}
2234EXPORT_SYMBOL_GPL(__dev_forward_skb);
2235
2236/**
2237 * dev_forward_skb - loopback an skb to another netif
2238 *
2239 * @dev: destination network device
2240 * @skb: buffer to forward
2241 *
2242 * return values:
2243 *	NET_RX_SUCCESS	(no congestion)
2244 *	NET_RX_DROP     (packet was dropped, but freed)
2245 *
2246 * dev_forward_skb can be used for injecting an skb from the
2247 * start_xmit function of one device into the receive queue
2248 * of another device.
2249 *
2250 * The receiving device may be in another namespace, so
2251 * we have to clear all information in the skb that could
2252 * impact namespace isolation.
2253 */
2254int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2255{
2256	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2257}
2258EXPORT_SYMBOL_GPL(dev_forward_skb);
2259
2260static inline int deliver_skb(struct sk_buff *skb,
2261			      struct packet_type *pt_prev,
2262			      struct net_device *orig_dev)
2263{
2264	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2265		return -ENOMEM;
2266	refcount_inc(&skb->users);
2267	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2268}
2269
2270static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2271					  struct packet_type **pt,
2272					  struct net_device *orig_dev,
2273					  __be16 type,
2274					  struct list_head *ptype_list)
2275{
2276	struct packet_type *ptype, *pt_prev = *pt;
2277
2278	list_for_each_entry_rcu(ptype, ptype_list, list) {
2279		if (ptype->type != type)
2280			continue;
2281		if (pt_prev)
2282			deliver_skb(skb, pt_prev, orig_dev);
2283		pt_prev = ptype;
2284	}
2285	*pt = pt_prev;
2286}
2287
2288static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2289{
2290	if (!ptype->af_packet_priv || !skb->sk)
2291		return false;
2292
2293	if (ptype->id_match)
2294		return ptype->id_match(ptype, skb->sk);
2295	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2296		return true;
2297
2298	return false;
2299}
2300
2301/**
2302 * dev_nit_active - return true if any network interface taps are in use
2303 *
2304 * @dev: network device to check for the presence of taps
2305 */
2306bool dev_nit_active(struct net_device *dev)
2307{
2308	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
2309}
2310EXPORT_SYMBOL_GPL(dev_nit_active);
2311
2312/*
2313 *	Support routine. Sends outgoing frames to any network
2314 *	taps currently in use.
2315 */
2316
2317void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2318{
2319	struct packet_type *ptype;
2320	struct sk_buff *skb2 = NULL;
2321	struct packet_type *pt_prev = NULL;
2322	struct list_head *ptype_list = &ptype_all;
2323
2324	rcu_read_lock();
2325again:
2326	list_for_each_entry_rcu(ptype, ptype_list, list) {
2327		if (READ_ONCE(ptype->ignore_outgoing))
2328			continue;
2329
2330		/* Never send packets back to the socket
2331		 * they originated from - MvS (miquels@drinkel.ow.org)
2332		 */
2333		if (skb_loop_sk(ptype, skb))
2334			continue;
2335
2336		if (pt_prev) {
2337			deliver_skb(skb2, pt_prev, skb->dev);
2338			pt_prev = ptype;
2339			continue;
2340		}
2341
2342		/* need to clone skb, done only once */
2343		skb2 = skb_clone(skb, GFP_ATOMIC);
2344		if (!skb2)
2345			goto out_unlock;
2346
2347		net_timestamp_set(skb2);
2348
2349		/* skb->nh should be correctly
2350		 * set by sender, so that the second statement is
2351		 * just protection against buggy protocols.
2352		 */
2353		skb_reset_mac_header(skb2);
2354
2355		if (skb_network_header(skb2) < skb2->data ||
2356		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2357			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2358					     ntohs(skb2->protocol),
2359					     dev->name);
2360			skb_reset_network_header(skb2);
2361		}
2362
2363		skb2->transport_header = skb2->network_header;
2364		skb2->pkt_type = PACKET_OUTGOING;
2365		pt_prev = ptype;
2366	}
2367
2368	if (ptype_list == &ptype_all) {
2369		ptype_list = &dev->ptype_all;
2370		goto again;
2371	}
2372out_unlock:
2373	if (pt_prev) {
2374		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2375			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2376		else
2377			kfree_skb(skb2);
2378	}
2379	rcu_read_unlock();
2380}
2381EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2382
2383/**
2384 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2385 * @dev: Network device
2386 * @txq: number of queues available
2387 *
2388 * If real_num_tx_queues is changed the tc mappings may no longer be
2389 * valid. To resolve this verify the tc mapping remains valid and if
2390 * not NULL the mapping. With no priorities mapping to this
2391 * offset/count pair it will no longer be used. In the worst case TC0
2392 * is invalid nothing can be done so disable priority mappings. If is
2393 * expected that drivers will fix this mapping if they can before
2394 * calling netif_set_real_num_tx_queues.
2395 */
2396static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2397{
2398	int i;
2399	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2400
2401	/* If TC0 is invalidated disable TC mapping */
2402	if (tc->offset + tc->count > txq) {
2403		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2404		dev->num_tc = 0;
2405		return;
2406	}
2407
2408	/* Invalidated prio to tc mappings set to TC0 */
2409	for (i = 1; i < TC_BITMASK + 1; i++) {
2410		int q = netdev_get_prio_tc_map(dev, i);
2411
2412		tc = &dev->tc_to_txq[q];
2413		if (tc->offset + tc->count > txq) {
2414			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2415				i, q);
2416			netdev_set_prio_tc_map(dev, i, 0);
2417		}
2418	}
2419}
2420
2421int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2422{
2423	if (dev->num_tc) {
2424		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2425		int i;
2426
2427		/* walk through the TCs and see if it falls into any of them */
2428		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2429			if ((txq - tc->offset) < tc->count)
2430				return i;
2431		}
2432
2433		/* didn't find it, just return -1 to indicate no match */
2434		return -1;
2435	}
2436
2437	return 0;
2438}
2439EXPORT_SYMBOL(netdev_txq_to_tc);
2440
2441#ifdef CONFIG_XPS
2442struct static_key xps_needed __read_mostly;
2443EXPORT_SYMBOL(xps_needed);
2444struct static_key xps_rxqs_needed __read_mostly;
2445EXPORT_SYMBOL(xps_rxqs_needed);
2446static DEFINE_MUTEX(xps_map_mutex);
2447#define xmap_dereference(P)		\
2448	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2449
2450static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2451			     int tci, u16 index)
2452{
2453	struct xps_map *map = NULL;
2454	int pos;
2455
2456	if (dev_maps)
2457		map = xmap_dereference(dev_maps->attr_map[tci]);
2458	if (!map)
2459		return false;
2460
2461	for (pos = map->len; pos--;) {
2462		if (map->queues[pos] != index)
2463			continue;
2464
2465		if (map->len > 1) {
2466			map->queues[pos] = map->queues[--map->len];
2467			break;
2468		}
2469
2470		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2471		kfree_rcu(map, rcu);
2472		return false;
2473	}
2474
2475	return true;
2476}
2477
2478static bool remove_xps_queue_cpu(struct net_device *dev,
2479				 struct xps_dev_maps *dev_maps,
2480				 int cpu, u16 offset, u16 count)
2481{
2482	int num_tc = dev->num_tc ? : 1;
2483	bool active = false;
2484	int tci;
2485
2486	for (tci = cpu * num_tc; num_tc--; tci++) {
2487		int i, j;
2488
2489		for (i = count, j = offset; i--; j++) {
2490			if (!remove_xps_queue(dev_maps, tci, j))
2491				break;
2492		}
2493
2494		active |= i < 0;
2495	}
2496
2497	return active;
2498}
2499
2500static void reset_xps_maps(struct net_device *dev,
2501			   struct xps_dev_maps *dev_maps,
2502			   bool is_rxqs_map)
2503{
2504	if (is_rxqs_map) {
2505		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2506		RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
2507	} else {
2508		RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
2509	}
2510	static_key_slow_dec_cpuslocked(&xps_needed);
2511	kfree_rcu(dev_maps, rcu);
2512}
2513
2514static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
2515			   struct xps_dev_maps *dev_maps, unsigned int nr_ids,
2516			   u16 offset, u16 count, bool is_rxqs_map)
2517{
2518	bool active = false;
2519	int i, j;
2520
2521	for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
2522	     j < nr_ids;)
2523		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
2524					       count);
2525	if (!active)
2526		reset_xps_maps(dev, dev_maps, is_rxqs_map);
2527
2528	if (!is_rxqs_map) {
2529		for (i = offset + (count - 1); count--; i--) {
2530			netdev_queue_numa_node_write(
2531				netdev_get_tx_queue(dev, i),
2532				NUMA_NO_NODE);
2533		}
2534	}
2535}
2536
2537static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2538				   u16 count)
2539{
2540	const unsigned long *possible_mask = NULL;
2541	struct xps_dev_maps *dev_maps;
2542	unsigned int nr_ids;
2543
2544	if (!static_key_false(&xps_needed))
2545		return;
2546
2547	cpus_read_lock();
2548	mutex_lock(&xps_map_mutex);
2549
2550	if (static_key_false(&xps_rxqs_needed)) {
2551		dev_maps = xmap_dereference(dev->xps_rxqs_map);
2552		if (dev_maps) {
2553			nr_ids = dev->num_rx_queues;
2554			clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
2555				       offset, count, true);
2556		}
2557	}
2558
2559	dev_maps = xmap_dereference(dev->xps_cpus_map);
2560	if (!dev_maps)
2561		goto out_no_maps;
2562
2563	if (num_possible_cpus() > 1)
2564		possible_mask = cpumask_bits(cpu_possible_mask);
2565	nr_ids = nr_cpu_ids;
2566	clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
2567		       false);
2568
2569out_no_maps:
2570	mutex_unlock(&xps_map_mutex);
2571	cpus_read_unlock();
2572}
2573
2574static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2575{
2576	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2577}
2578
2579static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2580				      u16 index, bool is_rxqs_map)
2581{
2582	struct xps_map *new_map;
2583	int alloc_len = XPS_MIN_MAP_ALLOC;
2584	int i, pos;
2585
2586	for (pos = 0; map && pos < map->len; pos++) {
2587		if (map->queues[pos] != index)
2588			continue;
2589		return map;
2590	}
2591
2592	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
2593	if (map) {
2594		if (pos < map->alloc_len)
2595			return map;
2596
2597		alloc_len = map->alloc_len * 2;
2598	}
2599
2600	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2601	 *  map
2602	 */
2603	if (is_rxqs_map)
2604		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2605	else
2606		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2607				       cpu_to_node(attr_index));
2608	if (!new_map)
2609		return NULL;
2610
2611	for (i = 0; i < pos; i++)
2612		new_map->queues[i] = map->queues[i];
2613	new_map->alloc_len = alloc_len;
2614	new_map->len = pos;
2615
2616	return new_map;
2617}
2618
2619/* Must be called under cpus_read_lock */
2620int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2621			  u16 index, bool is_rxqs_map)
2622{
2623	const unsigned long *online_mask = NULL, *possible_mask = NULL;
2624	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2625	int i, j, tci, numa_node_id = -2;
2626	int maps_sz, num_tc = 1, tc = 0;
2627	struct xps_map *map, *new_map;
2628	bool active = false;
2629	unsigned int nr_ids;
2630
2631	WARN_ON_ONCE(index >= dev->num_tx_queues);
2632
2633	if (dev->num_tc) {
2634		/* Do not allow XPS on subordinate device directly */
2635		num_tc = dev->num_tc;
2636		if (num_tc < 0)
2637			return -EINVAL;
2638
2639		/* If queue belongs to subordinate dev use its map */
2640		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2641
2642		tc = netdev_txq_to_tc(dev, index);
2643		if (tc < 0)
2644			return -EINVAL;
2645	}
2646
2647	mutex_lock(&xps_map_mutex);
2648	if (is_rxqs_map) {
2649		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2650		dev_maps = xmap_dereference(dev->xps_rxqs_map);
2651		nr_ids = dev->num_rx_queues;
2652	} else {
2653		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2654		if (num_possible_cpus() > 1) {
2655			online_mask = cpumask_bits(cpu_online_mask);
2656			possible_mask = cpumask_bits(cpu_possible_mask);
2657		}
2658		dev_maps = xmap_dereference(dev->xps_cpus_map);
2659		nr_ids = nr_cpu_ids;
2660	}
2661
2662	if (maps_sz < L1_CACHE_BYTES)
2663		maps_sz = L1_CACHE_BYTES;
2664
2665	/* allocate memory for queue storage */
2666	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2667	     j < nr_ids;) {
2668		if (!new_dev_maps)
2669			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2670		if (!new_dev_maps) {
2671			mutex_unlock(&xps_map_mutex);
2672			return -ENOMEM;
2673		}
2674
2675		tci = j * num_tc + tc;
2676		map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
2677				 NULL;
2678
2679		map = expand_xps_map(map, j, index, is_rxqs_map);
2680		if (!map)
2681			goto error;
2682
2683		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2684	}
2685
2686	if (!new_dev_maps)
2687		goto out_no_new_maps;
2688
2689	if (!dev_maps) {
2690		/* Increment static keys at most once per type */
2691		static_key_slow_inc_cpuslocked(&xps_needed);
2692		if (is_rxqs_map)
2693			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2694	}
2695
2696	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2697	     j < nr_ids;) {
2698		/* copy maps belonging to foreign traffic classes */
2699		for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
2700			/* fill in the new device map from the old device map */
2701			map = xmap_dereference(dev_maps->attr_map[tci]);
2702			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2703		}
2704
2705		/* We need to explicitly update tci as prevous loop
2706		 * could break out early if dev_maps is NULL.
2707		 */
2708		tci = j * num_tc + tc;
2709
2710		if (netif_attr_test_mask(j, mask, nr_ids) &&
2711		    netif_attr_test_online(j, online_mask, nr_ids)) {
2712			/* add tx-queue to CPU/rx-queue maps */
2713			int pos = 0;
2714
2715			map = xmap_dereference(new_dev_maps->attr_map[tci]);
2716			while ((pos < map->len) && (map->queues[pos] != index))
2717				pos++;
2718
2719			if (pos == map->len)
2720				map->queues[map->len++] = index;
2721#ifdef CONFIG_NUMA
2722			if (!is_rxqs_map) {
2723				if (numa_node_id == -2)
2724					numa_node_id = cpu_to_node(j);
2725				else if (numa_node_id != cpu_to_node(j))
2726					numa_node_id = -1;
2727			}
2728#endif
2729		} else if (dev_maps) {
2730			/* fill in the new device map from the old device map */
2731			map = xmap_dereference(dev_maps->attr_map[tci]);
2732			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2733		}
2734
2735		/* copy maps belonging to foreign traffic classes */
2736		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2737			/* fill in the new device map from the old device map */
2738			map = xmap_dereference(dev_maps->attr_map[tci]);
2739			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2740		}
2741	}
2742
2743	if (is_rxqs_map)
2744		rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
2745	else
2746		rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
2747
2748	/* Cleanup old maps */
2749	if (!dev_maps)
2750		goto out_no_old_maps;
2751
2752	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2753	     j < nr_ids;) {
2754		for (i = num_tc, tci = j * num_tc; i--; tci++) {
2755			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2756			map = xmap_dereference(dev_maps->attr_map[tci]);
2757			if (map && map != new_map)
2758				kfree_rcu(map, rcu);
2759		}
2760	}
2761
2762	kfree_rcu(dev_maps, rcu);
2763
2764out_no_old_maps:
2765	dev_maps = new_dev_maps;
2766	active = true;
2767
2768out_no_new_maps:
2769	if (!is_rxqs_map) {
2770		/* update Tx queue numa node */
2771		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2772					     (numa_node_id >= 0) ?
2773					     numa_node_id : NUMA_NO_NODE);
2774	}
2775
2776	if (!dev_maps)
2777		goto out_no_maps;
2778
2779	/* removes tx-queue from unused CPUs/rx-queues */
2780	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2781	     j < nr_ids;) {
2782		for (i = tc, tci = j * num_tc; i--; tci++)
2783			active |= remove_xps_queue(dev_maps, tci, index);
2784		if (!netif_attr_test_mask(j, mask, nr_ids) ||
2785		    !netif_attr_test_online(j, online_mask, nr_ids))
2786			active |= remove_xps_queue(dev_maps, tci, index);
2787		for (i = num_tc - tc, tci++; --i; tci++)
2788			active |= remove_xps_queue(dev_maps, tci, index);
2789	}
2790
2791	/* free map if not active */
2792	if (!active)
2793		reset_xps_maps(dev, dev_maps, is_rxqs_map);
2794
2795out_no_maps:
2796	mutex_unlock(&xps_map_mutex);
2797
2798	return 0;
2799error:
2800	/* remove any maps that we added */
2801	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2802	     j < nr_ids;) {
2803		for (i = num_tc, tci = j * num_tc; i--; tci++) {
2804			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2805			map = dev_maps ?
2806			      xmap_dereference(dev_maps->attr_map[tci]) :
2807			      NULL;
2808			if (new_map && new_map != map)
2809				kfree(new_map);
2810		}
2811	}
2812
2813	mutex_unlock(&xps_map_mutex);
2814
2815	kfree(new_dev_maps);
2816	return -ENOMEM;
2817}
2818EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2819
2820int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2821			u16 index)
2822{
2823	int ret;
2824
2825	cpus_read_lock();
2826	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
2827	cpus_read_unlock();
2828
2829	return ret;
2830}
2831EXPORT_SYMBOL(netif_set_xps_queue);
2832
2833#endif
2834static void netdev_unbind_all_sb_channels(struct net_device *dev)
2835{
2836	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2837
2838	/* Unbind any subordinate channels */
2839	while (txq-- != &dev->_tx[0]) {
2840		if (txq->sb_dev)
2841			netdev_unbind_sb_channel(dev, txq->sb_dev);
2842	}
2843}
2844
2845void netdev_reset_tc(struct net_device *dev)
2846{
2847#ifdef CONFIG_XPS
2848	netif_reset_xps_queues_gt(dev, 0);
2849#endif
2850	netdev_unbind_all_sb_channels(dev);
2851
2852	/* Reset TC configuration of device */
2853	dev->num_tc = 0;
2854	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2855	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2856}
2857EXPORT_SYMBOL(netdev_reset_tc);
2858
2859int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2860{
2861	if (tc >= dev->num_tc)
2862		return -EINVAL;
2863
2864#ifdef CONFIG_XPS
2865	netif_reset_xps_queues(dev, offset, count);
2866#endif
2867	dev->tc_to_txq[tc].count = count;
2868	dev->tc_to_txq[tc].offset = offset;
2869	return 0;
2870}
2871EXPORT_SYMBOL(netdev_set_tc_queue);
2872
2873int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2874{
2875	if (num_tc > TC_MAX_QUEUE)
2876		return -EINVAL;
2877
2878#ifdef CONFIG_XPS
2879	netif_reset_xps_queues_gt(dev, 0);
2880#endif
2881	netdev_unbind_all_sb_channels(dev);
2882
2883	dev->num_tc = num_tc;
2884	return 0;
2885}
2886EXPORT_SYMBOL(netdev_set_num_tc);
2887
2888void netdev_unbind_sb_channel(struct net_device *dev,
2889			      struct net_device *sb_dev)
2890{
2891	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2892
2893#ifdef CONFIG_XPS
2894	netif_reset_xps_queues_gt(sb_dev, 0);
2895#endif
2896	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2897	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2898
2899	while (txq-- != &dev->_tx[0]) {
2900		if (txq->sb_dev == sb_dev)
2901			txq->sb_dev = NULL;
2902	}
2903}
2904EXPORT_SYMBOL(netdev_unbind_sb_channel);
2905
2906int netdev_bind_sb_channel_queue(struct net_device *dev,
2907				 struct net_device *sb_dev,
2908				 u8 tc, u16 count, u16 offset)
2909{
2910	/* Make certain the sb_dev and dev are already configured */
2911	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2912		return -EINVAL;
2913
2914	/* We cannot hand out queues we don't have */
2915	if ((offset + count) > dev->real_num_tx_queues)
2916		return -EINVAL;
2917
2918	/* Record the mapping */
2919	sb_dev->tc_to_txq[tc].count = count;
2920	sb_dev->tc_to_txq[tc].offset = offset;
2921
2922	/* Provide a way for Tx queue to find the tc_to_txq map or
2923	 * XPS map for itself.
2924	 */
2925	while (count--)
2926		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2927
2928	return 0;
2929}
2930EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2931
2932int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2933{
2934	/* Do not use a multiqueue device to represent a subordinate channel */
2935	if (netif_is_multiqueue(dev))
2936		return -ENODEV;
2937
2938	/* We allow channels 1 - 32767 to be used for subordinate channels.
2939	 * Channel 0 is meant to be "native" mode and used only to represent
2940	 * the main root device. We allow writing 0 to reset the device back
2941	 * to normal mode after being used as a subordinate channel.
2942	 */
2943	if (channel > S16_MAX)
2944		return -EINVAL;
2945
2946	dev->num_tc = -channel;
2947
2948	return 0;
2949}
2950EXPORT_SYMBOL(netdev_set_sb_channel);
2951
2952/*
2953 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2954 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2955 */
2956int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2957{
2958	bool disabling;
2959	int rc;
2960
2961	disabling = txq < dev->real_num_tx_queues;
2962
2963	if (txq < 1 || txq > dev->num_tx_queues)
2964		return -EINVAL;
2965
2966	if (dev->reg_state == NETREG_REGISTERED ||
2967	    dev->reg_state == NETREG_UNREGISTERING) {
2968		ASSERT_RTNL();
2969
2970		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2971						  txq);
2972		if (rc)
2973			return rc;
2974
2975		if (dev->num_tc)
2976			netif_setup_tc(dev, txq);
2977
2978		dev_qdisc_change_real_num_tx(dev, txq);
2979
2980		dev->real_num_tx_queues = txq;
2981
2982		if (disabling) {
2983			synchronize_net();
2984			qdisc_reset_all_tx_gt(dev, txq);
2985#ifdef CONFIG_XPS
2986			netif_reset_xps_queues_gt(dev, txq);
2987#endif
2988		}
2989	} else {
2990		dev->real_num_tx_queues = txq;
2991	}
2992
2993	return 0;
2994}
2995EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2996
2997#ifdef CONFIG_SYSFS
2998/**
2999 *	netif_set_real_num_rx_queues - set actual number of RX queues used
3000 *	@dev: Network device
3001 *	@rxq: Actual number of RX queues
3002 *
3003 *	This must be called either with the rtnl_lock held or before
3004 *	registration of the net device.  Returns 0 on success, or a
3005 *	negative error code.  If called before registration, it always
3006 *	succeeds.
3007 */
3008int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
3009{
3010	int rc;
3011
3012	if (rxq < 1 || rxq > dev->num_rx_queues)
3013		return -EINVAL;
3014
3015	if (dev->reg_state == NETREG_REGISTERED) {
3016		ASSERT_RTNL();
3017
3018		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
3019						  rxq);
3020		if (rc)
3021			return rc;
3022	}
3023
3024	dev->real_num_rx_queues = rxq;
3025	return 0;
3026}
3027EXPORT_SYMBOL(netif_set_real_num_rx_queues);
3028#endif
3029
3030/**
3031 * netif_get_num_default_rss_queues - default number of RSS queues
3032 *
3033 * This routine should set an upper limit on the number of RSS queues
3034 * used by default by multiqueue devices.
3035 */
3036int netif_get_num_default_rss_queues(void)
3037{
3038	return is_kdump_kernel() ?
3039		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
3040}
3041EXPORT_SYMBOL(netif_get_num_default_rss_queues);
3042
3043static void __netif_reschedule(struct Qdisc *q)
3044{
3045	struct softnet_data *sd;
3046	unsigned long flags;
3047
3048	local_irq_save(flags);
3049	sd = this_cpu_ptr(&softnet_data);
3050	q->next_sched = NULL;
3051	*sd->output_queue_tailp = q;
3052	sd->output_queue_tailp = &q->next_sched;
3053	raise_softirq_irqoff(NET_TX_SOFTIRQ);
3054	local_irq_restore(flags);
3055}
3056
3057void __netif_schedule(struct Qdisc *q)
3058{
3059	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
3060		__netif_reschedule(q);
3061}
3062EXPORT_SYMBOL(__netif_schedule);
3063
3064struct dev_kfree_skb_cb {
3065	enum skb_free_reason reason;
3066};
3067
3068static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
3069{
3070	return (struct dev_kfree_skb_cb *)skb->cb;
3071}
3072
3073void netif_schedule_queue(struct netdev_queue *txq)
3074{
3075	rcu_read_lock();
3076	if (!netif_xmit_stopped(txq)) {
3077		struct Qdisc *q = rcu_dereference(txq->qdisc);
3078
3079		__netif_schedule(q);
3080	}
3081	rcu_read_unlock();
3082}
3083EXPORT_SYMBOL(netif_schedule_queue);
3084
3085void netif_tx_wake_queue(struct netdev_queue *dev_queue)
3086{
3087	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
3088		struct Qdisc *q;
3089
3090		rcu_read_lock();
3091		q = rcu_dereference(dev_queue->qdisc);
3092		__netif_schedule(q);
3093		rcu_read_unlock();
3094	}
3095}
3096EXPORT_SYMBOL(netif_tx_wake_queue);
3097
3098void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
3099{
3100	unsigned long flags;
3101
3102	if (unlikely(!skb))
3103		return;
3104
3105	if (likely(refcount_read(&skb->users) == 1)) {
3106		smp_rmb();
3107		refcount_set(&skb->users, 0);
3108	} else if (likely(!refcount_dec_and_test(&skb->users))) {
3109		return;
3110	}
3111	get_kfree_skb_cb(skb)->reason = reason;
3112	local_irq_save(flags);
3113	skb->next = __this_cpu_read(softnet_data.completion_queue);
3114	__this_cpu_write(softnet_data.completion_queue, skb);
3115	raise_softirq_irqoff(NET_TX_SOFTIRQ);
3116	local_irq_restore(flags);
3117}
3118EXPORT_SYMBOL(__dev_kfree_skb_irq);
3119
3120void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
3121{
3122	if (in_irq() || irqs_disabled())
3123		__dev_kfree_skb_irq(skb, reason);
3124	else if (unlikely(reason == SKB_REASON_DROPPED))
3125		kfree_skb(skb);
3126	else
3127		consume_skb(skb);
3128}
3129EXPORT_SYMBOL(__dev_kfree_skb_any);
3130
3131
3132/**
3133 * netif_device_detach - mark device as removed
3134 * @dev: network device
3135 *
3136 * Mark device as removed from system and therefore no longer available.
3137 */
3138void netif_device_detach(struct net_device *dev)
3139{
3140	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
3141	    netif_running(dev)) {
3142		netif_tx_stop_all_queues(dev);
3143	}
3144}
3145EXPORT_SYMBOL(netif_device_detach);
3146
3147/**
3148 * netif_device_attach - mark device as attached
3149 * @dev: network device
3150 *
3151 * Mark device as attached from system and restart if needed.
3152 */
3153void netif_device_attach(struct net_device *dev)
3154{
3155	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
3156	    netif_running(dev)) {
3157		netif_tx_wake_all_queues(dev);
3158		__netdev_watchdog_up(dev);
3159	}
3160}
3161EXPORT_SYMBOL(netif_device_attach);
3162
3163/*
3164 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3165 * to be used as a distribution range.
3166 */
3167static u16 skb_tx_hash(const struct net_device *dev,
3168		       const struct net_device *sb_dev,
3169		       struct sk_buff *skb)
3170{
3171	u32 hash;
3172	u16 qoffset = 0;
3173	u16 qcount = dev->real_num_tx_queues;
3174
3175	if (dev->num_tc) {
3176		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
3177
3178		qoffset = sb_dev->tc_to_txq[tc].offset;
3179		qcount = sb_dev->tc_to_txq[tc].count;
3180		if (unlikely(!qcount)) {
3181			net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
3182					     sb_dev->name, qoffset, tc);
3183			qoffset = 0;
3184			qcount = dev->real_num_tx_queues;
3185		}
3186	}
3187
3188	if (skb_rx_queue_recorded(skb)) {
3189		hash = skb_get_rx_queue(skb);
3190		if (hash >= qoffset)
3191			hash -= qoffset;
3192		while (unlikely(hash >= qcount))
3193			hash -= qcount;
3194		return hash + qoffset;
3195	}
3196
3197	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
3198}
3199
3200static void skb_warn_bad_offload(const struct sk_buff *skb)
3201{
3202	static const netdev_features_t null_features;
3203	struct net_device *dev = skb->dev;
3204	const char *name = "";
3205
3206	if (!net_ratelimit())
3207		return;
3208
3209	if (dev) {
3210		if (dev->dev.parent)
3211			name = dev_driver_string(dev->dev.parent);
3212		else
3213			name = netdev_name(dev);
3214	}
3215	skb_dump(KERN_WARNING, skb, false);
3216	WARN(1, "%s: caps=(%pNF, %pNF)\n",
3217	     name, dev ? &dev->features : &null_features,
3218	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
3219}
3220
3221/*
3222 * Invalidate hardware checksum when packet is to be mangled, and
3223 * complete checksum manually on outgoing path.
3224 */
3225int skb_checksum_help(struct sk_buff *skb)
3226{
3227	__wsum csum;
3228	int ret = 0, offset;
3229
3230	if (skb->ip_summed == CHECKSUM_COMPLETE)
3231		goto out_set_summed;
3232
3233	if (unlikely(skb_shinfo(skb)->gso_size)) {
3234		skb_warn_bad_offload(skb);
3235		return -EINVAL;
3236	}
3237
3238	/* Before computing a checksum, we should make sure no frag could
3239	 * be modified by an external entity : checksum could be wrong.
3240	 */
3241	if (skb_has_shared_frag(skb)) {
3242		ret = __skb_linearize(skb);
3243		if (ret)
3244			goto out;
3245	}
3246
3247	offset = skb_checksum_start_offset(skb);
3248	ret = -EINVAL;
3249	if (WARN_ON_ONCE(offset >= skb_headlen(skb)))
3250		goto out;
3251
3252	csum = skb_checksum(skb, offset, skb->len - offset, 0);
3253
3254	offset += skb->csum_offset;
3255	if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb)))
3256		goto out;
3257
3258	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3259	if (ret)
3260		goto out;
3261
3262	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3263out_set_summed:
3264	skb->ip_summed = CHECKSUM_NONE;
3265out:
3266	return ret;
3267}
3268EXPORT_SYMBOL(skb_checksum_help);
3269
3270int skb_crc32c_csum_help(struct sk_buff *skb)
3271{
3272	__le32 crc32c_csum;
3273	int ret = 0, offset, start;
3274
3275	if (skb->ip_summed != CHECKSUM_PARTIAL)
3276		goto out;
3277
3278	if (unlikely(skb_is_gso(skb)))
3279		goto out;
3280
3281	/* Before computing a checksum, we should make sure no frag could
3282	 * be modified by an external entity : checksum could be wrong.
3283	 */
3284	if (unlikely(skb_has_shared_frag(skb))) {
3285		ret = __skb_linearize(skb);
3286		if (ret)
3287			goto out;
3288	}
3289	start = skb_checksum_start_offset(skb);
3290	offset = start + offsetof(struct sctphdr, checksum);
3291	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3292		ret = -EINVAL;
3293		goto out;
3294	}
3295
3296	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3297	if (ret)
3298		goto out;
3299
3300	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3301						  skb->len - start, ~(__u32)0,
3302						  crc32c_csum_stub));
3303	*(__le32 *)(skb->data + offset) = crc32c_csum;
3304	skb->ip_summed = CHECKSUM_NONE;
3305	skb->csum_not_inet = 0;
3306out:
3307	return ret;
3308}
3309
3310__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3311{
3312	__be16 type = skb->protocol;
3313
3314	/* Tunnel gso handlers can set protocol to ethernet. */
3315	if (type == htons(ETH_P_TEB)) {
3316		struct ethhdr *eth;
3317
3318		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3319			return 0;
3320
3321		eth = (struct ethhdr *)skb->data;
3322		type = eth->h_proto;
3323	}
3324
3325	return vlan_get_protocol_and_depth(skb, type, depth);
3326}
3327
3328/**
3329 *	skb_mac_gso_segment - mac layer segmentation handler.
3330 *	@skb: buffer to segment
3331 *	@features: features for the output path (see dev->features)
3332 */
3333struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
3334				    netdev_features_t features)
3335{
3336	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
3337	struct packet_offload *ptype;
3338	int vlan_depth = skb->mac_len;
3339	__be16 type = skb_network_protocol(skb, &vlan_depth);
3340
3341	if (unlikely(!type))
3342		return ERR_PTR(-EINVAL);
3343
3344	__skb_pull(skb, vlan_depth);
3345
3346	rcu_read_lock();
3347	list_for_each_entry_rcu(ptype, &offload_base, list) {
3348		if (ptype->type == type && ptype->callbacks.gso_segment) {
3349			segs = ptype->callbacks.gso_segment(skb, features);
3350			break;
3351		}
3352	}
3353	rcu_read_unlock();
3354
3355	__skb_push(skb, skb->data - skb_mac_header(skb));
3356
3357	return segs;
3358}
3359EXPORT_SYMBOL(skb_mac_gso_segment);
3360
3361
3362/* openvswitch calls this on rx path, so we need a different check.
3363 */
3364static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
3365{
3366	if (tx_path)
3367		return skb->ip_summed != CHECKSUM_PARTIAL &&
3368		       skb->ip_summed != CHECKSUM_UNNECESSARY;
3369
3370	return skb->ip_summed == CHECKSUM_NONE;
3371}
3372
3373/**
3374 *	__skb_gso_segment - Perform segmentation on skb.
3375 *	@skb: buffer to segment
3376 *	@features: features for the output path (see dev->features)
3377 *	@tx_path: whether it is called in TX path
3378 *
3379 *	This function segments the given skb and returns a list of segments.
3380 *
3381 *	It may return NULL if the skb requires no segmentation.  This is
3382 *	only possible when GSO is used for verifying header integrity.
3383 *
3384 *	Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
3385 */
3386struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
3387				  netdev_features_t features, bool tx_path)
3388{
3389	struct sk_buff *segs;
3390
3391	if (unlikely(skb_needs_check(skb, tx_path))) {
3392		int err;
3393
3394		/* We're going to init ->check field in TCP or UDP header */
3395		err = skb_cow_head(skb, 0);
3396		if (err < 0)
3397			return ERR_PTR(err);
3398	}
3399
3400	/* Only report GSO partial support if it will enable us to
3401	 * support segmentation on this frame without needing additional
3402	 * work.
3403	 */
3404	if (features & NETIF_F_GSO_PARTIAL) {
3405		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
3406		struct net_device *dev = skb->dev;
3407
3408		partial_features |= dev->features & dev->gso_partial_features;
3409		if (!skb_gso_ok(skb, features | partial_features))
3410			features &= ~NETIF_F_GSO_PARTIAL;
3411	}
3412
3413	BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
3414		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
3415
3416	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3417	SKB_GSO_CB(skb)->encap_level = 0;
3418
3419	skb_reset_mac_header(skb);
3420	skb_reset_mac_len(skb);
3421
3422	segs = skb_mac_gso_segment(skb, features);
3423
3424	if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
3425		skb_warn_bad_offload(skb);
3426
3427	return segs;
3428}
3429EXPORT_SYMBOL(__skb_gso_segment);
3430
3431/* Take action when hardware reception checksum errors are detected. */
3432#ifdef CONFIG_BUG
3433void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3434{
3435	if (net_ratelimit()) {
3436		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
3437		skb_dump(KERN_ERR, skb, true);
3438		dump_stack();
3439	}
3440}
3441EXPORT_SYMBOL(netdev_rx_csum_fault);
3442#endif
3443
3444/* XXX: check that highmem exists at all on the given machine. */
3445static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3446{
3447#ifdef CONFIG_HIGHMEM
3448	int i;
3449
3450	if (!(dev->features & NETIF_F_HIGHDMA)) {
3451		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3452			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3453
3454			if (PageHighMem(skb_frag_page(frag)))
3455				return 1;
3456		}
3457	}
3458#endif
3459	return 0;
3460}
3461
3462/* If MPLS offload request, verify we are testing hardware MPLS features
3463 * instead of standard features for the netdev.
3464 */
3465#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3466static netdev_features_t net_mpls_features(struct sk_buff *skb,
3467					   netdev_features_t features,
3468					   __be16 type)
3469{
3470	if (eth_p_mpls(type))
3471		features &= skb->dev->mpls_features;
3472
3473	return features;
3474}
3475#else
3476static netdev_features_t net_mpls_features(struct sk_buff *skb,
3477					   netdev_features_t features,
3478					   __be16 type)
3479{
3480	return features;
3481}
3482#endif
3483
3484static netdev_features_t harmonize_features(struct sk_buff *skb,
3485	netdev_features_t features)
3486{
3487	__be16 type;
3488
3489	type = skb_network_protocol(skb, NULL);
3490	features = net_mpls_features(skb, features, type);
3491
3492	if (skb->ip_summed != CHECKSUM_NONE &&
3493	    !can_checksum_protocol(features, type)) {
3494		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3495	}
3496	if (illegal_highdma(skb->dev, skb))
3497		features &= ~NETIF_F_SG;
3498
3499	return features;
3500}
3501
3502netdev_features_t passthru_features_check(struct sk_buff *skb,
3503					  struct net_device *dev,
3504					  netdev_features_t features)
3505{
3506	return features;
3507}
3508EXPORT_SYMBOL(passthru_features_check);
3509
3510static netdev_features_t dflt_features_check(struct sk_buff *skb,
3511					     struct net_device *dev,
3512					     netdev_features_t features)
3513{
3514	return vlan_features_check(skb, features);
3515}
3516
3517static netdev_features_t gso_features_check(const struct sk_buff *skb,
3518					    struct net_device *dev,
3519					    netdev_features_t features)
3520{
3521	u16 gso_segs = skb_shinfo(skb)->gso_segs;
3522
3523	if (gso_segs > dev->gso_max_segs)
3524		return features & ~NETIF_F_GSO_MASK;
3525
3526	if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size)))
3527		return features & ~NETIF_F_GSO_MASK;
3528
3529	if (!skb_shinfo(skb)->gso_type) {
3530		skb_warn_bad_offload(skb);
3531		return features & ~NETIF_F_GSO_MASK;
3532	}
3533
3534	/* Support for GSO partial features requires software
3535	 * intervention before we can actually process the packets
3536	 * so we need to strip support for any partial features now
3537	 * and we can pull them back in after we have partially
3538	 * segmented the frame.
3539	 */
3540	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3541		features &= ~dev->gso_partial_features;
3542
3543	/* Make sure to clear the IPv4 ID mangling feature if the
3544	 * IPv4 header has the potential to be fragmented.
3545	 */
3546	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3547		struct iphdr *iph = skb->encapsulation ?
3548				    inner_ip_hdr(skb) : ip_hdr(skb);
3549
3550		if (!(iph->frag_off & htons(IP_DF)))
3551			features &= ~NETIF_F_TSO_MANGLEID;
3552	}
3553
3554	return features;
3555}
3556
3557netdev_features_t netif_skb_features(struct sk_buff *skb)
3558{
3559	struct net_device *dev = skb->dev;
3560	netdev_features_t features = dev->features;
3561
3562	if (skb_is_gso(skb))
3563		features = gso_features_check(skb, dev, features);
3564
3565	/* If encapsulation offload request, verify we are testing
3566	 * hardware encapsulation features instead of standard
3567	 * features for the netdev
3568	 */
3569	if (skb->encapsulation)
3570		features &= dev->hw_enc_features;
3571
3572	if (skb_vlan_tagged(skb))
3573		features = netdev_intersect_features(features,
3574						     dev->vlan_features |
3575						     NETIF_F_HW_VLAN_CTAG_TX |
3576						     NETIF_F_HW_VLAN_STAG_TX);
3577
3578	if (dev->netdev_ops->ndo_features_check)
3579		features &= dev->netdev_ops->ndo_features_check(skb, dev,
3580								features);
3581	else
3582		features &= dflt_features_check(skb, dev, features);
3583
3584	return harmonize_features(skb, features);
3585}
3586EXPORT_SYMBOL(netif_skb_features);
3587
3588static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3589		    struct netdev_queue *txq, bool more)
3590{
3591	unsigned int len;
3592	int rc;
3593
3594	if (dev_nit_active(dev))
3595		dev_queue_xmit_nit(skb, dev);
3596
3597	len = skb->len;
3598	PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies);
3599	trace_net_dev_start_xmit(skb, dev);
3600	rc = netdev_start_xmit(skb, dev, txq, more);
3601	trace_net_dev_xmit(skb, rc, dev, len);
3602
3603	return rc;
3604}
3605
3606struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3607				    struct netdev_queue *txq, int *ret)
3608{
3609	struct sk_buff *skb = first;
3610	int rc = NETDEV_TX_OK;
3611
3612	while (skb) {
3613		struct sk_buff *next = skb->next;
3614
3615		skb_mark_not_on_list(skb);
3616		rc = xmit_one(skb, dev, txq, next != NULL);
3617		if (unlikely(!dev_xmit_complete(rc))) {
3618			skb->next = next;
3619			goto out;
3620		}
3621
3622		skb = next;
3623		if (netif_tx_queue_stopped(txq) && skb) {
3624			rc = NETDEV_TX_BUSY;
3625			break;
3626		}
3627	}
3628
3629out:
3630	*ret = rc;
3631	return skb;
3632}
3633
3634static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3635					  netdev_features_t features)
3636{
3637	if (skb_vlan_tag_present(skb) &&
3638	    !vlan_hw_offload_capable(features, skb->vlan_proto))
3639		skb = __vlan_hwaccel_push_inside(skb);
3640	return skb;
3641}
3642
3643int skb_csum_hwoffload_help(struct sk_buff *skb,
3644			    const netdev_features_t features)
3645{
3646	if (unlikely(skb_csum_is_sctp(skb)))
3647		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3648			skb_crc32c_csum_help(skb);
3649
3650	return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
3651}
3652EXPORT_SYMBOL(skb_csum_hwoffload_help);
3653
3654static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3655{
3656	netdev_features_t features;
3657
3658	features = netif_skb_features(skb);
3659	skb = validate_xmit_vlan(skb, features);
3660	if (unlikely(!skb))
3661		goto out_null;
3662
3663	skb = sk_validate_xmit_skb(skb, dev);
3664	if (unlikely(!skb))
3665		goto out_null;
3666
3667	if (netif_needs_gso(skb, features)) {
3668		struct sk_buff *segs;
3669
3670		segs = skb_gso_segment(skb, features);
3671		if (IS_ERR(segs)) {
3672			goto out_kfree_skb;
3673		} else if (segs) {
3674			consume_skb(skb);
3675			skb = segs;
3676		}
3677	} else {
3678		if (skb_needs_linearize(skb, features) &&
3679		    __skb_linearize(skb))
3680			goto out_kfree_skb;
3681
3682		/* If packet is not checksummed and device does not
3683		 * support checksumming for this protocol, complete
3684		 * checksumming here.
3685		 */
3686		if (skb->ip_summed == CHECKSUM_PARTIAL) {
3687			if (skb->encapsulation)
3688				skb_set_inner_transport_header(skb,
3689							       skb_checksum_start_offset(skb));
3690			else
3691				skb_set_transport_header(skb,
3692							 skb_checksum_start_offset(skb));
3693			if (skb_csum_hwoffload_help(skb, features))
3694				goto out_kfree_skb;
3695		}
3696	}
3697
3698	skb = validate_xmit_xfrm(skb, features, again);
3699
3700	return skb;
3701
3702out_kfree_skb:
3703	kfree_skb(skb);
3704out_null:
3705	atomic_long_inc(&dev->tx_dropped);
3706	return NULL;
3707}
3708
3709struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3710{
3711	struct sk_buff *next, *head = NULL, *tail;
3712
3713	for (; skb != NULL; skb = next) {
3714		next = skb->next;
3715		skb_mark_not_on_list(skb);
3716
3717		/* in case skb wont be segmented, point to itself */
3718		skb->prev = skb;
3719
3720		skb = validate_xmit_skb(skb, dev, again);
3721		if (!skb)
3722			continue;
3723
3724		if (!head)
3725			head = skb;
3726		else
3727			tail->next = skb;
3728		/* If skb was segmented, skb->prev points to
3729		 * the last segment. If not, it still contains skb.
3730		 */
3731		tail = skb->prev;
3732	}
3733	return head;
3734}
3735EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3736
3737static void qdisc_pkt_len_init(struct sk_buff *skb)
3738{
3739	const struct skb_shared_info *shinfo = skb_shinfo(skb);
3740
3741	qdisc_skb_cb(skb)->pkt_len = skb->len;
3742
3743	/* To get more precise estimation of bytes sent on wire,
3744	 * we add to pkt_len the headers size of all segments
3745	 */
3746	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3747		unsigned int hdr_len;
3748		u16 gso_segs = shinfo->gso_segs;
3749
3750		/* mac layer + network layer */
3751		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3752
3753		/* + transport layer */
3754		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3755			const struct tcphdr *th;
3756			struct tcphdr _tcphdr;
3757
3758			th = skb_header_pointer(skb, skb_transport_offset(skb),
3759						sizeof(_tcphdr), &_tcphdr);
3760			if (likely(th))
3761				hdr_len += __tcp_hdrlen(th);
3762		} else {
3763			struct udphdr _udphdr;
3764
3765			if (skb_header_pointer(skb, skb_transport_offset(skb),
3766					       sizeof(_udphdr), &_udphdr))
3767				hdr_len += sizeof(struct udphdr);
3768		}
3769
3770		if (shinfo->gso_type & SKB_GSO_DODGY)
3771			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3772						shinfo->gso_size);
3773
3774		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3775	}
3776}
3777
3778static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3779				 struct net_device *dev,
3780				 struct netdev_queue *txq)
3781{
3782	spinlock_t *root_lock = qdisc_lock(q);
3783	struct sk_buff *to_free = NULL;
3784	bool contended;
3785	int rc;
3786
3787	qdisc_calculate_pkt_len(skb, q);
3788
3789	if (q->flags & TCQ_F_NOLOCK) {
3790		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3791		if (likely(!netif_xmit_frozen_or_stopped(txq)))
3792			qdisc_run(q);
3793
3794		if (unlikely(to_free))
3795			kfree_skb_list(to_free);
3796		return rc;
3797	}
3798
3799	/*
3800	 * Heuristic to force contended enqueues to serialize on a
3801	 * separate lock before trying to get qdisc main lock.
3802	 * This permits qdisc->running owner to get the lock more
3803	 * often and dequeue packets faster.
3804	 */
3805	contended = qdisc_is_running(q);
3806	if (unlikely(contended))
3807		spin_lock(&q->busylock);
3808
3809	spin_lock(root_lock);
3810	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3811		__qdisc_drop(skb, &to_free);
3812		rc = NET_XMIT_DROP;
3813	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3814		   qdisc_run_begin(q)) {
3815		/*
3816		 * This is a work-conserving queue; there are no old skbs
3817		 * waiting to be sent out; and the qdisc is not running -
3818		 * xmit the skb directly.
3819		 */
3820
3821		qdisc_bstats_update(q, skb);
3822
3823		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3824			if (unlikely(contended)) {
3825				spin_unlock(&q->busylock);
3826				contended = false;
3827			}
3828			__qdisc_run(q);
3829		}
3830
3831		qdisc_run_end(q);
3832		rc = NET_XMIT_SUCCESS;
3833	} else {
3834		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3835		if (qdisc_run_begin(q)) {
3836			if (unlikely(contended)) {
3837				spin_unlock(&q->busylock);
3838				contended = false;
3839			}
3840			__qdisc_run(q);
3841			qdisc_run_end(q);
3842		}
3843	}
3844	spin_unlock(root_lock);
3845	if (unlikely(to_free))
3846		kfree_skb_list(to_free);
3847	if (unlikely(contended))
3848		spin_unlock(&q->busylock);
3849	return rc;
3850}
3851
3852#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3853static void skb_update_prio(struct sk_buff *skb)
3854{
3855	const struct netprio_map *map;
3856	const struct sock *sk;
3857	unsigned int prioidx;
3858
3859	if (skb->priority)
3860		return;
3861	map = rcu_dereference_bh(skb->dev->priomap);
3862	if (!map)
3863		return;
3864	sk = skb_to_full_sk(skb);
3865	if (!sk)
3866		return;
3867
3868	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3869
3870	if (prioidx < map->priomap_len)
3871		skb->priority = map->priomap[prioidx];
3872}
3873#else
3874#define skb_update_prio(skb)
3875#endif
3876
3877/**
3878 *	dev_loopback_xmit - loop back @skb
3879 *	@net: network namespace this loopback is happening in
3880 *	@sk:  sk needed to be a netfilter okfn
3881 *	@skb: buffer to transmit
3882 */
3883int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3884{
3885	skb_reset_mac_header(skb);
3886	__skb_pull(skb, skb_network_offset(skb));
3887	skb->pkt_type = PACKET_LOOPBACK;
3888	if (skb->ip_summed == CHECKSUM_NONE)
3889		skb->ip_summed = CHECKSUM_UNNECESSARY;
3890	WARN_ON(!skb_dst(skb));
3891	skb_dst_force(skb);
3892	netif_rx_ni(skb);
3893	return 0;
3894}
3895EXPORT_SYMBOL(dev_loopback_xmit);
3896
3897#ifdef CONFIG_NET_EGRESS
3898static struct sk_buff *
3899sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3900{
3901	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3902	struct tcf_result cl_res;
3903
3904	if (!miniq)
3905		return skb;
3906
3907	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3908	qdisc_skb_cb(skb)->mru = 0;
3909	mini_qdisc_bstats_cpu_update(miniq, skb);
3910
3911	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
3912	case TC_ACT_OK:
3913	case TC_ACT_RECLASSIFY:
3914		skb->tc_index = TC_H_MIN(cl_res.classid);
3915		break;
3916	case TC_ACT_SHOT:
3917		mini_qdisc_qstats_cpu_drop(miniq);
3918		*ret = NET_XMIT_DROP;
3919		kfree_skb(skb);
3920		return NULL;
3921	case TC_ACT_STOLEN:
3922	case TC_ACT_QUEUED:
3923	case TC_ACT_TRAP:
3924		*ret = NET_XMIT_SUCCESS;
3925		consume_skb(skb);
3926		return NULL;
3927	case TC_ACT_REDIRECT:
3928		/* No need to push/pop skb's mac_header here on egress! */
3929		skb_do_redirect(skb);
3930		*ret = NET_XMIT_SUCCESS;
3931		return NULL;
3932	default:
3933		break;
3934	}
3935
3936	return skb;
3937}
3938#endif /* CONFIG_NET_EGRESS */
3939
3940#ifdef CONFIG_XPS
3941static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
3942			       struct xps_dev_maps *dev_maps, unsigned int tci)
3943{
3944	struct xps_map *map;
3945	int queue_index = -1;
3946
3947	if (dev->num_tc) {
3948		tci *= dev->num_tc;
3949		tci += netdev_get_prio_tc_map(dev, skb->priority);
3950	}
3951
3952	map = rcu_dereference(dev_maps->attr_map[tci]);
3953	if (map) {
3954		if (map->len == 1)
3955			queue_index = map->queues[0];
3956		else
3957			queue_index = map->queues[reciprocal_scale(
3958						skb_get_hash(skb), map->len)];
3959		if (unlikely(queue_index >= dev->real_num_tx_queues))
3960			queue_index = -1;
3961	}
3962	return queue_index;
3963}
3964#endif
3965
3966static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
3967			 struct sk_buff *skb)
3968{
3969#ifdef CONFIG_XPS
3970	struct xps_dev_maps *dev_maps;
3971	struct sock *sk = skb->sk;
3972	int queue_index = -1;
3973
3974	if (!static_key_false(&xps_needed))
3975		return -1;
3976
3977	rcu_read_lock();
3978	if (!static_key_false(&xps_rxqs_needed))
3979		goto get_cpus_map;
3980
3981	dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
3982	if (dev_maps) {
3983		int tci = sk_rx_queue_get(sk);
3984
3985		if (tci >= 0 && tci < dev->num_rx_queues)
3986			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3987							  tci);
3988	}
3989
3990get_cpus_map:
3991	if (queue_index < 0) {
3992		dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
3993		if (dev_maps) {
3994			unsigned int tci = skb->sender_cpu - 1;
3995
3996			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3997							  tci);
3998		}
3999	}
4000	rcu_read_unlock();
4001
4002	return queue_index;
4003#else
4004	return -1;
4005#endif
4006}
4007
4008u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
4009		     struct net_device *sb_dev)
4010{
4011	return 0;
4012}
4013EXPORT_SYMBOL(dev_pick_tx_zero);
4014
4015u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
4016		       struct net_device *sb_dev)
4017{
4018	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
4019}
4020EXPORT_SYMBOL(dev_pick_tx_cpu_id);
4021
4022u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
4023		     struct net_device *sb_dev)
4024{
4025	struct sock *sk = skb->sk;
4026	int queue_index = sk_tx_queue_get(sk);
4027
4028	sb_dev = sb_dev ? : dev;
4029
4030	if (queue_index < 0 || skb->ooo_okay ||
4031	    queue_index >= dev->real_num_tx_queues) {
4032		int new_index = get_xps_queue(dev, sb_dev, skb);
4033
4034		if (new_index < 0)
4035			new_index = skb_tx_hash(dev, sb_dev, skb);
4036
4037		if (queue_index != new_index && sk &&
4038		    sk_fullsock(sk) &&
4039		    rcu_access_pointer(sk->sk_dst_cache))
4040			sk_tx_queue_set(sk, new_index);
4041
4042		queue_index = new_index;
4043	}
4044
4045	return queue_index;
4046}
4047EXPORT_SYMBOL(netdev_pick_tx);
4048
4049struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
4050					 struct sk_buff *skb,
4051					 struct net_device *sb_dev)
4052{
4053	int queue_index = 0;
4054
4055#ifdef CONFIG_XPS
4056	u32 sender_cpu = skb->sender_cpu - 1;
4057
4058	if (sender_cpu >= (u32)NR_CPUS)
4059		skb->sender_cpu = raw_smp_processor_id() + 1;
4060#endif
4061
4062	if (dev->real_num_tx_queues != 1) {
4063		const struct net_device_ops *ops = dev->netdev_ops;
4064
4065		if (ops->ndo_select_queue)
4066			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
4067		else
4068			queue_index = netdev_pick_tx(dev, skb, sb_dev);
4069
4070		queue_index = netdev_cap_txqueue(dev, queue_index);
4071	}
4072
4073	skb_set_queue_mapping(skb, queue_index);
4074	return netdev_get_tx_queue(dev, queue_index);
4075}
4076
4077/**
4078 *	__dev_queue_xmit - transmit a buffer
4079 *	@skb: buffer to transmit
4080 *	@sb_dev: suboordinate device used for L2 forwarding offload
4081 *
4082 *	Queue a buffer for transmission to a network device. The caller must
4083 *	have set the device and priority and built the buffer before calling
4084 *	this function. The function can be called from an interrupt.
4085 *
4086 *	A negative errno code is returned on a failure. A success does not
4087 *	guarantee the frame will be transmitted as it may be dropped due
4088 *	to congestion or traffic shaping.
4089 *
4090 * -----------------------------------------------------------------------------------
4091 *      I notice this method can also return errors from the queue disciplines,
4092 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
4093 *      be positive.
4094 *
4095 *      Regardless of the return value, the skb is consumed, so it is currently
4096 *      difficult to retry a send to this method.  (You can bump the ref count
4097 *      before sending to hold a reference for retry if you are careful.)
4098 *
4099 *      When calling this method, interrupts MUST be enabled.  This is because
4100 *      the BH enable code must have IRQs enabled so that it will not deadlock.
4101 *          --BLG
4102 */
4103static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
4104{
4105	struct net_device *dev = skb->dev;
4106	struct netdev_queue *txq;
4107	struct Qdisc *q;
4108	int rc = -ENOMEM;
4109	bool again = false;
4110
4111	skb_reset_mac_header(skb);
4112	skb_assert_len(skb);
4113
4114	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
4115		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
4116
4117	/* Disable soft irqs for various locks below. Also
4118	 * stops preemption for RCU.
4119	 */
4120	rcu_read_lock_bh();
4121
4122	skb_update_prio(skb);
4123
4124	qdisc_pkt_len_init(skb);
4125#ifdef CONFIG_NET_CLS_ACT
4126	skb->tc_at_ingress = 0;
4127# ifdef CONFIG_NET_EGRESS
4128	if (static_branch_unlikely(&egress_needed_key)) {
4129		skb = sch_handle_egress(skb, &rc, dev);
4130		if (!skb)
4131			goto out;
4132	}
4133# endif
4134#endif
4135	/* If device/qdisc don't need skb->dst, release it right now while
4136	 * its hot in this cpu cache.
4137	 */
4138	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
4139		skb_dst_drop(skb);
4140	else
4141		skb_dst_force(skb);
4142
4143	txq = netdev_core_pick_tx(dev, skb, sb_dev);
4144	q = rcu_dereference_bh(txq->qdisc);
4145
4146	trace_net_dev_queue(skb);
4147	if (q->enqueue) {
4148		rc = __dev_xmit_skb(skb, q, dev, txq);
4149		goto out;
4150	}
4151
4152	/* The device has no queue. Common case for software devices:
4153	 * loopback, all the sorts of tunnels...
4154
4155	 * Really, it is unlikely that netif_tx_lock protection is necessary
4156	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
4157	 * counters.)
4158	 * However, it is possible, that they rely on protection
4159	 * made by us here.
4160
4161	 * Check this and shot the lock. It is not prone from deadlocks.
4162	 *Either shot noqueue qdisc, it is even simpler 8)
4163	 */
4164	if (dev->flags & IFF_UP) {
4165		int cpu = smp_processor_id(); /* ok because BHs are off */
4166
4167		/* Other cpus might concurrently change txq->xmit_lock_owner
4168		 * to -1 or to their cpu id, but not to our id.
4169		 */
4170		if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
4171			if (dev_xmit_recursion())
4172				goto recursion_alert;
4173
4174			skb = validate_xmit_skb(skb, dev, &again);
4175			if (!skb)
4176				goto out;
4177
4178			PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
4179			HARD_TX_LOCK(dev, txq, cpu);
4180
4181			if (!netif_xmit_stopped(txq)) {
4182				dev_xmit_recursion_inc();
4183				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4184				dev_xmit_recursion_dec();
4185				if (dev_xmit_complete(rc)) {
4186					HARD_TX_UNLOCK(dev, txq);
4187					goto out;
4188				}
4189			}
4190			HARD_TX_UNLOCK(dev, txq);
4191			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4192					     dev->name);
4193		} else {
4194			/* Recursion is detected! It is possible,
4195			 * unfortunately
4196			 */
4197recursion_alert:
4198			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4199					     dev->name);
4200		}
4201	}
4202
4203	rc = -ENETDOWN;
4204	rcu_read_unlock_bh();
4205
4206	atomic_long_inc(&dev->tx_dropped);
4207	kfree_skb_list(skb);
4208	return rc;
4209out:
4210	rcu_read_unlock_bh();
4211	return rc;
4212}
4213
4214int dev_queue_xmit(struct sk_buff *skb)
4215{
4216	return __dev_queue_xmit(skb, NULL);
4217}
4218EXPORT_SYMBOL(dev_queue_xmit);
4219
4220int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
4221{
4222	return __dev_queue_xmit(skb, sb_dev);
4223}
4224EXPORT_SYMBOL(dev_queue_xmit_accel);
4225
4226int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4227{
4228	struct net_device *dev = skb->dev;
4229	struct sk_buff *orig_skb = skb;
4230	struct netdev_queue *txq;
4231	int ret = NETDEV_TX_BUSY;
4232	bool again = false;
4233
4234	if (unlikely(!netif_running(dev) ||
4235		     !netif_carrier_ok(dev)))
4236		goto drop;
4237
4238	skb = validate_xmit_skb_list(skb, dev, &again);
4239	if (skb != orig_skb)
4240		goto drop;
4241
4242	skb_set_queue_mapping(skb, queue_id);
4243	txq = skb_get_tx_queue(dev, skb);
4244	PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
4245
4246	local_bh_disable();
4247
4248	dev_xmit_recursion_inc();
4249	HARD_TX_LOCK(dev, txq, smp_processor_id());
4250	if (!netif_xmit_frozen_or_drv_stopped(txq))
4251		ret = netdev_start_xmit(skb, dev, txq, false);
4252	HARD_TX_UNLOCK(dev, txq);
4253	dev_xmit_recursion_dec();
4254
4255	local_bh_enable();
4256	return ret;
4257drop:
4258	atomic_long_inc(&dev->tx_dropped);
4259	kfree_skb_list(skb);
4260	return NET_XMIT_DROP;
4261}
4262EXPORT_SYMBOL(__dev_direct_xmit);
4263
4264/*************************************************************************
4265 *			Receiver routines
4266 *************************************************************************/
4267
4268int netdev_max_backlog __read_mostly = 1000;
4269EXPORT_SYMBOL(netdev_max_backlog);
4270
4271int netdev_tstamp_prequeue __read_mostly = 1;
4272int netdev_budget __read_mostly = 300;
4273/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
4274unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
4275int weight_p __read_mostly = 64;           /* old backlog weight */
4276int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
4277int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
4278int dev_rx_weight __read_mostly = 64;
4279int dev_tx_weight __read_mostly = 64;
4280/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
4281int gro_normal_batch __read_mostly = 8;
4282
4283/* Called with irq disabled */
4284static inline void ____napi_schedule(struct softnet_data *sd,
4285				     struct napi_struct *napi)
4286{
4287	list_add_tail(&napi->poll_list, &sd->poll_list);
4288	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4289}
4290
4291#ifdef CONFIG_RPS
4292
4293/* One global table that all flow-based protocols share. */
4294struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
4295EXPORT_SYMBOL(rps_sock_flow_table);
4296u32 rps_cpu_mask __read_mostly;
4297EXPORT_SYMBOL(rps_cpu_mask);
4298
4299struct static_key_false rps_needed __read_mostly;
4300EXPORT_SYMBOL(rps_needed);
4301struct static_key_false rfs_needed __read_mostly;
4302EXPORT_SYMBOL(rfs_needed);
4303
4304static struct rps_dev_flow *
4305set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4306	    struct rps_dev_flow *rflow, u16 next_cpu)
4307{
4308	if (next_cpu < nr_cpu_ids) {
4309#ifdef CONFIG_RFS_ACCEL
4310		struct netdev_rx_queue *rxqueue;
4311		struct rps_dev_flow_table *flow_table;
4312		struct rps_dev_flow *old_rflow;
4313		u32 flow_id;
4314		u16 rxq_index;
4315		int rc;
4316
4317		/* Should we steer this flow to a different hardware queue? */
4318		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4319		    !(dev->features & NETIF_F_NTUPLE))
4320			goto out;
4321		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4322		if (rxq_index == skb_get_rx_queue(skb))
4323			goto out;
4324
4325		rxqueue = dev->_rx + rxq_index;
4326		flow_table = rcu_dereference(rxqueue->rps_flow_table);
4327		if (!flow_table)
4328			goto out;
4329		flow_id = skb_get_hash(skb) & flow_table->mask;
4330		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4331							rxq_index, flow_id);
4332		if (rc < 0)
4333			goto out;
4334		old_rflow = rflow;
4335		rflow = &flow_table->flows[flow_id];
4336		rflow->filter = rc;
4337		if (old_rflow->filter == rflow->filter)
4338			old_rflow->filter = RPS_NO_FILTER;
4339	out:
4340#endif
4341		rflow->last_qtail =
4342			per_cpu(softnet_data, next_cpu).input_queue_head;
4343	}
4344
4345	rflow->cpu = next_cpu;
4346	return rflow;
4347}
4348
4349/*
4350 * get_rps_cpu is called from netif_receive_skb and returns the target
4351 * CPU from the RPS map of the receiving queue for a given skb.
4352 * rcu_read_lock must be held on entry.
4353 */
4354static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4355		       struct rps_dev_flow **rflowp)
4356{
4357	const struct rps_sock_flow_table *sock_flow_table;
4358	struct netdev_rx_queue *rxqueue = dev->_rx;
4359	struct rps_dev_flow_table *flow_table;
4360	struct rps_map *map;
4361	int cpu = -1;
4362	u32 tcpu;
4363	u32 hash;
4364
4365	if (skb_rx_queue_recorded(skb)) {
4366		u16 index = skb_get_rx_queue(skb);
4367
4368		if (unlikely(index >= dev->real_num_rx_queues)) {
4369			WARN_ONCE(dev->real_num_rx_queues > 1,
4370				  "%s received packet on queue %u, but number "
4371				  "of RX queues is %u\n",
4372				  dev->name, index, dev->real_num_rx_queues);
4373			goto done;
4374		}
4375		rxqueue += index;
4376	}
4377
4378	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4379
4380	flow_table = rcu_dereference(rxqueue->rps_flow_table);
4381	map = rcu_dereference(rxqueue->rps_map);
4382	if (!flow_table && !map)
4383		goto done;
4384
4385	skb_reset_network_header(skb);
4386	hash = skb_get_hash(skb);
4387	if (!hash)
4388		goto done;
4389
4390	sock_flow_table = rcu_dereference(rps_sock_flow_table);
4391	if (flow_table && sock_flow_table) {
4392		struct rps_dev_flow *rflow;
4393		u32 next_cpu;
4394		u32 ident;
4395
4396		/* First check into global flow table if there is a match.
4397		 * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
4398		 */
4399		ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
4400		if ((ident ^ hash) & ~rps_cpu_mask)
4401			goto try_rps;
4402
4403		next_cpu = ident & rps_cpu_mask;
4404
4405		/* OK, now we know there is a match,
4406		 * we can look at the local (per receive queue) flow table
4407		 */
4408		rflow = &flow_table->flows[hash & flow_table->mask];
4409		tcpu = rflow->cpu;
4410
4411		/*
4412		 * If the desired CPU (where last recvmsg was done) is
4413		 * different from current CPU (one in the rx-queue flow
4414		 * table entry), switch if one of the following holds:
4415		 *   - Current CPU is unset (>= nr_cpu_ids).
4416		 *   - Current CPU is offline.
4417		 *   - The current CPU's queue tail has advanced beyond the
4418		 *     last packet that was enqueued using this table entry.
4419		 *     This guarantees that all previous packets for the flow
4420		 *     have been dequeued, thus preserving in order delivery.
4421		 */
4422		if (unlikely(tcpu != next_cpu) &&
4423		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4424		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4425		      rflow->last_qtail)) >= 0)) {
4426			tcpu = next_cpu;
4427			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4428		}
4429
4430		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4431			*rflowp = rflow;
4432			cpu = tcpu;
4433			goto done;
4434		}
4435	}
4436
4437try_rps:
4438
4439	if (map) {
4440		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4441		if (cpu_online(tcpu)) {
4442			cpu = tcpu;
4443			goto done;
4444		}
4445	}
4446
4447done:
4448	return cpu;
4449}
4450
4451#ifdef CONFIG_RFS_ACCEL
4452
4453/**
4454 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4455 * @dev: Device on which the filter was set
4456 * @rxq_index: RX queue index
4457 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4458 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4459 *
4460 * Drivers that implement ndo_rx_flow_steer() should periodically call
4461 * this function for each installed filter and remove the filters for
4462 * which it returns %true.
4463 */
4464bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4465			 u32 flow_id, u16 filter_id)
4466{
4467	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4468	struct rps_dev_flow_table *flow_table;
4469	struct rps_dev_flow *rflow;
4470	bool expire = true;
4471	unsigned int cpu;
4472
4473	rcu_read_lock();
4474	flow_table = rcu_dereference(rxqueue->rps_flow_table);
4475	if (flow_table && flow_id <= flow_table->mask) {
4476		rflow = &flow_table->flows[flow_id];
4477		cpu = READ_ONCE(rflow->cpu);
4478		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4479		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4480			   rflow->last_qtail) <
4481		     (int)(10 * flow_table->mask)))
4482			expire = false;
4483	}
4484	rcu_read_unlock();
4485	return expire;
4486}
4487EXPORT_SYMBOL(rps_may_expire_flow);
4488
4489#endif /* CONFIG_RFS_ACCEL */
4490
4491/* Called from hardirq (IPI) context */
4492static void rps_trigger_softirq(void *data)
4493{
4494	struct softnet_data *sd = data;
4495
4496	____napi_schedule(sd, &sd->backlog);
4497	sd->received_rps++;
4498}
4499
4500#endif /* CONFIG_RPS */
4501
4502/*
4503 * Check if this softnet_data structure is another cpu one
4504 * If yes, queue it to our IPI list and return 1
4505 * If no, return 0
4506 */
4507static int rps_ipi_queued(struct softnet_data *sd)
4508{
4509#ifdef CONFIG_RPS
4510	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4511
4512	if (sd != mysd) {
4513		sd->rps_ipi_next = mysd->rps_ipi_list;
4514		mysd->rps_ipi_list = sd;
4515
4516		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4517		return 1;
4518	}
4519#endif /* CONFIG_RPS */
4520	return 0;
4521}
4522
4523#ifdef CONFIG_NET_FLOW_LIMIT
4524int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4525#endif
4526
4527static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4528{
4529#ifdef CONFIG_NET_FLOW_LIMIT
4530	struct sd_flow_limit *fl;
4531	struct softnet_data *sd;
4532	unsigned int old_flow, new_flow;
4533
4534	if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
4535		return false;
4536
4537	sd = this_cpu_ptr(&softnet_data);
4538
4539	rcu_read_lock();
4540	fl = rcu_dereference(sd->flow_limit);
4541	if (fl) {
4542		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4543		old_flow = fl->history[fl->history_head];
4544		fl->history[fl->history_head] = new_flow;
4545
4546		fl->history_head++;
4547		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4548
4549		if (likely(fl->buckets[old_flow]))
4550			fl->buckets[old_flow]--;
4551
4552		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4553			fl->count++;
4554			rcu_read_unlock();
4555			return true;
4556		}
4557	}
4558	rcu_read_unlock();
4559#endif
4560	return false;
4561}
4562
4563/*
4564 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4565 * queue (may be a remote CPU queue).
4566 */
4567static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4568			      unsigned int *qtail)
4569{
4570	struct softnet_data *sd;
4571	unsigned long flags;
4572	unsigned int qlen;
4573
4574	sd = &per_cpu(softnet_data, cpu);
4575
4576	local_irq_save(flags);
4577
4578	rps_lock(sd);
4579	if (!netif_running(skb->dev))
4580		goto drop;
4581	qlen = skb_queue_len(&sd->input_pkt_queue);
4582	if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
4583		if (qlen) {
4584enqueue:
4585			__skb_queue_tail(&sd->input_pkt_queue, skb);
4586			input_queue_tail_incr_save(sd, qtail);
4587			rps_unlock(sd);
4588			local_irq_restore(flags);
4589			return NET_RX_SUCCESS;
4590		}
4591
4592		/* Schedule NAPI for backlog device
4593		 * We can use non atomic operation since we own the queue lock
4594		 */
4595		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
4596			if (!rps_ipi_queued(sd))
4597				____napi_schedule(sd, &sd->backlog);
4598		}
4599		goto enqueue;
4600	}
4601
4602drop:
4603	sd->dropped++;
4604	rps_unlock(sd);
4605
4606	local_irq_restore(flags);
4607
4608	atomic_long_inc(&skb->dev->rx_dropped);
4609	kfree_skb(skb);
4610	return NET_RX_DROP;
4611}
4612
4613static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4614{
4615	struct net_device *dev = skb->dev;
4616	struct netdev_rx_queue *rxqueue;
4617
4618	rxqueue = dev->_rx;
4619
4620	if (skb_rx_queue_recorded(skb)) {
4621		u16 index = skb_get_rx_queue(skb);
4622
4623		if (unlikely(index >= dev->real_num_rx_queues)) {
4624			WARN_ONCE(dev->real_num_rx_queues > 1,
4625				  "%s received packet on queue %u, but number "
4626				  "of RX queues is %u\n",
4627				  dev->name, index, dev->real_num_rx_queues);
4628
4629			return rxqueue; /* Return first rxqueue */
4630		}
4631		rxqueue += index;
4632	}
4633	return rxqueue;
4634}
4635
4636static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4637				     struct xdp_buff *xdp,
4638				     struct bpf_prog *xdp_prog)
4639{
4640	struct netdev_rx_queue *rxqueue;
4641	void *orig_data, *orig_data_end;
4642	u32 metalen, act = XDP_DROP;
4643	__be16 orig_eth_type;
4644	struct ethhdr *eth;
4645	bool orig_bcast;
4646	int hlen, off;
4647	u32 mac_len;
4648
4649	/* Reinjected packets coming from act_mirred or similar should
4650	 * not get XDP generic processing.
4651	 */
4652	if (skb_is_redirected(skb))
4653		return XDP_PASS;
4654
4655	/* XDP packets must be linear and must have sufficient headroom
4656	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4657	 * native XDP provides, thus we need to do it here as well.
4658	 */
4659	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
4660	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4661		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4662		int troom = skb->tail + skb->data_len - skb->end;
4663
4664		/* In case we have to go down the path and also linearize,
4665		 * then lets do the pskb_expand_head() work just once here.
4666		 */
4667		if (pskb_expand_head(skb,
4668				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4669				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
4670			goto do_drop;
4671		if (skb_linearize(skb))
4672			goto do_drop;
4673	}
4674
4675	/* The XDP program wants to see the packet starting at the MAC
4676	 * header.
4677	 */
4678	mac_len = skb->data - skb_mac_header(skb);
4679	hlen = skb_headlen(skb) + mac_len;
4680	xdp->data = skb->data - mac_len;
4681	xdp->data_meta = xdp->data;
4682	xdp->data_end = xdp->data + hlen;
4683	xdp->data_hard_start = skb->data - skb_headroom(skb);
4684
4685	/* SKB "head" area always have tailroom for skb_shared_info */
4686	xdp->frame_sz  = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
4687	xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
4688
4689	orig_data_end = xdp->data_end;
4690	orig_data = xdp->data;
4691	eth = (struct ethhdr *)xdp->data;
4692	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4693	orig_eth_type = eth->h_proto;
4694
4695	rxqueue = netif_get_rxqueue(skb);
4696	xdp->rxq = &rxqueue->xdp_rxq;
4697
4698	act = bpf_prog_run_xdp(xdp_prog, xdp);
4699
4700	/* check if bpf_xdp_adjust_head was used */
4701	off = xdp->data - orig_data;
4702	if (off) {
4703		if (off > 0)
4704			__skb_pull(skb, off);
4705		else if (off < 0)
4706			__skb_push(skb, -off);
4707
4708		skb->mac_header += off;
4709		skb_reset_network_header(skb);
4710	}
4711
4712	/* check if bpf_xdp_adjust_tail was used */
4713	off = xdp->data_end - orig_data_end;
4714	if (off != 0) {
4715		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4716		skb->len += off; /* positive on grow, negative on shrink */
4717	}
4718
4719	/* check if XDP changed eth hdr such SKB needs update */
4720	eth = (struct ethhdr *)xdp->data;
4721	if ((orig_eth_type != eth->h_proto) ||
4722	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4723		__skb_push(skb, ETH_HLEN);
4724		skb->protocol = eth_type_trans(skb, skb->dev);
4725	}
4726
4727	switch (act) {
4728	case XDP_REDIRECT:
4729	case XDP_TX:
4730		__skb_push(skb, mac_len);
4731		break;
4732	case XDP_PASS:
4733		metalen = xdp->data - xdp->data_meta;
4734		if (metalen)
4735			skb_metadata_set(skb, metalen);
4736		break;
4737	default:
4738		bpf_warn_invalid_xdp_action(act);
4739		fallthrough;
4740	case XDP_ABORTED:
4741		trace_xdp_exception(skb->dev, xdp_prog, act);
4742		fallthrough;
4743	case XDP_DROP:
4744	do_drop:
4745		kfree_skb(skb);
4746		break;
4747	}
4748
4749	return act;
4750}
4751
4752/* When doing generic XDP we have to bypass the qdisc layer and the
4753 * network taps in order to match in-driver-XDP behavior.
4754 */
4755void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4756{
4757	struct net_device *dev = skb->dev;
4758	struct netdev_queue *txq;
4759	bool free_skb = true;
4760	int cpu, rc;
4761
4762	txq = netdev_core_pick_tx(dev, skb, NULL);
4763	cpu = smp_processor_id();
4764	HARD_TX_LOCK(dev, txq, cpu);
4765	if (!netif_xmit_stopped(txq)) {
4766		rc = netdev_start_xmit(skb, dev, txq, 0);
4767		if (dev_xmit_complete(rc))
4768			free_skb = false;
4769	}
4770	HARD_TX_UNLOCK(dev, txq);
4771	if (free_skb) {
4772		trace_xdp_exception(dev, xdp_prog, XDP_TX);
4773		kfree_skb(skb);
4774	}
4775}
4776
4777static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
4778
4779int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4780{
4781	if (xdp_prog) {
4782		struct xdp_buff xdp;
4783		u32 act;
4784		int err;
4785
4786		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
4787		if (act != XDP_PASS) {
4788			switch (act) {
4789			case XDP_REDIRECT:
4790				err = xdp_do_generic_redirect(skb->dev, skb,
4791							      &xdp, xdp_prog);
4792				if (err)
4793					goto out_redir;
4794				break;
4795			case XDP_TX:
4796				generic_xdp_tx(skb, xdp_prog);
4797				break;
4798			}
4799			return XDP_DROP;
4800		}
4801	}
4802	return XDP_PASS;
4803out_redir:
4804	kfree_skb(skb);
4805	return XDP_DROP;
4806}
4807EXPORT_SYMBOL_GPL(do_xdp_generic);
4808
4809static int netif_rx_internal(struct sk_buff *skb)
4810{
4811	int ret;
4812
4813	net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
4814
4815	trace_netif_rx(skb);
4816
4817#ifdef CONFIG_RPS
4818	if (static_branch_unlikely(&rps_needed)) {
4819		struct rps_dev_flow voidflow, *rflow = &voidflow;
4820		int cpu;
4821
4822		preempt_disable();
4823		rcu_read_lock();
4824
4825		cpu = get_rps_cpu(skb->dev, skb, &rflow);
4826		if (cpu < 0)
4827			cpu = smp_processor_id();
4828
4829		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4830
4831		rcu_read_unlock();
4832		preempt_enable();
4833	} else
4834#endif
4835	{
4836		unsigned int qtail;
4837
4838		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
4839		put_cpu();
4840	}
4841	return ret;
4842}
4843
4844/**
4845 *	netif_rx	-	post buffer to the network code
4846 *	@skb: buffer to post
4847 *
4848 *	This function receives a packet from a device driver and queues it for
4849 *	the upper (protocol) levels to process.  It always succeeds. The buffer
4850 *	may be dropped during processing for congestion control or by the
4851 *	protocol layers.
4852 *
4853 *	return values:
4854 *	NET_RX_SUCCESS	(no congestion)
4855 *	NET_RX_DROP     (packet was dropped)
4856 *
4857 */
4858
4859int netif_rx(struct sk_buff *skb)
4860{
4861	int ret;
4862
4863	trace_netif_rx_entry(skb);
4864
4865	ret = netif_rx_internal(skb);
4866	trace_netif_rx_exit(ret);
4867
4868	return ret;
4869}
4870EXPORT_SYMBOL(netif_rx);
4871
4872int netif_rx_ni(struct sk_buff *skb)
4873{
4874	int err;
4875
4876	trace_netif_rx_ni_entry(skb);
4877
4878	preempt_disable();
4879	err = netif_rx_internal(skb);
4880	if (local_softirq_pending())
4881		do_softirq();
4882	preempt_enable();
4883	trace_netif_rx_ni_exit(err);
4884
4885	return err;
4886}
4887EXPORT_SYMBOL(netif_rx_ni);
4888
4889int netif_rx_any_context(struct sk_buff *skb)
4890{
4891	/*
4892	 * If invoked from contexts which do not invoke bottom half
4893	 * processing either at return from interrupt or when softrqs are
4894	 * reenabled, use netif_rx_ni() which invokes bottomhalf processing
4895	 * directly.
4896	 */
4897	if (in_interrupt())
4898		return netif_rx(skb);
4899	else
4900		return netif_rx_ni(skb);
4901}
4902EXPORT_SYMBOL(netif_rx_any_context);
4903
4904static __latent_entropy void net_tx_action(struct softirq_action *h)
4905{
4906	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4907
4908	if (sd->completion_queue) {
4909		struct sk_buff *clist;
4910
4911		local_irq_disable();
4912		clist = sd->completion_queue;
4913		sd->completion_queue = NULL;
4914		local_irq_enable();
4915
4916		while (clist) {
4917			struct sk_buff *skb = clist;
4918
4919			clist = clist->next;
4920
4921			WARN_ON(refcount_read(&skb->users));
4922			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
4923				trace_consume_skb(skb);
4924			else
4925				trace_kfree_skb(skb, net_tx_action);
4926
4927			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
4928				__kfree_skb(skb);
4929			else
4930				__kfree_skb_defer(skb);
4931		}
4932
4933		__kfree_skb_flush();
4934	}
4935
4936	if (sd->output_queue) {
4937		struct Qdisc *head;
4938
4939		local_irq_disable();
4940		head = sd->output_queue;
4941		sd->output_queue = NULL;
4942		sd->output_queue_tailp = &sd->output_queue;
4943		local_irq_enable();
4944
4945		rcu_read_lock();
4946
4947		while (head) {
4948			struct Qdisc *q = head;
4949			spinlock_t *root_lock = NULL;
4950
4951			head = head->next_sched;
4952
4953			/* We need to make sure head->next_sched is read
4954			 * before clearing __QDISC_STATE_SCHED
4955			 */
4956			smp_mb__before_atomic();
4957
4958			if (!(q->flags & TCQ_F_NOLOCK)) {
4959				root_lock = qdisc_lock(q);
4960				spin_lock(root_lock);
4961			} else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
4962						     &q->state))) {
4963				/* There is a synchronize_net() between
4964				 * STATE_DEACTIVATED flag being set and
4965				 * qdisc_reset()/some_qdisc_is_busy() in
4966				 * dev_deactivate(), so we can safely bail out
4967				 * early here to avoid data race between
4968				 * qdisc_deactivate() and some_qdisc_is_busy()
4969				 * for lockless qdisc.
4970				 */
4971				clear_bit(__QDISC_STATE_SCHED, &q->state);
4972				continue;
4973			}
4974
4975			clear_bit(__QDISC_STATE_SCHED, &q->state);
4976			qdisc_run(q);
4977			if (root_lock)
4978				spin_unlock(root_lock);
4979		}
4980
4981		rcu_read_unlock();
4982	}
4983
4984	xfrm_dev_backlog(sd);
4985}
4986
4987#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
4988/* This hook is defined here for ATM LANE */
4989int (*br_fdb_test_addr_hook)(struct net_device *dev,
4990			     unsigned char *addr) __read_mostly;
4991EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
4992#endif
4993
4994static inline struct sk_buff *
4995sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4996		   struct net_device *orig_dev, bool *another)
4997{
4998#ifdef CONFIG_NET_CLS_ACT
4999	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
5000	struct tcf_result cl_res;
5001
5002	/* If there's at least one ingress present somewhere (so
5003	 * we get here via enabled static key), remaining devices
5004	 * that are not configured with an ingress qdisc will bail
5005	 * out here.
5006	 */
5007	if (!miniq)
5008		return skb;
5009
5010	if (*pt_prev) {
5011		*ret = deliver_skb(skb, *pt_prev, orig_dev);
5012		*pt_prev = NULL;
5013	}
5014
5015	qdisc_skb_cb(skb)->pkt_len = skb->len;
5016	qdisc_skb_cb(skb)->mru = 0;
5017	skb->tc_at_ingress = 1;
5018	mini_qdisc_bstats_cpu_update(miniq, skb);
5019
5020	switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
5021				     &cl_res, false)) {
5022	case TC_ACT_OK:
5023	case TC_ACT_RECLASSIFY:
5024		skb->tc_index = TC_H_MIN(cl_res.classid);
5025		break;
5026	case TC_ACT_SHOT:
5027		mini_qdisc_qstats_cpu_drop(miniq);
5028		kfree_skb(skb);
5029		return NULL;
5030	case TC_ACT_STOLEN:
5031	case TC_ACT_QUEUED:
5032	case TC_ACT_TRAP:
5033		consume_skb(skb);
5034		return NULL;
5035	case TC_ACT_REDIRECT:
5036		/* skb_mac_header check was done by cls/act_bpf, so
5037		 * we can safely push the L2 header back before
5038		 * redirecting to another netdev
5039		 */
5040		__skb_push(skb, skb->mac_len);
5041		if (skb_do_redirect(skb) == -EAGAIN) {
5042			__skb_pull(skb, skb->mac_len);
5043			*another = true;
5044			break;
5045		}
5046		return NULL;
5047	case TC_ACT_CONSUMED:
5048		return NULL;
5049	default:
5050		break;
5051	}
5052#endif /* CONFIG_NET_CLS_ACT */
5053	return skb;
5054}
5055
5056/**
5057 *	netdev_is_rx_handler_busy - check if receive handler is registered
5058 *	@dev: device to check
5059 *
5060 *	Check if a receive handler is already registered for a given device.
5061 *	Return true if there one.
5062 *
5063 *	The caller must hold the rtnl_mutex.
5064 */
5065bool netdev_is_rx_handler_busy(struct net_device *dev)
5066{
5067	ASSERT_RTNL();
5068	return dev && rtnl_dereference(dev->rx_handler);
5069}
5070EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
5071
5072/**
5073 *	netdev_rx_handler_register - register receive handler
5074 *	@dev: device to register a handler for
5075 *	@rx_handler: receive handler to register
5076 *	@rx_handler_data: data pointer that is used by rx handler
5077 *
5078 *	Register a receive handler for a device. This handler will then be
5079 *	called from __netif_receive_skb. A negative errno code is returned
5080 *	on a failure.
5081 *
5082 *	The caller must hold the rtnl_mutex.
5083 *
5084 *	For a general description of rx_handler, see enum rx_handler_result.
5085 */
5086int netdev_rx_handler_register(struct net_device *dev,
5087			       rx_handler_func_t *rx_handler,
5088			       void *rx_handler_data)
5089{
5090	if (netdev_is_rx_handler_busy(dev))
5091		return -EBUSY;
5092
5093	if (dev->priv_flags & IFF_NO_RX_HANDLER)
5094		return -EINVAL;
5095
5096	/* Note: rx_handler_data must be set before rx_handler */
5097	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
5098	rcu_assign_pointer(dev->rx_handler, rx_handler);
5099
5100	return 0;
5101}
5102EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
5103
5104/**
5105 *	netdev_rx_handler_unregister - unregister receive handler
5106 *	@dev: device to unregister a handler from
5107 *
5108 *	Unregister a receive handler from a device.
5109 *
5110 *	The caller must hold the rtnl_mutex.
5111 */
5112void netdev_rx_handler_unregister(struct net_device *dev)
5113{
5114
5115	ASSERT_RTNL();
5116	RCU_INIT_POINTER(dev->rx_handler, NULL);
5117	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
5118	 * section has a guarantee to see a non NULL rx_handler_data
5119	 * as well.
5120	 */
5121	synchronize_net();
5122	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
5123}
5124EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
5125
5126/*
5127 * Limit the use of PFMEMALLOC reserves to those protocols that implement
5128 * the special handling of PFMEMALLOC skbs.
5129 */
5130static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
5131{
5132	switch (skb->protocol) {
5133	case htons(ETH_P_ARP):
5134	case htons(ETH_P_IP):
5135	case htons(ETH_P_IPV6):
5136	case htons(ETH_P_8021Q):
5137	case htons(ETH_P_8021AD):
5138		return true;
5139	default:
5140		return false;
5141	}
5142}
5143
5144static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
5145			     int *ret, struct net_device *orig_dev)
5146{
5147	if (nf_hook_ingress_active(skb)) {
5148		int ingress_retval;
5149
5150		if (*pt_prev) {
5151			*ret = deliver_skb(skb, *pt_prev, orig_dev);
5152			*pt_prev = NULL;
5153		}
5154
5155		rcu_read_lock();
5156		ingress_retval = nf_hook_ingress(skb);
5157		rcu_read_unlock();
5158		return ingress_retval;
5159	}
5160	return 0;
5161}
5162
5163static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
5164				    struct packet_type **ppt_prev)
5165{
5166	struct packet_type *ptype, *pt_prev;
5167	rx_handler_func_t *rx_handler;
5168	struct sk_buff *skb = *pskb;
5169	struct net_device *orig_dev;
5170	bool deliver_exact = false;
5171	int ret = NET_RX_DROP;
5172	__be16 type;
5173
5174	net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
5175
5176	trace_netif_receive_skb(skb);
5177
5178	orig_dev = skb->dev;
5179
5180	skb_reset_network_header(skb);
5181	if (!skb_transport_header_was_set(skb))
5182		skb_reset_transport_header(skb);
5183	skb_reset_mac_len(skb);
5184
5185	pt_prev = NULL;
5186
5187another_round:
5188	skb->skb_iif = skb->dev->ifindex;
5189
5190	__this_cpu_inc(softnet_data.processed);
5191
5192	if (static_branch_unlikely(&generic_xdp_needed_key)) {
5193		int ret2;
5194
5195		preempt_disable();
5196		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
5197		preempt_enable();
5198
5199		if (ret2 != XDP_PASS) {
5200			ret = NET_RX_DROP;
5201			goto out;
5202		}
5203		skb_reset_mac_len(skb);
5204	}
5205
5206	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
5207	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
5208		skb = skb_vlan_untag(skb);
5209		if (unlikely(!skb))
5210			goto out;
5211	}
5212
5213	if (skb_skip_tc_classify(skb))
5214		goto skip_classify;
5215
5216	if (pfmemalloc)
5217		goto skip_taps;
5218
5219	list_for_each_entry_rcu(ptype, &ptype_all, list) {
5220		if (pt_prev)
5221			ret = deliver_skb(skb, pt_prev, orig_dev);
5222		pt_prev = ptype;
5223	}
5224
5225	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
5226		if (pt_prev)
5227			ret = deliver_skb(skb, pt_prev, orig_dev);
5228		pt_prev = ptype;
5229	}
5230
5231skip_taps:
5232#ifdef CONFIG_NET_INGRESS
5233	if (static_branch_unlikely(&ingress_needed_key)) {
5234		bool another = false;
5235
5236		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
5237					 &another);
5238		if (another)
5239			goto another_round;
5240		if (!skb)
5241			goto out;
5242
5243		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
5244			goto out;
5245	}
5246#endif
5247	skb_reset_redirect(skb);
5248skip_classify:
5249	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5250		goto drop;
5251
5252	if (skb_vlan_tag_present(skb)) {
5253		if (pt_prev) {
5254			ret = deliver_skb(skb, pt_prev, orig_dev);
5255			pt_prev = NULL;
5256		}
5257		if (vlan_do_receive(&skb))
5258			goto another_round;
5259		else if (unlikely(!skb))
5260			goto out;
5261	}
5262
5263	rx_handler = rcu_dereference(skb->dev->rx_handler);
5264	if (rx_handler) {
5265		if (pt_prev) {
5266			ret = deliver_skb(skb, pt_prev, orig_dev);
5267			pt_prev = NULL;
5268		}
5269		switch (rx_handler(&skb)) {
5270		case RX_HANDLER_CONSUMED:
5271			ret = NET_RX_SUCCESS;
5272			goto out;
5273		case RX_HANDLER_ANOTHER:
5274			goto another_round;
5275		case RX_HANDLER_EXACT:
5276			deliver_exact = true;
5277		case RX_HANDLER_PASS:
5278			break;
5279		default:
5280			BUG();
5281		}
5282	}
5283
5284	if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
5285check_vlan_id:
5286		if (skb_vlan_tag_get_id(skb)) {
5287			/* Vlan id is non 0 and vlan_do_receive() above couldn't
5288			 * find vlan device.
5289			 */
5290			skb->pkt_type = PACKET_OTHERHOST;
5291		} else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
5292			   skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
5293			/* Outer header is 802.1P with vlan 0, inner header is
5294			 * 802.1Q or 802.1AD and vlan_do_receive() above could
5295			 * not find vlan dev for vlan id 0.
5296			 */
5297			__vlan_hwaccel_clear_tag(skb);
5298			skb = skb_vlan_untag(skb);
5299			if (unlikely(!skb))
5300				goto out;
5301			if (vlan_do_receive(&skb))
5302				/* After stripping off 802.1P header with vlan 0
5303				 * vlan dev is found for inner header.
5304				 */
5305				goto another_round;
5306			else if (unlikely(!skb))
5307				goto out;
5308			else
5309				/* We have stripped outer 802.1P vlan 0 header.
5310				 * But could not find vlan dev.
5311				 * check again for vlan id to set OTHERHOST.
5312				 */
5313				goto check_vlan_id;
5314		}
5315		/* Note: we might in the future use prio bits
5316		 * and set skb->priority like in vlan_do_receive()
5317		 * For the time being, just ignore Priority Code Point
5318		 */
5319		__vlan_hwaccel_clear_tag(skb);
5320	}
5321
5322	type = skb->protocol;
5323
5324	/* deliver only exact match when indicated */
5325	if (likely(!deliver_exact)) {
5326		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5327				       &ptype_base[ntohs(type) &
5328						   PTYPE_HASH_MASK]);
5329	}
5330
5331	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5332			       &orig_dev->ptype_specific);
5333
5334	if (unlikely(skb->dev != orig_dev)) {
5335		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5336				       &skb->dev->ptype_specific);
5337	}
5338
5339	if (pt_prev) {
5340		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5341			goto drop;
5342		*ppt_prev = pt_prev;
5343	} else {
5344drop:
5345		if (!deliver_exact)
5346			atomic_long_inc(&skb->dev->rx_dropped);
5347		else
5348			atomic_long_inc(&skb->dev->rx_nohandler);
5349		kfree_skb(skb);
5350		/* Jamal, now you will not able to escape explaining
5351		 * me how you were going to use this. :-)
5352		 */
5353		ret = NET_RX_DROP;
5354	}
5355
5356out:
5357	/* The invariant here is that if *ppt_prev is not NULL
5358	 * then skb should also be non-NULL.
5359	 *
5360	 * Apparently *ppt_prev assignment above holds this invariant due to
5361	 * skb dereferencing near it.
5362	 */
5363	*pskb = skb;
5364	return ret;
5365}
5366
5367static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
5368{
5369	struct net_device *orig_dev = skb->dev;
5370	struct packet_type *pt_prev = NULL;
5371	int ret;
5372
5373	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5374	if (pt_prev)
5375		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5376					 skb->dev, pt_prev, orig_dev);
5377	return ret;
5378}
5379
5380/**
5381 *	netif_receive_skb_core - special purpose version of netif_receive_skb
5382 *	@skb: buffer to process
5383 *
5384 *	More direct receive version of netif_receive_skb().  It should
5385 *	only be used by callers that have a need to skip RPS and Generic XDP.
5386 *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
5387 *
5388 *	This function may only be called from softirq context and interrupts
5389 *	should be enabled.
5390 *
5391 *	Return values (usually ignored):
5392 *	NET_RX_SUCCESS: no congestion
5393 *	NET_RX_DROP: packet was dropped
5394 */
5395int netif_receive_skb_core(struct sk_buff *skb)
5396{
5397	int ret;
5398
5399	rcu_read_lock();
5400	ret = __netif_receive_skb_one_core(skb, false);
5401	rcu_read_unlock();
5402
5403	return ret;
5404}
5405EXPORT_SYMBOL(netif_receive_skb_core);
5406
5407static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5408						  struct packet_type *pt_prev,
5409						  struct net_device *orig_dev)
5410{
5411	struct sk_buff *skb, *next;
5412
5413	if (!pt_prev)
5414		return;
5415	if (list_empty(head))
5416		return;
5417	if (pt_prev->list_func != NULL)
5418		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5419				   ip_list_rcv, head, pt_prev, orig_dev);
5420	else
5421		list_for_each_entry_safe(skb, next, head, list) {
5422			skb_list_del_init(skb);
5423			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5424		}
5425}
5426
5427static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5428{
5429	/* Fast-path assumptions:
5430	 * - There is no RX handler.
5431	 * - Only one packet_type matches.
5432	 * If either of these fails, we will end up doing some per-packet
5433	 * processing in-line, then handling the 'last ptype' for the whole
5434	 * sublist.  This can't cause out-of-order delivery to any single ptype,
5435	 * because the 'last ptype' must be constant across the sublist, and all
5436	 * other ptypes are handled per-packet.
5437	 */
5438	/* Current (common) ptype of sublist */
5439	struct packet_type *pt_curr = NULL;
5440	/* Current (common) orig_dev of sublist */
5441	struct net_device *od_curr = NULL;
5442	struct list_head sublist;
5443	struct sk_buff *skb, *next;
5444
5445	INIT_LIST_HEAD(&sublist);
5446	list_for_each_entry_safe(skb, next, head, list) {
5447		struct net_device *orig_dev = skb->dev;
5448		struct packet_type *pt_prev = NULL;
5449
5450		skb_list_del_init(skb);
5451		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5452		if (!pt_prev)
5453			continue;
5454		if (pt_curr != pt_prev || od_curr != orig_dev) {
5455			/* dispatch old sublist */
5456			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5457			/* start new sublist */
5458			INIT_LIST_HEAD(&sublist);
5459			pt_curr = pt_prev;
5460			od_curr = orig_dev;
5461		}
5462		list_add_tail(&skb->list, &sublist);
5463	}
5464
5465	/* dispatch final sublist */
5466	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5467}
5468
5469static int __netif_receive_skb(struct sk_buff *skb)
5470{
5471	int ret;
5472
5473	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5474		unsigned int noreclaim_flag;
5475
5476		/*
5477		 * PFMEMALLOC skbs are special, they should
5478		 * - be delivered to SOCK_MEMALLOC sockets only
5479		 * - stay away from userspace
5480		 * - have bounded memory usage
5481		 *
5482		 * Use PF_MEMALLOC as this saves us from propagating the allocation
5483		 * context down to all allocation sites.
5484		 */
5485		noreclaim_flag = memalloc_noreclaim_save();
5486		ret = __netif_receive_skb_one_core(skb, true);
5487		memalloc_noreclaim_restore(noreclaim_flag);
5488	} else
5489		ret = __netif_receive_skb_one_core(skb, false);
5490
5491	return ret;
5492}
5493
5494static void __netif_receive_skb_list(struct list_head *head)
5495{
5496	unsigned long noreclaim_flag = 0;
5497	struct sk_buff *skb, *next;
5498	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5499
5500	list_for_each_entry_safe(skb, next, head, list) {
5501		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5502			struct list_head sublist;
5503
5504			/* Handle the previous sublist */
5505			list_cut_before(&sublist, head, &skb->list);
5506			if (!list_empty(&sublist))
5507				__netif_receive_skb_list_core(&sublist, pfmemalloc);
5508			pfmemalloc = !pfmemalloc;
5509			/* See comments in __netif_receive_skb */
5510			if (pfmemalloc)
5511				noreclaim_flag = memalloc_noreclaim_save();
5512			else
5513				memalloc_noreclaim_restore(noreclaim_flag);
5514		}
5515	}
5516	/* Handle the remaining sublist */
5517	if (!list_empty(head))
5518		__netif_receive_skb_list_core(head, pfmemalloc);
5519	/* Restore pflags */
5520	if (pfmemalloc)
5521		memalloc_noreclaim_restore(noreclaim_flag);
5522}
5523
5524static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5525{
5526	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5527	struct bpf_prog *new = xdp->prog;
5528	int ret = 0;
5529
5530	if (new) {
5531		u32 i;
5532
5533		mutex_lock(&new->aux->used_maps_mutex);
5534
5535		/* generic XDP does not work with DEVMAPs that can
5536		 * have a bpf_prog installed on an entry
5537		 */
5538		for (i = 0; i < new->aux->used_map_cnt; i++) {
5539			if (dev_map_can_have_prog(new->aux->used_maps[i]) ||
5540			    cpu_map_prog_allowed(new->aux->used_maps[i])) {
5541				mutex_unlock(&new->aux->used_maps_mutex);
5542				return -EINVAL;
5543			}
5544		}
5545
5546		mutex_unlock(&new->aux->used_maps_mutex);
5547	}
5548
5549	switch (xdp->command) {
5550	case XDP_SETUP_PROG:
5551		rcu_assign_pointer(dev->xdp_prog, new);
5552		if (old)
5553			bpf_prog_put(old);
5554
5555		if (old && !new) {
5556			static_branch_dec(&generic_xdp_needed_key);
5557		} else if (new && !old) {
5558			static_branch_inc(&generic_xdp_needed_key);
5559			dev_disable_lro(dev);
5560			dev_disable_gro_hw(dev);
5561		}
5562		break;
5563
5564	default:
5565		ret = -EINVAL;
5566		break;
5567	}
5568
5569	return ret;
5570}
5571
5572static int netif_receive_skb_internal(struct sk_buff *skb)
5573{
5574	int ret;
5575
5576	net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
5577
5578	if (skb_defer_rx_timestamp(skb))
5579		return NET_RX_SUCCESS;
5580
5581	rcu_read_lock();
5582#ifdef CONFIG_RPS
5583	if (static_branch_unlikely(&rps_needed)) {
5584		struct rps_dev_flow voidflow, *rflow = &voidflow;
5585		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5586
5587		if (cpu >= 0) {
5588			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5589			rcu_read_unlock();
5590			return ret;
5591		}
5592	}
5593#endif
5594	ret = __netif_receive_skb(skb);
5595	rcu_read_unlock();
5596	return ret;
5597}
5598
5599static void netif_receive_skb_list_internal(struct list_head *head)
5600{
5601	struct sk_buff *skb, *next;
5602	struct list_head sublist;
5603
5604	INIT_LIST_HEAD(&sublist);
5605	list_for_each_entry_safe(skb, next, head, list) {
5606		net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
5607		skb_list_del_init(skb);
5608		if (!skb_defer_rx_timestamp(skb))
5609			list_add_tail(&skb->list, &sublist);
5610	}
5611	list_splice_init(&sublist, head);
5612
5613	rcu_read_lock();
5614#ifdef CONFIG_RPS
5615	if (static_branch_unlikely(&rps_needed)) {
5616		list_for_each_entry_safe(skb, next, head, list) {
5617			struct rps_dev_flow voidflow, *rflow = &voidflow;
5618			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5619
5620			if (cpu >= 0) {
5621				/* Will be handled, remove from list */
5622				skb_list_del_init(skb);
5623				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5624			}
5625		}
5626	}
5627#endif
5628	__netif_receive_skb_list(head);
5629	rcu_read_unlock();
5630}
5631
5632/**
5633 *	netif_receive_skb - process receive buffer from network
5634 *	@skb: buffer to process
5635 *
5636 *	netif_receive_skb() is the main receive data processing function.
5637 *	It always succeeds. The buffer may be dropped during processing
5638 *	for congestion control or by the protocol layers.
5639 *
5640 *	This function may only be called from softirq context and interrupts
5641 *	should be enabled.
5642 *
5643 *	Return values (usually ignored):
5644 *	NET_RX_SUCCESS: no congestion
5645 *	NET_RX_DROP: packet was dropped
5646 */
5647int netif_receive_skb(struct sk_buff *skb)
5648{
5649	int ret;
5650
5651	trace_netif_receive_skb_entry(skb);
5652
5653	ret = netif_receive_skb_internal(skb);
5654	trace_netif_receive_skb_exit(ret);
5655
5656	return ret;
5657}
5658EXPORT_SYMBOL(netif_receive_skb);
5659
5660/**
5661 *	netif_receive_skb_list - process many receive buffers from network
5662 *	@head: list of skbs to process.
5663 *
5664 *	Since return value of netif_receive_skb() is normally ignored, and
5665 *	wouldn't be meaningful for a list, this function returns void.
5666 *
5667 *	This function may only be called from softirq context and interrupts
5668 *	should be enabled.
5669 */
5670void netif_receive_skb_list(struct list_head *head)
5671{
5672	struct sk_buff *skb;
5673
5674	if (list_empty(head))
5675		return;
5676	if (trace_netif_receive_skb_list_entry_enabled()) {
5677		list_for_each_entry(skb, head, list)
5678			trace_netif_receive_skb_list_entry(skb);
5679	}
5680	netif_receive_skb_list_internal(head);
5681	trace_netif_receive_skb_list_exit(0);
5682}
5683EXPORT_SYMBOL(netif_receive_skb_list);
5684
5685static DEFINE_PER_CPU(struct work_struct, flush_works);
5686
5687/* Network device is going away, flush any packets still pending */
5688static void flush_backlog(struct work_struct *work)
5689{
5690	struct sk_buff *skb, *tmp;
5691	struct softnet_data *sd;
5692
5693	local_bh_disable();
5694	sd = this_cpu_ptr(&softnet_data);
5695
5696	local_irq_disable();
5697	rps_lock(sd);
5698	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5699		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5700			__skb_unlink(skb, &sd->input_pkt_queue);
5701			dev_kfree_skb_irq(skb);
5702			input_queue_head_incr(sd);
5703		}
5704	}
5705	rps_unlock(sd);
5706	local_irq_enable();
5707
5708	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5709		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5710			__skb_unlink(skb, &sd->process_queue);
5711			kfree_skb(skb);
5712			input_queue_head_incr(sd);
5713		}
5714	}
5715	local_bh_enable();
5716}
5717
5718static bool flush_required(int cpu)
5719{
5720#if IS_ENABLED(CONFIG_RPS)
5721	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
5722	bool do_flush;
5723
5724	local_irq_disable();
5725	rps_lock(sd);
5726
5727	/* as insertion into process_queue happens with the rps lock held,
5728	 * process_queue access may race only with dequeue
5729	 */
5730	do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
5731		   !skb_queue_empty_lockless(&sd->process_queue);
5732	rps_unlock(sd);
5733	local_irq_enable();
5734
5735	return do_flush;
5736#endif
5737	/* without RPS we can't safely check input_pkt_queue: during a
5738	 * concurrent remote skb_queue_splice() we can detect as empty both
5739	 * input_pkt_queue and process_queue even if the latter could end-up
5740	 * containing a lot of packets.
5741	 */
5742	return true;
5743}
5744
5745static void flush_all_backlogs(void)
5746{
5747	static cpumask_t flush_cpus;
5748	unsigned int cpu;
5749
5750	/* since we are under rtnl lock protection we can use static data
5751	 * for the cpumask and avoid allocating on stack the possibly
5752	 * large mask
5753	 */
5754	ASSERT_RTNL();
5755
5756	get_online_cpus();
5757
5758	cpumask_clear(&flush_cpus);
5759	for_each_online_cpu(cpu) {
5760		if (flush_required(cpu)) {
5761			queue_work_on(cpu, system_highpri_wq,
5762				      per_cpu_ptr(&flush_works, cpu));
5763			cpumask_set_cpu(cpu, &flush_cpus);
5764		}
5765	}
5766
5767	/* we can have in flight packet[s] on the cpus we are not flushing,
5768	 * synchronize_net() in unregister_netdevice_many() will take care of
5769	 * them
5770	 */
5771	for_each_cpu(cpu, &flush_cpus)
5772		flush_work(per_cpu_ptr(&flush_works, cpu));
5773
5774	put_online_cpus();
5775}
5776
5777/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
5778static void gro_normal_list(struct napi_struct *napi)
5779{
5780	if (!napi->rx_count)
5781		return;
5782	netif_receive_skb_list_internal(&napi->rx_list);
5783	INIT_LIST_HEAD(&napi->rx_list);
5784	napi->rx_count = 0;
5785}
5786
5787/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
5788 * pass the whole batch up to the stack.
5789 */
5790static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs)
5791{
5792	list_add_tail(&skb->list, &napi->rx_list);
5793	napi->rx_count += segs;
5794	if (napi->rx_count >= gro_normal_batch)
5795		gro_normal_list(napi);
5796}
5797
5798INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
5799INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
5800static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
5801{
5802	struct packet_offload *ptype;
5803	__be16 type = skb->protocol;
5804	struct list_head *head = &offload_base;
5805	int err = -ENOENT;
5806
5807	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
5808
5809	if (NAPI_GRO_CB(skb)->count == 1) {
5810		skb_shinfo(skb)->gso_size = 0;
5811		goto out;
5812	}
5813
5814	rcu_read_lock();
5815	list_for_each_entry_rcu(ptype, head, list) {
5816		if (ptype->type != type || !ptype->callbacks.gro_complete)
5817			continue;
5818
5819		err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
5820					 ipv6_gro_complete, inet_gro_complete,
5821					 skb, 0);
5822		break;
5823	}
5824	rcu_read_unlock();
5825
5826	if (err) {
5827		WARN_ON(&ptype->list == head);
5828		kfree_skb(skb);
5829		return NET_RX_SUCCESS;
5830	}
5831
5832out:
5833	gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
5834	return NET_RX_SUCCESS;
5835}
5836
5837static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
5838				   bool flush_old)
5839{
5840	struct list_head *head = &napi->gro_hash[index].list;
5841	struct sk_buff *skb, *p;
5842
5843	list_for_each_entry_safe_reverse(skb, p, head, list) {
5844		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
5845			return;
5846		skb_list_del_init(skb);
5847		napi_gro_complete(napi, skb);
5848		napi->gro_hash[index].count--;
5849	}
5850
5851	if (!napi->gro_hash[index].count)
5852		__clear_bit(index, &napi->gro_bitmask);
5853}
5854
5855/* napi->gro_hash[].list contains packets ordered by age.
5856 * youngest packets at the head of it.
5857 * Complete skbs in reverse order to reduce latencies.
5858 */
5859void napi_gro_flush(struct napi_struct *napi, bool flush_old)
5860{
5861	unsigned long bitmask = napi->gro_bitmask;
5862	unsigned int i, base = ~0U;
5863
5864	while ((i = ffs(bitmask)) != 0) {
5865		bitmask >>= i;
5866		base += i;
5867		__napi_gro_flush_chain(napi, base, flush_old);
5868	}
5869}
5870EXPORT_SYMBOL(napi_gro_flush);
5871
5872static struct list_head *gro_list_prepare(struct napi_struct *napi,
5873					  struct sk_buff *skb)
5874{
5875	unsigned int maclen = skb->dev->hard_header_len;
5876	u32 hash = skb_get_hash_raw(skb);
5877	struct list_head *head;
5878	struct sk_buff *p;
5879
5880	head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
5881	list_for_each_entry(p, head, list) {
5882		unsigned long diffs;
5883
5884		NAPI_GRO_CB(p)->flush = 0;
5885
5886		if (hash != skb_get_hash_raw(p)) {
5887			NAPI_GRO_CB(p)->same_flow = 0;
5888			continue;
5889		}
5890
5891		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
5892		diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
5893		if (skb_vlan_tag_present(p))
5894			diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
5895		diffs |= skb_metadata_dst_cmp(p, skb);
5896		diffs |= skb_metadata_differs(p, skb);
5897		if (maclen == ETH_HLEN)
5898			diffs |= compare_ether_header(skb_mac_header(p),
5899						      skb_mac_header(skb));
5900		else if (!diffs)
5901			diffs = memcmp(skb_mac_header(p),
5902				       skb_mac_header(skb),
5903				       maclen);
5904
5905		diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
5906#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
5907		if (!diffs) {
5908			struct tc_skb_ext *skb_ext = skb_ext_find(skb, TC_SKB_EXT);
5909			struct tc_skb_ext *p_ext = skb_ext_find(p, TC_SKB_EXT);
5910
5911			diffs |= (!!p_ext) ^ (!!skb_ext);
5912			if (!diffs && unlikely(skb_ext))
5913				diffs |= p_ext->chain ^ skb_ext->chain;
5914		}
5915#endif
5916
5917		NAPI_GRO_CB(p)->same_flow = !diffs;
5918	}
5919
5920	return head;
5921}
5922
5923static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
5924{
5925	const struct skb_shared_info *pinfo = skb_shinfo(skb);
5926	const skb_frag_t *frag0 = &pinfo->frags[0];
5927
5928	NAPI_GRO_CB(skb)->data_offset = 0;
5929	NAPI_GRO_CB(skb)->frag0 = NULL;
5930	NAPI_GRO_CB(skb)->frag0_len = 0;
5931
5932	if (!skb_headlen(skb) && pinfo->nr_frags &&
5933	    !PageHighMem(skb_frag_page(frag0)) &&
5934	    (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) {
5935		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
5936		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
5937						    skb_frag_size(frag0),
5938						    skb->end - skb->tail);
5939	}
5940}
5941
5942static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
5943{
5944	struct skb_shared_info *pinfo = skb_shinfo(skb);
5945
5946	BUG_ON(skb->end - skb->tail < grow);
5947
5948	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
5949
5950	skb->data_len -= grow;
5951	skb->tail += grow;
5952
5953	skb_frag_off_add(&pinfo->frags[0], grow);
5954	skb_frag_size_sub(&pinfo->frags[0], grow);
5955
5956	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
5957		skb_frag_unref(skb, 0);
5958		memmove(pinfo->frags, pinfo->frags + 1,
5959			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
5960	}
5961}
5962
5963static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
5964{
5965	struct sk_buff *oldest;
5966
5967	oldest = list_last_entry(head, struct sk_buff, list);
5968
5969	/* We are called with head length >= MAX_GRO_SKBS, so this is
5970	 * impossible.
5971	 */
5972	if (WARN_ON_ONCE(!oldest))
5973		return;
5974
5975	/* Do not adjust napi->gro_hash[].count, caller is adding a new
5976	 * SKB to the chain.
5977	 */
5978	skb_list_del_init(oldest);
5979	napi_gro_complete(napi, oldest);
5980}
5981
5982INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
5983							   struct sk_buff *));
5984INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
5985							   struct sk_buff *));
5986static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5987{
5988	u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
5989	struct list_head *head = &offload_base;
5990	struct packet_offload *ptype;
5991	__be16 type = skb->protocol;
5992	struct list_head *gro_head;
5993	struct sk_buff *pp = NULL;
5994	enum gro_result ret;
5995	int same_flow;
5996	int grow;
5997
5998	if (netif_elide_gro(skb->dev))
5999		goto normal;
6000
6001	gro_head = gro_list_prepare(napi, skb);
6002
6003	rcu_read_lock();
6004	list_for_each_entry_rcu(ptype, head, list) {
6005		if (ptype->type != type || !ptype->callbacks.gro_receive)
6006			continue;
6007
6008		skb_set_network_header(skb, skb_gro_offset(skb));
6009		skb_reset_mac_len(skb);
6010		NAPI_GRO_CB(skb)->same_flow = 0;
6011		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
6012		NAPI_GRO_CB(skb)->free = 0;
6013		NAPI_GRO_CB(skb)->encap_mark = 0;
6014		NAPI_GRO_CB(skb)->recursion_counter = 0;
6015		NAPI_GRO_CB(skb)->is_fou = 0;
6016		NAPI_GRO_CB(skb)->is_atomic = 1;
6017		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
6018
6019		/* Setup for GRO checksum validation */
6020		switch (skb->ip_summed) {
6021		case CHECKSUM_COMPLETE:
6022			NAPI_GRO_CB(skb)->csum = skb->csum;
6023			NAPI_GRO_CB(skb)->csum_valid = 1;
6024			NAPI_GRO_CB(skb)->csum_cnt = 0;
6025			break;
6026		case CHECKSUM_UNNECESSARY:
6027			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
6028			NAPI_GRO_CB(skb)->csum_valid = 0;
6029			break;
6030		default:
6031			NAPI_GRO_CB(skb)->csum_cnt = 0;
6032			NAPI_GRO_CB(skb)->csum_valid = 0;
6033		}
6034
6035		pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
6036					ipv6_gro_receive, inet_gro_receive,
6037					gro_head, skb);
6038		break;
6039	}
6040	rcu_read_unlock();
6041
6042	if (&ptype->list == head)
6043		goto normal;
6044
6045	if (PTR_ERR(pp) == -EINPROGRESS) {
6046		ret = GRO_CONSUMED;
6047		goto ok;
6048	}
6049
6050	same_flow = NAPI_GRO_CB(skb)->same_flow;
6051	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
6052
6053	if (pp) {
6054		skb_list_del_init(pp);
6055		napi_gro_complete(napi, pp);
6056		napi->gro_hash[hash].count--;
6057	}
6058
6059	if (same_flow)
6060		goto ok;
6061
6062	if (NAPI_GRO_CB(skb)->flush)
6063		goto normal;
6064
6065	if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
6066		gro_flush_oldest(napi, gro_head);
6067	} else {
6068		napi->gro_hash[hash].count++;
6069	}
6070	NAPI_GRO_CB(skb)->count = 1;
6071	NAPI_GRO_CB(skb)->age = jiffies;
6072	NAPI_GRO_CB(skb)->last = skb;
6073	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
6074	list_add(&skb->list, gro_head);
6075	ret = GRO_HELD;
6076
6077pull:
6078	grow = skb_gro_offset(skb) - skb_headlen(skb);
6079	if (grow > 0)
6080		gro_pull_from_frag0(skb, grow);
6081ok:
6082	if (napi->gro_hash[hash].count) {
6083		if (!test_bit(hash, &napi->gro_bitmask))
6084			__set_bit(hash, &napi->gro_bitmask);
6085	} else if (test_bit(hash, &napi->gro_bitmask)) {
6086		__clear_bit(hash, &napi->gro_bitmask);
6087	}
6088
6089	return ret;
6090
6091normal:
6092	ret = GRO_NORMAL;
6093	goto pull;
6094}
6095
6096struct packet_offload *gro_find_receive_by_type(__be16 type)
6097{
6098	struct list_head *offload_head = &offload_base;
6099	struct packet_offload *ptype;
6100
6101	list_for_each_entry_rcu(ptype, offload_head, list) {
6102		if (ptype->type != type || !ptype->callbacks.gro_receive)
6103			continue;
6104		return ptype;
6105	}
6106	return NULL;
6107}
6108EXPORT_SYMBOL(gro_find_receive_by_type);
6109
6110struct packet_offload *gro_find_complete_by_type(__be16 type)
6111{
6112	struct list_head *offload_head = &offload_base;
6113	struct packet_offload *ptype;
6114
6115	list_for_each_entry_rcu(ptype, offload_head, list) {
6116		if (ptype->type != type || !ptype->callbacks.gro_complete)
6117			continue;
6118		return ptype;
6119	}
6120	return NULL;
6121}
6122EXPORT_SYMBOL(gro_find_complete_by_type);
6123
6124static void napi_skb_free_stolen_head(struct sk_buff *skb)
6125{
6126	nf_reset_ct(skb);
6127	skb_dst_drop(skb);
6128	skb_ext_put(skb);
6129	kmem_cache_free(skbuff_head_cache, skb);
6130}
6131
6132static gro_result_t napi_skb_finish(struct napi_struct *napi,
6133				    struct sk_buff *skb,
6134				    gro_result_t ret)
6135{
6136	switch (ret) {
6137	case GRO_NORMAL:
6138		gro_normal_one(napi, skb, 1);
6139		break;
6140
6141	case GRO_DROP:
6142		kfree_skb(skb);
6143		break;
6144
6145	case GRO_MERGED_FREE:
6146		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
6147			napi_skb_free_stolen_head(skb);
6148		else
6149			__kfree_skb(skb);
6150		break;
6151
6152	case GRO_HELD:
6153	case GRO_MERGED:
6154	case GRO_CONSUMED:
6155		break;
6156	}
6157
6158	return ret;
6159}
6160
6161gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
6162{
6163	gro_result_t ret;
6164
6165	skb_mark_napi_id(skb, napi);
6166	trace_napi_gro_receive_entry(skb);
6167
6168	skb_gro_reset_offset(skb, 0);
6169
6170	ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
6171	trace_napi_gro_receive_exit(ret);
6172
6173	return ret;
6174}
6175EXPORT_SYMBOL(napi_gro_receive);
6176
6177static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
6178{
6179	if (unlikely(skb->pfmemalloc)) {
6180		consume_skb(skb);
6181		return;
6182	}
6183	__skb_pull(skb, skb_headlen(skb));
6184	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
6185	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
6186	__vlan_hwaccel_clear_tag(skb);
6187	skb->dev = napi->dev;
6188	skb->skb_iif = 0;
6189
6190	/* eth_type_trans() assumes pkt_type is PACKET_HOST */
6191	skb->pkt_type = PACKET_HOST;
6192
6193	skb->encapsulation = 0;
6194	skb_shinfo(skb)->gso_type = 0;
6195	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
6196	skb_ext_reset(skb);
6197	nf_reset_ct(skb);
6198
6199	napi->skb = skb;
6200}
6201
6202struct sk_buff *napi_get_frags(struct napi_struct *napi)
6203{
6204	struct sk_buff *skb = napi->skb;
6205
6206	if (!skb) {
6207		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
6208		if (skb) {
6209			napi->skb = skb;
6210			skb_mark_napi_id(skb, napi);
6211		}
6212	}
6213	return skb;
6214}
6215EXPORT_SYMBOL(napi_get_frags);
6216
6217static gro_result_t napi_frags_finish(struct napi_struct *napi,
6218				      struct sk_buff *skb,
6219				      gro_result_t ret)
6220{
6221	switch (ret) {
6222	case GRO_NORMAL:
6223	case GRO_HELD:
6224		__skb_push(skb, ETH_HLEN);
6225		skb->protocol = eth_type_trans(skb, skb->dev);
6226		if (ret == GRO_NORMAL)
6227			gro_normal_one(napi, skb, 1);
6228		break;
6229
6230	case GRO_DROP:
6231		napi_reuse_skb(napi, skb);
6232		break;
6233
6234	case GRO_MERGED_FREE:
6235		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
6236			napi_skb_free_stolen_head(skb);
6237		else
6238			napi_reuse_skb(napi, skb);
6239		break;
6240
6241	case GRO_MERGED:
6242	case GRO_CONSUMED:
6243		break;
6244	}
6245
6246	return ret;
6247}
6248
6249/* Upper GRO stack assumes network header starts at gro_offset=0
6250 * Drivers could call both napi_gro_frags() and napi_gro_receive()
6251 * We copy ethernet header into skb->data to have a common layout.
6252 */
6253static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
6254{
6255	struct sk_buff *skb = napi->skb;
6256	const struct ethhdr *eth;
6257	unsigned int hlen = sizeof(*eth);
6258
6259	napi->skb = NULL;
6260
6261	skb_reset_mac_header(skb);
6262	skb_gro_reset_offset(skb, hlen);
6263
6264	if (unlikely(skb_gro_header_hard(skb, hlen))) {
6265		eth = skb_gro_header_slow(skb, hlen, 0);
6266		if (unlikely(!eth)) {
6267			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
6268					     __func__, napi->dev->name);
6269			napi_reuse_skb(napi, skb);
6270			return NULL;
6271		}
6272	} else {
6273		eth = (const struct ethhdr *)skb->data;
6274		gro_pull_from_frag0(skb, hlen);
6275		NAPI_GRO_CB(skb)->frag0 += hlen;
6276		NAPI_GRO_CB(skb)->frag0_len -= hlen;
6277	}
6278	__skb_pull(skb, hlen);
6279
6280	/*
6281	 * This works because the only protocols we care about don't require
6282	 * special handling.
6283	 * We'll fix it up properly in napi_frags_finish()
6284	 */
6285	skb->protocol = eth->h_proto;
6286
6287	return skb;
6288}
6289
6290gro_result_t napi_gro_frags(struct napi_struct *napi)
6291{
6292	gro_result_t ret;
6293	struct sk_buff *skb = napi_frags_skb(napi);
6294
6295	if (!skb)
6296		return GRO_DROP;
6297
6298	trace_napi_gro_frags_entry(skb);
6299
6300	ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
6301	trace_napi_gro_frags_exit(ret);
6302
6303	return ret;
6304}
6305EXPORT_SYMBOL(napi_gro_frags);
6306
6307/* Compute the checksum from gro_offset and return the folded value
6308 * after adding in any pseudo checksum.
6309 */
6310__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
6311{
6312	__wsum wsum;
6313	__sum16 sum;
6314
6315	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
6316
6317	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
6318	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
6319	/* See comments in __skb_checksum_complete(). */
6320	if (likely(!sum)) {
6321		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
6322		    !skb->csum_complete_sw)
6323			netdev_rx_csum_fault(skb->dev, skb);
6324	}
6325
6326	NAPI_GRO_CB(skb)->csum = wsum;
6327	NAPI_GRO_CB(skb)->csum_valid = 1;
6328
6329	return sum;
6330}
6331EXPORT_SYMBOL(__skb_gro_checksum_complete);
6332
6333static void net_rps_send_ipi(struct softnet_data *remsd)
6334{
6335#ifdef CONFIG_RPS
6336	while (remsd) {
6337		struct softnet_data *next = remsd->rps_ipi_next;
6338
6339		if (cpu_online(remsd->cpu))
6340			smp_call_function_single_async(remsd->cpu, &remsd->csd);
6341		remsd = next;
6342	}
6343#endif
6344}
6345
6346/*
6347 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
6348 * Note: called with local irq disabled, but exits with local irq enabled.
6349 */
6350static void net_rps_action_and_irq_enable(struct softnet_data *sd)
6351{
6352#ifdef CONFIG_RPS
6353	struct softnet_data *remsd = sd->rps_ipi_list;
6354
6355	if (remsd) {
6356		sd->rps_ipi_list = NULL;
6357
6358		local_irq_enable();
6359
6360		/* Send pending IPI's to kick RPS processing on remote cpus. */
6361		net_rps_send_ipi(remsd);
6362	} else
6363#endif
6364		local_irq_enable();
6365}
6366
6367static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
6368{
6369#ifdef CONFIG_RPS
6370	return sd->rps_ipi_list != NULL;
6371#else
6372	return false;
6373#endif
6374}
6375
6376static int process_backlog(struct napi_struct *napi, int quota)
6377{
6378	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
6379	bool again = true;
6380	int work = 0;
6381
6382	/* Check if we have pending ipi, its better to send them now,
6383	 * not waiting net_rx_action() end.
6384	 */
6385	if (sd_has_rps_ipi_waiting(sd)) {
6386		local_irq_disable();
6387		net_rps_action_and_irq_enable(sd);
6388	}
6389
6390	napi->weight = READ_ONCE(dev_rx_weight);
6391	while (again) {
6392		struct sk_buff *skb;
6393
6394		while ((skb = __skb_dequeue(&sd->process_queue))) {
6395			rcu_read_lock();
6396			__netif_receive_skb(skb);
6397			rcu_read_unlock();
6398			input_queue_head_incr(sd);
6399			if (++work >= quota)
6400				return work;
6401
6402		}
6403
6404		local_irq_disable();
6405		rps_lock(sd);
6406		if (skb_queue_empty(&sd->input_pkt_queue)) {
6407			/*
6408			 * Inline a custom version of __napi_complete().
6409			 * only current cpu owns and manipulates this napi,
6410			 * and NAPI_STATE_SCHED is the only possible flag set
6411			 * on backlog.
6412			 * We can use a plain write instead of clear_bit(),
6413			 * and we dont need an smp_mb() memory barrier.
6414			 */
6415			napi->state = 0;
6416			again = false;
6417		} else {
6418			skb_queue_splice_tail_init(&sd->input_pkt_queue,
6419						   &sd->process_queue);
6420		}
6421		rps_unlock(sd);
6422		local_irq_enable();
6423	}
6424
6425	return work;
6426}
6427
6428/**
6429 * __napi_schedule - schedule for receive
6430 * @n: entry to schedule
6431 *
6432 * The entry's receive function will be scheduled to run.
6433 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
6434 */
6435void __napi_schedule(struct napi_struct *n)
6436{
6437	unsigned long flags;
6438
6439	local_irq_save(flags);
6440	____napi_schedule(this_cpu_ptr(&softnet_data), n);
6441	local_irq_restore(flags);
6442}
6443EXPORT_SYMBOL(__napi_schedule);
6444
6445/**
6446 *	napi_schedule_prep - check if napi can be scheduled
6447 *	@n: napi context
6448 *
6449 * Test if NAPI routine is already running, and if not mark
6450 * it as running.  This is used as a condition variable to
6451 * insure only one NAPI poll instance runs.  We also make
6452 * sure there is no pending NAPI disable.
6453 */
6454bool napi_schedule_prep(struct napi_struct *n)
6455{
6456	unsigned long val, new;
6457
6458	do {
6459		val = READ_ONCE(n->state);
6460		if (unlikely(val & NAPIF_STATE_DISABLE))
6461			return false;
6462		new = val | NAPIF_STATE_SCHED;
6463
6464		/* Sets STATE_MISSED bit if STATE_SCHED was already set
6465		 * This was suggested by Alexander Duyck, as compiler
6466		 * emits better code than :
6467		 * if (val & NAPIF_STATE_SCHED)
6468		 *     new |= NAPIF_STATE_MISSED;
6469		 */
6470		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
6471						   NAPIF_STATE_MISSED;
6472	} while (cmpxchg(&n->state, val, new) != val);
6473
6474	return !(val & NAPIF_STATE_SCHED);
6475}
6476EXPORT_SYMBOL(napi_schedule_prep);
6477
6478/**
6479 * __napi_schedule_irqoff - schedule for receive
6480 * @n: entry to schedule
6481 *
6482 * Variant of __napi_schedule() assuming hard irqs are masked.
6483 *
6484 * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
6485 * because the interrupt disabled assumption might not be true
6486 * due to force-threaded interrupts and spinlock substitution.
6487 */
6488void __napi_schedule_irqoff(struct napi_struct *n)
6489{
6490	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6491		____napi_schedule(this_cpu_ptr(&softnet_data), n);
6492	else
6493		__napi_schedule(n);
6494}
6495EXPORT_SYMBOL(__napi_schedule_irqoff);
6496
6497bool napi_complete_done(struct napi_struct *n, int work_done)
6498{
6499	unsigned long flags, val, new, timeout = 0;
6500	bool ret = true;
6501
6502	/*
6503	 * 1) Don't let napi dequeue from the cpu poll list
6504	 *    just in case its running on a different cpu.
6505	 * 2) If we are busy polling, do nothing here, we have
6506	 *    the guarantee we will be called later.
6507	 */
6508	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6509				 NAPIF_STATE_IN_BUSY_POLL)))
6510		return false;
6511
6512	if (work_done) {
6513		if (n->gro_bitmask)
6514			timeout = READ_ONCE(n->dev->gro_flush_timeout);
6515		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
6516	}
6517	if (n->defer_hard_irqs_count > 0) {
6518		n->defer_hard_irqs_count--;
6519		timeout = READ_ONCE(n->dev->gro_flush_timeout);
6520		if (timeout)
6521			ret = false;
6522	}
6523	if (n->gro_bitmask) {
6524		/* When the NAPI instance uses a timeout and keeps postponing
6525		 * it, we need to bound somehow the time packets are kept in
6526		 * the GRO layer
6527		 */
6528		napi_gro_flush(n, !!timeout);
6529	}
6530
6531	gro_normal_list(n);
6532
6533	if (unlikely(!list_empty(&n->poll_list))) {
6534		/* If n->poll_list is not empty, we need to mask irqs */
6535		local_irq_save(flags);
6536		list_del_init(&n->poll_list);
6537		local_irq_restore(flags);
6538	}
6539
6540	do {
6541		val = READ_ONCE(n->state);
6542
6543		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6544
6545		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
6546
6547		/* If STATE_MISSED was set, leave STATE_SCHED set,
6548		 * because we will call napi->poll() one more time.
6549		 * This C code was suggested by Alexander Duyck to help gcc.
6550		 */
6551		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6552						    NAPIF_STATE_SCHED;
6553	} while (cmpxchg(&n->state, val, new) != val);
6554
6555	if (unlikely(val & NAPIF_STATE_MISSED)) {
6556		__napi_schedule(n);
6557		return false;
6558	}
6559
6560	if (timeout)
6561		hrtimer_start(&n->timer, ns_to_ktime(timeout),
6562			      HRTIMER_MODE_REL_PINNED);
6563	return ret;
6564}
6565EXPORT_SYMBOL(napi_complete_done);
6566
6567/* must be called under rcu_read_lock(), as we dont take a reference */
6568static struct napi_struct *napi_by_id(unsigned int napi_id)
6569{
6570	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6571	struct napi_struct *napi;
6572
6573	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6574		if (napi->napi_id == napi_id)
6575			return napi;
6576
6577	return NULL;
6578}
6579
6580#if defined(CONFIG_NET_RX_BUSY_POLL)
6581
6582#define BUSY_POLL_BUDGET 8
6583
6584static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
6585{
6586	int rc;
6587
6588	/* Busy polling means there is a high chance device driver hard irq
6589	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6590	 * set in napi_schedule_prep().
6591	 * Since we are about to call napi->poll() once more, we can safely
6592	 * clear NAPI_STATE_MISSED.
6593	 *
6594	 * Note: x86 could use a single "lock and ..." instruction
6595	 * to perform these two clear_bit()
6596	 */
6597	clear_bit(NAPI_STATE_MISSED, &napi->state);
6598	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6599
6600	local_bh_disable();
6601
6602	/* All we really want here is to re-enable device interrupts.
6603	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6604	 */
6605	rc = napi->poll(napi, BUSY_POLL_BUDGET);
6606	/* We can't gro_normal_list() here, because napi->poll() might have
6607	 * rearmed the napi (napi_complete_done()) in which case it could
6608	 * already be running on another CPU.
6609	 */
6610	trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
6611	netpoll_poll_unlock(have_poll_lock);
6612	if (rc == BUSY_POLL_BUDGET) {
6613		/* As the whole budget was spent, we still own the napi so can
6614		 * safely handle the rx_list.
6615		 */
6616		gro_normal_list(napi);
6617		__napi_schedule(napi);
6618	}
6619	local_bh_enable();
6620}
6621
6622void napi_busy_loop(unsigned int napi_id,
6623		    bool (*loop_end)(void *, unsigned long),
6624		    void *loop_end_arg)
6625{
6626	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6627	int (*napi_poll)(struct napi_struct *napi, int budget);
6628	void *have_poll_lock = NULL;
6629	struct napi_struct *napi;
6630
6631restart:
6632	napi_poll = NULL;
6633
6634	rcu_read_lock();
6635
6636	napi = napi_by_id(napi_id);
6637	if (!napi)
6638		goto out;
6639
6640	preempt_disable();
6641	for (;;) {
6642		int work = 0;
6643
6644		local_bh_disable();
6645		if (!napi_poll) {
6646			unsigned long val = READ_ONCE(napi->state);
6647
6648			/* If multiple threads are competing for this napi,
6649			 * we avoid dirtying napi->state as much as we can.
6650			 */
6651			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6652				   NAPIF_STATE_IN_BUSY_POLL))
6653				goto count;
6654			if (cmpxchg(&napi->state, val,
6655				    val | NAPIF_STATE_IN_BUSY_POLL |
6656					  NAPIF_STATE_SCHED) != val)
6657				goto count;
6658			have_poll_lock = netpoll_poll_lock(napi);
6659			napi_poll = napi->poll;
6660		}
6661		work = napi_poll(napi, BUSY_POLL_BUDGET);
6662		trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
6663		gro_normal_list(napi);
6664count:
6665		if (work > 0)
6666			__NET_ADD_STATS(dev_net(napi->dev),
6667					LINUX_MIB_BUSYPOLLRXPACKETS, work);
6668		local_bh_enable();
6669
6670		if (!loop_end || loop_end(loop_end_arg, start_time))
6671			break;
6672
6673		if (unlikely(need_resched())) {
6674			if (napi_poll)
6675				busy_poll_stop(napi, have_poll_lock);
6676			preempt_enable();
6677			rcu_read_unlock();
6678			cond_resched();
6679			if (loop_end(loop_end_arg, start_time))
6680				return;
6681			goto restart;
6682		}
6683		cpu_relax();
6684	}
6685	if (napi_poll)
6686		busy_poll_stop(napi, have_poll_lock);
6687	preempt_enable();
6688out:
6689	rcu_read_unlock();
6690}
6691EXPORT_SYMBOL(napi_busy_loop);
6692
6693#endif /* CONFIG_NET_RX_BUSY_POLL */
6694
6695static void napi_hash_add(struct napi_struct *napi)
6696{
6697	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
6698		return;
6699
6700	spin_lock(&napi_hash_lock);
6701
6702	/* 0..NR_CPUS range is reserved for sender_cpu use */
6703	do {
6704		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6705			napi_gen_id = MIN_NAPI_ID;
6706	} while (napi_by_id(napi_gen_id));
6707	napi->napi_id = napi_gen_id;
6708
6709	hlist_add_head_rcu(&napi->napi_hash_node,
6710			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6711
6712	spin_unlock(&napi_hash_lock);
6713}
6714
6715/* Warning : caller is responsible to make sure rcu grace period
6716 * is respected before freeing memory containing @napi
6717 */
6718static void napi_hash_del(struct napi_struct *napi)
6719{
6720	spin_lock(&napi_hash_lock);
6721
6722	hlist_del_init_rcu(&napi->napi_hash_node);
6723
6724	spin_unlock(&napi_hash_lock);
6725}
6726
6727static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6728{
6729	struct napi_struct *napi;
6730
6731	napi = container_of(timer, struct napi_struct, timer);
6732
6733	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
6734	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6735	 */
6736	if (!napi_disable_pending(napi) &&
6737	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
6738		__napi_schedule_irqoff(napi);
6739
6740	return HRTIMER_NORESTART;
6741}
6742
6743static void init_gro_hash(struct napi_struct *napi)
6744{
6745	int i;
6746
6747	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6748		INIT_LIST_HEAD(&napi->gro_hash[i].list);
6749		napi->gro_hash[i].count = 0;
6750	}
6751	napi->gro_bitmask = 0;
6752}
6753
6754void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
6755		    int (*poll)(struct napi_struct *, int), int weight)
6756{
6757	if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
6758		return;
6759
6760	INIT_LIST_HEAD(&napi->poll_list);
6761	INIT_HLIST_NODE(&napi->napi_hash_node);
6762	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6763	napi->timer.function = napi_watchdog;
6764	init_gro_hash(napi);
6765	napi->skb = NULL;
6766	INIT_LIST_HEAD(&napi->rx_list);
6767	napi->rx_count = 0;
6768	napi->poll = poll;
6769	if (weight > NAPI_POLL_WEIGHT)
6770		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6771				weight);
6772	napi->weight = weight;
6773	napi->dev = dev;
6774#ifdef CONFIG_NETPOLL
6775	napi->poll_owner = -1;
6776#endif
6777	set_bit(NAPI_STATE_SCHED, &napi->state);
6778	set_bit(NAPI_STATE_NPSVC, &napi->state);
6779	list_add_rcu(&napi->dev_list, &dev->napi_list);
6780	napi_hash_add(napi);
6781}
6782EXPORT_SYMBOL(netif_napi_add);
6783
6784void napi_disable(struct napi_struct *n)
6785{
6786	might_sleep();
6787	set_bit(NAPI_STATE_DISABLE, &n->state);
6788
6789	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
6790		msleep(1);
6791	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
6792		msleep(1);
6793
6794	hrtimer_cancel(&n->timer);
6795
6796	clear_bit(NAPI_STATE_DISABLE, &n->state);
6797}
6798EXPORT_SYMBOL(napi_disable);
6799
6800static void flush_gro_hash(struct napi_struct *napi)
6801{
6802	int i;
6803
6804	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6805		struct sk_buff *skb, *n;
6806
6807		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6808			kfree_skb(skb);
6809		napi->gro_hash[i].count = 0;
6810	}
6811}
6812
6813/* Must be called in process context */
6814void __netif_napi_del(struct napi_struct *napi)
6815{
6816	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
6817		return;
6818
6819	napi_hash_del(napi);
6820	list_del_rcu(&napi->dev_list);
6821	napi_free_frags(napi);
6822
6823	flush_gro_hash(napi);
6824	napi->gro_bitmask = 0;
6825}
6826EXPORT_SYMBOL(__netif_napi_del);
6827
6828static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6829{
6830	void *have;
6831	int work, weight;
6832
6833	list_del_init(&n->poll_list);
6834
6835	have = netpoll_poll_lock(n);
6836
6837	weight = n->weight;
6838
6839	/* This NAPI_STATE_SCHED test is for avoiding a race
6840	 * with netpoll's poll_napi().  Only the entity which
6841	 * obtains the lock and sees NAPI_STATE_SCHED set will
6842	 * actually make the ->poll() call.  Therefore we avoid
6843	 * accidentally calling ->poll() when NAPI is not scheduled.
6844	 */
6845	work = 0;
6846	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
6847		work = n->poll(n, weight);
6848		trace_napi_poll(n, work, weight);
6849	}
6850
6851	if (unlikely(work > weight))
6852		pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
6853			    n->poll, work, weight);
6854
6855	if (likely(work < weight))
6856		goto out_unlock;
6857
6858	/* Drivers must not modify the NAPI state if they
6859	 * consume the entire weight.  In such cases this code
6860	 * still "owns" the NAPI instance and therefore can
6861	 * move the instance around on the list at-will.
6862	 */
6863	if (unlikely(napi_disable_pending(n))) {
6864		napi_complete(n);
6865		goto out_unlock;
6866	}
6867
6868	if (n->gro_bitmask) {
6869		/* flush too old packets
6870		 * If HZ < 1000, flush all packets.
6871		 */
6872		napi_gro_flush(n, HZ >= 1000);
6873	}
6874
6875	gro_normal_list(n);
6876
6877	/* Some drivers may have called napi_schedule
6878	 * prior to exhausting their budget.
6879	 */
6880	if (unlikely(!list_empty(&n->poll_list))) {
6881		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6882			     n->dev ? n->dev->name : "backlog");
6883		goto out_unlock;
6884	}
6885
6886	list_add_tail(&n->poll_list, repoll);
6887
6888out_unlock:
6889	netpoll_poll_unlock(have);
6890
6891	return work;
6892}
6893
6894static __latent_entropy void net_rx_action(struct softirq_action *h)
6895{
6896	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6897	unsigned long time_limit = jiffies +
6898		usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
6899	int budget = READ_ONCE(netdev_budget);
6900	LIST_HEAD(list);
6901	LIST_HEAD(repoll);
6902
6903	local_irq_disable();
6904	list_splice_init(&sd->poll_list, &list);
6905	local_irq_enable();
6906
6907	for (;;) {
6908		struct napi_struct *n;
6909
6910		if (list_empty(&list)) {
6911			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
6912				goto out;
6913			break;
6914		}
6915
6916		n = list_first_entry(&list, struct napi_struct, poll_list);
6917		budget -= napi_poll(n, &repoll);
6918
6919		/* If softirq window is exhausted then punt.
6920		 * Allow this to run for 2 jiffies since which will allow
6921		 * an average latency of 1.5/HZ.
6922		 */
6923		if (unlikely(budget <= 0 ||
6924			     time_after_eq(jiffies, time_limit))) {
6925			sd->time_squeeze++;
6926			break;
6927		}
6928	}
6929
6930	local_irq_disable();
6931
6932	list_splice_tail_init(&sd->poll_list, &list);
6933	list_splice_tail(&repoll, &list);
6934	list_splice(&list, &sd->poll_list);
6935	if (!list_empty(&sd->poll_list))
6936		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
6937
6938	net_rps_action_and_irq_enable(sd);
6939out:
6940	__kfree_skb_flush();
6941}
6942
6943struct netdev_adjacent {
6944	struct net_device *dev;
6945
6946	/* upper master flag, there can only be one master device per list */
6947	bool master;
6948
6949	/* lookup ignore flag */
6950	bool ignore;
6951
6952	/* counter for the number of times this device was added to us */
6953	u16 ref_nr;
6954
6955	/* private field for the users */
6956	void *private;
6957
6958	struct list_head list;
6959	struct rcu_head rcu;
6960};
6961
6962static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6963						 struct list_head *adj_list)
6964{
6965	struct netdev_adjacent *adj;
6966
6967	list_for_each_entry(adj, adj_list, list) {
6968		if (adj->dev == adj_dev)
6969			return adj;
6970	}
6971	return NULL;
6972}
6973
6974static int ____netdev_has_upper_dev(struct net_device *upper_dev,
6975				    struct netdev_nested_priv *priv)
6976{
6977	struct net_device *dev = (struct net_device *)priv->data;
6978
6979	return upper_dev == dev;
6980}
6981
6982/**
6983 * netdev_has_upper_dev - Check if device is linked to an upper device
6984 * @dev: device
6985 * @upper_dev: upper device to check
6986 *
6987 * Find out if a device is linked to specified upper device and return true
6988 * in case it is. Note that this checks only immediate upper device,
6989 * not through a complete stack of devices. The caller must hold the RTNL lock.
6990 */
6991bool netdev_has_upper_dev(struct net_device *dev,
6992			  struct net_device *upper_dev)
6993{
6994	struct netdev_nested_priv priv = {
6995		.data = (void *)upper_dev,
6996	};
6997
6998	ASSERT_RTNL();
6999
7000	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
7001					     &priv);
7002}
7003EXPORT_SYMBOL(netdev_has_upper_dev);
7004
7005/**
7006 * netdev_has_upper_dev_all - Check if device is linked to an upper device
7007 * @dev: device
7008 * @upper_dev: upper device to check
7009 *
7010 * Find out if a device is linked to specified upper device and return true
7011 * in case it is. Note that this checks the entire upper device chain.
7012 * The caller must hold rcu lock.
7013 */
7014
7015bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
7016				  struct net_device *upper_dev)
7017{
7018	struct netdev_nested_priv priv = {
7019		.data = (void *)upper_dev,
7020	};
7021
7022	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
7023					       &priv);
7024}
7025EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
7026
7027/**
7028 * netdev_has_any_upper_dev - Check if device is linked to some device
7029 * @dev: device
7030 *
7031 * Find out if a device is linked to an upper device and return true in case
7032 * it is. The caller must hold the RTNL lock.
7033 */
7034bool netdev_has_any_upper_dev(struct net_device *dev)
7035{
7036	ASSERT_RTNL();
7037
7038	return !list_empty(&dev->adj_list.upper);
7039}
7040EXPORT_SYMBOL(netdev_has_any_upper_dev);
7041
7042/**
7043 * netdev_master_upper_dev_get - Get master upper device
7044 * @dev: device
7045 *
7046 * Find a master upper device and return pointer to it or NULL in case
7047 * it's not there. The caller must hold the RTNL lock.
7048 */
7049struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
7050{
7051	struct netdev_adjacent *upper;
7052
7053	ASSERT_RTNL();
7054
7055	if (list_empty(&dev->adj_list.upper))
7056		return NULL;
7057
7058	upper = list_first_entry(&dev->adj_list.upper,
7059				 struct netdev_adjacent, list);
7060	if (likely(upper->master))
7061		return upper->dev;
7062	return NULL;
7063}
7064EXPORT_SYMBOL(netdev_master_upper_dev_get);
7065
7066static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
7067{
7068	struct netdev_adjacent *upper;
7069
7070	ASSERT_RTNL();
7071
7072	if (list_empty(&dev->adj_list.upper))
7073		return NULL;
7074
7075	upper = list_first_entry(&dev->adj_list.upper,
7076				 struct netdev_adjacent, list);
7077	if (likely(upper->master) && !upper->ignore)
7078		return upper->dev;
7079	return NULL;
7080}
7081
7082/**
7083 * netdev_has_any_lower_dev - Check if device is linked to some device
7084 * @dev: device
7085 *
7086 * Find out if a device is linked to a lower device and return true in case
7087 * it is. The caller must hold the RTNL lock.
7088 */
7089static bool netdev_has_any_lower_dev(struct net_device *dev)
7090{
7091	ASSERT_RTNL();
7092
7093	return !list_empty(&dev->adj_list.lower);
7094}
7095
7096void *netdev_adjacent_get_private(struct list_head *adj_list)
7097{
7098	struct netdev_adjacent *adj;
7099
7100	adj = list_entry(adj_list, struct netdev_adjacent, list);
7101
7102	return adj->private;
7103}
7104EXPORT_SYMBOL(netdev_adjacent_get_private);
7105
7106/**
7107 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
7108 * @dev: device
7109 * @iter: list_head ** of the current position
7110 *
7111 * Gets the next device from the dev's upper list, starting from iter
7112 * position. The caller must hold RCU read lock.
7113 */
7114struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
7115						 struct list_head **iter)
7116{
7117	struct netdev_adjacent *upper;
7118
7119	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7120
7121	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7122
7123	if (&upper->list == &dev->adj_list.upper)
7124		return NULL;
7125
7126	*iter = &upper->list;
7127
7128	return upper->dev;
7129}
7130EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
7131
7132static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
7133						  struct list_head **iter,
7134						  bool *ignore)
7135{
7136	struct netdev_adjacent *upper;
7137
7138	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
7139
7140	if (&upper->list == &dev->adj_list.upper)
7141		return NULL;
7142
7143	*iter = &upper->list;
7144	*ignore = upper->ignore;
7145
7146	return upper->dev;
7147}
7148
7149static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
7150						    struct list_head **iter)
7151{
7152	struct netdev_adjacent *upper;
7153
7154	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7155
7156	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7157
7158	if (&upper->list == &dev->adj_list.upper)
7159		return NULL;
7160
7161	*iter = &upper->list;
7162
7163	return upper->dev;
7164}
7165
7166static int __netdev_walk_all_upper_dev(struct net_device *dev,
7167				       int (*fn)(struct net_device *dev,
7168					 struct netdev_nested_priv *priv),
7169				       struct netdev_nested_priv *priv)
7170{
7171	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7172	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7173	int ret, cur = 0;
7174	bool ignore;
7175
7176	now = dev;
7177	iter = &dev->adj_list.upper;
7178
7179	while (1) {
7180		if (now != dev) {
7181			ret = fn(now, priv);
7182			if (ret)
7183				return ret;
7184		}
7185
7186		next = NULL;
7187		while (1) {
7188			udev = __netdev_next_upper_dev(now, &iter, &ignore);
7189			if (!udev)
7190				break;
7191			if (ignore)
7192				continue;
7193
7194			next = udev;
7195			niter = &udev->adj_list.upper;
7196			dev_stack[cur] = now;
7197			iter_stack[cur++] = iter;
7198			break;
7199		}
7200
7201		if (!next) {
7202			if (!cur)
7203				return 0;
7204			next = dev_stack[--cur];
7205			niter = iter_stack[cur];
7206		}
7207
7208		now = next;
7209		iter = niter;
7210	}
7211
7212	return 0;
7213}
7214
7215int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
7216				  int (*fn)(struct net_device *dev,
7217					    struct netdev_nested_priv *priv),
7218				  struct netdev_nested_priv *priv)
7219{
7220	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7221	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7222	int ret, cur = 0;
7223
7224	now = dev;
7225	iter = &dev->adj_list.upper;
7226
7227	while (1) {
7228		if (now != dev) {
7229			ret = fn(now, priv);
7230			if (ret)
7231				return ret;
7232		}
7233
7234		next = NULL;
7235		while (1) {
7236			udev = netdev_next_upper_dev_rcu(now, &iter);
7237			if (!udev)
7238				break;
7239
7240			next = udev;
7241			niter = &udev->adj_list.upper;
7242			dev_stack[cur] = now;
7243			iter_stack[cur++] = iter;
7244			break;
7245		}
7246
7247		if (!next) {
7248			if (!cur)
7249				return 0;
7250			next = dev_stack[--cur];
7251			niter = iter_stack[cur];
7252		}
7253
7254		now = next;
7255		iter = niter;
7256	}
7257
7258	return 0;
7259}
7260EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
7261
7262static bool __netdev_has_upper_dev(struct net_device *dev,
7263				   struct net_device *upper_dev)
7264{
7265	struct netdev_nested_priv priv = {
7266		.flags = 0,
7267		.data = (void *)upper_dev,
7268	};
7269
7270	ASSERT_RTNL();
7271
7272	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
7273					   &priv);
7274}
7275
7276/**
7277 * netdev_lower_get_next_private - Get the next ->private from the
7278 *				   lower neighbour list
7279 * @dev: device
7280 * @iter: list_head ** of the current position
7281 *
7282 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7283 * list, starting from iter position. The caller must hold either hold the
7284 * RTNL lock or its own locking that guarantees that the neighbour lower
7285 * list will remain unchanged.
7286 */
7287void *netdev_lower_get_next_private(struct net_device *dev,
7288				    struct list_head **iter)
7289{
7290	struct netdev_adjacent *lower;
7291
7292	lower = list_entry(*iter, struct netdev_adjacent, list);
7293
7294	if (&lower->list == &dev->adj_list.lower)
7295		return NULL;
7296
7297	*iter = lower->list.next;
7298
7299	return lower->private;
7300}
7301EXPORT_SYMBOL(netdev_lower_get_next_private);
7302
7303/**
7304 * netdev_lower_get_next_private_rcu - Get the next ->private from the
7305 *				       lower neighbour list, RCU
7306 *				       variant
7307 * @dev: device
7308 * @iter: list_head ** of the current position
7309 *
7310 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7311 * list, starting from iter position. The caller must hold RCU read lock.
7312 */
7313void *netdev_lower_get_next_private_rcu(struct net_device *dev,
7314					struct list_head **iter)
7315{
7316	struct netdev_adjacent *lower;
7317
7318	WARN_ON_ONCE(!rcu_read_lock_held());
7319
7320	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7321
7322	if (&lower->list == &dev->adj_list.lower)
7323		return NULL;
7324
7325	*iter = &lower->list;
7326
7327	return lower->private;
7328}
7329EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
7330
7331/**
7332 * netdev_lower_get_next - Get the next device from the lower neighbour
7333 *                         list
7334 * @dev: device
7335 * @iter: list_head ** of the current position
7336 *
7337 * Gets the next netdev_adjacent from the dev's lower neighbour
7338 * list, starting from iter position. The caller must hold RTNL lock or
7339 * its own locking that guarantees that the neighbour lower
7340 * list will remain unchanged.
7341 */
7342void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
7343{
7344	struct netdev_adjacent *lower;
7345
7346	lower = list_entry(*iter, struct netdev_adjacent, list);
7347
7348	if (&lower->list == &dev->adj_list.lower)
7349		return NULL;
7350
7351	*iter = lower->list.next;
7352
7353	return lower->dev;
7354}
7355EXPORT_SYMBOL(netdev_lower_get_next);
7356
7357static struct net_device *netdev_next_lower_dev(struct net_device *dev,
7358						struct list_head **iter)
7359{
7360	struct netdev_adjacent *lower;
7361
7362	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7363
7364	if (&lower->list == &dev->adj_list.lower)
7365		return NULL;
7366
7367	*iter = &lower->list;
7368
7369	return lower->dev;
7370}
7371
7372static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
7373						  struct list_head **iter,
7374						  bool *ignore)
7375{
7376	struct netdev_adjacent *lower;
7377
7378	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7379
7380	if (&lower->list == &dev->adj_list.lower)
7381		return NULL;
7382
7383	*iter = &lower->list;
7384	*ignore = lower->ignore;
7385
7386	return lower->dev;
7387}
7388
7389int netdev_walk_all_lower_dev(struct net_device *dev,
7390			      int (*fn)(struct net_device *dev,
7391					struct netdev_nested_priv *priv),
7392			      struct netdev_nested_priv *priv)
7393{
7394	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7395	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7396	int ret, cur = 0;
7397
7398	now = dev;
7399	iter = &dev->adj_list.lower;
7400
7401	while (1) {
7402		if (now != dev) {
7403			ret = fn(now, priv);
7404			if (ret)
7405				return ret;
7406		}
7407
7408		next = NULL;
7409		while (1) {
7410			ldev = netdev_next_lower_dev(now, &iter);
7411			if (!ldev)
7412				break;
7413
7414			next = ldev;
7415			niter = &ldev->adj_list.lower;
7416			dev_stack[cur] = now;
7417			iter_stack[cur++] = iter;
7418			break;
7419		}
7420
7421		if (!next) {
7422			if (!cur)
7423				return 0;
7424			next = dev_stack[--cur];
7425			niter = iter_stack[cur];
7426		}
7427
7428		now = next;
7429		iter = niter;
7430	}
7431
7432	return 0;
7433}
7434EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
7435
7436static int __netdev_walk_all_lower_dev(struct net_device *dev,
7437				       int (*fn)(struct net_device *dev,
7438					 struct netdev_nested_priv *priv),
7439				       struct netdev_nested_priv *priv)
7440{
7441	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7442	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7443	int ret, cur = 0;
7444	bool ignore;
7445
7446	now = dev;
7447	iter = &dev->adj_list.lower;
7448
7449	while (1) {
7450		if (now != dev) {
7451			ret = fn(now, priv);
7452			if (ret)
7453				return ret;
7454		}
7455
7456		next = NULL;
7457		while (1) {
7458			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
7459			if (!ldev)
7460				break;
7461			if (ignore)
7462				continue;
7463
7464			next = ldev;
7465			niter = &ldev->adj_list.lower;
7466			dev_stack[cur] = now;
7467			iter_stack[cur++] = iter;
7468			break;
7469		}
7470
7471		if (!next) {
7472			if (!cur)
7473				return 0;
7474			next = dev_stack[--cur];
7475			niter = iter_stack[cur];
7476		}
7477
7478		now = next;
7479		iter = niter;
7480	}
7481
7482	return 0;
7483}
7484
7485struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
7486					     struct list_head **iter)
7487{
7488	struct netdev_adjacent *lower;
7489
7490	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7491	if (&lower->list == &dev->adj_list.lower)
7492		return NULL;
7493
7494	*iter = &lower->list;
7495
7496	return lower->dev;
7497}
7498EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
7499
7500static u8 __netdev_upper_depth(struct net_device *dev)
7501{
7502	struct net_device *udev;
7503	struct list_head *iter;
7504	u8 max_depth = 0;
7505	bool ignore;
7506
7507	for (iter = &dev->adj_list.upper,
7508	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
7509	     udev;
7510	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
7511		if (ignore)
7512			continue;
7513		if (max_depth < udev->upper_level)
7514			max_depth = udev->upper_level;
7515	}
7516
7517	return max_depth;
7518}
7519
7520static u8 __netdev_lower_depth(struct net_device *dev)
7521{
7522	struct net_device *ldev;
7523	struct list_head *iter;
7524	u8 max_depth = 0;
7525	bool ignore;
7526
7527	for (iter = &dev->adj_list.lower,
7528	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
7529	     ldev;
7530	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
7531		if (ignore)
7532			continue;
7533		if (max_depth < ldev->lower_level)
7534			max_depth = ldev->lower_level;
7535	}
7536
7537	return max_depth;
7538}
7539
7540static int __netdev_update_upper_level(struct net_device *dev,
7541				       struct netdev_nested_priv *__unused)
7542{
7543	dev->upper_level = __netdev_upper_depth(dev) + 1;
7544	return 0;
7545}
7546
7547static int __netdev_update_lower_level(struct net_device *dev,
7548				       struct netdev_nested_priv *priv)
7549{
7550	dev->lower_level = __netdev_lower_depth(dev) + 1;
7551
7552#ifdef CONFIG_LOCKDEP
7553	if (!priv)
7554		return 0;
7555
7556	if (priv->flags & NESTED_SYNC_IMM)
7557		dev->nested_level = dev->lower_level - 1;
7558	if (priv->flags & NESTED_SYNC_TODO)
7559		net_unlink_todo(dev);
7560#endif
7561	return 0;
7562}
7563
7564int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
7565				  int (*fn)(struct net_device *dev,
7566					    struct netdev_nested_priv *priv),
7567				  struct netdev_nested_priv *priv)
7568{
7569	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7570	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7571	int ret, cur = 0;
7572
7573	now = dev;
7574	iter = &dev->adj_list.lower;
7575
7576	while (1) {
7577		if (now != dev) {
7578			ret = fn(now, priv);
7579			if (ret)
7580				return ret;
7581		}
7582
7583		next = NULL;
7584		while (1) {
7585			ldev = netdev_next_lower_dev_rcu(now, &iter);
7586			if (!ldev)
7587				break;
7588
7589			next = ldev;
7590			niter = &ldev->adj_list.lower;
7591			dev_stack[cur] = now;
7592			iter_stack[cur++] = iter;
7593			break;
7594		}
7595
7596		if (!next) {
7597			if (!cur)
7598				return 0;
7599			next = dev_stack[--cur];
7600			niter = iter_stack[cur];
7601		}
7602
7603		now = next;
7604		iter = niter;
7605	}
7606
7607	return 0;
7608}
7609EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
7610
7611/**
7612 * netdev_lower_get_first_private_rcu - Get the first ->private from the
7613 *				       lower neighbour list, RCU
7614 *				       variant
7615 * @dev: device
7616 *
7617 * Gets the first netdev_adjacent->private from the dev's lower neighbour
7618 * list. The caller must hold RCU read lock.
7619 */
7620void *netdev_lower_get_first_private_rcu(struct net_device *dev)
7621{
7622	struct netdev_adjacent *lower;
7623
7624	lower = list_first_or_null_rcu(&dev->adj_list.lower,
7625			struct netdev_adjacent, list);
7626	if (lower)
7627		return lower->private;
7628	return NULL;
7629}
7630EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
7631
7632/**
7633 * netdev_master_upper_dev_get_rcu - Get master upper device
7634 * @dev: device
7635 *
7636 * Find a master upper device and return pointer to it or NULL in case
7637 * it's not there. The caller must hold the RCU read lock.
7638 */
7639struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
7640{
7641	struct netdev_adjacent *upper;
7642
7643	upper = list_first_or_null_rcu(&dev->adj_list.upper,
7644				       struct netdev_adjacent, list);
7645	if (upper && likely(upper->master))
7646		return upper->dev;
7647	return NULL;
7648}
7649EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
7650
7651static int netdev_adjacent_sysfs_add(struct net_device *dev,
7652			      struct net_device *adj_dev,
7653			      struct list_head *dev_list)
7654{
7655	char linkname[IFNAMSIZ+7];
7656
7657	sprintf(linkname, dev_list == &dev->adj_list.upper ?
7658		"upper_%s" : "lower_%s", adj_dev->name);
7659	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
7660				 linkname);
7661}
7662static void netdev_adjacent_sysfs_del(struct net_device *dev,
7663			       char *name,
7664			       struct list_head *dev_list)
7665{
7666	char linkname[IFNAMSIZ+7];
7667
7668	sprintf(linkname, dev_list == &dev->adj_list.upper ?
7669		"upper_%s" : "lower_%s", name);
7670	sysfs_remove_link(&(dev->dev.kobj), linkname);
7671}
7672
7673static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
7674						 struct net_device *adj_dev,
7675						 struct list_head *dev_list)
7676{
7677	return (dev_list == &dev->adj_list.upper ||
7678		dev_list == &dev->adj_list.lower) &&
7679		net_eq(dev_net(dev), dev_net(adj_dev));
7680}
7681
7682static int __netdev_adjacent_dev_insert(struct net_device *dev,
7683					struct net_device *adj_dev,
7684					struct list_head *dev_list,
7685					void *private, bool master)
7686{
7687	struct netdev_adjacent *adj;
7688	int ret;
7689
7690	adj = __netdev_find_adj(adj_dev, dev_list);
7691
7692	if (adj) {
7693		adj->ref_nr += 1;
7694		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
7695			 dev->name, adj_dev->name, adj->ref_nr);
7696
7697		return 0;
7698	}
7699
7700	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
7701	if (!adj)
7702		return -ENOMEM;
7703
7704	adj->dev = adj_dev;
7705	adj->master = master;
7706	adj->ref_nr = 1;
7707	adj->private = private;
7708	adj->ignore = false;
7709	dev_hold(adj_dev);
7710
7711	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
7712		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
7713
7714	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
7715		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
7716		if (ret)
7717			goto free_adj;
7718	}
7719
7720	/* Ensure that master link is always the first item in list. */
7721	if (master) {
7722		ret = sysfs_create_link(&(dev->dev.kobj),
7723					&(adj_dev->dev.kobj), "master");
7724		if (ret)
7725			goto remove_symlinks;
7726
7727		list_add_rcu(&adj->list, dev_list);
7728	} else {
7729		list_add_tail_rcu(&adj->list, dev_list);
7730	}
7731
7732	return 0;
7733
7734remove_symlinks:
7735	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7736		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7737free_adj:
7738	kfree(adj);
7739	dev_put(adj_dev);
7740
7741	return ret;
7742}
7743
7744static void __netdev_adjacent_dev_remove(struct net_device *dev,
7745					 struct net_device *adj_dev,
7746					 u16 ref_nr,
7747					 struct list_head *dev_list)
7748{
7749	struct netdev_adjacent *adj;
7750
7751	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
7752		 dev->name, adj_dev->name, ref_nr);
7753
7754	adj = __netdev_find_adj(adj_dev, dev_list);
7755
7756	if (!adj) {
7757		pr_err("Adjacency does not exist for device %s from %s\n",
7758		       dev->name, adj_dev->name);
7759		WARN_ON(1);
7760		return;
7761	}
7762
7763	if (adj->ref_nr > ref_nr) {
7764		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
7765			 dev->name, adj_dev->name, ref_nr,
7766			 adj->ref_nr - ref_nr);
7767		adj->ref_nr -= ref_nr;
7768		return;
7769	}
7770
7771	if (adj->master)
7772		sysfs_remove_link(&(dev->dev.kobj), "master");
7773
7774	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7775		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7776
7777	list_del_rcu(&adj->list);
7778	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7779		 adj_dev->name, dev->name, adj_dev->name);
7780	dev_put(adj_dev);
7781	kfree_rcu(adj, rcu);
7782}
7783
7784static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
7785					    struct net_device *upper_dev,
7786					    struct list_head *up_list,
7787					    struct list_head *down_list,
7788					    void *private, bool master)
7789{
7790	int ret;
7791
7792	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7793					   private, master);
7794	if (ret)
7795		return ret;
7796
7797	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7798					   private, false);
7799	if (ret) {
7800		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7801		return ret;
7802	}
7803
7804	return 0;
7805}
7806
7807static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
7808					       struct net_device *upper_dev,
7809					       u16 ref_nr,
7810					       struct list_head *up_list,
7811					       struct list_head *down_list)
7812{
7813	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
7814	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7815}
7816
7817static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
7818						struct net_device *upper_dev,
7819						void *private, bool master)
7820{
7821	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
7822						&dev->adj_list.upper,
7823						&upper_dev->adj_list.lower,
7824						private, master);
7825}
7826
7827static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
7828						   struct net_device *upper_dev)
7829{
7830	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7831					   &dev->adj_list.upper,
7832					   &upper_dev->adj_list.lower);
7833}
7834
7835static int __netdev_upper_dev_link(struct net_device *dev,
7836				   struct net_device *upper_dev, bool master,
7837				   void *upper_priv, void *upper_info,
7838				   struct netdev_nested_priv *priv,
7839				   struct netlink_ext_ack *extack)
7840{
7841	struct netdev_notifier_changeupper_info changeupper_info = {
7842		.info = {
7843			.dev = dev,
7844			.extack = extack,
7845		},
7846		.upper_dev = upper_dev,
7847		.master = master,
7848		.linking = true,
7849		.upper_info = upper_info,
7850	};
7851	struct net_device *master_dev;
7852	int ret = 0;
7853
7854	ASSERT_RTNL();
7855
7856	if (dev == upper_dev)
7857		return -EBUSY;
7858
7859	/* To prevent loops, check if dev is not upper device to upper_dev. */
7860	if (__netdev_has_upper_dev(upper_dev, dev))
7861		return -EBUSY;
7862
7863	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
7864		return -EMLINK;
7865
7866	if (!master) {
7867		if (__netdev_has_upper_dev(dev, upper_dev))
7868			return -EEXIST;
7869	} else {
7870		master_dev = __netdev_master_upper_dev_get(dev);
7871		if (master_dev)
7872			return master_dev == upper_dev ? -EEXIST : -EBUSY;
7873	}
7874
7875	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7876					    &changeupper_info.info);
7877	ret = notifier_to_errno(ret);
7878	if (ret)
7879		return ret;
7880
7881	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7882						   master);
7883	if (ret)
7884		return ret;
7885
7886	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7887					    &changeupper_info.info);
7888	ret = notifier_to_errno(ret);
7889	if (ret)
7890		goto rollback;
7891
7892	__netdev_update_upper_level(dev, NULL);
7893	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7894
7895	__netdev_update_lower_level(upper_dev, priv);
7896	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7897				    priv);
7898
7899	return 0;
7900
7901rollback:
7902	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7903
7904	return ret;
7905}
7906
7907/**
7908 * netdev_upper_dev_link - Add a link to the upper device
7909 * @dev: device
7910 * @upper_dev: new upper device
7911 * @extack: netlink extended ack
7912 *
7913 * Adds a link to device which is upper to this one. The caller must hold
7914 * the RTNL lock. On a failure a negative errno code is returned.
7915 * On success the reference counts are adjusted and the function
7916 * returns zero.
7917 */
7918int netdev_upper_dev_link(struct net_device *dev,
7919			  struct net_device *upper_dev,
7920			  struct netlink_ext_ack *extack)
7921{
7922	struct netdev_nested_priv priv = {
7923		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7924		.data = NULL,
7925	};
7926
7927	return __netdev_upper_dev_link(dev, upper_dev, false,
7928				       NULL, NULL, &priv, extack);
7929}
7930EXPORT_SYMBOL(netdev_upper_dev_link);
7931
7932/**
7933 * netdev_master_upper_dev_link - Add a master link to the upper device
7934 * @dev: device
7935 * @upper_dev: new upper device
7936 * @upper_priv: upper device private
7937 * @upper_info: upper info to be passed down via notifier
7938 * @extack: netlink extended ack
7939 *
7940 * Adds a link to device which is upper to this one. In this case, only
7941 * one master upper device can be linked, although other non-master devices
7942 * might be linked as well. The caller must hold the RTNL lock.
7943 * On a failure a negative errno code is returned. On success the reference
7944 * counts are adjusted and the function returns zero.
7945 */
7946int netdev_master_upper_dev_link(struct net_device *dev,
7947				 struct net_device *upper_dev,
7948				 void *upper_priv, void *upper_info,
7949				 struct netlink_ext_ack *extack)
7950{
7951	struct netdev_nested_priv priv = {
7952		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7953		.data = NULL,
7954	};
7955
7956	return __netdev_upper_dev_link(dev, upper_dev, true,
7957				       upper_priv, upper_info, &priv, extack);
7958}
7959EXPORT_SYMBOL(netdev_master_upper_dev_link);
7960
7961static void __netdev_upper_dev_unlink(struct net_device *dev,
7962				      struct net_device *upper_dev,
7963				      struct netdev_nested_priv *priv)
7964{
7965	struct netdev_notifier_changeupper_info changeupper_info = {
7966		.info = {
7967			.dev = dev,
7968		},
7969		.upper_dev = upper_dev,
7970		.linking = false,
7971	};
7972
7973	ASSERT_RTNL();
7974
7975	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
7976
7977	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7978				      &changeupper_info.info);
7979
7980	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7981
7982	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7983				      &changeupper_info.info);
7984
7985	__netdev_update_upper_level(dev, NULL);
7986	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7987
7988	__netdev_update_lower_level(upper_dev, priv);
7989	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7990				    priv);
7991}
7992
7993/**
7994 * netdev_upper_dev_unlink - Removes a link to upper device
7995 * @dev: device
7996 * @upper_dev: new upper device
7997 *
7998 * Removes a link to device which is upper to this one. The caller must hold
7999 * the RTNL lock.
8000 */
8001void netdev_upper_dev_unlink(struct net_device *dev,
8002			     struct net_device *upper_dev)
8003{
8004	struct netdev_nested_priv priv = {
8005		.flags = NESTED_SYNC_TODO,
8006		.data = NULL,
8007	};
8008
8009	__netdev_upper_dev_unlink(dev, upper_dev, &priv);
8010}
8011EXPORT_SYMBOL(netdev_upper_dev_unlink);
8012
8013static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
8014				      struct net_device *lower_dev,
8015				      bool val)
8016{
8017	struct netdev_adjacent *adj;
8018
8019	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
8020	if (adj)
8021		adj->ignore = val;
8022
8023	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
8024	if (adj)
8025		adj->ignore = val;
8026}
8027
8028static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
8029					struct net_device *lower_dev)
8030{
8031	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
8032}
8033
8034static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
8035				       struct net_device *lower_dev)
8036{
8037	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
8038}
8039
8040int netdev_adjacent_change_prepare(struct net_device *old_dev,
8041				   struct net_device *new_dev,
8042				   struct net_device *dev,
8043				   struct netlink_ext_ack *extack)
8044{
8045	struct netdev_nested_priv priv = {
8046		.flags = 0,
8047		.data = NULL,
8048	};
8049	int err;
8050
8051	if (!new_dev)
8052		return 0;
8053
8054	if (old_dev && new_dev != old_dev)
8055		netdev_adjacent_dev_disable(dev, old_dev);
8056	err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
8057				      extack);
8058	if (err) {
8059		if (old_dev && new_dev != old_dev)
8060			netdev_adjacent_dev_enable(dev, old_dev);
8061		return err;
8062	}
8063
8064	return 0;
8065}
8066EXPORT_SYMBOL(netdev_adjacent_change_prepare);
8067
8068void netdev_adjacent_change_commit(struct net_device *old_dev,
8069				   struct net_device *new_dev,
8070				   struct net_device *dev)
8071{
8072	struct netdev_nested_priv priv = {
8073		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
8074		.data = NULL,
8075	};
8076
8077	if (!new_dev || !old_dev)
8078		return;
8079
8080	if (new_dev == old_dev)
8081		return;
8082
8083	netdev_adjacent_dev_enable(dev, old_dev);
8084	__netdev_upper_dev_unlink(old_dev, dev, &priv);
8085}
8086EXPORT_SYMBOL(netdev_adjacent_change_commit);
8087
8088void netdev_adjacent_change_abort(struct net_device *old_dev,
8089				  struct net_device *new_dev,
8090				  struct net_device *dev)
8091{
8092	struct netdev_nested_priv priv = {
8093		.flags = 0,
8094		.data = NULL,
8095	};
8096
8097	if (!new_dev)
8098		return;
8099
8100	if (old_dev && new_dev != old_dev)
8101		netdev_adjacent_dev_enable(dev, old_dev);
8102
8103	__netdev_upper_dev_unlink(new_dev, dev, &priv);
8104}
8105EXPORT_SYMBOL(netdev_adjacent_change_abort);
8106
8107/**
8108 * netdev_bonding_info_change - Dispatch event about slave change
8109 * @dev: device
8110 * @bonding_info: info to dispatch
8111 *
8112 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
8113 * The caller must hold the RTNL lock.
8114 */
8115void netdev_bonding_info_change(struct net_device *dev,
8116				struct netdev_bonding_info *bonding_info)
8117{
8118	struct netdev_notifier_bonding_info info = {
8119		.info.dev = dev,
8120	};
8121
8122	memcpy(&info.bonding_info, bonding_info,
8123	       sizeof(struct netdev_bonding_info));
8124	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
8125				      &info.info);
8126}
8127EXPORT_SYMBOL(netdev_bonding_info_change);
8128
8129/**
8130 * netdev_get_xmit_slave - Get the xmit slave of master device
8131 * @dev: device
8132 * @skb: The packet
8133 * @all_slaves: assume all the slaves are active
8134 *
8135 * The reference counters are not incremented so the caller must be
8136 * careful with locks. The caller must hold RCU lock.
8137 * %NULL is returned if no slave is found.
8138 */
8139
8140struct net_device *netdev_get_xmit_slave(struct net_device *dev,
8141					 struct sk_buff *skb,
8142					 bool all_slaves)
8143{
8144	const struct net_device_ops *ops = dev->netdev_ops;
8145
8146	if (!ops->ndo_get_xmit_slave)
8147		return NULL;
8148	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
8149}
8150EXPORT_SYMBOL(netdev_get_xmit_slave);
8151
8152static void netdev_adjacent_add_links(struct net_device *dev)
8153{
8154	struct netdev_adjacent *iter;
8155
8156	struct net *net = dev_net(dev);
8157
8158	list_for_each_entry(iter, &dev->adj_list.upper, list) {
8159		if (!net_eq(net, dev_net(iter->dev)))
8160			continue;
8161		netdev_adjacent_sysfs_add(iter->dev, dev,
8162					  &iter->dev->adj_list.lower);
8163		netdev_adjacent_sysfs_add(dev, iter->dev,
8164					  &dev->adj_list.upper);
8165	}
8166
8167	list_for_each_entry(iter, &dev->adj_list.lower, list) {
8168		if (!net_eq(net, dev_net(iter->dev)))
8169			continue;
8170		netdev_adjacent_sysfs_add(iter->dev, dev,
8171					  &iter->dev->adj_list.upper);
8172		netdev_adjacent_sysfs_add(dev, iter->dev,
8173					  &dev->adj_list.lower);
8174	}
8175}
8176
8177static void netdev_adjacent_del_links(struct net_device *dev)
8178{
8179	struct netdev_adjacent *iter;
8180
8181	struct net *net = dev_net(dev);
8182
8183	list_for_each_entry(iter, &dev->adj_list.upper, list) {
8184		if (!net_eq(net, dev_net(iter->dev)))
8185			continue;
8186		netdev_adjacent_sysfs_del(iter->dev, dev->name,
8187					  &iter->dev->adj_list.lower);
8188		netdev_adjacent_sysfs_del(dev, iter->dev->name,
8189					  &dev->adj_list.upper);
8190	}
8191
8192	list_for_each_entry(iter, &dev->adj_list.lower, list) {
8193		if (!net_eq(net, dev_net(iter->dev)))
8194			continue;
8195		netdev_adjacent_sysfs_del(iter->dev, dev->name,
8196					  &iter->dev->adj_list.upper);
8197		netdev_adjacent_sysfs_del(dev, iter->dev->name,
8198					  &dev->adj_list.lower);
8199	}
8200}
8201
8202void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
8203{
8204	struct netdev_adjacent *iter;
8205
8206	struct net *net = dev_net(dev);
8207
8208	list_for_each_entry(iter, &dev->adj_list.upper, list) {
8209		if (!net_eq(net, dev_net(iter->dev)))
8210			continue;
8211		netdev_adjacent_sysfs_del(iter->dev, oldname,
8212					  &iter->dev->adj_list.lower);
8213		netdev_adjacent_sysfs_add(iter->dev, dev,
8214					  &iter->dev->adj_list.lower);
8215	}
8216
8217	list_for_each_entry(iter, &dev->adj_list.lower, list) {
8218		if (!net_eq(net, dev_net(iter->dev)))
8219			continue;
8220		netdev_adjacent_sysfs_del(iter->dev, oldname,
8221					  &iter->dev->adj_list.upper);
8222		netdev_adjacent_sysfs_add(iter->dev, dev,
8223					  &iter->dev->adj_list.upper);
8224	}
8225}
8226
8227void *netdev_lower_dev_get_private(struct net_device *dev,
8228				   struct net_device *lower_dev)
8229{
8230	struct netdev_adjacent *lower;
8231
8232	if (!lower_dev)
8233		return NULL;
8234	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
8235	if (!lower)
8236		return NULL;
8237
8238	return lower->private;
8239}
8240EXPORT_SYMBOL(netdev_lower_dev_get_private);
8241
8242
8243/**
8244 * netdev_lower_change - Dispatch event about lower device state change
8245 * @lower_dev: device
8246 * @lower_state_info: state to dispatch
8247 *
8248 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
8249 * The caller must hold the RTNL lock.
8250 */
8251void netdev_lower_state_changed(struct net_device *lower_dev,
8252				void *lower_state_info)
8253{
8254	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
8255		.info.dev = lower_dev,
8256	};
8257
8258	ASSERT_RTNL();
8259	changelowerstate_info.lower_state_info = lower_state_info;
8260	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
8261				      &changelowerstate_info.info);
8262}
8263EXPORT_SYMBOL(netdev_lower_state_changed);
8264
8265static void dev_change_rx_flags(struct net_device *dev, int flags)
8266{
8267	const struct net_device_ops *ops = dev->netdev_ops;
8268
8269	if (ops->ndo_change_rx_flags)
8270		ops->ndo_change_rx_flags(dev, flags);
8271}
8272
8273static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
8274{
8275	unsigned int old_flags = dev->flags;
8276	kuid_t uid;
8277	kgid_t gid;
8278
8279	ASSERT_RTNL();
8280
8281	dev->flags |= IFF_PROMISC;
8282	dev->promiscuity += inc;
8283	if (dev->promiscuity == 0) {
8284		/*
8285		 * Avoid overflow.
8286		 * If inc causes overflow, untouch promisc and return error.
8287		 */
8288		if (inc < 0)
8289			dev->flags &= ~IFF_PROMISC;
8290		else {
8291			dev->promiscuity -= inc;
8292			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
8293				dev->name);
8294			return -EOVERFLOW;
8295		}
8296	}
8297	if (dev->flags != old_flags) {
8298		pr_info("device %s %s promiscuous mode\n",
8299			dev->name,
8300			dev->flags & IFF_PROMISC ? "entered" : "left");
8301		if (audit_enabled) {
8302			current_uid_gid(&uid, &gid);
8303			audit_log(audit_context(), GFP_ATOMIC,
8304				  AUDIT_ANOM_PROMISCUOUS,
8305				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
8306				  dev->name, (dev->flags & IFF_PROMISC),
8307				  (old_flags & IFF_PROMISC),
8308				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
8309				  from_kuid(&init_user_ns, uid),
8310				  from_kgid(&init_user_ns, gid),
8311				  audit_get_sessionid(current));
8312		}
8313
8314		dev_change_rx_flags(dev, IFF_PROMISC);
8315	}
8316	if (notify)
8317		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
8318	return 0;
8319}
8320
8321/**
8322 *	dev_set_promiscuity	- update promiscuity count on a device
8323 *	@dev: device
8324 *	@inc: modifier
8325 *
8326 *	Add or remove promiscuity from a device. While the count in the device
8327 *	remains above zero the interface remains promiscuous. Once it hits zero
8328 *	the device reverts back to normal filtering operation. A negative inc
8329 *	value is used to drop promiscuity on the device.
8330 *	Return 0 if successful or a negative errno code on error.
8331 */
8332int dev_set_promiscuity(struct net_device *dev, int inc)
8333{
8334	unsigned int old_flags = dev->flags;
8335	int err;
8336
8337	err = __dev_set_promiscuity(dev, inc, true);
8338	if (err < 0)
8339		return err;
8340	if (dev->flags != old_flags)
8341		dev_set_rx_mode(dev);
8342	return err;
8343}
8344EXPORT_SYMBOL(dev_set_promiscuity);
8345
8346static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
8347{
8348	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
8349
8350	ASSERT_RTNL();
8351
8352	dev->flags |= IFF_ALLMULTI;
8353	dev->allmulti += inc;
8354	if (dev->allmulti == 0) {
8355		/*
8356		 * Avoid overflow.
8357		 * If inc causes overflow, untouch allmulti and return error.
8358		 */
8359		if (inc < 0)
8360			dev->flags &= ~IFF_ALLMULTI;
8361		else {
8362			dev->allmulti -= inc;
8363			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
8364				dev->name);
8365			return -EOVERFLOW;
8366		}
8367	}
8368	if (dev->flags ^ old_flags) {
8369		dev_change_rx_flags(dev, IFF_ALLMULTI);
8370		dev_set_rx_mode(dev);
8371		if (notify)
8372			__dev_notify_flags(dev, old_flags,
8373					   dev->gflags ^ old_gflags);
8374	}
8375	return 0;
8376}
8377
8378/**
8379 *	dev_set_allmulti	- update allmulti count on a device
8380 *	@dev: device
8381 *	@inc: modifier
8382 *
8383 *	Add or remove reception of all multicast frames to a device. While the
8384 *	count in the device remains above zero the interface remains listening
8385 *	to all interfaces. Once it hits zero the device reverts back to normal
8386 *	filtering operation. A negative @inc value is used to drop the counter
8387 *	when releasing a resource needing all multicasts.
8388 *	Return 0 if successful or a negative errno code on error.
8389 */
8390
8391int dev_set_allmulti(struct net_device *dev, int inc)
8392{
8393	return __dev_set_allmulti(dev, inc, true);
8394}
8395EXPORT_SYMBOL(dev_set_allmulti);
8396
8397/*
8398 *	Upload unicast and multicast address lists to device and
8399 *	configure RX filtering. When the device doesn't support unicast
8400 *	filtering it is put in promiscuous mode while unicast addresses
8401 *	are present.
8402 */
8403void __dev_set_rx_mode(struct net_device *dev)
8404{
8405	const struct net_device_ops *ops = dev->netdev_ops;
8406
8407	/* dev_open will call this function so the list will stay sane. */
8408	if (!(dev->flags&IFF_UP))
8409		return;
8410
8411	if (!netif_device_present(dev))
8412		return;
8413
8414	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
8415		/* Unicast addresses changes may only happen under the rtnl,
8416		 * therefore calling __dev_set_promiscuity here is safe.
8417		 */
8418		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
8419			__dev_set_promiscuity(dev, 1, false);
8420			dev->uc_promisc = true;
8421		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
8422			__dev_set_promiscuity(dev, -1, false);
8423			dev->uc_promisc = false;
8424		}
8425	}
8426
8427	if (ops->ndo_set_rx_mode)
8428		ops->ndo_set_rx_mode(dev);
8429}
8430
8431void dev_set_rx_mode(struct net_device *dev)
8432{
8433	netif_addr_lock_bh(dev);
8434	__dev_set_rx_mode(dev);
8435	netif_addr_unlock_bh(dev);
8436}
8437
8438/**
8439 *	dev_get_flags - get flags reported to userspace
8440 *	@dev: device
8441 *
8442 *	Get the combination of flag bits exported through APIs to userspace.
8443 */
8444unsigned int dev_get_flags(const struct net_device *dev)
8445{
8446	unsigned int flags;
8447
8448	flags = (dev->flags & ~(IFF_PROMISC |
8449				IFF_ALLMULTI |
8450				IFF_RUNNING |
8451				IFF_LOWER_UP |
8452				IFF_DORMANT)) |
8453		(dev->gflags & (IFF_PROMISC |
8454				IFF_ALLMULTI));
8455
8456	if (netif_running(dev)) {
8457		if (netif_oper_up(dev))
8458			flags |= IFF_RUNNING;
8459		if (netif_carrier_ok(dev))
8460			flags |= IFF_LOWER_UP;
8461		if (netif_dormant(dev))
8462			flags |= IFF_DORMANT;
8463	}
8464
8465	return flags;
8466}
8467EXPORT_SYMBOL(dev_get_flags);
8468
8469int __dev_change_flags(struct net_device *dev, unsigned int flags,
8470		       struct netlink_ext_ack *extack)
8471{
8472	unsigned int old_flags = dev->flags;
8473	int ret;
8474
8475	ASSERT_RTNL();
8476
8477	/*
8478	 *	Set the flags on our device.
8479	 */
8480
8481	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
8482			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
8483			       IFF_AUTOMEDIA)) |
8484		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
8485				    IFF_ALLMULTI));
8486
8487	/*
8488	 *	Load in the correct multicast list now the flags have changed.
8489	 */
8490
8491	if ((old_flags ^ flags) & IFF_MULTICAST)
8492		dev_change_rx_flags(dev, IFF_MULTICAST);
8493
8494	dev_set_rx_mode(dev);
8495
8496	/*
8497	 *	Have we downed the interface. We handle IFF_UP ourselves
8498	 *	according to user attempts to set it, rather than blindly
8499	 *	setting it.
8500	 */
8501
8502	ret = 0;
8503	if ((old_flags ^ flags) & IFF_UP) {
8504		if (old_flags & IFF_UP)
8505			__dev_close(dev);
8506		else
8507			ret = __dev_open(dev, extack);
8508	}
8509
8510	if ((flags ^ dev->gflags) & IFF_PROMISC) {
8511		int inc = (flags & IFF_PROMISC) ? 1 : -1;
8512		unsigned int old_flags = dev->flags;
8513
8514		dev->gflags ^= IFF_PROMISC;
8515
8516		if (__dev_set_promiscuity(dev, inc, false) >= 0)
8517			if (dev->flags != old_flags)
8518				dev_set_rx_mode(dev);
8519	}
8520
8521	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
8522	 * is important. Some (broken) drivers set IFF_PROMISC, when
8523	 * IFF_ALLMULTI is requested not asking us and not reporting.
8524	 */
8525	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
8526		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
8527
8528		dev->gflags ^= IFF_ALLMULTI;
8529		__dev_set_allmulti(dev, inc, false);
8530	}
8531
8532	return ret;
8533}
8534
8535void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
8536			unsigned int gchanges)
8537{
8538	unsigned int changes = dev->flags ^ old_flags;
8539
8540	if (gchanges)
8541		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
8542
8543	if (changes & IFF_UP) {
8544		if (dev->flags & IFF_UP)
8545			call_netdevice_notifiers(NETDEV_UP, dev);
8546		else
8547			call_netdevice_notifiers(NETDEV_DOWN, dev);
8548	}
8549
8550	if (dev->flags & IFF_UP &&
8551	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
8552		struct netdev_notifier_change_info change_info = {
8553			.info = {
8554				.dev = dev,
8555			},
8556			.flags_changed = changes,
8557		};
8558
8559		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
8560	}
8561}
8562
8563/**
8564 *	dev_change_flags - change device settings
8565 *	@dev: device
8566 *	@flags: device state flags
8567 *	@extack: netlink extended ack
8568 *
8569 *	Change settings on device based state flags. The flags are
8570 *	in the userspace exported format.
8571 */
8572int dev_change_flags(struct net_device *dev, unsigned int flags,
8573		     struct netlink_ext_ack *extack)
8574{
8575	int ret;
8576	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
8577
8578	ret = __dev_change_flags(dev, flags, extack);
8579	if (ret < 0)
8580		return ret;
8581
8582	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
8583	__dev_notify_flags(dev, old_flags, changes);
8584	return ret;
8585}
8586EXPORT_SYMBOL(dev_change_flags);
8587
8588int __dev_set_mtu(struct net_device *dev, int new_mtu)
8589{
8590	const struct net_device_ops *ops = dev->netdev_ops;
8591
8592	if (ops->ndo_change_mtu)
8593		return ops->ndo_change_mtu(dev, new_mtu);
8594
8595	/* Pairs with all the lockless reads of dev->mtu in the stack */
8596	WRITE_ONCE(dev->mtu, new_mtu);
8597	return 0;
8598}
8599EXPORT_SYMBOL(__dev_set_mtu);
8600
8601int dev_validate_mtu(struct net_device *dev, int new_mtu,
8602		     struct netlink_ext_ack *extack)
8603{
8604	/* MTU must be positive, and in range */
8605	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
8606		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
8607		return -EINVAL;
8608	}
8609
8610	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
8611		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
8612		return -EINVAL;
8613	}
8614	return 0;
8615}
8616
8617/**
8618 *	dev_set_mtu_ext - Change maximum transfer unit
8619 *	@dev: device
8620 *	@new_mtu: new transfer unit
8621 *	@extack: netlink extended ack
8622 *
8623 *	Change the maximum transfer size of the network device.
8624 */
8625int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
8626		    struct netlink_ext_ack *extack)
8627{
8628	int err, orig_mtu;
8629
8630	if (new_mtu == dev->mtu)
8631		return 0;
8632
8633	err = dev_validate_mtu(dev, new_mtu, extack);
8634	if (err)
8635		return err;
8636
8637	if (!netif_device_present(dev))
8638		return -ENODEV;
8639
8640	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
8641	err = notifier_to_errno(err);
8642	if (err)
8643		return err;
8644
8645	orig_mtu = dev->mtu;
8646	err = __dev_set_mtu(dev, new_mtu);
8647
8648	if (!err) {
8649		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8650						   orig_mtu);
8651		err = notifier_to_errno(err);
8652		if (err) {
8653			/* setting mtu back and notifying everyone again,
8654			 * so that they have a chance to revert changes.
8655			 */
8656			__dev_set_mtu(dev, orig_mtu);
8657			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8658						     new_mtu);
8659		}
8660	}
8661	return err;
8662}
8663
8664int dev_set_mtu(struct net_device *dev, int new_mtu)
8665{
8666	struct netlink_ext_ack extack;
8667	int err;
8668
8669	memset(&extack, 0, sizeof(extack));
8670	err = dev_set_mtu_ext(dev, new_mtu, &extack);
8671	if (err && extack._msg)
8672		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
8673	return err;
8674}
8675EXPORT_SYMBOL(dev_set_mtu);
8676
8677/**
8678 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
8679 *	@dev: device
8680 *	@new_len: new tx queue length
8681 */
8682int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
8683{
8684	unsigned int orig_len = dev->tx_queue_len;
8685	int res;
8686
8687	if (new_len != (unsigned int)new_len)
8688		return -ERANGE;
8689
8690	if (new_len != orig_len) {
8691		dev->tx_queue_len = new_len;
8692		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
8693		res = notifier_to_errno(res);
8694		if (res)
8695			goto err_rollback;
8696		res = dev_qdisc_change_tx_queue_len(dev);
8697		if (res)
8698			goto err_rollback;
8699	}
8700
8701	return 0;
8702
8703err_rollback:
8704	netdev_err(dev, "refused to change device tx_queue_len\n");
8705	dev->tx_queue_len = orig_len;
8706	return res;
8707}
8708
8709/**
8710 *	dev_set_group - Change group this device belongs to
8711 *	@dev: device
8712 *	@new_group: group this device should belong to
8713 */
8714void dev_set_group(struct net_device *dev, int new_group)
8715{
8716	dev->group = new_group;
8717}
8718EXPORT_SYMBOL(dev_set_group);
8719
8720/**
8721 *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
8722 *	@dev: device
8723 *	@addr: new address
8724 *	@extack: netlink extended ack
8725 */
8726int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
8727			      struct netlink_ext_ack *extack)
8728{
8729	struct netdev_notifier_pre_changeaddr_info info = {
8730		.info.dev = dev,
8731		.info.extack = extack,
8732		.dev_addr = addr,
8733	};
8734	int rc;
8735
8736	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
8737	return notifier_to_errno(rc);
8738}
8739EXPORT_SYMBOL(dev_pre_changeaddr_notify);
8740
8741/**
8742 *	dev_set_mac_address - Change Media Access Control Address
8743 *	@dev: device
8744 *	@sa: new address
8745 *	@extack: netlink extended ack
8746 *
8747 *	Change the hardware (MAC) address of the device
8748 */
8749int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
8750			struct netlink_ext_ack *extack)
8751{
8752	const struct net_device_ops *ops = dev->netdev_ops;
8753	int err;
8754
8755	if (!ops->ndo_set_mac_address)
8756		return -EOPNOTSUPP;
8757	if (sa->sa_family != dev->type)
8758		return -EINVAL;
8759	if (!netif_device_present(dev))
8760		return -ENODEV;
8761	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
8762	if (err)
8763		return err;
8764	err = ops->ndo_set_mac_address(dev, sa);
8765	if (err)
8766		return err;
8767	dev->addr_assign_type = NET_ADDR_SET;
8768	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
8769	add_device_randomness(dev->dev_addr, dev->addr_len);
8770	return 0;
8771}
8772EXPORT_SYMBOL(dev_set_mac_address);
8773
8774static DECLARE_RWSEM(dev_addr_sem);
8775
8776int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
8777			     struct netlink_ext_ack *extack)
8778{
8779	int ret;
8780
8781	down_write(&dev_addr_sem);
8782	ret = dev_set_mac_address(dev, sa, extack);
8783	up_write(&dev_addr_sem);
8784	return ret;
8785}
8786EXPORT_SYMBOL(dev_set_mac_address_user);
8787
8788int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
8789{
8790	size_t size = sizeof(sa->sa_data_min);
8791	struct net_device *dev;
8792	int ret = 0;
8793
8794	down_read(&dev_addr_sem);
8795	rcu_read_lock();
8796
8797	dev = dev_get_by_name_rcu(net, dev_name);
8798	if (!dev) {
8799		ret = -ENODEV;
8800		goto unlock;
8801	}
8802	if (!dev->addr_len)
8803		memset(sa->sa_data, 0, size);
8804	else
8805		memcpy(sa->sa_data, dev->dev_addr,
8806		       min_t(size_t, size, dev->addr_len));
8807	sa->sa_family = dev->type;
8808
8809unlock:
8810	rcu_read_unlock();
8811	up_read(&dev_addr_sem);
8812	return ret;
8813}
8814EXPORT_SYMBOL(dev_get_mac_address);
8815
8816/**
8817 *	dev_change_carrier - Change device carrier
8818 *	@dev: device
8819 *	@new_carrier: new value
8820 *
8821 *	Change device carrier
8822 */
8823int dev_change_carrier(struct net_device *dev, bool new_carrier)
8824{
8825	const struct net_device_ops *ops = dev->netdev_ops;
8826
8827	if (!ops->ndo_change_carrier)
8828		return -EOPNOTSUPP;
8829	if (!netif_device_present(dev))
8830		return -ENODEV;
8831	return ops->ndo_change_carrier(dev, new_carrier);
8832}
8833EXPORT_SYMBOL(dev_change_carrier);
8834
8835/**
8836 *	dev_get_phys_port_id - Get device physical port ID
8837 *	@dev: device
8838 *	@ppid: port ID
8839 *
8840 *	Get device physical port ID
8841 */
8842int dev_get_phys_port_id(struct net_device *dev,
8843			 struct netdev_phys_item_id *ppid)
8844{
8845	const struct net_device_ops *ops = dev->netdev_ops;
8846
8847	if (!ops->ndo_get_phys_port_id)
8848		return -EOPNOTSUPP;
8849	return ops->ndo_get_phys_port_id(dev, ppid);
8850}
8851EXPORT_SYMBOL(dev_get_phys_port_id);
8852
8853/**
8854 *	dev_get_phys_port_name - Get device physical port name
8855 *	@dev: device
8856 *	@name: port name
8857 *	@len: limit of bytes to copy to name
8858 *
8859 *	Get device physical port name
8860 */
8861int dev_get_phys_port_name(struct net_device *dev,
8862			   char *name, size_t len)
8863{
8864	const struct net_device_ops *ops = dev->netdev_ops;
8865	int err;
8866
8867	if (ops->ndo_get_phys_port_name) {
8868		err = ops->ndo_get_phys_port_name(dev, name, len);
8869		if (err != -EOPNOTSUPP)
8870			return err;
8871	}
8872	return devlink_compat_phys_port_name_get(dev, name, len);
8873}
8874EXPORT_SYMBOL(dev_get_phys_port_name);
8875
8876/**
8877 *	dev_get_port_parent_id - Get the device's port parent identifier
8878 *	@dev: network device
8879 *	@ppid: pointer to a storage for the port's parent identifier
8880 *	@recurse: allow/disallow recursion to lower devices
8881 *
8882 *	Get the devices's port parent identifier
8883 */
8884int dev_get_port_parent_id(struct net_device *dev,
8885			   struct netdev_phys_item_id *ppid,
8886			   bool recurse)
8887{
8888	const struct net_device_ops *ops = dev->netdev_ops;
8889	struct netdev_phys_item_id first = { };
8890	struct net_device *lower_dev;
8891	struct list_head *iter;
8892	int err;
8893
8894	if (ops->ndo_get_port_parent_id) {
8895		err = ops->ndo_get_port_parent_id(dev, ppid);
8896		if (err != -EOPNOTSUPP)
8897			return err;
8898	}
8899
8900	err = devlink_compat_switch_id_get(dev, ppid);
8901	if (!err || err != -EOPNOTSUPP)
8902		return err;
8903
8904	if (!recurse)
8905		return -EOPNOTSUPP;
8906
8907	netdev_for_each_lower_dev(dev, lower_dev, iter) {
8908		err = dev_get_port_parent_id(lower_dev, ppid, recurse);
8909		if (err)
8910			break;
8911		if (!first.id_len)
8912			first = *ppid;
8913		else if (memcmp(&first, ppid, sizeof(*ppid)))
8914			return -EOPNOTSUPP;
8915	}
8916
8917	return err;
8918}
8919EXPORT_SYMBOL(dev_get_port_parent_id);
8920
8921/**
8922 *	netdev_port_same_parent_id - Indicate if two network devices have
8923 *	the same port parent identifier
8924 *	@a: first network device
8925 *	@b: second network device
8926 */
8927bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
8928{
8929	struct netdev_phys_item_id a_id = { };
8930	struct netdev_phys_item_id b_id = { };
8931
8932	if (dev_get_port_parent_id(a, &a_id, true) ||
8933	    dev_get_port_parent_id(b, &b_id, true))
8934		return false;
8935
8936	return netdev_phys_item_id_same(&a_id, &b_id);
8937}
8938EXPORT_SYMBOL(netdev_port_same_parent_id);
8939
8940/**
8941 *	dev_change_proto_down - update protocol port state information
8942 *	@dev: device
8943 *	@proto_down: new value
8944 *
8945 *	This info can be used by switch drivers to set the phys state of the
8946 *	port.
8947 */
8948int dev_change_proto_down(struct net_device *dev, bool proto_down)
8949{
8950	const struct net_device_ops *ops = dev->netdev_ops;
8951
8952	if (!ops->ndo_change_proto_down)
8953		return -EOPNOTSUPP;
8954	if (!netif_device_present(dev))
8955		return -ENODEV;
8956	return ops->ndo_change_proto_down(dev, proto_down);
8957}
8958EXPORT_SYMBOL(dev_change_proto_down);
8959
8960/**
8961 *	dev_change_proto_down_generic - generic implementation for
8962 * 	ndo_change_proto_down that sets carrier according to
8963 * 	proto_down.
8964 *
8965 *	@dev: device
8966 *	@proto_down: new value
8967 */
8968int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
8969{
8970	if (proto_down)
8971		netif_carrier_off(dev);
8972	else
8973		netif_carrier_on(dev);
8974	dev->proto_down = proto_down;
8975	return 0;
8976}
8977EXPORT_SYMBOL(dev_change_proto_down_generic);
8978
8979/**
8980 *	dev_change_proto_down_reason - proto down reason
8981 *
8982 *	@dev: device
8983 *	@mask: proto down mask
8984 *	@value: proto down value
8985 */
8986void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
8987				  u32 value)
8988{
8989	int b;
8990
8991	if (!mask) {
8992		dev->proto_down_reason = value;
8993	} else {
8994		for_each_set_bit(b, &mask, 32) {
8995			if (value & (1 << b))
8996				dev->proto_down_reason |= BIT(b);
8997			else
8998				dev->proto_down_reason &= ~BIT(b);
8999		}
9000	}
9001}
9002EXPORT_SYMBOL(dev_change_proto_down_reason);
9003
9004struct bpf_xdp_link {
9005	struct bpf_link link;
9006	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
9007	int flags;
9008};
9009
9010static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
9011{
9012	if (flags & XDP_FLAGS_HW_MODE)
9013		return XDP_MODE_HW;
9014	if (flags & XDP_FLAGS_DRV_MODE)
9015		return XDP_MODE_DRV;
9016	if (flags & XDP_FLAGS_SKB_MODE)
9017		return XDP_MODE_SKB;
9018	return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
9019}
9020
9021static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
9022{
9023	switch (mode) {
9024	case XDP_MODE_SKB:
9025		return generic_xdp_install;
9026	case XDP_MODE_DRV:
9027	case XDP_MODE_HW:
9028		return dev->netdev_ops->ndo_bpf;
9029	default:
9030		return NULL;
9031	};
9032}
9033
9034static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
9035					 enum bpf_xdp_mode mode)
9036{
9037	return dev->xdp_state[mode].link;
9038}
9039
9040static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
9041				     enum bpf_xdp_mode mode)
9042{
9043	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
9044
9045	if (link)
9046		return link->link.prog;
9047	return dev->xdp_state[mode].prog;
9048}
9049
9050static u8 dev_xdp_prog_count(struct net_device *dev)
9051{
9052	u8 count = 0;
9053	int i;
9054
9055	for (i = 0; i < __MAX_XDP_MODE; i++)
9056		if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
9057			count++;
9058	return count;
9059}
9060
9061u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
9062{
9063	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
9064
9065	return prog ? prog->aux->id : 0;
9066}
9067
9068static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
9069			     struct bpf_xdp_link *link)
9070{
9071	dev->xdp_state[mode].link = link;
9072	dev->xdp_state[mode].prog = NULL;
9073}
9074
9075static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
9076			     struct bpf_prog *prog)
9077{
9078	dev->xdp_state[mode].link = NULL;
9079	dev->xdp_state[mode].prog = prog;
9080}
9081
9082static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
9083			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
9084			   u32 flags, struct bpf_prog *prog)
9085{
9086	struct netdev_bpf xdp;
9087	int err;
9088
9089	memset(&xdp, 0, sizeof(xdp));
9090	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
9091	xdp.extack = extack;
9092	xdp.flags = flags;
9093	xdp.prog = prog;
9094
9095	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
9096	 * "moved" into driver), so they don't increment it on their own, but
9097	 * they do decrement refcnt when program is detached or replaced.
9098	 * Given net_device also owns link/prog, we need to bump refcnt here
9099	 * to prevent drivers from underflowing it.
9100	 */
9101	if (prog)
9102		bpf_prog_inc(prog);
9103	err = bpf_op(dev, &xdp);
9104	if (err) {
9105		if (prog)
9106			bpf_prog_put(prog);
9107		return err;
9108	}
9109
9110	if (mode != XDP_MODE_HW)
9111		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
9112
9113	return 0;
9114}
9115
9116static void dev_xdp_uninstall(struct net_device *dev)
9117{
9118	struct bpf_xdp_link *link;
9119	struct bpf_prog *prog;
9120	enum bpf_xdp_mode mode;
9121	bpf_op_t bpf_op;
9122
9123	ASSERT_RTNL();
9124
9125	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
9126		prog = dev_xdp_prog(dev, mode);
9127		if (!prog)
9128			continue;
9129
9130		bpf_op = dev_xdp_bpf_op(dev, mode);
9131		if (!bpf_op)
9132			continue;
9133
9134		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9135
9136		/* auto-detach link from net device */
9137		link = dev_xdp_link(dev, mode);
9138		if (link)
9139			link->dev = NULL;
9140		else
9141			bpf_prog_put(prog);
9142
9143		dev_xdp_set_link(dev, mode, NULL);
9144	}
9145}
9146
9147static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
9148			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
9149			  struct bpf_prog *old_prog, u32 flags)
9150{
9151	unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
9152	struct bpf_prog *cur_prog;
9153	enum bpf_xdp_mode mode;
9154	bpf_op_t bpf_op;
9155	int err;
9156
9157	ASSERT_RTNL();
9158
9159	/* either link or prog attachment, never both */
9160	if (link && (new_prog || old_prog))
9161		return -EINVAL;
9162	/* link supports only XDP mode flags */
9163	if (link && (flags & ~XDP_FLAGS_MODES)) {
9164		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
9165		return -EINVAL;
9166	}
9167	/* just one XDP mode bit should be set, zero defaults to drv/skb mode */
9168	if (num_modes > 1) {
9169		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
9170		return -EINVAL;
9171	}
9172	/* avoid ambiguity if offload + drv/skb mode progs are both loaded */
9173	if (!num_modes && dev_xdp_prog_count(dev) > 1) {
9174		NL_SET_ERR_MSG(extack,
9175			       "More than one program loaded, unset mode is ambiguous");
9176		return -EINVAL;
9177	}
9178	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
9179	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
9180		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
9181		return -EINVAL;
9182	}
9183
9184	mode = dev_xdp_mode(dev, flags);
9185	/* can't replace attached link */
9186	if (dev_xdp_link(dev, mode)) {
9187		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
9188		return -EBUSY;
9189	}
9190
9191	cur_prog = dev_xdp_prog(dev, mode);
9192	/* can't replace attached prog with link */
9193	if (link && cur_prog) {
9194		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
9195		return -EBUSY;
9196	}
9197	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
9198		NL_SET_ERR_MSG(extack, "Active program does not match expected");
9199		return -EEXIST;
9200	}
9201
9202	/* put effective new program into new_prog */
9203	if (link)
9204		new_prog = link->link.prog;
9205
9206	if (new_prog) {
9207		bool offload = mode == XDP_MODE_HW;
9208		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
9209					       ? XDP_MODE_DRV : XDP_MODE_SKB;
9210
9211		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
9212			NL_SET_ERR_MSG(extack, "XDP program already attached");
9213			return -EBUSY;
9214		}
9215		if (!offload && dev_xdp_prog(dev, other_mode)) {
9216			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
9217			return -EEXIST;
9218		}
9219		if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
9220			NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
9221			return -EINVAL;
9222		}
9223		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
9224			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
9225			return -EINVAL;
9226		}
9227		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
9228			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
9229			return -EINVAL;
9230		}
9231	}
9232
9233	/* don't call drivers if the effective program didn't change */
9234	if (new_prog != cur_prog) {
9235		bpf_op = dev_xdp_bpf_op(dev, mode);
9236		if (!bpf_op) {
9237			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
9238			return -EOPNOTSUPP;
9239		}
9240
9241		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
9242		if (err)
9243			return err;
9244	}
9245
9246	if (link)
9247		dev_xdp_set_link(dev, mode, link);
9248	else
9249		dev_xdp_set_prog(dev, mode, new_prog);
9250	if (cur_prog)
9251		bpf_prog_put(cur_prog);
9252
9253	return 0;
9254}
9255
9256static int dev_xdp_attach_link(struct net_device *dev,
9257			       struct netlink_ext_ack *extack,
9258			       struct bpf_xdp_link *link)
9259{
9260	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
9261}
9262
9263static int dev_xdp_detach_link(struct net_device *dev,
9264			       struct netlink_ext_ack *extack,
9265			       struct bpf_xdp_link *link)
9266{
9267	enum bpf_xdp_mode mode;
9268	bpf_op_t bpf_op;
9269
9270	ASSERT_RTNL();
9271
9272	mode = dev_xdp_mode(dev, link->flags);
9273	if (dev_xdp_link(dev, mode) != link)
9274		return -EINVAL;
9275
9276	bpf_op = dev_xdp_bpf_op(dev, mode);
9277	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9278	dev_xdp_set_link(dev, mode, NULL);
9279	return 0;
9280}
9281
9282static void bpf_xdp_link_release(struct bpf_link *link)
9283{
9284	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9285
9286	rtnl_lock();
9287
9288	/* if racing with net_device's tear down, xdp_link->dev might be
9289	 * already NULL, in which case link was already auto-detached
9290	 */
9291	if (xdp_link->dev) {
9292		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
9293		xdp_link->dev = NULL;
9294	}
9295
9296	rtnl_unlock();
9297}
9298
9299static int bpf_xdp_link_detach(struct bpf_link *link)
9300{
9301	bpf_xdp_link_release(link);
9302	return 0;
9303}
9304
9305static void bpf_xdp_link_dealloc(struct bpf_link *link)
9306{
9307	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9308
9309	kfree(xdp_link);
9310}
9311
9312static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
9313				     struct seq_file *seq)
9314{
9315	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9316	u32 ifindex = 0;
9317
9318	rtnl_lock();
9319	if (xdp_link->dev)
9320		ifindex = xdp_link->dev->ifindex;
9321	rtnl_unlock();
9322
9323	seq_printf(seq, "ifindex:\t%u\n", ifindex);
9324}
9325
9326static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
9327				       struct bpf_link_info *info)
9328{
9329	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9330	u32 ifindex = 0;
9331
9332	rtnl_lock();
9333	if (xdp_link->dev)
9334		ifindex = xdp_link->dev->ifindex;
9335	rtnl_unlock();
9336
9337	info->xdp.ifindex = ifindex;
9338	return 0;
9339}
9340
9341static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
9342			       struct bpf_prog *old_prog)
9343{
9344	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9345	enum bpf_xdp_mode mode;
9346	bpf_op_t bpf_op;
9347	int err = 0;
9348
9349	rtnl_lock();
9350
9351	/* link might have been auto-released already, so fail */
9352	if (!xdp_link->dev) {
9353		err = -ENOLINK;
9354		goto out_unlock;
9355	}
9356
9357	if (old_prog && link->prog != old_prog) {
9358		err = -EPERM;
9359		goto out_unlock;
9360	}
9361	old_prog = link->prog;
9362	if (old_prog->type != new_prog->type ||
9363	    old_prog->expected_attach_type != new_prog->expected_attach_type) {
9364		err = -EINVAL;
9365		goto out_unlock;
9366	}
9367
9368	if (old_prog == new_prog) {
9369		/* no-op, don't disturb drivers */
9370		bpf_prog_put(new_prog);
9371		goto out_unlock;
9372	}
9373
9374	mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
9375	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
9376	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
9377			      xdp_link->flags, new_prog);
9378	if (err)
9379		goto out_unlock;
9380
9381	old_prog = xchg(&link->prog, new_prog);
9382	bpf_prog_put(old_prog);
9383
9384out_unlock:
9385	rtnl_unlock();
9386	return err;
9387}
9388
9389static const struct bpf_link_ops bpf_xdp_link_lops = {
9390	.release = bpf_xdp_link_release,
9391	.dealloc = bpf_xdp_link_dealloc,
9392	.detach = bpf_xdp_link_detach,
9393	.show_fdinfo = bpf_xdp_link_show_fdinfo,
9394	.fill_link_info = bpf_xdp_link_fill_link_info,
9395	.update_prog = bpf_xdp_link_update,
9396};
9397
9398int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
9399{
9400	struct net *net = current->nsproxy->net_ns;
9401	struct bpf_link_primer link_primer;
9402	struct bpf_xdp_link *link;
9403	struct net_device *dev;
9404	int err, fd;
9405
9406	rtnl_lock();
9407	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
9408	if (!dev) {
9409		rtnl_unlock();
9410		return -EINVAL;
9411	}
9412
9413	link = kzalloc(sizeof(*link), GFP_USER);
9414	if (!link) {
9415		err = -ENOMEM;
9416		goto unlock;
9417	}
9418
9419	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
9420	link->dev = dev;
9421	link->flags = attr->link_create.flags;
9422
9423	err = bpf_link_prime(&link->link, &link_primer);
9424	if (err) {
9425		kfree(link);
9426		goto unlock;
9427	}
9428
9429	err = dev_xdp_attach_link(dev, NULL, link);
9430	rtnl_unlock();
9431
9432	if (err) {
9433		link->dev = NULL;
9434		bpf_link_cleanup(&link_primer);
9435		goto out_put_dev;
9436	}
9437
9438	fd = bpf_link_settle(&link_primer);
9439	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
9440	dev_put(dev);
9441	return fd;
9442
9443unlock:
9444	rtnl_unlock();
9445
9446out_put_dev:
9447	dev_put(dev);
9448	return err;
9449}
9450
9451/**
9452 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
9453 *	@dev: device
9454 *	@extack: netlink extended ack
9455 *	@fd: new program fd or negative value to clear
9456 *	@expected_fd: old program fd that userspace expects to replace or clear
9457 *	@flags: xdp-related flags
9458 *
9459 *	Set or clear a bpf program for a device
9460 */
9461int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
9462		      int fd, int expected_fd, u32 flags)
9463{
9464	enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
9465	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
9466	int err;
9467
9468	ASSERT_RTNL();
9469
9470	if (fd >= 0) {
9471		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
9472						 mode != XDP_MODE_SKB);
9473		if (IS_ERR(new_prog))
9474			return PTR_ERR(new_prog);
9475	}
9476
9477	if (expected_fd >= 0) {
9478		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
9479						 mode != XDP_MODE_SKB);
9480		if (IS_ERR(old_prog)) {
9481			err = PTR_ERR(old_prog);
9482			old_prog = NULL;
9483			goto err_out;
9484		}
9485	}
9486
9487	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
9488
9489err_out:
9490	if (err && new_prog)
9491		bpf_prog_put(new_prog);
9492	if (old_prog)
9493		bpf_prog_put(old_prog);
9494	return err;
9495}
9496
9497/**
9498 *	dev_new_index	-	allocate an ifindex
9499 *	@net: the applicable net namespace
9500 *
9501 *	Returns a suitable unique value for a new device interface
9502 *	number.  The caller must hold the rtnl semaphore or the
9503 *	dev_base_lock to be sure it remains unique.
9504 */
9505static int dev_new_index(struct net *net)
9506{
9507	int ifindex = net->ifindex;
9508
9509	for (;;) {
9510		if (++ifindex <= 0)
9511			ifindex = 1;
9512		if (!__dev_get_by_index(net, ifindex))
9513			return net->ifindex = ifindex;
9514	}
9515}
9516
9517/* Delayed registration/unregisteration */
9518static LIST_HEAD(net_todo_list);
9519DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
9520
9521static void net_set_todo(struct net_device *dev)
9522{
9523	list_add_tail(&dev->todo_list, &net_todo_list);
9524	dev_net(dev)->dev_unreg_count++;
9525}
9526
9527static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
9528	struct net_device *upper, netdev_features_t features)
9529{
9530	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9531	netdev_features_t feature;
9532	int feature_bit;
9533
9534	for_each_netdev_feature(upper_disables, feature_bit) {
9535		feature = __NETIF_F_BIT(feature_bit);
9536		if (!(upper->wanted_features & feature)
9537		    && (features & feature)) {
9538			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
9539				   &feature, upper->name);
9540			features &= ~feature;
9541		}
9542	}
9543
9544	return features;
9545}
9546
9547static void netdev_sync_lower_features(struct net_device *upper,
9548	struct net_device *lower, netdev_features_t features)
9549{
9550	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9551	netdev_features_t feature;
9552	int feature_bit;
9553
9554	for_each_netdev_feature(upper_disables, feature_bit) {
9555		feature = __NETIF_F_BIT(feature_bit);
9556		if (!(features & feature) && (lower->features & feature)) {
9557			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
9558				   &feature, lower->name);
9559			lower->wanted_features &= ~feature;
9560			__netdev_update_features(lower);
9561
9562			if (unlikely(lower->features & feature))
9563				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
9564					    &feature, lower->name);
9565			else
9566				netdev_features_change(lower);
9567		}
9568	}
9569}
9570
9571static netdev_features_t netdev_fix_features(struct net_device *dev,
9572	netdev_features_t features)
9573{
9574	/* Fix illegal checksum combinations */
9575	if ((features & NETIF_F_HW_CSUM) &&
9576	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
9577		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
9578		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
9579	}
9580
9581	/* TSO requires that SG is present as well. */
9582	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
9583		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
9584		features &= ~NETIF_F_ALL_TSO;
9585	}
9586
9587	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
9588					!(features & NETIF_F_IP_CSUM)) {
9589		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
9590		features &= ~NETIF_F_TSO;
9591		features &= ~NETIF_F_TSO_ECN;
9592	}
9593
9594	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
9595					 !(features & NETIF_F_IPV6_CSUM)) {
9596		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
9597		features &= ~NETIF_F_TSO6;
9598	}
9599
9600	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
9601	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
9602		features &= ~NETIF_F_TSO_MANGLEID;
9603
9604	/* TSO ECN requires that TSO is present as well. */
9605	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
9606		features &= ~NETIF_F_TSO_ECN;
9607
9608	/* Software GSO depends on SG. */
9609	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
9610		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
9611		features &= ~NETIF_F_GSO;
9612	}
9613
9614	/* GSO partial features require GSO partial be set */
9615	if ((features & dev->gso_partial_features) &&
9616	    !(features & NETIF_F_GSO_PARTIAL)) {
9617		netdev_dbg(dev,
9618			   "Dropping partially supported GSO features since no GSO partial.\n");
9619		features &= ~dev->gso_partial_features;
9620	}
9621
9622	if (!(features & NETIF_F_RXCSUM)) {
9623		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
9624		 * successfully merged by hardware must also have the
9625		 * checksum verified by hardware.  If the user does not
9626		 * want to enable RXCSUM, logically, we should disable GRO_HW.
9627		 */
9628		if (features & NETIF_F_GRO_HW) {
9629			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
9630			features &= ~NETIF_F_GRO_HW;
9631		}
9632	}
9633
9634	/* LRO/HW-GRO features cannot be combined with RX-FCS */
9635	if (features & NETIF_F_RXFCS) {
9636		if (features & NETIF_F_LRO) {
9637			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
9638			features &= ~NETIF_F_LRO;
9639		}
9640
9641		if (features & NETIF_F_GRO_HW) {
9642			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
9643			features &= ~NETIF_F_GRO_HW;
9644		}
9645	}
9646
9647	if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
9648		netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
9649		features &= ~NETIF_F_HW_TLS_RX;
9650	}
9651
9652	return features;
9653}
9654
9655int __netdev_update_features(struct net_device *dev)
9656{
9657	struct net_device *upper, *lower;
9658	netdev_features_t features;
9659	struct list_head *iter;
9660	int err = -1;
9661
9662	ASSERT_RTNL();
9663
9664	features = netdev_get_wanted_features(dev);
9665
9666	if (dev->netdev_ops->ndo_fix_features)
9667		features = dev->netdev_ops->ndo_fix_features(dev, features);
9668
9669	/* driver might be less strict about feature dependencies */
9670	features = netdev_fix_features(dev, features);
9671
9672	/* some features can't be enabled if they're off on an upper device */
9673	netdev_for_each_upper_dev_rcu(dev, upper, iter)
9674		features = netdev_sync_upper_features(dev, upper, features);
9675
9676	if (dev->features == features)
9677		goto sync_lower;
9678
9679	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
9680		&dev->features, &features);
9681
9682	if (dev->netdev_ops->ndo_set_features)
9683		err = dev->netdev_ops->ndo_set_features(dev, features);
9684	else
9685		err = 0;
9686
9687	if (unlikely(err < 0)) {
9688		netdev_err(dev,
9689			"set_features() failed (%d); wanted %pNF, left %pNF\n",
9690			err, &features, &dev->features);
9691		/* return non-0 since some features might have changed and
9692		 * it's better to fire a spurious notification than miss it
9693		 */
9694		return -1;
9695	}
9696
9697sync_lower:
9698	/* some features must be disabled on lower devices when disabled
9699	 * on an upper device (think: bonding master or bridge)
9700	 */
9701	netdev_for_each_lower_dev(dev, lower, iter)
9702		netdev_sync_lower_features(dev, lower, features);
9703
9704	if (!err) {
9705		netdev_features_t diff = features ^ dev->features;
9706
9707		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
9708			/* udp_tunnel_{get,drop}_rx_info both need
9709			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
9710			 * device, or they won't do anything.
9711			 * Thus we need to update dev->features
9712			 * *before* calling udp_tunnel_get_rx_info,
9713			 * but *after* calling udp_tunnel_drop_rx_info.
9714			 */
9715			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
9716				dev->features = features;
9717				udp_tunnel_get_rx_info(dev);
9718			} else {
9719				udp_tunnel_drop_rx_info(dev);
9720			}
9721		}
9722
9723		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
9724			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
9725				dev->features = features;
9726				err |= vlan_get_rx_ctag_filter_info(dev);
9727			} else {
9728				vlan_drop_rx_ctag_filter_info(dev);
9729			}
9730		}
9731
9732		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
9733			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
9734				dev->features = features;
9735				err |= vlan_get_rx_stag_filter_info(dev);
9736			} else {
9737				vlan_drop_rx_stag_filter_info(dev);
9738			}
9739		}
9740
9741		dev->features = features;
9742	}
9743
9744	return err < 0 ? 0 : 1;
9745}
9746
9747/**
9748 *	netdev_update_features - recalculate device features
9749 *	@dev: the device to check
9750 *
9751 *	Recalculate dev->features set and send notifications if it
9752 *	has changed. Should be called after driver or hardware dependent
9753 *	conditions might have changed that influence the features.
9754 */
9755void netdev_update_features(struct net_device *dev)
9756{
9757	if (__netdev_update_features(dev))
9758		netdev_features_change(dev);
9759}
9760EXPORT_SYMBOL(netdev_update_features);
9761
9762/**
9763 *	netdev_change_features - recalculate device features
9764 *	@dev: the device to check
9765 *
9766 *	Recalculate dev->features set and send notifications even
9767 *	if they have not changed. Should be called instead of
9768 *	netdev_update_features() if also dev->vlan_features might
9769 *	have changed to allow the changes to be propagated to stacked
9770 *	VLAN devices.
9771 */
9772void netdev_change_features(struct net_device *dev)
9773{
9774	__netdev_update_features(dev);
9775	netdev_features_change(dev);
9776}
9777EXPORT_SYMBOL(netdev_change_features);
9778
9779/**
9780 *	netif_stacked_transfer_operstate -	transfer operstate
9781 *	@rootdev: the root or lower level device to transfer state from
9782 *	@dev: the device to transfer operstate to
9783 *
9784 *	Transfer operational state from root to device. This is normally
9785 *	called when a stacking relationship exists between the root
9786 *	device and the device(a leaf device).
9787 */
9788void netif_stacked_transfer_operstate(const struct net_device *rootdev,
9789					struct net_device *dev)
9790{
9791	if (rootdev->operstate == IF_OPER_DORMANT)
9792		netif_dormant_on(dev);
9793	else
9794		netif_dormant_off(dev);
9795
9796	if (rootdev->operstate == IF_OPER_TESTING)
9797		netif_testing_on(dev);
9798	else
9799		netif_testing_off(dev);
9800
9801	if (netif_carrier_ok(rootdev))
9802		netif_carrier_on(dev);
9803	else
9804		netif_carrier_off(dev);
9805}
9806EXPORT_SYMBOL(netif_stacked_transfer_operstate);
9807
9808static int netif_alloc_rx_queues(struct net_device *dev)
9809{
9810	unsigned int i, count = dev->num_rx_queues;
9811	struct netdev_rx_queue *rx;
9812	size_t sz = count * sizeof(*rx);
9813	int err = 0;
9814
9815	BUG_ON(count < 1);
9816
9817	rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9818	if (!rx)
9819		return -ENOMEM;
9820
9821	dev->_rx = rx;
9822
9823	for (i = 0; i < count; i++) {
9824		rx[i].dev = dev;
9825
9826		/* XDP RX-queue setup */
9827		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
9828		if (err < 0)
9829			goto err_rxq_info;
9830	}
9831	return 0;
9832
9833err_rxq_info:
9834	/* Rollback successful reg's and free other resources */
9835	while (i--)
9836		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
9837	kvfree(dev->_rx);
9838	dev->_rx = NULL;
9839	return err;
9840}
9841
9842static void netif_free_rx_queues(struct net_device *dev)
9843{
9844	unsigned int i, count = dev->num_rx_queues;
9845
9846	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
9847	if (!dev->_rx)
9848		return;
9849
9850	for (i = 0; i < count; i++)
9851		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
9852
9853	kvfree(dev->_rx);
9854}
9855
9856static void netdev_init_one_queue(struct net_device *dev,
9857				  struct netdev_queue *queue, void *_unused)
9858{
9859	/* Initialize queue lock */
9860	spin_lock_init(&queue->_xmit_lock);
9861	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
9862	queue->xmit_lock_owner = -1;
9863	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
9864	queue->dev = dev;
9865#ifdef CONFIG_BQL
9866	dql_init(&queue->dql, HZ);
9867#endif
9868}
9869
9870static void netif_free_tx_queues(struct net_device *dev)
9871{
9872	kvfree(dev->_tx);
9873}
9874
9875static int netif_alloc_netdev_queues(struct net_device *dev)
9876{
9877	unsigned int count = dev->num_tx_queues;
9878	struct netdev_queue *tx;
9879	size_t sz = count * sizeof(*tx);
9880
9881	if (count < 1 || count > 0xffff)
9882		return -EINVAL;
9883
9884	tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9885	if (!tx)
9886		return -ENOMEM;
9887
9888	dev->_tx = tx;
9889
9890	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
9891	spin_lock_init(&dev->tx_global_lock);
9892
9893	return 0;
9894}
9895
9896void netif_tx_stop_all_queues(struct net_device *dev)
9897{
9898	unsigned int i;
9899
9900	for (i = 0; i < dev->num_tx_queues; i++) {
9901		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
9902
9903		netif_tx_stop_queue(txq);
9904	}
9905}
9906EXPORT_SYMBOL(netif_tx_stop_all_queues);
9907
9908/**
9909 *	register_netdevice	- register a network device
9910 *	@dev: device to register
9911 *
9912 *	Take a completed network device structure and add it to the kernel
9913 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
9914 *	chain. 0 is returned on success. A negative errno code is returned
9915 *	on a failure to set up the device, or if the name is a duplicate.
9916 *
9917 *	Callers must hold the rtnl semaphore. You may want
9918 *	register_netdev() instead of this.
9919 *
9920 *	BUGS:
9921 *	The locking appears insufficient to guarantee two parallel registers
9922 *	will not get the same name.
9923 */
9924
9925int register_netdevice(struct net_device *dev)
9926{
9927	int ret;
9928	struct net *net = dev_net(dev);
9929
9930	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
9931		     NETDEV_FEATURE_COUNT);
9932	BUG_ON(dev_boot_phase);
9933	ASSERT_RTNL();
9934
9935	might_sleep();
9936
9937	/* When net_device's are persistent, this will be fatal. */
9938	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
9939	BUG_ON(!net);
9940
9941	ret = ethtool_check_ops(dev->ethtool_ops);
9942	if (ret)
9943		return ret;
9944
9945	spin_lock_init(&dev->addr_list_lock);
9946	netdev_set_addr_lockdep_class(dev);
9947
9948	ret = dev_get_valid_name(net, dev, dev->name);
9949	if (ret < 0)
9950		goto out;
9951
9952	ret = -ENOMEM;
9953	dev->name_node = netdev_name_node_head_alloc(dev);
9954	if (!dev->name_node)
9955		goto out;
9956
9957	/* Init, if this function is available */
9958	if (dev->netdev_ops->ndo_init) {
9959		ret = dev->netdev_ops->ndo_init(dev);
9960		if (ret) {
9961			if (ret > 0)
9962				ret = -EIO;
9963			goto err_free_name;
9964		}
9965	}
9966
9967	if (((dev->hw_features | dev->features) &
9968	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
9969	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
9970	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
9971		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
9972		ret = -EINVAL;
9973		goto err_uninit;
9974	}
9975
9976	ret = -EBUSY;
9977	if (!dev->ifindex)
9978		dev->ifindex = dev_new_index(net);
9979	else if (__dev_get_by_index(net, dev->ifindex))
9980		goto err_uninit;
9981
9982	/* Transfer changeable features to wanted_features and enable
9983	 * software offloads (GSO and GRO).
9984	 */
9985	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
9986	dev->features |= NETIF_F_SOFT_FEATURES;
9987
9988	if (dev->netdev_ops->ndo_udp_tunnel_add) {
9989		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
9990		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
9991	}
9992
9993	dev->wanted_features = dev->features & dev->hw_features;
9994
9995	if (!(dev->flags & IFF_LOOPBACK))
9996		dev->hw_features |= NETIF_F_NOCACHE_COPY;
9997
9998	/* If IPv4 TCP segmentation offload is supported we should also
9999	 * allow the device to enable segmenting the frame with the option
10000	 * of ignoring a static IP ID value.  This doesn't enable the
10001	 * feature itself but allows the user to enable it later.
10002	 */
10003	if (dev->hw_features & NETIF_F_TSO)
10004		dev->hw_features |= NETIF_F_TSO_MANGLEID;
10005	if (dev->vlan_features & NETIF_F_TSO)
10006		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
10007	if (dev->mpls_features & NETIF_F_TSO)
10008		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
10009	if (dev->hw_enc_features & NETIF_F_TSO)
10010		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
10011
10012	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
10013	 */
10014	dev->vlan_features |= NETIF_F_HIGHDMA;
10015
10016	/* Make NETIF_F_SG inheritable to tunnel devices.
10017	 */
10018	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
10019
10020	/* Make NETIF_F_SG inheritable to MPLS.
10021	 */
10022	dev->mpls_features |= NETIF_F_SG;
10023
10024	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
10025	ret = notifier_to_errno(ret);
10026	if (ret)
10027		goto err_uninit;
10028
10029	ret = netdev_register_kobject(dev);
10030	if (ret) {
10031		dev->reg_state = NETREG_UNREGISTERED;
10032		goto err_uninit;
10033	}
10034	dev->reg_state = NETREG_REGISTERED;
10035
10036	__netdev_update_features(dev);
10037
10038	/*
10039	 *	Default initial state at registry is that the
10040	 *	device is present.
10041	 */
10042
10043	set_bit(__LINK_STATE_PRESENT, &dev->state);
10044
10045	linkwatch_init_dev(dev);
10046
10047	dev_init_scheduler(dev);
10048	dev_hold(dev);
10049	list_netdevice(dev);
10050	add_device_randomness(dev->dev_addr, dev->addr_len);
10051
10052	/* If the device has permanent device address, driver should
10053	 * set dev_addr and also addr_assign_type should be set to
10054	 * NET_ADDR_PERM (default value).
10055	 */
10056	if (dev->addr_assign_type == NET_ADDR_PERM)
10057		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
10058
10059	/* Notify protocols, that a new device appeared. */
10060	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
10061	ret = notifier_to_errno(ret);
10062	if (ret) {
10063		/* Expect explicit free_netdev() on failure */
10064		dev->needs_free_netdev = false;
10065		unregister_netdevice_queue(dev, NULL);
10066		goto out;
10067	}
10068	/*
10069	 *	Prevent userspace races by waiting until the network
10070	 *	device is fully setup before sending notifications.
10071	 */
10072	if (!dev->rtnl_link_ops ||
10073	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10074		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10075
10076out:
10077	return ret;
10078
10079err_uninit:
10080	if (dev->netdev_ops->ndo_uninit)
10081		dev->netdev_ops->ndo_uninit(dev);
10082	if (dev->priv_destructor)
10083		dev->priv_destructor(dev);
10084err_free_name:
10085	netdev_name_node_free(dev->name_node);
10086	goto out;
10087}
10088EXPORT_SYMBOL(register_netdevice);
10089
10090/**
10091 *	init_dummy_netdev	- init a dummy network device for NAPI
10092 *	@dev: device to init
10093 *
10094 *	This takes a network device structure and initialize the minimum
10095 *	amount of fields so it can be used to schedule NAPI polls without
10096 *	registering a full blown interface. This is to be used by drivers
10097 *	that need to tie several hardware interfaces to a single NAPI
10098 *	poll scheduler due to HW limitations.
10099 */
10100int init_dummy_netdev(struct net_device *dev)
10101{
10102	/* Clear everything. Note we don't initialize spinlocks
10103	 * are they aren't supposed to be taken by any of the
10104	 * NAPI code and this dummy netdev is supposed to be
10105	 * only ever used for NAPI polls
10106	 */
10107	memset(dev, 0, sizeof(struct net_device));
10108
10109	/* make sure we BUG if trying to hit standard
10110	 * register/unregister code path
10111	 */
10112	dev->reg_state = NETREG_DUMMY;
10113
10114	/* NAPI wants this */
10115	INIT_LIST_HEAD(&dev->napi_list);
10116
10117	/* a dummy interface is started by default */
10118	set_bit(__LINK_STATE_PRESENT, &dev->state);
10119	set_bit(__LINK_STATE_START, &dev->state);
10120
10121	/* napi_busy_loop stats accounting wants this */
10122	dev_net_set(dev, &init_net);
10123
10124	/* Note : We dont allocate pcpu_refcnt for dummy devices,
10125	 * because users of this 'device' dont need to change
10126	 * its refcount.
10127	 */
10128
10129	return 0;
10130}
10131EXPORT_SYMBOL_GPL(init_dummy_netdev);
10132
10133
10134/**
10135 *	register_netdev	- register a network device
10136 *	@dev: device to register
10137 *
10138 *	Take a completed network device structure and add it to the kernel
10139 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10140 *	chain. 0 is returned on success. A negative errno code is returned
10141 *	on a failure to set up the device, or if the name is a duplicate.
10142 *
10143 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
10144 *	and expands the device name if you passed a format string to
10145 *	alloc_netdev.
10146 */
10147int register_netdev(struct net_device *dev)
10148{
10149	int err;
10150
10151	if (rtnl_lock_killable())
10152		return -EINTR;
10153	err = register_netdevice(dev);
10154	rtnl_unlock();
10155	return err;
10156}
10157EXPORT_SYMBOL(register_netdev);
10158
10159int netdev_refcnt_read(const struct net_device *dev)
10160{
10161	int i, refcnt = 0;
10162
10163	for_each_possible_cpu(i)
10164		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10165	return refcnt;
10166}
10167EXPORT_SYMBOL(netdev_refcnt_read);
10168
10169#define WAIT_REFS_MIN_MSECS 1
10170#define WAIT_REFS_MAX_MSECS 250
10171/**
10172 * netdev_wait_allrefs - wait until all references are gone.
10173 * @dev: target net_device
10174 *
10175 * This is called when unregistering network devices.
10176 *
10177 * Any protocol or device that holds a reference should register
10178 * for netdevice notification, and cleanup and put back the
10179 * reference if they receive an UNREGISTER event.
10180 * We can get stuck here if buggy protocols don't correctly
10181 * call dev_put.
10182 */
10183static void netdev_wait_allrefs(struct net_device *dev)
10184{
10185	unsigned long rebroadcast_time, warning_time;
10186	int wait = 0, refcnt;
10187
10188	linkwatch_forget_dev(dev);
10189
10190	rebroadcast_time = warning_time = jiffies;
10191	refcnt = netdev_refcnt_read(dev);
10192
10193	while (refcnt != 0) {
10194		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10195			rtnl_lock();
10196
10197			/* Rebroadcast unregister notification */
10198			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10199
10200			__rtnl_unlock();
10201			rcu_barrier();
10202			rtnl_lock();
10203
10204			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10205				     &dev->state)) {
10206				/* We must not have linkwatch events
10207				 * pending on unregister. If this
10208				 * happens, we simply run the queue
10209				 * unscheduled, resulting in a noop
10210				 * for this device.
10211				 */
10212				linkwatch_run_queue();
10213			}
10214
10215			__rtnl_unlock();
10216
10217			rebroadcast_time = jiffies;
10218		}
10219
10220		if (!wait) {
10221			rcu_barrier();
10222			wait = WAIT_REFS_MIN_MSECS;
10223		} else {
10224			msleep(wait);
10225			wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10226		}
10227
10228		refcnt = netdev_refcnt_read(dev);
10229
10230		if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
10231			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10232				 dev->name, refcnt);
10233			warning_time = jiffies;
10234		}
10235	}
10236}
10237
10238/* The sequence is:
10239 *
10240 *	rtnl_lock();
10241 *	...
10242 *	register_netdevice(x1);
10243 *	register_netdevice(x2);
10244 *	...
10245 *	unregister_netdevice(y1);
10246 *	unregister_netdevice(y2);
10247 *      ...
10248 *	rtnl_unlock();
10249 *	free_netdev(y1);
10250 *	free_netdev(y2);
10251 *
10252 * We are invoked by rtnl_unlock().
10253 * This allows us to deal with problems:
10254 * 1) We can delete sysfs objects which invoke hotplug
10255 *    without deadlocking with linkwatch via keventd.
10256 * 2) Since we run with the RTNL semaphore not held, we can sleep
10257 *    safely in order to wait for the netdev refcnt to drop to zero.
10258 *
10259 * We must not return until all unregister events added during
10260 * the interval the lock was held have been completed.
10261 */
10262void netdev_run_todo(void)
10263{
10264	struct list_head list;
10265#ifdef CONFIG_LOCKDEP
10266	struct list_head unlink_list;
10267
10268	list_replace_init(&net_unlink_list, &unlink_list);
10269
10270	while (!list_empty(&unlink_list)) {
10271		struct net_device *dev = list_first_entry(&unlink_list,
10272							  struct net_device,
10273							  unlink_list);
10274		list_del_init(&dev->unlink_list);
10275		dev->nested_level = dev->lower_level - 1;
10276	}
10277#endif
10278
10279	/* Snapshot list, allow later requests */
10280	list_replace_init(&net_todo_list, &list);
10281
10282	__rtnl_unlock();
10283
10284
10285	/* Wait for rcu callbacks to finish before next phase */
10286	if (!list_empty(&list))
10287		rcu_barrier();
10288
10289	while (!list_empty(&list)) {
10290		struct net_device *dev
10291			= list_first_entry(&list, struct net_device, todo_list);
10292		list_del(&dev->todo_list);
10293
10294		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10295			pr_err("network todo '%s' but state %d\n",
10296			       dev->name, dev->reg_state);
10297			dump_stack();
10298			continue;
10299		}
10300
10301		dev->reg_state = NETREG_UNREGISTERED;
10302
10303		netdev_wait_allrefs(dev);
10304
10305		/* paranoia */
10306		BUG_ON(netdev_refcnt_read(dev));
10307		BUG_ON(!list_empty(&dev->ptype_all));
10308		BUG_ON(!list_empty(&dev->ptype_specific));
10309		WARN_ON(rcu_access_pointer(dev->ip_ptr));
10310		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10311
10312		if (dev->priv_destructor)
10313			dev->priv_destructor(dev);
10314		if (dev->needs_free_netdev)
10315			free_netdev(dev);
10316
10317		/* Report a network device has been unregistered */
10318		rtnl_lock();
10319		dev_net(dev)->dev_unreg_count--;
10320		__rtnl_unlock();
10321		wake_up(&netdev_unregistering_wq);
10322
10323		/* Free network device */
10324		kobject_put(&dev->dev.kobj);
10325	}
10326}
10327
10328/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10329 * all the same fields in the same order as net_device_stats, with only
10330 * the type differing, but rtnl_link_stats64 may have additional fields
10331 * at the end for newer counters.
10332 */
10333void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10334			     const struct net_device_stats *netdev_stats)
10335{
10336	size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
10337	const atomic_long_t *src = (atomic_long_t *)netdev_stats;
10338	u64 *dst = (u64 *)stats64;
10339
10340	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10341	for (i = 0; i < n; i++)
10342		dst[i] = (unsigned long)atomic_long_read(&src[i]);
10343	/* zero out counters that only exist in rtnl_link_stats64 */
10344	memset((char *)stats64 + n * sizeof(u64), 0,
10345	       sizeof(*stats64) - n * sizeof(u64));
10346}
10347EXPORT_SYMBOL(netdev_stats_to_stats64);
10348
10349/**
10350 *	dev_get_stats	- get network device statistics
10351 *	@dev: device to get statistics from
10352 *	@storage: place to store stats
10353 *
10354 *	Get network statistics from device. Return @storage.
10355 *	The device driver may provide its own method by setting
10356 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10357 *	otherwise the internal statistics structure is used.
10358 */
10359struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10360					struct rtnl_link_stats64 *storage)
10361{
10362	const struct net_device_ops *ops = dev->netdev_ops;
10363
10364	if (ops->ndo_get_stats64) {
10365		memset(storage, 0, sizeof(*storage));
10366		ops->ndo_get_stats64(dev, storage);
10367	} else if (ops->ndo_get_stats) {
10368		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10369	} else {
10370		netdev_stats_to_stats64(storage, &dev->stats);
10371	}
10372	storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
10373	storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
10374	storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
10375	return storage;
10376}
10377EXPORT_SYMBOL(dev_get_stats);
10378
10379/**
10380 *	dev_fetch_sw_netstats - get per-cpu network device statistics
10381 *	@s: place to store stats
10382 *	@netstats: per-cpu network stats to read from
10383 *
10384 *	Read per-cpu network statistics and populate the related fields in @s.
10385 */
10386void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10387			   const struct pcpu_sw_netstats __percpu *netstats)
10388{
10389	int cpu;
10390
10391	for_each_possible_cpu(cpu) {
10392		const struct pcpu_sw_netstats *stats;
10393		struct pcpu_sw_netstats tmp;
10394		unsigned int start;
10395
10396		stats = per_cpu_ptr(netstats, cpu);
10397		do {
10398			start = u64_stats_fetch_begin_irq(&stats->syncp);
10399			tmp.rx_packets = stats->rx_packets;
10400			tmp.rx_bytes   = stats->rx_bytes;
10401			tmp.tx_packets = stats->tx_packets;
10402			tmp.tx_bytes   = stats->tx_bytes;
10403		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
10404
10405		s->rx_packets += tmp.rx_packets;
10406		s->rx_bytes   += tmp.rx_bytes;
10407		s->tx_packets += tmp.tx_packets;
10408		s->tx_bytes   += tmp.tx_bytes;
10409	}
10410}
10411EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
10412
10413struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10414{
10415	struct netdev_queue *queue = dev_ingress_queue(dev);
10416
10417#ifdef CONFIG_NET_CLS_ACT
10418	if (queue)
10419		return queue;
10420	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10421	if (!queue)
10422		return NULL;
10423	netdev_init_one_queue(dev, queue, NULL);
10424	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10425	queue->qdisc_sleeping = &noop_qdisc;
10426	rcu_assign_pointer(dev->ingress_queue, queue);
10427#endif
10428	return queue;
10429}
10430
10431static const struct ethtool_ops default_ethtool_ops;
10432
10433void netdev_set_default_ethtool_ops(struct net_device *dev,
10434				    const struct ethtool_ops *ops)
10435{
10436	if (dev->ethtool_ops == &default_ethtool_ops)
10437		dev->ethtool_ops = ops;
10438}
10439EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10440
10441void netdev_freemem(struct net_device *dev)
10442{
10443	char *addr = (char *)dev - dev->padded;
10444
10445	kvfree(addr);
10446}
10447
10448/**
10449 * alloc_netdev_mqs - allocate network device
10450 * @sizeof_priv: size of private data to allocate space for
10451 * @name: device name format string
10452 * @name_assign_type: origin of device name
10453 * @setup: callback to initialize device
10454 * @txqs: the number of TX subqueues to allocate
10455 * @rxqs: the number of RX subqueues to allocate
10456 *
10457 * Allocates a struct net_device with private data area for driver use
10458 * and performs basic initialization.  Also allocates subqueue structs
10459 * for each queue on the device.
10460 */
10461struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10462		unsigned char name_assign_type,
10463		void (*setup)(struct net_device *),
10464		unsigned int txqs, unsigned int rxqs)
10465{
10466	struct net_device *dev;
10467	unsigned int alloc_size;
10468	struct net_device *p;
10469
10470	BUG_ON(strlen(name) >= sizeof(dev->name));
10471
10472	if (txqs < 1) {
10473		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10474		return NULL;
10475	}
10476
10477	if (rxqs < 1) {
10478		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10479		return NULL;
10480	}
10481
10482	alloc_size = sizeof(struct net_device);
10483	if (sizeof_priv) {
10484		/* ensure 32-byte alignment of private area */
10485		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10486		alloc_size += sizeof_priv;
10487	}
10488	/* ensure 32-byte alignment of whole construct */
10489	alloc_size += NETDEV_ALIGN - 1;
10490
10491	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
10492	if (!p)
10493		return NULL;
10494
10495	dev = PTR_ALIGN(p, NETDEV_ALIGN);
10496	dev->padded = (char *)dev - (char *)p;
10497
10498	dev->pcpu_refcnt = alloc_percpu(int);
10499	if (!dev->pcpu_refcnt)
10500		goto free_dev;
10501
10502	if (dev_addr_init(dev))
10503		goto free_pcpu;
10504
10505	dev_mc_init(dev);
10506	dev_uc_init(dev);
10507
10508	dev_net_set(dev, &init_net);
10509
10510	dev->gso_max_size = GSO_MAX_SIZE;
10511	dev->gso_max_segs = GSO_MAX_SEGS;
10512	dev->upper_level = 1;
10513	dev->lower_level = 1;
10514#ifdef CONFIG_LOCKDEP
10515	dev->nested_level = 0;
10516	INIT_LIST_HEAD(&dev->unlink_list);
10517#endif
10518
10519	INIT_LIST_HEAD(&dev->napi_list);
10520	INIT_LIST_HEAD(&dev->unreg_list);
10521	INIT_LIST_HEAD(&dev->close_list);
10522	INIT_LIST_HEAD(&dev->link_watch_list);
10523	INIT_LIST_HEAD(&dev->adj_list.upper);
10524	INIT_LIST_HEAD(&dev->adj_list.lower);
10525	INIT_LIST_HEAD(&dev->ptype_all);
10526	INIT_LIST_HEAD(&dev->ptype_specific);
10527	INIT_LIST_HEAD(&dev->net_notifier_list);
10528#ifdef CONFIG_NET_SCHED
10529	hash_init(dev->qdisc_hash);
10530#endif
10531	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10532	setup(dev);
10533
10534	if (!dev->tx_queue_len) {
10535		dev->priv_flags |= IFF_NO_QUEUE;
10536		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10537	}
10538
10539	dev->num_tx_queues = txqs;
10540	dev->real_num_tx_queues = txqs;
10541	if (netif_alloc_netdev_queues(dev))
10542		goto free_all;
10543
10544	dev->num_rx_queues = rxqs;
10545	dev->real_num_rx_queues = rxqs;
10546	if (netif_alloc_rx_queues(dev))
10547		goto free_all;
10548
10549	strcpy(dev->name, name);
10550	dev->name_assign_type = name_assign_type;
10551	dev->group = INIT_NETDEV_GROUP;
10552	if (!dev->ethtool_ops)
10553		dev->ethtool_ops = &default_ethtool_ops;
10554
10555	nf_hook_ingress_init(dev);
10556
10557	return dev;
10558
10559free_all:
10560	free_netdev(dev);
10561	return NULL;
10562
10563free_pcpu:
10564	free_percpu(dev->pcpu_refcnt);
10565free_dev:
10566	netdev_freemem(dev);
10567	return NULL;
10568}
10569EXPORT_SYMBOL(alloc_netdev_mqs);
10570
10571/**
10572 * free_netdev - free network device
10573 * @dev: device
10574 *
10575 * This function does the last stage of destroying an allocated device
10576 * interface. The reference to the device object is released. If this
10577 * is the last reference then it will be freed.Must be called in process
10578 * context.
10579 */
10580void free_netdev(struct net_device *dev)
10581{
10582	struct napi_struct *p, *n;
10583
10584	might_sleep();
10585
10586	/* When called immediately after register_netdevice() failed the unwind
10587	 * handling may still be dismantling the device. Handle that case by
10588	 * deferring the free.
10589	 */
10590	if (dev->reg_state == NETREG_UNREGISTERING) {
10591		ASSERT_RTNL();
10592		dev->needs_free_netdev = true;
10593		return;
10594	}
10595
10596	netif_free_tx_queues(dev);
10597	netif_free_rx_queues(dev);
10598
10599	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10600
10601	/* Flush device addresses */
10602	dev_addr_flush(dev);
10603
10604	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10605		netif_napi_del(p);
10606
10607	free_percpu(dev->pcpu_refcnt);
10608	dev->pcpu_refcnt = NULL;
10609	free_percpu(dev->xdp_bulkq);
10610	dev->xdp_bulkq = NULL;
10611
10612	/*  Compatibility with error handling in drivers */
10613	if (dev->reg_state == NETREG_UNINITIALIZED) {
10614		netdev_freemem(dev);
10615		return;
10616	}
10617
10618	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10619	dev->reg_state = NETREG_RELEASED;
10620
10621	/* will free via device release */
10622	put_device(&dev->dev);
10623}
10624EXPORT_SYMBOL(free_netdev);
10625
10626/**
10627 *	synchronize_net -  Synchronize with packet receive processing
10628 *
10629 *	Wait for packets currently being received to be done.
10630 *	Does not block later packets from starting.
10631 */
10632void synchronize_net(void)
10633{
10634	might_sleep();
10635	if (rtnl_is_locked())
10636		synchronize_rcu_expedited();
10637	else
10638		synchronize_rcu();
10639}
10640EXPORT_SYMBOL(synchronize_net);
10641
10642/**
10643 *	unregister_netdevice_queue - remove device from the kernel
10644 *	@dev: device
10645 *	@head: list
10646 *
10647 *	This function shuts down a device interface and removes it
10648 *	from the kernel tables.
10649 *	If head not NULL, device is queued to be unregistered later.
10650 *
10651 *	Callers must hold the rtnl semaphore.  You may want
10652 *	unregister_netdev() instead of this.
10653 */
10654
10655void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
10656{
10657	ASSERT_RTNL();
10658
10659	if (head) {
10660		list_move_tail(&dev->unreg_list, head);
10661	} else {
10662		LIST_HEAD(single);
10663
10664		list_add(&dev->unreg_list, &single);
10665		unregister_netdevice_many(&single);
10666	}
10667}
10668EXPORT_SYMBOL(unregister_netdevice_queue);
10669
10670/**
10671 *	unregister_netdevice_many - unregister many devices
10672 *	@head: list of devices
10673 *
10674 *  Note: As most callers use a stack allocated list_head,
10675 *  we force a list_del() to make sure stack wont be corrupted later.
10676 */
10677void unregister_netdevice_many(struct list_head *head)
10678{
10679	struct net_device *dev, *tmp;
10680	LIST_HEAD(close_head);
10681
10682	BUG_ON(dev_boot_phase);
10683	ASSERT_RTNL();
10684
10685	if (list_empty(head))
10686		return;
10687
10688	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
10689		/* Some devices call without registering
10690		 * for initialization unwind. Remove those
10691		 * devices and proceed with the remaining.
10692		 */
10693		if (dev->reg_state == NETREG_UNINITIALIZED) {
10694			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
10695				 dev->name, dev);
10696
10697			WARN_ON(1);
10698			list_del(&dev->unreg_list);
10699			continue;
10700		}
10701		dev->dismantle = true;
10702		BUG_ON(dev->reg_state != NETREG_REGISTERED);
10703	}
10704
10705	/* If device is running, close it first. */
10706	list_for_each_entry(dev, head, unreg_list)
10707		list_add_tail(&dev->close_list, &close_head);
10708	dev_close_many(&close_head, true);
10709
10710	list_for_each_entry(dev, head, unreg_list) {
10711		/* And unlink it from device chain. */
10712		unlist_netdevice(dev);
10713
10714		dev->reg_state = NETREG_UNREGISTERING;
10715	}
10716	flush_all_backlogs();
10717
10718	synchronize_net();
10719
10720	list_for_each_entry(dev, head, unreg_list) {
10721		struct sk_buff *skb = NULL;
10722
10723		/* Shutdown queueing discipline. */
10724		dev_shutdown(dev);
10725
10726		dev_xdp_uninstall(dev);
10727
10728		/* Notify protocols, that we are about to destroy
10729		 * this device. They should clean all the things.
10730		 */
10731		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10732
10733		if (!dev->rtnl_link_ops ||
10734		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10735			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
10736						     GFP_KERNEL, NULL, 0);
10737
10738		/*
10739		 *	Flush the unicast and multicast chains
10740		 */
10741		dev_uc_flush(dev);
10742		dev_mc_flush(dev);
10743
10744		netdev_name_node_alt_flush(dev);
10745		netdev_name_node_free(dev->name_node);
10746
10747		if (dev->netdev_ops->ndo_uninit)
10748			dev->netdev_ops->ndo_uninit(dev);
10749
10750		if (skb)
10751			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
10752
10753		/* Notifier chain MUST detach us all upper devices. */
10754		WARN_ON(netdev_has_any_upper_dev(dev));
10755		WARN_ON(netdev_has_any_lower_dev(dev));
10756
10757		/* Remove entries from kobject tree */
10758		netdev_unregister_kobject(dev);
10759#ifdef CONFIG_XPS
10760		/* Remove XPS queueing entries */
10761		netif_reset_xps_queues_gt(dev, 0);
10762#endif
10763	}
10764
10765	synchronize_net();
10766
10767	list_for_each_entry(dev, head, unreg_list) {
10768		dev_put(dev);
10769		net_set_todo(dev);
10770	}
10771
10772	list_del(head);
10773}
10774EXPORT_SYMBOL(unregister_netdevice_many);
10775
10776/**
10777 *	unregister_netdev - remove device from the kernel
10778 *	@dev: device
10779 *
10780 *	This function shuts down a device interface and removes it
10781 *	from the kernel tables.
10782 *
10783 *	This is just a wrapper for unregister_netdevice that takes
10784 *	the rtnl semaphore.  In general you want to use this and not
10785 *	unregister_netdevice.
10786 */
10787void unregister_netdev(struct net_device *dev)
10788{
10789	rtnl_lock();
10790	unregister_netdevice(dev);
10791	rtnl_unlock();
10792}
10793EXPORT_SYMBOL(unregister_netdev);
10794
10795/**
10796 *	dev_change_net_namespace - move device to different nethost namespace
10797 *	@dev: device
10798 *	@net: network namespace
10799 *	@pat: If not NULL name pattern to try if the current device name
10800 *	      is already taken in the destination network namespace.
10801 *
10802 *	This function shuts down a device interface and moves it
10803 *	to a new network namespace. On success 0 is returned, on
10804 *	a failure a netagive errno code is returned.
10805 *
10806 *	Callers must hold the rtnl semaphore.
10807 */
10808
10809int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
10810{
10811	struct net *net_old = dev_net(dev);
10812	int err, new_nsid, new_ifindex;
10813
10814	ASSERT_RTNL();
10815
10816	/* Don't allow namespace local devices to be moved. */
10817	err = -EINVAL;
10818	if (dev->features & NETIF_F_NETNS_LOCAL)
10819		goto out;
10820
10821	/* Ensure the device has been registrered */
10822	if (dev->reg_state != NETREG_REGISTERED)
10823		goto out;
10824
10825	/* Get out if there is nothing todo */
10826	err = 0;
10827	if (net_eq(net_old, net))
10828		goto out;
10829
10830	/* Pick the destination device name, and ensure
10831	 * we can use it in the destination network namespace.
10832	 */
10833	err = -EEXIST;
10834	if (__dev_get_by_name(net, dev->name)) {
10835		/* We get here if we can't use the current device name */
10836		if (!pat)
10837			goto out;
10838		err = dev_get_valid_name(net, dev, pat);
10839		if (err < 0)
10840			goto out;
10841	}
10842
10843	/*
10844	 * And now a mini version of register_netdevice unregister_netdevice.
10845	 */
10846
10847	/* If device is running close it first. */
10848	dev_close(dev);
10849
10850	/* And unlink it from device chain */
10851	unlist_netdevice(dev);
10852
10853	synchronize_net();
10854
10855	/* Shutdown queueing discipline. */
10856	dev_shutdown(dev);
10857
10858	/* Notify protocols, that we are about to destroy
10859	 * this device. They should clean all the things.
10860	 *
10861	 * Note that dev->reg_state stays at NETREG_REGISTERED.
10862	 * This is wanted because this way 8021q and macvlan know
10863	 * the device is just moving and can keep their slaves up.
10864	 */
10865	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10866	rcu_barrier();
10867
10868	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
10869	/* If there is an ifindex conflict assign a new one */
10870	if (__dev_get_by_index(net, dev->ifindex))
10871		new_ifindex = dev_new_index(net);
10872	else
10873		new_ifindex = dev->ifindex;
10874
10875	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
10876			    new_ifindex);
10877
10878	/*
10879	 *	Flush the unicast and multicast chains
10880	 */
10881	dev_uc_flush(dev);
10882	dev_mc_flush(dev);
10883
10884	/* Send a netdev-removed uevent to the old namespace */
10885	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
10886	netdev_adjacent_del_links(dev);
10887
10888	/* Move per-net netdevice notifiers that are following the netdevice */
10889	move_netdevice_notifiers_dev_net(dev, net);
10890
10891	/* Actually switch the network namespace */
10892	dev_net_set(dev, net);
10893	dev->ifindex = new_ifindex;
10894
10895	/* Send a netdev-add uevent to the new namespace */
10896	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
10897	netdev_adjacent_add_links(dev);
10898
10899	/* Fixup kobjects */
10900	err = device_rename(&dev->dev, dev->name);
10901	WARN_ON(err);
10902
10903	/* Adapt owner in case owning user namespace of target network
10904	 * namespace is different from the original one.
10905	 */
10906	err = netdev_change_owner(dev, net_old, net);
10907	WARN_ON(err);
10908
10909	/* Add the device back in the hashes */
10910	list_netdevice(dev);
10911
10912	/* Notify protocols, that a new device appeared. */
10913	call_netdevice_notifiers(NETDEV_REGISTER, dev);
10914
10915	/*
10916	 *	Prevent userspace races by waiting until the network
10917	 *	device is fully setup before sending notifications.
10918	 */
10919	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10920
10921	synchronize_net();
10922	err = 0;
10923out:
10924	return err;
10925}
10926EXPORT_SYMBOL_GPL(dev_change_net_namespace);
10927
10928static int dev_cpu_dead(unsigned int oldcpu)
10929{
10930	struct sk_buff **list_skb;
10931	struct sk_buff *skb;
10932	unsigned int cpu;
10933	struct softnet_data *sd, *oldsd, *remsd = NULL;
10934
10935	local_irq_disable();
10936	cpu = smp_processor_id();
10937	sd = &per_cpu(softnet_data, cpu);
10938	oldsd = &per_cpu(softnet_data, oldcpu);
10939
10940	/* Find end of our completion_queue. */
10941	list_skb = &sd->completion_queue;
10942	while (*list_skb)
10943		list_skb = &(*list_skb)->next;
10944	/* Append completion queue from offline CPU. */
10945	*list_skb = oldsd->completion_queue;
10946	oldsd->completion_queue = NULL;
10947
10948	/* Append output queue from offline CPU. */
10949	if (oldsd->output_queue) {
10950		*sd->output_queue_tailp = oldsd->output_queue;
10951		sd->output_queue_tailp = oldsd->output_queue_tailp;
10952		oldsd->output_queue = NULL;
10953		oldsd->output_queue_tailp = &oldsd->output_queue;
10954	}
10955	/* Append NAPI poll list from offline CPU, with one exception :
10956	 * process_backlog() must be called by cpu owning percpu backlog.
10957	 * We properly handle process_queue & input_pkt_queue later.
10958	 */
10959	while (!list_empty(&oldsd->poll_list)) {
10960		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
10961							    struct napi_struct,
10962							    poll_list);
10963
10964		list_del_init(&napi->poll_list);
10965		if (napi->poll == process_backlog)
10966			napi->state = 0;
10967		else
10968			____napi_schedule(sd, napi);
10969	}
10970
10971	raise_softirq_irqoff(NET_TX_SOFTIRQ);
10972	local_irq_enable();
10973
10974#ifdef CONFIG_RPS
10975	remsd = oldsd->rps_ipi_list;
10976	oldsd->rps_ipi_list = NULL;
10977#endif
10978	/* send out pending IPI's on offline CPU */
10979	net_rps_send_ipi(remsd);
10980
10981	/* Process offline CPU's input_pkt_queue */
10982	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
10983		netif_rx_ni(skb);
10984		input_queue_head_incr(oldsd);
10985	}
10986	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
10987		netif_rx_ni(skb);
10988		input_queue_head_incr(oldsd);
10989	}
10990
10991	return 0;
10992}
10993
10994/**
10995 *	netdev_increment_features - increment feature set by one
10996 *	@all: current feature set
10997 *	@one: new feature set
10998 *	@mask: mask feature set
10999 *
11000 *	Computes a new feature set after adding a device with feature set
11001 *	@one to the master device with current feature set @all.  Will not
11002 *	enable anything that is off in @mask. Returns the new feature set.
11003 */
11004netdev_features_t netdev_increment_features(netdev_features_t all,
11005	netdev_features_t one, netdev_features_t mask)
11006{
11007	if (mask & NETIF_F_HW_CSUM)
11008		mask |= NETIF_F_CSUM_MASK;
11009	mask |= NETIF_F_VLAN_CHALLENGED;
11010
11011	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
11012	all &= one | ~NETIF_F_ALL_FOR_ALL;
11013
11014	/* If one device supports hw checksumming, set for all. */
11015	if (all & NETIF_F_HW_CSUM)
11016		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
11017
11018	return all;
11019}
11020EXPORT_SYMBOL(netdev_increment_features);
11021
11022static struct hlist_head * __net_init netdev_create_hash(void)
11023{
11024	int i;
11025	struct hlist_head *hash;
11026
11027	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
11028	if (hash != NULL)
11029		for (i = 0; i < NETDEV_HASHENTRIES; i++)
11030			INIT_HLIST_HEAD(&hash[i]);
11031
11032	return hash;
11033}
11034
11035/* Initialize per network namespace state */
11036static int __net_init netdev_init(struct net *net)
11037{
11038	BUILD_BUG_ON(GRO_HASH_BUCKETS >
11039		     8 * sizeof_field(struct napi_struct, gro_bitmask));
11040
11041	if (net != &init_net)
11042		INIT_LIST_HEAD(&net->dev_base_head);
11043
11044	net->dev_name_head = netdev_create_hash();
11045	if (net->dev_name_head == NULL)
11046		goto err_name;
11047
11048	net->dev_index_head = netdev_create_hash();
11049	if (net->dev_index_head == NULL)
11050		goto err_idx;
11051
11052	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
11053
11054	return 0;
11055
11056err_idx:
11057	kfree(net->dev_name_head);
11058err_name:
11059	return -ENOMEM;
11060}
11061
11062/**
11063 *	netdev_drivername - network driver for the device
11064 *	@dev: network device
11065 *
11066 *	Determine network driver for device.
11067 */
11068const char *netdev_drivername(const struct net_device *dev)
11069{
11070	const struct device_driver *driver;
11071	const struct device *parent;
11072	const char *empty = "";
11073
11074	parent = dev->dev.parent;
11075	if (!parent)
11076		return empty;
11077
11078	driver = parent->driver;
11079	if (driver && driver->name)
11080		return driver->name;
11081	return empty;
11082}
11083
11084static void __netdev_printk(const char *level, const struct net_device *dev,
11085			    struct va_format *vaf)
11086{
11087	if (dev && dev->dev.parent) {
11088		dev_printk_emit(level[1] - '0',
11089				dev->dev.parent,
11090				"%s %s %s%s: %pV",
11091				dev_driver_string(dev->dev.parent),
11092				dev_name(dev->dev.parent),
11093				netdev_name(dev), netdev_reg_state(dev),
11094				vaf);
11095	} else if (dev) {
11096		printk("%s%s%s: %pV",
11097		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
11098	} else {
11099		printk("%s(NULL net_device): %pV", level, vaf);
11100	}
11101}
11102
11103void netdev_printk(const char *level, const struct net_device *dev,
11104		   const char *format, ...)
11105{
11106	struct va_format vaf;
11107	va_list args;
11108
11109	va_start(args, format);
11110
11111	vaf.fmt = format;
11112	vaf.va = &args;
11113
11114	__netdev_printk(level, dev, &vaf);
11115
11116	va_end(args);
11117}
11118EXPORT_SYMBOL(netdev_printk);
11119
11120#define define_netdev_printk_level(func, level)			\
11121void func(const struct net_device *dev, const char *fmt, ...)	\
11122{								\
11123	struct va_format vaf;					\
11124	va_list args;						\
11125								\
11126	va_start(args, fmt);					\
11127								\
11128	vaf.fmt = fmt;						\
11129	vaf.va = &args;						\
11130								\
11131	__netdev_printk(level, dev, &vaf);			\
11132								\
11133	va_end(args);						\
11134}								\
11135EXPORT_SYMBOL(func);
11136
11137define_netdev_printk_level(netdev_emerg, KERN_EMERG);
11138define_netdev_printk_level(netdev_alert, KERN_ALERT);
11139define_netdev_printk_level(netdev_crit, KERN_CRIT);
11140define_netdev_printk_level(netdev_err, KERN_ERR);
11141define_netdev_printk_level(netdev_warn, KERN_WARNING);
11142define_netdev_printk_level(netdev_notice, KERN_NOTICE);
11143define_netdev_printk_level(netdev_info, KERN_INFO);
11144
11145static void __net_exit netdev_exit(struct net *net)
11146{
11147	kfree(net->dev_name_head);
11148	kfree(net->dev_index_head);
11149	if (net != &init_net)
11150		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
11151}
11152
11153static struct pernet_operations __net_initdata netdev_net_ops = {
11154	.init = netdev_init,
11155	.exit = netdev_exit,
11156};
11157
11158static void __net_exit default_device_exit(struct net *net)
11159{
11160	struct net_device *dev, *aux;
11161	/*
11162	 * Push all migratable network devices back to the
11163	 * initial network namespace
11164	 */
11165	rtnl_lock();
11166	for_each_netdev_safe(net, dev, aux) {
11167		int err;
11168		char fb_name[IFNAMSIZ];
11169
11170		/* Ignore unmoveable devices (i.e. loopback) */
11171		if (dev->features & NETIF_F_NETNS_LOCAL)
11172			continue;
11173
11174		/* Leave virtual devices for the generic cleanup */
11175		if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
11176			continue;
11177
11178		/* Push remaining network devices to init_net */
11179		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
11180		if (__dev_get_by_name(&init_net, fb_name))
11181			snprintf(fb_name, IFNAMSIZ, "dev%%d");
11182		err = dev_change_net_namespace(dev, &init_net, fb_name);
11183		if (err) {
11184			pr_emerg("%s: failed to move %s to init_net: %d\n",
11185				 __func__, dev->name, err);
11186			BUG();
11187		}
11188	}
11189	rtnl_unlock();
11190}
11191
11192static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
11193{
11194	/* Return with the rtnl_lock held when there are no network
11195	 * devices unregistering in any network namespace in net_list.
11196	 */
11197	struct net *net;
11198	bool unregistering;
11199	DEFINE_WAIT_FUNC(wait, woken_wake_function);
11200
11201	add_wait_queue(&netdev_unregistering_wq, &wait);
11202	for (;;) {
11203		unregistering = false;
11204		rtnl_lock();
11205		list_for_each_entry(net, net_list, exit_list) {
11206			if (net->dev_unreg_count > 0) {
11207				unregistering = true;
11208				break;
11209			}
11210		}
11211		if (!unregistering)
11212			break;
11213		__rtnl_unlock();
11214
11215		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
11216	}
11217	remove_wait_queue(&netdev_unregistering_wq, &wait);
11218}
11219
11220static void __net_exit default_device_exit_batch(struct list_head *net_list)
11221{
11222	/* At exit all network devices most be removed from a network
11223	 * namespace.  Do this in the reverse order of registration.
11224	 * Do this across as many network namespaces as possible to
11225	 * improve batching efficiency.
11226	 */
11227	struct net_device *dev;
11228	struct net *net;
11229	LIST_HEAD(dev_kill_list);
11230
11231	/* To prevent network device cleanup code from dereferencing
11232	 * loopback devices or network devices that have been freed
11233	 * wait here for all pending unregistrations to complete,
11234	 * before unregistring the loopback device and allowing the
11235	 * network namespace be freed.
11236	 *
11237	 * The netdev todo list containing all network devices
11238	 * unregistrations that happen in default_device_exit_batch
11239	 * will run in the rtnl_unlock() at the end of
11240	 * default_device_exit_batch.
11241	 */
11242	rtnl_lock_unregistering(net_list);
11243	list_for_each_entry(net, net_list, exit_list) {
11244		for_each_netdev_reverse(net, dev) {
11245			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
11246				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
11247			else
11248				unregister_netdevice_queue(dev, &dev_kill_list);
11249		}
11250	}
11251	unregister_netdevice_many(&dev_kill_list);
11252	rtnl_unlock();
11253}
11254
11255static struct pernet_operations __net_initdata default_device_ops = {
11256	.exit = default_device_exit,
11257	.exit_batch = default_device_exit_batch,
11258};
11259
11260/*
11261 *	Initialize the DEV module. At boot time this walks the device list and
11262 *	unhooks any devices that fail to initialise (normally hardware not
11263 *	present) and leaves us with a valid list of present and active devices.
11264 *
11265 */
11266
11267/*
11268 *       This is called single threaded during boot, so no need
11269 *       to take the rtnl semaphore.
11270 */
11271static int __init net_dev_init(void)
11272{
11273	int i, rc = -ENOMEM;
11274
11275	BUG_ON(!dev_boot_phase);
11276
11277	if (dev_proc_init())
11278		goto out;
11279
11280	if (netdev_kobject_init())
11281		goto out;
11282
11283	INIT_LIST_HEAD(&ptype_all);
11284	for (i = 0; i < PTYPE_HASH_SIZE; i++)
11285		INIT_LIST_HEAD(&ptype_base[i]);
11286
11287	INIT_LIST_HEAD(&offload_base);
11288
11289	if (register_pernet_subsys(&netdev_net_ops))
11290		goto out;
11291
11292	/*
11293	 *	Initialise the packet receive queues.
11294	 */
11295
11296	for_each_possible_cpu(i) {
11297		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11298		struct softnet_data *sd = &per_cpu(softnet_data, i);
11299
11300		INIT_WORK(flush, flush_backlog);
11301
11302		skb_queue_head_init(&sd->input_pkt_queue);
11303		skb_queue_head_init(&sd->process_queue);
11304#ifdef CONFIG_XFRM_OFFLOAD
11305		skb_queue_head_init(&sd->xfrm_backlog);
11306#endif
11307		INIT_LIST_HEAD(&sd->poll_list);
11308		sd->output_queue_tailp = &sd->output_queue;
11309#ifdef CONFIG_RPS
11310		sd->csd.func = rps_trigger_softirq;
11311		sd->csd.info = sd;
11312		sd->cpu = i;
11313#endif
11314
11315		init_gro_hash(&sd->backlog);
11316		sd->backlog.poll = process_backlog;
11317		sd->backlog.weight = weight_p;
11318	}
11319
11320	dev_boot_phase = 0;
11321
11322	/* The loopback device is special if any other network devices
11323	 * is present in a network namespace the loopback device must
11324	 * be present. Since we now dynamically allocate and free the
11325	 * loopback device ensure this invariant is maintained by
11326	 * keeping the loopback device as the first device on the
11327	 * list of network devices.  Ensuring the loopback devices
11328	 * is the first device that appears and the last network device
11329	 * that disappears.
11330	 */
11331	if (register_pernet_device(&loopback_net_ops))
11332		goto out;
11333
11334	if (register_pernet_device(&default_device_ops))
11335		goto out;
11336
11337	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11338	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11339
11340	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11341				       NULL, dev_cpu_dead);
11342	WARN_ON(rc < 0);
11343	rc = 0;
11344out:
11345	return rc;
11346}
11347
11348subsys_initcall(net_dev_init);
11349