xref: /kernel/linux/linux-5.10/net/ipv6/route.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 *	Linux INET6 implementation
4 *	FIB front-end.
5 *
6 *	Authors:
7 *	Pedro Roque		<roque@di.fc.ul.pt>
8 */
9
10/*	Changes:
11 *
12 *	YOSHIFUJI Hideaki @USAGI
13 *		reworked default router selection.
14 *		- respect outgoing interface
15 *		- select from (probably) reachable routers (i.e.
16 *		routers in REACHABLE, STALE, DELAY or PROBE states).
17 *		- always select the same router if it is (probably)
18 *		reachable.  otherwise, round-robin the list.
19 *	Ville Nuorvala
20 *		Fixed routing subtrees.
21 */
22
23#define pr_fmt(fmt) "IPv6: " fmt
24
25#include <linux/capability.h>
26#include <linux/errno.h>
27#include <linux/export.h>
28#include <linux/types.h>
29#include <linux/times.h>
30#include <linux/socket.h>
31#include <linux/sockios.h>
32#include <linux/net.h>
33#include <linux/route.h>
34#include <linux/netdevice.h>
35#include <linux/in6.h>
36#include <linux/mroute6.h>
37#include <linux/init.h>
38#include <linux/if_arp.h>
39#include <linux/proc_fs.h>
40#include <linux/seq_file.h>
41#include <linux/nsproxy.h>
42#include <linux/slab.h>
43#include <linux/jhash.h>
44#include <linux/siphash.h>
45#include <net/net_namespace.h>
46#include <net/snmp.h>
47#include <net/ipv6.h>
48#include <net/ip6_fib.h>
49#include <net/ip6_route.h>
50#include <net/ndisc.h>
51#include <net/addrconf.h>
52#include <net/tcp.h>
53#include <linux/rtnetlink.h>
54#include <net/dst.h>
55#include <net/dst_metadata.h>
56#include <net/xfrm.h>
57#include <net/netevent.h>
58#include <net/netlink.h>
59#include <net/rtnh.h>
60#include <net/lwtunnel.h>
61#include <net/ip_tunnels.h>
62#include <net/l3mdev.h>
63#include <net/ip.h>
64#include <linux/uaccess.h>
65#include <linux/btf_ids.h>
66
67#ifdef CONFIG_SYSCTL
68#include <linux/sysctl.h>
69#endif
70
71static int ip6_rt_type_to_error(u8 fib6_type);
72
73#define CREATE_TRACE_POINTS
74#include <trace/events/fib6.h>
75EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
76#undef CREATE_TRACE_POINTS
77
78enum rt6_nud_state {
79	RT6_NUD_FAIL_HARD = -3,
80	RT6_NUD_FAIL_PROBE = -2,
81	RT6_NUD_FAIL_DO_RR = -1,
82	RT6_NUD_SUCCEED = 1
83};
84
85static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
86static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
87static unsigned int	 ip6_mtu(const struct dst_entry *dst);
88static void		ip6_negative_advice(struct sock *sk,
89					    struct dst_entry *dst);
90static void		ip6_dst_destroy(struct dst_entry *);
91static void		ip6_dst_ifdown(struct dst_entry *,
92				       struct net_device *dev, int how);
93static void		 ip6_dst_gc(struct dst_ops *ops);
94
95static int		ip6_pkt_discard(struct sk_buff *skb);
96static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
97static int		ip6_pkt_prohibit(struct sk_buff *skb);
98static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
99static void		ip6_link_failure(struct sk_buff *skb);
100static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
101					   struct sk_buff *skb, u32 mtu,
102					   bool confirm_neigh);
103static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104					struct sk_buff *skb);
105static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
106			   int strict);
107static size_t rt6_nlmsg_size(struct fib6_info *f6i);
108static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109			 struct fib6_info *rt, struct dst_entry *dst,
110			 struct in6_addr *dest, struct in6_addr *src,
111			 int iif, int type, u32 portid, u32 seq,
112			 unsigned int flags);
113static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
114					   const struct in6_addr *daddr,
115					   const struct in6_addr *saddr);
116
117#ifdef CONFIG_IPV6_ROUTE_INFO
118static struct fib6_info *rt6_add_route_info(struct net *net,
119					   const struct in6_addr *prefix, int prefixlen,
120					   const struct in6_addr *gwaddr,
121					   struct net_device *dev,
122					   unsigned int pref);
123static struct fib6_info *rt6_get_route_info(struct net *net,
124					   const struct in6_addr *prefix, int prefixlen,
125					   const struct in6_addr *gwaddr,
126					   struct net_device *dev);
127#endif
128
129struct uncached_list {
130	spinlock_t		lock;
131	struct list_head	head;
132};
133
134static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135
136void rt6_uncached_list_add(struct rt6_info *rt)
137{
138	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139
140	rt->rt6i_uncached_list = ul;
141
142	spin_lock_bh(&ul->lock);
143	list_add_tail(&rt->rt6i_uncached, &ul->head);
144	spin_unlock_bh(&ul->lock);
145}
146
147void rt6_uncached_list_del(struct rt6_info *rt)
148{
149	if (!list_empty(&rt->rt6i_uncached)) {
150		struct uncached_list *ul = rt->rt6i_uncached_list;
151		struct net *net = dev_net(rt->dst.dev);
152
153		spin_lock_bh(&ul->lock);
154		list_del(&rt->rt6i_uncached);
155		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156		spin_unlock_bh(&ul->lock);
157	}
158}
159
160static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161{
162	struct net_device *loopback_dev = net->loopback_dev;
163	int cpu;
164
165	if (dev == loopback_dev)
166		return;
167
168	for_each_possible_cpu(cpu) {
169		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
170		struct rt6_info *rt;
171
172		spin_lock_bh(&ul->lock);
173		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174			struct inet6_dev *rt_idev = rt->rt6i_idev;
175			struct net_device *rt_dev = rt->dst.dev;
176
177			if (rt_idev->dev == dev) {
178				rt->rt6i_idev = in6_dev_get(loopback_dev);
179				in6_dev_put(rt_idev);
180			}
181
182			if (rt_dev == dev) {
183				rt->dst.dev = blackhole_netdev;
184				dev_hold(rt->dst.dev);
185				dev_put(rt_dev);
186			}
187		}
188		spin_unlock_bh(&ul->lock);
189	}
190}
191
192static inline const void *choose_neigh_daddr(const struct in6_addr *p,
193					     struct sk_buff *skb,
194					     const void *daddr)
195{
196	if (!ipv6_addr_any(p))
197		return (const void *) p;
198	else if (skb)
199		return &ipv6_hdr(skb)->daddr;
200	return daddr;
201}
202
203struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204				   struct net_device *dev,
205				   struct sk_buff *skb,
206				   const void *daddr)
207{
208	struct neighbour *n;
209
210	daddr = choose_neigh_daddr(gw, skb, daddr);
211	n = __ipv6_neigh_lookup(dev, daddr);
212	if (n)
213		return n;
214
215	n = neigh_create(&nd_tbl, daddr, dev);
216	return IS_ERR(n) ? NULL : n;
217}
218
219static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220					      struct sk_buff *skb,
221					      const void *daddr)
222{
223	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224
225	return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
226				dst->dev, skb, daddr);
227}
228
229static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
230{
231	struct net_device *dev = dst->dev;
232	struct rt6_info *rt = (struct rt6_info *)dst;
233
234	daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr);
235	if (!daddr)
236		return;
237	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
238		return;
239	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
240		return;
241	__ipv6_confirm_neigh(dev, daddr);
242}
243
244static struct dst_ops ip6_dst_ops_template = {
245	.family			=	AF_INET6,
246	.gc			=	ip6_dst_gc,
247	.gc_thresh		=	1024,
248	.check			=	ip6_dst_check,
249	.default_advmss		=	ip6_default_advmss,
250	.mtu			=	ip6_mtu,
251	.cow_metrics		=	dst_cow_metrics_generic,
252	.destroy		=	ip6_dst_destroy,
253	.ifdown			=	ip6_dst_ifdown,
254	.negative_advice	=	ip6_negative_advice,
255	.link_failure		=	ip6_link_failure,
256	.update_pmtu		=	ip6_rt_update_pmtu,
257	.redirect		=	rt6_do_redirect,
258	.local_out		=	__ip6_local_out,
259	.neigh_lookup		=	ip6_dst_neigh_lookup,
260	.confirm_neigh		=	ip6_confirm_neigh,
261};
262
263static struct dst_ops ip6_dst_blackhole_ops = {
264	.family			= AF_INET6,
265	.default_advmss		= ip6_default_advmss,
266	.neigh_lookup		= ip6_dst_neigh_lookup,
267	.check			= ip6_dst_check,
268	.destroy		= ip6_dst_destroy,
269	.cow_metrics		= dst_cow_metrics_generic,
270	.update_pmtu		= dst_blackhole_update_pmtu,
271	.redirect		= dst_blackhole_redirect,
272	.mtu			= dst_blackhole_mtu,
273};
274
275static const u32 ip6_template_metrics[RTAX_MAX] = {
276	[RTAX_HOPLIMIT - 1] = 0,
277};
278
279static const struct fib6_info fib6_null_entry_template = {
280	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
281	.fib6_protocol  = RTPROT_KERNEL,
282	.fib6_metric	= ~(u32)0,
283	.fib6_ref	= REFCOUNT_INIT(1),
284	.fib6_type	= RTN_UNREACHABLE,
285	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
286};
287
288static const struct rt6_info ip6_null_entry_template = {
289	.dst = {
290		.__refcnt	= ATOMIC_INIT(1),
291		.__use		= 1,
292		.obsolete	= DST_OBSOLETE_FORCE_CHK,
293		.error		= -ENETUNREACH,
294		.input		= ip6_pkt_discard,
295		.output		= ip6_pkt_discard_out,
296	},
297	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
298};
299
300#ifdef CONFIG_IPV6_MULTIPLE_TABLES
301
302static const struct rt6_info ip6_prohibit_entry_template = {
303	.dst = {
304		.__refcnt	= ATOMIC_INIT(1),
305		.__use		= 1,
306		.obsolete	= DST_OBSOLETE_FORCE_CHK,
307		.error		= -EACCES,
308		.input		= ip6_pkt_prohibit,
309		.output		= ip6_pkt_prohibit_out,
310	},
311	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
312};
313
314static const struct rt6_info ip6_blk_hole_entry_template = {
315	.dst = {
316		.__refcnt	= ATOMIC_INIT(1),
317		.__use		= 1,
318		.obsolete	= DST_OBSOLETE_FORCE_CHK,
319		.error		= -EINVAL,
320		.input		= dst_discard,
321		.output		= dst_discard_out,
322	},
323	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
324};
325
326#endif
327
328static void rt6_info_init(struct rt6_info *rt)
329{
330	struct dst_entry *dst = &rt->dst;
331
332	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
333	INIT_LIST_HEAD(&rt->rt6i_uncached);
334}
335
336/* allocate dst with ip6_dst_ops */
337struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
338			       int flags)
339{
340	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
341					1, DST_OBSOLETE_FORCE_CHK, flags);
342
343	if (rt) {
344		rt6_info_init(rt);
345		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
346	}
347
348	return rt;
349}
350EXPORT_SYMBOL(ip6_dst_alloc);
351
352static void ip6_dst_destroy(struct dst_entry *dst)
353{
354	struct rt6_info *rt = (struct rt6_info *)dst;
355	struct fib6_info *from;
356	struct inet6_dev *idev;
357
358	ip_dst_metrics_put(dst);
359	rt6_uncached_list_del(rt);
360
361	idev = rt->rt6i_idev;
362	if (idev) {
363		rt->rt6i_idev = NULL;
364		in6_dev_put(idev);
365	}
366
367	from = xchg((__force struct fib6_info **)&rt->from, NULL);
368	fib6_info_release(from);
369}
370
371static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
372			   int how)
373{
374	struct rt6_info *rt = (struct rt6_info *)dst;
375	struct inet6_dev *idev = rt->rt6i_idev;
376	struct net_device *loopback_dev =
377		dev_net(dev)->loopback_dev;
378
379	if (idev && idev->dev != loopback_dev) {
380		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
381		if (loopback_idev) {
382			rt->rt6i_idev = loopback_idev;
383			in6_dev_put(idev);
384		}
385	}
386}
387
388static bool __rt6_check_expired(const struct rt6_info *rt)
389{
390	if (rt->rt6i_flags & RTF_EXPIRES)
391		return time_after(jiffies, rt->dst.expires);
392	else
393		return false;
394}
395
396static bool rt6_check_expired(const struct rt6_info *rt)
397{
398	struct fib6_info *from;
399
400	from = rcu_dereference(rt->from);
401
402	if (rt->rt6i_flags & RTF_EXPIRES) {
403		if (time_after(jiffies, rt->dst.expires))
404			return true;
405	} else if (from) {
406		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
407			fib6_check_expired(from);
408	}
409	return false;
410}
411
412void fib6_select_path(const struct net *net, struct fib6_result *res,
413		      struct flowi6 *fl6, int oif, bool have_oif_match,
414		      const struct sk_buff *skb, int strict)
415{
416	struct fib6_info *sibling, *next_sibling;
417	struct fib6_info *match = res->f6i;
418
419	if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
420		goto out;
421
422	if (match->nh && have_oif_match && res->nh)
423		return;
424
425	/* We might have already computed the hash for ICMPv6 errors. In such
426	 * case it will always be non-zero. Otherwise now is the time to do it.
427	 */
428	if (!fl6->mp_hash &&
429	    (!match->nh || nexthop_is_multipath(match->nh)))
430		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
431
432	if (unlikely(match->nh)) {
433		nexthop_path_fib6_result(res, fl6->mp_hash);
434		return;
435	}
436
437	if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
438		goto out;
439
440	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
441				 fib6_siblings) {
442		const struct fib6_nh *nh = sibling->fib6_nh;
443		int nh_upper_bound;
444
445		nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
446		if (fl6->mp_hash > nh_upper_bound)
447			continue;
448		if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
449			break;
450		match = sibling;
451		break;
452	}
453
454out:
455	res->f6i = match;
456	res->nh = match->fib6_nh;
457}
458
459/*
460 *	Route lookup. rcu_read_lock() should be held.
461 */
462
463static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
464			       const struct in6_addr *saddr, int oif, int flags)
465{
466	const struct net_device *dev;
467
468	if (nh->fib_nh_flags & RTNH_F_DEAD)
469		return false;
470
471	dev = nh->fib_nh_dev;
472	if (oif) {
473		if (dev->ifindex == oif)
474			return true;
475	} else {
476		if (ipv6_chk_addr(net, saddr, dev,
477				  flags & RT6_LOOKUP_F_IFACE))
478			return true;
479	}
480
481	return false;
482}
483
484struct fib6_nh_dm_arg {
485	struct net		*net;
486	const struct in6_addr	*saddr;
487	int			oif;
488	int			flags;
489	struct fib6_nh		*nh;
490};
491
492static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg)
493{
494	struct fib6_nh_dm_arg *arg = _arg;
495
496	arg->nh = nh;
497	return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif,
498				  arg->flags);
499}
500
501/* returns fib6_nh from nexthop or NULL */
502static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh,
503					struct fib6_result *res,
504					const struct in6_addr *saddr,
505					int oif, int flags)
506{
507	struct fib6_nh_dm_arg arg = {
508		.net   = net,
509		.saddr = saddr,
510		.oif   = oif,
511		.flags = flags,
512	};
513
514	if (nexthop_is_blackhole(nh))
515		return NULL;
516
517	if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg))
518		return arg.nh;
519
520	return NULL;
521}
522
523static void rt6_device_match(struct net *net, struct fib6_result *res,
524			     const struct in6_addr *saddr, int oif, int flags)
525{
526	struct fib6_info *f6i = res->f6i;
527	struct fib6_info *spf6i;
528	struct fib6_nh *nh;
529
530	if (!oif && ipv6_addr_any(saddr)) {
531		if (unlikely(f6i->nh)) {
532			nh = nexthop_fib6_nh(f6i->nh);
533			if (nexthop_is_blackhole(f6i->nh))
534				goto out_blackhole;
535		} else {
536			nh = f6i->fib6_nh;
537		}
538		if (!(nh->fib_nh_flags & RTNH_F_DEAD))
539			goto out;
540	}
541
542	for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
543		bool matched = false;
544
545		if (unlikely(spf6i->nh)) {
546			nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr,
547					      oif, flags);
548			if (nh)
549				matched = true;
550		} else {
551			nh = spf6i->fib6_nh;
552			if (__rt6_device_match(net, nh, saddr, oif, flags))
553				matched = true;
554		}
555		if (matched) {
556			res->f6i = spf6i;
557			goto out;
558		}
559	}
560
561	if (oif && flags & RT6_LOOKUP_F_IFACE) {
562		res->f6i = net->ipv6.fib6_null_entry;
563		nh = res->f6i->fib6_nh;
564		goto out;
565	}
566
567	if (unlikely(f6i->nh)) {
568		nh = nexthop_fib6_nh(f6i->nh);
569		if (nexthop_is_blackhole(f6i->nh))
570			goto out_blackhole;
571	} else {
572		nh = f6i->fib6_nh;
573	}
574
575	if (nh->fib_nh_flags & RTNH_F_DEAD) {
576		res->f6i = net->ipv6.fib6_null_entry;
577		nh = res->f6i->fib6_nh;
578	}
579out:
580	res->nh = nh;
581	res->fib6_type = res->f6i->fib6_type;
582	res->fib6_flags = res->f6i->fib6_flags;
583	return;
584
585out_blackhole:
586	res->fib6_flags |= RTF_REJECT;
587	res->fib6_type = RTN_BLACKHOLE;
588	res->nh = nh;
589}
590
591#ifdef CONFIG_IPV6_ROUTER_PREF
592struct __rt6_probe_work {
593	struct work_struct work;
594	struct in6_addr target;
595	struct net_device *dev;
596};
597
598static void rt6_probe_deferred(struct work_struct *w)
599{
600	struct in6_addr mcaddr;
601	struct __rt6_probe_work *work =
602		container_of(w, struct __rt6_probe_work, work);
603
604	addrconf_addr_solict_mult(&work->target, &mcaddr);
605	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
606	dev_put(work->dev);
607	kfree(work);
608}
609
610static void rt6_probe(struct fib6_nh *fib6_nh)
611{
612	struct __rt6_probe_work *work = NULL;
613	const struct in6_addr *nh_gw;
614	unsigned long last_probe;
615	struct neighbour *neigh;
616	struct net_device *dev;
617	struct inet6_dev *idev;
618
619	/*
620	 * Okay, this does not seem to be appropriate
621	 * for now, however, we need to check if it
622	 * is really so; aka Router Reachability Probing.
623	 *
624	 * Router Reachability Probe MUST be rate-limited
625	 * to no more than one per minute.
626	 */
627	if (!fib6_nh->fib_nh_gw_family)
628		return;
629
630	nh_gw = &fib6_nh->fib_nh_gw6;
631	dev = fib6_nh->fib_nh_dev;
632	rcu_read_lock_bh();
633	last_probe = READ_ONCE(fib6_nh->last_probe);
634	idev = __in6_dev_get(dev);
635	if (!idev)
636		goto out;
637	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
638	if (neigh) {
639		if (neigh->nud_state & NUD_VALID)
640			goto out;
641
642		write_lock(&neigh->lock);
643		if (!(neigh->nud_state & NUD_VALID) &&
644		    time_after(jiffies,
645			       neigh->updated + idev->cnf.rtr_probe_interval)) {
646			work = kmalloc(sizeof(*work), GFP_ATOMIC);
647			if (work)
648				__neigh_set_probe_once(neigh);
649		}
650		write_unlock(&neigh->lock);
651	} else if (time_after(jiffies, last_probe +
652				       idev->cnf.rtr_probe_interval)) {
653		work = kmalloc(sizeof(*work), GFP_ATOMIC);
654	}
655
656	if (!work || cmpxchg(&fib6_nh->last_probe,
657			     last_probe, jiffies) != last_probe) {
658		kfree(work);
659	} else {
660		INIT_WORK(&work->work, rt6_probe_deferred);
661		work->target = *nh_gw;
662		dev_hold(dev);
663		work->dev = dev;
664		schedule_work(&work->work);
665	}
666
667out:
668	rcu_read_unlock_bh();
669}
670#else
671static inline void rt6_probe(struct fib6_nh *fib6_nh)
672{
673}
674#endif
675
676/*
677 * Default Router Selection (RFC 2461 6.3.6)
678 */
679static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
680{
681	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
682	struct neighbour *neigh;
683
684	rcu_read_lock_bh();
685	neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
686					  &fib6_nh->fib_nh_gw6);
687	if (neigh) {
688		read_lock(&neigh->lock);
689		if (neigh->nud_state & NUD_VALID)
690			ret = RT6_NUD_SUCCEED;
691#ifdef CONFIG_IPV6_ROUTER_PREF
692		else if (!(neigh->nud_state & NUD_FAILED))
693			ret = RT6_NUD_SUCCEED;
694		else
695			ret = RT6_NUD_FAIL_PROBE;
696#endif
697		read_unlock(&neigh->lock);
698	} else {
699		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
700		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
701	}
702	rcu_read_unlock_bh();
703
704	return ret;
705}
706
707static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
708			   int strict)
709{
710	int m = 0;
711
712	if (!oif || nh->fib_nh_dev->ifindex == oif)
713		m = 2;
714
715	if (!m && (strict & RT6_LOOKUP_F_IFACE))
716		return RT6_NUD_FAIL_HARD;
717#ifdef CONFIG_IPV6_ROUTER_PREF
718	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
719#endif
720	if ((strict & RT6_LOOKUP_F_REACHABLE) &&
721	    !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
722		int n = rt6_check_neigh(nh);
723		if (n < 0)
724			return n;
725	}
726	return m;
727}
728
729static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
730		       int oif, int strict, int *mpri, bool *do_rr)
731{
732	bool match_do_rr = false;
733	bool rc = false;
734	int m;
735
736	if (nh->fib_nh_flags & RTNH_F_DEAD)
737		goto out;
738
739	if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
740	    nh->fib_nh_flags & RTNH_F_LINKDOWN &&
741	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
742		goto out;
743
744	m = rt6_score_route(nh, fib6_flags, oif, strict);
745	if (m == RT6_NUD_FAIL_DO_RR) {
746		match_do_rr = true;
747		m = 0; /* lowest valid score */
748	} else if (m == RT6_NUD_FAIL_HARD) {
749		goto out;
750	}
751
752	if (strict & RT6_LOOKUP_F_REACHABLE)
753		rt6_probe(nh);
754
755	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
756	if (m > *mpri) {
757		*do_rr = match_do_rr;
758		*mpri = m;
759		rc = true;
760	}
761out:
762	return rc;
763}
764
765struct fib6_nh_frl_arg {
766	u32		flags;
767	int		oif;
768	int		strict;
769	int		*mpri;
770	bool		*do_rr;
771	struct fib6_nh	*nh;
772};
773
774static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg)
775{
776	struct fib6_nh_frl_arg *arg = _arg;
777
778	arg->nh = nh;
779	return find_match(nh, arg->flags, arg->oif, arg->strict,
780			  arg->mpri, arg->do_rr);
781}
782
783static void __find_rr_leaf(struct fib6_info *f6i_start,
784			   struct fib6_info *nomatch, u32 metric,
785			   struct fib6_result *res, struct fib6_info **cont,
786			   int oif, int strict, bool *do_rr, int *mpri)
787{
788	struct fib6_info *f6i;
789
790	for (f6i = f6i_start;
791	     f6i && f6i != nomatch;
792	     f6i = rcu_dereference(f6i->fib6_next)) {
793		bool matched = false;
794		struct fib6_nh *nh;
795
796		if (cont && f6i->fib6_metric != metric) {
797			*cont = f6i;
798			return;
799		}
800
801		if (fib6_check_expired(f6i))
802			continue;
803
804		if (unlikely(f6i->nh)) {
805			struct fib6_nh_frl_arg arg = {
806				.flags  = f6i->fib6_flags,
807				.oif    = oif,
808				.strict = strict,
809				.mpri   = mpri,
810				.do_rr  = do_rr
811			};
812
813			if (nexthop_is_blackhole(f6i->nh)) {
814				res->fib6_flags = RTF_REJECT;
815				res->fib6_type = RTN_BLACKHOLE;
816				res->f6i = f6i;
817				res->nh = nexthop_fib6_nh(f6i->nh);
818				return;
819			}
820			if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match,
821						     &arg)) {
822				matched = true;
823				nh = arg.nh;
824			}
825		} else {
826			nh = f6i->fib6_nh;
827			if (find_match(nh, f6i->fib6_flags, oif, strict,
828				       mpri, do_rr))
829				matched = true;
830		}
831		if (matched) {
832			res->f6i = f6i;
833			res->nh = nh;
834			res->fib6_flags = f6i->fib6_flags;
835			res->fib6_type = f6i->fib6_type;
836		}
837	}
838}
839
840static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
841			 struct fib6_info *rr_head, int oif, int strict,
842			 bool *do_rr, struct fib6_result *res)
843{
844	u32 metric = rr_head->fib6_metric;
845	struct fib6_info *cont = NULL;
846	int mpri = -1;
847
848	__find_rr_leaf(rr_head, NULL, metric, res, &cont,
849		       oif, strict, do_rr, &mpri);
850
851	__find_rr_leaf(leaf, rr_head, metric, res, &cont,
852		       oif, strict, do_rr, &mpri);
853
854	if (res->f6i || !cont)
855		return;
856
857	__find_rr_leaf(cont, NULL, metric, res, NULL,
858		       oif, strict, do_rr, &mpri);
859}
860
861static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
862		       struct fib6_result *res, int strict)
863{
864	struct fib6_info *leaf = rcu_dereference(fn->leaf);
865	struct fib6_info *rt0;
866	bool do_rr = false;
867	int key_plen;
868
869	/* make sure this function or its helpers sets f6i */
870	res->f6i = NULL;
871
872	if (!leaf || leaf == net->ipv6.fib6_null_entry)
873		goto out;
874
875	rt0 = rcu_dereference(fn->rr_ptr);
876	if (!rt0)
877		rt0 = leaf;
878
879	/* Double check to make sure fn is not an intermediate node
880	 * and fn->leaf does not points to its child's leaf
881	 * (This might happen if all routes under fn are deleted from
882	 * the tree and fib6_repair_tree() is called on the node.)
883	 */
884	key_plen = rt0->fib6_dst.plen;
885#ifdef CONFIG_IPV6_SUBTREES
886	if (rt0->fib6_src.plen)
887		key_plen = rt0->fib6_src.plen;
888#endif
889	if (fn->fn_bit != key_plen)
890		goto out;
891
892	find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
893	if (do_rr) {
894		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
895
896		/* no entries matched; do round-robin */
897		if (!next || next->fib6_metric != rt0->fib6_metric)
898			next = leaf;
899
900		if (next != rt0) {
901			spin_lock_bh(&leaf->fib6_table->tb6_lock);
902			/* make sure next is not being deleted from the tree */
903			if (next->fib6_node)
904				rcu_assign_pointer(fn->rr_ptr, next);
905			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
906		}
907	}
908
909out:
910	if (!res->f6i) {
911		res->f6i = net->ipv6.fib6_null_entry;
912		res->nh = res->f6i->fib6_nh;
913		res->fib6_flags = res->f6i->fib6_flags;
914		res->fib6_type = res->f6i->fib6_type;
915	}
916}
917
918static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
919{
920	return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
921	       res->nh->fib_nh_gw_family;
922}
923
924#ifdef CONFIG_IPV6_ROUTE_INFO
925int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
926		  const struct in6_addr *gwaddr)
927{
928	struct net *net = dev_net(dev);
929	struct route_info *rinfo = (struct route_info *) opt;
930	struct in6_addr prefix_buf, *prefix;
931	unsigned int pref;
932	unsigned long lifetime;
933	struct fib6_info *rt;
934
935	if (len < sizeof(struct route_info)) {
936		return -EINVAL;
937	}
938
939	/* Sanity check for prefix_len and length */
940	if (rinfo->length > 3) {
941		return -EINVAL;
942	} else if (rinfo->prefix_len > 128) {
943		return -EINVAL;
944	} else if (rinfo->prefix_len > 64) {
945		if (rinfo->length < 2) {
946			return -EINVAL;
947		}
948	} else if (rinfo->prefix_len > 0) {
949		if (rinfo->length < 1) {
950			return -EINVAL;
951		}
952	}
953
954	pref = rinfo->route_pref;
955	if (pref == ICMPV6_ROUTER_PREF_INVALID)
956		return -EINVAL;
957
958	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
959
960	if (rinfo->length == 3)
961		prefix = (struct in6_addr *)rinfo->prefix;
962	else {
963		/* this function is safe */
964		ipv6_addr_prefix(&prefix_buf,
965				 (struct in6_addr *)rinfo->prefix,
966				 rinfo->prefix_len);
967		prefix = &prefix_buf;
968	}
969
970	if (rinfo->prefix_len == 0)
971		rt = rt6_get_dflt_router(net, gwaddr, dev);
972	else
973		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
974					gwaddr, dev);
975
976	if (rt && !lifetime) {
977		ip6_del_rt(net, rt, false);
978		rt = NULL;
979	}
980
981	if (!rt && lifetime)
982		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
983					dev, pref);
984	else if (rt)
985		rt->fib6_flags = RTF_ROUTEINFO |
986				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
987
988	if (rt) {
989		if (!addrconf_finite_timeout(lifetime))
990			fib6_clean_expires(rt);
991		else
992			fib6_set_expires(rt, jiffies + HZ * lifetime);
993
994		fib6_info_release(rt);
995	}
996	return 0;
997}
998#endif
999
1000/*
1001 *	Misc support functions
1002 */
1003
1004/* called with rcu_lock held */
1005static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
1006{
1007	struct net_device *dev = res->nh->fib_nh_dev;
1008
1009	if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1010		/* for copies of local routes, dst->dev needs to be the
1011		 * device if it is a master device, the master device if
1012		 * device is enslaved, and the loopback as the default
1013		 */
1014		if (netif_is_l3_slave(dev) &&
1015		    !rt6_need_strict(&res->f6i->fib6_dst.addr))
1016			dev = l3mdev_master_dev_rcu(dev);
1017		else if (!netif_is_l3_master(dev))
1018			dev = dev_net(dev)->loopback_dev;
1019		/* last case is netif_is_l3_master(dev) is true in which
1020		 * case we want dev returned to be dev
1021		 */
1022	}
1023
1024	return dev;
1025}
1026
1027static const int fib6_prop[RTN_MAX + 1] = {
1028	[RTN_UNSPEC]	= 0,
1029	[RTN_UNICAST]	= 0,
1030	[RTN_LOCAL]	= 0,
1031	[RTN_BROADCAST]	= 0,
1032	[RTN_ANYCAST]	= 0,
1033	[RTN_MULTICAST]	= 0,
1034	[RTN_BLACKHOLE]	= -EINVAL,
1035	[RTN_UNREACHABLE] = -EHOSTUNREACH,
1036	[RTN_PROHIBIT]	= -EACCES,
1037	[RTN_THROW]	= -EAGAIN,
1038	[RTN_NAT]	= -EINVAL,
1039	[RTN_XRESOLVE]	= -EINVAL,
1040};
1041
1042static int ip6_rt_type_to_error(u8 fib6_type)
1043{
1044	return fib6_prop[fib6_type];
1045}
1046
1047static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
1048{
1049	unsigned short flags = 0;
1050
1051	if (rt->dst_nocount)
1052		flags |= DST_NOCOUNT;
1053	if (rt->dst_nopolicy)
1054		flags |= DST_NOPOLICY;
1055
1056	return flags;
1057}
1058
1059static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
1060{
1061	rt->dst.error = ip6_rt_type_to_error(fib6_type);
1062
1063	switch (fib6_type) {
1064	case RTN_BLACKHOLE:
1065		rt->dst.output = dst_discard_out;
1066		rt->dst.input = dst_discard;
1067		break;
1068	case RTN_PROHIBIT:
1069		rt->dst.output = ip6_pkt_prohibit_out;
1070		rt->dst.input = ip6_pkt_prohibit;
1071		break;
1072	case RTN_THROW:
1073	case RTN_UNREACHABLE:
1074	default:
1075		rt->dst.output = ip6_pkt_discard_out;
1076		rt->dst.input = ip6_pkt_discard;
1077		break;
1078	}
1079}
1080
1081static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
1082{
1083	struct fib6_info *f6i = res->f6i;
1084
1085	if (res->fib6_flags & RTF_REJECT) {
1086		ip6_rt_init_dst_reject(rt, res->fib6_type);
1087		return;
1088	}
1089
1090	rt->dst.error = 0;
1091	rt->dst.output = ip6_output;
1092
1093	if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
1094		rt->dst.input = ip6_input;
1095	} else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
1096		rt->dst.input = ip6_mc_input;
1097	} else {
1098		rt->dst.input = ip6_forward;
1099	}
1100
1101	if (res->nh->fib_nh_lws) {
1102		rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
1103		lwtunnel_set_redirect(&rt->dst);
1104	}
1105
1106	rt->dst.lastuse = jiffies;
1107}
1108
1109/* Caller must already hold reference to @from */
1110static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
1111{
1112	rt->rt6i_flags &= ~RTF_EXPIRES;
1113	rcu_assign_pointer(rt->from, from);
1114	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1115}
1116
1117/* Caller must already hold reference to f6i in result */
1118static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1119{
1120	const struct fib6_nh *nh = res->nh;
1121	const struct net_device *dev = nh->fib_nh_dev;
1122	struct fib6_info *f6i = res->f6i;
1123
1124	ip6_rt_init_dst(rt, res);
1125
1126	rt->rt6i_dst = f6i->fib6_dst;
1127	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1128	rt->rt6i_flags = res->fib6_flags;
1129	if (nh->fib_nh_gw_family) {
1130		rt->rt6i_gateway = nh->fib_nh_gw6;
1131		rt->rt6i_flags |= RTF_GATEWAY;
1132	}
1133	rt6_set_from(rt, f6i);
1134#ifdef CONFIG_IPV6_SUBTREES
1135	rt->rt6i_src = f6i->fib6_src;
1136#endif
1137}
1138
1139static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1140					struct in6_addr *saddr)
1141{
1142	struct fib6_node *pn, *sn;
1143	while (1) {
1144		if (fn->fn_flags & RTN_TL_ROOT)
1145			return NULL;
1146		pn = rcu_dereference(fn->parent);
1147		sn = FIB6_SUBTREE(pn);
1148		if (sn && sn != fn)
1149			fn = fib6_node_lookup(sn, NULL, saddr);
1150		else
1151			fn = pn;
1152		if (fn->fn_flags & RTN_RTINFO)
1153			return fn;
1154	}
1155}
1156
1157static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1158{
1159	struct rt6_info *rt = *prt;
1160
1161	if (dst_hold_safe(&rt->dst))
1162		return true;
1163	if (net) {
1164		rt = net->ipv6.ip6_null_entry;
1165		dst_hold(&rt->dst);
1166	} else {
1167		rt = NULL;
1168	}
1169	*prt = rt;
1170	return false;
1171}
1172
1173/* called with rcu_lock held */
1174static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1175{
1176	struct net_device *dev = res->nh->fib_nh_dev;
1177	struct fib6_info *f6i = res->f6i;
1178	unsigned short flags;
1179	struct rt6_info *nrt;
1180
1181	if (!fib6_info_hold_safe(f6i))
1182		goto fallback;
1183
1184	flags = fib6_info_dst_flags(f6i);
1185	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1186	if (!nrt) {
1187		fib6_info_release(f6i);
1188		goto fallback;
1189	}
1190
1191	ip6_rt_copy_init(nrt, res);
1192	return nrt;
1193
1194fallback:
1195	nrt = dev_net(dev)->ipv6.ip6_null_entry;
1196	dst_hold(&nrt->dst);
1197	return nrt;
1198}
1199
1200INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net,
1201					     struct fib6_table *table,
1202					     struct flowi6 *fl6,
1203					     const struct sk_buff *skb,
1204					     int flags)
1205{
1206	struct fib6_result res = {};
1207	struct fib6_node *fn;
1208	struct rt6_info *rt;
1209
1210	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1211		flags &= ~RT6_LOOKUP_F_IFACE;
1212
1213	rcu_read_lock();
1214	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1215restart:
1216	res.f6i = rcu_dereference(fn->leaf);
1217	if (!res.f6i)
1218		res.f6i = net->ipv6.fib6_null_entry;
1219	else
1220		rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1221				 flags);
1222
1223	if (res.f6i == net->ipv6.fib6_null_entry) {
1224		fn = fib6_backtrack(fn, &fl6->saddr);
1225		if (fn)
1226			goto restart;
1227
1228		rt = net->ipv6.ip6_null_entry;
1229		dst_hold(&rt->dst);
1230		goto out;
1231	} else if (res.fib6_flags & RTF_REJECT) {
1232		goto do_create;
1233	}
1234
1235	fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1236			 fl6->flowi6_oif != 0, skb, flags);
1237
1238	/* Search through exception table */
1239	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1240	if (rt) {
1241		if (ip6_hold_safe(net, &rt))
1242			dst_use_noref(&rt->dst, jiffies);
1243	} else {
1244do_create:
1245		rt = ip6_create_rt_rcu(&res);
1246	}
1247
1248out:
1249	trace_fib6_table_lookup(net, &res, table, fl6);
1250
1251	rcu_read_unlock();
1252
1253	return rt;
1254}
1255
1256struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1257				   const struct sk_buff *skb, int flags)
1258{
1259	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1260}
1261EXPORT_SYMBOL_GPL(ip6_route_lookup);
1262
1263struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1264			    const struct in6_addr *saddr, int oif,
1265			    const struct sk_buff *skb, int strict)
1266{
1267	struct flowi6 fl6 = {
1268		.flowi6_oif = oif,
1269		.daddr = *daddr,
1270	};
1271	struct dst_entry *dst;
1272	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1273
1274	if (saddr) {
1275		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1276		flags |= RT6_LOOKUP_F_HAS_SADDR;
1277	}
1278
1279	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1280	if (dst->error == 0)
1281		return (struct rt6_info *) dst;
1282
1283	dst_release(dst);
1284
1285	return NULL;
1286}
1287EXPORT_SYMBOL(rt6_lookup);
1288
1289/* ip6_ins_rt is called with FREE table->tb6_lock.
1290 * It takes new route entry, the addition fails by any reason the
1291 * route is released.
1292 * Caller must hold dst before calling it.
1293 */
1294
1295static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1296			struct netlink_ext_ack *extack)
1297{
1298	int err;
1299	struct fib6_table *table;
1300
1301	table = rt->fib6_table;
1302	spin_lock_bh(&table->tb6_lock);
1303	err = fib6_add(&table->tb6_root, rt, info, extack);
1304	spin_unlock_bh(&table->tb6_lock);
1305
1306	return err;
1307}
1308
1309int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1310{
1311	struct nl_info info = {	.nl_net = net, };
1312
1313	return __ip6_ins_rt(rt, &info, NULL);
1314}
1315
1316static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1317					   const struct in6_addr *daddr,
1318					   const struct in6_addr *saddr)
1319{
1320	struct fib6_info *f6i = res->f6i;
1321	struct net_device *dev;
1322	struct rt6_info *rt;
1323
1324	/*
1325	 *	Clone the route.
1326	 */
1327
1328	if (!fib6_info_hold_safe(f6i))
1329		return NULL;
1330
1331	dev = ip6_rt_get_dev_rcu(res);
1332	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1333	if (!rt) {
1334		fib6_info_release(f6i);
1335		return NULL;
1336	}
1337
1338	ip6_rt_copy_init(rt, res);
1339	rt->rt6i_flags |= RTF_CACHE;
1340	rt->rt6i_dst.addr = *daddr;
1341	rt->rt6i_dst.plen = 128;
1342
1343	if (!rt6_is_gw_or_nonexthop(res)) {
1344		if (f6i->fib6_dst.plen != 128 &&
1345		    ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1346			rt->rt6i_flags |= RTF_ANYCAST;
1347#ifdef CONFIG_IPV6_SUBTREES
1348		if (rt->rt6i_src.plen && saddr) {
1349			rt->rt6i_src.addr = *saddr;
1350			rt->rt6i_src.plen = 128;
1351		}
1352#endif
1353	}
1354
1355	return rt;
1356}
1357
1358static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1359{
1360	struct fib6_info *f6i = res->f6i;
1361	unsigned short flags = fib6_info_dst_flags(f6i);
1362	struct net_device *dev;
1363	struct rt6_info *pcpu_rt;
1364
1365	if (!fib6_info_hold_safe(f6i))
1366		return NULL;
1367
1368	rcu_read_lock();
1369	dev = ip6_rt_get_dev_rcu(res);
1370	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT);
1371	rcu_read_unlock();
1372	if (!pcpu_rt) {
1373		fib6_info_release(f6i);
1374		return NULL;
1375	}
1376	ip6_rt_copy_init(pcpu_rt, res);
1377	pcpu_rt->rt6i_flags |= RTF_PCPU;
1378
1379	if (f6i->nh)
1380		pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev));
1381
1382	return pcpu_rt;
1383}
1384
1385static bool rt6_is_valid(const struct rt6_info *rt6)
1386{
1387	return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev));
1388}
1389
1390/* It should be called with rcu_read_lock() acquired */
1391static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1392{
1393	struct rt6_info *pcpu_rt;
1394
1395	pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);
1396
1397	if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) {
1398		struct rt6_info *prev, **p;
1399
1400		p = this_cpu_ptr(res->nh->rt6i_pcpu);
1401		/* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */
1402		prev = xchg(p, NULL);
1403		if (prev) {
1404			dst_dev_put(&prev->dst);
1405			dst_release(&prev->dst);
1406		}
1407
1408		pcpu_rt = NULL;
1409	}
1410
1411	return pcpu_rt;
1412}
1413
1414static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1415					    const struct fib6_result *res)
1416{
1417	struct rt6_info *pcpu_rt, *prev, **p;
1418
1419	pcpu_rt = ip6_rt_pcpu_alloc(res);
1420	if (!pcpu_rt)
1421		return NULL;
1422
1423	p = this_cpu_ptr(res->nh->rt6i_pcpu);
1424	prev = cmpxchg(p, NULL, pcpu_rt);
1425	BUG_ON(prev);
1426
1427	if (res->f6i->fib6_destroying) {
1428		struct fib6_info *from;
1429
1430		from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1431		fib6_info_release(from);
1432	}
1433
1434	return pcpu_rt;
1435}
1436
1437/* exception hash table implementation
1438 */
1439static DEFINE_SPINLOCK(rt6_exception_lock);
1440
1441/* Remove rt6_ex from hash table and free the memory
1442 * Caller must hold rt6_exception_lock
1443 */
1444static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1445				 struct rt6_exception *rt6_ex)
1446{
1447	struct fib6_info *from;
1448	struct net *net;
1449
1450	if (!bucket || !rt6_ex)
1451		return;
1452
1453	net = dev_net(rt6_ex->rt6i->dst.dev);
1454	net->ipv6.rt6_stats->fib_rt_cache--;
1455
1456	/* purge completely the exception to allow releasing the held resources:
1457	 * some [sk] cache may keep the dst around for unlimited time
1458	 */
1459	from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1460	fib6_info_release(from);
1461	dst_dev_put(&rt6_ex->rt6i->dst);
1462
1463	hlist_del_rcu(&rt6_ex->hlist);
1464	dst_release(&rt6_ex->rt6i->dst);
1465	kfree_rcu(rt6_ex, rcu);
1466	WARN_ON_ONCE(!bucket->depth);
1467	bucket->depth--;
1468}
1469
1470/* Remove oldest rt6_ex in bucket and free the memory
1471 * Caller must hold rt6_exception_lock
1472 */
1473static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1474{
1475	struct rt6_exception *rt6_ex, *oldest = NULL;
1476
1477	if (!bucket)
1478		return;
1479
1480	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1481		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1482			oldest = rt6_ex;
1483	}
1484	rt6_remove_exception(bucket, oldest);
1485}
1486
1487static u32 rt6_exception_hash(const struct in6_addr *dst,
1488			      const struct in6_addr *src)
1489{
1490	static siphash_key_t rt6_exception_key __read_mostly;
1491	struct {
1492		struct in6_addr dst;
1493		struct in6_addr src;
1494	} __aligned(SIPHASH_ALIGNMENT) combined = {
1495		.dst = *dst,
1496	};
1497	u64 val;
1498
1499	net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key));
1500
1501#ifdef CONFIG_IPV6_SUBTREES
1502	if (src)
1503		combined.src = *src;
1504#endif
1505	val = siphash(&combined, sizeof(combined), &rt6_exception_key);
1506
1507	return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1508}
1509
1510/* Helper function to find the cached rt in the hash table
1511 * and update bucket pointer to point to the bucket for this
1512 * (daddr, saddr) pair
1513 * Caller must hold rt6_exception_lock
1514 */
1515static struct rt6_exception *
1516__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1517			      const struct in6_addr *daddr,
1518			      const struct in6_addr *saddr)
1519{
1520	struct rt6_exception *rt6_ex;
1521	u32 hval;
1522
1523	if (!(*bucket) || !daddr)
1524		return NULL;
1525
1526	hval = rt6_exception_hash(daddr, saddr);
1527	*bucket += hval;
1528
1529	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1530		struct rt6_info *rt6 = rt6_ex->rt6i;
1531		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1532
1533#ifdef CONFIG_IPV6_SUBTREES
1534		if (matched && saddr)
1535			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1536#endif
1537		if (matched)
1538			return rt6_ex;
1539	}
1540	return NULL;
1541}
1542
1543/* Helper function to find the cached rt in the hash table
1544 * and update bucket pointer to point to the bucket for this
1545 * (daddr, saddr) pair
1546 * Caller must hold rcu_read_lock()
1547 */
1548static struct rt6_exception *
1549__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1550			 const struct in6_addr *daddr,
1551			 const struct in6_addr *saddr)
1552{
1553	struct rt6_exception *rt6_ex;
1554	u32 hval;
1555
1556	WARN_ON_ONCE(!rcu_read_lock_held());
1557
1558	if (!(*bucket) || !daddr)
1559		return NULL;
1560
1561	hval = rt6_exception_hash(daddr, saddr);
1562	*bucket += hval;
1563
1564	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1565		struct rt6_info *rt6 = rt6_ex->rt6i;
1566		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1567
1568#ifdef CONFIG_IPV6_SUBTREES
1569		if (matched && saddr)
1570			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1571#endif
1572		if (matched)
1573			return rt6_ex;
1574	}
1575	return NULL;
1576}
1577
1578static unsigned int fib6_mtu(const struct fib6_result *res)
1579{
1580	const struct fib6_nh *nh = res->nh;
1581	unsigned int mtu;
1582
1583	if (res->f6i->fib6_pmtu) {
1584		mtu = res->f6i->fib6_pmtu;
1585	} else {
1586		struct net_device *dev = nh->fib_nh_dev;
1587		struct inet6_dev *idev;
1588
1589		rcu_read_lock();
1590		idev = __in6_dev_get(dev);
1591		mtu = idev->cnf.mtu6;
1592		rcu_read_unlock();
1593	}
1594
1595	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1596
1597	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1598}
1599
1600#define FIB6_EXCEPTION_BUCKET_FLUSHED  0x1UL
1601
1602/* used when the flushed bit is not relevant, only access to the bucket
1603 * (ie., all bucket users except rt6_insert_exception);
1604 *
1605 * called under rcu lock; sometimes called with rt6_exception_lock held
1606 */
1607static
1608struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
1609						       spinlock_t *lock)
1610{
1611	struct rt6_exception_bucket *bucket;
1612
1613	if (lock)
1614		bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1615						   lockdep_is_held(lock));
1616	else
1617		bucket = rcu_dereference(nh->rt6i_exception_bucket);
1618
1619	/* remove bucket flushed bit if set */
1620	if (bucket) {
1621		unsigned long p = (unsigned long)bucket;
1622
1623		p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
1624		bucket = (struct rt6_exception_bucket *)p;
1625	}
1626
1627	return bucket;
1628}
1629
1630static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
1631{
1632	unsigned long p = (unsigned long)bucket;
1633
1634	return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
1635}
1636
1637/* called with rt6_exception_lock held */
1638static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
1639					      spinlock_t *lock)
1640{
1641	struct rt6_exception_bucket *bucket;
1642	unsigned long p;
1643
1644	bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1645					   lockdep_is_held(lock));
1646
1647	p = (unsigned long)bucket;
1648	p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
1649	bucket = (struct rt6_exception_bucket *)p;
1650	rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
1651}
1652
1653static int rt6_insert_exception(struct rt6_info *nrt,
1654				const struct fib6_result *res)
1655{
1656	struct net *net = dev_net(nrt->dst.dev);
1657	struct rt6_exception_bucket *bucket;
1658	struct fib6_info *f6i = res->f6i;
1659	struct in6_addr *src_key = NULL;
1660	struct rt6_exception *rt6_ex;
1661	struct fib6_nh *nh = res->nh;
1662	int max_depth;
1663	int err = 0;
1664
1665	spin_lock_bh(&rt6_exception_lock);
1666
1667	bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1668					  lockdep_is_held(&rt6_exception_lock));
1669	if (!bucket) {
1670		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1671				 GFP_ATOMIC);
1672		if (!bucket) {
1673			err = -ENOMEM;
1674			goto out;
1675		}
1676		rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
1677	} else if (fib6_nh_excptn_bucket_flushed(bucket)) {
1678		err = -EINVAL;
1679		goto out;
1680	}
1681
1682#ifdef CONFIG_IPV6_SUBTREES
1683	/* fib6_src.plen != 0 indicates f6i is in subtree
1684	 * and exception table is indexed by a hash of
1685	 * both fib6_dst and fib6_src.
1686	 * Otherwise, the exception table is indexed by
1687	 * a hash of only fib6_dst.
1688	 */
1689	if (f6i->fib6_src.plen)
1690		src_key = &nrt->rt6i_src.addr;
1691#endif
1692	/* rt6_mtu_change() might lower mtu on f6i.
1693	 * Only insert this exception route if its mtu
1694	 * is less than f6i's mtu value.
1695	 */
1696	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1697		err = -EINVAL;
1698		goto out;
1699	}
1700
1701	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1702					       src_key);
1703	if (rt6_ex)
1704		rt6_remove_exception(bucket, rt6_ex);
1705
1706	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1707	if (!rt6_ex) {
1708		err = -ENOMEM;
1709		goto out;
1710	}
1711	rt6_ex->rt6i = nrt;
1712	rt6_ex->stamp = jiffies;
1713	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1714	bucket->depth++;
1715	net->ipv6.rt6_stats->fib_rt_cache++;
1716
1717	/* Randomize max depth to avoid some side channels attacks. */
1718	max_depth = FIB6_MAX_DEPTH + prandom_u32_max(FIB6_MAX_DEPTH);
1719	while (bucket->depth > max_depth)
1720		rt6_exception_remove_oldest(bucket);
1721
1722out:
1723	spin_unlock_bh(&rt6_exception_lock);
1724
1725	/* Update fn->fn_sernum to invalidate all cached dst */
1726	if (!err) {
1727		spin_lock_bh(&f6i->fib6_table->tb6_lock);
1728		fib6_update_sernum(net, f6i);
1729		spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1730		fib6_force_start_gc(net);
1731	}
1732
1733	return err;
1734}
1735
1736static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
1737{
1738	struct rt6_exception_bucket *bucket;
1739	struct rt6_exception *rt6_ex;
1740	struct hlist_node *tmp;
1741	int i;
1742
1743	spin_lock_bh(&rt6_exception_lock);
1744
1745	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
1746	if (!bucket)
1747		goto out;
1748
1749	/* Prevent rt6_insert_exception() to recreate the bucket list */
1750	if (!from)
1751		fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);
1752
1753	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1754		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
1755			if (!from ||
1756			    rcu_access_pointer(rt6_ex->rt6i->from) == from)
1757				rt6_remove_exception(bucket, rt6_ex);
1758		}
1759		WARN_ON_ONCE(!from && bucket->depth);
1760		bucket++;
1761	}
1762out:
1763	spin_unlock_bh(&rt6_exception_lock);
1764}
1765
1766static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
1767{
1768	struct fib6_info *f6i = arg;
1769
1770	fib6_nh_flush_exceptions(nh, f6i);
1771
1772	return 0;
1773}
1774
1775void rt6_flush_exceptions(struct fib6_info *f6i)
1776{
1777	if (f6i->nh)
1778		nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions,
1779					 f6i);
1780	else
1781		fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
1782}
1783
1784/* Find cached rt in the hash table inside passed in rt
1785 * Caller has to hold rcu_read_lock()
1786 */
1787static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1788					   const struct in6_addr *daddr,
1789					   const struct in6_addr *saddr)
1790{
1791	const struct in6_addr *src_key = NULL;
1792	struct rt6_exception_bucket *bucket;
1793	struct rt6_exception *rt6_ex;
1794	struct rt6_info *ret = NULL;
1795
1796#ifdef CONFIG_IPV6_SUBTREES
1797	/* fib6i_src.plen != 0 indicates f6i is in subtree
1798	 * and exception table is indexed by a hash of
1799	 * both fib6_dst and fib6_src.
1800	 * However, the src addr used to create the hash
1801	 * might not be exactly the passed in saddr which
1802	 * is a /128 addr from the flow.
1803	 * So we need to use f6i->fib6_src to redo lookup
1804	 * if the passed in saddr does not find anything.
1805	 * (See the logic in ip6_rt_cache_alloc() on how
1806	 * rt->rt6i_src is updated.)
1807	 */
1808	if (res->f6i->fib6_src.plen)
1809		src_key = saddr;
1810find_ex:
1811#endif
1812	bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
1813	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1814
1815	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1816		ret = rt6_ex->rt6i;
1817
1818#ifdef CONFIG_IPV6_SUBTREES
1819	/* Use fib6_src as src_key and redo lookup */
1820	if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
1821		src_key = &res->f6i->fib6_src.addr;
1822		goto find_ex;
1823	}
1824#endif
1825
1826	return ret;
1827}
1828
1829/* Remove the passed in cached rt from the hash table that contains it */
1830static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
1831				    const struct rt6_info *rt)
1832{
1833	const struct in6_addr *src_key = NULL;
1834	struct rt6_exception_bucket *bucket;
1835	struct rt6_exception *rt6_ex;
1836	int err;
1837
1838	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
1839		return -ENOENT;
1840
1841	spin_lock_bh(&rt6_exception_lock);
1842	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
1843
1844#ifdef CONFIG_IPV6_SUBTREES
1845	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1846	 * and exception table is indexed by a hash of
1847	 * both rt6i_dst and rt6i_src.
1848	 * Otherwise, the exception table is indexed by
1849	 * a hash of only rt6i_dst.
1850	 */
1851	if (plen)
1852		src_key = &rt->rt6i_src.addr;
1853#endif
1854	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1855					       &rt->rt6i_dst.addr,
1856					       src_key);
1857	if (rt6_ex) {
1858		rt6_remove_exception(bucket, rt6_ex);
1859		err = 0;
1860	} else {
1861		err = -ENOENT;
1862	}
1863
1864	spin_unlock_bh(&rt6_exception_lock);
1865	return err;
1866}
1867
1868struct fib6_nh_excptn_arg {
1869	struct rt6_info	*rt;
1870	int		plen;
1871};
1872
1873static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg)
1874{
1875	struct fib6_nh_excptn_arg *arg = _arg;
1876	int err;
1877
1878	err = fib6_nh_remove_exception(nh, arg->plen, arg->rt);
1879	if (err == 0)
1880		return 1;
1881
1882	return 0;
1883}
1884
1885static int rt6_remove_exception_rt(struct rt6_info *rt)
1886{
1887	struct fib6_info *from;
1888
1889	from = rcu_dereference(rt->from);
1890	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1891		return -EINVAL;
1892
1893	if (from->nh) {
1894		struct fib6_nh_excptn_arg arg = {
1895			.rt = rt,
1896			.plen = from->fib6_src.plen
1897		};
1898		int rc;
1899
1900		/* rc = 1 means an entry was found */
1901		rc = nexthop_for_each_fib6_nh(from->nh,
1902					      rt6_nh_remove_exception_rt,
1903					      &arg);
1904		return rc ? 0 : -ENOENT;
1905	}
1906
1907	return fib6_nh_remove_exception(from->fib6_nh,
1908					from->fib6_src.plen, rt);
1909}
1910
1911/* Find rt6_ex which contains the passed in rt cache and
1912 * refresh its stamp
1913 */
1914static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
1915				     const struct rt6_info *rt)
1916{
1917	const struct in6_addr *src_key = NULL;
1918	struct rt6_exception_bucket *bucket;
1919	struct rt6_exception *rt6_ex;
1920
1921	bucket = fib6_nh_get_excptn_bucket(nh, NULL);
1922#ifdef CONFIG_IPV6_SUBTREES
1923	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1924	 * and exception table is indexed by a hash of
1925	 * both rt6i_dst and rt6i_src.
1926	 * Otherwise, the exception table is indexed by
1927	 * a hash of only rt6i_dst.
1928	 */
1929	if (plen)
1930		src_key = &rt->rt6i_src.addr;
1931#endif
1932	rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
1933	if (rt6_ex)
1934		rt6_ex->stamp = jiffies;
1935}
1936
1937struct fib6_nh_match_arg {
1938	const struct net_device *dev;
1939	const struct in6_addr	*gw;
1940	struct fib6_nh		*match;
1941};
1942
1943/* determine if fib6_nh has given device and gateway */
1944static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg)
1945{
1946	struct fib6_nh_match_arg *arg = _arg;
1947
1948	if (arg->dev != nh->fib_nh_dev ||
1949	    (arg->gw && !nh->fib_nh_gw_family) ||
1950	    (!arg->gw && nh->fib_nh_gw_family) ||
1951	    (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6)))
1952		return 0;
1953
1954	arg->match = nh;
1955
1956	/* found a match, break the loop */
1957	return 1;
1958}
1959
1960static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1961{
1962	struct fib6_info *from;
1963	struct fib6_nh *fib6_nh;
1964
1965	rcu_read_lock();
1966
1967	from = rcu_dereference(rt->from);
1968	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1969		goto unlock;
1970
1971	if (from->nh) {
1972		struct fib6_nh_match_arg arg = {
1973			.dev = rt->dst.dev,
1974			.gw = &rt->rt6i_gateway,
1975		};
1976
1977		nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg);
1978
1979		if (!arg.match)
1980			goto unlock;
1981		fib6_nh = arg.match;
1982	} else {
1983		fib6_nh = from->fib6_nh;
1984	}
1985	fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt);
1986unlock:
1987	rcu_read_unlock();
1988}
1989
1990static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1991					 struct rt6_info *rt, int mtu)
1992{
1993	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1994	 * lowest MTU in the path: always allow updating the route PMTU to
1995	 * reflect PMTU decreases.
1996	 *
1997	 * If the new MTU is higher, and the route PMTU is equal to the local
1998	 * MTU, this means the old MTU is the lowest in the path, so allow
1999	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
2000	 * handle this.
2001	 */
2002
2003	if (dst_mtu(&rt->dst) >= mtu)
2004		return true;
2005
2006	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
2007		return true;
2008
2009	return false;
2010}
2011
2012static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
2013				       const struct fib6_nh *nh, int mtu)
2014{
2015	struct rt6_exception_bucket *bucket;
2016	struct rt6_exception *rt6_ex;
2017	int i;
2018
2019	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
2020	if (!bucket)
2021		return;
2022
2023	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
2024		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
2025			struct rt6_info *entry = rt6_ex->rt6i;
2026
2027			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
2028			 * route), the metrics of its rt->from have already
2029			 * been updated.
2030			 */
2031			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
2032			    rt6_mtu_change_route_allowed(idev, entry, mtu))
2033				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
2034		}
2035		bucket++;
2036	}
2037}
2038
2039#define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
2040
2041static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
2042					    const struct in6_addr *gateway)
2043{
2044	struct rt6_exception_bucket *bucket;
2045	struct rt6_exception *rt6_ex;
2046	struct hlist_node *tmp;
2047	int i;
2048
2049	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
2050		return;
2051
2052	spin_lock_bh(&rt6_exception_lock);
2053	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
2054	if (bucket) {
2055		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
2056			hlist_for_each_entry_safe(rt6_ex, tmp,
2057						  &bucket->chain, hlist) {
2058				struct rt6_info *entry = rt6_ex->rt6i;
2059
2060				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
2061				    RTF_CACHE_GATEWAY &&
2062				    ipv6_addr_equal(gateway,
2063						    &entry->rt6i_gateway)) {
2064					rt6_remove_exception(bucket, rt6_ex);
2065				}
2066			}
2067			bucket++;
2068		}
2069	}
2070
2071	spin_unlock_bh(&rt6_exception_lock);
2072}
2073
2074static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
2075				      struct rt6_exception *rt6_ex,
2076				      struct fib6_gc_args *gc_args,
2077				      unsigned long now)
2078{
2079	struct rt6_info *rt = rt6_ex->rt6i;
2080
2081	/* we are pruning and obsoleting aged-out and non gateway exceptions
2082	 * even if others have still references to them, so that on next
2083	 * dst_check() such references can be dropped.
2084	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
2085	 * expired, independently from their aging, as per RFC 8201 section 4
2086	 */
2087	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
2088		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
2089			RT6_TRACE("aging clone %p\n", rt);
2090			rt6_remove_exception(bucket, rt6_ex);
2091			return;
2092		}
2093	} else if (time_after(jiffies, rt->dst.expires)) {
2094		RT6_TRACE("purging expired route %p\n", rt);
2095		rt6_remove_exception(bucket, rt6_ex);
2096		return;
2097	}
2098
2099	if (rt->rt6i_flags & RTF_GATEWAY) {
2100		struct neighbour *neigh;
2101		__u8 neigh_flags = 0;
2102
2103		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
2104		if (neigh)
2105			neigh_flags = neigh->flags;
2106
2107		if (!(neigh_flags & NTF_ROUTER)) {
2108			RT6_TRACE("purging route %p via non-router but gateway\n",
2109				  rt);
2110			rt6_remove_exception(bucket, rt6_ex);
2111			return;
2112		}
2113	}
2114
2115	gc_args->more++;
2116}
2117
2118static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
2119				   struct fib6_gc_args *gc_args,
2120				   unsigned long now)
2121{
2122	struct rt6_exception_bucket *bucket;
2123	struct rt6_exception *rt6_ex;
2124	struct hlist_node *tmp;
2125	int i;
2126
2127	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
2128		return;
2129
2130	rcu_read_lock_bh();
2131	spin_lock(&rt6_exception_lock);
2132	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
2133	if (bucket) {
2134		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
2135			hlist_for_each_entry_safe(rt6_ex, tmp,
2136						  &bucket->chain, hlist) {
2137				rt6_age_examine_exception(bucket, rt6_ex,
2138							  gc_args, now);
2139			}
2140			bucket++;
2141		}
2142	}
2143	spin_unlock(&rt6_exception_lock);
2144	rcu_read_unlock_bh();
2145}
2146
2147struct fib6_nh_age_excptn_arg {
2148	struct fib6_gc_args	*gc_args;
2149	unsigned long		now;
2150};
2151
2152static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg)
2153{
2154	struct fib6_nh_age_excptn_arg *arg = _arg;
2155
2156	fib6_nh_age_exceptions(nh, arg->gc_args, arg->now);
2157	return 0;
2158}
2159
2160void rt6_age_exceptions(struct fib6_info *f6i,
2161			struct fib6_gc_args *gc_args,
2162			unsigned long now)
2163{
2164	if (f6i->nh) {
2165		struct fib6_nh_age_excptn_arg arg = {
2166			.gc_args = gc_args,
2167			.now = now
2168		};
2169
2170		nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions,
2171					 &arg);
2172	} else {
2173		fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
2174	}
2175}
2176
2177/* must be called with rcu lock held */
2178int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
2179		      struct flowi6 *fl6, struct fib6_result *res, int strict)
2180{
2181	struct fib6_node *fn, *saved_fn;
2182
2183	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2184	saved_fn = fn;
2185
2186	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
2187		oif = 0;
2188
2189redo_rt6_select:
2190	rt6_select(net, fn, oif, res, strict);
2191	if (res->f6i == net->ipv6.fib6_null_entry) {
2192		fn = fib6_backtrack(fn, &fl6->saddr);
2193		if (fn)
2194			goto redo_rt6_select;
2195		else if (strict & RT6_LOOKUP_F_REACHABLE) {
2196			/* also consider unreachable route */
2197			strict &= ~RT6_LOOKUP_F_REACHABLE;
2198			fn = saved_fn;
2199			goto redo_rt6_select;
2200		}
2201	}
2202
2203	trace_fib6_table_lookup(net, res, table, fl6);
2204
2205	return 0;
2206}
2207
2208struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
2209			       int oif, struct flowi6 *fl6,
2210			       const struct sk_buff *skb, int flags)
2211{
2212	struct fib6_result res = {};
2213	struct rt6_info *rt = NULL;
2214	int strict = 0;
2215
2216	WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) &&
2217		     !rcu_read_lock_held());
2218
2219	strict |= flags & RT6_LOOKUP_F_IFACE;
2220	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
2221	if (net->ipv6.devconf_all->forwarding == 0)
2222		strict |= RT6_LOOKUP_F_REACHABLE;
2223
2224	rcu_read_lock();
2225
2226	fib6_table_lookup(net, table, oif, fl6, &res, strict);
2227	if (res.f6i == net->ipv6.fib6_null_entry)
2228		goto out;
2229
2230	fib6_select_path(net, &res, fl6, oif, false, skb, strict);
2231
2232	/*Search through exception table */
2233	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
2234	if (rt) {
2235		goto out;
2236	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
2237			    !res.nh->fib_nh_gw_family)) {
2238		/* Create a RTF_CACHE clone which will not be
2239		 * owned by the fib6 tree.  It is for the special case where
2240		 * the daddr in the skb during the neighbor look-up is different
2241		 * from the fl6->daddr used to look-up route here.
2242		 */
2243		rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
2244
2245		if (rt) {
2246			/* 1 refcnt is taken during ip6_rt_cache_alloc().
2247			 * As rt6_uncached_list_add() does not consume refcnt,
2248			 * this refcnt is always returned to the caller even
2249			 * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
2250			 */
2251			rt6_uncached_list_add(rt);
2252			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2253			rcu_read_unlock();
2254
2255			return rt;
2256		}
2257	} else {
2258		/* Get a percpu copy */
2259		local_bh_disable();
2260		rt = rt6_get_pcpu_route(&res);
2261
2262		if (!rt)
2263			rt = rt6_make_pcpu_route(net, &res);
2264
2265		local_bh_enable();
2266	}
2267out:
2268	if (!rt)
2269		rt = net->ipv6.ip6_null_entry;
2270	if (!(flags & RT6_LOOKUP_F_DST_NOREF))
2271		ip6_hold_safe(net, &rt);
2272	rcu_read_unlock();
2273
2274	return rt;
2275}
2276EXPORT_SYMBOL_GPL(ip6_pol_route);
2277
2278INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net,
2279					    struct fib6_table *table,
2280					    struct flowi6 *fl6,
2281					    const struct sk_buff *skb,
2282					    int flags)
2283{
2284	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
2285}
2286
2287struct dst_entry *ip6_route_input_lookup(struct net *net,
2288					 struct net_device *dev,
2289					 struct flowi6 *fl6,
2290					 const struct sk_buff *skb,
2291					 int flags)
2292{
2293	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
2294		flags |= RT6_LOOKUP_F_IFACE;
2295
2296	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
2297}
2298EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
2299
2300static void ip6_multipath_l3_keys(const struct sk_buff *skb,
2301				  struct flow_keys *keys,
2302				  struct flow_keys *flkeys)
2303{
2304	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
2305	const struct ipv6hdr *key_iph = outer_iph;
2306	struct flow_keys *_flkeys = flkeys;
2307	const struct ipv6hdr *inner_iph;
2308	const struct icmp6hdr *icmph;
2309	struct ipv6hdr _inner_iph;
2310	struct icmp6hdr _icmph;
2311
2312	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2313		goto out;
2314
2315	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2316				   sizeof(_icmph), &_icmph);
2317	if (!icmph)
2318		goto out;
2319
2320	if (!icmpv6_is_err(icmph->icmp6_type))
2321		goto out;
2322
2323	inner_iph = skb_header_pointer(skb,
2324				       skb_transport_offset(skb) + sizeof(*icmph),
2325				       sizeof(_inner_iph), &_inner_iph);
2326	if (!inner_iph)
2327		goto out;
2328
2329	key_iph = inner_iph;
2330	_flkeys = NULL;
2331out:
2332	if (_flkeys) {
2333		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2334		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2335		keys->tags.flow_label = _flkeys->tags.flow_label;
2336		keys->basic.ip_proto = _flkeys->basic.ip_proto;
2337	} else {
2338		keys->addrs.v6addrs.src = key_iph->saddr;
2339		keys->addrs.v6addrs.dst = key_iph->daddr;
2340		keys->tags.flow_label = ip6_flowlabel(key_iph);
2341		keys->basic.ip_proto = key_iph->nexthdr;
2342	}
2343}
2344
2345/* if skb is set it will be used and fl6 can be NULL */
2346u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2347		       const struct sk_buff *skb, struct flow_keys *flkeys)
2348{
2349	struct flow_keys hash_keys;
2350	u32 mhash;
2351
2352	switch (ip6_multipath_hash_policy(net)) {
2353	case 0:
2354		memset(&hash_keys, 0, sizeof(hash_keys));
2355		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2356		if (skb) {
2357			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2358		} else {
2359			hash_keys.addrs.v6addrs.src = fl6->saddr;
2360			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2361			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2362			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2363		}
2364		break;
2365	case 1:
2366		if (skb) {
2367			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2368			struct flow_keys keys;
2369
2370			/* short-circuit if we already have L4 hash present */
2371			if (skb->l4_hash)
2372				return skb_get_hash_raw(skb) >> 1;
2373
2374			memset(&hash_keys, 0, sizeof(hash_keys));
2375
2376                        if (!flkeys) {
2377				skb_flow_dissect_flow_keys(skb, &keys, flag);
2378				flkeys = &keys;
2379			}
2380			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2381			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2382			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2383			hash_keys.ports.src = flkeys->ports.src;
2384			hash_keys.ports.dst = flkeys->ports.dst;
2385			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2386		} else {
2387			memset(&hash_keys, 0, sizeof(hash_keys));
2388			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2389			hash_keys.addrs.v6addrs.src = fl6->saddr;
2390			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2391			hash_keys.ports.src = fl6->fl6_sport;
2392			hash_keys.ports.dst = fl6->fl6_dport;
2393			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2394		}
2395		break;
2396	case 2:
2397		memset(&hash_keys, 0, sizeof(hash_keys));
2398		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2399		if (skb) {
2400			struct flow_keys keys;
2401
2402			if (!flkeys) {
2403				skb_flow_dissect_flow_keys(skb, &keys, 0);
2404				flkeys = &keys;
2405			}
2406
2407			/* Inner can be v4 or v6 */
2408			if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2409				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2410				hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2411				hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2412			} else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2413				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2414				hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2415				hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2416				hash_keys.tags.flow_label = flkeys->tags.flow_label;
2417				hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2418			} else {
2419				/* Same as case 0 */
2420				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2421				ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2422			}
2423		} else {
2424			/* Same as case 0 */
2425			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2426			hash_keys.addrs.v6addrs.src = fl6->saddr;
2427			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2428			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2429			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2430		}
2431		break;
2432	}
2433	mhash = flow_hash_from_keys(&hash_keys);
2434
2435	return mhash >> 1;
2436}
2437
2438/* Called with rcu held */
2439void ip6_route_input(struct sk_buff *skb)
2440{
2441	const struct ipv6hdr *iph = ipv6_hdr(skb);
2442	struct net *net = dev_net(skb->dev);
2443	int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF;
2444	struct ip_tunnel_info *tun_info;
2445	struct flowi6 fl6 = {
2446		.flowi6_iif = skb->dev->ifindex,
2447		.daddr = iph->daddr,
2448		.saddr = iph->saddr,
2449		.flowlabel = ip6_flowinfo(iph),
2450		.flowi6_mark = skb->mark,
2451		.flowi6_proto = iph->nexthdr,
2452	};
2453	struct flow_keys *flkeys = NULL, _flkeys;
2454
2455	tun_info = skb_tunnel_info(skb);
2456	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2457		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2458
2459	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2460		flkeys = &_flkeys;
2461
2462	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2463		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2464	skb_dst_drop(skb);
2465	skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev,
2466						      &fl6, skb, flags));
2467}
2468
2469INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net,
2470					     struct fib6_table *table,
2471					     struct flowi6 *fl6,
2472					     const struct sk_buff *skb,
2473					     int flags)
2474{
2475	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2476}
2477
2478struct dst_entry *ip6_route_output_flags_noref(struct net *net,
2479					       const struct sock *sk,
2480					       struct flowi6 *fl6, int flags)
2481{
2482	bool any_src;
2483
2484	if (ipv6_addr_type(&fl6->daddr) &
2485	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2486		struct dst_entry *dst;
2487
2488		/* This function does not take refcnt on the dst */
2489		dst = l3mdev_link_scope_lookup(net, fl6);
2490		if (dst)
2491			return dst;
2492	}
2493
2494	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2495
2496	flags |= RT6_LOOKUP_F_DST_NOREF;
2497	any_src = ipv6_addr_any(&fl6->saddr);
2498	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2499	    (fl6->flowi6_oif && any_src))
2500		flags |= RT6_LOOKUP_F_IFACE;
2501
2502	if (!any_src)
2503		flags |= RT6_LOOKUP_F_HAS_SADDR;
2504	else if (sk)
2505		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2506
2507	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2508}
2509EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref);
2510
2511struct dst_entry *ip6_route_output_flags(struct net *net,
2512					 const struct sock *sk,
2513					 struct flowi6 *fl6,
2514					 int flags)
2515{
2516        struct dst_entry *dst;
2517        struct rt6_info *rt6;
2518
2519        rcu_read_lock();
2520        dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
2521        rt6 = (struct rt6_info *)dst;
2522        /* For dst cached in uncached_list, refcnt is already taken. */
2523        if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
2524                dst = &net->ipv6.ip6_null_entry->dst;
2525                dst_hold(dst);
2526        }
2527        rcu_read_unlock();
2528
2529        return dst;
2530}
2531EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2532
2533struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2534{
2535	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2536	struct net_device *loopback_dev = net->loopback_dev;
2537	struct dst_entry *new = NULL;
2538
2539	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2540		       DST_OBSOLETE_DEAD, 0);
2541	if (rt) {
2542		rt6_info_init(rt);
2543		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2544
2545		new = &rt->dst;
2546		new->__use = 1;
2547		new->input = dst_discard;
2548		new->output = dst_discard_out;
2549
2550		dst_copy_metrics(new, &ort->dst);
2551
2552		rt->rt6i_idev = in6_dev_get(loopback_dev);
2553		rt->rt6i_gateway = ort->rt6i_gateway;
2554		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2555
2556		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2557#ifdef CONFIG_IPV6_SUBTREES
2558		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2559#endif
2560	}
2561
2562	dst_release(dst_orig);
2563	return new ? new : ERR_PTR(-ENOMEM);
2564}
2565
2566/*
2567 *	Destination cache support functions
2568 */
2569
2570static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2571{
2572	u32 rt_cookie = 0;
2573
2574	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2575		return false;
2576
2577	if (fib6_check_expired(f6i))
2578		return false;
2579
2580	return true;
2581}
2582
2583static struct dst_entry *rt6_check(struct rt6_info *rt,
2584				   struct fib6_info *from,
2585				   u32 cookie)
2586{
2587	u32 rt_cookie = 0;
2588
2589	if (!from || !fib6_get_cookie_safe(from, &rt_cookie) ||
2590	    rt_cookie != cookie)
2591		return NULL;
2592
2593	if (rt6_check_expired(rt))
2594		return NULL;
2595
2596	return &rt->dst;
2597}
2598
2599static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2600					    struct fib6_info *from,
2601					    u32 cookie)
2602{
2603	if (!__rt6_check_expired(rt) &&
2604	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2605	    fib6_check(from, cookie))
2606		return &rt->dst;
2607	else
2608		return NULL;
2609}
2610
2611static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2612{
2613	struct dst_entry *dst_ret;
2614	struct fib6_info *from;
2615	struct rt6_info *rt;
2616
2617	rt = container_of(dst, struct rt6_info, dst);
2618
2619	if (rt->sernum)
2620		return rt6_is_valid(rt) ? dst : NULL;
2621
2622	rcu_read_lock();
2623
2624	/* All IPV6 dsts are created with ->obsolete set to the value
2625	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2626	 * into this function always.
2627	 */
2628
2629	from = rcu_dereference(rt->from);
2630
2631	if (from && (rt->rt6i_flags & RTF_PCPU ||
2632	    unlikely(!list_empty(&rt->rt6i_uncached))))
2633		dst_ret = rt6_dst_from_check(rt, from, cookie);
2634	else
2635		dst_ret = rt6_check(rt, from, cookie);
2636
2637	rcu_read_unlock();
2638
2639	return dst_ret;
2640}
2641
2642static void ip6_negative_advice(struct sock *sk,
2643				struct dst_entry *dst)
2644{
2645	struct rt6_info *rt = (struct rt6_info *) dst;
2646
2647	if (rt->rt6i_flags & RTF_CACHE) {
2648		rcu_read_lock();
2649		if (rt6_check_expired(rt)) {
2650			/* counteract the dst_release() in sk_dst_reset() */
2651			dst_hold(dst);
2652			sk_dst_reset(sk);
2653
2654			rt6_remove_exception_rt(rt);
2655		}
2656		rcu_read_unlock();
2657		return;
2658	}
2659	sk_dst_reset(sk);
2660}
2661
2662static void ip6_link_failure(struct sk_buff *skb)
2663{
2664	struct rt6_info *rt;
2665
2666	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2667
2668	rt = (struct rt6_info *) skb_dst(skb);
2669	if (rt) {
2670		rcu_read_lock();
2671		if (rt->rt6i_flags & RTF_CACHE) {
2672			rt6_remove_exception_rt(rt);
2673		} else {
2674			struct fib6_info *from;
2675			struct fib6_node *fn;
2676
2677			from = rcu_dereference(rt->from);
2678			if (from) {
2679				fn = rcu_dereference(from->fib6_node);
2680				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2681					WRITE_ONCE(fn->fn_sernum, -1);
2682			}
2683		}
2684		rcu_read_unlock();
2685	}
2686}
2687
2688static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2689{
2690	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2691		struct fib6_info *from;
2692
2693		rcu_read_lock();
2694		from = rcu_dereference(rt0->from);
2695		if (from)
2696			rt0->dst.expires = from->expires;
2697		rcu_read_unlock();
2698	}
2699
2700	dst_set_expires(&rt0->dst, timeout);
2701	rt0->rt6i_flags |= RTF_EXPIRES;
2702}
2703
2704static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2705{
2706	struct net *net = dev_net(rt->dst.dev);
2707
2708	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2709	rt->rt6i_flags |= RTF_MODIFIED;
2710	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2711}
2712
2713static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2714{
2715	return !(rt->rt6i_flags & RTF_CACHE) &&
2716		(rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2717}
2718
2719static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2720				 const struct ipv6hdr *iph, u32 mtu,
2721				 bool confirm_neigh)
2722{
2723	const struct in6_addr *daddr, *saddr;
2724	struct rt6_info *rt6 = (struct rt6_info *)dst;
2725
2726	/* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
2727	 * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
2728	 * [see also comment in rt6_mtu_change_route()]
2729	 */
2730
2731	if (iph) {
2732		daddr = &iph->daddr;
2733		saddr = &iph->saddr;
2734	} else if (sk) {
2735		daddr = &sk->sk_v6_daddr;
2736		saddr = &inet6_sk(sk)->saddr;
2737	} else {
2738		daddr = NULL;
2739		saddr = NULL;
2740	}
2741
2742	if (confirm_neigh)
2743		dst_confirm_neigh(dst, daddr);
2744
2745	if (mtu < IPV6_MIN_MTU)
2746		return;
2747	if (mtu >= dst_mtu(dst))
2748		return;
2749
2750	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2751		rt6_do_update_pmtu(rt6, mtu);
2752		/* update rt6_ex->stamp for cache */
2753		if (rt6->rt6i_flags & RTF_CACHE)
2754			rt6_update_exception_stamp_rt(rt6);
2755	} else if (daddr) {
2756		struct fib6_result res = {};
2757		struct rt6_info *nrt6;
2758
2759		rcu_read_lock();
2760		res.f6i = rcu_dereference(rt6->from);
2761		if (!res.f6i)
2762			goto out_unlock;
2763
2764		res.fib6_flags = res.f6i->fib6_flags;
2765		res.fib6_type = res.f6i->fib6_type;
2766
2767		if (res.f6i->nh) {
2768			struct fib6_nh_match_arg arg = {
2769				.dev = dst->dev,
2770				.gw = &rt6->rt6i_gateway,
2771			};
2772
2773			nexthop_for_each_fib6_nh(res.f6i->nh,
2774						 fib6_nh_find_match, &arg);
2775
2776			/* fib6_info uses a nexthop that does not have fib6_nh
2777			 * using the dst->dev + gw. Should be impossible.
2778			 */
2779			if (!arg.match)
2780				goto out_unlock;
2781
2782			res.nh = arg.match;
2783		} else {
2784			res.nh = res.f6i->fib6_nh;
2785		}
2786
2787		nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2788		if (nrt6) {
2789			rt6_do_update_pmtu(nrt6, mtu);
2790			if (rt6_insert_exception(nrt6, &res))
2791				dst_release_immediate(&nrt6->dst);
2792		}
2793out_unlock:
2794		rcu_read_unlock();
2795	}
2796}
2797
2798static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2799			       struct sk_buff *skb, u32 mtu,
2800			       bool confirm_neigh)
2801{
2802	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu,
2803			     confirm_neigh);
2804}
2805
2806void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2807		     int oif, u32 mark, kuid_t uid)
2808{
2809	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2810	struct dst_entry *dst;
2811	struct flowi6 fl6 = {
2812		.flowi6_oif = oif,
2813		.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2814		.daddr = iph->daddr,
2815		.saddr = iph->saddr,
2816		.flowlabel = ip6_flowinfo(iph),
2817		.flowi6_uid = uid,
2818	};
2819
2820	dst = ip6_route_output(net, NULL, &fl6);
2821	if (!dst->error)
2822		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true);
2823	dst_release(dst);
2824}
2825EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2826
2827void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2828{
2829	int oif = sk->sk_bound_dev_if;
2830	struct dst_entry *dst;
2831
2832	if (!oif && skb->dev)
2833		oif = l3mdev_master_ifindex(skb->dev);
2834
2835	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2836
2837	dst = __sk_dst_get(sk);
2838	if (!dst || !dst->obsolete ||
2839	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2840		return;
2841
2842	bh_lock_sock(sk);
2843	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2844		ip6_datagram_dst_update(sk, false);
2845	bh_unlock_sock(sk);
2846}
2847EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2848
2849void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2850			   const struct flowi6 *fl6)
2851{
2852#ifdef CONFIG_IPV6_SUBTREES
2853	struct ipv6_pinfo *np = inet6_sk(sk);
2854#endif
2855
2856	ip6_dst_store(sk, dst,
2857		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2858		      &sk->sk_v6_daddr : NULL,
2859#ifdef CONFIG_IPV6_SUBTREES
2860		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2861		      &np->saddr :
2862#endif
2863		      NULL);
2864}
2865
2866static bool ip6_redirect_nh_match(const struct fib6_result *res,
2867				  struct flowi6 *fl6,
2868				  const struct in6_addr *gw,
2869				  struct rt6_info **ret)
2870{
2871	const struct fib6_nh *nh = res->nh;
2872
2873	if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2874	    fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2875		return false;
2876
2877	/* rt_cache's gateway might be different from its 'parent'
2878	 * in the case of an ip redirect.
2879	 * So we keep searching in the exception table if the gateway
2880	 * is different.
2881	 */
2882	if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2883		struct rt6_info *rt_cache;
2884
2885		rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2886		if (rt_cache &&
2887		    ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2888			*ret = rt_cache;
2889			return true;
2890		}
2891		return false;
2892	}
2893	return true;
2894}
2895
2896struct fib6_nh_rd_arg {
2897	struct fib6_result	*res;
2898	struct flowi6		*fl6;
2899	const struct in6_addr	*gw;
2900	struct rt6_info		**ret;
2901};
2902
2903static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg)
2904{
2905	struct fib6_nh_rd_arg *arg = _arg;
2906
2907	arg->res->nh = nh;
2908	return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret);
2909}
2910
2911/* Handle redirects */
2912struct ip6rd_flowi {
2913	struct flowi6 fl6;
2914	struct in6_addr gateway;
2915};
2916
2917INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net,
2918					     struct fib6_table *table,
2919					     struct flowi6 *fl6,
2920					     const struct sk_buff *skb,
2921					     int flags)
2922{
2923	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2924	struct rt6_info *ret = NULL;
2925	struct fib6_result res = {};
2926	struct fib6_nh_rd_arg arg = {
2927		.res = &res,
2928		.fl6 = fl6,
2929		.gw  = &rdfl->gateway,
2930		.ret = &ret
2931	};
2932	struct fib6_info *rt;
2933	struct fib6_node *fn;
2934
2935	/* l3mdev_update_flow overrides oif if the device is enslaved; in
2936	 * this case we must match on the real ingress device, so reset it
2937	 */
2938	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
2939		fl6->flowi6_oif = skb->dev->ifindex;
2940
2941	/* Get the "current" route for this destination and
2942	 * check if the redirect has come from appropriate router.
2943	 *
2944	 * RFC 4861 specifies that redirects should only be
2945	 * accepted if they come from the nexthop to the target.
2946	 * Due to the way the routes are chosen, this notion
2947	 * is a bit fuzzy and one might need to check all possible
2948	 * routes.
2949	 */
2950
2951	rcu_read_lock();
2952	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2953restart:
2954	for_each_fib6_node_rt_rcu(fn) {
2955		res.f6i = rt;
2956		if (fib6_check_expired(rt))
2957			continue;
2958		if (rt->fib6_flags & RTF_REJECT)
2959			break;
2960		if (unlikely(rt->nh)) {
2961			if (nexthop_is_blackhole(rt->nh))
2962				continue;
2963			/* on match, res->nh is filled in and potentially ret */
2964			if (nexthop_for_each_fib6_nh(rt->nh,
2965						     fib6_nh_redirect_match,
2966						     &arg))
2967				goto out;
2968		} else {
2969			res.nh = rt->fib6_nh;
2970			if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway,
2971						  &ret))
2972				goto out;
2973		}
2974	}
2975
2976	if (!rt)
2977		rt = net->ipv6.fib6_null_entry;
2978	else if (rt->fib6_flags & RTF_REJECT) {
2979		ret = net->ipv6.ip6_null_entry;
2980		goto out;
2981	}
2982
2983	if (rt == net->ipv6.fib6_null_entry) {
2984		fn = fib6_backtrack(fn, &fl6->saddr);
2985		if (fn)
2986			goto restart;
2987	}
2988
2989	res.f6i = rt;
2990	res.nh = rt->fib6_nh;
2991out:
2992	if (ret) {
2993		ip6_hold_safe(net, &ret);
2994	} else {
2995		res.fib6_flags = res.f6i->fib6_flags;
2996		res.fib6_type = res.f6i->fib6_type;
2997		ret = ip6_create_rt_rcu(&res);
2998	}
2999
3000	rcu_read_unlock();
3001
3002	trace_fib6_table_lookup(net, &res, table, fl6);
3003	return ret;
3004};
3005
3006static struct dst_entry *ip6_route_redirect(struct net *net,
3007					    const struct flowi6 *fl6,
3008					    const struct sk_buff *skb,
3009					    const struct in6_addr *gateway)
3010{
3011	int flags = RT6_LOOKUP_F_HAS_SADDR;
3012	struct ip6rd_flowi rdfl;
3013
3014	rdfl.fl6 = *fl6;
3015	rdfl.gateway = *gateway;
3016
3017	return fib6_rule_lookup(net, &rdfl.fl6, skb,
3018				flags, __ip6_route_redirect);
3019}
3020
3021void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
3022		  kuid_t uid)
3023{
3024	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
3025	struct dst_entry *dst;
3026	struct flowi6 fl6 = {
3027		.flowi6_iif = LOOPBACK_IFINDEX,
3028		.flowi6_oif = oif,
3029		.flowi6_mark = mark,
3030		.daddr = iph->daddr,
3031		.saddr = iph->saddr,
3032		.flowlabel = ip6_flowinfo(iph),
3033		.flowi6_uid = uid,
3034	};
3035
3036	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
3037	rt6_do_redirect(dst, NULL, skb);
3038	dst_release(dst);
3039}
3040EXPORT_SYMBOL_GPL(ip6_redirect);
3041
3042void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
3043{
3044	const struct ipv6hdr *iph = ipv6_hdr(skb);
3045	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
3046	struct dst_entry *dst;
3047	struct flowi6 fl6 = {
3048		.flowi6_iif = LOOPBACK_IFINDEX,
3049		.flowi6_oif = oif,
3050		.daddr = msg->dest,
3051		.saddr = iph->daddr,
3052		.flowi6_uid = sock_net_uid(net, NULL),
3053	};
3054
3055	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
3056	rt6_do_redirect(dst, NULL, skb);
3057	dst_release(dst);
3058}
3059
3060void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
3061{
3062	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
3063		     sk->sk_uid);
3064}
3065EXPORT_SYMBOL_GPL(ip6_sk_redirect);
3066
3067static unsigned int ip6_default_advmss(const struct dst_entry *dst)
3068{
3069	struct net_device *dev = dst->dev;
3070	unsigned int mtu = dst_mtu(dst);
3071	struct net *net = dev_net(dev);
3072
3073	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
3074
3075	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
3076		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
3077
3078	/*
3079	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
3080	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
3081	 * IPV6_MAXPLEN is also valid and means: "any MSS,
3082	 * rely only on pmtu discovery"
3083	 */
3084	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
3085		mtu = IPV6_MAXPLEN;
3086	return mtu;
3087}
3088
3089static unsigned int ip6_mtu(const struct dst_entry *dst)
3090{
3091	struct inet6_dev *idev;
3092	unsigned int mtu;
3093
3094	mtu = dst_metric_raw(dst, RTAX_MTU);
3095	if (mtu)
3096		goto out;
3097
3098	mtu = IPV6_MIN_MTU;
3099
3100	rcu_read_lock();
3101	idev = __in6_dev_get(dst->dev);
3102	if (idev)
3103		mtu = idev->cnf.mtu6;
3104	rcu_read_unlock();
3105
3106out:
3107	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
3108
3109	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
3110}
3111
3112/* MTU selection:
3113 * 1. mtu on route is locked - use it
3114 * 2. mtu from nexthop exception
3115 * 3. mtu from egress device
3116 *
3117 * based on ip6_dst_mtu_forward and exception logic of
3118 * rt6_find_cached_rt; called with rcu_read_lock
3119 */
3120u32 ip6_mtu_from_fib6(const struct fib6_result *res,
3121		      const struct in6_addr *daddr,
3122		      const struct in6_addr *saddr)
3123{
3124	const struct fib6_nh *nh = res->nh;
3125	struct fib6_info *f6i = res->f6i;
3126	struct inet6_dev *idev;
3127	struct rt6_info *rt;
3128	u32 mtu = 0;
3129
3130	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
3131		mtu = f6i->fib6_pmtu;
3132		if (mtu)
3133			goto out;
3134	}
3135
3136	rt = rt6_find_cached_rt(res, daddr, saddr);
3137	if (unlikely(rt)) {
3138		mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
3139	} else {
3140		struct net_device *dev = nh->fib_nh_dev;
3141
3142		mtu = IPV6_MIN_MTU;
3143		idev = __in6_dev_get(dev);
3144		if (idev && idev->cnf.mtu6 > mtu)
3145			mtu = idev->cnf.mtu6;
3146	}
3147
3148	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
3149out:
3150	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
3151}
3152
3153struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
3154				  struct flowi6 *fl6)
3155{
3156	struct dst_entry *dst;
3157	struct rt6_info *rt;
3158	struct inet6_dev *idev = in6_dev_get(dev);
3159	struct net *net = dev_net(dev);
3160
3161	if (unlikely(!idev))
3162		return ERR_PTR(-ENODEV);
3163
3164	rt = ip6_dst_alloc(net, dev, 0);
3165	if (unlikely(!rt)) {
3166		in6_dev_put(idev);
3167		dst = ERR_PTR(-ENOMEM);
3168		goto out;
3169	}
3170
3171	rt->dst.input = ip6_input;
3172	rt->dst.output  = ip6_output;
3173	rt->rt6i_gateway  = fl6->daddr;
3174	rt->rt6i_dst.addr = fl6->daddr;
3175	rt->rt6i_dst.plen = 128;
3176	rt->rt6i_idev     = idev;
3177	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
3178
3179	/* Add this dst into uncached_list so that rt6_disable_ip() can
3180	 * do proper release of the net_device
3181	 */
3182	rt6_uncached_list_add(rt);
3183	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
3184
3185	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
3186
3187out:
3188	return dst;
3189}
3190
3191static void ip6_dst_gc(struct dst_ops *ops)
3192{
3193	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
3194	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
3195	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
3196	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
3197	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
3198	unsigned int val;
3199	int entries;
3200
3201	entries = dst_entries_get_fast(ops);
3202	if (entries > ops->gc_thresh)
3203		entries = dst_entries_get_slow(ops);
3204
3205	if (time_after(rt_last_gc + rt_min_interval, jiffies))
3206		goto out;
3207
3208	fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true);
3209	entries = dst_entries_get_slow(ops);
3210	if (entries < ops->gc_thresh)
3211		atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1);
3212out:
3213	val = atomic_read(&net->ipv6.ip6_rt_gc_expire);
3214	atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity));
3215}
3216
3217static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
3218			       const struct in6_addr *gw_addr, u32 tbid,
3219			       int flags, struct fib6_result *res)
3220{
3221	struct flowi6 fl6 = {
3222		.flowi6_oif = cfg->fc_ifindex,
3223		.daddr = *gw_addr,
3224		.saddr = cfg->fc_prefsrc,
3225	};
3226	struct fib6_table *table;
3227	int err;
3228
3229	table = fib6_get_table(net, tbid);
3230	if (!table)
3231		return -EINVAL;
3232
3233	if (!ipv6_addr_any(&cfg->fc_prefsrc))
3234		flags |= RT6_LOOKUP_F_HAS_SADDR;
3235
3236	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
3237
3238	err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags);
3239	if (!err && res->f6i != net->ipv6.fib6_null_entry)
3240		fib6_select_path(net, res, &fl6, cfg->fc_ifindex,
3241				 cfg->fc_ifindex != 0, NULL, flags);
3242
3243	return err;
3244}
3245
3246static int ip6_route_check_nh_onlink(struct net *net,
3247				     struct fib6_config *cfg,
3248				     const struct net_device *dev,
3249				     struct netlink_ext_ack *extack)
3250{
3251	u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
3252	const struct in6_addr *gw_addr = &cfg->fc_gateway;
3253	struct fib6_result res = {};
3254	int err;
3255
3256	err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res);
3257	if (!err && !(res.fib6_flags & RTF_REJECT) &&
3258	    /* ignore match if it is the default route */
3259	    !ipv6_addr_any(&res.f6i->fib6_dst.addr) &&
3260	    (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) {
3261		NL_SET_ERR_MSG(extack,
3262			       "Nexthop has invalid gateway or device mismatch");
3263		err = -EINVAL;
3264	}
3265
3266	return err;
3267}
3268
3269static int ip6_route_check_nh(struct net *net,
3270			      struct fib6_config *cfg,
3271			      struct net_device **_dev,
3272			      struct inet6_dev **idev)
3273{
3274	const struct in6_addr *gw_addr = &cfg->fc_gateway;
3275	struct net_device *dev = _dev ? *_dev : NULL;
3276	int flags = RT6_LOOKUP_F_IFACE;
3277	struct fib6_result res = {};
3278	int err = -EHOSTUNREACH;
3279
3280	if (cfg->fc_table) {
3281		err = ip6_nh_lookup_table(net, cfg, gw_addr,
3282					  cfg->fc_table, flags, &res);
3283		/* gw_addr can not require a gateway or resolve to a reject
3284		 * route. If a device is given, it must match the result.
3285		 */
3286		if (err || res.fib6_flags & RTF_REJECT ||
3287		    res.nh->fib_nh_gw_family ||
3288		    (dev && dev != res.nh->fib_nh_dev))
3289			err = -EHOSTUNREACH;
3290	}
3291
3292	if (err < 0) {
3293		struct flowi6 fl6 = {
3294			.flowi6_oif = cfg->fc_ifindex,
3295			.daddr = *gw_addr,
3296		};
3297
3298		err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags);
3299		if (err || res.fib6_flags & RTF_REJECT ||
3300		    res.nh->fib_nh_gw_family)
3301			err = -EHOSTUNREACH;
3302
3303		if (err)
3304			return err;
3305
3306		fib6_select_path(net, &res, &fl6, cfg->fc_ifindex,
3307				 cfg->fc_ifindex != 0, NULL, flags);
3308	}
3309
3310	err = 0;
3311	if (dev) {
3312		if (dev != res.nh->fib_nh_dev)
3313			err = -EHOSTUNREACH;
3314	} else {
3315		*_dev = dev = res.nh->fib_nh_dev;
3316		dev_hold(dev);
3317		*idev = in6_dev_get(dev);
3318	}
3319
3320	return err;
3321}
3322
3323static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
3324			   struct net_device **_dev, struct inet6_dev **idev,
3325			   struct netlink_ext_ack *extack)
3326{
3327	const struct in6_addr *gw_addr = &cfg->fc_gateway;
3328	int gwa_type = ipv6_addr_type(gw_addr);
3329	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
3330	const struct net_device *dev = *_dev;
3331	bool need_addr_check = !dev;
3332	int err = -EINVAL;
3333
3334	/* if gw_addr is local we will fail to detect this in case
3335	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
3336	 * will return already-added prefix route via interface that
3337	 * prefix route was assigned to, which might be non-loopback.
3338	 */
3339	if (dev &&
3340	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
3341		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
3342		goto out;
3343	}
3344
3345	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
3346		/* IPv6 strictly inhibits using not link-local
3347		 * addresses as nexthop address.
3348		 * Otherwise, router will not able to send redirects.
3349		 * It is very good, but in some (rare!) circumstances
3350		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
3351		 * some exceptions. --ANK
3352		 * We allow IPv4-mapped nexthops to support RFC4798-type
3353		 * addressing
3354		 */
3355		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
3356			NL_SET_ERR_MSG(extack, "Invalid gateway address");
3357			goto out;
3358		}
3359
3360		rcu_read_lock();
3361
3362		if (cfg->fc_flags & RTNH_F_ONLINK)
3363			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
3364		else
3365			err = ip6_route_check_nh(net, cfg, _dev, idev);
3366
3367		rcu_read_unlock();
3368
3369		if (err)
3370			goto out;
3371	}
3372
3373	/* reload in case device was changed */
3374	dev = *_dev;
3375
3376	err = -EINVAL;
3377	if (!dev) {
3378		NL_SET_ERR_MSG(extack, "Egress device not specified");
3379		goto out;
3380	} else if (dev->flags & IFF_LOOPBACK) {
3381		NL_SET_ERR_MSG(extack,
3382			       "Egress device can not be loopback device for this route");
3383		goto out;
3384	}
3385
3386	/* if we did not check gw_addr above, do so now that the
3387	 * egress device has been resolved.
3388	 */
3389	if (need_addr_check &&
3390	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
3391		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
3392		goto out;
3393	}
3394
3395	err = 0;
3396out:
3397	return err;
3398}
3399
3400static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
3401{
3402	if ((flags & RTF_REJECT) ||
3403	    (dev && (dev->flags & IFF_LOOPBACK) &&
3404	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
3405	     !(flags & (RTF_ANYCAST | RTF_LOCAL))))
3406		return true;
3407
3408	return false;
3409}
3410
3411int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
3412		 struct fib6_config *cfg, gfp_t gfp_flags,
3413		 struct netlink_ext_ack *extack)
3414{
3415	struct net_device *dev = NULL;
3416	struct inet6_dev *idev = NULL;
3417	int addr_type;
3418	int err;
3419
3420	fib6_nh->fib_nh_family = AF_INET6;
3421#ifdef CONFIG_IPV6_ROUTER_PREF
3422	fib6_nh->last_probe = jiffies;
3423#endif
3424	if (cfg->fc_is_fdb) {
3425		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3426		fib6_nh->fib_nh_gw_family = AF_INET6;
3427		return 0;
3428	}
3429
3430	err = -ENODEV;
3431	if (cfg->fc_ifindex) {
3432		dev = dev_get_by_index(net, cfg->fc_ifindex);
3433		if (!dev)
3434			goto out;
3435		idev = in6_dev_get(dev);
3436		if (!idev)
3437			goto out;
3438	}
3439
3440	if (cfg->fc_flags & RTNH_F_ONLINK) {
3441		if (!dev) {
3442			NL_SET_ERR_MSG(extack,
3443				       "Nexthop device required for onlink");
3444			goto out;
3445		}
3446
3447		if (!(dev->flags & IFF_UP)) {
3448			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3449			err = -ENETDOWN;
3450			goto out;
3451		}
3452
3453		fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3454	}
3455
3456	fib6_nh->fib_nh_weight = 1;
3457
3458	/* We cannot add true routes via loopback here,
3459	 * they would result in kernel looping; promote them to reject routes
3460	 */
3461	addr_type = ipv6_addr_type(&cfg->fc_dst);
3462	if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3463		/* hold loopback dev/idev if we haven't done so. */
3464		if (dev != net->loopback_dev) {
3465			if (dev) {
3466				dev_put(dev);
3467				in6_dev_put(idev);
3468			}
3469			dev = net->loopback_dev;
3470			dev_hold(dev);
3471			idev = in6_dev_get(dev);
3472			if (!idev) {
3473				err = -ENODEV;
3474				goto out;
3475			}
3476		}
3477		goto pcpu_alloc;
3478	}
3479
3480	if (cfg->fc_flags & RTF_GATEWAY) {
3481		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3482		if (err)
3483			goto out;
3484
3485		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3486		fib6_nh->fib_nh_gw_family = AF_INET6;
3487	}
3488
3489	err = -ENODEV;
3490	if (!dev)
3491		goto out;
3492
3493	if (!idev || idev->cnf.disable_ipv6) {
3494		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3495		err = -EACCES;
3496		goto out;
3497	}
3498
3499	if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3500		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3501		err = -ENETDOWN;
3502		goto out;
3503	}
3504
3505	if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3506	    !netif_carrier_ok(dev))
3507		fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3508
3509	err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap,
3510				 cfg->fc_encap_type, cfg, gfp_flags, extack);
3511	if (err)
3512		goto out;
3513
3514pcpu_alloc:
3515	fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
3516	if (!fib6_nh->rt6i_pcpu) {
3517		err = -ENOMEM;
3518		goto out;
3519	}
3520
3521	fib6_nh->fib_nh_dev = dev;
3522	fib6_nh->fib_nh_oif = dev->ifindex;
3523	err = 0;
3524out:
3525	if (idev)
3526		in6_dev_put(idev);
3527
3528	if (err) {
3529		lwtstate_put(fib6_nh->fib_nh_lws);
3530		fib6_nh->fib_nh_lws = NULL;
3531		if (dev)
3532			dev_put(dev);
3533	}
3534
3535	return err;
3536}
3537
3538void fib6_nh_release(struct fib6_nh *fib6_nh)
3539{
3540	struct rt6_exception_bucket *bucket;
3541
3542	rcu_read_lock();
3543
3544	fib6_nh_flush_exceptions(fib6_nh, NULL);
3545	bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
3546	if (bucket) {
3547		rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
3548		kfree(bucket);
3549	}
3550
3551	rcu_read_unlock();
3552
3553	if (fib6_nh->rt6i_pcpu) {
3554		int cpu;
3555
3556		for_each_possible_cpu(cpu) {
3557			struct rt6_info **ppcpu_rt;
3558			struct rt6_info *pcpu_rt;
3559
3560			ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
3561			pcpu_rt = *ppcpu_rt;
3562			if (pcpu_rt) {
3563				dst_dev_put(&pcpu_rt->dst);
3564				dst_release(&pcpu_rt->dst);
3565				*ppcpu_rt = NULL;
3566			}
3567		}
3568
3569		free_percpu(fib6_nh->rt6i_pcpu);
3570	}
3571
3572	fib_nh_common_release(&fib6_nh->nh_common);
3573}
3574
3575void fib6_nh_release_dsts(struct fib6_nh *fib6_nh)
3576{
3577	int cpu;
3578
3579	if (!fib6_nh->rt6i_pcpu)
3580		return;
3581
3582	for_each_possible_cpu(cpu) {
3583		struct rt6_info *pcpu_rt, **ppcpu_rt;
3584
3585		ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
3586		pcpu_rt = xchg(ppcpu_rt, NULL);
3587		if (pcpu_rt) {
3588			dst_dev_put(&pcpu_rt->dst);
3589			dst_release(&pcpu_rt->dst);
3590		}
3591	}
3592}
3593
3594static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3595					      gfp_t gfp_flags,
3596					      struct netlink_ext_ack *extack)
3597{
3598	struct net *net = cfg->fc_nlinfo.nl_net;
3599	struct fib6_info *rt = NULL;
3600	struct nexthop *nh = NULL;
3601	struct fib6_table *table;
3602	struct fib6_nh *fib6_nh;
3603	int err = -EINVAL;
3604	int addr_type;
3605
3606	/* RTF_PCPU is an internal flag; can not be set by userspace */
3607	if (cfg->fc_flags & RTF_PCPU) {
3608		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3609		goto out;
3610	}
3611
3612	/* RTF_CACHE is an internal flag; can not be set by userspace */
3613	if (cfg->fc_flags & RTF_CACHE) {
3614		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3615		goto out;
3616	}
3617
3618	if (cfg->fc_type > RTN_MAX) {
3619		NL_SET_ERR_MSG(extack, "Invalid route type");
3620		goto out;
3621	}
3622
3623	if (cfg->fc_dst_len > 128) {
3624		NL_SET_ERR_MSG(extack, "Invalid prefix length");
3625		goto out;
3626	}
3627	if (cfg->fc_src_len > 128) {
3628		NL_SET_ERR_MSG(extack, "Invalid source address length");
3629		goto out;
3630	}
3631#ifndef CONFIG_IPV6_SUBTREES
3632	if (cfg->fc_src_len) {
3633		NL_SET_ERR_MSG(extack,
3634			       "Specifying source address requires IPV6_SUBTREES to be enabled");
3635		goto out;
3636	}
3637#endif
3638	if (cfg->fc_nh_id) {
3639		nh = nexthop_find_by_id(net, cfg->fc_nh_id);
3640		if (!nh) {
3641			NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
3642			goto out;
3643		}
3644		err = fib6_check_nexthop(nh, cfg, extack);
3645		if (err)
3646			goto out;
3647	}
3648
3649	err = -ENOBUFS;
3650	if (cfg->fc_nlinfo.nlh &&
3651	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3652		table = fib6_get_table(net, cfg->fc_table);
3653		if (!table) {
3654			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3655			table = fib6_new_table(net, cfg->fc_table);
3656		}
3657	} else {
3658		table = fib6_new_table(net, cfg->fc_table);
3659	}
3660
3661	if (!table)
3662		goto out;
3663
3664	err = -ENOMEM;
3665	rt = fib6_info_alloc(gfp_flags, !nh);
3666	if (!rt)
3667		goto out;
3668
3669	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3670					       extack);
3671	if (IS_ERR(rt->fib6_metrics)) {
3672		err = PTR_ERR(rt->fib6_metrics);
3673		/* Do not leave garbage there. */
3674		rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3675		goto out_free;
3676	}
3677
3678	if (cfg->fc_flags & RTF_ADDRCONF)
3679		rt->dst_nocount = true;
3680
3681	if (cfg->fc_flags & RTF_EXPIRES)
3682		fib6_set_expires(rt, jiffies +
3683				clock_t_to_jiffies(cfg->fc_expires));
3684	else
3685		fib6_clean_expires(rt);
3686
3687	if (cfg->fc_protocol == RTPROT_UNSPEC)
3688		cfg->fc_protocol = RTPROT_BOOT;
3689	rt->fib6_protocol = cfg->fc_protocol;
3690
3691	rt->fib6_table = table;
3692	rt->fib6_metric = cfg->fc_metric;
3693	rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
3694	rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3695
3696	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3697	rt->fib6_dst.plen = cfg->fc_dst_len;
3698
3699#ifdef CONFIG_IPV6_SUBTREES
3700	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3701	rt->fib6_src.plen = cfg->fc_src_len;
3702#endif
3703	if (nh) {
3704		if (rt->fib6_src.plen) {
3705			NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
3706			goto out_free;
3707		}
3708		if (!nexthop_get(nh)) {
3709			NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
3710			goto out_free;
3711		}
3712		rt->nh = nh;
3713		fib6_nh = nexthop_fib6_nh(rt->nh);
3714	} else {
3715		err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
3716		if (err)
3717			goto out;
3718
3719		fib6_nh = rt->fib6_nh;
3720
3721		/* We cannot add true routes via loopback here, they would
3722		 * result in kernel looping; promote them to reject routes
3723		 */
3724		addr_type = ipv6_addr_type(&cfg->fc_dst);
3725		if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
3726				   addr_type))
3727			rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3728	}
3729
3730	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3731		struct net_device *dev = fib6_nh->fib_nh_dev;
3732
3733		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3734			NL_SET_ERR_MSG(extack, "Invalid source address");
3735			err = -EINVAL;
3736			goto out;
3737		}
3738		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3739		rt->fib6_prefsrc.plen = 128;
3740	} else
3741		rt->fib6_prefsrc.plen = 0;
3742
3743	return rt;
3744out:
3745	fib6_info_release(rt);
3746	return ERR_PTR(err);
3747out_free:
3748	ip_fib_metrics_put(rt->fib6_metrics);
3749	kfree(rt);
3750	return ERR_PTR(err);
3751}
3752
3753int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3754		  struct netlink_ext_ack *extack)
3755{
3756	struct fib6_info *rt;
3757	int err;
3758
3759	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3760	if (IS_ERR(rt))
3761		return PTR_ERR(rt);
3762
3763	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3764	fib6_info_release(rt);
3765
3766	return err;
3767}
3768
3769static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3770{
3771	struct net *net = info->nl_net;
3772	struct fib6_table *table;
3773	int err;
3774
3775	if (rt == net->ipv6.fib6_null_entry) {
3776		err = -ENOENT;
3777		goto out;
3778	}
3779
3780	table = rt->fib6_table;
3781	spin_lock_bh(&table->tb6_lock);
3782	err = fib6_del(rt, info);
3783	spin_unlock_bh(&table->tb6_lock);
3784
3785out:
3786	fib6_info_release(rt);
3787	return err;
3788}
3789
3790int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify)
3791{
3792	struct nl_info info = {
3793		.nl_net = net,
3794		.skip_notify = skip_notify
3795	};
3796
3797	return __ip6_del_rt(rt, &info);
3798}
3799
3800static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3801{
3802	struct nl_info *info = &cfg->fc_nlinfo;
3803	struct net *net = info->nl_net;
3804	struct sk_buff *skb = NULL;
3805	struct fib6_table *table;
3806	int err = -ENOENT;
3807
3808	if (rt == net->ipv6.fib6_null_entry)
3809		goto out_put;
3810	table = rt->fib6_table;
3811	spin_lock_bh(&table->tb6_lock);
3812
3813	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3814		struct fib6_info *sibling, *next_sibling;
3815		struct fib6_node *fn;
3816
3817		/* prefer to send a single notification with all hops */
3818		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3819		if (skb) {
3820			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3821
3822			if (rt6_fill_node(net, skb, rt, NULL,
3823					  NULL, NULL, 0, RTM_DELROUTE,
3824					  info->portid, seq, 0) < 0) {
3825				kfree_skb(skb);
3826				skb = NULL;
3827			} else
3828				info->skip_notify = 1;
3829		}
3830
3831		/* 'rt' points to the first sibling route. If it is not the
3832		 * leaf, then we do not need to send a notification. Otherwise,
3833		 * we need to check if the last sibling has a next route or not
3834		 * and emit a replace or delete notification, respectively.
3835		 */
3836		info->skip_notify_kernel = 1;
3837		fn = rcu_dereference_protected(rt->fib6_node,
3838					    lockdep_is_held(&table->tb6_lock));
3839		if (rcu_access_pointer(fn->leaf) == rt) {
3840			struct fib6_info *last_sibling, *replace_rt;
3841
3842			last_sibling = list_last_entry(&rt->fib6_siblings,
3843						       struct fib6_info,
3844						       fib6_siblings);
3845			replace_rt = rcu_dereference_protected(
3846					    last_sibling->fib6_next,
3847					    lockdep_is_held(&table->tb6_lock));
3848			if (replace_rt)
3849				call_fib6_entry_notifiers_replace(net,
3850								  replace_rt);
3851			else
3852				call_fib6_multipath_entry_notifiers(net,
3853						       FIB_EVENT_ENTRY_DEL,
3854						       rt, rt->fib6_nsiblings,
3855						       NULL);
3856		}
3857		list_for_each_entry_safe(sibling, next_sibling,
3858					 &rt->fib6_siblings,
3859					 fib6_siblings) {
3860			err = fib6_del(sibling, info);
3861			if (err)
3862				goto out_unlock;
3863		}
3864	}
3865
3866	err = fib6_del(rt, info);
3867out_unlock:
3868	spin_unlock_bh(&table->tb6_lock);
3869out_put:
3870	fib6_info_release(rt);
3871
3872	if (skb) {
3873		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3874			    info->nlh, gfp_any());
3875	}
3876	return err;
3877}
3878
3879static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3880{
3881	int rc = -ESRCH;
3882
3883	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3884		goto out;
3885
3886	if (cfg->fc_flags & RTF_GATEWAY &&
3887	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3888		goto out;
3889
3890	rc = rt6_remove_exception_rt(rt);
3891out:
3892	return rc;
3893}
3894
3895static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
3896			     struct fib6_nh *nh)
3897{
3898	struct fib6_result res = {
3899		.f6i = rt,
3900		.nh = nh,
3901	};
3902	struct rt6_info *rt_cache;
3903
3904	rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src);
3905	if (rt_cache)
3906		return __ip6_del_cached_rt(rt_cache, cfg);
3907
3908	return 0;
3909}
3910
3911struct fib6_nh_del_cached_rt_arg {
3912	struct fib6_config *cfg;
3913	struct fib6_info *f6i;
3914};
3915
3916static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg)
3917{
3918	struct fib6_nh_del_cached_rt_arg *arg = _arg;
3919	int rc;
3920
3921	rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh);
3922	return rc != -ESRCH ? rc : 0;
3923}
3924
3925static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i)
3926{
3927	struct fib6_nh_del_cached_rt_arg arg = {
3928		.cfg = cfg,
3929		.f6i = f6i
3930	};
3931
3932	return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg);
3933}
3934
3935static int ip6_route_del(struct fib6_config *cfg,
3936			 struct netlink_ext_ack *extack)
3937{
3938	struct fib6_table *table;
3939	struct fib6_info *rt;
3940	struct fib6_node *fn;
3941	int err = -ESRCH;
3942
3943	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3944	if (!table) {
3945		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3946		return err;
3947	}
3948
3949	rcu_read_lock();
3950
3951	fn = fib6_locate(&table->tb6_root,
3952			 &cfg->fc_dst, cfg->fc_dst_len,
3953			 &cfg->fc_src, cfg->fc_src_len,
3954			 !(cfg->fc_flags & RTF_CACHE));
3955
3956	if (fn) {
3957		for_each_fib6_node_rt_rcu(fn) {
3958			struct fib6_nh *nh;
3959
3960			if (rt->nh && cfg->fc_nh_id &&
3961			    rt->nh->id != cfg->fc_nh_id)
3962				continue;
3963
3964			if (cfg->fc_flags & RTF_CACHE) {
3965				int rc = 0;
3966
3967				if (rt->nh) {
3968					rc = ip6_del_cached_rt_nh(cfg, rt);
3969				} else if (cfg->fc_nh_id) {
3970					continue;
3971				} else {
3972					nh = rt->fib6_nh;
3973					rc = ip6_del_cached_rt(cfg, rt, nh);
3974				}
3975				if (rc != -ESRCH) {
3976					rcu_read_unlock();
3977					return rc;
3978				}
3979				continue;
3980			}
3981
3982			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3983				continue;
3984			if (cfg->fc_protocol &&
3985			    cfg->fc_protocol != rt->fib6_protocol)
3986				continue;
3987
3988			if (rt->nh) {
3989				if (!fib6_info_hold_safe(rt))
3990					continue;
3991				rcu_read_unlock();
3992
3993				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3994			}
3995			if (cfg->fc_nh_id)
3996				continue;
3997
3998			nh = rt->fib6_nh;
3999			if (cfg->fc_ifindex &&
4000			    (!nh->fib_nh_dev ||
4001			     nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
4002				continue;
4003			if (cfg->fc_flags & RTF_GATEWAY &&
4004			    !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
4005				continue;
4006			if (!fib6_info_hold_safe(rt))
4007				continue;
4008			rcu_read_unlock();
4009
4010			/* if gateway was specified only delete the one hop */
4011			if (cfg->fc_flags & RTF_GATEWAY)
4012				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
4013
4014			return __ip6_del_rt_siblings(rt, cfg);
4015		}
4016	}
4017	rcu_read_unlock();
4018
4019	return err;
4020}
4021
4022static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
4023{
4024	struct netevent_redirect netevent;
4025	struct rt6_info *rt, *nrt = NULL;
4026	struct fib6_result res = {};
4027	struct ndisc_options ndopts;
4028	struct inet6_dev *in6_dev;
4029	struct neighbour *neigh;
4030	struct rd_msg *msg;
4031	int optlen, on_link;
4032	u8 *lladdr;
4033
4034	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
4035	optlen -= sizeof(*msg);
4036
4037	if (optlen < 0) {
4038		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
4039		return;
4040	}
4041
4042	msg = (struct rd_msg *)icmp6_hdr(skb);
4043
4044	if (ipv6_addr_is_multicast(&msg->dest)) {
4045		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
4046		return;
4047	}
4048
4049	on_link = 0;
4050	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
4051		on_link = 1;
4052	} else if (ipv6_addr_type(&msg->target) !=
4053		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
4054		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
4055		return;
4056	}
4057
4058	in6_dev = __in6_dev_get(skb->dev);
4059	if (!in6_dev)
4060		return;
4061	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
4062		return;
4063
4064	/* RFC2461 8.1:
4065	 *	The IP source address of the Redirect MUST be the same as the current
4066	 *	first-hop router for the specified ICMP Destination Address.
4067	 */
4068
4069	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
4070		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
4071		return;
4072	}
4073
4074	lladdr = NULL;
4075	if (ndopts.nd_opts_tgt_lladdr) {
4076		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
4077					     skb->dev);
4078		if (!lladdr) {
4079			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
4080			return;
4081		}
4082	}
4083
4084	rt = (struct rt6_info *) dst;
4085	if (rt->rt6i_flags & RTF_REJECT) {
4086		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
4087		return;
4088	}
4089
4090	/* Redirect received -> path was valid.
4091	 * Look, redirects are sent only in response to data packets,
4092	 * so that this nexthop apparently is reachable. --ANK
4093	 */
4094	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
4095
4096	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
4097	if (!neigh)
4098		return;
4099
4100	/*
4101	 *	We have finally decided to accept it.
4102	 */
4103
4104	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
4105		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
4106		     NEIGH_UPDATE_F_OVERRIDE|
4107		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
4108				     NEIGH_UPDATE_F_ISROUTER)),
4109		     NDISC_REDIRECT, &ndopts);
4110
4111	rcu_read_lock();
4112	res.f6i = rcu_dereference(rt->from);
4113	if (!res.f6i)
4114		goto out;
4115
4116	if (res.f6i->nh) {
4117		struct fib6_nh_match_arg arg = {
4118			.dev = dst->dev,
4119			.gw = &rt->rt6i_gateway,
4120		};
4121
4122		nexthop_for_each_fib6_nh(res.f6i->nh,
4123					 fib6_nh_find_match, &arg);
4124
4125		/* fib6_info uses a nexthop that does not have fib6_nh
4126		 * using the dst->dev. Should be impossible
4127		 */
4128		if (!arg.match)
4129			goto out;
4130		res.nh = arg.match;
4131	} else {
4132		res.nh = res.f6i->fib6_nh;
4133	}
4134
4135	res.fib6_flags = res.f6i->fib6_flags;
4136	res.fib6_type = res.f6i->fib6_type;
4137	nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
4138	if (!nrt)
4139		goto out;
4140
4141	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
4142	if (on_link)
4143		nrt->rt6i_flags &= ~RTF_GATEWAY;
4144
4145	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
4146
4147	/* rt6_insert_exception() will take care of duplicated exceptions */
4148	if (rt6_insert_exception(nrt, &res)) {
4149		dst_release_immediate(&nrt->dst);
4150		goto out;
4151	}
4152
4153	netevent.old = &rt->dst;
4154	netevent.new = &nrt->dst;
4155	netevent.daddr = &msg->dest;
4156	netevent.neigh = neigh;
4157	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
4158
4159out:
4160	rcu_read_unlock();
4161	neigh_release(neigh);
4162}
4163
4164#ifdef CONFIG_IPV6_ROUTE_INFO
4165static struct fib6_info *rt6_get_route_info(struct net *net,
4166					   const struct in6_addr *prefix, int prefixlen,
4167					   const struct in6_addr *gwaddr,
4168					   struct net_device *dev)
4169{
4170	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
4171	int ifindex = dev->ifindex;
4172	struct fib6_node *fn;
4173	struct fib6_info *rt = NULL;
4174	struct fib6_table *table;
4175
4176	table = fib6_get_table(net, tb_id);
4177	if (!table)
4178		return NULL;
4179
4180	rcu_read_lock();
4181	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
4182	if (!fn)
4183		goto out;
4184
4185	for_each_fib6_node_rt_rcu(fn) {
4186		/* these routes do not use nexthops */
4187		if (rt->nh)
4188			continue;
4189		if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
4190			continue;
4191		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
4192		    !rt->fib6_nh->fib_nh_gw_family)
4193			continue;
4194		if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
4195			continue;
4196		if (!fib6_info_hold_safe(rt))
4197			continue;
4198		break;
4199	}
4200out:
4201	rcu_read_unlock();
4202	return rt;
4203}
4204
4205static struct fib6_info *rt6_add_route_info(struct net *net,
4206					   const struct in6_addr *prefix, int prefixlen,
4207					   const struct in6_addr *gwaddr,
4208					   struct net_device *dev,
4209					   unsigned int pref)
4210{
4211	struct fib6_config cfg = {
4212		.fc_metric	= IP6_RT_PRIO_USER,
4213		.fc_ifindex	= dev->ifindex,
4214		.fc_dst_len	= prefixlen,
4215		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
4216				  RTF_UP | RTF_PREF(pref),
4217		.fc_protocol = RTPROT_RA,
4218		.fc_type = RTN_UNICAST,
4219		.fc_nlinfo.portid = 0,
4220		.fc_nlinfo.nlh = NULL,
4221		.fc_nlinfo.nl_net = net,
4222	};
4223
4224	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
4225	cfg.fc_dst = *prefix;
4226	cfg.fc_gateway = *gwaddr;
4227
4228	/* We should treat it as a default route if prefix length is 0. */
4229	if (!prefixlen)
4230		cfg.fc_flags |= RTF_DEFAULT;
4231
4232	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
4233
4234	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
4235}
4236#endif
4237
4238struct fib6_info *rt6_get_dflt_router(struct net *net,
4239				     const struct in6_addr *addr,
4240				     struct net_device *dev)
4241{
4242	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
4243	struct fib6_info *rt;
4244	struct fib6_table *table;
4245
4246	table = fib6_get_table(net, tb_id);
4247	if (!table)
4248		return NULL;
4249
4250	rcu_read_lock();
4251	for_each_fib6_node_rt_rcu(&table->tb6_root) {
4252		struct fib6_nh *nh;
4253
4254		/* RA routes do not use nexthops */
4255		if (rt->nh)
4256			continue;
4257
4258		nh = rt->fib6_nh;
4259		if (dev == nh->fib_nh_dev &&
4260		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
4261		    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
4262			break;
4263	}
4264	if (rt && !fib6_info_hold_safe(rt))
4265		rt = NULL;
4266	rcu_read_unlock();
4267	return rt;
4268}
4269
4270struct fib6_info *rt6_add_dflt_router(struct net *net,
4271				     const struct in6_addr *gwaddr,
4272				     struct net_device *dev,
4273				     unsigned int pref)
4274{
4275	struct fib6_config cfg = {
4276		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
4277		.fc_metric	= IP6_RT_PRIO_USER,
4278		.fc_ifindex	= dev->ifindex,
4279		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
4280				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
4281		.fc_protocol = RTPROT_RA,
4282		.fc_type = RTN_UNICAST,
4283		.fc_nlinfo.portid = 0,
4284		.fc_nlinfo.nlh = NULL,
4285		.fc_nlinfo.nl_net = net,
4286	};
4287
4288	cfg.fc_gateway = *gwaddr;
4289
4290	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
4291		struct fib6_table *table;
4292
4293		table = fib6_get_table(dev_net(dev), cfg.fc_table);
4294		if (table)
4295			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
4296	}
4297
4298	return rt6_get_dflt_router(net, gwaddr, dev);
4299}
4300
4301static void __rt6_purge_dflt_routers(struct net *net,
4302				     struct fib6_table *table)
4303{
4304	struct fib6_info *rt;
4305
4306restart:
4307	rcu_read_lock();
4308	for_each_fib6_node_rt_rcu(&table->tb6_root) {
4309		struct net_device *dev = fib6_info_nh_dev(rt);
4310		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
4311
4312		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
4313		    (!idev || idev->cnf.accept_ra != 2) &&
4314		    fib6_info_hold_safe(rt)) {
4315			rcu_read_unlock();
4316			ip6_del_rt(net, rt, false);
4317			goto restart;
4318		}
4319	}
4320	rcu_read_unlock();
4321
4322	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
4323}
4324
4325void rt6_purge_dflt_routers(struct net *net)
4326{
4327	struct fib6_table *table;
4328	struct hlist_head *head;
4329	unsigned int h;
4330
4331	rcu_read_lock();
4332
4333	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
4334		head = &net->ipv6.fib_table_hash[h];
4335		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
4336			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
4337				__rt6_purge_dflt_routers(net, table);
4338		}
4339	}
4340
4341	rcu_read_unlock();
4342}
4343
4344static void rtmsg_to_fib6_config(struct net *net,
4345				 struct in6_rtmsg *rtmsg,
4346				 struct fib6_config *cfg)
4347{
4348	*cfg = (struct fib6_config){
4349		.fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
4350			 : RT6_TABLE_MAIN,
4351		.fc_ifindex = rtmsg->rtmsg_ifindex,
4352		.fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
4353		.fc_expires = rtmsg->rtmsg_info,
4354		.fc_dst_len = rtmsg->rtmsg_dst_len,
4355		.fc_src_len = rtmsg->rtmsg_src_len,
4356		.fc_flags = rtmsg->rtmsg_flags,
4357		.fc_type = rtmsg->rtmsg_type,
4358
4359		.fc_nlinfo.nl_net = net,
4360
4361		.fc_dst = rtmsg->rtmsg_dst,
4362		.fc_src = rtmsg->rtmsg_src,
4363		.fc_gateway = rtmsg->rtmsg_gateway,
4364	};
4365}
4366
4367int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
4368{
4369	struct fib6_config cfg;
4370	int err;
4371
4372	if (cmd != SIOCADDRT && cmd != SIOCDELRT)
4373		return -EINVAL;
4374	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
4375		return -EPERM;
4376
4377	rtmsg_to_fib6_config(net, rtmsg, &cfg);
4378
4379	rtnl_lock();
4380	switch (cmd) {
4381	case SIOCADDRT:
4382		err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
4383		break;
4384	case SIOCDELRT:
4385		err = ip6_route_del(&cfg, NULL);
4386		break;
4387	}
4388	rtnl_unlock();
4389	return err;
4390}
4391
4392/*
4393 *	Drop the packet on the floor
4394 */
4395
4396static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
4397{
4398	struct dst_entry *dst = skb_dst(skb);
4399	struct net *net = dev_net(dst->dev);
4400	struct inet6_dev *idev;
4401	int type;
4402
4403	if (netif_is_l3_master(skb->dev) ||
4404	    dst->dev == net->loopback_dev)
4405		idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
4406	else
4407		idev = ip6_dst_idev(dst);
4408
4409	switch (ipstats_mib_noroutes) {
4410	case IPSTATS_MIB_INNOROUTES:
4411		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
4412		if (type == IPV6_ADDR_ANY) {
4413			IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
4414			break;
4415		}
4416		fallthrough;
4417	case IPSTATS_MIB_OUTNOROUTES:
4418		IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
4419		break;
4420	}
4421
4422	/* Start over by dropping the dst for l3mdev case */
4423	if (netif_is_l3_master(skb->dev))
4424		skb_dst_drop(skb);
4425
4426	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
4427	kfree_skb(skb);
4428	return 0;
4429}
4430
4431static int ip6_pkt_discard(struct sk_buff *skb)
4432{
4433	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
4434}
4435
4436static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
4437{
4438	skb->dev = skb_dst(skb)->dev;
4439	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
4440}
4441
4442static int ip6_pkt_prohibit(struct sk_buff *skb)
4443{
4444	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
4445}
4446
4447static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
4448{
4449	skb->dev = skb_dst(skb)->dev;
4450	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
4451}
4452
4453/*
4454 *	Allocate a dst for local (unicast / anycast) address.
4455 */
4456
4457struct fib6_info *addrconf_f6i_alloc(struct net *net,
4458				     struct inet6_dev *idev,
4459				     const struct in6_addr *addr,
4460				     bool anycast, gfp_t gfp_flags)
4461{
4462	struct fib6_config cfg = {
4463		.fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
4464		.fc_ifindex = idev->dev->ifindex,
4465		.fc_flags = RTF_UP | RTF_NONEXTHOP,
4466		.fc_dst = *addr,
4467		.fc_dst_len = 128,
4468		.fc_protocol = RTPROT_KERNEL,
4469		.fc_nlinfo.nl_net = net,
4470		.fc_ignore_dev_down = true,
4471	};
4472	struct fib6_info *f6i;
4473
4474	if (anycast) {
4475		cfg.fc_type = RTN_ANYCAST;
4476		cfg.fc_flags |= RTF_ANYCAST;
4477	} else {
4478		cfg.fc_type = RTN_LOCAL;
4479		cfg.fc_flags |= RTF_LOCAL;
4480	}
4481
4482	f6i = ip6_route_info_create(&cfg, gfp_flags, NULL);
4483	if (!IS_ERR(f6i)) {
4484		f6i->dst_nocount = true;
4485
4486		if (!anycast &&
4487		    (net->ipv6.devconf_all->disable_policy ||
4488		     idev->cnf.disable_policy))
4489			f6i->dst_nopolicy = true;
4490	}
4491
4492	return f6i;
4493}
4494
4495/* remove deleted ip from prefsrc entries */
4496struct arg_dev_net_ip {
4497	struct net_device *dev;
4498	struct net *net;
4499	struct in6_addr *addr;
4500};
4501
4502static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
4503{
4504	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
4505	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
4506	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
4507
4508	if (!rt->nh &&
4509	    ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
4510	    rt != net->ipv6.fib6_null_entry &&
4511	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
4512		spin_lock_bh(&rt6_exception_lock);
4513		/* remove prefsrc entry */
4514		rt->fib6_prefsrc.plen = 0;
4515		spin_unlock_bh(&rt6_exception_lock);
4516	}
4517	return 0;
4518}
4519
4520void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
4521{
4522	struct net *net = dev_net(ifp->idev->dev);
4523	struct arg_dev_net_ip adni = {
4524		.dev = ifp->idev->dev,
4525		.net = net,
4526		.addr = &ifp->addr,
4527	};
4528	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
4529}
4530
4531#define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT)
4532
4533/* Remove routers and update dst entries when gateway turn into host. */
4534static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
4535{
4536	struct in6_addr *gateway = (struct in6_addr *)arg;
4537	struct fib6_nh *nh;
4538
4539	/* RA routes do not use nexthops */
4540	if (rt->nh)
4541		return 0;
4542
4543	nh = rt->fib6_nh;
4544	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
4545	    nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
4546		return -1;
4547
4548	/* Further clean up cached routes in exception table.
4549	 * This is needed because cached route may have a different
4550	 * gateway than its 'parent' in the case of an ip redirect.
4551	 */
4552	fib6_nh_exceptions_clean_tohost(nh, gateway);
4553
4554	return 0;
4555}
4556
4557void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
4558{
4559	fib6_clean_all(net, fib6_clean_tohost, gateway);
4560}
4561
4562struct arg_netdev_event {
4563	const struct net_device *dev;
4564	union {
4565		unsigned char nh_flags;
4566		unsigned long event;
4567	};
4568};
4569
4570static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
4571{
4572	struct fib6_info *iter;
4573	struct fib6_node *fn;
4574
4575	fn = rcu_dereference_protected(rt->fib6_node,
4576			lockdep_is_held(&rt->fib6_table->tb6_lock));
4577	iter = rcu_dereference_protected(fn->leaf,
4578			lockdep_is_held(&rt->fib6_table->tb6_lock));
4579	while (iter) {
4580		if (iter->fib6_metric == rt->fib6_metric &&
4581		    rt6_qualify_for_ecmp(iter))
4582			return iter;
4583		iter = rcu_dereference_protected(iter->fib6_next,
4584				lockdep_is_held(&rt->fib6_table->tb6_lock));
4585	}
4586
4587	return NULL;
4588}
4589
4590/* only called for fib entries with builtin fib6_nh */
4591static bool rt6_is_dead(const struct fib6_info *rt)
4592{
4593	if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
4594	    (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
4595	     ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
4596		return true;
4597
4598	return false;
4599}
4600
4601static int rt6_multipath_total_weight(const struct fib6_info *rt)
4602{
4603	struct fib6_info *iter;
4604	int total = 0;
4605
4606	if (!rt6_is_dead(rt))
4607		total += rt->fib6_nh->fib_nh_weight;
4608
4609	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
4610		if (!rt6_is_dead(iter))
4611			total += iter->fib6_nh->fib_nh_weight;
4612	}
4613
4614	return total;
4615}
4616
4617static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
4618{
4619	int upper_bound = -1;
4620
4621	if (!rt6_is_dead(rt)) {
4622		*weight += rt->fib6_nh->fib_nh_weight;
4623		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
4624						    total) - 1;
4625	}
4626	atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
4627}
4628
4629static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
4630{
4631	struct fib6_info *iter;
4632	int weight = 0;
4633
4634	rt6_upper_bound_set(rt, &weight, total);
4635
4636	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4637		rt6_upper_bound_set(iter, &weight, total);
4638}
4639
4640void rt6_multipath_rebalance(struct fib6_info *rt)
4641{
4642	struct fib6_info *first;
4643	int total;
4644
4645	/* In case the entire multipath route was marked for flushing,
4646	 * then there is no need to rebalance upon the removal of every
4647	 * sibling route.
4648	 */
4649	if (!rt->fib6_nsiblings || rt->should_flush)
4650		return;
4651
4652	/* During lookup routes are evaluated in order, so we need to
4653	 * make sure upper bounds are assigned from the first sibling
4654	 * onwards.
4655	 */
4656	first = rt6_multipath_first_sibling(rt);
4657	if (WARN_ON_ONCE(!first))
4658		return;
4659
4660	total = rt6_multipath_total_weight(first);
4661	rt6_multipath_upper_bound_set(first, total);
4662}
4663
4664static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4665{
4666	const struct arg_netdev_event *arg = p_arg;
4667	struct net *net = dev_net(arg->dev);
4668
4669	if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
4670	    rt->fib6_nh->fib_nh_dev == arg->dev) {
4671		rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
4672		fib6_update_sernum_upto_root(net, rt);
4673		rt6_multipath_rebalance(rt);
4674	}
4675
4676	return 0;
4677}
4678
4679void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4680{
4681	struct arg_netdev_event arg = {
4682		.dev = dev,
4683		{
4684			.nh_flags = nh_flags,
4685		},
4686	};
4687
4688	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4689		arg.nh_flags |= RTNH_F_LINKDOWN;
4690
4691	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4692}
4693
4694/* only called for fib entries with inline fib6_nh */
4695static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4696				   const struct net_device *dev)
4697{
4698	struct fib6_info *iter;
4699
4700	if (rt->fib6_nh->fib_nh_dev == dev)
4701		return true;
4702	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4703		if (iter->fib6_nh->fib_nh_dev == dev)
4704			return true;
4705
4706	return false;
4707}
4708
4709static void rt6_multipath_flush(struct fib6_info *rt)
4710{
4711	struct fib6_info *iter;
4712
4713	rt->should_flush = 1;
4714	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4715		iter->should_flush = 1;
4716}
4717
4718static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4719					     const struct net_device *down_dev)
4720{
4721	struct fib6_info *iter;
4722	unsigned int dead = 0;
4723
4724	if (rt->fib6_nh->fib_nh_dev == down_dev ||
4725	    rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
4726		dead++;
4727	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4728		if (iter->fib6_nh->fib_nh_dev == down_dev ||
4729		    iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
4730			dead++;
4731
4732	return dead;
4733}
4734
4735static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4736				       const struct net_device *dev,
4737				       unsigned char nh_flags)
4738{
4739	struct fib6_info *iter;
4740
4741	if (rt->fib6_nh->fib_nh_dev == dev)
4742		rt->fib6_nh->fib_nh_flags |= nh_flags;
4743	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4744		if (iter->fib6_nh->fib_nh_dev == dev)
4745			iter->fib6_nh->fib_nh_flags |= nh_flags;
4746}
4747
4748/* called with write lock held for table with rt */
4749static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4750{
4751	const struct arg_netdev_event *arg = p_arg;
4752	const struct net_device *dev = arg->dev;
4753	struct net *net = dev_net(dev);
4754
4755	if (rt == net->ipv6.fib6_null_entry || rt->nh)
4756		return 0;
4757
4758	switch (arg->event) {
4759	case NETDEV_UNREGISTER:
4760		return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
4761	case NETDEV_DOWN:
4762		if (rt->should_flush)
4763			return -1;
4764		if (!rt->fib6_nsiblings)
4765			return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
4766		if (rt6_multipath_uses_dev(rt, dev)) {
4767			unsigned int count;
4768
4769			count = rt6_multipath_dead_count(rt, dev);
4770			if (rt->fib6_nsiblings + 1 == count) {
4771				rt6_multipath_flush(rt);
4772				return -1;
4773			}
4774			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4775						   RTNH_F_LINKDOWN);
4776			fib6_update_sernum(net, rt);
4777			rt6_multipath_rebalance(rt);
4778		}
4779		return -2;
4780	case NETDEV_CHANGE:
4781		if (rt->fib6_nh->fib_nh_dev != dev ||
4782		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4783			break;
4784		rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
4785		rt6_multipath_rebalance(rt);
4786		break;
4787	}
4788
4789	return 0;
4790}
4791
4792void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4793{
4794	struct arg_netdev_event arg = {
4795		.dev = dev,
4796		{
4797			.event = event,
4798		},
4799	};
4800	struct net *net = dev_net(dev);
4801
4802	if (net->ipv6.sysctl.skip_notify_on_dev_down)
4803		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4804	else
4805		fib6_clean_all(net, fib6_ifdown, &arg);
4806}
4807
4808void rt6_disable_ip(struct net_device *dev, unsigned long event)
4809{
4810	rt6_sync_down_dev(dev, event);
4811	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4812	neigh_ifdown(&nd_tbl, dev);
4813}
4814
4815struct rt6_mtu_change_arg {
4816	struct net_device *dev;
4817	unsigned int mtu;
4818	struct fib6_info *f6i;
4819};
4820
4821static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
4822{
4823	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
4824	struct fib6_info *f6i = arg->f6i;
4825
4826	/* For administrative MTU increase, there is no way to discover
4827	 * IPv6 PMTU increase, so PMTU increase should be updated here.
4828	 * Since RFC 1981 doesn't include administrative MTU increase
4829	 * update PMTU increase is a MUST. (i.e. jumbo frame)
4830	 */
4831	if (nh->fib_nh_dev == arg->dev) {
4832		struct inet6_dev *idev = __in6_dev_get(arg->dev);
4833		u32 mtu = f6i->fib6_pmtu;
4834
4835		if (mtu >= arg->mtu ||
4836		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4837			fib6_metric_set(f6i, RTAX_MTU, arg->mtu);
4838
4839		spin_lock_bh(&rt6_exception_lock);
4840		rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
4841		spin_unlock_bh(&rt6_exception_lock);
4842	}
4843
4844	return 0;
4845}
4846
4847static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
4848{
4849	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4850	struct inet6_dev *idev;
4851
4852	/* In IPv6 pmtu discovery is not optional,
4853	   so that RTAX_MTU lock cannot disable it.
4854	   We still use this lock to block changes
4855	   caused by addrconf/ndisc.
4856	*/
4857
4858	idev = __in6_dev_get(arg->dev);
4859	if (!idev)
4860		return 0;
4861
4862	if (fib6_metric_locked(f6i, RTAX_MTU))
4863		return 0;
4864
4865	arg->f6i = f6i;
4866	if (f6i->nh) {
4867		/* fib6_nh_mtu_change only returns 0, so this is safe */
4868		return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change,
4869						arg);
4870	}
4871
4872	return fib6_nh_mtu_change(f6i->fib6_nh, arg);
4873}
4874
4875void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4876{
4877	struct rt6_mtu_change_arg arg = {
4878		.dev = dev,
4879		.mtu = mtu,
4880	};
4881
4882	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4883}
4884
4885static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4886	[RTA_UNSPEC]		= { .strict_start_type = RTA_DPORT + 1 },
4887	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4888	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4889	[RTA_OIF]               = { .type = NLA_U32 },
4890	[RTA_IIF]		= { .type = NLA_U32 },
4891	[RTA_PRIORITY]          = { .type = NLA_U32 },
4892	[RTA_METRICS]           = { .type = NLA_NESTED },
4893	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4894	[RTA_PREF]              = { .type = NLA_U8 },
4895	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4896	[RTA_ENCAP]		= { .type = NLA_NESTED },
4897	[RTA_EXPIRES]		= { .type = NLA_U32 },
4898	[RTA_UID]		= { .type = NLA_U32 },
4899	[RTA_MARK]		= { .type = NLA_U32 },
4900	[RTA_TABLE]		= { .type = NLA_U32 },
4901	[RTA_IP_PROTO]		= { .type = NLA_U8 },
4902	[RTA_SPORT]		= { .type = NLA_U16 },
4903	[RTA_DPORT]		= { .type = NLA_U16 },
4904	[RTA_NH_ID]		= { .type = NLA_U32 },
4905};
4906
4907static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4908			      struct fib6_config *cfg,
4909			      struct netlink_ext_ack *extack)
4910{
4911	struct rtmsg *rtm;
4912	struct nlattr *tb[RTA_MAX+1];
4913	unsigned int pref;
4914	int err;
4915
4916	err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4917				     rtm_ipv6_policy, extack);
4918	if (err < 0)
4919		goto errout;
4920
4921	err = -EINVAL;
4922	rtm = nlmsg_data(nlh);
4923
4924	*cfg = (struct fib6_config){
4925		.fc_table = rtm->rtm_table,
4926		.fc_dst_len = rtm->rtm_dst_len,
4927		.fc_src_len = rtm->rtm_src_len,
4928		.fc_flags = RTF_UP,
4929		.fc_protocol = rtm->rtm_protocol,
4930		.fc_type = rtm->rtm_type,
4931
4932		.fc_nlinfo.portid = NETLINK_CB(skb).portid,
4933		.fc_nlinfo.nlh = nlh,
4934		.fc_nlinfo.nl_net = sock_net(skb->sk),
4935	};
4936
4937	if (rtm->rtm_type == RTN_UNREACHABLE ||
4938	    rtm->rtm_type == RTN_BLACKHOLE ||
4939	    rtm->rtm_type == RTN_PROHIBIT ||
4940	    rtm->rtm_type == RTN_THROW)
4941		cfg->fc_flags |= RTF_REJECT;
4942
4943	if (rtm->rtm_type == RTN_LOCAL)
4944		cfg->fc_flags |= RTF_LOCAL;
4945
4946	if (rtm->rtm_flags & RTM_F_CLONED)
4947		cfg->fc_flags |= RTF_CACHE;
4948
4949	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4950
4951	if (tb[RTA_NH_ID]) {
4952		if (tb[RTA_GATEWAY]   || tb[RTA_OIF] ||
4953		    tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
4954			NL_SET_ERR_MSG(extack,
4955				       "Nexthop specification and nexthop id are mutually exclusive");
4956			goto errout;
4957		}
4958		cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
4959	}
4960
4961	if (tb[RTA_GATEWAY]) {
4962		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4963		cfg->fc_flags |= RTF_GATEWAY;
4964	}
4965	if (tb[RTA_VIA]) {
4966		NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4967		goto errout;
4968	}
4969
4970	if (tb[RTA_DST]) {
4971		int plen = (rtm->rtm_dst_len + 7) >> 3;
4972
4973		if (nla_len(tb[RTA_DST]) < plen)
4974			goto errout;
4975
4976		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4977	}
4978
4979	if (tb[RTA_SRC]) {
4980		int plen = (rtm->rtm_src_len + 7) >> 3;
4981
4982		if (nla_len(tb[RTA_SRC]) < plen)
4983			goto errout;
4984
4985		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4986	}
4987
4988	if (tb[RTA_PREFSRC])
4989		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4990
4991	if (tb[RTA_OIF])
4992		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4993
4994	if (tb[RTA_PRIORITY])
4995		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4996
4997	if (tb[RTA_METRICS]) {
4998		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4999		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
5000	}
5001
5002	if (tb[RTA_TABLE])
5003		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
5004
5005	if (tb[RTA_MULTIPATH]) {
5006		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
5007		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
5008
5009		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
5010						     cfg->fc_mp_len, extack);
5011		if (err < 0)
5012			goto errout;
5013	}
5014
5015	if (tb[RTA_PREF]) {
5016		pref = nla_get_u8(tb[RTA_PREF]);
5017		if (pref != ICMPV6_ROUTER_PREF_LOW &&
5018		    pref != ICMPV6_ROUTER_PREF_HIGH)
5019			pref = ICMPV6_ROUTER_PREF_MEDIUM;
5020		cfg->fc_flags |= RTF_PREF(pref);
5021	}
5022
5023	if (tb[RTA_ENCAP])
5024		cfg->fc_encap = tb[RTA_ENCAP];
5025
5026	if (tb[RTA_ENCAP_TYPE]) {
5027		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
5028
5029		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
5030		if (err < 0)
5031			goto errout;
5032	}
5033
5034	if (tb[RTA_EXPIRES]) {
5035		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
5036
5037		if (addrconf_finite_timeout(timeout)) {
5038			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
5039			cfg->fc_flags |= RTF_EXPIRES;
5040		}
5041	}
5042
5043	err = 0;
5044errout:
5045	return err;
5046}
5047
5048struct rt6_nh {
5049	struct fib6_info *fib6_info;
5050	struct fib6_config r_cfg;
5051	struct list_head next;
5052};
5053
5054static int ip6_route_info_append(struct net *net,
5055				 struct list_head *rt6_nh_list,
5056				 struct fib6_info *rt,
5057				 struct fib6_config *r_cfg)
5058{
5059	struct rt6_nh *nh;
5060	int err = -EEXIST;
5061
5062	list_for_each_entry(nh, rt6_nh_list, next) {
5063		/* check if fib6_info already exists */
5064		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
5065			return err;
5066	}
5067
5068	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
5069	if (!nh)
5070		return -ENOMEM;
5071	nh->fib6_info = rt;
5072	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
5073	list_add_tail(&nh->next, rt6_nh_list);
5074
5075	return 0;
5076}
5077
5078static void ip6_route_mpath_notify(struct fib6_info *rt,
5079				   struct fib6_info *rt_last,
5080				   struct nl_info *info,
5081				   __u16 nlflags)
5082{
5083	/* if this is an APPEND route, then rt points to the first route
5084	 * inserted and rt_last points to last route inserted. Userspace
5085	 * wants a consistent dump of the route which starts at the first
5086	 * nexthop. Since sibling routes are always added at the end of
5087	 * the list, find the first sibling of the last route appended
5088	 */
5089	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
5090		rt = list_first_entry(&rt_last->fib6_siblings,
5091				      struct fib6_info,
5092				      fib6_siblings);
5093	}
5094
5095	if (rt)
5096		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
5097}
5098
5099static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
5100{
5101	bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
5102	bool should_notify = false;
5103	struct fib6_info *leaf;
5104	struct fib6_node *fn;
5105
5106	rcu_read_lock();
5107	fn = rcu_dereference(rt->fib6_node);
5108	if (!fn)
5109		goto out;
5110
5111	leaf = rcu_dereference(fn->leaf);
5112	if (!leaf)
5113		goto out;
5114
5115	if (rt == leaf ||
5116	    (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric &&
5117	     rt6_qualify_for_ecmp(leaf)))
5118		should_notify = true;
5119out:
5120	rcu_read_unlock();
5121
5122	return should_notify;
5123}
5124
5125static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla,
5126			     struct netlink_ext_ack *extack)
5127{
5128	if (nla_len(nla) < sizeof(*gw)) {
5129		NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY");
5130		return -EINVAL;
5131	}
5132
5133	*gw = nla_get_in6_addr(nla);
5134
5135	return 0;
5136}
5137
5138static int ip6_route_multipath_add(struct fib6_config *cfg,
5139				   struct netlink_ext_ack *extack)
5140{
5141	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
5142	struct nl_info *info = &cfg->fc_nlinfo;
5143	struct fib6_config r_cfg;
5144	struct rtnexthop *rtnh;
5145	struct fib6_info *rt;
5146	struct rt6_nh *err_nh;
5147	struct rt6_nh *nh, *nh_safe;
5148	__u16 nlflags;
5149	int remaining;
5150	int attrlen;
5151	int err = 1;
5152	int nhn = 0;
5153	int replace = (cfg->fc_nlinfo.nlh &&
5154		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
5155	LIST_HEAD(rt6_nh_list);
5156
5157	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
5158	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
5159		nlflags |= NLM_F_APPEND;
5160
5161	remaining = cfg->fc_mp_len;
5162	rtnh = (struct rtnexthop *)cfg->fc_mp;
5163
5164	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
5165	 * fib6_info structs per nexthop
5166	 */
5167	while (rtnh_ok(rtnh, remaining)) {
5168		memcpy(&r_cfg, cfg, sizeof(*cfg));
5169		if (rtnh->rtnh_ifindex)
5170			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
5171
5172		attrlen = rtnh_attrlen(rtnh);
5173		if (attrlen > 0) {
5174			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
5175
5176			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
5177			if (nla) {
5178				err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
5179							extack);
5180				if (err)
5181					goto cleanup;
5182
5183				r_cfg.fc_flags |= RTF_GATEWAY;
5184			}
5185			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
5186
5187			/* RTA_ENCAP_TYPE length checked in
5188			 * lwtunnel_valid_encap_type_attr
5189			 */
5190			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
5191			if (nla)
5192				r_cfg.fc_encap_type = nla_get_u16(nla);
5193		}
5194
5195		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
5196		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
5197		if (IS_ERR(rt)) {
5198			err = PTR_ERR(rt);
5199			rt = NULL;
5200			goto cleanup;
5201		}
5202		if (!rt6_qualify_for_ecmp(rt)) {
5203			err = -EINVAL;
5204			NL_SET_ERR_MSG(extack,
5205				       "Device only routes can not be added for IPv6 using the multipath API.");
5206			fib6_info_release(rt);
5207			goto cleanup;
5208		}
5209
5210		rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
5211
5212		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
5213					    rt, &r_cfg);
5214		if (err) {
5215			fib6_info_release(rt);
5216			goto cleanup;
5217		}
5218
5219		rtnh = rtnh_next(rtnh, &remaining);
5220	}
5221
5222	if (list_empty(&rt6_nh_list)) {
5223		NL_SET_ERR_MSG(extack,
5224			       "Invalid nexthop configuration - no valid nexthops");
5225		return -EINVAL;
5226	}
5227
5228	/* for add and replace send one notification with all nexthops.
5229	 * Skip the notification in fib6_add_rt2node and send one with
5230	 * the full route when done
5231	 */
5232	info->skip_notify = 1;
5233
5234	/* For add and replace, send one notification with all nexthops. For
5235	 * append, send one notification with all appended nexthops.
5236	 */
5237	info->skip_notify_kernel = 1;
5238
5239	err_nh = NULL;
5240	list_for_each_entry(nh, &rt6_nh_list, next) {
5241		err = __ip6_ins_rt(nh->fib6_info, info, extack);
5242
5243		if (err) {
5244			if (replace && nhn)
5245				NL_SET_ERR_MSG_MOD(extack,
5246						   "multipath route replace failed (check consistency of installed routes)");
5247			err_nh = nh;
5248			goto add_errout;
5249		}
5250		/* save reference to last route successfully inserted */
5251		rt_last = nh->fib6_info;
5252
5253		/* save reference to first route for notification */
5254		if (!rt_notif)
5255			rt_notif = nh->fib6_info;
5256
5257		/* Because each route is added like a single route we remove
5258		 * these flags after the first nexthop: if there is a collision,
5259		 * we have already failed to add the first nexthop:
5260		 * fib6_add_rt2node() has rejected it; when replacing, old
5261		 * nexthops have been replaced by first new, the rest should
5262		 * be added to it.
5263		 */
5264		if (cfg->fc_nlinfo.nlh) {
5265			cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
5266							     NLM_F_REPLACE);
5267			cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
5268		}
5269		nhn++;
5270	}
5271
5272	/* An in-kernel notification should only be sent in case the new
5273	 * multipath route is added as the first route in the node, or if
5274	 * it was appended to it. We pass 'rt_notif' since it is the first
5275	 * sibling and might allow us to skip some checks in the replace case.
5276	 */
5277	if (ip6_route_mpath_should_notify(rt_notif)) {
5278		enum fib_event_type fib_event;
5279
5280		if (rt_notif->fib6_nsiblings != nhn - 1)
5281			fib_event = FIB_EVENT_ENTRY_APPEND;
5282		else
5283			fib_event = FIB_EVENT_ENTRY_REPLACE;
5284
5285		err = call_fib6_multipath_entry_notifiers(info->nl_net,
5286							  fib_event, rt_notif,
5287							  nhn - 1, extack);
5288		if (err) {
5289			/* Delete all the siblings that were just added */
5290			err_nh = NULL;
5291			goto add_errout;
5292		}
5293	}
5294
5295	/* success ... tell user about new route */
5296	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
5297	goto cleanup;
5298
5299add_errout:
5300	/* send notification for routes that were added so that
5301	 * the delete notifications sent by ip6_route_del are
5302	 * coherent
5303	 */
5304	if (rt_notif)
5305		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
5306
5307	/* Delete routes that were already added */
5308	list_for_each_entry(nh, &rt6_nh_list, next) {
5309		if (err_nh == nh)
5310			break;
5311		ip6_route_del(&nh->r_cfg, extack);
5312	}
5313
5314cleanup:
5315	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
5316		fib6_info_release(nh->fib6_info);
5317		list_del(&nh->next);
5318		kfree(nh);
5319	}
5320
5321	return err;
5322}
5323
5324static int ip6_route_multipath_del(struct fib6_config *cfg,
5325				   struct netlink_ext_ack *extack)
5326{
5327	struct fib6_config r_cfg;
5328	struct rtnexthop *rtnh;
5329	int last_err = 0;
5330	int remaining;
5331	int attrlen;
5332	int err;
5333
5334	remaining = cfg->fc_mp_len;
5335	rtnh = (struct rtnexthop *)cfg->fc_mp;
5336
5337	/* Parse a Multipath Entry */
5338	while (rtnh_ok(rtnh, remaining)) {
5339		memcpy(&r_cfg, cfg, sizeof(*cfg));
5340		if (rtnh->rtnh_ifindex)
5341			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
5342
5343		attrlen = rtnh_attrlen(rtnh);
5344		if (attrlen > 0) {
5345			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
5346
5347			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
5348			if (nla) {
5349				err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
5350							extack);
5351				if (err) {
5352					last_err = err;
5353					goto next_rtnh;
5354				}
5355
5356				r_cfg.fc_flags |= RTF_GATEWAY;
5357			}
5358		}
5359		err = ip6_route_del(&r_cfg, extack);
5360		if (err)
5361			last_err = err;
5362
5363next_rtnh:
5364		rtnh = rtnh_next(rtnh, &remaining);
5365	}
5366
5367	return last_err;
5368}
5369
5370static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
5371			      struct netlink_ext_ack *extack)
5372{
5373	struct fib6_config cfg;
5374	int err;
5375
5376	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
5377	if (err < 0)
5378		return err;
5379
5380	if (cfg.fc_nh_id &&
5381	    !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) {
5382		NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
5383		return -EINVAL;
5384	}
5385
5386	if (cfg.fc_mp)
5387		return ip6_route_multipath_del(&cfg, extack);
5388	else {
5389		cfg.fc_delete_all_nh = 1;
5390		return ip6_route_del(&cfg, extack);
5391	}
5392}
5393
5394static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
5395			      struct netlink_ext_ack *extack)
5396{
5397	struct fib6_config cfg;
5398	int err;
5399
5400	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
5401	if (err < 0)
5402		return err;
5403
5404	if (cfg.fc_metric == 0)
5405		cfg.fc_metric = IP6_RT_PRIO_USER;
5406
5407	if (cfg.fc_mp)
5408		return ip6_route_multipath_add(&cfg, extack);
5409	else
5410		return ip6_route_add(&cfg, GFP_KERNEL, extack);
5411}
5412
5413/* add the overhead of this fib6_nh to nexthop_len */
5414static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg)
5415{
5416	int *nexthop_len = arg;
5417
5418	*nexthop_len += nla_total_size(0)	 /* RTA_MULTIPATH */
5419		     + NLA_ALIGN(sizeof(struct rtnexthop))
5420		     + nla_total_size(16); /* RTA_GATEWAY */
5421
5422	if (nh->fib_nh_lws) {
5423		/* RTA_ENCAP_TYPE */
5424		*nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
5425		/* RTA_ENCAP */
5426		*nexthop_len += nla_total_size(2);
5427	}
5428
5429	return 0;
5430}
5431
5432static size_t rt6_nlmsg_size(struct fib6_info *f6i)
5433{
5434	int nexthop_len;
5435
5436	if (f6i->nh) {
5437		nexthop_len = nla_total_size(4); /* RTA_NH_ID */
5438		nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
5439					 &nexthop_len);
5440	} else {
5441		struct fib6_info *sibling, *next_sibling;
5442		struct fib6_nh *nh = f6i->fib6_nh;
5443
5444		nexthop_len = 0;
5445		if (f6i->fib6_nsiblings) {
5446			rt6_nh_nlmsg_size(nh, &nexthop_len);
5447
5448			list_for_each_entry_safe(sibling, next_sibling,
5449						 &f6i->fib6_siblings, fib6_siblings) {
5450				rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len);
5451			}
5452		}
5453		nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
5454	}
5455
5456	return NLMSG_ALIGN(sizeof(struct rtmsg))
5457	       + nla_total_size(16) /* RTA_SRC */
5458	       + nla_total_size(16) /* RTA_DST */
5459	       + nla_total_size(16) /* RTA_GATEWAY */
5460	       + nla_total_size(16) /* RTA_PREFSRC */
5461	       + nla_total_size(4) /* RTA_TABLE */
5462	       + nla_total_size(4) /* RTA_IIF */
5463	       + nla_total_size(4) /* RTA_OIF */
5464	       + nla_total_size(4) /* RTA_PRIORITY */
5465	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
5466	       + nla_total_size(sizeof(struct rta_cacheinfo))
5467	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
5468	       + nla_total_size(1) /* RTA_PREF */
5469	       + nexthop_len;
5470}
5471
5472static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
5473				 unsigned char *flags)
5474{
5475	if (nexthop_is_multipath(nh)) {
5476		struct nlattr *mp;
5477
5478		mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
5479		if (!mp)
5480			goto nla_put_failure;
5481
5482		if (nexthop_mpath_fill_node(skb, nh, AF_INET6))
5483			goto nla_put_failure;
5484
5485		nla_nest_end(skb, mp);
5486	} else {
5487		struct fib6_nh *fib6_nh;
5488
5489		fib6_nh = nexthop_fib6_nh(nh);
5490		if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6,
5491				     flags, false) < 0)
5492			goto nla_put_failure;
5493	}
5494
5495	return 0;
5496
5497nla_put_failure:
5498	return -EMSGSIZE;
5499}
5500
5501static int rt6_fill_node(struct net *net, struct sk_buff *skb,
5502			 struct fib6_info *rt, struct dst_entry *dst,
5503			 struct in6_addr *dest, struct in6_addr *src,
5504			 int iif, int type, u32 portid, u32 seq,
5505			 unsigned int flags)
5506{
5507	struct rt6_info *rt6 = (struct rt6_info *)dst;
5508	struct rt6key *rt6_dst, *rt6_src;
5509	u32 *pmetrics, table, rt6_flags;
5510	unsigned char nh_flags = 0;
5511	struct nlmsghdr *nlh;
5512	struct rtmsg *rtm;
5513	long expires = 0;
5514
5515	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
5516	if (!nlh)
5517		return -EMSGSIZE;
5518
5519	if (rt6) {
5520		rt6_dst = &rt6->rt6i_dst;
5521		rt6_src = &rt6->rt6i_src;
5522		rt6_flags = rt6->rt6i_flags;
5523	} else {
5524		rt6_dst = &rt->fib6_dst;
5525		rt6_src = &rt->fib6_src;
5526		rt6_flags = rt->fib6_flags;
5527	}
5528
5529	rtm = nlmsg_data(nlh);
5530	rtm->rtm_family = AF_INET6;
5531	rtm->rtm_dst_len = rt6_dst->plen;
5532	rtm->rtm_src_len = rt6_src->plen;
5533	rtm->rtm_tos = 0;
5534	if (rt->fib6_table)
5535		table = rt->fib6_table->tb6_id;
5536	else
5537		table = RT6_TABLE_UNSPEC;
5538	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
5539	if (nla_put_u32(skb, RTA_TABLE, table))
5540		goto nla_put_failure;
5541
5542	rtm->rtm_type = rt->fib6_type;
5543	rtm->rtm_flags = 0;
5544	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
5545	rtm->rtm_protocol = rt->fib6_protocol;
5546
5547	if (rt6_flags & RTF_CACHE)
5548		rtm->rtm_flags |= RTM_F_CLONED;
5549
5550	if (dest) {
5551		if (nla_put_in6_addr(skb, RTA_DST, dest))
5552			goto nla_put_failure;
5553		rtm->rtm_dst_len = 128;
5554	} else if (rtm->rtm_dst_len)
5555		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
5556			goto nla_put_failure;
5557#ifdef CONFIG_IPV6_SUBTREES
5558	if (src) {
5559		if (nla_put_in6_addr(skb, RTA_SRC, src))
5560			goto nla_put_failure;
5561		rtm->rtm_src_len = 128;
5562	} else if (rtm->rtm_src_len &&
5563		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
5564		goto nla_put_failure;
5565#endif
5566	if (iif) {
5567#ifdef CONFIG_IPV6_MROUTE
5568		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
5569			int err = ip6mr_get_route(net, skb, rtm, portid);
5570
5571			if (err == 0)
5572				return 0;
5573			if (err < 0)
5574				goto nla_put_failure;
5575		} else
5576#endif
5577			if (nla_put_u32(skb, RTA_IIF, iif))
5578				goto nla_put_failure;
5579	} else if (dest) {
5580		struct in6_addr saddr_buf;
5581		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
5582		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
5583			goto nla_put_failure;
5584	}
5585
5586	if (rt->fib6_prefsrc.plen) {
5587		struct in6_addr saddr_buf;
5588		saddr_buf = rt->fib6_prefsrc.addr;
5589		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
5590			goto nla_put_failure;
5591	}
5592
5593	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
5594	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
5595		goto nla_put_failure;
5596
5597	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
5598		goto nla_put_failure;
5599
5600	/* For multipath routes, walk the siblings list and add
5601	 * each as a nexthop within RTA_MULTIPATH.
5602	 */
5603	if (rt6) {
5604		if (rt6_flags & RTF_GATEWAY &&
5605		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
5606			goto nla_put_failure;
5607
5608		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
5609			goto nla_put_failure;
5610	} else if (rt->fib6_nsiblings) {
5611		struct fib6_info *sibling, *next_sibling;
5612		struct nlattr *mp;
5613
5614		mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
5615		if (!mp)
5616			goto nla_put_failure;
5617
5618		if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
5619				    rt->fib6_nh->fib_nh_weight, AF_INET6,
5620				    0) < 0)
5621			goto nla_put_failure;
5622
5623		list_for_each_entry_safe(sibling, next_sibling,
5624					 &rt->fib6_siblings, fib6_siblings) {
5625			if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
5626					    sibling->fib6_nh->fib_nh_weight,
5627					    AF_INET6, 0) < 0)
5628				goto nla_put_failure;
5629		}
5630
5631		nla_nest_end(skb, mp);
5632	} else if (rt->nh) {
5633		if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
5634			goto nla_put_failure;
5635
5636		if (nexthop_is_blackhole(rt->nh))
5637			rtm->rtm_type = RTN_BLACKHOLE;
5638
5639		if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) &&
5640		    rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
5641			goto nla_put_failure;
5642
5643		rtm->rtm_flags |= nh_flags;
5644	} else {
5645		if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6,
5646				     &nh_flags, false) < 0)
5647			goto nla_put_failure;
5648
5649		rtm->rtm_flags |= nh_flags;
5650	}
5651
5652	if (rt6_flags & RTF_EXPIRES) {
5653		expires = dst ? dst->expires : rt->expires;
5654		expires -= jiffies;
5655	}
5656
5657	if (!dst) {
5658		if (rt->offload)
5659			rtm->rtm_flags |= RTM_F_OFFLOAD;
5660		if (rt->trap)
5661			rtm->rtm_flags |= RTM_F_TRAP;
5662	}
5663
5664	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
5665		goto nla_put_failure;
5666
5667	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
5668		goto nla_put_failure;
5669
5670
5671	nlmsg_end(skb, nlh);
5672	return 0;
5673
5674nla_put_failure:
5675	nlmsg_cancel(skb, nlh);
5676	return -EMSGSIZE;
5677}
5678
5679static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg)
5680{
5681	const struct net_device *dev = arg;
5682
5683	if (nh->fib_nh_dev == dev)
5684		return 1;
5685
5686	return 0;
5687}
5688
5689static bool fib6_info_uses_dev(const struct fib6_info *f6i,
5690			       const struct net_device *dev)
5691{
5692	if (f6i->nh) {
5693		struct net_device *_dev = (struct net_device *)dev;
5694
5695		return !!nexthop_for_each_fib6_nh(f6i->nh,
5696						  fib6_info_nh_uses_dev,
5697						  _dev);
5698	}
5699
5700	if (f6i->fib6_nh->fib_nh_dev == dev)
5701		return true;
5702
5703	if (f6i->fib6_nsiblings) {
5704		struct fib6_info *sibling, *next_sibling;
5705
5706		list_for_each_entry_safe(sibling, next_sibling,
5707					 &f6i->fib6_siblings, fib6_siblings) {
5708			if (sibling->fib6_nh->fib_nh_dev == dev)
5709				return true;
5710		}
5711	}
5712
5713	return false;
5714}
5715
5716struct fib6_nh_exception_dump_walker {
5717	struct rt6_rtnl_dump_arg *dump;
5718	struct fib6_info *rt;
5719	unsigned int flags;
5720	unsigned int skip;
5721	unsigned int count;
5722};
5723
5724static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg)
5725{
5726	struct fib6_nh_exception_dump_walker *w = arg;
5727	struct rt6_rtnl_dump_arg *dump = w->dump;
5728	struct rt6_exception_bucket *bucket;
5729	struct rt6_exception *rt6_ex;
5730	int i, err;
5731
5732	bucket = fib6_nh_get_excptn_bucket(nh, NULL);
5733	if (!bucket)
5734		return 0;
5735
5736	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
5737		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
5738			if (w->skip) {
5739				w->skip--;
5740				continue;
5741			}
5742
5743			/* Expiration of entries doesn't bump sernum, insertion
5744			 * does. Removal is triggered by insertion, so we can
5745			 * rely on the fact that if entries change between two
5746			 * partial dumps, this node is scanned again completely,
5747			 * see rt6_insert_exception() and fib6_dump_table().
5748			 *
5749			 * Count expired entries we go through as handled
5750			 * entries that we'll skip next time, in case of partial
5751			 * node dump. Otherwise, if entries expire meanwhile,
5752			 * we'll skip the wrong amount.
5753			 */
5754			if (rt6_check_expired(rt6_ex->rt6i)) {
5755				w->count++;
5756				continue;
5757			}
5758
5759			err = rt6_fill_node(dump->net, dump->skb, w->rt,
5760					    &rt6_ex->rt6i->dst, NULL, NULL, 0,
5761					    RTM_NEWROUTE,
5762					    NETLINK_CB(dump->cb->skb).portid,
5763					    dump->cb->nlh->nlmsg_seq, w->flags);
5764			if (err)
5765				return err;
5766
5767			w->count++;
5768		}
5769		bucket++;
5770	}
5771
5772	return 0;
5773}
5774
5775/* Return -1 if done with node, number of handled routes on partial dump */
5776int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
5777{
5778	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
5779	struct fib_dump_filter *filter = &arg->filter;
5780	unsigned int flags = NLM_F_MULTI;
5781	struct net *net = arg->net;
5782	int count = 0;
5783
5784	if (rt == net->ipv6.fib6_null_entry)
5785		return -1;
5786
5787	if ((filter->flags & RTM_F_PREFIX) &&
5788	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
5789		/* success since this is not a prefix route */
5790		return -1;
5791	}
5792	if (filter->filter_set &&
5793	    ((filter->rt_type  && rt->fib6_type != filter->rt_type) ||
5794	     (filter->dev      && !fib6_info_uses_dev(rt, filter->dev)) ||
5795	     (filter->protocol && rt->fib6_protocol != filter->protocol))) {
5796		return -1;
5797	}
5798
5799	if (filter->filter_set ||
5800	    !filter->dump_routes || !filter->dump_exceptions) {
5801		flags |= NLM_F_DUMP_FILTERED;
5802	}
5803
5804	if (filter->dump_routes) {
5805		if (skip) {
5806			skip--;
5807		} else {
5808			if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL,
5809					  0, RTM_NEWROUTE,
5810					  NETLINK_CB(arg->cb->skb).portid,
5811					  arg->cb->nlh->nlmsg_seq, flags)) {
5812				return 0;
5813			}
5814			count++;
5815		}
5816	}
5817
5818	if (filter->dump_exceptions) {
5819		struct fib6_nh_exception_dump_walker w = { .dump = arg,
5820							   .rt = rt,
5821							   .flags = flags,
5822							   .skip = skip,
5823							   .count = 0 };
5824		int err;
5825
5826		rcu_read_lock();
5827		if (rt->nh) {
5828			err = nexthop_for_each_fib6_nh(rt->nh,
5829						       rt6_nh_dump_exceptions,
5830						       &w);
5831		} else {
5832			err = rt6_nh_dump_exceptions(rt->fib6_nh, &w);
5833		}
5834		rcu_read_unlock();
5835
5836		if (err)
5837			return count += w.count;
5838	}
5839
5840	return -1;
5841}
5842
5843static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
5844					const struct nlmsghdr *nlh,
5845					struct nlattr **tb,
5846					struct netlink_ext_ack *extack)
5847{
5848	struct rtmsg *rtm;
5849	int i, err;
5850
5851	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
5852		NL_SET_ERR_MSG_MOD(extack,
5853				   "Invalid header for get route request");
5854		return -EINVAL;
5855	}
5856
5857	if (!netlink_strict_get_check(skb))
5858		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
5859					      rtm_ipv6_policy, extack);
5860
5861	rtm = nlmsg_data(nlh);
5862	if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
5863	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
5864	    rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
5865	    rtm->rtm_type) {
5866		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
5867		return -EINVAL;
5868	}
5869	if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
5870		NL_SET_ERR_MSG_MOD(extack,
5871				   "Invalid flags for get route request");
5872		return -EINVAL;
5873	}
5874
5875	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
5876					    rtm_ipv6_policy, extack);
5877	if (err)
5878		return err;
5879
5880	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
5881	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
5882		NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
5883		return -EINVAL;
5884	}
5885
5886	for (i = 0; i <= RTA_MAX; i++) {
5887		if (!tb[i])
5888			continue;
5889
5890		switch (i) {
5891		case RTA_SRC:
5892		case RTA_DST:
5893		case RTA_IIF:
5894		case RTA_OIF:
5895		case RTA_MARK:
5896		case RTA_UID:
5897		case RTA_SPORT:
5898		case RTA_DPORT:
5899		case RTA_IP_PROTO:
5900			break;
5901		default:
5902			NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
5903			return -EINVAL;
5904		}
5905	}
5906
5907	return 0;
5908}
5909
5910static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
5911			      struct netlink_ext_ack *extack)
5912{
5913	struct net *net = sock_net(in_skb->sk);
5914	struct nlattr *tb[RTA_MAX+1];
5915	int err, iif = 0, oif = 0;
5916	struct fib6_info *from;
5917	struct dst_entry *dst;
5918	struct rt6_info *rt;
5919	struct sk_buff *skb;
5920	struct rtmsg *rtm;
5921	struct flowi6 fl6 = {};
5922	bool fibmatch;
5923
5924	err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
5925	if (err < 0)
5926		goto errout;
5927
5928	err = -EINVAL;
5929	rtm = nlmsg_data(nlh);
5930	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
5931	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
5932
5933	if (tb[RTA_SRC]) {
5934		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
5935			goto errout;
5936
5937		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
5938	}
5939
5940	if (tb[RTA_DST]) {
5941		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
5942			goto errout;
5943
5944		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
5945	}
5946
5947	if (tb[RTA_IIF])
5948		iif = nla_get_u32(tb[RTA_IIF]);
5949
5950	if (tb[RTA_OIF])
5951		oif = nla_get_u32(tb[RTA_OIF]);
5952
5953	if (tb[RTA_MARK])
5954		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
5955
5956	if (tb[RTA_UID])
5957		fl6.flowi6_uid = make_kuid(current_user_ns(),
5958					   nla_get_u32(tb[RTA_UID]));
5959	else
5960		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
5961
5962	if (tb[RTA_SPORT])
5963		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
5964
5965	if (tb[RTA_DPORT])
5966		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
5967
5968	if (tb[RTA_IP_PROTO]) {
5969		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5970						  &fl6.flowi6_proto, AF_INET6,
5971						  extack);
5972		if (err)
5973			goto errout;
5974	}
5975
5976	if (iif) {
5977		struct net_device *dev;
5978		int flags = 0;
5979
5980		rcu_read_lock();
5981
5982		dev = dev_get_by_index_rcu(net, iif);
5983		if (!dev) {
5984			rcu_read_unlock();
5985			err = -ENODEV;
5986			goto errout;
5987		}
5988
5989		fl6.flowi6_iif = iif;
5990
5991		if (!ipv6_addr_any(&fl6.saddr))
5992			flags |= RT6_LOOKUP_F_HAS_SADDR;
5993
5994		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5995
5996		rcu_read_unlock();
5997	} else {
5998		fl6.flowi6_oif = oif;
5999
6000		dst = ip6_route_output(net, NULL, &fl6);
6001	}
6002
6003
6004	rt = container_of(dst, struct rt6_info, dst);
6005	if (rt->dst.error) {
6006		err = rt->dst.error;
6007		ip6_rt_put(rt);
6008		goto errout;
6009	}
6010
6011	if (rt == net->ipv6.ip6_null_entry) {
6012		err = rt->dst.error;
6013		ip6_rt_put(rt);
6014		goto errout;
6015	}
6016
6017	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
6018	if (!skb) {
6019		ip6_rt_put(rt);
6020		err = -ENOBUFS;
6021		goto errout;
6022	}
6023
6024	skb_dst_set(skb, &rt->dst);
6025
6026	rcu_read_lock();
6027	from = rcu_dereference(rt->from);
6028	if (from) {
6029		if (fibmatch)
6030			err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
6031					    iif, RTM_NEWROUTE,
6032					    NETLINK_CB(in_skb).portid,
6033					    nlh->nlmsg_seq, 0);
6034		else
6035			err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
6036					    &fl6.saddr, iif, RTM_NEWROUTE,
6037					    NETLINK_CB(in_skb).portid,
6038					    nlh->nlmsg_seq, 0);
6039	} else {
6040		err = -ENETUNREACH;
6041	}
6042	rcu_read_unlock();
6043
6044	if (err < 0) {
6045		kfree_skb(skb);
6046		goto errout;
6047	}
6048
6049	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
6050errout:
6051	return err;
6052}
6053
6054void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
6055		     unsigned int nlm_flags)
6056{
6057	struct sk_buff *skb;
6058	struct net *net = info->nl_net;
6059	u32 seq;
6060	int err;
6061
6062	err = -ENOBUFS;
6063	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
6064
6065	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
6066	if (!skb)
6067		goto errout;
6068
6069	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
6070			    event, info->portid, seq, nlm_flags);
6071	if (err < 0) {
6072		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
6073		WARN_ON(err == -EMSGSIZE);
6074		kfree_skb(skb);
6075		goto errout;
6076	}
6077	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
6078		    info->nlh, gfp_any());
6079	return;
6080errout:
6081	if (err < 0)
6082		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
6083}
6084
6085void fib6_rt_update(struct net *net, struct fib6_info *rt,
6086		    struct nl_info *info)
6087{
6088	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
6089	struct sk_buff *skb;
6090	int err = -ENOBUFS;
6091
6092	/* call_fib6_entry_notifiers will be removed when in-kernel notifier
6093	 * is implemented and supported for nexthop objects
6094	 */
6095	call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL);
6096
6097	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
6098	if (!skb)
6099		goto errout;
6100
6101	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
6102			    RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
6103	if (err < 0) {
6104		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
6105		WARN_ON(err == -EMSGSIZE);
6106		kfree_skb(skb);
6107		goto errout;
6108	}
6109	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
6110		    info->nlh, gfp_any());
6111	return;
6112errout:
6113	if (err < 0)
6114		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
6115}
6116
6117static int ip6_route_dev_notify(struct notifier_block *this,
6118				unsigned long event, void *ptr)
6119{
6120	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
6121	struct net *net = dev_net(dev);
6122
6123	if (!(dev->flags & IFF_LOOPBACK))
6124		return NOTIFY_OK;
6125
6126	if (event == NETDEV_REGISTER) {
6127		net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
6128		net->ipv6.ip6_null_entry->dst.dev = dev;
6129		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
6130#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6131		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
6132		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
6133		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
6134		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
6135#endif
6136	 } else if (event == NETDEV_UNREGISTER &&
6137		    dev->reg_state != NETREG_UNREGISTERED) {
6138		/* NETDEV_UNREGISTER could be fired for multiple times by
6139		 * netdev_wait_allrefs(). Make sure we only call this once.
6140		 */
6141		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
6142#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6143		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
6144		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
6145#endif
6146	}
6147
6148	return NOTIFY_OK;
6149}
6150
6151/*
6152 *	/proc
6153 */
6154
6155#ifdef CONFIG_PROC_FS
6156static int rt6_stats_seq_show(struct seq_file *seq, void *v)
6157{
6158	struct net *net = (struct net *)seq->private;
6159	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
6160		   net->ipv6.rt6_stats->fib_nodes,
6161		   net->ipv6.rt6_stats->fib_route_nodes,
6162		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
6163		   net->ipv6.rt6_stats->fib_rt_entries,
6164		   net->ipv6.rt6_stats->fib_rt_cache,
6165		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
6166		   net->ipv6.rt6_stats->fib_discarded_routes);
6167
6168	return 0;
6169}
6170#endif	/* CONFIG_PROC_FS */
6171
6172#ifdef CONFIG_SYSCTL
6173
6174static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
6175			      void *buffer, size_t *lenp, loff_t *ppos)
6176{
6177	struct net *net;
6178	int delay;
6179	int ret;
6180	if (!write)
6181		return -EINVAL;
6182
6183	net = (struct net *)ctl->extra1;
6184	delay = net->ipv6.sysctl.flush_delay;
6185	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
6186	if (ret)
6187		return ret;
6188
6189	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
6190	return 0;
6191}
6192
6193static struct ctl_table ipv6_route_table_template[] = {
6194	{
6195		.procname	=	"flush",
6196		.data		=	&init_net.ipv6.sysctl.flush_delay,
6197		.maxlen		=	sizeof(int),
6198		.mode		=	0200,
6199		.proc_handler	=	ipv6_sysctl_rtcache_flush
6200	},
6201	{
6202		.procname	=	"gc_thresh",
6203		.data		=	&ip6_dst_ops_template.gc_thresh,
6204		.maxlen		=	sizeof(int),
6205		.mode		=	0644,
6206		.proc_handler	=	proc_dointvec,
6207	},
6208	{
6209		.procname	=	"max_size",
6210		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
6211		.maxlen		=	sizeof(int),
6212		.mode		=	0644,
6213		.proc_handler	=	proc_dointvec,
6214	},
6215	{
6216		.procname	=	"gc_min_interval",
6217		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
6218		.maxlen		=	sizeof(int),
6219		.mode		=	0644,
6220		.proc_handler	=	proc_dointvec_jiffies,
6221	},
6222	{
6223		.procname	=	"gc_timeout",
6224		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
6225		.maxlen		=	sizeof(int),
6226		.mode		=	0644,
6227		.proc_handler	=	proc_dointvec_jiffies,
6228	},
6229	{
6230		.procname	=	"gc_interval",
6231		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
6232		.maxlen		=	sizeof(int),
6233		.mode		=	0644,
6234		.proc_handler	=	proc_dointvec_jiffies,
6235	},
6236	{
6237		.procname	=	"gc_elasticity",
6238		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
6239		.maxlen		=	sizeof(int),
6240		.mode		=	0644,
6241		.proc_handler	=	proc_dointvec,
6242	},
6243	{
6244		.procname	=	"mtu_expires",
6245		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
6246		.maxlen		=	sizeof(int),
6247		.mode		=	0644,
6248		.proc_handler	=	proc_dointvec_jiffies,
6249	},
6250	{
6251		.procname	=	"min_adv_mss",
6252		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
6253		.maxlen		=	sizeof(int),
6254		.mode		=	0644,
6255		.proc_handler	=	proc_dointvec,
6256	},
6257	{
6258		.procname	=	"gc_min_interval_ms",
6259		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
6260		.maxlen		=	sizeof(int),
6261		.mode		=	0644,
6262		.proc_handler	=	proc_dointvec_ms_jiffies,
6263	},
6264	{
6265		.procname	=	"skip_notify_on_dev_down",
6266		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
6267		.maxlen		=	sizeof(int),
6268		.mode		=	0644,
6269		.proc_handler	=	proc_dointvec_minmax,
6270		.extra1		=	SYSCTL_ZERO,
6271		.extra2		=	SYSCTL_ONE,
6272	},
6273	{ }
6274};
6275
6276struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
6277{
6278	struct ctl_table *table;
6279
6280	table = kmemdup(ipv6_route_table_template,
6281			sizeof(ipv6_route_table_template),
6282			GFP_KERNEL);
6283
6284	if (table) {
6285		table[0].data = &net->ipv6.sysctl.flush_delay;
6286		table[0].extra1 = net;
6287		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
6288		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
6289		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
6290		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
6291		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
6292		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
6293		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
6294		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
6295		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
6296		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
6297
6298		/* Don't export sysctls to unprivileged users */
6299		if (net->user_ns != &init_user_ns)
6300			table[0].procname = NULL;
6301	}
6302
6303	return table;
6304}
6305#endif
6306
6307static int __net_init ip6_route_net_init(struct net *net)
6308{
6309	int ret = -ENOMEM;
6310
6311	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
6312	       sizeof(net->ipv6.ip6_dst_ops));
6313
6314	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
6315		goto out_ip6_dst_ops;
6316
6317	net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
6318	if (!net->ipv6.fib6_null_entry)
6319		goto out_ip6_dst_entries;
6320	memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
6321	       sizeof(*net->ipv6.fib6_null_entry));
6322
6323	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
6324					   sizeof(*net->ipv6.ip6_null_entry),
6325					   GFP_KERNEL);
6326	if (!net->ipv6.ip6_null_entry)
6327		goto out_fib6_null_entry;
6328	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
6329	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
6330			 ip6_template_metrics, true);
6331	INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached);
6332
6333#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6334	net->ipv6.fib6_has_custom_rules = false;
6335	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
6336					       sizeof(*net->ipv6.ip6_prohibit_entry),
6337					       GFP_KERNEL);
6338	if (!net->ipv6.ip6_prohibit_entry)
6339		goto out_ip6_null_entry;
6340	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
6341	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
6342			 ip6_template_metrics, true);
6343	INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached);
6344
6345	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
6346					       sizeof(*net->ipv6.ip6_blk_hole_entry),
6347					       GFP_KERNEL);
6348	if (!net->ipv6.ip6_blk_hole_entry)
6349		goto out_ip6_prohibit_entry;
6350	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
6351	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
6352			 ip6_template_metrics, true);
6353	INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached);
6354#ifdef CONFIG_IPV6_SUBTREES
6355	net->ipv6.fib6_routes_require_src = 0;
6356#endif
6357#endif
6358
6359	net->ipv6.sysctl.flush_delay = 0;
6360	net->ipv6.sysctl.ip6_rt_max_size = INT_MAX;
6361	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
6362	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
6363	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
6364	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
6365	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
6366	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
6367	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
6368
6369	atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ);
6370
6371	ret = 0;
6372out:
6373	return ret;
6374
6375#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6376out_ip6_prohibit_entry:
6377	kfree(net->ipv6.ip6_prohibit_entry);
6378out_ip6_null_entry:
6379	kfree(net->ipv6.ip6_null_entry);
6380#endif
6381out_fib6_null_entry:
6382	kfree(net->ipv6.fib6_null_entry);
6383out_ip6_dst_entries:
6384	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
6385out_ip6_dst_ops:
6386	goto out;
6387}
6388
6389static void __net_exit ip6_route_net_exit(struct net *net)
6390{
6391	kfree(net->ipv6.fib6_null_entry);
6392	kfree(net->ipv6.ip6_null_entry);
6393#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6394	kfree(net->ipv6.ip6_prohibit_entry);
6395	kfree(net->ipv6.ip6_blk_hole_entry);
6396#endif
6397	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
6398}
6399
6400static int __net_init ip6_route_net_init_late(struct net *net)
6401{
6402#ifdef CONFIG_PROC_FS
6403	if (!proc_create_net("ipv6_route", 0, net->proc_net,
6404			     &ipv6_route_seq_ops,
6405			     sizeof(struct ipv6_route_iter)))
6406		return -ENOMEM;
6407
6408	if (!proc_create_net_single("rt6_stats", 0444, net->proc_net,
6409				    rt6_stats_seq_show, NULL)) {
6410		remove_proc_entry("ipv6_route", net->proc_net);
6411		return -ENOMEM;
6412	}
6413#endif
6414	return 0;
6415}
6416
6417static void __net_exit ip6_route_net_exit_late(struct net *net)
6418{
6419#ifdef CONFIG_PROC_FS
6420	remove_proc_entry("ipv6_route", net->proc_net);
6421	remove_proc_entry("rt6_stats", net->proc_net);
6422#endif
6423}
6424
6425static struct pernet_operations ip6_route_net_ops = {
6426	.init = ip6_route_net_init,
6427	.exit = ip6_route_net_exit,
6428};
6429
6430static int __net_init ipv6_inetpeer_init(struct net *net)
6431{
6432	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
6433
6434	if (!bp)
6435		return -ENOMEM;
6436	inet_peer_base_init(bp);
6437	net->ipv6.peers = bp;
6438	return 0;
6439}
6440
6441static void __net_exit ipv6_inetpeer_exit(struct net *net)
6442{
6443	struct inet_peer_base *bp = net->ipv6.peers;
6444
6445	net->ipv6.peers = NULL;
6446	inetpeer_invalidate_tree(bp);
6447	kfree(bp);
6448}
6449
6450static struct pernet_operations ipv6_inetpeer_ops = {
6451	.init	=	ipv6_inetpeer_init,
6452	.exit	=	ipv6_inetpeer_exit,
6453};
6454
6455static struct pernet_operations ip6_route_net_late_ops = {
6456	.init = ip6_route_net_init_late,
6457	.exit = ip6_route_net_exit_late,
6458};
6459
6460static struct notifier_block ip6_route_dev_notifier = {
6461	.notifier_call = ip6_route_dev_notify,
6462	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
6463};
6464
6465void __init ip6_route_init_special_entries(void)
6466{
6467	/* Registering of the loopback is done before this portion of code,
6468	 * the loopback reference in rt6_info will not be taken, do it
6469	 * manually for init_net */
6470	init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
6471	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
6472	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
6473  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
6474	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
6475	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
6476	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
6477	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
6478  #endif
6479}
6480
6481#if IS_BUILTIN(CONFIG_IPV6)
6482#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6483DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
6484
6485BTF_ID_LIST(btf_fib6_info_id)
6486BTF_ID(struct, fib6_info)
6487
6488static const struct bpf_iter_seq_info ipv6_route_seq_info = {
6489	.seq_ops		= &ipv6_route_seq_ops,
6490	.init_seq_private	= bpf_iter_init_seq_net,
6491	.fini_seq_private	= bpf_iter_fini_seq_net,
6492	.seq_priv_size		= sizeof(struct ipv6_route_iter),
6493};
6494
6495static struct bpf_iter_reg ipv6_route_reg_info = {
6496	.target			= "ipv6_route",
6497	.ctx_arg_info_size	= 1,
6498	.ctx_arg_info		= {
6499		{ offsetof(struct bpf_iter__ipv6_route, rt),
6500		  PTR_TO_BTF_ID_OR_NULL },
6501	},
6502	.seq_info		= &ipv6_route_seq_info,
6503};
6504
6505static int __init bpf_iter_register(void)
6506{
6507	ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id;
6508	return bpf_iter_reg_target(&ipv6_route_reg_info);
6509}
6510
6511static void bpf_iter_unregister(void)
6512{
6513	bpf_iter_unreg_target(&ipv6_route_reg_info);
6514}
6515#endif
6516#endif
6517
6518int __init ip6_route_init(void)
6519{
6520	int ret;
6521	int cpu;
6522
6523	ret = -ENOMEM;
6524	ip6_dst_ops_template.kmem_cachep =
6525		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
6526				  SLAB_HWCACHE_ALIGN, NULL);
6527	if (!ip6_dst_ops_template.kmem_cachep)
6528		goto out;
6529
6530	ret = dst_entries_init(&ip6_dst_blackhole_ops);
6531	if (ret)
6532		goto out_kmem_cache;
6533
6534	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
6535	if (ret)
6536		goto out_dst_entries;
6537
6538	ret = register_pernet_subsys(&ip6_route_net_ops);
6539	if (ret)
6540		goto out_register_inetpeer;
6541
6542	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
6543
6544	ret = fib6_init();
6545	if (ret)
6546		goto out_register_subsys;
6547
6548	ret = xfrm6_init();
6549	if (ret)
6550		goto out_fib6_init;
6551
6552	ret = fib6_rules_init();
6553	if (ret)
6554		goto xfrm6_init;
6555
6556	ret = register_pernet_subsys(&ip6_route_net_late_ops);
6557	if (ret)
6558		goto fib6_rules_init;
6559
6560	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
6561				   inet6_rtm_newroute, NULL, 0);
6562	if (ret < 0)
6563		goto out_register_late_subsys;
6564
6565	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
6566				   inet6_rtm_delroute, NULL, 0);
6567	if (ret < 0)
6568		goto out_register_late_subsys;
6569
6570	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
6571				   inet6_rtm_getroute, NULL,
6572				   RTNL_FLAG_DOIT_UNLOCKED);
6573	if (ret < 0)
6574		goto out_register_late_subsys;
6575
6576	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
6577	if (ret)
6578		goto out_register_late_subsys;
6579
6580#if IS_BUILTIN(CONFIG_IPV6)
6581#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6582	ret = bpf_iter_register();
6583	if (ret)
6584		goto out_register_late_subsys;
6585#endif
6586#endif
6587
6588	for_each_possible_cpu(cpu) {
6589		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
6590
6591		INIT_LIST_HEAD(&ul->head);
6592		spin_lock_init(&ul->lock);
6593	}
6594
6595out:
6596	return ret;
6597
6598out_register_late_subsys:
6599	rtnl_unregister_all(PF_INET6);
6600	unregister_pernet_subsys(&ip6_route_net_late_ops);
6601fib6_rules_init:
6602	fib6_rules_cleanup();
6603xfrm6_init:
6604	xfrm6_fini();
6605out_fib6_init:
6606	fib6_gc_cleanup();
6607out_register_subsys:
6608	unregister_pernet_subsys(&ip6_route_net_ops);
6609out_register_inetpeer:
6610	unregister_pernet_subsys(&ipv6_inetpeer_ops);
6611out_dst_entries:
6612	dst_entries_destroy(&ip6_dst_blackhole_ops);
6613out_kmem_cache:
6614	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
6615	goto out;
6616}
6617
6618void ip6_route_cleanup(void)
6619{
6620#if IS_BUILTIN(CONFIG_IPV6)
6621#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6622	bpf_iter_unregister();
6623#endif
6624#endif
6625	unregister_netdevice_notifier(&ip6_route_dev_notifier);
6626	unregister_pernet_subsys(&ip6_route_net_late_ops);
6627	fib6_rules_cleanup();
6628	xfrm6_fini();
6629	fib6_gc_cleanup();
6630	unregister_pernet_subsys(&ipv6_inetpeer_ops);
6631	unregister_pernet_subsys(&ip6_route_net_ops);
6632	dst_entries_destroy(&ip6_dst_blackhole_ops);
6633	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
6634}
6635