xref: /kernel/linux/linux-5.10/net/ipv4/route.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
4 *		operating system.  INET is implemented using the  BSD Socket
5 *		interface as the means of communication with the user level.
6 *
7 *		ROUTE - implementation of the IP router.
8 *
9 * Authors:	Ross Biro
10 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
12 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 *
15 * Fixes:
16 *		Alan Cox	:	Verify area fixes.
17 *		Alan Cox	:	cli() protects routing changes
18 *		Rui Oliveira	:	ICMP routing table updates
19 *		(rco@di.uminho.pt)	Routing table insertion and update
20 *		Linus Torvalds	:	Rewrote bits to be sensible
21 *		Alan Cox	:	Added BSD route gw semantics
22 *		Alan Cox	:	Super /proc >4K
23 *		Alan Cox	:	MTU in route table
24 *		Alan Cox	: 	MSS actually. Also added the window
25 *					clamper.
26 *		Sam Lantinga	:	Fixed route matching in rt_del()
27 *		Alan Cox	:	Routing cache support.
28 *		Alan Cox	:	Removed compatibility cruft.
29 *		Alan Cox	:	RTF_REJECT support.
30 *		Alan Cox	:	TCP irtt support.
31 *		Jonathan Naylor	:	Added Metric support.
32 *	Miquel van Smoorenburg	:	BSD API fixes.
33 *	Miquel van Smoorenburg	:	Metrics.
34 *		Alan Cox	:	Use __u32 properly
35 *		Alan Cox	:	Aligned routing errors more closely with BSD
36 *					our system is still very different.
37 *		Alan Cox	:	Faster /proc handling
38 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
39 *					routing caches and better behaviour.
40 *
41 *		Olaf Erb	:	irtt wasn't being copied right.
42 *		Bjorn Ekwall	:	Kerneld route support.
43 *		Alan Cox	:	Multicast fixed (I hope)
44 * 		Pavel Krauz	:	Limited broadcast fixed
45 *		Mike McLagan	:	Routing by source
46 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
47 *					route.c and rewritten from scratch.
48 *		Andi Kleen	:	Load-limit warning messages.
49 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
50 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
51 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
52 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
53 *		Marc Boucher	:	routing by fwmark
54 *	Robert Olsson		:	Added rt_cache statistics
55 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
56 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
57 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
58 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
59 */
60
61#define pr_fmt(fmt) "IPv4: " fmt
62
63#include <linux/module.h>
64#include <linux/uaccess.h>
65#include <linux/bitops.h>
66#include <linux/types.h>
67#include <linux/kernel.h>
68#include <linux/mm.h>
69#include <linux/memblock.h>
70#include <linux/string.h>
71#include <linux/socket.h>
72#include <linux/sockios.h>
73#include <linux/errno.h>
74#include <linux/in.h>
75#include <linux/inet.h>
76#include <linux/netdevice.h>
77#include <linux/proc_fs.h>
78#include <linux/init.h>
79#include <linux/skbuff.h>
80#include <linux/inetdevice.h>
81#include <linux/igmp.h>
82#include <linux/pkt_sched.h>
83#include <linux/mroute.h>
84#include <linux/netfilter_ipv4.h>
85#include <linux/random.h>
86#include <linux/rcupdate.h>
87#include <linux/times.h>
88#include <linux/slab.h>
89#include <linux/jhash.h>
90#include <net/dst.h>
91#include <net/dst_metadata.h>
92#include <net/net_namespace.h>
93#include <net/protocol.h>
94#include <net/ip.h>
95#include <net/route.h>
96#include <net/inetpeer.h>
97#include <net/sock.h>
98#include <net/ip_fib.h>
99#include <net/nexthop.h>
100#include <net/arp.h>
101#include <net/tcp.h>
102#include <net/icmp.h>
103#include <net/xfrm.h>
104#include <net/lwtunnel.h>
105#include <net/netevent.h>
106#include <net/rtnetlink.h>
107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110#include <net/secure_seq.h>
111#include <net/ip_tunnels.h>
112#include <net/l3mdev.h>
113
114#include "fib_lookup.h"
115
116#define RT_FL_TOS(oldflp4) \
117	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119#define RT_GC_TIMEOUT (300*HZ)
120
121static int ip_rt_max_size;
122static int ip_rt_redirect_number __read_mostly	= 9;
123static int ip_rt_redirect_load __read_mostly	= HZ / 50;
124static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
125static int ip_rt_error_cost __read_mostly	= HZ;
126static int ip_rt_error_burst __read_mostly	= 5 * HZ;
127static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
128static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
129static int ip_rt_min_advmss __read_mostly	= 256;
130
131static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
132
133/*
134 *	Interface to generic destination cache.
135 */
136
137static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
139static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
140static void		ipv4_negative_advice(struct sock *sk,
141					     struct dst_entry *dst);
142static void		 ipv4_link_failure(struct sk_buff *skb);
143static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
144					   struct sk_buff *skb, u32 mtu,
145					   bool confirm_neigh);
146static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147					struct sk_buff *skb);
148static void		ipv4_dst_destroy(struct dst_entry *dst);
149
150static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151{
152	WARN_ON(1);
153	return NULL;
154}
155
156static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157					   struct sk_buff *skb,
158					   const void *daddr);
159static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160
161static struct dst_ops ipv4_dst_ops = {
162	.family =		AF_INET,
163	.check =		ipv4_dst_check,
164	.default_advmss =	ipv4_default_advmss,
165	.mtu =			ipv4_mtu,
166	.cow_metrics =		ipv4_cow_metrics,
167	.destroy =		ipv4_dst_destroy,
168	.negative_advice =	ipv4_negative_advice,
169	.link_failure =		ipv4_link_failure,
170	.update_pmtu =		ip_rt_update_pmtu,
171	.redirect =		ip_do_redirect,
172	.local_out =		__ip_local_out,
173	.neigh_lookup =		ipv4_neigh_lookup,
174	.confirm_neigh =	ipv4_confirm_neigh,
175};
176
177#define ECN_OR_COST(class)	TC_PRIO_##class
178
179const __u8 ip_tos2prio[16] = {
180	TC_PRIO_BESTEFFORT,
181	ECN_OR_COST(BESTEFFORT),
182	TC_PRIO_BESTEFFORT,
183	ECN_OR_COST(BESTEFFORT),
184	TC_PRIO_BULK,
185	ECN_OR_COST(BULK),
186	TC_PRIO_BULK,
187	ECN_OR_COST(BULK),
188	TC_PRIO_INTERACTIVE,
189	ECN_OR_COST(INTERACTIVE),
190	TC_PRIO_INTERACTIVE,
191	ECN_OR_COST(INTERACTIVE),
192	TC_PRIO_INTERACTIVE_BULK,
193	ECN_OR_COST(INTERACTIVE_BULK),
194	TC_PRIO_INTERACTIVE_BULK,
195	ECN_OR_COST(INTERACTIVE_BULK)
196};
197EXPORT_SYMBOL(ip_tos2prio);
198
199static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201
202#ifdef CONFIG_PROC_FS
203static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204{
205	if (*pos)
206		return NULL;
207	return SEQ_START_TOKEN;
208}
209
210static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211{
212	++*pos;
213	return NULL;
214}
215
216static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217{
218}
219
220static int rt_cache_seq_show(struct seq_file *seq, void *v)
221{
222	if (v == SEQ_START_TOKEN)
223		seq_printf(seq, "%-127s\n",
224			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226			   "HHUptod\tSpecDst");
227	return 0;
228}
229
230static const struct seq_operations rt_cache_seq_ops = {
231	.start  = rt_cache_seq_start,
232	.next   = rt_cache_seq_next,
233	.stop   = rt_cache_seq_stop,
234	.show   = rt_cache_seq_show,
235};
236
237static int rt_cache_seq_open(struct inode *inode, struct file *file)
238{
239	return seq_open(file, &rt_cache_seq_ops);
240}
241
242static const struct proc_ops rt_cache_proc_ops = {
243	.proc_open	= rt_cache_seq_open,
244	.proc_read	= seq_read,
245	.proc_lseek	= seq_lseek,
246	.proc_release	= seq_release,
247};
248
249
250static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251{
252	int cpu;
253
254	if (*pos == 0)
255		return SEQ_START_TOKEN;
256
257	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258		if (!cpu_possible(cpu))
259			continue;
260		*pos = cpu+1;
261		return &per_cpu(rt_cache_stat, cpu);
262	}
263	return NULL;
264}
265
266static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267{
268	int cpu;
269
270	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271		if (!cpu_possible(cpu))
272			continue;
273		*pos = cpu+1;
274		return &per_cpu(rt_cache_stat, cpu);
275	}
276	(*pos)++;
277	return NULL;
278
279}
280
281static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282{
283
284}
285
286static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287{
288	struct rt_cache_stat *st = v;
289
290	if (v == SEQ_START_TOKEN) {
291		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292		return 0;
293	}
294
295	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297		   dst_entries_get_slow(&ipv4_dst_ops),
298		   0, /* st->in_hit */
299		   st->in_slow_tot,
300		   st->in_slow_mc,
301		   st->in_no_route,
302		   st->in_brd,
303		   st->in_martian_dst,
304		   st->in_martian_src,
305
306		   0, /* st->out_hit */
307		   st->out_slow_tot,
308		   st->out_slow_mc,
309
310		   0, /* st->gc_total */
311		   0, /* st->gc_ignored */
312		   0, /* st->gc_goal_miss */
313		   0, /* st->gc_dst_overflow */
314		   0, /* st->in_hlist_search */
315		   0  /* st->out_hlist_search */
316		);
317	return 0;
318}
319
320static const struct seq_operations rt_cpu_seq_ops = {
321	.start  = rt_cpu_seq_start,
322	.next   = rt_cpu_seq_next,
323	.stop   = rt_cpu_seq_stop,
324	.show   = rt_cpu_seq_show,
325};
326
327
328static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329{
330	return seq_open(file, &rt_cpu_seq_ops);
331}
332
333static const struct proc_ops rt_cpu_proc_ops = {
334	.proc_open	= rt_cpu_seq_open,
335	.proc_read	= seq_read,
336	.proc_lseek	= seq_lseek,
337	.proc_release	= seq_release,
338};
339
340#ifdef CONFIG_IP_ROUTE_CLASSID
341static int rt_acct_proc_show(struct seq_file *m, void *v)
342{
343	struct ip_rt_acct *dst, *src;
344	unsigned int i, j;
345
346	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347	if (!dst)
348		return -ENOMEM;
349
350	for_each_possible_cpu(i) {
351		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352		for (j = 0; j < 256; j++) {
353			dst[j].o_bytes   += src[j].o_bytes;
354			dst[j].o_packets += src[j].o_packets;
355			dst[j].i_bytes   += src[j].i_bytes;
356			dst[j].i_packets += src[j].i_packets;
357		}
358	}
359
360	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361	kfree(dst);
362	return 0;
363}
364#endif
365
366static int __net_init ip_rt_do_proc_init(struct net *net)
367{
368	struct proc_dir_entry *pde;
369
370	pde = proc_create("rt_cache", 0444, net->proc_net,
371			  &rt_cache_proc_ops);
372	if (!pde)
373		goto err1;
374
375	pde = proc_create("rt_cache", 0444,
376			  net->proc_net_stat, &rt_cpu_proc_ops);
377	if (!pde)
378		goto err2;
379
380#ifdef CONFIG_IP_ROUTE_CLASSID
381	pde = proc_create_single("rt_acct", 0, net->proc_net,
382			rt_acct_proc_show);
383	if (!pde)
384		goto err3;
385#endif
386	return 0;
387
388#ifdef CONFIG_IP_ROUTE_CLASSID
389err3:
390	remove_proc_entry("rt_cache", net->proc_net_stat);
391#endif
392err2:
393	remove_proc_entry("rt_cache", net->proc_net);
394err1:
395	return -ENOMEM;
396}
397
398static void __net_exit ip_rt_do_proc_exit(struct net *net)
399{
400	remove_proc_entry("rt_cache", net->proc_net_stat);
401	remove_proc_entry("rt_cache", net->proc_net);
402#ifdef CONFIG_IP_ROUTE_CLASSID
403	remove_proc_entry("rt_acct", net->proc_net);
404#endif
405}
406
407static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
408	.init = ip_rt_do_proc_init,
409	.exit = ip_rt_do_proc_exit,
410};
411
412static int __init ip_rt_proc_init(void)
413{
414	return register_pernet_subsys(&ip_rt_proc_ops);
415}
416
417#else
418static inline int ip_rt_proc_init(void)
419{
420	return 0;
421}
422#endif /* CONFIG_PROC_FS */
423
424static inline bool rt_is_expired(const struct rtable *rth)
425{
426	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
427}
428
429void rt_cache_flush(struct net *net)
430{
431	rt_genid_bump_ipv4(net);
432}
433
434static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
435					   struct sk_buff *skb,
436					   const void *daddr)
437{
438	const struct rtable *rt = container_of(dst, struct rtable, dst);
439	struct net_device *dev = dst->dev;
440	struct neighbour *n;
441
442	rcu_read_lock_bh();
443
444	if (likely(rt->rt_gw_family == AF_INET)) {
445		n = ip_neigh_gw4(dev, rt->rt_gw4);
446	} else if (rt->rt_gw_family == AF_INET6) {
447		n = ip_neigh_gw6(dev, &rt->rt_gw6);
448        } else {
449		__be32 pkey;
450
451		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
452		n = ip_neigh_gw4(dev, pkey);
453	}
454
455	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
456		n = NULL;
457
458	rcu_read_unlock_bh();
459
460	return n;
461}
462
463static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
464{
465	const struct rtable *rt = container_of(dst, struct rtable, dst);
466	struct net_device *dev = dst->dev;
467	const __be32 *pkey = daddr;
468
469	if (rt->rt_gw_family == AF_INET) {
470		pkey = (const __be32 *)&rt->rt_gw4;
471	} else if (rt->rt_gw_family == AF_INET6) {
472		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
473	} else if (!daddr ||
474		 (rt->rt_flags &
475		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
476		return;
477	}
478	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
479}
480
481/* Hash tables of size 2048..262144 depending on RAM size.
482 * Each bucket uses 8 bytes.
483 */
484static u32 ip_idents_mask __read_mostly;
485static atomic_t *ip_idents __read_mostly;
486static u32 *ip_tstamps __read_mostly;
487
488/* In order to protect privacy, we add a perturbation to identifiers
489 * if one generator is seldom used. This makes hard for an attacker
490 * to infer how many packets were sent between two points in time.
491 */
492u32 ip_idents_reserve(u32 hash, int segs)
493{
494	u32 bucket, old, now = (u32)jiffies;
495	atomic_t *p_id;
496	u32 *p_tstamp;
497	u32 delta = 0;
498
499	bucket = hash & ip_idents_mask;
500	p_tstamp = ip_tstamps + bucket;
501	p_id = ip_idents + bucket;
502	old = READ_ONCE(*p_tstamp);
503
504	if (old != now && cmpxchg(p_tstamp, old, now) == old)
505		delta = prandom_u32_max(now - old);
506
507	/* If UBSAN reports an error there, please make sure your compiler
508	 * supports -fno-strict-overflow before reporting it that was a bug
509	 * in UBSAN, and it has been fixed in GCC-8.
510	 */
511	return atomic_add_return(segs + delta, p_id) - segs;
512}
513EXPORT_SYMBOL(ip_idents_reserve);
514
515void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
516{
517	u32 hash, id;
518
519	/* Note the following code is not safe, but this is okay. */
520	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
521		get_random_bytes(&net->ipv4.ip_id_key,
522				 sizeof(net->ipv4.ip_id_key));
523
524	hash = siphash_3u32((__force u32)iph->daddr,
525			    (__force u32)iph->saddr,
526			    iph->protocol,
527			    &net->ipv4.ip_id_key);
528	id = ip_idents_reserve(hash, segs);
529	iph->id = htons(id);
530}
531EXPORT_SYMBOL(__ip_select_ident);
532
533static void ip_rt_fix_tos(struct flowi4 *fl4)
534{
535	__u8 tos = RT_FL_TOS(fl4);
536
537	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
538	fl4->flowi4_scope = tos & RTO_ONLINK ?
539			    RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
540}
541
542static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
543			     const struct sock *sk,
544			     const struct iphdr *iph,
545			     int oif, u8 tos,
546			     u8 prot, u32 mark, int flow_flags)
547{
548	if (sk) {
549		const struct inet_sock *inet = inet_sk(sk);
550
551		oif = sk->sk_bound_dev_if;
552		mark = sk->sk_mark;
553		tos = RT_CONN_FLAGS(sk);
554		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
555	}
556	flowi4_init_output(fl4, oif, mark, tos,
557			   RT_SCOPE_UNIVERSE, prot,
558			   flow_flags,
559			   iph->daddr, iph->saddr, 0, 0,
560			   sock_net_uid(net, sk));
561}
562
563static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
564			       const struct sock *sk)
565{
566	const struct net *net = dev_net(skb->dev);
567	const struct iphdr *iph = ip_hdr(skb);
568	int oif = skb->dev->ifindex;
569	u8 tos = RT_TOS(iph->tos);
570	u8 prot = iph->protocol;
571	u32 mark = skb->mark;
572
573	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
574}
575
576static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
577{
578	const struct inet_sock *inet = inet_sk(sk);
579	const struct ip_options_rcu *inet_opt;
580	__be32 daddr = inet->inet_daddr;
581
582	rcu_read_lock();
583	inet_opt = rcu_dereference(inet->inet_opt);
584	if (inet_opt && inet_opt->opt.srr)
585		daddr = inet_opt->opt.faddr;
586	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
587			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
588			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
589			   inet_sk_flowi_flags(sk),
590			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
591	rcu_read_unlock();
592}
593
594static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
595				 const struct sk_buff *skb)
596{
597	if (skb)
598		build_skb_flow_key(fl4, skb, sk);
599	else
600		build_sk_flow_key(fl4, sk);
601}
602
603static DEFINE_SPINLOCK(fnhe_lock);
604
605static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
606{
607	struct rtable *rt;
608
609	rt = rcu_dereference(fnhe->fnhe_rth_input);
610	if (rt) {
611		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
612		dst_dev_put(&rt->dst);
613		dst_release(&rt->dst);
614	}
615	rt = rcu_dereference(fnhe->fnhe_rth_output);
616	if (rt) {
617		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
618		dst_dev_put(&rt->dst);
619		dst_release(&rt->dst);
620	}
621}
622
623static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
624{
625	struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
626	struct fib_nh_exception *fnhe, *oldest = NULL;
627
628	for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
629		fnhe = rcu_dereference_protected(*fnhe_p,
630						 lockdep_is_held(&fnhe_lock));
631		if (!fnhe)
632			break;
633		if (!oldest ||
634		    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
635			oldest = fnhe;
636			oldest_p = fnhe_p;
637		}
638	}
639	fnhe_flush_routes(oldest);
640	*oldest_p = oldest->fnhe_next;
641	kfree_rcu(oldest, rcu);
642}
643
644static u32 fnhe_hashfun(__be32 daddr)
645{
646	static siphash_key_t fnhe_hash_key __read_mostly;
647	u64 hval;
648
649	net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
650	hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
651	return hash_64(hval, FNHE_HASH_SHIFT);
652}
653
654static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
655{
656	rt->rt_pmtu = fnhe->fnhe_pmtu;
657	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
658	rt->dst.expires = fnhe->fnhe_expires;
659
660	if (fnhe->fnhe_gw) {
661		rt->rt_flags |= RTCF_REDIRECTED;
662		rt->rt_uses_gateway = 1;
663		rt->rt_gw_family = AF_INET;
664		rt->rt_gw4 = fnhe->fnhe_gw;
665	}
666}
667
668static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
669				  __be32 gw, u32 pmtu, bool lock,
670				  unsigned long expires)
671{
672	struct fnhe_hash_bucket *hash;
673	struct fib_nh_exception *fnhe;
674	struct rtable *rt;
675	u32 genid, hval;
676	unsigned int i;
677	int depth;
678
679	genid = fnhe_genid(dev_net(nhc->nhc_dev));
680	hval = fnhe_hashfun(daddr);
681
682	spin_lock_bh(&fnhe_lock);
683
684	hash = rcu_dereference(nhc->nhc_exceptions);
685	if (!hash) {
686		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
687		if (!hash)
688			goto out_unlock;
689		rcu_assign_pointer(nhc->nhc_exceptions, hash);
690	}
691
692	hash += hval;
693
694	depth = 0;
695	for (fnhe = rcu_dereference(hash->chain); fnhe;
696	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
697		if (fnhe->fnhe_daddr == daddr)
698			break;
699		depth++;
700	}
701
702	if (fnhe) {
703		if (fnhe->fnhe_genid != genid)
704			fnhe->fnhe_genid = genid;
705		if (gw)
706			fnhe->fnhe_gw = gw;
707		if (pmtu) {
708			fnhe->fnhe_pmtu = pmtu;
709			fnhe->fnhe_mtu_locked = lock;
710		}
711		fnhe->fnhe_expires = max(1UL, expires);
712		/* Update all cached dsts too */
713		rt = rcu_dereference(fnhe->fnhe_rth_input);
714		if (rt)
715			fill_route_from_fnhe(rt, fnhe);
716		rt = rcu_dereference(fnhe->fnhe_rth_output);
717		if (rt)
718			fill_route_from_fnhe(rt, fnhe);
719	} else {
720		/* Randomize max depth to avoid some side channels attacks. */
721		int max_depth = FNHE_RECLAIM_DEPTH +
722				prandom_u32_max(FNHE_RECLAIM_DEPTH);
723
724		while (depth > max_depth) {
725			fnhe_remove_oldest(hash);
726			depth--;
727		}
728
729		fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
730		if (!fnhe)
731			goto out_unlock;
732
733		fnhe->fnhe_next = hash->chain;
734
735		fnhe->fnhe_genid = genid;
736		fnhe->fnhe_daddr = daddr;
737		fnhe->fnhe_gw = gw;
738		fnhe->fnhe_pmtu = pmtu;
739		fnhe->fnhe_mtu_locked = lock;
740		fnhe->fnhe_expires = max(1UL, expires);
741
742		rcu_assign_pointer(hash->chain, fnhe);
743
744		/* Exception created; mark the cached routes for the nexthop
745		 * stale, so anyone caching it rechecks if this exception
746		 * applies to them.
747		 */
748		rt = rcu_dereference(nhc->nhc_rth_input);
749		if (rt)
750			rt->dst.obsolete = DST_OBSOLETE_KILL;
751
752		for_each_possible_cpu(i) {
753			struct rtable __rcu **prt;
754			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
755			rt = rcu_dereference(*prt);
756			if (rt)
757				rt->dst.obsolete = DST_OBSOLETE_KILL;
758		}
759	}
760
761	fnhe->fnhe_stamp = jiffies;
762
763out_unlock:
764	spin_unlock_bh(&fnhe_lock);
765}
766
767static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
768			     bool kill_route)
769{
770	__be32 new_gw = icmp_hdr(skb)->un.gateway;
771	__be32 old_gw = ip_hdr(skb)->saddr;
772	struct net_device *dev = skb->dev;
773	struct in_device *in_dev;
774	struct fib_result res;
775	struct neighbour *n;
776	struct net *net;
777
778	switch (icmp_hdr(skb)->code & 7) {
779	case ICMP_REDIR_NET:
780	case ICMP_REDIR_NETTOS:
781	case ICMP_REDIR_HOST:
782	case ICMP_REDIR_HOSTTOS:
783		break;
784
785	default:
786		return;
787	}
788
789	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
790		return;
791
792	in_dev = __in_dev_get_rcu(dev);
793	if (!in_dev)
794		return;
795
796	net = dev_net(dev);
797	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
798	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
799	    ipv4_is_zeronet(new_gw))
800		goto reject_redirect;
801
802	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
803		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
804			goto reject_redirect;
805		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
806			goto reject_redirect;
807	} else {
808		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
809			goto reject_redirect;
810	}
811
812	n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
813	if (!n)
814		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
815	if (!IS_ERR(n)) {
816		if (!(n->nud_state & NUD_VALID)) {
817			neigh_event_send(n, NULL);
818		} else {
819			if (fib_lookup(net, fl4, &res, 0) == 0) {
820				struct fib_nh_common *nhc;
821
822				fib_select_path(net, &res, fl4, skb);
823				nhc = FIB_RES_NHC(res);
824				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
825						0, false,
826						jiffies + ip_rt_gc_timeout);
827			}
828			if (kill_route)
829				rt->dst.obsolete = DST_OBSOLETE_KILL;
830			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
831		}
832		neigh_release(n);
833	}
834	return;
835
836reject_redirect:
837#ifdef CONFIG_IP_ROUTE_VERBOSE
838	if (IN_DEV_LOG_MARTIANS(in_dev)) {
839		const struct iphdr *iph = (const struct iphdr *) skb->data;
840		__be32 daddr = iph->daddr;
841		__be32 saddr = iph->saddr;
842
843		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
844				     "  Advised path = %pI4 -> %pI4\n",
845				     &old_gw, dev->name, &new_gw,
846				     &saddr, &daddr);
847	}
848#endif
849	;
850}
851
852static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
853{
854	struct rtable *rt;
855	struct flowi4 fl4;
856	const struct iphdr *iph = (const struct iphdr *) skb->data;
857	struct net *net = dev_net(skb->dev);
858	int oif = skb->dev->ifindex;
859	u8 tos = RT_TOS(iph->tos);
860	u8 prot = iph->protocol;
861	u32 mark = skb->mark;
862
863	rt = (struct rtable *) dst;
864
865	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
866	ip_rt_fix_tos(&fl4);
867	__ip_do_redirect(rt, skb, &fl4, true);
868}
869
870static void ipv4_negative_advice(struct sock *sk,
871				 struct dst_entry *dst)
872{
873	struct rtable *rt = (struct rtable *)dst;
874
875	if ((dst->obsolete > 0) ||
876	    (rt->rt_flags & RTCF_REDIRECTED) ||
877	    rt->dst.expires)
878		sk_dst_reset(sk);
879}
880
881/*
882 * Algorithm:
883 *	1. The first ip_rt_redirect_number redirects are sent
884 *	   with exponential backoff, then we stop sending them at all,
885 *	   assuming that the host ignores our redirects.
886 *	2. If we did not see packets requiring redirects
887 *	   during ip_rt_redirect_silence, we assume that the host
888 *	   forgot redirected route and start to send redirects again.
889 *
890 * This algorithm is much cheaper and more intelligent than dumb load limiting
891 * in icmp.c.
892 *
893 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
894 * and "frag. need" (breaks PMTU discovery) in icmp.c.
895 */
896
897void ip_rt_send_redirect(struct sk_buff *skb)
898{
899	struct rtable *rt = skb_rtable(skb);
900	struct in_device *in_dev;
901	struct inet_peer *peer;
902	struct net *net;
903	int log_martians;
904	int vif;
905
906	rcu_read_lock();
907	in_dev = __in_dev_get_rcu(rt->dst.dev);
908	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
909		rcu_read_unlock();
910		return;
911	}
912	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
913	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
914	rcu_read_unlock();
915
916	net = dev_net(rt->dst.dev);
917	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
918	if (!peer) {
919		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
920			  rt_nexthop(rt, ip_hdr(skb)->daddr));
921		return;
922	}
923
924	/* No redirected packets during ip_rt_redirect_silence;
925	 * reset the algorithm.
926	 */
927	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
928		peer->rate_tokens = 0;
929		peer->n_redirects = 0;
930	}
931
932	/* Too many ignored redirects; do not send anything
933	 * set dst.rate_last to the last seen redirected packet.
934	 */
935	if (peer->n_redirects >= ip_rt_redirect_number) {
936		peer->rate_last = jiffies;
937		goto out_put_peer;
938	}
939
940	/* Check for load limit; set rate_last to the latest sent
941	 * redirect.
942	 */
943	if (peer->n_redirects == 0 ||
944	    time_after(jiffies,
945		       (peer->rate_last +
946			(ip_rt_redirect_load << peer->n_redirects)))) {
947		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
948
949		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
950		peer->rate_last = jiffies;
951		++peer->n_redirects;
952#ifdef CONFIG_IP_ROUTE_VERBOSE
953		if (log_martians &&
954		    peer->n_redirects == ip_rt_redirect_number)
955			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
956					     &ip_hdr(skb)->saddr, inet_iif(skb),
957					     &ip_hdr(skb)->daddr, &gw);
958#endif
959	}
960out_put_peer:
961	inet_putpeer(peer);
962}
963
964static int ip_error(struct sk_buff *skb)
965{
966	struct rtable *rt = skb_rtable(skb);
967	struct net_device *dev = skb->dev;
968	struct in_device *in_dev;
969	struct inet_peer *peer;
970	unsigned long now;
971	struct net *net;
972	bool send;
973	int code;
974
975	if (netif_is_l3_master(skb->dev)) {
976		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
977		if (!dev)
978			goto out;
979	}
980
981	in_dev = __in_dev_get_rcu(dev);
982
983	/* IP on this device is disabled. */
984	if (!in_dev)
985		goto out;
986
987	net = dev_net(rt->dst.dev);
988	if (!IN_DEV_FORWARD(in_dev)) {
989		switch (rt->dst.error) {
990		case EHOSTUNREACH:
991			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
992			break;
993
994		case ENETUNREACH:
995			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
996			break;
997		}
998		goto out;
999	}
1000
1001	switch (rt->dst.error) {
1002	case EINVAL:
1003	default:
1004		goto out;
1005	case EHOSTUNREACH:
1006		code = ICMP_HOST_UNREACH;
1007		break;
1008	case ENETUNREACH:
1009		code = ICMP_NET_UNREACH;
1010		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1011		break;
1012	case EACCES:
1013		code = ICMP_PKT_FILTERED;
1014		break;
1015	}
1016
1017	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1018			       l3mdev_master_ifindex(skb->dev), 1);
1019
1020	send = true;
1021	if (peer) {
1022		now = jiffies;
1023		peer->rate_tokens += now - peer->rate_last;
1024		if (peer->rate_tokens > ip_rt_error_burst)
1025			peer->rate_tokens = ip_rt_error_burst;
1026		peer->rate_last = now;
1027		if (peer->rate_tokens >= ip_rt_error_cost)
1028			peer->rate_tokens -= ip_rt_error_cost;
1029		else
1030			send = false;
1031		inet_putpeer(peer);
1032	}
1033	if (send)
1034		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1035
1036out:	kfree_skb(skb);
1037	return 0;
1038}
1039
1040static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1041{
1042	struct dst_entry *dst = &rt->dst;
1043	struct net *net = dev_net(dst->dev);
1044	struct fib_result res;
1045	bool lock = false;
1046	u32 old_mtu;
1047
1048	if (ip_mtu_locked(dst))
1049		return;
1050
1051	old_mtu = ipv4_mtu(dst);
1052	if (old_mtu < mtu)
1053		return;
1054
1055	if (mtu < ip_rt_min_pmtu) {
1056		lock = true;
1057		mtu = min(old_mtu, ip_rt_min_pmtu);
1058	}
1059
1060	if (rt->rt_pmtu == mtu && !lock &&
1061	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1062		return;
1063
1064	rcu_read_lock();
1065	if (fib_lookup(net, fl4, &res, 0) == 0) {
1066		struct fib_nh_common *nhc;
1067
1068		fib_select_path(net, &res, fl4, NULL);
1069		nhc = FIB_RES_NHC(res);
1070		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1071				      jiffies + ip_rt_mtu_expires);
1072	}
1073	rcu_read_unlock();
1074}
1075
1076static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1077			      struct sk_buff *skb, u32 mtu,
1078			      bool confirm_neigh)
1079{
1080	struct rtable *rt = (struct rtable *) dst;
1081	struct flowi4 fl4;
1082
1083	ip_rt_build_flow_key(&fl4, sk, skb);
1084	ip_rt_fix_tos(&fl4);
1085
1086	/* Don't make lookup fail for bridged encapsulations */
1087	if (skb && netif_is_any_bridge_port(skb->dev))
1088		fl4.flowi4_oif = 0;
1089
1090	__ip_rt_update_pmtu(rt, &fl4, mtu);
1091}
1092
1093void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1094		      int oif, u8 protocol)
1095{
1096	const struct iphdr *iph = (const struct iphdr *)skb->data;
1097	struct flowi4 fl4;
1098	struct rtable *rt;
1099	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1100
1101	__build_flow_key(net, &fl4, NULL, iph, oif,
1102			 RT_TOS(iph->tos), protocol, mark, 0);
1103	rt = __ip_route_output_key(net, &fl4);
1104	if (!IS_ERR(rt)) {
1105		__ip_rt_update_pmtu(rt, &fl4, mtu);
1106		ip_rt_put(rt);
1107	}
1108}
1109EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1110
1111static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1112{
1113	const struct iphdr *iph = (const struct iphdr *)skb->data;
1114	struct flowi4 fl4;
1115	struct rtable *rt;
1116
1117	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1118
1119	if (!fl4.flowi4_mark)
1120		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1121
1122	rt = __ip_route_output_key(sock_net(sk), &fl4);
1123	if (!IS_ERR(rt)) {
1124		__ip_rt_update_pmtu(rt, &fl4, mtu);
1125		ip_rt_put(rt);
1126	}
1127}
1128
1129void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1130{
1131	const struct iphdr *iph = (const struct iphdr *)skb->data;
1132	struct flowi4 fl4;
1133	struct rtable *rt;
1134	struct dst_entry *odst = NULL;
1135	bool new = false;
1136	struct net *net = sock_net(sk);
1137
1138	bh_lock_sock(sk);
1139
1140	if (!ip_sk_accept_pmtu(sk))
1141		goto out;
1142
1143	odst = sk_dst_get(sk);
1144
1145	if (sock_owned_by_user(sk) || !odst) {
1146		__ipv4_sk_update_pmtu(skb, sk, mtu);
1147		goto out;
1148	}
1149
1150	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1151
1152	rt = (struct rtable *)odst;
1153	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1154		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1155		if (IS_ERR(rt))
1156			goto out;
1157
1158		new = true;
1159	} else {
1160		ip_rt_fix_tos(&fl4);
1161	}
1162
1163	__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1164
1165	if (!dst_check(&rt->dst, 0)) {
1166		if (new)
1167			dst_release(&rt->dst);
1168
1169		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1170		if (IS_ERR(rt))
1171			goto out;
1172
1173		new = true;
1174	}
1175
1176	if (new)
1177		sk_dst_set(sk, &rt->dst);
1178
1179out:
1180	bh_unlock_sock(sk);
1181	dst_release(odst);
1182}
1183EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1184
1185void ipv4_redirect(struct sk_buff *skb, struct net *net,
1186		   int oif, u8 protocol)
1187{
1188	const struct iphdr *iph = (const struct iphdr *)skb->data;
1189	struct flowi4 fl4;
1190	struct rtable *rt;
1191
1192	__build_flow_key(net, &fl4, NULL, iph, oif,
1193			 RT_TOS(iph->tos), protocol, 0, 0);
1194	rt = __ip_route_output_key(net, &fl4);
1195	if (!IS_ERR(rt)) {
1196		__ip_do_redirect(rt, skb, &fl4, false);
1197		ip_rt_put(rt);
1198	}
1199}
1200EXPORT_SYMBOL_GPL(ipv4_redirect);
1201
1202void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1203{
1204	const struct iphdr *iph = (const struct iphdr *)skb->data;
1205	struct flowi4 fl4;
1206	struct rtable *rt;
1207	struct net *net = sock_net(sk);
1208
1209	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1210	rt = __ip_route_output_key(net, &fl4);
1211	if (!IS_ERR(rt)) {
1212		__ip_do_redirect(rt, skb, &fl4, false);
1213		ip_rt_put(rt);
1214	}
1215}
1216EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1217
1218static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1219{
1220	struct rtable *rt = (struct rtable *) dst;
1221
1222	/* All IPV4 dsts are created with ->obsolete set to the value
1223	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1224	 * into this function always.
1225	 *
1226	 * When a PMTU/redirect information update invalidates a route,
1227	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1228	 * DST_OBSOLETE_DEAD.
1229	 */
1230	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1231		return NULL;
1232	return dst;
1233}
1234
1235static void ipv4_send_dest_unreach(struct sk_buff *skb)
1236{
1237	struct net_device *dev;
1238	struct ip_options opt;
1239	int res;
1240
1241	/* Recompile ip options since IPCB may not be valid anymore.
1242	 * Also check we have a reasonable ipv4 header.
1243	 */
1244	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1245	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1246		return;
1247
1248	memset(&opt, 0, sizeof(opt));
1249	if (ip_hdr(skb)->ihl > 5) {
1250		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1251			return;
1252		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1253
1254		rcu_read_lock();
1255		dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1256		res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1257		rcu_read_unlock();
1258
1259		if (res)
1260			return;
1261	}
1262	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1263}
1264
1265static void ipv4_link_failure(struct sk_buff *skb)
1266{
1267	struct rtable *rt;
1268
1269	ipv4_send_dest_unreach(skb);
1270
1271	rt = skb_rtable(skb);
1272	if (rt)
1273		dst_set_expires(&rt->dst, 0);
1274}
1275
1276static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1277{
1278	pr_debug("%s: %pI4 -> %pI4, %s\n",
1279		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1280		 skb->dev ? skb->dev->name : "?");
1281	kfree_skb(skb);
1282	WARN_ON(1);
1283	return 0;
1284}
1285
1286/*
1287   We do not cache source address of outgoing interface,
1288   because it is used only by IP RR, TS and SRR options,
1289   so that it out of fast path.
1290
1291   BTW remember: "addr" is allowed to be not aligned
1292   in IP options!
1293 */
1294
1295void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1296{
1297	__be32 src;
1298
1299	if (rt_is_output_route(rt))
1300		src = ip_hdr(skb)->saddr;
1301	else {
1302		struct fib_result res;
1303		struct iphdr *iph = ip_hdr(skb);
1304		struct flowi4 fl4 = {
1305			.daddr = iph->daddr,
1306			.saddr = iph->saddr,
1307			.flowi4_tos = RT_TOS(iph->tos),
1308			.flowi4_oif = rt->dst.dev->ifindex,
1309			.flowi4_iif = skb->dev->ifindex,
1310			.flowi4_mark = skb->mark,
1311		};
1312
1313		rcu_read_lock();
1314		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1315			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1316		else
1317			src = inet_select_addr(rt->dst.dev,
1318					       rt_nexthop(rt, iph->daddr),
1319					       RT_SCOPE_UNIVERSE);
1320		rcu_read_unlock();
1321	}
1322	memcpy(addr, &src, 4);
1323}
1324
1325#ifdef CONFIG_IP_ROUTE_CLASSID
1326static void set_class_tag(struct rtable *rt, u32 tag)
1327{
1328	if (!(rt->dst.tclassid & 0xFFFF))
1329		rt->dst.tclassid |= tag & 0xFFFF;
1330	if (!(rt->dst.tclassid & 0xFFFF0000))
1331		rt->dst.tclassid |= tag & 0xFFFF0000;
1332}
1333#endif
1334
1335static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1336{
1337	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1338	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1339				    ip_rt_min_advmss);
1340
1341	return min(advmss, IPV4_MAX_PMTU - header_size);
1342}
1343
1344static unsigned int ipv4_mtu(const struct dst_entry *dst)
1345{
1346	const struct rtable *rt = (const struct rtable *)dst;
1347	unsigned int mtu = rt->rt_pmtu;
1348
1349	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1350		mtu = dst_metric_raw(dst, RTAX_MTU);
1351
1352	if (mtu)
1353		goto out;
1354
1355	mtu = READ_ONCE(dst->dev->mtu);
1356
1357	if (unlikely(ip_mtu_locked(dst))) {
1358		if (rt->rt_uses_gateway && mtu > 576)
1359			mtu = 576;
1360	}
1361
1362out:
1363	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1364
1365	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1366}
1367
1368static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1369{
1370	struct fnhe_hash_bucket *hash;
1371	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1372	u32 hval = fnhe_hashfun(daddr);
1373
1374	spin_lock_bh(&fnhe_lock);
1375
1376	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1377					 lockdep_is_held(&fnhe_lock));
1378	hash += hval;
1379
1380	fnhe_p = &hash->chain;
1381	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1382	while (fnhe) {
1383		if (fnhe->fnhe_daddr == daddr) {
1384			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1385				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1386			/* set fnhe_daddr to 0 to ensure it won't bind with
1387			 * new dsts in rt_bind_exception().
1388			 */
1389			fnhe->fnhe_daddr = 0;
1390			fnhe_flush_routes(fnhe);
1391			kfree_rcu(fnhe, rcu);
1392			break;
1393		}
1394		fnhe_p = &fnhe->fnhe_next;
1395		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1396						 lockdep_is_held(&fnhe_lock));
1397	}
1398
1399	spin_unlock_bh(&fnhe_lock);
1400}
1401
1402static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1403					       __be32 daddr)
1404{
1405	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1406	struct fib_nh_exception *fnhe;
1407	u32 hval;
1408
1409	if (!hash)
1410		return NULL;
1411
1412	hval = fnhe_hashfun(daddr);
1413
1414	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1415	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1416		if (fnhe->fnhe_daddr == daddr) {
1417			if (fnhe->fnhe_expires &&
1418			    time_after(jiffies, fnhe->fnhe_expires)) {
1419				ip_del_fnhe(nhc, daddr);
1420				break;
1421			}
1422			return fnhe;
1423		}
1424	}
1425	return NULL;
1426}
1427
1428/* MTU selection:
1429 * 1. mtu on route is locked - use it
1430 * 2. mtu from nexthop exception
1431 * 3. mtu from egress device
1432 */
1433
1434u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1435{
1436	struct fib_nh_common *nhc = res->nhc;
1437	struct net_device *dev = nhc->nhc_dev;
1438	struct fib_info *fi = res->fi;
1439	u32 mtu = 0;
1440
1441	if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1442	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1443		mtu = fi->fib_mtu;
1444
1445	if (likely(!mtu)) {
1446		struct fib_nh_exception *fnhe;
1447
1448		fnhe = find_exception(nhc, daddr);
1449		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1450			mtu = fnhe->fnhe_pmtu;
1451	}
1452
1453	if (likely(!mtu))
1454		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1455
1456	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1457}
1458
1459static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1460			      __be32 daddr, const bool do_cache)
1461{
1462	bool ret = false;
1463
1464	spin_lock_bh(&fnhe_lock);
1465
1466	if (daddr == fnhe->fnhe_daddr) {
1467		struct rtable __rcu **porig;
1468		struct rtable *orig;
1469		int genid = fnhe_genid(dev_net(rt->dst.dev));
1470
1471		if (rt_is_input_route(rt))
1472			porig = &fnhe->fnhe_rth_input;
1473		else
1474			porig = &fnhe->fnhe_rth_output;
1475		orig = rcu_dereference(*porig);
1476
1477		if (fnhe->fnhe_genid != genid) {
1478			fnhe->fnhe_genid = genid;
1479			fnhe->fnhe_gw = 0;
1480			fnhe->fnhe_pmtu = 0;
1481			fnhe->fnhe_expires = 0;
1482			fnhe->fnhe_mtu_locked = false;
1483			fnhe_flush_routes(fnhe);
1484			orig = NULL;
1485		}
1486		fill_route_from_fnhe(rt, fnhe);
1487		if (!rt->rt_gw4) {
1488			rt->rt_gw4 = daddr;
1489			rt->rt_gw_family = AF_INET;
1490		}
1491
1492		if (do_cache) {
1493			dst_hold(&rt->dst);
1494			rcu_assign_pointer(*porig, rt);
1495			if (orig) {
1496				dst_dev_put(&orig->dst);
1497				dst_release(&orig->dst);
1498			}
1499			ret = true;
1500		}
1501
1502		fnhe->fnhe_stamp = jiffies;
1503	}
1504	spin_unlock_bh(&fnhe_lock);
1505
1506	return ret;
1507}
1508
1509static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1510{
1511	struct rtable *orig, *prev, **p;
1512	bool ret = true;
1513
1514	if (rt_is_input_route(rt)) {
1515		p = (struct rtable **)&nhc->nhc_rth_input;
1516	} else {
1517		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1518	}
1519	orig = *p;
1520
1521	/* hold dst before doing cmpxchg() to avoid race condition
1522	 * on this dst
1523	 */
1524	dst_hold(&rt->dst);
1525	prev = cmpxchg(p, orig, rt);
1526	if (prev == orig) {
1527		if (orig) {
1528			rt_add_uncached_list(orig);
1529			dst_release(&orig->dst);
1530		}
1531	} else {
1532		dst_release(&rt->dst);
1533		ret = false;
1534	}
1535
1536	return ret;
1537}
1538
1539struct uncached_list {
1540	spinlock_t		lock;
1541	struct list_head	head;
1542};
1543
1544static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1545
1546void rt_add_uncached_list(struct rtable *rt)
1547{
1548	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1549
1550	rt->rt_uncached_list = ul;
1551
1552	spin_lock_bh(&ul->lock);
1553	list_add_tail(&rt->rt_uncached, &ul->head);
1554	spin_unlock_bh(&ul->lock);
1555}
1556
1557void rt_del_uncached_list(struct rtable *rt)
1558{
1559	if (!list_empty(&rt->rt_uncached)) {
1560		struct uncached_list *ul = rt->rt_uncached_list;
1561
1562		spin_lock_bh(&ul->lock);
1563		list_del(&rt->rt_uncached);
1564		spin_unlock_bh(&ul->lock);
1565	}
1566}
1567
1568static void ipv4_dst_destroy(struct dst_entry *dst)
1569{
1570	struct rtable *rt = (struct rtable *)dst;
1571
1572	ip_dst_metrics_put(dst);
1573	rt_del_uncached_list(rt);
1574}
1575
1576void rt_flush_dev(struct net_device *dev)
1577{
1578	struct rtable *rt;
1579	int cpu;
1580
1581	for_each_possible_cpu(cpu) {
1582		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1583
1584		spin_lock_bh(&ul->lock);
1585		list_for_each_entry(rt, &ul->head, rt_uncached) {
1586			if (rt->dst.dev != dev)
1587				continue;
1588			rt->dst.dev = blackhole_netdev;
1589			dev_hold(rt->dst.dev);
1590			dev_put(dev);
1591		}
1592		spin_unlock_bh(&ul->lock);
1593	}
1594}
1595
1596static bool rt_cache_valid(const struct rtable *rt)
1597{
1598	return	rt &&
1599		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1600		!rt_is_expired(rt);
1601}
1602
1603static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1604			   const struct fib_result *res,
1605			   struct fib_nh_exception *fnhe,
1606			   struct fib_info *fi, u16 type, u32 itag,
1607			   const bool do_cache)
1608{
1609	bool cached = false;
1610
1611	if (fi) {
1612		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1613
1614		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1615			rt->rt_uses_gateway = 1;
1616			rt->rt_gw_family = nhc->nhc_gw_family;
1617			/* only INET and INET6 are supported */
1618			if (likely(nhc->nhc_gw_family == AF_INET))
1619				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1620			else
1621				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1622		}
1623
1624		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1625
1626#ifdef CONFIG_IP_ROUTE_CLASSID
1627		if (nhc->nhc_family == AF_INET) {
1628			struct fib_nh *nh;
1629
1630			nh = container_of(nhc, struct fib_nh, nh_common);
1631			rt->dst.tclassid = nh->nh_tclassid;
1632		}
1633#endif
1634		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1635		if (unlikely(fnhe))
1636			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1637		else if (do_cache)
1638			cached = rt_cache_route(nhc, rt);
1639		if (unlikely(!cached)) {
1640			/* Routes we intend to cache in nexthop exception or
1641			 * FIB nexthop have the DST_NOCACHE bit clear.
1642			 * However, if we are unsuccessful at storing this
1643			 * route into the cache we really need to set it.
1644			 */
1645			if (!rt->rt_gw4) {
1646				rt->rt_gw_family = AF_INET;
1647				rt->rt_gw4 = daddr;
1648			}
1649			rt_add_uncached_list(rt);
1650		}
1651	} else
1652		rt_add_uncached_list(rt);
1653
1654#ifdef CONFIG_IP_ROUTE_CLASSID
1655#ifdef CONFIG_IP_MULTIPLE_TABLES
1656	set_class_tag(rt, res->tclassid);
1657#endif
1658	set_class_tag(rt, itag);
1659#endif
1660}
1661
1662struct rtable *rt_dst_alloc(struct net_device *dev,
1663			    unsigned int flags, u16 type,
1664			    bool nopolicy, bool noxfrm)
1665{
1666	struct rtable *rt;
1667
1668	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1669		       (nopolicy ? DST_NOPOLICY : 0) |
1670		       (noxfrm ? DST_NOXFRM : 0));
1671
1672	if (rt) {
1673		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1674		rt->rt_flags = flags;
1675		rt->rt_type = type;
1676		rt->rt_is_input = 0;
1677		rt->rt_iif = 0;
1678		rt->rt_pmtu = 0;
1679		rt->rt_mtu_locked = 0;
1680		rt->rt_uses_gateway = 0;
1681		rt->rt_gw_family = 0;
1682		rt->rt_gw4 = 0;
1683		INIT_LIST_HEAD(&rt->rt_uncached);
1684
1685		rt->dst.output = ip_output;
1686		if (flags & RTCF_LOCAL)
1687			rt->dst.input = ip_local_deliver;
1688	}
1689
1690	return rt;
1691}
1692EXPORT_SYMBOL(rt_dst_alloc);
1693
1694struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1695{
1696	struct rtable *new_rt;
1697
1698	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1699			   rt->dst.flags);
1700
1701	if (new_rt) {
1702		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1703		new_rt->rt_flags = rt->rt_flags;
1704		new_rt->rt_type = rt->rt_type;
1705		new_rt->rt_is_input = rt->rt_is_input;
1706		new_rt->rt_iif = rt->rt_iif;
1707		new_rt->rt_pmtu = rt->rt_pmtu;
1708		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1709		new_rt->rt_gw_family = rt->rt_gw_family;
1710		if (rt->rt_gw_family == AF_INET)
1711			new_rt->rt_gw4 = rt->rt_gw4;
1712		else if (rt->rt_gw_family == AF_INET6)
1713			new_rt->rt_gw6 = rt->rt_gw6;
1714		INIT_LIST_HEAD(&new_rt->rt_uncached);
1715
1716		new_rt->dst.input = rt->dst.input;
1717		new_rt->dst.output = rt->dst.output;
1718		new_rt->dst.error = rt->dst.error;
1719		new_rt->dst.lastuse = jiffies;
1720		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1721	}
1722	return new_rt;
1723}
1724EXPORT_SYMBOL(rt_dst_clone);
1725
1726/* called in rcu_read_lock() section */
1727int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1728			  u8 tos, struct net_device *dev,
1729			  struct in_device *in_dev, u32 *itag)
1730{
1731	int err;
1732
1733	/* Primary sanity checks. */
1734	if (!in_dev)
1735		return -EINVAL;
1736
1737	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1738	    skb->protocol != htons(ETH_P_IP))
1739		return -EINVAL;
1740
1741	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1742		return -EINVAL;
1743
1744	if (ipv4_is_zeronet(saddr)) {
1745		if (!ipv4_is_local_multicast(daddr) &&
1746		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1747			return -EINVAL;
1748	} else {
1749		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1750					  in_dev, itag);
1751		if (err < 0)
1752			return err;
1753	}
1754	return 0;
1755}
1756
1757/* called in rcu_read_lock() section */
1758static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1759			     u8 tos, struct net_device *dev, int our)
1760{
1761	struct in_device *in_dev = __in_dev_get_rcu(dev);
1762	unsigned int flags = RTCF_MULTICAST;
1763	struct rtable *rth;
1764	bool no_policy;
1765	u32 itag = 0;
1766	int err;
1767
1768	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1769	if (err)
1770		return err;
1771
1772	if (our)
1773		flags |= RTCF_LOCAL;
1774
1775	no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1776	if (no_policy)
1777		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1778
1779	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1780			   no_policy, false);
1781	if (!rth)
1782		return -ENOBUFS;
1783
1784#ifdef CONFIG_IP_ROUTE_CLASSID
1785	rth->dst.tclassid = itag;
1786#endif
1787	rth->dst.output = ip_rt_bug;
1788	rth->rt_is_input= 1;
1789
1790#ifdef CONFIG_IP_MROUTE
1791	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1792		rth->dst.input = ip_mr_input;
1793#endif
1794	RT_CACHE_STAT_INC(in_slow_mc);
1795
1796	skb_dst_drop(skb);
1797	skb_dst_set(skb, &rth->dst);
1798	return 0;
1799}
1800
1801
1802static void ip_handle_martian_source(struct net_device *dev,
1803				     struct in_device *in_dev,
1804				     struct sk_buff *skb,
1805				     __be32 daddr,
1806				     __be32 saddr)
1807{
1808	RT_CACHE_STAT_INC(in_martian_src);
1809#ifdef CONFIG_IP_ROUTE_VERBOSE
1810	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1811		/*
1812		 *	RFC1812 recommendation, if source is martian,
1813		 *	the only hint is MAC header.
1814		 */
1815		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1816			&daddr, &saddr, dev->name);
1817		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1818			print_hex_dump(KERN_WARNING, "ll header: ",
1819				       DUMP_PREFIX_OFFSET, 16, 1,
1820				       skb_mac_header(skb),
1821				       dev->hard_header_len, false);
1822		}
1823	}
1824#endif
1825}
1826
1827/* called in rcu_read_lock() section */
1828static int __mkroute_input(struct sk_buff *skb,
1829			   const struct fib_result *res,
1830			   struct in_device *in_dev,
1831			   __be32 daddr, __be32 saddr, u32 tos)
1832{
1833	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1834	struct net_device *dev = nhc->nhc_dev;
1835	struct fib_nh_exception *fnhe;
1836	struct rtable *rth;
1837	int err;
1838	struct in_device *out_dev;
1839	bool do_cache, no_policy;
1840	u32 itag = 0;
1841
1842	/* get a working reference to the output device */
1843	out_dev = __in_dev_get_rcu(dev);
1844	if (!out_dev) {
1845		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1846		return -EINVAL;
1847	}
1848
1849	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1850				  in_dev->dev, in_dev, &itag);
1851	if (err < 0) {
1852		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1853					 saddr);
1854
1855		goto cleanup;
1856	}
1857
1858	do_cache = res->fi && !itag;
1859	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1860	    skb->protocol == htons(ETH_P_IP)) {
1861		__be32 gw;
1862
1863		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1864		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1865		    inet_addr_onlink(out_dev, saddr, gw))
1866			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1867	}
1868
1869	if (skb->protocol != htons(ETH_P_IP)) {
1870		/* Not IP (i.e. ARP). Do not create route, if it is
1871		 * invalid for proxy arp. DNAT routes are always valid.
1872		 *
1873		 * Proxy arp feature have been extended to allow, ARP
1874		 * replies back to the same interface, to support
1875		 * Private VLAN switch technologies. See arp.c.
1876		 */
1877		if (out_dev == in_dev &&
1878		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1879			err = -EINVAL;
1880			goto cleanup;
1881		}
1882	}
1883
1884	no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1885	if (no_policy)
1886		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1887
1888	fnhe = find_exception(nhc, daddr);
1889	if (do_cache) {
1890		if (fnhe)
1891			rth = rcu_dereference(fnhe->fnhe_rth_input);
1892		else
1893			rth = rcu_dereference(nhc->nhc_rth_input);
1894		if (rt_cache_valid(rth)) {
1895			skb_dst_set_noref(skb, &rth->dst);
1896			goto out;
1897		}
1898	}
1899
1900	rth = rt_dst_alloc(out_dev->dev, 0, res->type, no_policy,
1901			   IN_DEV_ORCONF(out_dev, NOXFRM));
1902	if (!rth) {
1903		err = -ENOBUFS;
1904		goto cleanup;
1905	}
1906
1907	rth->rt_is_input = 1;
1908	RT_CACHE_STAT_INC(in_slow_tot);
1909
1910	rth->dst.input = ip_forward;
1911
1912	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1913		       do_cache);
1914	lwtunnel_set_redirect(&rth->dst);
1915	skb_dst_set(skb, &rth->dst);
1916out:
1917	err = 0;
1918 cleanup:
1919	return err;
1920}
1921
1922#ifdef CONFIG_IP_ROUTE_MULTIPATH
1923/* To make ICMP packets follow the right flow, the multipath hash is
1924 * calculated from the inner IP addresses.
1925 */
1926static void ip_multipath_l3_keys(const struct sk_buff *skb,
1927				 struct flow_keys *hash_keys)
1928{
1929	const struct iphdr *outer_iph = ip_hdr(skb);
1930	const struct iphdr *key_iph = outer_iph;
1931	const struct iphdr *inner_iph;
1932	const struct icmphdr *icmph;
1933	struct iphdr _inner_iph;
1934	struct icmphdr _icmph;
1935
1936	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1937		goto out;
1938
1939	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1940		goto out;
1941
1942	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1943				   &_icmph);
1944	if (!icmph)
1945		goto out;
1946
1947	if (!icmp_is_err(icmph->type))
1948		goto out;
1949
1950	inner_iph = skb_header_pointer(skb,
1951				       outer_iph->ihl * 4 + sizeof(_icmph),
1952				       sizeof(_inner_iph), &_inner_iph);
1953	if (!inner_iph)
1954		goto out;
1955
1956	key_iph = inner_iph;
1957out:
1958	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1959	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1960}
1961
1962/* if skb is set it will be used and fl4 can be NULL */
1963int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1964		       const struct sk_buff *skb, struct flow_keys *flkeys)
1965{
1966	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1967	struct flow_keys hash_keys;
1968	u32 mhash;
1969
1970	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1971	case 0:
1972		memset(&hash_keys, 0, sizeof(hash_keys));
1973		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1974		if (skb) {
1975			ip_multipath_l3_keys(skb, &hash_keys);
1976		} else {
1977			hash_keys.addrs.v4addrs.src = fl4->saddr;
1978			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1979		}
1980		break;
1981	case 1:
1982		/* skb is currently provided only when forwarding */
1983		if (skb) {
1984			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1985			struct flow_keys keys;
1986
1987			/* short-circuit if we already have L4 hash present */
1988			if (skb->l4_hash)
1989				return skb_get_hash_raw(skb) >> 1;
1990
1991			memset(&hash_keys, 0, sizeof(hash_keys));
1992
1993			if (!flkeys) {
1994				skb_flow_dissect_flow_keys(skb, &keys, flag);
1995				flkeys = &keys;
1996			}
1997
1998			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1999			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2000			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2001			hash_keys.ports.src = flkeys->ports.src;
2002			hash_keys.ports.dst = flkeys->ports.dst;
2003			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2004		} else {
2005			memset(&hash_keys, 0, sizeof(hash_keys));
2006			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2007			hash_keys.addrs.v4addrs.src = fl4->saddr;
2008			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2009			hash_keys.ports.src = fl4->fl4_sport;
2010			hash_keys.ports.dst = fl4->fl4_dport;
2011			hash_keys.basic.ip_proto = fl4->flowi4_proto;
2012		}
2013		break;
2014	case 2:
2015		memset(&hash_keys, 0, sizeof(hash_keys));
2016		/* skb is currently provided only when forwarding */
2017		if (skb) {
2018			struct flow_keys keys;
2019
2020			skb_flow_dissect_flow_keys(skb, &keys, 0);
2021			/* Inner can be v4 or v6 */
2022			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2023				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2024				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2025				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2026			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2027				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2028				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2029				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2030				hash_keys.tags.flow_label = keys.tags.flow_label;
2031				hash_keys.basic.ip_proto = keys.basic.ip_proto;
2032			} else {
2033				/* Same as case 0 */
2034				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2035				ip_multipath_l3_keys(skb, &hash_keys);
2036			}
2037		} else {
2038			/* Same as case 0 */
2039			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2040			hash_keys.addrs.v4addrs.src = fl4->saddr;
2041			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2042		}
2043		break;
2044	}
2045	mhash = flow_hash_from_keys(&hash_keys);
2046
2047	if (multipath_hash)
2048		mhash = jhash_2words(mhash, multipath_hash, 0);
2049
2050	return mhash >> 1;
2051}
2052#endif /* CONFIG_IP_ROUTE_MULTIPATH */
2053
2054static int ip_mkroute_input(struct sk_buff *skb,
2055			    struct fib_result *res,
2056			    struct in_device *in_dev,
2057			    __be32 daddr, __be32 saddr, u32 tos,
2058			    struct flow_keys *hkeys)
2059{
2060#ifdef CONFIG_IP_ROUTE_MULTIPATH
2061	if (res->fi && fib_info_num_path(res->fi) > 1) {
2062		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2063
2064		fib_select_multipath(res, h);
2065		IPCB(skb)->flags |= IPSKB_MULTIPATH;
2066	}
2067#endif
2068
2069	/* create a routing cache entry */
2070	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2071}
2072
2073/* Implements all the saddr-related checks as ip_route_input_slow(),
2074 * assuming daddr is valid and the destination is not a local broadcast one.
2075 * Uses the provided hint instead of performing a route lookup.
2076 */
2077int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2078		      u8 tos, struct net_device *dev,
2079		      const struct sk_buff *hint)
2080{
2081	struct in_device *in_dev = __in_dev_get_rcu(dev);
2082	struct rtable *rt = skb_rtable(hint);
2083	struct net *net = dev_net(dev);
2084	int err = -EINVAL;
2085	u32 tag = 0;
2086
2087	if (!in_dev)
2088		return -EINVAL;
2089
2090	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2091		goto martian_source;
2092
2093	if (ipv4_is_zeronet(saddr))
2094		goto martian_source;
2095
2096	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2097		goto martian_source;
2098
2099	if (rt->rt_type != RTN_LOCAL)
2100		goto skip_validate_source;
2101
2102	tos &= IPTOS_RT_MASK;
2103	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2104	if (err < 0)
2105		goto martian_source;
2106
2107skip_validate_source:
2108	skb_dst_copy(skb, hint);
2109	return 0;
2110
2111martian_source:
2112	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2113	return err;
2114}
2115
2116/* get device for dst_alloc with local routes */
2117static struct net_device *ip_rt_get_dev(struct net *net,
2118					const struct fib_result *res)
2119{
2120	struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2121	struct net_device *dev = NULL;
2122
2123	if (nhc)
2124		dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2125
2126	return dev ? : net->loopback_dev;
2127}
2128
2129/*
2130 *	NOTE. We drop all the packets that has local source
2131 *	addresses, because every properly looped back packet
2132 *	must have correct destination already attached by output routine.
2133 *	Changes in the enforced policies must be applied also to
2134 *	ip_route_use_hint().
2135 *
2136 *	Such approach solves two big problems:
2137 *	1. Not simplex devices are handled properly.
2138 *	2. IP spoofing attempts are filtered with 100% of guarantee.
2139 *	called with rcu_read_lock()
2140 */
2141
2142static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2143			       u8 tos, struct net_device *dev,
2144			       struct fib_result *res)
2145{
2146	struct in_device *in_dev = __in_dev_get_rcu(dev);
2147	struct flow_keys *flkeys = NULL, _flkeys;
2148	struct net    *net = dev_net(dev);
2149	struct ip_tunnel_info *tun_info;
2150	int		err = -EINVAL;
2151	unsigned int	flags = 0;
2152	u32		itag = 0;
2153	struct rtable	*rth;
2154	struct flowi4	fl4;
2155	bool do_cache = true;
2156	bool no_policy;
2157
2158	/* IP on this device is disabled. */
2159
2160	if (!in_dev)
2161		goto out;
2162
2163	/* Check for the most weird martians, which can be not detected
2164	   by fib_lookup.
2165	 */
2166
2167	tun_info = skb_tunnel_info(skb);
2168	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2169		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2170	else
2171		fl4.flowi4_tun_key.tun_id = 0;
2172	skb_dst_drop(skb);
2173
2174	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2175		goto martian_source;
2176
2177	res->fi = NULL;
2178	res->table = NULL;
2179	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2180		goto brd_input;
2181
2182	/* Accept zero addresses only to limited broadcast;
2183	 * I even do not know to fix it or not. Waiting for complains :-)
2184	 */
2185	if (ipv4_is_zeronet(saddr))
2186		goto martian_source;
2187
2188	if (ipv4_is_zeronet(daddr))
2189		goto martian_destination;
2190
2191	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2192	 * and call it once if daddr or/and saddr are loopback addresses
2193	 */
2194	if (ipv4_is_loopback(daddr)) {
2195		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2196			goto martian_destination;
2197	} else if (ipv4_is_loopback(saddr)) {
2198		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2199			goto martian_source;
2200	}
2201
2202	/*
2203	 *	Now we are ready to route packet.
2204	 */
2205	fl4.flowi4_oif = 0;
2206	fl4.flowi4_iif = dev->ifindex;
2207	fl4.flowi4_mark = skb->mark;
2208	fl4.flowi4_tos = tos;
2209	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2210	fl4.flowi4_flags = 0;
2211	fl4.daddr = daddr;
2212	fl4.saddr = saddr;
2213	fl4.flowi4_uid = sock_net_uid(net, NULL);
2214	fl4.flowi4_multipath_hash = 0;
2215
2216	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2217		flkeys = &_flkeys;
2218	} else {
2219		fl4.flowi4_proto = 0;
2220		fl4.fl4_sport = 0;
2221		fl4.fl4_dport = 0;
2222	}
2223
2224	err = fib_lookup(net, &fl4, res, 0);
2225	if (err != 0) {
2226		if (!IN_DEV_FORWARD(in_dev))
2227			err = -EHOSTUNREACH;
2228		goto no_route;
2229	}
2230
2231	if (res->type == RTN_BROADCAST) {
2232		if (IN_DEV_BFORWARD(in_dev))
2233			goto make_route;
2234		/* not do cache if bc_forwarding is enabled */
2235		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2236			do_cache = false;
2237		goto brd_input;
2238	}
2239
2240	if (res->type == RTN_LOCAL) {
2241		err = fib_validate_source(skb, saddr, daddr, tos,
2242					  0, dev, in_dev, &itag);
2243		if (err < 0)
2244			goto martian_source;
2245		goto local_input;
2246	}
2247
2248	if (!IN_DEV_FORWARD(in_dev)) {
2249		err = -EHOSTUNREACH;
2250		goto no_route;
2251	}
2252	if (res->type != RTN_UNICAST)
2253		goto martian_destination;
2254
2255make_route:
2256	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2257out:	return err;
2258
2259brd_input:
2260	if (skb->protocol != htons(ETH_P_IP))
2261		goto e_inval;
2262
2263	if (!ipv4_is_zeronet(saddr)) {
2264		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2265					  in_dev, &itag);
2266		if (err < 0)
2267			goto martian_source;
2268	}
2269	flags |= RTCF_BROADCAST;
2270	res->type = RTN_BROADCAST;
2271	RT_CACHE_STAT_INC(in_brd);
2272
2273local_input:
2274	no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
2275	if (no_policy)
2276		IPCB(skb)->flags |= IPSKB_NOPOLICY;
2277
2278	do_cache &= res->fi && !itag;
2279	if (do_cache) {
2280		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2281
2282		rth = rcu_dereference(nhc->nhc_rth_input);
2283		if (rt_cache_valid(rth)) {
2284			skb_dst_set_noref(skb, &rth->dst);
2285			err = 0;
2286			goto out;
2287		}
2288	}
2289
2290	rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2291			   flags | RTCF_LOCAL, res->type,
2292			   no_policy, false);
2293	if (!rth)
2294		goto e_nobufs;
2295
2296	rth->dst.output= ip_rt_bug;
2297#ifdef CONFIG_IP_ROUTE_CLASSID
2298	rth->dst.tclassid = itag;
2299#endif
2300	rth->rt_is_input = 1;
2301
2302	RT_CACHE_STAT_INC(in_slow_tot);
2303	if (res->type == RTN_UNREACHABLE) {
2304		rth->dst.input= ip_error;
2305		rth->dst.error= -err;
2306		rth->rt_flags 	&= ~RTCF_LOCAL;
2307	}
2308
2309	if (do_cache) {
2310		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2311
2312		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2313		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2314			WARN_ON(rth->dst.input == lwtunnel_input);
2315			rth->dst.lwtstate->orig_input = rth->dst.input;
2316			rth->dst.input = lwtunnel_input;
2317		}
2318
2319		if (unlikely(!rt_cache_route(nhc, rth)))
2320			rt_add_uncached_list(rth);
2321	}
2322	skb_dst_set(skb, &rth->dst);
2323	err = 0;
2324	goto out;
2325
2326no_route:
2327	RT_CACHE_STAT_INC(in_no_route);
2328	res->type = RTN_UNREACHABLE;
2329	res->fi = NULL;
2330	res->table = NULL;
2331	goto local_input;
2332
2333	/*
2334	 *	Do not cache martian addresses: they should be logged (RFC1812)
2335	 */
2336martian_destination:
2337	RT_CACHE_STAT_INC(in_martian_dst);
2338#ifdef CONFIG_IP_ROUTE_VERBOSE
2339	if (IN_DEV_LOG_MARTIANS(in_dev))
2340		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2341				     &daddr, &saddr, dev->name);
2342#endif
2343
2344e_inval:
2345	err = -EINVAL;
2346	goto out;
2347
2348e_nobufs:
2349	err = -ENOBUFS;
2350	goto out;
2351
2352martian_source:
2353	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2354	goto out;
2355}
2356
2357int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2358			 u8 tos, struct net_device *dev)
2359{
2360	struct fib_result res;
2361	int err;
2362
2363	tos &= IPTOS_RT_MASK;
2364	rcu_read_lock();
2365	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2366	rcu_read_unlock();
2367
2368	return err;
2369}
2370EXPORT_SYMBOL(ip_route_input_noref);
2371
2372/* called with rcu_read_lock held */
2373int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2374		       u8 tos, struct net_device *dev, struct fib_result *res)
2375{
2376	/* Multicast recognition logic is moved from route cache to here.
2377	   The problem was that too many Ethernet cards have broken/missing
2378	   hardware multicast filters :-( As result the host on multicasting
2379	   network acquires a lot of useless route cache entries, sort of
2380	   SDR messages from all the world. Now we try to get rid of them.
2381	   Really, provided software IP multicast filter is organized
2382	   reasonably (at least, hashed), it does not result in a slowdown
2383	   comparing with route cache reject entries.
2384	   Note, that multicast routers are not affected, because
2385	   route cache entry is created eventually.
2386	 */
2387	if (ipv4_is_multicast(daddr)) {
2388		struct in_device *in_dev = __in_dev_get_rcu(dev);
2389		int our = 0;
2390		int err = -EINVAL;
2391
2392		if (!in_dev)
2393			return err;
2394		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2395				      ip_hdr(skb)->protocol);
2396
2397		/* check l3 master if no match yet */
2398		if (!our && netif_is_l3_slave(dev)) {
2399			struct in_device *l3_in_dev;
2400
2401			l3_in_dev = __in_dev_get_rcu(skb->dev);
2402			if (l3_in_dev)
2403				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2404						      ip_hdr(skb)->protocol);
2405		}
2406
2407		if (our
2408#ifdef CONFIG_IP_MROUTE
2409			||
2410		    (!ipv4_is_local_multicast(daddr) &&
2411		     IN_DEV_MFORWARD(in_dev))
2412#endif
2413		   ) {
2414			err = ip_route_input_mc(skb, daddr, saddr,
2415						tos, dev, our);
2416		}
2417		return err;
2418	}
2419
2420	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2421}
2422
2423/* called with rcu_read_lock() */
2424static struct rtable *__mkroute_output(const struct fib_result *res,
2425				       const struct flowi4 *fl4, int orig_oif,
2426				       struct net_device *dev_out,
2427				       unsigned int flags)
2428{
2429	struct fib_info *fi = res->fi;
2430	struct fib_nh_exception *fnhe;
2431	struct in_device *in_dev;
2432	u16 type = res->type;
2433	struct rtable *rth;
2434	bool do_cache;
2435
2436	in_dev = __in_dev_get_rcu(dev_out);
2437	if (!in_dev)
2438		return ERR_PTR(-EINVAL);
2439
2440	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2441		if (ipv4_is_loopback(fl4->saddr) &&
2442		    !(dev_out->flags & IFF_LOOPBACK) &&
2443		    !netif_is_l3_master(dev_out))
2444			return ERR_PTR(-EINVAL);
2445
2446	if (ipv4_is_lbcast(fl4->daddr))
2447		type = RTN_BROADCAST;
2448	else if (ipv4_is_multicast(fl4->daddr))
2449		type = RTN_MULTICAST;
2450	else if (ipv4_is_zeronet(fl4->daddr))
2451		return ERR_PTR(-EINVAL);
2452
2453	if (dev_out->flags & IFF_LOOPBACK)
2454		flags |= RTCF_LOCAL;
2455
2456	do_cache = true;
2457	if (type == RTN_BROADCAST) {
2458		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2459		fi = NULL;
2460	} else if (type == RTN_MULTICAST) {
2461		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2462		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2463				     fl4->flowi4_proto))
2464			flags &= ~RTCF_LOCAL;
2465		else
2466			do_cache = false;
2467		/* If multicast route do not exist use
2468		 * default one, but do not gateway in this case.
2469		 * Yes, it is hack.
2470		 */
2471		if (fi && res->prefixlen < 4)
2472			fi = NULL;
2473	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2474		   (orig_oif != dev_out->ifindex)) {
2475		/* For local routes that require a particular output interface
2476		 * we do not want to cache the result.  Caching the result
2477		 * causes incorrect behaviour when there are multiple source
2478		 * addresses on the interface, the end result being that if the
2479		 * intended recipient is waiting on that interface for the
2480		 * packet he won't receive it because it will be delivered on
2481		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2482		 * be set to the loopback interface as well.
2483		 */
2484		do_cache = false;
2485	}
2486
2487	fnhe = NULL;
2488	do_cache &= fi != NULL;
2489	if (fi) {
2490		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2491		struct rtable __rcu **prth;
2492
2493		fnhe = find_exception(nhc, fl4->daddr);
2494		if (!do_cache)
2495			goto add;
2496		if (fnhe) {
2497			prth = &fnhe->fnhe_rth_output;
2498		} else {
2499			if (unlikely(fl4->flowi4_flags &
2500				     FLOWI_FLAG_KNOWN_NH &&
2501				     !(nhc->nhc_gw_family &&
2502				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2503				do_cache = false;
2504				goto add;
2505			}
2506			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2507		}
2508		rth = rcu_dereference(*prth);
2509		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2510			return rth;
2511	}
2512
2513add:
2514	rth = rt_dst_alloc(dev_out, flags, type,
2515			   IN_DEV_ORCONF(in_dev, NOPOLICY),
2516			   IN_DEV_ORCONF(in_dev, NOXFRM));
2517	if (!rth)
2518		return ERR_PTR(-ENOBUFS);
2519
2520	rth->rt_iif = orig_oif;
2521
2522	RT_CACHE_STAT_INC(out_slow_tot);
2523
2524	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2525		if (flags & RTCF_LOCAL &&
2526		    !(dev_out->flags & IFF_LOOPBACK)) {
2527			rth->dst.output = ip_mc_output;
2528			RT_CACHE_STAT_INC(out_slow_mc);
2529		}
2530#ifdef CONFIG_IP_MROUTE
2531		if (type == RTN_MULTICAST) {
2532			if (IN_DEV_MFORWARD(in_dev) &&
2533			    !ipv4_is_local_multicast(fl4->daddr)) {
2534				rth->dst.input = ip_mr_input;
2535				rth->dst.output = ip_mc_output;
2536			}
2537		}
2538#endif
2539	}
2540
2541	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2542	lwtunnel_set_redirect(&rth->dst);
2543
2544	return rth;
2545}
2546
2547/*
2548 * Major route resolver routine.
2549 */
2550
2551struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2552					const struct sk_buff *skb)
2553{
2554	struct fib_result res = {
2555		.type		= RTN_UNSPEC,
2556		.fi		= NULL,
2557		.table		= NULL,
2558		.tclassid	= 0,
2559	};
2560	struct rtable *rth;
2561
2562	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2563	ip_rt_fix_tos(fl4);
2564
2565	rcu_read_lock();
2566	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2567	rcu_read_unlock();
2568
2569	return rth;
2570}
2571EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2572
2573struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2574					    struct fib_result *res,
2575					    const struct sk_buff *skb)
2576{
2577	struct net_device *dev_out = NULL;
2578	int orig_oif = fl4->flowi4_oif;
2579	unsigned int flags = 0;
2580	struct rtable *rth;
2581	int err;
2582
2583	if (fl4->saddr) {
2584		if (ipv4_is_multicast(fl4->saddr) ||
2585		    ipv4_is_lbcast(fl4->saddr) ||
2586		    ipv4_is_zeronet(fl4->saddr)) {
2587			rth = ERR_PTR(-EINVAL);
2588			goto out;
2589		}
2590
2591		rth = ERR_PTR(-ENETUNREACH);
2592
2593		/* I removed check for oif == dev_out->oif here.
2594		   It was wrong for two reasons:
2595		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2596		      is assigned to multiple interfaces.
2597		   2. Moreover, we are allowed to send packets with saddr
2598		      of another iface. --ANK
2599		 */
2600
2601		if (fl4->flowi4_oif == 0 &&
2602		    (ipv4_is_multicast(fl4->daddr) ||
2603		     ipv4_is_lbcast(fl4->daddr))) {
2604			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2605			dev_out = __ip_dev_find(net, fl4->saddr, false);
2606			if (!dev_out)
2607				goto out;
2608
2609			/* Special hack: user can direct multicasts
2610			   and limited broadcast via necessary interface
2611			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2612			   This hack is not just for fun, it allows
2613			   vic,vat and friends to work.
2614			   They bind socket to loopback, set ttl to zero
2615			   and expect that it will work.
2616			   From the viewpoint of routing cache they are broken,
2617			   because we are not allowed to build multicast path
2618			   with loopback source addr (look, routing cache
2619			   cannot know, that ttl is zero, so that packet
2620			   will not leave this host and route is valid).
2621			   Luckily, this hack is good workaround.
2622			 */
2623
2624			fl4->flowi4_oif = dev_out->ifindex;
2625			goto make_route;
2626		}
2627
2628		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2629			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2630			if (!__ip_dev_find(net, fl4->saddr, false))
2631				goto out;
2632		}
2633	}
2634
2635
2636	if (fl4->flowi4_oif) {
2637		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2638		rth = ERR_PTR(-ENODEV);
2639		if (!dev_out)
2640			goto out;
2641
2642		/* RACE: Check return value of inet_select_addr instead. */
2643		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2644			rth = ERR_PTR(-ENETUNREACH);
2645			goto out;
2646		}
2647		if (ipv4_is_local_multicast(fl4->daddr) ||
2648		    ipv4_is_lbcast(fl4->daddr) ||
2649		    fl4->flowi4_proto == IPPROTO_IGMP) {
2650			if (!fl4->saddr)
2651				fl4->saddr = inet_select_addr(dev_out, 0,
2652							      RT_SCOPE_LINK);
2653			goto make_route;
2654		}
2655		if (!fl4->saddr) {
2656			if (ipv4_is_multicast(fl4->daddr))
2657				fl4->saddr = inet_select_addr(dev_out, 0,
2658							      fl4->flowi4_scope);
2659			else if (!fl4->daddr)
2660				fl4->saddr = inet_select_addr(dev_out, 0,
2661							      RT_SCOPE_HOST);
2662		}
2663	}
2664
2665	if (!fl4->daddr) {
2666		fl4->daddr = fl4->saddr;
2667		if (!fl4->daddr)
2668			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2669		dev_out = net->loopback_dev;
2670		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2671		res->type = RTN_LOCAL;
2672		flags |= RTCF_LOCAL;
2673		goto make_route;
2674	}
2675
2676	err = fib_lookup(net, fl4, res, 0);
2677	if (err) {
2678		res->fi = NULL;
2679		res->table = NULL;
2680		if (fl4->flowi4_oif &&
2681		    (ipv4_is_multicast(fl4->daddr) ||
2682		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2683			/* Apparently, routing tables are wrong. Assume,
2684			   that the destination is on link.
2685
2686			   WHY? DW.
2687			   Because we are allowed to send to iface
2688			   even if it has NO routes and NO assigned
2689			   addresses. When oif is specified, routing
2690			   tables are looked up with only one purpose:
2691			   to catch if destination is gatewayed, rather than
2692			   direct. Moreover, if MSG_DONTROUTE is set,
2693			   we send packet, ignoring both routing tables
2694			   and ifaddr state. --ANK
2695
2696
2697			   We could make it even if oif is unknown,
2698			   likely IPv6, but we do not.
2699			 */
2700
2701			if (fl4->saddr == 0)
2702				fl4->saddr = inet_select_addr(dev_out, 0,
2703							      RT_SCOPE_LINK);
2704			res->type = RTN_UNICAST;
2705			goto make_route;
2706		}
2707		rth = ERR_PTR(err);
2708		goto out;
2709	}
2710
2711	if (res->type == RTN_LOCAL) {
2712		if (!fl4->saddr) {
2713			if (res->fi->fib_prefsrc)
2714				fl4->saddr = res->fi->fib_prefsrc;
2715			else
2716				fl4->saddr = fl4->daddr;
2717		}
2718
2719		/* L3 master device is the loopback for that domain */
2720		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2721			net->loopback_dev;
2722
2723		/* make sure orig_oif points to fib result device even
2724		 * though packet rx/tx happens over loopback or l3mdev
2725		 */
2726		orig_oif = FIB_RES_OIF(*res);
2727
2728		fl4->flowi4_oif = dev_out->ifindex;
2729		flags |= RTCF_LOCAL;
2730		goto make_route;
2731	}
2732
2733	fib_select_path(net, res, fl4, skb);
2734
2735	dev_out = FIB_RES_DEV(*res);
2736
2737make_route:
2738	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2739
2740out:
2741	return rth;
2742}
2743
2744static struct dst_ops ipv4_dst_blackhole_ops = {
2745	.family			= AF_INET,
2746	.default_advmss		= ipv4_default_advmss,
2747	.neigh_lookup		= ipv4_neigh_lookup,
2748	.check			= dst_blackhole_check,
2749	.cow_metrics		= dst_blackhole_cow_metrics,
2750	.update_pmtu		= dst_blackhole_update_pmtu,
2751	.redirect		= dst_blackhole_redirect,
2752	.mtu			= dst_blackhole_mtu,
2753};
2754
2755struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2756{
2757	struct rtable *ort = (struct rtable *) dst_orig;
2758	struct rtable *rt;
2759
2760	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2761	if (rt) {
2762		struct dst_entry *new = &rt->dst;
2763
2764		new->__use = 1;
2765		new->input = dst_discard;
2766		new->output = dst_discard_out;
2767
2768		new->dev = net->loopback_dev;
2769		if (new->dev)
2770			dev_hold(new->dev);
2771
2772		rt->rt_is_input = ort->rt_is_input;
2773		rt->rt_iif = ort->rt_iif;
2774		rt->rt_pmtu = ort->rt_pmtu;
2775		rt->rt_mtu_locked = ort->rt_mtu_locked;
2776
2777		rt->rt_genid = rt_genid_ipv4(net);
2778		rt->rt_flags = ort->rt_flags;
2779		rt->rt_type = ort->rt_type;
2780		rt->rt_uses_gateway = ort->rt_uses_gateway;
2781		rt->rt_gw_family = ort->rt_gw_family;
2782		if (rt->rt_gw_family == AF_INET)
2783			rt->rt_gw4 = ort->rt_gw4;
2784		else if (rt->rt_gw_family == AF_INET6)
2785			rt->rt_gw6 = ort->rt_gw6;
2786
2787		INIT_LIST_HEAD(&rt->rt_uncached);
2788	}
2789
2790	dst_release(dst_orig);
2791
2792	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2793}
2794
2795struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2796				    const struct sock *sk)
2797{
2798	struct rtable *rt = __ip_route_output_key(net, flp4);
2799
2800	if (IS_ERR(rt))
2801		return rt;
2802
2803	if (flp4->flowi4_proto) {
2804		flp4->flowi4_oif = rt->dst.dev->ifindex;
2805		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2806							flowi4_to_flowi(flp4),
2807							sk, 0);
2808	}
2809
2810	return rt;
2811}
2812EXPORT_SYMBOL_GPL(ip_route_output_flow);
2813
2814struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2815				      struct net_device *dev,
2816				      struct net *net, __be32 *saddr,
2817				      const struct ip_tunnel_info *info,
2818				      u8 protocol, bool use_cache)
2819{
2820#ifdef CONFIG_DST_CACHE
2821	struct dst_cache *dst_cache;
2822#endif
2823	struct rtable *rt = NULL;
2824	struct flowi4 fl4;
2825	__u8 tos;
2826
2827#ifdef CONFIG_DST_CACHE
2828	dst_cache = (struct dst_cache *)&info->dst_cache;
2829	if (use_cache) {
2830		rt = dst_cache_get_ip4(dst_cache, saddr);
2831		if (rt)
2832			return rt;
2833	}
2834#endif
2835	memset(&fl4, 0, sizeof(fl4));
2836	fl4.flowi4_mark = skb->mark;
2837	fl4.flowi4_proto = protocol;
2838	fl4.daddr = info->key.u.ipv4.dst;
2839	fl4.saddr = info->key.u.ipv4.src;
2840	tos = info->key.tos;
2841	fl4.flowi4_tos = RT_TOS(tos);
2842
2843	rt = ip_route_output_key(net, &fl4);
2844	if (IS_ERR(rt)) {
2845		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2846		return ERR_PTR(-ENETUNREACH);
2847	}
2848	if (rt->dst.dev == dev) { /* is this necessary? */
2849		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2850		ip_rt_put(rt);
2851		return ERR_PTR(-ELOOP);
2852	}
2853#ifdef CONFIG_DST_CACHE
2854	if (use_cache)
2855		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2856#endif
2857	*saddr = fl4.saddr;
2858	return rt;
2859}
2860EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2861
2862/* called with rcu_read_lock held */
2863static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2864			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2865			struct sk_buff *skb, u32 portid, u32 seq,
2866			unsigned int flags)
2867{
2868	struct rtmsg *r;
2869	struct nlmsghdr *nlh;
2870	unsigned long expires = 0;
2871	u32 error;
2872	u32 metrics[RTAX_MAX];
2873
2874	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2875	if (!nlh)
2876		return -EMSGSIZE;
2877
2878	r = nlmsg_data(nlh);
2879	r->rtm_family	 = AF_INET;
2880	r->rtm_dst_len	= 32;
2881	r->rtm_src_len	= 0;
2882	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2883	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2884	if (nla_put_u32(skb, RTA_TABLE, table_id))
2885		goto nla_put_failure;
2886	r->rtm_type	= rt->rt_type;
2887	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2888	r->rtm_protocol = RTPROT_UNSPEC;
2889	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2890	if (rt->rt_flags & RTCF_NOTIFY)
2891		r->rtm_flags |= RTM_F_NOTIFY;
2892	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2893		r->rtm_flags |= RTCF_DOREDIRECT;
2894
2895	if (nla_put_in_addr(skb, RTA_DST, dst))
2896		goto nla_put_failure;
2897	if (src) {
2898		r->rtm_src_len = 32;
2899		if (nla_put_in_addr(skb, RTA_SRC, src))
2900			goto nla_put_failure;
2901	}
2902	if (rt->dst.dev &&
2903	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2904		goto nla_put_failure;
2905#ifdef CONFIG_IP_ROUTE_CLASSID
2906	if (rt->dst.tclassid &&
2907	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2908		goto nla_put_failure;
2909#endif
2910	if (fl4 && !rt_is_input_route(rt) &&
2911	    fl4->saddr != src) {
2912		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2913			goto nla_put_failure;
2914	}
2915	if (rt->rt_uses_gateway) {
2916		if (rt->rt_gw_family == AF_INET &&
2917		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2918			goto nla_put_failure;
2919		} else if (rt->rt_gw_family == AF_INET6) {
2920			int alen = sizeof(struct in6_addr);
2921			struct nlattr *nla;
2922			struct rtvia *via;
2923
2924			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2925			if (!nla)
2926				goto nla_put_failure;
2927
2928			via = nla_data(nla);
2929			via->rtvia_family = AF_INET6;
2930			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2931		}
2932	}
2933
2934	expires = rt->dst.expires;
2935	if (expires) {
2936		unsigned long now = jiffies;
2937
2938		if (time_before(now, expires))
2939			expires -= now;
2940		else
2941			expires = 0;
2942	}
2943
2944	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2945	if (rt->rt_pmtu && expires)
2946		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2947	if (rt->rt_mtu_locked && expires)
2948		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2949	if (rtnetlink_put_metrics(skb, metrics) < 0)
2950		goto nla_put_failure;
2951
2952	if (fl4) {
2953		if (fl4->flowi4_mark &&
2954		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2955			goto nla_put_failure;
2956
2957		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2958		    nla_put_u32(skb, RTA_UID,
2959				from_kuid_munged(current_user_ns(),
2960						 fl4->flowi4_uid)))
2961			goto nla_put_failure;
2962
2963		if (rt_is_input_route(rt)) {
2964#ifdef CONFIG_IP_MROUTE
2965			if (ipv4_is_multicast(dst) &&
2966			    !ipv4_is_local_multicast(dst) &&
2967			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2968				int err = ipmr_get_route(net, skb,
2969							 fl4->saddr, fl4->daddr,
2970							 r, portid);
2971
2972				if (err <= 0) {
2973					if (err == 0)
2974						return 0;
2975					goto nla_put_failure;
2976				}
2977			} else
2978#endif
2979				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2980					goto nla_put_failure;
2981		}
2982	}
2983
2984	error = rt->dst.error;
2985
2986	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2987		goto nla_put_failure;
2988
2989	nlmsg_end(skb, nlh);
2990	return 0;
2991
2992nla_put_failure:
2993	nlmsg_cancel(skb, nlh);
2994	return -EMSGSIZE;
2995}
2996
2997static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2998			    struct netlink_callback *cb, u32 table_id,
2999			    struct fnhe_hash_bucket *bucket, int genid,
3000			    int *fa_index, int fa_start, unsigned int flags)
3001{
3002	int i;
3003
3004	for (i = 0; i < FNHE_HASH_SIZE; i++) {
3005		struct fib_nh_exception *fnhe;
3006
3007		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3008		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
3009			struct rtable *rt;
3010			int err;
3011
3012			if (*fa_index < fa_start)
3013				goto next;
3014
3015			if (fnhe->fnhe_genid != genid)
3016				goto next;
3017
3018			if (fnhe->fnhe_expires &&
3019			    time_after(jiffies, fnhe->fnhe_expires))
3020				goto next;
3021
3022			rt = rcu_dereference(fnhe->fnhe_rth_input);
3023			if (!rt)
3024				rt = rcu_dereference(fnhe->fnhe_rth_output);
3025			if (!rt)
3026				goto next;
3027
3028			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3029					   table_id, NULL, skb,
3030					   NETLINK_CB(cb->skb).portid,
3031					   cb->nlh->nlmsg_seq, flags);
3032			if (err)
3033				return err;
3034next:
3035			(*fa_index)++;
3036		}
3037	}
3038
3039	return 0;
3040}
3041
3042int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3043		       u32 table_id, struct fib_info *fi,
3044		       int *fa_index, int fa_start, unsigned int flags)
3045{
3046	struct net *net = sock_net(cb->skb->sk);
3047	int nhsel, genid = fnhe_genid(net);
3048
3049	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3050		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3051		struct fnhe_hash_bucket *bucket;
3052		int err;
3053
3054		if (nhc->nhc_flags & RTNH_F_DEAD)
3055			continue;
3056
3057		rcu_read_lock();
3058		bucket = rcu_dereference(nhc->nhc_exceptions);
3059		err = 0;
3060		if (bucket)
3061			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3062					       genid, fa_index, fa_start,
3063					       flags);
3064		rcu_read_unlock();
3065		if (err)
3066			return err;
3067	}
3068
3069	return 0;
3070}
3071
3072static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3073						   u8 ip_proto, __be16 sport,
3074						   __be16 dport)
3075{
3076	struct sk_buff *skb;
3077	struct iphdr *iph;
3078
3079	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3080	if (!skb)
3081		return NULL;
3082
3083	/* Reserve room for dummy headers, this skb can pass
3084	 * through good chunk of routing engine.
3085	 */
3086	skb_reset_mac_header(skb);
3087	skb_reset_network_header(skb);
3088	skb->protocol = htons(ETH_P_IP);
3089	iph = skb_put(skb, sizeof(struct iphdr));
3090	iph->protocol = ip_proto;
3091	iph->saddr = src;
3092	iph->daddr = dst;
3093	iph->version = 0x4;
3094	iph->frag_off = 0;
3095	iph->ihl = 0x5;
3096	skb_set_transport_header(skb, skb->len);
3097
3098	switch (iph->protocol) {
3099	case IPPROTO_UDP: {
3100		struct udphdr *udph;
3101
3102		udph = skb_put_zero(skb, sizeof(struct udphdr));
3103		udph->source = sport;
3104		udph->dest = dport;
3105		udph->len = htons(sizeof(struct udphdr));
3106		udph->check = 0;
3107		break;
3108	}
3109	case IPPROTO_TCP: {
3110		struct tcphdr *tcph;
3111
3112		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3113		tcph->source	= sport;
3114		tcph->dest	= dport;
3115		tcph->doff	= sizeof(struct tcphdr) / 4;
3116		tcph->rst = 1;
3117		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3118					    src, dst, 0);
3119		break;
3120	}
3121	case IPPROTO_ICMP: {
3122		struct icmphdr *icmph;
3123
3124		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3125		icmph->type = ICMP_ECHO;
3126		icmph->code = 0;
3127	}
3128	}
3129
3130	return skb;
3131}
3132
3133static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3134				       const struct nlmsghdr *nlh,
3135				       struct nlattr **tb,
3136				       struct netlink_ext_ack *extack)
3137{
3138	struct rtmsg *rtm;
3139	int i, err;
3140
3141	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3142		NL_SET_ERR_MSG(extack,
3143			       "ipv4: Invalid header for route get request");
3144		return -EINVAL;
3145	}
3146
3147	if (!netlink_strict_get_check(skb))
3148		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3149					      rtm_ipv4_policy, extack);
3150
3151	rtm = nlmsg_data(nlh);
3152	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3153	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3154	    rtm->rtm_table || rtm->rtm_protocol ||
3155	    rtm->rtm_scope || rtm->rtm_type) {
3156		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3157		return -EINVAL;
3158	}
3159
3160	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3161			       RTM_F_LOOKUP_TABLE |
3162			       RTM_F_FIB_MATCH)) {
3163		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3164		return -EINVAL;
3165	}
3166
3167	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3168					    rtm_ipv4_policy, extack);
3169	if (err)
3170		return err;
3171
3172	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3173	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3174		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3175		return -EINVAL;
3176	}
3177
3178	for (i = 0; i <= RTA_MAX; i++) {
3179		if (!tb[i])
3180			continue;
3181
3182		switch (i) {
3183		case RTA_IIF:
3184		case RTA_OIF:
3185		case RTA_SRC:
3186		case RTA_DST:
3187		case RTA_IP_PROTO:
3188		case RTA_SPORT:
3189		case RTA_DPORT:
3190		case RTA_MARK:
3191		case RTA_UID:
3192			break;
3193		default:
3194			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3195			return -EINVAL;
3196		}
3197	}
3198
3199	return 0;
3200}
3201
3202static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3203			     struct netlink_ext_ack *extack)
3204{
3205	struct net *net = sock_net(in_skb->sk);
3206	struct nlattr *tb[RTA_MAX+1];
3207	u32 table_id = RT_TABLE_MAIN;
3208	__be16 sport = 0, dport = 0;
3209	struct fib_result res = {};
3210	u8 ip_proto = IPPROTO_UDP;
3211	struct rtable *rt = NULL;
3212	struct sk_buff *skb;
3213	struct rtmsg *rtm;
3214	struct flowi4 fl4 = {};
3215	__be32 dst = 0;
3216	__be32 src = 0;
3217	kuid_t uid;
3218	u32 iif;
3219	int err;
3220	int mark;
3221
3222	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3223	if (err < 0)
3224		return err;
3225
3226	rtm = nlmsg_data(nlh);
3227	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3228	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3229	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3230	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3231	if (tb[RTA_UID])
3232		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3233	else
3234		uid = (iif ? INVALID_UID : current_uid());
3235
3236	if (tb[RTA_IP_PROTO]) {
3237		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3238						  &ip_proto, AF_INET, extack);
3239		if (err)
3240			return err;
3241	}
3242
3243	if (tb[RTA_SPORT])
3244		sport = nla_get_be16(tb[RTA_SPORT]);
3245
3246	if (tb[RTA_DPORT])
3247		dport = nla_get_be16(tb[RTA_DPORT]);
3248
3249	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3250	if (!skb)
3251		return -ENOBUFS;
3252
3253	fl4.daddr = dst;
3254	fl4.saddr = src;
3255	fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3256	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3257	fl4.flowi4_mark = mark;
3258	fl4.flowi4_uid = uid;
3259	if (sport)
3260		fl4.fl4_sport = sport;
3261	if (dport)
3262		fl4.fl4_dport = dport;
3263	fl4.flowi4_proto = ip_proto;
3264
3265	rcu_read_lock();
3266
3267	if (iif) {
3268		struct net_device *dev;
3269
3270		dev = dev_get_by_index_rcu(net, iif);
3271		if (!dev) {
3272			err = -ENODEV;
3273			goto errout_rcu;
3274		}
3275
3276		fl4.flowi4_iif = iif; /* for rt_fill_info */
3277		skb->dev	= dev;
3278		skb->mark	= mark;
3279		err = ip_route_input_rcu(skb, dst, src,
3280					 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3281					 &res);
3282
3283		rt = skb_rtable(skb);
3284		if (err == 0 && rt->dst.error)
3285			err = -rt->dst.error;
3286	} else {
3287		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3288		skb->dev = net->loopback_dev;
3289		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3290		err = 0;
3291		if (IS_ERR(rt))
3292			err = PTR_ERR(rt);
3293		else
3294			skb_dst_set(skb, &rt->dst);
3295	}
3296
3297	if (err)
3298		goto errout_rcu;
3299
3300	if (rtm->rtm_flags & RTM_F_NOTIFY)
3301		rt->rt_flags |= RTCF_NOTIFY;
3302
3303	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3304		table_id = res.table ? res.table->tb_id : 0;
3305
3306	/* reset skb for netlink reply msg */
3307	skb_trim(skb, 0);
3308	skb_reset_network_header(skb);
3309	skb_reset_transport_header(skb);
3310	skb_reset_mac_header(skb);
3311
3312	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3313		struct fib_rt_info fri;
3314
3315		if (!res.fi) {
3316			err = fib_props[res.type].error;
3317			if (!err)
3318				err = -EHOSTUNREACH;
3319			goto errout_rcu;
3320		}
3321		fri.fi = res.fi;
3322		fri.tb_id = table_id;
3323		fri.dst = res.prefix;
3324		fri.dst_len = res.prefixlen;
3325		fri.tos = fl4.flowi4_tos;
3326		fri.type = rt->rt_type;
3327		fri.offload = 0;
3328		fri.trap = 0;
3329		if (res.fa_head) {
3330			struct fib_alias *fa;
3331
3332			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3333				u8 slen = 32 - fri.dst_len;
3334
3335				if (fa->fa_slen == slen &&
3336				    fa->tb_id == fri.tb_id &&
3337				    fa->fa_tos == fri.tos &&
3338				    fa->fa_info == res.fi &&
3339				    fa->fa_type == fri.type) {
3340					fri.offload = fa->offload;
3341					fri.trap = fa->trap;
3342					break;
3343				}
3344			}
3345		}
3346		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3347				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3348	} else {
3349		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3350				   NETLINK_CB(in_skb).portid,
3351				   nlh->nlmsg_seq, 0);
3352	}
3353	if (err < 0)
3354		goto errout_rcu;
3355
3356	rcu_read_unlock();
3357
3358	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3359
3360errout_free:
3361	return err;
3362errout_rcu:
3363	rcu_read_unlock();
3364	kfree_skb(skb);
3365	goto errout_free;
3366}
3367
3368void ip_rt_multicast_event(struct in_device *in_dev)
3369{
3370	rt_cache_flush(dev_net(in_dev->dev));
3371}
3372
3373#ifdef CONFIG_SYSCTL
3374static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3375static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3376static int ip_rt_gc_elasticity __read_mostly	= 8;
3377static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3378
3379static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3380		void *buffer, size_t *lenp, loff_t *ppos)
3381{
3382	struct net *net = (struct net *)__ctl->extra1;
3383
3384	if (write) {
3385		rt_cache_flush(net);
3386		fnhe_genid_bump(net);
3387		return 0;
3388	}
3389
3390	return -EINVAL;
3391}
3392
3393static struct ctl_table ipv4_route_table[] = {
3394	{
3395		.procname	= "gc_thresh",
3396		.data		= &ipv4_dst_ops.gc_thresh,
3397		.maxlen		= sizeof(int),
3398		.mode		= 0644,
3399		.proc_handler	= proc_dointvec,
3400	},
3401	{
3402		.procname	= "max_size",
3403		.data		= &ip_rt_max_size,
3404		.maxlen		= sizeof(int),
3405		.mode		= 0644,
3406		.proc_handler	= proc_dointvec,
3407	},
3408	{
3409		/*  Deprecated. Use gc_min_interval_ms */
3410
3411		.procname	= "gc_min_interval",
3412		.data		= &ip_rt_gc_min_interval,
3413		.maxlen		= sizeof(int),
3414		.mode		= 0644,
3415		.proc_handler	= proc_dointvec_jiffies,
3416	},
3417	{
3418		.procname	= "gc_min_interval_ms",
3419		.data		= &ip_rt_gc_min_interval,
3420		.maxlen		= sizeof(int),
3421		.mode		= 0644,
3422		.proc_handler	= proc_dointvec_ms_jiffies,
3423	},
3424	{
3425		.procname	= "gc_timeout",
3426		.data		= &ip_rt_gc_timeout,
3427		.maxlen		= sizeof(int),
3428		.mode		= 0644,
3429		.proc_handler	= proc_dointvec_jiffies,
3430	},
3431	{
3432		.procname	= "gc_interval",
3433		.data		= &ip_rt_gc_interval,
3434		.maxlen		= sizeof(int),
3435		.mode		= 0644,
3436		.proc_handler	= proc_dointvec_jiffies,
3437	},
3438	{
3439		.procname	= "redirect_load",
3440		.data		= &ip_rt_redirect_load,
3441		.maxlen		= sizeof(int),
3442		.mode		= 0644,
3443		.proc_handler	= proc_dointvec,
3444	},
3445	{
3446		.procname	= "redirect_number",
3447		.data		= &ip_rt_redirect_number,
3448		.maxlen		= sizeof(int),
3449		.mode		= 0644,
3450		.proc_handler	= proc_dointvec,
3451	},
3452	{
3453		.procname	= "redirect_silence",
3454		.data		= &ip_rt_redirect_silence,
3455		.maxlen		= sizeof(int),
3456		.mode		= 0644,
3457		.proc_handler	= proc_dointvec,
3458	},
3459	{
3460		.procname	= "error_cost",
3461		.data		= &ip_rt_error_cost,
3462		.maxlen		= sizeof(int),
3463		.mode		= 0644,
3464		.proc_handler	= proc_dointvec,
3465	},
3466	{
3467		.procname	= "error_burst",
3468		.data		= &ip_rt_error_burst,
3469		.maxlen		= sizeof(int),
3470		.mode		= 0644,
3471		.proc_handler	= proc_dointvec,
3472	},
3473	{
3474		.procname	= "gc_elasticity",
3475		.data		= &ip_rt_gc_elasticity,
3476		.maxlen		= sizeof(int),
3477		.mode		= 0644,
3478		.proc_handler	= proc_dointvec,
3479	},
3480	{
3481		.procname	= "mtu_expires",
3482		.data		= &ip_rt_mtu_expires,
3483		.maxlen		= sizeof(int),
3484		.mode		= 0644,
3485		.proc_handler	= proc_dointvec_jiffies,
3486	},
3487	{
3488		.procname	= "min_pmtu",
3489		.data		= &ip_rt_min_pmtu,
3490		.maxlen		= sizeof(int),
3491		.mode		= 0644,
3492		.proc_handler	= proc_dointvec_minmax,
3493		.extra1		= &ip_min_valid_pmtu,
3494	},
3495	{
3496		.procname	= "min_adv_mss",
3497		.data		= &ip_rt_min_advmss,
3498		.maxlen		= sizeof(int),
3499		.mode		= 0644,
3500		.proc_handler	= proc_dointvec,
3501	},
3502	{ }
3503};
3504
3505static const char ipv4_route_flush_procname[] = "flush";
3506
3507static struct ctl_table ipv4_route_flush_table[] = {
3508	{
3509		.procname	= ipv4_route_flush_procname,
3510		.maxlen		= sizeof(int),
3511		.mode		= 0200,
3512		.proc_handler	= ipv4_sysctl_rtcache_flush,
3513	},
3514	{ },
3515};
3516
3517static __net_init int sysctl_route_net_init(struct net *net)
3518{
3519	struct ctl_table *tbl;
3520
3521	tbl = ipv4_route_flush_table;
3522	if (!net_eq(net, &init_net)) {
3523		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3524		if (!tbl)
3525			goto err_dup;
3526
3527		/* Don't export non-whitelisted sysctls to unprivileged users */
3528		if (net->user_ns != &init_user_ns) {
3529			if (tbl[0].procname != ipv4_route_flush_procname)
3530				tbl[0].procname = NULL;
3531		}
3532	}
3533	tbl[0].extra1 = net;
3534
3535	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3536	if (!net->ipv4.route_hdr)
3537		goto err_reg;
3538	return 0;
3539
3540err_reg:
3541	if (tbl != ipv4_route_flush_table)
3542		kfree(tbl);
3543err_dup:
3544	return -ENOMEM;
3545}
3546
3547static __net_exit void sysctl_route_net_exit(struct net *net)
3548{
3549	struct ctl_table *tbl;
3550
3551	tbl = net->ipv4.route_hdr->ctl_table_arg;
3552	unregister_net_sysctl_table(net->ipv4.route_hdr);
3553	BUG_ON(tbl == ipv4_route_flush_table);
3554	kfree(tbl);
3555}
3556
3557static __net_initdata struct pernet_operations sysctl_route_ops = {
3558	.init = sysctl_route_net_init,
3559	.exit = sysctl_route_net_exit,
3560};
3561#endif
3562
3563static __net_init int rt_genid_init(struct net *net)
3564{
3565	atomic_set(&net->ipv4.rt_genid, 0);
3566	atomic_set(&net->fnhe_genid, 0);
3567	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3568	return 0;
3569}
3570
3571static __net_initdata struct pernet_operations rt_genid_ops = {
3572	.init = rt_genid_init,
3573};
3574
3575static int __net_init ipv4_inetpeer_init(struct net *net)
3576{
3577	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3578
3579	if (!bp)
3580		return -ENOMEM;
3581	inet_peer_base_init(bp);
3582	net->ipv4.peers = bp;
3583	return 0;
3584}
3585
3586static void __net_exit ipv4_inetpeer_exit(struct net *net)
3587{
3588	struct inet_peer_base *bp = net->ipv4.peers;
3589
3590	net->ipv4.peers = NULL;
3591	inetpeer_invalidate_tree(bp);
3592	kfree(bp);
3593}
3594
3595static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3596	.init	=	ipv4_inetpeer_init,
3597	.exit	=	ipv4_inetpeer_exit,
3598};
3599
3600#ifdef CONFIG_IP_ROUTE_CLASSID
3601struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3602#endif /* CONFIG_IP_ROUTE_CLASSID */
3603
3604int __init ip_rt_init(void)
3605{
3606	void *idents_hash;
3607	int cpu;
3608
3609	/* For modern hosts, this will use 2 MB of memory */
3610	idents_hash = alloc_large_system_hash("IP idents",
3611					      sizeof(*ip_idents) + sizeof(*ip_tstamps),
3612					      0,
3613					      16, /* one bucket per 64 KB */
3614					      HASH_ZERO,
3615					      NULL,
3616					      &ip_idents_mask,
3617					      2048,
3618					      256*1024);
3619
3620	ip_idents = idents_hash;
3621
3622	prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3623
3624	ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3625
3626	for_each_possible_cpu(cpu) {
3627		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3628
3629		INIT_LIST_HEAD(&ul->head);
3630		spin_lock_init(&ul->lock);
3631	}
3632#ifdef CONFIG_IP_ROUTE_CLASSID
3633	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3634	if (!ip_rt_acct)
3635		panic("IP: failed to allocate ip_rt_acct\n");
3636#endif
3637
3638	ipv4_dst_ops.kmem_cachep =
3639		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3640				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3641
3642	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3643
3644	if (dst_entries_init(&ipv4_dst_ops) < 0)
3645		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3646
3647	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3648		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3649
3650	ipv4_dst_ops.gc_thresh = ~0;
3651	ip_rt_max_size = INT_MAX;
3652
3653	devinet_init();
3654	ip_fib_init();
3655
3656	if (ip_rt_proc_init())
3657		pr_err("Unable to create route proc files\n");
3658#ifdef CONFIG_XFRM
3659	xfrm_init();
3660	xfrm4_init();
3661#endif
3662	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3663		      RTNL_FLAG_DOIT_UNLOCKED);
3664
3665#ifdef CONFIG_SYSCTL
3666	register_pernet_subsys(&sysctl_route_ops);
3667#endif
3668	register_pernet_subsys(&rt_genid_ops);
3669	register_pernet_subsys(&ipv4_inetpeer_ops);
3670	return 0;
3671}
3672
3673#ifdef CONFIG_SYSCTL
3674/*
3675 * We really need to sanitize the damn ipv4 init order, then all
3676 * this nonsense will go away.
3677 */
3678void __init ip_static_sysctl_init(void)
3679{
3680	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3681}
3682#endif
3683