xref: /kernel/linux/linux-6.6/net/ipv6/ip6_output.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 *	IPv6 output functions
4 *	Linux INET6 implementation
5 *
6 *	Authors:
7 *	Pedro Roque		<roque@di.fc.ul.pt>
8 *
9 *	Based on linux/net/ipv4/ip_output.c
10 *
11 *	Changes:
12 *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13 *				extension headers are implemented.
14 *				route changes now work.
15 *				ip6_forward does not confuse sniffers.
16 *				etc.
17 *
18 *      H. von Brand    :       Added missing #include <linux/string.h>
19 *	Imran Patel	:	frag id should be in NBO
20 *      Kazunori MIYAZAWA @USAGI
21 *			:       add ip6_append_data and related functions
22 *				for datagram xmit
23 */
24
25#include <linux/errno.h>
26#include <linux/kernel.h>
27#include <linux/string.h>
28#include <linux/socket.h>
29#include <linux/net.h>
30#include <linux/netdevice.h>
31#include <linux/if_arp.h>
32#include <linux/in6.h>
33#include <linux/tcp.h>
34#include <linux/route.h>
35#include <linux/module.h>
36#include <linux/slab.h>
37
38#include <linux/bpf-cgroup.h>
39#include <linux/netfilter.h>
40#include <linux/netfilter_ipv6.h>
41
42#include <net/sock.h>
43#include <net/snmp.h>
44
45#include <net/gso.h>
46#include <net/ipv6.h>
47#include <net/ndisc.h>
48#include <net/protocol.h>
49#include <net/ip6_route.h>
50#include <net/addrconf.h>
51#include <net/rawv6.h>
52#include <net/icmp.h>
53#include <net/xfrm.h>
54#include <net/checksum.h>
55#include <linux/mroute6.h>
56#include <net/l3mdev.h>
57#include <net/lwtunnel.h>
58#include <net/ip_tunnels.h>
59
60static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61{
62	struct dst_entry *dst = skb_dst(skb);
63	struct net_device *dev = dst->dev;
64	struct inet6_dev *idev = ip6_dst_idev(dst);
65	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66	const struct in6_addr *daddr, *nexthop;
67	struct ipv6hdr *hdr;
68	struct neighbour *neigh;
69	int ret;
70
71	/* Be paranoid, rather than too clever. */
72	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73		skb = skb_expand_head(skb, hh_len);
74		if (!skb) {
75			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
76			return -ENOMEM;
77		}
78	}
79
80	hdr = ipv6_hdr(skb);
81	daddr = &hdr->daddr;
82	if (ipv6_addr_is_multicast(daddr)) {
83		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
84		    ((mroute6_is_socket(net, skb) &&
85		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
86		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
87			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88
89			/* Do not check for IFF_ALLMULTI; multicast routing
90			   is not supported in any case.
91			 */
92			if (newskb)
93				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
94					net, sk, newskb, NULL, newskb->dev,
95					dev_loopback_xmit);
96
97			if (hdr->hop_limit == 0) {
98				IP6_INC_STATS(net, idev,
99					      IPSTATS_MIB_OUTDISCARDS);
100				kfree_skb(skb);
101				return 0;
102			}
103		}
104
105		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
106		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
107		    !(dev->flags & IFF_LOOPBACK)) {
108			kfree_skb(skb);
109			return 0;
110		}
111	}
112
113	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
114		int res = lwtunnel_xmit(skb);
115
116		if (res != LWTUNNEL_XMIT_CONTINUE)
117			return res;
118	}
119
120	rcu_read_lock();
121	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
122	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
123
124	if (unlikely(IS_ERR_OR_NULL(neigh))) {
125		if (unlikely(!neigh))
126			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
127		if (IS_ERR(neigh)) {
128			rcu_read_unlock();
129			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
130			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
131			return -EINVAL;
132		}
133	}
134	sock_confirm_neigh(skb, neigh);
135	ret = neigh_output(neigh, skb, false);
136	rcu_read_unlock();
137	return ret;
138}
139
140static int
141ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142				    struct sk_buff *skb, unsigned int mtu)
143{
144	struct sk_buff *segs, *nskb;
145	netdev_features_t features;
146	int ret = 0;
147
148	/* Please see corresponding comment in ip_finish_output_gso
149	 * describing the cases where GSO segment length exceeds the
150	 * egress MTU.
151	 */
152	features = netif_skb_features(skb);
153	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154	if (IS_ERR_OR_NULL(segs)) {
155		kfree_skb(skb);
156		return -ENOMEM;
157	}
158
159	consume_skb(skb);
160
161	skb_list_walk_safe(segs, segs, nskb) {
162		int err;
163
164		skb_mark_not_on_list(segs);
165		/* Last GSO segment can be smaller than gso_size (and MTU).
166		 * Adding a fragment header would produce an "atomic fragment",
167		 * which is considered harmful (RFC-8021). Avoid that.
168		 */
169		err = segs->len > mtu ?
170			ip6_fragment(net, sk, segs, ip6_finish_output2) :
171			ip6_finish_output2(net, sk, segs);
172		if (err && ret == 0)
173			ret = err;
174	}
175
176	return ret;
177}
178
179static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
180{
181	unsigned int mtu;
182
183#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
184	/* Policy lookup after SNAT yielded a new policy */
185	if (skb_dst(skb)->xfrm) {
186		IP6CB(skb)->flags |= IP6SKB_REROUTED;
187		return dst_output(net, sk, skb);
188	}
189#endif
190
191	mtu = ip6_skb_dst_mtu(skb);
192	if (skb_is_gso(skb) &&
193	    !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
194	    !skb_gso_validate_network_len(skb, mtu))
195		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
196
197	if ((skb->len > mtu && !skb_is_gso(skb)) ||
198	    dst_allfrag(skb_dst(skb)) ||
199	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
200		return ip6_fragment(net, sk, skb, ip6_finish_output2);
201	else
202		return ip6_finish_output2(net, sk, skb);
203}
204
205static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
206{
207	int ret;
208
209	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
210	switch (ret) {
211	case NET_XMIT_SUCCESS:
212	case NET_XMIT_CN:
213		return __ip6_finish_output(net, sk, skb) ? : ret;
214	default:
215		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
216		return ret;
217	}
218}
219
220int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
221{
222	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
223	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
224
225	skb->protocol = htons(ETH_P_IPV6);
226	skb->dev = dev;
227
228	if (unlikely(idev->cnf.disable_ipv6)) {
229		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
230		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
231		return 0;
232	}
233
234	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
235			    net, sk, skb, indev, dev,
236			    ip6_finish_output,
237			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
238}
239EXPORT_SYMBOL(ip6_output);
240
241bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
242{
243	if (!np->autoflowlabel_set)
244		return ip6_default_np_autolabel(net);
245	else
246		return np->autoflowlabel;
247}
248
249/*
250 * xmit an sk_buff (used by TCP, SCTP and DCCP)
251 * Note : socket lock is not held for SYNACK packets, but might be modified
252 * by calls to skb_set_owner_w() and ipv6_local_error(),
253 * which are using proper atomic operations or spinlocks.
254 */
255int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
256	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
257{
258	struct net *net = sock_net(sk);
259	const struct ipv6_pinfo *np = inet6_sk(sk);
260	struct in6_addr *first_hop = &fl6->daddr;
261	struct dst_entry *dst = skb_dst(skb);
262	struct net_device *dev = dst->dev;
263	struct inet6_dev *idev = ip6_dst_idev(dst);
264	struct hop_jumbo_hdr *hop_jumbo;
265	int hoplen = sizeof(*hop_jumbo);
266	unsigned int head_room;
267	struct ipv6hdr *hdr;
268	u8  proto = fl6->flowi6_proto;
269	int seg_len = skb->len;
270	int hlimit = -1;
271	u32 mtu;
272
273	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
274	if (opt)
275		head_room += opt->opt_nflen + opt->opt_flen;
276
277	if (unlikely(head_room > skb_headroom(skb))) {
278		skb = skb_expand_head(skb, head_room);
279		if (!skb) {
280			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
281			return -ENOBUFS;
282		}
283	}
284
285	if (opt) {
286		seg_len += opt->opt_nflen + opt->opt_flen;
287
288		if (opt->opt_flen)
289			ipv6_push_frag_opts(skb, opt, &proto);
290
291		if (opt->opt_nflen)
292			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
293					     &fl6->saddr);
294	}
295
296	if (unlikely(seg_len > IPV6_MAXPLEN)) {
297		hop_jumbo = skb_push(skb, hoplen);
298
299		hop_jumbo->nexthdr = proto;
300		hop_jumbo->hdrlen = 0;
301		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
302		hop_jumbo->tlv_len = 4;
303		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
304
305		proto = IPPROTO_HOPOPTS;
306		seg_len = 0;
307		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
308	}
309
310	skb_push(skb, sizeof(struct ipv6hdr));
311	skb_reset_network_header(skb);
312	hdr = ipv6_hdr(skb);
313
314	/*
315	 *	Fill in the IPv6 header
316	 */
317	if (np)
318		hlimit = np->hop_limit;
319	if (hlimit < 0)
320		hlimit = ip6_dst_hoplimit(dst);
321
322	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
323				ip6_autoflowlabel(net, np), fl6));
324
325	hdr->payload_len = htons(seg_len);
326	hdr->nexthdr = proto;
327	hdr->hop_limit = hlimit;
328
329	hdr->saddr = fl6->saddr;
330	hdr->daddr = *first_hop;
331
332	skb->protocol = htons(ETH_P_IPV6);
333	skb->priority = priority;
334	skb->mark = mark;
335
336	mtu = dst_mtu(dst);
337	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
338		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
339
340		/* if egress device is enslaved to an L3 master device pass the
341		 * skb to its handler for processing
342		 */
343		skb = l3mdev_ip6_out((struct sock *)sk, skb);
344		if (unlikely(!skb))
345			return 0;
346
347		/* hooks should never assume socket lock is held.
348		 * we promote our socket to non const
349		 */
350		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
351			       net, (struct sock *)sk, skb, NULL, dev,
352			       dst_output);
353	}
354
355	skb->dev = dev;
356	/* ipv6_local_error() does not require socket lock,
357	 * we promote our socket to non const
358	 */
359	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
360
361	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
362	kfree_skb(skb);
363	return -EMSGSIZE;
364}
365EXPORT_SYMBOL(ip6_xmit);
366
367static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
368{
369	struct ip6_ra_chain *ra;
370	struct sock *last = NULL;
371
372	read_lock(&ip6_ra_lock);
373	for (ra = ip6_ra_chain; ra; ra = ra->next) {
374		struct sock *sk = ra->sk;
375		if (sk && ra->sel == sel &&
376		    (!sk->sk_bound_dev_if ||
377		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
378			struct ipv6_pinfo *np = inet6_sk(sk);
379
380			if (np && np->rtalert_isolate &&
381			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
382				continue;
383			}
384			if (last) {
385				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
386				if (skb2)
387					rawv6_rcv(last, skb2);
388			}
389			last = sk;
390		}
391	}
392
393	if (last) {
394		rawv6_rcv(last, skb);
395		read_unlock(&ip6_ra_lock);
396		return 1;
397	}
398	read_unlock(&ip6_ra_lock);
399	return 0;
400}
401
402static int ip6_forward_proxy_check(struct sk_buff *skb)
403{
404	struct ipv6hdr *hdr = ipv6_hdr(skb);
405	u8 nexthdr = hdr->nexthdr;
406	__be16 frag_off;
407	int offset;
408
409	if (ipv6_ext_hdr(nexthdr)) {
410		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
411		if (offset < 0)
412			return 0;
413	} else
414		offset = sizeof(struct ipv6hdr);
415
416	if (nexthdr == IPPROTO_ICMPV6) {
417		struct icmp6hdr *icmp6;
418
419		if (!pskb_may_pull(skb, (skb_network_header(skb) +
420					 offset + 1 - skb->data)))
421			return 0;
422
423		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
424
425		switch (icmp6->icmp6_type) {
426		case NDISC_ROUTER_SOLICITATION:
427		case NDISC_ROUTER_ADVERTISEMENT:
428		case NDISC_NEIGHBOUR_SOLICITATION:
429		case NDISC_NEIGHBOUR_ADVERTISEMENT:
430		case NDISC_REDIRECT:
431			/* For reaction involving unicast neighbor discovery
432			 * message destined to the proxied address, pass it to
433			 * input function.
434			 */
435			return 1;
436		default:
437			break;
438		}
439	}
440
441	/*
442	 * The proxying router can't forward traffic sent to a link-local
443	 * address, so signal the sender and discard the packet. This
444	 * behavior is clarified by the MIPv6 specification.
445	 */
446	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
447		dst_link_failure(skb);
448		return -1;
449	}
450
451	return 0;
452}
453
454static inline int ip6_forward_finish(struct net *net, struct sock *sk,
455				     struct sk_buff *skb)
456{
457	struct dst_entry *dst = skb_dst(skb);
458
459	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
460
461#ifdef CONFIG_NET_SWITCHDEV
462	if (skb->offload_l3_fwd_mark) {
463		consume_skb(skb);
464		return 0;
465	}
466#endif
467
468	skb_clear_tstamp(skb);
469	return dst_output(net, sk, skb);
470}
471
472static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
473{
474	if (skb->len <= mtu)
475		return false;
476
477	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
478	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
479		return true;
480
481	if (skb->ignore_df)
482		return false;
483
484	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
485		return false;
486
487	return true;
488}
489
490int ip6_forward(struct sk_buff *skb)
491{
492	struct dst_entry *dst = skb_dst(skb);
493	struct ipv6hdr *hdr = ipv6_hdr(skb);
494	struct inet6_skb_parm *opt = IP6CB(skb);
495	struct net *net = dev_net(dst->dev);
496	struct inet6_dev *idev;
497	SKB_DR(reason);
498	u32 mtu;
499
500	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
501	if (net->ipv6.devconf_all->forwarding == 0)
502		goto error;
503
504	if (skb->pkt_type != PACKET_HOST)
505		goto drop;
506
507	if (unlikely(skb->sk))
508		goto drop;
509
510	if (skb_warn_if_lro(skb))
511		goto drop;
512
513	if (!net->ipv6.devconf_all->disable_policy &&
514	    (!idev || !idev->cnf.disable_policy) &&
515	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
516		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
517		goto drop;
518	}
519
520	skb_forward_csum(skb);
521
522	/*
523	 *	We DO NOT make any processing on
524	 *	RA packets, pushing them to user level AS IS
525	 *	without ane WARRANTY that application will be able
526	 *	to interpret them. The reason is that we
527	 *	cannot make anything clever here.
528	 *
529	 *	We are not end-node, so that if packet contains
530	 *	AH/ESP, we cannot make anything.
531	 *	Defragmentation also would be mistake, RA packets
532	 *	cannot be fragmented, because there is no warranty
533	 *	that different fragments will go along one path. --ANK
534	 */
535	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
536		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
537			return 0;
538	}
539
540	/*
541	 *	check and decrement ttl
542	 */
543	if (hdr->hop_limit <= 1) {
544		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
545		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
546
547		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
548		return -ETIMEDOUT;
549	}
550
551	/* XXX: idev->cnf.proxy_ndp? */
552	if (net->ipv6.devconf_all->proxy_ndp &&
553	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
554		int proxied = ip6_forward_proxy_check(skb);
555		if (proxied > 0) {
556			/* It's tempting to decrease the hop limit
557			 * here by 1, as we do at the end of the
558			 * function too.
559			 *
560			 * But that would be incorrect, as proxying is
561			 * not forwarding.  The ip6_input function
562			 * will handle this packet locally, and it
563			 * depends on the hop limit being unchanged.
564			 *
565			 * One example is the NDP hop limit, that
566			 * always has to stay 255, but other would be
567			 * similar checks around RA packets, where the
568			 * user can even change the desired limit.
569			 */
570			return ip6_input(skb);
571		} else if (proxied < 0) {
572			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
573			goto drop;
574		}
575	}
576
577	if (!xfrm6_route_forward(skb)) {
578		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
579		SKB_DR_SET(reason, XFRM_POLICY);
580		goto drop;
581	}
582	dst = skb_dst(skb);
583
584	/* IPv6 specs say nothing about it, but it is clear that we cannot
585	   send redirects to source routed frames.
586	   We don't send redirects to frames decapsulated from IPsec.
587	 */
588	if (IP6CB(skb)->iif == dst->dev->ifindex &&
589	    opt->srcrt == 0 && !skb_sec_path(skb)) {
590		struct in6_addr *target = NULL;
591		struct inet_peer *peer;
592		struct rt6_info *rt;
593
594		/*
595		 *	incoming and outgoing devices are the same
596		 *	send a redirect.
597		 */
598
599		rt = (struct rt6_info *) dst;
600		if (rt->rt6i_flags & RTF_GATEWAY)
601			target = &rt->rt6i_gateway;
602		else
603			target = &hdr->daddr;
604
605		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
606
607		/* Limit redirects both by destination (here)
608		   and by source (inside ndisc_send_redirect)
609		 */
610		if (inet_peer_xrlim_allow(peer, 1*HZ))
611			ndisc_send_redirect(skb, target);
612		if (peer)
613			inet_putpeer(peer);
614	} else {
615		int addrtype = ipv6_addr_type(&hdr->saddr);
616
617		/* This check is security critical. */
618		if (addrtype == IPV6_ADDR_ANY ||
619		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
620			goto error;
621		if (addrtype & IPV6_ADDR_LINKLOCAL) {
622			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
623				    ICMPV6_NOT_NEIGHBOUR, 0);
624			goto error;
625		}
626	}
627
628	mtu = ip6_dst_mtu_maybe_forward(dst, true);
629	if (mtu < IPV6_MIN_MTU)
630		mtu = IPV6_MIN_MTU;
631
632	if (ip6_pkt_too_big(skb, mtu)) {
633		/* Again, force OUTPUT device used as source address */
634		skb->dev = dst->dev;
635		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
636		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
637		__IP6_INC_STATS(net, ip6_dst_idev(dst),
638				IPSTATS_MIB_FRAGFAILS);
639		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
640		return -EMSGSIZE;
641	}
642
643	if (skb_cow(skb, dst->dev->hard_header_len)) {
644		__IP6_INC_STATS(net, ip6_dst_idev(dst),
645				IPSTATS_MIB_OUTDISCARDS);
646		goto drop;
647	}
648
649	hdr = ipv6_hdr(skb);
650
651	/* Mangling hops number delayed to point after skb COW */
652
653	hdr->hop_limit--;
654
655	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
656		       net, NULL, skb, skb->dev, dst->dev,
657		       ip6_forward_finish);
658
659error:
660	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
661	SKB_DR_SET(reason, IP_INADDRERRORS);
662drop:
663	kfree_skb_reason(skb, reason);
664	return -EINVAL;
665}
666
667static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
668{
669	to->pkt_type = from->pkt_type;
670	to->priority = from->priority;
671	to->protocol = from->protocol;
672	skb_dst_drop(to);
673	skb_dst_set(to, dst_clone(skb_dst(from)));
674	to->dev = from->dev;
675	to->mark = from->mark;
676
677	skb_copy_hash(to, from);
678
679#ifdef CONFIG_NET_SCHED
680	to->tc_index = from->tc_index;
681#endif
682	nf_copy(to, from);
683	skb_ext_copy(to, from);
684	skb_copy_secmark(to, from);
685}
686
687int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
688		      u8 nexthdr, __be32 frag_id,
689		      struct ip6_fraglist_iter *iter)
690{
691	unsigned int first_len;
692	struct frag_hdr *fh;
693
694	/* BUILD HEADER */
695	*prevhdr = NEXTHDR_FRAGMENT;
696	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
697	if (!iter->tmp_hdr)
698		return -ENOMEM;
699
700	iter->frag = skb_shinfo(skb)->frag_list;
701	skb_frag_list_init(skb);
702
703	iter->offset = 0;
704	iter->hlen = hlen;
705	iter->frag_id = frag_id;
706	iter->nexthdr = nexthdr;
707
708	__skb_pull(skb, hlen);
709	fh = __skb_push(skb, sizeof(struct frag_hdr));
710	__skb_push(skb, hlen);
711	skb_reset_network_header(skb);
712	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
713
714	fh->nexthdr = nexthdr;
715	fh->reserved = 0;
716	fh->frag_off = htons(IP6_MF);
717	fh->identification = frag_id;
718
719	first_len = skb_pagelen(skb);
720	skb->data_len = first_len - skb_headlen(skb);
721	skb->len = first_len;
722	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
723
724	return 0;
725}
726EXPORT_SYMBOL(ip6_fraglist_init);
727
728void ip6_fraglist_prepare(struct sk_buff *skb,
729			  struct ip6_fraglist_iter *iter)
730{
731	struct sk_buff *frag = iter->frag;
732	unsigned int hlen = iter->hlen;
733	struct frag_hdr *fh;
734
735	frag->ip_summed = CHECKSUM_NONE;
736	skb_reset_transport_header(frag);
737	fh = __skb_push(frag, sizeof(struct frag_hdr));
738	__skb_push(frag, hlen);
739	skb_reset_network_header(frag);
740	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
741	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
742	fh->nexthdr = iter->nexthdr;
743	fh->reserved = 0;
744	fh->frag_off = htons(iter->offset);
745	if (frag->next)
746		fh->frag_off |= htons(IP6_MF);
747	fh->identification = iter->frag_id;
748	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
749	ip6_copy_metadata(frag, skb);
750}
751EXPORT_SYMBOL(ip6_fraglist_prepare);
752
753void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
754		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
755		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
756{
757	state->prevhdr = prevhdr;
758	state->nexthdr = nexthdr;
759	state->frag_id = frag_id;
760
761	state->hlen = hlen;
762	state->mtu = mtu;
763
764	state->left = skb->len - hlen;	/* Space per frame */
765	state->ptr = hlen;		/* Where to start from */
766
767	state->hroom = hdr_room;
768	state->troom = needed_tailroom;
769
770	state->offset = 0;
771}
772EXPORT_SYMBOL(ip6_frag_init);
773
774struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
775{
776	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
777	struct sk_buff *frag;
778	struct frag_hdr *fh;
779	unsigned int len;
780
781	len = state->left;
782	/* IF: it doesn't fit, use 'mtu' - the data space left */
783	if (len > state->mtu)
784		len = state->mtu;
785	/* IF: we are not sending up to and including the packet end
786	   then align the next start on an eight byte boundary */
787	if (len < state->left)
788		len &= ~7;
789
790	/* Allocate buffer */
791	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
792			 state->hroom + state->troom, GFP_ATOMIC);
793	if (!frag)
794		return ERR_PTR(-ENOMEM);
795
796	/*
797	 *	Set up data on packet
798	 */
799
800	ip6_copy_metadata(frag, skb);
801	skb_reserve(frag, state->hroom);
802	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
803	skb_reset_network_header(frag);
804	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
805	frag->transport_header = (frag->network_header + state->hlen +
806				  sizeof(struct frag_hdr));
807
808	/*
809	 *	Charge the memory for the fragment to any owner
810	 *	it might possess
811	 */
812	if (skb->sk)
813		skb_set_owner_w(frag, skb->sk);
814
815	/*
816	 *	Copy the packet header into the new buffer.
817	 */
818	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
819
820	fragnexthdr_offset = skb_network_header(frag);
821	fragnexthdr_offset += prevhdr - skb_network_header(skb);
822	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
823
824	/*
825	 *	Build fragment header.
826	 */
827	fh->nexthdr = state->nexthdr;
828	fh->reserved = 0;
829	fh->identification = state->frag_id;
830
831	/*
832	 *	Copy a block of the IP datagram.
833	 */
834	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
835			     len));
836	state->left -= len;
837
838	fh->frag_off = htons(state->offset);
839	if (state->left > 0)
840		fh->frag_off |= htons(IP6_MF);
841	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
842
843	state->ptr += len;
844	state->offset += len;
845
846	return frag;
847}
848EXPORT_SYMBOL(ip6_frag_next);
849
850int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
851		 int (*output)(struct net *, struct sock *, struct sk_buff *))
852{
853	struct sk_buff *frag;
854	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
855	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
856				inet6_sk(skb->sk) : NULL;
857	bool mono_delivery_time = skb->mono_delivery_time;
858	struct ip6_frag_state state;
859	unsigned int mtu, hlen, nexthdr_offset;
860	ktime_t tstamp = skb->tstamp;
861	int hroom, err = 0;
862	__be32 frag_id;
863	u8 *prevhdr, nexthdr = 0;
864
865	err = ip6_find_1stfragopt(skb, &prevhdr);
866	if (err < 0)
867		goto fail;
868	hlen = err;
869	nexthdr = *prevhdr;
870	nexthdr_offset = prevhdr - skb_network_header(skb);
871
872	mtu = ip6_skb_dst_mtu(skb);
873
874	/* We must not fragment if the socket is set to force MTU discovery
875	 * or if the skb it not generated by a local socket.
876	 */
877	if (unlikely(!skb->ignore_df && skb->len > mtu))
878		goto fail_toobig;
879
880	if (IP6CB(skb)->frag_max_size) {
881		if (IP6CB(skb)->frag_max_size > mtu)
882			goto fail_toobig;
883
884		/* don't send fragments larger than what we received */
885		mtu = IP6CB(skb)->frag_max_size;
886		if (mtu < IPV6_MIN_MTU)
887			mtu = IPV6_MIN_MTU;
888	}
889
890	if (np && np->frag_size < mtu) {
891		if (np->frag_size)
892			mtu = np->frag_size;
893	}
894	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
895		goto fail_toobig;
896	mtu -= hlen + sizeof(struct frag_hdr);
897
898	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
899				    &ipv6_hdr(skb)->saddr);
900
901	if (skb->ip_summed == CHECKSUM_PARTIAL &&
902	    (err = skb_checksum_help(skb)))
903		goto fail;
904
905	prevhdr = skb_network_header(skb) + nexthdr_offset;
906	hroom = LL_RESERVED_SPACE(rt->dst.dev);
907	if (skb_has_frag_list(skb)) {
908		unsigned int first_len = skb_pagelen(skb);
909		struct ip6_fraglist_iter iter;
910		struct sk_buff *frag2;
911
912		if (first_len - hlen > mtu ||
913		    ((first_len - hlen) & 7) ||
914		    skb_cloned(skb) ||
915		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
916			goto slow_path;
917
918		skb_walk_frags(skb, frag) {
919			/* Correct geometry. */
920			if (frag->len > mtu ||
921			    ((frag->len & 7) && frag->next) ||
922			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
923				goto slow_path_clean;
924
925			/* Partially cloned skb? */
926			if (skb_shared(frag))
927				goto slow_path_clean;
928
929			BUG_ON(frag->sk);
930			if (skb->sk) {
931				frag->sk = skb->sk;
932				frag->destructor = sock_wfree;
933			}
934			skb->truesize -= frag->truesize;
935		}
936
937		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
938					&iter);
939		if (err < 0)
940			goto fail;
941
942		/* We prevent @rt from being freed. */
943		rcu_read_lock();
944
945		for (;;) {
946			/* Prepare header of the next frame,
947			 * before previous one went down. */
948			if (iter.frag)
949				ip6_fraglist_prepare(skb, &iter);
950
951			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
952			err = output(net, sk, skb);
953			if (!err)
954				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
955					      IPSTATS_MIB_FRAGCREATES);
956
957			if (err || !iter.frag)
958				break;
959
960			skb = ip6_fraglist_next(&iter);
961		}
962
963		kfree(iter.tmp_hdr);
964
965		if (err == 0) {
966			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
967				      IPSTATS_MIB_FRAGOKS);
968			rcu_read_unlock();
969			return 0;
970		}
971
972		kfree_skb_list(iter.frag);
973
974		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
975			      IPSTATS_MIB_FRAGFAILS);
976		rcu_read_unlock();
977		return err;
978
979slow_path_clean:
980		skb_walk_frags(skb, frag2) {
981			if (frag2 == frag)
982				break;
983			frag2->sk = NULL;
984			frag2->destructor = NULL;
985			skb->truesize += frag2->truesize;
986		}
987	}
988
989slow_path:
990	/*
991	 *	Fragment the datagram.
992	 */
993
994	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
995		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
996		      &state);
997
998	/*
999	 *	Keep copying data until we run out.
1000	 */
1001
1002	while (state.left > 0) {
1003		frag = ip6_frag_next(skb, &state);
1004		if (IS_ERR(frag)) {
1005			err = PTR_ERR(frag);
1006			goto fail;
1007		}
1008
1009		/*
1010		 *	Put this fragment into the sending queue.
1011		 */
1012		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1013		err = output(net, sk, frag);
1014		if (err)
1015			goto fail;
1016
1017		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1018			      IPSTATS_MIB_FRAGCREATES);
1019	}
1020	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1021		      IPSTATS_MIB_FRAGOKS);
1022	consume_skb(skb);
1023	return err;
1024
1025fail_toobig:
1026	if (skb->sk && dst_allfrag(skb_dst(skb)))
1027		sk_gso_disable(skb->sk);
1028
1029	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1030	err = -EMSGSIZE;
1031
1032fail:
1033	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1034		      IPSTATS_MIB_FRAGFAILS);
1035	kfree_skb(skb);
1036	return err;
1037}
1038
1039static inline int ip6_rt_check(const struct rt6key *rt_key,
1040			       const struct in6_addr *fl_addr,
1041			       const struct in6_addr *addr_cache)
1042{
1043	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1044		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1045}
1046
1047static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1048					  struct dst_entry *dst,
1049					  const struct flowi6 *fl6)
1050{
1051	struct ipv6_pinfo *np = inet6_sk(sk);
1052	struct rt6_info *rt;
1053
1054	if (!dst)
1055		goto out;
1056
1057	if (dst->ops->family != AF_INET6) {
1058		dst_release(dst);
1059		return NULL;
1060	}
1061
1062	rt = (struct rt6_info *)dst;
1063	/* Yes, checking route validity in not connected
1064	 * case is not very simple. Take into account,
1065	 * that we do not support routing by source, TOS,
1066	 * and MSG_DONTROUTE		--ANK (980726)
1067	 *
1068	 * 1. ip6_rt_check(): If route was host route,
1069	 *    check that cached destination is current.
1070	 *    If it is network route, we still may
1071	 *    check its validity using saved pointer
1072	 *    to the last used address: daddr_cache.
1073	 *    We do not want to save whole address now,
1074	 *    (because main consumer of this service
1075	 *    is tcp, which has not this problem),
1076	 *    so that the last trick works only on connected
1077	 *    sockets.
1078	 * 2. oif also should be the same.
1079	 */
1080	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1081#ifdef CONFIG_IPV6_SUBTREES
1082	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1083#endif
1084	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1085		dst_release(dst);
1086		dst = NULL;
1087	}
1088
1089out:
1090	return dst;
1091}
1092
1093static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1094			       struct dst_entry **dst, struct flowi6 *fl6)
1095{
1096#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1097	struct neighbour *n;
1098	struct rt6_info *rt;
1099#endif
1100	int err;
1101	int flags = 0;
1102
1103	/* The correct way to handle this would be to do
1104	 * ip6_route_get_saddr, and then ip6_route_output; however,
1105	 * the route-specific preferred source forces the
1106	 * ip6_route_output call _before_ ip6_route_get_saddr.
1107	 *
1108	 * In source specific routing (no src=any default route),
1109	 * ip6_route_output will fail given src=any saddr, though, so
1110	 * that's why we try it again later.
1111	 */
1112	if (ipv6_addr_any(&fl6->saddr)) {
1113		struct fib6_info *from;
1114		struct rt6_info *rt;
1115
1116		*dst = ip6_route_output(net, sk, fl6);
1117		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1118
1119		rcu_read_lock();
1120		from = rt ? rcu_dereference(rt->from) : NULL;
1121		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1122					  sk ? inet6_sk(sk)->srcprefs : 0,
1123					  &fl6->saddr);
1124		rcu_read_unlock();
1125
1126		if (err)
1127			goto out_err_release;
1128
1129		/* If we had an erroneous initial result, pretend it
1130		 * never existed and let the SA-enabled version take
1131		 * over.
1132		 */
1133		if ((*dst)->error) {
1134			dst_release(*dst);
1135			*dst = NULL;
1136		}
1137
1138		if (fl6->flowi6_oif)
1139			flags |= RT6_LOOKUP_F_IFACE;
1140	}
1141
1142	if (!*dst)
1143		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1144
1145	err = (*dst)->error;
1146	if (err)
1147		goto out_err_release;
1148
1149#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1150	/*
1151	 * Here if the dst entry we've looked up
1152	 * has a neighbour entry that is in the INCOMPLETE
1153	 * state and the src address from the flow is
1154	 * marked as OPTIMISTIC, we release the found
1155	 * dst entry and replace it instead with the
1156	 * dst entry of the nexthop router
1157	 */
1158	rt = (struct rt6_info *) *dst;
1159	rcu_read_lock();
1160	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1161				      rt6_nexthop(rt, &fl6->daddr));
1162	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1163	rcu_read_unlock();
1164
1165	if (err) {
1166		struct inet6_ifaddr *ifp;
1167		struct flowi6 fl_gw6;
1168		int redirect;
1169
1170		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1171				      (*dst)->dev, 1);
1172
1173		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1174		if (ifp)
1175			in6_ifa_put(ifp);
1176
1177		if (redirect) {
1178			/*
1179			 * We need to get the dst entry for the
1180			 * default router instead
1181			 */
1182			dst_release(*dst);
1183			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1184			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1185			*dst = ip6_route_output(net, sk, &fl_gw6);
1186			err = (*dst)->error;
1187			if (err)
1188				goto out_err_release;
1189		}
1190	}
1191#endif
1192	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1193	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1194		err = -EAFNOSUPPORT;
1195		goto out_err_release;
1196	}
1197
1198	return 0;
1199
1200out_err_release:
1201	dst_release(*dst);
1202	*dst = NULL;
1203
1204	if (err == -ENETUNREACH)
1205		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1206	return err;
1207}
1208
1209/**
1210 *	ip6_dst_lookup - perform route lookup on flow
1211 *	@net: Network namespace to perform lookup in
1212 *	@sk: socket which provides route info
1213 *	@dst: pointer to dst_entry * for result
1214 *	@fl6: flow to lookup
1215 *
1216 *	This function performs a route lookup on the given flow.
1217 *
1218 *	It returns zero on success, or a standard errno code on error.
1219 */
1220int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1221		   struct flowi6 *fl6)
1222{
1223	*dst = NULL;
1224	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1225}
1226EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1227
1228/**
1229 *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1230 *	@net: Network namespace to perform lookup in
1231 *	@sk: socket which provides route info
1232 *	@fl6: flow to lookup
1233 *	@final_dst: final destination address for ipsec lookup
1234 *
1235 *	This function performs a route lookup on the given flow.
1236 *
1237 *	It returns a valid dst pointer on success, or a pointer encoded
1238 *	error code.
1239 */
1240struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1241				      const struct in6_addr *final_dst)
1242{
1243	struct dst_entry *dst = NULL;
1244	int err;
1245
1246	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1247	if (err)
1248		return ERR_PTR(err);
1249	if (final_dst)
1250		fl6->daddr = *final_dst;
1251
1252	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1253}
1254EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1255
1256/**
1257 *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1258 *	@sk: socket which provides the dst cache and route info
1259 *	@fl6: flow to lookup
1260 *	@final_dst: final destination address for ipsec lookup
1261 *	@connected: whether @sk is connected or not
1262 *
1263 *	This function performs a route lookup on the given flow with the
1264 *	possibility of using the cached route in the socket if it is valid.
1265 *	It will take the socket dst lock when operating on the dst cache.
1266 *	As a result, this function can only be used in process context.
1267 *
1268 *	In addition, for a connected socket, cache the dst in the socket
1269 *	if the current cache is not valid.
1270 *
1271 *	It returns a valid dst pointer on success, or a pointer encoded
1272 *	error code.
1273 */
1274struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1275					 const struct in6_addr *final_dst,
1276					 bool connected)
1277{
1278	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1279
1280	dst = ip6_sk_dst_check(sk, dst, fl6);
1281	if (dst)
1282		return dst;
1283
1284	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1285	if (connected && !IS_ERR(dst))
1286		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1287
1288	return dst;
1289}
1290EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1291
1292/**
1293 *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1294 *      @skb: Packet for which lookup is done
1295 *      @dev: Tunnel device
1296 *      @net: Network namespace of tunnel device
1297 *      @sock: Socket which provides route info
1298 *      @saddr: Memory to store the src ip address
1299 *      @info: Tunnel information
1300 *      @protocol: IP protocol
1301 *      @use_cache: Flag to enable cache usage
1302 *      This function performs a route lookup on a tunnel
1303 *
1304 *      It returns a valid dst pointer and stores src address to be used in
1305 *      tunnel in param saddr on success, else a pointer encoded error code.
1306 */
1307
1308struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1309					struct net_device *dev,
1310					struct net *net,
1311					struct socket *sock,
1312					struct in6_addr *saddr,
1313					const struct ip_tunnel_info *info,
1314					u8 protocol,
1315					bool use_cache)
1316{
1317	struct dst_entry *dst = NULL;
1318#ifdef CONFIG_DST_CACHE
1319	struct dst_cache *dst_cache;
1320#endif
1321	struct flowi6 fl6;
1322	__u8 prio;
1323
1324#ifdef CONFIG_DST_CACHE
1325	dst_cache = (struct dst_cache *)&info->dst_cache;
1326	if (use_cache) {
1327		dst = dst_cache_get_ip6(dst_cache, saddr);
1328		if (dst)
1329			return dst;
1330	}
1331#endif
1332	memset(&fl6, 0, sizeof(fl6));
1333	fl6.flowi6_mark = skb->mark;
1334	fl6.flowi6_proto = protocol;
1335	fl6.daddr = info->key.u.ipv6.dst;
1336	fl6.saddr = info->key.u.ipv6.src;
1337	prio = info->key.tos;
1338	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1339
1340	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1341					      NULL);
1342	if (IS_ERR(dst)) {
1343		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1344		return ERR_PTR(-ENETUNREACH);
1345	}
1346	if (dst->dev == dev) { /* is this necessary? */
1347		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1348		dst_release(dst);
1349		return ERR_PTR(-ELOOP);
1350	}
1351#ifdef CONFIG_DST_CACHE
1352	if (use_cache)
1353		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1354#endif
1355	*saddr = fl6.saddr;
1356	return dst;
1357}
1358EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1359
1360static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1361					       gfp_t gfp)
1362{
1363	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1364}
1365
1366static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1367						gfp_t gfp)
1368{
1369	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1370}
1371
1372static void ip6_append_data_mtu(unsigned int *mtu,
1373				int *maxfraglen,
1374				unsigned int fragheaderlen,
1375				struct sk_buff *skb,
1376				struct rt6_info *rt,
1377				unsigned int orig_mtu)
1378{
1379	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1380		if (!skb) {
1381			/* first fragment, reserve header_len */
1382			*mtu = orig_mtu - rt->dst.header_len;
1383
1384		} else {
1385			/*
1386			 * this fragment is not first, the headers
1387			 * space is regarded as data space.
1388			 */
1389			*mtu = orig_mtu;
1390		}
1391		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1392			      + fragheaderlen - sizeof(struct frag_hdr);
1393	}
1394}
1395
1396static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1397			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1398			  struct rt6_info *rt)
1399{
1400	struct ipv6_pinfo *np = inet6_sk(sk);
1401	unsigned int mtu;
1402	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1403
1404	/* callers pass dst together with a reference, set it first so
1405	 * ip6_cork_release() can put it down even in case of an error.
1406	 */
1407	cork->base.dst = &rt->dst;
1408
1409	/*
1410	 * setup for corking
1411	 */
1412	if (opt) {
1413		if (WARN_ON(v6_cork->opt))
1414			return -EINVAL;
1415
1416		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1417		if (unlikely(!nopt))
1418			return -ENOBUFS;
1419
1420		nopt->tot_len = sizeof(*opt);
1421		nopt->opt_flen = opt->opt_flen;
1422		nopt->opt_nflen = opt->opt_nflen;
1423
1424		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1425		if (opt->dst0opt && !nopt->dst0opt)
1426			return -ENOBUFS;
1427
1428		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1429		if (opt->dst1opt && !nopt->dst1opt)
1430			return -ENOBUFS;
1431
1432		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1433		if (opt->hopopt && !nopt->hopopt)
1434			return -ENOBUFS;
1435
1436		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1437		if (opt->srcrt && !nopt->srcrt)
1438			return -ENOBUFS;
1439
1440		/* need source address above miyazawa*/
1441	}
1442	v6_cork->hop_limit = ipc6->hlimit;
1443	v6_cork->tclass = ipc6->tclass;
1444	if (rt->dst.flags & DST_XFRM_TUNNEL)
1445		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1446		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1447	else
1448		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1449			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1450	if (np->frag_size < mtu) {
1451		if (np->frag_size)
1452			mtu = np->frag_size;
1453	}
1454	cork->base.fragsize = mtu;
1455	cork->base.gso_size = ipc6->gso_size;
1456	cork->base.tx_flags = 0;
1457	cork->base.mark = ipc6->sockc.mark;
1458	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1459
1460	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1461		cork->base.flags |= IPCORK_ALLFRAG;
1462	cork->base.length = 0;
1463
1464	cork->base.transmit_time = ipc6->sockc.transmit_time;
1465
1466	return 0;
1467}
1468
1469static int __ip6_append_data(struct sock *sk,
1470			     struct sk_buff_head *queue,
1471			     struct inet_cork_full *cork_full,
1472			     struct inet6_cork *v6_cork,
1473			     struct page_frag *pfrag,
1474			     int getfrag(void *from, char *to, int offset,
1475					 int len, int odd, struct sk_buff *skb),
1476			     void *from, size_t length, int transhdrlen,
1477			     unsigned int flags, struct ipcm6_cookie *ipc6)
1478{
1479	struct sk_buff *skb, *skb_prev = NULL;
1480	struct inet_cork *cork = &cork_full->base;
1481	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1482	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1483	struct ubuf_info *uarg = NULL;
1484	int exthdrlen = 0;
1485	int dst_exthdrlen = 0;
1486	int hh_len;
1487	int copy;
1488	int err;
1489	int offset = 0;
1490	bool zc = false;
1491	u32 tskey = 0;
1492	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1493	struct ipv6_txoptions *opt = v6_cork->opt;
1494	int csummode = CHECKSUM_NONE;
1495	unsigned int maxnonfragsize, headersize;
1496	unsigned int wmem_alloc_delta = 0;
1497	bool paged, extra_uref = false;
1498
1499	skb = skb_peek_tail(queue);
1500	if (!skb) {
1501		exthdrlen = opt ? opt->opt_flen : 0;
1502		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1503	}
1504
1505	paged = !!cork->gso_size;
1506	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1507	orig_mtu = mtu;
1508
1509	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1510	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID)
1511		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1512
1513	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1514
1515	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1516			(opt ? opt->opt_nflen : 0);
1517
1518	headersize = sizeof(struct ipv6hdr) +
1519		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1520		     (dst_allfrag(&rt->dst) ?
1521		      sizeof(struct frag_hdr) : 0) +
1522		     rt->rt6i_nfheader_len;
1523
1524	if (mtu <= fragheaderlen ||
1525	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1526		goto emsgsize;
1527
1528	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1529		     sizeof(struct frag_hdr);
1530
1531	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1532	 * the first fragment
1533	 */
1534	if (headersize + transhdrlen > mtu)
1535		goto emsgsize;
1536
1537	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1538	    (sk->sk_protocol == IPPROTO_UDP ||
1539	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1540	     sk->sk_protocol == IPPROTO_RAW)) {
1541		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1542				sizeof(struct ipv6hdr));
1543		goto emsgsize;
1544	}
1545
1546	if (ip6_sk_ignore_df(sk))
1547		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1548	else
1549		maxnonfragsize = mtu;
1550
1551	if (cork->length + length > maxnonfragsize - headersize) {
1552emsgsize:
1553		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1554		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1555		return -EMSGSIZE;
1556	}
1557
1558	/* CHECKSUM_PARTIAL only with no extension headers and when
1559	 * we are not going to fragment
1560	 */
1561	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1562	    headersize == sizeof(struct ipv6hdr) &&
1563	    length <= mtu - headersize &&
1564	    (!(flags & MSG_MORE) || cork->gso_size) &&
1565	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1566		csummode = CHECKSUM_PARTIAL;
1567
1568	if ((flags & MSG_ZEROCOPY) && length) {
1569		struct msghdr *msg = from;
1570
1571		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1572			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1573				return -EINVAL;
1574
1575			/* Leave uarg NULL if can't zerocopy, callers should
1576			 * be able to handle it.
1577			 */
1578			if ((rt->dst.dev->features & NETIF_F_SG) &&
1579			    csummode == CHECKSUM_PARTIAL) {
1580				paged = true;
1581				zc = true;
1582				uarg = msg->msg_ubuf;
1583			}
1584		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1585			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1586			if (!uarg)
1587				return -ENOBUFS;
1588			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1589			if (rt->dst.dev->features & NETIF_F_SG &&
1590			    csummode == CHECKSUM_PARTIAL) {
1591				paged = true;
1592				zc = true;
1593			} else {
1594				uarg_to_msgzc(uarg)->zerocopy = 0;
1595				skb_zcopy_set(skb, uarg, &extra_uref);
1596			}
1597		}
1598	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1599		if (inet_test_bit(HDRINCL, sk))
1600			return -EPERM;
1601		if (rt->dst.dev->features & NETIF_F_SG &&
1602		    getfrag == ip_generic_getfrag)
1603			/* We need an empty buffer to attach stuff to */
1604			paged = true;
1605		else
1606			flags &= ~MSG_SPLICE_PAGES;
1607	}
1608
1609	/*
1610	 * Let's try using as much space as possible.
1611	 * Use MTU if total length of the message fits into the MTU.
1612	 * Otherwise, we need to reserve fragment header and
1613	 * fragment alignment (= 8-15 octects, in total).
1614	 *
1615	 * Note that we may need to "move" the data from the tail
1616	 * of the buffer to the new fragment when we split
1617	 * the message.
1618	 *
1619	 * FIXME: It may be fragmented into multiple chunks
1620	 *        at once if non-fragmentable extension headers
1621	 *        are too large.
1622	 * --yoshfuji
1623	 */
1624
1625	cork->length += length;
1626	if (!skb)
1627		goto alloc_new_skb;
1628
1629	while (length > 0) {
1630		/* Check if the remaining data fits into current packet. */
1631		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1632		if (copy < length)
1633			copy = maxfraglen - skb->len;
1634
1635		if (copy <= 0) {
1636			char *data;
1637			unsigned int datalen;
1638			unsigned int fraglen;
1639			unsigned int fraggap;
1640			unsigned int alloclen, alloc_extra;
1641			unsigned int pagedlen;
1642alloc_new_skb:
1643			/* There's no room in the current skb */
1644			if (skb)
1645				fraggap = skb->len - maxfraglen;
1646			else
1647				fraggap = 0;
1648			/* update mtu and maxfraglen if necessary */
1649			if (!skb || !skb_prev)
1650				ip6_append_data_mtu(&mtu, &maxfraglen,
1651						    fragheaderlen, skb, rt,
1652						    orig_mtu);
1653
1654			skb_prev = skb;
1655
1656			/*
1657			 * If remaining data exceeds the mtu,
1658			 * we know we need more fragment(s).
1659			 */
1660			datalen = length + fraggap;
1661
1662			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1663				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1664			fraglen = datalen + fragheaderlen;
1665			pagedlen = 0;
1666
1667			alloc_extra = hh_len;
1668			alloc_extra += dst_exthdrlen;
1669			alloc_extra += rt->dst.trailer_len;
1670
1671			/* We just reserve space for fragment header.
1672			 * Note: this may be overallocation if the message
1673			 * (without MSG_MORE) fits into the MTU.
1674			 */
1675			alloc_extra += sizeof(struct frag_hdr);
1676
1677			if ((flags & MSG_MORE) &&
1678			    !(rt->dst.dev->features&NETIF_F_SG))
1679				alloclen = mtu;
1680			else if (!paged &&
1681				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1682				  !(rt->dst.dev->features & NETIF_F_SG)))
1683				alloclen = fraglen;
1684			else {
1685				alloclen = fragheaderlen + transhdrlen;
1686				pagedlen = datalen - transhdrlen;
1687			}
1688			alloclen += alloc_extra;
1689
1690			if (datalen != length + fraggap) {
1691				/*
1692				 * this is not the last fragment, the trailer
1693				 * space is regarded as data space.
1694				 */
1695				datalen += rt->dst.trailer_len;
1696			}
1697
1698			fraglen = datalen + fragheaderlen;
1699
1700			copy = datalen - transhdrlen - fraggap - pagedlen;
1701			/* [!] NOTE: copy may be negative if pagedlen>0
1702			 * because then the equation may reduces to -fraggap.
1703			 */
1704			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1705				err = -EINVAL;
1706				goto error;
1707			}
1708			if (transhdrlen) {
1709				skb = sock_alloc_send_skb(sk, alloclen,
1710						(flags & MSG_DONTWAIT), &err);
1711			} else {
1712				skb = NULL;
1713				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1714				    2 * sk->sk_sndbuf)
1715					skb = alloc_skb(alloclen,
1716							sk->sk_allocation);
1717				if (unlikely(!skb))
1718					err = -ENOBUFS;
1719			}
1720			if (!skb)
1721				goto error;
1722			/*
1723			 *	Fill in the control structures
1724			 */
1725			skb->protocol = htons(ETH_P_IPV6);
1726			skb->ip_summed = csummode;
1727			skb->csum = 0;
1728			/* reserve for fragmentation and ipsec header */
1729			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1730				    dst_exthdrlen);
1731
1732			/*
1733			 *	Find where to start putting bytes
1734			 */
1735			data = skb_put(skb, fraglen - pagedlen);
1736			skb_set_network_header(skb, exthdrlen);
1737			data += fragheaderlen;
1738			skb->transport_header = (skb->network_header +
1739						 fragheaderlen);
1740			if (fraggap) {
1741				skb->csum = skb_copy_and_csum_bits(
1742					skb_prev, maxfraglen,
1743					data + transhdrlen, fraggap);
1744				skb_prev->csum = csum_sub(skb_prev->csum,
1745							  skb->csum);
1746				data += fraggap;
1747				pskb_trim_unique(skb_prev, maxfraglen);
1748			}
1749			if (copy > 0 &&
1750			    getfrag(from, data + transhdrlen, offset,
1751				    copy, fraggap, skb) < 0) {
1752				err = -EFAULT;
1753				kfree_skb(skb);
1754				goto error;
1755			} else if (flags & MSG_SPLICE_PAGES) {
1756				copy = 0;
1757			}
1758
1759			offset += copy;
1760			length -= copy + transhdrlen;
1761			transhdrlen = 0;
1762			exthdrlen = 0;
1763			dst_exthdrlen = 0;
1764
1765			/* Only the initial fragment is time stamped */
1766			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1767			cork->tx_flags = 0;
1768			skb_shinfo(skb)->tskey = tskey;
1769			tskey = 0;
1770			skb_zcopy_set(skb, uarg, &extra_uref);
1771
1772			if ((flags & MSG_CONFIRM) && !skb_prev)
1773				skb_set_dst_pending_confirm(skb, 1);
1774
1775			/*
1776			 * Put the packet on the pending queue
1777			 */
1778			if (!skb->destructor) {
1779				skb->destructor = sock_wfree;
1780				skb->sk = sk;
1781				wmem_alloc_delta += skb->truesize;
1782			}
1783			__skb_queue_tail(queue, skb);
1784			continue;
1785		}
1786
1787		if (copy > length)
1788			copy = length;
1789
1790		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1791		    skb_tailroom(skb) >= copy) {
1792			unsigned int off;
1793
1794			off = skb->len;
1795			if (getfrag(from, skb_put(skb, copy),
1796						offset, copy, off, skb) < 0) {
1797				__skb_trim(skb, off);
1798				err = -EFAULT;
1799				goto error;
1800			}
1801		} else if (flags & MSG_SPLICE_PAGES) {
1802			struct msghdr *msg = from;
1803
1804			err = -EIO;
1805			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1806				goto error;
1807
1808			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1809						   sk->sk_allocation);
1810			if (err < 0)
1811				goto error;
1812			copy = err;
1813			wmem_alloc_delta += copy;
1814		} else if (!zc) {
1815			int i = skb_shinfo(skb)->nr_frags;
1816
1817			err = -ENOMEM;
1818			if (!sk_page_frag_refill(sk, pfrag))
1819				goto error;
1820
1821			skb_zcopy_downgrade_managed(skb);
1822			if (!skb_can_coalesce(skb, i, pfrag->page,
1823					      pfrag->offset)) {
1824				err = -EMSGSIZE;
1825				if (i == MAX_SKB_FRAGS)
1826					goto error;
1827
1828				__skb_fill_page_desc(skb, i, pfrag->page,
1829						     pfrag->offset, 0);
1830				skb_shinfo(skb)->nr_frags = ++i;
1831				get_page(pfrag->page);
1832			}
1833			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1834			if (getfrag(from,
1835				    page_address(pfrag->page) + pfrag->offset,
1836				    offset, copy, skb->len, skb) < 0)
1837				goto error_efault;
1838
1839			pfrag->offset += copy;
1840			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1841			skb->len += copy;
1842			skb->data_len += copy;
1843			skb->truesize += copy;
1844			wmem_alloc_delta += copy;
1845		} else {
1846			err = skb_zerocopy_iter_dgram(skb, from, copy);
1847			if (err < 0)
1848				goto error;
1849		}
1850		offset += copy;
1851		length -= copy;
1852	}
1853
1854	if (wmem_alloc_delta)
1855		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1856	return 0;
1857
1858error_efault:
1859	err = -EFAULT;
1860error:
1861	net_zcopy_put_abort(uarg, extra_uref);
1862	cork->length -= length;
1863	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1864	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1865	return err;
1866}
1867
1868int ip6_append_data(struct sock *sk,
1869		    int getfrag(void *from, char *to, int offset, int len,
1870				int odd, struct sk_buff *skb),
1871		    void *from, size_t length, int transhdrlen,
1872		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1873		    struct rt6_info *rt, unsigned int flags)
1874{
1875	struct inet_sock *inet = inet_sk(sk);
1876	struct ipv6_pinfo *np = inet6_sk(sk);
1877	int exthdrlen;
1878	int err;
1879
1880	if (flags&MSG_PROBE)
1881		return 0;
1882	if (skb_queue_empty(&sk->sk_write_queue)) {
1883		/*
1884		 * setup for corking
1885		 */
1886		dst_hold(&rt->dst);
1887		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1888				     ipc6, rt);
1889		if (err)
1890			return err;
1891
1892		inet->cork.fl.u.ip6 = *fl6;
1893		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1894		length += exthdrlen;
1895		transhdrlen += exthdrlen;
1896	} else {
1897		transhdrlen = 0;
1898	}
1899
1900	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1901				 &np->cork, sk_page_frag(sk), getfrag,
1902				 from, length, transhdrlen, flags, ipc6);
1903}
1904EXPORT_SYMBOL_GPL(ip6_append_data);
1905
1906static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1907{
1908	struct dst_entry *dst = cork->base.dst;
1909
1910	cork->base.dst = NULL;
1911	cork->base.flags &= ~IPCORK_ALLFRAG;
1912	skb_dst_set(skb, dst);
1913}
1914
1915static void ip6_cork_release(struct inet_cork_full *cork,
1916			     struct inet6_cork *v6_cork)
1917{
1918	if (v6_cork->opt) {
1919		struct ipv6_txoptions *opt = v6_cork->opt;
1920
1921		kfree(opt->dst0opt);
1922		kfree(opt->dst1opt);
1923		kfree(opt->hopopt);
1924		kfree(opt->srcrt);
1925		kfree(opt);
1926		v6_cork->opt = NULL;
1927	}
1928
1929	if (cork->base.dst) {
1930		dst_release(cork->base.dst);
1931		cork->base.dst = NULL;
1932		cork->base.flags &= ~IPCORK_ALLFRAG;
1933	}
1934}
1935
1936struct sk_buff *__ip6_make_skb(struct sock *sk,
1937			       struct sk_buff_head *queue,
1938			       struct inet_cork_full *cork,
1939			       struct inet6_cork *v6_cork)
1940{
1941	struct sk_buff *skb, *tmp_skb;
1942	struct sk_buff **tail_skb;
1943	struct in6_addr *final_dst;
1944	struct ipv6_pinfo *np = inet6_sk(sk);
1945	struct net *net = sock_net(sk);
1946	struct ipv6hdr *hdr;
1947	struct ipv6_txoptions *opt = v6_cork->opt;
1948	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1949	struct flowi6 *fl6 = &cork->fl.u.ip6;
1950	unsigned char proto = fl6->flowi6_proto;
1951
1952	skb = __skb_dequeue(queue);
1953	if (!skb)
1954		goto out;
1955	tail_skb = &(skb_shinfo(skb)->frag_list);
1956
1957	/* move skb->data to ip header from ext header */
1958	if (skb->data < skb_network_header(skb))
1959		__skb_pull(skb, skb_network_offset(skb));
1960	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1961		__skb_pull(tmp_skb, skb_network_header_len(skb));
1962		*tail_skb = tmp_skb;
1963		tail_skb = &(tmp_skb->next);
1964		skb->len += tmp_skb->len;
1965		skb->data_len += tmp_skb->len;
1966		skb->truesize += tmp_skb->truesize;
1967		tmp_skb->destructor = NULL;
1968		tmp_skb->sk = NULL;
1969	}
1970
1971	/* Allow local fragmentation. */
1972	skb->ignore_df = ip6_sk_ignore_df(sk);
1973	__skb_pull(skb, skb_network_header_len(skb));
1974
1975	final_dst = &fl6->daddr;
1976	if (opt && opt->opt_flen)
1977		ipv6_push_frag_opts(skb, opt, &proto);
1978	if (opt && opt->opt_nflen)
1979		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1980
1981	skb_push(skb, sizeof(struct ipv6hdr));
1982	skb_reset_network_header(skb);
1983	hdr = ipv6_hdr(skb);
1984
1985	ip6_flow_hdr(hdr, v6_cork->tclass,
1986		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1987					ip6_autoflowlabel(net, np), fl6));
1988	hdr->hop_limit = v6_cork->hop_limit;
1989	hdr->nexthdr = proto;
1990	hdr->saddr = fl6->saddr;
1991	hdr->daddr = *final_dst;
1992
1993	skb->priority = sk->sk_priority;
1994	skb->mark = cork->base.mark;
1995	skb->tstamp = cork->base.transmit_time;
1996
1997	ip6_cork_steal_dst(skb, cork);
1998	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1999	if (proto == IPPROTO_ICMPV6) {
2000		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
2001		u8 icmp6_type;
2002
2003		if (sk->sk_socket->type == SOCK_RAW &&
2004		   !inet_test_bit(HDRINCL, sk))
2005			icmp6_type = fl6->fl6_icmp_type;
2006		else
2007			icmp6_type = icmp6_hdr(skb)->icmp6_type;
2008		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
2009		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
2010	}
2011
2012	ip6_cork_release(cork, v6_cork);
2013out:
2014	return skb;
2015}
2016
2017int ip6_send_skb(struct sk_buff *skb)
2018{
2019	struct net *net = sock_net(skb->sk);
2020	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
2021	int err;
2022
2023	err = ip6_local_out(net, skb->sk, skb);
2024	if (err) {
2025		if (err > 0)
2026			err = net_xmit_errno(err);
2027		if (err)
2028			IP6_INC_STATS(net, rt->rt6i_idev,
2029				      IPSTATS_MIB_OUTDISCARDS);
2030	}
2031
2032	return err;
2033}
2034
2035int ip6_push_pending_frames(struct sock *sk)
2036{
2037	struct sk_buff *skb;
2038
2039	skb = ip6_finish_skb(sk);
2040	if (!skb)
2041		return 0;
2042
2043	return ip6_send_skb(skb);
2044}
2045EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2046
2047static void __ip6_flush_pending_frames(struct sock *sk,
2048				       struct sk_buff_head *queue,
2049				       struct inet_cork_full *cork,
2050				       struct inet6_cork *v6_cork)
2051{
2052	struct sk_buff *skb;
2053
2054	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2055		if (skb_dst(skb))
2056			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2057				      IPSTATS_MIB_OUTDISCARDS);
2058		kfree_skb(skb);
2059	}
2060
2061	ip6_cork_release(cork, v6_cork);
2062}
2063
2064void ip6_flush_pending_frames(struct sock *sk)
2065{
2066	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2067				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2068}
2069EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2070
2071struct sk_buff *ip6_make_skb(struct sock *sk,
2072			     int getfrag(void *from, char *to, int offset,
2073					 int len, int odd, struct sk_buff *skb),
2074			     void *from, size_t length, int transhdrlen,
2075			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2076			     unsigned int flags, struct inet_cork_full *cork)
2077{
2078	struct inet6_cork v6_cork;
2079	struct sk_buff_head queue;
2080	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2081	int err;
2082
2083	if (flags & MSG_PROBE) {
2084		dst_release(&rt->dst);
2085		return NULL;
2086	}
2087
2088	__skb_queue_head_init(&queue);
2089
2090	cork->base.flags = 0;
2091	cork->base.addr = 0;
2092	cork->base.opt = NULL;
2093	v6_cork.opt = NULL;
2094	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2095	if (err) {
2096		ip6_cork_release(cork, &v6_cork);
2097		return ERR_PTR(err);
2098	}
2099	if (ipc6->dontfrag < 0)
2100		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2101
2102	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2103				&current->task_frag, getfrag, from,
2104				length + exthdrlen, transhdrlen + exthdrlen,
2105				flags, ipc6);
2106	if (err) {
2107		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2108		return ERR_PTR(err);
2109	}
2110
2111	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2112}
2113