xref: /kernel/linux/linux-5.10/net/core/skbuff.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 *	Routines having to do with the 'struct sk_buff' memory handlers.
4 *
5 *	Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
6 *			Florian La Roche <rzsfl@rz.uni-sb.de>
7 *
8 *	Fixes:
9 *		Alan Cox	:	Fixed the worst of the load
10 *					balancer bugs.
11 *		Dave Platt	:	Interrupt stacking fix.
12 *	Richard Kooijman	:	Timestamp fixes.
13 *		Alan Cox	:	Changed buffer format.
14 *		Alan Cox	:	destructor hook for AF_UNIX etc.
15 *		Linus Torvalds	:	Better skb_clone.
16 *		Alan Cox	:	Added skb_copy.
17 *		Alan Cox	:	Added all the changed routines Linus
18 *					only put in the headers
19 *		Ray VanTassle	:	Fixed --skb->lock in free
20 *		Alan Cox	:	skb_copy copy arp field
21 *		Andi Kleen	:	slabified it.
22 *		Robert Olsson	:	Removed skb_head_pool
23 *
24 *	NOTE:
25 *		The __skb_ routines should be called with interrupts
26 *	disabled, or you better be *real* sure that the operation is atomic
27 *	with respect to whatever list is being frobbed (e.g. via lock_sock()
28 *	or via disabling bottom half handlers, etc).
29 */
30
31/*
32 *	The functions in this file will not compile correctly with gcc 2.4.x
33 */
34
35#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
36
37#include <linux/module.h>
38#include <linux/types.h>
39#include <linux/kernel.h>
40#include <linux/mm.h>
41#include <linux/interrupt.h>
42#include <linux/in.h>
43#include <linux/inet.h>
44#include <linux/slab.h>
45#include <linux/tcp.h>
46#include <linux/udp.h>
47#include <linux/sctp.h>
48#include <linux/netdevice.h>
49#ifdef CONFIG_NET_CLS_ACT
50#include <net/pkt_sched.h>
51#endif
52#include <linux/string.h>
53#include <linux/skbuff.h>
54#include <linux/splice.h>
55#include <linux/cache.h>
56#include <linux/rtnetlink.h>
57#include <linux/init.h>
58#include <linux/scatterlist.h>
59#include <linux/errqueue.h>
60#include <linux/prefetch.h>
61#include <linux/if_vlan.h>
62#include <linux/mpls.h>
63
64#include <net/protocol.h>
65#include <net/dst.h>
66#include <net/sock.h>
67#include <net/checksum.h>
68#include <net/ip6_checksum.h>
69#include <net/xfrm.h>
70#include <net/mpls.h>
71#include <net/mptcp.h>
72
73#include <linux/uaccess.h>
74#include <trace/events/skb.h>
75#include <linux/highmem.h>
76#include <linux/capability.h>
77#include <linux/user_namespace.h>
78#include <linux/indirect_call_wrapper.h>
79
80#include "datagram.h"
81
82struct kmem_cache *skbuff_head_cache __ro_after_init;
83static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
84#ifdef CONFIG_SKB_EXTENSIONS
85static struct kmem_cache *skbuff_ext_cache __ro_after_init;
86#endif
87int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
88EXPORT_SYMBOL(sysctl_max_skb_frags);
89
90/**
91 *	skb_panic - private function for out-of-line support
92 *	@skb:	buffer
93 *	@sz:	size
94 *	@addr:	address
95 *	@msg:	skb_over_panic or skb_under_panic
96 *
97 *	Out-of-line support for skb_put() and skb_push().
98 *	Called via the wrapper skb_over_panic() or skb_under_panic().
99 *	Keep out of line to prevent kernel bloat.
100 *	__builtin_return_address is not used because it is not always reliable.
101 */
102static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
103		      const char msg[])
104{
105	pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
106		 msg, addr, skb->len, sz, skb->head, skb->data,
107		 (unsigned long)skb->tail, (unsigned long)skb->end,
108		 skb->dev ? skb->dev->name : "<NULL>");
109	BUG();
110}
111
112static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
113{
114	skb_panic(skb, sz, addr, __func__);
115}
116
117static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
118{
119	skb_panic(skb, sz, addr, __func__);
120}
121
122/*
123 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
124 * the caller if emergency pfmemalloc reserves are being used. If it is and
125 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
126 * may be used. Otherwise, the packet data may be discarded until enough
127 * memory is free
128 */
129#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
130	 __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
131
132static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
133			       unsigned long ip, bool *pfmemalloc)
134{
135	void *obj;
136	bool ret_pfmemalloc = false;
137
138	/*
139	 * Try a regular allocation, when that fails and we're not entitled
140	 * to the reserves, fail.
141	 */
142	obj = kmalloc_node_track_caller(size,
143					flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
144					node);
145	if (obj || !(gfp_pfmemalloc_allowed(flags)))
146		goto out;
147
148	/* Try again but now we are using pfmemalloc reserves */
149	ret_pfmemalloc = true;
150	obj = kmalloc_node_track_caller(size, flags, node);
151
152out:
153	if (pfmemalloc)
154		*pfmemalloc = ret_pfmemalloc;
155
156	return obj;
157}
158
159/* 	Allocate a new skbuff. We do this ourselves so we can fill in a few
160 *	'private' fields and also do memory statistics to find all the
161 *	[BEEP] leaks.
162 *
163 */
164
165/**
166 *	__alloc_skb	-	allocate a network buffer
167 *	@size: size to allocate
168 *	@gfp_mask: allocation mask
169 *	@flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
170 *		instead of head cache and allocate a cloned (child) skb.
171 *		If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
172 *		allocations in case the data is required for writeback
173 *	@node: numa node to allocate memory on
174 *
175 *	Allocate a new &sk_buff. The returned buffer has no headroom and a
176 *	tail room of at least size bytes. The object has a reference count
177 *	of one. The return is the buffer. On a failure the return is %NULL.
178 *
179 *	Buffers may only be allocated from interrupts using a @gfp_mask of
180 *	%GFP_ATOMIC.
181 */
182struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
183			    int flags, int node)
184{
185	struct kmem_cache *cache;
186	struct skb_shared_info *shinfo;
187	struct sk_buff *skb;
188	u8 *data;
189	bool pfmemalloc;
190
191	cache = (flags & SKB_ALLOC_FCLONE)
192		? skbuff_fclone_cache : skbuff_head_cache;
193
194	if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
195		gfp_mask |= __GFP_MEMALLOC;
196
197	/* Get the HEAD */
198	skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
199	if (!skb)
200		goto out;
201	prefetchw(skb);
202
203	/* We do our best to align skb_shared_info on a separate cache
204	 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
205	 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
206	 * Both skb->head and skb_shared_info are cache line aligned.
207	 */
208	size = SKB_DATA_ALIGN(size);
209	size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
210	data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
211	if (!data)
212		goto nodata;
213	/* kmalloc(size) might give us more room than requested.
214	 * Put skb_shared_info exactly at the end of allocated zone,
215	 * to allow max possible filling before reallocation.
216	 */
217	size = SKB_WITH_OVERHEAD(ksize(data));
218	prefetchw(data + size);
219
220	/*
221	 * Only clear those fields we need to clear, not those that we will
222	 * actually initialise below. Hence, don't put any more fields after
223	 * the tail pointer in struct sk_buff!
224	 */
225	memset(skb, 0, offsetof(struct sk_buff, tail));
226	/* Account for allocated memory : skb + skb->head */
227	skb->truesize = SKB_TRUESIZE(size);
228	skb->pfmemalloc = pfmemalloc;
229	refcount_set(&skb->users, 1);
230	skb->head = data;
231	skb->data = data;
232	skb_reset_tail_pointer(skb);
233	skb->end = skb->tail + size;
234	skb->mac_header = (typeof(skb->mac_header))~0U;
235	skb->transport_header = (typeof(skb->transport_header))~0U;
236
237	/* make sure we initialize shinfo sequentially */
238	shinfo = skb_shinfo(skb);
239	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
240	atomic_set(&shinfo->dataref, 1);
241
242	if (flags & SKB_ALLOC_FCLONE) {
243		struct sk_buff_fclones *fclones;
244
245		fclones = container_of(skb, struct sk_buff_fclones, skb1);
246
247		skb->fclone = SKB_FCLONE_ORIG;
248		refcount_set(&fclones->fclone_ref, 1);
249
250		fclones->skb2.fclone = SKB_FCLONE_CLONE;
251	}
252
253	skb_set_kcov_handle(skb, kcov_common_handle());
254
255out:
256	return skb;
257nodata:
258	kmem_cache_free(cache, skb);
259	skb = NULL;
260	goto out;
261}
262EXPORT_SYMBOL(__alloc_skb);
263
264/* Caller must provide SKB that is memset cleared */
265static struct sk_buff *__build_skb_around(struct sk_buff *skb,
266					  void *data, unsigned int frag_size)
267{
268	struct skb_shared_info *shinfo;
269	unsigned int size = frag_size ? : ksize(data);
270
271	size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
272
273	/* Assumes caller memset cleared SKB */
274	skb->truesize = SKB_TRUESIZE(size);
275	refcount_set(&skb->users, 1);
276	skb->head = data;
277	skb->data = data;
278	skb_reset_tail_pointer(skb);
279	skb->end = skb->tail + size;
280	skb->mac_header = (typeof(skb->mac_header))~0U;
281	skb->transport_header = (typeof(skb->transport_header))~0U;
282
283	/* make sure we initialize shinfo sequentially */
284	shinfo = skb_shinfo(skb);
285	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
286	atomic_set(&shinfo->dataref, 1);
287
288	skb_set_kcov_handle(skb, kcov_common_handle());
289
290	return skb;
291}
292
293/**
294 * __build_skb - build a network buffer
295 * @data: data buffer provided by caller
296 * @frag_size: size of data, or 0 if head was kmalloced
297 *
298 * Allocate a new &sk_buff. Caller provides space holding head and
299 * skb_shared_info. @data must have been allocated by kmalloc() only if
300 * @frag_size is 0, otherwise data should come from the page allocator
301 *  or vmalloc()
302 * The return is the new skb buffer.
303 * On a failure the return is %NULL, and @data is not freed.
304 * Notes :
305 *  Before IO, driver allocates only data buffer where NIC put incoming frame
306 *  Driver should add room at head (NET_SKB_PAD) and
307 *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
308 *  After IO, driver calls build_skb(), to allocate sk_buff and populate it
309 *  before giving packet to stack.
310 *  RX rings only contains data buffers, not full skbs.
311 */
312struct sk_buff *__build_skb(void *data, unsigned int frag_size)
313{
314	struct sk_buff *skb;
315
316	skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
317	if (unlikely(!skb))
318		return NULL;
319
320	memset(skb, 0, offsetof(struct sk_buff, tail));
321
322	return __build_skb_around(skb, data, frag_size);
323}
324
325/* build_skb() is wrapper over __build_skb(), that specifically
326 * takes care of skb->head and skb->pfmemalloc
327 * This means that if @frag_size is not zero, then @data must be backed
328 * by a page fragment, not kmalloc() or vmalloc()
329 */
330struct sk_buff *build_skb(void *data, unsigned int frag_size)
331{
332	struct sk_buff *skb = __build_skb(data, frag_size);
333
334	if (skb && frag_size) {
335		skb->head_frag = 1;
336		if (page_is_pfmemalloc(virt_to_head_page(data)))
337			skb->pfmemalloc = 1;
338	}
339	return skb;
340}
341EXPORT_SYMBOL(build_skb);
342
343/**
344 * build_skb_around - build a network buffer around provided skb
345 * @skb: sk_buff provide by caller, must be memset cleared
346 * @data: data buffer provided by caller
347 * @frag_size: size of data, or 0 if head was kmalloced
348 */
349struct sk_buff *build_skb_around(struct sk_buff *skb,
350				 void *data, unsigned int frag_size)
351{
352	if (unlikely(!skb))
353		return NULL;
354
355	skb = __build_skb_around(skb, data, frag_size);
356
357	if (skb && frag_size) {
358		skb->head_frag = 1;
359		if (page_is_pfmemalloc(virt_to_head_page(data)))
360			skb->pfmemalloc = 1;
361	}
362	return skb;
363}
364EXPORT_SYMBOL(build_skb_around);
365
366#define NAPI_SKB_CACHE_SIZE	64
367
368struct napi_alloc_cache {
369	struct page_frag_cache page;
370	unsigned int skb_count;
371	void *skb_cache[NAPI_SKB_CACHE_SIZE];
372};
373
374static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
375static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
376
377static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
378{
379	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
380
381	return page_frag_alloc(&nc->page, fragsz, gfp_mask);
382}
383
384void *napi_alloc_frag(unsigned int fragsz)
385{
386	fragsz = SKB_DATA_ALIGN(fragsz);
387
388	return __napi_alloc_frag(fragsz, GFP_ATOMIC);
389}
390EXPORT_SYMBOL(napi_alloc_frag);
391
392/**
393 * netdev_alloc_frag - allocate a page fragment
394 * @fragsz: fragment size
395 *
396 * Allocates a frag from a page for receive buffer.
397 * Uses GFP_ATOMIC allocations.
398 */
399void *netdev_alloc_frag(unsigned int fragsz)
400{
401	struct page_frag_cache *nc;
402	void *data;
403
404	fragsz = SKB_DATA_ALIGN(fragsz);
405	if (in_irq() || irqs_disabled()) {
406		nc = this_cpu_ptr(&netdev_alloc_cache);
407		data = page_frag_alloc(nc, fragsz, GFP_ATOMIC);
408	} else {
409		local_bh_disable();
410		data = __napi_alloc_frag(fragsz, GFP_ATOMIC);
411		local_bh_enable();
412	}
413	return data;
414}
415EXPORT_SYMBOL(netdev_alloc_frag);
416
417/**
418 *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
419 *	@dev: network device to receive on
420 *	@len: length to allocate
421 *	@gfp_mask: get_free_pages mask, passed to alloc_skb
422 *
423 *	Allocate a new &sk_buff and assign it a usage count of one. The
424 *	buffer has NET_SKB_PAD headroom built in. Users should allocate
425 *	the headroom they think they need without accounting for the
426 *	built in space. The built in space is used for optimisations.
427 *
428 *	%NULL is returned if there is no free memory.
429 */
430struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
431				   gfp_t gfp_mask)
432{
433	struct page_frag_cache *nc;
434	struct sk_buff *skb;
435	bool pfmemalloc;
436	void *data;
437
438	len += NET_SKB_PAD;
439
440	/* If requested length is either too small or too big,
441	 * we use kmalloc() for skb->head allocation.
442	 */
443	if (len <= SKB_WITH_OVERHEAD(1024) ||
444	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
445	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
446		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
447		if (!skb)
448			goto skb_fail;
449		goto skb_success;
450	}
451
452	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
453	len = SKB_DATA_ALIGN(len);
454
455	if (sk_memalloc_socks())
456		gfp_mask |= __GFP_MEMALLOC;
457
458	if (in_irq() || irqs_disabled()) {
459		nc = this_cpu_ptr(&netdev_alloc_cache);
460		data = page_frag_alloc(nc, len, gfp_mask);
461		pfmemalloc = nc->pfmemalloc;
462	} else {
463		local_bh_disable();
464		nc = this_cpu_ptr(&napi_alloc_cache.page);
465		data = page_frag_alloc(nc, len, gfp_mask);
466		pfmemalloc = nc->pfmemalloc;
467		local_bh_enable();
468	}
469
470	if (unlikely(!data))
471		return NULL;
472
473	skb = __build_skb(data, len);
474	if (unlikely(!skb)) {
475		skb_free_frag(data);
476		return NULL;
477	}
478
479	if (pfmemalloc)
480		skb->pfmemalloc = 1;
481	skb->head_frag = 1;
482
483skb_success:
484	skb_reserve(skb, NET_SKB_PAD);
485	skb->dev = dev;
486
487skb_fail:
488	return skb;
489}
490EXPORT_SYMBOL(__netdev_alloc_skb);
491
492/**
493 *	__napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
494 *	@napi: napi instance this buffer was allocated for
495 *	@len: length to allocate
496 *	@gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
497 *
498 *	Allocate a new sk_buff for use in NAPI receive.  This buffer will
499 *	attempt to allocate the head from a special reserved region used
500 *	only for NAPI Rx allocation.  By doing this we can save several
501 *	CPU cycles by avoiding having to disable and re-enable IRQs.
502 *
503 *	%NULL is returned if there is no free memory.
504 */
505struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
506				 gfp_t gfp_mask)
507{
508	struct napi_alloc_cache *nc;
509	struct sk_buff *skb;
510	void *data;
511
512	len += NET_SKB_PAD + NET_IP_ALIGN;
513
514	/* If requested length is either too small or too big,
515	 * we use kmalloc() for skb->head allocation.
516	 */
517	if (len <= SKB_WITH_OVERHEAD(1024) ||
518	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
519	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
520		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
521		if (!skb)
522			goto skb_fail;
523		goto skb_success;
524	}
525
526	nc = this_cpu_ptr(&napi_alloc_cache);
527	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
528	len = SKB_DATA_ALIGN(len);
529
530	if (sk_memalloc_socks())
531		gfp_mask |= __GFP_MEMALLOC;
532
533	data = page_frag_alloc(&nc->page, len, gfp_mask);
534	if (unlikely(!data))
535		return NULL;
536
537	skb = __build_skb(data, len);
538	if (unlikely(!skb)) {
539		skb_free_frag(data);
540		return NULL;
541	}
542
543	if (nc->page.pfmemalloc)
544		skb->pfmemalloc = 1;
545	skb->head_frag = 1;
546
547skb_success:
548	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
549	skb->dev = napi->dev;
550
551skb_fail:
552	return skb;
553}
554EXPORT_SYMBOL(__napi_alloc_skb);
555
556void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
557		     int size, unsigned int truesize)
558{
559	skb_fill_page_desc(skb, i, page, off, size);
560	skb->len += size;
561	skb->data_len += size;
562	skb->truesize += truesize;
563}
564EXPORT_SYMBOL(skb_add_rx_frag);
565
566void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
567			  unsigned int truesize)
568{
569	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
570
571	skb_frag_size_add(frag, size);
572	skb->len += size;
573	skb->data_len += size;
574	skb->truesize += truesize;
575}
576EXPORT_SYMBOL(skb_coalesce_rx_frag);
577
578static void skb_drop_list(struct sk_buff **listp)
579{
580	kfree_skb_list(*listp);
581	*listp = NULL;
582}
583
584static inline void skb_drop_fraglist(struct sk_buff *skb)
585{
586	skb_drop_list(&skb_shinfo(skb)->frag_list);
587}
588
589static void skb_clone_fraglist(struct sk_buff *skb)
590{
591	struct sk_buff *list;
592
593	skb_walk_frags(skb, list)
594		skb_get(list);
595}
596
597static void skb_free_head(struct sk_buff *skb)
598{
599	unsigned char *head = skb->head;
600
601	if (skb->head_frag)
602		skb_free_frag(head);
603	else
604		kfree(head);
605}
606
607static void skb_release_data(struct sk_buff *skb)
608{
609	struct skb_shared_info *shinfo = skb_shinfo(skb);
610	int i;
611
612	if (skb->cloned &&
613	    atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
614			      &shinfo->dataref))
615		return;
616
617	for (i = 0; i < shinfo->nr_frags; i++)
618		__skb_frag_unref(&shinfo->frags[i]);
619
620	if (shinfo->frag_list)
621		kfree_skb_list(shinfo->frag_list);
622
623	skb_zcopy_clear(skb, true);
624	skb_free_head(skb);
625}
626
627/*
628 *	Free an skbuff by memory without cleaning the state.
629 */
630static void kfree_skbmem(struct sk_buff *skb)
631{
632	struct sk_buff_fclones *fclones;
633
634	switch (skb->fclone) {
635	case SKB_FCLONE_UNAVAILABLE:
636		kmem_cache_free(skbuff_head_cache, skb);
637		return;
638
639	case SKB_FCLONE_ORIG:
640		fclones = container_of(skb, struct sk_buff_fclones, skb1);
641
642		/* We usually free the clone (TX completion) before original skb
643		 * This test would have no chance to be true for the clone,
644		 * while here, branch prediction will be good.
645		 */
646		if (refcount_read(&fclones->fclone_ref) == 1)
647			goto fastpath;
648		break;
649
650	default: /* SKB_FCLONE_CLONE */
651		fclones = container_of(skb, struct sk_buff_fclones, skb2);
652		break;
653	}
654	if (!refcount_dec_and_test(&fclones->fclone_ref))
655		return;
656fastpath:
657	kmem_cache_free(skbuff_fclone_cache, fclones);
658}
659
660void skb_release_head_state(struct sk_buff *skb)
661{
662	skb_dst_drop(skb);
663	if (skb->destructor) {
664		WARN_ON(in_irq());
665		skb->destructor(skb);
666	}
667#if IS_ENABLED(CONFIG_NF_CONNTRACK)
668	nf_conntrack_put(skb_nfct(skb));
669#endif
670	skb_ext_put(skb);
671}
672
673/* Free everything but the sk_buff shell. */
674static void skb_release_all(struct sk_buff *skb)
675{
676	skb_release_head_state(skb);
677	if (likely(skb->head))
678		skb_release_data(skb);
679}
680
681/**
682 *	__kfree_skb - private function
683 *	@skb: buffer
684 *
685 *	Free an sk_buff. Release anything attached to the buffer.
686 *	Clean the state. This is an internal helper function. Users should
687 *	always call kfree_skb
688 */
689
690void __kfree_skb(struct sk_buff *skb)
691{
692	skb_release_all(skb);
693	kfree_skbmem(skb);
694}
695EXPORT_SYMBOL(__kfree_skb);
696
697/**
698 *	kfree_skb - free an sk_buff
699 *	@skb: buffer to free
700 *
701 *	Drop a reference to the buffer and free it if the usage count has
702 *	hit zero.
703 */
704void kfree_skb(struct sk_buff *skb)
705{
706	if (!skb_unref(skb))
707		return;
708
709	trace_kfree_skb(skb, __builtin_return_address(0));
710	__kfree_skb(skb);
711}
712EXPORT_SYMBOL(kfree_skb);
713
714void kfree_skb_list(struct sk_buff *segs)
715{
716	while (segs) {
717		struct sk_buff *next = segs->next;
718
719		kfree_skb(segs);
720		segs = next;
721	}
722}
723EXPORT_SYMBOL(kfree_skb_list);
724
725/* Dump skb information and contents.
726 *
727 * Must only be called from net_ratelimit()-ed paths.
728 *
729 * Dumps whole packets if full_pkt, only headers otherwise.
730 */
731void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
732{
733	struct skb_shared_info *sh = skb_shinfo(skb);
734	struct net_device *dev = skb->dev;
735	struct sock *sk = skb->sk;
736	struct sk_buff *list_skb;
737	bool has_mac, has_trans;
738	int headroom, tailroom;
739	int i, len, seg_len;
740
741	if (full_pkt)
742		len = skb->len;
743	else
744		len = min_t(int, skb->len, MAX_HEADER + 128);
745
746	headroom = skb_headroom(skb);
747	tailroom = skb_tailroom(skb);
748
749	has_mac = skb_mac_header_was_set(skb);
750	has_trans = skb_transport_header_was_set(skb);
751
752	printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
753	       "mac=(%d,%d) net=(%d,%d) trans=%d\n"
754	       "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
755	       "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
756	       "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n",
757	       level, skb->len, headroom, skb_headlen(skb), tailroom,
758	       has_mac ? skb->mac_header : -1,
759	       has_mac ? skb_mac_header_len(skb) : -1,
760	       skb->network_header,
761	       has_trans ? skb_network_header_len(skb) : -1,
762	       has_trans ? skb->transport_header : -1,
763	       sh->tx_flags, sh->nr_frags,
764	       sh->gso_size, sh->gso_type, sh->gso_segs,
765	       skb->csum, skb->ip_summed, skb->csum_complete_sw,
766	       skb->csum_valid, skb->csum_level,
767	       skb->hash, skb->sw_hash, skb->l4_hash,
768	       ntohs(skb->protocol), skb->pkt_type, skb->skb_iif);
769
770	if (dev)
771		printk("%sdev name=%s feat=%pNF\n",
772		       level, dev->name, &dev->features);
773	if (sk)
774		printk("%ssk family=%hu type=%u proto=%u\n",
775		       level, sk->sk_family, sk->sk_type, sk->sk_protocol);
776
777	if (full_pkt && headroom)
778		print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
779			       16, 1, skb->head, headroom, false);
780
781	seg_len = min_t(int, skb_headlen(skb), len);
782	if (seg_len)
783		print_hex_dump(level, "skb linear:   ", DUMP_PREFIX_OFFSET,
784			       16, 1, skb->data, seg_len, false);
785	len -= seg_len;
786
787	if (full_pkt && tailroom)
788		print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
789			       16, 1, skb_tail_pointer(skb), tailroom, false);
790
791	for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
792		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
793		u32 p_off, p_len, copied;
794		struct page *p;
795		u8 *vaddr;
796
797		skb_frag_foreach_page(frag, skb_frag_off(frag),
798				      skb_frag_size(frag), p, p_off, p_len,
799				      copied) {
800			seg_len = min_t(int, p_len, len);
801			vaddr = kmap_atomic(p);
802			print_hex_dump(level, "skb frag:     ",
803				       DUMP_PREFIX_OFFSET,
804				       16, 1, vaddr + p_off, seg_len, false);
805			kunmap_atomic(vaddr);
806			len -= seg_len;
807			if (!len)
808				break;
809		}
810	}
811
812	if (full_pkt && skb_has_frag_list(skb)) {
813		printk("skb fraglist:\n");
814		skb_walk_frags(skb, list_skb)
815			skb_dump(level, list_skb, true);
816	}
817}
818EXPORT_SYMBOL(skb_dump);
819
820/**
821 *	skb_tx_error - report an sk_buff xmit error
822 *	@skb: buffer that triggered an error
823 *
824 *	Report xmit error if a device callback is tracking this skb.
825 *	skb must be freed afterwards.
826 */
827void skb_tx_error(struct sk_buff *skb)
828{
829	skb_zcopy_clear(skb, true);
830}
831EXPORT_SYMBOL(skb_tx_error);
832
833#ifdef CONFIG_TRACEPOINTS
834/**
835 *	consume_skb - free an skbuff
836 *	@skb: buffer to free
837 *
838 *	Drop a ref to the buffer and free it if the usage count has hit zero
839 *	Functions identically to kfree_skb, but kfree_skb assumes that the frame
840 *	is being dropped after a failure and notes that
841 */
842void consume_skb(struct sk_buff *skb)
843{
844	if (!skb_unref(skb))
845		return;
846
847	trace_consume_skb(skb);
848	__kfree_skb(skb);
849}
850EXPORT_SYMBOL(consume_skb);
851#endif
852
853/**
854 *	consume_stateless_skb - free an skbuff, assuming it is stateless
855 *	@skb: buffer to free
856 *
857 *	Alike consume_skb(), but this variant assumes that this is the last
858 *	skb reference and all the head states have been already dropped
859 */
860void __consume_stateless_skb(struct sk_buff *skb)
861{
862	trace_consume_skb(skb);
863	skb_release_data(skb);
864	kfree_skbmem(skb);
865}
866
867void __kfree_skb_flush(void)
868{
869	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
870
871	/* flush skb_cache if containing objects */
872	if (nc->skb_count) {
873		kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
874				     nc->skb_cache);
875		nc->skb_count = 0;
876	}
877}
878
879static inline void _kfree_skb_defer(struct sk_buff *skb)
880{
881	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
882
883	/* drop skb->head and call any destructors for packet */
884	skb_release_all(skb);
885
886	/* record skb to CPU local list */
887	nc->skb_cache[nc->skb_count++] = skb;
888
889#ifdef CONFIG_SLUB
890	/* SLUB writes into objects when freeing */
891	prefetchw(skb);
892#endif
893
894	/* flush skb_cache if it is filled */
895	if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
896		kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
897				     nc->skb_cache);
898		nc->skb_count = 0;
899	}
900}
901void __kfree_skb_defer(struct sk_buff *skb)
902{
903	_kfree_skb_defer(skb);
904}
905
906void napi_consume_skb(struct sk_buff *skb, int budget)
907{
908	/* Zero budget indicate non-NAPI context called us, like netpoll */
909	if (unlikely(!budget)) {
910		dev_consume_skb_any(skb);
911		return;
912	}
913
914	if (!skb_unref(skb))
915		return;
916
917	/* if reaching here SKB is ready to free */
918	trace_consume_skb(skb);
919
920	/* if SKB is a clone, don't handle this case */
921	if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
922		__kfree_skb(skb);
923		return;
924	}
925
926	_kfree_skb_defer(skb);
927}
928EXPORT_SYMBOL(napi_consume_skb);
929
930/* Make sure a field is enclosed inside headers_start/headers_end section */
931#define CHECK_SKB_FIELD(field) \
932	BUILD_BUG_ON(offsetof(struct sk_buff, field) <		\
933		     offsetof(struct sk_buff, headers_start));	\
934	BUILD_BUG_ON(offsetof(struct sk_buff, field) >		\
935		     offsetof(struct sk_buff, headers_end));	\
936
937static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
938{
939	new->tstamp		= old->tstamp;
940	/* We do not copy old->sk */
941	new->dev		= old->dev;
942	memcpy(new->cb, old->cb, sizeof(old->cb));
943	skb_dst_copy(new, old);
944	__skb_ext_copy(new, old);
945	__nf_copy(new, old, false);
946
947	/* Note : this field could be in headers_start/headers_end section
948	 * It is not yet because we do not want to have a 16 bit hole
949	 */
950	new->queue_mapping = old->queue_mapping;
951
952	memcpy(&new->headers_start, &old->headers_start,
953	       offsetof(struct sk_buff, headers_end) -
954	       offsetof(struct sk_buff, headers_start));
955	CHECK_SKB_FIELD(protocol);
956	CHECK_SKB_FIELD(csum);
957	CHECK_SKB_FIELD(hash);
958	CHECK_SKB_FIELD(priority);
959	CHECK_SKB_FIELD(skb_iif);
960	CHECK_SKB_FIELD(vlan_proto);
961	CHECK_SKB_FIELD(vlan_tci);
962	CHECK_SKB_FIELD(transport_header);
963	CHECK_SKB_FIELD(network_header);
964	CHECK_SKB_FIELD(mac_header);
965	CHECK_SKB_FIELD(inner_protocol);
966	CHECK_SKB_FIELD(inner_transport_header);
967	CHECK_SKB_FIELD(inner_network_header);
968	CHECK_SKB_FIELD(inner_mac_header);
969	CHECK_SKB_FIELD(mark);
970#ifdef CONFIG_NETWORK_SECMARK
971	CHECK_SKB_FIELD(secmark);
972#endif
973#ifdef CONFIG_NET_RX_BUSY_POLL
974	CHECK_SKB_FIELD(napi_id);
975#endif
976#ifdef CONFIG_XPS
977	CHECK_SKB_FIELD(sender_cpu);
978#endif
979#ifdef CONFIG_NET_SCHED
980	CHECK_SKB_FIELD(tc_index);
981#endif
982
983}
984
985/*
986 * You should not add any new code to this function.  Add it to
987 * __copy_skb_header above instead.
988 */
989static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
990{
991#define C(x) n->x = skb->x
992
993	n->next = n->prev = NULL;
994	n->sk = NULL;
995	__copy_skb_header(n, skb);
996
997	C(len);
998	C(data_len);
999	C(mac_len);
1000	n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
1001	n->cloned = 1;
1002	n->nohdr = 0;
1003	n->peeked = 0;
1004	C(pfmemalloc);
1005	n->destructor = NULL;
1006	C(tail);
1007	C(end);
1008	C(head);
1009	C(head_frag);
1010	C(data);
1011	C(truesize);
1012	refcount_set(&n->users, 1);
1013
1014	atomic_inc(&(skb_shinfo(skb)->dataref));
1015	skb->cloned = 1;
1016
1017	return n;
1018#undef C
1019}
1020
1021/**
1022 * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
1023 * @first: first sk_buff of the msg
1024 */
1025struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
1026{
1027	struct sk_buff *n;
1028
1029	n = alloc_skb(0, GFP_ATOMIC);
1030	if (!n)
1031		return NULL;
1032
1033	n->len = first->len;
1034	n->data_len = first->len;
1035	n->truesize = first->truesize;
1036
1037	skb_shinfo(n)->frag_list = first;
1038
1039	__copy_skb_header(n, first);
1040	n->destructor = NULL;
1041
1042	return n;
1043}
1044EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
1045
1046/**
1047 *	skb_morph	-	morph one skb into another
1048 *	@dst: the skb to receive the contents
1049 *	@src: the skb to supply the contents
1050 *
1051 *	This is identical to skb_clone except that the target skb is
1052 *	supplied by the user.
1053 *
1054 *	The target skb is returned upon exit.
1055 */
1056struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
1057{
1058	skb_release_all(dst);
1059	return __skb_clone(dst, src);
1060}
1061EXPORT_SYMBOL_GPL(skb_morph);
1062
1063int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
1064{
1065	unsigned long max_pg, num_pg, new_pg, old_pg;
1066	struct user_struct *user;
1067
1068	if (capable(CAP_IPC_LOCK) || !size)
1069		return 0;
1070
1071	num_pg = (size >> PAGE_SHIFT) + 2;	/* worst case */
1072	max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1073	user = mmp->user ? : current_user();
1074
1075	do {
1076		old_pg = atomic_long_read(&user->locked_vm);
1077		new_pg = old_pg + num_pg;
1078		if (new_pg > max_pg)
1079			return -ENOBUFS;
1080	} while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) !=
1081		 old_pg);
1082
1083	if (!mmp->user) {
1084		mmp->user = get_uid(user);
1085		mmp->num_pg = num_pg;
1086	} else {
1087		mmp->num_pg += num_pg;
1088	}
1089
1090	return 0;
1091}
1092EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
1093
1094void mm_unaccount_pinned_pages(struct mmpin *mmp)
1095{
1096	if (mmp->user) {
1097		atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
1098		free_uid(mmp->user);
1099	}
1100}
1101EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
1102
1103struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
1104{
1105	struct ubuf_info *uarg;
1106	struct sk_buff *skb;
1107
1108	WARN_ON_ONCE(!in_task());
1109
1110	skb = sock_omalloc(sk, 0, GFP_KERNEL);
1111	if (!skb)
1112		return NULL;
1113
1114	BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
1115	uarg = (void *)skb->cb;
1116	uarg->mmp.user = NULL;
1117
1118	if (mm_account_pinned_pages(&uarg->mmp, size)) {
1119		kfree_skb(skb);
1120		return NULL;
1121	}
1122
1123	uarg->callback = sock_zerocopy_callback;
1124	uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
1125	uarg->len = 1;
1126	uarg->bytelen = size;
1127	uarg->zerocopy = 1;
1128	refcount_set(&uarg->refcnt, 1);
1129	sock_hold(sk);
1130
1131	return uarg;
1132}
1133EXPORT_SYMBOL_GPL(sock_zerocopy_alloc);
1134
1135static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
1136{
1137	return container_of((void *)uarg, struct sk_buff, cb);
1138}
1139
1140struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
1141					struct ubuf_info *uarg)
1142{
1143	if (uarg) {
1144		const u32 byte_limit = 1 << 19;		/* limit to a few TSO */
1145		u32 bytelen, next;
1146
1147		/* realloc only when socket is locked (TCP, UDP cork),
1148		 * so uarg->len and sk_zckey access is serialized
1149		 */
1150		if (!sock_owned_by_user(sk)) {
1151			WARN_ON_ONCE(1);
1152			return NULL;
1153		}
1154
1155		bytelen = uarg->bytelen + size;
1156		if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
1157			/* TCP can create new skb to attach new uarg */
1158			if (sk->sk_type == SOCK_STREAM)
1159				goto new_alloc;
1160			return NULL;
1161		}
1162
1163		next = (u32)atomic_read(&sk->sk_zckey);
1164		if ((u32)(uarg->id + uarg->len) == next) {
1165			if (mm_account_pinned_pages(&uarg->mmp, size))
1166				return NULL;
1167			uarg->len++;
1168			uarg->bytelen = bytelen;
1169			atomic_set(&sk->sk_zckey, ++next);
1170
1171			/* no extra ref when appending to datagram (MSG_MORE) */
1172			if (sk->sk_type == SOCK_STREAM)
1173				sock_zerocopy_get(uarg);
1174
1175			return uarg;
1176		}
1177	}
1178
1179new_alloc:
1180	return sock_zerocopy_alloc(sk, size);
1181}
1182EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);
1183
1184static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
1185{
1186	struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
1187	u32 old_lo, old_hi;
1188	u64 sum_len;
1189
1190	old_lo = serr->ee.ee_info;
1191	old_hi = serr->ee.ee_data;
1192	sum_len = old_hi - old_lo + 1ULL + len;
1193
1194	if (sum_len >= (1ULL << 32))
1195		return false;
1196
1197	if (lo != old_hi + 1)
1198		return false;
1199
1200	serr->ee.ee_data += len;
1201	return true;
1202}
1203
1204void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
1205{
1206	struct sk_buff *tail, *skb = skb_from_uarg(uarg);
1207	struct sock_exterr_skb *serr;
1208	struct sock *sk = skb->sk;
1209	struct sk_buff_head *q;
1210	unsigned long flags;
1211	u32 lo, hi;
1212	u16 len;
1213
1214	mm_unaccount_pinned_pages(&uarg->mmp);
1215
1216	/* if !len, there was only 1 call, and it was aborted
1217	 * so do not queue a completion notification
1218	 */
1219	if (!uarg->len || sock_flag(sk, SOCK_DEAD))
1220		goto release;
1221
1222	len = uarg->len;
1223	lo = uarg->id;
1224	hi = uarg->id + len - 1;
1225
1226	serr = SKB_EXT_ERR(skb);
1227	memset(serr, 0, sizeof(*serr));
1228	serr->ee.ee_errno = 0;
1229	serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
1230	serr->ee.ee_data = hi;
1231	serr->ee.ee_info = lo;
1232	if (!success)
1233		serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
1234
1235	q = &sk->sk_error_queue;
1236	spin_lock_irqsave(&q->lock, flags);
1237	tail = skb_peek_tail(q);
1238	if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
1239	    !skb_zerocopy_notify_extend(tail, lo, len)) {
1240		__skb_queue_tail(q, skb);
1241		skb = NULL;
1242	}
1243	spin_unlock_irqrestore(&q->lock, flags);
1244
1245	sk->sk_error_report(sk);
1246
1247release:
1248	consume_skb(skb);
1249	sock_put(sk);
1250}
1251EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
1252
1253void sock_zerocopy_put(struct ubuf_info *uarg)
1254{
1255	if (uarg && refcount_dec_and_test(&uarg->refcnt)) {
1256		if (uarg->callback)
1257			uarg->callback(uarg, uarg->zerocopy);
1258		else
1259			consume_skb(skb_from_uarg(uarg));
1260	}
1261}
1262EXPORT_SYMBOL_GPL(sock_zerocopy_put);
1263
1264void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
1265{
1266	if (uarg) {
1267		struct sock *sk = skb_from_uarg(uarg)->sk;
1268
1269		atomic_dec(&sk->sk_zckey);
1270		uarg->len--;
1271
1272		if (have_uref)
1273			sock_zerocopy_put(uarg);
1274	}
1275}
1276EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
1277
1278int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
1279{
1280	return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
1281}
1282EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);
1283
1284int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
1285			     struct msghdr *msg, int len,
1286			     struct ubuf_info *uarg)
1287{
1288	struct ubuf_info *orig_uarg = skb_zcopy(skb);
1289	struct iov_iter orig_iter = msg->msg_iter;
1290	int err, orig_len = skb->len;
1291
1292	/* An skb can only point to one uarg. This edge case happens when
1293	 * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
1294	 */
1295	if (orig_uarg && uarg != orig_uarg)
1296		return -EEXIST;
1297
1298	err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
1299	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
1300		struct sock *save_sk = skb->sk;
1301
1302		/* Streams do not free skb on error. Reset to prev state. */
1303		msg->msg_iter = orig_iter;
1304		skb->sk = sk;
1305		___pskb_trim(skb, orig_len);
1306		skb->sk = save_sk;
1307		return err;
1308	}
1309
1310	skb_zcopy_set(skb, uarg, NULL);
1311	return skb->len - orig_len;
1312}
1313EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
1314
1315static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
1316			      gfp_t gfp_mask)
1317{
1318	if (skb_zcopy(orig)) {
1319		if (skb_zcopy(nskb)) {
1320			/* !gfp_mask callers are verified to !skb_zcopy(nskb) */
1321			if (!gfp_mask) {
1322				WARN_ON_ONCE(1);
1323				return -ENOMEM;
1324			}
1325			if (skb_uarg(nskb) == skb_uarg(orig))
1326				return 0;
1327			if (skb_copy_ubufs(nskb, GFP_ATOMIC))
1328				return -EIO;
1329		}
1330		skb_zcopy_set(nskb, skb_uarg(orig), NULL);
1331	}
1332	return 0;
1333}
1334
1335/**
1336 *	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
1337 *	@skb: the skb to modify
1338 *	@gfp_mask: allocation priority
1339 *
1340 *	This must be called on SKBTX_DEV_ZEROCOPY skb.
1341 *	It will copy all frags into kernel and drop the reference
1342 *	to userspace pages.
1343 *
1344 *	If this function is called from an interrupt gfp_mask() must be
1345 *	%GFP_ATOMIC.
1346 *
1347 *	Returns 0 on success or a negative error code on failure
1348 *	to allocate kernel memory to copy to.
1349 */
1350int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
1351{
1352	int num_frags = skb_shinfo(skb)->nr_frags;
1353	struct page *page, *head = NULL;
1354	int i, new_frags;
1355	u32 d_off;
1356
1357	if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
1358		return -EINVAL;
1359
1360	if (!num_frags)
1361		goto release;
1362
1363	new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
1364	for (i = 0; i < new_frags; i++) {
1365		page = alloc_page(gfp_mask);
1366		if (!page) {
1367			while (head) {
1368				struct page *next = (struct page *)page_private(head);
1369				put_page(head);
1370				head = next;
1371			}
1372			return -ENOMEM;
1373		}
1374		set_page_private(page, (unsigned long)head);
1375		head = page;
1376	}
1377
1378	page = head;
1379	d_off = 0;
1380	for (i = 0; i < num_frags; i++) {
1381		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
1382		u32 p_off, p_len, copied;
1383		struct page *p;
1384		u8 *vaddr;
1385
1386		skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
1387				      p, p_off, p_len, copied) {
1388			u32 copy, done = 0;
1389			vaddr = kmap_atomic(p);
1390
1391			while (done < p_len) {
1392				if (d_off == PAGE_SIZE) {
1393					d_off = 0;
1394					page = (struct page *)page_private(page);
1395				}
1396				copy = min_t(u32, PAGE_SIZE - d_off, p_len - done);
1397				memcpy(page_address(page) + d_off,
1398				       vaddr + p_off + done, copy);
1399				done += copy;
1400				d_off += copy;
1401			}
1402			kunmap_atomic(vaddr);
1403		}
1404	}
1405
1406	/* skb frags release userspace buffers */
1407	for (i = 0; i < num_frags; i++)
1408		skb_frag_unref(skb, i);
1409
1410	/* skb frags point to kernel buffers */
1411	for (i = 0; i < new_frags - 1; i++) {
1412		__skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE);
1413		head = (struct page *)page_private(head);
1414	}
1415	__skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
1416	skb_shinfo(skb)->nr_frags = new_frags;
1417
1418release:
1419	skb_zcopy_clear(skb, false);
1420	return 0;
1421}
1422EXPORT_SYMBOL_GPL(skb_copy_ubufs);
1423
1424/**
1425 *	skb_clone	-	duplicate an sk_buff
1426 *	@skb: buffer to clone
1427 *	@gfp_mask: allocation priority
1428 *
1429 *	Duplicate an &sk_buff. The new one is not owned by a socket. Both
1430 *	copies share the same packet data but not structure. The new
1431 *	buffer has a reference count of 1. If the allocation fails the
1432 *	function returns %NULL otherwise the new buffer is returned.
1433 *
1434 *	If this function is called from an interrupt gfp_mask() must be
1435 *	%GFP_ATOMIC.
1436 */
1437
1438struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
1439{
1440	struct sk_buff_fclones *fclones = container_of(skb,
1441						       struct sk_buff_fclones,
1442						       skb1);
1443	struct sk_buff *n;
1444
1445	if (skb_orphan_frags(skb, gfp_mask))
1446		return NULL;
1447
1448	if (skb->fclone == SKB_FCLONE_ORIG &&
1449	    refcount_read(&fclones->fclone_ref) == 1) {
1450		n = &fclones->skb2;
1451		refcount_set(&fclones->fclone_ref, 2);
1452	} else {
1453		if (skb_pfmemalloc(skb))
1454			gfp_mask |= __GFP_MEMALLOC;
1455
1456		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
1457		if (!n)
1458			return NULL;
1459
1460		n->fclone = SKB_FCLONE_UNAVAILABLE;
1461	}
1462
1463	return __skb_clone(n, skb);
1464}
1465EXPORT_SYMBOL(skb_clone);
1466
1467void skb_headers_offset_update(struct sk_buff *skb, int off)
1468{
1469	/* Only adjust this if it actually is csum_start rather than csum */
1470	if (skb->ip_summed == CHECKSUM_PARTIAL)
1471		skb->csum_start += off;
1472	/* {transport,network,mac}_header and tail are relative to skb->head */
1473	skb->transport_header += off;
1474	skb->network_header   += off;
1475	if (skb_mac_header_was_set(skb))
1476		skb->mac_header += off;
1477	skb->inner_transport_header += off;
1478	skb->inner_network_header += off;
1479	skb->inner_mac_header += off;
1480}
1481EXPORT_SYMBOL(skb_headers_offset_update);
1482
1483void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
1484{
1485	__copy_skb_header(new, old);
1486
1487	skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
1488	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
1489	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
1490}
1491EXPORT_SYMBOL(skb_copy_header);
1492
1493static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
1494{
1495	if (skb_pfmemalloc(skb))
1496		return SKB_ALLOC_RX;
1497	return 0;
1498}
1499
1500/**
1501 *	skb_copy	-	create private copy of an sk_buff
1502 *	@skb: buffer to copy
1503 *	@gfp_mask: allocation priority
1504 *
1505 *	Make a copy of both an &sk_buff and its data. This is used when the
1506 *	caller wishes to modify the data and needs a private copy of the
1507 *	data to alter. Returns %NULL on failure or the pointer to the buffer
1508 *	on success. The returned buffer has a reference count of 1.
1509 *
1510 *	As by-product this function converts non-linear &sk_buff to linear
1511 *	one, so that &sk_buff becomes completely private and caller is allowed
1512 *	to modify all the data of returned buffer. This means that this
1513 *	function is not recommended for use in circumstances when only
1514 *	header is going to be modified. Use pskb_copy() instead.
1515 */
1516
1517struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
1518{
1519	struct sk_buff *n;
1520	unsigned int size;
1521	int headerlen;
1522
1523	if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
1524		return NULL;
1525
1526	headerlen = skb_headroom(skb);
1527	size = skb_end_offset(skb) + skb->data_len;
1528	n = __alloc_skb(size, gfp_mask,
1529			skb_alloc_rx_flag(skb), NUMA_NO_NODE);
1530	if (!n)
1531		return NULL;
1532
1533	/* Set the data pointer */
1534	skb_reserve(n, headerlen);
1535	/* Set the tail pointer and length */
1536	skb_put(n, skb->len);
1537
1538	BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
1539
1540	skb_copy_header(n, skb);
1541	return n;
1542}
1543EXPORT_SYMBOL(skb_copy);
1544
1545/**
1546 *	__pskb_copy_fclone	-  create copy of an sk_buff with private head.
1547 *	@skb: buffer to copy
1548 *	@headroom: headroom of new skb
1549 *	@gfp_mask: allocation priority
1550 *	@fclone: if true allocate the copy of the skb from the fclone
1551 *	cache instead of the head cache; it is recommended to set this
1552 *	to true for the cases where the copy will likely be cloned
1553 *
1554 *	Make a copy of both an &sk_buff and part of its data, located
1555 *	in header. Fragmented data remain shared. This is used when
1556 *	the caller wishes to modify only header of &sk_buff and needs
1557 *	private copy of the header to alter. Returns %NULL on failure
1558 *	or the pointer to the buffer on success.
1559 *	The returned buffer has a reference count of 1.
1560 */
1561
1562struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
1563				   gfp_t gfp_mask, bool fclone)
1564{
1565	unsigned int size = skb_headlen(skb) + headroom;
1566	int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
1567	struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
1568
1569	if (!n)
1570		goto out;
1571
1572	/* Set the data pointer */
1573	skb_reserve(n, headroom);
1574	/* Set the tail pointer and length */
1575	skb_put(n, skb_headlen(skb));
1576	/* Copy the bytes */
1577	skb_copy_from_linear_data(skb, n->data, n->len);
1578
1579	n->truesize += skb->data_len;
1580	n->data_len  = skb->data_len;
1581	n->len	     = skb->len;
1582
1583	if (skb_shinfo(skb)->nr_frags) {
1584		int i;
1585
1586		if (skb_orphan_frags(skb, gfp_mask) ||
1587		    skb_zerocopy_clone(n, skb, gfp_mask)) {
1588			kfree_skb(n);
1589			n = NULL;
1590			goto out;
1591		}
1592		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1593			skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
1594			skb_frag_ref(skb, i);
1595		}
1596		skb_shinfo(n)->nr_frags = i;
1597	}
1598
1599	if (skb_has_frag_list(skb)) {
1600		skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
1601		skb_clone_fraglist(n);
1602	}
1603
1604	skb_copy_header(n, skb);
1605out:
1606	return n;
1607}
1608EXPORT_SYMBOL(__pskb_copy_fclone);
1609
1610/**
1611 *	pskb_expand_head - reallocate header of &sk_buff
1612 *	@skb: buffer to reallocate
1613 *	@nhead: room to add at head
1614 *	@ntail: room to add at tail
1615 *	@gfp_mask: allocation priority
1616 *
1617 *	Expands (or creates identical copy, if @nhead and @ntail are zero)
1618 *	header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
1619 *	reference count of 1. Returns zero in the case of success or error,
1620 *	if expansion failed. In the last case, &sk_buff is not changed.
1621 *
1622 *	All the pointers pointing into skb header may change and must be
1623 *	reloaded after call to this function.
1624 */
1625
1626int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
1627		     gfp_t gfp_mask)
1628{
1629	int i, osize = skb_end_offset(skb);
1630	int size = osize + nhead + ntail;
1631	long off;
1632	u8 *data;
1633
1634	BUG_ON(nhead < 0);
1635
1636	BUG_ON(skb_shared(skb));
1637
1638	size = SKB_DATA_ALIGN(size);
1639
1640	if (skb_pfmemalloc(skb))
1641		gfp_mask |= __GFP_MEMALLOC;
1642	data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
1643			       gfp_mask, NUMA_NO_NODE, NULL);
1644	if (!data)
1645		goto nodata;
1646	size = SKB_WITH_OVERHEAD(ksize(data));
1647
1648	/* Copy only real data... and, alas, header. This should be
1649	 * optimized for the cases when header is void.
1650	 */
1651	memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
1652
1653	memcpy((struct skb_shared_info *)(data + size),
1654	       skb_shinfo(skb),
1655	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
1656
1657	/*
1658	 * if shinfo is shared we must drop the old head gracefully, but if it
1659	 * is not we can just drop the old head and let the existing refcount
1660	 * be since all we did is relocate the values
1661	 */
1662	if (skb_cloned(skb)) {
1663		if (skb_orphan_frags(skb, gfp_mask))
1664			goto nofrags;
1665		if (skb_zcopy(skb))
1666			refcount_inc(&skb_uarg(skb)->refcnt);
1667		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1668			skb_frag_ref(skb, i);
1669
1670		if (skb_has_frag_list(skb))
1671			skb_clone_fraglist(skb);
1672
1673		skb_release_data(skb);
1674	} else {
1675		skb_free_head(skb);
1676	}
1677	off = (data + nhead) - skb->head;
1678
1679	skb->head     = data;
1680	skb->head_frag = 0;
1681	skb->data    += off;
1682#ifdef NET_SKBUFF_DATA_USES_OFFSET
1683	skb->end      = size;
1684	off           = nhead;
1685#else
1686	skb->end      = skb->head + size;
1687#endif
1688	skb->tail	      += off;
1689	skb_headers_offset_update(skb, nhead);
1690	skb->cloned   = 0;
1691	skb->hdr_len  = 0;
1692	skb->nohdr    = 0;
1693	atomic_set(&skb_shinfo(skb)->dataref, 1);
1694
1695	skb_metadata_clear(skb);
1696
1697	/* It is not generally safe to change skb->truesize.
1698	 * For the moment, we really care of rx path, or
1699	 * when skb is orphaned (not attached to a socket).
1700	 */
1701	if (!skb->sk || skb->destructor == sock_edemux)
1702		skb->truesize += size - osize;
1703
1704	return 0;
1705
1706nofrags:
1707	kfree(data);
1708nodata:
1709	return -ENOMEM;
1710}
1711EXPORT_SYMBOL(pskb_expand_head);
1712
1713/* Make private copy of skb with writable head and some headroom */
1714
1715struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
1716{
1717	struct sk_buff *skb2;
1718	int delta = headroom - skb_headroom(skb);
1719
1720	if (delta <= 0)
1721		skb2 = pskb_copy(skb, GFP_ATOMIC);
1722	else {
1723		skb2 = skb_clone(skb, GFP_ATOMIC);
1724		if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
1725					     GFP_ATOMIC)) {
1726			kfree_skb(skb2);
1727			skb2 = NULL;
1728		}
1729	}
1730	return skb2;
1731}
1732EXPORT_SYMBOL(skb_realloc_headroom);
1733
1734/**
1735 *	skb_copy_expand	-	copy and expand sk_buff
1736 *	@skb: buffer to copy
1737 *	@newheadroom: new free bytes at head
1738 *	@newtailroom: new free bytes at tail
1739 *	@gfp_mask: allocation priority
1740 *
1741 *	Make a copy of both an &sk_buff and its data and while doing so
1742 *	allocate additional space.
1743 *
1744 *	This is used when the caller wishes to modify the data and needs a
1745 *	private copy of the data to alter as well as more space for new fields.
1746 *	Returns %NULL on failure or the pointer to the buffer
1747 *	on success. The returned buffer has a reference count of 1.
1748 *
1749 *	You must pass %GFP_ATOMIC as the allocation priority if this function
1750 *	is called from an interrupt.
1751 */
1752struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
1753				int newheadroom, int newtailroom,
1754				gfp_t gfp_mask)
1755{
1756	/*
1757	 *	Allocate the copy buffer
1758	 */
1759	int head_copy_len, head_copy_off;
1760	struct sk_buff *n;
1761	int oldheadroom;
1762
1763	if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
1764		return NULL;
1765
1766	oldheadroom = skb_headroom(skb);
1767	n = __alloc_skb(newheadroom + skb->len + newtailroom,
1768			gfp_mask, skb_alloc_rx_flag(skb),
1769			NUMA_NO_NODE);
1770	if (!n)
1771		return NULL;
1772
1773	skb_reserve(n, newheadroom);
1774
1775	/* Set the tail pointer and length */
1776	skb_put(n, skb->len);
1777
1778	head_copy_len = oldheadroom;
1779	head_copy_off = 0;
1780	if (newheadroom <= head_copy_len)
1781		head_copy_len = newheadroom;
1782	else
1783		head_copy_off = newheadroom - head_copy_len;
1784
1785	/* Copy the linear header and data. */
1786	BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
1787			     skb->len + head_copy_len));
1788
1789	skb_copy_header(n, skb);
1790
1791	skb_headers_offset_update(n, newheadroom - oldheadroom);
1792
1793	return n;
1794}
1795EXPORT_SYMBOL(skb_copy_expand);
1796
1797/**
1798 *	__skb_pad		-	zero pad the tail of an skb
1799 *	@skb: buffer to pad
1800 *	@pad: space to pad
1801 *	@free_on_error: free buffer on error
1802 *
1803 *	Ensure that a buffer is followed by a padding area that is zero
1804 *	filled. Used by network drivers which may DMA or transfer data
1805 *	beyond the buffer end onto the wire.
1806 *
1807 *	May return error in out of memory cases. The skb is freed on error
1808 *	if @free_on_error is true.
1809 */
1810
1811int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
1812{
1813	int err;
1814	int ntail;
1815
1816	/* If the skbuff is non linear tailroom is always zero.. */
1817	if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
1818		memset(skb->data+skb->len, 0, pad);
1819		return 0;
1820	}
1821
1822	ntail = skb->data_len + pad - (skb->end - skb->tail);
1823	if (likely(skb_cloned(skb) || ntail > 0)) {
1824		err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
1825		if (unlikely(err))
1826			goto free_skb;
1827	}
1828
1829	/* FIXME: The use of this function with non-linear skb's really needs
1830	 * to be audited.
1831	 */
1832	err = skb_linearize(skb);
1833	if (unlikely(err))
1834		goto free_skb;
1835
1836	memset(skb->data + skb->len, 0, pad);
1837	return 0;
1838
1839free_skb:
1840	if (free_on_error)
1841		kfree_skb(skb);
1842	return err;
1843}
1844EXPORT_SYMBOL(__skb_pad);
1845
1846/**
1847 *	pskb_put - add data to the tail of a potentially fragmented buffer
1848 *	@skb: start of the buffer to use
1849 *	@tail: tail fragment of the buffer to use
1850 *	@len: amount of data to add
1851 *
1852 *	This function extends the used data area of the potentially
1853 *	fragmented buffer. @tail must be the last fragment of @skb -- or
1854 *	@skb itself. If this would exceed the total buffer size the kernel
1855 *	will panic. A pointer to the first byte of the extra data is
1856 *	returned.
1857 */
1858
1859void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
1860{
1861	if (tail != skb) {
1862		skb->data_len += len;
1863		skb->len += len;
1864	}
1865	return skb_put(tail, len);
1866}
1867EXPORT_SYMBOL_GPL(pskb_put);
1868
1869/**
1870 *	skb_put - add data to a buffer
1871 *	@skb: buffer to use
1872 *	@len: amount of data to add
1873 *
1874 *	This function extends the used data area of the buffer. If this would
1875 *	exceed the total buffer size the kernel will panic. A pointer to the
1876 *	first byte of the extra data is returned.
1877 */
1878void *skb_put(struct sk_buff *skb, unsigned int len)
1879{
1880	void *tmp = skb_tail_pointer(skb);
1881	SKB_LINEAR_ASSERT(skb);
1882	skb->tail += len;
1883	skb->len  += len;
1884	if (unlikely(skb->tail > skb->end))
1885		skb_over_panic(skb, len, __builtin_return_address(0));
1886	return tmp;
1887}
1888EXPORT_SYMBOL(skb_put);
1889
1890/**
1891 *	skb_push - add data to the start of a buffer
1892 *	@skb: buffer to use
1893 *	@len: amount of data to add
1894 *
1895 *	This function extends the used data area of the buffer at the buffer
1896 *	start. If this would exceed the total buffer headroom the kernel will
1897 *	panic. A pointer to the first byte of the extra data is returned.
1898 */
1899void *skb_push(struct sk_buff *skb, unsigned int len)
1900{
1901	skb->data -= len;
1902	skb->len  += len;
1903	if (unlikely(skb->data < skb->head))
1904		skb_under_panic(skb, len, __builtin_return_address(0));
1905	return skb->data;
1906}
1907EXPORT_SYMBOL(skb_push);
1908
1909/**
1910 *	skb_pull - remove data from the start of a buffer
1911 *	@skb: buffer to use
1912 *	@len: amount of data to remove
1913 *
1914 *	This function removes data from the start of a buffer, returning
1915 *	the memory to the headroom. A pointer to the next data in the buffer
1916 *	is returned. Once the data has been pulled future pushes will overwrite
1917 *	the old data.
1918 */
1919void *skb_pull(struct sk_buff *skb, unsigned int len)
1920{
1921	return skb_pull_inline(skb, len);
1922}
1923EXPORT_SYMBOL(skb_pull);
1924
1925/**
1926 *	skb_trim - remove end from a buffer
1927 *	@skb: buffer to alter
1928 *	@len: new length
1929 *
1930 *	Cut the length of a buffer down by removing data from the tail. If
1931 *	the buffer is already under the length specified it is not modified.
1932 *	The skb must be linear.
1933 */
1934void skb_trim(struct sk_buff *skb, unsigned int len)
1935{
1936	if (skb->len > len)
1937		__skb_trim(skb, len);
1938}
1939EXPORT_SYMBOL(skb_trim);
1940
1941/* Trims skb to length len. It can change skb pointers.
1942 */
1943
1944int ___pskb_trim(struct sk_buff *skb, unsigned int len)
1945{
1946	struct sk_buff **fragp;
1947	struct sk_buff *frag;
1948	int offset = skb_headlen(skb);
1949	int nfrags = skb_shinfo(skb)->nr_frags;
1950	int i;
1951	int err;
1952
1953	if (skb_cloned(skb) &&
1954	    unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
1955		return err;
1956
1957	i = 0;
1958	if (offset >= len)
1959		goto drop_pages;
1960
1961	for (; i < nfrags; i++) {
1962		int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
1963
1964		if (end < len) {
1965			offset = end;
1966			continue;
1967		}
1968
1969		skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
1970
1971drop_pages:
1972		skb_shinfo(skb)->nr_frags = i;
1973
1974		for (; i < nfrags; i++)
1975			skb_frag_unref(skb, i);
1976
1977		if (skb_has_frag_list(skb))
1978			skb_drop_fraglist(skb);
1979		goto done;
1980	}
1981
1982	for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
1983	     fragp = &frag->next) {
1984		int end = offset + frag->len;
1985
1986		if (skb_shared(frag)) {
1987			struct sk_buff *nfrag;
1988
1989			nfrag = skb_clone(frag, GFP_ATOMIC);
1990			if (unlikely(!nfrag))
1991				return -ENOMEM;
1992
1993			nfrag->next = frag->next;
1994			consume_skb(frag);
1995			frag = nfrag;
1996			*fragp = frag;
1997		}
1998
1999		if (end < len) {
2000			offset = end;
2001			continue;
2002		}
2003
2004		if (end > len &&
2005		    unlikely((err = pskb_trim(frag, len - offset))))
2006			return err;
2007
2008		if (frag->next)
2009			skb_drop_list(&frag->next);
2010		break;
2011	}
2012
2013done:
2014	if (len > skb_headlen(skb)) {
2015		skb->data_len -= skb->len - len;
2016		skb->len       = len;
2017	} else {
2018		skb->len       = len;
2019		skb->data_len  = 0;
2020		skb_set_tail_pointer(skb, len);
2021	}
2022
2023	if (!skb->sk || skb->destructor == sock_edemux)
2024		skb_condense(skb);
2025	return 0;
2026}
2027EXPORT_SYMBOL(___pskb_trim);
2028
2029/* Note : use pskb_trim_rcsum() instead of calling this directly
2030 */
2031int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
2032{
2033	if (skb->ip_summed == CHECKSUM_COMPLETE) {
2034		int delta = skb->len - len;
2035
2036		skb->csum = csum_block_sub(skb->csum,
2037					   skb_checksum(skb, len, delta, 0),
2038					   len);
2039	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
2040		int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
2041		int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
2042
2043		if (offset + sizeof(__sum16) > hdlen)
2044			return -EINVAL;
2045	}
2046	return __pskb_trim(skb, len);
2047}
2048EXPORT_SYMBOL(pskb_trim_rcsum_slow);
2049
2050/**
2051 *	__pskb_pull_tail - advance tail of skb header
2052 *	@skb: buffer to reallocate
2053 *	@delta: number of bytes to advance tail
2054 *
2055 *	The function makes a sense only on a fragmented &sk_buff,
2056 *	it expands header moving its tail forward and copying necessary
2057 *	data from fragmented part.
2058 *
2059 *	&sk_buff MUST have reference count of 1.
2060 *
2061 *	Returns %NULL (and &sk_buff does not change) if pull failed
2062 *	or value of new tail of skb in the case of success.
2063 *
2064 *	All the pointers pointing into skb header may change and must be
2065 *	reloaded after call to this function.
2066 */
2067
2068/* Moves tail of skb head forward, copying data from fragmented part,
2069 * when it is necessary.
2070 * 1. It may fail due to malloc failure.
2071 * 2. It may change skb pointers.
2072 *
2073 * It is pretty complicated. Luckily, it is called only in exceptional cases.
2074 */
2075void *__pskb_pull_tail(struct sk_buff *skb, int delta)
2076{
2077	/* If skb has not enough free space at tail, get new one
2078	 * plus 128 bytes for future expansions. If we have enough
2079	 * room at tail, reallocate without expansion only if skb is cloned.
2080	 */
2081	int i, k, eat = (skb->tail + delta) - skb->end;
2082
2083	if (eat > 0 || skb_cloned(skb)) {
2084		if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
2085				     GFP_ATOMIC))
2086			return NULL;
2087	}
2088
2089	BUG_ON(skb_copy_bits(skb, skb_headlen(skb),
2090			     skb_tail_pointer(skb), delta));
2091
2092	/* Optimization: no fragments, no reasons to preestimate
2093	 * size of pulled pages. Superb.
2094	 */
2095	if (!skb_has_frag_list(skb))
2096		goto pull_pages;
2097
2098	/* Estimate size of pulled pages. */
2099	eat = delta;
2100	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2101		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
2102
2103		if (size >= eat)
2104			goto pull_pages;
2105		eat -= size;
2106	}
2107
2108	/* If we need update frag list, we are in troubles.
2109	 * Certainly, it is possible to add an offset to skb data,
2110	 * but taking into account that pulling is expected to
2111	 * be very rare operation, it is worth to fight against
2112	 * further bloating skb head and crucify ourselves here instead.
2113	 * Pure masohism, indeed. 8)8)
2114	 */
2115	if (eat) {
2116		struct sk_buff *list = skb_shinfo(skb)->frag_list;
2117		struct sk_buff *clone = NULL;
2118		struct sk_buff *insp = NULL;
2119
2120		do {
2121			if (list->len <= eat) {
2122				/* Eaten as whole. */
2123				eat -= list->len;
2124				list = list->next;
2125				insp = list;
2126			} else {
2127				/* Eaten partially. */
2128				if (skb_is_gso(skb) && !list->head_frag &&
2129				    skb_headlen(list))
2130					skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2131
2132				if (skb_shared(list)) {
2133					/* Sucks! We need to fork list. :-( */
2134					clone = skb_clone(list, GFP_ATOMIC);
2135					if (!clone)
2136						return NULL;
2137					insp = list->next;
2138					list = clone;
2139				} else {
2140					/* This may be pulled without
2141					 * problems. */
2142					insp = list;
2143				}
2144				if (!pskb_pull(list, eat)) {
2145					kfree_skb(clone);
2146					return NULL;
2147				}
2148				break;
2149			}
2150		} while (eat);
2151
2152		/* Free pulled out fragments. */
2153		while ((list = skb_shinfo(skb)->frag_list) != insp) {
2154			skb_shinfo(skb)->frag_list = list->next;
2155			consume_skb(list);
2156		}
2157		/* And insert new clone at head. */
2158		if (clone) {
2159			clone->next = list;
2160			skb_shinfo(skb)->frag_list = clone;
2161		}
2162	}
2163	/* Success! Now we may commit changes to skb data. */
2164
2165pull_pages:
2166	eat = delta;
2167	k = 0;
2168	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2169		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
2170
2171		if (size <= eat) {
2172			skb_frag_unref(skb, i);
2173			eat -= size;
2174		} else {
2175			skb_frag_t *frag = &skb_shinfo(skb)->frags[k];
2176
2177			*frag = skb_shinfo(skb)->frags[i];
2178			if (eat) {
2179				skb_frag_off_add(frag, eat);
2180				skb_frag_size_sub(frag, eat);
2181				if (!i)
2182					goto end;
2183				eat = 0;
2184			}
2185			k++;
2186		}
2187	}
2188	skb_shinfo(skb)->nr_frags = k;
2189
2190end:
2191	skb->tail     += delta;
2192	skb->data_len -= delta;
2193
2194	if (!skb->data_len)
2195		skb_zcopy_clear(skb, false);
2196
2197	return skb_tail_pointer(skb);
2198}
2199EXPORT_SYMBOL(__pskb_pull_tail);
2200
2201/**
2202 *	skb_copy_bits - copy bits from skb to kernel buffer
2203 *	@skb: source skb
2204 *	@offset: offset in source
2205 *	@to: destination buffer
2206 *	@len: number of bytes to copy
2207 *
2208 *	Copy the specified number of bytes from the source skb to the
2209 *	destination buffer.
2210 *
2211 *	CAUTION ! :
2212 *		If its prototype is ever changed,
2213 *		check arch/{*}/net/{*}.S files,
2214 *		since it is called from BPF assembly code.
2215 */
2216int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
2217{
2218	int start = skb_headlen(skb);
2219	struct sk_buff *frag_iter;
2220	int i, copy;
2221
2222	if (offset > (int)skb->len - len)
2223		goto fault;
2224
2225	/* Copy header. */
2226	if ((copy = start - offset) > 0) {
2227		if (copy > len)
2228			copy = len;
2229		skb_copy_from_linear_data_offset(skb, offset, to, copy);
2230		if ((len -= copy) == 0)
2231			return 0;
2232		offset += copy;
2233		to     += copy;
2234	}
2235
2236	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2237		int end;
2238		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
2239
2240		WARN_ON(start > offset + len);
2241
2242		end = start + skb_frag_size(f);
2243		if ((copy = end - offset) > 0) {
2244			u32 p_off, p_len, copied;
2245			struct page *p;
2246			u8 *vaddr;
2247
2248			if (copy > len)
2249				copy = len;
2250
2251			skb_frag_foreach_page(f,
2252					      skb_frag_off(f) + offset - start,
2253					      copy, p, p_off, p_len, copied) {
2254				vaddr = kmap_atomic(p);
2255				memcpy(to + copied, vaddr + p_off, p_len);
2256				kunmap_atomic(vaddr);
2257			}
2258
2259			if ((len -= copy) == 0)
2260				return 0;
2261			offset += copy;
2262			to     += copy;
2263		}
2264		start = end;
2265	}
2266
2267	skb_walk_frags(skb, frag_iter) {
2268		int end;
2269
2270		WARN_ON(start > offset + len);
2271
2272		end = start + frag_iter->len;
2273		if ((copy = end - offset) > 0) {
2274			if (copy > len)
2275				copy = len;
2276			if (skb_copy_bits(frag_iter, offset - start, to, copy))
2277				goto fault;
2278			if ((len -= copy) == 0)
2279				return 0;
2280			offset += copy;
2281			to     += copy;
2282		}
2283		start = end;
2284	}
2285
2286	if (!len)
2287		return 0;
2288
2289fault:
2290	return -EFAULT;
2291}
2292EXPORT_SYMBOL(skb_copy_bits);
2293
2294/*
2295 * Callback from splice_to_pipe(), if we need to release some pages
2296 * at the end of the spd in case we error'ed out in filling the pipe.
2297 */
2298static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
2299{
2300	put_page(spd->pages[i]);
2301}
2302
2303static struct page *linear_to_page(struct page *page, unsigned int *len,
2304				   unsigned int *offset,
2305				   struct sock *sk)
2306{
2307	struct page_frag *pfrag = sk_page_frag(sk);
2308
2309	if (!sk_page_frag_refill(sk, pfrag))
2310		return NULL;
2311
2312	*len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
2313
2314	memcpy(page_address(pfrag->page) + pfrag->offset,
2315	       page_address(page) + *offset, *len);
2316	*offset = pfrag->offset;
2317	pfrag->offset += *len;
2318
2319	return pfrag->page;
2320}
2321
2322static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
2323			     struct page *page,
2324			     unsigned int offset)
2325{
2326	return	spd->nr_pages &&
2327		spd->pages[spd->nr_pages - 1] == page &&
2328		(spd->partial[spd->nr_pages - 1].offset +
2329		 spd->partial[spd->nr_pages - 1].len == offset);
2330}
2331
2332/*
2333 * Fill page/offset/length into spd, if it can hold more pages.
2334 */
2335static bool spd_fill_page(struct splice_pipe_desc *spd,
2336			  struct pipe_inode_info *pipe, struct page *page,
2337			  unsigned int *len, unsigned int offset,
2338			  bool linear,
2339			  struct sock *sk)
2340{
2341	if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
2342		return true;
2343
2344	if (linear) {
2345		page = linear_to_page(page, len, &offset, sk);
2346		if (!page)
2347			return true;
2348	}
2349	if (spd_can_coalesce(spd, page, offset)) {
2350		spd->partial[spd->nr_pages - 1].len += *len;
2351		return false;
2352	}
2353	get_page(page);
2354	spd->pages[spd->nr_pages] = page;
2355	spd->partial[spd->nr_pages].len = *len;
2356	spd->partial[spd->nr_pages].offset = offset;
2357	spd->nr_pages++;
2358
2359	return false;
2360}
2361
2362static bool __splice_segment(struct page *page, unsigned int poff,
2363			     unsigned int plen, unsigned int *off,
2364			     unsigned int *len,
2365			     struct splice_pipe_desc *spd, bool linear,
2366			     struct sock *sk,
2367			     struct pipe_inode_info *pipe)
2368{
2369	if (!*len)
2370		return true;
2371
2372	/* skip this segment if already processed */
2373	if (*off >= plen) {
2374		*off -= plen;
2375		return false;
2376	}
2377
2378	/* ignore any bits we already processed */
2379	poff += *off;
2380	plen -= *off;
2381	*off = 0;
2382
2383	do {
2384		unsigned int flen = min(*len, plen);
2385
2386		if (spd_fill_page(spd, pipe, page, &flen, poff,
2387				  linear, sk))
2388			return true;
2389		poff += flen;
2390		plen -= flen;
2391		*len -= flen;
2392	} while (*len && plen);
2393
2394	return false;
2395}
2396
2397/*
2398 * Map linear and fragment data from the skb to spd. It reports true if the
2399 * pipe is full or if we already spliced the requested length.
2400 */
2401static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
2402			      unsigned int *offset, unsigned int *len,
2403			      struct splice_pipe_desc *spd, struct sock *sk)
2404{
2405	int seg;
2406	struct sk_buff *iter;
2407
2408	/* map the linear part :
2409	 * If skb->head_frag is set, this 'linear' part is backed by a
2410	 * fragment, and if the head is not shared with any clones then
2411	 * we can avoid a copy since we own the head portion of this page.
2412	 */
2413	if (__splice_segment(virt_to_page(skb->data),
2414			     (unsigned long) skb->data & (PAGE_SIZE - 1),
2415			     skb_headlen(skb),
2416			     offset, len, spd,
2417			     skb_head_is_locked(skb),
2418			     sk, pipe))
2419		return true;
2420
2421	/*
2422	 * then map the fragments
2423	 */
2424	for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
2425		const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
2426
2427		if (__splice_segment(skb_frag_page(f),
2428				     skb_frag_off(f), skb_frag_size(f),
2429				     offset, len, spd, false, sk, pipe))
2430			return true;
2431	}
2432
2433	skb_walk_frags(skb, iter) {
2434		if (*offset >= iter->len) {
2435			*offset -= iter->len;
2436			continue;
2437		}
2438		/* __skb_splice_bits() only fails if the output has no room
2439		 * left, so no point in going over the frag_list for the error
2440		 * case.
2441		 */
2442		if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
2443			return true;
2444	}
2445
2446	return false;
2447}
2448
2449/*
2450 * Map data from the skb to a pipe. Should handle both the linear part,
2451 * the fragments, and the frag list.
2452 */
2453int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
2454		    struct pipe_inode_info *pipe, unsigned int tlen,
2455		    unsigned int flags)
2456{
2457	struct partial_page partial[MAX_SKB_FRAGS];
2458	struct page *pages[MAX_SKB_FRAGS];
2459	struct splice_pipe_desc spd = {
2460		.pages = pages,
2461		.partial = partial,
2462		.nr_pages_max = MAX_SKB_FRAGS,
2463		.ops = &nosteal_pipe_buf_ops,
2464		.spd_release = sock_spd_release,
2465	};
2466	int ret = 0;
2467
2468	__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
2469
2470	if (spd.nr_pages)
2471		ret = splice_to_pipe(pipe, &spd);
2472
2473	return ret;
2474}
2475EXPORT_SYMBOL_GPL(skb_splice_bits);
2476
2477/* Send skb data on a socket. Socket must be locked. */
2478int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
2479			 int len)
2480{
2481	unsigned int orig_len = len;
2482	struct sk_buff *head = skb;
2483	unsigned short fragidx;
2484	int slen, ret;
2485
2486do_frag_list:
2487
2488	/* Deal with head data */
2489	while (offset < skb_headlen(skb) && len) {
2490		struct kvec kv;
2491		struct msghdr msg;
2492
2493		slen = min_t(int, len, skb_headlen(skb) - offset);
2494		kv.iov_base = skb->data + offset;
2495		kv.iov_len = slen;
2496		memset(&msg, 0, sizeof(msg));
2497		msg.msg_flags = MSG_DONTWAIT;
2498
2499		ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen);
2500		if (ret <= 0)
2501			goto error;
2502
2503		offset += ret;
2504		len -= ret;
2505	}
2506
2507	/* All the data was skb head? */
2508	if (!len)
2509		goto out;
2510
2511	/* Make offset relative to start of frags */
2512	offset -= skb_headlen(skb);
2513
2514	/* Find where we are in frag list */
2515	for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
2516		skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];
2517
2518		if (offset < skb_frag_size(frag))
2519			break;
2520
2521		offset -= skb_frag_size(frag);
2522	}
2523
2524	for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
2525		skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];
2526
2527		slen = min_t(size_t, len, skb_frag_size(frag) - offset);
2528
2529		while (slen) {
2530			ret = kernel_sendpage_locked(sk, skb_frag_page(frag),
2531						     skb_frag_off(frag) + offset,
2532						     slen, MSG_DONTWAIT);
2533			if (ret <= 0)
2534				goto error;
2535
2536			len -= ret;
2537			offset += ret;
2538			slen -= ret;
2539		}
2540
2541		offset = 0;
2542	}
2543
2544	if (len) {
2545		/* Process any frag lists */
2546
2547		if (skb == head) {
2548			if (skb_has_frag_list(skb)) {
2549				skb = skb_shinfo(skb)->frag_list;
2550				goto do_frag_list;
2551			}
2552		} else if (skb->next) {
2553			skb = skb->next;
2554			goto do_frag_list;
2555		}
2556	}
2557
2558out:
2559	return orig_len - len;
2560
2561error:
2562	return orig_len == len ? ret : orig_len - len;
2563}
2564EXPORT_SYMBOL_GPL(skb_send_sock_locked);
2565
2566/**
2567 *	skb_store_bits - store bits from kernel buffer to skb
2568 *	@skb: destination buffer
2569 *	@offset: offset in destination
2570 *	@from: source buffer
2571 *	@len: number of bytes to copy
2572 *
2573 *	Copy the specified number of bytes from the source buffer to the
2574 *	destination skb.  This function handles all the messy bits of
2575 *	traversing fragment lists and such.
2576 */
2577
2578int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
2579{
2580	int start = skb_headlen(skb);
2581	struct sk_buff *frag_iter;
2582	int i, copy;
2583
2584	if (offset > (int)skb->len - len)
2585		goto fault;
2586
2587	if ((copy = start - offset) > 0) {
2588		if (copy > len)
2589			copy = len;
2590		skb_copy_to_linear_data_offset(skb, offset, from, copy);
2591		if ((len -= copy) == 0)
2592			return 0;
2593		offset += copy;
2594		from += copy;
2595	}
2596
2597	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2598		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2599		int end;
2600
2601		WARN_ON(start > offset + len);
2602
2603		end = start + skb_frag_size(frag);
2604		if ((copy = end - offset) > 0) {
2605			u32 p_off, p_len, copied;
2606			struct page *p;
2607			u8 *vaddr;
2608
2609			if (copy > len)
2610				copy = len;
2611
2612			skb_frag_foreach_page(frag,
2613					      skb_frag_off(frag) + offset - start,
2614					      copy, p, p_off, p_len, copied) {
2615				vaddr = kmap_atomic(p);
2616				memcpy(vaddr + p_off, from + copied, p_len);
2617				kunmap_atomic(vaddr);
2618			}
2619
2620			if ((len -= copy) == 0)
2621				return 0;
2622			offset += copy;
2623			from += copy;
2624		}
2625		start = end;
2626	}
2627
2628	skb_walk_frags(skb, frag_iter) {
2629		int end;
2630
2631		WARN_ON(start > offset + len);
2632
2633		end = start + frag_iter->len;
2634		if ((copy = end - offset) > 0) {
2635			if (copy > len)
2636				copy = len;
2637			if (skb_store_bits(frag_iter, offset - start,
2638					   from, copy))
2639				goto fault;
2640			if ((len -= copy) == 0)
2641				return 0;
2642			offset += copy;
2643			from += copy;
2644		}
2645		start = end;
2646	}
2647	if (!len)
2648		return 0;
2649
2650fault:
2651	return -EFAULT;
2652}
2653EXPORT_SYMBOL(skb_store_bits);
2654
2655/* Checksum skb data. */
2656__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
2657		      __wsum csum, const struct skb_checksum_ops *ops)
2658{
2659	int start = skb_headlen(skb);
2660	int i, copy = start - offset;
2661	struct sk_buff *frag_iter;
2662	int pos = 0;
2663
2664	/* Checksum header. */
2665	if (copy > 0) {
2666		if (copy > len)
2667			copy = len;
2668		csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
2669				       skb->data + offset, copy, csum);
2670		if ((len -= copy) == 0)
2671			return csum;
2672		offset += copy;
2673		pos	= copy;
2674	}
2675
2676	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2677		int end;
2678		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2679
2680		WARN_ON(start > offset + len);
2681
2682		end = start + skb_frag_size(frag);
2683		if ((copy = end - offset) > 0) {
2684			u32 p_off, p_len, copied;
2685			struct page *p;
2686			__wsum csum2;
2687			u8 *vaddr;
2688
2689			if (copy > len)
2690				copy = len;
2691
2692			skb_frag_foreach_page(frag,
2693					      skb_frag_off(frag) + offset - start,
2694					      copy, p, p_off, p_len, copied) {
2695				vaddr = kmap_atomic(p);
2696				csum2 = INDIRECT_CALL_1(ops->update,
2697							csum_partial_ext,
2698							vaddr + p_off, p_len, 0);
2699				kunmap_atomic(vaddr);
2700				csum = INDIRECT_CALL_1(ops->combine,
2701						       csum_block_add_ext, csum,
2702						       csum2, pos, p_len);
2703				pos += p_len;
2704			}
2705
2706			if (!(len -= copy))
2707				return csum;
2708			offset += copy;
2709		}
2710		start = end;
2711	}
2712
2713	skb_walk_frags(skb, frag_iter) {
2714		int end;
2715
2716		WARN_ON(start > offset + len);
2717
2718		end = start + frag_iter->len;
2719		if ((copy = end - offset) > 0) {
2720			__wsum csum2;
2721			if (copy > len)
2722				copy = len;
2723			csum2 = __skb_checksum(frag_iter, offset - start,
2724					       copy, 0, ops);
2725			csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
2726					       csum, csum2, pos, copy);
2727			if ((len -= copy) == 0)
2728				return csum;
2729			offset += copy;
2730			pos    += copy;
2731		}
2732		start = end;
2733	}
2734	BUG_ON(len);
2735
2736	return csum;
2737}
2738EXPORT_SYMBOL(__skb_checksum);
2739
2740__wsum skb_checksum(const struct sk_buff *skb, int offset,
2741		    int len, __wsum csum)
2742{
2743	const struct skb_checksum_ops ops = {
2744		.update  = csum_partial_ext,
2745		.combine = csum_block_add_ext,
2746	};
2747
2748	return __skb_checksum(skb, offset, len, csum, &ops);
2749}
2750EXPORT_SYMBOL(skb_checksum);
2751
2752/* Both of above in one bottle. */
2753
2754__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
2755				    u8 *to, int len)
2756{
2757	int start = skb_headlen(skb);
2758	int i, copy = start - offset;
2759	struct sk_buff *frag_iter;
2760	int pos = 0;
2761	__wsum csum = 0;
2762
2763	/* Copy header. */
2764	if (copy > 0) {
2765		if (copy > len)
2766			copy = len;
2767		csum = csum_partial_copy_nocheck(skb->data + offset, to,
2768						 copy);
2769		if ((len -= copy) == 0)
2770			return csum;
2771		offset += copy;
2772		to     += copy;
2773		pos	= copy;
2774	}
2775
2776	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2777		int end;
2778
2779		WARN_ON(start > offset + len);
2780
2781		end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
2782		if ((copy = end - offset) > 0) {
2783			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2784			u32 p_off, p_len, copied;
2785			struct page *p;
2786			__wsum csum2;
2787			u8 *vaddr;
2788
2789			if (copy > len)
2790				copy = len;
2791
2792			skb_frag_foreach_page(frag,
2793					      skb_frag_off(frag) + offset - start,
2794					      copy, p, p_off, p_len, copied) {
2795				vaddr = kmap_atomic(p);
2796				csum2 = csum_partial_copy_nocheck(vaddr + p_off,
2797								  to + copied,
2798								  p_len);
2799				kunmap_atomic(vaddr);
2800				csum = csum_block_add(csum, csum2, pos);
2801				pos += p_len;
2802			}
2803
2804			if (!(len -= copy))
2805				return csum;
2806			offset += copy;
2807			to     += copy;
2808		}
2809		start = end;
2810	}
2811
2812	skb_walk_frags(skb, frag_iter) {
2813		__wsum csum2;
2814		int end;
2815
2816		WARN_ON(start > offset + len);
2817
2818		end = start + frag_iter->len;
2819		if ((copy = end - offset) > 0) {
2820			if (copy > len)
2821				copy = len;
2822			csum2 = skb_copy_and_csum_bits(frag_iter,
2823						       offset - start,
2824						       to, copy);
2825			csum = csum_block_add(csum, csum2, pos);
2826			if ((len -= copy) == 0)
2827				return csum;
2828			offset += copy;
2829			to     += copy;
2830			pos    += copy;
2831		}
2832		start = end;
2833	}
2834	BUG_ON(len);
2835	return csum;
2836}
2837EXPORT_SYMBOL(skb_copy_and_csum_bits);
2838
2839__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
2840{
2841	__sum16 sum;
2842
2843	sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
2844	/* See comments in __skb_checksum_complete(). */
2845	if (likely(!sum)) {
2846		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
2847		    !skb->csum_complete_sw)
2848			netdev_rx_csum_fault(skb->dev, skb);
2849	}
2850	if (!skb_shared(skb))
2851		skb->csum_valid = !sum;
2852	return sum;
2853}
2854EXPORT_SYMBOL(__skb_checksum_complete_head);
2855
2856/* This function assumes skb->csum already holds pseudo header's checksum,
2857 * which has been changed from the hardware checksum, for example, by
2858 * __skb_checksum_validate_complete(). And, the original skb->csum must
2859 * have been validated unsuccessfully for CHECKSUM_COMPLETE case.
2860 *
2861 * It returns non-zero if the recomputed checksum is still invalid, otherwise
2862 * zero. The new checksum is stored back into skb->csum unless the skb is
2863 * shared.
2864 */
2865__sum16 __skb_checksum_complete(struct sk_buff *skb)
2866{
2867	__wsum csum;
2868	__sum16 sum;
2869
2870	csum = skb_checksum(skb, 0, skb->len, 0);
2871
2872	sum = csum_fold(csum_add(skb->csum, csum));
2873	/* This check is inverted, because we already knew the hardware
2874	 * checksum is invalid before calling this function. So, if the
2875	 * re-computed checksum is valid instead, then we have a mismatch
2876	 * between the original skb->csum and skb_checksum(). This means either
2877	 * the original hardware checksum is incorrect or we screw up skb->csum
2878	 * when moving skb->data around.
2879	 */
2880	if (likely(!sum)) {
2881		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
2882		    !skb->csum_complete_sw)
2883			netdev_rx_csum_fault(skb->dev, skb);
2884	}
2885
2886	if (!skb_shared(skb)) {
2887		/* Save full packet checksum */
2888		skb->csum = csum;
2889		skb->ip_summed = CHECKSUM_COMPLETE;
2890		skb->csum_complete_sw = 1;
2891		skb->csum_valid = !sum;
2892	}
2893
2894	return sum;
2895}
2896EXPORT_SYMBOL(__skb_checksum_complete);
2897
2898static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
2899{
2900	net_warn_ratelimited(
2901		"%s: attempt to compute crc32c without libcrc32c.ko\n",
2902		__func__);
2903	return 0;
2904}
2905
2906static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
2907				       int offset, int len)
2908{
2909	net_warn_ratelimited(
2910		"%s: attempt to compute crc32c without libcrc32c.ko\n",
2911		__func__);
2912	return 0;
2913}
2914
2915static const struct skb_checksum_ops default_crc32c_ops = {
2916	.update  = warn_crc32c_csum_update,
2917	.combine = warn_crc32c_csum_combine,
2918};
2919
2920const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
2921	&default_crc32c_ops;
2922EXPORT_SYMBOL(crc32c_csum_stub);
2923
2924 /**
2925 *	skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
2926 *	@from: source buffer
2927 *
2928 *	Calculates the amount of linear headroom needed in the 'to' skb passed
2929 *	into skb_zerocopy().
2930 */
2931unsigned int
2932skb_zerocopy_headlen(const struct sk_buff *from)
2933{
2934	unsigned int hlen = 0;
2935
2936	if (!from->head_frag ||
2937	    skb_headlen(from) < L1_CACHE_BYTES ||
2938	    skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) {
2939		hlen = skb_headlen(from);
2940		if (!hlen)
2941			hlen = from->len;
2942	}
2943
2944	if (skb_has_frag_list(from))
2945		hlen = from->len;
2946
2947	return hlen;
2948}
2949EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
2950
2951/**
2952 *	skb_zerocopy - Zero copy skb to skb
2953 *	@to: destination buffer
2954 *	@from: source buffer
2955 *	@len: number of bytes to copy from source buffer
2956 *	@hlen: size of linear headroom in destination buffer
2957 *
2958 *	Copies up to `len` bytes from `from` to `to` by creating references
2959 *	to the frags in the source buffer.
2960 *
2961 *	The `hlen` as calculated by skb_zerocopy_headlen() specifies the
2962 *	headroom in the `to` buffer.
2963 *
2964 *	Return value:
2965 *	0: everything is OK
2966 *	-ENOMEM: couldn't orphan frags of @from due to lack of memory
2967 *	-EFAULT: skb_copy_bits() found some problem with skb geometry
2968 */
2969int
2970skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
2971{
2972	int i, j = 0;
2973	int plen = 0; /* length of skb->head fragment */
2974	int ret;
2975	struct page *page;
2976	unsigned int offset;
2977
2978	BUG_ON(!from->head_frag && !hlen);
2979
2980	/* dont bother with small payloads */
2981	if (len <= skb_tailroom(to))
2982		return skb_copy_bits(from, 0, skb_put(to, len), len);
2983
2984	if (hlen) {
2985		ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
2986		if (unlikely(ret))
2987			return ret;
2988		len -= hlen;
2989	} else {
2990		plen = min_t(int, skb_headlen(from), len);
2991		if (plen) {
2992			page = virt_to_head_page(from->head);
2993			offset = from->data - (unsigned char *)page_address(page);
2994			__skb_fill_page_desc(to, 0, page, offset, plen);
2995			get_page(page);
2996			j = 1;
2997			len -= plen;
2998		}
2999	}
3000
3001	to->truesize += len + plen;
3002	to->len += len + plen;
3003	to->data_len += len + plen;
3004
3005	if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
3006		skb_tx_error(from);
3007		return -ENOMEM;
3008	}
3009	skb_zerocopy_clone(to, from, GFP_ATOMIC);
3010
3011	for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
3012		int size;
3013
3014		if (!len)
3015			break;
3016		skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
3017		size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]),
3018					len);
3019		skb_frag_size_set(&skb_shinfo(to)->frags[j], size);
3020		len -= size;
3021		skb_frag_ref(to, j);
3022		j++;
3023	}
3024	skb_shinfo(to)->nr_frags = j;
3025
3026	return 0;
3027}
3028EXPORT_SYMBOL_GPL(skb_zerocopy);
3029
3030void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
3031{
3032	__wsum csum;
3033	long csstart;
3034
3035	if (skb->ip_summed == CHECKSUM_PARTIAL)
3036		csstart = skb_checksum_start_offset(skb);
3037	else
3038		csstart = skb_headlen(skb);
3039
3040	BUG_ON(csstart > skb_headlen(skb));
3041
3042	skb_copy_from_linear_data(skb, to, csstart);
3043
3044	csum = 0;
3045	if (csstart != skb->len)
3046		csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
3047					      skb->len - csstart);
3048
3049	if (skb->ip_summed == CHECKSUM_PARTIAL) {
3050		long csstuff = csstart + skb->csum_offset;
3051
3052		*((__sum16 *)(to + csstuff)) = csum_fold(csum);
3053	}
3054}
3055EXPORT_SYMBOL(skb_copy_and_csum_dev);
3056
3057/**
3058 *	skb_dequeue - remove from the head of the queue
3059 *	@list: list to dequeue from
3060 *
3061 *	Remove the head of the list. The list lock is taken so the function
3062 *	may be used safely with other locking list functions. The head item is
3063 *	returned or %NULL if the list is empty.
3064 */
3065
3066struct sk_buff *skb_dequeue(struct sk_buff_head *list)
3067{
3068	unsigned long flags;
3069	struct sk_buff *result;
3070
3071	spin_lock_irqsave(&list->lock, flags);
3072	result = __skb_dequeue(list);
3073	spin_unlock_irqrestore(&list->lock, flags);
3074	return result;
3075}
3076EXPORT_SYMBOL(skb_dequeue);
3077
3078/**
3079 *	skb_dequeue_tail - remove from the tail of the queue
3080 *	@list: list to dequeue from
3081 *
3082 *	Remove the tail of the list. The list lock is taken so the function
3083 *	may be used safely with other locking list functions. The tail item is
3084 *	returned or %NULL if the list is empty.
3085 */
3086struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
3087{
3088	unsigned long flags;
3089	struct sk_buff *result;
3090
3091	spin_lock_irqsave(&list->lock, flags);
3092	result = __skb_dequeue_tail(list);
3093	spin_unlock_irqrestore(&list->lock, flags);
3094	return result;
3095}
3096EXPORT_SYMBOL(skb_dequeue_tail);
3097
3098/**
3099 *	skb_queue_purge - empty a list
3100 *	@list: list to empty
3101 *
3102 *	Delete all buffers on an &sk_buff list. Each buffer is removed from
3103 *	the list and one reference dropped. This function takes the list
3104 *	lock and is atomic with respect to other list locking functions.
3105 */
3106void skb_queue_purge(struct sk_buff_head *list)
3107{
3108	struct sk_buff *skb;
3109	while ((skb = skb_dequeue(list)) != NULL)
3110		kfree_skb(skb);
3111}
3112EXPORT_SYMBOL(skb_queue_purge);
3113
3114/**
3115 *	skb_rbtree_purge - empty a skb rbtree
3116 *	@root: root of the rbtree to empty
3117 *	Return value: the sum of truesizes of all purged skbs.
3118 *
3119 *	Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
3120 *	the list and one reference dropped. This function does not take
3121 *	any lock. Synchronization should be handled by the caller (e.g., TCP
3122 *	out-of-order queue is protected by the socket lock).
3123 */
3124unsigned int skb_rbtree_purge(struct rb_root *root)
3125{
3126	struct rb_node *p = rb_first(root);
3127	unsigned int sum = 0;
3128
3129	while (p) {
3130		struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
3131
3132		p = rb_next(p);
3133		rb_erase(&skb->rbnode, root);
3134		sum += skb->truesize;
3135		kfree_skb(skb);
3136	}
3137	return sum;
3138}
3139
3140/**
3141 *	skb_queue_head - queue a buffer at the list head
3142 *	@list: list to use
3143 *	@newsk: buffer to queue
3144 *
3145 *	Queue a buffer at the start of the list. This function takes the
3146 *	list lock and can be used safely with other locking &sk_buff functions
3147 *	safely.
3148 *
3149 *	A buffer cannot be placed on two lists at the same time.
3150 */
3151void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
3152{
3153	unsigned long flags;
3154
3155	spin_lock_irqsave(&list->lock, flags);
3156	__skb_queue_head(list, newsk);
3157	spin_unlock_irqrestore(&list->lock, flags);
3158}
3159EXPORT_SYMBOL(skb_queue_head);
3160
3161/**
3162 *	skb_queue_tail - queue a buffer at the list tail
3163 *	@list: list to use
3164 *	@newsk: buffer to queue
3165 *
3166 *	Queue a buffer at the tail of the list. This function takes the
3167 *	list lock and can be used safely with other locking &sk_buff functions
3168 *	safely.
3169 *
3170 *	A buffer cannot be placed on two lists at the same time.
3171 */
3172void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
3173{
3174	unsigned long flags;
3175
3176	spin_lock_irqsave(&list->lock, flags);
3177	__skb_queue_tail(list, newsk);
3178	spin_unlock_irqrestore(&list->lock, flags);
3179}
3180EXPORT_SYMBOL(skb_queue_tail);
3181
3182/**
3183 *	skb_unlink	-	remove a buffer from a list
3184 *	@skb: buffer to remove
3185 *	@list: list to use
3186 *
3187 *	Remove a packet from a list. The list locks are taken and this
3188 *	function is atomic with respect to other list locked calls
3189 *
3190 *	You must know what list the SKB is on.
3191 */
3192void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
3193{
3194	unsigned long flags;
3195
3196	spin_lock_irqsave(&list->lock, flags);
3197	__skb_unlink(skb, list);
3198	spin_unlock_irqrestore(&list->lock, flags);
3199}
3200EXPORT_SYMBOL(skb_unlink);
3201
3202/**
3203 *	skb_append	-	append a buffer
3204 *	@old: buffer to insert after
3205 *	@newsk: buffer to insert
3206 *	@list: list to use
3207 *
3208 *	Place a packet after a given packet in a list. The list locks are taken
3209 *	and this function is atomic with respect to other list locked calls.
3210 *	A buffer cannot be placed on two lists at the same time.
3211 */
3212void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
3213{
3214	unsigned long flags;
3215
3216	spin_lock_irqsave(&list->lock, flags);
3217	__skb_queue_after(list, old, newsk);
3218	spin_unlock_irqrestore(&list->lock, flags);
3219}
3220EXPORT_SYMBOL(skb_append);
3221
3222static inline void skb_split_inside_header(struct sk_buff *skb,
3223					   struct sk_buff* skb1,
3224					   const u32 len, const int pos)
3225{
3226	int i;
3227
3228	skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
3229					 pos - len);
3230	/* And move data appendix as is. */
3231	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
3232		skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
3233
3234	skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
3235	skb_shinfo(skb)->nr_frags  = 0;
3236	skb1->data_len		   = skb->data_len;
3237	skb1->len		   += skb1->data_len;
3238	skb->data_len		   = 0;
3239	skb->len		   = len;
3240	skb_set_tail_pointer(skb, len);
3241}
3242
3243static inline void skb_split_no_header(struct sk_buff *skb,
3244				       struct sk_buff* skb1,
3245				       const u32 len, int pos)
3246{
3247	int i, k = 0;
3248	const int nfrags = skb_shinfo(skb)->nr_frags;
3249
3250	skb_shinfo(skb)->nr_frags = 0;
3251	skb1->len		  = skb1->data_len = skb->len - len;
3252	skb->len		  = len;
3253	skb->data_len		  = len - pos;
3254
3255	for (i = 0; i < nfrags; i++) {
3256		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
3257
3258		if (pos + size > len) {
3259			skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
3260
3261			if (pos < len) {
3262				/* Split frag.
3263				 * We have two variants in this case:
3264				 * 1. Move all the frag to the second
3265				 *    part, if it is possible. F.e.
3266				 *    this approach is mandatory for TUX,
3267				 *    where splitting is expensive.
3268				 * 2. Split is accurately. We make this.
3269				 */
3270				skb_frag_ref(skb, i);
3271				skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos);
3272				skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
3273				skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
3274				skb_shinfo(skb)->nr_frags++;
3275			}
3276			k++;
3277		} else
3278			skb_shinfo(skb)->nr_frags++;
3279		pos += size;
3280	}
3281	skb_shinfo(skb1)->nr_frags = k;
3282}
3283
3284/**
3285 * skb_split - Split fragmented skb to two parts at length len.
3286 * @skb: the buffer to split
3287 * @skb1: the buffer to receive the second part
3288 * @len: new length for skb
3289 */
3290void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
3291{
3292	int pos = skb_headlen(skb);
3293
3294	skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags &
3295				      SKBTX_SHARED_FRAG;
3296	skb_zerocopy_clone(skb1, skb, 0);
3297	if (len < pos)	/* Split line is inside header. */
3298		skb_split_inside_header(skb, skb1, len, pos);
3299	else		/* Second chunk has no header, nothing to copy. */
3300		skb_split_no_header(skb, skb1, len, pos);
3301}
3302EXPORT_SYMBOL(skb_split);
3303
3304/* Shifting from/to a cloned skb is a no-go.
3305 *
3306 * Caller cannot keep skb_shinfo related pointers past calling here!
3307 */
3308static int skb_prepare_for_shift(struct sk_buff *skb)
3309{
3310	int ret = 0;
3311
3312	if (skb_cloned(skb)) {
3313		/* Save and restore truesize: pskb_expand_head() may reallocate
3314		 * memory where ksize(kmalloc(S)) != ksize(kmalloc(S)), but we
3315		 * cannot change truesize at this point.
3316		 */
3317		unsigned int save_truesize = skb->truesize;
3318
3319		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
3320		skb->truesize = save_truesize;
3321	}
3322	return ret;
3323}
3324
3325/**
3326 * skb_shift - Shifts paged data partially from skb to another
3327 * @tgt: buffer into which tail data gets added
3328 * @skb: buffer from which the paged data comes from
3329 * @shiftlen: shift up to this many bytes
3330 *
3331 * Attempts to shift up to shiftlen worth of bytes, which may be less than
3332 * the length of the skb, from skb to tgt. Returns number bytes shifted.
3333 * It's up to caller to free skb if everything was shifted.
3334 *
3335 * If @tgt runs out of frags, the whole operation is aborted.
3336 *
3337 * Skb cannot include anything else but paged data while tgt is allowed
3338 * to have non-paged data as well.
3339 *
3340 * TODO: full sized shift could be optimized but that would need
3341 * specialized skb free'er to handle frags without up-to-date nr_frags.
3342 */
3343int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
3344{
3345	int from, to, merge, todo;
3346	skb_frag_t *fragfrom, *fragto;
3347
3348	BUG_ON(shiftlen > skb->len);
3349
3350	if (skb_headlen(skb))
3351		return 0;
3352	if (skb_zcopy(tgt) || skb_zcopy(skb))
3353		return 0;
3354
3355	todo = shiftlen;
3356	from = 0;
3357	to = skb_shinfo(tgt)->nr_frags;
3358	fragfrom = &skb_shinfo(skb)->frags[from];
3359
3360	/* Actual merge is delayed until the point when we know we can
3361	 * commit all, so that we don't have to undo partial changes
3362	 */
3363	if (!to ||
3364	    !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
3365			      skb_frag_off(fragfrom))) {
3366		merge = -1;
3367	} else {
3368		merge = to - 1;
3369
3370		todo -= skb_frag_size(fragfrom);
3371		if (todo < 0) {
3372			if (skb_prepare_for_shift(skb) ||
3373			    skb_prepare_for_shift(tgt))
3374				return 0;
3375
3376			/* All previous frag pointers might be stale! */
3377			fragfrom = &skb_shinfo(skb)->frags[from];
3378			fragto = &skb_shinfo(tgt)->frags[merge];
3379
3380			skb_frag_size_add(fragto, shiftlen);
3381			skb_frag_size_sub(fragfrom, shiftlen);
3382			skb_frag_off_add(fragfrom, shiftlen);
3383
3384			goto onlymerged;
3385		}
3386
3387		from++;
3388	}
3389
3390	/* Skip full, not-fitting skb to avoid expensive operations */
3391	if ((shiftlen == skb->len) &&
3392	    (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
3393		return 0;
3394
3395	if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
3396		return 0;
3397
3398	while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
3399		if (to == MAX_SKB_FRAGS)
3400			return 0;
3401
3402		fragfrom = &skb_shinfo(skb)->frags[from];
3403		fragto = &skb_shinfo(tgt)->frags[to];
3404
3405		if (todo >= skb_frag_size(fragfrom)) {
3406			*fragto = *fragfrom;
3407			todo -= skb_frag_size(fragfrom);
3408			from++;
3409			to++;
3410
3411		} else {
3412			__skb_frag_ref(fragfrom);
3413			skb_frag_page_copy(fragto, fragfrom);
3414			skb_frag_off_copy(fragto, fragfrom);
3415			skb_frag_size_set(fragto, todo);
3416
3417			skb_frag_off_add(fragfrom, todo);
3418			skb_frag_size_sub(fragfrom, todo);
3419			todo = 0;
3420
3421			to++;
3422			break;
3423		}
3424	}
3425
3426	/* Ready to "commit" this state change to tgt */
3427	skb_shinfo(tgt)->nr_frags = to;
3428
3429	if (merge >= 0) {
3430		fragfrom = &skb_shinfo(skb)->frags[0];
3431		fragto = &skb_shinfo(tgt)->frags[merge];
3432
3433		skb_frag_size_add(fragto, skb_frag_size(fragfrom));
3434		__skb_frag_unref(fragfrom);
3435	}
3436
3437	/* Reposition in the original skb */
3438	to = 0;
3439	while (from < skb_shinfo(skb)->nr_frags)
3440		skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
3441	skb_shinfo(skb)->nr_frags = to;
3442
3443	BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
3444
3445onlymerged:
3446	/* Most likely the tgt won't ever need its checksum anymore, skb on
3447	 * the other hand might need it if it needs to be resent
3448	 */
3449	tgt->ip_summed = CHECKSUM_PARTIAL;
3450	skb->ip_summed = CHECKSUM_PARTIAL;
3451
3452	/* Yak, is it really working this way? Some helper please? */
3453	skb->len -= shiftlen;
3454	skb->data_len -= shiftlen;
3455	skb->truesize -= shiftlen;
3456	tgt->len += shiftlen;
3457	tgt->data_len += shiftlen;
3458	tgt->truesize += shiftlen;
3459
3460	return shiftlen;
3461}
3462
3463/**
3464 * skb_prepare_seq_read - Prepare a sequential read of skb data
3465 * @skb: the buffer to read
3466 * @from: lower offset of data to be read
3467 * @to: upper offset of data to be read
3468 * @st: state variable
3469 *
3470 * Initializes the specified state variable. Must be called before
3471 * invoking skb_seq_read() for the first time.
3472 */
3473void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
3474			  unsigned int to, struct skb_seq_state *st)
3475{
3476	st->lower_offset = from;
3477	st->upper_offset = to;
3478	st->root_skb = st->cur_skb = skb;
3479	st->frag_idx = st->stepped_offset = 0;
3480	st->frag_data = NULL;
3481}
3482EXPORT_SYMBOL(skb_prepare_seq_read);
3483
3484/**
3485 * skb_seq_read - Sequentially read skb data
3486 * @consumed: number of bytes consumed by the caller so far
3487 * @data: destination pointer for data to be returned
3488 * @st: state variable
3489 *
3490 * Reads a block of skb data at @consumed relative to the
3491 * lower offset specified to skb_prepare_seq_read(). Assigns
3492 * the head of the data block to @data and returns the length
3493 * of the block or 0 if the end of the skb data or the upper
3494 * offset has been reached.
3495 *
3496 * The caller is not required to consume all of the data
3497 * returned, i.e. @consumed is typically set to the number
3498 * of bytes already consumed and the next call to
3499 * skb_seq_read() will return the remaining part of the block.
3500 *
3501 * Note 1: The size of each block of data returned can be arbitrary,
3502 *       this limitation is the cost for zerocopy sequential
3503 *       reads of potentially non linear data.
3504 *
3505 * Note 2: Fragment lists within fragments are not implemented
3506 *       at the moment, state->root_skb could be replaced with
3507 *       a stack for this purpose.
3508 */
3509unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
3510			  struct skb_seq_state *st)
3511{
3512	unsigned int block_limit, abs_offset = consumed + st->lower_offset;
3513	skb_frag_t *frag;
3514
3515	if (unlikely(abs_offset >= st->upper_offset)) {
3516		if (st->frag_data) {
3517			kunmap_atomic(st->frag_data);
3518			st->frag_data = NULL;
3519		}
3520		return 0;
3521	}
3522
3523next_skb:
3524	block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
3525
3526	if (abs_offset < block_limit && !st->frag_data) {
3527		*data = st->cur_skb->data + (abs_offset - st->stepped_offset);
3528		return block_limit - abs_offset;
3529	}
3530
3531	if (st->frag_idx == 0 && !st->frag_data)
3532		st->stepped_offset += skb_headlen(st->cur_skb);
3533
3534	while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
3535		frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
3536		block_limit = skb_frag_size(frag) + st->stepped_offset;
3537
3538		if (abs_offset < block_limit) {
3539			if (!st->frag_data)
3540				st->frag_data = kmap_atomic(skb_frag_page(frag));
3541
3542			*data = (u8 *) st->frag_data + skb_frag_off(frag) +
3543				(abs_offset - st->stepped_offset);
3544
3545			return block_limit - abs_offset;
3546		}
3547
3548		if (st->frag_data) {
3549			kunmap_atomic(st->frag_data);
3550			st->frag_data = NULL;
3551		}
3552
3553		st->frag_idx++;
3554		st->stepped_offset += skb_frag_size(frag);
3555	}
3556
3557	if (st->frag_data) {
3558		kunmap_atomic(st->frag_data);
3559		st->frag_data = NULL;
3560	}
3561
3562	if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
3563		st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
3564		st->frag_idx = 0;
3565		goto next_skb;
3566	} else if (st->cur_skb->next) {
3567		st->cur_skb = st->cur_skb->next;
3568		st->frag_idx = 0;
3569		goto next_skb;
3570	}
3571
3572	return 0;
3573}
3574EXPORT_SYMBOL(skb_seq_read);
3575
3576/**
3577 * skb_abort_seq_read - Abort a sequential read of skb data
3578 * @st: state variable
3579 *
3580 * Must be called if skb_seq_read() was not called until it
3581 * returned 0.
3582 */
3583void skb_abort_seq_read(struct skb_seq_state *st)
3584{
3585	if (st->frag_data)
3586		kunmap_atomic(st->frag_data);
3587}
3588EXPORT_SYMBOL(skb_abort_seq_read);
3589
3590#define TS_SKB_CB(state)	((struct skb_seq_state *) &((state)->cb))
3591
3592static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
3593					  struct ts_config *conf,
3594					  struct ts_state *state)
3595{
3596	return skb_seq_read(offset, text, TS_SKB_CB(state));
3597}
3598
3599static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
3600{
3601	skb_abort_seq_read(TS_SKB_CB(state));
3602}
3603
3604/**
3605 * skb_find_text - Find a text pattern in skb data
3606 * @skb: the buffer to look in
3607 * @from: search offset
3608 * @to: search limit
3609 * @config: textsearch configuration
3610 *
3611 * Finds a pattern in the skb data according to the specified
3612 * textsearch configuration. Use textsearch_next() to retrieve
3613 * subsequent occurrences of the pattern. Returns the offset
3614 * to the first occurrence or UINT_MAX if no match was found.
3615 */
3616unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
3617			   unsigned int to, struct ts_config *config)
3618{
3619	struct ts_state state;
3620	unsigned int ret;
3621
3622	config->get_next_block = skb_ts_get_next_block;
3623	config->finish = skb_ts_finish;
3624
3625	skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));
3626
3627	ret = textsearch_find(config, &state);
3628	return (ret <= to - from ? ret : UINT_MAX);
3629}
3630EXPORT_SYMBOL(skb_find_text);
3631
3632int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
3633			 int offset, size_t size)
3634{
3635	int i = skb_shinfo(skb)->nr_frags;
3636
3637	if (skb_can_coalesce(skb, i, page, offset)) {
3638		skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
3639	} else if (i < MAX_SKB_FRAGS) {
3640		get_page(page);
3641		skb_fill_page_desc(skb, i, page, offset, size);
3642	} else {
3643		return -EMSGSIZE;
3644	}
3645
3646	return 0;
3647}
3648EXPORT_SYMBOL_GPL(skb_append_pagefrags);
3649
3650/**
3651 *	skb_pull_rcsum - pull skb and update receive checksum
3652 *	@skb: buffer to update
3653 *	@len: length of data pulled
3654 *
3655 *	This function performs an skb_pull on the packet and updates
3656 *	the CHECKSUM_COMPLETE checksum.  It should be used on
3657 *	receive path processing instead of skb_pull unless you know
3658 *	that the checksum difference is zero (e.g., a valid IP header)
3659 *	or you are setting ip_summed to CHECKSUM_NONE.
3660 */
3661void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
3662{
3663	unsigned char *data = skb->data;
3664
3665	BUG_ON(len > skb->len);
3666	__skb_pull(skb, len);
3667	skb_postpull_rcsum(skb, data, len);
3668	return skb->data;
3669}
3670EXPORT_SYMBOL_GPL(skb_pull_rcsum);
3671
3672static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
3673{
3674	skb_frag_t head_frag;
3675	struct page *page;
3676
3677	page = virt_to_head_page(frag_skb->head);
3678	__skb_frag_set_page(&head_frag, page);
3679	skb_frag_off_set(&head_frag, frag_skb->data -
3680			 (unsigned char *)page_address(page));
3681	skb_frag_size_set(&head_frag, skb_headlen(frag_skb));
3682	return head_frag;
3683}
3684
3685struct sk_buff *skb_segment_list(struct sk_buff *skb,
3686				 netdev_features_t features,
3687				 unsigned int offset)
3688{
3689	struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
3690	unsigned int tnl_hlen = skb_tnl_header_len(skb);
3691	unsigned int delta_truesize = 0;
3692	unsigned int delta_len = 0;
3693	struct sk_buff *tail = NULL;
3694	struct sk_buff *nskb, *tmp;
3695	int err;
3696
3697	skb_push(skb, -skb_network_offset(skb) + offset);
3698
3699	/* Ensure the head is writeable before touching the shared info */
3700	err = skb_unclone(skb, GFP_ATOMIC);
3701	if (err)
3702		goto err_linearize;
3703
3704	skb_shinfo(skb)->frag_list = NULL;
3705
3706	while (list_skb) {
3707		nskb = list_skb;
3708		list_skb = list_skb->next;
3709
3710		err = 0;
3711		delta_truesize += nskb->truesize;
3712		if (skb_shared(nskb)) {
3713			tmp = skb_clone(nskb, GFP_ATOMIC);
3714			if (tmp) {
3715				consume_skb(nskb);
3716				nskb = tmp;
3717				err = skb_unclone(nskb, GFP_ATOMIC);
3718			} else {
3719				err = -ENOMEM;
3720			}
3721		}
3722
3723		if (!tail)
3724			skb->next = nskb;
3725		else
3726			tail->next = nskb;
3727
3728		if (unlikely(err)) {
3729			nskb->next = list_skb;
3730			goto err_linearize;
3731		}
3732
3733		tail = nskb;
3734
3735		delta_len += nskb->len;
3736
3737		skb_push(nskb, -skb_network_offset(nskb) + offset);
3738
3739		skb_release_head_state(nskb);
3740		 __copy_skb_header(nskb, skb);
3741
3742		skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
3743		skb_copy_from_linear_data_offset(skb, -tnl_hlen,
3744						 nskb->data - tnl_hlen,
3745						 offset + tnl_hlen);
3746
3747		if (skb_needs_linearize(nskb, features) &&
3748		    __skb_linearize(nskb))
3749			goto err_linearize;
3750	}
3751
3752	skb->truesize = skb->truesize - delta_truesize;
3753	skb->data_len = skb->data_len - delta_len;
3754	skb->len = skb->len - delta_len;
3755
3756	skb_gso_reset(skb);
3757
3758	skb->prev = tail;
3759
3760	if (skb_needs_linearize(skb, features) &&
3761	    __skb_linearize(skb))
3762		goto err_linearize;
3763
3764	skb_get(skb);
3765
3766	return skb;
3767
3768err_linearize:
3769	kfree_skb_list(skb->next);
3770	skb->next = NULL;
3771	return ERR_PTR(-ENOMEM);
3772}
3773EXPORT_SYMBOL_GPL(skb_segment_list);
3774
3775int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
3776{
3777	if (unlikely(p->len + skb->len >= 65536))
3778		return -E2BIG;
3779
3780	if (NAPI_GRO_CB(p)->last == p)
3781		skb_shinfo(p)->frag_list = skb;
3782	else
3783		NAPI_GRO_CB(p)->last->next = skb;
3784
3785	skb_pull(skb, skb_gro_offset(skb));
3786
3787	NAPI_GRO_CB(p)->last = skb;
3788	NAPI_GRO_CB(p)->count++;
3789	p->data_len += skb->len;
3790	p->truesize += skb->truesize;
3791	p->len += skb->len;
3792
3793	NAPI_GRO_CB(skb)->same_flow = 1;
3794
3795	return 0;
3796}
3797
3798/**
3799 *	skb_segment - Perform protocol segmentation on skb.
3800 *	@head_skb: buffer to segment
3801 *	@features: features for the output path (see dev->features)
3802 *
3803 *	This function performs segmentation on the given skb.  It returns
3804 *	a pointer to the first in a list of new skbs for the segments.
3805 *	In case of error it returns ERR_PTR(err).
3806 */
3807struct sk_buff *skb_segment(struct sk_buff *head_skb,
3808			    netdev_features_t features)
3809{
3810	struct sk_buff *segs = NULL;
3811	struct sk_buff *tail = NULL;
3812	struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
3813	unsigned int mss = skb_shinfo(head_skb)->gso_size;
3814	unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
3815	unsigned int offset = doffset;
3816	unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
3817	unsigned int partial_segs = 0;
3818	unsigned int headroom;
3819	unsigned int len = head_skb->len;
3820	struct sk_buff *frag_skb;
3821	skb_frag_t *frag;
3822	__be16 proto;
3823	bool csum, sg;
3824	int err = -ENOMEM;
3825	int i = 0;
3826	int nfrags, pos;
3827
3828	if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) &&
3829	    mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) {
3830		struct sk_buff *check_skb;
3831
3832		for (check_skb = list_skb; check_skb; check_skb = check_skb->next) {
3833			if (skb_headlen(check_skb) && !check_skb->head_frag) {
3834				/* gso_size is untrusted, and we have a frag_list with
3835				 * a linear non head_frag item.
3836				 *
3837				 * If head_skb's headlen does not fit requested gso_size,
3838				 * it means that the frag_list members do NOT terminate
3839				 * on exact gso_size boundaries. Hence we cannot perform
3840				 * skb_frag_t page sharing. Therefore we must fallback to
3841				 * copying the frag_list skbs; we do so by disabling SG.
3842				 */
3843				features &= ~NETIF_F_SG;
3844				break;
3845			}
3846		}
3847	}
3848
3849	__skb_push(head_skb, doffset);
3850	proto = skb_network_protocol(head_skb, NULL);
3851	if (unlikely(!proto))
3852		return ERR_PTR(-EINVAL);
3853
3854	sg = !!(features & NETIF_F_SG);
3855	csum = !!can_checksum_protocol(features, proto);
3856
3857	if (sg && csum && (mss != GSO_BY_FRAGS))  {
3858		if (!(features & NETIF_F_GSO_PARTIAL)) {
3859			struct sk_buff *iter;
3860			unsigned int frag_len;
3861
3862			if (!list_skb ||
3863			    !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
3864				goto normal;
3865
3866			/* If we get here then all the required
3867			 * GSO features except frag_list are supported.
3868			 * Try to split the SKB to multiple GSO SKBs
3869			 * with no frag_list.
3870			 * Currently we can do that only when the buffers don't
3871			 * have a linear part and all the buffers except
3872			 * the last are of the same length.
3873			 */
3874			frag_len = list_skb->len;
3875			skb_walk_frags(head_skb, iter) {
3876				if (frag_len != iter->len && iter->next)
3877					goto normal;
3878				if (skb_headlen(iter) && !iter->head_frag)
3879					goto normal;
3880
3881				len -= iter->len;
3882			}
3883
3884			if (len != frag_len)
3885				goto normal;
3886		}
3887
3888		/* GSO partial only requires that we trim off any excess that
3889		 * doesn't fit into an MSS sized block, so take care of that
3890		 * now.
3891		 * Cap len to not accidentally hit GSO_BY_FRAGS.
3892		 */
3893		partial_segs = min(len, GSO_BY_FRAGS - 1U) / mss;
3894		if (partial_segs > 1)
3895			mss *= partial_segs;
3896		else
3897			partial_segs = 0;
3898	}
3899
3900normal:
3901	headroom = skb_headroom(head_skb);
3902	pos = skb_headlen(head_skb);
3903
3904	if (skb_orphan_frags(head_skb, GFP_ATOMIC))
3905		return ERR_PTR(-ENOMEM);
3906
3907	nfrags = skb_shinfo(head_skb)->nr_frags;
3908	frag = skb_shinfo(head_skb)->frags;
3909	frag_skb = head_skb;
3910
3911	do {
3912		struct sk_buff *nskb;
3913		skb_frag_t *nskb_frag;
3914		int hsize;
3915		int size;
3916
3917		if (unlikely(mss == GSO_BY_FRAGS)) {
3918			len = list_skb->len;
3919		} else {
3920			len = head_skb->len - offset;
3921			if (len > mss)
3922				len = mss;
3923		}
3924
3925		hsize = skb_headlen(head_skb) - offset;
3926		if (hsize < 0)
3927			hsize = 0;
3928		if (hsize > len || !sg)
3929			hsize = len;
3930
3931		if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
3932		    (skb_headlen(list_skb) == len || sg)) {
3933			BUG_ON(skb_headlen(list_skb) > len);
3934
3935			nskb = skb_clone(list_skb, GFP_ATOMIC);
3936			if (unlikely(!nskb))
3937				goto err;
3938
3939			i = 0;
3940			nfrags = skb_shinfo(list_skb)->nr_frags;
3941			frag = skb_shinfo(list_skb)->frags;
3942			frag_skb = list_skb;
3943			pos += skb_headlen(list_skb);
3944
3945			while (pos < offset + len) {
3946				BUG_ON(i >= nfrags);
3947
3948				size = skb_frag_size(frag);
3949				if (pos + size > offset + len)
3950					break;
3951
3952				i++;
3953				pos += size;
3954				frag++;
3955			}
3956
3957			list_skb = list_skb->next;
3958
3959			if (unlikely(pskb_trim(nskb, len))) {
3960				kfree_skb(nskb);
3961				goto err;
3962			}
3963
3964			hsize = skb_end_offset(nskb);
3965			if (skb_cow_head(nskb, doffset + headroom)) {
3966				kfree_skb(nskb);
3967				goto err;
3968			}
3969
3970			nskb->truesize += skb_end_offset(nskb) - hsize;
3971			skb_release_head_state(nskb);
3972			__skb_push(nskb, doffset);
3973		} else {
3974			nskb = __alloc_skb(hsize + doffset + headroom,
3975					   GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
3976					   NUMA_NO_NODE);
3977
3978			if (unlikely(!nskb))
3979				goto err;
3980
3981			skb_reserve(nskb, headroom);
3982			__skb_put(nskb, doffset);
3983		}
3984
3985		if (segs)
3986			tail->next = nskb;
3987		else
3988			segs = nskb;
3989		tail = nskb;
3990
3991		__copy_skb_header(nskb, head_skb);
3992
3993		skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
3994		skb_reset_mac_len(nskb);
3995
3996		skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
3997						 nskb->data - tnl_hlen,
3998						 doffset + tnl_hlen);
3999
4000		if (nskb->len == len + doffset)
4001			goto perform_csum_check;
4002
4003		if (!sg) {
4004			if (!csum) {
4005				if (!nskb->remcsum_offload)
4006					nskb->ip_summed = CHECKSUM_NONE;
4007				SKB_GSO_CB(nskb)->csum =
4008					skb_copy_and_csum_bits(head_skb, offset,
4009							       skb_put(nskb,
4010								       len),
4011							       len);
4012				SKB_GSO_CB(nskb)->csum_start =
4013					skb_headroom(nskb) + doffset;
4014			} else {
4015				if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len))
4016					goto err;
4017			}
4018			continue;
4019		}
4020
4021		nskb_frag = skb_shinfo(nskb)->frags;
4022
4023		skb_copy_from_linear_data_offset(head_skb, offset,
4024						 skb_put(nskb, hsize), hsize);
4025
4026		skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
4027					      SKBTX_SHARED_FRAG;
4028
4029		if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
4030			goto err;
4031
4032		while (pos < offset + len) {
4033			if (i >= nfrags) {
4034				if (skb_orphan_frags(list_skb, GFP_ATOMIC) ||
4035				    skb_zerocopy_clone(nskb, list_skb,
4036						       GFP_ATOMIC))
4037					goto err;
4038
4039				i = 0;
4040				nfrags = skb_shinfo(list_skb)->nr_frags;
4041				frag = skb_shinfo(list_skb)->frags;
4042				frag_skb = list_skb;
4043				if (!skb_headlen(list_skb)) {
4044					BUG_ON(!nfrags);
4045				} else {
4046					BUG_ON(!list_skb->head_frag);
4047
4048					/* to make room for head_frag. */
4049					i--;
4050					frag--;
4051				}
4052
4053				list_skb = list_skb->next;
4054			}
4055
4056			if (unlikely(skb_shinfo(nskb)->nr_frags >=
4057				     MAX_SKB_FRAGS)) {
4058				net_warn_ratelimited(
4059					"skb_segment: too many frags: %u %u\n",
4060					pos, mss);
4061				err = -EINVAL;
4062				goto err;
4063			}
4064
4065			*nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
4066			__skb_frag_ref(nskb_frag);
4067			size = skb_frag_size(nskb_frag);
4068
4069			if (pos < offset) {
4070				skb_frag_off_add(nskb_frag, offset - pos);
4071				skb_frag_size_sub(nskb_frag, offset - pos);
4072			}
4073
4074			skb_shinfo(nskb)->nr_frags++;
4075
4076			if (pos + size <= offset + len) {
4077				i++;
4078				frag++;
4079				pos += size;
4080			} else {
4081				skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
4082				goto skip_fraglist;
4083			}
4084
4085			nskb_frag++;
4086		}
4087
4088skip_fraglist:
4089		nskb->data_len = len - hsize;
4090		nskb->len += nskb->data_len;
4091		nskb->truesize += nskb->data_len;
4092
4093perform_csum_check:
4094		if (!csum) {
4095			if (skb_has_shared_frag(nskb) &&
4096			    __skb_linearize(nskb))
4097				goto err;
4098
4099			if (!nskb->remcsum_offload)
4100				nskb->ip_summed = CHECKSUM_NONE;
4101			SKB_GSO_CB(nskb)->csum =
4102				skb_checksum(nskb, doffset,
4103					     nskb->len - doffset, 0);
4104			SKB_GSO_CB(nskb)->csum_start =
4105				skb_headroom(nskb) + doffset;
4106		}
4107	} while ((offset += len) < head_skb->len);
4108
4109	/* Some callers want to get the end of the list.
4110	 * Put it in segs->prev to avoid walking the list.
4111	 * (see validate_xmit_skb_list() for example)
4112	 */
4113	segs->prev = tail;
4114
4115	if (partial_segs) {
4116		struct sk_buff *iter;
4117		int type = skb_shinfo(head_skb)->gso_type;
4118		unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
4119
4120		/* Update type to add partial and then remove dodgy if set */
4121		type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
4122		type &= ~SKB_GSO_DODGY;
4123
4124		/* Update GSO info and prepare to start updating headers on
4125		 * our way back down the stack of protocols.
4126		 */
4127		for (iter = segs; iter; iter = iter->next) {
4128			skb_shinfo(iter)->gso_size = gso_size;
4129			skb_shinfo(iter)->gso_segs = partial_segs;
4130			skb_shinfo(iter)->gso_type = type;
4131			SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
4132		}
4133
4134		if (tail->len - doffset <= gso_size)
4135			skb_shinfo(tail)->gso_size = 0;
4136		else if (tail != segs)
4137			skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
4138	}
4139
4140	/* Following permits correct backpressure, for protocols
4141	 * using skb_set_owner_w().
4142	 * Idea is to tranfert ownership from head_skb to last segment.
4143	 */
4144	if (head_skb->destructor == sock_wfree) {
4145		swap(tail->truesize, head_skb->truesize);
4146		swap(tail->destructor, head_skb->destructor);
4147		swap(tail->sk, head_skb->sk);
4148	}
4149	return segs;
4150
4151err:
4152	kfree_skb_list(segs);
4153	return ERR_PTR(err);
4154}
4155EXPORT_SYMBOL_GPL(skb_segment);
4156
4157int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
4158{
4159	struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
4160	unsigned int offset = skb_gro_offset(skb);
4161	unsigned int headlen = skb_headlen(skb);
4162	unsigned int len = skb_gro_len(skb);
4163	unsigned int delta_truesize;
4164	struct sk_buff *lp;
4165
4166	if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush))
4167		return -E2BIG;
4168
4169	lp = NAPI_GRO_CB(p)->last;
4170	pinfo = skb_shinfo(lp);
4171
4172	if (headlen <= offset) {
4173		skb_frag_t *frag;
4174		skb_frag_t *frag2;
4175		int i = skbinfo->nr_frags;
4176		int nr_frags = pinfo->nr_frags + i;
4177
4178		if (nr_frags > MAX_SKB_FRAGS)
4179			goto merge;
4180
4181		offset -= headlen;
4182		pinfo->nr_frags = nr_frags;
4183		skbinfo->nr_frags = 0;
4184
4185		frag = pinfo->frags + nr_frags;
4186		frag2 = skbinfo->frags + i;
4187		do {
4188			*--frag = *--frag2;
4189		} while (--i);
4190
4191		skb_frag_off_add(frag, offset);
4192		skb_frag_size_sub(frag, offset);
4193
4194		/* all fragments truesize : remove (head size + sk_buff) */
4195		delta_truesize = skb->truesize -
4196				 SKB_TRUESIZE(skb_end_offset(skb));
4197
4198		skb->truesize -= skb->data_len;
4199		skb->len -= skb->data_len;
4200		skb->data_len = 0;
4201
4202		NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
4203		goto done;
4204	} else if (skb->head_frag) {
4205		int nr_frags = pinfo->nr_frags;
4206		skb_frag_t *frag = pinfo->frags + nr_frags;
4207		struct page *page = virt_to_head_page(skb->head);
4208		unsigned int first_size = headlen - offset;
4209		unsigned int first_offset;
4210
4211		if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
4212			goto merge;
4213
4214		first_offset = skb->data -
4215			       (unsigned char *)page_address(page) +
4216			       offset;
4217
4218		pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
4219
4220		__skb_frag_set_page(frag, page);
4221		skb_frag_off_set(frag, first_offset);
4222		skb_frag_size_set(frag, first_size);
4223
4224		memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
4225		/* We dont need to clear skbinfo->nr_frags here */
4226
4227		delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
4228		NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
4229		goto done;
4230	}
4231
4232merge:
4233	delta_truesize = skb->truesize;
4234	if (offset > headlen) {
4235		unsigned int eat = offset - headlen;
4236
4237		skb_frag_off_add(&skbinfo->frags[0], eat);
4238		skb_frag_size_sub(&skbinfo->frags[0], eat);
4239		skb->data_len -= eat;
4240		skb->len -= eat;
4241		offset = headlen;
4242	}
4243
4244	__skb_pull(skb, offset);
4245
4246	if (NAPI_GRO_CB(p)->last == p)
4247		skb_shinfo(p)->frag_list = skb;
4248	else
4249		NAPI_GRO_CB(p)->last->next = skb;
4250	NAPI_GRO_CB(p)->last = skb;
4251	__skb_header_release(skb);
4252	lp = p;
4253
4254done:
4255	NAPI_GRO_CB(p)->count++;
4256	p->data_len += len;
4257	p->truesize += delta_truesize;
4258	p->len += len;
4259	if (lp != p) {
4260		lp->data_len += len;
4261		lp->truesize += delta_truesize;
4262		lp->len += len;
4263	}
4264	NAPI_GRO_CB(skb)->same_flow = 1;
4265	return 0;
4266}
4267
4268#ifdef CONFIG_SKB_EXTENSIONS
4269#define SKB_EXT_ALIGN_VALUE	8
4270#define SKB_EXT_CHUNKSIZEOF(x)	(ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)
4271
4272static const u8 skb_ext_type_len[] = {
4273#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
4274	[SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
4275#endif
4276#ifdef CONFIG_XFRM
4277	[SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
4278#endif
4279#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
4280	[TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
4281#endif
4282#if IS_ENABLED(CONFIG_MPTCP)
4283	[SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
4284#endif
4285};
4286
4287static __always_inline unsigned int skb_ext_total_length(void)
4288{
4289	return SKB_EXT_CHUNKSIZEOF(struct skb_ext) +
4290#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
4291		skb_ext_type_len[SKB_EXT_BRIDGE_NF] +
4292#endif
4293#ifdef CONFIG_XFRM
4294		skb_ext_type_len[SKB_EXT_SEC_PATH] +
4295#endif
4296#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
4297		skb_ext_type_len[TC_SKB_EXT] +
4298#endif
4299#if IS_ENABLED(CONFIG_MPTCP)
4300		skb_ext_type_len[SKB_EXT_MPTCP] +
4301#endif
4302		0;
4303}
4304
4305static void skb_extensions_init(void)
4306{
4307	BUILD_BUG_ON(SKB_EXT_NUM >= 8);
4308	BUILD_BUG_ON(skb_ext_total_length() > 255);
4309
4310	skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache",
4311					     SKB_EXT_ALIGN_VALUE * skb_ext_total_length(),
4312					     0,
4313					     SLAB_HWCACHE_ALIGN|SLAB_PANIC,
4314					     NULL);
4315}
4316#else
4317static void skb_extensions_init(void) {}
4318#endif
4319
4320void __init skb_init(void)
4321{
4322	skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache",
4323					      sizeof(struct sk_buff),
4324					      0,
4325					      SLAB_HWCACHE_ALIGN|SLAB_PANIC,
4326					      offsetof(struct sk_buff, cb),
4327					      sizeof_field(struct sk_buff, cb),
4328					      NULL);
4329	skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
4330						sizeof(struct sk_buff_fclones),
4331						0,
4332						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
4333						NULL);
4334	skb_extensions_init();
4335}
4336
4337static int
4338__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
4339	       unsigned int recursion_level)
4340{
4341	int start = skb_headlen(skb);
4342	int i, copy = start - offset;
4343	struct sk_buff *frag_iter;
4344	int elt = 0;
4345
4346	if (unlikely(recursion_level >= 24))
4347		return -EMSGSIZE;
4348
4349	if (copy > 0) {
4350		if (copy > len)
4351			copy = len;
4352		sg_set_buf(sg, skb->data + offset, copy);
4353		elt++;
4354		if ((len -= copy) == 0)
4355			return elt;
4356		offset += copy;
4357	}
4358
4359	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
4360		int end;
4361
4362		WARN_ON(start > offset + len);
4363
4364		end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
4365		if ((copy = end - offset) > 0) {
4366			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
4367			if (unlikely(elt && sg_is_last(&sg[elt - 1])))
4368				return -EMSGSIZE;
4369
4370			if (copy > len)
4371				copy = len;
4372			sg_set_page(&sg[elt], skb_frag_page(frag), copy,
4373				    skb_frag_off(frag) + offset - start);
4374			elt++;
4375			if (!(len -= copy))
4376				return elt;
4377			offset += copy;
4378		}
4379		start = end;
4380	}
4381
4382	skb_walk_frags(skb, frag_iter) {
4383		int end, ret;
4384
4385		WARN_ON(start > offset + len);
4386
4387		end = start + frag_iter->len;
4388		if ((copy = end - offset) > 0) {
4389			if (unlikely(elt && sg_is_last(&sg[elt - 1])))
4390				return -EMSGSIZE;
4391
4392			if (copy > len)
4393				copy = len;
4394			ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
4395					      copy, recursion_level + 1);
4396			if (unlikely(ret < 0))
4397				return ret;
4398			elt += ret;
4399			if ((len -= copy) == 0)
4400				return elt;
4401			offset += copy;
4402		}
4403		start = end;
4404	}
4405	BUG_ON(len);
4406	return elt;
4407}
4408
4409/**
4410 *	skb_to_sgvec - Fill a scatter-gather list from a socket buffer
4411 *	@skb: Socket buffer containing the buffers to be mapped
4412 *	@sg: The scatter-gather list to map into
4413 *	@offset: The offset into the buffer's contents to start mapping
4414 *	@len: Length of buffer space to be mapped
4415 *
4416 *	Fill the specified scatter-gather list with mappings/pointers into a
4417 *	region of the buffer space attached to a socket buffer. Returns either
4418 *	the number of scatterlist items used, or -EMSGSIZE if the contents
4419 *	could not fit.
4420 */
4421int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
4422{
4423	int nsg = __skb_to_sgvec(skb, sg, offset, len, 0);
4424
4425	if (nsg <= 0)
4426		return nsg;
4427
4428	sg_mark_end(&sg[nsg - 1]);
4429
4430	return nsg;
4431}
4432EXPORT_SYMBOL_GPL(skb_to_sgvec);
4433
4434/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
4435 * sglist without mark the sg which contain last skb data as the end.
4436 * So the caller can mannipulate sg list as will when padding new data after
4437 * the first call without calling sg_unmark_end to expend sg list.
4438 *
4439 * Scenario to use skb_to_sgvec_nomark:
4440 * 1. sg_init_table
4441 * 2. skb_to_sgvec_nomark(payload1)
4442 * 3. skb_to_sgvec_nomark(payload2)
4443 *
4444 * This is equivalent to:
4445 * 1. sg_init_table
4446 * 2. skb_to_sgvec(payload1)
4447 * 3. sg_unmark_end
4448 * 4. skb_to_sgvec(payload2)
4449 *
4450 * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
4451 * is more preferable.
4452 */
4453int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
4454			int offset, int len)
4455{
4456	return __skb_to_sgvec(skb, sg, offset, len, 0);
4457}
4458EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
4459
4460
4461
4462/**
4463 *	skb_cow_data - Check that a socket buffer's data buffers are writable
4464 *	@skb: The socket buffer to check.
4465 *	@tailbits: Amount of trailing space to be added
4466 *	@trailer: Returned pointer to the skb where the @tailbits space begins
4467 *
4468 *	Make sure that the data buffers attached to a socket buffer are
4469 *	writable. If they are not, private copies are made of the data buffers
4470 *	and the socket buffer is set to use these instead.
4471 *
4472 *	If @tailbits is given, make sure that there is space to write @tailbits
4473 *	bytes of data beyond current end of socket buffer.  @trailer will be
4474 *	set to point to the skb in which this space begins.
4475 *
4476 *	The number of scatterlist elements required to completely map the
4477 *	COW'd and extended socket buffer will be returned.
4478 */
4479int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
4480{
4481	int copyflag;
4482	int elt;
4483	struct sk_buff *skb1, **skb_p;
4484
4485	/* If skb is cloned or its head is paged, reallocate
4486	 * head pulling out all the pages (pages are considered not writable
4487	 * at the moment even if they are anonymous).
4488	 */
4489	if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
4490	    !__pskb_pull_tail(skb, __skb_pagelen(skb)))
4491		return -ENOMEM;
4492
4493	/* Easy case. Most of packets will go this way. */
4494	if (!skb_has_frag_list(skb)) {
4495		/* A little of trouble, not enough of space for trailer.
4496		 * This should not happen, when stack is tuned to generate
4497		 * good frames. OK, on miss we reallocate and reserve even more
4498		 * space, 128 bytes is fair. */
4499
4500		if (skb_tailroom(skb) < tailbits &&
4501		    pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
4502			return -ENOMEM;
4503
4504		/* Voila! */
4505		*trailer = skb;
4506		return 1;
4507	}
4508
4509	/* Misery. We are in troubles, going to mincer fragments... */
4510
4511	elt = 1;
4512	skb_p = &skb_shinfo(skb)->frag_list;
4513	copyflag = 0;
4514
4515	while ((skb1 = *skb_p) != NULL) {
4516		int ntail = 0;
4517
4518		/* The fragment is partially pulled by someone,
4519		 * this can happen on input. Copy it and everything
4520		 * after it. */
4521
4522		if (skb_shared(skb1))
4523			copyflag = 1;
4524
4525		/* If the skb is the last, worry about trailer. */
4526
4527		if (skb1->next == NULL && tailbits) {
4528			if (skb_shinfo(skb1)->nr_frags ||
4529			    skb_has_frag_list(skb1) ||
4530			    skb_tailroom(skb1) < tailbits)
4531				ntail = tailbits + 128;
4532		}
4533
4534		if (copyflag ||
4535		    skb_cloned(skb1) ||
4536		    ntail ||
4537		    skb_shinfo(skb1)->nr_frags ||
4538		    skb_has_frag_list(skb1)) {
4539			struct sk_buff *skb2;
4540
4541			/* Fuck, we are miserable poor guys... */
4542			if (ntail == 0)
4543				skb2 = skb_copy(skb1, GFP_ATOMIC);
4544			else
4545				skb2 = skb_copy_expand(skb1,
4546						       skb_headroom(skb1),
4547						       ntail,
4548						       GFP_ATOMIC);
4549			if (unlikely(skb2 == NULL))
4550				return -ENOMEM;
4551
4552			if (skb1->sk)
4553				skb_set_owner_w(skb2, skb1->sk);
4554
4555			/* Looking around. Are we still alive?
4556			 * OK, link new skb, drop old one */
4557
4558			skb2->next = skb1->next;
4559			*skb_p = skb2;
4560			kfree_skb(skb1);
4561			skb1 = skb2;
4562		}
4563		elt++;
4564		*trailer = skb1;
4565		skb_p = &skb1->next;
4566	}
4567
4568	return elt;
4569}
4570EXPORT_SYMBOL_GPL(skb_cow_data);
4571
4572static void sock_rmem_free(struct sk_buff *skb)
4573{
4574	struct sock *sk = skb->sk;
4575
4576	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
4577}
4578
4579static void skb_set_err_queue(struct sk_buff *skb)
4580{
4581	/* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
4582	 * So, it is safe to (mis)use it to mark skbs on the error queue.
4583	 */
4584	skb->pkt_type = PACKET_OUTGOING;
4585	BUILD_BUG_ON(PACKET_OUTGOING == 0);
4586}
4587
4588/*
4589 * Note: We dont mem charge error packets (no sk_forward_alloc changes)
4590 */
4591int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
4592{
4593	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
4594	    (unsigned int)READ_ONCE(sk->sk_rcvbuf))
4595		return -ENOMEM;
4596
4597	skb_orphan(skb);
4598	skb->sk = sk;
4599	skb->destructor = sock_rmem_free;
4600	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
4601	skb_set_err_queue(skb);
4602
4603	/* before exiting rcu section, make sure dst is refcounted */
4604	skb_dst_force(skb);
4605
4606	skb_queue_tail(&sk->sk_error_queue, skb);
4607	if (!sock_flag(sk, SOCK_DEAD))
4608		sk->sk_error_report(sk);
4609	return 0;
4610}
4611EXPORT_SYMBOL(sock_queue_err_skb);
4612
4613static bool is_icmp_err_skb(const struct sk_buff *skb)
4614{
4615	return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
4616		       SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
4617}
4618
4619struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
4620{
4621	struct sk_buff_head *q = &sk->sk_error_queue;
4622	struct sk_buff *skb, *skb_next = NULL;
4623	bool icmp_next = false;
4624	unsigned long flags;
4625
4626	spin_lock_irqsave(&q->lock, flags);
4627	skb = __skb_dequeue(q);
4628	if (skb && (skb_next = skb_peek(q))) {
4629		icmp_next = is_icmp_err_skb(skb_next);
4630		if (icmp_next)
4631			sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
4632	}
4633	spin_unlock_irqrestore(&q->lock, flags);
4634
4635	if (is_icmp_err_skb(skb) && !icmp_next)
4636		sk->sk_err = 0;
4637
4638	if (skb_next)
4639		sk->sk_error_report(sk);
4640
4641	return skb;
4642}
4643EXPORT_SYMBOL(sock_dequeue_err_skb);
4644
4645/**
4646 * skb_clone_sk - create clone of skb, and take reference to socket
4647 * @skb: the skb to clone
4648 *
4649 * This function creates a clone of a buffer that holds a reference on
4650 * sk_refcnt.  Buffers created via this function are meant to be
4651 * returned using sock_queue_err_skb, or free via kfree_skb.
4652 *
4653 * When passing buffers allocated with this function to sock_queue_err_skb
4654 * it is necessary to wrap the call with sock_hold/sock_put in order to
4655 * prevent the socket from being released prior to being enqueued on
4656 * the sk_error_queue.
4657 */
4658struct sk_buff *skb_clone_sk(struct sk_buff *skb)
4659{
4660	struct sock *sk = skb->sk;
4661	struct sk_buff *clone;
4662
4663	if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
4664		return NULL;
4665
4666	clone = skb_clone(skb, GFP_ATOMIC);
4667	if (!clone) {
4668		sock_put(sk);
4669		return NULL;
4670	}
4671
4672	clone->sk = sk;
4673	clone->destructor = sock_efree;
4674
4675	return clone;
4676}
4677EXPORT_SYMBOL(skb_clone_sk);
4678
4679static void __skb_complete_tx_timestamp(struct sk_buff *skb,
4680					struct sock *sk,
4681					int tstype,
4682					bool opt_stats)
4683{
4684	struct sock_exterr_skb *serr;
4685	int err;
4686
4687	BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
4688
4689	serr = SKB_EXT_ERR(skb);
4690	memset(serr, 0, sizeof(*serr));
4691	serr->ee.ee_errno = ENOMSG;
4692	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
4693	serr->ee.ee_info = tstype;
4694	serr->opt_stats = opt_stats;
4695	serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
4696	if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
4697		serr->ee.ee_data = skb_shinfo(skb)->tskey;
4698		if (sk->sk_protocol == IPPROTO_TCP &&
4699		    sk->sk_type == SOCK_STREAM)
4700			serr->ee.ee_data -= sk->sk_tskey;
4701	}
4702
4703	err = sock_queue_err_skb(sk, skb);
4704
4705	if (err)
4706		kfree_skb(skb);
4707}
4708
4709static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
4710{
4711	bool ret;
4712
4713	if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly))
4714		return true;
4715
4716	read_lock_bh(&sk->sk_callback_lock);
4717	ret = sk->sk_socket && sk->sk_socket->file &&
4718	      file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW);
4719	read_unlock_bh(&sk->sk_callback_lock);
4720	return ret;
4721}
4722
4723void skb_complete_tx_timestamp(struct sk_buff *skb,
4724			       struct skb_shared_hwtstamps *hwtstamps)
4725{
4726	struct sock *sk = skb->sk;
4727
4728	if (!skb_may_tx_timestamp(sk, false))
4729		goto err;
4730
4731	/* Take a reference to prevent skb_orphan() from freeing the socket,
4732	 * but only if the socket refcount is not zero.
4733	 */
4734	if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
4735		*skb_hwtstamps(skb) = *hwtstamps;
4736		__skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
4737		sock_put(sk);
4738		return;
4739	}
4740
4741err:
4742	kfree_skb(skb);
4743}
4744EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
4745
4746void __skb_tstamp_tx(struct sk_buff *orig_skb,
4747		     struct skb_shared_hwtstamps *hwtstamps,
4748		     struct sock *sk, int tstype)
4749{
4750	struct sk_buff *skb;
4751	bool tsonly, opt_stats = false;
4752
4753	if (!sk)
4754		return;
4755
4756	if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
4757	    skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
4758		return;
4759
4760	tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
4761	if (!skb_may_tx_timestamp(sk, tsonly))
4762		return;
4763
4764	if (tsonly) {
4765#ifdef CONFIG_INET
4766		if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
4767		    sk->sk_protocol == IPPROTO_TCP &&
4768		    sk->sk_type == SOCK_STREAM) {
4769			skb = tcp_get_timestamping_opt_stats(sk, orig_skb);
4770			opt_stats = true;
4771		} else
4772#endif
4773			skb = alloc_skb(0, GFP_ATOMIC);
4774	} else {
4775		skb = skb_clone(orig_skb, GFP_ATOMIC);
4776
4777		if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) {
4778			kfree_skb(skb);
4779			return;
4780		}
4781	}
4782	if (!skb)
4783		return;
4784
4785	if (tsonly) {
4786		skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags &
4787					     SKBTX_ANY_TSTAMP;
4788		skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
4789	}
4790
4791	if (hwtstamps)
4792		*skb_hwtstamps(skb) = *hwtstamps;
4793	else
4794		skb->tstamp = ktime_get_real();
4795
4796	__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
4797}
4798EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
4799
4800void skb_tstamp_tx(struct sk_buff *orig_skb,
4801		   struct skb_shared_hwtstamps *hwtstamps)
4802{
4803	return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk,
4804			       SCM_TSTAMP_SND);
4805}
4806EXPORT_SYMBOL_GPL(skb_tstamp_tx);
4807
4808void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
4809{
4810	struct sock *sk = skb->sk;
4811	struct sock_exterr_skb *serr;
4812	int err = 1;
4813
4814	skb->wifi_acked_valid = 1;
4815	skb->wifi_acked = acked;
4816
4817	serr = SKB_EXT_ERR(skb);
4818	memset(serr, 0, sizeof(*serr));
4819	serr->ee.ee_errno = ENOMSG;
4820	serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
4821
4822	/* Take a reference to prevent skb_orphan() from freeing the socket,
4823	 * but only if the socket refcount is not zero.
4824	 */
4825	if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
4826		err = sock_queue_err_skb(sk, skb);
4827		sock_put(sk);
4828	}
4829	if (err)
4830		kfree_skb(skb);
4831}
4832EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
4833
4834/**
4835 * skb_partial_csum_set - set up and verify partial csum values for packet
4836 * @skb: the skb to set
4837 * @start: the number of bytes after skb->data to start checksumming.
4838 * @off: the offset from start to place the checksum.
4839 *
4840 * For untrusted partially-checksummed packets, we need to make sure the values
4841 * for skb->csum_start and skb->csum_offset are valid so we don't oops.
4842 *
4843 * This function checks and sets those values and skb->ip_summed: if this
4844 * returns false you should drop the packet.
4845 */
4846bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
4847{
4848	u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
4849	u32 csum_start = skb_headroom(skb) + (u32)start;
4850
4851	if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) {
4852		net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
4853				     start, off, skb_headroom(skb), skb_headlen(skb));
4854		return false;
4855	}
4856	skb->ip_summed = CHECKSUM_PARTIAL;
4857	skb->csum_start = csum_start;
4858	skb->csum_offset = off;
4859	skb_set_transport_header(skb, start);
4860	return true;
4861}
4862EXPORT_SYMBOL_GPL(skb_partial_csum_set);
4863
4864static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
4865			       unsigned int max)
4866{
4867	if (skb_headlen(skb) >= len)
4868		return 0;
4869
4870	/* If we need to pullup then pullup to the max, so we
4871	 * won't need to do it again.
4872	 */
4873	if (max > skb->len)
4874		max = skb->len;
4875
4876	if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
4877		return -ENOMEM;
4878
4879	if (skb_headlen(skb) < len)
4880		return -EPROTO;
4881
4882	return 0;
4883}
4884
4885#define MAX_TCP_HDR_LEN (15 * 4)
4886
4887static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
4888				      typeof(IPPROTO_IP) proto,
4889				      unsigned int off)
4890{
4891	int err;
4892
4893	switch (proto) {
4894	case IPPROTO_TCP:
4895		err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
4896					  off + MAX_TCP_HDR_LEN);
4897		if (!err && !skb_partial_csum_set(skb, off,
4898						  offsetof(struct tcphdr,
4899							   check)))
4900			err = -EPROTO;
4901		return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
4902
4903	case IPPROTO_UDP:
4904		err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
4905					  off + sizeof(struct udphdr));
4906		if (!err && !skb_partial_csum_set(skb, off,
4907						  offsetof(struct udphdr,
4908							   check)))
4909			err = -EPROTO;
4910		return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
4911	}
4912
4913	return ERR_PTR(-EPROTO);
4914}
4915
4916/* This value should be large enough to cover a tagged ethernet header plus
4917 * maximally sized IP and TCP or UDP headers.
4918 */
4919#define MAX_IP_HDR_LEN 128
4920
4921static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
4922{
4923	unsigned int off;
4924	bool fragment;
4925	__sum16 *csum;
4926	int err;
4927
4928	fragment = false;
4929
4930	err = skb_maybe_pull_tail(skb,
4931				  sizeof(struct iphdr),
4932				  MAX_IP_HDR_LEN);
4933	if (err < 0)
4934		goto out;
4935
4936	if (ip_is_fragment(ip_hdr(skb)))
4937		fragment = true;
4938
4939	off = ip_hdrlen(skb);
4940
4941	err = -EPROTO;
4942
4943	if (fragment)
4944		goto out;
4945
4946	csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
4947	if (IS_ERR(csum))
4948		return PTR_ERR(csum);
4949
4950	if (recalculate)
4951		*csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
4952					   ip_hdr(skb)->daddr,
4953					   skb->len - off,
4954					   ip_hdr(skb)->protocol, 0);
4955	err = 0;
4956
4957out:
4958	return err;
4959}
4960
4961/* This value should be large enough to cover a tagged ethernet header plus
4962 * an IPv6 header, all options, and a maximal TCP or UDP header.
4963 */
4964#define MAX_IPV6_HDR_LEN 256
4965
4966#define OPT_HDR(type, skb, off) \
4967	(type *)(skb_network_header(skb) + (off))
4968
4969static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
4970{
4971	int err;
4972	u8 nexthdr;
4973	unsigned int off;
4974	unsigned int len;
4975	bool fragment;
4976	bool done;
4977	__sum16 *csum;
4978
4979	fragment = false;
4980	done = false;
4981
4982	off = sizeof(struct ipv6hdr);
4983
4984	err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
4985	if (err < 0)
4986		goto out;
4987
4988	nexthdr = ipv6_hdr(skb)->nexthdr;
4989
4990	len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
4991	while (off <= len && !done) {
4992		switch (nexthdr) {
4993		case IPPROTO_DSTOPTS:
4994		case IPPROTO_HOPOPTS:
4995		case IPPROTO_ROUTING: {
4996			struct ipv6_opt_hdr *hp;
4997
4998			err = skb_maybe_pull_tail(skb,
4999						  off +
5000						  sizeof(struct ipv6_opt_hdr),
5001						  MAX_IPV6_HDR_LEN);
5002			if (err < 0)
5003				goto out;
5004
5005			hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
5006			nexthdr = hp->nexthdr;
5007			off += ipv6_optlen(hp);
5008			break;
5009		}
5010		case IPPROTO_AH: {
5011			struct ip_auth_hdr *hp;
5012
5013			err = skb_maybe_pull_tail(skb,
5014						  off +
5015						  sizeof(struct ip_auth_hdr),
5016						  MAX_IPV6_HDR_LEN);
5017			if (err < 0)
5018				goto out;
5019
5020			hp = OPT_HDR(struct ip_auth_hdr, skb, off);
5021			nexthdr = hp->nexthdr;
5022			off += ipv6_authlen(hp);
5023			break;
5024		}
5025		case IPPROTO_FRAGMENT: {
5026			struct frag_hdr *hp;
5027
5028			err = skb_maybe_pull_tail(skb,
5029						  off +
5030						  sizeof(struct frag_hdr),
5031						  MAX_IPV6_HDR_LEN);
5032			if (err < 0)
5033				goto out;
5034
5035			hp = OPT_HDR(struct frag_hdr, skb, off);
5036
5037			if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
5038				fragment = true;
5039
5040			nexthdr = hp->nexthdr;
5041			off += sizeof(struct frag_hdr);
5042			break;
5043		}
5044		default:
5045			done = true;
5046			break;
5047		}
5048	}
5049
5050	err = -EPROTO;
5051
5052	if (!done || fragment)
5053		goto out;
5054
5055	csum = skb_checksum_setup_ip(skb, nexthdr, off);
5056	if (IS_ERR(csum))
5057		return PTR_ERR(csum);
5058
5059	if (recalculate)
5060		*csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
5061					 &ipv6_hdr(skb)->daddr,
5062					 skb->len - off, nexthdr, 0);
5063	err = 0;
5064
5065out:
5066	return err;
5067}
5068
5069/**
5070 * skb_checksum_setup - set up partial checksum offset
5071 * @skb: the skb to set up
5072 * @recalculate: if true the pseudo-header checksum will be recalculated
5073 */
5074int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
5075{
5076	int err;
5077
5078	switch (skb->protocol) {
5079	case htons(ETH_P_IP):
5080		err = skb_checksum_setup_ipv4(skb, recalculate);
5081		break;
5082
5083	case htons(ETH_P_IPV6):
5084		err = skb_checksum_setup_ipv6(skb, recalculate);
5085		break;
5086
5087	default:
5088		err = -EPROTO;
5089		break;
5090	}
5091
5092	return err;
5093}
5094EXPORT_SYMBOL(skb_checksum_setup);
5095
5096/**
5097 * skb_checksum_maybe_trim - maybe trims the given skb
5098 * @skb: the skb to check
5099 * @transport_len: the data length beyond the network header
5100 *
5101 * Checks whether the given skb has data beyond the given transport length.
5102 * If so, returns a cloned skb trimmed to this transport length.
5103 * Otherwise returns the provided skb. Returns NULL in error cases
5104 * (e.g. transport_len exceeds skb length or out-of-memory).
5105 *
5106 * Caller needs to set the skb transport header and free any returned skb if it
5107 * differs from the provided skb.
5108 */
5109static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
5110					       unsigned int transport_len)
5111{
5112	struct sk_buff *skb_chk;
5113	unsigned int len = skb_transport_offset(skb) + transport_len;
5114	int ret;
5115
5116	if (skb->len < len)
5117		return NULL;
5118	else if (skb->len == len)
5119		return skb;
5120
5121	skb_chk = skb_clone(skb, GFP_ATOMIC);
5122	if (!skb_chk)
5123		return NULL;
5124
5125	ret = pskb_trim_rcsum(skb_chk, len);
5126	if (ret) {
5127		kfree_skb(skb_chk);
5128		return NULL;
5129	}
5130
5131	return skb_chk;
5132}
5133
5134/**
5135 * skb_checksum_trimmed - validate checksum of an skb
5136 * @skb: the skb to check
5137 * @transport_len: the data length beyond the network header
5138 * @skb_chkf: checksum function to use
5139 *
5140 * Applies the given checksum function skb_chkf to the provided skb.
5141 * Returns a checked and maybe trimmed skb. Returns NULL on error.
5142 *
5143 * If the skb has data beyond the given transport length, then a
5144 * trimmed & cloned skb is checked and returned.
5145 *
5146 * Caller needs to set the skb transport header and free any returned skb if it
5147 * differs from the provided skb.
5148 */
5149struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
5150				     unsigned int transport_len,
5151				     __sum16(*skb_chkf)(struct sk_buff *skb))
5152{
5153	struct sk_buff *skb_chk;
5154	unsigned int offset = skb_transport_offset(skb);
5155	__sum16 ret;
5156
5157	skb_chk = skb_checksum_maybe_trim(skb, transport_len);
5158	if (!skb_chk)
5159		goto err;
5160
5161	if (!pskb_may_pull(skb_chk, offset))
5162		goto err;
5163
5164	skb_pull_rcsum(skb_chk, offset);
5165	ret = skb_chkf(skb_chk);
5166	skb_push_rcsum(skb_chk, offset);
5167
5168	if (ret)
5169		goto err;
5170
5171	return skb_chk;
5172
5173err:
5174	if (skb_chk && skb_chk != skb)
5175		kfree_skb(skb_chk);
5176
5177	return NULL;
5178
5179}
5180EXPORT_SYMBOL(skb_checksum_trimmed);
5181
5182void __skb_warn_lro_forwarding(const struct sk_buff *skb)
5183{
5184	net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
5185			     skb->dev->name);
5186}
5187EXPORT_SYMBOL(__skb_warn_lro_forwarding);
5188
5189void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
5190{
5191	if (head_stolen) {
5192		skb_release_head_state(skb);
5193		kmem_cache_free(skbuff_head_cache, skb);
5194	} else {
5195		__kfree_skb(skb);
5196	}
5197}
5198EXPORT_SYMBOL(kfree_skb_partial);
5199
5200/**
5201 * skb_try_coalesce - try to merge skb to prior one
5202 * @to: prior buffer
5203 * @from: buffer to add
5204 * @fragstolen: pointer to boolean
5205 * @delta_truesize: how much more was allocated than was requested
5206 */
5207bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
5208		      bool *fragstolen, int *delta_truesize)
5209{
5210	struct skb_shared_info *to_shinfo, *from_shinfo;
5211	int i, delta, len = from->len;
5212
5213	*fragstolen = false;
5214
5215	if (skb_cloned(to))
5216		return false;
5217
5218	if (len <= skb_tailroom(to)) {
5219		if (len)
5220			BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
5221		*delta_truesize = 0;
5222		return true;
5223	}
5224
5225	to_shinfo = skb_shinfo(to);
5226	from_shinfo = skb_shinfo(from);
5227	if (to_shinfo->frag_list || from_shinfo->frag_list)
5228		return false;
5229	if (skb_zcopy(to) || skb_zcopy(from))
5230		return false;
5231
5232	if (skb_headlen(from) != 0) {
5233		struct page *page;
5234		unsigned int offset;
5235
5236		if (to_shinfo->nr_frags +
5237		    from_shinfo->nr_frags >= MAX_SKB_FRAGS)
5238			return false;
5239
5240		if (skb_head_is_locked(from))
5241			return false;
5242
5243		delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
5244
5245		page = virt_to_head_page(from->head);
5246		offset = from->data - (unsigned char *)page_address(page);
5247
5248		skb_fill_page_desc(to, to_shinfo->nr_frags,
5249				   page, offset, skb_headlen(from));
5250		*fragstolen = true;
5251	} else {
5252		if (to_shinfo->nr_frags +
5253		    from_shinfo->nr_frags > MAX_SKB_FRAGS)
5254			return false;
5255
5256		delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
5257	}
5258
5259	WARN_ON_ONCE(delta < len);
5260
5261	memcpy(to_shinfo->frags + to_shinfo->nr_frags,
5262	       from_shinfo->frags,
5263	       from_shinfo->nr_frags * sizeof(skb_frag_t));
5264	to_shinfo->nr_frags += from_shinfo->nr_frags;
5265
5266	if (!skb_cloned(from))
5267		from_shinfo->nr_frags = 0;
5268
5269	/* if the skb is not cloned this does nothing
5270	 * since we set nr_frags to 0.
5271	 */
5272	for (i = 0; i < from_shinfo->nr_frags; i++)
5273		__skb_frag_ref(&from_shinfo->frags[i]);
5274
5275	to->truesize += delta;
5276	to->len += len;
5277	to->data_len += len;
5278
5279	*delta_truesize = delta;
5280	return true;
5281}
5282EXPORT_SYMBOL(skb_try_coalesce);
5283
5284/**
5285 * skb_scrub_packet - scrub an skb
5286 *
5287 * @skb: buffer to clean
5288 * @xnet: packet is crossing netns
5289 *
5290 * skb_scrub_packet can be used after encapsulating or decapsulting a packet
5291 * into/from a tunnel. Some information have to be cleared during these
5292 * operations.
5293 * skb_scrub_packet can also be used to clean a skb before injecting it in
5294 * another namespace (@xnet == true). We have to clear all information in the
5295 * skb that could impact namespace isolation.
5296 */
5297void skb_scrub_packet(struct sk_buff *skb, bool xnet)
5298{
5299	skb->pkt_type = PACKET_HOST;
5300	skb->skb_iif = 0;
5301	skb->ignore_df = 0;
5302	skb_dst_drop(skb);
5303	skb_ext_reset(skb);
5304	nf_reset_ct(skb);
5305	nf_reset_trace(skb);
5306
5307#ifdef CONFIG_NET_SWITCHDEV
5308	skb->offload_fwd_mark = 0;
5309	skb->offload_l3_fwd_mark = 0;
5310#endif
5311
5312	if (!xnet)
5313		return;
5314
5315	ipvs_reset(skb);
5316	skb->mark = 0;
5317	skb->tstamp = 0;
5318}
5319EXPORT_SYMBOL_GPL(skb_scrub_packet);
5320
5321/**
5322 * skb_gso_transport_seglen - Return length of individual segments of a gso packet
5323 *
5324 * @skb: GSO skb
5325 *
5326 * skb_gso_transport_seglen is used to determine the real size of the
5327 * individual segments, including Layer4 headers (TCP/UDP).
5328 *
5329 * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
5330 */
5331static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
5332{
5333	const struct skb_shared_info *shinfo = skb_shinfo(skb);
5334	unsigned int thlen = 0;
5335
5336	if (skb->encapsulation) {
5337		thlen = skb_inner_transport_header(skb) -
5338			skb_transport_header(skb);
5339
5340		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
5341			thlen += inner_tcp_hdrlen(skb);
5342	} else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
5343		thlen = tcp_hdrlen(skb);
5344	} else if (unlikely(skb_is_gso_sctp(skb))) {
5345		thlen = sizeof(struct sctphdr);
5346	} else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
5347		thlen = sizeof(struct udphdr);
5348	}
5349	/* UFO sets gso_size to the size of the fragmentation
5350	 * payload, i.e. the size of the L4 (UDP) header is already
5351	 * accounted for.
5352	 */
5353	return thlen + shinfo->gso_size;
5354}
5355
5356/**
5357 * skb_gso_network_seglen - Return length of individual segments of a gso packet
5358 *
5359 * @skb: GSO skb
5360 *
5361 * skb_gso_network_seglen is used to determine the real size of the
5362 * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
5363 *
5364 * The MAC/L2 header is not accounted for.
5365 */
5366static unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
5367{
5368	unsigned int hdr_len = skb_transport_header(skb) -
5369			       skb_network_header(skb);
5370
5371	return hdr_len + skb_gso_transport_seglen(skb);
5372}
5373
5374/**
5375 * skb_gso_mac_seglen - Return length of individual segments of a gso packet
5376 *
5377 * @skb: GSO skb
5378 *
5379 * skb_gso_mac_seglen is used to determine the real size of the
5380 * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
5381 * headers (TCP/UDP).
5382 */
5383static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
5384{
5385	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
5386
5387	return hdr_len + skb_gso_transport_seglen(skb);
5388}
5389
5390/**
5391 * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS
5392 *
5393 * There are a couple of instances where we have a GSO skb, and we
5394 * want to determine what size it would be after it is segmented.
5395 *
5396 * We might want to check:
5397 * -    L3+L4+payload size (e.g. IP forwarding)
5398 * - L2+L3+L4+payload size (e.g. sanity check before passing to driver)
5399 *
5400 * This is a helper to do that correctly considering GSO_BY_FRAGS.
5401 *
5402 * @skb: GSO skb
5403 *
5404 * @seg_len: The segmented length (from skb_gso_*_seglen). In the
5405 *           GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS].
5406 *
5407 * @max_len: The maximum permissible length.
5408 *
5409 * Returns true if the segmented length <= max length.
5410 */
5411static inline bool skb_gso_size_check(const struct sk_buff *skb,
5412				      unsigned int seg_len,
5413				      unsigned int max_len) {
5414	const struct skb_shared_info *shinfo = skb_shinfo(skb);
5415	const struct sk_buff *iter;
5416
5417	if (shinfo->gso_size != GSO_BY_FRAGS)
5418		return seg_len <= max_len;
5419
5420	/* Undo this so we can re-use header sizes */
5421	seg_len -= GSO_BY_FRAGS;
5422
5423	skb_walk_frags(skb, iter) {
5424		if (seg_len + skb_headlen(iter) > max_len)
5425			return false;
5426	}
5427
5428	return true;
5429}
5430
5431/**
5432 * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU?
5433 *
5434 * @skb: GSO skb
5435 * @mtu: MTU to validate against
5436 *
5437 * skb_gso_validate_network_len validates if a given skb will fit a
5438 * wanted MTU once split. It considers L3 headers, L4 headers, and the
5439 * payload.
5440 */
5441bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu)
5442{
5443	return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu);
5444}
5445EXPORT_SYMBOL_GPL(skb_gso_validate_network_len);
5446
5447/**
5448 * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length?
5449 *
5450 * @skb: GSO skb
5451 * @len: length to validate against
5452 *
5453 * skb_gso_validate_mac_len validates if a given skb will fit a wanted
5454 * length once split, including L2, L3 and L4 headers and the payload.
5455 */
5456bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len)
5457{
5458	return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len);
5459}
5460EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);
5461
5462static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
5463{
5464	int mac_len, meta_len;
5465	void *meta;
5466
5467	if (skb_cow(skb, skb_headroom(skb)) < 0) {
5468		kfree_skb(skb);
5469		return NULL;
5470	}
5471
5472	mac_len = skb->data - skb_mac_header(skb);
5473	if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) {
5474		memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
5475			mac_len - VLAN_HLEN - ETH_TLEN);
5476	}
5477
5478	meta_len = skb_metadata_len(skb);
5479	if (meta_len) {
5480		meta = skb_metadata_end(skb) - meta_len;
5481		memmove(meta + VLAN_HLEN, meta, meta_len);
5482	}
5483
5484	skb->mac_header += VLAN_HLEN;
5485	return skb;
5486}
5487
5488struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
5489{
5490	struct vlan_hdr *vhdr;
5491	u16 vlan_tci;
5492
5493	if (unlikely(skb_vlan_tag_present(skb))) {
5494		/* vlan_tci is already set-up so leave this for another time */
5495		return skb;
5496	}
5497
5498	skb = skb_share_check(skb, GFP_ATOMIC);
5499	if (unlikely(!skb))
5500		goto err_free;
5501	/* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */
5502	if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short))))
5503		goto err_free;
5504
5505	vhdr = (struct vlan_hdr *)skb->data;
5506	vlan_tci = ntohs(vhdr->h_vlan_TCI);
5507	__vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
5508
5509	skb_pull_rcsum(skb, VLAN_HLEN);
5510	vlan_set_encap_proto(skb, vhdr);
5511
5512	skb = skb_reorder_vlan_header(skb);
5513	if (unlikely(!skb))
5514		goto err_free;
5515
5516	skb_reset_network_header(skb);
5517	skb_reset_transport_header(skb);
5518	skb_reset_mac_len(skb);
5519
5520	return skb;
5521
5522err_free:
5523	kfree_skb(skb);
5524	return NULL;
5525}
5526EXPORT_SYMBOL(skb_vlan_untag);
5527
5528int skb_ensure_writable(struct sk_buff *skb, int write_len)
5529{
5530	if (!pskb_may_pull(skb, write_len))
5531		return -ENOMEM;
5532
5533	if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
5534		return 0;
5535
5536	return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
5537}
5538EXPORT_SYMBOL(skb_ensure_writable);
5539
5540/* remove VLAN header from packet and update csum accordingly.
5541 * expects a non skb_vlan_tag_present skb with a vlan tag payload
5542 */
5543int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
5544{
5545	struct vlan_hdr *vhdr;
5546	int offset = skb->data - skb_mac_header(skb);
5547	int err;
5548
5549	if (WARN_ONCE(offset,
5550		      "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
5551		      offset)) {
5552		return -EINVAL;
5553	}
5554
5555	err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
5556	if (unlikely(err))
5557		return err;
5558
5559	skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
5560
5561	vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
5562	*vlan_tci = ntohs(vhdr->h_vlan_TCI);
5563
5564	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
5565	__skb_pull(skb, VLAN_HLEN);
5566
5567	vlan_set_encap_proto(skb, vhdr);
5568	skb->mac_header += VLAN_HLEN;
5569
5570	if (skb_network_offset(skb) < ETH_HLEN)
5571		skb_set_network_header(skb, ETH_HLEN);
5572
5573	skb_reset_mac_len(skb);
5574
5575	return err;
5576}
5577EXPORT_SYMBOL(__skb_vlan_pop);
5578
5579/* Pop a vlan tag either from hwaccel or from payload.
5580 * Expects skb->data at mac header.
5581 */
5582int skb_vlan_pop(struct sk_buff *skb)
5583{
5584	u16 vlan_tci;
5585	__be16 vlan_proto;
5586	int err;
5587
5588	if (likely(skb_vlan_tag_present(skb))) {
5589		__vlan_hwaccel_clear_tag(skb);
5590	} else {
5591		if (unlikely(!eth_type_vlan(skb->protocol)))
5592			return 0;
5593
5594		err = __skb_vlan_pop(skb, &vlan_tci);
5595		if (err)
5596			return err;
5597	}
5598	/* move next vlan tag to hw accel tag */
5599	if (likely(!eth_type_vlan(skb->protocol)))
5600		return 0;
5601
5602	vlan_proto = skb->protocol;
5603	err = __skb_vlan_pop(skb, &vlan_tci);
5604	if (unlikely(err))
5605		return err;
5606
5607	__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
5608	return 0;
5609}
5610EXPORT_SYMBOL(skb_vlan_pop);
5611
5612/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
5613 * Expects skb->data at mac header.
5614 */
5615int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
5616{
5617	if (skb_vlan_tag_present(skb)) {
5618		int offset = skb->data - skb_mac_header(skb);
5619		int err;
5620
5621		if (WARN_ONCE(offset,
5622			      "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
5623			      offset)) {
5624			return -EINVAL;
5625		}
5626
5627		err = __vlan_insert_tag(skb, skb->vlan_proto,
5628					skb_vlan_tag_get(skb));
5629		if (err)
5630			return err;
5631
5632		skb->protocol = skb->vlan_proto;
5633		skb->mac_len += VLAN_HLEN;
5634
5635		skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
5636	}
5637	__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
5638	return 0;
5639}
5640EXPORT_SYMBOL(skb_vlan_push);
5641
5642/**
5643 * skb_eth_pop() - Drop the Ethernet header at the head of a packet
5644 *
5645 * @skb: Socket buffer to modify
5646 *
5647 * Drop the Ethernet header of @skb.
5648 *
5649 * Expects that skb->data points to the mac header and that no VLAN tags are
5650 * present.
5651 *
5652 * Returns 0 on success, -errno otherwise.
5653 */
5654int skb_eth_pop(struct sk_buff *skb)
5655{
5656	if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) ||
5657	    skb_network_offset(skb) < ETH_HLEN)
5658		return -EPROTO;
5659
5660	skb_pull_rcsum(skb, ETH_HLEN);
5661	skb_reset_mac_header(skb);
5662	skb_reset_mac_len(skb);
5663
5664	return 0;
5665}
5666EXPORT_SYMBOL(skb_eth_pop);
5667
5668/**
5669 * skb_eth_push() - Add a new Ethernet header at the head of a packet
5670 *
5671 * @skb: Socket buffer to modify
5672 * @dst: Destination MAC address of the new header
5673 * @src: Source MAC address of the new header
5674 *
5675 * Prepend @skb with a new Ethernet header.
5676 *
5677 * Expects that skb->data points to the mac header, which must be empty.
5678 *
5679 * Returns 0 on success, -errno otherwise.
5680 */
5681int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
5682		 const unsigned char *src)
5683{
5684	struct ethhdr *eth;
5685	int err;
5686
5687	if (skb_network_offset(skb) || skb_vlan_tag_present(skb))
5688		return -EPROTO;
5689
5690	err = skb_cow_head(skb, sizeof(*eth));
5691	if (err < 0)
5692		return err;
5693
5694	skb_push(skb, sizeof(*eth));
5695	skb_reset_mac_header(skb);
5696	skb_reset_mac_len(skb);
5697
5698	eth = eth_hdr(skb);
5699	ether_addr_copy(eth->h_dest, dst);
5700	ether_addr_copy(eth->h_source, src);
5701	eth->h_proto = skb->protocol;
5702
5703	skb_postpush_rcsum(skb, eth, sizeof(*eth));
5704
5705	return 0;
5706}
5707EXPORT_SYMBOL(skb_eth_push);
5708
5709/* Update the ethertype of hdr and the skb csum value if required. */
5710static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
5711			     __be16 ethertype)
5712{
5713	if (skb->ip_summed == CHECKSUM_COMPLETE) {
5714		__be16 diff[] = { ~hdr->h_proto, ethertype };
5715
5716		skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
5717	}
5718
5719	hdr->h_proto = ethertype;
5720}
5721
5722/**
5723 * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of
5724 *                   the packet
5725 *
5726 * @skb: buffer
5727 * @mpls_lse: MPLS label stack entry to push
5728 * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
5729 * @mac_len: length of the MAC header
5730 * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is
5731 *            ethernet
5732 *
5733 * Expects skb->data at mac header.
5734 *
5735 * Returns 0 on success, -errno otherwise.
5736 */
5737int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
5738		  int mac_len, bool ethernet)
5739{
5740	struct mpls_shim_hdr *lse;
5741	int err;
5742
5743	if (unlikely(!eth_p_mpls(mpls_proto)))
5744		return -EINVAL;
5745
5746	/* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
5747	if (skb->encapsulation)
5748		return -EINVAL;
5749
5750	err = skb_cow_head(skb, MPLS_HLEN);
5751	if (unlikely(err))
5752		return err;
5753
5754	if (!skb->inner_protocol) {
5755		skb_set_inner_network_header(skb, skb_network_offset(skb));
5756		skb_set_inner_protocol(skb, skb->protocol);
5757	}
5758
5759	skb_push(skb, MPLS_HLEN);
5760	memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
5761		mac_len);
5762	skb_reset_mac_header(skb);
5763	skb_set_network_header(skb, mac_len);
5764	skb_reset_mac_len(skb);
5765
5766	lse = mpls_hdr(skb);
5767	lse->label_stack_entry = mpls_lse;
5768	skb_postpush_rcsum(skb, lse, MPLS_HLEN);
5769
5770	if (ethernet && mac_len >= ETH_HLEN)
5771		skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
5772	skb->protocol = mpls_proto;
5773
5774	return 0;
5775}
5776EXPORT_SYMBOL_GPL(skb_mpls_push);
5777
5778/**
5779 * skb_mpls_pop() - pop the outermost MPLS header
5780 *
5781 * @skb: buffer
5782 * @next_proto: ethertype of header after popped MPLS header
5783 * @mac_len: length of the MAC header
5784 * @ethernet: flag to indicate if the packet is ethernet
5785 *
5786 * Expects skb->data at mac header.
5787 *
5788 * Returns 0 on success, -errno otherwise.
5789 */
5790int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
5791		 bool ethernet)
5792{
5793	int err;
5794
5795	if (unlikely(!eth_p_mpls(skb->protocol)))
5796		return 0;
5797
5798	err = skb_ensure_writable(skb, mac_len + MPLS_HLEN);
5799	if (unlikely(err))
5800		return err;
5801
5802	skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
5803	memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
5804		mac_len);
5805
5806	__skb_pull(skb, MPLS_HLEN);
5807	skb_reset_mac_header(skb);
5808	skb_set_network_header(skb, mac_len);
5809
5810	if (ethernet && mac_len >= ETH_HLEN) {
5811		struct ethhdr *hdr;
5812
5813		/* use mpls_hdr() to get ethertype to account for VLANs. */
5814		hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
5815		skb_mod_eth_type(skb, hdr, next_proto);
5816	}
5817	skb->protocol = next_proto;
5818
5819	return 0;
5820}
5821EXPORT_SYMBOL_GPL(skb_mpls_pop);
5822
5823/**
5824 * skb_mpls_update_lse() - modify outermost MPLS header and update csum
5825 *
5826 * @skb: buffer
5827 * @mpls_lse: new MPLS label stack entry to update to
5828 *
5829 * Expects skb->data at mac header.
5830 *
5831 * Returns 0 on success, -errno otherwise.
5832 */
5833int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
5834{
5835	int err;
5836
5837	if (unlikely(!eth_p_mpls(skb->protocol)))
5838		return -EINVAL;
5839
5840	err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
5841	if (unlikely(err))
5842		return err;
5843
5844	if (skb->ip_summed == CHECKSUM_COMPLETE) {
5845		__be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };
5846
5847		skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
5848	}
5849
5850	mpls_hdr(skb)->label_stack_entry = mpls_lse;
5851
5852	return 0;
5853}
5854EXPORT_SYMBOL_GPL(skb_mpls_update_lse);
5855
5856/**
5857 * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
5858 *
5859 * @skb: buffer
5860 *
5861 * Expects skb->data at mac header.
5862 *
5863 * Returns 0 on success, -errno otherwise.
5864 */
5865int skb_mpls_dec_ttl(struct sk_buff *skb)
5866{
5867	u32 lse;
5868	u8 ttl;
5869
5870	if (unlikely(!eth_p_mpls(skb->protocol)))
5871		return -EINVAL;
5872
5873	if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
5874		return -ENOMEM;
5875
5876	lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
5877	ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
5878	if (!--ttl)
5879		return -EINVAL;
5880
5881	lse &= ~MPLS_LS_TTL_MASK;
5882	lse |= ttl << MPLS_LS_TTL_SHIFT;
5883
5884	return skb_mpls_update_lse(skb, cpu_to_be32(lse));
5885}
5886EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
5887
5888/**
5889 * alloc_skb_with_frags - allocate skb with page frags
5890 *
5891 * @header_len: size of linear part
5892 * @data_len: needed length in frags
5893 * @max_page_order: max page order desired.
5894 * @errcode: pointer to error code if any
5895 * @gfp_mask: allocation mask
5896 *
5897 * This can be used to allocate a paged skb, given a maximal order for frags.
5898 */
5899struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
5900				     unsigned long data_len,
5901				     int max_page_order,
5902				     int *errcode,
5903				     gfp_t gfp_mask)
5904{
5905	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
5906	unsigned long chunk;
5907	struct sk_buff *skb;
5908	struct page *page;
5909	int i;
5910
5911	*errcode = -EMSGSIZE;
5912	/* Note this test could be relaxed, if we succeed to allocate
5913	 * high order pages...
5914	 */
5915	if (npages > MAX_SKB_FRAGS)
5916		return NULL;
5917
5918	*errcode = -ENOBUFS;
5919	skb = alloc_skb(header_len, gfp_mask);
5920	if (!skb)
5921		return NULL;
5922
5923	skb->truesize += npages << PAGE_SHIFT;
5924
5925	for (i = 0; npages > 0; i++) {
5926		int order = max_page_order;
5927
5928		while (order) {
5929			if (npages >= 1 << order) {
5930				page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
5931						   __GFP_COMP |
5932						   __GFP_NOWARN,
5933						   order);
5934				if (page)
5935					goto fill_page;
5936				/* Do not retry other high order allocations */
5937				order = 1;
5938				max_page_order = 0;
5939			}
5940			order--;
5941		}
5942		page = alloc_page(gfp_mask);
5943		if (!page)
5944			goto failure;
5945fill_page:
5946		chunk = min_t(unsigned long, data_len,
5947			      PAGE_SIZE << order);
5948		skb_fill_page_desc(skb, i, page, 0, chunk);
5949		data_len -= chunk;
5950		npages -= 1 << order;
5951	}
5952	return skb;
5953
5954failure:
5955	kfree_skb(skb);
5956	return NULL;
5957}
5958EXPORT_SYMBOL(alloc_skb_with_frags);
5959
5960/* carve out the first off bytes from skb when off < headlen */
5961static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
5962				    const int headlen, gfp_t gfp_mask)
5963{
5964	int i;
5965	int size = skb_end_offset(skb);
5966	int new_hlen = headlen - off;
5967	u8 *data;
5968
5969	size = SKB_DATA_ALIGN(size);
5970
5971	if (skb_pfmemalloc(skb))
5972		gfp_mask |= __GFP_MEMALLOC;
5973	data = kmalloc_reserve(size +
5974			       SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
5975			       gfp_mask, NUMA_NO_NODE, NULL);
5976	if (!data)
5977		return -ENOMEM;
5978
5979	size = SKB_WITH_OVERHEAD(ksize(data));
5980
5981	/* Copy real data, and all frags */
5982	skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
5983	skb->len -= off;
5984
5985	memcpy((struct skb_shared_info *)(data + size),
5986	       skb_shinfo(skb),
5987	       offsetof(struct skb_shared_info,
5988			frags[skb_shinfo(skb)->nr_frags]));
5989	if (skb_cloned(skb)) {
5990		/* drop the old head gracefully */
5991		if (skb_orphan_frags(skb, gfp_mask)) {
5992			kfree(data);
5993			return -ENOMEM;
5994		}
5995		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
5996			skb_frag_ref(skb, i);
5997		if (skb_has_frag_list(skb))
5998			skb_clone_fraglist(skb);
5999		skb_release_data(skb);
6000	} else {
6001		/* we can reuse existing recount- all we did was
6002		 * relocate values
6003		 */
6004		skb_free_head(skb);
6005	}
6006
6007	skb->head = data;
6008	skb->data = data;
6009	skb->head_frag = 0;
6010#ifdef NET_SKBUFF_DATA_USES_OFFSET
6011	skb->end = size;
6012#else
6013	skb->end = skb->head + size;
6014#endif
6015	skb_set_tail_pointer(skb, skb_headlen(skb));
6016	skb_headers_offset_update(skb, 0);
6017	skb->cloned = 0;
6018	skb->hdr_len = 0;
6019	skb->nohdr = 0;
6020	atomic_set(&skb_shinfo(skb)->dataref, 1);
6021
6022	return 0;
6023}
6024
6025static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
6026
6027/* carve out the first eat bytes from skb's frag_list. May recurse into
6028 * pskb_carve()
6029 */
6030static int pskb_carve_frag_list(struct sk_buff *skb,
6031				struct skb_shared_info *shinfo, int eat,
6032				gfp_t gfp_mask)
6033{
6034	struct sk_buff *list = shinfo->frag_list;
6035	struct sk_buff *clone = NULL;
6036	struct sk_buff *insp = NULL;
6037
6038	do {
6039		if (!list) {
6040			pr_err("Not enough bytes to eat. Want %d\n", eat);
6041			return -EFAULT;
6042		}
6043		if (list->len <= eat) {
6044			/* Eaten as whole. */
6045			eat -= list->len;
6046			list = list->next;
6047			insp = list;
6048		} else {
6049			/* Eaten partially. */
6050			if (skb_shared(list)) {
6051				clone = skb_clone(list, gfp_mask);
6052				if (!clone)
6053					return -ENOMEM;
6054				insp = list->next;
6055				list = clone;
6056			} else {
6057				/* This may be pulled without problems. */
6058				insp = list;
6059			}
6060			if (pskb_carve(list, eat, gfp_mask) < 0) {
6061				kfree_skb(clone);
6062				return -ENOMEM;
6063			}
6064			break;
6065		}
6066	} while (eat);
6067
6068	/* Free pulled out fragments. */
6069	while ((list = shinfo->frag_list) != insp) {
6070		shinfo->frag_list = list->next;
6071		consume_skb(list);
6072	}
6073	/* And insert new clone at head. */
6074	if (clone) {
6075		clone->next = list;
6076		shinfo->frag_list = clone;
6077	}
6078	return 0;
6079}
6080
6081/* carve off first len bytes from skb. Split line (off) is in the
6082 * non-linear part of skb
6083 */
6084static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
6085				       int pos, gfp_t gfp_mask)
6086{
6087	int i, k = 0;
6088	int size = skb_end_offset(skb);
6089	u8 *data;
6090	const int nfrags = skb_shinfo(skb)->nr_frags;
6091	struct skb_shared_info *shinfo;
6092
6093	size = SKB_DATA_ALIGN(size);
6094
6095	if (skb_pfmemalloc(skb))
6096		gfp_mask |= __GFP_MEMALLOC;
6097	data = kmalloc_reserve(size +
6098			       SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
6099			       gfp_mask, NUMA_NO_NODE, NULL);
6100	if (!data)
6101		return -ENOMEM;
6102
6103	size = SKB_WITH_OVERHEAD(ksize(data));
6104
6105	memcpy((struct skb_shared_info *)(data + size),
6106	       skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
6107	if (skb_orphan_frags(skb, gfp_mask)) {
6108		kfree(data);
6109		return -ENOMEM;
6110	}
6111	shinfo = (struct skb_shared_info *)(data + size);
6112	for (i = 0; i < nfrags; i++) {
6113		int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);
6114
6115		if (pos + fsize > off) {
6116			shinfo->frags[k] = skb_shinfo(skb)->frags[i];
6117
6118			if (pos < off) {
6119				/* Split frag.
6120				 * We have two variants in this case:
6121				 * 1. Move all the frag to the second
6122				 *    part, if it is possible. F.e.
6123				 *    this approach is mandatory for TUX,
6124				 *    where splitting is expensive.
6125				 * 2. Split is accurately. We make this.
6126				 */
6127				skb_frag_off_add(&shinfo->frags[0], off - pos);
6128				skb_frag_size_sub(&shinfo->frags[0], off - pos);
6129			}
6130			skb_frag_ref(skb, i);
6131			k++;
6132		}
6133		pos += fsize;
6134	}
6135	shinfo->nr_frags = k;
6136	if (skb_has_frag_list(skb))
6137		skb_clone_fraglist(skb);
6138
6139	/* split line is in frag list */
6140	if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) {
6141		/* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
6142		if (skb_has_frag_list(skb))
6143			kfree_skb_list(skb_shinfo(skb)->frag_list);
6144		kfree(data);
6145		return -ENOMEM;
6146	}
6147	skb_release_data(skb);
6148
6149	skb->head = data;
6150	skb->head_frag = 0;
6151	skb->data = data;
6152#ifdef NET_SKBUFF_DATA_USES_OFFSET
6153	skb->end = size;
6154#else
6155	skb->end = skb->head + size;
6156#endif
6157	skb_reset_tail_pointer(skb);
6158	skb_headers_offset_update(skb, 0);
6159	skb->cloned   = 0;
6160	skb->hdr_len  = 0;
6161	skb->nohdr    = 0;
6162	skb->len -= off;
6163	skb->data_len = skb->len;
6164	atomic_set(&skb_shinfo(skb)->dataref, 1);
6165	return 0;
6166}
6167
6168/* remove len bytes from the beginning of the skb */
6169static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
6170{
6171	int headlen = skb_headlen(skb);
6172
6173	if (len < headlen)
6174		return pskb_carve_inside_header(skb, len, headlen, gfp);
6175	else
6176		return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
6177}
6178
6179/* Extract to_copy bytes starting at off from skb, and return this in
6180 * a new skb
6181 */
6182struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
6183			     int to_copy, gfp_t gfp)
6184{
6185	struct sk_buff  *clone = skb_clone(skb, gfp);
6186
6187	if (!clone)
6188		return NULL;
6189
6190	if (pskb_carve(clone, off, gfp) < 0 ||
6191	    pskb_trim(clone, to_copy)) {
6192		kfree_skb(clone);
6193		return NULL;
6194	}
6195	return clone;
6196}
6197EXPORT_SYMBOL(pskb_extract);
6198
6199/**
6200 * skb_condense - try to get rid of fragments/frag_list if possible
6201 * @skb: buffer
6202 *
6203 * Can be used to save memory before skb is added to a busy queue.
6204 * If packet has bytes in frags and enough tail room in skb->head,
6205 * pull all of them, so that we can free the frags right now and adjust
6206 * truesize.
6207 * Notes:
6208 *	We do not reallocate skb->head thus can not fail.
6209 *	Caller must re-evaluate skb->truesize if needed.
6210 */
6211void skb_condense(struct sk_buff *skb)
6212{
6213	if (skb->data_len) {
6214		if (skb->data_len > skb->end - skb->tail ||
6215		    skb_cloned(skb))
6216			return;
6217
6218		/* Nice, we can free page frag(s) right now */
6219		__pskb_pull_tail(skb, skb->data_len);
6220	}
6221	/* At this point, skb->truesize might be over estimated,
6222	 * because skb had a fragment, and fragments do not tell
6223	 * their truesize.
6224	 * When we pulled its content into skb->head, fragment
6225	 * was freed, but __pskb_pull_tail() could not possibly
6226	 * adjust skb->truesize, not knowing the frag truesize.
6227	 */
6228	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
6229}
6230
6231#ifdef CONFIG_SKB_EXTENSIONS
6232static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
6233{
6234	return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
6235}
6236
6237/**
6238 * __skb_ext_alloc - allocate a new skb extensions storage
6239 *
6240 * @flags: See kmalloc().
6241 *
6242 * Returns the newly allocated pointer. The pointer can later attached to a
6243 * skb via __skb_ext_set().
6244 * Note: caller must handle the skb_ext as an opaque data.
6245 */
6246struct skb_ext *__skb_ext_alloc(gfp_t flags)
6247{
6248	struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);
6249
6250	if (new) {
6251		memset(new->offset, 0, sizeof(new->offset));
6252		refcount_set(&new->refcnt, 1);
6253	}
6254
6255	return new;
6256}
6257
6258static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
6259					 unsigned int old_active)
6260{
6261	struct skb_ext *new;
6262
6263	if (refcount_read(&old->refcnt) == 1)
6264		return old;
6265
6266	new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
6267	if (!new)
6268		return NULL;
6269
6270	memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
6271	refcount_set(&new->refcnt, 1);
6272
6273#ifdef CONFIG_XFRM
6274	if (old_active & (1 << SKB_EXT_SEC_PATH)) {
6275		struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH);
6276		unsigned int i;
6277
6278		for (i = 0; i < sp->len; i++)
6279			xfrm_state_hold(sp->xvec[i]);
6280	}
6281#endif
6282	__skb_ext_put(old);
6283	return new;
6284}
6285
6286/**
6287 * __skb_ext_set - attach the specified extension storage to this skb
6288 * @skb: buffer
6289 * @id: extension id
6290 * @ext: extension storage previously allocated via __skb_ext_alloc()
6291 *
6292 * Existing extensions, if any, are cleared.
6293 *
6294 * Returns the pointer to the extension.
6295 */
6296void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
6297		    struct skb_ext *ext)
6298{
6299	unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext);
6300
6301	skb_ext_put(skb);
6302	newlen = newoff + skb_ext_type_len[id];
6303	ext->chunks = newlen;
6304	ext->offset[id] = newoff;
6305	skb->extensions = ext;
6306	skb->active_extensions = 1 << id;
6307	return skb_ext_get_ptr(ext, id);
6308}
6309
6310/**
6311 * skb_ext_add - allocate space for given extension, COW if needed
6312 * @skb: buffer
6313 * @id: extension to allocate space for
6314 *
6315 * Allocates enough space for the given extension.
6316 * If the extension is already present, a pointer to that extension
6317 * is returned.
6318 *
6319 * If the skb was cloned, COW applies and the returned memory can be
6320 * modified without changing the extension space of clones buffers.
6321 *
6322 * Returns pointer to the extension or NULL on allocation failure.
6323 */
6324void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
6325{
6326	struct skb_ext *new, *old = NULL;
6327	unsigned int newlen, newoff;
6328
6329	if (skb->active_extensions) {
6330		old = skb->extensions;
6331
6332		new = skb_ext_maybe_cow(old, skb->active_extensions);
6333		if (!new)
6334			return NULL;
6335
6336		if (__skb_ext_exist(new, id))
6337			goto set_active;
6338
6339		newoff = new->chunks;
6340	} else {
6341		newoff = SKB_EXT_CHUNKSIZEOF(*new);
6342
6343		new = __skb_ext_alloc(GFP_ATOMIC);
6344		if (!new)
6345			return NULL;
6346	}
6347
6348	newlen = newoff + skb_ext_type_len[id];
6349	new->chunks = newlen;
6350	new->offset[id] = newoff;
6351set_active:
6352	skb->extensions = new;
6353	skb->active_extensions |= 1 << id;
6354	return skb_ext_get_ptr(new, id);
6355}
6356EXPORT_SYMBOL(skb_ext_add);
6357
6358#ifdef CONFIG_XFRM
6359static void skb_ext_put_sp(struct sec_path *sp)
6360{
6361	unsigned int i;
6362
6363	for (i = 0; i < sp->len; i++)
6364		xfrm_state_put(sp->xvec[i]);
6365}
6366#endif
6367
6368void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
6369{
6370	struct skb_ext *ext = skb->extensions;
6371
6372	skb->active_extensions &= ~(1 << id);
6373	if (skb->active_extensions == 0) {
6374		skb->extensions = NULL;
6375		__skb_ext_put(ext);
6376#ifdef CONFIG_XFRM
6377	} else if (id == SKB_EXT_SEC_PATH &&
6378		   refcount_read(&ext->refcnt) == 1) {
6379		struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH);
6380
6381		skb_ext_put_sp(sp);
6382		sp->len = 0;
6383#endif
6384	}
6385}
6386EXPORT_SYMBOL(__skb_ext_del);
6387
6388void __skb_ext_put(struct skb_ext *ext)
6389{
6390	/* If this is last clone, nothing can increment
6391	 * it after check passes.  Avoids one atomic op.
6392	 */
6393	if (refcount_read(&ext->refcnt) == 1)
6394		goto free_now;
6395
6396	if (!refcount_dec_and_test(&ext->refcnt))
6397		return;
6398free_now:
6399#ifdef CONFIG_XFRM
6400	if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
6401		skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
6402#endif
6403
6404	kmem_cache_free(skbuff_ext_cache, ext);
6405}
6406EXPORT_SYMBOL(__skb_ext_put);
6407#endif /* CONFIG_SKB_EXTENSIONS */
6408