1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * Routines having to do with the 'struct sk_buff' memory handlers. 4 * 5 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> 6 * Florian La Roche <rzsfl@rz.uni-sb.de> 7 * 8 * Fixes: 9 * Alan Cox : Fixed the worst of the load 10 * balancer bugs. 11 * Dave Platt : Interrupt stacking fix. 12 * Richard Kooijman : Timestamp fixes. 13 * Alan Cox : Changed buffer format. 14 * Alan Cox : destructor hook for AF_UNIX etc. 15 * Linus Torvalds : Better skb_clone. 16 * Alan Cox : Added skb_copy. 17 * Alan Cox : Added all the changed routines Linus 18 * only put in the headers 19 * Ray VanTassle : Fixed --skb->lock in free 20 * Alan Cox : skb_copy copy arp field 21 * Andi Kleen : slabified it. 22 * Robert Olsson : Removed skb_head_pool 23 * 24 * NOTE: 25 * The __skb_ routines should be called with interrupts 26 * disabled, or you better be *real* sure that the operation is atomic 27 * with respect to whatever list is being frobbed (e.g. via lock_sock() 28 * or via disabling bottom half handlers, etc). 29 */ 30 31/* 32 * The functions in this file will not compile correctly with gcc 2.4.x 33 */ 34 35#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 36 37#include <linux/module.h> 38#include <linux/types.h> 39#include <linux/kernel.h> 40#include <linux/mm.h> 41#include <linux/interrupt.h> 42#include <linux/in.h> 43#include <linux/inet.h> 44#include <linux/slab.h> 45#include <linux/tcp.h> 46#include <linux/udp.h> 47#include <linux/sctp.h> 48#include <linux/netdevice.h> 49#ifdef CONFIG_NET_CLS_ACT 50#include <net/pkt_sched.h> 51#endif 52#include <linux/string.h> 53#include <linux/skbuff.h> 54#include <linux/splice.h> 55#include <linux/cache.h> 56#include <linux/rtnetlink.h> 57#include <linux/init.h> 58#include <linux/scatterlist.h> 59#include <linux/errqueue.h> 60#include <linux/prefetch.h> 61#include <linux/if_vlan.h> 62#include <linux/mpls.h> 63 64#include <net/protocol.h> 65#include <net/dst.h> 66#include <net/sock.h> 67#include <net/checksum.h> 68#include <net/ip6_checksum.h> 69#include <net/xfrm.h> 70#include <net/mpls.h> 71#include <net/mptcp.h> 72 73#include <linux/uaccess.h> 74#include <trace/events/skb.h> 75#include <linux/highmem.h> 76#include <linux/capability.h> 77#include <linux/user_namespace.h> 78#include <linux/indirect_call_wrapper.h> 79 80#include "datagram.h" 81 82struct kmem_cache *skbuff_head_cache __ro_after_init; 83static struct kmem_cache *skbuff_fclone_cache __ro_after_init; 84#ifdef CONFIG_SKB_EXTENSIONS 85static struct kmem_cache *skbuff_ext_cache __ro_after_init; 86#endif 87int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; 88EXPORT_SYMBOL(sysctl_max_skb_frags); 89 90/** 91 * skb_panic - private function for out-of-line support 92 * @skb: buffer 93 * @sz: size 94 * @addr: address 95 * @msg: skb_over_panic or skb_under_panic 96 * 97 * Out-of-line support for skb_put() and skb_push(). 98 * Called via the wrapper skb_over_panic() or skb_under_panic(). 99 * Keep out of line to prevent kernel bloat. 100 * __builtin_return_address is not used because it is not always reliable. 101 */ 102static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, 103 const char msg[]) 104{ 105 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n", 106 msg, addr, skb->len, sz, skb->head, skb->data, 107 (unsigned long)skb->tail, (unsigned long)skb->end, 108 skb->dev ? skb->dev->name : "<NULL>"); 109 BUG(); 110} 111 112static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) 113{ 114 skb_panic(skb, sz, addr, __func__); 115} 116 117static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) 118{ 119 skb_panic(skb, sz, addr, __func__); 120} 121 122/* 123 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells 124 * the caller if emergency pfmemalloc reserves are being used. If it is and 125 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves 126 * may be used. Otherwise, the packet data may be discarded until enough 127 * memory is free 128 */ 129#define kmalloc_reserve(size, gfp, node, pfmemalloc) \ 130 __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc) 131 132static void *__kmalloc_reserve(size_t size, gfp_t flags, int node, 133 unsigned long ip, bool *pfmemalloc) 134{ 135 void *obj; 136 bool ret_pfmemalloc = false; 137 138 /* 139 * Try a regular allocation, when that fails and we're not entitled 140 * to the reserves, fail. 141 */ 142 obj = kmalloc_node_track_caller(size, 143 flags | __GFP_NOMEMALLOC | __GFP_NOWARN, 144 node); 145 if (obj || !(gfp_pfmemalloc_allowed(flags))) 146 goto out; 147 148 /* Try again but now we are using pfmemalloc reserves */ 149 ret_pfmemalloc = true; 150 obj = kmalloc_node_track_caller(size, flags, node); 151 152out: 153 if (pfmemalloc) 154 *pfmemalloc = ret_pfmemalloc; 155 156 return obj; 157} 158 159/* Allocate a new skbuff. We do this ourselves so we can fill in a few 160 * 'private' fields and also do memory statistics to find all the 161 * [BEEP] leaks. 162 * 163 */ 164 165/** 166 * __alloc_skb - allocate a network buffer 167 * @size: size to allocate 168 * @gfp_mask: allocation mask 169 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache 170 * instead of head cache and allocate a cloned (child) skb. 171 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for 172 * allocations in case the data is required for writeback 173 * @node: numa node to allocate memory on 174 * 175 * Allocate a new &sk_buff. The returned buffer has no headroom and a 176 * tail room of at least size bytes. The object has a reference count 177 * of one. The return is the buffer. On a failure the return is %NULL. 178 * 179 * Buffers may only be allocated from interrupts using a @gfp_mask of 180 * %GFP_ATOMIC. 181 */ 182struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, 183 int flags, int node) 184{ 185 struct kmem_cache *cache; 186 struct skb_shared_info *shinfo; 187 struct sk_buff *skb; 188 u8 *data; 189 bool pfmemalloc; 190 191 cache = (flags & SKB_ALLOC_FCLONE) 192 ? skbuff_fclone_cache : skbuff_head_cache; 193 194 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) 195 gfp_mask |= __GFP_MEMALLOC; 196 197 /* Get the HEAD */ 198 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); 199 if (!skb) 200 goto out; 201 prefetchw(skb); 202 203 /* We do our best to align skb_shared_info on a separate cache 204 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives 205 * aligned memory blocks, unless SLUB/SLAB debug is enabled. 206 * Both skb->head and skb_shared_info are cache line aligned. 207 */ 208 size = SKB_DATA_ALIGN(size); 209 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 210 data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); 211 if (!data) 212 goto nodata; 213 /* kmalloc(size) might give us more room than requested. 214 * Put skb_shared_info exactly at the end of allocated zone, 215 * to allow max possible filling before reallocation. 216 */ 217 size = SKB_WITH_OVERHEAD(ksize(data)); 218 prefetchw(data + size); 219 220 /* 221 * Only clear those fields we need to clear, not those that we will 222 * actually initialise below. Hence, don't put any more fields after 223 * the tail pointer in struct sk_buff! 224 */ 225 memset(skb, 0, offsetof(struct sk_buff, tail)); 226 /* Account for allocated memory : skb + skb->head */ 227 skb->truesize = SKB_TRUESIZE(size); 228 skb->pfmemalloc = pfmemalloc; 229 refcount_set(&skb->users, 1); 230 skb->head = data; 231 skb->data = data; 232 skb_reset_tail_pointer(skb); 233 skb->end = skb->tail + size; 234 skb->mac_header = (typeof(skb->mac_header))~0U; 235 skb->transport_header = (typeof(skb->transport_header))~0U; 236 237 /* make sure we initialize shinfo sequentially */ 238 shinfo = skb_shinfo(skb); 239 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 240 atomic_set(&shinfo->dataref, 1); 241 242 if (flags & SKB_ALLOC_FCLONE) { 243 struct sk_buff_fclones *fclones; 244 245 fclones = container_of(skb, struct sk_buff_fclones, skb1); 246 247 skb->fclone = SKB_FCLONE_ORIG; 248 refcount_set(&fclones->fclone_ref, 1); 249 250 fclones->skb2.fclone = SKB_FCLONE_CLONE; 251 } 252 253 skb_set_kcov_handle(skb, kcov_common_handle()); 254 255out: 256 return skb; 257nodata: 258 kmem_cache_free(cache, skb); 259 skb = NULL; 260 goto out; 261} 262EXPORT_SYMBOL(__alloc_skb); 263 264/* Caller must provide SKB that is memset cleared */ 265static struct sk_buff *__build_skb_around(struct sk_buff *skb, 266 void *data, unsigned int frag_size) 267{ 268 struct skb_shared_info *shinfo; 269 unsigned int size = frag_size ? : ksize(data); 270 271 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 272 273 /* Assumes caller memset cleared SKB */ 274 skb->truesize = SKB_TRUESIZE(size); 275 refcount_set(&skb->users, 1); 276 skb->head = data; 277 skb->data = data; 278 skb_reset_tail_pointer(skb); 279 skb->end = skb->tail + size; 280 skb->mac_header = (typeof(skb->mac_header))~0U; 281 skb->transport_header = (typeof(skb->transport_header))~0U; 282 283 /* make sure we initialize shinfo sequentially */ 284 shinfo = skb_shinfo(skb); 285 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 286 atomic_set(&shinfo->dataref, 1); 287 288 skb_set_kcov_handle(skb, kcov_common_handle()); 289 290 return skb; 291} 292 293/** 294 * __build_skb - build a network buffer 295 * @data: data buffer provided by caller 296 * @frag_size: size of data, or 0 if head was kmalloced 297 * 298 * Allocate a new &sk_buff. Caller provides space holding head and 299 * skb_shared_info. @data must have been allocated by kmalloc() only if 300 * @frag_size is 0, otherwise data should come from the page allocator 301 * or vmalloc() 302 * The return is the new skb buffer. 303 * On a failure the return is %NULL, and @data is not freed. 304 * Notes : 305 * Before IO, driver allocates only data buffer where NIC put incoming frame 306 * Driver should add room at head (NET_SKB_PAD) and 307 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) 308 * After IO, driver calls build_skb(), to allocate sk_buff and populate it 309 * before giving packet to stack. 310 * RX rings only contains data buffers, not full skbs. 311 */ 312struct sk_buff *__build_skb(void *data, unsigned int frag_size) 313{ 314 struct sk_buff *skb; 315 316 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); 317 if (unlikely(!skb)) 318 return NULL; 319 320 memset(skb, 0, offsetof(struct sk_buff, tail)); 321 322 return __build_skb_around(skb, data, frag_size); 323} 324 325/* build_skb() is wrapper over __build_skb(), that specifically 326 * takes care of skb->head and skb->pfmemalloc 327 * This means that if @frag_size is not zero, then @data must be backed 328 * by a page fragment, not kmalloc() or vmalloc() 329 */ 330struct sk_buff *build_skb(void *data, unsigned int frag_size) 331{ 332 struct sk_buff *skb = __build_skb(data, frag_size); 333 334 if (skb && frag_size) { 335 skb->head_frag = 1; 336 if (page_is_pfmemalloc(virt_to_head_page(data))) 337 skb->pfmemalloc = 1; 338 } 339 return skb; 340} 341EXPORT_SYMBOL(build_skb); 342 343/** 344 * build_skb_around - build a network buffer around provided skb 345 * @skb: sk_buff provide by caller, must be memset cleared 346 * @data: data buffer provided by caller 347 * @frag_size: size of data, or 0 if head was kmalloced 348 */ 349struct sk_buff *build_skb_around(struct sk_buff *skb, 350 void *data, unsigned int frag_size) 351{ 352 if (unlikely(!skb)) 353 return NULL; 354 355 skb = __build_skb_around(skb, data, frag_size); 356 357 if (skb && frag_size) { 358 skb->head_frag = 1; 359 if (page_is_pfmemalloc(virt_to_head_page(data))) 360 skb->pfmemalloc = 1; 361 } 362 return skb; 363} 364EXPORT_SYMBOL(build_skb_around); 365 366#define NAPI_SKB_CACHE_SIZE 64 367 368struct napi_alloc_cache { 369 struct page_frag_cache page; 370 unsigned int skb_count; 371 void *skb_cache[NAPI_SKB_CACHE_SIZE]; 372}; 373 374static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); 375static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); 376 377static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) 378{ 379 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 380 381 return page_frag_alloc(&nc->page, fragsz, gfp_mask); 382} 383 384void *napi_alloc_frag(unsigned int fragsz) 385{ 386 fragsz = SKB_DATA_ALIGN(fragsz); 387 388 return __napi_alloc_frag(fragsz, GFP_ATOMIC); 389} 390EXPORT_SYMBOL(napi_alloc_frag); 391 392/** 393 * netdev_alloc_frag - allocate a page fragment 394 * @fragsz: fragment size 395 * 396 * Allocates a frag from a page for receive buffer. 397 * Uses GFP_ATOMIC allocations. 398 */ 399void *netdev_alloc_frag(unsigned int fragsz) 400{ 401 struct page_frag_cache *nc; 402 void *data; 403 404 fragsz = SKB_DATA_ALIGN(fragsz); 405 if (in_irq() || irqs_disabled()) { 406 nc = this_cpu_ptr(&netdev_alloc_cache); 407 data = page_frag_alloc(nc, fragsz, GFP_ATOMIC); 408 } else { 409 local_bh_disable(); 410 data = __napi_alloc_frag(fragsz, GFP_ATOMIC); 411 local_bh_enable(); 412 } 413 return data; 414} 415EXPORT_SYMBOL(netdev_alloc_frag); 416 417/** 418 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device 419 * @dev: network device to receive on 420 * @len: length to allocate 421 * @gfp_mask: get_free_pages mask, passed to alloc_skb 422 * 423 * Allocate a new &sk_buff and assign it a usage count of one. The 424 * buffer has NET_SKB_PAD headroom built in. Users should allocate 425 * the headroom they think they need without accounting for the 426 * built in space. The built in space is used for optimisations. 427 * 428 * %NULL is returned if there is no free memory. 429 */ 430struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, 431 gfp_t gfp_mask) 432{ 433 struct page_frag_cache *nc; 434 struct sk_buff *skb; 435 bool pfmemalloc; 436 void *data; 437 438 len += NET_SKB_PAD; 439 440 /* If requested length is either too small or too big, 441 * we use kmalloc() for skb->head allocation. 442 */ 443 if (len <= SKB_WITH_OVERHEAD(1024) || 444 len > SKB_WITH_OVERHEAD(PAGE_SIZE) || 445 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { 446 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); 447 if (!skb) 448 goto skb_fail; 449 goto skb_success; 450 } 451 452 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 453 len = SKB_DATA_ALIGN(len); 454 455 if (sk_memalloc_socks()) 456 gfp_mask |= __GFP_MEMALLOC; 457 458 if (in_irq() || irqs_disabled()) { 459 nc = this_cpu_ptr(&netdev_alloc_cache); 460 data = page_frag_alloc(nc, len, gfp_mask); 461 pfmemalloc = nc->pfmemalloc; 462 } else { 463 local_bh_disable(); 464 nc = this_cpu_ptr(&napi_alloc_cache.page); 465 data = page_frag_alloc(nc, len, gfp_mask); 466 pfmemalloc = nc->pfmemalloc; 467 local_bh_enable(); 468 } 469 470 if (unlikely(!data)) 471 return NULL; 472 473 skb = __build_skb(data, len); 474 if (unlikely(!skb)) { 475 skb_free_frag(data); 476 return NULL; 477 } 478 479 if (pfmemalloc) 480 skb->pfmemalloc = 1; 481 skb->head_frag = 1; 482 483skb_success: 484 skb_reserve(skb, NET_SKB_PAD); 485 skb->dev = dev; 486 487skb_fail: 488 return skb; 489} 490EXPORT_SYMBOL(__netdev_alloc_skb); 491 492/** 493 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance 494 * @napi: napi instance this buffer was allocated for 495 * @len: length to allocate 496 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages 497 * 498 * Allocate a new sk_buff for use in NAPI receive. This buffer will 499 * attempt to allocate the head from a special reserved region used 500 * only for NAPI Rx allocation. By doing this we can save several 501 * CPU cycles by avoiding having to disable and re-enable IRQs. 502 * 503 * %NULL is returned if there is no free memory. 504 */ 505struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, 506 gfp_t gfp_mask) 507{ 508 struct napi_alloc_cache *nc; 509 struct sk_buff *skb; 510 void *data; 511 512 len += NET_SKB_PAD + NET_IP_ALIGN; 513 514 /* If requested length is either too small or too big, 515 * we use kmalloc() for skb->head allocation. 516 */ 517 if (len <= SKB_WITH_OVERHEAD(1024) || 518 len > SKB_WITH_OVERHEAD(PAGE_SIZE) || 519 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { 520 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); 521 if (!skb) 522 goto skb_fail; 523 goto skb_success; 524 } 525 526 nc = this_cpu_ptr(&napi_alloc_cache); 527 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 528 len = SKB_DATA_ALIGN(len); 529 530 if (sk_memalloc_socks()) 531 gfp_mask |= __GFP_MEMALLOC; 532 533 data = page_frag_alloc(&nc->page, len, gfp_mask); 534 if (unlikely(!data)) 535 return NULL; 536 537 skb = __build_skb(data, len); 538 if (unlikely(!skb)) { 539 skb_free_frag(data); 540 return NULL; 541 } 542 543 if (nc->page.pfmemalloc) 544 skb->pfmemalloc = 1; 545 skb->head_frag = 1; 546 547skb_success: 548 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); 549 skb->dev = napi->dev; 550 551skb_fail: 552 return skb; 553} 554EXPORT_SYMBOL(__napi_alloc_skb); 555 556void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, 557 int size, unsigned int truesize) 558{ 559 skb_fill_page_desc(skb, i, page, off, size); 560 skb->len += size; 561 skb->data_len += size; 562 skb->truesize += truesize; 563} 564EXPORT_SYMBOL(skb_add_rx_frag); 565 566void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, 567 unsigned int truesize) 568{ 569 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 570 571 skb_frag_size_add(frag, size); 572 skb->len += size; 573 skb->data_len += size; 574 skb->truesize += truesize; 575} 576EXPORT_SYMBOL(skb_coalesce_rx_frag); 577 578static void skb_drop_list(struct sk_buff **listp) 579{ 580 kfree_skb_list(*listp); 581 *listp = NULL; 582} 583 584static inline void skb_drop_fraglist(struct sk_buff *skb) 585{ 586 skb_drop_list(&skb_shinfo(skb)->frag_list); 587} 588 589static void skb_clone_fraglist(struct sk_buff *skb) 590{ 591 struct sk_buff *list; 592 593 skb_walk_frags(skb, list) 594 skb_get(list); 595} 596 597static void skb_free_head(struct sk_buff *skb) 598{ 599 unsigned char *head = skb->head; 600 601 if (skb->head_frag) 602 skb_free_frag(head); 603 else 604 kfree(head); 605} 606 607static void skb_release_data(struct sk_buff *skb) 608{ 609 struct skb_shared_info *shinfo = skb_shinfo(skb); 610 int i; 611 612 if (skb->cloned && 613 atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, 614 &shinfo->dataref)) 615 return; 616 617 for (i = 0; i < shinfo->nr_frags; i++) 618 __skb_frag_unref(&shinfo->frags[i]); 619 620 if (shinfo->frag_list) 621 kfree_skb_list(shinfo->frag_list); 622 623 skb_zcopy_clear(skb, true); 624 skb_free_head(skb); 625} 626 627/* 628 * Free an skbuff by memory without cleaning the state. 629 */ 630static void kfree_skbmem(struct sk_buff *skb) 631{ 632 struct sk_buff_fclones *fclones; 633 634 switch (skb->fclone) { 635 case SKB_FCLONE_UNAVAILABLE: 636 kmem_cache_free(skbuff_head_cache, skb); 637 return; 638 639 case SKB_FCLONE_ORIG: 640 fclones = container_of(skb, struct sk_buff_fclones, skb1); 641 642 /* We usually free the clone (TX completion) before original skb 643 * This test would have no chance to be true for the clone, 644 * while here, branch prediction will be good. 645 */ 646 if (refcount_read(&fclones->fclone_ref) == 1) 647 goto fastpath; 648 break; 649 650 default: /* SKB_FCLONE_CLONE */ 651 fclones = container_of(skb, struct sk_buff_fclones, skb2); 652 break; 653 } 654 if (!refcount_dec_and_test(&fclones->fclone_ref)) 655 return; 656fastpath: 657 kmem_cache_free(skbuff_fclone_cache, fclones); 658} 659 660void skb_release_head_state(struct sk_buff *skb) 661{ 662 skb_dst_drop(skb); 663 if (skb->destructor) { 664 WARN_ON(in_irq()); 665 skb->destructor(skb); 666 } 667#if IS_ENABLED(CONFIG_NF_CONNTRACK) 668 nf_conntrack_put(skb_nfct(skb)); 669#endif 670 skb_ext_put(skb); 671} 672 673/* Free everything but the sk_buff shell. */ 674static void skb_release_all(struct sk_buff *skb) 675{ 676 skb_release_head_state(skb); 677 if (likely(skb->head)) 678 skb_release_data(skb); 679} 680 681/** 682 * __kfree_skb - private function 683 * @skb: buffer 684 * 685 * Free an sk_buff. Release anything attached to the buffer. 686 * Clean the state. This is an internal helper function. Users should 687 * always call kfree_skb 688 */ 689 690void __kfree_skb(struct sk_buff *skb) 691{ 692 skb_release_all(skb); 693 kfree_skbmem(skb); 694} 695EXPORT_SYMBOL(__kfree_skb); 696 697/** 698 * kfree_skb - free an sk_buff 699 * @skb: buffer to free 700 * 701 * Drop a reference to the buffer and free it if the usage count has 702 * hit zero. 703 */ 704void kfree_skb(struct sk_buff *skb) 705{ 706 if (!skb_unref(skb)) 707 return; 708 709 trace_kfree_skb(skb, __builtin_return_address(0)); 710 __kfree_skb(skb); 711} 712EXPORT_SYMBOL(kfree_skb); 713 714void kfree_skb_list(struct sk_buff *segs) 715{ 716 while (segs) { 717 struct sk_buff *next = segs->next; 718 719 kfree_skb(segs); 720 segs = next; 721 } 722} 723EXPORT_SYMBOL(kfree_skb_list); 724 725/* Dump skb information and contents. 726 * 727 * Must only be called from net_ratelimit()-ed paths. 728 * 729 * Dumps whole packets if full_pkt, only headers otherwise. 730 */ 731void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) 732{ 733 struct skb_shared_info *sh = skb_shinfo(skb); 734 struct net_device *dev = skb->dev; 735 struct sock *sk = skb->sk; 736 struct sk_buff *list_skb; 737 bool has_mac, has_trans; 738 int headroom, tailroom; 739 int i, len, seg_len; 740 741 if (full_pkt) 742 len = skb->len; 743 else 744 len = min_t(int, skb->len, MAX_HEADER + 128); 745 746 headroom = skb_headroom(skb); 747 tailroom = skb_tailroom(skb); 748 749 has_mac = skb_mac_header_was_set(skb); 750 has_trans = skb_transport_header_was_set(skb); 751 752 printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" 753 "mac=(%d,%d) net=(%d,%d) trans=%d\n" 754 "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" 755 "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" 756 "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", 757 level, skb->len, headroom, skb_headlen(skb), tailroom, 758 has_mac ? skb->mac_header : -1, 759 has_mac ? skb_mac_header_len(skb) : -1, 760 skb->network_header, 761 has_trans ? skb_network_header_len(skb) : -1, 762 has_trans ? skb->transport_header : -1, 763 sh->tx_flags, sh->nr_frags, 764 sh->gso_size, sh->gso_type, sh->gso_segs, 765 skb->csum, skb->ip_summed, skb->csum_complete_sw, 766 skb->csum_valid, skb->csum_level, 767 skb->hash, skb->sw_hash, skb->l4_hash, 768 ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); 769 770 if (dev) 771 printk("%sdev name=%s feat=%pNF\n", 772 level, dev->name, &dev->features); 773 if (sk) 774 printk("%ssk family=%hu type=%u proto=%u\n", 775 level, sk->sk_family, sk->sk_type, sk->sk_protocol); 776 777 if (full_pkt && headroom) 778 print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET, 779 16, 1, skb->head, headroom, false); 780 781 seg_len = min_t(int, skb_headlen(skb), len); 782 if (seg_len) 783 print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET, 784 16, 1, skb->data, seg_len, false); 785 len -= seg_len; 786 787 if (full_pkt && tailroom) 788 print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET, 789 16, 1, skb_tail_pointer(skb), tailroom, false); 790 791 for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) { 792 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 793 u32 p_off, p_len, copied; 794 struct page *p; 795 u8 *vaddr; 796 797 skb_frag_foreach_page(frag, skb_frag_off(frag), 798 skb_frag_size(frag), p, p_off, p_len, 799 copied) { 800 seg_len = min_t(int, p_len, len); 801 vaddr = kmap_atomic(p); 802 print_hex_dump(level, "skb frag: ", 803 DUMP_PREFIX_OFFSET, 804 16, 1, vaddr + p_off, seg_len, false); 805 kunmap_atomic(vaddr); 806 len -= seg_len; 807 if (!len) 808 break; 809 } 810 } 811 812 if (full_pkt && skb_has_frag_list(skb)) { 813 printk("skb fraglist:\n"); 814 skb_walk_frags(skb, list_skb) 815 skb_dump(level, list_skb, true); 816 } 817} 818EXPORT_SYMBOL(skb_dump); 819 820/** 821 * skb_tx_error - report an sk_buff xmit error 822 * @skb: buffer that triggered an error 823 * 824 * Report xmit error if a device callback is tracking this skb. 825 * skb must be freed afterwards. 826 */ 827void skb_tx_error(struct sk_buff *skb) 828{ 829 skb_zcopy_clear(skb, true); 830} 831EXPORT_SYMBOL(skb_tx_error); 832 833#ifdef CONFIG_TRACEPOINTS 834/** 835 * consume_skb - free an skbuff 836 * @skb: buffer to free 837 * 838 * Drop a ref to the buffer and free it if the usage count has hit zero 839 * Functions identically to kfree_skb, but kfree_skb assumes that the frame 840 * is being dropped after a failure and notes that 841 */ 842void consume_skb(struct sk_buff *skb) 843{ 844 if (!skb_unref(skb)) 845 return; 846 847 trace_consume_skb(skb); 848 __kfree_skb(skb); 849} 850EXPORT_SYMBOL(consume_skb); 851#endif 852 853/** 854 * consume_stateless_skb - free an skbuff, assuming it is stateless 855 * @skb: buffer to free 856 * 857 * Alike consume_skb(), but this variant assumes that this is the last 858 * skb reference and all the head states have been already dropped 859 */ 860void __consume_stateless_skb(struct sk_buff *skb) 861{ 862 trace_consume_skb(skb); 863 skb_release_data(skb); 864 kfree_skbmem(skb); 865} 866 867void __kfree_skb_flush(void) 868{ 869 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 870 871 /* flush skb_cache if containing objects */ 872 if (nc->skb_count) { 873 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count, 874 nc->skb_cache); 875 nc->skb_count = 0; 876 } 877} 878 879static inline void _kfree_skb_defer(struct sk_buff *skb) 880{ 881 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 882 883 /* drop skb->head and call any destructors for packet */ 884 skb_release_all(skb); 885 886 /* record skb to CPU local list */ 887 nc->skb_cache[nc->skb_count++] = skb; 888 889#ifdef CONFIG_SLUB 890 /* SLUB writes into objects when freeing */ 891 prefetchw(skb); 892#endif 893 894 /* flush skb_cache if it is filled */ 895 if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { 896 kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE, 897 nc->skb_cache); 898 nc->skb_count = 0; 899 } 900} 901void __kfree_skb_defer(struct sk_buff *skb) 902{ 903 _kfree_skb_defer(skb); 904} 905 906void napi_consume_skb(struct sk_buff *skb, int budget) 907{ 908 /* Zero budget indicate non-NAPI context called us, like netpoll */ 909 if (unlikely(!budget)) { 910 dev_consume_skb_any(skb); 911 return; 912 } 913 914 if (!skb_unref(skb)) 915 return; 916 917 /* if reaching here SKB is ready to free */ 918 trace_consume_skb(skb); 919 920 /* if SKB is a clone, don't handle this case */ 921 if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { 922 __kfree_skb(skb); 923 return; 924 } 925 926 _kfree_skb_defer(skb); 927} 928EXPORT_SYMBOL(napi_consume_skb); 929 930/* Make sure a field is enclosed inside headers_start/headers_end section */ 931#define CHECK_SKB_FIELD(field) \ 932 BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ 933 offsetof(struct sk_buff, headers_start)); \ 934 BUILD_BUG_ON(offsetof(struct sk_buff, field) > \ 935 offsetof(struct sk_buff, headers_end)); \ 936 937static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 938{ 939 new->tstamp = old->tstamp; 940 /* We do not copy old->sk */ 941 new->dev = old->dev; 942 memcpy(new->cb, old->cb, sizeof(old->cb)); 943 skb_dst_copy(new, old); 944 __skb_ext_copy(new, old); 945 __nf_copy(new, old, false); 946 947 /* Note : this field could be in headers_start/headers_end section 948 * It is not yet because we do not want to have a 16 bit hole 949 */ 950 new->queue_mapping = old->queue_mapping; 951 952 memcpy(&new->headers_start, &old->headers_start, 953 offsetof(struct sk_buff, headers_end) - 954 offsetof(struct sk_buff, headers_start)); 955 CHECK_SKB_FIELD(protocol); 956 CHECK_SKB_FIELD(csum); 957 CHECK_SKB_FIELD(hash); 958 CHECK_SKB_FIELD(priority); 959 CHECK_SKB_FIELD(skb_iif); 960 CHECK_SKB_FIELD(vlan_proto); 961 CHECK_SKB_FIELD(vlan_tci); 962 CHECK_SKB_FIELD(transport_header); 963 CHECK_SKB_FIELD(network_header); 964 CHECK_SKB_FIELD(mac_header); 965 CHECK_SKB_FIELD(inner_protocol); 966 CHECK_SKB_FIELD(inner_transport_header); 967 CHECK_SKB_FIELD(inner_network_header); 968 CHECK_SKB_FIELD(inner_mac_header); 969 CHECK_SKB_FIELD(mark); 970#ifdef CONFIG_NETWORK_SECMARK 971 CHECK_SKB_FIELD(secmark); 972#endif 973#ifdef CONFIG_NET_RX_BUSY_POLL 974 CHECK_SKB_FIELD(napi_id); 975#endif 976#ifdef CONFIG_XPS 977 CHECK_SKB_FIELD(sender_cpu); 978#endif 979#ifdef CONFIG_NET_SCHED 980 CHECK_SKB_FIELD(tc_index); 981#endif 982 983} 984 985/* 986 * You should not add any new code to this function. Add it to 987 * __copy_skb_header above instead. 988 */ 989static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) 990{ 991#define C(x) n->x = skb->x 992 993 n->next = n->prev = NULL; 994 n->sk = NULL; 995 __copy_skb_header(n, skb); 996 997 C(len); 998 C(data_len); 999 C(mac_len); 1000 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; 1001 n->cloned = 1; 1002 n->nohdr = 0; 1003 n->peeked = 0; 1004 C(pfmemalloc); 1005 n->destructor = NULL; 1006 C(tail); 1007 C(end); 1008 C(head); 1009 C(head_frag); 1010 C(data); 1011 C(truesize); 1012 refcount_set(&n->users, 1); 1013 1014 atomic_inc(&(skb_shinfo(skb)->dataref)); 1015 skb->cloned = 1; 1016 1017 return n; 1018#undef C 1019} 1020 1021/** 1022 * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg 1023 * @first: first sk_buff of the msg 1024 */ 1025struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) 1026{ 1027 struct sk_buff *n; 1028 1029 n = alloc_skb(0, GFP_ATOMIC); 1030 if (!n) 1031 return NULL; 1032 1033 n->len = first->len; 1034 n->data_len = first->len; 1035 n->truesize = first->truesize; 1036 1037 skb_shinfo(n)->frag_list = first; 1038 1039 __copy_skb_header(n, first); 1040 n->destructor = NULL; 1041 1042 return n; 1043} 1044EXPORT_SYMBOL_GPL(alloc_skb_for_msg); 1045 1046/** 1047 * skb_morph - morph one skb into another 1048 * @dst: the skb to receive the contents 1049 * @src: the skb to supply the contents 1050 * 1051 * This is identical to skb_clone except that the target skb is 1052 * supplied by the user. 1053 * 1054 * The target skb is returned upon exit. 1055 */ 1056struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) 1057{ 1058 skb_release_all(dst); 1059 return __skb_clone(dst, src); 1060} 1061EXPORT_SYMBOL_GPL(skb_morph); 1062 1063int mm_account_pinned_pages(struct mmpin *mmp, size_t size) 1064{ 1065 unsigned long max_pg, num_pg, new_pg, old_pg; 1066 struct user_struct *user; 1067 1068 if (capable(CAP_IPC_LOCK) || !size) 1069 return 0; 1070 1071 num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ 1072 max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 1073 user = mmp->user ? : current_user(); 1074 1075 do { 1076 old_pg = atomic_long_read(&user->locked_vm); 1077 new_pg = old_pg + num_pg; 1078 if (new_pg > max_pg) 1079 return -ENOBUFS; 1080 } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) != 1081 old_pg); 1082 1083 if (!mmp->user) { 1084 mmp->user = get_uid(user); 1085 mmp->num_pg = num_pg; 1086 } else { 1087 mmp->num_pg += num_pg; 1088 } 1089 1090 return 0; 1091} 1092EXPORT_SYMBOL_GPL(mm_account_pinned_pages); 1093 1094void mm_unaccount_pinned_pages(struct mmpin *mmp) 1095{ 1096 if (mmp->user) { 1097 atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); 1098 free_uid(mmp->user); 1099 } 1100} 1101EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); 1102 1103struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) 1104{ 1105 struct ubuf_info *uarg; 1106 struct sk_buff *skb; 1107 1108 WARN_ON_ONCE(!in_task()); 1109 1110 skb = sock_omalloc(sk, 0, GFP_KERNEL); 1111 if (!skb) 1112 return NULL; 1113 1114 BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); 1115 uarg = (void *)skb->cb; 1116 uarg->mmp.user = NULL; 1117 1118 if (mm_account_pinned_pages(&uarg->mmp, size)) { 1119 kfree_skb(skb); 1120 return NULL; 1121 } 1122 1123 uarg->callback = sock_zerocopy_callback; 1124 uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; 1125 uarg->len = 1; 1126 uarg->bytelen = size; 1127 uarg->zerocopy = 1; 1128 refcount_set(&uarg->refcnt, 1); 1129 sock_hold(sk); 1130 1131 return uarg; 1132} 1133EXPORT_SYMBOL_GPL(sock_zerocopy_alloc); 1134 1135static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg) 1136{ 1137 return container_of((void *)uarg, struct sk_buff, cb); 1138} 1139 1140struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, 1141 struct ubuf_info *uarg) 1142{ 1143 if (uarg) { 1144 const u32 byte_limit = 1 << 19; /* limit to a few TSO */ 1145 u32 bytelen, next; 1146 1147 /* realloc only when socket is locked (TCP, UDP cork), 1148 * so uarg->len and sk_zckey access is serialized 1149 */ 1150 if (!sock_owned_by_user(sk)) { 1151 WARN_ON_ONCE(1); 1152 return NULL; 1153 } 1154 1155 bytelen = uarg->bytelen + size; 1156 if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) { 1157 /* TCP can create new skb to attach new uarg */ 1158 if (sk->sk_type == SOCK_STREAM) 1159 goto new_alloc; 1160 return NULL; 1161 } 1162 1163 next = (u32)atomic_read(&sk->sk_zckey); 1164 if ((u32)(uarg->id + uarg->len) == next) { 1165 if (mm_account_pinned_pages(&uarg->mmp, size)) 1166 return NULL; 1167 uarg->len++; 1168 uarg->bytelen = bytelen; 1169 atomic_set(&sk->sk_zckey, ++next); 1170 1171 /* no extra ref when appending to datagram (MSG_MORE) */ 1172 if (sk->sk_type == SOCK_STREAM) 1173 sock_zerocopy_get(uarg); 1174 1175 return uarg; 1176 } 1177 } 1178 1179new_alloc: 1180 return sock_zerocopy_alloc(sk, size); 1181} 1182EXPORT_SYMBOL_GPL(sock_zerocopy_realloc); 1183 1184static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) 1185{ 1186 struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); 1187 u32 old_lo, old_hi; 1188 u64 sum_len; 1189 1190 old_lo = serr->ee.ee_info; 1191 old_hi = serr->ee.ee_data; 1192 sum_len = old_hi - old_lo + 1ULL + len; 1193 1194 if (sum_len >= (1ULL << 32)) 1195 return false; 1196 1197 if (lo != old_hi + 1) 1198 return false; 1199 1200 serr->ee.ee_data += len; 1201 return true; 1202} 1203 1204void sock_zerocopy_callback(struct ubuf_info *uarg, bool success) 1205{ 1206 struct sk_buff *tail, *skb = skb_from_uarg(uarg); 1207 struct sock_exterr_skb *serr; 1208 struct sock *sk = skb->sk; 1209 struct sk_buff_head *q; 1210 unsigned long flags; 1211 u32 lo, hi; 1212 u16 len; 1213 1214 mm_unaccount_pinned_pages(&uarg->mmp); 1215 1216 /* if !len, there was only 1 call, and it was aborted 1217 * so do not queue a completion notification 1218 */ 1219 if (!uarg->len || sock_flag(sk, SOCK_DEAD)) 1220 goto release; 1221 1222 len = uarg->len; 1223 lo = uarg->id; 1224 hi = uarg->id + len - 1; 1225 1226 serr = SKB_EXT_ERR(skb); 1227 memset(serr, 0, sizeof(*serr)); 1228 serr->ee.ee_errno = 0; 1229 serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; 1230 serr->ee.ee_data = hi; 1231 serr->ee.ee_info = lo; 1232 if (!success) 1233 serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; 1234 1235 q = &sk->sk_error_queue; 1236 spin_lock_irqsave(&q->lock, flags); 1237 tail = skb_peek_tail(q); 1238 if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY || 1239 !skb_zerocopy_notify_extend(tail, lo, len)) { 1240 __skb_queue_tail(q, skb); 1241 skb = NULL; 1242 } 1243 spin_unlock_irqrestore(&q->lock, flags); 1244 1245 sk->sk_error_report(sk); 1246 1247release: 1248 consume_skb(skb); 1249 sock_put(sk); 1250} 1251EXPORT_SYMBOL_GPL(sock_zerocopy_callback); 1252 1253void sock_zerocopy_put(struct ubuf_info *uarg) 1254{ 1255 if (uarg && refcount_dec_and_test(&uarg->refcnt)) { 1256 if (uarg->callback) 1257 uarg->callback(uarg, uarg->zerocopy); 1258 else 1259 consume_skb(skb_from_uarg(uarg)); 1260 } 1261} 1262EXPORT_SYMBOL_GPL(sock_zerocopy_put); 1263 1264void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) 1265{ 1266 if (uarg) { 1267 struct sock *sk = skb_from_uarg(uarg)->sk; 1268 1269 atomic_dec(&sk->sk_zckey); 1270 uarg->len--; 1271 1272 if (have_uref) 1273 sock_zerocopy_put(uarg); 1274 } 1275} 1276EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); 1277 1278int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) 1279{ 1280 return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); 1281} 1282EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram); 1283 1284int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, 1285 struct msghdr *msg, int len, 1286 struct ubuf_info *uarg) 1287{ 1288 struct ubuf_info *orig_uarg = skb_zcopy(skb); 1289 struct iov_iter orig_iter = msg->msg_iter; 1290 int err, orig_len = skb->len; 1291 1292 /* An skb can only point to one uarg. This edge case happens when 1293 * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. 1294 */ 1295 if (orig_uarg && uarg != orig_uarg) 1296 return -EEXIST; 1297 1298 err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len); 1299 if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { 1300 struct sock *save_sk = skb->sk; 1301 1302 /* Streams do not free skb on error. Reset to prev state. */ 1303 msg->msg_iter = orig_iter; 1304 skb->sk = sk; 1305 ___pskb_trim(skb, orig_len); 1306 skb->sk = save_sk; 1307 return err; 1308 } 1309 1310 skb_zcopy_set(skb, uarg, NULL); 1311 return skb->len - orig_len; 1312} 1313EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); 1314 1315static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, 1316 gfp_t gfp_mask) 1317{ 1318 if (skb_zcopy(orig)) { 1319 if (skb_zcopy(nskb)) { 1320 /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ 1321 if (!gfp_mask) { 1322 WARN_ON_ONCE(1); 1323 return -ENOMEM; 1324 } 1325 if (skb_uarg(nskb) == skb_uarg(orig)) 1326 return 0; 1327 if (skb_copy_ubufs(nskb, GFP_ATOMIC)) 1328 return -EIO; 1329 } 1330 skb_zcopy_set(nskb, skb_uarg(orig), NULL); 1331 } 1332 return 0; 1333} 1334 1335/** 1336 * skb_copy_ubufs - copy userspace skb frags buffers to kernel 1337 * @skb: the skb to modify 1338 * @gfp_mask: allocation priority 1339 * 1340 * This must be called on SKBTX_DEV_ZEROCOPY skb. 1341 * It will copy all frags into kernel and drop the reference 1342 * to userspace pages. 1343 * 1344 * If this function is called from an interrupt gfp_mask() must be 1345 * %GFP_ATOMIC. 1346 * 1347 * Returns 0 on success or a negative error code on failure 1348 * to allocate kernel memory to copy to. 1349 */ 1350int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) 1351{ 1352 int num_frags = skb_shinfo(skb)->nr_frags; 1353 struct page *page, *head = NULL; 1354 int i, new_frags; 1355 u32 d_off; 1356 1357 if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) 1358 return -EINVAL; 1359 1360 if (!num_frags) 1361 goto release; 1362 1363 new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT; 1364 for (i = 0; i < new_frags; i++) { 1365 page = alloc_page(gfp_mask); 1366 if (!page) { 1367 while (head) { 1368 struct page *next = (struct page *)page_private(head); 1369 put_page(head); 1370 head = next; 1371 } 1372 return -ENOMEM; 1373 } 1374 set_page_private(page, (unsigned long)head); 1375 head = page; 1376 } 1377 1378 page = head; 1379 d_off = 0; 1380 for (i = 0; i < num_frags; i++) { 1381 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 1382 u32 p_off, p_len, copied; 1383 struct page *p; 1384 u8 *vaddr; 1385 1386 skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), 1387 p, p_off, p_len, copied) { 1388 u32 copy, done = 0; 1389 vaddr = kmap_atomic(p); 1390 1391 while (done < p_len) { 1392 if (d_off == PAGE_SIZE) { 1393 d_off = 0; 1394 page = (struct page *)page_private(page); 1395 } 1396 copy = min_t(u32, PAGE_SIZE - d_off, p_len - done); 1397 memcpy(page_address(page) + d_off, 1398 vaddr + p_off + done, copy); 1399 done += copy; 1400 d_off += copy; 1401 } 1402 kunmap_atomic(vaddr); 1403 } 1404 } 1405 1406 /* skb frags release userspace buffers */ 1407 for (i = 0; i < num_frags; i++) 1408 skb_frag_unref(skb, i); 1409 1410 /* skb frags point to kernel buffers */ 1411 for (i = 0; i < new_frags - 1; i++) { 1412 __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE); 1413 head = (struct page *)page_private(head); 1414 } 1415 __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); 1416 skb_shinfo(skb)->nr_frags = new_frags; 1417 1418release: 1419 skb_zcopy_clear(skb, false); 1420 return 0; 1421} 1422EXPORT_SYMBOL_GPL(skb_copy_ubufs); 1423 1424/** 1425 * skb_clone - duplicate an sk_buff 1426 * @skb: buffer to clone 1427 * @gfp_mask: allocation priority 1428 * 1429 * Duplicate an &sk_buff. The new one is not owned by a socket. Both 1430 * copies share the same packet data but not structure. The new 1431 * buffer has a reference count of 1. If the allocation fails the 1432 * function returns %NULL otherwise the new buffer is returned. 1433 * 1434 * If this function is called from an interrupt gfp_mask() must be 1435 * %GFP_ATOMIC. 1436 */ 1437 1438struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) 1439{ 1440 struct sk_buff_fclones *fclones = container_of(skb, 1441 struct sk_buff_fclones, 1442 skb1); 1443 struct sk_buff *n; 1444 1445 if (skb_orphan_frags(skb, gfp_mask)) 1446 return NULL; 1447 1448 if (skb->fclone == SKB_FCLONE_ORIG && 1449 refcount_read(&fclones->fclone_ref) == 1) { 1450 n = &fclones->skb2; 1451 refcount_set(&fclones->fclone_ref, 2); 1452 } else { 1453 if (skb_pfmemalloc(skb)) 1454 gfp_mask |= __GFP_MEMALLOC; 1455 1456 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); 1457 if (!n) 1458 return NULL; 1459 1460 n->fclone = SKB_FCLONE_UNAVAILABLE; 1461 } 1462 1463 return __skb_clone(n, skb); 1464} 1465EXPORT_SYMBOL(skb_clone); 1466 1467void skb_headers_offset_update(struct sk_buff *skb, int off) 1468{ 1469 /* Only adjust this if it actually is csum_start rather than csum */ 1470 if (skb->ip_summed == CHECKSUM_PARTIAL) 1471 skb->csum_start += off; 1472 /* {transport,network,mac}_header and tail are relative to skb->head */ 1473 skb->transport_header += off; 1474 skb->network_header += off; 1475 if (skb_mac_header_was_set(skb)) 1476 skb->mac_header += off; 1477 skb->inner_transport_header += off; 1478 skb->inner_network_header += off; 1479 skb->inner_mac_header += off; 1480} 1481EXPORT_SYMBOL(skb_headers_offset_update); 1482 1483void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) 1484{ 1485 __copy_skb_header(new, old); 1486 1487 skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; 1488 skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; 1489 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; 1490} 1491EXPORT_SYMBOL(skb_copy_header); 1492 1493static inline int skb_alloc_rx_flag(const struct sk_buff *skb) 1494{ 1495 if (skb_pfmemalloc(skb)) 1496 return SKB_ALLOC_RX; 1497 return 0; 1498} 1499 1500/** 1501 * skb_copy - create private copy of an sk_buff 1502 * @skb: buffer to copy 1503 * @gfp_mask: allocation priority 1504 * 1505 * Make a copy of both an &sk_buff and its data. This is used when the 1506 * caller wishes to modify the data and needs a private copy of the 1507 * data to alter. Returns %NULL on failure or the pointer to the buffer 1508 * on success. The returned buffer has a reference count of 1. 1509 * 1510 * As by-product this function converts non-linear &sk_buff to linear 1511 * one, so that &sk_buff becomes completely private and caller is allowed 1512 * to modify all the data of returned buffer. This means that this 1513 * function is not recommended for use in circumstances when only 1514 * header is going to be modified. Use pskb_copy() instead. 1515 */ 1516 1517struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) 1518{ 1519 struct sk_buff *n; 1520 unsigned int size; 1521 int headerlen; 1522 1523 if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) 1524 return NULL; 1525 1526 headerlen = skb_headroom(skb); 1527 size = skb_end_offset(skb) + skb->data_len; 1528 n = __alloc_skb(size, gfp_mask, 1529 skb_alloc_rx_flag(skb), NUMA_NO_NODE); 1530 if (!n) 1531 return NULL; 1532 1533 /* Set the data pointer */ 1534 skb_reserve(n, headerlen); 1535 /* Set the tail pointer and length */ 1536 skb_put(n, skb->len); 1537 1538 BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); 1539 1540 skb_copy_header(n, skb); 1541 return n; 1542} 1543EXPORT_SYMBOL(skb_copy); 1544 1545/** 1546 * __pskb_copy_fclone - create copy of an sk_buff with private head. 1547 * @skb: buffer to copy 1548 * @headroom: headroom of new skb 1549 * @gfp_mask: allocation priority 1550 * @fclone: if true allocate the copy of the skb from the fclone 1551 * cache instead of the head cache; it is recommended to set this 1552 * to true for the cases where the copy will likely be cloned 1553 * 1554 * Make a copy of both an &sk_buff and part of its data, located 1555 * in header. Fragmented data remain shared. This is used when 1556 * the caller wishes to modify only header of &sk_buff and needs 1557 * private copy of the header to alter. Returns %NULL on failure 1558 * or the pointer to the buffer on success. 1559 * The returned buffer has a reference count of 1. 1560 */ 1561 1562struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, 1563 gfp_t gfp_mask, bool fclone) 1564{ 1565 unsigned int size = skb_headlen(skb) + headroom; 1566 int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); 1567 struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); 1568 1569 if (!n) 1570 goto out; 1571 1572 /* Set the data pointer */ 1573 skb_reserve(n, headroom); 1574 /* Set the tail pointer and length */ 1575 skb_put(n, skb_headlen(skb)); 1576 /* Copy the bytes */ 1577 skb_copy_from_linear_data(skb, n->data, n->len); 1578 1579 n->truesize += skb->data_len; 1580 n->data_len = skb->data_len; 1581 n->len = skb->len; 1582 1583 if (skb_shinfo(skb)->nr_frags) { 1584 int i; 1585 1586 if (skb_orphan_frags(skb, gfp_mask) || 1587 skb_zerocopy_clone(n, skb, gfp_mask)) { 1588 kfree_skb(n); 1589 n = NULL; 1590 goto out; 1591 } 1592 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1593 skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; 1594 skb_frag_ref(skb, i); 1595 } 1596 skb_shinfo(n)->nr_frags = i; 1597 } 1598 1599 if (skb_has_frag_list(skb)) { 1600 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; 1601 skb_clone_fraglist(n); 1602 } 1603 1604 skb_copy_header(n, skb); 1605out: 1606 return n; 1607} 1608EXPORT_SYMBOL(__pskb_copy_fclone); 1609 1610/** 1611 * pskb_expand_head - reallocate header of &sk_buff 1612 * @skb: buffer to reallocate 1613 * @nhead: room to add at head 1614 * @ntail: room to add at tail 1615 * @gfp_mask: allocation priority 1616 * 1617 * Expands (or creates identical copy, if @nhead and @ntail are zero) 1618 * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have 1619 * reference count of 1. Returns zero in the case of success or error, 1620 * if expansion failed. In the last case, &sk_buff is not changed. 1621 * 1622 * All the pointers pointing into skb header may change and must be 1623 * reloaded after call to this function. 1624 */ 1625 1626int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, 1627 gfp_t gfp_mask) 1628{ 1629 int i, osize = skb_end_offset(skb); 1630 int size = osize + nhead + ntail; 1631 long off; 1632 u8 *data; 1633 1634 BUG_ON(nhead < 0); 1635 1636 BUG_ON(skb_shared(skb)); 1637 1638 size = SKB_DATA_ALIGN(size); 1639 1640 if (skb_pfmemalloc(skb)) 1641 gfp_mask |= __GFP_MEMALLOC; 1642 data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), 1643 gfp_mask, NUMA_NO_NODE, NULL); 1644 if (!data) 1645 goto nodata; 1646 size = SKB_WITH_OVERHEAD(ksize(data)); 1647 1648 /* Copy only real data... and, alas, header. This should be 1649 * optimized for the cases when header is void. 1650 */ 1651 memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); 1652 1653 memcpy((struct skb_shared_info *)(data + size), 1654 skb_shinfo(skb), 1655 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); 1656 1657 /* 1658 * if shinfo is shared we must drop the old head gracefully, but if it 1659 * is not we can just drop the old head and let the existing refcount 1660 * be since all we did is relocate the values 1661 */ 1662 if (skb_cloned(skb)) { 1663 if (skb_orphan_frags(skb, gfp_mask)) 1664 goto nofrags; 1665 if (skb_zcopy(skb)) 1666 refcount_inc(&skb_uarg(skb)->refcnt); 1667 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 1668 skb_frag_ref(skb, i); 1669 1670 if (skb_has_frag_list(skb)) 1671 skb_clone_fraglist(skb); 1672 1673 skb_release_data(skb); 1674 } else { 1675 skb_free_head(skb); 1676 } 1677 off = (data + nhead) - skb->head; 1678 1679 skb->head = data; 1680 skb->head_frag = 0; 1681 skb->data += off; 1682#ifdef NET_SKBUFF_DATA_USES_OFFSET 1683 skb->end = size; 1684 off = nhead; 1685#else 1686 skb->end = skb->head + size; 1687#endif 1688 skb->tail += off; 1689 skb_headers_offset_update(skb, nhead); 1690 skb->cloned = 0; 1691 skb->hdr_len = 0; 1692 skb->nohdr = 0; 1693 atomic_set(&skb_shinfo(skb)->dataref, 1); 1694 1695 skb_metadata_clear(skb); 1696 1697 /* It is not generally safe to change skb->truesize. 1698 * For the moment, we really care of rx path, or 1699 * when skb is orphaned (not attached to a socket). 1700 */ 1701 if (!skb->sk || skb->destructor == sock_edemux) 1702 skb->truesize += size - osize; 1703 1704 return 0; 1705 1706nofrags: 1707 kfree(data); 1708nodata: 1709 return -ENOMEM; 1710} 1711EXPORT_SYMBOL(pskb_expand_head); 1712 1713/* Make private copy of skb with writable head and some headroom */ 1714 1715struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) 1716{ 1717 struct sk_buff *skb2; 1718 int delta = headroom - skb_headroom(skb); 1719 1720 if (delta <= 0) 1721 skb2 = pskb_copy(skb, GFP_ATOMIC); 1722 else { 1723 skb2 = skb_clone(skb, GFP_ATOMIC); 1724 if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, 1725 GFP_ATOMIC)) { 1726 kfree_skb(skb2); 1727 skb2 = NULL; 1728 } 1729 } 1730 return skb2; 1731} 1732EXPORT_SYMBOL(skb_realloc_headroom); 1733 1734/** 1735 * skb_copy_expand - copy and expand sk_buff 1736 * @skb: buffer to copy 1737 * @newheadroom: new free bytes at head 1738 * @newtailroom: new free bytes at tail 1739 * @gfp_mask: allocation priority 1740 * 1741 * Make a copy of both an &sk_buff and its data and while doing so 1742 * allocate additional space. 1743 * 1744 * This is used when the caller wishes to modify the data and needs a 1745 * private copy of the data to alter as well as more space for new fields. 1746 * Returns %NULL on failure or the pointer to the buffer 1747 * on success. The returned buffer has a reference count of 1. 1748 * 1749 * You must pass %GFP_ATOMIC as the allocation priority if this function 1750 * is called from an interrupt. 1751 */ 1752struct sk_buff *skb_copy_expand(const struct sk_buff *skb, 1753 int newheadroom, int newtailroom, 1754 gfp_t gfp_mask) 1755{ 1756 /* 1757 * Allocate the copy buffer 1758 */ 1759 int head_copy_len, head_copy_off; 1760 struct sk_buff *n; 1761 int oldheadroom; 1762 1763 if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) 1764 return NULL; 1765 1766 oldheadroom = skb_headroom(skb); 1767 n = __alloc_skb(newheadroom + skb->len + newtailroom, 1768 gfp_mask, skb_alloc_rx_flag(skb), 1769 NUMA_NO_NODE); 1770 if (!n) 1771 return NULL; 1772 1773 skb_reserve(n, newheadroom); 1774 1775 /* Set the tail pointer and length */ 1776 skb_put(n, skb->len); 1777 1778 head_copy_len = oldheadroom; 1779 head_copy_off = 0; 1780 if (newheadroom <= head_copy_len) 1781 head_copy_len = newheadroom; 1782 else 1783 head_copy_off = newheadroom - head_copy_len; 1784 1785 /* Copy the linear header and data. */ 1786 BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, 1787 skb->len + head_copy_len)); 1788 1789 skb_copy_header(n, skb); 1790 1791 skb_headers_offset_update(n, newheadroom - oldheadroom); 1792 1793 return n; 1794} 1795EXPORT_SYMBOL(skb_copy_expand); 1796 1797/** 1798 * __skb_pad - zero pad the tail of an skb 1799 * @skb: buffer to pad 1800 * @pad: space to pad 1801 * @free_on_error: free buffer on error 1802 * 1803 * Ensure that a buffer is followed by a padding area that is zero 1804 * filled. Used by network drivers which may DMA or transfer data 1805 * beyond the buffer end onto the wire. 1806 * 1807 * May return error in out of memory cases. The skb is freed on error 1808 * if @free_on_error is true. 1809 */ 1810 1811int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) 1812{ 1813 int err; 1814 int ntail; 1815 1816 /* If the skbuff is non linear tailroom is always zero.. */ 1817 if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { 1818 memset(skb->data+skb->len, 0, pad); 1819 return 0; 1820 } 1821 1822 ntail = skb->data_len + pad - (skb->end - skb->tail); 1823 if (likely(skb_cloned(skb) || ntail > 0)) { 1824 err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); 1825 if (unlikely(err)) 1826 goto free_skb; 1827 } 1828 1829 /* FIXME: The use of this function with non-linear skb's really needs 1830 * to be audited. 1831 */ 1832 err = skb_linearize(skb); 1833 if (unlikely(err)) 1834 goto free_skb; 1835 1836 memset(skb->data + skb->len, 0, pad); 1837 return 0; 1838 1839free_skb: 1840 if (free_on_error) 1841 kfree_skb(skb); 1842 return err; 1843} 1844EXPORT_SYMBOL(__skb_pad); 1845 1846/** 1847 * pskb_put - add data to the tail of a potentially fragmented buffer 1848 * @skb: start of the buffer to use 1849 * @tail: tail fragment of the buffer to use 1850 * @len: amount of data to add 1851 * 1852 * This function extends the used data area of the potentially 1853 * fragmented buffer. @tail must be the last fragment of @skb -- or 1854 * @skb itself. If this would exceed the total buffer size the kernel 1855 * will panic. A pointer to the first byte of the extra data is 1856 * returned. 1857 */ 1858 1859void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) 1860{ 1861 if (tail != skb) { 1862 skb->data_len += len; 1863 skb->len += len; 1864 } 1865 return skb_put(tail, len); 1866} 1867EXPORT_SYMBOL_GPL(pskb_put); 1868 1869/** 1870 * skb_put - add data to a buffer 1871 * @skb: buffer to use 1872 * @len: amount of data to add 1873 * 1874 * This function extends the used data area of the buffer. If this would 1875 * exceed the total buffer size the kernel will panic. A pointer to the 1876 * first byte of the extra data is returned. 1877 */ 1878void *skb_put(struct sk_buff *skb, unsigned int len) 1879{ 1880 void *tmp = skb_tail_pointer(skb); 1881 SKB_LINEAR_ASSERT(skb); 1882 skb->tail += len; 1883 skb->len += len; 1884 if (unlikely(skb->tail > skb->end)) 1885 skb_over_panic(skb, len, __builtin_return_address(0)); 1886 return tmp; 1887} 1888EXPORT_SYMBOL(skb_put); 1889 1890/** 1891 * skb_push - add data to the start of a buffer 1892 * @skb: buffer to use 1893 * @len: amount of data to add 1894 * 1895 * This function extends the used data area of the buffer at the buffer 1896 * start. If this would exceed the total buffer headroom the kernel will 1897 * panic. A pointer to the first byte of the extra data is returned. 1898 */ 1899void *skb_push(struct sk_buff *skb, unsigned int len) 1900{ 1901 skb->data -= len; 1902 skb->len += len; 1903 if (unlikely(skb->data < skb->head)) 1904 skb_under_panic(skb, len, __builtin_return_address(0)); 1905 return skb->data; 1906} 1907EXPORT_SYMBOL(skb_push); 1908 1909/** 1910 * skb_pull - remove data from the start of a buffer 1911 * @skb: buffer to use 1912 * @len: amount of data to remove 1913 * 1914 * This function removes data from the start of a buffer, returning 1915 * the memory to the headroom. A pointer to the next data in the buffer 1916 * is returned. Once the data has been pulled future pushes will overwrite 1917 * the old data. 1918 */ 1919void *skb_pull(struct sk_buff *skb, unsigned int len) 1920{ 1921 return skb_pull_inline(skb, len); 1922} 1923EXPORT_SYMBOL(skb_pull); 1924 1925/** 1926 * skb_trim - remove end from a buffer 1927 * @skb: buffer to alter 1928 * @len: new length 1929 * 1930 * Cut the length of a buffer down by removing data from the tail. If 1931 * the buffer is already under the length specified it is not modified. 1932 * The skb must be linear. 1933 */ 1934void skb_trim(struct sk_buff *skb, unsigned int len) 1935{ 1936 if (skb->len > len) 1937 __skb_trim(skb, len); 1938} 1939EXPORT_SYMBOL(skb_trim); 1940 1941/* Trims skb to length len. It can change skb pointers. 1942 */ 1943 1944int ___pskb_trim(struct sk_buff *skb, unsigned int len) 1945{ 1946 struct sk_buff **fragp; 1947 struct sk_buff *frag; 1948 int offset = skb_headlen(skb); 1949 int nfrags = skb_shinfo(skb)->nr_frags; 1950 int i; 1951 int err; 1952 1953 if (skb_cloned(skb) && 1954 unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) 1955 return err; 1956 1957 i = 0; 1958 if (offset >= len) 1959 goto drop_pages; 1960 1961 for (; i < nfrags; i++) { 1962 int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); 1963 1964 if (end < len) { 1965 offset = end; 1966 continue; 1967 } 1968 1969 skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); 1970 1971drop_pages: 1972 skb_shinfo(skb)->nr_frags = i; 1973 1974 for (; i < nfrags; i++) 1975 skb_frag_unref(skb, i); 1976 1977 if (skb_has_frag_list(skb)) 1978 skb_drop_fraglist(skb); 1979 goto done; 1980 } 1981 1982 for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); 1983 fragp = &frag->next) { 1984 int end = offset + frag->len; 1985 1986 if (skb_shared(frag)) { 1987 struct sk_buff *nfrag; 1988 1989 nfrag = skb_clone(frag, GFP_ATOMIC); 1990 if (unlikely(!nfrag)) 1991 return -ENOMEM; 1992 1993 nfrag->next = frag->next; 1994 consume_skb(frag); 1995 frag = nfrag; 1996 *fragp = frag; 1997 } 1998 1999 if (end < len) { 2000 offset = end; 2001 continue; 2002 } 2003 2004 if (end > len && 2005 unlikely((err = pskb_trim(frag, len - offset)))) 2006 return err; 2007 2008 if (frag->next) 2009 skb_drop_list(&frag->next); 2010 break; 2011 } 2012 2013done: 2014 if (len > skb_headlen(skb)) { 2015 skb->data_len -= skb->len - len; 2016 skb->len = len; 2017 } else { 2018 skb->len = len; 2019 skb->data_len = 0; 2020 skb_set_tail_pointer(skb, len); 2021 } 2022 2023 if (!skb->sk || skb->destructor == sock_edemux) 2024 skb_condense(skb); 2025 return 0; 2026} 2027EXPORT_SYMBOL(___pskb_trim); 2028 2029/* Note : use pskb_trim_rcsum() instead of calling this directly 2030 */ 2031int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) 2032{ 2033 if (skb->ip_summed == CHECKSUM_COMPLETE) { 2034 int delta = skb->len - len; 2035 2036 skb->csum = csum_block_sub(skb->csum, 2037 skb_checksum(skb, len, delta, 0), 2038 len); 2039 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { 2040 int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; 2041 int offset = skb_checksum_start_offset(skb) + skb->csum_offset; 2042 2043 if (offset + sizeof(__sum16) > hdlen) 2044 return -EINVAL; 2045 } 2046 return __pskb_trim(skb, len); 2047} 2048EXPORT_SYMBOL(pskb_trim_rcsum_slow); 2049 2050/** 2051 * __pskb_pull_tail - advance tail of skb header 2052 * @skb: buffer to reallocate 2053 * @delta: number of bytes to advance tail 2054 * 2055 * The function makes a sense only on a fragmented &sk_buff, 2056 * it expands header moving its tail forward and copying necessary 2057 * data from fragmented part. 2058 * 2059 * &sk_buff MUST have reference count of 1. 2060 * 2061 * Returns %NULL (and &sk_buff does not change) if pull failed 2062 * or value of new tail of skb in the case of success. 2063 * 2064 * All the pointers pointing into skb header may change and must be 2065 * reloaded after call to this function. 2066 */ 2067 2068/* Moves tail of skb head forward, copying data from fragmented part, 2069 * when it is necessary. 2070 * 1. It may fail due to malloc failure. 2071 * 2. It may change skb pointers. 2072 * 2073 * It is pretty complicated. Luckily, it is called only in exceptional cases. 2074 */ 2075void *__pskb_pull_tail(struct sk_buff *skb, int delta) 2076{ 2077 /* If skb has not enough free space at tail, get new one 2078 * plus 128 bytes for future expansions. If we have enough 2079 * room at tail, reallocate without expansion only if skb is cloned. 2080 */ 2081 int i, k, eat = (skb->tail + delta) - skb->end; 2082 2083 if (eat > 0 || skb_cloned(skb)) { 2084 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, 2085 GFP_ATOMIC)) 2086 return NULL; 2087 } 2088 2089 BUG_ON(skb_copy_bits(skb, skb_headlen(skb), 2090 skb_tail_pointer(skb), delta)); 2091 2092 /* Optimization: no fragments, no reasons to preestimate 2093 * size of pulled pages. Superb. 2094 */ 2095 if (!skb_has_frag_list(skb)) 2096 goto pull_pages; 2097 2098 /* Estimate size of pulled pages. */ 2099 eat = delta; 2100 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2101 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 2102 2103 if (size >= eat) 2104 goto pull_pages; 2105 eat -= size; 2106 } 2107 2108 /* If we need update frag list, we are in troubles. 2109 * Certainly, it is possible to add an offset to skb data, 2110 * but taking into account that pulling is expected to 2111 * be very rare operation, it is worth to fight against 2112 * further bloating skb head and crucify ourselves here instead. 2113 * Pure masohism, indeed. 8)8) 2114 */ 2115 if (eat) { 2116 struct sk_buff *list = skb_shinfo(skb)->frag_list; 2117 struct sk_buff *clone = NULL; 2118 struct sk_buff *insp = NULL; 2119 2120 do { 2121 if (list->len <= eat) { 2122 /* Eaten as whole. */ 2123 eat -= list->len; 2124 list = list->next; 2125 insp = list; 2126 } else { 2127 /* Eaten partially. */ 2128 if (skb_is_gso(skb) && !list->head_frag && 2129 skb_headlen(list)) 2130 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 2131 2132 if (skb_shared(list)) { 2133 /* Sucks! We need to fork list. :-( */ 2134 clone = skb_clone(list, GFP_ATOMIC); 2135 if (!clone) 2136 return NULL; 2137 insp = list->next; 2138 list = clone; 2139 } else { 2140 /* This may be pulled without 2141 * problems. */ 2142 insp = list; 2143 } 2144 if (!pskb_pull(list, eat)) { 2145 kfree_skb(clone); 2146 return NULL; 2147 } 2148 break; 2149 } 2150 } while (eat); 2151 2152 /* Free pulled out fragments. */ 2153 while ((list = skb_shinfo(skb)->frag_list) != insp) { 2154 skb_shinfo(skb)->frag_list = list->next; 2155 consume_skb(list); 2156 } 2157 /* And insert new clone at head. */ 2158 if (clone) { 2159 clone->next = list; 2160 skb_shinfo(skb)->frag_list = clone; 2161 } 2162 } 2163 /* Success! Now we may commit changes to skb data. */ 2164 2165pull_pages: 2166 eat = delta; 2167 k = 0; 2168 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2169 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 2170 2171 if (size <= eat) { 2172 skb_frag_unref(skb, i); 2173 eat -= size; 2174 } else { 2175 skb_frag_t *frag = &skb_shinfo(skb)->frags[k]; 2176 2177 *frag = skb_shinfo(skb)->frags[i]; 2178 if (eat) { 2179 skb_frag_off_add(frag, eat); 2180 skb_frag_size_sub(frag, eat); 2181 if (!i) 2182 goto end; 2183 eat = 0; 2184 } 2185 k++; 2186 } 2187 } 2188 skb_shinfo(skb)->nr_frags = k; 2189 2190end: 2191 skb->tail += delta; 2192 skb->data_len -= delta; 2193 2194 if (!skb->data_len) 2195 skb_zcopy_clear(skb, false); 2196 2197 return skb_tail_pointer(skb); 2198} 2199EXPORT_SYMBOL(__pskb_pull_tail); 2200 2201/** 2202 * skb_copy_bits - copy bits from skb to kernel buffer 2203 * @skb: source skb 2204 * @offset: offset in source 2205 * @to: destination buffer 2206 * @len: number of bytes to copy 2207 * 2208 * Copy the specified number of bytes from the source skb to the 2209 * destination buffer. 2210 * 2211 * CAUTION ! : 2212 * If its prototype is ever changed, 2213 * check arch/{*}/net/{*}.S files, 2214 * since it is called from BPF assembly code. 2215 */ 2216int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) 2217{ 2218 int start = skb_headlen(skb); 2219 struct sk_buff *frag_iter; 2220 int i, copy; 2221 2222 if (offset > (int)skb->len - len) 2223 goto fault; 2224 2225 /* Copy header. */ 2226 if ((copy = start - offset) > 0) { 2227 if (copy > len) 2228 copy = len; 2229 skb_copy_from_linear_data_offset(skb, offset, to, copy); 2230 if ((len -= copy) == 0) 2231 return 0; 2232 offset += copy; 2233 to += copy; 2234 } 2235 2236 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2237 int end; 2238 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 2239 2240 WARN_ON(start > offset + len); 2241 2242 end = start + skb_frag_size(f); 2243 if ((copy = end - offset) > 0) { 2244 u32 p_off, p_len, copied; 2245 struct page *p; 2246 u8 *vaddr; 2247 2248 if (copy > len) 2249 copy = len; 2250 2251 skb_frag_foreach_page(f, 2252 skb_frag_off(f) + offset - start, 2253 copy, p, p_off, p_len, copied) { 2254 vaddr = kmap_atomic(p); 2255 memcpy(to + copied, vaddr + p_off, p_len); 2256 kunmap_atomic(vaddr); 2257 } 2258 2259 if ((len -= copy) == 0) 2260 return 0; 2261 offset += copy; 2262 to += copy; 2263 } 2264 start = end; 2265 } 2266 2267 skb_walk_frags(skb, frag_iter) { 2268 int end; 2269 2270 WARN_ON(start > offset + len); 2271 2272 end = start + frag_iter->len; 2273 if ((copy = end - offset) > 0) { 2274 if (copy > len) 2275 copy = len; 2276 if (skb_copy_bits(frag_iter, offset - start, to, copy)) 2277 goto fault; 2278 if ((len -= copy) == 0) 2279 return 0; 2280 offset += copy; 2281 to += copy; 2282 } 2283 start = end; 2284 } 2285 2286 if (!len) 2287 return 0; 2288 2289fault: 2290 return -EFAULT; 2291} 2292EXPORT_SYMBOL(skb_copy_bits); 2293 2294/* 2295 * Callback from splice_to_pipe(), if we need to release some pages 2296 * at the end of the spd in case we error'ed out in filling the pipe. 2297 */ 2298static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) 2299{ 2300 put_page(spd->pages[i]); 2301} 2302 2303static struct page *linear_to_page(struct page *page, unsigned int *len, 2304 unsigned int *offset, 2305 struct sock *sk) 2306{ 2307 struct page_frag *pfrag = sk_page_frag(sk); 2308 2309 if (!sk_page_frag_refill(sk, pfrag)) 2310 return NULL; 2311 2312 *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); 2313 2314 memcpy(page_address(pfrag->page) + pfrag->offset, 2315 page_address(page) + *offset, *len); 2316 *offset = pfrag->offset; 2317 pfrag->offset += *len; 2318 2319 return pfrag->page; 2320} 2321 2322static bool spd_can_coalesce(const struct splice_pipe_desc *spd, 2323 struct page *page, 2324 unsigned int offset) 2325{ 2326 return spd->nr_pages && 2327 spd->pages[spd->nr_pages - 1] == page && 2328 (spd->partial[spd->nr_pages - 1].offset + 2329 spd->partial[spd->nr_pages - 1].len == offset); 2330} 2331 2332/* 2333 * Fill page/offset/length into spd, if it can hold more pages. 2334 */ 2335static bool spd_fill_page(struct splice_pipe_desc *spd, 2336 struct pipe_inode_info *pipe, struct page *page, 2337 unsigned int *len, unsigned int offset, 2338 bool linear, 2339 struct sock *sk) 2340{ 2341 if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) 2342 return true; 2343 2344 if (linear) { 2345 page = linear_to_page(page, len, &offset, sk); 2346 if (!page) 2347 return true; 2348 } 2349 if (spd_can_coalesce(spd, page, offset)) { 2350 spd->partial[spd->nr_pages - 1].len += *len; 2351 return false; 2352 } 2353 get_page(page); 2354 spd->pages[spd->nr_pages] = page; 2355 spd->partial[spd->nr_pages].len = *len; 2356 spd->partial[spd->nr_pages].offset = offset; 2357 spd->nr_pages++; 2358 2359 return false; 2360} 2361 2362static bool __splice_segment(struct page *page, unsigned int poff, 2363 unsigned int plen, unsigned int *off, 2364 unsigned int *len, 2365 struct splice_pipe_desc *spd, bool linear, 2366 struct sock *sk, 2367 struct pipe_inode_info *pipe) 2368{ 2369 if (!*len) 2370 return true; 2371 2372 /* skip this segment if already processed */ 2373 if (*off >= plen) { 2374 *off -= plen; 2375 return false; 2376 } 2377 2378 /* ignore any bits we already processed */ 2379 poff += *off; 2380 plen -= *off; 2381 *off = 0; 2382 2383 do { 2384 unsigned int flen = min(*len, plen); 2385 2386 if (spd_fill_page(spd, pipe, page, &flen, poff, 2387 linear, sk)) 2388 return true; 2389 poff += flen; 2390 plen -= flen; 2391 *len -= flen; 2392 } while (*len && plen); 2393 2394 return false; 2395} 2396 2397/* 2398 * Map linear and fragment data from the skb to spd. It reports true if the 2399 * pipe is full or if we already spliced the requested length. 2400 */ 2401static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, 2402 unsigned int *offset, unsigned int *len, 2403 struct splice_pipe_desc *spd, struct sock *sk) 2404{ 2405 int seg; 2406 struct sk_buff *iter; 2407 2408 /* map the linear part : 2409 * If skb->head_frag is set, this 'linear' part is backed by a 2410 * fragment, and if the head is not shared with any clones then 2411 * we can avoid a copy since we own the head portion of this page. 2412 */ 2413 if (__splice_segment(virt_to_page(skb->data), 2414 (unsigned long) skb->data & (PAGE_SIZE - 1), 2415 skb_headlen(skb), 2416 offset, len, spd, 2417 skb_head_is_locked(skb), 2418 sk, pipe)) 2419 return true; 2420 2421 /* 2422 * then map the fragments 2423 */ 2424 for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { 2425 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; 2426 2427 if (__splice_segment(skb_frag_page(f), 2428 skb_frag_off(f), skb_frag_size(f), 2429 offset, len, spd, false, sk, pipe)) 2430 return true; 2431 } 2432 2433 skb_walk_frags(skb, iter) { 2434 if (*offset >= iter->len) { 2435 *offset -= iter->len; 2436 continue; 2437 } 2438 /* __skb_splice_bits() only fails if the output has no room 2439 * left, so no point in going over the frag_list for the error 2440 * case. 2441 */ 2442 if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) 2443 return true; 2444 } 2445 2446 return false; 2447} 2448 2449/* 2450 * Map data from the skb to a pipe. Should handle both the linear part, 2451 * the fragments, and the frag list. 2452 */ 2453int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, 2454 struct pipe_inode_info *pipe, unsigned int tlen, 2455 unsigned int flags) 2456{ 2457 struct partial_page partial[MAX_SKB_FRAGS]; 2458 struct page *pages[MAX_SKB_FRAGS]; 2459 struct splice_pipe_desc spd = { 2460 .pages = pages, 2461 .partial = partial, 2462 .nr_pages_max = MAX_SKB_FRAGS, 2463 .ops = &nosteal_pipe_buf_ops, 2464 .spd_release = sock_spd_release, 2465 }; 2466 int ret = 0; 2467 2468 __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); 2469 2470 if (spd.nr_pages) 2471 ret = splice_to_pipe(pipe, &spd); 2472 2473 return ret; 2474} 2475EXPORT_SYMBOL_GPL(skb_splice_bits); 2476 2477/* Send skb data on a socket. Socket must be locked. */ 2478int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, 2479 int len) 2480{ 2481 unsigned int orig_len = len; 2482 struct sk_buff *head = skb; 2483 unsigned short fragidx; 2484 int slen, ret; 2485 2486do_frag_list: 2487 2488 /* Deal with head data */ 2489 while (offset < skb_headlen(skb) && len) { 2490 struct kvec kv; 2491 struct msghdr msg; 2492 2493 slen = min_t(int, len, skb_headlen(skb) - offset); 2494 kv.iov_base = skb->data + offset; 2495 kv.iov_len = slen; 2496 memset(&msg, 0, sizeof(msg)); 2497 msg.msg_flags = MSG_DONTWAIT; 2498 2499 ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen); 2500 if (ret <= 0) 2501 goto error; 2502 2503 offset += ret; 2504 len -= ret; 2505 } 2506 2507 /* All the data was skb head? */ 2508 if (!len) 2509 goto out; 2510 2511 /* Make offset relative to start of frags */ 2512 offset -= skb_headlen(skb); 2513 2514 /* Find where we are in frag list */ 2515 for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { 2516 skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; 2517 2518 if (offset < skb_frag_size(frag)) 2519 break; 2520 2521 offset -= skb_frag_size(frag); 2522 } 2523 2524 for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { 2525 skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; 2526 2527 slen = min_t(size_t, len, skb_frag_size(frag) - offset); 2528 2529 while (slen) { 2530 ret = kernel_sendpage_locked(sk, skb_frag_page(frag), 2531 skb_frag_off(frag) + offset, 2532 slen, MSG_DONTWAIT); 2533 if (ret <= 0) 2534 goto error; 2535 2536 len -= ret; 2537 offset += ret; 2538 slen -= ret; 2539 } 2540 2541 offset = 0; 2542 } 2543 2544 if (len) { 2545 /* Process any frag lists */ 2546 2547 if (skb == head) { 2548 if (skb_has_frag_list(skb)) { 2549 skb = skb_shinfo(skb)->frag_list; 2550 goto do_frag_list; 2551 } 2552 } else if (skb->next) { 2553 skb = skb->next; 2554 goto do_frag_list; 2555 } 2556 } 2557 2558out: 2559 return orig_len - len; 2560 2561error: 2562 return orig_len == len ? ret : orig_len - len; 2563} 2564EXPORT_SYMBOL_GPL(skb_send_sock_locked); 2565 2566/** 2567 * skb_store_bits - store bits from kernel buffer to skb 2568 * @skb: destination buffer 2569 * @offset: offset in destination 2570 * @from: source buffer 2571 * @len: number of bytes to copy 2572 * 2573 * Copy the specified number of bytes from the source buffer to the 2574 * destination skb. This function handles all the messy bits of 2575 * traversing fragment lists and such. 2576 */ 2577 2578int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) 2579{ 2580 int start = skb_headlen(skb); 2581 struct sk_buff *frag_iter; 2582 int i, copy; 2583 2584 if (offset > (int)skb->len - len) 2585 goto fault; 2586 2587 if ((copy = start - offset) > 0) { 2588 if (copy > len) 2589 copy = len; 2590 skb_copy_to_linear_data_offset(skb, offset, from, copy); 2591 if ((len -= copy) == 0) 2592 return 0; 2593 offset += copy; 2594 from += copy; 2595 } 2596 2597 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2598 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2599 int end; 2600 2601 WARN_ON(start > offset + len); 2602 2603 end = start + skb_frag_size(frag); 2604 if ((copy = end - offset) > 0) { 2605 u32 p_off, p_len, copied; 2606 struct page *p; 2607 u8 *vaddr; 2608 2609 if (copy > len) 2610 copy = len; 2611 2612 skb_frag_foreach_page(frag, 2613 skb_frag_off(frag) + offset - start, 2614 copy, p, p_off, p_len, copied) { 2615 vaddr = kmap_atomic(p); 2616 memcpy(vaddr + p_off, from + copied, p_len); 2617 kunmap_atomic(vaddr); 2618 } 2619 2620 if ((len -= copy) == 0) 2621 return 0; 2622 offset += copy; 2623 from += copy; 2624 } 2625 start = end; 2626 } 2627 2628 skb_walk_frags(skb, frag_iter) { 2629 int end; 2630 2631 WARN_ON(start > offset + len); 2632 2633 end = start + frag_iter->len; 2634 if ((copy = end - offset) > 0) { 2635 if (copy > len) 2636 copy = len; 2637 if (skb_store_bits(frag_iter, offset - start, 2638 from, copy)) 2639 goto fault; 2640 if ((len -= copy) == 0) 2641 return 0; 2642 offset += copy; 2643 from += copy; 2644 } 2645 start = end; 2646 } 2647 if (!len) 2648 return 0; 2649 2650fault: 2651 return -EFAULT; 2652} 2653EXPORT_SYMBOL(skb_store_bits); 2654 2655/* Checksum skb data. */ 2656__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, 2657 __wsum csum, const struct skb_checksum_ops *ops) 2658{ 2659 int start = skb_headlen(skb); 2660 int i, copy = start - offset; 2661 struct sk_buff *frag_iter; 2662 int pos = 0; 2663 2664 /* Checksum header. */ 2665 if (copy > 0) { 2666 if (copy > len) 2667 copy = len; 2668 csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, 2669 skb->data + offset, copy, csum); 2670 if ((len -= copy) == 0) 2671 return csum; 2672 offset += copy; 2673 pos = copy; 2674 } 2675 2676 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2677 int end; 2678 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2679 2680 WARN_ON(start > offset + len); 2681 2682 end = start + skb_frag_size(frag); 2683 if ((copy = end - offset) > 0) { 2684 u32 p_off, p_len, copied; 2685 struct page *p; 2686 __wsum csum2; 2687 u8 *vaddr; 2688 2689 if (copy > len) 2690 copy = len; 2691 2692 skb_frag_foreach_page(frag, 2693 skb_frag_off(frag) + offset - start, 2694 copy, p, p_off, p_len, copied) { 2695 vaddr = kmap_atomic(p); 2696 csum2 = INDIRECT_CALL_1(ops->update, 2697 csum_partial_ext, 2698 vaddr + p_off, p_len, 0); 2699 kunmap_atomic(vaddr); 2700 csum = INDIRECT_CALL_1(ops->combine, 2701 csum_block_add_ext, csum, 2702 csum2, pos, p_len); 2703 pos += p_len; 2704 } 2705 2706 if (!(len -= copy)) 2707 return csum; 2708 offset += copy; 2709 } 2710 start = end; 2711 } 2712 2713 skb_walk_frags(skb, frag_iter) { 2714 int end; 2715 2716 WARN_ON(start > offset + len); 2717 2718 end = start + frag_iter->len; 2719 if ((copy = end - offset) > 0) { 2720 __wsum csum2; 2721 if (copy > len) 2722 copy = len; 2723 csum2 = __skb_checksum(frag_iter, offset - start, 2724 copy, 0, ops); 2725 csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, 2726 csum, csum2, pos, copy); 2727 if ((len -= copy) == 0) 2728 return csum; 2729 offset += copy; 2730 pos += copy; 2731 } 2732 start = end; 2733 } 2734 BUG_ON(len); 2735 2736 return csum; 2737} 2738EXPORT_SYMBOL(__skb_checksum); 2739 2740__wsum skb_checksum(const struct sk_buff *skb, int offset, 2741 int len, __wsum csum) 2742{ 2743 const struct skb_checksum_ops ops = { 2744 .update = csum_partial_ext, 2745 .combine = csum_block_add_ext, 2746 }; 2747 2748 return __skb_checksum(skb, offset, len, csum, &ops); 2749} 2750EXPORT_SYMBOL(skb_checksum); 2751 2752/* Both of above in one bottle. */ 2753 2754__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, 2755 u8 *to, int len) 2756{ 2757 int start = skb_headlen(skb); 2758 int i, copy = start - offset; 2759 struct sk_buff *frag_iter; 2760 int pos = 0; 2761 __wsum csum = 0; 2762 2763 /* Copy header. */ 2764 if (copy > 0) { 2765 if (copy > len) 2766 copy = len; 2767 csum = csum_partial_copy_nocheck(skb->data + offset, to, 2768 copy); 2769 if ((len -= copy) == 0) 2770 return csum; 2771 offset += copy; 2772 to += copy; 2773 pos = copy; 2774 } 2775 2776 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2777 int end; 2778 2779 WARN_ON(start > offset + len); 2780 2781 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 2782 if ((copy = end - offset) > 0) { 2783 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2784 u32 p_off, p_len, copied; 2785 struct page *p; 2786 __wsum csum2; 2787 u8 *vaddr; 2788 2789 if (copy > len) 2790 copy = len; 2791 2792 skb_frag_foreach_page(frag, 2793 skb_frag_off(frag) + offset - start, 2794 copy, p, p_off, p_len, copied) { 2795 vaddr = kmap_atomic(p); 2796 csum2 = csum_partial_copy_nocheck(vaddr + p_off, 2797 to + copied, 2798 p_len); 2799 kunmap_atomic(vaddr); 2800 csum = csum_block_add(csum, csum2, pos); 2801 pos += p_len; 2802 } 2803 2804 if (!(len -= copy)) 2805 return csum; 2806 offset += copy; 2807 to += copy; 2808 } 2809 start = end; 2810 } 2811 2812 skb_walk_frags(skb, frag_iter) { 2813 __wsum csum2; 2814 int end; 2815 2816 WARN_ON(start > offset + len); 2817 2818 end = start + frag_iter->len; 2819 if ((copy = end - offset) > 0) { 2820 if (copy > len) 2821 copy = len; 2822 csum2 = skb_copy_and_csum_bits(frag_iter, 2823 offset - start, 2824 to, copy); 2825 csum = csum_block_add(csum, csum2, pos); 2826 if ((len -= copy) == 0) 2827 return csum; 2828 offset += copy; 2829 to += copy; 2830 pos += copy; 2831 } 2832 start = end; 2833 } 2834 BUG_ON(len); 2835 return csum; 2836} 2837EXPORT_SYMBOL(skb_copy_and_csum_bits); 2838 2839__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) 2840{ 2841 __sum16 sum; 2842 2843 sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); 2844 /* See comments in __skb_checksum_complete(). */ 2845 if (likely(!sum)) { 2846 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 2847 !skb->csum_complete_sw) 2848 netdev_rx_csum_fault(skb->dev, skb); 2849 } 2850 if (!skb_shared(skb)) 2851 skb->csum_valid = !sum; 2852 return sum; 2853} 2854EXPORT_SYMBOL(__skb_checksum_complete_head); 2855 2856/* This function assumes skb->csum already holds pseudo header's checksum, 2857 * which has been changed from the hardware checksum, for example, by 2858 * __skb_checksum_validate_complete(). And, the original skb->csum must 2859 * have been validated unsuccessfully for CHECKSUM_COMPLETE case. 2860 * 2861 * It returns non-zero if the recomputed checksum is still invalid, otherwise 2862 * zero. The new checksum is stored back into skb->csum unless the skb is 2863 * shared. 2864 */ 2865__sum16 __skb_checksum_complete(struct sk_buff *skb) 2866{ 2867 __wsum csum; 2868 __sum16 sum; 2869 2870 csum = skb_checksum(skb, 0, skb->len, 0); 2871 2872 sum = csum_fold(csum_add(skb->csum, csum)); 2873 /* This check is inverted, because we already knew the hardware 2874 * checksum is invalid before calling this function. So, if the 2875 * re-computed checksum is valid instead, then we have a mismatch 2876 * between the original skb->csum and skb_checksum(). This means either 2877 * the original hardware checksum is incorrect or we screw up skb->csum 2878 * when moving skb->data around. 2879 */ 2880 if (likely(!sum)) { 2881 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 2882 !skb->csum_complete_sw) 2883 netdev_rx_csum_fault(skb->dev, skb); 2884 } 2885 2886 if (!skb_shared(skb)) { 2887 /* Save full packet checksum */ 2888 skb->csum = csum; 2889 skb->ip_summed = CHECKSUM_COMPLETE; 2890 skb->csum_complete_sw = 1; 2891 skb->csum_valid = !sum; 2892 } 2893 2894 return sum; 2895} 2896EXPORT_SYMBOL(__skb_checksum_complete); 2897 2898static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) 2899{ 2900 net_warn_ratelimited( 2901 "%s: attempt to compute crc32c without libcrc32c.ko\n", 2902 __func__); 2903 return 0; 2904} 2905 2906static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, 2907 int offset, int len) 2908{ 2909 net_warn_ratelimited( 2910 "%s: attempt to compute crc32c without libcrc32c.ko\n", 2911 __func__); 2912 return 0; 2913} 2914 2915static const struct skb_checksum_ops default_crc32c_ops = { 2916 .update = warn_crc32c_csum_update, 2917 .combine = warn_crc32c_csum_combine, 2918}; 2919 2920const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = 2921 &default_crc32c_ops; 2922EXPORT_SYMBOL(crc32c_csum_stub); 2923 2924 /** 2925 * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() 2926 * @from: source buffer 2927 * 2928 * Calculates the amount of linear headroom needed in the 'to' skb passed 2929 * into skb_zerocopy(). 2930 */ 2931unsigned int 2932skb_zerocopy_headlen(const struct sk_buff *from) 2933{ 2934 unsigned int hlen = 0; 2935 2936 if (!from->head_frag || 2937 skb_headlen(from) < L1_CACHE_BYTES || 2938 skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) { 2939 hlen = skb_headlen(from); 2940 if (!hlen) 2941 hlen = from->len; 2942 } 2943 2944 if (skb_has_frag_list(from)) 2945 hlen = from->len; 2946 2947 return hlen; 2948} 2949EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); 2950 2951/** 2952 * skb_zerocopy - Zero copy skb to skb 2953 * @to: destination buffer 2954 * @from: source buffer 2955 * @len: number of bytes to copy from source buffer 2956 * @hlen: size of linear headroom in destination buffer 2957 * 2958 * Copies up to `len` bytes from `from` to `to` by creating references 2959 * to the frags in the source buffer. 2960 * 2961 * The `hlen` as calculated by skb_zerocopy_headlen() specifies the 2962 * headroom in the `to` buffer. 2963 * 2964 * Return value: 2965 * 0: everything is OK 2966 * -ENOMEM: couldn't orphan frags of @from due to lack of memory 2967 * -EFAULT: skb_copy_bits() found some problem with skb geometry 2968 */ 2969int 2970skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) 2971{ 2972 int i, j = 0; 2973 int plen = 0; /* length of skb->head fragment */ 2974 int ret; 2975 struct page *page; 2976 unsigned int offset; 2977 2978 BUG_ON(!from->head_frag && !hlen); 2979 2980 /* dont bother with small payloads */ 2981 if (len <= skb_tailroom(to)) 2982 return skb_copy_bits(from, 0, skb_put(to, len), len); 2983 2984 if (hlen) { 2985 ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); 2986 if (unlikely(ret)) 2987 return ret; 2988 len -= hlen; 2989 } else { 2990 plen = min_t(int, skb_headlen(from), len); 2991 if (plen) { 2992 page = virt_to_head_page(from->head); 2993 offset = from->data - (unsigned char *)page_address(page); 2994 __skb_fill_page_desc(to, 0, page, offset, plen); 2995 get_page(page); 2996 j = 1; 2997 len -= plen; 2998 } 2999 } 3000 3001 to->truesize += len + plen; 3002 to->len += len + plen; 3003 to->data_len += len + plen; 3004 3005 if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { 3006 skb_tx_error(from); 3007 return -ENOMEM; 3008 } 3009 skb_zerocopy_clone(to, from, GFP_ATOMIC); 3010 3011 for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { 3012 int size; 3013 3014 if (!len) 3015 break; 3016 skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; 3017 size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]), 3018 len); 3019 skb_frag_size_set(&skb_shinfo(to)->frags[j], size); 3020 len -= size; 3021 skb_frag_ref(to, j); 3022 j++; 3023 } 3024 skb_shinfo(to)->nr_frags = j; 3025 3026 return 0; 3027} 3028EXPORT_SYMBOL_GPL(skb_zerocopy); 3029 3030void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) 3031{ 3032 __wsum csum; 3033 long csstart; 3034 3035 if (skb->ip_summed == CHECKSUM_PARTIAL) 3036 csstart = skb_checksum_start_offset(skb); 3037 else 3038 csstart = skb_headlen(skb); 3039 3040 BUG_ON(csstart > skb_headlen(skb)); 3041 3042 skb_copy_from_linear_data(skb, to, csstart); 3043 3044 csum = 0; 3045 if (csstart != skb->len) 3046 csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, 3047 skb->len - csstart); 3048 3049 if (skb->ip_summed == CHECKSUM_PARTIAL) { 3050 long csstuff = csstart + skb->csum_offset; 3051 3052 *((__sum16 *)(to + csstuff)) = csum_fold(csum); 3053 } 3054} 3055EXPORT_SYMBOL(skb_copy_and_csum_dev); 3056 3057/** 3058 * skb_dequeue - remove from the head of the queue 3059 * @list: list to dequeue from 3060 * 3061 * Remove the head of the list. The list lock is taken so the function 3062 * may be used safely with other locking list functions. The head item is 3063 * returned or %NULL if the list is empty. 3064 */ 3065 3066struct sk_buff *skb_dequeue(struct sk_buff_head *list) 3067{ 3068 unsigned long flags; 3069 struct sk_buff *result; 3070 3071 spin_lock_irqsave(&list->lock, flags); 3072 result = __skb_dequeue(list); 3073 spin_unlock_irqrestore(&list->lock, flags); 3074 return result; 3075} 3076EXPORT_SYMBOL(skb_dequeue); 3077 3078/** 3079 * skb_dequeue_tail - remove from the tail of the queue 3080 * @list: list to dequeue from 3081 * 3082 * Remove the tail of the list. The list lock is taken so the function 3083 * may be used safely with other locking list functions. The tail item is 3084 * returned or %NULL if the list is empty. 3085 */ 3086struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) 3087{ 3088 unsigned long flags; 3089 struct sk_buff *result; 3090 3091 spin_lock_irqsave(&list->lock, flags); 3092 result = __skb_dequeue_tail(list); 3093 spin_unlock_irqrestore(&list->lock, flags); 3094 return result; 3095} 3096EXPORT_SYMBOL(skb_dequeue_tail); 3097 3098/** 3099 * skb_queue_purge - empty a list 3100 * @list: list to empty 3101 * 3102 * Delete all buffers on an &sk_buff list. Each buffer is removed from 3103 * the list and one reference dropped. This function takes the list 3104 * lock and is atomic with respect to other list locking functions. 3105 */ 3106void skb_queue_purge(struct sk_buff_head *list) 3107{ 3108 struct sk_buff *skb; 3109 while ((skb = skb_dequeue(list)) != NULL) 3110 kfree_skb(skb); 3111} 3112EXPORT_SYMBOL(skb_queue_purge); 3113 3114/** 3115 * skb_rbtree_purge - empty a skb rbtree 3116 * @root: root of the rbtree to empty 3117 * Return value: the sum of truesizes of all purged skbs. 3118 * 3119 * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from 3120 * the list and one reference dropped. This function does not take 3121 * any lock. Synchronization should be handled by the caller (e.g., TCP 3122 * out-of-order queue is protected by the socket lock). 3123 */ 3124unsigned int skb_rbtree_purge(struct rb_root *root) 3125{ 3126 struct rb_node *p = rb_first(root); 3127 unsigned int sum = 0; 3128 3129 while (p) { 3130 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); 3131 3132 p = rb_next(p); 3133 rb_erase(&skb->rbnode, root); 3134 sum += skb->truesize; 3135 kfree_skb(skb); 3136 } 3137 return sum; 3138} 3139 3140/** 3141 * skb_queue_head - queue a buffer at the list head 3142 * @list: list to use 3143 * @newsk: buffer to queue 3144 * 3145 * Queue a buffer at the start of the list. This function takes the 3146 * list lock and can be used safely with other locking &sk_buff functions 3147 * safely. 3148 * 3149 * A buffer cannot be placed on two lists at the same time. 3150 */ 3151void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) 3152{ 3153 unsigned long flags; 3154 3155 spin_lock_irqsave(&list->lock, flags); 3156 __skb_queue_head(list, newsk); 3157 spin_unlock_irqrestore(&list->lock, flags); 3158} 3159EXPORT_SYMBOL(skb_queue_head); 3160 3161/** 3162 * skb_queue_tail - queue a buffer at the list tail 3163 * @list: list to use 3164 * @newsk: buffer to queue 3165 * 3166 * Queue a buffer at the tail of the list. This function takes the 3167 * list lock and can be used safely with other locking &sk_buff functions 3168 * safely. 3169 * 3170 * A buffer cannot be placed on two lists at the same time. 3171 */ 3172void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) 3173{ 3174 unsigned long flags; 3175 3176 spin_lock_irqsave(&list->lock, flags); 3177 __skb_queue_tail(list, newsk); 3178 spin_unlock_irqrestore(&list->lock, flags); 3179} 3180EXPORT_SYMBOL(skb_queue_tail); 3181 3182/** 3183 * skb_unlink - remove a buffer from a list 3184 * @skb: buffer to remove 3185 * @list: list to use 3186 * 3187 * Remove a packet from a list. The list locks are taken and this 3188 * function is atomic with respect to other list locked calls 3189 * 3190 * You must know what list the SKB is on. 3191 */ 3192void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) 3193{ 3194 unsigned long flags; 3195 3196 spin_lock_irqsave(&list->lock, flags); 3197 __skb_unlink(skb, list); 3198 spin_unlock_irqrestore(&list->lock, flags); 3199} 3200EXPORT_SYMBOL(skb_unlink); 3201 3202/** 3203 * skb_append - append a buffer 3204 * @old: buffer to insert after 3205 * @newsk: buffer to insert 3206 * @list: list to use 3207 * 3208 * Place a packet after a given packet in a list. The list locks are taken 3209 * and this function is atomic with respect to other list locked calls. 3210 * A buffer cannot be placed on two lists at the same time. 3211 */ 3212void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) 3213{ 3214 unsigned long flags; 3215 3216 spin_lock_irqsave(&list->lock, flags); 3217 __skb_queue_after(list, old, newsk); 3218 spin_unlock_irqrestore(&list->lock, flags); 3219} 3220EXPORT_SYMBOL(skb_append); 3221 3222static inline void skb_split_inside_header(struct sk_buff *skb, 3223 struct sk_buff* skb1, 3224 const u32 len, const int pos) 3225{ 3226 int i; 3227 3228 skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), 3229 pos - len); 3230 /* And move data appendix as is. */ 3231 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 3232 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; 3233 3234 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; 3235 skb_shinfo(skb)->nr_frags = 0; 3236 skb1->data_len = skb->data_len; 3237 skb1->len += skb1->data_len; 3238 skb->data_len = 0; 3239 skb->len = len; 3240 skb_set_tail_pointer(skb, len); 3241} 3242 3243static inline void skb_split_no_header(struct sk_buff *skb, 3244 struct sk_buff* skb1, 3245 const u32 len, int pos) 3246{ 3247 int i, k = 0; 3248 const int nfrags = skb_shinfo(skb)->nr_frags; 3249 3250 skb_shinfo(skb)->nr_frags = 0; 3251 skb1->len = skb1->data_len = skb->len - len; 3252 skb->len = len; 3253 skb->data_len = len - pos; 3254 3255 for (i = 0; i < nfrags; i++) { 3256 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 3257 3258 if (pos + size > len) { 3259 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; 3260 3261 if (pos < len) { 3262 /* Split frag. 3263 * We have two variants in this case: 3264 * 1. Move all the frag to the second 3265 * part, if it is possible. F.e. 3266 * this approach is mandatory for TUX, 3267 * where splitting is expensive. 3268 * 2. Split is accurately. We make this. 3269 */ 3270 skb_frag_ref(skb, i); 3271 skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos); 3272 skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); 3273 skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); 3274 skb_shinfo(skb)->nr_frags++; 3275 } 3276 k++; 3277 } else 3278 skb_shinfo(skb)->nr_frags++; 3279 pos += size; 3280 } 3281 skb_shinfo(skb1)->nr_frags = k; 3282} 3283 3284/** 3285 * skb_split - Split fragmented skb to two parts at length len. 3286 * @skb: the buffer to split 3287 * @skb1: the buffer to receive the second part 3288 * @len: new length for skb 3289 */ 3290void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) 3291{ 3292 int pos = skb_headlen(skb); 3293 3294 skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags & 3295 SKBTX_SHARED_FRAG; 3296 skb_zerocopy_clone(skb1, skb, 0); 3297 if (len < pos) /* Split line is inside header. */ 3298 skb_split_inside_header(skb, skb1, len, pos); 3299 else /* Second chunk has no header, nothing to copy. */ 3300 skb_split_no_header(skb, skb1, len, pos); 3301} 3302EXPORT_SYMBOL(skb_split); 3303 3304/* Shifting from/to a cloned skb is a no-go. 3305 * 3306 * Caller cannot keep skb_shinfo related pointers past calling here! 3307 */ 3308static int skb_prepare_for_shift(struct sk_buff *skb) 3309{ 3310 int ret = 0; 3311 3312 if (skb_cloned(skb)) { 3313 /* Save and restore truesize: pskb_expand_head() may reallocate 3314 * memory where ksize(kmalloc(S)) != ksize(kmalloc(S)), but we 3315 * cannot change truesize at this point. 3316 */ 3317 unsigned int save_truesize = skb->truesize; 3318 3319 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 3320 skb->truesize = save_truesize; 3321 } 3322 return ret; 3323} 3324 3325/** 3326 * skb_shift - Shifts paged data partially from skb to another 3327 * @tgt: buffer into which tail data gets added 3328 * @skb: buffer from which the paged data comes from 3329 * @shiftlen: shift up to this many bytes 3330 * 3331 * Attempts to shift up to shiftlen worth of bytes, which may be less than 3332 * the length of the skb, from skb to tgt. Returns number bytes shifted. 3333 * It's up to caller to free skb if everything was shifted. 3334 * 3335 * If @tgt runs out of frags, the whole operation is aborted. 3336 * 3337 * Skb cannot include anything else but paged data while tgt is allowed 3338 * to have non-paged data as well. 3339 * 3340 * TODO: full sized shift could be optimized but that would need 3341 * specialized skb free'er to handle frags without up-to-date nr_frags. 3342 */ 3343int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) 3344{ 3345 int from, to, merge, todo; 3346 skb_frag_t *fragfrom, *fragto; 3347 3348 BUG_ON(shiftlen > skb->len); 3349 3350 if (skb_headlen(skb)) 3351 return 0; 3352 if (skb_zcopy(tgt) || skb_zcopy(skb)) 3353 return 0; 3354 3355 todo = shiftlen; 3356 from = 0; 3357 to = skb_shinfo(tgt)->nr_frags; 3358 fragfrom = &skb_shinfo(skb)->frags[from]; 3359 3360 /* Actual merge is delayed until the point when we know we can 3361 * commit all, so that we don't have to undo partial changes 3362 */ 3363 if (!to || 3364 !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), 3365 skb_frag_off(fragfrom))) { 3366 merge = -1; 3367 } else { 3368 merge = to - 1; 3369 3370 todo -= skb_frag_size(fragfrom); 3371 if (todo < 0) { 3372 if (skb_prepare_for_shift(skb) || 3373 skb_prepare_for_shift(tgt)) 3374 return 0; 3375 3376 /* All previous frag pointers might be stale! */ 3377 fragfrom = &skb_shinfo(skb)->frags[from]; 3378 fragto = &skb_shinfo(tgt)->frags[merge]; 3379 3380 skb_frag_size_add(fragto, shiftlen); 3381 skb_frag_size_sub(fragfrom, shiftlen); 3382 skb_frag_off_add(fragfrom, shiftlen); 3383 3384 goto onlymerged; 3385 } 3386 3387 from++; 3388 } 3389 3390 /* Skip full, not-fitting skb to avoid expensive operations */ 3391 if ((shiftlen == skb->len) && 3392 (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) 3393 return 0; 3394 3395 if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) 3396 return 0; 3397 3398 while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { 3399 if (to == MAX_SKB_FRAGS) 3400 return 0; 3401 3402 fragfrom = &skb_shinfo(skb)->frags[from]; 3403 fragto = &skb_shinfo(tgt)->frags[to]; 3404 3405 if (todo >= skb_frag_size(fragfrom)) { 3406 *fragto = *fragfrom; 3407 todo -= skb_frag_size(fragfrom); 3408 from++; 3409 to++; 3410 3411 } else { 3412 __skb_frag_ref(fragfrom); 3413 skb_frag_page_copy(fragto, fragfrom); 3414 skb_frag_off_copy(fragto, fragfrom); 3415 skb_frag_size_set(fragto, todo); 3416 3417 skb_frag_off_add(fragfrom, todo); 3418 skb_frag_size_sub(fragfrom, todo); 3419 todo = 0; 3420 3421 to++; 3422 break; 3423 } 3424 } 3425 3426 /* Ready to "commit" this state change to tgt */ 3427 skb_shinfo(tgt)->nr_frags = to; 3428 3429 if (merge >= 0) { 3430 fragfrom = &skb_shinfo(skb)->frags[0]; 3431 fragto = &skb_shinfo(tgt)->frags[merge]; 3432 3433 skb_frag_size_add(fragto, skb_frag_size(fragfrom)); 3434 __skb_frag_unref(fragfrom); 3435 } 3436 3437 /* Reposition in the original skb */ 3438 to = 0; 3439 while (from < skb_shinfo(skb)->nr_frags) 3440 skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; 3441 skb_shinfo(skb)->nr_frags = to; 3442 3443 BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); 3444 3445onlymerged: 3446 /* Most likely the tgt won't ever need its checksum anymore, skb on 3447 * the other hand might need it if it needs to be resent 3448 */ 3449 tgt->ip_summed = CHECKSUM_PARTIAL; 3450 skb->ip_summed = CHECKSUM_PARTIAL; 3451 3452 /* Yak, is it really working this way? Some helper please? */ 3453 skb->len -= shiftlen; 3454 skb->data_len -= shiftlen; 3455 skb->truesize -= shiftlen; 3456 tgt->len += shiftlen; 3457 tgt->data_len += shiftlen; 3458 tgt->truesize += shiftlen; 3459 3460 return shiftlen; 3461} 3462 3463/** 3464 * skb_prepare_seq_read - Prepare a sequential read of skb data 3465 * @skb: the buffer to read 3466 * @from: lower offset of data to be read 3467 * @to: upper offset of data to be read 3468 * @st: state variable 3469 * 3470 * Initializes the specified state variable. Must be called before 3471 * invoking skb_seq_read() for the first time. 3472 */ 3473void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, 3474 unsigned int to, struct skb_seq_state *st) 3475{ 3476 st->lower_offset = from; 3477 st->upper_offset = to; 3478 st->root_skb = st->cur_skb = skb; 3479 st->frag_idx = st->stepped_offset = 0; 3480 st->frag_data = NULL; 3481} 3482EXPORT_SYMBOL(skb_prepare_seq_read); 3483 3484/** 3485 * skb_seq_read - Sequentially read skb data 3486 * @consumed: number of bytes consumed by the caller so far 3487 * @data: destination pointer for data to be returned 3488 * @st: state variable 3489 * 3490 * Reads a block of skb data at @consumed relative to the 3491 * lower offset specified to skb_prepare_seq_read(). Assigns 3492 * the head of the data block to @data and returns the length 3493 * of the block or 0 if the end of the skb data or the upper 3494 * offset has been reached. 3495 * 3496 * The caller is not required to consume all of the data 3497 * returned, i.e. @consumed is typically set to the number 3498 * of bytes already consumed and the next call to 3499 * skb_seq_read() will return the remaining part of the block. 3500 * 3501 * Note 1: The size of each block of data returned can be arbitrary, 3502 * this limitation is the cost for zerocopy sequential 3503 * reads of potentially non linear data. 3504 * 3505 * Note 2: Fragment lists within fragments are not implemented 3506 * at the moment, state->root_skb could be replaced with 3507 * a stack for this purpose. 3508 */ 3509unsigned int skb_seq_read(unsigned int consumed, const u8 **data, 3510 struct skb_seq_state *st) 3511{ 3512 unsigned int block_limit, abs_offset = consumed + st->lower_offset; 3513 skb_frag_t *frag; 3514 3515 if (unlikely(abs_offset >= st->upper_offset)) { 3516 if (st->frag_data) { 3517 kunmap_atomic(st->frag_data); 3518 st->frag_data = NULL; 3519 } 3520 return 0; 3521 } 3522 3523next_skb: 3524 block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; 3525 3526 if (abs_offset < block_limit && !st->frag_data) { 3527 *data = st->cur_skb->data + (abs_offset - st->stepped_offset); 3528 return block_limit - abs_offset; 3529 } 3530 3531 if (st->frag_idx == 0 && !st->frag_data) 3532 st->stepped_offset += skb_headlen(st->cur_skb); 3533 3534 while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { 3535 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; 3536 block_limit = skb_frag_size(frag) + st->stepped_offset; 3537 3538 if (abs_offset < block_limit) { 3539 if (!st->frag_data) 3540 st->frag_data = kmap_atomic(skb_frag_page(frag)); 3541 3542 *data = (u8 *) st->frag_data + skb_frag_off(frag) + 3543 (abs_offset - st->stepped_offset); 3544 3545 return block_limit - abs_offset; 3546 } 3547 3548 if (st->frag_data) { 3549 kunmap_atomic(st->frag_data); 3550 st->frag_data = NULL; 3551 } 3552 3553 st->frag_idx++; 3554 st->stepped_offset += skb_frag_size(frag); 3555 } 3556 3557 if (st->frag_data) { 3558 kunmap_atomic(st->frag_data); 3559 st->frag_data = NULL; 3560 } 3561 3562 if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { 3563 st->cur_skb = skb_shinfo(st->root_skb)->frag_list; 3564 st->frag_idx = 0; 3565 goto next_skb; 3566 } else if (st->cur_skb->next) { 3567 st->cur_skb = st->cur_skb->next; 3568 st->frag_idx = 0; 3569 goto next_skb; 3570 } 3571 3572 return 0; 3573} 3574EXPORT_SYMBOL(skb_seq_read); 3575 3576/** 3577 * skb_abort_seq_read - Abort a sequential read of skb data 3578 * @st: state variable 3579 * 3580 * Must be called if skb_seq_read() was not called until it 3581 * returned 0. 3582 */ 3583void skb_abort_seq_read(struct skb_seq_state *st) 3584{ 3585 if (st->frag_data) 3586 kunmap_atomic(st->frag_data); 3587} 3588EXPORT_SYMBOL(skb_abort_seq_read); 3589 3590#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) 3591 3592static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, 3593 struct ts_config *conf, 3594 struct ts_state *state) 3595{ 3596 return skb_seq_read(offset, text, TS_SKB_CB(state)); 3597} 3598 3599static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) 3600{ 3601 skb_abort_seq_read(TS_SKB_CB(state)); 3602} 3603 3604/** 3605 * skb_find_text - Find a text pattern in skb data 3606 * @skb: the buffer to look in 3607 * @from: search offset 3608 * @to: search limit 3609 * @config: textsearch configuration 3610 * 3611 * Finds a pattern in the skb data according to the specified 3612 * textsearch configuration. Use textsearch_next() to retrieve 3613 * subsequent occurrences of the pattern. Returns the offset 3614 * to the first occurrence or UINT_MAX if no match was found. 3615 */ 3616unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, 3617 unsigned int to, struct ts_config *config) 3618{ 3619 struct ts_state state; 3620 unsigned int ret; 3621 3622 config->get_next_block = skb_ts_get_next_block; 3623 config->finish = skb_ts_finish; 3624 3625 skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); 3626 3627 ret = textsearch_find(config, &state); 3628 return (ret <= to - from ? ret : UINT_MAX); 3629} 3630EXPORT_SYMBOL(skb_find_text); 3631 3632int skb_append_pagefrags(struct sk_buff *skb, struct page *page, 3633 int offset, size_t size) 3634{ 3635 int i = skb_shinfo(skb)->nr_frags; 3636 3637 if (skb_can_coalesce(skb, i, page, offset)) { 3638 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); 3639 } else if (i < MAX_SKB_FRAGS) { 3640 get_page(page); 3641 skb_fill_page_desc(skb, i, page, offset, size); 3642 } else { 3643 return -EMSGSIZE; 3644 } 3645 3646 return 0; 3647} 3648EXPORT_SYMBOL_GPL(skb_append_pagefrags); 3649 3650/** 3651 * skb_pull_rcsum - pull skb and update receive checksum 3652 * @skb: buffer to update 3653 * @len: length of data pulled 3654 * 3655 * This function performs an skb_pull on the packet and updates 3656 * the CHECKSUM_COMPLETE checksum. It should be used on 3657 * receive path processing instead of skb_pull unless you know 3658 * that the checksum difference is zero (e.g., a valid IP header) 3659 * or you are setting ip_summed to CHECKSUM_NONE. 3660 */ 3661void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) 3662{ 3663 unsigned char *data = skb->data; 3664 3665 BUG_ON(len > skb->len); 3666 __skb_pull(skb, len); 3667 skb_postpull_rcsum(skb, data, len); 3668 return skb->data; 3669} 3670EXPORT_SYMBOL_GPL(skb_pull_rcsum); 3671 3672static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) 3673{ 3674 skb_frag_t head_frag; 3675 struct page *page; 3676 3677 page = virt_to_head_page(frag_skb->head); 3678 __skb_frag_set_page(&head_frag, page); 3679 skb_frag_off_set(&head_frag, frag_skb->data - 3680 (unsigned char *)page_address(page)); 3681 skb_frag_size_set(&head_frag, skb_headlen(frag_skb)); 3682 return head_frag; 3683} 3684 3685struct sk_buff *skb_segment_list(struct sk_buff *skb, 3686 netdev_features_t features, 3687 unsigned int offset) 3688{ 3689 struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; 3690 unsigned int tnl_hlen = skb_tnl_header_len(skb); 3691 unsigned int delta_truesize = 0; 3692 unsigned int delta_len = 0; 3693 struct sk_buff *tail = NULL; 3694 struct sk_buff *nskb, *tmp; 3695 int err; 3696 3697 skb_push(skb, -skb_network_offset(skb) + offset); 3698 3699 /* Ensure the head is writeable before touching the shared info */ 3700 err = skb_unclone(skb, GFP_ATOMIC); 3701 if (err) 3702 goto err_linearize; 3703 3704 skb_shinfo(skb)->frag_list = NULL; 3705 3706 while (list_skb) { 3707 nskb = list_skb; 3708 list_skb = list_skb->next; 3709 3710 err = 0; 3711 delta_truesize += nskb->truesize; 3712 if (skb_shared(nskb)) { 3713 tmp = skb_clone(nskb, GFP_ATOMIC); 3714 if (tmp) { 3715 consume_skb(nskb); 3716 nskb = tmp; 3717 err = skb_unclone(nskb, GFP_ATOMIC); 3718 } else { 3719 err = -ENOMEM; 3720 } 3721 } 3722 3723 if (!tail) 3724 skb->next = nskb; 3725 else 3726 tail->next = nskb; 3727 3728 if (unlikely(err)) { 3729 nskb->next = list_skb; 3730 goto err_linearize; 3731 } 3732 3733 tail = nskb; 3734 3735 delta_len += nskb->len; 3736 3737 skb_push(nskb, -skb_network_offset(nskb) + offset); 3738 3739 skb_release_head_state(nskb); 3740 __copy_skb_header(nskb, skb); 3741 3742 skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); 3743 skb_copy_from_linear_data_offset(skb, -tnl_hlen, 3744 nskb->data - tnl_hlen, 3745 offset + tnl_hlen); 3746 3747 if (skb_needs_linearize(nskb, features) && 3748 __skb_linearize(nskb)) 3749 goto err_linearize; 3750 } 3751 3752 skb->truesize = skb->truesize - delta_truesize; 3753 skb->data_len = skb->data_len - delta_len; 3754 skb->len = skb->len - delta_len; 3755 3756 skb_gso_reset(skb); 3757 3758 skb->prev = tail; 3759 3760 if (skb_needs_linearize(skb, features) && 3761 __skb_linearize(skb)) 3762 goto err_linearize; 3763 3764 skb_get(skb); 3765 3766 return skb; 3767 3768err_linearize: 3769 kfree_skb_list(skb->next); 3770 skb->next = NULL; 3771 return ERR_PTR(-ENOMEM); 3772} 3773EXPORT_SYMBOL_GPL(skb_segment_list); 3774 3775int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) 3776{ 3777 if (unlikely(p->len + skb->len >= 65536)) 3778 return -E2BIG; 3779 3780 if (NAPI_GRO_CB(p)->last == p) 3781 skb_shinfo(p)->frag_list = skb; 3782 else 3783 NAPI_GRO_CB(p)->last->next = skb; 3784 3785 skb_pull(skb, skb_gro_offset(skb)); 3786 3787 NAPI_GRO_CB(p)->last = skb; 3788 NAPI_GRO_CB(p)->count++; 3789 p->data_len += skb->len; 3790 p->truesize += skb->truesize; 3791 p->len += skb->len; 3792 3793 NAPI_GRO_CB(skb)->same_flow = 1; 3794 3795 return 0; 3796} 3797 3798/** 3799 * skb_segment - Perform protocol segmentation on skb. 3800 * @head_skb: buffer to segment 3801 * @features: features for the output path (see dev->features) 3802 * 3803 * This function performs segmentation on the given skb. It returns 3804 * a pointer to the first in a list of new skbs for the segments. 3805 * In case of error it returns ERR_PTR(err). 3806 */ 3807struct sk_buff *skb_segment(struct sk_buff *head_skb, 3808 netdev_features_t features) 3809{ 3810 struct sk_buff *segs = NULL; 3811 struct sk_buff *tail = NULL; 3812 struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; 3813 unsigned int mss = skb_shinfo(head_skb)->gso_size; 3814 unsigned int doffset = head_skb->data - skb_mac_header(head_skb); 3815 unsigned int offset = doffset; 3816 unsigned int tnl_hlen = skb_tnl_header_len(head_skb); 3817 unsigned int partial_segs = 0; 3818 unsigned int headroom; 3819 unsigned int len = head_skb->len; 3820 struct sk_buff *frag_skb; 3821 skb_frag_t *frag; 3822 __be16 proto; 3823 bool csum, sg; 3824 int err = -ENOMEM; 3825 int i = 0; 3826 int nfrags, pos; 3827 3828 if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && 3829 mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) { 3830 struct sk_buff *check_skb; 3831 3832 for (check_skb = list_skb; check_skb; check_skb = check_skb->next) { 3833 if (skb_headlen(check_skb) && !check_skb->head_frag) { 3834 /* gso_size is untrusted, and we have a frag_list with 3835 * a linear non head_frag item. 3836 * 3837 * If head_skb's headlen does not fit requested gso_size, 3838 * it means that the frag_list members do NOT terminate 3839 * on exact gso_size boundaries. Hence we cannot perform 3840 * skb_frag_t page sharing. Therefore we must fallback to 3841 * copying the frag_list skbs; we do so by disabling SG. 3842 */ 3843 features &= ~NETIF_F_SG; 3844 break; 3845 } 3846 } 3847 } 3848 3849 __skb_push(head_skb, doffset); 3850 proto = skb_network_protocol(head_skb, NULL); 3851 if (unlikely(!proto)) 3852 return ERR_PTR(-EINVAL); 3853 3854 sg = !!(features & NETIF_F_SG); 3855 csum = !!can_checksum_protocol(features, proto); 3856 3857 if (sg && csum && (mss != GSO_BY_FRAGS)) { 3858 if (!(features & NETIF_F_GSO_PARTIAL)) { 3859 struct sk_buff *iter; 3860 unsigned int frag_len; 3861 3862 if (!list_skb || 3863 !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) 3864 goto normal; 3865 3866 /* If we get here then all the required 3867 * GSO features except frag_list are supported. 3868 * Try to split the SKB to multiple GSO SKBs 3869 * with no frag_list. 3870 * Currently we can do that only when the buffers don't 3871 * have a linear part and all the buffers except 3872 * the last are of the same length. 3873 */ 3874 frag_len = list_skb->len; 3875 skb_walk_frags(head_skb, iter) { 3876 if (frag_len != iter->len && iter->next) 3877 goto normal; 3878 if (skb_headlen(iter) && !iter->head_frag) 3879 goto normal; 3880 3881 len -= iter->len; 3882 } 3883 3884 if (len != frag_len) 3885 goto normal; 3886 } 3887 3888 /* GSO partial only requires that we trim off any excess that 3889 * doesn't fit into an MSS sized block, so take care of that 3890 * now. 3891 * Cap len to not accidentally hit GSO_BY_FRAGS. 3892 */ 3893 partial_segs = min(len, GSO_BY_FRAGS - 1U) / mss; 3894 if (partial_segs > 1) 3895 mss *= partial_segs; 3896 else 3897 partial_segs = 0; 3898 } 3899 3900normal: 3901 headroom = skb_headroom(head_skb); 3902 pos = skb_headlen(head_skb); 3903 3904 if (skb_orphan_frags(head_skb, GFP_ATOMIC)) 3905 return ERR_PTR(-ENOMEM); 3906 3907 nfrags = skb_shinfo(head_skb)->nr_frags; 3908 frag = skb_shinfo(head_skb)->frags; 3909 frag_skb = head_skb; 3910 3911 do { 3912 struct sk_buff *nskb; 3913 skb_frag_t *nskb_frag; 3914 int hsize; 3915 int size; 3916 3917 if (unlikely(mss == GSO_BY_FRAGS)) { 3918 len = list_skb->len; 3919 } else { 3920 len = head_skb->len - offset; 3921 if (len > mss) 3922 len = mss; 3923 } 3924 3925 hsize = skb_headlen(head_skb) - offset; 3926 if (hsize < 0) 3927 hsize = 0; 3928 if (hsize > len || !sg) 3929 hsize = len; 3930 3931 if (!hsize && i >= nfrags && skb_headlen(list_skb) && 3932 (skb_headlen(list_skb) == len || sg)) { 3933 BUG_ON(skb_headlen(list_skb) > len); 3934 3935 nskb = skb_clone(list_skb, GFP_ATOMIC); 3936 if (unlikely(!nskb)) 3937 goto err; 3938 3939 i = 0; 3940 nfrags = skb_shinfo(list_skb)->nr_frags; 3941 frag = skb_shinfo(list_skb)->frags; 3942 frag_skb = list_skb; 3943 pos += skb_headlen(list_skb); 3944 3945 while (pos < offset + len) { 3946 BUG_ON(i >= nfrags); 3947 3948 size = skb_frag_size(frag); 3949 if (pos + size > offset + len) 3950 break; 3951 3952 i++; 3953 pos += size; 3954 frag++; 3955 } 3956 3957 list_skb = list_skb->next; 3958 3959 if (unlikely(pskb_trim(nskb, len))) { 3960 kfree_skb(nskb); 3961 goto err; 3962 } 3963 3964 hsize = skb_end_offset(nskb); 3965 if (skb_cow_head(nskb, doffset + headroom)) { 3966 kfree_skb(nskb); 3967 goto err; 3968 } 3969 3970 nskb->truesize += skb_end_offset(nskb) - hsize; 3971 skb_release_head_state(nskb); 3972 __skb_push(nskb, doffset); 3973 } else { 3974 nskb = __alloc_skb(hsize + doffset + headroom, 3975 GFP_ATOMIC, skb_alloc_rx_flag(head_skb), 3976 NUMA_NO_NODE); 3977 3978 if (unlikely(!nskb)) 3979 goto err; 3980 3981 skb_reserve(nskb, headroom); 3982 __skb_put(nskb, doffset); 3983 } 3984 3985 if (segs) 3986 tail->next = nskb; 3987 else 3988 segs = nskb; 3989 tail = nskb; 3990 3991 __copy_skb_header(nskb, head_skb); 3992 3993 skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); 3994 skb_reset_mac_len(nskb); 3995 3996 skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, 3997 nskb->data - tnl_hlen, 3998 doffset + tnl_hlen); 3999 4000 if (nskb->len == len + doffset) 4001 goto perform_csum_check; 4002 4003 if (!sg) { 4004 if (!csum) { 4005 if (!nskb->remcsum_offload) 4006 nskb->ip_summed = CHECKSUM_NONE; 4007 SKB_GSO_CB(nskb)->csum = 4008 skb_copy_and_csum_bits(head_skb, offset, 4009 skb_put(nskb, 4010 len), 4011 len); 4012 SKB_GSO_CB(nskb)->csum_start = 4013 skb_headroom(nskb) + doffset; 4014 } else { 4015 if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len)) 4016 goto err; 4017 } 4018 continue; 4019 } 4020 4021 nskb_frag = skb_shinfo(nskb)->frags; 4022 4023 skb_copy_from_linear_data_offset(head_skb, offset, 4024 skb_put(nskb, hsize), hsize); 4025 4026 skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & 4027 SKBTX_SHARED_FRAG; 4028 4029 if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) 4030 goto err; 4031 4032 while (pos < offset + len) { 4033 if (i >= nfrags) { 4034 if (skb_orphan_frags(list_skb, GFP_ATOMIC) || 4035 skb_zerocopy_clone(nskb, list_skb, 4036 GFP_ATOMIC)) 4037 goto err; 4038 4039 i = 0; 4040 nfrags = skb_shinfo(list_skb)->nr_frags; 4041 frag = skb_shinfo(list_skb)->frags; 4042 frag_skb = list_skb; 4043 if (!skb_headlen(list_skb)) { 4044 BUG_ON(!nfrags); 4045 } else { 4046 BUG_ON(!list_skb->head_frag); 4047 4048 /* to make room for head_frag. */ 4049 i--; 4050 frag--; 4051 } 4052 4053 list_skb = list_skb->next; 4054 } 4055 4056 if (unlikely(skb_shinfo(nskb)->nr_frags >= 4057 MAX_SKB_FRAGS)) { 4058 net_warn_ratelimited( 4059 "skb_segment: too many frags: %u %u\n", 4060 pos, mss); 4061 err = -EINVAL; 4062 goto err; 4063 } 4064 4065 *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag; 4066 __skb_frag_ref(nskb_frag); 4067 size = skb_frag_size(nskb_frag); 4068 4069 if (pos < offset) { 4070 skb_frag_off_add(nskb_frag, offset - pos); 4071 skb_frag_size_sub(nskb_frag, offset - pos); 4072 } 4073 4074 skb_shinfo(nskb)->nr_frags++; 4075 4076 if (pos + size <= offset + len) { 4077 i++; 4078 frag++; 4079 pos += size; 4080 } else { 4081 skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); 4082 goto skip_fraglist; 4083 } 4084 4085 nskb_frag++; 4086 } 4087 4088skip_fraglist: 4089 nskb->data_len = len - hsize; 4090 nskb->len += nskb->data_len; 4091 nskb->truesize += nskb->data_len; 4092 4093perform_csum_check: 4094 if (!csum) { 4095 if (skb_has_shared_frag(nskb) && 4096 __skb_linearize(nskb)) 4097 goto err; 4098 4099 if (!nskb->remcsum_offload) 4100 nskb->ip_summed = CHECKSUM_NONE; 4101 SKB_GSO_CB(nskb)->csum = 4102 skb_checksum(nskb, doffset, 4103 nskb->len - doffset, 0); 4104 SKB_GSO_CB(nskb)->csum_start = 4105 skb_headroom(nskb) + doffset; 4106 } 4107 } while ((offset += len) < head_skb->len); 4108 4109 /* Some callers want to get the end of the list. 4110 * Put it in segs->prev to avoid walking the list. 4111 * (see validate_xmit_skb_list() for example) 4112 */ 4113 segs->prev = tail; 4114 4115 if (partial_segs) { 4116 struct sk_buff *iter; 4117 int type = skb_shinfo(head_skb)->gso_type; 4118 unsigned short gso_size = skb_shinfo(head_skb)->gso_size; 4119 4120 /* Update type to add partial and then remove dodgy if set */ 4121 type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; 4122 type &= ~SKB_GSO_DODGY; 4123 4124 /* Update GSO info and prepare to start updating headers on 4125 * our way back down the stack of protocols. 4126 */ 4127 for (iter = segs; iter; iter = iter->next) { 4128 skb_shinfo(iter)->gso_size = gso_size; 4129 skb_shinfo(iter)->gso_segs = partial_segs; 4130 skb_shinfo(iter)->gso_type = type; 4131 SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; 4132 } 4133 4134 if (tail->len - doffset <= gso_size) 4135 skb_shinfo(tail)->gso_size = 0; 4136 else if (tail != segs) 4137 skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); 4138 } 4139 4140 /* Following permits correct backpressure, for protocols 4141 * using skb_set_owner_w(). 4142 * Idea is to tranfert ownership from head_skb to last segment. 4143 */ 4144 if (head_skb->destructor == sock_wfree) { 4145 swap(tail->truesize, head_skb->truesize); 4146 swap(tail->destructor, head_skb->destructor); 4147 swap(tail->sk, head_skb->sk); 4148 } 4149 return segs; 4150 4151err: 4152 kfree_skb_list(segs); 4153 return ERR_PTR(err); 4154} 4155EXPORT_SYMBOL_GPL(skb_segment); 4156 4157int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) 4158{ 4159 struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); 4160 unsigned int offset = skb_gro_offset(skb); 4161 unsigned int headlen = skb_headlen(skb); 4162 unsigned int len = skb_gro_len(skb); 4163 unsigned int delta_truesize; 4164 struct sk_buff *lp; 4165 4166 if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush)) 4167 return -E2BIG; 4168 4169 lp = NAPI_GRO_CB(p)->last; 4170 pinfo = skb_shinfo(lp); 4171 4172 if (headlen <= offset) { 4173 skb_frag_t *frag; 4174 skb_frag_t *frag2; 4175 int i = skbinfo->nr_frags; 4176 int nr_frags = pinfo->nr_frags + i; 4177 4178 if (nr_frags > MAX_SKB_FRAGS) 4179 goto merge; 4180 4181 offset -= headlen; 4182 pinfo->nr_frags = nr_frags; 4183 skbinfo->nr_frags = 0; 4184 4185 frag = pinfo->frags + nr_frags; 4186 frag2 = skbinfo->frags + i; 4187 do { 4188 *--frag = *--frag2; 4189 } while (--i); 4190 4191 skb_frag_off_add(frag, offset); 4192 skb_frag_size_sub(frag, offset); 4193 4194 /* all fragments truesize : remove (head size + sk_buff) */ 4195 delta_truesize = skb->truesize - 4196 SKB_TRUESIZE(skb_end_offset(skb)); 4197 4198 skb->truesize -= skb->data_len; 4199 skb->len -= skb->data_len; 4200 skb->data_len = 0; 4201 4202 NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; 4203 goto done; 4204 } else if (skb->head_frag) { 4205 int nr_frags = pinfo->nr_frags; 4206 skb_frag_t *frag = pinfo->frags + nr_frags; 4207 struct page *page = virt_to_head_page(skb->head); 4208 unsigned int first_size = headlen - offset; 4209 unsigned int first_offset; 4210 4211 if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) 4212 goto merge; 4213 4214 first_offset = skb->data - 4215 (unsigned char *)page_address(page) + 4216 offset; 4217 4218 pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; 4219 4220 __skb_frag_set_page(frag, page); 4221 skb_frag_off_set(frag, first_offset); 4222 skb_frag_size_set(frag, first_size); 4223 4224 memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); 4225 /* We dont need to clear skbinfo->nr_frags here */ 4226 4227 delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); 4228 NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; 4229 goto done; 4230 } 4231 4232merge: 4233 delta_truesize = skb->truesize; 4234 if (offset > headlen) { 4235 unsigned int eat = offset - headlen; 4236 4237 skb_frag_off_add(&skbinfo->frags[0], eat); 4238 skb_frag_size_sub(&skbinfo->frags[0], eat); 4239 skb->data_len -= eat; 4240 skb->len -= eat; 4241 offset = headlen; 4242 } 4243 4244 __skb_pull(skb, offset); 4245 4246 if (NAPI_GRO_CB(p)->last == p) 4247 skb_shinfo(p)->frag_list = skb; 4248 else 4249 NAPI_GRO_CB(p)->last->next = skb; 4250 NAPI_GRO_CB(p)->last = skb; 4251 __skb_header_release(skb); 4252 lp = p; 4253 4254done: 4255 NAPI_GRO_CB(p)->count++; 4256 p->data_len += len; 4257 p->truesize += delta_truesize; 4258 p->len += len; 4259 if (lp != p) { 4260 lp->data_len += len; 4261 lp->truesize += delta_truesize; 4262 lp->len += len; 4263 } 4264 NAPI_GRO_CB(skb)->same_flow = 1; 4265 return 0; 4266} 4267 4268#ifdef CONFIG_SKB_EXTENSIONS 4269#define SKB_EXT_ALIGN_VALUE 8 4270#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) 4271 4272static const u8 skb_ext_type_len[] = { 4273#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 4274 [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), 4275#endif 4276#ifdef CONFIG_XFRM 4277 [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), 4278#endif 4279#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) 4280 [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), 4281#endif 4282#if IS_ENABLED(CONFIG_MPTCP) 4283 [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), 4284#endif 4285}; 4286 4287static __always_inline unsigned int skb_ext_total_length(void) 4288{ 4289 return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + 4290#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 4291 skb_ext_type_len[SKB_EXT_BRIDGE_NF] + 4292#endif 4293#ifdef CONFIG_XFRM 4294 skb_ext_type_len[SKB_EXT_SEC_PATH] + 4295#endif 4296#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) 4297 skb_ext_type_len[TC_SKB_EXT] + 4298#endif 4299#if IS_ENABLED(CONFIG_MPTCP) 4300 skb_ext_type_len[SKB_EXT_MPTCP] + 4301#endif 4302 0; 4303} 4304 4305static void skb_extensions_init(void) 4306{ 4307 BUILD_BUG_ON(SKB_EXT_NUM >= 8); 4308 BUILD_BUG_ON(skb_ext_total_length() > 255); 4309 4310 skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", 4311 SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), 4312 0, 4313 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 4314 NULL); 4315} 4316#else 4317static void skb_extensions_init(void) {} 4318#endif 4319 4320void __init skb_init(void) 4321{ 4322 skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", 4323 sizeof(struct sk_buff), 4324 0, 4325 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 4326 offsetof(struct sk_buff, cb), 4327 sizeof_field(struct sk_buff, cb), 4328 NULL); 4329 skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", 4330 sizeof(struct sk_buff_fclones), 4331 0, 4332 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 4333 NULL); 4334 skb_extensions_init(); 4335} 4336 4337static int 4338__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, 4339 unsigned int recursion_level) 4340{ 4341 int start = skb_headlen(skb); 4342 int i, copy = start - offset; 4343 struct sk_buff *frag_iter; 4344 int elt = 0; 4345 4346 if (unlikely(recursion_level >= 24)) 4347 return -EMSGSIZE; 4348 4349 if (copy > 0) { 4350 if (copy > len) 4351 copy = len; 4352 sg_set_buf(sg, skb->data + offset, copy); 4353 elt++; 4354 if ((len -= copy) == 0) 4355 return elt; 4356 offset += copy; 4357 } 4358 4359 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 4360 int end; 4361 4362 WARN_ON(start > offset + len); 4363 4364 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 4365 if ((copy = end - offset) > 0) { 4366 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 4367 if (unlikely(elt && sg_is_last(&sg[elt - 1]))) 4368 return -EMSGSIZE; 4369 4370 if (copy > len) 4371 copy = len; 4372 sg_set_page(&sg[elt], skb_frag_page(frag), copy, 4373 skb_frag_off(frag) + offset - start); 4374 elt++; 4375 if (!(len -= copy)) 4376 return elt; 4377 offset += copy; 4378 } 4379 start = end; 4380 } 4381 4382 skb_walk_frags(skb, frag_iter) { 4383 int end, ret; 4384 4385 WARN_ON(start > offset + len); 4386 4387 end = start + frag_iter->len; 4388 if ((copy = end - offset) > 0) { 4389 if (unlikely(elt && sg_is_last(&sg[elt - 1]))) 4390 return -EMSGSIZE; 4391 4392 if (copy > len) 4393 copy = len; 4394 ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start, 4395 copy, recursion_level + 1); 4396 if (unlikely(ret < 0)) 4397 return ret; 4398 elt += ret; 4399 if ((len -= copy) == 0) 4400 return elt; 4401 offset += copy; 4402 } 4403 start = end; 4404 } 4405 BUG_ON(len); 4406 return elt; 4407} 4408 4409/** 4410 * skb_to_sgvec - Fill a scatter-gather list from a socket buffer 4411 * @skb: Socket buffer containing the buffers to be mapped 4412 * @sg: The scatter-gather list to map into 4413 * @offset: The offset into the buffer's contents to start mapping 4414 * @len: Length of buffer space to be mapped 4415 * 4416 * Fill the specified scatter-gather list with mappings/pointers into a 4417 * region of the buffer space attached to a socket buffer. Returns either 4418 * the number of scatterlist items used, or -EMSGSIZE if the contents 4419 * could not fit. 4420 */ 4421int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) 4422{ 4423 int nsg = __skb_to_sgvec(skb, sg, offset, len, 0); 4424 4425 if (nsg <= 0) 4426 return nsg; 4427 4428 sg_mark_end(&sg[nsg - 1]); 4429 4430 return nsg; 4431} 4432EXPORT_SYMBOL_GPL(skb_to_sgvec); 4433 4434/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given 4435 * sglist without mark the sg which contain last skb data as the end. 4436 * So the caller can mannipulate sg list as will when padding new data after 4437 * the first call without calling sg_unmark_end to expend sg list. 4438 * 4439 * Scenario to use skb_to_sgvec_nomark: 4440 * 1. sg_init_table 4441 * 2. skb_to_sgvec_nomark(payload1) 4442 * 3. skb_to_sgvec_nomark(payload2) 4443 * 4444 * This is equivalent to: 4445 * 1. sg_init_table 4446 * 2. skb_to_sgvec(payload1) 4447 * 3. sg_unmark_end 4448 * 4. skb_to_sgvec(payload2) 4449 * 4450 * When mapping mutilple payload conditionally, skb_to_sgvec_nomark 4451 * is more preferable. 4452 */ 4453int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, 4454 int offset, int len) 4455{ 4456 return __skb_to_sgvec(skb, sg, offset, len, 0); 4457} 4458EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); 4459 4460 4461 4462/** 4463 * skb_cow_data - Check that a socket buffer's data buffers are writable 4464 * @skb: The socket buffer to check. 4465 * @tailbits: Amount of trailing space to be added 4466 * @trailer: Returned pointer to the skb where the @tailbits space begins 4467 * 4468 * Make sure that the data buffers attached to a socket buffer are 4469 * writable. If they are not, private copies are made of the data buffers 4470 * and the socket buffer is set to use these instead. 4471 * 4472 * If @tailbits is given, make sure that there is space to write @tailbits 4473 * bytes of data beyond current end of socket buffer. @trailer will be 4474 * set to point to the skb in which this space begins. 4475 * 4476 * The number of scatterlist elements required to completely map the 4477 * COW'd and extended socket buffer will be returned. 4478 */ 4479int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) 4480{ 4481 int copyflag; 4482 int elt; 4483 struct sk_buff *skb1, **skb_p; 4484 4485 /* If skb is cloned or its head is paged, reallocate 4486 * head pulling out all the pages (pages are considered not writable 4487 * at the moment even if they are anonymous). 4488 */ 4489 if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && 4490 !__pskb_pull_tail(skb, __skb_pagelen(skb))) 4491 return -ENOMEM; 4492 4493 /* Easy case. Most of packets will go this way. */ 4494 if (!skb_has_frag_list(skb)) { 4495 /* A little of trouble, not enough of space for trailer. 4496 * This should not happen, when stack is tuned to generate 4497 * good frames. OK, on miss we reallocate and reserve even more 4498 * space, 128 bytes is fair. */ 4499 4500 if (skb_tailroom(skb) < tailbits && 4501 pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) 4502 return -ENOMEM; 4503 4504 /* Voila! */ 4505 *trailer = skb; 4506 return 1; 4507 } 4508 4509 /* Misery. We are in troubles, going to mincer fragments... */ 4510 4511 elt = 1; 4512 skb_p = &skb_shinfo(skb)->frag_list; 4513 copyflag = 0; 4514 4515 while ((skb1 = *skb_p) != NULL) { 4516 int ntail = 0; 4517 4518 /* The fragment is partially pulled by someone, 4519 * this can happen on input. Copy it and everything 4520 * after it. */ 4521 4522 if (skb_shared(skb1)) 4523 copyflag = 1; 4524 4525 /* If the skb is the last, worry about trailer. */ 4526 4527 if (skb1->next == NULL && tailbits) { 4528 if (skb_shinfo(skb1)->nr_frags || 4529 skb_has_frag_list(skb1) || 4530 skb_tailroom(skb1) < tailbits) 4531 ntail = tailbits + 128; 4532 } 4533 4534 if (copyflag || 4535 skb_cloned(skb1) || 4536 ntail || 4537 skb_shinfo(skb1)->nr_frags || 4538 skb_has_frag_list(skb1)) { 4539 struct sk_buff *skb2; 4540 4541 /* Fuck, we are miserable poor guys... */ 4542 if (ntail == 0) 4543 skb2 = skb_copy(skb1, GFP_ATOMIC); 4544 else 4545 skb2 = skb_copy_expand(skb1, 4546 skb_headroom(skb1), 4547 ntail, 4548 GFP_ATOMIC); 4549 if (unlikely(skb2 == NULL)) 4550 return -ENOMEM; 4551 4552 if (skb1->sk) 4553 skb_set_owner_w(skb2, skb1->sk); 4554 4555 /* Looking around. Are we still alive? 4556 * OK, link new skb, drop old one */ 4557 4558 skb2->next = skb1->next; 4559 *skb_p = skb2; 4560 kfree_skb(skb1); 4561 skb1 = skb2; 4562 } 4563 elt++; 4564 *trailer = skb1; 4565 skb_p = &skb1->next; 4566 } 4567 4568 return elt; 4569} 4570EXPORT_SYMBOL_GPL(skb_cow_data); 4571 4572static void sock_rmem_free(struct sk_buff *skb) 4573{ 4574 struct sock *sk = skb->sk; 4575 4576 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 4577} 4578 4579static void skb_set_err_queue(struct sk_buff *skb) 4580{ 4581 /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. 4582 * So, it is safe to (mis)use it to mark skbs on the error queue. 4583 */ 4584 skb->pkt_type = PACKET_OUTGOING; 4585 BUILD_BUG_ON(PACKET_OUTGOING == 0); 4586} 4587 4588/* 4589 * Note: We dont mem charge error packets (no sk_forward_alloc changes) 4590 */ 4591int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) 4592{ 4593 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= 4594 (unsigned int)READ_ONCE(sk->sk_rcvbuf)) 4595 return -ENOMEM; 4596 4597 skb_orphan(skb); 4598 skb->sk = sk; 4599 skb->destructor = sock_rmem_free; 4600 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 4601 skb_set_err_queue(skb); 4602 4603 /* before exiting rcu section, make sure dst is refcounted */ 4604 skb_dst_force(skb); 4605 4606 skb_queue_tail(&sk->sk_error_queue, skb); 4607 if (!sock_flag(sk, SOCK_DEAD)) 4608 sk->sk_error_report(sk); 4609 return 0; 4610} 4611EXPORT_SYMBOL(sock_queue_err_skb); 4612 4613static bool is_icmp_err_skb(const struct sk_buff *skb) 4614{ 4615 return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || 4616 SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); 4617} 4618 4619struct sk_buff *sock_dequeue_err_skb(struct sock *sk) 4620{ 4621 struct sk_buff_head *q = &sk->sk_error_queue; 4622 struct sk_buff *skb, *skb_next = NULL; 4623 bool icmp_next = false; 4624 unsigned long flags; 4625 4626 spin_lock_irqsave(&q->lock, flags); 4627 skb = __skb_dequeue(q); 4628 if (skb && (skb_next = skb_peek(q))) { 4629 icmp_next = is_icmp_err_skb(skb_next); 4630 if (icmp_next) 4631 sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno; 4632 } 4633 spin_unlock_irqrestore(&q->lock, flags); 4634 4635 if (is_icmp_err_skb(skb) && !icmp_next) 4636 sk->sk_err = 0; 4637 4638 if (skb_next) 4639 sk->sk_error_report(sk); 4640 4641 return skb; 4642} 4643EXPORT_SYMBOL(sock_dequeue_err_skb); 4644 4645/** 4646 * skb_clone_sk - create clone of skb, and take reference to socket 4647 * @skb: the skb to clone 4648 * 4649 * This function creates a clone of a buffer that holds a reference on 4650 * sk_refcnt. Buffers created via this function are meant to be 4651 * returned using sock_queue_err_skb, or free via kfree_skb. 4652 * 4653 * When passing buffers allocated with this function to sock_queue_err_skb 4654 * it is necessary to wrap the call with sock_hold/sock_put in order to 4655 * prevent the socket from being released prior to being enqueued on 4656 * the sk_error_queue. 4657 */ 4658struct sk_buff *skb_clone_sk(struct sk_buff *skb) 4659{ 4660 struct sock *sk = skb->sk; 4661 struct sk_buff *clone; 4662 4663 if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) 4664 return NULL; 4665 4666 clone = skb_clone(skb, GFP_ATOMIC); 4667 if (!clone) { 4668 sock_put(sk); 4669 return NULL; 4670 } 4671 4672 clone->sk = sk; 4673 clone->destructor = sock_efree; 4674 4675 return clone; 4676} 4677EXPORT_SYMBOL(skb_clone_sk); 4678 4679static void __skb_complete_tx_timestamp(struct sk_buff *skb, 4680 struct sock *sk, 4681 int tstype, 4682 bool opt_stats) 4683{ 4684 struct sock_exterr_skb *serr; 4685 int err; 4686 4687 BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); 4688 4689 serr = SKB_EXT_ERR(skb); 4690 memset(serr, 0, sizeof(*serr)); 4691 serr->ee.ee_errno = ENOMSG; 4692 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; 4693 serr->ee.ee_info = tstype; 4694 serr->opt_stats = opt_stats; 4695 serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; 4696 if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { 4697 serr->ee.ee_data = skb_shinfo(skb)->tskey; 4698 if (sk->sk_protocol == IPPROTO_TCP && 4699 sk->sk_type == SOCK_STREAM) 4700 serr->ee.ee_data -= sk->sk_tskey; 4701 } 4702 4703 err = sock_queue_err_skb(sk, skb); 4704 4705 if (err) 4706 kfree_skb(skb); 4707} 4708 4709static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) 4710{ 4711 bool ret; 4712 4713 if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) 4714 return true; 4715 4716 read_lock_bh(&sk->sk_callback_lock); 4717 ret = sk->sk_socket && sk->sk_socket->file && 4718 file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); 4719 read_unlock_bh(&sk->sk_callback_lock); 4720 return ret; 4721} 4722 4723void skb_complete_tx_timestamp(struct sk_buff *skb, 4724 struct skb_shared_hwtstamps *hwtstamps) 4725{ 4726 struct sock *sk = skb->sk; 4727 4728 if (!skb_may_tx_timestamp(sk, false)) 4729 goto err; 4730 4731 /* Take a reference to prevent skb_orphan() from freeing the socket, 4732 * but only if the socket refcount is not zero. 4733 */ 4734 if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { 4735 *skb_hwtstamps(skb) = *hwtstamps; 4736 __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); 4737 sock_put(sk); 4738 return; 4739 } 4740 4741err: 4742 kfree_skb(skb); 4743} 4744EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); 4745 4746void __skb_tstamp_tx(struct sk_buff *orig_skb, 4747 struct skb_shared_hwtstamps *hwtstamps, 4748 struct sock *sk, int tstype) 4749{ 4750 struct sk_buff *skb; 4751 bool tsonly, opt_stats = false; 4752 4753 if (!sk) 4754 return; 4755 4756 if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && 4757 skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) 4758 return; 4759 4760 tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; 4761 if (!skb_may_tx_timestamp(sk, tsonly)) 4762 return; 4763 4764 if (tsonly) { 4765#ifdef CONFIG_INET 4766 if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && 4767 sk->sk_protocol == IPPROTO_TCP && 4768 sk->sk_type == SOCK_STREAM) { 4769 skb = tcp_get_timestamping_opt_stats(sk, orig_skb); 4770 opt_stats = true; 4771 } else 4772#endif 4773 skb = alloc_skb(0, GFP_ATOMIC); 4774 } else { 4775 skb = skb_clone(orig_skb, GFP_ATOMIC); 4776 4777 if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) { 4778 kfree_skb(skb); 4779 return; 4780 } 4781 } 4782 if (!skb) 4783 return; 4784 4785 if (tsonly) { 4786 skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & 4787 SKBTX_ANY_TSTAMP; 4788 skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; 4789 } 4790 4791 if (hwtstamps) 4792 *skb_hwtstamps(skb) = *hwtstamps; 4793 else 4794 skb->tstamp = ktime_get_real(); 4795 4796 __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); 4797} 4798EXPORT_SYMBOL_GPL(__skb_tstamp_tx); 4799 4800void skb_tstamp_tx(struct sk_buff *orig_skb, 4801 struct skb_shared_hwtstamps *hwtstamps) 4802{ 4803 return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk, 4804 SCM_TSTAMP_SND); 4805} 4806EXPORT_SYMBOL_GPL(skb_tstamp_tx); 4807 4808void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) 4809{ 4810 struct sock *sk = skb->sk; 4811 struct sock_exterr_skb *serr; 4812 int err = 1; 4813 4814 skb->wifi_acked_valid = 1; 4815 skb->wifi_acked = acked; 4816 4817 serr = SKB_EXT_ERR(skb); 4818 memset(serr, 0, sizeof(*serr)); 4819 serr->ee.ee_errno = ENOMSG; 4820 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; 4821 4822 /* Take a reference to prevent skb_orphan() from freeing the socket, 4823 * but only if the socket refcount is not zero. 4824 */ 4825 if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { 4826 err = sock_queue_err_skb(sk, skb); 4827 sock_put(sk); 4828 } 4829 if (err) 4830 kfree_skb(skb); 4831} 4832EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); 4833 4834/** 4835 * skb_partial_csum_set - set up and verify partial csum values for packet 4836 * @skb: the skb to set 4837 * @start: the number of bytes after skb->data to start checksumming. 4838 * @off: the offset from start to place the checksum. 4839 * 4840 * For untrusted partially-checksummed packets, we need to make sure the values 4841 * for skb->csum_start and skb->csum_offset are valid so we don't oops. 4842 * 4843 * This function checks and sets those values and skb->ip_summed: if this 4844 * returns false you should drop the packet. 4845 */ 4846bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) 4847{ 4848 u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); 4849 u32 csum_start = skb_headroom(skb) + (u32)start; 4850 4851 if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) { 4852 net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", 4853 start, off, skb_headroom(skb), skb_headlen(skb)); 4854 return false; 4855 } 4856 skb->ip_summed = CHECKSUM_PARTIAL; 4857 skb->csum_start = csum_start; 4858 skb->csum_offset = off; 4859 skb_set_transport_header(skb, start); 4860 return true; 4861} 4862EXPORT_SYMBOL_GPL(skb_partial_csum_set); 4863 4864static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, 4865 unsigned int max) 4866{ 4867 if (skb_headlen(skb) >= len) 4868 return 0; 4869 4870 /* If we need to pullup then pullup to the max, so we 4871 * won't need to do it again. 4872 */ 4873 if (max > skb->len) 4874 max = skb->len; 4875 4876 if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) 4877 return -ENOMEM; 4878 4879 if (skb_headlen(skb) < len) 4880 return -EPROTO; 4881 4882 return 0; 4883} 4884 4885#define MAX_TCP_HDR_LEN (15 * 4) 4886 4887static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, 4888 typeof(IPPROTO_IP) proto, 4889 unsigned int off) 4890{ 4891 int err; 4892 4893 switch (proto) { 4894 case IPPROTO_TCP: 4895 err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), 4896 off + MAX_TCP_HDR_LEN); 4897 if (!err && !skb_partial_csum_set(skb, off, 4898 offsetof(struct tcphdr, 4899 check))) 4900 err = -EPROTO; 4901 return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; 4902 4903 case IPPROTO_UDP: 4904 err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), 4905 off + sizeof(struct udphdr)); 4906 if (!err && !skb_partial_csum_set(skb, off, 4907 offsetof(struct udphdr, 4908 check))) 4909 err = -EPROTO; 4910 return err ? ERR_PTR(err) : &udp_hdr(skb)->check; 4911 } 4912 4913 return ERR_PTR(-EPROTO); 4914} 4915 4916/* This value should be large enough to cover a tagged ethernet header plus 4917 * maximally sized IP and TCP or UDP headers. 4918 */ 4919#define MAX_IP_HDR_LEN 128 4920 4921static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) 4922{ 4923 unsigned int off; 4924 bool fragment; 4925 __sum16 *csum; 4926 int err; 4927 4928 fragment = false; 4929 4930 err = skb_maybe_pull_tail(skb, 4931 sizeof(struct iphdr), 4932 MAX_IP_HDR_LEN); 4933 if (err < 0) 4934 goto out; 4935 4936 if (ip_is_fragment(ip_hdr(skb))) 4937 fragment = true; 4938 4939 off = ip_hdrlen(skb); 4940 4941 err = -EPROTO; 4942 4943 if (fragment) 4944 goto out; 4945 4946 csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); 4947 if (IS_ERR(csum)) 4948 return PTR_ERR(csum); 4949 4950 if (recalculate) 4951 *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, 4952 ip_hdr(skb)->daddr, 4953 skb->len - off, 4954 ip_hdr(skb)->protocol, 0); 4955 err = 0; 4956 4957out: 4958 return err; 4959} 4960 4961/* This value should be large enough to cover a tagged ethernet header plus 4962 * an IPv6 header, all options, and a maximal TCP or UDP header. 4963 */ 4964#define MAX_IPV6_HDR_LEN 256 4965 4966#define OPT_HDR(type, skb, off) \ 4967 (type *)(skb_network_header(skb) + (off)) 4968 4969static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) 4970{ 4971 int err; 4972 u8 nexthdr; 4973 unsigned int off; 4974 unsigned int len; 4975 bool fragment; 4976 bool done; 4977 __sum16 *csum; 4978 4979 fragment = false; 4980 done = false; 4981 4982 off = sizeof(struct ipv6hdr); 4983 4984 err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); 4985 if (err < 0) 4986 goto out; 4987 4988 nexthdr = ipv6_hdr(skb)->nexthdr; 4989 4990 len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); 4991 while (off <= len && !done) { 4992 switch (nexthdr) { 4993 case IPPROTO_DSTOPTS: 4994 case IPPROTO_HOPOPTS: 4995 case IPPROTO_ROUTING: { 4996 struct ipv6_opt_hdr *hp; 4997 4998 err = skb_maybe_pull_tail(skb, 4999 off + 5000 sizeof(struct ipv6_opt_hdr), 5001 MAX_IPV6_HDR_LEN); 5002 if (err < 0) 5003 goto out; 5004 5005 hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); 5006 nexthdr = hp->nexthdr; 5007 off += ipv6_optlen(hp); 5008 break; 5009 } 5010 case IPPROTO_AH: { 5011 struct ip_auth_hdr *hp; 5012 5013 err = skb_maybe_pull_tail(skb, 5014 off + 5015 sizeof(struct ip_auth_hdr), 5016 MAX_IPV6_HDR_LEN); 5017 if (err < 0) 5018 goto out; 5019 5020 hp = OPT_HDR(struct ip_auth_hdr, skb, off); 5021 nexthdr = hp->nexthdr; 5022 off += ipv6_authlen(hp); 5023 break; 5024 } 5025 case IPPROTO_FRAGMENT: { 5026 struct frag_hdr *hp; 5027 5028 err = skb_maybe_pull_tail(skb, 5029 off + 5030 sizeof(struct frag_hdr), 5031 MAX_IPV6_HDR_LEN); 5032 if (err < 0) 5033 goto out; 5034 5035 hp = OPT_HDR(struct frag_hdr, skb, off); 5036 5037 if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) 5038 fragment = true; 5039 5040 nexthdr = hp->nexthdr; 5041 off += sizeof(struct frag_hdr); 5042 break; 5043 } 5044 default: 5045 done = true; 5046 break; 5047 } 5048 } 5049 5050 err = -EPROTO; 5051 5052 if (!done || fragment) 5053 goto out; 5054 5055 csum = skb_checksum_setup_ip(skb, nexthdr, off); 5056 if (IS_ERR(csum)) 5057 return PTR_ERR(csum); 5058 5059 if (recalculate) 5060 *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, 5061 &ipv6_hdr(skb)->daddr, 5062 skb->len - off, nexthdr, 0); 5063 err = 0; 5064 5065out: 5066 return err; 5067} 5068 5069/** 5070 * skb_checksum_setup - set up partial checksum offset 5071 * @skb: the skb to set up 5072 * @recalculate: if true the pseudo-header checksum will be recalculated 5073 */ 5074int skb_checksum_setup(struct sk_buff *skb, bool recalculate) 5075{ 5076 int err; 5077 5078 switch (skb->protocol) { 5079 case htons(ETH_P_IP): 5080 err = skb_checksum_setup_ipv4(skb, recalculate); 5081 break; 5082 5083 case htons(ETH_P_IPV6): 5084 err = skb_checksum_setup_ipv6(skb, recalculate); 5085 break; 5086 5087 default: 5088 err = -EPROTO; 5089 break; 5090 } 5091 5092 return err; 5093} 5094EXPORT_SYMBOL(skb_checksum_setup); 5095 5096/** 5097 * skb_checksum_maybe_trim - maybe trims the given skb 5098 * @skb: the skb to check 5099 * @transport_len: the data length beyond the network header 5100 * 5101 * Checks whether the given skb has data beyond the given transport length. 5102 * If so, returns a cloned skb trimmed to this transport length. 5103 * Otherwise returns the provided skb. Returns NULL in error cases 5104 * (e.g. transport_len exceeds skb length or out-of-memory). 5105 * 5106 * Caller needs to set the skb transport header and free any returned skb if it 5107 * differs from the provided skb. 5108 */ 5109static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, 5110 unsigned int transport_len) 5111{ 5112 struct sk_buff *skb_chk; 5113 unsigned int len = skb_transport_offset(skb) + transport_len; 5114 int ret; 5115 5116 if (skb->len < len) 5117 return NULL; 5118 else if (skb->len == len) 5119 return skb; 5120 5121 skb_chk = skb_clone(skb, GFP_ATOMIC); 5122 if (!skb_chk) 5123 return NULL; 5124 5125 ret = pskb_trim_rcsum(skb_chk, len); 5126 if (ret) { 5127 kfree_skb(skb_chk); 5128 return NULL; 5129 } 5130 5131 return skb_chk; 5132} 5133 5134/** 5135 * skb_checksum_trimmed - validate checksum of an skb 5136 * @skb: the skb to check 5137 * @transport_len: the data length beyond the network header 5138 * @skb_chkf: checksum function to use 5139 * 5140 * Applies the given checksum function skb_chkf to the provided skb. 5141 * Returns a checked and maybe trimmed skb. Returns NULL on error. 5142 * 5143 * If the skb has data beyond the given transport length, then a 5144 * trimmed & cloned skb is checked and returned. 5145 * 5146 * Caller needs to set the skb transport header and free any returned skb if it 5147 * differs from the provided skb. 5148 */ 5149struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, 5150 unsigned int transport_len, 5151 __sum16(*skb_chkf)(struct sk_buff *skb)) 5152{ 5153 struct sk_buff *skb_chk; 5154 unsigned int offset = skb_transport_offset(skb); 5155 __sum16 ret; 5156 5157 skb_chk = skb_checksum_maybe_trim(skb, transport_len); 5158 if (!skb_chk) 5159 goto err; 5160 5161 if (!pskb_may_pull(skb_chk, offset)) 5162 goto err; 5163 5164 skb_pull_rcsum(skb_chk, offset); 5165 ret = skb_chkf(skb_chk); 5166 skb_push_rcsum(skb_chk, offset); 5167 5168 if (ret) 5169 goto err; 5170 5171 return skb_chk; 5172 5173err: 5174 if (skb_chk && skb_chk != skb) 5175 kfree_skb(skb_chk); 5176 5177 return NULL; 5178 5179} 5180EXPORT_SYMBOL(skb_checksum_trimmed); 5181 5182void __skb_warn_lro_forwarding(const struct sk_buff *skb) 5183{ 5184 net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", 5185 skb->dev->name); 5186} 5187EXPORT_SYMBOL(__skb_warn_lro_forwarding); 5188 5189void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) 5190{ 5191 if (head_stolen) { 5192 skb_release_head_state(skb); 5193 kmem_cache_free(skbuff_head_cache, skb); 5194 } else { 5195 __kfree_skb(skb); 5196 } 5197} 5198EXPORT_SYMBOL(kfree_skb_partial); 5199 5200/** 5201 * skb_try_coalesce - try to merge skb to prior one 5202 * @to: prior buffer 5203 * @from: buffer to add 5204 * @fragstolen: pointer to boolean 5205 * @delta_truesize: how much more was allocated than was requested 5206 */ 5207bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, 5208 bool *fragstolen, int *delta_truesize) 5209{ 5210 struct skb_shared_info *to_shinfo, *from_shinfo; 5211 int i, delta, len = from->len; 5212 5213 *fragstolen = false; 5214 5215 if (skb_cloned(to)) 5216 return false; 5217 5218 if (len <= skb_tailroom(to)) { 5219 if (len) 5220 BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); 5221 *delta_truesize = 0; 5222 return true; 5223 } 5224 5225 to_shinfo = skb_shinfo(to); 5226 from_shinfo = skb_shinfo(from); 5227 if (to_shinfo->frag_list || from_shinfo->frag_list) 5228 return false; 5229 if (skb_zcopy(to) || skb_zcopy(from)) 5230 return false; 5231 5232 if (skb_headlen(from) != 0) { 5233 struct page *page; 5234 unsigned int offset; 5235 5236 if (to_shinfo->nr_frags + 5237 from_shinfo->nr_frags >= MAX_SKB_FRAGS) 5238 return false; 5239 5240 if (skb_head_is_locked(from)) 5241 return false; 5242 5243 delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); 5244 5245 page = virt_to_head_page(from->head); 5246 offset = from->data - (unsigned char *)page_address(page); 5247 5248 skb_fill_page_desc(to, to_shinfo->nr_frags, 5249 page, offset, skb_headlen(from)); 5250 *fragstolen = true; 5251 } else { 5252 if (to_shinfo->nr_frags + 5253 from_shinfo->nr_frags > MAX_SKB_FRAGS) 5254 return false; 5255 5256 delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); 5257 } 5258 5259 WARN_ON_ONCE(delta < len); 5260 5261 memcpy(to_shinfo->frags + to_shinfo->nr_frags, 5262 from_shinfo->frags, 5263 from_shinfo->nr_frags * sizeof(skb_frag_t)); 5264 to_shinfo->nr_frags += from_shinfo->nr_frags; 5265 5266 if (!skb_cloned(from)) 5267 from_shinfo->nr_frags = 0; 5268 5269 /* if the skb is not cloned this does nothing 5270 * since we set nr_frags to 0. 5271 */ 5272 for (i = 0; i < from_shinfo->nr_frags; i++) 5273 __skb_frag_ref(&from_shinfo->frags[i]); 5274 5275 to->truesize += delta; 5276 to->len += len; 5277 to->data_len += len; 5278 5279 *delta_truesize = delta; 5280 return true; 5281} 5282EXPORT_SYMBOL(skb_try_coalesce); 5283 5284/** 5285 * skb_scrub_packet - scrub an skb 5286 * 5287 * @skb: buffer to clean 5288 * @xnet: packet is crossing netns 5289 * 5290 * skb_scrub_packet can be used after encapsulating or decapsulting a packet 5291 * into/from a tunnel. Some information have to be cleared during these 5292 * operations. 5293 * skb_scrub_packet can also be used to clean a skb before injecting it in 5294 * another namespace (@xnet == true). We have to clear all information in the 5295 * skb that could impact namespace isolation. 5296 */ 5297void skb_scrub_packet(struct sk_buff *skb, bool xnet) 5298{ 5299 skb->pkt_type = PACKET_HOST; 5300 skb->skb_iif = 0; 5301 skb->ignore_df = 0; 5302 skb_dst_drop(skb); 5303 skb_ext_reset(skb); 5304 nf_reset_ct(skb); 5305 nf_reset_trace(skb); 5306 5307#ifdef CONFIG_NET_SWITCHDEV 5308 skb->offload_fwd_mark = 0; 5309 skb->offload_l3_fwd_mark = 0; 5310#endif 5311 5312 if (!xnet) 5313 return; 5314 5315 ipvs_reset(skb); 5316 skb->mark = 0; 5317 skb->tstamp = 0; 5318} 5319EXPORT_SYMBOL_GPL(skb_scrub_packet); 5320 5321/** 5322 * skb_gso_transport_seglen - Return length of individual segments of a gso packet 5323 * 5324 * @skb: GSO skb 5325 * 5326 * skb_gso_transport_seglen is used to determine the real size of the 5327 * individual segments, including Layer4 headers (TCP/UDP). 5328 * 5329 * The MAC/L2 or network (IP, IPv6) headers are not accounted for. 5330 */ 5331static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) 5332{ 5333 const struct skb_shared_info *shinfo = skb_shinfo(skb); 5334 unsigned int thlen = 0; 5335 5336 if (skb->encapsulation) { 5337 thlen = skb_inner_transport_header(skb) - 5338 skb_transport_header(skb); 5339 5340 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 5341 thlen += inner_tcp_hdrlen(skb); 5342 } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { 5343 thlen = tcp_hdrlen(skb); 5344 } else if (unlikely(skb_is_gso_sctp(skb))) { 5345 thlen = sizeof(struct sctphdr); 5346 } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { 5347 thlen = sizeof(struct udphdr); 5348 } 5349 /* UFO sets gso_size to the size of the fragmentation 5350 * payload, i.e. the size of the L4 (UDP) header is already 5351 * accounted for. 5352 */ 5353 return thlen + shinfo->gso_size; 5354} 5355 5356/** 5357 * skb_gso_network_seglen - Return length of individual segments of a gso packet 5358 * 5359 * @skb: GSO skb 5360 * 5361 * skb_gso_network_seglen is used to determine the real size of the 5362 * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). 5363 * 5364 * The MAC/L2 header is not accounted for. 5365 */ 5366static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) 5367{ 5368 unsigned int hdr_len = skb_transport_header(skb) - 5369 skb_network_header(skb); 5370 5371 return hdr_len + skb_gso_transport_seglen(skb); 5372} 5373 5374/** 5375 * skb_gso_mac_seglen - Return length of individual segments of a gso packet 5376 * 5377 * @skb: GSO skb 5378 * 5379 * skb_gso_mac_seglen is used to determine the real size of the 5380 * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 5381 * headers (TCP/UDP). 5382 */ 5383static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) 5384{ 5385 unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 5386 5387 return hdr_len + skb_gso_transport_seglen(skb); 5388} 5389 5390/** 5391 * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS 5392 * 5393 * There are a couple of instances where we have a GSO skb, and we 5394 * want to determine what size it would be after it is segmented. 5395 * 5396 * We might want to check: 5397 * - L3+L4+payload size (e.g. IP forwarding) 5398 * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) 5399 * 5400 * This is a helper to do that correctly considering GSO_BY_FRAGS. 5401 * 5402 * @skb: GSO skb 5403 * 5404 * @seg_len: The segmented length (from skb_gso_*_seglen). In the 5405 * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. 5406 * 5407 * @max_len: The maximum permissible length. 5408 * 5409 * Returns true if the segmented length <= max length. 5410 */ 5411static inline bool skb_gso_size_check(const struct sk_buff *skb, 5412 unsigned int seg_len, 5413 unsigned int max_len) { 5414 const struct skb_shared_info *shinfo = skb_shinfo(skb); 5415 const struct sk_buff *iter; 5416 5417 if (shinfo->gso_size != GSO_BY_FRAGS) 5418 return seg_len <= max_len; 5419 5420 /* Undo this so we can re-use header sizes */ 5421 seg_len -= GSO_BY_FRAGS; 5422 5423 skb_walk_frags(skb, iter) { 5424 if (seg_len + skb_headlen(iter) > max_len) 5425 return false; 5426 } 5427 5428 return true; 5429} 5430 5431/** 5432 * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? 5433 * 5434 * @skb: GSO skb 5435 * @mtu: MTU to validate against 5436 * 5437 * skb_gso_validate_network_len validates if a given skb will fit a 5438 * wanted MTU once split. It considers L3 headers, L4 headers, and the 5439 * payload. 5440 */ 5441bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) 5442{ 5443 return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); 5444} 5445EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); 5446 5447/** 5448 * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? 5449 * 5450 * @skb: GSO skb 5451 * @len: length to validate against 5452 * 5453 * skb_gso_validate_mac_len validates if a given skb will fit a wanted 5454 * length once split, including L2, L3 and L4 headers and the payload. 5455 */ 5456bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) 5457{ 5458 return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); 5459} 5460EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); 5461 5462static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) 5463{ 5464 int mac_len, meta_len; 5465 void *meta; 5466 5467 if (skb_cow(skb, skb_headroom(skb)) < 0) { 5468 kfree_skb(skb); 5469 return NULL; 5470 } 5471 5472 mac_len = skb->data - skb_mac_header(skb); 5473 if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) { 5474 memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), 5475 mac_len - VLAN_HLEN - ETH_TLEN); 5476 } 5477 5478 meta_len = skb_metadata_len(skb); 5479 if (meta_len) { 5480 meta = skb_metadata_end(skb) - meta_len; 5481 memmove(meta + VLAN_HLEN, meta, meta_len); 5482 } 5483 5484 skb->mac_header += VLAN_HLEN; 5485 return skb; 5486} 5487 5488struct sk_buff *skb_vlan_untag(struct sk_buff *skb) 5489{ 5490 struct vlan_hdr *vhdr; 5491 u16 vlan_tci; 5492 5493 if (unlikely(skb_vlan_tag_present(skb))) { 5494 /* vlan_tci is already set-up so leave this for another time */ 5495 return skb; 5496 } 5497 5498 skb = skb_share_check(skb, GFP_ATOMIC); 5499 if (unlikely(!skb)) 5500 goto err_free; 5501 /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ 5502 if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) 5503 goto err_free; 5504 5505 vhdr = (struct vlan_hdr *)skb->data; 5506 vlan_tci = ntohs(vhdr->h_vlan_TCI); 5507 __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); 5508 5509 skb_pull_rcsum(skb, VLAN_HLEN); 5510 vlan_set_encap_proto(skb, vhdr); 5511 5512 skb = skb_reorder_vlan_header(skb); 5513 if (unlikely(!skb)) 5514 goto err_free; 5515 5516 skb_reset_network_header(skb); 5517 skb_reset_transport_header(skb); 5518 skb_reset_mac_len(skb); 5519 5520 return skb; 5521 5522err_free: 5523 kfree_skb(skb); 5524 return NULL; 5525} 5526EXPORT_SYMBOL(skb_vlan_untag); 5527 5528int skb_ensure_writable(struct sk_buff *skb, int write_len) 5529{ 5530 if (!pskb_may_pull(skb, write_len)) 5531 return -ENOMEM; 5532 5533 if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) 5534 return 0; 5535 5536 return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 5537} 5538EXPORT_SYMBOL(skb_ensure_writable); 5539 5540/* remove VLAN header from packet and update csum accordingly. 5541 * expects a non skb_vlan_tag_present skb with a vlan tag payload 5542 */ 5543int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) 5544{ 5545 struct vlan_hdr *vhdr; 5546 int offset = skb->data - skb_mac_header(skb); 5547 int err; 5548 5549 if (WARN_ONCE(offset, 5550 "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n", 5551 offset)) { 5552 return -EINVAL; 5553 } 5554 5555 err = skb_ensure_writable(skb, VLAN_ETH_HLEN); 5556 if (unlikely(err)) 5557 return err; 5558 5559 skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); 5560 5561 vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); 5562 *vlan_tci = ntohs(vhdr->h_vlan_TCI); 5563 5564 memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); 5565 __skb_pull(skb, VLAN_HLEN); 5566 5567 vlan_set_encap_proto(skb, vhdr); 5568 skb->mac_header += VLAN_HLEN; 5569 5570 if (skb_network_offset(skb) < ETH_HLEN) 5571 skb_set_network_header(skb, ETH_HLEN); 5572 5573 skb_reset_mac_len(skb); 5574 5575 return err; 5576} 5577EXPORT_SYMBOL(__skb_vlan_pop); 5578 5579/* Pop a vlan tag either from hwaccel or from payload. 5580 * Expects skb->data at mac header. 5581 */ 5582int skb_vlan_pop(struct sk_buff *skb) 5583{ 5584 u16 vlan_tci; 5585 __be16 vlan_proto; 5586 int err; 5587 5588 if (likely(skb_vlan_tag_present(skb))) { 5589 __vlan_hwaccel_clear_tag(skb); 5590 } else { 5591 if (unlikely(!eth_type_vlan(skb->protocol))) 5592 return 0; 5593 5594 err = __skb_vlan_pop(skb, &vlan_tci); 5595 if (err) 5596 return err; 5597 } 5598 /* move next vlan tag to hw accel tag */ 5599 if (likely(!eth_type_vlan(skb->protocol))) 5600 return 0; 5601 5602 vlan_proto = skb->protocol; 5603 err = __skb_vlan_pop(skb, &vlan_tci); 5604 if (unlikely(err)) 5605 return err; 5606 5607 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); 5608 return 0; 5609} 5610EXPORT_SYMBOL(skb_vlan_pop); 5611 5612/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). 5613 * Expects skb->data at mac header. 5614 */ 5615int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) 5616{ 5617 if (skb_vlan_tag_present(skb)) { 5618 int offset = skb->data - skb_mac_header(skb); 5619 int err; 5620 5621 if (WARN_ONCE(offset, 5622 "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n", 5623 offset)) { 5624 return -EINVAL; 5625 } 5626 5627 err = __vlan_insert_tag(skb, skb->vlan_proto, 5628 skb_vlan_tag_get(skb)); 5629 if (err) 5630 return err; 5631 5632 skb->protocol = skb->vlan_proto; 5633 skb->mac_len += VLAN_HLEN; 5634 5635 skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); 5636 } 5637 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); 5638 return 0; 5639} 5640EXPORT_SYMBOL(skb_vlan_push); 5641 5642/** 5643 * skb_eth_pop() - Drop the Ethernet header at the head of a packet 5644 * 5645 * @skb: Socket buffer to modify 5646 * 5647 * Drop the Ethernet header of @skb. 5648 * 5649 * Expects that skb->data points to the mac header and that no VLAN tags are 5650 * present. 5651 * 5652 * Returns 0 on success, -errno otherwise. 5653 */ 5654int skb_eth_pop(struct sk_buff *skb) 5655{ 5656 if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) || 5657 skb_network_offset(skb) < ETH_HLEN) 5658 return -EPROTO; 5659 5660 skb_pull_rcsum(skb, ETH_HLEN); 5661 skb_reset_mac_header(skb); 5662 skb_reset_mac_len(skb); 5663 5664 return 0; 5665} 5666EXPORT_SYMBOL(skb_eth_pop); 5667 5668/** 5669 * skb_eth_push() - Add a new Ethernet header at the head of a packet 5670 * 5671 * @skb: Socket buffer to modify 5672 * @dst: Destination MAC address of the new header 5673 * @src: Source MAC address of the new header 5674 * 5675 * Prepend @skb with a new Ethernet header. 5676 * 5677 * Expects that skb->data points to the mac header, which must be empty. 5678 * 5679 * Returns 0 on success, -errno otherwise. 5680 */ 5681int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, 5682 const unsigned char *src) 5683{ 5684 struct ethhdr *eth; 5685 int err; 5686 5687 if (skb_network_offset(skb) || skb_vlan_tag_present(skb)) 5688 return -EPROTO; 5689 5690 err = skb_cow_head(skb, sizeof(*eth)); 5691 if (err < 0) 5692 return err; 5693 5694 skb_push(skb, sizeof(*eth)); 5695 skb_reset_mac_header(skb); 5696 skb_reset_mac_len(skb); 5697 5698 eth = eth_hdr(skb); 5699 ether_addr_copy(eth->h_dest, dst); 5700 ether_addr_copy(eth->h_source, src); 5701 eth->h_proto = skb->protocol; 5702 5703 skb_postpush_rcsum(skb, eth, sizeof(*eth)); 5704 5705 return 0; 5706} 5707EXPORT_SYMBOL(skb_eth_push); 5708 5709/* Update the ethertype of hdr and the skb csum value if required. */ 5710static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, 5711 __be16 ethertype) 5712{ 5713 if (skb->ip_summed == CHECKSUM_COMPLETE) { 5714 __be16 diff[] = { ~hdr->h_proto, ethertype }; 5715 5716 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); 5717 } 5718 5719 hdr->h_proto = ethertype; 5720} 5721 5722/** 5723 * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of 5724 * the packet 5725 * 5726 * @skb: buffer 5727 * @mpls_lse: MPLS label stack entry to push 5728 * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) 5729 * @mac_len: length of the MAC header 5730 * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is 5731 * ethernet 5732 * 5733 * Expects skb->data at mac header. 5734 * 5735 * Returns 0 on success, -errno otherwise. 5736 */ 5737int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, 5738 int mac_len, bool ethernet) 5739{ 5740 struct mpls_shim_hdr *lse; 5741 int err; 5742 5743 if (unlikely(!eth_p_mpls(mpls_proto))) 5744 return -EINVAL; 5745 5746 /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */ 5747 if (skb->encapsulation) 5748 return -EINVAL; 5749 5750 err = skb_cow_head(skb, MPLS_HLEN); 5751 if (unlikely(err)) 5752 return err; 5753 5754 if (!skb->inner_protocol) { 5755 skb_set_inner_network_header(skb, skb_network_offset(skb)); 5756 skb_set_inner_protocol(skb, skb->protocol); 5757 } 5758 5759 skb_push(skb, MPLS_HLEN); 5760 memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), 5761 mac_len); 5762 skb_reset_mac_header(skb); 5763 skb_set_network_header(skb, mac_len); 5764 skb_reset_mac_len(skb); 5765 5766 lse = mpls_hdr(skb); 5767 lse->label_stack_entry = mpls_lse; 5768 skb_postpush_rcsum(skb, lse, MPLS_HLEN); 5769 5770 if (ethernet && mac_len >= ETH_HLEN) 5771 skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); 5772 skb->protocol = mpls_proto; 5773 5774 return 0; 5775} 5776EXPORT_SYMBOL_GPL(skb_mpls_push); 5777 5778/** 5779 * skb_mpls_pop() - pop the outermost MPLS header 5780 * 5781 * @skb: buffer 5782 * @next_proto: ethertype of header after popped MPLS header 5783 * @mac_len: length of the MAC header 5784 * @ethernet: flag to indicate if the packet is ethernet 5785 * 5786 * Expects skb->data at mac header. 5787 * 5788 * Returns 0 on success, -errno otherwise. 5789 */ 5790int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, 5791 bool ethernet) 5792{ 5793 int err; 5794 5795 if (unlikely(!eth_p_mpls(skb->protocol))) 5796 return 0; 5797 5798 err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); 5799 if (unlikely(err)) 5800 return err; 5801 5802 skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); 5803 memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), 5804 mac_len); 5805 5806 __skb_pull(skb, MPLS_HLEN); 5807 skb_reset_mac_header(skb); 5808 skb_set_network_header(skb, mac_len); 5809 5810 if (ethernet && mac_len >= ETH_HLEN) { 5811 struct ethhdr *hdr; 5812 5813 /* use mpls_hdr() to get ethertype to account for VLANs. */ 5814 hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); 5815 skb_mod_eth_type(skb, hdr, next_proto); 5816 } 5817 skb->protocol = next_proto; 5818 5819 return 0; 5820} 5821EXPORT_SYMBOL_GPL(skb_mpls_pop); 5822 5823/** 5824 * skb_mpls_update_lse() - modify outermost MPLS header and update csum 5825 * 5826 * @skb: buffer 5827 * @mpls_lse: new MPLS label stack entry to update to 5828 * 5829 * Expects skb->data at mac header. 5830 * 5831 * Returns 0 on success, -errno otherwise. 5832 */ 5833int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse) 5834{ 5835 int err; 5836 5837 if (unlikely(!eth_p_mpls(skb->protocol))) 5838 return -EINVAL; 5839 5840 err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); 5841 if (unlikely(err)) 5842 return err; 5843 5844 if (skb->ip_summed == CHECKSUM_COMPLETE) { 5845 __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse }; 5846 5847 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); 5848 } 5849 5850 mpls_hdr(skb)->label_stack_entry = mpls_lse; 5851 5852 return 0; 5853} 5854EXPORT_SYMBOL_GPL(skb_mpls_update_lse); 5855 5856/** 5857 * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header 5858 * 5859 * @skb: buffer 5860 * 5861 * Expects skb->data at mac header. 5862 * 5863 * Returns 0 on success, -errno otherwise. 5864 */ 5865int skb_mpls_dec_ttl(struct sk_buff *skb) 5866{ 5867 u32 lse; 5868 u8 ttl; 5869 5870 if (unlikely(!eth_p_mpls(skb->protocol))) 5871 return -EINVAL; 5872 5873 if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) 5874 return -ENOMEM; 5875 5876 lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); 5877 ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; 5878 if (!--ttl) 5879 return -EINVAL; 5880 5881 lse &= ~MPLS_LS_TTL_MASK; 5882 lse |= ttl << MPLS_LS_TTL_SHIFT; 5883 5884 return skb_mpls_update_lse(skb, cpu_to_be32(lse)); 5885} 5886EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); 5887 5888/** 5889 * alloc_skb_with_frags - allocate skb with page frags 5890 * 5891 * @header_len: size of linear part 5892 * @data_len: needed length in frags 5893 * @max_page_order: max page order desired. 5894 * @errcode: pointer to error code if any 5895 * @gfp_mask: allocation mask 5896 * 5897 * This can be used to allocate a paged skb, given a maximal order for frags. 5898 */ 5899struct sk_buff *alloc_skb_with_frags(unsigned long header_len, 5900 unsigned long data_len, 5901 int max_page_order, 5902 int *errcode, 5903 gfp_t gfp_mask) 5904{ 5905 int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; 5906 unsigned long chunk; 5907 struct sk_buff *skb; 5908 struct page *page; 5909 int i; 5910 5911 *errcode = -EMSGSIZE; 5912 /* Note this test could be relaxed, if we succeed to allocate 5913 * high order pages... 5914 */ 5915 if (npages > MAX_SKB_FRAGS) 5916 return NULL; 5917 5918 *errcode = -ENOBUFS; 5919 skb = alloc_skb(header_len, gfp_mask); 5920 if (!skb) 5921 return NULL; 5922 5923 skb->truesize += npages << PAGE_SHIFT; 5924 5925 for (i = 0; npages > 0; i++) { 5926 int order = max_page_order; 5927 5928 while (order) { 5929 if (npages >= 1 << order) { 5930 page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | 5931 __GFP_COMP | 5932 __GFP_NOWARN, 5933 order); 5934 if (page) 5935 goto fill_page; 5936 /* Do not retry other high order allocations */ 5937 order = 1; 5938 max_page_order = 0; 5939 } 5940 order--; 5941 } 5942 page = alloc_page(gfp_mask); 5943 if (!page) 5944 goto failure; 5945fill_page: 5946 chunk = min_t(unsigned long, data_len, 5947 PAGE_SIZE << order); 5948 skb_fill_page_desc(skb, i, page, 0, chunk); 5949 data_len -= chunk; 5950 npages -= 1 << order; 5951 } 5952 return skb; 5953 5954failure: 5955 kfree_skb(skb); 5956 return NULL; 5957} 5958EXPORT_SYMBOL(alloc_skb_with_frags); 5959 5960/* carve out the first off bytes from skb when off < headlen */ 5961static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, 5962 const int headlen, gfp_t gfp_mask) 5963{ 5964 int i; 5965 int size = skb_end_offset(skb); 5966 int new_hlen = headlen - off; 5967 u8 *data; 5968 5969 size = SKB_DATA_ALIGN(size); 5970 5971 if (skb_pfmemalloc(skb)) 5972 gfp_mask |= __GFP_MEMALLOC; 5973 data = kmalloc_reserve(size + 5974 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), 5975 gfp_mask, NUMA_NO_NODE, NULL); 5976 if (!data) 5977 return -ENOMEM; 5978 5979 size = SKB_WITH_OVERHEAD(ksize(data)); 5980 5981 /* Copy real data, and all frags */ 5982 skb_copy_from_linear_data_offset(skb, off, data, new_hlen); 5983 skb->len -= off; 5984 5985 memcpy((struct skb_shared_info *)(data + size), 5986 skb_shinfo(skb), 5987 offsetof(struct skb_shared_info, 5988 frags[skb_shinfo(skb)->nr_frags])); 5989 if (skb_cloned(skb)) { 5990 /* drop the old head gracefully */ 5991 if (skb_orphan_frags(skb, gfp_mask)) { 5992 kfree(data); 5993 return -ENOMEM; 5994 } 5995 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 5996 skb_frag_ref(skb, i); 5997 if (skb_has_frag_list(skb)) 5998 skb_clone_fraglist(skb); 5999 skb_release_data(skb); 6000 } else { 6001 /* we can reuse existing recount- all we did was 6002 * relocate values 6003 */ 6004 skb_free_head(skb); 6005 } 6006 6007 skb->head = data; 6008 skb->data = data; 6009 skb->head_frag = 0; 6010#ifdef NET_SKBUFF_DATA_USES_OFFSET 6011 skb->end = size; 6012#else 6013 skb->end = skb->head + size; 6014#endif 6015 skb_set_tail_pointer(skb, skb_headlen(skb)); 6016 skb_headers_offset_update(skb, 0); 6017 skb->cloned = 0; 6018 skb->hdr_len = 0; 6019 skb->nohdr = 0; 6020 atomic_set(&skb_shinfo(skb)->dataref, 1); 6021 6022 return 0; 6023} 6024 6025static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); 6026 6027/* carve out the first eat bytes from skb's frag_list. May recurse into 6028 * pskb_carve() 6029 */ 6030static int pskb_carve_frag_list(struct sk_buff *skb, 6031 struct skb_shared_info *shinfo, int eat, 6032 gfp_t gfp_mask) 6033{ 6034 struct sk_buff *list = shinfo->frag_list; 6035 struct sk_buff *clone = NULL; 6036 struct sk_buff *insp = NULL; 6037 6038 do { 6039 if (!list) { 6040 pr_err("Not enough bytes to eat. Want %d\n", eat); 6041 return -EFAULT; 6042 } 6043 if (list->len <= eat) { 6044 /* Eaten as whole. */ 6045 eat -= list->len; 6046 list = list->next; 6047 insp = list; 6048 } else { 6049 /* Eaten partially. */ 6050 if (skb_shared(list)) { 6051 clone = skb_clone(list, gfp_mask); 6052 if (!clone) 6053 return -ENOMEM; 6054 insp = list->next; 6055 list = clone; 6056 } else { 6057 /* This may be pulled without problems. */ 6058 insp = list; 6059 } 6060 if (pskb_carve(list, eat, gfp_mask) < 0) { 6061 kfree_skb(clone); 6062 return -ENOMEM; 6063 } 6064 break; 6065 } 6066 } while (eat); 6067 6068 /* Free pulled out fragments. */ 6069 while ((list = shinfo->frag_list) != insp) { 6070 shinfo->frag_list = list->next; 6071 consume_skb(list); 6072 } 6073 /* And insert new clone at head. */ 6074 if (clone) { 6075 clone->next = list; 6076 shinfo->frag_list = clone; 6077 } 6078 return 0; 6079} 6080 6081/* carve off first len bytes from skb. Split line (off) is in the 6082 * non-linear part of skb 6083 */ 6084static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, 6085 int pos, gfp_t gfp_mask) 6086{ 6087 int i, k = 0; 6088 int size = skb_end_offset(skb); 6089 u8 *data; 6090 const int nfrags = skb_shinfo(skb)->nr_frags; 6091 struct skb_shared_info *shinfo; 6092 6093 size = SKB_DATA_ALIGN(size); 6094 6095 if (skb_pfmemalloc(skb)) 6096 gfp_mask |= __GFP_MEMALLOC; 6097 data = kmalloc_reserve(size + 6098 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), 6099 gfp_mask, NUMA_NO_NODE, NULL); 6100 if (!data) 6101 return -ENOMEM; 6102 6103 size = SKB_WITH_OVERHEAD(ksize(data)); 6104 6105 memcpy((struct skb_shared_info *)(data + size), 6106 skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); 6107 if (skb_orphan_frags(skb, gfp_mask)) { 6108 kfree(data); 6109 return -ENOMEM; 6110 } 6111 shinfo = (struct skb_shared_info *)(data + size); 6112 for (i = 0; i < nfrags; i++) { 6113 int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]); 6114 6115 if (pos + fsize > off) { 6116 shinfo->frags[k] = skb_shinfo(skb)->frags[i]; 6117 6118 if (pos < off) { 6119 /* Split frag. 6120 * We have two variants in this case: 6121 * 1. Move all the frag to the second 6122 * part, if it is possible. F.e. 6123 * this approach is mandatory for TUX, 6124 * where splitting is expensive. 6125 * 2. Split is accurately. We make this. 6126 */ 6127 skb_frag_off_add(&shinfo->frags[0], off - pos); 6128 skb_frag_size_sub(&shinfo->frags[0], off - pos); 6129 } 6130 skb_frag_ref(skb, i); 6131 k++; 6132 } 6133 pos += fsize; 6134 } 6135 shinfo->nr_frags = k; 6136 if (skb_has_frag_list(skb)) 6137 skb_clone_fraglist(skb); 6138 6139 /* split line is in frag list */ 6140 if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) { 6141 /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ 6142 if (skb_has_frag_list(skb)) 6143 kfree_skb_list(skb_shinfo(skb)->frag_list); 6144 kfree(data); 6145 return -ENOMEM; 6146 } 6147 skb_release_data(skb); 6148 6149 skb->head = data; 6150 skb->head_frag = 0; 6151 skb->data = data; 6152#ifdef NET_SKBUFF_DATA_USES_OFFSET 6153 skb->end = size; 6154#else 6155 skb->end = skb->head + size; 6156#endif 6157 skb_reset_tail_pointer(skb); 6158 skb_headers_offset_update(skb, 0); 6159 skb->cloned = 0; 6160 skb->hdr_len = 0; 6161 skb->nohdr = 0; 6162 skb->len -= off; 6163 skb->data_len = skb->len; 6164 atomic_set(&skb_shinfo(skb)->dataref, 1); 6165 return 0; 6166} 6167 6168/* remove len bytes from the beginning of the skb */ 6169static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) 6170{ 6171 int headlen = skb_headlen(skb); 6172 6173 if (len < headlen) 6174 return pskb_carve_inside_header(skb, len, headlen, gfp); 6175 else 6176 return pskb_carve_inside_nonlinear(skb, len, headlen, gfp); 6177} 6178 6179/* Extract to_copy bytes starting at off from skb, and return this in 6180 * a new skb 6181 */ 6182struct sk_buff *pskb_extract(struct sk_buff *skb, int off, 6183 int to_copy, gfp_t gfp) 6184{ 6185 struct sk_buff *clone = skb_clone(skb, gfp); 6186 6187 if (!clone) 6188 return NULL; 6189 6190 if (pskb_carve(clone, off, gfp) < 0 || 6191 pskb_trim(clone, to_copy)) { 6192 kfree_skb(clone); 6193 return NULL; 6194 } 6195 return clone; 6196} 6197EXPORT_SYMBOL(pskb_extract); 6198 6199/** 6200 * skb_condense - try to get rid of fragments/frag_list if possible 6201 * @skb: buffer 6202 * 6203 * Can be used to save memory before skb is added to a busy queue. 6204 * If packet has bytes in frags and enough tail room in skb->head, 6205 * pull all of them, so that we can free the frags right now and adjust 6206 * truesize. 6207 * Notes: 6208 * We do not reallocate skb->head thus can not fail. 6209 * Caller must re-evaluate skb->truesize if needed. 6210 */ 6211void skb_condense(struct sk_buff *skb) 6212{ 6213 if (skb->data_len) { 6214 if (skb->data_len > skb->end - skb->tail || 6215 skb_cloned(skb)) 6216 return; 6217 6218 /* Nice, we can free page frag(s) right now */ 6219 __pskb_pull_tail(skb, skb->data_len); 6220 } 6221 /* At this point, skb->truesize might be over estimated, 6222 * because skb had a fragment, and fragments do not tell 6223 * their truesize. 6224 * When we pulled its content into skb->head, fragment 6225 * was freed, but __pskb_pull_tail() could not possibly 6226 * adjust skb->truesize, not knowing the frag truesize. 6227 */ 6228 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 6229} 6230 6231#ifdef CONFIG_SKB_EXTENSIONS 6232static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) 6233{ 6234 return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); 6235} 6236 6237/** 6238 * __skb_ext_alloc - allocate a new skb extensions storage 6239 * 6240 * @flags: See kmalloc(). 6241 * 6242 * Returns the newly allocated pointer. The pointer can later attached to a 6243 * skb via __skb_ext_set(). 6244 * Note: caller must handle the skb_ext as an opaque data. 6245 */ 6246struct skb_ext *__skb_ext_alloc(gfp_t flags) 6247{ 6248 struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); 6249 6250 if (new) { 6251 memset(new->offset, 0, sizeof(new->offset)); 6252 refcount_set(&new->refcnt, 1); 6253 } 6254 6255 return new; 6256} 6257 6258static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, 6259 unsigned int old_active) 6260{ 6261 struct skb_ext *new; 6262 6263 if (refcount_read(&old->refcnt) == 1) 6264 return old; 6265 6266 new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); 6267 if (!new) 6268 return NULL; 6269 6270 memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); 6271 refcount_set(&new->refcnt, 1); 6272 6273#ifdef CONFIG_XFRM 6274 if (old_active & (1 << SKB_EXT_SEC_PATH)) { 6275 struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); 6276 unsigned int i; 6277 6278 for (i = 0; i < sp->len; i++) 6279 xfrm_state_hold(sp->xvec[i]); 6280 } 6281#endif 6282 __skb_ext_put(old); 6283 return new; 6284} 6285 6286/** 6287 * __skb_ext_set - attach the specified extension storage to this skb 6288 * @skb: buffer 6289 * @id: extension id 6290 * @ext: extension storage previously allocated via __skb_ext_alloc() 6291 * 6292 * Existing extensions, if any, are cleared. 6293 * 6294 * Returns the pointer to the extension. 6295 */ 6296void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, 6297 struct skb_ext *ext) 6298{ 6299 unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); 6300 6301 skb_ext_put(skb); 6302 newlen = newoff + skb_ext_type_len[id]; 6303 ext->chunks = newlen; 6304 ext->offset[id] = newoff; 6305 skb->extensions = ext; 6306 skb->active_extensions = 1 << id; 6307 return skb_ext_get_ptr(ext, id); 6308} 6309 6310/** 6311 * skb_ext_add - allocate space for given extension, COW if needed 6312 * @skb: buffer 6313 * @id: extension to allocate space for 6314 * 6315 * Allocates enough space for the given extension. 6316 * If the extension is already present, a pointer to that extension 6317 * is returned. 6318 * 6319 * If the skb was cloned, COW applies and the returned memory can be 6320 * modified without changing the extension space of clones buffers. 6321 * 6322 * Returns pointer to the extension or NULL on allocation failure. 6323 */ 6324void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) 6325{ 6326 struct skb_ext *new, *old = NULL; 6327 unsigned int newlen, newoff; 6328 6329 if (skb->active_extensions) { 6330 old = skb->extensions; 6331 6332 new = skb_ext_maybe_cow(old, skb->active_extensions); 6333 if (!new) 6334 return NULL; 6335 6336 if (__skb_ext_exist(new, id)) 6337 goto set_active; 6338 6339 newoff = new->chunks; 6340 } else { 6341 newoff = SKB_EXT_CHUNKSIZEOF(*new); 6342 6343 new = __skb_ext_alloc(GFP_ATOMIC); 6344 if (!new) 6345 return NULL; 6346 } 6347 6348 newlen = newoff + skb_ext_type_len[id]; 6349 new->chunks = newlen; 6350 new->offset[id] = newoff; 6351set_active: 6352 skb->extensions = new; 6353 skb->active_extensions |= 1 << id; 6354 return skb_ext_get_ptr(new, id); 6355} 6356EXPORT_SYMBOL(skb_ext_add); 6357 6358#ifdef CONFIG_XFRM 6359static void skb_ext_put_sp(struct sec_path *sp) 6360{ 6361 unsigned int i; 6362 6363 for (i = 0; i < sp->len; i++) 6364 xfrm_state_put(sp->xvec[i]); 6365} 6366#endif 6367 6368void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) 6369{ 6370 struct skb_ext *ext = skb->extensions; 6371 6372 skb->active_extensions &= ~(1 << id); 6373 if (skb->active_extensions == 0) { 6374 skb->extensions = NULL; 6375 __skb_ext_put(ext); 6376#ifdef CONFIG_XFRM 6377 } else if (id == SKB_EXT_SEC_PATH && 6378 refcount_read(&ext->refcnt) == 1) { 6379 struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); 6380 6381 skb_ext_put_sp(sp); 6382 sp->len = 0; 6383#endif 6384 } 6385} 6386EXPORT_SYMBOL(__skb_ext_del); 6387 6388void __skb_ext_put(struct skb_ext *ext) 6389{ 6390 /* If this is last clone, nothing can increment 6391 * it after check passes. Avoids one atomic op. 6392 */ 6393 if (refcount_read(&ext->refcnt) == 1) 6394 goto free_now; 6395 6396 if (!refcount_dec_and_test(&ext->refcnt)) 6397 return; 6398free_now: 6399#ifdef CONFIG_XFRM 6400 if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) 6401 skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); 6402#endif 6403 6404 kmem_cache_free(skbuff_ext_cache, ext); 6405} 6406EXPORT_SYMBOL(__skb_ext_put); 6407#endif /* CONFIG_SKB_EXTENSIONS */ 6408