1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * inet fragments management 4 * 5 * Authors: Pavel Emelyanov <xemul@openvz.org> 6 * Started as consolidation of ipv4/ip_fragment.c, 7 * ipv6/reassembly. and ipv6 nf conntrack reassembly 8 */ 9 10#include <linux/list.h> 11#include <linux/spinlock.h> 12#include <linux/module.h> 13#include <linux/timer.h> 14#include <linux/mm.h> 15#include <linux/random.h> 16#include <linux/skbuff.h> 17#include <linux/rtnetlink.h> 18#include <linux/slab.h> 19#include <linux/rhashtable.h> 20 21#include <net/sock.h> 22#include <net/inet_frag.h> 23#include <net/inet_ecn.h> 24#include <net/ip.h> 25#include <net/ipv6.h> 26 27/* Use skb->cb to track consecutive/adjacent fragments coming at 28 * the end of the queue. Nodes in the rb-tree queue will 29 * contain "runs" of one or more adjacent fragments. 30 * 31 * Invariants: 32 * - next_frag is NULL at the tail of a "run"; 33 * - the head of a "run" has the sum of all fragment lengths in frag_run_len. 34 */ 35struct ipfrag_skb_cb { 36 union { 37 struct inet_skb_parm h4; 38 struct inet6_skb_parm h6; 39 }; 40 struct sk_buff *next_frag; 41 int frag_run_len; 42}; 43 44#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) 45 46static void fragcb_clear(struct sk_buff *skb) 47{ 48 RB_CLEAR_NODE(&skb->rbnode); 49 FRAG_CB(skb)->next_frag = NULL; 50 FRAG_CB(skb)->frag_run_len = skb->len; 51} 52 53/* Append skb to the last "run". */ 54static void fragrun_append_to_last(struct inet_frag_queue *q, 55 struct sk_buff *skb) 56{ 57 fragcb_clear(skb); 58 59 FRAG_CB(q->last_run_head)->frag_run_len += skb->len; 60 FRAG_CB(q->fragments_tail)->next_frag = skb; 61 q->fragments_tail = skb; 62} 63 64/* Create a new "run" with the skb. */ 65static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) 66{ 67 BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); 68 fragcb_clear(skb); 69 70 if (q->last_run_head) 71 rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, 72 &q->last_run_head->rbnode.rb_right); 73 else 74 rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); 75 rb_insert_color(&skb->rbnode, &q->rb_fragments); 76 77 q->fragments_tail = skb; 78 q->last_run_head = skb; 79} 80 81/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements 82 * Value : 0xff if frame should be dropped. 83 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field 84 */ 85const u8 ip_frag_ecn_table[16] = { 86 /* at least one fragment had CE, and others ECT_0 or ECT_1 */ 87 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, 88 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, 89 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, 90 91 /* invalid combinations : drop frame */ 92 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, 93 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, 94 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, 95 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, 96 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, 97 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, 98 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, 99}; 100EXPORT_SYMBOL(ip_frag_ecn_table); 101 102int inet_frags_init(struct inet_frags *f) 103{ 104 f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, 105 NULL); 106 if (!f->frags_cachep) 107 return -ENOMEM; 108 109 refcount_set(&f->refcnt, 1); 110 init_completion(&f->completion); 111 return 0; 112} 113EXPORT_SYMBOL(inet_frags_init); 114 115void inet_frags_fini(struct inet_frags *f) 116{ 117 if (refcount_dec_and_test(&f->refcnt)) 118 complete(&f->completion); 119 120 wait_for_completion(&f->completion); 121 122 kmem_cache_destroy(f->frags_cachep); 123 f->frags_cachep = NULL; 124} 125EXPORT_SYMBOL(inet_frags_fini); 126 127/* called from rhashtable_free_and_destroy() at netns_frags dismantle */ 128static void inet_frags_free_cb(void *ptr, void *arg) 129{ 130 struct inet_frag_queue *fq = ptr; 131 int count; 132 133 count = del_timer_sync(&fq->timer) ? 1 : 0; 134 135 spin_lock_bh(&fq->lock); 136 if (!(fq->flags & INET_FRAG_COMPLETE)) { 137 fq->flags |= INET_FRAG_COMPLETE; 138 count++; 139 } else if (fq->flags & INET_FRAG_HASH_DEAD) { 140 count++; 141 } 142 spin_unlock_bh(&fq->lock); 143 144 if (refcount_sub_and_test(count, &fq->refcnt)) 145 inet_frag_destroy(fq); 146} 147 148static void fqdir_work_fn(struct work_struct *work) 149{ 150 struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); 151 struct inet_frags *f = fqdir->f; 152 153 rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); 154 155 /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu) 156 * have completed, since they need to dereference fqdir. 157 * Would it not be nice to have kfree_rcu_barrier() ? :) 158 */ 159 rcu_barrier(); 160 161 if (refcount_dec_and_test(&f->refcnt)) 162 complete(&f->completion); 163 164 kfree(fqdir); 165} 166 167int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) 168{ 169 struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL); 170 int res; 171 172 if (!fqdir) 173 return -ENOMEM; 174 fqdir->f = f; 175 fqdir->net = net; 176 res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params); 177 if (res < 0) { 178 kfree(fqdir); 179 return res; 180 } 181 refcount_inc(&f->refcnt); 182 *fqdirp = fqdir; 183 return 0; 184} 185EXPORT_SYMBOL(fqdir_init); 186 187void fqdir_exit(struct fqdir *fqdir) 188{ 189 INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); 190 queue_work(system_wq, &fqdir->destroy_work); 191} 192EXPORT_SYMBOL(fqdir_exit); 193 194void inet_frag_kill(struct inet_frag_queue *fq) 195{ 196 if (del_timer(&fq->timer)) 197 refcount_dec(&fq->refcnt); 198 199 if (!(fq->flags & INET_FRAG_COMPLETE)) { 200 struct fqdir *fqdir = fq->fqdir; 201 202 fq->flags |= INET_FRAG_COMPLETE; 203 rcu_read_lock(); 204 /* The RCU read lock provides a memory barrier 205 * guaranteeing that if fqdir->dead is false then 206 * the hash table destruction will not start until 207 * after we unlock. Paired with fqdir_pre_exit(). 208 */ 209 if (!READ_ONCE(fqdir->dead)) { 210 rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, 211 fqdir->f->rhash_params); 212 refcount_dec(&fq->refcnt); 213 } else { 214 fq->flags |= INET_FRAG_HASH_DEAD; 215 } 216 rcu_read_unlock(); 217 } 218} 219EXPORT_SYMBOL(inet_frag_kill); 220 221static void inet_frag_destroy_rcu(struct rcu_head *head) 222{ 223 struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, 224 rcu); 225 struct inet_frags *f = q->fqdir->f; 226 227 if (f->destructor) 228 f->destructor(q); 229 kmem_cache_free(f->frags_cachep, q); 230} 231 232unsigned int inet_frag_rbtree_purge(struct rb_root *root) 233{ 234 struct rb_node *p = rb_first(root); 235 unsigned int sum = 0; 236 237 while (p) { 238 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); 239 240 p = rb_next(p); 241 rb_erase(&skb->rbnode, root); 242 while (skb) { 243 struct sk_buff *next = FRAG_CB(skb)->next_frag; 244 245 sum += skb->truesize; 246 kfree_skb(skb); 247 skb = next; 248 } 249 } 250 return sum; 251} 252EXPORT_SYMBOL(inet_frag_rbtree_purge); 253 254void inet_frag_destroy(struct inet_frag_queue *q) 255{ 256 struct fqdir *fqdir; 257 unsigned int sum, sum_truesize = 0; 258 struct inet_frags *f; 259 260 WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); 261 WARN_ON(del_timer(&q->timer) != 0); 262 263 /* Release all fragment data. */ 264 fqdir = q->fqdir; 265 f = fqdir->f; 266 sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments); 267 sum = sum_truesize + f->qsize; 268 269 call_rcu(&q->rcu, inet_frag_destroy_rcu); 270 271 sub_frag_mem_limit(fqdir, sum); 272} 273EXPORT_SYMBOL(inet_frag_destroy); 274 275static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, 276 struct inet_frags *f, 277 void *arg) 278{ 279 struct inet_frag_queue *q; 280 281 q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); 282 if (!q) 283 return NULL; 284 285 q->fqdir = fqdir; 286 f->constructor(q, arg); 287 add_frag_mem_limit(fqdir, f->qsize); 288 289 timer_setup(&q->timer, f->frag_expire, 0); 290 spin_lock_init(&q->lock); 291 refcount_set(&q->refcnt, 3); 292 293 return q; 294} 295 296static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, 297 void *arg, 298 struct inet_frag_queue **prev) 299{ 300 struct inet_frags *f = fqdir->f; 301 struct inet_frag_queue *q; 302 303 q = inet_frag_alloc(fqdir, f, arg); 304 if (!q) { 305 *prev = ERR_PTR(-ENOMEM); 306 return NULL; 307 } 308 mod_timer(&q->timer, jiffies + fqdir->timeout); 309 310 *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, 311 &q->node, f->rhash_params); 312 if (*prev) { 313 q->flags |= INET_FRAG_COMPLETE; 314 inet_frag_kill(q); 315 inet_frag_destroy(q); 316 return NULL; 317 } 318 return q; 319} 320 321/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ 322struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) 323{ 324 /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */ 325 long high_thresh = READ_ONCE(fqdir->high_thresh); 326 struct inet_frag_queue *fq = NULL, *prev; 327 328 if (!high_thresh || frag_mem_limit(fqdir) > high_thresh) 329 return NULL; 330 331 rcu_read_lock(); 332 333 prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); 334 if (!prev) 335 fq = inet_frag_create(fqdir, key, &prev); 336 if (!IS_ERR_OR_NULL(prev)) { 337 fq = prev; 338 if (!refcount_inc_not_zero(&fq->refcnt)) 339 fq = NULL; 340 } 341 rcu_read_unlock(); 342 return fq; 343} 344EXPORT_SYMBOL(inet_frag_find); 345 346int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, 347 int offset, int end) 348{ 349 struct sk_buff *last = q->fragments_tail; 350 351 /* RFC5722, Section 4, amended by Errata ID : 3089 352 * When reassembling an IPv6 datagram, if 353 * one or more its constituent fragments is determined to be an 354 * overlapping fragment, the entire datagram (and any constituent 355 * fragments) MUST be silently discarded. 356 * 357 * Duplicates, however, should be ignored (i.e. skb dropped, but the 358 * queue/fragments kept for later reassembly). 359 */ 360 if (!last) 361 fragrun_create(q, skb); /* First fragment. */ 362 else if (last->ip_defrag_offset + last->len < end) { 363 /* This is the common case: skb goes to the end. */ 364 /* Detect and discard overlaps. */ 365 if (offset < last->ip_defrag_offset + last->len) 366 return IPFRAG_OVERLAP; 367 if (offset == last->ip_defrag_offset + last->len) 368 fragrun_append_to_last(q, skb); 369 else 370 fragrun_create(q, skb); 371 } else { 372 /* Binary search. Note that skb can become the first fragment, 373 * but not the last (covered above). 374 */ 375 struct rb_node **rbn, *parent; 376 377 rbn = &q->rb_fragments.rb_node; 378 do { 379 struct sk_buff *curr; 380 int curr_run_end; 381 382 parent = *rbn; 383 curr = rb_to_skb(parent); 384 curr_run_end = curr->ip_defrag_offset + 385 FRAG_CB(curr)->frag_run_len; 386 if (end <= curr->ip_defrag_offset) 387 rbn = &parent->rb_left; 388 else if (offset >= curr_run_end) 389 rbn = &parent->rb_right; 390 else if (offset >= curr->ip_defrag_offset && 391 end <= curr_run_end) 392 return IPFRAG_DUP; 393 else 394 return IPFRAG_OVERLAP; 395 } while (*rbn); 396 /* Here we have parent properly set, and rbn pointing to 397 * one of its NULL left/right children. Insert skb. 398 */ 399 fragcb_clear(skb); 400 rb_link_node(&skb->rbnode, parent, rbn); 401 rb_insert_color(&skb->rbnode, &q->rb_fragments); 402 } 403 404 skb->ip_defrag_offset = offset; 405 406 return IPFRAG_OK; 407} 408EXPORT_SYMBOL(inet_frag_queue_insert); 409 410void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, 411 struct sk_buff *parent) 412{ 413 struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); 414 struct sk_buff **nextp; 415 int delta; 416 417 if (head != skb) { 418 fp = skb_clone(skb, GFP_ATOMIC); 419 if (!fp) 420 return NULL; 421 FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; 422 if (RB_EMPTY_NODE(&skb->rbnode)) 423 FRAG_CB(parent)->next_frag = fp; 424 else 425 rb_replace_node(&skb->rbnode, &fp->rbnode, 426 &q->rb_fragments); 427 if (q->fragments_tail == skb) 428 q->fragments_tail = fp; 429 skb_morph(skb, head); 430 FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; 431 rb_replace_node(&head->rbnode, &skb->rbnode, 432 &q->rb_fragments); 433 consume_skb(head); 434 head = skb; 435 } 436 WARN_ON(head->ip_defrag_offset != 0); 437 438 delta = -head->truesize; 439 440 /* Head of list must not be cloned. */ 441 if (skb_unclone(head, GFP_ATOMIC)) 442 return NULL; 443 444 delta += head->truesize; 445 if (delta) 446 add_frag_mem_limit(q->fqdir, delta); 447 448 /* If the first fragment is fragmented itself, we split 449 * it to two chunks: the first with data and paged part 450 * and the second, holding only fragments. 451 */ 452 if (skb_has_frag_list(head)) { 453 struct sk_buff *clone; 454 int i, plen = 0; 455 456 clone = alloc_skb(0, GFP_ATOMIC); 457 if (!clone) 458 return NULL; 459 skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; 460 skb_frag_list_init(head); 461 for (i = 0; i < skb_shinfo(head)->nr_frags; i++) 462 plen += skb_frag_size(&skb_shinfo(head)->frags[i]); 463 clone->data_len = head->data_len - plen; 464 clone->len = clone->data_len; 465 head->truesize += clone->truesize; 466 clone->csum = 0; 467 clone->ip_summed = head->ip_summed; 468 add_frag_mem_limit(q->fqdir, clone->truesize); 469 skb_shinfo(head)->frag_list = clone; 470 nextp = &clone->next; 471 } else { 472 nextp = &skb_shinfo(head)->frag_list; 473 } 474 475 return nextp; 476} 477EXPORT_SYMBOL(inet_frag_reasm_prepare); 478 479void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, 480 void *reasm_data, bool try_coalesce) 481{ 482 struct sk_buff **nextp = (struct sk_buff **)reasm_data; 483 struct rb_node *rbn; 484 struct sk_buff *fp; 485 int sum_truesize; 486 487 skb_push(head, head->data - skb_network_header(head)); 488 489 /* Traverse the tree in order, to build frag_list. */ 490 fp = FRAG_CB(head)->next_frag; 491 rbn = rb_next(&head->rbnode); 492 rb_erase(&head->rbnode, &q->rb_fragments); 493 494 sum_truesize = head->truesize; 495 while (rbn || fp) { 496 /* fp points to the next sk_buff in the current run; 497 * rbn points to the next run. 498 */ 499 /* Go through the current run. */ 500 while (fp) { 501 struct sk_buff *next_frag = FRAG_CB(fp)->next_frag; 502 bool stolen; 503 int delta; 504 505 sum_truesize += fp->truesize; 506 if (head->ip_summed != fp->ip_summed) 507 head->ip_summed = CHECKSUM_NONE; 508 else if (head->ip_summed == CHECKSUM_COMPLETE) 509 head->csum = csum_add(head->csum, fp->csum); 510 511 if (try_coalesce && skb_try_coalesce(head, fp, &stolen, 512 &delta)) { 513 kfree_skb_partial(fp, stolen); 514 } else { 515 fp->prev = NULL; 516 memset(&fp->rbnode, 0, sizeof(fp->rbnode)); 517 fp->sk = NULL; 518 519 head->data_len += fp->len; 520 head->len += fp->len; 521 head->truesize += fp->truesize; 522 523 *nextp = fp; 524 nextp = &fp->next; 525 } 526 527 fp = next_frag; 528 } 529 /* Move to the next run. */ 530 if (rbn) { 531 struct rb_node *rbnext = rb_next(rbn); 532 533 fp = rb_to_skb(rbn); 534 rb_erase(rbn, &q->rb_fragments); 535 rbn = rbnext; 536 } 537 } 538 sub_frag_mem_limit(q->fqdir, sum_truesize); 539 540 *nextp = NULL; 541 skb_mark_not_on_list(head); 542 head->prev = NULL; 543 head->tstamp = q->stamp; 544} 545EXPORT_SYMBOL(inet_frag_reasm_finish); 546 547struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) 548{ 549 struct sk_buff *head, *skb; 550 551 head = skb_rb_first(&q->rb_fragments); 552 if (!head) 553 return NULL; 554 skb = FRAG_CB(head)->next_frag; 555 if (skb) 556 rb_replace_node(&head->rbnode, &skb->rbnode, 557 &q->rb_fragments); 558 else 559 rb_erase(&head->rbnode, &q->rb_fragments); 560 memset(&head->rbnode, 0, sizeof(head->rbnode)); 561 barrier(); 562 563 if (head == q->fragments_tail) 564 q->fragments_tail = NULL; 565 566 sub_frag_mem_limit(q->fqdir, head->truesize); 567 568 return head; 569} 570EXPORT_SYMBOL(inet_frag_pull_head); 571