18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * inet fragments management 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Authors: Pavel Emelyanov <xemul@openvz.org> 68c2ecf20Sopenharmony_ci * Started as consolidation of ipv4/ip_fragment.c, 78c2ecf20Sopenharmony_ci * ipv6/reassembly. and ipv6 nf conntrack reassembly 88c2ecf20Sopenharmony_ci */ 98c2ecf20Sopenharmony_ci 108c2ecf20Sopenharmony_ci#include <linux/list.h> 118c2ecf20Sopenharmony_ci#include <linux/spinlock.h> 128c2ecf20Sopenharmony_ci#include <linux/module.h> 138c2ecf20Sopenharmony_ci#include <linux/timer.h> 148c2ecf20Sopenharmony_ci#include <linux/mm.h> 158c2ecf20Sopenharmony_ci#include <linux/random.h> 168c2ecf20Sopenharmony_ci#include <linux/skbuff.h> 178c2ecf20Sopenharmony_ci#include <linux/rtnetlink.h> 188c2ecf20Sopenharmony_ci#include <linux/slab.h> 198c2ecf20Sopenharmony_ci#include <linux/rhashtable.h> 208c2ecf20Sopenharmony_ci 218c2ecf20Sopenharmony_ci#include <net/sock.h> 228c2ecf20Sopenharmony_ci#include <net/inet_frag.h> 238c2ecf20Sopenharmony_ci#include <net/inet_ecn.h> 248c2ecf20Sopenharmony_ci#include <net/ip.h> 258c2ecf20Sopenharmony_ci#include <net/ipv6.h> 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_ci/* Use skb->cb to track consecutive/adjacent fragments coming at 288c2ecf20Sopenharmony_ci * the end of the queue. Nodes in the rb-tree queue will 298c2ecf20Sopenharmony_ci * contain "runs" of one or more adjacent fragments. 308c2ecf20Sopenharmony_ci * 318c2ecf20Sopenharmony_ci * Invariants: 328c2ecf20Sopenharmony_ci * - next_frag is NULL at the tail of a "run"; 338c2ecf20Sopenharmony_ci * - the head of a "run" has the sum of all fragment lengths in frag_run_len. 348c2ecf20Sopenharmony_ci */ 358c2ecf20Sopenharmony_cistruct ipfrag_skb_cb { 368c2ecf20Sopenharmony_ci union { 378c2ecf20Sopenharmony_ci struct inet_skb_parm h4; 388c2ecf20Sopenharmony_ci struct inet6_skb_parm h6; 398c2ecf20Sopenharmony_ci }; 408c2ecf20Sopenharmony_ci struct sk_buff *next_frag; 418c2ecf20Sopenharmony_ci int frag_run_len; 428c2ecf20Sopenharmony_ci}; 438c2ecf20Sopenharmony_ci 448c2ecf20Sopenharmony_ci#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) 458c2ecf20Sopenharmony_ci 468c2ecf20Sopenharmony_cistatic void fragcb_clear(struct sk_buff *skb) 478c2ecf20Sopenharmony_ci{ 488c2ecf20Sopenharmony_ci RB_CLEAR_NODE(&skb->rbnode); 498c2ecf20Sopenharmony_ci FRAG_CB(skb)->next_frag = NULL; 508c2ecf20Sopenharmony_ci FRAG_CB(skb)->frag_run_len = skb->len; 518c2ecf20Sopenharmony_ci} 528c2ecf20Sopenharmony_ci 538c2ecf20Sopenharmony_ci/* Append skb to the last "run". */ 548c2ecf20Sopenharmony_cistatic void fragrun_append_to_last(struct inet_frag_queue *q, 558c2ecf20Sopenharmony_ci struct sk_buff *skb) 568c2ecf20Sopenharmony_ci{ 578c2ecf20Sopenharmony_ci fragcb_clear(skb); 588c2ecf20Sopenharmony_ci 598c2ecf20Sopenharmony_ci FRAG_CB(q->last_run_head)->frag_run_len += skb->len; 608c2ecf20Sopenharmony_ci FRAG_CB(q->fragments_tail)->next_frag = skb; 618c2ecf20Sopenharmony_ci q->fragments_tail = skb; 628c2ecf20Sopenharmony_ci} 638c2ecf20Sopenharmony_ci 648c2ecf20Sopenharmony_ci/* Create a new "run" with the skb. */ 658c2ecf20Sopenharmony_cistatic void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) 668c2ecf20Sopenharmony_ci{ 678c2ecf20Sopenharmony_ci BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); 688c2ecf20Sopenharmony_ci fragcb_clear(skb); 698c2ecf20Sopenharmony_ci 708c2ecf20Sopenharmony_ci if (q->last_run_head) 718c2ecf20Sopenharmony_ci rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, 728c2ecf20Sopenharmony_ci &q->last_run_head->rbnode.rb_right); 738c2ecf20Sopenharmony_ci else 748c2ecf20Sopenharmony_ci rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); 758c2ecf20Sopenharmony_ci rb_insert_color(&skb->rbnode, &q->rb_fragments); 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci q->fragments_tail = skb; 788c2ecf20Sopenharmony_ci q->last_run_head = skb; 798c2ecf20Sopenharmony_ci} 808c2ecf20Sopenharmony_ci 818c2ecf20Sopenharmony_ci/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements 828c2ecf20Sopenharmony_ci * Value : 0xff if frame should be dropped. 838c2ecf20Sopenharmony_ci * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field 848c2ecf20Sopenharmony_ci */ 858c2ecf20Sopenharmony_ciconst u8 ip_frag_ecn_table[16] = { 868c2ecf20Sopenharmony_ci /* at least one fragment had CE, and others ECT_0 or ECT_1 */ 878c2ecf20Sopenharmony_ci [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, 888c2ecf20Sopenharmony_ci [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, 898c2ecf20Sopenharmony_ci [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, 908c2ecf20Sopenharmony_ci 918c2ecf20Sopenharmony_ci /* invalid combinations : drop frame */ 928c2ecf20Sopenharmony_ci [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, 938c2ecf20Sopenharmony_ci [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, 948c2ecf20Sopenharmony_ci [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, 958c2ecf20Sopenharmony_ci [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, 968c2ecf20Sopenharmony_ci [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, 978c2ecf20Sopenharmony_ci [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, 988c2ecf20Sopenharmony_ci [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, 998c2ecf20Sopenharmony_ci}; 1008c2ecf20Sopenharmony_ciEXPORT_SYMBOL(ip_frag_ecn_table); 1018c2ecf20Sopenharmony_ci 1028c2ecf20Sopenharmony_ciint inet_frags_init(struct inet_frags *f) 1038c2ecf20Sopenharmony_ci{ 1048c2ecf20Sopenharmony_ci f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, 1058c2ecf20Sopenharmony_ci NULL); 1068c2ecf20Sopenharmony_ci if (!f->frags_cachep) 1078c2ecf20Sopenharmony_ci return -ENOMEM; 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_ci refcount_set(&f->refcnt, 1); 1108c2ecf20Sopenharmony_ci init_completion(&f->completion); 1118c2ecf20Sopenharmony_ci return 0; 1128c2ecf20Sopenharmony_ci} 1138c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frags_init); 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_civoid inet_frags_fini(struct inet_frags *f) 1168c2ecf20Sopenharmony_ci{ 1178c2ecf20Sopenharmony_ci if (refcount_dec_and_test(&f->refcnt)) 1188c2ecf20Sopenharmony_ci complete(&f->completion); 1198c2ecf20Sopenharmony_ci 1208c2ecf20Sopenharmony_ci wait_for_completion(&f->completion); 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ci kmem_cache_destroy(f->frags_cachep); 1238c2ecf20Sopenharmony_ci f->frags_cachep = NULL; 1248c2ecf20Sopenharmony_ci} 1258c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frags_fini); 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_ci/* called from rhashtable_free_and_destroy() at netns_frags dismantle */ 1288c2ecf20Sopenharmony_cistatic void inet_frags_free_cb(void *ptr, void *arg) 1298c2ecf20Sopenharmony_ci{ 1308c2ecf20Sopenharmony_ci struct inet_frag_queue *fq = ptr; 1318c2ecf20Sopenharmony_ci int count; 1328c2ecf20Sopenharmony_ci 1338c2ecf20Sopenharmony_ci count = del_timer_sync(&fq->timer) ? 1 : 0; 1348c2ecf20Sopenharmony_ci 1358c2ecf20Sopenharmony_ci spin_lock_bh(&fq->lock); 1368c2ecf20Sopenharmony_ci if (!(fq->flags & INET_FRAG_COMPLETE)) { 1378c2ecf20Sopenharmony_ci fq->flags |= INET_FRAG_COMPLETE; 1388c2ecf20Sopenharmony_ci count++; 1398c2ecf20Sopenharmony_ci } else if (fq->flags & INET_FRAG_HASH_DEAD) { 1408c2ecf20Sopenharmony_ci count++; 1418c2ecf20Sopenharmony_ci } 1428c2ecf20Sopenharmony_ci spin_unlock_bh(&fq->lock); 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ci if (refcount_sub_and_test(count, &fq->refcnt)) 1458c2ecf20Sopenharmony_ci inet_frag_destroy(fq); 1468c2ecf20Sopenharmony_ci} 1478c2ecf20Sopenharmony_ci 1488c2ecf20Sopenharmony_cistatic void fqdir_work_fn(struct work_struct *work) 1498c2ecf20Sopenharmony_ci{ 1508c2ecf20Sopenharmony_ci struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); 1518c2ecf20Sopenharmony_ci struct inet_frags *f = fqdir->f; 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_ci rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); 1548c2ecf20Sopenharmony_ci 1558c2ecf20Sopenharmony_ci /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu) 1568c2ecf20Sopenharmony_ci * have completed, since they need to dereference fqdir. 1578c2ecf20Sopenharmony_ci * Would it not be nice to have kfree_rcu_barrier() ? :) 1588c2ecf20Sopenharmony_ci */ 1598c2ecf20Sopenharmony_ci rcu_barrier(); 1608c2ecf20Sopenharmony_ci 1618c2ecf20Sopenharmony_ci if (refcount_dec_and_test(&f->refcnt)) 1628c2ecf20Sopenharmony_ci complete(&f->completion); 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_ci kfree(fqdir); 1658c2ecf20Sopenharmony_ci} 1668c2ecf20Sopenharmony_ci 1678c2ecf20Sopenharmony_ciint fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) 1688c2ecf20Sopenharmony_ci{ 1698c2ecf20Sopenharmony_ci struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL); 1708c2ecf20Sopenharmony_ci int res; 1718c2ecf20Sopenharmony_ci 1728c2ecf20Sopenharmony_ci if (!fqdir) 1738c2ecf20Sopenharmony_ci return -ENOMEM; 1748c2ecf20Sopenharmony_ci fqdir->f = f; 1758c2ecf20Sopenharmony_ci fqdir->net = net; 1768c2ecf20Sopenharmony_ci res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params); 1778c2ecf20Sopenharmony_ci if (res < 0) { 1788c2ecf20Sopenharmony_ci kfree(fqdir); 1798c2ecf20Sopenharmony_ci return res; 1808c2ecf20Sopenharmony_ci } 1818c2ecf20Sopenharmony_ci refcount_inc(&f->refcnt); 1828c2ecf20Sopenharmony_ci *fqdirp = fqdir; 1838c2ecf20Sopenharmony_ci return 0; 1848c2ecf20Sopenharmony_ci} 1858c2ecf20Sopenharmony_ciEXPORT_SYMBOL(fqdir_init); 1868c2ecf20Sopenharmony_ci 1878c2ecf20Sopenharmony_civoid fqdir_exit(struct fqdir *fqdir) 1888c2ecf20Sopenharmony_ci{ 1898c2ecf20Sopenharmony_ci INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); 1908c2ecf20Sopenharmony_ci queue_work(system_wq, &fqdir->destroy_work); 1918c2ecf20Sopenharmony_ci} 1928c2ecf20Sopenharmony_ciEXPORT_SYMBOL(fqdir_exit); 1938c2ecf20Sopenharmony_ci 1948c2ecf20Sopenharmony_civoid inet_frag_kill(struct inet_frag_queue *fq) 1958c2ecf20Sopenharmony_ci{ 1968c2ecf20Sopenharmony_ci if (del_timer(&fq->timer)) 1978c2ecf20Sopenharmony_ci refcount_dec(&fq->refcnt); 1988c2ecf20Sopenharmony_ci 1998c2ecf20Sopenharmony_ci if (!(fq->flags & INET_FRAG_COMPLETE)) { 2008c2ecf20Sopenharmony_ci struct fqdir *fqdir = fq->fqdir; 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci fq->flags |= INET_FRAG_COMPLETE; 2038c2ecf20Sopenharmony_ci rcu_read_lock(); 2048c2ecf20Sopenharmony_ci /* The RCU read lock provides a memory barrier 2058c2ecf20Sopenharmony_ci * guaranteeing that if fqdir->dead is false then 2068c2ecf20Sopenharmony_ci * the hash table destruction will not start until 2078c2ecf20Sopenharmony_ci * after we unlock. Paired with fqdir_pre_exit(). 2088c2ecf20Sopenharmony_ci */ 2098c2ecf20Sopenharmony_ci if (!READ_ONCE(fqdir->dead)) { 2108c2ecf20Sopenharmony_ci rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, 2118c2ecf20Sopenharmony_ci fqdir->f->rhash_params); 2128c2ecf20Sopenharmony_ci refcount_dec(&fq->refcnt); 2138c2ecf20Sopenharmony_ci } else { 2148c2ecf20Sopenharmony_ci fq->flags |= INET_FRAG_HASH_DEAD; 2158c2ecf20Sopenharmony_ci } 2168c2ecf20Sopenharmony_ci rcu_read_unlock(); 2178c2ecf20Sopenharmony_ci } 2188c2ecf20Sopenharmony_ci} 2198c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frag_kill); 2208c2ecf20Sopenharmony_ci 2218c2ecf20Sopenharmony_cistatic void inet_frag_destroy_rcu(struct rcu_head *head) 2228c2ecf20Sopenharmony_ci{ 2238c2ecf20Sopenharmony_ci struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, 2248c2ecf20Sopenharmony_ci rcu); 2258c2ecf20Sopenharmony_ci struct inet_frags *f = q->fqdir->f; 2268c2ecf20Sopenharmony_ci 2278c2ecf20Sopenharmony_ci if (f->destructor) 2288c2ecf20Sopenharmony_ci f->destructor(q); 2298c2ecf20Sopenharmony_ci kmem_cache_free(f->frags_cachep, q); 2308c2ecf20Sopenharmony_ci} 2318c2ecf20Sopenharmony_ci 2328c2ecf20Sopenharmony_ciunsigned int inet_frag_rbtree_purge(struct rb_root *root) 2338c2ecf20Sopenharmony_ci{ 2348c2ecf20Sopenharmony_ci struct rb_node *p = rb_first(root); 2358c2ecf20Sopenharmony_ci unsigned int sum = 0; 2368c2ecf20Sopenharmony_ci 2378c2ecf20Sopenharmony_ci while (p) { 2388c2ecf20Sopenharmony_ci struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); 2398c2ecf20Sopenharmony_ci 2408c2ecf20Sopenharmony_ci p = rb_next(p); 2418c2ecf20Sopenharmony_ci rb_erase(&skb->rbnode, root); 2428c2ecf20Sopenharmony_ci while (skb) { 2438c2ecf20Sopenharmony_ci struct sk_buff *next = FRAG_CB(skb)->next_frag; 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_ci sum += skb->truesize; 2468c2ecf20Sopenharmony_ci kfree_skb(skb); 2478c2ecf20Sopenharmony_ci skb = next; 2488c2ecf20Sopenharmony_ci } 2498c2ecf20Sopenharmony_ci } 2508c2ecf20Sopenharmony_ci return sum; 2518c2ecf20Sopenharmony_ci} 2528c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frag_rbtree_purge); 2538c2ecf20Sopenharmony_ci 2548c2ecf20Sopenharmony_civoid inet_frag_destroy(struct inet_frag_queue *q) 2558c2ecf20Sopenharmony_ci{ 2568c2ecf20Sopenharmony_ci struct fqdir *fqdir; 2578c2ecf20Sopenharmony_ci unsigned int sum, sum_truesize = 0; 2588c2ecf20Sopenharmony_ci struct inet_frags *f; 2598c2ecf20Sopenharmony_ci 2608c2ecf20Sopenharmony_ci WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); 2618c2ecf20Sopenharmony_ci WARN_ON(del_timer(&q->timer) != 0); 2628c2ecf20Sopenharmony_ci 2638c2ecf20Sopenharmony_ci /* Release all fragment data. */ 2648c2ecf20Sopenharmony_ci fqdir = q->fqdir; 2658c2ecf20Sopenharmony_ci f = fqdir->f; 2668c2ecf20Sopenharmony_ci sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments); 2678c2ecf20Sopenharmony_ci sum = sum_truesize + f->qsize; 2688c2ecf20Sopenharmony_ci 2698c2ecf20Sopenharmony_ci call_rcu(&q->rcu, inet_frag_destroy_rcu); 2708c2ecf20Sopenharmony_ci 2718c2ecf20Sopenharmony_ci sub_frag_mem_limit(fqdir, sum); 2728c2ecf20Sopenharmony_ci} 2738c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frag_destroy); 2748c2ecf20Sopenharmony_ci 2758c2ecf20Sopenharmony_cistatic struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, 2768c2ecf20Sopenharmony_ci struct inet_frags *f, 2778c2ecf20Sopenharmony_ci void *arg) 2788c2ecf20Sopenharmony_ci{ 2798c2ecf20Sopenharmony_ci struct inet_frag_queue *q; 2808c2ecf20Sopenharmony_ci 2818c2ecf20Sopenharmony_ci q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); 2828c2ecf20Sopenharmony_ci if (!q) 2838c2ecf20Sopenharmony_ci return NULL; 2848c2ecf20Sopenharmony_ci 2858c2ecf20Sopenharmony_ci q->fqdir = fqdir; 2868c2ecf20Sopenharmony_ci f->constructor(q, arg); 2878c2ecf20Sopenharmony_ci add_frag_mem_limit(fqdir, f->qsize); 2888c2ecf20Sopenharmony_ci 2898c2ecf20Sopenharmony_ci timer_setup(&q->timer, f->frag_expire, 0); 2908c2ecf20Sopenharmony_ci spin_lock_init(&q->lock); 2918c2ecf20Sopenharmony_ci refcount_set(&q->refcnt, 3); 2928c2ecf20Sopenharmony_ci 2938c2ecf20Sopenharmony_ci return q; 2948c2ecf20Sopenharmony_ci} 2958c2ecf20Sopenharmony_ci 2968c2ecf20Sopenharmony_cistatic struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, 2978c2ecf20Sopenharmony_ci void *arg, 2988c2ecf20Sopenharmony_ci struct inet_frag_queue **prev) 2998c2ecf20Sopenharmony_ci{ 3008c2ecf20Sopenharmony_ci struct inet_frags *f = fqdir->f; 3018c2ecf20Sopenharmony_ci struct inet_frag_queue *q; 3028c2ecf20Sopenharmony_ci 3038c2ecf20Sopenharmony_ci q = inet_frag_alloc(fqdir, f, arg); 3048c2ecf20Sopenharmony_ci if (!q) { 3058c2ecf20Sopenharmony_ci *prev = ERR_PTR(-ENOMEM); 3068c2ecf20Sopenharmony_ci return NULL; 3078c2ecf20Sopenharmony_ci } 3088c2ecf20Sopenharmony_ci mod_timer(&q->timer, jiffies + fqdir->timeout); 3098c2ecf20Sopenharmony_ci 3108c2ecf20Sopenharmony_ci *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, 3118c2ecf20Sopenharmony_ci &q->node, f->rhash_params); 3128c2ecf20Sopenharmony_ci if (*prev) { 3138c2ecf20Sopenharmony_ci q->flags |= INET_FRAG_COMPLETE; 3148c2ecf20Sopenharmony_ci inet_frag_kill(q); 3158c2ecf20Sopenharmony_ci inet_frag_destroy(q); 3168c2ecf20Sopenharmony_ci return NULL; 3178c2ecf20Sopenharmony_ci } 3188c2ecf20Sopenharmony_ci return q; 3198c2ecf20Sopenharmony_ci} 3208c2ecf20Sopenharmony_ci 3218c2ecf20Sopenharmony_ci/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ 3228c2ecf20Sopenharmony_cistruct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) 3238c2ecf20Sopenharmony_ci{ 3248c2ecf20Sopenharmony_ci /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */ 3258c2ecf20Sopenharmony_ci long high_thresh = READ_ONCE(fqdir->high_thresh); 3268c2ecf20Sopenharmony_ci struct inet_frag_queue *fq = NULL, *prev; 3278c2ecf20Sopenharmony_ci 3288c2ecf20Sopenharmony_ci if (!high_thresh || frag_mem_limit(fqdir) > high_thresh) 3298c2ecf20Sopenharmony_ci return NULL; 3308c2ecf20Sopenharmony_ci 3318c2ecf20Sopenharmony_ci rcu_read_lock(); 3328c2ecf20Sopenharmony_ci 3338c2ecf20Sopenharmony_ci prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); 3348c2ecf20Sopenharmony_ci if (!prev) 3358c2ecf20Sopenharmony_ci fq = inet_frag_create(fqdir, key, &prev); 3368c2ecf20Sopenharmony_ci if (!IS_ERR_OR_NULL(prev)) { 3378c2ecf20Sopenharmony_ci fq = prev; 3388c2ecf20Sopenharmony_ci if (!refcount_inc_not_zero(&fq->refcnt)) 3398c2ecf20Sopenharmony_ci fq = NULL; 3408c2ecf20Sopenharmony_ci } 3418c2ecf20Sopenharmony_ci rcu_read_unlock(); 3428c2ecf20Sopenharmony_ci return fq; 3438c2ecf20Sopenharmony_ci} 3448c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frag_find); 3458c2ecf20Sopenharmony_ci 3468c2ecf20Sopenharmony_ciint inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, 3478c2ecf20Sopenharmony_ci int offset, int end) 3488c2ecf20Sopenharmony_ci{ 3498c2ecf20Sopenharmony_ci struct sk_buff *last = q->fragments_tail; 3508c2ecf20Sopenharmony_ci 3518c2ecf20Sopenharmony_ci /* RFC5722, Section 4, amended by Errata ID : 3089 3528c2ecf20Sopenharmony_ci * When reassembling an IPv6 datagram, if 3538c2ecf20Sopenharmony_ci * one or more its constituent fragments is determined to be an 3548c2ecf20Sopenharmony_ci * overlapping fragment, the entire datagram (and any constituent 3558c2ecf20Sopenharmony_ci * fragments) MUST be silently discarded. 3568c2ecf20Sopenharmony_ci * 3578c2ecf20Sopenharmony_ci * Duplicates, however, should be ignored (i.e. skb dropped, but the 3588c2ecf20Sopenharmony_ci * queue/fragments kept for later reassembly). 3598c2ecf20Sopenharmony_ci */ 3608c2ecf20Sopenharmony_ci if (!last) 3618c2ecf20Sopenharmony_ci fragrun_create(q, skb); /* First fragment. */ 3628c2ecf20Sopenharmony_ci else if (last->ip_defrag_offset + last->len < end) { 3638c2ecf20Sopenharmony_ci /* This is the common case: skb goes to the end. */ 3648c2ecf20Sopenharmony_ci /* Detect and discard overlaps. */ 3658c2ecf20Sopenharmony_ci if (offset < last->ip_defrag_offset + last->len) 3668c2ecf20Sopenharmony_ci return IPFRAG_OVERLAP; 3678c2ecf20Sopenharmony_ci if (offset == last->ip_defrag_offset + last->len) 3688c2ecf20Sopenharmony_ci fragrun_append_to_last(q, skb); 3698c2ecf20Sopenharmony_ci else 3708c2ecf20Sopenharmony_ci fragrun_create(q, skb); 3718c2ecf20Sopenharmony_ci } else { 3728c2ecf20Sopenharmony_ci /* Binary search. Note that skb can become the first fragment, 3738c2ecf20Sopenharmony_ci * but not the last (covered above). 3748c2ecf20Sopenharmony_ci */ 3758c2ecf20Sopenharmony_ci struct rb_node **rbn, *parent; 3768c2ecf20Sopenharmony_ci 3778c2ecf20Sopenharmony_ci rbn = &q->rb_fragments.rb_node; 3788c2ecf20Sopenharmony_ci do { 3798c2ecf20Sopenharmony_ci struct sk_buff *curr; 3808c2ecf20Sopenharmony_ci int curr_run_end; 3818c2ecf20Sopenharmony_ci 3828c2ecf20Sopenharmony_ci parent = *rbn; 3838c2ecf20Sopenharmony_ci curr = rb_to_skb(parent); 3848c2ecf20Sopenharmony_ci curr_run_end = curr->ip_defrag_offset + 3858c2ecf20Sopenharmony_ci FRAG_CB(curr)->frag_run_len; 3868c2ecf20Sopenharmony_ci if (end <= curr->ip_defrag_offset) 3878c2ecf20Sopenharmony_ci rbn = &parent->rb_left; 3888c2ecf20Sopenharmony_ci else if (offset >= curr_run_end) 3898c2ecf20Sopenharmony_ci rbn = &parent->rb_right; 3908c2ecf20Sopenharmony_ci else if (offset >= curr->ip_defrag_offset && 3918c2ecf20Sopenharmony_ci end <= curr_run_end) 3928c2ecf20Sopenharmony_ci return IPFRAG_DUP; 3938c2ecf20Sopenharmony_ci else 3948c2ecf20Sopenharmony_ci return IPFRAG_OVERLAP; 3958c2ecf20Sopenharmony_ci } while (*rbn); 3968c2ecf20Sopenharmony_ci /* Here we have parent properly set, and rbn pointing to 3978c2ecf20Sopenharmony_ci * one of its NULL left/right children. Insert skb. 3988c2ecf20Sopenharmony_ci */ 3998c2ecf20Sopenharmony_ci fragcb_clear(skb); 4008c2ecf20Sopenharmony_ci rb_link_node(&skb->rbnode, parent, rbn); 4018c2ecf20Sopenharmony_ci rb_insert_color(&skb->rbnode, &q->rb_fragments); 4028c2ecf20Sopenharmony_ci } 4038c2ecf20Sopenharmony_ci 4048c2ecf20Sopenharmony_ci skb->ip_defrag_offset = offset; 4058c2ecf20Sopenharmony_ci 4068c2ecf20Sopenharmony_ci return IPFRAG_OK; 4078c2ecf20Sopenharmony_ci} 4088c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frag_queue_insert); 4098c2ecf20Sopenharmony_ci 4108c2ecf20Sopenharmony_civoid *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, 4118c2ecf20Sopenharmony_ci struct sk_buff *parent) 4128c2ecf20Sopenharmony_ci{ 4138c2ecf20Sopenharmony_ci struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); 4148c2ecf20Sopenharmony_ci struct sk_buff **nextp; 4158c2ecf20Sopenharmony_ci int delta; 4168c2ecf20Sopenharmony_ci 4178c2ecf20Sopenharmony_ci if (head != skb) { 4188c2ecf20Sopenharmony_ci fp = skb_clone(skb, GFP_ATOMIC); 4198c2ecf20Sopenharmony_ci if (!fp) 4208c2ecf20Sopenharmony_ci return NULL; 4218c2ecf20Sopenharmony_ci FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; 4228c2ecf20Sopenharmony_ci if (RB_EMPTY_NODE(&skb->rbnode)) 4238c2ecf20Sopenharmony_ci FRAG_CB(parent)->next_frag = fp; 4248c2ecf20Sopenharmony_ci else 4258c2ecf20Sopenharmony_ci rb_replace_node(&skb->rbnode, &fp->rbnode, 4268c2ecf20Sopenharmony_ci &q->rb_fragments); 4278c2ecf20Sopenharmony_ci if (q->fragments_tail == skb) 4288c2ecf20Sopenharmony_ci q->fragments_tail = fp; 4298c2ecf20Sopenharmony_ci skb_morph(skb, head); 4308c2ecf20Sopenharmony_ci FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; 4318c2ecf20Sopenharmony_ci rb_replace_node(&head->rbnode, &skb->rbnode, 4328c2ecf20Sopenharmony_ci &q->rb_fragments); 4338c2ecf20Sopenharmony_ci consume_skb(head); 4348c2ecf20Sopenharmony_ci head = skb; 4358c2ecf20Sopenharmony_ci } 4368c2ecf20Sopenharmony_ci WARN_ON(head->ip_defrag_offset != 0); 4378c2ecf20Sopenharmony_ci 4388c2ecf20Sopenharmony_ci delta = -head->truesize; 4398c2ecf20Sopenharmony_ci 4408c2ecf20Sopenharmony_ci /* Head of list must not be cloned. */ 4418c2ecf20Sopenharmony_ci if (skb_unclone(head, GFP_ATOMIC)) 4428c2ecf20Sopenharmony_ci return NULL; 4438c2ecf20Sopenharmony_ci 4448c2ecf20Sopenharmony_ci delta += head->truesize; 4458c2ecf20Sopenharmony_ci if (delta) 4468c2ecf20Sopenharmony_ci add_frag_mem_limit(q->fqdir, delta); 4478c2ecf20Sopenharmony_ci 4488c2ecf20Sopenharmony_ci /* If the first fragment is fragmented itself, we split 4498c2ecf20Sopenharmony_ci * it to two chunks: the first with data and paged part 4508c2ecf20Sopenharmony_ci * and the second, holding only fragments. 4518c2ecf20Sopenharmony_ci */ 4528c2ecf20Sopenharmony_ci if (skb_has_frag_list(head)) { 4538c2ecf20Sopenharmony_ci struct sk_buff *clone; 4548c2ecf20Sopenharmony_ci int i, plen = 0; 4558c2ecf20Sopenharmony_ci 4568c2ecf20Sopenharmony_ci clone = alloc_skb(0, GFP_ATOMIC); 4578c2ecf20Sopenharmony_ci if (!clone) 4588c2ecf20Sopenharmony_ci return NULL; 4598c2ecf20Sopenharmony_ci skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; 4608c2ecf20Sopenharmony_ci skb_frag_list_init(head); 4618c2ecf20Sopenharmony_ci for (i = 0; i < skb_shinfo(head)->nr_frags; i++) 4628c2ecf20Sopenharmony_ci plen += skb_frag_size(&skb_shinfo(head)->frags[i]); 4638c2ecf20Sopenharmony_ci clone->data_len = head->data_len - plen; 4648c2ecf20Sopenharmony_ci clone->len = clone->data_len; 4658c2ecf20Sopenharmony_ci head->truesize += clone->truesize; 4668c2ecf20Sopenharmony_ci clone->csum = 0; 4678c2ecf20Sopenharmony_ci clone->ip_summed = head->ip_summed; 4688c2ecf20Sopenharmony_ci add_frag_mem_limit(q->fqdir, clone->truesize); 4698c2ecf20Sopenharmony_ci skb_shinfo(head)->frag_list = clone; 4708c2ecf20Sopenharmony_ci nextp = &clone->next; 4718c2ecf20Sopenharmony_ci } else { 4728c2ecf20Sopenharmony_ci nextp = &skb_shinfo(head)->frag_list; 4738c2ecf20Sopenharmony_ci } 4748c2ecf20Sopenharmony_ci 4758c2ecf20Sopenharmony_ci return nextp; 4768c2ecf20Sopenharmony_ci} 4778c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frag_reasm_prepare); 4788c2ecf20Sopenharmony_ci 4798c2ecf20Sopenharmony_civoid inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, 4808c2ecf20Sopenharmony_ci void *reasm_data, bool try_coalesce) 4818c2ecf20Sopenharmony_ci{ 4828c2ecf20Sopenharmony_ci struct sk_buff **nextp = (struct sk_buff **)reasm_data; 4838c2ecf20Sopenharmony_ci struct rb_node *rbn; 4848c2ecf20Sopenharmony_ci struct sk_buff *fp; 4858c2ecf20Sopenharmony_ci int sum_truesize; 4868c2ecf20Sopenharmony_ci 4878c2ecf20Sopenharmony_ci skb_push(head, head->data - skb_network_header(head)); 4888c2ecf20Sopenharmony_ci 4898c2ecf20Sopenharmony_ci /* Traverse the tree in order, to build frag_list. */ 4908c2ecf20Sopenharmony_ci fp = FRAG_CB(head)->next_frag; 4918c2ecf20Sopenharmony_ci rbn = rb_next(&head->rbnode); 4928c2ecf20Sopenharmony_ci rb_erase(&head->rbnode, &q->rb_fragments); 4938c2ecf20Sopenharmony_ci 4948c2ecf20Sopenharmony_ci sum_truesize = head->truesize; 4958c2ecf20Sopenharmony_ci while (rbn || fp) { 4968c2ecf20Sopenharmony_ci /* fp points to the next sk_buff in the current run; 4978c2ecf20Sopenharmony_ci * rbn points to the next run. 4988c2ecf20Sopenharmony_ci */ 4998c2ecf20Sopenharmony_ci /* Go through the current run. */ 5008c2ecf20Sopenharmony_ci while (fp) { 5018c2ecf20Sopenharmony_ci struct sk_buff *next_frag = FRAG_CB(fp)->next_frag; 5028c2ecf20Sopenharmony_ci bool stolen; 5038c2ecf20Sopenharmony_ci int delta; 5048c2ecf20Sopenharmony_ci 5058c2ecf20Sopenharmony_ci sum_truesize += fp->truesize; 5068c2ecf20Sopenharmony_ci if (head->ip_summed != fp->ip_summed) 5078c2ecf20Sopenharmony_ci head->ip_summed = CHECKSUM_NONE; 5088c2ecf20Sopenharmony_ci else if (head->ip_summed == CHECKSUM_COMPLETE) 5098c2ecf20Sopenharmony_ci head->csum = csum_add(head->csum, fp->csum); 5108c2ecf20Sopenharmony_ci 5118c2ecf20Sopenharmony_ci if (try_coalesce && skb_try_coalesce(head, fp, &stolen, 5128c2ecf20Sopenharmony_ci &delta)) { 5138c2ecf20Sopenharmony_ci kfree_skb_partial(fp, stolen); 5148c2ecf20Sopenharmony_ci } else { 5158c2ecf20Sopenharmony_ci fp->prev = NULL; 5168c2ecf20Sopenharmony_ci memset(&fp->rbnode, 0, sizeof(fp->rbnode)); 5178c2ecf20Sopenharmony_ci fp->sk = NULL; 5188c2ecf20Sopenharmony_ci 5198c2ecf20Sopenharmony_ci head->data_len += fp->len; 5208c2ecf20Sopenharmony_ci head->len += fp->len; 5218c2ecf20Sopenharmony_ci head->truesize += fp->truesize; 5228c2ecf20Sopenharmony_ci 5238c2ecf20Sopenharmony_ci *nextp = fp; 5248c2ecf20Sopenharmony_ci nextp = &fp->next; 5258c2ecf20Sopenharmony_ci } 5268c2ecf20Sopenharmony_ci 5278c2ecf20Sopenharmony_ci fp = next_frag; 5288c2ecf20Sopenharmony_ci } 5298c2ecf20Sopenharmony_ci /* Move to the next run. */ 5308c2ecf20Sopenharmony_ci if (rbn) { 5318c2ecf20Sopenharmony_ci struct rb_node *rbnext = rb_next(rbn); 5328c2ecf20Sopenharmony_ci 5338c2ecf20Sopenharmony_ci fp = rb_to_skb(rbn); 5348c2ecf20Sopenharmony_ci rb_erase(rbn, &q->rb_fragments); 5358c2ecf20Sopenharmony_ci rbn = rbnext; 5368c2ecf20Sopenharmony_ci } 5378c2ecf20Sopenharmony_ci } 5388c2ecf20Sopenharmony_ci sub_frag_mem_limit(q->fqdir, sum_truesize); 5398c2ecf20Sopenharmony_ci 5408c2ecf20Sopenharmony_ci *nextp = NULL; 5418c2ecf20Sopenharmony_ci skb_mark_not_on_list(head); 5428c2ecf20Sopenharmony_ci head->prev = NULL; 5438c2ecf20Sopenharmony_ci head->tstamp = q->stamp; 5448c2ecf20Sopenharmony_ci} 5458c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frag_reasm_finish); 5468c2ecf20Sopenharmony_ci 5478c2ecf20Sopenharmony_cistruct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) 5488c2ecf20Sopenharmony_ci{ 5498c2ecf20Sopenharmony_ci struct sk_buff *head, *skb; 5508c2ecf20Sopenharmony_ci 5518c2ecf20Sopenharmony_ci head = skb_rb_first(&q->rb_fragments); 5528c2ecf20Sopenharmony_ci if (!head) 5538c2ecf20Sopenharmony_ci return NULL; 5548c2ecf20Sopenharmony_ci skb = FRAG_CB(head)->next_frag; 5558c2ecf20Sopenharmony_ci if (skb) 5568c2ecf20Sopenharmony_ci rb_replace_node(&head->rbnode, &skb->rbnode, 5578c2ecf20Sopenharmony_ci &q->rb_fragments); 5588c2ecf20Sopenharmony_ci else 5598c2ecf20Sopenharmony_ci rb_erase(&head->rbnode, &q->rb_fragments); 5608c2ecf20Sopenharmony_ci memset(&head->rbnode, 0, sizeof(head->rbnode)); 5618c2ecf20Sopenharmony_ci barrier(); 5628c2ecf20Sopenharmony_ci 5638c2ecf20Sopenharmony_ci if (head == q->fragments_tail) 5648c2ecf20Sopenharmony_ci q->fragments_tail = NULL; 5658c2ecf20Sopenharmony_ci 5668c2ecf20Sopenharmony_ci sub_frag_mem_limit(q->fqdir, head->truesize); 5678c2ecf20Sopenharmony_ci 5688c2ecf20Sopenharmony_ci return head; 5698c2ecf20Sopenharmony_ci} 5708c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frag_pull_head); 571