18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * inet fragments management
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * 		Authors:	Pavel Emelyanov <xemul@openvz.org>
68c2ecf20Sopenharmony_ci *				Started as consolidation of ipv4/ip_fragment.c,
78c2ecf20Sopenharmony_ci *				ipv6/reassembly. and ipv6 nf conntrack reassembly
88c2ecf20Sopenharmony_ci */
98c2ecf20Sopenharmony_ci
108c2ecf20Sopenharmony_ci#include <linux/list.h>
118c2ecf20Sopenharmony_ci#include <linux/spinlock.h>
128c2ecf20Sopenharmony_ci#include <linux/module.h>
138c2ecf20Sopenharmony_ci#include <linux/timer.h>
148c2ecf20Sopenharmony_ci#include <linux/mm.h>
158c2ecf20Sopenharmony_ci#include <linux/random.h>
168c2ecf20Sopenharmony_ci#include <linux/skbuff.h>
178c2ecf20Sopenharmony_ci#include <linux/rtnetlink.h>
188c2ecf20Sopenharmony_ci#include <linux/slab.h>
198c2ecf20Sopenharmony_ci#include <linux/rhashtable.h>
208c2ecf20Sopenharmony_ci
218c2ecf20Sopenharmony_ci#include <net/sock.h>
228c2ecf20Sopenharmony_ci#include <net/inet_frag.h>
238c2ecf20Sopenharmony_ci#include <net/inet_ecn.h>
248c2ecf20Sopenharmony_ci#include <net/ip.h>
258c2ecf20Sopenharmony_ci#include <net/ipv6.h>
268c2ecf20Sopenharmony_ci
278c2ecf20Sopenharmony_ci/* Use skb->cb to track consecutive/adjacent fragments coming at
288c2ecf20Sopenharmony_ci * the end of the queue. Nodes in the rb-tree queue will
298c2ecf20Sopenharmony_ci * contain "runs" of one or more adjacent fragments.
308c2ecf20Sopenharmony_ci *
318c2ecf20Sopenharmony_ci * Invariants:
328c2ecf20Sopenharmony_ci * - next_frag is NULL at the tail of a "run";
338c2ecf20Sopenharmony_ci * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
348c2ecf20Sopenharmony_ci */
358c2ecf20Sopenharmony_cistruct ipfrag_skb_cb {
368c2ecf20Sopenharmony_ci	union {
378c2ecf20Sopenharmony_ci		struct inet_skb_parm	h4;
388c2ecf20Sopenharmony_ci		struct inet6_skb_parm	h6;
398c2ecf20Sopenharmony_ci	};
408c2ecf20Sopenharmony_ci	struct sk_buff		*next_frag;
418c2ecf20Sopenharmony_ci	int			frag_run_len;
428c2ecf20Sopenharmony_ci};
438c2ecf20Sopenharmony_ci
448c2ecf20Sopenharmony_ci#define FRAG_CB(skb)		((struct ipfrag_skb_cb *)((skb)->cb))
458c2ecf20Sopenharmony_ci
468c2ecf20Sopenharmony_cistatic void fragcb_clear(struct sk_buff *skb)
478c2ecf20Sopenharmony_ci{
488c2ecf20Sopenharmony_ci	RB_CLEAR_NODE(&skb->rbnode);
498c2ecf20Sopenharmony_ci	FRAG_CB(skb)->next_frag = NULL;
508c2ecf20Sopenharmony_ci	FRAG_CB(skb)->frag_run_len = skb->len;
518c2ecf20Sopenharmony_ci}
528c2ecf20Sopenharmony_ci
538c2ecf20Sopenharmony_ci/* Append skb to the last "run". */
548c2ecf20Sopenharmony_cistatic void fragrun_append_to_last(struct inet_frag_queue *q,
558c2ecf20Sopenharmony_ci				   struct sk_buff *skb)
568c2ecf20Sopenharmony_ci{
578c2ecf20Sopenharmony_ci	fragcb_clear(skb);
588c2ecf20Sopenharmony_ci
598c2ecf20Sopenharmony_ci	FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
608c2ecf20Sopenharmony_ci	FRAG_CB(q->fragments_tail)->next_frag = skb;
618c2ecf20Sopenharmony_ci	q->fragments_tail = skb;
628c2ecf20Sopenharmony_ci}
638c2ecf20Sopenharmony_ci
648c2ecf20Sopenharmony_ci/* Create a new "run" with the skb. */
658c2ecf20Sopenharmony_cistatic void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb)
668c2ecf20Sopenharmony_ci{
678c2ecf20Sopenharmony_ci	BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
688c2ecf20Sopenharmony_ci	fragcb_clear(skb);
698c2ecf20Sopenharmony_ci
708c2ecf20Sopenharmony_ci	if (q->last_run_head)
718c2ecf20Sopenharmony_ci		rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
728c2ecf20Sopenharmony_ci			     &q->last_run_head->rbnode.rb_right);
738c2ecf20Sopenharmony_ci	else
748c2ecf20Sopenharmony_ci		rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
758c2ecf20Sopenharmony_ci	rb_insert_color(&skb->rbnode, &q->rb_fragments);
768c2ecf20Sopenharmony_ci
778c2ecf20Sopenharmony_ci	q->fragments_tail = skb;
788c2ecf20Sopenharmony_ci	q->last_run_head = skb;
798c2ecf20Sopenharmony_ci}
808c2ecf20Sopenharmony_ci
818c2ecf20Sopenharmony_ci/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
828c2ecf20Sopenharmony_ci * Value : 0xff if frame should be dropped.
838c2ecf20Sopenharmony_ci *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
848c2ecf20Sopenharmony_ci */
858c2ecf20Sopenharmony_ciconst u8 ip_frag_ecn_table[16] = {
868c2ecf20Sopenharmony_ci	/* at least one fragment had CE, and others ECT_0 or ECT_1 */
878c2ecf20Sopenharmony_ci	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]			= INET_ECN_CE,
888c2ecf20Sopenharmony_ci	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]			= INET_ECN_CE,
898c2ecf20Sopenharmony_ci	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]	= INET_ECN_CE,
908c2ecf20Sopenharmony_ci
918c2ecf20Sopenharmony_ci	/* invalid combinations : drop frame */
928c2ecf20Sopenharmony_ci	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
938c2ecf20Sopenharmony_ci	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
948c2ecf20Sopenharmony_ci	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
958c2ecf20Sopenharmony_ci	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
968c2ecf20Sopenharmony_ci	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
978c2ecf20Sopenharmony_ci	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
988c2ecf20Sopenharmony_ci	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
998c2ecf20Sopenharmony_ci};
1008c2ecf20Sopenharmony_ciEXPORT_SYMBOL(ip_frag_ecn_table);
1018c2ecf20Sopenharmony_ci
1028c2ecf20Sopenharmony_ciint inet_frags_init(struct inet_frags *f)
1038c2ecf20Sopenharmony_ci{
1048c2ecf20Sopenharmony_ci	f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
1058c2ecf20Sopenharmony_ci					    NULL);
1068c2ecf20Sopenharmony_ci	if (!f->frags_cachep)
1078c2ecf20Sopenharmony_ci		return -ENOMEM;
1088c2ecf20Sopenharmony_ci
1098c2ecf20Sopenharmony_ci	refcount_set(&f->refcnt, 1);
1108c2ecf20Sopenharmony_ci	init_completion(&f->completion);
1118c2ecf20Sopenharmony_ci	return 0;
1128c2ecf20Sopenharmony_ci}
1138c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frags_init);
1148c2ecf20Sopenharmony_ci
1158c2ecf20Sopenharmony_civoid inet_frags_fini(struct inet_frags *f)
1168c2ecf20Sopenharmony_ci{
1178c2ecf20Sopenharmony_ci	if (refcount_dec_and_test(&f->refcnt))
1188c2ecf20Sopenharmony_ci		complete(&f->completion);
1198c2ecf20Sopenharmony_ci
1208c2ecf20Sopenharmony_ci	wait_for_completion(&f->completion);
1218c2ecf20Sopenharmony_ci
1228c2ecf20Sopenharmony_ci	kmem_cache_destroy(f->frags_cachep);
1238c2ecf20Sopenharmony_ci	f->frags_cachep = NULL;
1248c2ecf20Sopenharmony_ci}
1258c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frags_fini);
1268c2ecf20Sopenharmony_ci
1278c2ecf20Sopenharmony_ci/* called from rhashtable_free_and_destroy() at netns_frags dismantle */
1288c2ecf20Sopenharmony_cistatic void inet_frags_free_cb(void *ptr, void *arg)
1298c2ecf20Sopenharmony_ci{
1308c2ecf20Sopenharmony_ci	struct inet_frag_queue *fq = ptr;
1318c2ecf20Sopenharmony_ci	int count;
1328c2ecf20Sopenharmony_ci
1338c2ecf20Sopenharmony_ci	count = del_timer_sync(&fq->timer) ? 1 : 0;
1348c2ecf20Sopenharmony_ci
1358c2ecf20Sopenharmony_ci	spin_lock_bh(&fq->lock);
1368c2ecf20Sopenharmony_ci	if (!(fq->flags & INET_FRAG_COMPLETE)) {
1378c2ecf20Sopenharmony_ci		fq->flags |= INET_FRAG_COMPLETE;
1388c2ecf20Sopenharmony_ci		count++;
1398c2ecf20Sopenharmony_ci	} else if (fq->flags & INET_FRAG_HASH_DEAD) {
1408c2ecf20Sopenharmony_ci		count++;
1418c2ecf20Sopenharmony_ci	}
1428c2ecf20Sopenharmony_ci	spin_unlock_bh(&fq->lock);
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ci	if (refcount_sub_and_test(count, &fq->refcnt))
1458c2ecf20Sopenharmony_ci		inet_frag_destroy(fq);
1468c2ecf20Sopenharmony_ci}
1478c2ecf20Sopenharmony_ci
1488c2ecf20Sopenharmony_cistatic void fqdir_work_fn(struct work_struct *work)
1498c2ecf20Sopenharmony_ci{
1508c2ecf20Sopenharmony_ci	struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
1518c2ecf20Sopenharmony_ci	struct inet_frags *f = fqdir->f;
1528c2ecf20Sopenharmony_ci
1538c2ecf20Sopenharmony_ci	rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
1548c2ecf20Sopenharmony_ci
1558c2ecf20Sopenharmony_ci	/* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
1568c2ecf20Sopenharmony_ci	 * have completed, since they need to dereference fqdir.
1578c2ecf20Sopenharmony_ci	 * Would it not be nice to have kfree_rcu_barrier() ? :)
1588c2ecf20Sopenharmony_ci	 */
1598c2ecf20Sopenharmony_ci	rcu_barrier();
1608c2ecf20Sopenharmony_ci
1618c2ecf20Sopenharmony_ci	if (refcount_dec_and_test(&f->refcnt))
1628c2ecf20Sopenharmony_ci		complete(&f->completion);
1638c2ecf20Sopenharmony_ci
1648c2ecf20Sopenharmony_ci	kfree(fqdir);
1658c2ecf20Sopenharmony_ci}
1668c2ecf20Sopenharmony_ci
1678c2ecf20Sopenharmony_ciint fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
1688c2ecf20Sopenharmony_ci{
1698c2ecf20Sopenharmony_ci	struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL);
1708c2ecf20Sopenharmony_ci	int res;
1718c2ecf20Sopenharmony_ci
1728c2ecf20Sopenharmony_ci	if (!fqdir)
1738c2ecf20Sopenharmony_ci		return -ENOMEM;
1748c2ecf20Sopenharmony_ci	fqdir->f = f;
1758c2ecf20Sopenharmony_ci	fqdir->net = net;
1768c2ecf20Sopenharmony_ci	res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
1778c2ecf20Sopenharmony_ci	if (res < 0) {
1788c2ecf20Sopenharmony_ci		kfree(fqdir);
1798c2ecf20Sopenharmony_ci		return res;
1808c2ecf20Sopenharmony_ci	}
1818c2ecf20Sopenharmony_ci	refcount_inc(&f->refcnt);
1828c2ecf20Sopenharmony_ci	*fqdirp = fqdir;
1838c2ecf20Sopenharmony_ci	return 0;
1848c2ecf20Sopenharmony_ci}
1858c2ecf20Sopenharmony_ciEXPORT_SYMBOL(fqdir_init);
1868c2ecf20Sopenharmony_ci
1878c2ecf20Sopenharmony_civoid fqdir_exit(struct fqdir *fqdir)
1888c2ecf20Sopenharmony_ci{
1898c2ecf20Sopenharmony_ci	INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
1908c2ecf20Sopenharmony_ci	queue_work(system_wq, &fqdir->destroy_work);
1918c2ecf20Sopenharmony_ci}
1928c2ecf20Sopenharmony_ciEXPORT_SYMBOL(fqdir_exit);
1938c2ecf20Sopenharmony_ci
1948c2ecf20Sopenharmony_civoid inet_frag_kill(struct inet_frag_queue *fq)
1958c2ecf20Sopenharmony_ci{
1968c2ecf20Sopenharmony_ci	if (del_timer(&fq->timer))
1978c2ecf20Sopenharmony_ci		refcount_dec(&fq->refcnt);
1988c2ecf20Sopenharmony_ci
1998c2ecf20Sopenharmony_ci	if (!(fq->flags & INET_FRAG_COMPLETE)) {
2008c2ecf20Sopenharmony_ci		struct fqdir *fqdir = fq->fqdir;
2018c2ecf20Sopenharmony_ci
2028c2ecf20Sopenharmony_ci		fq->flags |= INET_FRAG_COMPLETE;
2038c2ecf20Sopenharmony_ci		rcu_read_lock();
2048c2ecf20Sopenharmony_ci		/* The RCU read lock provides a memory barrier
2058c2ecf20Sopenharmony_ci		 * guaranteeing that if fqdir->dead is false then
2068c2ecf20Sopenharmony_ci		 * the hash table destruction will not start until
2078c2ecf20Sopenharmony_ci		 * after we unlock.  Paired with fqdir_pre_exit().
2088c2ecf20Sopenharmony_ci		 */
2098c2ecf20Sopenharmony_ci		if (!READ_ONCE(fqdir->dead)) {
2108c2ecf20Sopenharmony_ci			rhashtable_remove_fast(&fqdir->rhashtable, &fq->node,
2118c2ecf20Sopenharmony_ci					       fqdir->f->rhash_params);
2128c2ecf20Sopenharmony_ci			refcount_dec(&fq->refcnt);
2138c2ecf20Sopenharmony_ci		} else {
2148c2ecf20Sopenharmony_ci			fq->flags |= INET_FRAG_HASH_DEAD;
2158c2ecf20Sopenharmony_ci		}
2168c2ecf20Sopenharmony_ci		rcu_read_unlock();
2178c2ecf20Sopenharmony_ci	}
2188c2ecf20Sopenharmony_ci}
2198c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frag_kill);
2208c2ecf20Sopenharmony_ci
2218c2ecf20Sopenharmony_cistatic void inet_frag_destroy_rcu(struct rcu_head *head)
2228c2ecf20Sopenharmony_ci{
2238c2ecf20Sopenharmony_ci	struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
2248c2ecf20Sopenharmony_ci						 rcu);
2258c2ecf20Sopenharmony_ci	struct inet_frags *f = q->fqdir->f;
2268c2ecf20Sopenharmony_ci
2278c2ecf20Sopenharmony_ci	if (f->destructor)
2288c2ecf20Sopenharmony_ci		f->destructor(q);
2298c2ecf20Sopenharmony_ci	kmem_cache_free(f->frags_cachep, q);
2308c2ecf20Sopenharmony_ci}
2318c2ecf20Sopenharmony_ci
2328c2ecf20Sopenharmony_ciunsigned int inet_frag_rbtree_purge(struct rb_root *root)
2338c2ecf20Sopenharmony_ci{
2348c2ecf20Sopenharmony_ci	struct rb_node *p = rb_first(root);
2358c2ecf20Sopenharmony_ci	unsigned int sum = 0;
2368c2ecf20Sopenharmony_ci
2378c2ecf20Sopenharmony_ci	while (p) {
2388c2ecf20Sopenharmony_ci		struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
2398c2ecf20Sopenharmony_ci
2408c2ecf20Sopenharmony_ci		p = rb_next(p);
2418c2ecf20Sopenharmony_ci		rb_erase(&skb->rbnode, root);
2428c2ecf20Sopenharmony_ci		while (skb) {
2438c2ecf20Sopenharmony_ci			struct sk_buff *next = FRAG_CB(skb)->next_frag;
2448c2ecf20Sopenharmony_ci
2458c2ecf20Sopenharmony_ci			sum += skb->truesize;
2468c2ecf20Sopenharmony_ci			kfree_skb(skb);
2478c2ecf20Sopenharmony_ci			skb = next;
2488c2ecf20Sopenharmony_ci		}
2498c2ecf20Sopenharmony_ci	}
2508c2ecf20Sopenharmony_ci	return sum;
2518c2ecf20Sopenharmony_ci}
2528c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frag_rbtree_purge);
2538c2ecf20Sopenharmony_ci
2548c2ecf20Sopenharmony_civoid inet_frag_destroy(struct inet_frag_queue *q)
2558c2ecf20Sopenharmony_ci{
2568c2ecf20Sopenharmony_ci	struct fqdir *fqdir;
2578c2ecf20Sopenharmony_ci	unsigned int sum, sum_truesize = 0;
2588c2ecf20Sopenharmony_ci	struct inet_frags *f;
2598c2ecf20Sopenharmony_ci
2608c2ecf20Sopenharmony_ci	WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
2618c2ecf20Sopenharmony_ci	WARN_ON(del_timer(&q->timer) != 0);
2628c2ecf20Sopenharmony_ci
2638c2ecf20Sopenharmony_ci	/* Release all fragment data. */
2648c2ecf20Sopenharmony_ci	fqdir = q->fqdir;
2658c2ecf20Sopenharmony_ci	f = fqdir->f;
2668c2ecf20Sopenharmony_ci	sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
2678c2ecf20Sopenharmony_ci	sum = sum_truesize + f->qsize;
2688c2ecf20Sopenharmony_ci
2698c2ecf20Sopenharmony_ci	call_rcu(&q->rcu, inet_frag_destroy_rcu);
2708c2ecf20Sopenharmony_ci
2718c2ecf20Sopenharmony_ci	sub_frag_mem_limit(fqdir, sum);
2728c2ecf20Sopenharmony_ci}
2738c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frag_destroy);
2748c2ecf20Sopenharmony_ci
2758c2ecf20Sopenharmony_cistatic struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
2768c2ecf20Sopenharmony_ci					       struct inet_frags *f,
2778c2ecf20Sopenharmony_ci					       void *arg)
2788c2ecf20Sopenharmony_ci{
2798c2ecf20Sopenharmony_ci	struct inet_frag_queue *q;
2808c2ecf20Sopenharmony_ci
2818c2ecf20Sopenharmony_ci	q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
2828c2ecf20Sopenharmony_ci	if (!q)
2838c2ecf20Sopenharmony_ci		return NULL;
2848c2ecf20Sopenharmony_ci
2858c2ecf20Sopenharmony_ci	q->fqdir = fqdir;
2868c2ecf20Sopenharmony_ci	f->constructor(q, arg);
2878c2ecf20Sopenharmony_ci	add_frag_mem_limit(fqdir, f->qsize);
2888c2ecf20Sopenharmony_ci
2898c2ecf20Sopenharmony_ci	timer_setup(&q->timer, f->frag_expire, 0);
2908c2ecf20Sopenharmony_ci	spin_lock_init(&q->lock);
2918c2ecf20Sopenharmony_ci	refcount_set(&q->refcnt, 3);
2928c2ecf20Sopenharmony_ci
2938c2ecf20Sopenharmony_ci	return q;
2948c2ecf20Sopenharmony_ci}
2958c2ecf20Sopenharmony_ci
2968c2ecf20Sopenharmony_cistatic struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
2978c2ecf20Sopenharmony_ci						void *arg,
2988c2ecf20Sopenharmony_ci						struct inet_frag_queue **prev)
2998c2ecf20Sopenharmony_ci{
3008c2ecf20Sopenharmony_ci	struct inet_frags *f = fqdir->f;
3018c2ecf20Sopenharmony_ci	struct inet_frag_queue *q;
3028c2ecf20Sopenharmony_ci
3038c2ecf20Sopenharmony_ci	q = inet_frag_alloc(fqdir, f, arg);
3048c2ecf20Sopenharmony_ci	if (!q) {
3058c2ecf20Sopenharmony_ci		*prev = ERR_PTR(-ENOMEM);
3068c2ecf20Sopenharmony_ci		return NULL;
3078c2ecf20Sopenharmony_ci	}
3088c2ecf20Sopenharmony_ci	mod_timer(&q->timer, jiffies + fqdir->timeout);
3098c2ecf20Sopenharmony_ci
3108c2ecf20Sopenharmony_ci	*prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
3118c2ecf20Sopenharmony_ci						 &q->node, f->rhash_params);
3128c2ecf20Sopenharmony_ci	if (*prev) {
3138c2ecf20Sopenharmony_ci		q->flags |= INET_FRAG_COMPLETE;
3148c2ecf20Sopenharmony_ci		inet_frag_kill(q);
3158c2ecf20Sopenharmony_ci		inet_frag_destroy(q);
3168c2ecf20Sopenharmony_ci		return NULL;
3178c2ecf20Sopenharmony_ci	}
3188c2ecf20Sopenharmony_ci	return q;
3198c2ecf20Sopenharmony_ci}
3208c2ecf20Sopenharmony_ci
3218c2ecf20Sopenharmony_ci/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
3228c2ecf20Sopenharmony_cistruct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
3238c2ecf20Sopenharmony_ci{
3248c2ecf20Sopenharmony_ci	/* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */
3258c2ecf20Sopenharmony_ci	long high_thresh = READ_ONCE(fqdir->high_thresh);
3268c2ecf20Sopenharmony_ci	struct inet_frag_queue *fq = NULL, *prev;
3278c2ecf20Sopenharmony_ci
3288c2ecf20Sopenharmony_ci	if (!high_thresh || frag_mem_limit(fqdir) > high_thresh)
3298c2ecf20Sopenharmony_ci		return NULL;
3308c2ecf20Sopenharmony_ci
3318c2ecf20Sopenharmony_ci	rcu_read_lock();
3328c2ecf20Sopenharmony_ci
3338c2ecf20Sopenharmony_ci	prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params);
3348c2ecf20Sopenharmony_ci	if (!prev)
3358c2ecf20Sopenharmony_ci		fq = inet_frag_create(fqdir, key, &prev);
3368c2ecf20Sopenharmony_ci	if (!IS_ERR_OR_NULL(prev)) {
3378c2ecf20Sopenharmony_ci		fq = prev;
3388c2ecf20Sopenharmony_ci		if (!refcount_inc_not_zero(&fq->refcnt))
3398c2ecf20Sopenharmony_ci			fq = NULL;
3408c2ecf20Sopenharmony_ci	}
3418c2ecf20Sopenharmony_ci	rcu_read_unlock();
3428c2ecf20Sopenharmony_ci	return fq;
3438c2ecf20Sopenharmony_ci}
3448c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frag_find);
3458c2ecf20Sopenharmony_ci
3468c2ecf20Sopenharmony_ciint inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
3478c2ecf20Sopenharmony_ci			   int offset, int end)
3488c2ecf20Sopenharmony_ci{
3498c2ecf20Sopenharmony_ci	struct sk_buff *last = q->fragments_tail;
3508c2ecf20Sopenharmony_ci
3518c2ecf20Sopenharmony_ci	/* RFC5722, Section 4, amended by Errata ID : 3089
3528c2ecf20Sopenharmony_ci	 *                          When reassembling an IPv6 datagram, if
3538c2ecf20Sopenharmony_ci	 *   one or more its constituent fragments is determined to be an
3548c2ecf20Sopenharmony_ci	 *   overlapping fragment, the entire datagram (and any constituent
3558c2ecf20Sopenharmony_ci	 *   fragments) MUST be silently discarded.
3568c2ecf20Sopenharmony_ci	 *
3578c2ecf20Sopenharmony_ci	 * Duplicates, however, should be ignored (i.e. skb dropped, but the
3588c2ecf20Sopenharmony_ci	 * queue/fragments kept for later reassembly).
3598c2ecf20Sopenharmony_ci	 */
3608c2ecf20Sopenharmony_ci	if (!last)
3618c2ecf20Sopenharmony_ci		fragrun_create(q, skb);  /* First fragment. */
3628c2ecf20Sopenharmony_ci	else if (last->ip_defrag_offset + last->len < end) {
3638c2ecf20Sopenharmony_ci		/* This is the common case: skb goes to the end. */
3648c2ecf20Sopenharmony_ci		/* Detect and discard overlaps. */
3658c2ecf20Sopenharmony_ci		if (offset < last->ip_defrag_offset + last->len)
3668c2ecf20Sopenharmony_ci			return IPFRAG_OVERLAP;
3678c2ecf20Sopenharmony_ci		if (offset == last->ip_defrag_offset + last->len)
3688c2ecf20Sopenharmony_ci			fragrun_append_to_last(q, skb);
3698c2ecf20Sopenharmony_ci		else
3708c2ecf20Sopenharmony_ci			fragrun_create(q, skb);
3718c2ecf20Sopenharmony_ci	} else {
3728c2ecf20Sopenharmony_ci		/* Binary search. Note that skb can become the first fragment,
3738c2ecf20Sopenharmony_ci		 * but not the last (covered above).
3748c2ecf20Sopenharmony_ci		 */
3758c2ecf20Sopenharmony_ci		struct rb_node **rbn, *parent;
3768c2ecf20Sopenharmony_ci
3778c2ecf20Sopenharmony_ci		rbn = &q->rb_fragments.rb_node;
3788c2ecf20Sopenharmony_ci		do {
3798c2ecf20Sopenharmony_ci			struct sk_buff *curr;
3808c2ecf20Sopenharmony_ci			int curr_run_end;
3818c2ecf20Sopenharmony_ci
3828c2ecf20Sopenharmony_ci			parent = *rbn;
3838c2ecf20Sopenharmony_ci			curr = rb_to_skb(parent);
3848c2ecf20Sopenharmony_ci			curr_run_end = curr->ip_defrag_offset +
3858c2ecf20Sopenharmony_ci					FRAG_CB(curr)->frag_run_len;
3868c2ecf20Sopenharmony_ci			if (end <= curr->ip_defrag_offset)
3878c2ecf20Sopenharmony_ci				rbn = &parent->rb_left;
3888c2ecf20Sopenharmony_ci			else if (offset >= curr_run_end)
3898c2ecf20Sopenharmony_ci				rbn = &parent->rb_right;
3908c2ecf20Sopenharmony_ci			else if (offset >= curr->ip_defrag_offset &&
3918c2ecf20Sopenharmony_ci				 end <= curr_run_end)
3928c2ecf20Sopenharmony_ci				return IPFRAG_DUP;
3938c2ecf20Sopenharmony_ci			else
3948c2ecf20Sopenharmony_ci				return IPFRAG_OVERLAP;
3958c2ecf20Sopenharmony_ci		} while (*rbn);
3968c2ecf20Sopenharmony_ci		/* Here we have parent properly set, and rbn pointing to
3978c2ecf20Sopenharmony_ci		 * one of its NULL left/right children. Insert skb.
3988c2ecf20Sopenharmony_ci		 */
3998c2ecf20Sopenharmony_ci		fragcb_clear(skb);
4008c2ecf20Sopenharmony_ci		rb_link_node(&skb->rbnode, parent, rbn);
4018c2ecf20Sopenharmony_ci		rb_insert_color(&skb->rbnode, &q->rb_fragments);
4028c2ecf20Sopenharmony_ci	}
4038c2ecf20Sopenharmony_ci
4048c2ecf20Sopenharmony_ci	skb->ip_defrag_offset = offset;
4058c2ecf20Sopenharmony_ci
4068c2ecf20Sopenharmony_ci	return IPFRAG_OK;
4078c2ecf20Sopenharmony_ci}
4088c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frag_queue_insert);
4098c2ecf20Sopenharmony_ci
4108c2ecf20Sopenharmony_civoid *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
4118c2ecf20Sopenharmony_ci			      struct sk_buff *parent)
4128c2ecf20Sopenharmony_ci{
4138c2ecf20Sopenharmony_ci	struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments);
4148c2ecf20Sopenharmony_ci	struct sk_buff **nextp;
4158c2ecf20Sopenharmony_ci	int delta;
4168c2ecf20Sopenharmony_ci
4178c2ecf20Sopenharmony_ci	if (head != skb) {
4188c2ecf20Sopenharmony_ci		fp = skb_clone(skb, GFP_ATOMIC);
4198c2ecf20Sopenharmony_ci		if (!fp)
4208c2ecf20Sopenharmony_ci			return NULL;
4218c2ecf20Sopenharmony_ci		FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
4228c2ecf20Sopenharmony_ci		if (RB_EMPTY_NODE(&skb->rbnode))
4238c2ecf20Sopenharmony_ci			FRAG_CB(parent)->next_frag = fp;
4248c2ecf20Sopenharmony_ci		else
4258c2ecf20Sopenharmony_ci			rb_replace_node(&skb->rbnode, &fp->rbnode,
4268c2ecf20Sopenharmony_ci					&q->rb_fragments);
4278c2ecf20Sopenharmony_ci		if (q->fragments_tail == skb)
4288c2ecf20Sopenharmony_ci			q->fragments_tail = fp;
4298c2ecf20Sopenharmony_ci		skb_morph(skb, head);
4308c2ecf20Sopenharmony_ci		FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
4318c2ecf20Sopenharmony_ci		rb_replace_node(&head->rbnode, &skb->rbnode,
4328c2ecf20Sopenharmony_ci				&q->rb_fragments);
4338c2ecf20Sopenharmony_ci		consume_skb(head);
4348c2ecf20Sopenharmony_ci		head = skb;
4358c2ecf20Sopenharmony_ci	}
4368c2ecf20Sopenharmony_ci	WARN_ON(head->ip_defrag_offset != 0);
4378c2ecf20Sopenharmony_ci
4388c2ecf20Sopenharmony_ci	delta = -head->truesize;
4398c2ecf20Sopenharmony_ci
4408c2ecf20Sopenharmony_ci	/* Head of list must not be cloned. */
4418c2ecf20Sopenharmony_ci	if (skb_unclone(head, GFP_ATOMIC))
4428c2ecf20Sopenharmony_ci		return NULL;
4438c2ecf20Sopenharmony_ci
4448c2ecf20Sopenharmony_ci	delta += head->truesize;
4458c2ecf20Sopenharmony_ci	if (delta)
4468c2ecf20Sopenharmony_ci		add_frag_mem_limit(q->fqdir, delta);
4478c2ecf20Sopenharmony_ci
4488c2ecf20Sopenharmony_ci	/* If the first fragment is fragmented itself, we split
4498c2ecf20Sopenharmony_ci	 * it to two chunks: the first with data and paged part
4508c2ecf20Sopenharmony_ci	 * and the second, holding only fragments.
4518c2ecf20Sopenharmony_ci	 */
4528c2ecf20Sopenharmony_ci	if (skb_has_frag_list(head)) {
4538c2ecf20Sopenharmony_ci		struct sk_buff *clone;
4548c2ecf20Sopenharmony_ci		int i, plen = 0;
4558c2ecf20Sopenharmony_ci
4568c2ecf20Sopenharmony_ci		clone = alloc_skb(0, GFP_ATOMIC);
4578c2ecf20Sopenharmony_ci		if (!clone)
4588c2ecf20Sopenharmony_ci			return NULL;
4598c2ecf20Sopenharmony_ci		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
4608c2ecf20Sopenharmony_ci		skb_frag_list_init(head);
4618c2ecf20Sopenharmony_ci		for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
4628c2ecf20Sopenharmony_ci			plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
4638c2ecf20Sopenharmony_ci		clone->data_len = head->data_len - plen;
4648c2ecf20Sopenharmony_ci		clone->len = clone->data_len;
4658c2ecf20Sopenharmony_ci		head->truesize += clone->truesize;
4668c2ecf20Sopenharmony_ci		clone->csum = 0;
4678c2ecf20Sopenharmony_ci		clone->ip_summed = head->ip_summed;
4688c2ecf20Sopenharmony_ci		add_frag_mem_limit(q->fqdir, clone->truesize);
4698c2ecf20Sopenharmony_ci		skb_shinfo(head)->frag_list = clone;
4708c2ecf20Sopenharmony_ci		nextp = &clone->next;
4718c2ecf20Sopenharmony_ci	} else {
4728c2ecf20Sopenharmony_ci		nextp = &skb_shinfo(head)->frag_list;
4738c2ecf20Sopenharmony_ci	}
4748c2ecf20Sopenharmony_ci
4758c2ecf20Sopenharmony_ci	return nextp;
4768c2ecf20Sopenharmony_ci}
4778c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frag_reasm_prepare);
4788c2ecf20Sopenharmony_ci
4798c2ecf20Sopenharmony_civoid inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
4808c2ecf20Sopenharmony_ci			    void *reasm_data, bool try_coalesce)
4818c2ecf20Sopenharmony_ci{
4828c2ecf20Sopenharmony_ci	struct sk_buff **nextp = (struct sk_buff **)reasm_data;
4838c2ecf20Sopenharmony_ci	struct rb_node *rbn;
4848c2ecf20Sopenharmony_ci	struct sk_buff *fp;
4858c2ecf20Sopenharmony_ci	int sum_truesize;
4868c2ecf20Sopenharmony_ci
4878c2ecf20Sopenharmony_ci	skb_push(head, head->data - skb_network_header(head));
4888c2ecf20Sopenharmony_ci
4898c2ecf20Sopenharmony_ci	/* Traverse the tree in order, to build frag_list. */
4908c2ecf20Sopenharmony_ci	fp = FRAG_CB(head)->next_frag;
4918c2ecf20Sopenharmony_ci	rbn = rb_next(&head->rbnode);
4928c2ecf20Sopenharmony_ci	rb_erase(&head->rbnode, &q->rb_fragments);
4938c2ecf20Sopenharmony_ci
4948c2ecf20Sopenharmony_ci	sum_truesize = head->truesize;
4958c2ecf20Sopenharmony_ci	while (rbn || fp) {
4968c2ecf20Sopenharmony_ci		/* fp points to the next sk_buff in the current run;
4978c2ecf20Sopenharmony_ci		 * rbn points to the next run.
4988c2ecf20Sopenharmony_ci		 */
4998c2ecf20Sopenharmony_ci		/* Go through the current run. */
5008c2ecf20Sopenharmony_ci		while (fp) {
5018c2ecf20Sopenharmony_ci			struct sk_buff *next_frag = FRAG_CB(fp)->next_frag;
5028c2ecf20Sopenharmony_ci			bool stolen;
5038c2ecf20Sopenharmony_ci			int delta;
5048c2ecf20Sopenharmony_ci
5058c2ecf20Sopenharmony_ci			sum_truesize += fp->truesize;
5068c2ecf20Sopenharmony_ci			if (head->ip_summed != fp->ip_summed)
5078c2ecf20Sopenharmony_ci				head->ip_summed = CHECKSUM_NONE;
5088c2ecf20Sopenharmony_ci			else if (head->ip_summed == CHECKSUM_COMPLETE)
5098c2ecf20Sopenharmony_ci				head->csum = csum_add(head->csum, fp->csum);
5108c2ecf20Sopenharmony_ci
5118c2ecf20Sopenharmony_ci			if (try_coalesce && skb_try_coalesce(head, fp, &stolen,
5128c2ecf20Sopenharmony_ci							     &delta)) {
5138c2ecf20Sopenharmony_ci				kfree_skb_partial(fp, stolen);
5148c2ecf20Sopenharmony_ci			} else {
5158c2ecf20Sopenharmony_ci				fp->prev = NULL;
5168c2ecf20Sopenharmony_ci				memset(&fp->rbnode, 0, sizeof(fp->rbnode));
5178c2ecf20Sopenharmony_ci				fp->sk = NULL;
5188c2ecf20Sopenharmony_ci
5198c2ecf20Sopenharmony_ci				head->data_len += fp->len;
5208c2ecf20Sopenharmony_ci				head->len += fp->len;
5218c2ecf20Sopenharmony_ci				head->truesize += fp->truesize;
5228c2ecf20Sopenharmony_ci
5238c2ecf20Sopenharmony_ci				*nextp = fp;
5248c2ecf20Sopenharmony_ci				nextp = &fp->next;
5258c2ecf20Sopenharmony_ci			}
5268c2ecf20Sopenharmony_ci
5278c2ecf20Sopenharmony_ci			fp = next_frag;
5288c2ecf20Sopenharmony_ci		}
5298c2ecf20Sopenharmony_ci		/* Move to the next run. */
5308c2ecf20Sopenharmony_ci		if (rbn) {
5318c2ecf20Sopenharmony_ci			struct rb_node *rbnext = rb_next(rbn);
5328c2ecf20Sopenharmony_ci
5338c2ecf20Sopenharmony_ci			fp = rb_to_skb(rbn);
5348c2ecf20Sopenharmony_ci			rb_erase(rbn, &q->rb_fragments);
5358c2ecf20Sopenharmony_ci			rbn = rbnext;
5368c2ecf20Sopenharmony_ci		}
5378c2ecf20Sopenharmony_ci	}
5388c2ecf20Sopenharmony_ci	sub_frag_mem_limit(q->fqdir, sum_truesize);
5398c2ecf20Sopenharmony_ci
5408c2ecf20Sopenharmony_ci	*nextp = NULL;
5418c2ecf20Sopenharmony_ci	skb_mark_not_on_list(head);
5428c2ecf20Sopenharmony_ci	head->prev = NULL;
5438c2ecf20Sopenharmony_ci	head->tstamp = q->stamp;
5448c2ecf20Sopenharmony_ci}
5458c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frag_reasm_finish);
5468c2ecf20Sopenharmony_ci
5478c2ecf20Sopenharmony_cistruct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q)
5488c2ecf20Sopenharmony_ci{
5498c2ecf20Sopenharmony_ci	struct sk_buff *head, *skb;
5508c2ecf20Sopenharmony_ci
5518c2ecf20Sopenharmony_ci	head = skb_rb_first(&q->rb_fragments);
5528c2ecf20Sopenharmony_ci	if (!head)
5538c2ecf20Sopenharmony_ci		return NULL;
5548c2ecf20Sopenharmony_ci	skb = FRAG_CB(head)->next_frag;
5558c2ecf20Sopenharmony_ci	if (skb)
5568c2ecf20Sopenharmony_ci		rb_replace_node(&head->rbnode, &skb->rbnode,
5578c2ecf20Sopenharmony_ci				&q->rb_fragments);
5588c2ecf20Sopenharmony_ci	else
5598c2ecf20Sopenharmony_ci		rb_erase(&head->rbnode, &q->rb_fragments);
5608c2ecf20Sopenharmony_ci	memset(&head->rbnode, 0, sizeof(head->rbnode));
5618c2ecf20Sopenharmony_ci	barrier();
5628c2ecf20Sopenharmony_ci
5638c2ecf20Sopenharmony_ci	if (head == q->fragments_tail)
5648c2ecf20Sopenharmony_ci		q->fragments_tail = NULL;
5658c2ecf20Sopenharmony_ci
5668c2ecf20Sopenharmony_ci	sub_frag_mem_limit(q->fqdir, head->truesize);
5678c2ecf20Sopenharmony_ci
5688c2ecf20Sopenharmony_ci	return head;
5698c2ecf20Sopenharmony_ci}
5708c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inet_frag_pull_head);
571