162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * inet fragments management
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * 		Authors:	Pavel Emelyanov <xemul@openvz.org>
662306a36Sopenharmony_ci *				Started as consolidation of ipv4/ip_fragment.c,
762306a36Sopenharmony_ci *				ipv6/reassembly. and ipv6 nf conntrack reassembly
862306a36Sopenharmony_ci */
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci#include <linux/list.h>
1162306a36Sopenharmony_ci#include <linux/spinlock.h>
1262306a36Sopenharmony_ci#include <linux/module.h>
1362306a36Sopenharmony_ci#include <linux/timer.h>
1462306a36Sopenharmony_ci#include <linux/mm.h>
1562306a36Sopenharmony_ci#include <linux/random.h>
1662306a36Sopenharmony_ci#include <linux/skbuff.h>
1762306a36Sopenharmony_ci#include <linux/rtnetlink.h>
1862306a36Sopenharmony_ci#include <linux/slab.h>
1962306a36Sopenharmony_ci#include <linux/rhashtable.h>
2062306a36Sopenharmony_ci
2162306a36Sopenharmony_ci#include <net/sock.h>
2262306a36Sopenharmony_ci#include <net/inet_frag.h>
2362306a36Sopenharmony_ci#include <net/inet_ecn.h>
2462306a36Sopenharmony_ci#include <net/ip.h>
2562306a36Sopenharmony_ci#include <net/ipv6.h>
2662306a36Sopenharmony_ci
2762306a36Sopenharmony_ci/* Use skb->cb to track consecutive/adjacent fragments coming at
2862306a36Sopenharmony_ci * the end of the queue. Nodes in the rb-tree queue will
2962306a36Sopenharmony_ci * contain "runs" of one or more adjacent fragments.
3062306a36Sopenharmony_ci *
3162306a36Sopenharmony_ci * Invariants:
3262306a36Sopenharmony_ci * - next_frag is NULL at the tail of a "run";
3362306a36Sopenharmony_ci * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
3462306a36Sopenharmony_ci */
3562306a36Sopenharmony_cistruct ipfrag_skb_cb {
3662306a36Sopenharmony_ci	union {
3762306a36Sopenharmony_ci		struct inet_skb_parm	h4;
3862306a36Sopenharmony_ci		struct inet6_skb_parm	h6;
3962306a36Sopenharmony_ci	};
4062306a36Sopenharmony_ci	struct sk_buff		*next_frag;
4162306a36Sopenharmony_ci	int			frag_run_len;
4262306a36Sopenharmony_ci};
4362306a36Sopenharmony_ci
4462306a36Sopenharmony_ci#define FRAG_CB(skb)		((struct ipfrag_skb_cb *)((skb)->cb))
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_cistatic void fragcb_clear(struct sk_buff *skb)
4762306a36Sopenharmony_ci{
4862306a36Sopenharmony_ci	RB_CLEAR_NODE(&skb->rbnode);
4962306a36Sopenharmony_ci	FRAG_CB(skb)->next_frag = NULL;
5062306a36Sopenharmony_ci	FRAG_CB(skb)->frag_run_len = skb->len;
5162306a36Sopenharmony_ci}
5262306a36Sopenharmony_ci
5362306a36Sopenharmony_ci/* Append skb to the last "run". */
5462306a36Sopenharmony_cistatic void fragrun_append_to_last(struct inet_frag_queue *q,
5562306a36Sopenharmony_ci				   struct sk_buff *skb)
5662306a36Sopenharmony_ci{
5762306a36Sopenharmony_ci	fragcb_clear(skb);
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_ci	FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
6062306a36Sopenharmony_ci	FRAG_CB(q->fragments_tail)->next_frag = skb;
6162306a36Sopenharmony_ci	q->fragments_tail = skb;
6262306a36Sopenharmony_ci}
6362306a36Sopenharmony_ci
6462306a36Sopenharmony_ci/* Create a new "run" with the skb. */
6562306a36Sopenharmony_cistatic void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb)
6662306a36Sopenharmony_ci{
6762306a36Sopenharmony_ci	BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
6862306a36Sopenharmony_ci	fragcb_clear(skb);
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_ci	if (q->last_run_head)
7162306a36Sopenharmony_ci		rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
7262306a36Sopenharmony_ci			     &q->last_run_head->rbnode.rb_right);
7362306a36Sopenharmony_ci	else
7462306a36Sopenharmony_ci		rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
7562306a36Sopenharmony_ci	rb_insert_color(&skb->rbnode, &q->rb_fragments);
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_ci	q->fragments_tail = skb;
7862306a36Sopenharmony_ci	q->last_run_head = skb;
7962306a36Sopenharmony_ci}
8062306a36Sopenharmony_ci
8162306a36Sopenharmony_ci/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
8262306a36Sopenharmony_ci * Value : 0xff if frame should be dropped.
8362306a36Sopenharmony_ci *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
8462306a36Sopenharmony_ci */
8562306a36Sopenharmony_ciconst u8 ip_frag_ecn_table[16] = {
8662306a36Sopenharmony_ci	/* at least one fragment had CE, and others ECT_0 or ECT_1 */
8762306a36Sopenharmony_ci	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]			= INET_ECN_CE,
8862306a36Sopenharmony_ci	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]			= INET_ECN_CE,
8962306a36Sopenharmony_ci	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]	= INET_ECN_CE,
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci	/* invalid combinations : drop frame */
9262306a36Sopenharmony_ci	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
9362306a36Sopenharmony_ci	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
9462306a36Sopenharmony_ci	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
9562306a36Sopenharmony_ci	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
9662306a36Sopenharmony_ci	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
9762306a36Sopenharmony_ci	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
9862306a36Sopenharmony_ci	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
9962306a36Sopenharmony_ci};
10062306a36Sopenharmony_ciEXPORT_SYMBOL(ip_frag_ecn_table);
10162306a36Sopenharmony_ci
10262306a36Sopenharmony_ciint inet_frags_init(struct inet_frags *f)
10362306a36Sopenharmony_ci{
10462306a36Sopenharmony_ci	f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
10562306a36Sopenharmony_ci					    NULL);
10662306a36Sopenharmony_ci	if (!f->frags_cachep)
10762306a36Sopenharmony_ci		return -ENOMEM;
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci	refcount_set(&f->refcnt, 1);
11062306a36Sopenharmony_ci	init_completion(&f->completion);
11162306a36Sopenharmony_ci	return 0;
11262306a36Sopenharmony_ci}
11362306a36Sopenharmony_ciEXPORT_SYMBOL(inet_frags_init);
11462306a36Sopenharmony_ci
11562306a36Sopenharmony_civoid inet_frags_fini(struct inet_frags *f)
11662306a36Sopenharmony_ci{
11762306a36Sopenharmony_ci	if (refcount_dec_and_test(&f->refcnt))
11862306a36Sopenharmony_ci		complete(&f->completion);
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_ci	wait_for_completion(&f->completion);
12162306a36Sopenharmony_ci
12262306a36Sopenharmony_ci	kmem_cache_destroy(f->frags_cachep);
12362306a36Sopenharmony_ci	f->frags_cachep = NULL;
12462306a36Sopenharmony_ci}
12562306a36Sopenharmony_ciEXPORT_SYMBOL(inet_frags_fini);
12662306a36Sopenharmony_ci
12762306a36Sopenharmony_ci/* called from rhashtable_free_and_destroy() at netns_frags dismantle */
12862306a36Sopenharmony_cistatic void inet_frags_free_cb(void *ptr, void *arg)
12962306a36Sopenharmony_ci{
13062306a36Sopenharmony_ci	struct inet_frag_queue *fq = ptr;
13162306a36Sopenharmony_ci	int count;
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ci	count = del_timer_sync(&fq->timer) ? 1 : 0;
13462306a36Sopenharmony_ci
13562306a36Sopenharmony_ci	spin_lock_bh(&fq->lock);
13662306a36Sopenharmony_ci	fq->flags |= INET_FRAG_DROP;
13762306a36Sopenharmony_ci	if (!(fq->flags & INET_FRAG_COMPLETE)) {
13862306a36Sopenharmony_ci		fq->flags |= INET_FRAG_COMPLETE;
13962306a36Sopenharmony_ci		count++;
14062306a36Sopenharmony_ci	} else if (fq->flags & INET_FRAG_HASH_DEAD) {
14162306a36Sopenharmony_ci		count++;
14262306a36Sopenharmony_ci	}
14362306a36Sopenharmony_ci	spin_unlock_bh(&fq->lock);
14462306a36Sopenharmony_ci
14562306a36Sopenharmony_ci	if (refcount_sub_and_test(count, &fq->refcnt))
14662306a36Sopenharmony_ci		inet_frag_destroy(fq);
14762306a36Sopenharmony_ci}
14862306a36Sopenharmony_ci
14962306a36Sopenharmony_cistatic LLIST_HEAD(fqdir_free_list);
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_cistatic void fqdir_free_fn(struct work_struct *work)
15262306a36Sopenharmony_ci{
15362306a36Sopenharmony_ci	struct llist_node *kill_list;
15462306a36Sopenharmony_ci	struct fqdir *fqdir, *tmp;
15562306a36Sopenharmony_ci	struct inet_frags *f;
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_ci	/* Atomically snapshot the list of fqdirs to free */
15862306a36Sopenharmony_ci	kill_list = llist_del_all(&fqdir_free_list);
15962306a36Sopenharmony_ci
16062306a36Sopenharmony_ci	/* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
16162306a36Sopenharmony_ci	 * have completed, since they need to dereference fqdir.
16262306a36Sopenharmony_ci	 * Would it not be nice to have kfree_rcu_barrier() ? :)
16362306a36Sopenharmony_ci	 */
16462306a36Sopenharmony_ci	rcu_barrier();
16562306a36Sopenharmony_ci
16662306a36Sopenharmony_ci	llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) {
16762306a36Sopenharmony_ci		f = fqdir->f;
16862306a36Sopenharmony_ci		if (refcount_dec_and_test(&f->refcnt))
16962306a36Sopenharmony_ci			complete(&f->completion);
17062306a36Sopenharmony_ci
17162306a36Sopenharmony_ci		kfree(fqdir);
17262306a36Sopenharmony_ci	}
17362306a36Sopenharmony_ci}
17462306a36Sopenharmony_ci
17562306a36Sopenharmony_cistatic DECLARE_WORK(fqdir_free_work, fqdir_free_fn);
17662306a36Sopenharmony_ci
17762306a36Sopenharmony_cistatic void fqdir_work_fn(struct work_struct *work)
17862306a36Sopenharmony_ci{
17962306a36Sopenharmony_ci	struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
18062306a36Sopenharmony_ci
18162306a36Sopenharmony_ci	rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
18262306a36Sopenharmony_ci
18362306a36Sopenharmony_ci	if (llist_add(&fqdir->free_list, &fqdir_free_list))
18462306a36Sopenharmony_ci		queue_work(system_wq, &fqdir_free_work);
18562306a36Sopenharmony_ci}
18662306a36Sopenharmony_ci
18762306a36Sopenharmony_ciint fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
18862306a36Sopenharmony_ci{
18962306a36Sopenharmony_ci	struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL);
19062306a36Sopenharmony_ci	int res;
19162306a36Sopenharmony_ci
19262306a36Sopenharmony_ci	if (!fqdir)
19362306a36Sopenharmony_ci		return -ENOMEM;
19462306a36Sopenharmony_ci	fqdir->f = f;
19562306a36Sopenharmony_ci	fqdir->net = net;
19662306a36Sopenharmony_ci	res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
19762306a36Sopenharmony_ci	if (res < 0) {
19862306a36Sopenharmony_ci		kfree(fqdir);
19962306a36Sopenharmony_ci		return res;
20062306a36Sopenharmony_ci	}
20162306a36Sopenharmony_ci	refcount_inc(&f->refcnt);
20262306a36Sopenharmony_ci	*fqdirp = fqdir;
20362306a36Sopenharmony_ci	return 0;
20462306a36Sopenharmony_ci}
20562306a36Sopenharmony_ciEXPORT_SYMBOL(fqdir_init);
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_cistatic struct workqueue_struct *inet_frag_wq;
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_cistatic int __init inet_frag_wq_init(void)
21062306a36Sopenharmony_ci{
21162306a36Sopenharmony_ci	inet_frag_wq = create_workqueue("inet_frag_wq");
21262306a36Sopenharmony_ci	if (!inet_frag_wq)
21362306a36Sopenharmony_ci		panic("Could not create inet frag workq");
21462306a36Sopenharmony_ci	return 0;
21562306a36Sopenharmony_ci}
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_cipure_initcall(inet_frag_wq_init);
21862306a36Sopenharmony_ci
21962306a36Sopenharmony_civoid fqdir_exit(struct fqdir *fqdir)
22062306a36Sopenharmony_ci{
22162306a36Sopenharmony_ci	INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
22262306a36Sopenharmony_ci	queue_work(inet_frag_wq, &fqdir->destroy_work);
22362306a36Sopenharmony_ci}
22462306a36Sopenharmony_ciEXPORT_SYMBOL(fqdir_exit);
22562306a36Sopenharmony_ci
22662306a36Sopenharmony_civoid inet_frag_kill(struct inet_frag_queue *fq)
22762306a36Sopenharmony_ci{
22862306a36Sopenharmony_ci	if (del_timer(&fq->timer))
22962306a36Sopenharmony_ci		refcount_dec(&fq->refcnt);
23062306a36Sopenharmony_ci
23162306a36Sopenharmony_ci	if (!(fq->flags & INET_FRAG_COMPLETE)) {
23262306a36Sopenharmony_ci		struct fqdir *fqdir = fq->fqdir;
23362306a36Sopenharmony_ci
23462306a36Sopenharmony_ci		fq->flags |= INET_FRAG_COMPLETE;
23562306a36Sopenharmony_ci		rcu_read_lock();
23662306a36Sopenharmony_ci		/* The RCU read lock provides a memory barrier
23762306a36Sopenharmony_ci		 * guaranteeing that if fqdir->dead is false then
23862306a36Sopenharmony_ci		 * the hash table destruction will not start until
23962306a36Sopenharmony_ci		 * after we unlock.  Paired with fqdir_pre_exit().
24062306a36Sopenharmony_ci		 */
24162306a36Sopenharmony_ci		if (!READ_ONCE(fqdir->dead)) {
24262306a36Sopenharmony_ci			rhashtable_remove_fast(&fqdir->rhashtable, &fq->node,
24362306a36Sopenharmony_ci					       fqdir->f->rhash_params);
24462306a36Sopenharmony_ci			refcount_dec(&fq->refcnt);
24562306a36Sopenharmony_ci		} else {
24662306a36Sopenharmony_ci			fq->flags |= INET_FRAG_HASH_DEAD;
24762306a36Sopenharmony_ci		}
24862306a36Sopenharmony_ci		rcu_read_unlock();
24962306a36Sopenharmony_ci	}
25062306a36Sopenharmony_ci}
25162306a36Sopenharmony_ciEXPORT_SYMBOL(inet_frag_kill);
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_cistatic void inet_frag_destroy_rcu(struct rcu_head *head)
25462306a36Sopenharmony_ci{
25562306a36Sopenharmony_ci	struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
25662306a36Sopenharmony_ci						 rcu);
25762306a36Sopenharmony_ci	struct inet_frags *f = q->fqdir->f;
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci	if (f->destructor)
26062306a36Sopenharmony_ci		f->destructor(q);
26162306a36Sopenharmony_ci	kmem_cache_free(f->frags_cachep, q);
26262306a36Sopenharmony_ci}
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_ciunsigned int inet_frag_rbtree_purge(struct rb_root *root,
26562306a36Sopenharmony_ci				    enum skb_drop_reason reason)
26662306a36Sopenharmony_ci{
26762306a36Sopenharmony_ci	struct rb_node *p = rb_first(root);
26862306a36Sopenharmony_ci	unsigned int sum = 0;
26962306a36Sopenharmony_ci
27062306a36Sopenharmony_ci	while (p) {
27162306a36Sopenharmony_ci		struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_ci		p = rb_next(p);
27462306a36Sopenharmony_ci		rb_erase(&skb->rbnode, root);
27562306a36Sopenharmony_ci		while (skb) {
27662306a36Sopenharmony_ci			struct sk_buff *next = FRAG_CB(skb)->next_frag;
27762306a36Sopenharmony_ci
27862306a36Sopenharmony_ci			sum += skb->truesize;
27962306a36Sopenharmony_ci			kfree_skb_reason(skb, reason);
28062306a36Sopenharmony_ci			skb = next;
28162306a36Sopenharmony_ci		}
28262306a36Sopenharmony_ci	}
28362306a36Sopenharmony_ci	return sum;
28462306a36Sopenharmony_ci}
28562306a36Sopenharmony_ciEXPORT_SYMBOL(inet_frag_rbtree_purge);
28662306a36Sopenharmony_ci
28762306a36Sopenharmony_civoid inet_frag_destroy(struct inet_frag_queue *q)
28862306a36Sopenharmony_ci{
28962306a36Sopenharmony_ci	unsigned int sum, sum_truesize = 0;
29062306a36Sopenharmony_ci	enum skb_drop_reason reason;
29162306a36Sopenharmony_ci	struct inet_frags *f;
29262306a36Sopenharmony_ci	struct fqdir *fqdir;
29362306a36Sopenharmony_ci
29462306a36Sopenharmony_ci	WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
29562306a36Sopenharmony_ci	reason = (q->flags & INET_FRAG_DROP) ?
29662306a36Sopenharmony_ci			SKB_DROP_REASON_FRAG_REASM_TIMEOUT :
29762306a36Sopenharmony_ci			SKB_CONSUMED;
29862306a36Sopenharmony_ci	WARN_ON(del_timer(&q->timer) != 0);
29962306a36Sopenharmony_ci
30062306a36Sopenharmony_ci	/* Release all fragment data. */
30162306a36Sopenharmony_ci	fqdir = q->fqdir;
30262306a36Sopenharmony_ci	f = fqdir->f;
30362306a36Sopenharmony_ci	sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason);
30462306a36Sopenharmony_ci	sum = sum_truesize + f->qsize;
30562306a36Sopenharmony_ci
30662306a36Sopenharmony_ci	call_rcu(&q->rcu, inet_frag_destroy_rcu);
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_ci	sub_frag_mem_limit(fqdir, sum);
30962306a36Sopenharmony_ci}
31062306a36Sopenharmony_ciEXPORT_SYMBOL(inet_frag_destroy);
31162306a36Sopenharmony_ci
31262306a36Sopenharmony_cistatic struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
31362306a36Sopenharmony_ci					       struct inet_frags *f,
31462306a36Sopenharmony_ci					       void *arg)
31562306a36Sopenharmony_ci{
31662306a36Sopenharmony_ci	struct inet_frag_queue *q;
31762306a36Sopenharmony_ci
31862306a36Sopenharmony_ci	q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
31962306a36Sopenharmony_ci	if (!q)
32062306a36Sopenharmony_ci		return NULL;
32162306a36Sopenharmony_ci
32262306a36Sopenharmony_ci	q->fqdir = fqdir;
32362306a36Sopenharmony_ci	f->constructor(q, arg);
32462306a36Sopenharmony_ci	add_frag_mem_limit(fqdir, f->qsize);
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_ci	timer_setup(&q->timer, f->frag_expire, 0);
32762306a36Sopenharmony_ci	spin_lock_init(&q->lock);
32862306a36Sopenharmony_ci	refcount_set(&q->refcnt, 3);
32962306a36Sopenharmony_ci
33062306a36Sopenharmony_ci	return q;
33162306a36Sopenharmony_ci}
33262306a36Sopenharmony_ci
33362306a36Sopenharmony_cistatic struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
33462306a36Sopenharmony_ci						void *arg,
33562306a36Sopenharmony_ci						struct inet_frag_queue **prev)
33662306a36Sopenharmony_ci{
33762306a36Sopenharmony_ci	struct inet_frags *f = fqdir->f;
33862306a36Sopenharmony_ci	struct inet_frag_queue *q;
33962306a36Sopenharmony_ci
34062306a36Sopenharmony_ci	q = inet_frag_alloc(fqdir, f, arg);
34162306a36Sopenharmony_ci	if (!q) {
34262306a36Sopenharmony_ci		*prev = ERR_PTR(-ENOMEM);
34362306a36Sopenharmony_ci		return NULL;
34462306a36Sopenharmony_ci	}
34562306a36Sopenharmony_ci	mod_timer(&q->timer, jiffies + fqdir->timeout);
34662306a36Sopenharmony_ci
34762306a36Sopenharmony_ci	*prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
34862306a36Sopenharmony_ci						 &q->node, f->rhash_params);
34962306a36Sopenharmony_ci	if (*prev) {
35062306a36Sopenharmony_ci		q->flags |= INET_FRAG_COMPLETE;
35162306a36Sopenharmony_ci		inet_frag_kill(q);
35262306a36Sopenharmony_ci		inet_frag_destroy(q);
35362306a36Sopenharmony_ci		return NULL;
35462306a36Sopenharmony_ci	}
35562306a36Sopenharmony_ci	return q;
35662306a36Sopenharmony_ci}
35762306a36Sopenharmony_ci
35862306a36Sopenharmony_ci/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
35962306a36Sopenharmony_cistruct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
36062306a36Sopenharmony_ci{
36162306a36Sopenharmony_ci	/* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */
36262306a36Sopenharmony_ci	long high_thresh = READ_ONCE(fqdir->high_thresh);
36362306a36Sopenharmony_ci	struct inet_frag_queue *fq = NULL, *prev;
36462306a36Sopenharmony_ci
36562306a36Sopenharmony_ci	if (!high_thresh || frag_mem_limit(fqdir) > high_thresh)
36662306a36Sopenharmony_ci		return NULL;
36762306a36Sopenharmony_ci
36862306a36Sopenharmony_ci	rcu_read_lock();
36962306a36Sopenharmony_ci
37062306a36Sopenharmony_ci	prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params);
37162306a36Sopenharmony_ci	if (!prev)
37262306a36Sopenharmony_ci		fq = inet_frag_create(fqdir, key, &prev);
37362306a36Sopenharmony_ci	if (!IS_ERR_OR_NULL(prev)) {
37462306a36Sopenharmony_ci		fq = prev;
37562306a36Sopenharmony_ci		if (!refcount_inc_not_zero(&fq->refcnt))
37662306a36Sopenharmony_ci			fq = NULL;
37762306a36Sopenharmony_ci	}
37862306a36Sopenharmony_ci	rcu_read_unlock();
37962306a36Sopenharmony_ci	return fq;
38062306a36Sopenharmony_ci}
38162306a36Sopenharmony_ciEXPORT_SYMBOL(inet_frag_find);
38262306a36Sopenharmony_ci
38362306a36Sopenharmony_ciint inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
38462306a36Sopenharmony_ci			   int offset, int end)
38562306a36Sopenharmony_ci{
38662306a36Sopenharmony_ci	struct sk_buff *last = q->fragments_tail;
38762306a36Sopenharmony_ci
38862306a36Sopenharmony_ci	/* RFC5722, Section 4, amended by Errata ID : 3089
38962306a36Sopenharmony_ci	 *                          When reassembling an IPv6 datagram, if
39062306a36Sopenharmony_ci	 *   one or more its constituent fragments is determined to be an
39162306a36Sopenharmony_ci	 *   overlapping fragment, the entire datagram (and any constituent
39262306a36Sopenharmony_ci	 *   fragments) MUST be silently discarded.
39362306a36Sopenharmony_ci	 *
39462306a36Sopenharmony_ci	 * Duplicates, however, should be ignored (i.e. skb dropped, but the
39562306a36Sopenharmony_ci	 * queue/fragments kept for later reassembly).
39662306a36Sopenharmony_ci	 */
39762306a36Sopenharmony_ci	if (!last)
39862306a36Sopenharmony_ci		fragrun_create(q, skb);  /* First fragment. */
39962306a36Sopenharmony_ci	else if (last->ip_defrag_offset + last->len < end) {
40062306a36Sopenharmony_ci		/* This is the common case: skb goes to the end. */
40162306a36Sopenharmony_ci		/* Detect and discard overlaps. */
40262306a36Sopenharmony_ci		if (offset < last->ip_defrag_offset + last->len)
40362306a36Sopenharmony_ci			return IPFRAG_OVERLAP;
40462306a36Sopenharmony_ci		if (offset == last->ip_defrag_offset + last->len)
40562306a36Sopenharmony_ci			fragrun_append_to_last(q, skb);
40662306a36Sopenharmony_ci		else
40762306a36Sopenharmony_ci			fragrun_create(q, skb);
40862306a36Sopenharmony_ci	} else {
40962306a36Sopenharmony_ci		/* Binary search. Note that skb can become the first fragment,
41062306a36Sopenharmony_ci		 * but not the last (covered above).
41162306a36Sopenharmony_ci		 */
41262306a36Sopenharmony_ci		struct rb_node **rbn, *parent;
41362306a36Sopenharmony_ci
41462306a36Sopenharmony_ci		rbn = &q->rb_fragments.rb_node;
41562306a36Sopenharmony_ci		do {
41662306a36Sopenharmony_ci			struct sk_buff *curr;
41762306a36Sopenharmony_ci			int curr_run_end;
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci			parent = *rbn;
42062306a36Sopenharmony_ci			curr = rb_to_skb(parent);
42162306a36Sopenharmony_ci			curr_run_end = curr->ip_defrag_offset +
42262306a36Sopenharmony_ci					FRAG_CB(curr)->frag_run_len;
42362306a36Sopenharmony_ci			if (end <= curr->ip_defrag_offset)
42462306a36Sopenharmony_ci				rbn = &parent->rb_left;
42562306a36Sopenharmony_ci			else if (offset >= curr_run_end)
42662306a36Sopenharmony_ci				rbn = &parent->rb_right;
42762306a36Sopenharmony_ci			else if (offset >= curr->ip_defrag_offset &&
42862306a36Sopenharmony_ci				 end <= curr_run_end)
42962306a36Sopenharmony_ci				return IPFRAG_DUP;
43062306a36Sopenharmony_ci			else
43162306a36Sopenharmony_ci				return IPFRAG_OVERLAP;
43262306a36Sopenharmony_ci		} while (*rbn);
43362306a36Sopenharmony_ci		/* Here we have parent properly set, and rbn pointing to
43462306a36Sopenharmony_ci		 * one of its NULL left/right children. Insert skb.
43562306a36Sopenharmony_ci		 */
43662306a36Sopenharmony_ci		fragcb_clear(skb);
43762306a36Sopenharmony_ci		rb_link_node(&skb->rbnode, parent, rbn);
43862306a36Sopenharmony_ci		rb_insert_color(&skb->rbnode, &q->rb_fragments);
43962306a36Sopenharmony_ci	}
44062306a36Sopenharmony_ci
44162306a36Sopenharmony_ci	skb->ip_defrag_offset = offset;
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_ci	return IPFRAG_OK;
44462306a36Sopenharmony_ci}
44562306a36Sopenharmony_ciEXPORT_SYMBOL(inet_frag_queue_insert);
44662306a36Sopenharmony_ci
44762306a36Sopenharmony_civoid *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
44862306a36Sopenharmony_ci			      struct sk_buff *parent)
44962306a36Sopenharmony_ci{
45062306a36Sopenharmony_ci	struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments);
45162306a36Sopenharmony_ci	struct sk_buff **nextp;
45262306a36Sopenharmony_ci	int delta;
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_ci	if (head != skb) {
45562306a36Sopenharmony_ci		fp = skb_clone(skb, GFP_ATOMIC);
45662306a36Sopenharmony_ci		if (!fp)
45762306a36Sopenharmony_ci			return NULL;
45862306a36Sopenharmony_ci		FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
45962306a36Sopenharmony_ci		if (RB_EMPTY_NODE(&skb->rbnode))
46062306a36Sopenharmony_ci			FRAG_CB(parent)->next_frag = fp;
46162306a36Sopenharmony_ci		else
46262306a36Sopenharmony_ci			rb_replace_node(&skb->rbnode, &fp->rbnode,
46362306a36Sopenharmony_ci					&q->rb_fragments);
46462306a36Sopenharmony_ci		if (q->fragments_tail == skb)
46562306a36Sopenharmony_ci			q->fragments_tail = fp;
46662306a36Sopenharmony_ci		skb_morph(skb, head);
46762306a36Sopenharmony_ci		FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
46862306a36Sopenharmony_ci		rb_replace_node(&head->rbnode, &skb->rbnode,
46962306a36Sopenharmony_ci				&q->rb_fragments);
47062306a36Sopenharmony_ci		consume_skb(head);
47162306a36Sopenharmony_ci		head = skb;
47262306a36Sopenharmony_ci	}
47362306a36Sopenharmony_ci	WARN_ON(head->ip_defrag_offset != 0);
47462306a36Sopenharmony_ci
47562306a36Sopenharmony_ci	delta = -head->truesize;
47662306a36Sopenharmony_ci
47762306a36Sopenharmony_ci	/* Head of list must not be cloned. */
47862306a36Sopenharmony_ci	if (skb_unclone(head, GFP_ATOMIC))
47962306a36Sopenharmony_ci		return NULL;
48062306a36Sopenharmony_ci
48162306a36Sopenharmony_ci	delta += head->truesize;
48262306a36Sopenharmony_ci	if (delta)
48362306a36Sopenharmony_ci		add_frag_mem_limit(q->fqdir, delta);
48462306a36Sopenharmony_ci
48562306a36Sopenharmony_ci	/* If the first fragment is fragmented itself, we split
48662306a36Sopenharmony_ci	 * it to two chunks: the first with data and paged part
48762306a36Sopenharmony_ci	 * and the second, holding only fragments.
48862306a36Sopenharmony_ci	 */
48962306a36Sopenharmony_ci	if (skb_has_frag_list(head)) {
49062306a36Sopenharmony_ci		struct sk_buff *clone;
49162306a36Sopenharmony_ci		int i, plen = 0;
49262306a36Sopenharmony_ci
49362306a36Sopenharmony_ci		clone = alloc_skb(0, GFP_ATOMIC);
49462306a36Sopenharmony_ci		if (!clone)
49562306a36Sopenharmony_ci			return NULL;
49662306a36Sopenharmony_ci		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
49762306a36Sopenharmony_ci		skb_frag_list_init(head);
49862306a36Sopenharmony_ci		for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
49962306a36Sopenharmony_ci			plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
50062306a36Sopenharmony_ci		clone->data_len = head->data_len - plen;
50162306a36Sopenharmony_ci		clone->len = clone->data_len;
50262306a36Sopenharmony_ci		head->truesize += clone->truesize;
50362306a36Sopenharmony_ci		clone->csum = 0;
50462306a36Sopenharmony_ci		clone->ip_summed = head->ip_summed;
50562306a36Sopenharmony_ci		add_frag_mem_limit(q->fqdir, clone->truesize);
50662306a36Sopenharmony_ci		skb_shinfo(head)->frag_list = clone;
50762306a36Sopenharmony_ci		nextp = &clone->next;
50862306a36Sopenharmony_ci	} else {
50962306a36Sopenharmony_ci		nextp = &skb_shinfo(head)->frag_list;
51062306a36Sopenharmony_ci	}
51162306a36Sopenharmony_ci
51262306a36Sopenharmony_ci	return nextp;
51362306a36Sopenharmony_ci}
51462306a36Sopenharmony_ciEXPORT_SYMBOL(inet_frag_reasm_prepare);
51562306a36Sopenharmony_ci
51662306a36Sopenharmony_civoid inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
51762306a36Sopenharmony_ci			    void *reasm_data, bool try_coalesce)
51862306a36Sopenharmony_ci{
51962306a36Sopenharmony_ci	struct sk_buff **nextp = reasm_data;
52062306a36Sopenharmony_ci	struct rb_node *rbn;
52162306a36Sopenharmony_ci	struct sk_buff *fp;
52262306a36Sopenharmony_ci	int sum_truesize;
52362306a36Sopenharmony_ci
52462306a36Sopenharmony_ci	skb_push(head, head->data - skb_network_header(head));
52562306a36Sopenharmony_ci
52662306a36Sopenharmony_ci	/* Traverse the tree in order, to build frag_list. */
52762306a36Sopenharmony_ci	fp = FRAG_CB(head)->next_frag;
52862306a36Sopenharmony_ci	rbn = rb_next(&head->rbnode);
52962306a36Sopenharmony_ci	rb_erase(&head->rbnode, &q->rb_fragments);
53062306a36Sopenharmony_ci
53162306a36Sopenharmony_ci	sum_truesize = head->truesize;
53262306a36Sopenharmony_ci	while (rbn || fp) {
53362306a36Sopenharmony_ci		/* fp points to the next sk_buff in the current run;
53462306a36Sopenharmony_ci		 * rbn points to the next run.
53562306a36Sopenharmony_ci		 */
53662306a36Sopenharmony_ci		/* Go through the current run. */
53762306a36Sopenharmony_ci		while (fp) {
53862306a36Sopenharmony_ci			struct sk_buff *next_frag = FRAG_CB(fp)->next_frag;
53962306a36Sopenharmony_ci			bool stolen;
54062306a36Sopenharmony_ci			int delta;
54162306a36Sopenharmony_ci
54262306a36Sopenharmony_ci			sum_truesize += fp->truesize;
54362306a36Sopenharmony_ci			if (head->ip_summed != fp->ip_summed)
54462306a36Sopenharmony_ci				head->ip_summed = CHECKSUM_NONE;
54562306a36Sopenharmony_ci			else if (head->ip_summed == CHECKSUM_COMPLETE)
54662306a36Sopenharmony_ci				head->csum = csum_add(head->csum, fp->csum);
54762306a36Sopenharmony_ci
54862306a36Sopenharmony_ci			if (try_coalesce && skb_try_coalesce(head, fp, &stolen,
54962306a36Sopenharmony_ci							     &delta)) {
55062306a36Sopenharmony_ci				kfree_skb_partial(fp, stolen);
55162306a36Sopenharmony_ci			} else {
55262306a36Sopenharmony_ci				fp->prev = NULL;
55362306a36Sopenharmony_ci				memset(&fp->rbnode, 0, sizeof(fp->rbnode));
55462306a36Sopenharmony_ci				fp->sk = NULL;
55562306a36Sopenharmony_ci
55662306a36Sopenharmony_ci				head->data_len += fp->len;
55762306a36Sopenharmony_ci				head->len += fp->len;
55862306a36Sopenharmony_ci				head->truesize += fp->truesize;
55962306a36Sopenharmony_ci
56062306a36Sopenharmony_ci				*nextp = fp;
56162306a36Sopenharmony_ci				nextp = &fp->next;
56262306a36Sopenharmony_ci			}
56362306a36Sopenharmony_ci
56462306a36Sopenharmony_ci			fp = next_frag;
56562306a36Sopenharmony_ci		}
56662306a36Sopenharmony_ci		/* Move to the next run. */
56762306a36Sopenharmony_ci		if (rbn) {
56862306a36Sopenharmony_ci			struct rb_node *rbnext = rb_next(rbn);
56962306a36Sopenharmony_ci
57062306a36Sopenharmony_ci			fp = rb_to_skb(rbn);
57162306a36Sopenharmony_ci			rb_erase(rbn, &q->rb_fragments);
57262306a36Sopenharmony_ci			rbn = rbnext;
57362306a36Sopenharmony_ci		}
57462306a36Sopenharmony_ci	}
57562306a36Sopenharmony_ci	sub_frag_mem_limit(q->fqdir, sum_truesize);
57662306a36Sopenharmony_ci
57762306a36Sopenharmony_ci	*nextp = NULL;
57862306a36Sopenharmony_ci	skb_mark_not_on_list(head);
57962306a36Sopenharmony_ci	head->prev = NULL;
58062306a36Sopenharmony_ci	head->tstamp = q->stamp;
58162306a36Sopenharmony_ci	head->mono_delivery_time = q->mono_delivery_time;
58262306a36Sopenharmony_ci}
58362306a36Sopenharmony_ciEXPORT_SYMBOL(inet_frag_reasm_finish);
58462306a36Sopenharmony_ci
58562306a36Sopenharmony_cistruct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q)
58662306a36Sopenharmony_ci{
58762306a36Sopenharmony_ci	struct sk_buff *head, *skb;
58862306a36Sopenharmony_ci
58962306a36Sopenharmony_ci	head = skb_rb_first(&q->rb_fragments);
59062306a36Sopenharmony_ci	if (!head)
59162306a36Sopenharmony_ci		return NULL;
59262306a36Sopenharmony_ci	skb = FRAG_CB(head)->next_frag;
59362306a36Sopenharmony_ci	if (skb)
59462306a36Sopenharmony_ci		rb_replace_node(&head->rbnode, &skb->rbnode,
59562306a36Sopenharmony_ci				&q->rb_fragments);
59662306a36Sopenharmony_ci	else
59762306a36Sopenharmony_ci		rb_erase(&head->rbnode, &q->rb_fragments);
59862306a36Sopenharmony_ci	memset(&head->rbnode, 0, sizeof(head->rbnode));
59962306a36Sopenharmony_ci	barrier();
60062306a36Sopenharmony_ci
60162306a36Sopenharmony_ci	if (head == q->fragments_tail)
60262306a36Sopenharmony_ci		q->fragments_tail = NULL;
60362306a36Sopenharmony_ci
60462306a36Sopenharmony_ci	sub_frag_mem_limit(q->fqdir, head->truesize);
60562306a36Sopenharmony_ci
60662306a36Sopenharmony_ci	return head;
60762306a36Sopenharmony_ci}
60862306a36Sopenharmony_ciEXPORT_SYMBOL(inet_frag_pull_head);
609