162306a36Sopenharmony_ci/* 262306a36Sopenharmony_ci * INETPEER - A storage for permanent information about peers 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * This source is covered by the GNU GPL, the same as all kernel sources. 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * Authors: Andrey V. Savochkin <saw@msu.ru> 762306a36Sopenharmony_ci */ 862306a36Sopenharmony_ci 962306a36Sopenharmony_ci#include <linux/cache.h> 1062306a36Sopenharmony_ci#include <linux/module.h> 1162306a36Sopenharmony_ci#include <linux/types.h> 1262306a36Sopenharmony_ci#include <linux/slab.h> 1362306a36Sopenharmony_ci#include <linux/interrupt.h> 1462306a36Sopenharmony_ci#include <linux/spinlock.h> 1562306a36Sopenharmony_ci#include <linux/random.h> 1662306a36Sopenharmony_ci#include <linux/timer.h> 1762306a36Sopenharmony_ci#include <linux/time.h> 1862306a36Sopenharmony_ci#include <linux/kernel.h> 1962306a36Sopenharmony_ci#include <linux/mm.h> 2062306a36Sopenharmony_ci#include <linux/net.h> 2162306a36Sopenharmony_ci#include <linux/workqueue.h> 2262306a36Sopenharmony_ci#include <net/ip.h> 2362306a36Sopenharmony_ci#include <net/inetpeer.h> 2462306a36Sopenharmony_ci#include <net/secure_seq.h> 2562306a36Sopenharmony_ci 2662306a36Sopenharmony_ci/* 2762306a36Sopenharmony_ci * Theory of operations. 2862306a36Sopenharmony_ci * We keep one entry for each peer IP address. The nodes contains long-living 2962306a36Sopenharmony_ci * information about the peer which doesn't depend on routes. 3062306a36Sopenharmony_ci * 3162306a36Sopenharmony_ci * Nodes are removed only when reference counter goes to 0. 3262306a36Sopenharmony_ci * When it's happened the node may be removed when a sufficient amount of 3362306a36Sopenharmony_ci * time has been passed since its last use. The less-recently-used entry can 3462306a36Sopenharmony_ci * also be removed if the pool is overloaded i.e. if the total amount of 3562306a36Sopenharmony_ci * entries is greater-or-equal than the threshold. 3662306a36Sopenharmony_ci * 3762306a36Sopenharmony_ci * Node pool is organised as an RB tree. 3862306a36Sopenharmony_ci * Such an implementation has been chosen not just for fun. It's a way to 3962306a36Sopenharmony_ci * prevent easy and efficient DoS attacks by creating hash collisions. A huge 4062306a36Sopenharmony_ci * amount of long living nodes in a single hash slot would significantly delay 4162306a36Sopenharmony_ci * lookups performed with disabled BHs. 4262306a36Sopenharmony_ci * 4362306a36Sopenharmony_ci * Serialisation issues. 4462306a36Sopenharmony_ci * 1. Nodes may appear in the tree only with the pool lock held. 4562306a36Sopenharmony_ci * 2. Nodes may disappear from the tree only with the pool lock held 4662306a36Sopenharmony_ci * AND reference count being 0. 4762306a36Sopenharmony_ci * 3. Global variable peer_total is modified under the pool lock. 4862306a36Sopenharmony_ci * 4. struct inet_peer fields modification: 4962306a36Sopenharmony_ci * rb_node: pool lock 5062306a36Sopenharmony_ci * refcnt: atomically against modifications on other CPU; 5162306a36Sopenharmony_ci * usually under some other lock to prevent node disappearing 5262306a36Sopenharmony_ci * daddr: unchangeable 5362306a36Sopenharmony_ci */ 5462306a36Sopenharmony_ci 5562306a36Sopenharmony_cistatic struct kmem_cache *peer_cachep __ro_after_init; 5662306a36Sopenharmony_ci 5762306a36Sopenharmony_civoid inet_peer_base_init(struct inet_peer_base *bp) 5862306a36Sopenharmony_ci{ 5962306a36Sopenharmony_ci bp->rb_root = RB_ROOT; 6062306a36Sopenharmony_ci seqlock_init(&bp->lock); 6162306a36Sopenharmony_ci bp->total = 0; 6262306a36Sopenharmony_ci} 6362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(inet_peer_base_init); 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ci#define PEER_MAX_GC 32 6662306a36Sopenharmony_ci 6762306a36Sopenharmony_ci/* Exported for sysctl_net_ipv4. */ 6862306a36Sopenharmony_ciint inet_peer_threshold __read_mostly; /* start to throw entries more 6962306a36Sopenharmony_ci * aggressively at this stage */ 7062306a36Sopenharmony_ciint inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ 7162306a36Sopenharmony_ciint inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci/* Called from ip_output.c:ip_init */ 7462306a36Sopenharmony_civoid __init inet_initpeers(void) 7562306a36Sopenharmony_ci{ 7662306a36Sopenharmony_ci u64 nr_entries; 7762306a36Sopenharmony_ci 7862306a36Sopenharmony_ci /* 1% of physical memory */ 7962306a36Sopenharmony_ci nr_entries = div64_ul((u64)totalram_pages() << PAGE_SHIFT, 8062306a36Sopenharmony_ci 100 * L1_CACHE_ALIGN(sizeof(struct inet_peer))); 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ci inet_peer_threshold = clamp_val(nr_entries, 4096, 65536 + 128); 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci peer_cachep = kmem_cache_create("inet_peer_cache", 8562306a36Sopenharmony_ci sizeof(struct inet_peer), 8662306a36Sopenharmony_ci 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, 8762306a36Sopenharmony_ci NULL); 8862306a36Sopenharmony_ci} 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci/* Called with rcu_read_lock() or base->lock held */ 9162306a36Sopenharmony_cistatic struct inet_peer *lookup(const struct inetpeer_addr *daddr, 9262306a36Sopenharmony_ci struct inet_peer_base *base, 9362306a36Sopenharmony_ci unsigned int seq, 9462306a36Sopenharmony_ci struct inet_peer *gc_stack[], 9562306a36Sopenharmony_ci unsigned int *gc_cnt, 9662306a36Sopenharmony_ci struct rb_node **parent_p, 9762306a36Sopenharmony_ci struct rb_node ***pp_p) 9862306a36Sopenharmony_ci{ 9962306a36Sopenharmony_ci struct rb_node **pp, *parent, *next; 10062306a36Sopenharmony_ci struct inet_peer *p; 10162306a36Sopenharmony_ci 10262306a36Sopenharmony_ci pp = &base->rb_root.rb_node; 10362306a36Sopenharmony_ci parent = NULL; 10462306a36Sopenharmony_ci while (1) { 10562306a36Sopenharmony_ci int cmp; 10662306a36Sopenharmony_ci 10762306a36Sopenharmony_ci next = rcu_dereference_raw(*pp); 10862306a36Sopenharmony_ci if (!next) 10962306a36Sopenharmony_ci break; 11062306a36Sopenharmony_ci parent = next; 11162306a36Sopenharmony_ci p = rb_entry(parent, struct inet_peer, rb_node); 11262306a36Sopenharmony_ci cmp = inetpeer_addr_cmp(daddr, &p->daddr); 11362306a36Sopenharmony_ci if (cmp == 0) { 11462306a36Sopenharmony_ci if (!refcount_inc_not_zero(&p->refcnt)) 11562306a36Sopenharmony_ci break; 11662306a36Sopenharmony_ci return p; 11762306a36Sopenharmony_ci } 11862306a36Sopenharmony_ci if (gc_stack) { 11962306a36Sopenharmony_ci if (*gc_cnt < PEER_MAX_GC) 12062306a36Sopenharmony_ci gc_stack[(*gc_cnt)++] = p; 12162306a36Sopenharmony_ci } else if (unlikely(read_seqretry(&base->lock, seq))) { 12262306a36Sopenharmony_ci break; 12362306a36Sopenharmony_ci } 12462306a36Sopenharmony_ci if (cmp == -1) 12562306a36Sopenharmony_ci pp = &next->rb_left; 12662306a36Sopenharmony_ci else 12762306a36Sopenharmony_ci pp = &next->rb_right; 12862306a36Sopenharmony_ci } 12962306a36Sopenharmony_ci *parent_p = parent; 13062306a36Sopenharmony_ci *pp_p = pp; 13162306a36Sopenharmony_ci return NULL; 13262306a36Sopenharmony_ci} 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_cistatic void inetpeer_free_rcu(struct rcu_head *head) 13562306a36Sopenharmony_ci{ 13662306a36Sopenharmony_ci kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); 13762306a36Sopenharmony_ci} 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_ci/* perform garbage collect on all items stacked during a lookup */ 14062306a36Sopenharmony_cistatic void inet_peer_gc(struct inet_peer_base *base, 14162306a36Sopenharmony_ci struct inet_peer *gc_stack[], 14262306a36Sopenharmony_ci unsigned int gc_cnt) 14362306a36Sopenharmony_ci{ 14462306a36Sopenharmony_ci int peer_threshold, peer_maxttl, peer_minttl; 14562306a36Sopenharmony_ci struct inet_peer *p; 14662306a36Sopenharmony_ci __u32 delta, ttl; 14762306a36Sopenharmony_ci int i; 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_ci peer_threshold = READ_ONCE(inet_peer_threshold); 15062306a36Sopenharmony_ci peer_maxttl = READ_ONCE(inet_peer_maxttl); 15162306a36Sopenharmony_ci peer_minttl = READ_ONCE(inet_peer_minttl); 15262306a36Sopenharmony_ci 15362306a36Sopenharmony_ci if (base->total >= peer_threshold) 15462306a36Sopenharmony_ci ttl = 0; /* be aggressive */ 15562306a36Sopenharmony_ci else 15662306a36Sopenharmony_ci ttl = peer_maxttl - (peer_maxttl - peer_minttl) / HZ * 15762306a36Sopenharmony_ci base->total / peer_threshold * HZ; 15862306a36Sopenharmony_ci for (i = 0; i < gc_cnt; i++) { 15962306a36Sopenharmony_ci p = gc_stack[i]; 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci /* The READ_ONCE() pairs with the WRITE_ONCE() 16262306a36Sopenharmony_ci * in inet_putpeer() 16362306a36Sopenharmony_ci */ 16462306a36Sopenharmony_ci delta = (__u32)jiffies - READ_ONCE(p->dtime); 16562306a36Sopenharmony_ci 16662306a36Sopenharmony_ci if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) 16762306a36Sopenharmony_ci gc_stack[i] = NULL; 16862306a36Sopenharmony_ci } 16962306a36Sopenharmony_ci for (i = 0; i < gc_cnt; i++) { 17062306a36Sopenharmony_ci p = gc_stack[i]; 17162306a36Sopenharmony_ci if (p) { 17262306a36Sopenharmony_ci rb_erase(&p->rb_node, &base->rb_root); 17362306a36Sopenharmony_ci base->total--; 17462306a36Sopenharmony_ci call_rcu(&p->rcu, inetpeer_free_rcu); 17562306a36Sopenharmony_ci } 17662306a36Sopenharmony_ci } 17762306a36Sopenharmony_ci} 17862306a36Sopenharmony_ci 17962306a36Sopenharmony_cistruct inet_peer *inet_getpeer(struct inet_peer_base *base, 18062306a36Sopenharmony_ci const struct inetpeer_addr *daddr, 18162306a36Sopenharmony_ci int create) 18262306a36Sopenharmony_ci{ 18362306a36Sopenharmony_ci struct inet_peer *p, *gc_stack[PEER_MAX_GC]; 18462306a36Sopenharmony_ci struct rb_node **pp, *parent; 18562306a36Sopenharmony_ci unsigned int gc_cnt, seq; 18662306a36Sopenharmony_ci int invalidated; 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ci /* Attempt a lockless lookup first. 18962306a36Sopenharmony_ci * Because of a concurrent writer, we might not find an existing entry. 19062306a36Sopenharmony_ci */ 19162306a36Sopenharmony_ci rcu_read_lock(); 19262306a36Sopenharmony_ci seq = read_seqbegin(&base->lock); 19362306a36Sopenharmony_ci p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); 19462306a36Sopenharmony_ci invalidated = read_seqretry(&base->lock, seq); 19562306a36Sopenharmony_ci rcu_read_unlock(); 19662306a36Sopenharmony_ci 19762306a36Sopenharmony_ci if (p) 19862306a36Sopenharmony_ci return p; 19962306a36Sopenharmony_ci 20062306a36Sopenharmony_ci /* If no writer did a change during our lookup, we can return early. */ 20162306a36Sopenharmony_ci if (!create && !invalidated) 20262306a36Sopenharmony_ci return NULL; 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_ci /* retry an exact lookup, taking the lock before. 20562306a36Sopenharmony_ci * At least, nodes should be hot in our cache. 20662306a36Sopenharmony_ci */ 20762306a36Sopenharmony_ci parent = NULL; 20862306a36Sopenharmony_ci write_seqlock_bh(&base->lock); 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_ci gc_cnt = 0; 21162306a36Sopenharmony_ci p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp); 21262306a36Sopenharmony_ci if (!p && create) { 21362306a36Sopenharmony_ci p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); 21462306a36Sopenharmony_ci if (p) { 21562306a36Sopenharmony_ci p->daddr = *daddr; 21662306a36Sopenharmony_ci p->dtime = (__u32)jiffies; 21762306a36Sopenharmony_ci refcount_set(&p->refcnt, 2); 21862306a36Sopenharmony_ci atomic_set(&p->rid, 0); 21962306a36Sopenharmony_ci p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; 22062306a36Sopenharmony_ci p->rate_tokens = 0; 22162306a36Sopenharmony_ci p->n_redirects = 0; 22262306a36Sopenharmony_ci /* 60*HZ is arbitrary, but chosen enough high so that the first 22362306a36Sopenharmony_ci * calculation of tokens is at its maximum. 22462306a36Sopenharmony_ci */ 22562306a36Sopenharmony_ci p->rate_last = jiffies - 60*HZ; 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci rb_link_node(&p->rb_node, parent, pp); 22862306a36Sopenharmony_ci rb_insert_color(&p->rb_node, &base->rb_root); 22962306a36Sopenharmony_ci base->total++; 23062306a36Sopenharmony_ci } 23162306a36Sopenharmony_ci } 23262306a36Sopenharmony_ci if (gc_cnt) 23362306a36Sopenharmony_ci inet_peer_gc(base, gc_stack, gc_cnt); 23462306a36Sopenharmony_ci write_sequnlock_bh(&base->lock); 23562306a36Sopenharmony_ci 23662306a36Sopenharmony_ci return p; 23762306a36Sopenharmony_ci} 23862306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(inet_getpeer); 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_civoid inet_putpeer(struct inet_peer *p) 24162306a36Sopenharmony_ci{ 24262306a36Sopenharmony_ci /* The WRITE_ONCE() pairs with itself (we run lockless) 24362306a36Sopenharmony_ci * and the READ_ONCE() in inet_peer_gc() 24462306a36Sopenharmony_ci */ 24562306a36Sopenharmony_ci WRITE_ONCE(p->dtime, (__u32)jiffies); 24662306a36Sopenharmony_ci 24762306a36Sopenharmony_ci if (refcount_dec_and_test(&p->refcnt)) 24862306a36Sopenharmony_ci call_rcu(&p->rcu, inetpeer_free_rcu); 24962306a36Sopenharmony_ci} 25062306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(inet_putpeer); 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_ci/* 25362306a36Sopenharmony_ci * Check transmit rate limitation for given message. 25462306a36Sopenharmony_ci * The rate information is held in the inet_peer entries now. 25562306a36Sopenharmony_ci * This function is generic and could be used for other purposes 25662306a36Sopenharmony_ci * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. 25762306a36Sopenharmony_ci * 25862306a36Sopenharmony_ci * Note that the same inet_peer fields are modified by functions in 25962306a36Sopenharmony_ci * route.c too, but these work for packet destinations while xrlim_allow 26062306a36Sopenharmony_ci * works for icmp destinations. This means the rate limiting information 26162306a36Sopenharmony_ci * for one "ip object" is shared - and these ICMPs are twice limited: 26262306a36Sopenharmony_ci * by source and by destination. 26362306a36Sopenharmony_ci * 26462306a36Sopenharmony_ci * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate 26562306a36Sopenharmony_ci * SHOULD allow setting of rate limits 26662306a36Sopenharmony_ci * 26762306a36Sopenharmony_ci * Shared between ICMPv4 and ICMPv6. 26862306a36Sopenharmony_ci */ 26962306a36Sopenharmony_ci#define XRLIM_BURST_FACTOR 6 27062306a36Sopenharmony_cibool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) 27162306a36Sopenharmony_ci{ 27262306a36Sopenharmony_ci unsigned long now, token; 27362306a36Sopenharmony_ci bool rc = false; 27462306a36Sopenharmony_ci 27562306a36Sopenharmony_ci if (!peer) 27662306a36Sopenharmony_ci return true; 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci token = peer->rate_tokens; 27962306a36Sopenharmony_ci now = jiffies; 28062306a36Sopenharmony_ci token += now - peer->rate_last; 28162306a36Sopenharmony_ci peer->rate_last = now; 28262306a36Sopenharmony_ci if (token > XRLIM_BURST_FACTOR * timeout) 28362306a36Sopenharmony_ci token = XRLIM_BURST_FACTOR * timeout; 28462306a36Sopenharmony_ci if (token >= timeout) { 28562306a36Sopenharmony_ci token -= timeout; 28662306a36Sopenharmony_ci rc = true; 28762306a36Sopenharmony_ci } 28862306a36Sopenharmony_ci peer->rate_tokens = token; 28962306a36Sopenharmony_ci return rc; 29062306a36Sopenharmony_ci} 29162306a36Sopenharmony_ciEXPORT_SYMBOL(inet_peer_xrlim_allow); 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_civoid inetpeer_invalidate_tree(struct inet_peer_base *base) 29462306a36Sopenharmony_ci{ 29562306a36Sopenharmony_ci struct rb_node *p = rb_first(&base->rb_root); 29662306a36Sopenharmony_ci 29762306a36Sopenharmony_ci while (p) { 29862306a36Sopenharmony_ci struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node); 29962306a36Sopenharmony_ci 30062306a36Sopenharmony_ci p = rb_next(p); 30162306a36Sopenharmony_ci rb_erase(&peer->rb_node, &base->rb_root); 30262306a36Sopenharmony_ci inet_putpeer(peer); 30362306a36Sopenharmony_ci cond_resched(); 30462306a36Sopenharmony_ci } 30562306a36Sopenharmony_ci 30662306a36Sopenharmony_ci base->total = 0; 30762306a36Sopenharmony_ci} 30862306a36Sopenharmony_ciEXPORT_SYMBOL(inetpeer_invalidate_tree); 309