1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
4 *		operating system.  INET is implemented using the BSD Socket
5 *		interface as the means of communication with the user level.
6 *
7 * Authors:	Lotsa people, from code originally in tcp
8 */
9
10#ifndef _INET_HASHTABLES_H
11#define _INET_HASHTABLES_H
12
13
14#include <linux/interrupt.h>
15#include <linux/ip.h>
16#include <linux/ipv6.h>
17#include <linux/list.h>
18#include <linux/slab.h>
19#include <linux/socket.h>
20#include <linux/spinlock.h>
21#include <linux/types.h>
22#include <linux/wait.h>
23
24#include <net/inet_connection_sock.h>
25#include <net/inet_sock.h>
26#include <net/sock.h>
27#include <net/route.h>
28#include <net/tcp_states.h>
29#include <net/netns/hash.h>
30
31#include <linux/refcount.h>
32#include <asm/byteorder.h>
33
34/* This is for all connections with a full identity, no wildcards.
35 * The 'e' prefix stands for Establish, but we really put all sockets
36 * but LISTEN ones.
37 */
38struct inet_ehash_bucket {
39	struct hlist_nulls_head chain;
40};
41
42/* There are a few simple rules, which allow for local port reuse by
43 * an application.  In essence:
44 *
45 *	1) Sockets bound to different interfaces may share a local port.
46 *	   Failing that, goto test 2.
47 *	2) If all sockets have sk->sk_reuse set, and none of them are in
48 *	   TCP_LISTEN state, the port may be shared.
49 *	   Failing that, goto test 3.
50 *	3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local
51 *	   address, and none of them are the same, the port may be
52 *	   shared.
53 *	   Failing this, the port cannot be shared.
54 *
55 * The interesting point, is test #2.  This is what an FTP server does
56 * all day.  To optimize this case we use a specific flag bit defined
57 * below.  As we add sockets to a bind bucket list, we perform a
58 * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN))
59 * As long as all sockets added to a bind bucket pass this test,
60 * the flag bit will be set.
61 * The resulting situation is that tcp_v[46]_verify_bind() can just check
62 * for this flag bit, if it is set and the socket trying to bind has
63 * sk->sk_reuse set, we don't even have to walk the owners list at all,
64 * we return that it is ok to bind this socket to the requested local port.
65 *
66 * Sounds like a lot of work, but it is worth it.  In a more naive
67 * implementation (ie. current FreeBSD etc.) the entire list of ports
68 * must be walked for each data port opened by an ftp server.  Needless
69 * to say, this does not scale at all.  With a couple thousand FTP
70 * users logged onto your box, isn't it nice to know that new data
71 * ports are created in O(1) time?  I thought so. ;-)	-DaveM
72 */
73#define FASTREUSEPORT_ANY	1
74#define FASTREUSEPORT_STRICT	2
75
76struct inet_bind_bucket {
77	possible_net_t		ib_net;
78	int			l3mdev;
79	unsigned short		port;
80	signed char		fastreuse;
81	signed char		fastreuseport;
82	kuid_t			fastuid;
83#if IS_ENABLED(CONFIG_IPV6)
84	struct in6_addr		fast_v6_rcv_saddr;
85#endif
86	__be32			fast_rcv_saddr;
87	unsigned short		fast_sk_family;
88	bool			fast_ipv6_only;
89	struct hlist_node	node;
90	struct hlist_head	owners;
91};
92
93static inline struct net *ib_net(struct inet_bind_bucket *ib)
94{
95	return read_pnet(&ib->ib_net);
96}
97
98#define inet_bind_bucket_for_each(tb, head) \
99	hlist_for_each_entry(tb, head, node)
100
101struct inet_bind_hashbucket {
102	spinlock_t		lock;
103	struct hlist_head	chain;
104};
105
106/* Sockets can be hashed in established or listening table.
107 * We must use different 'nulls' end-of-chain value for all hash buckets :
108 * A socket might transition from ESTABLISH to LISTEN state without
109 * RCU grace period. A lookup in ehash table needs to handle this case.
110 */
111#define LISTENING_NULLS_BASE (1U << 29)
112struct inet_listen_hashbucket {
113	spinlock_t		lock;
114	unsigned int		count;
115	union {
116		struct hlist_head	head;
117		struct hlist_nulls_head	nulls_head;
118	};
119};
120
121/* This is for listening sockets, thus all sockets which possess wildcards. */
122#define INET_LHTABLE_SIZE	32	/* Yes, really, this is all you need. */
123
124struct inet_hashinfo {
125	/* This is for sockets with full identity only.  Sockets here will
126	 * always be without wildcards and will have the following invariant:
127	 *
128	 *          TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
129	 *
130	 */
131	struct inet_ehash_bucket	*ehash;
132	spinlock_t			*ehash_locks;
133	unsigned int			ehash_mask;
134	unsigned int			ehash_locks_mask;
135
136	/* Ok, let's try this, I give up, we do need a local binding
137	 * TCP hash as well as the others for fast bind/connect.
138	 */
139	struct kmem_cache		*bind_bucket_cachep;
140	struct inet_bind_hashbucket	*bhash;
141	unsigned int			bhash_size;
142
143	/* The 2nd listener table hashed by local port and address */
144	unsigned int			lhash2_mask;
145	struct inet_listen_hashbucket	*lhash2;
146
147	/* All the above members are written once at bootup and
148	 * never written again _or_ are predominantly read-access.
149	 *
150	 * Now align to a new cache line as all the following members
151	 * might be often dirty.
152	 */
153	/* All sockets in TCP_LISTEN state will be in listening_hash.
154	 * This is the only table where wildcard'd TCP sockets can
155	 * exist.  listening_hash is only hashed by local port number.
156	 * If lhash2 is initialized, the same socket will also be hashed
157	 * to lhash2 by port and address.
158	 */
159	struct inet_listen_hashbucket	listening_hash[INET_LHTABLE_SIZE]
160					____cacheline_aligned_in_smp;
161};
162
163#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \
164	hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node)
165
166static inline struct inet_listen_hashbucket *
167inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash)
168{
169	return &h->lhash2[hash & h->lhash2_mask];
170}
171
172static inline struct inet_ehash_bucket *inet_ehash_bucket(
173	struct inet_hashinfo *hashinfo,
174	unsigned int hash)
175{
176	return &hashinfo->ehash[hash & hashinfo->ehash_mask];
177}
178
179static inline spinlock_t *inet_ehash_lockp(
180	struct inet_hashinfo *hashinfo,
181	unsigned int hash)
182{
183	return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask];
184}
185
186int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo);
187
188static inline void inet_hashinfo2_free_mod(struct inet_hashinfo *h)
189{
190	kfree(h->lhash2);
191	h->lhash2 = NULL;
192}
193
194static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
195{
196	kvfree(hashinfo->ehash_locks);
197	hashinfo->ehash_locks = NULL;
198}
199
200struct inet_bind_bucket *
201inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
202			struct inet_bind_hashbucket *head,
203			const unsigned short snum, int l3mdev);
204void inet_bind_bucket_destroy(struct kmem_cache *cachep,
205			      struct inet_bind_bucket *tb);
206
207static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
208			       const u32 bhash_size)
209{
210	return (lport + net_hash_mix(net)) & (bhash_size - 1);
211}
212
213void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
214		    const unsigned short snum);
215
216/* These can have wildcards, don't try too hard. */
217static inline u32 inet_lhashfn(const struct net *net, const unsigned short num)
218{
219	return (num + net_hash_mix(net)) & (INET_LHTABLE_SIZE - 1);
220}
221
222static inline int inet_sk_listen_hashfn(const struct sock *sk)
223{
224	return inet_lhashfn(sock_net(sk), inet_sk(sk)->inet_num);
225}
226
227/* Caller must disable local BH processing. */
228int __inet_inherit_port(const struct sock *sk, struct sock *child);
229
230void inet_put_port(struct sock *sk);
231
232void inet_hashinfo_init(struct inet_hashinfo *h);
233void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
234			 unsigned long numentries, int scale,
235			 unsigned long low_limit,
236			 unsigned long high_limit);
237int inet_hashinfo2_init_mod(struct inet_hashinfo *h);
238
239bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk);
240bool inet_ehash_nolisten(struct sock *sk, struct sock *osk,
241			 bool *found_dup_sk);
242int __inet_hash(struct sock *sk, struct sock *osk);
243int inet_hash(struct sock *sk);
244void inet_unhash(struct sock *sk);
245
246struct sock *__inet_lookup_listener(struct net *net,
247				    struct inet_hashinfo *hashinfo,
248				    struct sk_buff *skb, int doff,
249				    const __be32 saddr, const __be16 sport,
250				    const __be32 daddr,
251				    const unsigned short hnum,
252				    const int dif, const int sdif);
253
254static inline struct sock *inet_lookup_listener(struct net *net,
255		struct inet_hashinfo *hashinfo,
256		struct sk_buff *skb, int doff,
257		__be32 saddr, __be16 sport,
258		__be32 daddr, __be16 dport, int dif, int sdif)
259{
260	return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
261				      daddr, ntohs(dport), dif, sdif);
262}
263
264/* Socket demux engine toys. */
265/* What happens here is ugly; there's a pair of adjacent fields in
266   struct inet_sock; __be16 dport followed by __u16 num.  We want to
267   search by pair, so we combine the keys into a single 32bit value
268   and compare with 32bit value read from &...->dport.  Let's at least
269   make sure that it's not mixed with anything else...
270   On 64bit targets we combine comparisons with pair of adjacent __be32
271   fields in the same way.
272*/
273#ifdef __BIG_ENDIAN
274#define INET_COMBINED_PORTS(__sport, __dport) \
275	((__force __portpair)(((__force __u32)(__be16)(__sport) << 16) | (__u32)(__dport)))
276#else /* __LITTLE_ENDIAN */
277#define INET_COMBINED_PORTS(__sport, __dport) \
278	((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport)))
279#endif
280
281#ifdef __BIG_ENDIAN
282#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
283	const __addrpair __name = (__force __addrpair) ( \
284				   (((__force __u64)(__be32)(__saddr)) << 32) | \
285				   ((__force __u64)(__be32)(__daddr)))
286#else /* __LITTLE_ENDIAN */
287#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
288	const __addrpair __name = (__force __addrpair) ( \
289				   (((__force __u64)(__be32)(__daddr)) << 32) | \
290				   ((__force __u64)(__be32)(__saddr)))
291#endif /* __BIG_ENDIAN */
292
293static inline bool INET_MATCH(struct net *net, const struct sock *sk,
294			      const __addrpair cookie, const __portpair ports,
295			      int dif, int sdif)
296{
297	if (!net_eq(sock_net(sk), net) ||
298	    sk->sk_portpair != ports ||
299	    sk->sk_addrpair != cookie)
300	        return false;
301
302	/* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */
303	return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif,
304				    sdif);
305}
306
307/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
308 * not check it for lookups anymore, thanks Alexey. -DaveM
309 */
310struct sock *__inet_lookup_established(struct net *net,
311				       struct inet_hashinfo *hashinfo,
312				       const __be32 saddr, const __be16 sport,
313				       const __be32 daddr, const u16 hnum,
314				       const int dif, const int sdif);
315
316static inline struct sock *
317	inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo,
318				const __be32 saddr, const __be16 sport,
319				const __be32 daddr, const __be16 dport,
320				const int dif)
321{
322	return __inet_lookup_established(net, hashinfo, saddr, sport, daddr,
323					 ntohs(dport), dif, 0);
324}
325
326static inline struct sock *__inet_lookup(struct net *net,
327					 struct inet_hashinfo *hashinfo,
328					 struct sk_buff *skb, int doff,
329					 const __be32 saddr, const __be16 sport,
330					 const __be32 daddr, const __be16 dport,
331					 const int dif, const int sdif,
332					 bool *refcounted)
333{
334	u16 hnum = ntohs(dport);
335	struct sock *sk;
336
337	sk = __inet_lookup_established(net, hashinfo, saddr, sport,
338				       daddr, hnum, dif, sdif);
339	*refcounted = true;
340	if (sk)
341		return sk;
342	*refcounted = false;
343	return __inet_lookup_listener(net, hashinfo, skb, doff, saddr,
344				      sport, daddr, hnum, dif, sdif);
345}
346
347static inline struct sock *inet_lookup(struct net *net,
348				       struct inet_hashinfo *hashinfo,
349				       struct sk_buff *skb, int doff,
350				       const __be32 saddr, const __be16 sport,
351				       const __be32 daddr, const __be16 dport,
352				       const int dif)
353{
354	struct sock *sk;
355	bool refcounted;
356
357	sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
358			   dport, dif, 0, &refcounted);
359
360	if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt))
361		sk = NULL;
362	return sk;
363}
364
365static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
366					     struct sk_buff *skb,
367					     int doff,
368					     const __be16 sport,
369					     const __be16 dport,
370					     const int sdif,
371					     bool *refcounted)
372{
373	struct sock *sk = skb_steal_sock(skb, refcounted);
374	const struct iphdr *iph = ip_hdr(skb);
375
376	if (sk)
377		return sk;
378
379	return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb,
380			     doff, iph->saddr, sport,
381			     iph->daddr, dport, inet_iif(skb), sdif,
382			     refcounted);
383}
384
385u32 inet6_ehashfn(const struct net *net,
386		  const struct in6_addr *laddr, const u16 lport,
387		  const struct in6_addr *faddr, const __be16 fport);
388
389static inline void sk_daddr_set(struct sock *sk, __be32 addr)
390{
391	sk->sk_daddr = addr; /* alias of inet_daddr */
392#if IS_ENABLED(CONFIG_IPV6)
393	ipv6_addr_set_v4mapped(addr, &sk->sk_v6_daddr);
394#endif
395}
396
397static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr)
398{
399	sk->sk_rcv_saddr = addr; /* alias of inet_rcv_saddr */
400#if IS_ENABLED(CONFIG_IPV6)
401	ipv6_addr_set_v4mapped(addr, &sk->sk_v6_rcv_saddr);
402#endif
403}
404
405int __inet_hash_connect(struct inet_timewait_death_row *death_row,
406			struct sock *sk, u64 port_offset,
407			int (*check_established)(struct inet_timewait_death_row *,
408						 struct sock *, __u16,
409						 struct inet_timewait_sock **));
410
411int inet_hash_connect(struct inet_timewait_death_row *death_row,
412		      struct sock *sk);
413#endif /* _INET_HASHTABLES_H */
414