1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 
118 #include <linux/uaccess.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132 
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136 
137 #include <trace/events/sock.h>
138 
139 #include <net/tcp.h>
140 #include <net/busy_poll.h>
141 
142 static DEFINE_MUTEX(proto_list_mutex);
143 static LIST_HEAD(proto_list);
144 
145 static void sock_inuse_add(struct net *net, int val);
146 
147 /**
148  * sk_ns_capable - General socket capability test
149  * @sk: Socket to use a capability on or through
150  * @user_ns: The user namespace of the capability to use
151  * @cap: The capability to use
152  *
153  * Test to see if the opener of the socket had when the socket was
154  * created and the current process has the capability @cap in the user
155  * namespace @user_ns.
156  */
sk_ns_capable(const struct sock *sk, struct user_namespace *user_ns, int cap)157 bool sk_ns_capable(const struct sock *sk,
158 		   struct user_namespace *user_ns, int cap)
159 {
160 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
161 		ns_capable(user_ns, cap);
162 }
163 EXPORT_SYMBOL(sk_ns_capable);
164 
165 /**
166  * sk_capable - Socket global capability test
167  * @sk: Socket to use a capability on or through
168  * @cap: The global capability to use
169  *
170  * Test to see if the opener of the socket had when the socket was
171  * created and the current process has the capability @cap in all user
172  * namespaces.
173  */
sk_capable(const struct sock *sk, int cap)174 bool sk_capable(const struct sock *sk, int cap)
175 {
176 	return sk_ns_capable(sk, &init_user_ns, cap);
177 }
178 EXPORT_SYMBOL(sk_capable);
179 
180 /**
181  * sk_net_capable - Network namespace socket capability test
182  * @sk: Socket to use a capability on or through
183  * @cap: The capability to use
184  *
185  * Test to see if the opener of the socket had when the socket was created
186  * and the current process has the capability @cap over the network namespace
187  * the socket is a member of.
188  */
sk_net_capable(const struct sock *sk, int cap)189 bool sk_net_capable(const struct sock *sk, int cap)
190 {
191 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
192 }
193 EXPORT_SYMBOL(sk_net_capable);
194 
195 /*
196  * Each address family might have different locking rules, so we have
197  * one slock key per address family and separate keys for internal and
198  * userspace sockets.
199  */
200 static struct lock_class_key af_family_keys[AF_MAX];
201 static struct lock_class_key af_family_kern_keys[AF_MAX];
202 static struct lock_class_key af_family_slock_keys[AF_MAX];
203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
204 
205 /*
206  * Make lock validator output more readable. (we pre-construct these
207  * strings build-time, so that runtime initialization of socket
208  * locks is fast):
209  */
210 
211 #define _sock_locks(x)						  \
212   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
213   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
214   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
215   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
216   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
217   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
218   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
219   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
220   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
221   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
222   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
223   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
224   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
225   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
226   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
227   x "AF_MAX"
228 
229 static const char *const af_family_key_strings[AF_MAX+1] = {
230 	_sock_locks("sk_lock-")
231 };
232 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
233 	_sock_locks("slock-")
234 };
235 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
236 	_sock_locks("clock-")
237 };
238 
239 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
240 	_sock_locks("k-sk_lock-")
241 };
242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
243 	_sock_locks("k-slock-")
244 };
245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
246 	_sock_locks("k-clock-")
247 };
248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
249 	_sock_locks("rlock-")
250 };
251 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
252 	_sock_locks("wlock-")
253 };
254 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
255 	_sock_locks("elock-")
256 };
257 
258 /*
259  * sk_callback_lock and sk queues locking rules are per-address-family,
260  * so split the lock classes by using a per-AF key:
261  */
262 static struct lock_class_key af_callback_keys[AF_MAX];
263 static struct lock_class_key af_rlock_keys[AF_MAX];
264 static struct lock_class_key af_wlock_keys[AF_MAX];
265 static struct lock_class_key af_elock_keys[AF_MAX];
266 static struct lock_class_key af_kern_callback_keys[AF_MAX];
267 
268 /* Run time adjustable parameters. */
269 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
270 EXPORT_SYMBOL(sysctl_wmem_max);
271 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
272 EXPORT_SYMBOL(sysctl_rmem_max);
273 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
274 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
275 
276 /* Maximal space eaten by iovec or ancillary data plus some space */
277 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
278 EXPORT_SYMBOL(sysctl_optmem_max);
279 
280 int sysctl_tstamp_allow_data __read_mostly = 1;
281 
282 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
283 EXPORT_SYMBOL_GPL(memalloc_socks_key);
284 
285 /**
286  * sk_set_memalloc - sets %SOCK_MEMALLOC
287  * @sk: socket to set it on
288  *
289  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
290  * It's the responsibility of the admin to adjust min_free_kbytes
291  * to meet the requirements
292  */
sk_set_memalloc(struct sock *sk)293 void sk_set_memalloc(struct sock *sk)
294 {
295 	sock_set_flag(sk, SOCK_MEMALLOC);
296 	sk->sk_allocation |= __GFP_MEMALLOC;
297 	static_branch_inc(&memalloc_socks_key);
298 }
299 EXPORT_SYMBOL_GPL(sk_set_memalloc);
300 
sk_clear_memalloc(struct sock *sk)301 void sk_clear_memalloc(struct sock *sk)
302 {
303 	sock_reset_flag(sk, SOCK_MEMALLOC);
304 	sk->sk_allocation &= ~__GFP_MEMALLOC;
305 	static_branch_dec(&memalloc_socks_key);
306 
307 	/*
308 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
309 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
310 	 * it has rmem allocations due to the last swapfile being deactivated
311 	 * but there is a risk that the socket is unusable due to exceeding
312 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
313 	 */
314 	sk_mem_reclaim(sk);
315 }
316 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
317 
__sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)318 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
319 {
320 	int ret;
321 	unsigned int noreclaim_flag;
322 
323 	/* these should have been dropped before queueing */
324 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
325 
326 	noreclaim_flag = memalloc_noreclaim_save();
327 	ret = sk->sk_backlog_rcv(sk, skb);
328 	memalloc_noreclaim_restore(noreclaim_flag);
329 
330 	return ret;
331 }
332 EXPORT_SYMBOL(__sk_backlog_rcv);
333 
sock_get_timeout(long timeo, void *optval, bool old_timeval)334 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
335 {
336 	struct __kernel_sock_timeval tv;
337 
338 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
339 		tv.tv_sec = 0;
340 		tv.tv_usec = 0;
341 	} else {
342 		tv.tv_sec = timeo / HZ;
343 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
344 	}
345 
346 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
347 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
348 		*(struct old_timeval32 *)optval = tv32;
349 		return sizeof(tv32);
350 	}
351 
352 	if (old_timeval) {
353 		struct __kernel_old_timeval old_tv;
354 		old_tv.tv_sec = tv.tv_sec;
355 		old_tv.tv_usec = tv.tv_usec;
356 		*(struct __kernel_old_timeval *)optval = old_tv;
357 		return sizeof(old_tv);
358 	}
359 
360 	*(struct __kernel_sock_timeval *)optval = tv;
361 	return sizeof(tv);
362 }
363 
sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, bool old_timeval)364 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
365 			    bool old_timeval)
366 {
367 	struct __kernel_sock_timeval tv;
368 
369 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
370 		struct old_timeval32 tv32;
371 
372 		if (optlen < sizeof(tv32))
373 			return -EINVAL;
374 
375 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
376 			return -EFAULT;
377 		tv.tv_sec = tv32.tv_sec;
378 		tv.tv_usec = tv32.tv_usec;
379 	} else if (old_timeval) {
380 		struct __kernel_old_timeval old_tv;
381 
382 		if (optlen < sizeof(old_tv))
383 			return -EINVAL;
384 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
385 			return -EFAULT;
386 		tv.tv_sec = old_tv.tv_sec;
387 		tv.tv_usec = old_tv.tv_usec;
388 	} else {
389 		if (optlen < sizeof(tv))
390 			return -EINVAL;
391 		if (copy_from_sockptr(&tv, optval, sizeof(tv)))
392 			return -EFAULT;
393 	}
394 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
395 		return -EDOM;
396 
397 	if (tv.tv_sec < 0) {
398 		static int warned __read_mostly;
399 
400 		*timeo_p = 0;
401 		if (warned < 10 && net_ratelimit()) {
402 			warned++;
403 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
404 				__func__, current->comm, task_pid_nr(current));
405 		}
406 		return 0;
407 	}
408 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
409 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
410 		return 0;
411 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
412 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
413 	return 0;
414 }
415 
sock_needs_netstamp(const struct sock *sk)416 static bool sock_needs_netstamp(const struct sock *sk)
417 {
418 	switch (sk->sk_family) {
419 	case AF_UNSPEC:
420 	case AF_UNIX:
421 		return false;
422 	default:
423 		return true;
424 	}
425 }
426 
sock_disable_timestamp(struct sock *sk, unsigned long flags)427 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
428 {
429 	if (sk->sk_flags & flags) {
430 		sk->sk_flags &= ~flags;
431 		if (sock_needs_netstamp(sk) &&
432 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
433 			net_disable_timestamp();
434 	}
435 }
436 
437 
__sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)438 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
439 {
440 	unsigned long flags;
441 	struct sk_buff_head *list = &sk->sk_receive_queue;
442 
443 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
444 		atomic_inc(&sk->sk_drops);
445 		trace_sock_rcvqueue_full(sk, skb);
446 		return -ENOMEM;
447 	}
448 
449 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
450 		atomic_inc(&sk->sk_drops);
451 		return -ENOBUFS;
452 	}
453 
454 	skb->dev = NULL;
455 	skb_set_owner_r(skb, sk);
456 
457 	/* we escape from rcu protected region, make sure we dont leak
458 	 * a norefcounted dst
459 	 */
460 	skb_dst_force(skb);
461 
462 	spin_lock_irqsave(&list->lock, flags);
463 	sock_skb_set_dropcount(sk, skb);
464 	__skb_queue_tail(list, skb);
465 	spin_unlock_irqrestore(&list->lock, flags);
466 
467 	if (!sock_flag(sk, SOCK_DEAD))
468 		sk->sk_data_ready(sk);
469 	return 0;
470 }
471 EXPORT_SYMBOL(__sock_queue_rcv_skb);
472 
sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)473 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
474 {
475 	int err;
476 
477 	err = sk_filter(sk, skb);
478 	if (err)
479 		return err;
480 
481 	return __sock_queue_rcv_skb(sk, skb);
482 }
483 EXPORT_SYMBOL(sock_queue_rcv_skb);
484 
__sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted)485 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
486 		     const int nested, unsigned int trim_cap, bool refcounted)
487 {
488 	int rc = NET_RX_SUCCESS;
489 
490 	if (sk_filter_trim_cap(sk, skb, trim_cap))
491 		goto discard_and_relse;
492 
493 	skb->dev = NULL;
494 
495 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
496 		atomic_inc(&sk->sk_drops);
497 		goto discard_and_relse;
498 	}
499 	if (nested)
500 		bh_lock_sock_nested(sk);
501 	else
502 		bh_lock_sock(sk);
503 	if (!sock_owned_by_user(sk)) {
504 		/*
505 		 * trylock + unlock semantics:
506 		 */
507 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
508 
509 		rc = sk_backlog_rcv(sk, skb);
510 
511 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
512 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
513 		bh_unlock_sock(sk);
514 		atomic_inc(&sk->sk_drops);
515 		goto discard_and_relse;
516 	}
517 
518 	bh_unlock_sock(sk);
519 out:
520 	if (refcounted)
521 		sock_put(sk);
522 	return rc;
523 discard_and_relse:
524 	kfree_skb(skb);
525 	goto out;
526 }
527 EXPORT_SYMBOL(__sk_receive_skb);
528 
__sk_dst_check(struct sock *sk, u32 cookie)529 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
530 {
531 	struct dst_entry *dst = __sk_dst_get(sk);
532 
533 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
534 		sk_tx_queue_clear(sk);
535 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
536 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
537 		dst_release(dst);
538 		return NULL;
539 	}
540 
541 	return dst;
542 }
543 EXPORT_SYMBOL(__sk_dst_check);
544 
sk_dst_check(struct sock *sk, u32 cookie)545 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
546 {
547 	struct dst_entry *dst = sk_dst_get(sk);
548 
549 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
550 		sk_dst_reset(sk);
551 		dst_release(dst);
552 		return NULL;
553 	}
554 
555 	return dst;
556 }
557 EXPORT_SYMBOL(sk_dst_check);
558 
sock_bindtoindex_locked(struct sock *sk, int ifindex)559 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
560 {
561 	int ret = -ENOPROTOOPT;
562 #ifdef CONFIG_NETDEVICES
563 	struct net *net = sock_net(sk);
564 
565 	/* Sorry... */
566 	ret = -EPERM;
567 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
568 		goto out;
569 
570 	ret = -EINVAL;
571 	if (ifindex < 0)
572 		goto out;
573 
574 	sk->sk_bound_dev_if = ifindex;
575 	if (sk->sk_prot->rehash)
576 		sk->sk_prot->rehash(sk);
577 	sk_dst_reset(sk);
578 
579 	ret = 0;
580 
581 out:
582 #endif
583 
584 	return ret;
585 }
586 
sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)587 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
588 {
589 	int ret;
590 
591 	if (lock_sk)
592 		lock_sock(sk);
593 	ret = sock_bindtoindex_locked(sk, ifindex);
594 	if (lock_sk)
595 		release_sock(sk);
596 
597 	return ret;
598 }
599 EXPORT_SYMBOL(sock_bindtoindex);
600 
sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)601 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
602 {
603 	int ret = -ENOPROTOOPT;
604 #ifdef CONFIG_NETDEVICES
605 	struct net *net = sock_net(sk);
606 	char devname[IFNAMSIZ];
607 	int index;
608 
609 	ret = -EINVAL;
610 	if (optlen < 0)
611 		goto out;
612 
613 	/* Bind this socket to a particular device like "eth0",
614 	 * as specified in the passed interface name. If the
615 	 * name is "" or the option length is zero the socket
616 	 * is not bound.
617 	 */
618 	if (optlen > IFNAMSIZ - 1)
619 		optlen = IFNAMSIZ - 1;
620 	memset(devname, 0, sizeof(devname));
621 
622 	ret = -EFAULT;
623 	if (copy_from_sockptr(devname, optval, optlen))
624 		goto out;
625 
626 	index = 0;
627 	if (devname[0] != '\0') {
628 		struct net_device *dev;
629 
630 		rcu_read_lock();
631 		dev = dev_get_by_name_rcu(net, devname);
632 		if (dev)
633 			index = dev->ifindex;
634 		rcu_read_unlock();
635 		ret = -ENODEV;
636 		if (!dev)
637 			goto out;
638 	}
639 
640 	return sock_bindtoindex(sk, index, true);
641 out:
642 #endif
643 
644 	return ret;
645 }
646 
sock_getbindtodevice(struct sock *sk, char __user *optval, int __user *optlen, int len)647 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
648 				int __user *optlen, int len)
649 {
650 	int ret = -ENOPROTOOPT;
651 #ifdef CONFIG_NETDEVICES
652 	struct net *net = sock_net(sk);
653 	char devname[IFNAMSIZ];
654 
655 	if (sk->sk_bound_dev_if == 0) {
656 		len = 0;
657 		goto zero;
658 	}
659 
660 	ret = -EINVAL;
661 	if (len < IFNAMSIZ)
662 		goto out;
663 
664 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
665 	if (ret)
666 		goto out;
667 
668 	len = strlen(devname) + 1;
669 
670 	ret = -EFAULT;
671 	if (copy_to_user(optval, devname, len))
672 		goto out;
673 
674 zero:
675 	ret = -EFAULT;
676 	if (put_user(len, optlen))
677 		goto out;
678 
679 	ret = 0;
680 
681 out:
682 #endif
683 
684 	return ret;
685 }
686 
sk_mc_loop(struct sock *sk)687 bool sk_mc_loop(struct sock *sk)
688 {
689 	if (dev_recursion_level())
690 		return false;
691 	if (!sk)
692 		return true;
693 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
694 	switch (READ_ONCE(sk->sk_family)) {
695 	case AF_INET:
696 		return inet_sk(sk)->mc_loop;
697 #if IS_ENABLED(CONFIG_IPV6)
698 	case AF_INET6:
699 		return inet6_sk(sk)->mc_loop;
700 #endif
701 	}
702 	WARN_ON_ONCE(1);
703 	return true;
704 }
705 EXPORT_SYMBOL(sk_mc_loop);
706 
sock_set_reuseaddr(struct sock *sk)707 void sock_set_reuseaddr(struct sock *sk)
708 {
709 	lock_sock(sk);
710 	sk->sk_reuse = SK_CAN_REUSE;
711 	release_sock(sk);
712 }
713 EXPORT_SYMBOL(sock_set_reuseaddr);
714 
sock_set_reuseport(struct sock *sk)715 void sock_set_reuseport(struct sock *sk)
716 {
717 	lock_sock(sk);
718 	sk->sk_reuseport = true;
719 	release_sock(sk);
720 }
721 EXPORT_SYMBOL(sock_set_reuseport);
722 
sock_no_linger(struct sock *sk)723 void sock_no_linger(struct sock *sk)
724 {
725 	lock_sock(sk);
726 	sk->sk_lingertime = 0;
727 	sock_set_flag(sk, SOCK_LINGER);
728 	release_sock(sk);
729 }
730 EXPORT_SYMBOL(sock_no_linger);
731 
sock_set_priority(struct sock *sk, u32 priority)732 void sock_set_priority(struct sock *sk, u32 priority)
733 {
734 	lock_sock(sk);
735 	sk->sk_priority = priority;
736 	release_sock(sk);
737 }
738 EXPORT_SYMBOL(sock_set_priority);
739 
sock_set_sndtimeo(struct sock *sk, s64 secs)740 void sock_set_sndtimeo(struct sock *sk, s64 secs)
741 {
742 	lock_sock(sk);
743 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
744 		sk->sk_sndtimeo = secs * HZ;
745 	else
746 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
747 	release_sock(sk);
748 }
749 EXPORT_SYMBOL(sock_set_sndtimeo);
750 
__sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)751 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
752 {
753 	if (val)  {
754 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
755 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
756 		sock_set_flag(sk, SOCK_RCVTSTAMP);
757 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
758 	} else {
759 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
760 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
761 	}
762 }
763 
sock_enable_timestamps(struct sock *sk)764 void sock_enable_timestamps(struct sock *sk)
765 {
766 	lock_sock(sk);
767 	__sock_set_timestamps(sk, true, false, true);
768 	release_sock(sk);
769 }
770 EXPORT_SYMBOL(sock_enable_timestamps);
771 
sock_set_keepalive(struct sock *sk)772 void sock_set_keepalive(struct sock *sk)
773 {
774 	lock_sock(sk);
775 	if (sk->sk_prot->keepalive)
776 		sk->sk_prot->keepalive(sk, true);
777 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
778 	release_sock(sk);
779 }
780 EXPORT_SYMBOL(sock_set_keepalive);
781 
__sock_set_rcvbuf(struct sock *sk, int val)782 static void __sock_set_rcvbuf(struct sock *sk, int val)
783 {
784 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
785 	 * as a negative value.
786 	 */
787 	val = min_t(int, val, INT_MAX / 2);
788 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
789 
790 	/* We double it on the way in to account for "struct sk_buff" etc.
791 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
792 	 * will allow that much actual data to be received on that socket.
793 	 *
794 	 * Applications are unaware that "struct sk_buff" and other overheads
795 	 * allocate from the receive buffer during socket buffer allocation.
796 	 *
797 	 * And after considering the possible alternatives, returning the value
798 	 * we actually used in getsockopt is the most desirable behavior.
799 	 */
800 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
801 }
802 
sock_set_rcvbuf(struct sock *sk, int val)803 void sock_set_rcvbuf(struct sock *sk, int val)
804 {
805 	lock_sock(sk);
806 	__sock_set_rcvbuf(sk, val);
807 	release_sock(sk);
808 }
809 EXPORT_SYMBOL(sock_set_rcvbuf);
810 
__sock_set_mark(struct sock *sk, u32 val)811 static void __sock_set_mark(struct sock *sk, u32 val)
812 {
813 	if (val != sk->sk_mark) {
814 		sk->sk_mark = val;
815 		sk_dst_reset(sk);
816 	}
817 }
818 
sock_set_mark(struct sock *sk, u32 val)819 void sock_set_mark(struct sock *sk, u32 val)
820 {
821 	lock_sock(sk);
822 	__sock_set_mark(sk, val);
823 	release_sock(sk);
824 }
825 EXPORT_SYMBOL(sock_set_mark);
826 
827 /*
828  *	This is meant for all protocols to use and covers goings on
829  *	at the socket level. Everything here is generic.
830  */
831 
sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen)832 int sock_setsockopt(struct socket *sock, int level, int optname,
833 		    sockptr_t optval, unsigned int optlen)
834 {
835 	struct sock_txtime sk_txtime;
836 	struct sock *sk = sock->sk;
837 	int val;
838 	int valbool;
839 	struct linger ling;
840 	int ret = 0;
841 
842 	/*
843 	 *	Options without arguments
844 	 */
845 
846 	if (optname == SO_BINDTODEVICE)
847 		return sock_setbindtodevice(sk, optval, optlen);
848 
849 	if (optlen < sizeof(int))
850 		return -EINVAL;
851 
852 	if (copy_from_sockptr(&val, optval, sizeof(val)))
853 		return -EFAULT;
854 
855 	valbool = val ? 1 : 0;
856 
857 	lock_sock(sk);
858 
859 	switch (optname) {
860 	case SO_DEBUG:
861 		if (val && !capable(CAP_NET_ADMIN))
862 			ret = -EACCES;
863 		else
864 			sock_valbool_flag(sk, SOCK_DBG, valbool);
865 		break;
866 	case SO_REUSEADDR:
867 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
868 		break;
869 	case SO_REUSEPORT:
870 		sk->sk_reuseport = valbool;
871 		break;
872 	case SO_TYPE:
873 	case SO_PROTOCOL:
874 	case SO_DOMAIN:
875 	case SO_ERROR:
876 		ret = -ENOPROTOOPT;
877 		break;
878 	case SO_DONTROUTE:
879 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
880 		sk_dst_reset(sk);
881 		break;
882 	case SO_BROADCAST:
883 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
884 		break;
885 	case SO_SNDBUF:
886 		/* Don't error on this BSD doesn't and if you think
887 		 * about it this is right. Otherwise apps have to
888 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
889 		 * are treated in BSD as hints
890 		 */
891 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
892 set_sndbuf:
893 		/* Ensure val * 2 fits into an int, to prevent max_t()
894 		 * from treating it as a negative value.
895 		 */
896 		val = min_t(int, val, INT_MAX / 2);
897 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
898 		WRITE_ONCE(sk->sk_sndbuf,
899 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
900 		/* Wake up sending tasks if we upped the value. */
901 		sk->sk_write_space(sk);
902 		break;
903 
904 	case SO_SNDBUFFORCE:
905 		if (!capable(CAP_NET_ADMIN)) {
906 			ret = -EPERM;
907 			break;
908 		}
909 
910 		/* No negative values (to prevent underflow, as val will be
911 		 * multiplied by 2).
912 		 */
913 		if (val < 0)
914 			val = 0;
915 		goto set_sndbuf;
916 
917 	case SO_RCVBUF:
918 		/* Don't error on this BSD doesn't and if you think
919 		 * about it this is right. Otherwise apps have to
920 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
921 		 * are treated in BSD as hints
922 		 */
923 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
924 		break;
925 
926 	case SO_RCVBUFFORCE:
927 		if (!capable(CAP_NET_ADMIN)) {
928 			ret = -EPERM;
929 			break;
930 		}
931 
932 		/* No negative values (to prevent underflow, as val will be
933 		 * multiplied by 2).
934 		 */
935 		__sock_set_rcvbuf(sk, max(val, 0));
936 		break;
937 
938 	case SO_KEEPALIVE:
939 		if (sk->sk_prot->keepalive)
940 			sk->sk_prot->keepalive(sk, valbool);
941 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
942 		break;
943 
944 	case SO_OOBINLINE:
945 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
946 		break;
947 
948 	case SO_NO_CHECK:
949 		sk->sk_no_check_tx = valbool;
950 		break;
951 
952 	case SO_PRIORITY:
953 		if ((val >= 0 && val <= 6) ||
954 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
955 			sk->sk_priority = val;
956 		else
957 			ret = -EPERM;
958 		break;
959 
960 	case SO_LINGER:
961 		if (optlen < sizeof(ling)) {
962 			ret = -EINVAL;	/* 1003.1g */
963 			break;
964 		}
965 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
966 			ret = -EFAULT;
967 			break;
968 		}
969 		if (!ling.l_onoff)
970 			sock_reset_flag(sk, SOCK_LINGER);
971 		else {
972 #if (BITS_PER_LONG == 32)
973 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
974 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
975 			else
976 #endif
977 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
978 			sock_set_flag(sk, SOCK_LINGER);
979 		}
980 		break;
981 
982 	case SO_BSDCOMPAT:
983 		break;
984 
985 	case SO_PASSCRED:
986 		if (valbool)
987 			set_bit(SOCK_PASSCRED, &sock->flags);
988 		else
989 			clear_bit(SOCK_PASSCRED, &sock->flags);
990 		break;
991 
992 	case SO_TIMESTAMP_OLD:
993 		__sock_set_timestamps(sk, valbool, false, false);
994 		break;
995 	case SO_TIMESTAMP_NEW:
996 		__sock_set_timestamps(sk, valbool, true, false);
997 		break;
998 	case SO_TIMESTAMPNS_OLD:
999 		__sock_set_timestamps(sk, valbool, false, true);
1000 		break;
1001 	case SO_TIMESTAMPNS_NEW:
1002 		__sock_set_timestamps(sk, valbool, true, true);
1003 		break;
1004 	case SO_TIMESTAMPING_NEW:
1005 	case SO_TIMESTAMPING_OLD:
1006 		if (val & ~SOF_TIMESTAMPING_MASK) {
1007 			ret = -EINVAL;
1008 			break;
1009 		}
1010 
1011 		if (val & SOF_TIMESTAMPING_OPT_ID &&
1012 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
1013 			if (sk->sk_protocol == IPPROTO_TCP &&
1014 			    sk->sk_type == SOCK_STREAM) {
1015 				if ((1 << sk->sk_state) &
1016 				    (TCPF_CLOSE | TCPF_LISTEN)) {
1017 					ret = -EINVAL;
1018 					break;
1019 				}
1020 				sk->sk_tskey = tcp_sk(sk)->snd_una;
1021 			} else {
1022 				sk->sk_tskey = 0;
1023 			}
1024 		}
1025 
1026 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
1027 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
1028 			ret = -EINVAL;
1029 			break;
1030 		}
1031 
1032 		sk->sk_tsflags = val;
1033 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
1034 
1035 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
1036 			sock_enable_timestamp(sk,
1037 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
1038 		else
1039 			sock_disable_timestamp(sk,
1040 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
1041 		break;
1042 
1043 	case SO_RCVLOWAT:
1044 		if (val < 0)
1045 			val = INT_MAX;
1046 		if (sock->ops->set_rcvlowat)
1047 			ret = sock->ops->set_rcvlowat(sk, val);
1048 		else
1049 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1050 		break;
1051 
1052 	case SO_RCVTIMEO_OLD:
1053 	case SO_RCVTIMEO_NEW:
1054 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1055 				       optlen, optname == SO_RCVTIMEO_OLD);
1056 		break;
1057 
1058 	case SO_SNDTIMEO_OLD:
1059 	case SO_SNDTIMEO_NEW:
1060 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1061 				       optlen, optname == SO_SNDTIMEO_OLD);
1062 		break;
1063 
1064 	case SO_ATTACH_FILTER: {
1065 		struct sock_fprog fprog;
1066 
1067 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1068 		if (!ret)
1069 			ret = sk_attach_filter(&fprog, sk);
1070 		break;
1071 	}
1072 	case SO_ATTACH_BPF:
1073 		ret = -EINVAL;
1074 		if (optlen == sizeof(u32)) {
1075 			u32 ufd;
1076 
1077 			ret = -EFAULT;
1078 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1079 				break;
1080 
1081 			ret = sk_attach_bpf(ufd, sk);
1082 		}
1083 		break;
1084 
1085 	case SO_ATTACH_REUSEPORT_CBPF: {
1086 		struct sock_fprog fprog;
1087 
1088 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1089 		if (!ret)
1090 			ret = sk_reuseport_attach_filter(&fprog, sk);
1091 		break;
1092 	}
1093 	case SO_ATTACH_REUSEPORT_EBPF:
1094 		ret = -EINVAL;
1095 		if (optlen == sizeof(u32)) {
1096 			u32 ufd;
1097 
1098 			ret = -EFAULT;
1099 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1100 				break;
1101 
1102 			ret = sk_reuseport_attach_bpf(ufd, sk);
1103 		}
1104 		break;
1105 
1106 	case SO_DETACH_REUSEPORT_BPF:
1107 		ret = reuseport_detach_prog(sk);
1108 		break;
1109 
1110 	case SO_DETACH_FILTER:
1111 		ret = sk_detach_filter(sk);
1112 		break;
1113 
1114 	case SO_LOCK_FILTER:
1115 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1116 			ret = -EPERM;
1117 		else
1118 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1119 		break;
1120 
1121 	case SO_PASSSEC:
1122 		if (valbool)
1123 			set_bit(SOCK_PASSSEC, &sock->flags);
1124 		else
1125 			clear_bit(SOCK_PASSSEC, &sock->flags);
1126 		break;
1127 	case SO_MARK:
1128 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1129 			ret = -EPERM;
1130 			break;
1131 		}
1132 
1133 		__sock_set_mark(sk, val);
1134 		break;
1135 
1136 	case SO_RXQ_OVFL:
1137 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1138 		break;
1139 
1140 	case SO_WIFI_STATUS:
1141 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1142 		break;
1143 
1144 	case SO_PEEK_OFF:
1145 		if (sock->ops->set_peek_off)
1146 			ret = sock->ops->set_peek_off(sk, val);
1147 		else
1148 			ret = -EOPNOTSUPP;
1149 		break;
1150 
1151 	case SO_NOFCS:
1152 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1153 		break;
1154 
1155 	case SO_SELECT_ERR_QUEUE:
1156 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1157 		break;
1158 
1159 #ifdef CONFIG_NET_RX_BUSY_POLL
1160 	case SO_BUSY_POLL:
1161 		/* allow unprivileged users to decrease the value */
1162 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1163 			ret = -EPERM;
1164 		else {
1165 			if (val < 0)
1166 				ret = -EINVAL;
1167 			else
1168 				WRITE_ONCE(sk->sk_ll_usec, val);
1169 		}
1170 		break;
1171 #endif
1172 
1173 	case SO_MAX_PACING_RATE:
1174 		{
1175 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1176 
1177 		if (sizeof(ulval) != sizeof(val) &&
1178 		    optlen >= sizeof(ulval) &&
1179 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1180 			ret = -EFAULT;
1181 			break;
1182 		}
1183 		if (ulval != ~0UL)
1184 			cmpxchg(&sk->sk_pacing_status,
1185 				SK_PACING_NONE,
1186 				SK_PACING_NEEDED);
1187 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1188 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1189 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1190 		break;
1191 		}
1192 	case SO_INCOMING_CPU:
1193 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1194 		break;
1195 
1196 	case SO_CNX_ADVICE:
1197 		if (val == 1)
1198 			dst_negative_advice(sk);
1199 		break;
1200 
1201 	case SO_ZEROCOPY:
1202 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1203 			if (!((sk->sk_type == SOCK_STREAM &&
1204 			       sk->sk_protocol == IPPROTO_TCP) ||
1205 			      (sk->sk_type == SOCK_DGRAM &&
1206 			       sk->sk_protocol == IPPROTO_UDP)))
1207 				ret = -ENOTSUPP;
1208 		} else if (sk->sk_family != PF_RDS) {
1209 			ret = -ENOTSUPP;
1210 		}
1211 		if (!ret) {
1212 			if (val < 0 || val > 1)
1213 				ret = -EINVAL;
1214 			else
1215 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1216 		}
1217 		break;
1218 
1219 	case SO_TXTIME:
1220 		if (optlen != sizeof(struct sock_txtime)) {
1221 			ret = -EINVAL;
1222 			break;
1223 		} else if (copy_from_sockptr(&sk_txtime, optval,
1224 			   sizeof(struct sock_txtime))) {
1225 			ret = -EFAULT;
1226 			break;
1227 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1228 			ret = -EINVAL;
1229 			break;
1230 		}
1231 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1232 		 * scheduler has enough safe guards.
1233 		 */
1234 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1235 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1236 			ret = -EPERM;
1237 			break;
1238 		}
1239 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1240 		sk->sk_clockid = sk_txtime.clockid;
1241 		sk->sk_txtime_deadline_mode =
1242 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1243 		sk->sk_txtime_report_errors =
1244 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1245 		break;
1246 
1247 	case SO_BINDTOIFINDEX:
1248 		ret = sock_bindtoindex_locked(sk, val);
1249 		break;
1250 
1251 	default:
1252 		ret = -ENOPROTOOPT;
1253 		break;
1254 	}
1255 	release_sock(sk);
1256 	return ret;
1257 }
1258 EXPORT_SYMBOL(sock_setsockopt);
1259 
sk_get_peer_cred(struct sock *sk)1260 static const struct cred *sk_get_peer_cred(struct sock *sk)
1261 {
1262 	const struct cred *cred;
1263 
1264 	spin_lock(&sk->sk_peer_lock);
1265 	cred = get_cred(sk->sk_peer_cred);
1266 	spin_unlock(&sk->sk_peer_lock);
1267 
1268 	return cred;
1269 }
1270 
cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred)1271 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1272 			  struct ucred *ucred)
1273 {
1274 	ucred->pid = pid_vnr(pid);
1275 	ucred->uid = ucred->gid = -1;
1276 	if (cred) {
1277 		struct user_namespace *current_ns = current_user_ns();
1278 
1279 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1280 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1281 	}
1282 }
1283 
groups_to_user(gid_t __user *dst, const struct group_info *src)1284 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1285 {
1286 	struct user_namespace *user_ns = current_user_ns();
1287 	int i;
1288 
1289 	for (i = 0; i < src->ngroups; i++)
1290 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1291 			return -EFAULT;
1292 
1293 	return 0;
1294 }
1295 
sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)1296 int sock_getsockopt(struct socket *sock, int level, int optname,
1297 		    char __user *optval, int __user *optlen)
1298 {
1299 	struct sock *sk = sock->sk;
1300 
1301 	union {
1302 		int val;
1303 		u64 val64;
1304 		unsigned long ulval;
1305 		struct linger ling;
1306 		struct old_timeval32 tm32;
1307 		struct __kernel_old_timeval tm;
1308 		struct  __kernel_sock_timeval stm;
1309 		struct sock_txtime txtime;
1310 	} v;
1311 
1312 	int lv = sizeof(int);
1313 	int len;
1314 
1315 	if (get_user(len, optlen))
1316 		return -EFAULT;
1317 	if (len < 0)
1318 		return -EINVAL;
1319 
1320 	memset(&v, 0, sizeof(v));
1321 
1322 	switch (optname) {
1323 	case SO_DEBUG:
1324 		v.val = sock_flag(sk, SOCK_DBG);
1325 		break;
1326 
1327 	case SO_DONTROUTE:
1328 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1329 		break;
1330 
1331 	case SO_BROADCAST:
1332 		v.val = sock_flag(sk, SOCK_BROADCAST);
1333 		break;
1334 
1335 	case SO_SNDBUF:
1336 		v.val = READ_ONCE(sk->sk_sndbuf);
1337 		break;
1338 
1339 	case SO_RCVBUF:
1340 		v.val = READ_ONCE(sk->sk_rcvbuf);
1341 		break;
1342 
1343 	case SO_REUSEADDR:
1344 		v.val = sk->sk_reuse;
1345 		break;
1346 
1347 	case SO_REUSEPORT:
1348 		v.val = sk->sk_reuseport;
1349 		break;
1350 
1351 	case SO_KEEPALIVE:
1352 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1353 		break;
1354 
1355 	case SO_TYPE:
1356 		v.val = sk->sk_type;
1357 		break;
1358 
1359 	case SO_PROTOCOL:
1360 		v.val = sk->sk_protocol;
1361 		break;
1362 
1363 	case SO_DOMAIN:
1364 		v.val = sk->sk_family;
1365 		break;
1366 
1367 	case SO_ERROR:
1368 		v.val = -sock_error(sk);
1369 		if (v.val == 0)
1370 			v.val = xchg(&sk->sk_err_soft, 0);
1371 		break;
1372 
1373 	case SO_OOBINLINE:
1374 		v.val = sock_flag(sk, SOCK_URGINLINE);
1375 		break;
1376 
1377 	case SO_NO_CHECK:
1378 		v.val = sk->sk_no_check_tx;
1379 		break;
1380 
1381 	case SO_PRIORITY:
1382 		v.val = sk->sk_priority;
1383 		break;
1384 
1385 	case SO_LINGER:
1386 		lv		= sizeof(v.ling);
1387 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1388 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1389 		break;
1390 
1391 	case SO_BSDCOMPAT:
1392 		break;
1393 
1394 	case SO_TIMESTAMP_OLD:
1395 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1396 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1397 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1398 		break;
1399 
1400 	case SO_TIMESTAMPNS_OLD:
1401 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1402 		break;
1403 
1404 	case SO_TIMESTAMP_NEW:
1405 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1406 		break;
1407 
1408 	case SO_TIMESTAMPNS_NEW:
1409 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1410 		break;
1411 
1412 	case SO_TIMESTAMPING_OLD:
1413 		v.val = sk->sk_tsflags;
1414 		break;
1415 
1416 	case SO_RCVTIMEO_OLD:
1417 	case SO_RCVTIMEO_NEW:
1418 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1419 		break;
1420 
1421 	case SO_SNDTIMEO_OLD:
1422 	case SO_SNDTIMEO_NEW:
1423 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1424 		break;
1425 
1426 	case SO_RCVLOWAT:
1427 		v.val = READ_ONCE(sk->sk_rcvlowat);
1428 		break;
1429 
1430 	case SO_SNDLOWAT:
1431 		v.val = 1;
1432 		break;
1433 
1434 	case SO_PASSCRED:
1435 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1436 		break;
1437 
1438 	case SO_PEERCRED:
1439 	{
1440 		struct ucred peercred;
1441 		if (len > sizeof(peercred))
1442 			len = sizeof(peercred);
1443 
1444 		spin_lock(&sk->sk_peer_lock);
1445 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1446 		spin_unlock(&sk->sk_peer_lock);
1447 
1448 		if (copy_to_user(optval, &peercred, len))
1449 			return -EFAULT;
1450 		goto lenout;
1451 	}
1452 
1453 	case SO_PEERGROUPS:
1454 	{
1455 		const struct cred *cred;
1456 		int ret, n;
1457 
1458 		cred = sk_get_peer_cred(sk);
1459 		if (!cred)
1460 			return -ENODATA;
1461 
1462 		n = cred->group_info->ngroups;
1463 		if (len < n * sizeof(gid_t)) {
1464 			len = n * sizeof(gid_t);
1465 			put_cred(cred);
1466 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1467 		}
1468 		len = n * sizeof(gid_t);
1469 
1470 		ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1471 		put_cred(cred);
1472 		if (ret)
1473 			return ret;
1474 		goto lenout;
1475 	}
1476 
1477 	case SO_PEERNAME:
1478 	{
1479 		char address[128];
1480 
1481 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1482 		if (lv < 0)
1483 			return -ENOTCONN;
1484 		if (lv < len)
1485 			return -EINVAL;
1486 		if (copy_to_user(optval, address, len))
1487 			return -EFAULT;
1488 		goto lenout;
1489 	}
1490 
1491 	/* Dubious BSD thing... Probably nobody even uses it, but
1492 	 * the UNIX standard wants it for whatever reason... -DaveM
1493 	 */
1494 	case SO_ACCEPTCONN:
1495 		v.val = sk->sk_state == TCP_LISTEN;
1496 		break;
1497 
1498 	case SO_PASSSEC:
1499 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1500 		break;
1501 
1502 	case SO_PEERSEC:
1503 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1504 
1505 	case SO_MARK:
1506 		v.val = sk->sk_mark;
1507 		break;
1508 
1509 	case SO_RXQ_OVFL:
1510 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1511 		break;
1512 
1513 	case SO_WIFI_STATUS:
1514 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1515 		break;
1516 
1517 	case SO_PEEK_OFF:
1518 		if (!sock->ops->set_peek_off)
1519 			return -EOPNOTSUPP;
1520 
1521 		v.val = READ_ONCE(sk->sk_peek_off);
1522 		break;
1523 	case SO_NOFCS:
1524 		v.val = sock_flag(sk, SOCK_NOFCS);
1525 		break;
1526 
1527 	case SO_BINDTODEVICE:
1528 		return sock_getbindtodevice(sk, optval, optlen, len);
1529 
1530 	case SO_GET_FILTER:
1531 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1532 		if (len < 0)
1533 			return len;
1534 
1535 		goto lenout;
1536 
1537 	case SO_LOCK_FILTER:
1538 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1539 		break;
1540 
1541 	case SO_BPF_EXTENSIONS:
1542 		v.val = bpf_tell_extensions();
1543 		break;
1544 
1545 	case SO_SELECT_ERR_QUEUE:
1546 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1547 		break;
1548 
1549 #ifdef CONFIG_NET_RX_BUSY_POLL
1550 	case SO_BUSY_POLL:
1551 		v.val = READ_ONCE(sk->sk_ll_usec);
1552 		break;
1553 #endif
1554 
1555 	case SO_MAX_PACING_RATE:
1556 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1557 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1558 			lv = sizeof(v.ulval);
1559 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1560 		} else {
1561 			/* 32bit version */
1562 			v.val = min_t(unsigned long, ~0U,
1563 				      READ_ONCE(sk->sk_max_pacing_rate));
1564 		}
1565 		break;
1566 
1567 	case SO_INCOMING_CPU:
1568 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1569 		break;
1570 
1571 	case SO_MEMINFO:
1572 	{
1573 		u32 meminfo[SK_MEMINFO_VARS];
1574 
1575 		sk_get_meminfo(sk, meminfo);
1576 
1577 		len = min_t(unsigned int, len, sizeof(meminfo));
1578 		if (copy_to_user(optval, &meminfo, len))
1579 			return -EFAULT;
1580 
1581 		goto lenout;
1582 	}
1583 
1584 #ifdef CONFIG_NET_RX_BUSY_POLL
1585 	case SO_INCOMING_NAPI_ID:
1586 		v.val = READ_ONCE(sk->sk_napi_id);
1587 
1588 		/* aggregate non-NAPI IDs down to 0 */
1589 		if (v.val < MIN_NAPI_ID)
1590 			v.val = 0;
1591 
1592 		break;
1593 #endif
1594 
1595 	case SO_COOKIE:
1596 		lv = sizeof(u64);
1597 		if (len < lv)
1598 			return -EINVAL;
1599 		v.val64 = sock_gen_cookie(sk);
1600 		break;
1601 
1602 	case SO_ZEROCOPY:
1603 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1604 		break;
1605 
1606 	case SO_TXTIME:
1607 		lv = sizeof(v.txtime);
1608 		v.txtime.clockid = sk->sk_clockid;
1609 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1610 				  SOF_TXTIME_DEADLINE_MODE : 0;
1611 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1612 				  SOF_TXTIME_REPORT_ERRORS : 0;
1613 		break;
1614 
1615 	case SO_BINDTOIFINDEX:
1616 		v.val = sk->sk_bound_dev_if;
1617 		break;
1618 
1619 	default:
1620 		/* We implement the SO_SNDLOWAT etc to not be settable
1621 		 * (1003.1g 7).
1622 		 */
1623 		return -ENOPROTOOPT;
1624 	}
1625 
1626 	if (len > lv)
1627 		len = lv;
1628 	if (copy_to_user(optval, &v, len))
1629 		return -EFAULT;
1630 lenout:
1631 	if (put_user(len, optlen))
1632 		return -EFAULT;
1633 	return 0;
1634 }
1635 
1636 /*
1637  * Initialize an sk_lock.
1638  *
1639  * (We also register the sk_lock with the lock validator.)
1640  */
sock_lock_init(struct sock *sk)1641 static inline void sock_lock_init(struct sock *sk)
1642 {
1643 	if (sk->sk_kern_sock)
1644 		sock_lock_init_class_and_name(
1645 			sk,
1646 			af_family_kern_slock_key_strings[sk->sk_family],
1647 			af_family_kern_slock_keys + sk->sk_family,
1648 			af_family_kern_key_strings[sk->sk_family],
1649 			af_family_kern_keys + sk->sk_family);
1650 	else
1651 		sock_lock_init_class_and_name(
1652 			sk,
1653 			af_family_slock_key_strings[sk->sk_family],
1654 			af_family_slock_keys + sk->sk_family,
1655 			af_family_key_strings[sk->sk_family],
1656 			af_family_keys + sk->sk_family);
1657 }
1658 
1659 /*
1660  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1661  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1662  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1663  */
sock_copy(struct sock *nsk, const struct sock *osk)1664 static void sock_copy(struct sock *nsk, const struct sock *osk)
1665 {
1666 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1667 #ifdef CONFIG_SECURITY_NETWORK
1668 	void *sptr = nsk->sk_security;
1669 #endif
1670 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1671 
1672 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1673 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1674 
1675 #ifdef CONFIG_SECURITY_NETWORK
1676 	nsk->sk_security = sptr;
1677 	security_sk_clone(osk, nsk);
1678 #endif
1679 }
1680 
sk_prot_alloc(struct proto *prot, gfp_t priority, int family)1681 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1682 		int family)
1683 {
1684 	struct sock *sk;
1685 	struct kmem_cache *slab;
1686 
1687 	slab = prot->slab;
1688 	if (slab != NULL) {
1689 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1690 		if (!sk)
1691 			return sk;
1692 		if (want_init_on_alloc(priority))
1693 			sk_prot_clear_nulls(sk, prot->obj_size);
1694 	} else
1695 		sk = kmalloc(prot->obj_size, priority);
1696 
1697 	if (sk != NULL) {
1698 		if (security_sk_alloc(sk, family, priority))
1699 			goto out_free;
1700 
1701 		if (!try_module_get(prot->owner))
1702 			goto out_free_sec;
1703 		sk_tx_queue_clear(sk);
1704 	}
1705 
1706 	return sk;
1707 
1708 out_free_sec:
1709 	security_sk_free(sk);
1710 out_free:
1711 	if (slab != NULL)
1712 		kmem_cache_free(slab, sk);
1713 	else
1714 		kfree(sk);
1715 	return NULL;
1716 }
1717 
sk_prot_free(struct proto *prot, struct sock *sk)1718 static void sk_prot_free(struct proto *prot, struct sock *sk)
1719 {
1720 	struct kmem_cache *slab;
1721 	struct module *owner;
1722 
1723 	owner = prot->owner;
1724 	slab = prot->slab;
1725 
1726 	cgroup_sk_free(&sk->sk_cgrp_data);
1727 	mem_cgroup_sk_free(sk);
1728 	security_sk_free(sk);
1729 	if (slab != NULL)
1730 		kmem_cache_free(slab, sk);
1731 	else
1732 		kfree(sk);
1733 	module_put(owner);
1734 }
1735 
1736 /**
1737  *	sk_alloc - All socket objects are allocated here
1738  *	@net: the applicable net namespace
1739  *	@family: protocol family
1740  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1741  *	@prot: struct proto associated with this new sock instance
1742  *	@kern: is this to be a kernel socket?
1743  */
sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern)1744 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1745 		      struct proto *prot, int kern)
1746 {
1747 	struct sock *sk;
1748 
1749 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1750 	if (sk) {
1751 		sk->sk_family = family;
1752 		/*
1753 		 * See comment in struct sock definition to understand
1754 		 * why we need sk_prot_creator -acme
1755 		 */
1756 		sk->sk_prot = sk->sk_prot_creator = prot;
1757 		sk->sk_kern_sock = kern;
1758 		sock_lock_init(sk);
1759 		sk->sk_net_refcnt = kern ? 0 : 1;
1760 		if (likely(sk->sk_net_refcnt)) {
1761 			get_net(net);
1762 			sock_inuse_add(net, 1);
1763 		}
1764 
1765 		sock_net_set(sk, net);
1766 		refcount_set(&sk->sk_wmem_alloc, 1);
1767 
1768 		mem_cgroup_sk_alloc(sk);
1769 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1770 		sock_update_classid(&sk->sk_cgrp_data);
1771 		sock_update_netprioidx(&sk->sk_cgrp_data);
1772 		sk_tx_queue_clear(sk);
1773 	}
1774 
1775 	return sk;
1776 }
1777 EXPORT_SYMBOL(sk_alloc);
1778 
1779 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1780  * grace period. This is the case for UDP sockets and TCP listeners.
1781  */
__sk_destruct(struct rcu_head *head)1782 static void __sk_destruct(struct rcu_head *head)
1783 {
1784 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1785 	struct sk_filter *filter;
1786 
1787 	if (sk->sk_destruct)
1788 		sk->sk_destruct(sk);
1789 
1790 	filter = rcu_dereference_check(sk->sk_filter,
1791 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1792 	if (filter) {
1793 		sk_filter_uncharge(sk, filter);
1794 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1795 	}
1796 
1797 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1798 
1799 #ifdef CONFIG_BPF_SYSCALL
1800 	bpf_sk_storage_free(sk);
1801 #endif
1802 
1803 	if (atomic_read(&sk->sk_omem_alloc))
1804 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1805 			 __func__, atomic_read(&sk->sk_omem_alloc));
1806 
1807 	if (sk->sk_frag.page) {
1808 		put_page(sk->sk_frag.page);
1809 		sk->sk_frag.page = NULL;
1810 	}
1811 
1812 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
1813 	put_cred(sk->sk_peer_cred);
1814 	put_pid(sk->sk_peer_pid);
1815 
1816 	if (likely(sk->sk_net_refcnt))
1817 		put_net(sock_net(sk));
1818 	sk_prot_free(sk->sk_prot_creator, sk);
1819 }
1820 
sk_destruct(struct sock *sk)1821 void sk_destruct(struct sock *sk)
1822 {
1823 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1824 
1825 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1826 		reuseport_detach_sock(sk);
1827 		use_call_rcu = true;
1828 	}
1829 
1830 	if (use_call_rcu)
1831 		call_rcu(&sk->sk_rcu, __sk_destruct);
1832 	else
1833 		__sk_destruct(&sk->sk_rcu);
1834 }
1835 
__sk_free(struct sock *sk)1836 static void __sk_free(struct sock *sk)
1837 {
1838 	if (likely(sk->sk_net_refcnt))
1839 		sock_inuse_add(sock_net(sk), -1);
1840 
1841 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1842 		sock_diag_broadcast_destroy(sk);
1843 	else
1844 		sk_destruct(sk);
1845 }
1846 
sk_free(struct sock *sk)1847 void sk_free(struct sock *sk)
1848 {
1849 	/*
1850 	 * We subtract one from sk_wmem_alloc and can know if
1851 	 * some packets are still in some tx queue.
1852 	 * If not null, sock_wfree() will call __sk_free(sk) later
1853 	 */
1854 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1855 		__sk_free(sk);
1856 }
1857 EXPORT_SYMBOL(sk_free);
1858 
sk_init_common(struct sock *sk)1859 static void sk_init_common(struct sock *sk)
1860 {
1861 	skb_queue_head_init(&sk->sk_receive_queue);
1862 	skb_queue_head_init(&sk->sk_write_queue);
1863 	skb_queue_head_init(&sk->sk_error_queue);
1864 
1865 	rwlock_init(&sk->sk_callback_lock);
1866 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1867 			af_rlock_keys + sk->sk_family,
1868 			af_family_rlock_key_strings[sk->sk_family]);
1869 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1870 			af_wlock_keys + sk->sk_family,
1871 			af_family_wlock_key_strings[sk->sk_family]);
1872 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1873 			af_elock_keys + sk->sk_family,
1874 			af_family_elock_key_strings[sk->sk_family]);
1875 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1876 			af_callback_keys + sk->sk_family,
1877 			af_family_clock_key_strings[sk->sk_family]);
1878 }
1879 
1880 /**
1881  *	sk_clone_lock - clone a socket, and lock its clone
1882  *	@sk: the socket to clone
1883  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1884  *
1885  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1886  */
sk_clone_lock(const struct sock *sk, const gfp_t priority)1887 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1888 {
1889 	struct proto *prot = READ_ONCE(sk->sk_prot);
1890 	struct sk_filter *filter;
1891 	bool is_charged = true;
1892 	struct sock *newsk;
1893 
1894 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
1895 	if (!newsk)
1896 		goto out;
1897 
1898 	sock_copy(newsk, sk);
1899 
1900 	newsk->sk_prot_creator = prot;
1901 
1902 	/* SANITY */
1903 	if (likely(newsk->sk_net_refcnt)) {
1904 		get_net(sock_net(newsk));
1905 		sock_inuse_add(sock_net(newsk), 1);
1906 	}
1907 	sk_node_init(&newsk->sk_node);
1908 	sock_lock_init(newsk);
1909 	bh_lock_sock(newsk);
1910 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1911 	newsk->sk_backlog.len = 0;
1912 
1913 	atomic_set(&newsk->sk_rmem_alloc, 0);
1914 
1915 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
1916 	refcount_set(&newsk->sk_wmem_alloc, 1);
1917 
1918 	atomic_set(&newsk->sk_omem_alloc, 0);
1919 	sk_init_common(newsk);
1920 
1921 	newsk->sk_dst_cache	= NULL;
1922 	newsk->sk_dst_pending_confirm = 0;
1923 	newsk->sk_wmem_queued	= 0;
1924 	newsk->sk_forward_alloc = 0;
1925 	atomic_set(&newsk->sk_drops, 0);
1926 	newsk->sk_send_head	= NULL;
1927 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1928 	atomic_set(&newsk->sk_zckey, 0);
1929 
1930 	sock_reset_flag(newsk, SOCK_DONE);
1931 
1932 	/* sk->sk_memcg will be populated at accept() time */
1933 	newsk->sk_memcg = NULL;
1934 
1935 	cgroup_sk_clone(&newsk->sk_cgrp_data);
1936 
1937 	rcu_read_lock();
1938 	filter = rcu_dereference(sk->sk_filter);
1939 	if (filter != NULL)
1940 		/* though it's an empty new sock, the charging may fail
1941 		 * if sysctl_optmem_max was changed between creation of
1942 		 * original socket and cloning
1943 		 */
1944 		is_charged = sk_filter_charge(newsk, filter);
1945 	RCU_INIT_POINTER(newsk->sk_filter, filter);
1946 	rcu_read_unlock();
1947 
1948 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1949 		/* We need to make sure that we don't uncharge the new
1950 		 * socket if we couldn't charge it in the first place
1951 		 * as otherwise we uncharge the parent's filter.
1952 		 */
1953 		if (!is_charged)
1954 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
1955 		sk_free_unlock_clone(newsk);
1956 		newsk = NULL;
1957 		goto out;
1958 	}
1959 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1960 
1961 	if (bpf_sk_storage_clone(sk, newsk)) {
1962 		sk_free_unlock_clone(newsk);
1963 		newsk = NULL;
1964 		goto out;
1965 	}
1966 
1967 	/* Clear sk_user_data if parent had the pointer tagged
1968 	 * as not suitable for copying when cloning.
1969 	 */
1970 	if (sk_user_data_is_nocopy(newsk))
1971 		newsk->sk_user_data = NULL;
1972 
1973 	newsk->sk_err	   = 0;
1974 	newsk->sk_err_soft = 0;
1975 	newsk->sk_priority = 0;
1976 	newsk->sk_incoming_cpu = raw_smp_processor_id();
1977 
1978 	/* Before updating sk_refcnt, we must commit prior changes to memory
1979 	 * (Documentation/RCU/rculist_nulls.rst for details)
1980 	 */
1981 	smp_wmb();
1982 	refcount_set(&newsk->sk_refcnt, 2);
1983 
1984 	/* Increment the counter in the same struct proto as the master
1985 	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1986 	 * is the same as sk->sk_prot->socks, as this field was copied
1987 	 * with memcpy).
1988 	 *
1989 	 * This _changes_ the previous behaviour, where
1990 	 * tcp_create_openreq_child always was incrementing the
1991 	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1992 	 * to be taken into account in all callers. -acme
1993 	 */
1994 	sk_refcnt_debug_inc(newsk);
1995 	sk_set_socket(newsk, NULL);
1996 	sk_tx_queue_clear(newsk);
1997 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
1998 
1999 	if (newsk->sk_prot->sockets_allocated)
2000 		sk_sockets_allocated_inc(newsk);
2001 
2002 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2003 		net_enable_timestamp();
2004 out:
2005 	return newsk;
2006 }
2007 EXPORT_SYMBOL_GPL(sk_clone_lock);
2008 
sk_free_unlock_clone(struct sock *sk)2009 void sk_free_unlock_clone(struct sock *sk)
2010 {
2011 	/* It is still raw copy of parent, so invalidate
2012 	 * destructor and make plain sk_free() */
2013 	sk->sk_destruct = NULL;
2014 	bh_unlock_sock(sk);
2015 	sk_free(sk);
2016 }
2017 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2018 
sk_setup_caps(struct sock *sk, struct dst_entry *dst)2019 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2020 {
2021 	u32 max_segs = 1;
2022 
2023 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2024 	if (sk->sk_route_caps & NETIF_F_GSO)
2025 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2026 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
2027 	if (sk_can_gso(sk)) {
2028 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2029 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2030 		} else {
2031 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2032 			sk->sk_gso_max_size = dst->dev->gso_max_size;
2033 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2034 		}
2035 	}
2036 	sk->sk_gso_max_segs = max_segs;
2037 	sk_dst_set(sk, dst);
2038 }
2039 EXPORT_SYMBOL_GPL(sk_setup_caps);
2040 
2041 /*
2042  *	Simple resource managers for sockets.
2043  */
2044 
2045 
2046 /*
2047  * Write buffer destructor automatically called from kfree_skb.
2048  */
sock_wfree(struct sk_buff *skb)2049 void sock_wfree(struct sk_buff *skb)
2050 {
2051 	struct sock *sk = skb->sk;
2052 	unsigned int len = skb->truesize;
2053 
2054 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2055 		/*
2056 		 * Keep a reference on sk_wmem_alloc, this will be released
2057 		 * after sk_write_space() call
2058 		 */
2059 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2060 		sk->sk_write_space(sk);
2061 		len = 1;
2062 	}
2063 	/*
2064 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2065 	 * could not do because of in-flight packets
2066 	 */
2067 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2068 		__sk_free(sk);
2069 }
2070 EXPORT_SYMBOL(sock_wfree);
2071 
2072 /* This variant of sock_wfree() is used by TCP,
2073  * since it sets SOCK_USE_WRITE_QUEUE.
2074  */
__sock_wfree(struct sk_buff *skb)2075 void __sock_wfree(struct sk_buff *skb)
2076 {
2077 	struct sock *sk = skb->sk;
2078 
2079 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2080 		__sk_free(sk);
2081 }
2082 
skb_set_owner_w(struct sk_buff *skb, struct sock *sk)2083 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2084 {
2085 	skb_orphan(skb);
2086 	skb->sk = sk;
2087 #ifdef CONFIG_INET
2088 	if (unlikely(!sk_fullsock(sk))) {
2089 		skb->destructor = sock_edemux;
2090 		sock_hold(sk);
2091 		return;
2092 	}
2093 #endif
2094 	skb->destructor = sock_wfree;
2095 	skb_set_hash_from_sk(skb, sk);
2096 	/*
2097 	 * We used to take a refcount on sk, but following operation
2098 	 * is enough to guarantee sk_free() wont free this sock until
2099 	 * all in-flight packets are completed
2100 	 */
2101 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2102 }
2103 EXPORT_SYMBOL(skb_set_owner_w);
2104 
can_skb_orphan_partial(const struct sk_buff *skb)2105 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2106 {
2107 #ifdef CONFIG_TLS_DEVICE
2108 	/* Drivers depend on in-order delivery for crypto offload,
2109 	 * partial orphan breaks out-of-order-OK logic.
2110 	 */
2111 	if (skb->decrypted)
2112 		return false;
2113 #endif
2114 	return (skb->destructor == sock_wfree ||
2115 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2116 }
2117 
2118 /* This helper is used by netem, as it can hold packets in its
2119  * delay queue. We want to allow the owner socket to send more
2120  * packets, as if they were already TX completed by a typical driver.
2121  * But we also want to keep skb->sk set because some packet schedulers
2122  * rely on it (sch_fq for example).
2123  */
skb_orphan_partial(struct sk_buff *skb)2124 void skb_orphan_partial(struct sk_buff *skb)
2125 {
2126 	if (skb_is_tcp_pure_ack(skb))
2127 		return;
2128 
2129 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2130 		return;
2131 
2132 	skb_orphan(skb);
2133 }
2134 EXPORT_SYMBOL(skb_orphan_partial);
2135 
2136 /*
2137  * Read buffer destructor automatically called from kfree_skb.
2138  */
sock_rfree(struct sk_buff *skb)2139 void sock_rfree(struct sk_buff *skb)
2140 {
2141 	struct sock *sk = skb->sk;
2142 	unsigned int len = skb->truesize;
2143 
2144 	atomic_sub(len, &sk->sk_rmem_alloc);
2145 	sk_mem_uncharge(sk, len);
2146 }
2147 EXPORT_SYMBOL(sock_rfree);
2148 
2149 /*
2150  * Buffer destructor for skbs that are not used directly in read or write
2151  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2152  */
sock_efree(struct sk_buff *skb)2153 void sock_efree(struct sk_buff *skb)
2154 {
2155 	sock_put(skb->sk);
2156 }
2157 EXPORT_SYMBOL(sock_efree);
2158 
2159 /* Buffer destructor for prefetch/receive path where reference count may
2160  * not be held, e.g. for listen sockets.
2161  */
2162 #ifdef CONFIG_INET
sock_pfree(struct sk_buff *skb)2163 void sock_pfree(struct sk_buff *skb)
2164 {
2165 	if (sk_is_refcounted(skb->sk))
2166 		sock_gen_put(skb->sk);
2167 }
2168 EXPORT_SYMBOL(sock_pfree);
2169 #endif /* CONFIG_INET */
2170 
sock_i_uid(struct sock *sk)2171 kuid_t sock_i_uid(struct sock *sk)
2172 {
2173 	kuid_t uid;
2174 
2175 	read_lock_bh(&sk->sk_callback_lock);
2176 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2177 	read_unlock_bh(&sk->sk_callback_lock);
2178 	return uid;
2179 }
2180 EXPORT_SYMBOL(sock_i_uid);
2181 
__sock_i_ino(struct sock *sk)2182 unsigned long __sock_i_ino(struct sock *sk)
2183 {
2184 	unsigned long ino;
2185 
2186 	read_lock(&sk->sk_callback_lock);
2187 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2188 	read_unlock(&sk->sk_callback_lock);
2189 	return ino;
2190 }
2191 EXPORT_SYMBOL(__sock_i_ino);
2192 
sock_i_ino(struct sock *sk)2193 unsigned long sock_i_ino(struct sock *sk)
2194 {
2195 	unsigned long ino;
2196 
2197 	local_bh_disable();
2198 	ino = __sock_i_ino(sk);
2199 	local_bh_enable();
2200 	return ino;
2201 }
2202 EXPORT_SYMBOL(sock_i_ino);
2203 
2204 /*
2205  * Allocate a skb from the socket's send buffer.
2206  */
sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority)2207 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2208 			     gfp_t priority)
2209 {
2210 	if (force ||
2211 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2212 		struct sk_buff *skb = alloc_skb(size, priority);
2213 
2214 		if (skb) {
2215 			skb_set_owner_w(skb, sk);
2216 			return skb;
2217 		}
2218 	}
2219 	return NULL;
2220 }
2221 EXPORT_SYMBOL(sock_wmalloc);
2222 
sock_ofree(struct sk_buff *skb)2223 static void sock_ofree(struct sk_buff *skb)
2224 {
2225 	struct sock *sk = skb->sk;
2226 
2227 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2228 }
2229 
sock_omalloc(struct sock *sk, unsigned long size, gfp_t priority)2230 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2231 			     gfp_t priority)
2232 {
2233 	struct sk_buff *skb;
2234 
2235 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2236 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2237 	    READ_ONCE(sysctl_optmem_max))
2238 		return NULL;
2239 
2240 	skb = alloc_skb(size, priority);
2241 	if (!skb)
2242 		return NULL;
2243 
2244 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2245 	skb->sk = sk;
2246 	skb->destructor = sock_ofree;
2247 	return skb;
2248 }
2249 
2250 /*
2251  * Allocate a memory block from the socket's option memory buffer.
2252  */
sock_kmalloc(struct sock *sk, int size, gfp_t priority)2253 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2254 {
2255 	int optmem_max = READ_ONCE(sysctl_optmem_max);
2256 
2257 	if ((unsigned int)size <= optmem_max &&
2258 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2259 		void *mem;
2260 		/* First do the add, to avoid the race if kmalloc
2261 		 * might sleep.
2262 		 */
2263 		atomic_add(size, &sk->sk_omem_alloc);
2264 		mem = kmalloc(size, priority);
2265 		if (mem)
2266 			return mem;
2267 		atomic_sub(size, &sk->sk_omem_alloc);
2268 	}
2269 	return NULL;
2270 }
2271 EXPORT_SYMBOL(sock_kmalloc);
2272 
2273 /* Free an option memory block. Note, we actually want the inline
2274  * here as this allows gcc to detect the nullify and fold away the
2275  * condition entirely.
2276  */
__sock_kfree_s(struct sock *sk, void *mem, int size, const bool nullify)2277 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2278 				  const bool nullify)
2279 {
2280 	if (WARN_ON_ONCE(!mem))
2281 		return;
2282 	if (nullify)
2283 		kfree_sensitive(mem);
2284 	else
2285 		kfree(mem);
2286 	atomic_sub(size, &sk->sk_omem_alloc);
2287 }
2288 
sock_kfree_s(struct sock *sk, void *mem, int size)2289 void sock_kfree_s(struct sock *sk, void *mem, int size)
2290 {
2291 	__sock_kfree_s(sk, mem, size, false);
2292 }
2293 EXPORT_SYMBOL(sock_kfree_s);
2294 
sock_kzfree_s(struct sock *sk, void *mem, int size)2295 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2296 {
2297 	__sock_kfree_s(sk, mem, size, true);
2298 }
2299 EXPORT_SYMBOL(sock_kzfree_s);
2300 
2301 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2302    I think, these locks should be removed for datagram sockets.
2303  */
sock_wait_for_wmem(struct sock *sk, long timeo)2304 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2305 {
2306 	DEFINE_WAIT(wait);
2307 
2308 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2309 	for (;;) {
2310 		if (!timeo)
2311 			break;
2312 		if (signal_pending(current))
2313 			break;
2314 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2315 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2316 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2317 			break;
2318 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2319 			break;
2320 		if (READ_ONCE(sk->sk_err))
2321 			break;
2322 		timeo = schedule_timeout(timeo);
2323 	}
2324 	finish_wait(sk_sleep(sk), &wait);
2325 	return timeo;
2326 }
2327 
2328 
2329 /*
2330  *	Generic send/receive buffer handlers
2331  */
2332 
sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order)2333 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2334 				     unsigned long data_len, int noblock,
2335 				     int *errcode, int max_page_order)
2336 {
2337 	struct sk_buff *skb;
2338 	long timeo;
2339 	int err;
2340 
2341 	timeo = sock_sndtimeo(sk, noblock);
2342 	for (;;) {
2343 		err = sock_error(sk);
2344 		if (err != 0)
2345 			goto failure;
2346 
2347 		err = -EPIPE;
2348 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2349 			goto failure;
2350 
2351 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2352 			break;
2353 
2354 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2355 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2356 		err = -EAGAIN;
2357 		if (!timeo)
2358 			goto failure;
2359 		if (signal_pending(current))
2360 			goto interrupted;
2361 		timeo = sock_wait_for_wmem(sk, timeo);
2362 	}
2363 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2364 				   errcode, sk->sk_allocation);
2365 	if (skb)
2366 		skb_set_owner_w(skb, sk);
2367 	return skb;
2368 
2369 interrupted:
2370 	err = sock_intr_errno(timeo);
2371 failure:
2372 	*errcode = err;
2373 	return NULL;
2374 }
2375 EXPORT_SYMBOL(sock_alloc_send_pskb);
2376 
sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode)2377 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2378 				    int noblock, int *errcode)
2379 {
2380 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2381 }
2382 EXPORT_SYMBOL(sock_alloc_send_skb);
2383 
__sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, struct sockcm_cookie *sockc)2384 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2385 		     struct sockcm_cookie *sockc)
2386 {
2387 	u32 tsflags;
2388 
2389 	switch (cmsg->cmsg_type) {
2390 	case SO_MARK:
2391 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2392 			return -EPERM;
2393 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2394 			return -EINVAL;
2395 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2396 		break;
2397 	case SO_TIMESTAMPING_OLD:
2398 	case SO_TIMESTAMPING_NEW:
2399 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2400 			return -EINVAL;
2401 
2402 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2403 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2404 			return -EINVAL;
2405 
2406 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2407 		sockc->tsflags |= tsflags;
2408 		break;
2409 	case SCM_TXTIME:
2410 		if (!sock_flag(sk, SOCK_TXTIME))
2411 			return -EINVAL;
2412 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2413 			return -EINVAL;
2414 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2415 		break;
2416 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2417 	case SCM_RIGHTS:
2418 	case SCM_CREDENTIALS:
2419 		break;
2420 	default:
2421 		return -EINVAL;
2422 	}
2423 	return 0;
2424 }
2425 EXPORT_SYMBOL(__sock_cmsg_send);
2426 
sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc)2427 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2428 		   struct sockcm_cookie *sockc)
2429 {
2430 	struct cmsghdr *cmsg;
2431 	int ret;
2432 
2433 	for_each_cmsghdr(cmsg, msg) {
2434 		if (!CMSG_OK(msg, cmsg))
2435 			return -EINVAL;
2436 		if (cmsg->cmsg_level != SOL_SOCKET)
2437 			continue;
2438 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2439 		if (ret)
2440 			return ret;
2441 	}
2442 	return 0;
2443 }
2444 EXPORT_SYMBOL(sock_cmsg_send);
2445 
sk_enter_memory_pressure(struct sock *sk)2446 static void sk_enter_memory_pressure(struct sock *sk)
2447 {
2448 	if (!sk->sk_prot->enter_memory_pressure)
2449 		return;
2450 
2451 	sk->sk_prot->enter_memory_pressure(sk);
2452 }
2453 
sk_leave_memory_pressure(struct sock *sk)2454 static void sk_leave_memory_pressure(struct sock *sk)
2455 {
2456 	if (sk->sk_prot->leave_memory_pressure) {
2457 		sk->sk_prot->leave_memory_pressure(sk);
2458 	} else {
2459 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2460 
2461 		if (memory_pressure && READ_ONCE(*memory_pressure))
2462 			WRITE_ONCE(*memory_pressure, 0);
2463 	}
2464 }
2465 
2466 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2467 
2468 /**
2469  * skb_page_frag_refill - check that a page_frag contains enough room
2470  * @sz: minimum size of the fragment we want to get
2471  * @pfrag: pointer to page_frag
2472  * @gfp: priority for memory allocation
2473  *
2474  * Note: While this allocator tries to use high order pages, there is
2475  * no guarantee that allocations succeed. Therefore, @sz MUST be
2476  * less or equal than PAGE_SIZE.
2477  */
skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)2478 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2479 {
2480 	if (pfrag->page) {
2481 		if (page_ref_count(pfrag->page) == 1) {
2482 			pfrag->offset = 0;
2483 			return true;
2484 		}
2485 		if (pfrag->offset + sz <= pfrag->size)
2486 			return true;
2487 		put_page(pfrag->page);
2488 	}
2489 
2490 	pfrag->offset = 0;
2491 	if (SKB_FRAG_PAGE_ORDER &&
2492 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2493 		/* Avoid direct reclaim but allow kswapd to wake */
2494 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2495 					  __GFP_COMP | __GFP_NOWARN |
2496 					  __GFP_NORETRY,
2497 					  SKB_FRAG_PAGE_ORDER);
2498 		if (likely(pfrag->page)) {
2499 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2500 			return true;
2501 		}
2502 	}
2503 	pfrag->page = alloc_page(gfp);
2504 	if (likely(pfrag->page)) {
2505 		pfrag->size = PAGE_SIZE;
2506 		return true;
2507 	}
2508 	return false;
2509 }
2510 EXPORT_SYMBOL(skb_page_frag_refill);
2511 
sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)2512 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2513 {
2514 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2515 		return true;
2516 
2517 	sk_enter_memory_pressure(sk);
2518 	sk_stream_moderate_sndbuf(sk);
2519 	return false;
2520 }
2521 EXPORT_SYMBOL(sk_page_frag_refill);
2522 
2523 static void __lock_sock(struct sock *sk)
2524 	__releases(&sk->sk_lock.slock)
2525 	__acquires(&sk->sk_lock.slock)
2526 {
2527 	DEFINE_WAIT(wait);
2528 
2529 	for (;;) {
2530 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2531 					TASK_UNINTERRUPTIBLE);
2532 		spin_unlock_bh(&sk->sk_lock.slock);
2533 		schedule();
2534 		spin_lock_bh(&sk->sk_lock.slock);
2535 		if (!sock_owned_by_user(sk))
2536 			break;
2537 	}
2538 	finish_wait(&sk->sk_lock.wq, &wait);
2539 }
2540 
2541 void __release_sock(struct sock *sk)
2542 	__releases(&sk->sk_lock.slock)
2543 	__acquires(&sk->sk_lock.slock)
2544 {
2545 	struct sk_buff *skb, *next;
2546 
2547 	while ((skb = sk->sk_backlog.head) != NULL) {
2548 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2549 
2550 		spin_unlock_bh(&sk->sk_lock.slock);
2551 
2552 		do {
2553 			next = skb->next;
2554 			prefetch(next);
2555 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2556 			skb_mark_not_on_list(skb);
2557 			sk_backlog_rcv(sk, skb);
2558 
2559 			cond_resched();
2560 
2561 			skb = next;
2562 		} while (skb != NULL);
2563 
2564 		spin_lock_bh(&sk->sk_lock.slock);
2565 	}
2566 
2567 	/*
2568 	 * Doing the zeroing here guarantee we can not loop forever
2569 	 * while a wild producer attempts to flood us.
2570 	 */
2571 	sk->sk_backlog.len = 0;
2572 }
2573 
__sk_flush_backlog(struct sock *sk)2574 void __sk_flush_backlog(struct sock *sk)
2575 {
2576 	spin_lock_bh(&sk->sk_lock.slock);
2577 	__release_sock(sk);
2578 	spin_unlock_bh(&sk->sk_lock.slock);
2579 }
2580 
2581 /**
2582  * sk_wait_data - wait for data to arrive at sk_receive_queue
2583  * @sk:    sock to wait on
2584  * @timeo: for how long
2585  * @skb:   last skb seen on sk_receive_queue
2586  *
2587  * Now socket state including sk->sk_err is changed only under lock,
2588  * hence we may omit checks after joining wait queue.
2589  * We check receive queue before schedule() only as optimization;
2590  * it is very likely that release_sock() added new data.
2591  */
sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)2592 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2593 {
2594 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2595 	int rc;
2596 
2597 	add_wait_queue(sk_sleep(sk), &wait);
2598 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2599 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2600 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2601 	remove_wait_queue(sk_sleep(sk), &wait);
2602 	return rc;
2603 }
2604 EXPORT_SYMBOL(sk_wait_data);
2605 
2606 /**
2607  *	__sk_mem_raise_allocated - increase memory_allocated
2608  *	@sk: socket
2609  *	@size: memory size to allocate
2610  *	@amt: pages to allocate
2611  *	@kind: allocation type
2612  *
2613  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2614  */
__sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)2615 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2616 {
2617 	struct proto *prot = sk->sk_prot;
2618 	long allocated = sk_memory_allocated_add(sk, amt);
2619 	bool charged = true;
2620 
2621 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2622 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2623 		goto suppress_allocation;
2624 
2625 	/* Under limit. */
2626 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2627 		sk_leave_memory_pressure(sk);
2628 		return 1;
2629 	}
2630 
2631 	/* Under pressure. */
2632 	if (allocated > sk_prot_mem_limits(sk, 1))
2633 		sk_enter_memory_pressure(sk);
2634 
2635 	/* Over hard limit. */
2636 	if (allocated > sk_prot_mem_limits(sk, 2))
2637 		goto suppress_allocation;
2638 
2639 	/* guarantee minimum buffer size under pressure */
2640 	if (kind == SK_MEM_RECV) {
2641 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2642 			return 1;
2643 
2644 	} else { /* SK_MEM_SEND */
2645 		int wmem0 = sk_get_wmem0(sk, prot);
2646 
2647 		if (sk->sk_type == SOCK_STREAM) {
2648 			if (sk->sk_wmem_queued < wmem0)
2649 				return 1;
2650 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2651 				return 1;
2652 		}
2653 	}
2654 
2655 	if (sk_has_memory_pressure(sk)) {
2656 		u64 alloc;
2657 
2658 		if (!sk_under_memory_pressure(sk))
2659 			return 1;
2660 		alloc = sk_sockets_allocated_read_positive(sk);
2661 		if (sk_prot_mem_limits(sk, 2) > alloc *
2662 		    sk_mem_pages(sk->sk_wmem_queued +
2663 				 atomic_read(&sk->sk_rmem_alloc) +
2664 				 sk->sk_forward_alloc))
2665 			return 1;
2666 	}
2667 
2668 suppress_allocation:
2669 
2670 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2671 		sk_stream_moderate_sndbuf(sk);
2672 
2673 		/* Fail only if socket is _under_ its sndbuf.
2674 		 * In this case we cannot block, so that we have to fail.
2675 		 */
2676 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2677 			return 1;
2678 	}
2679 
2680 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2681 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2682 
2683 	sk_memory_allocated_sub(sk, amt);
2684 
2685 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2686 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2687 
2688 	return 0;
2689 }
2690 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2691 
2692 /**
2693  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2694  *	@sk: socket
2695  *	@size: memory size to allocate
2696  *	@kind: allocation type
2697  *
2698  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2699  *	rmem allocation. This function assumes that protocols which have
2700  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2701  */
__sk_mem_schedule(struct sock *sk, int size, int kind)2702 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2703 {
2704 	int ret, amt = sk_mem_pages(size);
2705 
2706 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2707 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2708 	if (!ret)
2709 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2710 	return ret;
2711 }
2712 EXPORT_SYMBOL(__sk_mem_schedule);
2713 
2714 /**
2715  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2716  *	@sk: socket
2717  *	@amount: number of quanta
2718  *
2719  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2720  */
__sk_mem_reduce_allocated(struct sock *sk, int amount)2721 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2722 {
2723 	sk_memory_allocated_sub(sk, amount);
2724 
2725 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2726 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2727 
2728 	if (sk_under_global_memory_pressure(sk) &&
2729 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2730 		sk_leave_memory_pressure(sk);
2731 }
2732 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2733 
2734 /**
2735  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2736  *	@sk: socket
2737  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2738  */
__sk_mem_reclaim(struct sock *sk, int amount)2739 void __sk_mem_reclaim(struct sock *sk, int amount)
2740 {
2741 	amount >>= SK_MEM_QUANTUM_SHIFT;
2742 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2743 	__sk_mem_reduce_allocated(sk, amount);
2744 }
2745 EXPORT_SYMBOL(__sk_mem_reclaim);
2746 
sk_set_peek_off(struct sock *sk, int val)2747 int sk_set_peek_off(struct sock *sk, int val)
2748 {
2749 	WRITE_ONCE(sk->sk_peek_off, val);
2750 	return 0;
2751 }
2752 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2753 
2754 /*
2755  * Set of default routines for initialising struct proto_ops when
2756  * the protocol does not support a particular function. In certain
2757  * cases where it makes no sense for a protocol to have a "do nothing"
2758  * function, some default processing is provided.
2759  */
2760 
sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)2761 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2762 {
2763 	return -EOPNOTSUPP;
2764 }
2765 EXPORT_SYMBOL(sock_no_bind);
2766 
sock_no_connect(struct socket *sock, struct sockaddr *saddr, int len, int flags)2767 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2768 		    int len, int flags)
2769 {
2770 	return -EOPNOTSUPP;
2771 }
2772 EXPORT_SYMBOL(sock_no_connect);
2773 
sock_no_socketpair(struct socket *sock1, struct socket *sock2)2774 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2775 {
2776 	return -EOPNOTSUPP;
2777 }
2778 EXPORT_SYMBOL(sock_no_socketpair);
2779 
sock_no_accept(struct socket *sock, struct socket *newsock, int flags, bool kern)2780 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2781 		   bool kern)
2782 {
2783 	return -EOPNOTSUPP;
2784 }
2785 EXPORT_SYMBOL(sock_no_accept);
2786 
sock_no_getname(struct socket *sock, struct sockaddr *saddr, int peer)2787 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2788 		    int peer)
2789 {
2790 	return -EOPNOTSUPP;
2791 }
2792 EXPORT_SYMBOL(sock_no_getname);
2793 
sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)2794 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2795 {
2796 	return -EOPNOTSUPP;
2797 }
2798 EXPORT_SYMBOL(sock_no_ioctl);
2799 
sock_no_listen(struct socket *sock, int backlog)2800 int sock_no_listen(struct socket *sock, int backlog)
2801 {
2802 	return -EOPNOTSUPP;
2803 }
2804 EXPORT_SYMBOL(sock_no_listen);
2805 
sock_no_shutdown(struct socket *sock, int how)2806 int sock_no_shutdown(struct socket *sock, int how)
2807 {
2808 	return -EOPNOTSUPP;
2809 }
2810 EXPORT_SYMBOL(sock_no_shutdown);
2811 
sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)2812 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2813 {
2814 	return -EOPNOTSUPP;
2815 }
2816 EXPORT_SYMBOL(sock_no_sendmsg);
2817 
sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)2818 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2819 {
2820 	return -EOPNOTSUPP;
2821 }
2822 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2823 
sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)2824 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2825 		    int flags)
2826 {
2827 	return -EOPNOTSUPP;
2828 }
2829 EXPORT_SYMBOL(sock_no_recvmsg);
2830 
sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)2831 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2832 {
2833 	/* Mirror missing mmap method error code */
2834 	return -ENODEV;
2835 }
2836 EXPORT_SYMBOL(sock_no_mmap);
2837 
2838 /*
2839  * When a file is received (via SCM_RIGHTS, etc), we must bump the
2840  * various sock-based usage counts.
2841  */
__receive_sock(struct file *file)2842 void __receive_sock(struct file *file)
2843 {
2844 	struct socket *sock;
2845 	int error;
2846 
2847 	/*
2848 	 * The resulting value of "error" is ignored here since we only
2849 	 * need to take action when the file is a socket and testing
2850 	 * "sock" for NULL is sufficient.
2851 	 */
2852 	sock = sock_from_file(file, &error);
2853 	if (sock) {
2854 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2855 		sock_update_classid(&sock->sk->sk_cgrp_data);
2856 	}
2857 }
2858 
sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)2859 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2860 {
2861 	ssize_t res;
2862 	struct msghdr msg = {.msg_flags = flags};
2863 	struct kvec iov;
2864 	char *kaddr = kmap(page);
2865 	iov.iov_base = kaddr + offset;
2866 	iov.iov_len = size;
2867 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2868 	kunmap(page);
2869 	return res;
2870 }
2871 EXPORT_SYMBOL(sock_no_sendpage);
2872 
sock_no_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags)2873 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2874 				int offset, size_t size, int flags)
2875 {
2876 	ssize_t res;
2877 	struct msghdr msg = {.msg_flags = flags};
2878 	struct kvec iov;
2879 	char *kaddr = kmap(page);
2880 
2881 	iov.iov_base = kaddr + offset;
2882 	iov.iov_len = size;
2883 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2884 	kunmap(page);
2885 	return res;
2886 }
2887 EXPORT_SYMBOL(sock_no_sendpage_locked);
2888 
2889 /*
2890  *	Default Socket Callbacks
2891  */
2892 
sock_def_wakeup(struct sock *sk)2893 static void sock_def_wakeup(struct sock *sk)
2894 {
2895 	struct socket_wq *wq;
2896 
2897 	rcu_read_lock();
2898 	wq = rcu_dereference(sk->sk_wq);
2899 	if (skwq_has_sleeper(wq))
2900 		wake_up_interruptible_all(&wq->wait);
2901 	rcu_read_unlock();
2902 }
2903 
sock_def_error_report(struct sock *sk)2904 static void sock_def_error_report(struct sock *sk)
2905 {
2906 	struct socket_wq *wq;
2907 
2908 	rcu_read_lock();
2909 	wq = rcu_dereference(sk->sk_wq);
2910 	if (skwq_has_sleeper(wq))
2911 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2912 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2913 	rcu_read_unlock();
2914 }
2915 
sock_def_readable(struct sock *sk)2916 void sock_def_readable(struct sock *sk)
2917 {
2918 	struct socket_wq *wq;
2919 
2920 	rcu_read_lock();
2921 	wq = rcu_dereference(sk->sk_wq);
2922 	if (skwq_has_sleeper(wq))
2923 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2924 						EPOLLRDNORM | EPOLLRDBAND);
2925 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2926 	rcu_read_unlock();
2927 }
2928 
sock_def_write_space(struct sock *sk)2929 static void sock_def_write_space(struct sock *sk)
2930 {
2931 	struct socket_wq *wq;
2932 
2933 	rcu_read_lock();
2934 
2935 	/* Do not wake up a writer until he can make "significant"
2936 	 * progress.  --DaveM
2937 	 */
2938 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2939 		wq = rcu_dereference(sk->sk_wq);
2940 		if (skwq_has_sleeper(wq))
2941 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2942 						EPOLLWRNORM | EPOLLWRBAND);
2943 
2944 		/* Should agree with poll, otherwise some programs break */
2945 		if (sock_writeable(sk))
2946 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2947 	}
2948 
2949 	rcu_read_unlock();
2950 }
2951 
sock_def_destruct(struct sock *sk)2952 static void sock_def_destruct(struct sock *sk)
2953 {
2954 }
2955 
sk_send_sigurg(struct sock *sk)2956 void sk_send_sigurg(struct sock *sk)
2957 {
2958 	if (sk->sk_socket && sk->sk_socket->file)
2959 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2960 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2961 }
2962 EXPORT_SYMBOL(sk_send_sigurg);
2963 
sk_reset_timer(struct sock *sk, struct timer_list* timer, unsigned long expires)2964 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2965 		    unsigned long expires)
2966 {
2967 	if (!mod_timer(timer, expires))
2968 		sock_hold(sk);
2969 }
2970 EXPORT_SYMBOL(sk_reset_timer);
2971 
sk_stop_timer(struct sock *sk, struct timer_list* timer)2972 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2973 {
2974 	if (del_timer(timer))
2975 		__sock_put(sk);
2976 }
2977 EXPORT_SYMBOL(sk_stop_timer);
2978 
sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)2979 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
2980 {
2981 	if (del_timer_sync(timer))
2982 		__sock_put(sk);
2983 }
2984 EXPORT_SYMBOL(sk_stop_timer_sync);
2985 
sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)2986 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
2987 {
2988 	sk_init_common(sk);
2989 	sk->sk_send_head	=	NULL;
2990 
2991 	timer_setup(&sk->sk_timer, NULL, 0);
2992 
2993 	sk->sk_allocation	=	GFP_KERNEL;
2994 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
2995 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
2996 	sk->sk_state		=	TCP_CLOSE;
2997 	sk_set_socket(sk, sock);
2998 
2999 	sock_set_flag(sk, SOCK_ZAPPED);
3000 
3001 	if (sock) {
3002 		sk->sk_type	=	sock->type;
3003 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3004 		sock->sk	=	sk;
3005 	} else {
3006 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3007 	}
3008 	sk->sk_uid	=	uid;
3009 
3010 	rwlock_init(&sk->sk_callback_lock);
3011 	if (sk->sk_kern_sock)
3012 		lockdep_set_class_and_name(
3013 			&sk->sk_callback_lock,
3014 			af_kern_callback_keys + sk->sk_family,
3015 			af_family_kern_clock_key_strings[sk->sk_family]);
3016 	else
3017 		lockdep_set_class_and_name(
3018 			&sk->sk_callback_lock,
3019 			af_callback_keys + sk->sk_family,
3020 			af_family_clock_key_strings[sk->sk_family]);
3021 
3022 	sk->sk_state_change	=	sock_def_wakeup;
3023 	sk->sk_data_ready	=	sock_def_readable;
3024 	sk->sk_write_space	=	sock_def_write_space;
3025 	sk->sk_error_report	=	sock_def_error_report;
3026 	sk->sk_destruct		=	sock_def_destruct;
3027 
3028 	sk->sk_frag.page	=	NULL;
3029 	sk->sk_frag.offset	=	0;
3030 	sk->sk_peek_off		=	-1;
3031 
3032 	sk->sk_peer_pid 	=	NULL;
3033 	sk->sk_peer_cred	=	NULL;
3034 	spin_lock_init(&sk->sk_peer_lock);
3035 
3036 	sk->sk_write_pending	=	0;
3037 	sk->sk_rcvlowat		=	1;
3038 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3039 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3040 
3041 	sk->sk_stamp = SK_DEFAULT_STAMP;
3042 #if BITS_PER_LONG==32
3043 	seqlock_init(&sk->sk_stamp_seq);
3044 #endif
3045 	atomic_set(&sk->sk_zckey, 0);
3046 
3047 #ifdef CONFIG_NET_RX_BUSY_POLL
3048 	sk->sk_napi_id		=	0;
3049 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3050 #endif
3051 
3052 	sk->sk_max_pacing_rate = ~0UL;
3053 	sk->sk_pacing_rate = ~0UL;
3054 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3055 	sk->sk_incoming_cpu = -1;
3056 
3057 	sk_rx_queue_clear(sk);
3058 	/*
3059 	 * Before updating sk_refcnt, we must commit prior changes to memory
3060 	 * (Documentation/RCU/rculist_nulls.rst for details)
3061 	 */
3062 	smp_wmb();
3063 	refcount_set(&sk->sk_refcnt, 1);
3064 	atomic_set(&sk->sk_drops, 0);
3065 }
3066 EXPORT_SYMBOL(sock_init_data_uid);
3067 
sock_init_data(struct socket *sock, struct sock *sk)3068 void sock_init_data(struct socket *sock, struct sock *sk)
3069 {
3070 	kuid_t uid = sock ?
3071 		SOCK_INODE(sock)->i_uid :
3072 		make_kuid(sock_net(sk)->user_ns, 0);
3073 
3074 	sock_init_data_uid(sock, sk, uid);
3075 }
3076 EXPORT_SYMBOL(sock_init_data);
3077 
lock_sock_nested(struct sock *sk, int subclass)3078 void lock_sock_nested(struct sock *sk, int subclass)
3079 {
3080 	might_sleep();
3081 	spin_lock_bh(&sk->sk_lock.slock);
3082 	if (sk->sk_lock.owned)
3083 		__lock_sock(sk);
3084 	sk->sk_lock.owned = 1;
3085 	spin_unlock(&sk->sk_lock.slock);
3086 	/*
3087 	 * The sk_lock has mutex_lock() semantics here:
3088 	 */
3089 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3090 	local_bh_enable();
3091 }
3092 EXPORT_SYMBOL(lock_sock_nested);
3093 
release_sock(struct sock *sk)3094 void release_sock(struct sock *sk)
3095 {
3096 	spin_lock_bh(&sk->sk_lock.slock);
3097 	if (sk->sk_backlog.tail)
3098 		__release_sock(sk);
3099 
3100 	/* Warning : release_cb() might need to release sk ownership,
3101 	 * ie call sock_release_ownership(sk) before us.
3102 	 */
3103 	if (sk->sk_prot->release_cb)
3104 		sk->sk_prot->release_cb(sk);
3105 
3106 	sock_release_ownership(sk);
3107 	if (waitqueue_active(&sk->sk_lock.wq))
3108 		wake_up(&sk->sk_lock.wq);
3109 	spin_unlock_bh(&sk->sk_lock.slock);
3110 }
3111 EXPORT_SYMBOL(release_sock);
3112 
3113 /**
3114  * lock_sock_fast - fast version of lock_sock
3115  * @sk: socket
3116  *
3117  * This version should be used for very small section, where process wont block
3118  * return false if fast path is taken:
3119  *
3120  *   sk_lock.slock locked, owned = 0, BH disabled
3121  *
3122  * return true if slow path is taken:
3123  *
3124  *   sk_lock.slock unlocked, owned = 1, BH enabled
3125  */
lock_sock_fast(struct sock *sk)3126 bool lock_sock_fast(struct sock *sk)
3127 {
3128 	might_sleep();
3129 	spin_lock_bh(&sk->sk_lock.slock);
3130 
3131 	if (!sk->sk_lock.owned)
3132 		/*
3133 		 * Note : We must disable BH
3134 		 */
3135 		return false;
3136 
3137 	__lock_sock(sk);
3138 	sk->sk_lock.owned = 1;
3139 	spin_unlock(&sk->sk_lock.slock);
3140 	/*
3141 	 * The sk_lock has mutex_lock() semantics here:
3142 	 */
3143 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3144 	local_bh_enable();
3145 	return true;
3146 }
3147 EXPORT_SYMBOL(lock_sock_fast);
3148 
sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32)3149 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3150 		   bool timeval, bool time32)
3151 {
3152 	struct sock *sk = sock->sk;
3153 	struct timespec64 ts;
3154 
3155 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3156 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3157 	if (ts.tv_sec == -1)
3158 		return -ENOENT;
3159 	if (ts.tv_sec == 0) {
3160 		ktime_t kt = ktime_get_real();
3161 		sock_write_timestamp(sk, kt);
3162 		ts = ktime_to_timespec64(kt);
3163 	}
3164 
3165 	if (timeval)
3166 		ts.tv_nsec /= 1000;
3167 
3168 #ifdef CONFIG_COMPAT_32BIT_TIME
3169 	if (time32)
3170 		return put_old_timespec32(&ts, userstamp);
3171 #endif
3172 #ifdef CONFIG_SPARC64
3173 	/* beware of padding in sparc64 timeval */
3174 	if (timeval && !in_compat_syscall()) {
3175 		struct __kernel_old_timeval __user tv = {
3176 			.tv_sec = ts.tv_sec,
3177 			.tv_usec = ts.tv_nsec,
3178 		};
3179 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3180 			return -EFAULT;
3181 		return 0;
3182 	}
3183 #endif
3184 	return put_timespec64(&ts, userstamp);
3185 }
3186 EXPORT_SYMBOL(sock_gettstamp);
3187 
sock_enable_timestamp(struct sock *sk, enum sock_flags flag)3188 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3189 {
3190 	if (!sock_flag(sk, flag)) {
3191 		unsigned long previous_flags = sk->sk_flags;
3192 
3193 		sock_set_flag(sk, flag);
3194 		/*
3195 		 * we just set one of the two flags which require net
3196 		 * time stamping, but time stamping might have been on
3197 		 * already because of the other one
3198 		 */
3199 		if (sock_needs_netstamp(sk) &&
3200 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3201 			net_enable_timestamp();
3202 	}
3203 }
3204 
sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type)3205 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3206 		       int level, int type)
3207 {
3208 	struct sock_exterr_skb *serr;
3209 	struct sk_buff *skb;
3210 	int copied, err;
3211 
3212 	err = -EAGAIN;
3213 	skb = sock_dequeue_err_skb(sk);
3214 	if (skb == NULL)
3215 		goto out;
3216 
3217 	copied = skb->len;
3218 	if (copied > len) {
3219 		msg->msg_flags |= MSG_TRUNC;
3220 		copied = len;
3221 	}
3222 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3223 	if (err)
3224 		goto out_free_skb;
3225 
3226 	sock_recv_timestamp(msg, sk, skb);
3227 
3228 	serr = SKB_EXT_ERR(skb);
3229 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3230 
3231 	msg->msg_flags |= MSG_ERRQUEUE;
3232 	err = copied;
3233 
3234 out_free_skb:
3235 	kfree_skb(skb);
3236 out:
3237 	return err;
3238 }
3239 EXPORT_SYMBOL(sock_recv_errqueue);
3240 
3241 /*
3242  *	Get a socket option on an socket.
3243  *
3244  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3245  *	asynchronous errors should be reported by getsockopt. We assume
3246  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3247  */
sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)3248 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3249 			   char __user *optval, int __user *optlen)
3250 {
3251 	struct sock *sk = sock->sk;
3252 
3253 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3254 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3255 }
3256 EXPORT_SYMBOL(sock_common_getsockopt);
3257 
sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags)3258 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3259 			int flags)
3260 {
3261 	struct sock *sk = sock->sk;
3262 	int addr_len = 0;
3263 	int err;
3264 
3265 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3266 				   flags & ~MSG_DONTWAIT, &addr_len);
3267 	if (err >= 0)
3268 		msg->msg_namelen = addr_len;
3269 	return err;
3270 }
3271 EXPORT_SYMBOL(sock_common_recvmsg);
3272 
3273 /*
3274  *	Set socket options on an inet socket.
3275  */
sock_common_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen)3276 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3277 			   sockptr_t optval, unsigned int optlen)
3278 {
3279 	struct sock *sk = sock->sk;
3280 
3281 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3282 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3283 }
3284 EXPORT_SYMBOL(sock_common_setsockopt);
3285 
sk_common_release(struct sock *sk)3286 void sk_common_release(struct sock *sk)
3287 {
3288 	if (sk->sk_prot->destroy)
3289 		sk->sk_prot->destroy(sk);
3290 
3291 	/*
3292 	 * Observation: when sk_common_release is called, processes have
3293 	 * no access to socket. But net still has.
3294 	 * Step one, detach it from networking:
3295 	 *
3296 	 * A. Remove from hash tables.
3297 	 */
3298 
3299 	sk->sk_prot->unhash(sk);
3300 
3301 	/*
3302 	 * In this point socket cannot receive new packets, but it is possible
3303 	 * that some packets are in flight because some CPU runs receiver and
3304 	 * did hash table lookup before we unhashed socket. They will achieve
3305 	 * receive queue and will be purged by socket destructor.
3306 	 *
3307 	 * Also we still have packets pending on receive queue and probably,
3308 	 * our own packets waiting in device queues. sock_destroy will drain
3309 	 * receive queue, but transmitted packets will delay socket destruction
3310 	 * until the last reference will be released.
3311 	 */
3312 
3313 	sock_orphan(sk);
3314 
3315 	xfrm_sk_free_policy(sk);
3316 
3317 	sk_refcnt_debug_release(sk);
3318 
3319 	sock_put(sk);
3320 }
3321 EXPORT_SYMBOL(sk_common_release);
3322 
sk_get_meminfo(const struct sock *sk, u32 *mem)3323 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3324 {
3325 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3326 
3327 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3328 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3329 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3330 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3331 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3332 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3333 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3334 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3335 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3336 }
3337 
3338 #ifdef CONFIG_PROC_FS
3339 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3340 struct prot_inuse {
3341 	int val[PROTO_INUSE_NR];
3342 };
3343 
3344 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3345 
sock_prot_inuse_add(struct net *net, struct proto *prot, int val)3346 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3347 {
3348 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3349 }
3350 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3351 
sock_prot_inuse_get(struct net *net, struct proto *prot)3352 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3353 {
3354 	int cpu, idx = prot->inuse_idx;
3355 	int res = 0;
3356 
3357 	for_each_possible_cpu(cpu)
3358 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3359 
3360 	return res >= 0 ? res : 0;
3361 }
3362 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3363 
sock_inuse_add(struct net *net, int val)3364 static void sock_inuse_add(struct net *net, int val)
3365 {
3366 	this_cpu_add(*net->core.sock_inuse, val);
3367 }
3368 
sock_inuse_get(struct net *net)3369 int sock_inuse_get(struct net *net)
3370 {
3371 	int cpu, res = 0;
3372 
3373 	for_each_possible_cpu(cpu)
3374 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3375 
3376 	return res;
3377 }
3378 
3379 EXPORT_SYMBOL_GPL(sock_inuse_get);
3380 
sock_inuse_init_net(struct net *net)3381 static int __net_init sock_inuse_init_net(struct net *net)
3382 {
3383 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3384 	if (net->core.prot_inuse == NULL)
3385 		return -ENOMEM;
3386 
3387 	net->core.sock_inuse = alloc_percpu(int);
3388 	if (net->core.sock_inuse == NULL)
3389 		goto out;
3390 
3391 	return 0;
3392 
3393 out:
3394 	free_percpu(net->core.prot_inuse);
3395 	return -ENOMEM;
3396 }
3397 
sock_inuse_exit_net(struct net *net)3398 static void __net_exit sock_inuse_exit_net(struct net *net)
3399 {
3400 	free_percpu(net->core.prot_inuse);
3401 	free_percpu(net->core.sock_inuse);
3402 }
3403 
3404 static struct pernet_operations net_inuse_ops = {
3405 	.init = sock_inuse_init_net,
3406 	.exit = sock_inuse_exit_net,
3407 };
3408 
net_inuse_init(void)3409 static __init int net_inuse_init(void)
3410 {
3411 	if (register_pernet_subsys(&net_inuse_ops))
3412 		panic("Cannot initialize net inuse counters");
3413 
3414 	return 0;
3415 }
3416 
3417 core_initcall(net_inuse_init);
3418 
assign_proto_idx(struct proto *prot)3419 static int assign_proto_idx(struct proto *prot)
3420 {
3421 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3422 
3423 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3424 		pr_err("PROTO_INUSE_NR exhausted\n");
3425 		return -ENOSPC;
3426 	}
3427 
3428 	set_bit(prot->inuse_idx, proto_inuse_idx);
3429 	return 0;
3430 }
3431 
release_proto_idx(struct proto *prot)3432 static void release_proto_idx(struct proto *prot)
3433 {
3434 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3435 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3436 }
3437 #else
assign_proto_idx(struct proto *prot)3438 static inline int assign_proto_idx(struct proto *prot)
3439 {
3440 	return 0;
3441 }
3442 
release_proto_idx(struct proto *prot)3443 static inline void release_proto_idx(struct proto *prot)
3444 {
3445 }
3446 
sock_inuse_add(struct net *net, int val)3447 static void sock_inuse_add(struct net *net, int val)
3448 {
3449 }
3450 #endif
3451 
tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)3452 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3453 {
3454 	if (!twsk_prot)
3455 		return;
3456 	kfree(twsk_prot->twsk_slab_name);
3457 	twsk_prot->twsk_slab_name = NULL;
3458 	kmem_cache_destroy(twsk_prot->twsk_slab);
3459 	twsk_prot->twsk_slab = NULL;
3460 }
3461 
req_prot_cleanup(struct request_sock_ops *rsk_prot)3462 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3463 {
3464 	if (!rsk_prot)
3465 		return;
3466 	kfree(rsk_prot->slab_name);
3467 	rsk_prot->slab_name = NULL;
3468 	kmem_cache_destroy(rsk_prot->slab);
3469 	rsk_prot->slab = NULL;
3470 }
3471 
req_prot_init(const struct proto *prot)3472 static int req_prot_init(const struct proto *prot)
3473 {
3474 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3475 
3476 	if (!rsk_prot)
3477 		return 0;
3478 
3479 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3480 					prot->name);
3481 	if (!rsk_prot->slab_name)
3482 		return -ENOMEM;
3483 
3484 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3485 					   rsk_prot->obj_size, 0,
3486 					   SLAB_ACCOUNT | prot->slab_flags,
3487 					   NULL);
3488 
3489 	if (!rsk_prot->slab) {
3490 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3491 			prot->name);
3492 		return -ENOMEM;
3493 	}
3494 	return 0;
3495 }
3496 
proto_register(struct proto *prot, int alloc_slab)3497 int proto_register(struct proto *prot, int alloc_slab)
3498 {
3499 	int ret = -ENOBUFS;
3500 
3501 	if (alloc_slab) {
3502 		prot->slab = kmem_cache_create_usercopy(prot->name,
3503 					prot->obj_size, 0,
3504 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3505 					prot->slab_flags,
3506 					prot->useroffset, prot->usersize,
3507 					NULL);
3508 
3509 		if (prot->slab == NULL) {
3510 			pr_crit("%s: Can't create sock SLAB cache!\n",
3511 				prot->name);
3512 			goto out;
3513 		}
3514 
3515 		if (req_prot_init(prot))
3516 			goto out_free_request_sock_slab;
3517 
3518 		if (prot->twsk_prot != NULL) {
3519 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3520 
3521 			if (prot->twsk_prot->twsk_slab_name == NULL)
3522 				goto out_free_request_sock_slab;
3523 
3524 			prot->twsk_prot->twsk_slab =
3525 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3526 						  prot->twsk_prot->twsk_obj_size,
3527 						  0,
3528 						  SLAB_ACCOUNT |
3529 						  prot->slab_flags,
3530 						  NULL);
3531 			if (prot->twsk_prot->twsk_slab == NULL)
3532 				goto out_free_timewait_sock_slab;
3533 		}
3534 	}
3535 
3536 	mutex_lock(&proto_list_mutex);
3537 	ret = assign_proto_idx(prot);
3538 	if (ret) {
3539 		mutex_unlock(&proto_list_mutex);
3540 		goto out_free_timewait_sock_slab;
3541 	}
3542 	list_add(&prot->node, &proto_list);
3543 	mutex_unlock(&proto_list_mutex);
3544 	return ret;
3545 
3546 out_free_timewait_sock_slab:
3547 	if (alloc_slab && prot->twsk_prot)
3548 		tw_prot_cleanup(prot->twsk_prot);
3549 out_free_request_sock_slab:
3550 	if (alloc_slab) {
3551 		req_prot_cleanup(prot->rsk_prot);
3552 
3553 		kmem_cache_destroy(prot->slab);
3554 		prot->slab = NULL;
3555 	}
3556 out:
3557 	return ret;
3558 }
3559 EXPORT_SYMBOL(proto_register);
3560 
proto_unregister(struct proto *prot)3561 void proto_unregister(struct proto *prot)
3562 {
3563 	mutex_lock(&proto_list_mutex);
3564 	release_proto_idx(prot);
3565 	list_del(&prot->node);
3566 	mutex_unlock(&proto_list_mutex);
3567 
3568 	kmem_cache_destroy(prot->slab);
3569 	prot->slab = NULL;
3570 
3571 	req_prot_cleanup(prot->rsk_prot);
3572 	tw_prot_cleanup(prot->twsk_prot);
3573 }
3574 EXPORT_SYMBOL(proto_unregister);
3575 
sock_load_diag_module(int family, int protocol)3576 int sock_load_diag_module(int family, int protocol)
3577 {
3578 	if (!protocol) {
3579 		if (!sock_is_registered(family))
3580 			return -ENOENT;
3581 
3582 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3583 				      NETLINK_SOCK_DIAG, family);
3584 	}
3585 
3586 #ifdef CONFIG_INET
3587 	if (family == AF_INET &&
3588 	    protocol != IPPROTO_RAW &&
3589 	    protocol < MAX_INET_PROTOS &&
3590 	    !rcu_access_pointer(inet_protos[protocol]))
3591 		return -ENOENT;
3592 #endif
3593 
3594 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3595 			      NETLINK_SOCK_DIAG, family, protocol);
3596 }
3597 EXPORT_SYMBOL(sock_load_diag_module);
3598 
3599 #ifdef CONFIG_PROC_FS
3600 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
__acquiresnull3601 	__acquires(proto_list_mutex)
3602 {
3603 	mutex_lock(&proto_list_mutex);
3604 	return seq_list_start_head(&proto_list, *pos);
3605 }
3606 
proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)3607 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3608 {
3609 	return seq_list_next(v, &proto_list, pos);
3610 }
3611 
3612 static void proto_seq_stop(struct seq_file *seq, void *v)
__releasesnull3613 	__releases(proto_list_mutex)
3614 {
3615 	mutex_unlock(&proto_list_mutex);
3616 }
3617 
proto_method_implemented(const void *method)3618 static char proto_method_implemented(const void *method)
3619 {
3620 	return method == NULL ? 'n' : 'y';
3621 }
sock_prot_memory_allocated(struct proto *proto)3622 static long sock_prot_memory_allocated(struct proto *proto)
3623 {
3624 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3625 }
3626 
sock_prot_memory_pressure(struct proto *proto)3627 static const char *sock_prot_memory_pressure(struct proto *proto)
3628 {
3629 	return proto->memory_pressure != NULL ?
3630 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3631 }
3632 
proto_seq_printf(struct seq_file *seq, struct proto *proto)3633 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3634 {
3635 
3636 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3637 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3638 		   proto->name,
3639 		   proto->obj_size,
3640 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3641 		   sock_prot_memory_allocated(proto),
3642 		   sock_prot_memory_pressure(proto),
3643 		   proto->max_header,
3644 		   proto->slab == NULL ? "no" : "yes",
3645 		   module_name(proto->owner),
3646 		   proto_method_implemented(proto->close),
3647 		   proto_method_implemented(proto->connect),
3648 		   proto_method_implemented(proto->disconnect),
3649 		   proto_method_implemented(proto->accept),
3650 		   proto_method_implemented(proto->ioctl),
3651 		   proto_method_implemented(proto->init),
3652 		   proto_method_implemented(proto->destroy),
3653 		   proto_method_implemented(proto->shutdown),
3654 		   proto_method_implemented(proto->setsockopt),
3655 		   proto_method_implemented(proto->getsockopt),
3656 		   proto_method_implemented(proto->sendmsg),
3657 		   proto_method_implemented(proto->recvmsg),
3658 		   proto_method_implemented(proto->sendpage),
3659 		   proto_method_implemented(proto->bind),
3660 		   proto_method_implemented(proto->backlog_rcv),
3661 		   proto_method_implemented(proto->hash),
3662 		   proto_method_implemented(proto->unhash),
3663 		   proto_method_implemented(proto->get_port),
3664 		   proto_method_implemented(proto->enter_memory_pressure));
3665 }
3666 
proto_seq_show(struct seq_file *seq, void *v)3667 static int proto_seq_show(struct seq_file *seq, void *v)
3668 {
3669 	if (v == &proto_list)
3670 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3671 			   "protocol",
3672 			   "size",
3673 			   "sockets",
3674 			   "memory",
3675 			   "press",
3676 			   "maxhdr",
3677 			   "slab",
3678 			   "module",
3679 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3680 	else
3681 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3682 	return 0;
3683 }
3684 
3685 static const struct seq_operations proto_seq_ops = {
3686 	.start  = proto_seq_start,
3687 	.next   = proto_seq_next,
3688 	.stop   = proto_seq_stop,
3689 	.show   = proto_seq_show,
3690 };
3691 
proto_init_net(struct net *net)3692 static __net_init int proto_init_net(struct net *net)
3693 {
3694 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3695 			sizeof(struct seq_net_private)))
3696 		return -ENOMEM;
3697 
3698 	return 0;
3699 }
3700 
proto_exit_net(struct net *net)3701 static __net_exit void proto_exit_net(struct net *net)
3702 {
3703 	remove_proc_entry("protocols", net->proc_net);
3704 }
3705 
3706 
3707 static __net_initdata struct pernet_operations proto_net_ops = {
3708 	.init = proto_init_net,
3709 	.exit = proto_exit_net,
3710 };
3711 
proto_init(void)3712 static int __init proto_init(void)
3713 {
3714 	return register_pernet_subsys(&proto_net_ops);
3715 }
3716 
3717 subsys_initcall(proto_init);
3718 
3719 #endif /* PROC_FS */
3720 
3721 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void *p, unsigned long start_time)3722 bool sk_busy_loop_end(void *p, unsigned long start_time)
3723 {
3724 	struct sock *sk = p;
3725 
3726 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3727 	       sk_busy_loop_timeout(sk, start_time);
3728 }
3729 EXPORT_SYMBOL(sk_busy_loop_end);
3730 #endif /* CONFIG_NET_RX_BUSY_POLL */
3731 
sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)3732 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3733 {
3734 	if (!sk->sk_prot->bind_add)
3735 		return -EOPNOTSUPP;
3736 	return sk->sk_prot->bind_add(sk, addr, addr_len);
3737 }
3738 EXPORT_SYMBOL(sock_bind_add);
3739