xref: /kernel/linux/linux-6.6/net/core/sock.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
4 *		operating system.  INET is implemented using the  BSD Socket
5 *		interface as the means of communication with the user level.
6 *
7 *		Generic socket support routines. Memory allocators, socket lock/release
8 *		handler for protocols to use and generic option handler.
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 */
85
86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
88#include <asm/unaligned.h>
89#include <linux/capability.h>
90#include <linux/errno.h>
91#include <linux/errqueue.h>
92#include <linux/types.h>
93#include <linux/socket.h>
94#include <linux/in.h>
95#include <linux/kernel.h>
96#include <linux/module.h>
97#include <linux/proc_fs.h>
98#include <linux/seq_file.h>
99#include <linux/sched.h>
100#include <linux/sched/mm.h>
101#include <linux/timer.h>
102#include <linux/string.h>
103#include <linux/sockios.h>
104#include <linux/net.h>
105#include <linux/mm.h>
106#include <linux/slab.h>
107#include <linux/interrupt.h>
108#include <linux/poll.h>
109#include <linux/tcp.h>
110#include <linux/udp.h>
111#include <linux/init.h>
112#include <linux/highmem.h>
113#include <linux/user_namespace.h>
114#include <linux/static_key.h>
115#include <linux/memcontrol.h>
116#include <linux/prefetch.h>
117#include <linux/compat.h>
118#include <linux/mroute.h>
119#include <linux/mroute6.h>
120#include <linux/icmpv6.h>
121
122#include <linux/uaccess.h>
123
124#include <linux/netdevice.h>
125#include <net/protocol.h>
126#include <linux/skbuff.h>
127#include <net/net_namespace.h>
128#include <net/request_sock.h>
129#include <net/sock.h>
130#include <linux/net_tstamp.h>
131#include <net/xfrm.h>
132#include <linux/ipsec.h>
133#include <net/cls_cgroup.h>
134#include <net/netprio_cgroup.h>
135#include <linux/sock_diag.h>
136
137#include <linux/filter.h>
138#include <net/sock_reuseport.h>
139#include <net/bpf_sk_storage.h>
140
141#include <trace/events/sock.h>
142
143#include <net/tcp.h>
144#include <net/busy_poll.h>
145#include <net/phonet/phonet.h>
146
147#include <linux/ethtool.h>
148
149#include "dev.h"
150
151static DEFINE_MUTEX(proto_list_mutex);
152static LIST_HEAD(proto_list);
153
154static void sock_def_write_space_wfree(struct sock *sk);
155static void sock_def_write_space(struct sock *sk);
156
157/**
158 * sk_ns_capable - General socket capability test
159 * @sk: Socket to use a capability on or through
160 * @user_ns: The user namespace of the capability to use
161 * @cap: The capability to use
162 *
163 * Test to see if the opener of the socket had when the socket was
164 * created and the current process has the capability @cap in the user
165 * namespace @user_ns.
166 */
167bool sk_ns_capable(const struct sock *sk,
168		   struct user_namespace *user_ns, int cap)
169{
170	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
171		ns_capable(user_ns, cap);
172}
173EXPORT_SYMBOL(sk_ns_capable);
174
175/**
176 * sk_capable - Socket global capability test
177 * @sk: Socket to use a capability on or through
178 * @cap: The global capability to use
179 *
180 * Test to see if the opener of the socket had when the socket was
181 * created and the current process has the capability @cap in all user
182 * namespaces.
183 */
184bool sk_capable(const struct sock *sk, int cap)
185{
186	return sk_ns_capable(sk, &init_user_ns, cap);
187}
188EXPORT_SYMBOL(sk_capable);
189
190/**
191 * sk_net_capable - Network namespace socket capability test
192 * @sk: Socket to use a capability on or through
193 * @cap: The capability to use
194 *
195 * Test to see if the opener of the socket had when the socket was created
196 * and the current process has the capability @cap over the network namespace
197 * the socket is a member of.
198 */
199bool sk_net_capable(const struct sock *sk, int cap)
200{
201	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
202}
203EXPORT_SYMBOL(sk_net_capable);
204
205/*
206 * Each address family might have different locking rules, so we have
207 * one slock key per address family and separate keys for internal and
208 * userspace sockets.
209 */
210static struct lock_class_key af_family_keys[AF_MAX];
211static struct lock_class_key af_family_kern_keys[AF_MAX];
212static struct lock_class_key af_family_slock_keys[AF_MAX];
213static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
214
215/*
216 * Make lock validator output more readable. (we pre-construct these
217 * strings build-time, so that runtime initialization of socket
218 * locks is fast):
219 */
220
221#define _sock_locks(x)						  \
222  x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
223  x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
224  x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
225  x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
226  x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
227  x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
228  x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
229  x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
230  x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
231  x "27"       ,	x "28"          ,	x "AF_CAN"      , \
232  x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
233  x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
234  x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
235  x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
236  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
237  x "AF_MCTP"  , \
238  x "AF_MAX"
239
240static const char *const af_family_key_strings[AF_MAX+1] = {
241	_sock_locks("sk_lock-")
242};
243static const char *const af_family_slock_key_strings[AF_MAX+1] = {
244	_sock_locks("slock-")
245};
246static const char *const af_family_clock_key_strings[AF_MAX+1] = {
247	_sock_locks("clock-")
248};
249
250static const char *const af_family_kern_key_strings[AF_MAX+1] = {
251	_sock_locks("k-sk_lock-")
252};
253static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
254	_sock_locks("k-slock-")
255};
256static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
257	_sock_locks("k-clock-")
258};
259static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
260	_sock_locks("rlock-")
261};
262static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
263	_sock_locks("wlock-")
264};
265static const char *const af_family_elock_key_strings[AF_MAX+1] = {
266	_sock_locks("elock-")
267};
268
269/*
270 * sk_callback_lock and sk queues locking rules are per-address-family,
271 * so split the lock classes by using a per-AF key:
272 */
273static struct lock_class_key af_callback_keys[AF_MAX];
274static struct lock_class_key af_rlock_keys[AF_MAX];
275static struct lock_class_key af_wlock_keys[AF_MAX];
276static struct lock_class_key af_elock_keys[AF_MAX];
277static struct lock_class_key af_kern_callback_keys[AF_MAX];
278
279/* Run time adjustable parameters. */
280__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
281EXPORT_SYMBOL(sysctl_wmem_max);
282__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
283EXPORT_SYMBOL(sysctl_rmem_max);
284__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
285__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
286
287/* Maximal space eaten by iovec or ancillary data plus some space */
288int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
289EXPORT_SYMBOL(sysctl_optmem_max);
290
291int sysctl_tstamp_allow_data __read_mostly = 1;
292
293DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
294EXPORT_SYMBOL_GPL(memalloc_socks_key);
295
296/**
297 * sk_set_memalloc - sets %SOCK_MEMALLOC
298 * @sk: socket to set it on
299 *
300 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
301 * It's the responsibility of the admin to adjust min_free_kbytes
302 * to meet the requirements
303 */
304void sk_set_memalloc(struct sock *sk)
305{
306	sock_set_flag(sk, SOCK_MEMALLOC);
307	sk->sk_allocation |= __GFP_MEMALLOC;
308	static_branch_inc(&memalloc_socks_key);
309}
310EXPORT_SYMBOL_GPL(sk_set_memalloc);
311
312void sk_clear_memalloc(struct sock *sk)
313{
314	sock_reset_flag(sk, SOCK_MEMALLOC);
315	sk->sk_allocation &= ~__GFP_MEMALLOC;
316	static_branch_dec(&memalloc_socks_key);
317
318	/*
319	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
320	 * progress of swapping. SOCK_MEMALLOC may be cleared while
321	 * it has rmem allocations due to the last swapfile being deactivated
322	 * but there is a risk that the socket is unusable due to exceeding
323	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
324	 */
325	sk_mem_reclaim(sk);
326}
327EXPORT_SYMBOL_GPL(sk_clear_memalloc);
328
329int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
330{
331	int ret;
332	unsigned int noreclaim_flag;
333
334	/* these should have been dropped before queueing */
335	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
336
337	noreclaim_flag = memalloc_noreclaim_save();
338	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
339				 tcp_v6_do_rcv,
340				 tcp_v4_do_rcv,
341				 sk, skb);
342	memalloc_noreclaim_restore(noreclaim_flag);
343
344	return ret;
345}
346EXPORT_SYMBOL(__sk_backlog_rcv);
347
348void sk_error_report(struct sock *sk)
349{
350	sk->sk_error_report(sk);
351
352	switch (sk->sk_family) {
353	case AF_INET:
354		fallthrough;
355	case AF_INET6:
356		trace_inet_sk_error_report(sk);
357		break;
358	default:
359		break;
360	}
361}
362EXPORT_SYMBOL(sk_error_report);
363
364int sock_get_timeout(long timeo, void *optval, bool old_timeval)
365{
366	struct __kernel_sock_timeval tv;
367
368	if (timeo == MAX_SCHEDULE_TIMEOUT) {
369		tv.tv_sec = 0;
370		tv.tv_usec = 0;
371	} else {
372		tv.tv_sec = timeo / HZ;
373		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
374	}
375
376	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
377		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
378		*(struct old_timeval32 *)optval = tv32;
379		return sizeof(tv32);
380	}
381
382	if (old_timeval) {
383		struct __kernel_old_timeval old_tv;
384		old_tv.tv_sec = tv.tv_sec;
385		old_tv.tv_usec = tv.tv_usec;
386		*(struct __kernel_old_timeval *)optval = old_tv;
387		return sizeof(old_tv);
388	}
389
390	*(struct __kernel_sock_timeval *)optval = tv;
391	return sizeof(tv);
392}
393EXPORT_SYMBOL(sock_get_timeout);
394
395int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
396			   sockptr_t optval, int optlen, bool old_timeval)
397{
398	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
399		struct old_timeval32 tv32;
400
401		if (optlen < sizeof(tv32))
402			return -EINVAL;
403
404		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
405			return -EFAULT;
406		tv->tv_sec = tv32.tv_sec;
407		tv->tv_usec = tv32.tv_usec;
408	} else if (old_timeval) {
409		struct __kernel_old_timeval old_tv;
410
411		if (optlen < sizeof(old_tv))
412			return -EINVAL;
413		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
414			return -EFAULT;
415		tv->tv_sec = old_tv.tv_sec;
416		tv->tv_usec = old_tv.tv_usec;
417	} else {
418		if (optlen < sizeof(*tv))
419			return -EINVAL;
420		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
421			return -EFAULT;
422	}
423
424	return 0;
425}
426EXPORT_SYMBOL(sock_copy_user_timeval);
427
428static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
429			    bool old_timeval)
430{
431	struct __kernel_sock_timeval tv;
432	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
433	long val;
434
435	if (err)
436		return err;
437
438	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
439		return -EDOM;
440
441	if (tv.tv_sec < 0) {
442		static int warned __read_mostly;
443
444		WRITE_ONCE(*timeo_p, 0);
445		if (warned < 10 && net_ratelimit()) {
446			warned++;
447			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
448				__func__, current->comm, task_pid_nr(current));
449		}
450		return 0;
451	}
452	val = MAX_SCHEDULE_TIMEOUT;
453	if ((tv.tv_sec || tv.tv_usec) &&
454	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
455		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
456						    USEC_PER_SEC / HZ);
457	WRITE_ONCE(*timeo_p, val);
458	return 0;
459}
460
461static bool sock_needs_netstamp(const struct sock *sk)
462{
463	switch (sk->sk_family) {
464	case AF_UNSPEC:
465	case AF_UNIX:
466		return false;
467	default:
468		return true;
469	}
470}
471
472static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
473{
474	if (sk->sk_flags & flags) {
475		sk->sk_flags &= ~flags;
476		if (sock_needs_netstamp(sk) &&
477		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
478			net_disable_timestamp();
479	}
480}
481
482
483int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
484{
485	unsigned long flags;
486	struct sk_buff_head *list = &sk->sk_receive_queue;
487
488	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
489		atomic_inc(&sk->sk_drops);
490		trace_sock_rcvqueue_full(sk, skb);
491		return -ENOMEM;
492	}
493
494	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
495		atomic_inc(&sk->sk_drops);
496		return -ENOBUFS;
497	}
498
499	skb->dev = NULL;
500	skb_set_owner_r(skb, sk);
501
502	/* we escape from rcu protected region, make sure we dont leak
503	 * a norefcounted dst
504	 */
505	skb_dst_force(skb);
506
507	spin_lock_irqsave(&list->lock, flags);
508	sock_skb_set_dropcount(sk, skb);
509	__skb_queue_tail(list, skb);
510	spin_unlock_irqrestore(&list->lock, flags);
511
512	if (!sock_flag(sk, SOCK_DEAD))
513		sk->sk_data_ready(sk);
514	return 0;
515}
516EXPORT_SYMBOL(__sock_queue_rcv_skb);
517
518int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
519			      enum skb_drop_reason *reason)
520{
521	enum skb_drop_reason drop_reason;
522	int err;
523
524	err = sk_filter(sk, skb);
525	if (err) {
526		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
527		goto out;
528	}
529	err = __sock_queue_rcv_skb(sk, skb);
530	switch (err) {
531	case -ENOMEM:
532		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
533		break;
534	case -ENOBUFS:
535		drop_reason = SKB_DROP_REASON_PROTO_MEM;
536		break;
537	default:
538		drop_reason = SKB_NOT_DROPPED_YET;
539		break;
540	}
541out:
542	if (reason)
543		*reason = drop_reason;
544	return err;
545}
546EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
547
548int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
549		     const int nested, unsigned int trim_cap, bool refcounted)
550{
551	int rc = NET_RX_SUCCESS;
552
553	if (sk_filter_trim_cap(sk, skb, trim_cap))
554		goto discard_and_relse;
555
556	skb->dev = NULL;
557
558	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
559		atomic_inc(&sk->sk_drops);
560		goto discard_and_relse;
561	}
562	if (nested)
563		bh_lock_sock_nested(sk);
564	else
565		bh_lock_sock(sk);
566	if (!sock_owned_by_user(sk)) {
567		/*
568		 * trylock + unlock semantics:
569		 */
570		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
571
572		rc = sk_backlog_rcv(sk, skb);
573
574		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
575	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
576		bh_unlock_sock(sk);
577		atomic_inc(&sk->sk_drops);
578		goto discard_and_relse;
579	}
580
581	bh_unlock_sock(sk);
582out:
583	if (refcounted)
584		sock_put(sk);
585	return rc;
586discard_and_relse:
587	kfree_skb(skb);
588	goto out;
589}
590EXPORT_SYMBOL(__sk_receive_skb);
591
592INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
593							  u32));
594INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
595							   u32));
596struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
597{
598	struct dst_entry *dst = __sk_dst_get(sk);
599
600	if (dst && dst->obsolete &&
601	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
602			       dst, cookie) == NULL) {
603		sk_tx_queue_clear(sk);
604		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
605		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
606		dst_release(dst);
607		return NULL;
608	}
609
610	return dst;
611}
612EXPORT_SYMBOL(__sk_dst_check);
613
614struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
615{
616	struct dst_entry *dst = sk_dst_get(sk);
617
618	if (dst && dst->obsolete &&
619	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
620			       dst, cookie) == NULL) {
621		sk_dst_reset(sk);
622		dst_release(dst);
623		return NULL;
624	}
625
626	return dst;
627}
628EXPORT_SYMBOL(sk_dst_check);
629
630static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
631{
632	int ret = -ENOPROTOOPT;
633#ifdef CONFIG_NETDEVICES
634	struct net *net = sock_net(sk);
635
636	/* Sorry... */
637	ret = -EPERM;
638	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
639		goto out;
640
641	ret = -EINVAL;
642	if (ifindex < 0)
643		goto out;
644
645	/* Paired with all READ_ONCE() done locklessly. */
646	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
647
648	if (sk->sk_prot->rehash)
649		sk->sk_prot->rehash(sk);
650	sk_dst_reset(sk);
651
652	ret = 0;
653
654out:
655#endif
656
657	return ret;
658}
659
660int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
661{
662	int ret;
663
664	if (lock_sk)
665		lock_sock(sk);
666	ret = sock_bindtoindex_locked(sk, ifindex);
667	if (lock_sk)
668		release_sock(sk);
669
670	return ret;
671}
672EXPORT_SYMBOL(sock_bindtoindex);
673
674static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
675{
676	int ret = -ENOPROTOOPT;
677#ifdef CONFIG_NETDEVICES
678	struct net *net = sock_net(sk);
679	char devname[IFNAMSIZ];
680	int index;
681
682	ret = -EINVAL;
683	if (optlen < 0)
684		goto out;
685
686	/* Bind this socket to a particular device like "eth0",
687	 * as specified in the passed interface name. If the
688	 * name is "" or the option length is zero the socket
689	 * is not bound.
690	 */
691	if (optlen > IFNAMSIZ - 1)
692		optlen = IFNAMSIZ - 1;
693	memset(devname, 0, sizeof(devname));
694
695	ret = -EFAULT;
696	if (copy_from_sockptr(devname, optval, optlen))
697		goto out;
698
699	index = 0;
700	if (devname[0] != '\0') {
701		struct net_device *dev;
702
703		rcu_read_lock();
704		dev = dev_get_by_name_rcu(net, devname);
705		if (dev)
706			index = dev->ifindex;
707		rcu_read_unlock();
708		ret = -ENODEV;
709		if (!dev)
710			goto out;
711	}
712
713	sockopt_lock_sock(sk);
714	ret = sock_bindtoindex_locked(sk, index);
715	sockopt_release_sock(sk);
716out:
717#endif
718
719	return ret;
720}
721
722static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
723				sockptr_t optlen, int len)
724{
725	int ret = -ENOPROTOOPT;
726#ifdef CONFIG_NETDEVICES
727	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
728	struct net *net = sock_net(sk);
729	char devname[IFNAMSIZ];
730
731	if (bound_dev_if == 0) {
732		len = 0;
733		goto zero;
734	}
735
736	ret = -EINVAL;
737	if (len < IFNAMSIZ)
738		goto out;
739
740	ret = netdev_get_name(net, devname, bound_dev_if);
741	if (ret)
742		goto out;
743
744	len = strlen(devname) + 1;
745
746	ret = -EFAULT;
747	if (copy_to_sockptr(optval, devname, len))
748		goto out;
749
750zero:
751	ret = -EFAULT;
752	if (copy_to_sockptr(optlen, &len, sizeof(int)))
753		goto out;
754
755	ret = 0;
756
757out:
758#endif
759
760	return ret;
761}
762
763bool sk_mc_loop(struct sock *sk)
764{
765	if (dev_recursion_level())
766		return false;
767	if (!sk)
768		return true;
769	/* IPV6_ADDRFORM can change sk->sk_family under us. */
770	switch (READ_ONCE(sk->sk_family)) {
771	case AF_INET:
772		return inet_test_bit(MC_LOOP, sk);
773#if IS_ENABLED(CONFIG_IPV6)
774	case AF_INET6:
775		return inet6_sk(sk)->mc_loop;
776#endif
777	}
778	WARN_ON_ONCE(1);
779	return true;
780}
781EXPORT_SYMBOL(sk_mc_loop);
782
783void sock_set_reuseaddr(struct sock *sk)
784{
785	lock_sock(sk);
786	sk->sk_reuse = SK_CAN_REUSE;
787	release_sock(sk);
788}
789EXPORT_SYMBOL(sock_set_reuseaddr);
790
791void sock_set_reuseport(struct sock *sk)
792{
793	lock_sock(sk);
794	sk->sk_reuseport = true;
795	release_sock(sk);
796}
797EXPORT_SYMBOL(sock_set_reuseport);
798
799void sock_no_linger(struct sock *sk)
800{
801	lock_sock(sk);
802	WRITE_ONCE(sk->sk_lingertime, 0);
803	sock_set_flag(sk, SOCK_LINGER);
804	release_sock(sk);
805}
806EXPORT_SYMBOL(sock_no_linger);
807
808void sock_set_priority(struct sock *sk, u32 priority)
809{
810	lock_sock(sk);
811	WRITE_ONCE(sk->sk_priority, priority);
812	release_sock(sk);
813}
814EXPORT_SYMBOL(sock_set_priority);
815
816void sock_set_sndtimeo(struct sock *sk, s64 secs)
817{
818	lock_sock(sk);
819	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
820		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
821	else
822		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
823	release_sock(sk);
824}
825EXPORT_SYMBOL(sock_set_sndtimeo);
826
827static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
828{
829	if (val)  {
830		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
831		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
832		sock_set_flag(sk, SOCK_RCVTSTAMP);
833		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
834	} else {
835		sock_reset_flag(sk, SOCK_RCVTSTAMP);
836		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
837	}
838}
839
840void sock_enable_timestamps(struct sock *sk)
841{
842	lock_sock(sk);
843	__sock_set_timestamps(sk, true, false, true);
844	release_sock(sk);
845}
846EXPORT_SYMBOL(sock_enable_timestamps);
847
848void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
849{
850	switch (optname) {
851	case SO_TIMESTAMP_OLD:
852		__sock_set_timestamps(sk, valbool, false, false);
853		break;
854	case SO_TIMESTAMP_NEW:
855		__sock_set_timestamps(sk, valbool, true, false);
856		break;
857	case SO_TIMESTAMPNS_OLD:
858		__sock_set_timestamps(sk, valbool, false, true);
859		break;
860	case SO_TIMESTAMPNS_NEW:
861		__sock_set_timestamps(sk, valbool, true, true);
862		break;
863	}
864}
865
866static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
867{
868	struct net *net = sock_net(sk);
869	struct net_device *dev = NULL;
870	bool match = false;
871	int *vclock_index;
872	int i, num;
873
874	if (sk->sk_bound_dev_if)
875		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
876
877	if (!dev) {
878		pr_err("%s: sock not bind to device\n", __func__);
879		return -EOPNOTSUPP;
880	}
881
882	num = ethtool_get_phc_vclocks(dev, &vclock_index);
883	dev_put(dev);
884
885	for (i = 0; i < num; i++) {
886		if (*(vclock_index + i) == phc_index) {
887			match = true;
888			break;
889		}
890	}
891
892	if (num > 0)
893		kfree(vclock_index);
894
895	if (!match)
896		return -EINVAL;
897
898	WRITE_ONCE(sk->sk_bind_phc, phc_index);
899
900	return 0;
901}
902
903int sock_set_timestamping(struct sock *sk, int optname,
904			  struct so_timestamping timestamping)
905{
906	int val = timestamping.flags;
907	int ret;
908
909	if (val & ~SOF_TIMESTAMPING_MASK)
910		return -EINVAL;
911
912	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
913	    !(val & SOF_TIMESTAMPING_OPT_ID))
914		return -EINVAL;
915
916	if (val & SOF_TIMESTAMPING_OPT_ID &&
917	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
918		if (sk_is_tcp(sk)) {
919			if ((1 << sk->sk_state) &
920			    (TCPF_CLOSE | TCPF_LISTEN))
921				return -EINVAL;
922			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
923				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
924			else
925				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
926		} else {
927			atomic_set(&sk->sk_tskey, 0);
928		}
929	}
930
931	if (val & SOF_TIMESTAMPING_OPT_STATS &&
932	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
933		return -EINVAL;
934
935	if (val & SOF_TIMESTAMPING_BIND_PHC) {
936		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
937		if (ret)
938			return ret;
939	}
940
941	WRITE_ONCE(sk->sk_tsflags, val);
942	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
943
944	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
945		sock_enable_timestamp(sk,
946				      SOCK_TIMESTAMPING_RX_SOFTWARE);
947	else
948		sock_disable_timestamp(sk,
949				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
950	return 0;
951}
952
953void sock_set_keepalive(struct sock *sk)
954{
955	lock_sock(sk);
956	if (sk->sk_prot->keepalive)
957		sk->sk_prot->keepalive(sk, true);
958	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
959	release_sock(sk);
960}
961EXPORT_SYMBOL(sock_set_keepalive);
962
963static void __sock_set_rcvbuf(struct sock *sk, int val)
964{
965	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
966	 * as a negative value.
967	 */
968	val = min_t(int, val, INT_MAX / 2);
969	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
970
971	/* We double it on the way in to account for "struct sk_buff" etc.
972	 * overhead.   Applications assume that the SO_RCVBUF setting they make
973	 * will allow that much actual data to be received on that socket.
974	 *
975	 * Applications are unaware that "struct sk_buff" and other overheads
976	 * allocate from the receive buffer during socket buffer allocation.
977	 *
978	 * And after considering the possible alternatives, returning the value
979	 * we actually used in getsockopt is the most desirable behavior.
980	 */
981	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
982}
983
984void sock_set_rcvbuf(struct sock *sk, int val)
985{
986	lock_sock(sk);
987	__sock_set_rcvbuf(sk, val);
988	release_sock(sk);
989}
990EXPORT_SYMBOL(sock_set_rcvbuf);
991
992static void __sock_set_mark(struct sock *sk, u32 val)
993{
994	if (val != sk->sk_mark) {
995		WRITE_ONCE(sk->sk_mark, val);
996		sk_dst_reset(sk);
997	}
998}
999
1000void sock_set_mark(struct sock *sk, u32 val)
1001{
1002	lock_sock(sk);
1003	__sock_set_mark(sk, val);
1004	release_sock(sk);
1005}
1006EXPORT_SYMBOL(sock_set_mark);
1007
1008static void sock_release_reserved_memory(struct sock *sk, int bytes)
1009{
1010	/* Round down bytes to multiple of pages */
1011	bytes = round_down(bytes, PAGE_SIZE);
1012
1013	WARN_ON(bytes > sk->sk_reserved_mem);
1014	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1015	sk_mem_reclaim(sk);
1016}
1017
1018static int sock_reserve_memory(struct sock *sk, int bytes)
1019{
1020	long allocated;
1021	bool charged;
1022	int pages;
1023
1024	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1025		return -EOPNOTSUPP;
1026
1027	if (!bytes)
1028		return 0;
1029
1030	pages = sk_mem_pages(bytes);
1031
1032	/* pre-charge to memcg */
1033	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1034					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1035	if (!charged)
1036		return -ENOMEM;
1037
1038	/* pre-charge to forward_alloc */
1039	sk_memory_allocated_add(sk, pages);
1040	allocated = sk_memory_allocated(sk);
1041	/* If the system goes into memory pressure with this
1042	 * precharge, give up and return error.
1043	 */
1044	if (allocated > sk_prot_mem_limits(sk, 1)) {
1045		sk_memory_allocated_sub(sk, pages);
1046		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1047		return -ENOMEM;
1048	}
1049	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1050
1051	WRITE_ONCE(sk->sk_reserved_mem,
1052		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1053
1054	return 0;
1055}
1056
1057void sockopt_lock_sock(struct sock *sk)
1058{
1059	/* When current->bpf_ctx is set, the setsockopt is called from
1060	 * a bpf prog.  bpf has ensured the sk lock has been
1061	 * acquired before calling setsockopt().
1062	 */
1063	if (has_current_bpf_ctx())
1064		return;
1065
1066	lock_sock(sk);
1067}
1068EXPORT_SYMBOL(sockopt_lock_sock);
1069
1070void sockopt_release_sock(struct sock *sk)
1071{
1072	if (has_current_bpf_ctx())
1073		return;
1074
1075	release_sock(sk);
1076}
1077EXPORT_SYMBOL(sockopt_release_sock);
1078
1079bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1080{
1081	return has_current_bpf_ctx() || ns_capable(ns, cap);
1082}
1083EXPORT_SYMBOL(sockopt_ns_capable);
1084
1085bool sockopt_capable(int cap)
1086{
1087	return has_current_bpf_ctx() || capable(cap);
1088}
1089EXPORT_SYMBOL(sockopt_capable);
1090
1091/*
1092 *	This is meant for all protocols to use and covers goings on
1093 *	at the socket level. Everything here is generic.
1094 */
1095
1096int sk_setsockopt(struct sock *sk, int level, int optname,
1097		  sockptr_t optval, unsigned int optlen)
1098{
1099	struct so_timestamping timestamping;
1100	struct socket *sock = sk->sk_socket;
1101	struct sock_txtime sk_txtime;
1102	int val;
1103	int valbool;
1104	struct linger ling;
1105	int ret = 0;
1106
1107	/*
1108	 *	Options without arguments
1109	 */
1110
1111	if (optname == SO_BINDTODEVICE)
1112		return sock_setbindtodevice(sk, optval, optlen);
1113
1114	if (optlen < sizeof(int))
1115		return -EINVAL;
1116
1117	if (copy_from_sockptr(&val, optval, sizeof(val)))
1118		return -EFAULT;
1119
1120	valbool = val ? 1 : 0;
1121
1122	sockopt_lock_sock(sk);
1123
1124	switch (optname) {
1125	case SO_DEBUG:
1126		if (val && !sockopt_capable(CAP_NET_ADMIN))
1127			ret = -EACCES;
1128		else
1129			sock_valbool_flag(sk, SOCK_DBG, valbool);
1130		break;
1131	case SO_REUSEADDR:
1132		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1133		break;
1134	case SO_REUSEPORT:
1135		sk->sk_reuseport = valbool;
1136		break;
1137	case SO_TYPE:
1138	case SO_PROTOCOL:
1139	case SO_DOMAIN:
1140	case SO_ERROR:
1141		ret = -ENOPROTOOPT;
1142		break;
1143	case SO_DONTROUTE:
1144		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1145		sk_dst_reset(sk);
1146		break;
1147	case SO_BROADCAST:
1148		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1149		break;
1150	case SO_SNDBUF:
1151		/* Don't error on this BSD doesn't and if you think
1152		 * about it this is right. Otherwise apps have to
1153		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1154		 * are treated in BSD as hints
1155		 */
1156		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1157set_sndbuf:
1158		/* Ensure val * 2 fits into an int, to prevent max_t()
1159		 * from treating it as a negative value.
1160		 */
1161		val = min_t(int, val, INT_MAX / 2);
1162		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1163		WRITE_ONCE(sk->sk_sndbuf,
1164			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1165		/* Wake up sending tasks if we upped the value. */
1166		sk->sk_write_space(sk);
1167		break;
1168
1169	case SO_SNDBUFFORCE:
1170		if (!sockopt_capable(CAP_NET_ADMIN)) {
1171			ret = -EPERM;
1172			break;
1173		}
1174
1175		/* No negative values (to prevent underflow, as val will be
1176		 * multiplied by 2).
1177		 */
1178		if (val < 0)
1179			val = 0;
1180		goto set_sndbuf;
1181
1182	case SO_RCVBUF:
1183		/* Don't error on this BSD doesn't and if you think
1184		 * about it this is right. Otherwise apps have to
1185		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1186		 * are treated in BSD as hints
1187		 */
1188		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1189		break;
1190
1191	case SO_RCVBUFFORCE:
1192		if (!sockopt_capable(CAP_NET_ADMIN)) {
1193			ret = -EPERM;
1194			break;
1195		}
1196
1197		/* No negative values (to prevent underflow, as val will be
1198		 * multiplied by 2).
1199		 */
1200		__sock_set_rcvbuf(sk, max(val, 0));
1201		break;
1202
1203	case SO_KEEPALIVE:
1204		if (sk->sk_prot->keepalive)
1205			sk->sk_prot->keepalive(sk, valbool);
1206		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1207		break;
1208
1209	case SO_OOBINLINE:
1210		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1211		break;
1212
1213	case SO_NO_CHECK:
1214		sk->sk_no_check_tx = valbool;
1215		break;
1216
1217	case SO_PRIORITY:
1218		if ((val >= 0 && val <= 6) ||
1219		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1220		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1221			WRITE_ONCE(sk->sk_priority, val);
1222		else
1223			ret = -EPERM;
1224		break;
1225
1226	case SO_LINGER:
1227		if (optlen < sizeof(ling)) {
1228			ret = -EINVAL;	/* 1003.1g */
1229			break;
1230		}
1231		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1232			ret = -EFAULT;
1233			break;
1234		}
1235		if (!ling.l_onoff) {
1236			sock_reset_flag(sk, SOCK_LINGER);
1237		} else {
1238			unsigned long t_sec = ling.l_linger;
1239
1240			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1241				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1242			else
1243				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1244			sock_set_flag(sk, SOCK_LINGER);
1245		}
1246		break;
1247
1248	case SO_BSDCOMPAT:
1249		break;
1250
1251	case SO_PASSCRED:
1252		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1253		break;
1254
1255	case SO_PASSPIDFD:
1256		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1257		break;
1258
1259	case SO_TIMESTAMP_OLD:
1260	case SO_TIMESTAMP_NEW:
1261	case SO_TIMESTAMPNS_OLD:
1262	case SO_TIMESTAMPNS_NEW:
1263		sock_set_timestamp(sk, optname, valbool);
1264		break;
1265
1266	case SO_TIMESTAMPING_NEW:
1267	case SO_TIMESTAMPING_OLD:
1268		if (optlen == sizeof(timestamping)) {
1269			if (copy_from_sockptr(&timestamping, optval,
1270					      sizeof(timestamping))) {
1271				ret = -EFAULT;
1272				break;
1273			}
1274		} else {
1275			memset(&timestamping, 0, sizeof(timestamping));
1276			timestamping.flags = val;
1277		}
1278		ret = sock_set_timestamping(sk, optname, timestamping);
1279		break;
1280
1281	case SO_RCVLOWAT:
1282		{
1283		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1284
1285		if (val < 0)
1286			val = INT_MAX;
1287		if (sock)
1288			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1289		if (set_rcvlowat)
1290			ret = set_rcvlowat(sk, val);
1291		else
1292			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1293		break;
1294		}
1295	case SO_RCVTIMEO_OLD:
1296	case SO_RCVTIMEO_NEW:
1297		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1298				       optlen, optname == SO_RCVTIMEO_OLD);
1299		break;
1300
1301	case SO_SNDTIMEO_OLD:
1302	case SO_SNDTIMEO_NEW:
1303		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1304				       optlen, optname == SO_SNDTIMEO_OLD);
1305		break;
1306
1307	case SO_ATTACH_FILTER: {
1308		struct sock_fprog fprog;
1309
1310		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1311		if (!ret)
1312			ret = sk_attach_filter(&fprog, sk);
1313		break;
1314	}
1315	case SO_ATTACH_BPF:
1316		ret = -EINVAL;
1317		if (optlen == sizeof(u32)) {
1318			u32 ufd;
1319
1320			ret = -EFAULT;
1321			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1322				break;
1323
1324			ret = sk_attach_bpf(ufd, sk);
1325		}
1326		break;
1327
1328	case SO_ATTACH_REUSEPORT_CBPF: {
1329		struct sock_fprog fprog;
1330
1331		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1332		if (!ret)
1333			ret = sk_reuseport_attach_filter(&fprog, sk);
1334		break;
1335	}
1336	case SO_ATTACH_REUSEPORT_EBPF:
1337		ret = -EINVAL;
1338		if (optlen == sizeof(u32)) {
1339			u32 ufd;
1340
1341			ret = -EFAULT;
1342			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1343				break;
1344
1345			ret = sk_reuseport_attach_bpf(ufd, sk);
1346		}
1347		break;
1348
1349	case SO_DETACH_REUSEPORT_BPF:
1350		ret = reuseport_detach_prog(sk);
1351		break;
1352
1353	case SO_DETACH_FILTER:
1354		ret = sk_detach_filter(sk);
1355		break;
1356
1357	case SO_LOCK_FILTER:
1358		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1359			ret = -EPERM;
1360		else
1361			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1362		break;
1363
1364	case SO_PASSSEC:
1365		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1366		break;
1367	case SO_MARK:
1368		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1369		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1370			ret = -EPERM;
1371			break;
1372		}
1373
1374		__sock_set_mark(sk, val);
1375		break;
1376	case SO_RCVMARK:
1377		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1378		break;
1379
1380	case SO_RXQ_OVFL:
1381		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1382		break;
1383
1384	case SO_WIFI_STATUS:
1385		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1386		break;
1387
1388	case SO_PEEK_OFF:
1389		{
1390		int (*set_peek_off)(struct sock *sk, int val);
1391
1392		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1393		if (set_peek_off)
1394			ret = set_peek_off(sk, val);
1395		else
1396			ret = -EOPNOTSUPP;
1397		break;
1398		}
1399
1400	case SO_NOFCS:
1401		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1402		break;
1403
1404	case SO_SELECT_ERR_QUEUE:
1405		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1406		break;
1407
1408#ifdef CONFIG_NET_RX_BUSY_POLL
1409	case SO_BUSY_POLL:
1410		if (val < 0)
1411			ret = -EINVAL;
1412		else
1413			WRITE_ONCE(sk->sk_ll_usec, val);
1414		break;
1415	case SO_PREFER_BUSY_POLL:
1416		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1417			ret = -EPERM;
1418		else
1419			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1420		break;
1421	case SO_BUSY_POLL_BUDGET:
1422		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
1423			ret = -EPERM;
1424		} else {
1425			if (val < 0 || val > U16_MAX)
1426				ret = -EINVAL;
1427			else
1428				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1429		}
1430		break;
1431#endif
1432
1433	case SO_MAX_PACING_RATE:
1434		{
1435		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1436
1437		if (sizeof(ulval) != sizeof(val) &&
1438		    optlen >= sizeof(ulval) &&
1439		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1440			ret = -EFAULT;
1441			break;
1442		}
1443		if (ulval != ~0UL)
1444			cmpxchg(&sk->sk_pacing_status,
1445				SK_PACING_NONE,
1446				SK_PACING_NEEDED);
1447		/* Pairs with READ_ONCE() from sk_getsockopt() */
1448		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1449		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1450		break;
1451		}
1452	case SO_INCOMING_CPU:
1453		reuseport_update_incoming_cpu(sk, val);
1454		break;
1455
1456	case SO_CNX_ADVICE:
1457		if (val == 1)
1458			dst_negative_advice(sk);
1459		break;
1460
1461	case SO_ZEROCOPY:
1462		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1463			if (!(sk_is_tcp(sk) ||
1464			      (sk->sk_type == SOCK_DGRAM &&
1465			       sk->sk_protocol == IPPROTO_UDP)))
1466				ret = -EOPNOTSUPP;
1467		} else if (sk->sk_family != PF_RDS) {
1468			ret = -EOPNOTSUPP;
1469		}
1470		if (!ret) {
1471			if (val < 0 || val > 1)
1472				ret = -EINVAL;
1473			else
1474				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1475		}
1476		break;
1477
1478	case SO_TXTIME:
1479		if (optlen != sizeof(struct sock_txtime)) {
1480			ret = -EINVAL;
1481			break;
1482		} else if (copy_from_sockptr(&sk_txtime, optval,
1483			   sizeof(struct sock_txtime))) {
1484			ret = -EFAULT;
1485			break;
1486		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1487			ret = -EINVAL;
1488			break;
1489		}
1490		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1491		 * scheduler has enough safe guards.
1492		 */
1493		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1494		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1495			ret = -EPERM;
1496			break;
1497		}
1498		sock_valbool_flag(sk, SOCK_TXTIME, true);
1499		sk->sk_clockid = sk_txtime.clockid;
1500		sk->sk_txtime_deadline_mode =
1501			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1502		sk->sk_txtime_report_errors =
1503			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1504		break;
1505
1506	case SO_BINDTOIFINDEX:
1507		ret = sock_bindtoindex_locked(sk, val);
1508		break;
1509
1510	case SO_BUF_LOCK:
1511		if (val & ~SOCK_BUF_LOCK_MASK) {
1512			ret = -EINVAL;
1513			break;
1514		}
1515		sk->sk_userlocks = val | (sk->sk_userlocks &
1516					  ~SOCK_BUF_LOCK_MASK);
1517		break;
1518
1519	case SO_RESERVE_MEM:
1520	{
1521		int delta;
1522
1523		if (val < 0) {
1524			ret = -EINVAL;
1525			break;
1526		}
1527
1528		delta = val - sk->sk_reserved_mem;
1529		if (delta < 0)
1530			sock_release_reserved_memory(sk, -delta);
1531		else
1532			ret = sock_reserve_memory(sk, delta);
1533		break;
1534	}
1535
1536	case SO_TXREHASH:
1537		if (val < -1 || val > 1) {
1538			ret = -EINVAL;
1539			break;
1540		}
1541		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1542			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1543		/* Paired with READ_ONCE() in tcp_rtx_synack()
1544		 * and sk_getsockopt().
1545		 */
1546		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1547		break;
1548
1549	default:
1550		ret = -ENOPROTOOPT;
1551		break;
1552	}
1553	sockopt_release_sock(sk);
1554	return ret;
1555}
1556
1557int sock_setsockopt(struct socket *sock, int level, int optname,
1558		    sockptr_t optval, unsigned int optlen)
1559{
1560	return sk_setsockopt(sock->sk, level, optname,
1561			     optval, optlen);
1562}
1563EXPORT_SYMBOL(sock_setsockopt);
1564
1565static const struct cred *sk_get_peer_cred(struct sock *sk)
1566{
1567	const struct cred *cred;
1568
1569	spin_lock(&sk->sk_peer_lock);
1570	cred = get_cred(sk->sk_peer_cred);
1571	spin_unlock(&sk->sk_peer_lock);
1572
1573	return cred;
1574}
1575
1576static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1577			  struct ucred *ucred)
1578{
1579	ucred->pid = pid_vnr(pid);
1580	ucred->uid = ucred->gid = -1;
1581	if (cred) {
1582		struct user_namespace *current_ns = current_user_ns();
1583
1584		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1585		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1586	}
1587}
1588
1589static int groups_to_user(sockptr_t dst, const struct group_info *src)
1590{
1591	struct user_namespace *user_ns = current_user_ns();
1592	int i;
1593
1594	for (i = 0; i < src->ngroups; i++) {
1595		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1596
1597		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1598			return -EFAULT;
1599	}
1600
1601	return 0;
1602}
1603
1604int sk_getsockopt(struct sock *sk, int level, int optname,
1605		  sockptr_t optval, sockptr_t optlen)
1606{
1607	struct socket *sock = sk->sk_socket;
1608
1609	union {
1610		int val;
1611		u64 val64;
1612		unsigned long ulval;
1613		struct linger ling;
1614		struct old_timeval32 tm32;
1615		struct __kernel_old_timeval tm;
1616		struct  __kernel_sock_timeval stm;
1617		struct sock_txtime txtime;
1618		struct so_timestamping timestamping;
1619	} v;
1620
1621	int lv = sizeof(int);
1622	int len;
1623
1624	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1625		return -EFAULT;
1626	if (len < 0)
1627		return -EINVAL;
1628
1629	memset(&v, 0, sizeof(v));
1630
1631	switch (optname) {
1632	case SO_DEBUG:
1633		v.val = sock_flag(sk, SOCK_DBG);
1634		break;
1635
1636	case SO_DONTROUTE:
1637		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1638		break;
1639
1640	case SO_BROADCAST:
1641		v.val = sock_flag(sk, SOCK_BROADCAST);
1642		break;
1643
1644	case SO_SNDBUF:
1645		v.val = READ_ONCE(sk->sk_sndbuf);
1646		break;
1647
1648	case SO_RCVBUF:
1649		v.val = READ_ONCE(sk->sk_rcvbuf);
1650		break;
1651
1652	case SO_REUSEADDR:
1653		v.val = sk->sk_reuse;
1654		break;
1655
1656	case SO_REUSEPORT:
1657		v.val = sk->sk_reuseport;
1658		break;
1659
1660	case SO_KEEPALIVE:
1661		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1662		break;
1663
1664	case SO_TYPE:
1665		v.val = sk->sk_type;
1666		break;
1667
1668	case SO_PROTOCOL:
1669		v.val = sk->sk_protocol;
1670		break;
1671
1672	case SO_DOMAIN:
1673		v.val = sk->sk_family;
1674		break;
1675
1676	case SO_ERROR:
1677		v.val = -sock_error(sk);
1678		if (v.val == 0)
1679			v.val = xchg(&sk->sk_err_soft, 0);
1680		break;
1681
1682	case SO_OOBINLINE:
1683		v.val = sock_flag(sk, SOCK_URGINLINE);
1684		break;
1685
1686	case SO_NO_CHECK:
1687		v.val = sk->sk_no_check_tx;
1688		break;
1689
1690	case SO_PRIORITY:
1691		v.val = READ_ONCE(sk->sk_priority);
1692		break;
1693
1694	case SO_LINGER:
1695		lv		= sizeof(v.ling);
1696		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1697		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1698		break;
1699
1700	case SO_BSDCOMPAT:
1701		break;
1702
1703	case SO_TIMESTAMP_OLD:
1704		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1705				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1706				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1707		break;
1708
1709	case SO_TIMESTAMPNS_OLD:
1710		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1711		break;
1712
1713	case SO_TIMESTAMP_NEW:
1714		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1715		break;
1716
1717	case SO_TIMESTAMPNS_NEW:
1718		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1719		break;
1720
1721	case SO_TIMESTAMPING_OLD:
1722	case SO_TIMESTAMPING_NEW:
1723		lv = sizeof(v.timestamping);
1724		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1725		 * returning the flags when they were set through the same option.
1726		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1727		 */
1728		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1729			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1730			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1731		}
1732		break;
1733
1734	case SO_RCVTIMEO_OLD:
1735	case SO_RCVTIMEO_NEW:
1736		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1737				      SO_RCVTIMEO_OLD == optname);
1738		break;
1739
1740	case SO_SNDTIMEO_OLD:
1741	case SO_SNDTIMEO_NEW:
1742		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1743				      SO_SNDTIMEO_OLD == optname);
1744		break;
1745
1746	case SO_RCVLOWAT:
1747		v.val = READ_ONCE(sk->sk_rcvlowat);
1748		break;
1749
1750	case SO_SNDLOWAT:
1751		v.val = 1;
1752		break;
1753
1754	case SO_PASSCRED:
1755		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1756		break;
1757
1758	case SO_PASSPIDFD:
1759		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1760		break;
1761
1762	case SO_PEERCRED:
1763	{
1764		struct ucred peercred;
1765		if (len > sizeof(peercred))
1766			len = sizeof(peercred);
1767
1768		spin_lock(&sk->sk_peer_lock);
1769		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1770		spin_unlock(&sk->sk_peer_lock);
1771
1772		if (copy_to_sockptr(optval, &peercred, len))
1773			return -EFAULT;
1774		goto lenout;
1775	}
1776
1777	case SO_PEERPIDFD:
1778	{
1779		struct pid *peer_pid;
1780		struct file *pidfd_file = NULL;
1781		int pidfd;
1782
1783		if (len > sizeof(pidfd))
1784			len = sizeof(pidfd);
1785
1786		spin_lock(&sk->sk_peer_lock);
1787		peer_pid = get_pid(sk->sk_peer_pid);
1788		spin_unlock(&sk->sk_peer_lock);
1789
1790		if (!peer_pid)
1791			return -ENODATA;
1792
1793		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1794		put_pid(peer_pid);
1795		if (pidfd < 0)
1796			return pidfd;
1797
1798		if (copy_to_sockptr(optval, &pidfd, len) ||
1799		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1800			put_unused_fd(pidfd);
1801			fput(pidfd_file);
1802
1803			return -EFAULT;
1804		}
1805
1806		fd_install(pidfd, pidfd_file);
1807		return 0;
1808	}
1809
1810	case SO_PEERGROUPS:
1811	{
1812		const struct cred *cred;
1813		int ret, n;
1814
1815		cred = sk_get_peer_cred(sk);
1816		if (!cred)
1817			return -ENODATA;
1818
1819		n = cred->group_info->ngroups;
1820		if (len < n * sizeof(gid_t)) {
1821			len = n * sizeof(gid_t);
1822			put_cred(cred);
1823			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1824		}
1825		len = n * sizeof(gid_t);
1826
1827		ret = groups_to_user(optval, cred->group_info);
1828		put_cred(cred);
1829		if (ret)
1830			return ret;
1831		goto lenout;
1832	}
1833
1834	case SO_PEERNAME:
1835	{
1836		struct sockaddr_storage address;
1837
1838		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1839		if (lv < 0)
1840			return -ENOTCONN;
1841		if (lv < len)
1842			return -EINVAL;
1843		if (copy_to_sockptr(optval, &address, len))
1844			return -EFAULT;
1845		goto lenout;
1846	}
1847
1848	/* Dubious BSD thing... Probably nobody even uses it, but
1849	 * the UNIX standard wants it for whatever reason... -DaveM
1850	 */
1851	case SO_ACCEPTCONN:
1852		v.val = sk->sk_state == TCP_LISTEN;
1853		break;
1854
1855	case SO_PASSSEC:
1856		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1857		break;
1858
1859	case SO_PEERSEC:
1860		return security_socket_getpeersec_stream(sock,
1861							 optval, optlen, len);
1862
1863	case SO_MARK:
1864		v.val = READ_ONCE(sk->sk_mark);
1865		break;
1866
1867	case SO_RCVMARK:
1868		v.val = sock_flag(sk, SOCK_RCVMARK);
1869		break;
1870
1871	case SO_RXQ_OVFL:
1872		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1873		break;
1874
1875	case SO_WIFI_STATUS:
1876		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1877		break;
1878
1879	case SO_PEEK_OFF:
1880		if (!READ_ONCE(sock->ops)->set_peek_off)
1881			return -EOPNOTSUPP;
1882
1883		v.val = READ_ONCE(sk->sk_peek_off);
1884		break;
1885	case SO_NOFCS:
1886		v.val = sock_flag(sk, SOCK_NOFCS);
1887		break;
1888
1889	case SO_BINDTODEVICE:
1890		return sock_getbindtodevice(sk, optval, optlen, len);
1891
1892	case SO_GET_FILTER:
1893		len = sk_get_filter(sk, optval, len);
1894		if (len < 0)
1895			return len;
1896
1897		goto lenout;
1898
1899	case SO_LOCK_FILTER:
1900		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1901		break;
1902
1903	case SO_BPF_EXTENSIONS:
1904		v.val = bpf_tell_extensions();
1905		break;
1906
1907	case SO_SELECT_ERR_QUEUE:
1908		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1909		break;
1910
1911#ifdef CONFIG_NET_RX_BUSY_POLL
1912	case SO_BUSY_POLL:
1913		v.val = READ_ONCE(sk->sk_ll_usec);
1914		break;
1915	case SO_PREFER_BUSY_POLL:
1916		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1917		break;
1918#endif
1919
1920	case SO_MAX_PACING_RATE:
1921		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1922		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1923			lv = sizeof(v.ulval);
1924			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1925		} else {
1926			/* 32bit version */
1927			v.val = min_t(unsigned long, ~0U,
1928				      READ_ONCE(sk->sk_max_pacing_rate));
1929		}
1930		break;
1931
1932	case SO_INCOMING_CPU:
1933		v.val = READ_ONCE(sk->sk_incoming_cpu);
1934		break;
1935
1936	case SO_MEMINFO:
1937	{
1938		u32 meminfo[SK_MEMINFO_VARS];
1939
1940		sk_get_meminfo(sk, meminfo);
1941
1942		len = min_t(unsigned int, len, sizeof(meminfo));
1943		if (copy_to_sockptr(optval, &meminfo, len))
1944			return -EFAULT;
1945
1946		goto lenout;
1947	}
1948
1949#ifdef CONFIG_NET_RX_BUSY_POLL
1950	case SO_INCOMING_NAPI_ID:
1951		v.val = READ_ONCE(sk->sk_napi_id);
1952
1953		/* aggregate non-NAPI IDs down to 0 */
1954		if (v.val < MIN_NAPI_ID)
1955			v.val = 0;
1956
1957		break;
1958#endif
1959
1960	case SO_COOKIE:
1961		lv = sizeof(u64);
1962		if (len < lv)
1963			return -EINVAL;
1964		v.val64 = sock_gen_cookie(sk);
1965		break;
1966
1967	case SO_ZEROCOPY:
1968		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1969		break;
1970
1971	case SO_TXTIME:
1972		lv = sizeof(v.txtime);
1973		v.txtime.clockid = sk->sk_clockid;
1974		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1975				  SOF_TXTIME_DEADLINE_MODE : 0;
1976		v.txtime.flags |= sk->sk_txtime_report_errors ?
1977				  SOF_TXTIME_REPORT_ERRORS : 0;
1978		break;
1979
1980	case SO_BINDTOIFINDEX:
1981		v.val = READ_ONCE(sk->sk_bound_dev_if);
1982		break;
1983
1984	case SO_NETNS_COOKIE:
1985		lv = sizeof(u64);
1986		if (len != lv)
1987			return -EINVAL;
1988		v.val64 = sock_net(sk)->net_cookie;
1989		break;
1990
1991	case SO_BUF_LOCK:
1992		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1993		break;
1994
1995	case SO_RESERVE_MEM:
1996		v.val = READ_ONCE(sk->sk_reserved_mem);
1997		break;
1998
1999	case SO_TXREHASH:
2000		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2001		v.val = READ_ONCE(sk->sk_txrehash);
2002		break;
2003
2004	default:
2005		/* We implement the SO_SNDLOWAT etc to not be settable
2006		 * (1003.1g 7).
2007		 */
2008		return -ENOPROTOOPT;
2009	}
2010
2011	if (len > lv)
2012		len = lv;
2013	if (copy_to_sockptr(optval, &v, len))
2014		return -EFAULT;
2015lenout:
2016	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2017		return -EFAULT;
2018	return 0;
2019}
2020
2021int sock_getsockopt(struct socket *sock, int level, int optname,
2022		    char __user *optval, int __user *optlen)
2023{
2024	return sk_getsockopt(sock->sk, level, optname,
2025			     USER_SOCKPTR(optval),
2026			     USER_SOCKPTR(optlen));
2027}
2028
2029/*
2030 * Initialize an sk_lock.
2031 *
2032 * (We also register the sk_lock with the lock validator.)
2033 */
2034static inline void sock_lock_init(struct sock *sk)
2035{
2036	if (sk->sk_kern_sock)
2037		sock_lock_init_class_and_name(
2038			sk,
2039			af_family_kern_slock_key_strings[sk->sk_family],
2040			af_family_kern_slock_keys + sk->sk_family,
2041			af_family_kern_key_strings[sk->sk_family],
2042			af_family_kern_keys + sk->sk_family);
2043	else
2044		sock_lock_init_class_and_name(
2045			sk,
2046			af_family_slock_key_strings[sk->sk_family],
2047			af_family_slock_keys + sk->sk_family,
2048			af_family_key_strings[sk->sk_family],
2049			af_family_keys + sk->sk_family);
2050}
2051
2052/*
2053 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2054 * even temporarly, because of RCU lookups. sk_node should also be left as is.
2055 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2056 */
2057static void sock_copy(struct sock *nsk, const struct sock *osk)
2058{
2059	const struct proto *prot = READ_ONCE(osk->sk_prot);
2060#ifdef CONFIG_SECURITY_NETWORK
2061	void *sptr = nsk->sk_security;
2062#endif
2063
2064	/* If we move sk_tx_queue_mapping out of the private section,
2065	 * we must check if sk_tx_queue_clear() is called after
2066	 * sock_copy() in sk_clone_lock().
2067	 */
2068	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2069		     offsetof(struct sock, sk_dontcopy_begin) ||
2070		     offsetof(struct sock, sk_tx_queue_mapping) >=
2071		     offsetof(struct sock, sk_dontcopy_end));
2072
2073	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2074
2075	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2076	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2077
2078#ifdef CONFIG_SECURITY_NETWORK
2079	nsk->sk_security = sptr;
2080	security_sk_clone(osk, nsk);
2081#endif
2082}
2083
2084static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2085		int family)
2086{
2087	struct sock *sk;
2088	struct kmem_cache *slab;
2089
2090	slab = prot->slab;
2091	if (slab != NULL) {
2092		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2093		if (!sk)
2094			return sk;
2095		if (want_init_on_alloc(priority))
2096			sk_prot_clear_nulls(sk, prot->obj_size);
2097	} else
2098		sk = kmalloc(prot->obj_size, priority);
2099
2100	if (sk != NULL) {
2101		if (security_sk_alloc(sk, family, priority))
2102			goto out_free;
2103
2104		if (!try_module_get(prot->owner))
2105			goto out_free_sec;
2106	}
2107
2108	return sk;
2109
2110out_free_sec:
2111	security_sk_free(sk);
2112out_free:
2113	if (slab != NULL)
2114		kmem_cache_free(slab, sk);
2115	else
2116		kfree(sk);
2117	return NULL;
2118}
2119
2120static void sk_prot_free(struct proto *prot, struct sock *sk)
2121{
2122	struct kmem_cache *slab;
2123	struct module *owner;
2124
2125	owner = prot->owner;
2126	slab = prot->slab;
2127
2128	cgroup_sk_free(&sk->sk_cgrp_data);
2129	mem_cgroup_sk_free(sk);
2130	security_sk_free(sk);
2131	if (slab != NULL)
2132		kmem_cache_free(slab, sk);
2133	else
2134		kfree(sk);
2135	module_put(owner);
2136}
2137
2138/**
2139 *	sk_alloc - All socket objects are allocated here
2140 *	@net: the applicable net namespace
2141 *	@family: protocol family
2142 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2143 *	@prot: struct proto associated with this new sock instance
2144 *	@kern: is this to be a kernel socket?
2145 */
2146struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2147		      struct proto *prot, int kern)
2148{
2149	struct sock *sk;
2150
2151	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2152	if (sk) {
2153		sk->sk_family = family;
2154		/*
2155		 * See comment in struct sock definition to understand
2156		 * why we need sk_prot_creator -acme
2157		 */
2158		sk->sk_prot = sk->sk_prot_creator = prot;
2159		sk->sk_kern_sock = kern;
2160		sock_lock_init(sk);
2161		sk->sk_net_refcnt = kern ? 0 : 1;
2162		if (likely(sk->sk_net_refcnt)) {
2163			get_net_track(net, &sk->ns_tracker, priority);
2164			sock_inuse_add(net, 1);
2165		} else {
2166			__netns_tracker_alloc(net, &sk->ns_tracker,
2167					      false, priority);
2168		}
2169
2170		sock_net_set(sk, net);
2171		refcount_set(&sk->sk_wmem_alloc, 1);
2172
2173		mem_cgroup_sk_alloc(sk);
2174		cgroup_sk_alloc(&sk->sk_cgrp_data);
2175		sock_update_classid(&sk->sk_cgrp_data);
2176		sock_update_netprioidx(&sk->sk_cgrp_data);
2177		sk_tx_queue_clear(sk);
2178	}
2179
2180	return sk;
2181}
2182EXPORT_SYMBOL(sk_alloc);
2183
2184/* Sockets having SOCK_RCU_FREE will call this function after one RCU
2185 * grace period. This is the case for UDP sockets and TCP listeners.
2186 */
2187static void __sk_destruct(struct rcu_head *head)
2188{
2189	struct sock *sk = container_of(head, struct sock, sk_rcu);
2190	struct sk_filter *filter;
2191
2192	if (sk->sk_destruct)
2193		sk->sk_destruct(sk);
2194
2195	filter = rcu_dereference_check(sk->sk_filter,
2196				       refcount_read(&sk->sk_wmem_alloc) == 0);
2197	if (filter) {
2198		sk_filter_uncharge(sk, filter);
2199		RCU_INIT_POINTER(sk->sk_filter, NULL);
2200	}
2201
2202	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2203
2204#ifdef CONFIG_BPF_SYSCALL
2205	bpf_sk_storage_free(sk);
2206#endif
2207
2208	if (atomic_read(&sk->sk_omem_alloc))
2209		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2210			 __func__, atomic_read(&sk->sk_omem_alloc));
2211
2212	if (sk->sk_frag.page) {
2213		put_page(sk->sk_frag.page);
2214		sk->sk_frag.page = NULL;
2215	}
2216
2217	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2218	put_cred(sk->sk_peer_cred);
2219	put_pid(sk->sk_peer_pid);
2220
2221	if (likely(sk->sk_net_refcnt))
2222		put_net_track(sock_net(sk), &sk->ns_tracker);
2223	else
2224		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2225
2226	sk_prot_free(sk->sk_prot_creator, sk);
2227}
2228
2229void sk_destruct(struct sock *sk)
2230{
2231	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2232
2233	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2234		reuseport_detach_sock(sk);
2235		use_call_rcu = true;
2236	}
2237
2238	if (use_call_rcu)
2239		call_rcu(&sk->sk_rcu, __sk_destruct);
2240	else
2241		__sk_destruct(&sk->sk_rcu);
2242}
2243
2244static void __sk_free(struct sock *sk)
2245{
2246	if (likely(sk->sk_net_refcnt))
2247		sock_inuse_add(sock_net(sk), -1);
2248
2249	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2250		sock_diag_broadcast_destroy(sk);
2251	else
2252		sk_destruct(sk);
2253}
2254
2255void sk_free(struct sock *sk)
2256{
2257	/*
2258	 * We subtract one from sk_wmem_alloc and can know if
2259	 * some packets are still in some tx queue.
2260	 * If not null, sock_wfree() will call __sk_free(sk) later
2261	 */
2262	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2263		__sk_free(sk);
2264}
2265EXPORT_SYMBOL(sk_free);
2266
2267static void sk_init_common(struct sock *sk)
2268{
2269	skb_queue_head_init(&sk->sk_receive_queue);
2270	skb_queue_head_init(&sk->sk_write_queue);
2271	skb_queue_head_init(&sk->sk_error_queue);
2272
2273	rwlock_init(&sk->sk_callback_lock);
2274	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2275			af_rlock_keys + sk->sk_family,
2276			af_family_rlock_key_strings[sk->sk_family]);
2277	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2278			af_wlock_keys + sk->sk_family,
2279			af_family_wlock_key_strings[sk->sk_family]);
2280	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2281			af_elock_keys + sk->sk_family,
2282			af_family_elock_key_strings[sk->sk_family]);
2283	lockdep_set_class_and_name(&sk->sk_callback_lock,
2284			af_callback_keys + sk->sk_family,
2285			af_family_clock_key_strings[sk->sk_family]);
2286}
2287
2288/**
2289 *	sk_clone_lock - clone a socket, and lock its clone
2290 *	@sk: the socket to clone
2291 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2292 *
2293 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2294 */
2295struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2296{
2297	struct proto *prot = READ_ONCE(sk->sk_prot);
2298	struct sk_filter *filter;
2299	bool is_charged = true;
2300	struct sock *newsk;
2301
2302	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2303	if (!newsk)
2304		goto out;
2305
2306	sock_copy(newsk, sk);
2307
2308	newsk->sk_prot_creator = prot;
2309
2310	/* SANITY */
2311	if (likely(newsk->sk_net_refcnt)) {
2312		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2313		sock_inuse_add(sock_net(newsk), 1);
2314	} else {
2315		/* Kernel sockets are not elevating the struct net refcount.
2316		 * Instead, use a tracker to more easily detect if a layer
2317		 * is not properly dismantling its kernel sockets at netns
2318		 * destroy time.
2319		 */
2320		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2321				      false, priority);
2322	}
2323	sk_node_init(&newsk->sk_node);
2324	sock_lock_init(newsk);
2325	bh_lock_sock(newsk);
2326	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2327	newsk->sk_backlog.len = 0;
2328
2329	atomic_set(&newsk->sk_rmem_alloc, 0);
2330
2331	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2332	refcount_set(&newsk->sk_wmem_alloc, 1);
2333
2334	atomic_set(&newsk->sk_omem_alloc, 0);
2335	sk_init_common(newsk);
2336
2337	newsk->sk_dst_cache	= NULL;
2338	newsk->sk_dst_pending_confirm = 0;
2339	newsk->sk_wmem_queued	= 0;
2340	newsk->sk_forward_alloc = 0;
2341	newsk->sk_reserved_mem  = 0;
2342	atomic_set(&newsk->sk_drops, 0);
2343	newsk->sk_send_head	= NULL;
2344	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2345	atomic_set(&newsk->sk_zckey, 0);
2346
2347	sock_reset_flag(newsk, SOCK_DONE);
2348
2349	/* sk->sk_memcg will be populated at accept() time */
2350	newsk->sk_memcg = NULL;
2351
2352	cgroup_sk_clone(&newsk->sk_cgrp_data);
2353
2354	rcu_read_lock();
2355	filter = rcu_dereference(sk->sk_filter);
2356	if (filter != NULL)
2357		/* though it's an empty new sock, the charging may fail
2358		 * if sysctl_optmem_max was changed between creation of
2359		 * original socket and cloning
2360		 */
2361		is_charged = sk_filter_charge(newsk, filter);
2362	RCU_INIT_POINTER(newsk->sk_filter, filter);
2363	rcu_read_unlock();
2364
2365	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2366		/* We need to make sure that we don't uncharge the new
2367		 * socket if we couldn't charge it in the first place
2368		 * as otherwise we uncharge the parent's filter.
2369		 */
2370		if (!is_charged)
2371			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2372		sk_free_unlock_clone(newsk);
2373		newsk = NULL;
2374		goto out;
2375	}
2376	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2377
2378	if (bpf_sk_storage_clone(sk, newsk)) {
2379		sk_free_unlock_clone(newsk);
2380		newsk = NULL;
2381		goto out;
2382	}
2383
2384	/* Clear sk_user_data if parent had the pointer tagged
2385	 * as not suitable for copying when cloning.
2386	 */
2387	if (sk_user_data_is_nocopy(newsk))
2388		newsk->sk_user_data = NULL;
2389
2390	newsk->sk_err	   = 0;
2391	newsk->sk_err_soft = 0;
2392	newsk->sk_priority = 0;
2393	newsk->sk_incoming_cpu = raw_smp_processor_id();
2394
2395	/* Before updating sk_refcnt, we must commit prior changes to memory
2396	 * (Documentation/RCU/rculist_nulls.rst for details)
2397	 */
2398	smp_wmb();
2399	refcount_set(&newsk->sk_refcnt, 2);
2400
2401	sk_set_socket(newsk, NULL);
2402	sk_tx_queue_clear(newsk);
2403	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2404
2405	if (newsk->sk_prot->sockets_allocated)
2406		sk_sockets_allocated_inc(newsk);
2407
2408	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2409		net_enable_timestamp();
2410out:
2411	return newsk;
2412}
2413EXPORT_SYMBOL_GPL(sk_clone_lock);
2414
2415void sk_free_unlock_clone(struct sock *sk)
2416{
2417	/* It is still raw copy of parent, so invalidate
2418	 * destructor and make plain sk_free() */
2419	sk->sk_destruct = NULL;
2420	bh_unlock_sock(sk);
2421	sk_free(sk);
2422}
2423EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2424
2425static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2426{
2427	bool is_ipv6 = false;
2428	u32 max_size;
2429
2430#if IS_ENABLED(CONFIG_IPV6)
2431	is_ipv6 = (sk->sk_family == AF_INET6 &&
2432		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2433#endif
2434	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2435	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2436			READ_ONCE(dst->dev->gso_ipv4_max_size);
2437	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2438		max_size = GSO_LEGACY_MAX_SIZE;
2439
2440	return max_size - (MAX_TCP_HEADER + 1);
2441}
2442
2443void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2444{
2445	u32 max_segs = 1;
2446
2447	sk->sk_route_caps = dst->dev->features;
2448	if (sk_is_tcp(sk))
2449		sk->sk_route_caps |= NETIF_F_GSO;
2450	if (sk->sk_route_caps & NETIF_F_GSO)
2451		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2452	if (unlikely(sk->sk_gso_disabled))
2453		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2454	if (sk_can_gso(sk)) {
2455		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2456			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2457		} else {
2458			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2459			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2460			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2461			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2462		}
2463	}
2464	sk->sk_gso_max_segs = max_segs;
2465	sk_dst_set(sk, dst);
2466}
2467EXPORT_SYMBOL_GPL(sk_setup_caps);
2468
2469/*
2470 *	Simple resource managers for sockets.
2471 */
2472
2473
2474/*
2475 * Write buffer destructor automatically called from kfree_skb.
2476 */
2477void sock_wfree(struct sk_buff *skb)
2478{
2479	struct sock *sk = skb->sk;
2480	unsigned int len = skb->truesize;
2481	bool free;
2482
2483	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2484		if (sock_flag(sk, SOCK_RCU_FREE) &&
2485		    sk->sk_write_space == sock_def_write_space) {
2486			rcu_read_lock();
2487			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2488			sock_def_write_space_wfree(sk);
2489			rcu_read_unlock();
2490			if (unlikely(free))
2491				__sk_free(sk);
2492			return;
2493		}
2494
2495		/*
2496		 * Keep a reference on sk_wmem_alloc, this will be released
2497		 * after sk_write_space() call
2498		 */
2499		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2500		sk->sk_write_space(sk);
2501		len = 1;
2502	}
2503	/*
2504	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2505	 * could not do because of in-flight packets
2506	 */
2507	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2508		__sk_free(sk);
2509}
2510EXPORT_SYMBOL(sock_wfree);
2511
2512/* This variant of sock_wfree() is used by TCP,
2513 * since it sets SOCK_USE_WRITE_QUEUE.
2514 */
2515void __sock_wfree(struct sk_buff *skb)
2516{
2517	struct sock *sk = skb->sk;
2518
2519	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2520		__sk_free(sk);
2521}
2522
2523void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2524{
2525	skb_orphan(skb);
2526	skb->sk = sk;
2527#ifdef CONFIG_INET
2528	if (unlikely(!sk_fullsock(sk))) {
2529		skb->destructor = sock_edemux;
2530		sock_hold(sk);
2531		return;
2532	}
2533#endif
2534	skb->destructor = sock_wfree;
2535	skb_set_hash_from_sk(skb, sk);
2536	/*
2537	 * We used to take a refcount on sk, but following operation
2538	 * is enough to guarantee sk_free() wont free this sock until
2539	 * all in-flight packets are completed
2540	 */
2541	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2542}
2543EXPORT_SYMBOL(skb_set_owner_w);
2544
2545static bool can_skb_orphan_partial(const struct sk_buff *skb)
2546{
2547#ifdef CONFIG_TLS_DEVICE
2548	/* Drivers depend on in-order delivery for crypto offload,
2549	 * partial orphan breaks out-of-order-OK logic.
2550	 */
2551	if (skb->decrypted)
2552		return false;
2553#endif
2554	return (skb->destructor == sock_wfree ||
2555		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2556}
2557
2558/* This helper is used by netem, as it can hold packets in its
2559 * delay queue. We want to allow the owner socket to send more
2560 * packets, as if they were already TX completed by a typical driver.
2561 * But we also want to keep skb->sk set because some packet schedulers
2562 * rely on it (sch_fq for example).
2563 */
2564void skb_orphan_partial(struct sk_buff *skb)
2565{
2566	if (skb_is_tcp_pure_ack(skb))
2567		return;
2568
2569	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2570		return;
2571
2572	skb_orphan(skb);
2573}
2574EXPORT_SYMBOL(skb_orphan_partial);
2575
2576/*
2577 * Read buffer destructor automatically called from kfree_skb.
2578 */
2579void sock_rfree(struct sk_buff *skb)
2580{
2581	struct sock *sk = skb->sk;
2582	unsigned int len = skb->truesize;
2583
2584	atomic_sub(len, &sk->sk_rmem_alloc);
2585	sk_mem_uncharge(sk, len);
2586}
2587EXPORT_SYMBOL(sock_rfree);
2588
2589/*
2590 * Buffer destructor for skbs that are not used directly in read or write
2591 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2592 */
2593void sock_efree(struct sk_buff *skb)
2594{
2595	sock_put(skb->sk);
2596}
2597EXPORT_SYMBOL(sock_efree);
2598
2599/* Buffer destructor for prefetch/receive path where reference count may
2600 * not be held, e.g. for listen sockets.
2601 */
2602#ifdef CONFIG_INET
2603void sock_pfree(struct sk_buff *skb)
2604{
2605	if (sk_is_refcounted(skb->sk))
2606		sock_gen_put(skb->sk);
2607}
2608EXPORT_SYMBOL(sock_pfree);
2609#endif /* CONFIG_INET */
2610
2611kuid_t sock_i_uid(struct sock *sk)
2612{
2613	kuid_t uid;
2614
2615	read_lock_bh(&sk->sk_callback_lock);
2616	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2617	read_unlock_bh(&sk->sk_callback_lock);
2618	return uid;
2619}
2620EXPORT_SYMBOL(sock_i_uid);
2621
2622unsigned long __sock_i_ino(struct sock *sk)
2623{
2624	unsigned long ino;
2625
2626	read_lock(&sk->sk_callback_lock);
2627	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2628	read_unlock(&sk->sk_callback_lock);
2629	return ino;
2630}
2631EXPORT_SYMBOL(__sock_i_ino);
2632
2633unsigned long sock_i_ino(struct sock *sk)
2634{
2635	unsigned long ino;
2636
2637	local_bh_disable();
2638	ino = __sock_i_ino(sk);
2639	local_bh_enable();
2640	return ino;
2641}
2642EXPORT_SYMBOL(sock_i_ino);
2643
2644/*
2645 * Allocate a skb from the socket's send buffer.
2646 */
2647struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2648			     gfp_t priority)
2649{
2650	if (force ||
2651	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2652		struct sk_buff *skb = alloc_skb(size, priority);
2653
2654		if (skb) {
2655			skb_set_owner_w(skb, sk);
2656			return skb;
2657		}
2658	}
2659	return NULL;
2660}
2661EXPORT_SYMBOL(sock_wmalloc);
2662
2663static void sock_ofree(struct sk_buff *skb)
2664{
2665	struct sock *sk = skb->sk;
2666
2667	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2668}
2669
2670struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2671			     gfp_t priority)
2672{
2673	struct sk_buff *skb;
2674
2675	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2676	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2677	    READ_ONCE(sysctl_optmem_max))
2678		return NULL;
2679
2680	skb = alloc_skb(size, priority);
2681	if (!skb)
2682		return NULL;
2683
2684	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2685	skb->sk = sk;
2686	skb->destructor = sock_ofree;
2687	return skb;
2688}
2689
2690/*
2691 * Allocate a memory block from the socket's option memory buffer.
2692 */
2693void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2694{
2695	int optmem_max = READ_ONCE(sysctl_optmem_max);
2696
2697	if ((unsigned int)size <= optmem_max &&
2698	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2699		void *mem;
2700		/* First do the add, to avoid the race if kmalloc
2701		 * might sleep.
2702		 */
2703		atomic_add(size, &sk->sk_omem_alloc);
2704		mem = kmalloc(size, priority);
2705		if (mem)
2706			return mem;
2707		atomic_sub(size, &sk->sk_omem_alloc);
2708	}
2709	return NULL;
2710}
2711EXPORT_SYMBOL(sock_kmalloc);
2712
2713/* Free an option memory block. Note, we actually want the inline
2714 * here as this allows gcc to detect the nullify and fold away the
2715 * condition entirely.
2716 */
2717static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2718				  const bool nullify)
2719{
2720	if (WARN_ON_ONCE(!mem))
2721		return;
2722	if (nullify)
2723		kfree_sensitive(mem);
2724	else
2725		kfree(mem);
2726	atomic_sub(size, &sk->sk_omem_alloc);
2727}
2728
2729void sock_kfree_s(struct sock *sk, void *mem, int size)
2730{
2731	__sock_kfree_s(sk, mem, size, false);
2732}
2733EXPORT_SYMBOL(sock_kfree_s);
2734
2735void sock_kzfree_s(struct sock *sk, void *mem, int size)
2736{
2737	__sock_kfree_s(sk, mem, size, true);
2738}
2739EXPORT_SYMBOL(sock_kzfree_s);
2740
2741/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2742   I think, these locks should be removed for datagram sockets.
2743 */
2744static long sock_wait_for_wmem(struct sock *sk, long timeo)
2745{
2746	DEFINE_WAIT(wait);
2747
2748	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2749	for (;;) {
2750		if (!timeo)
2751			break;
2752		if (signal_pending(current))
2753			break;
2754		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2755		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2756		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2757			break;
2758		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2759			break;
2760		if (READ_ONCE(sk->sk_err))
2761			break;
2762		timeo = schedule_timeout(timeo);
2763	}
2764	finish_wait(sk_sleep(sk), &wait);
2765	return timeo;
2766}
2767
2768
2769/*
2770 *	Generic send/receive buffer handlers
2771 */
2772
2773struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2774				     unsigned long data_len, int noblock,
2775				     int *errcode, int max_page_order)
2776{
2777	struct sk_buff *skb;
2778	long timeo;
2779	int err;
2780
2781	timeo = sock_sndtimeo(sk, noblock);
2782	for (;;) {
2783		err = sock_error(sk);
2784		if (err != 0)
2785			goto failure;
2786
2787		err = -EPIPE;
2788		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2789			goto failure;
2790
2791		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2792			break;
2793
2794		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2795		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2796		err = -EAGAIN;
2797		if (!timeo)
2798			goto failure;
2799		if (signal_pending(current))
2800			goto interrupted;
2801		timeo = sock_wait_for_wmem(sk, timeo);
2802	}
2803	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2804				   errcode, sk->sk_allocation);
2805	if (skb)
2806		skb_set_owner_w(skb, sk);
2807	return skb;
2808
2809interrupted:
2810	err = sock_intr_errno(timeo);
2811failure:
2812	*errcode = err;
2813	return NULL;
2814}
2815EXPORT_SYMBOL(sock_alloc_send_pskb);
2816
2817int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2818		     struct sockcm_cookie *sockc)
2819{
2820	u32 tsflags;
2821
2822	switch (cmsg->cmsg_type) {
2823	case SO_MARK:
2824		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2825		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2826			return -EPERM;
2827		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2828			return -EINVAL;
2829		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2830		break;
2831	case SO_TIMESTAMPING_OLD:
2832	case SO_TIMESTAMPING_NEW:
2833		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2834			return -EINVAL;
2835
2836		tsflags = *(u32 *)CMSG_DATA(cmsg);
2837		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2838			return -EINVAL;
2839
2840		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2841		sockc->tsflags |= tsflags;
2842		break;
2843	case SCM_TXTIME:
2844		if (!sock_flag(sk, SOCK_TXTIME))
2845			return -EINVAL;
2846		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2847			return -EINVAL;
2848		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2849		break;
2850	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2851	case SCM_RIGHTS:
2852	case SCM_CREDENTIALS:
2853		break;
2854	default:
2855		return -EINVAL;
2856	}
2857	return 0;
2858}
2859EXPORT_SYMBOL(__sock_cmsg_send);
2860
2861int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2862		   struct sockcm_cookie *sockc)
2863{
2864	struct cmsghdr *cmsg;
2865	int ret;
2866
2867	for_each_cmsghdr(cmsg, msg) {
2868		if (!CMSG_OK(msg, cmsg))
2869			return -EINVAL;
2870		if (cmsg->cmsg_level != SOL_SOCKET)
2871			continue;
2872		ret = __sock_cmsg_send(sk, cmsg, sockc);
2873		if (ret)
2874			return ret;
2875	}
2876	return 0;
2877}
2878EXPORT_SYMBOL(sock_cmsg_send);
2879
2880static void sk_enter_memory_pressure(struct sock *sk)
2881{
2882	if (!sk->sk_prot->enter_memory_pressure)
2883		return;
2884
2885	sk->sk_prot->enter_memory_pressure(sk);
2886}
2887
2888static void sk_leave_memory_pressure(struct sock *sk)
2889{
2890	if (sk->sk_prot->leave_memory_pressure) {
2891		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2892				     tcp_leave_memory_pressure, sk);
2893	} else {
2894		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2895
2896		if (memory_pressure && READ_ONCE(*memory_pressure))
2897			WRITE_ONCE(*memory_pressure, 0);
2898	}
2899}
2900
2901DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2902
2903/**
2904 * skb_page_frag_refill - check that a page_frag contains enough room
2905 * @sz: minimum size of the fragment we want to get
2906 * @pfrag: pointer to page_frag
2907 * @gfp: priority for memory allocation
2908 *
2909 * Note: While this allocator tries to use high order pages, there is
2910 * no guarantee that allocations succeed. Therefore, @sz MUST be
2911 * less or equal than PAGE_SIZE.
2912 */
2913bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2914{
2915	if (pfrag->page) {
2916		if (page_ref_count(pfrag->page) == 1) {
2917			pfrag->offset = 0;
2918			return true;
2919		}
2920		if (pfrag->offset + sz <= pfrag->size)
2921			return true;
2922		put_page(pfrag->page);
2923	}
2924
2925	pfrag->offset = 0;
2926	if (SKB_FRAG_PAGE_ORDER &&
2927	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2928		/* Avoid direct reclaim but allow kswapd to wake */
2929		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2930					  __GFP_COMP | __GFP_NOWARN |
2931					  __GFP_NORETRY,
2932					  SKB_FRAG_PAGE_ORDER);
2933		if (likely(pfrag->page)) {
2934			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2935			return true;
2936		}
2937	}
2938	pfrag->page = alloc_page(gfp);
2939	if (likely(pfrag->page)) {
2940		pfrag->size = PAGE_SIZE;
2941		return true;
2942	}
2943	return false;
2944}
2945EXPORT_SYMBOL(skb_page_frag_refill);
2946
2947bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2948{
2949	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2950		return true;
2951
2952	sk_enter_memory_pressure(sk);
2953	sk_stream_moderate_sndbuf(sk);
2954	return false;
2955}
2956EXPORT_SYMBOL(sk_page_frag_refill);
2957
2958void __lock_sock(struct sock *sk)
2959	__releases(&sk->sk_lock.slock)
2960	__acquires(&sk->sk_lock.slock)
2961{
2962	DEFINE_WAIT(wait);
2963
2964	for (;;) {
2965		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2966					TASK_UNINTERRUPTIBLE);
2967		spin_unlock_bh(&sk->sk_lock.slock);
2968		schedule();
2969		spin_lock_bh(&sk->sk_lock.slock);
2970		if (!sock_owned_by_user(sk))
2971			break;
2972	}
2973	finish_wait(&sk->sk_lock.wq, &wait);
2974}
2975
2976void __release_sock(struct sock *sk)
2977	__releases(&sk->sk_lock.slock)
2978	__acquires(&sk->sk_lock.slock)
2979{
2980	struct sk_buff *skb, *next;
2981
2982	while ((skb = sk->sk_backlog.head) != NULL) {
2983		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2984
2985		spin_unlock_bh(&sk->sk_lock.slock);
2986
2987		do {
2988			next = skb->next;
2989			prefetch(next);
2990			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2991			skb_mark_not_on_list(skb);
2992			sk_backlog_rcv(sk, skb);
2993
2994			cond_resched();
2995
2996			skb = next;
2997		} while (skb != NULL);
2998
2999		spin_lock_bh(&sk->sk_lock.slock);
3000	}
3001
3002	/*
3003	 * Doing the zeroing here guarantee we can not loop forever
3004	 * while a wild producer attempts to flood us.
3005	 */
3006	sk->sk_backlog.len = 0;
3007}
3008
3009void __sk_flush_backlog(struct sock *sk)
3010{
3011	spin_lock_bh(&sk->sk_lock.slock);
3012	__release_sock(sk);
3013	spin_unlock_bh(&sk->sk_lock.slock);
3014}
3015EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3016
3017/**
3018 * sk_wait_data - wait for data to arrive at sk_receive_queue
3019 * @sk:    sock to wait on
3020 * @timeo: for how long
3021 * @skb:   last skb seen on sk_receive_queue
3022 *
3023 * Now socket state including sk->sk_err is changed only under lock,
3024 * hence we may omit checks after joining wait queue.
3025 * We check receive queue before schedule() only as optimization;
3026 * it is very likely that release_sock() added new data.
3027 */
3028int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3029{
3030	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3031	int rc;
3032
3033	add_wait_queue(sk_sleep(sk), &wait);
3034	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3035	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3036	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3037	remove_wait_queue(sk_sleep(sk), &wait);
3038	return rc;
3039}
3040EXPORT_SYMBOL(sk_wait_data);
3041
3042/**
3043 *	__sk_mem_raise_allocated - increase memory_allocated
3044 *	@sk: socket
3045 *	@size: memory size to allocate
3046 *	@amt: pages to allocate
3047 *	@kind: allocation type
3048 *
3049 *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3050 */
3051int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3052{
3053	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
3054	struct proto *prot = sk->sk_prot;
3055	bool charged = true;
3056	long allocated;
3057
3058	sk_memory_allocated_add(sk, amt);
3059	allocated = sk_memory_allocated(sk);
3060	if (memcg_charge &&
3061	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3062						gfp_memcg_charge())))
3063		goto suppress_allocation;
3064
3065	/* Under limit. */
3066	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3067		sk_leave_memory_pressure(sk);
3068		return 1;
3069	}
3070
3071	/* Under pressure. */
3072	if (allocated > sk_prot_mem_limits(sk, 1))
3073		sk_enter_memory_pressure(sk);
3074
3075	/* Over hard limit. */
3076	if (allocated > sk_prot_mem_limits(sk, 2))
3077		goto suppress_allocation;
3078
3079	/* guarantee minimum buffer size under pressure */
3080	if (kind == SK_MEM_RECV) {
3081		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3082			return 1;
3083
3084	} else { /* SK_MEM_SEND */
3085		int wmem0 = sk_get_wmem0(sk, prot);
3086
3087		if (sk->sk_type == SOCK_STREAM) {
3088			if (sk->sk_wmem_queued < wmem0)
3089				return 1;
3090		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3091				return 1;
3092		}
3093	}
3094
3095	if (sk_has_memory_pressure(sk)) {
3096		u64 alloc;
3097
3098		if (!sk_under_memory_pressure(sk))
3099			return 1;
3100		alloc = sk_sockets_allocated_read_positive(sk);
3101		if (sk_prot_mem_limits(sk, 2) > alloc *
3102		    sk_mem_pages(sk->sk_wmem_queued +
3103				 atomic_read(&sk->sk_rmem_alloc) +
3104				 sk->sk_forward_alloc))
3105			return 1;
3106	}
3107
3108suppress_allocation:
3109
3110	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3111		sk_stream_moderate_sndbuf(sk);
3112
3113		/* Fail only if socket is _under_ its sndbuf.
3114		 * In this case we cannot block, so that we have to fail.
3115		 */
3116		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3117			/* Force charge with __GFP_NOFAIL */
3118			if (memcg_charge && !charged) {
3119				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3120					gfp_memcg_charge() | __GFP_NOFAIL);
3121			}
3122			return 1;
3123		}
3124	}
3125
3126	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3127		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3128
3129	sk_memory_allocated_sub(sk, amt);
3130
3131	if (memcg_charge && charged)
3132		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
3133
3134	return 0;
3135}
3136
3137/**
3138 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3139 *	@sk: socket
3140 *	@size: memory size to allocate
3141 *	@kind: allocation type
3142 *
3143 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3144 *	rmem allocation. This function assumes that protocols which have
3145 *	memory_pressure use sk_wmem_queued as write buffer accounting.
3146 */
3147int __sk_mem_schedule(struct sock *sk, int size, int kind)
3148{
3149	int ret, amt = sk_mem_pages(size);
3150
3151	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3152	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3153	if (!ret)
3154		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3155	return ret;
3156}
3157EXPORT_SYMBOL(__sk_mem_schedule);
3158
3159/**
3160 *	__sk_mem_reduce_allocated - reclaim memory_allocated
3161 *	@sk: socket
3162 *	@amount: number of quanta
3163 *
3164 *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3165 */
3166void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3167{
3168	sk_memory_allocated_sub(sk, amount);
3169
3170	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3171		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3172
3173	if (sk_under_global_memory_pressure(sk) &&
3174	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3175		sk_leave_memory_pressure(sk);
3176}
3177
3178/**
3179 *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3180 *	@sk: socket
3181 *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3182 */
3183void __sk_mem_reclaim(struct sock *sk, int amount)
3184{
3185	amount >>= PAGE_SHIFT;
3186	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3187	__sk_mem_reduce_allocated(sk, amount);
3188}
3189EXPORT_SYMBOL(__sk_mem_reclaim);
3190
3191int sk_set_peek_off(struct sock *sk, int val)
3192{
3193	WRITE_ONCE(sk->sk_peek_off, val);
3194	return 0;
3195}
3196EXPORT_SYMBOL_GPL(sk_set_peek_off);
3197
3198/*
3199 * Set of default routines for initialising struct proto_ops when
3200 * the protocol does not support a particular function. In certain
3201 * cases where it makes no sense for a protocol to have a "do nothing"
3202 * function, some default processing is provided.
3203 */
3204
3205int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3206{
3207	return -EOPNOTSUPP;
3208}
3209EXPORT_SYMBOL(sock_no_bind);
3210
3211int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3212		    int len, int flags)
3213{
3214	return -EOPNOTSUPP;
3215}
3216EXPORT_SYMBOL(sock_no_connect);
3217
3218int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3219{
3220	return -EOPNOTSUPP;
3221}
3222EXPORT_SYMBOL(sock_no_socketpair);
3223
3224int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3225		   bool kern)
3226{
3227	return -EOPNOTSUPP;
3228}
3229EXPORT_SYMBOL(sock_no_accept);
3230
3231int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3232		    int peer)
3233{
3234	return -EOPNOTSUPP;
3235}
3236EXPORT_SYMBOL(sock_no_getname);
3237
3238int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3239{
3240	return -EOPNOTSUPP;
3241}
3242EXPORT_SYMBOL(sock_no_ioctl);
3243
3244int sock_no_listen(struct socket *sock, int backlog)
3245{
3246	return -EOPNOTSUPP;
3247}
3248EXPORT_SYMBOL(sock_no_listen);
3249
3250int sock_no_shutdown(struct socket *sock, int how)
3251{
3252	return -EOPNOTSUPP;
3253}
3254EXPORT_SYMBOL(sock_no_shutdown);
3255
3256int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3257{
3258	return -EOPNOTSUPP;
3259}
3260EXPORT_SYMBOL(sock_no_sendmsg);
3261
3262int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3263{
3264	return -EOPNOTSUPP;
3265}
3266EXPORT_SYMBOL(sock_no_sendmsg_locked);
3267
3268int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3269		    int flags)
3270{
3271	return -EOPNOTSUPP;
3272}
3273EXPORT_SYMBOL(sock_no_recvmsg);
3274
3275int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3276{
3277	/* Mirror missing mmap method error code */
3278	return -ENODEV;
3279}
3280EXPORT_SYMBOL(sock_no_mmap);
3281
3282/*
3283 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3284 * various sock-based usage counts.
3285 */
3286void __receive_sock(struct file *file)
3287{
3288	struct socket *sock;
3289
3290	sock = sock_from_file(file);
3291	if (sock) {
3292		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3293		sock_update_classid(&sock->sk->sk_cgrp_data);
3294	}
3295}
3296
3297/*
3298 *	Default Socket Callbacks
3299 */
3300
3301static void sock_def_wakeup(struct sock *sk)
3302{
3303	struct socket_wq *wq;
3304
3305	rcu_read_lock();
3306	wq = rcu_dereference(sk->sk_wq);
3307	if (skwq_has_sleeper(wq))
3308		wake_up_interruptible_all(&wq->wait);
3309	rcu_read_unlock();
3310}
3311
3312static void sock_def_error_report(struct sock *sk)
3313{
3314	struct socket_wq *wq;
3315
3316	rcu_read_lock();
3317	wq = rcu_dereference(sk->sk_wq);
3318	if (skwq_has_sleeper(wq))
3319		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3320	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3321	rcu_read_unlock();
3322}
3323
3324void sock_def_readable(struct sock *sk)
3325{
3326	struct socket_wq *wq;
3327
3328	trace_sk_data_ready(sk);
3329
3330	rcu_read_lock();
3331	wq = rcu_dereference(sk->sk_wq);
3332	if (skwq_has_sleeper(wq))
3333		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3334						EPOLLRDNORM | EPOLLRDBAND);
3335	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3336	rcu_read_unlock();
3337}
3338
3339static void sock_def_write_space(struct sock *sk)
3340{
3341	struct socket_wq *wq;
3342
3343	rcu_read_lock();
3344
3345	/* Do not wake up a writer until he can make "significant"
3346	 * progress.  --DaveM
3347	 */
3348	if (sock_writeable(sk)) {
3349		wq = rcu_dereference(sk->sk_wq);
3350		if (skwq_has_sleeper(wq))
3351			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3352						EPOLLWRNORM | EPOLLWRBAND);
3353
3354		/* Should agree with poll, otherwise some programs break */
3355		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3356	}
3357
3358	rcu_read_unlock();
3359}
3360
3361/* An optimised version of sock_def_write_space(), should only be called
3362 * for SOCK_RCU_FREE sockets under RCU read section and after putting
3363 * ->sk_wmem_alloc.
3364 */
3365static void sock_def_write_space_wfree(struct sock *sk)
3366{
3367	/* Do not wake up a writer until he can make "significant"
3368	 * progress.  --DaveM
3369	 */
3370	if (sock_writeable(sk)) {
3371		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3372
3373		/* rely on refcount_sub from sock_wfree() */
3374		smp_mb__after_atomic();
3375		if (wq && waitqueue_active(&wq->wait))
3376			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3377						EPOLLWRNORM | EPOLLWRBAND);
3378
3379		/* Should agree with poll, otherwise some programs break */
3380		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3381	}
3382}
3383
3384static void sock_def_destruct(struct sock *sk)
3385{
3386}
3387
3388void sk_send_sigurg(struct sock *sk)
3389{
3390	if (sk->sk_socket && sk->sk_socket->file)
3391		if (send_sigurg(&sk->sk_socket->file->f_owner))
3392			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3393}
3394EXPORT_SYMBOL(sk_send_sigurg);
3395
3396void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3397		    unsigned long expires)
3398{
3399	if (!mod_timer(timer, expires))
3400		sock_hold(sk);
3401}
3402EXPORT_SYMBOL(sk_reset_timer);
3403
3404void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3405{
3406	if (del_timer(timer))
3407		__sock_put(sk);
3408}
3409EXPORT_SYMBOL(sk_stop_timer);
3410
3411void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3412{
3413	if (del_timer_sync(timer))
3414		__sock_put(sk);
3415}
3416EXPORT_SYMBOL(sk_stop_timer_sync);
3417
3418void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3419{
3420	sk_init_common(sk);
3421	sk->sk_send_head	=	NULL;
3422
3423	timer_setup(&sk->sk_timer, NULL, 0);
3424
3425	sk->sk_allocation	=	GFP_KERNEL;
3426	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3427	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3428	sk->sk_state		=	TCP_CLOSE;
3429	sk->sk_use_task_frag	=	true;
3430	sk_set_socket(sk, sock);
3431
3432	sock_set_flag(sk, SOCK_ZAPPED);
3433
3434	if (sock) {
3435		sk->sk_type	=	sock->type;
3436		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3437		sock->sk	=	sk;
3438	} else {
3439		RCU_INIT_POINTER(sk->sk_wq, NULL);
3440	}
3441	sk->sk_uid	=	uid;
3442
3443	rwlock_init(&sk->sk_callback_lock);
3444	if (sk->sk_kern_sock)
3445		lockdep_set_class_and_name(
3446			&sk->sk_callback_lock,
3447			af_kern_callback_keys + sk->sk_family,
3448			af_family_kern_clock_key_strings[sk->sk_family]);
3449	else
3450		lockdep_set_class_and_name(
3451			&sk->sk_callback_lock,
3452			af_callback_keys + sk->sk_family,
3453			af_family_clock_key_strings[sk->sk_family]);
3454
3455	sk->sk_state_change	=	sock_def_wakeup;
3456	sk->sk_data_ready	=	sock_def_readable;
3457	sk->sk_write_space	=	sock_def_write_space;
3458	sk->sk_error_report	=	sock_def_error_report;
3459	sk->sk_destruct		=	sock_def_destruct;
3460
3461	sk->sk_frag.page	=	NULL;
3462	sk->sk_frag.offset	=	0;
3463	sk->sk_peek_off		=	-1;
3464
3465	sk->sk_peer_pid 	=	NULL;
3466	sk->sk_peer_cred	=	NULL;
3467	spin_lock_init(&sk->sk_peer_lock);
3468
3469	sk->sk_write_pending	=	0;
3470	sk->sk_rcvlowat		=	1;
3471	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3472	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3473
3474	sk->sk_stamp = SK_DEFAULT_STAMP;
3475#if BITS_PER_LONG==32
3476	seqlock_init(&sk->sk_stamp_seq);
3477#endif
3478	atomic_set(&sk->sk_zckey, 0);
3479
3480#ifdef CONFIG_NET_RX_BUSY_POLL
3481	sk->sk_napi_id		=	0;
3482	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3483#endif
3484
3485	sk->sk_max_pacing_rate = ~0UL;
3486	sk->sk_pacing_rate = ~0UL;
3487	WRITE_ONCE(sk->sk_pacing_shift, 10);
3488	sk->sk_incoming_cpu = -1;
3489
3490	sk_rx_queue_clear(sk);
3491	/*
3492	 * Before updating sk_refcnt, we must commit prior changes to memory
3493	 * (Documentation/RCU/rculist_nulls.rst for details)
3494	 */
3495	smp_wmb();
3496	refcount_set(&sk->sk_refcnt, 1);
3497	atomic_set(&sk->sk_drops, 0);
3498}
3499EXPORT_SYMBOL(sock_init_data_uid);
3500
3501void sock_init_data(struct socket *sock, struct sock *sk)
3502{
3503	kuid_t uid = sock ?
3504		SOCK_INODE(sock)->i_uid :
3505		make_kuid(sock_net(sk)->user_ns, 0);
3506
3507	sock_init_data_uid(sock, sk, uid);
3508}
3509EXPORT_SYMBOL(sock_init_data);
3510
3511void lock_sock_nested(struct sock *sk, int subclass)
3512{
3513	/* The sk_lock has mutex_lock() semantics here. */
3514	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3515
3516	might_sleep();
3517	spin_lock_bh(&sk->sk_lock.slock);
3518	if (sock_owned_by_user_nocheck(sk))
3519		__lock_sock(sk);
3520	sk->sk_lock.owned = 1;
3521	spin_unlock_bh(&sk->sk_lock.slock);
3522}
3523EXPORT_SYMBOL(lock_sock_nested);
3524
3525void release_sock(struct sock *sk)
3526{
3527	spin_lock_bh(&sk->sk_lock.slock);
3528	if (sk->sk_backlog.tail)
3529		__release_sock(sk);
3530
3531	/* Warning : release_cb() might need to release sk ownership,
3532	 * ie call sock_release_ownership(sk) before us.
3533	 */
3534	if (sk->sk_prot->release_cb)
3535		sk->sk_prot->release_cb(sk);
3536
3537	sock_release_ownership(sk);
3538	if (waitqueue_active(&sk->sk_lock.wq))
3539		wake_up(&sk->sk_lock.wq);
3540	spin_unlock_bh(&sk->sk_lock.slock);
3541}
3542EXPORT_SYMBOL(release_sock);
3543
3544bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3545{
3546	might_sleep();
3547	spin_lock_bh(&sk->sk_lock.slock);
3548
3549	if (!sock_owned_by_user_nocheck(sk)) {
3550		/*
3551		 * Fast path return with bottom halves disabled and
3552		 * sock::sk_lock.slock held.
3553		 *
3554		 * The 'mutex' is not contended and holding
3555		 * sock::sk_lock.slock prevents all other lockers to
3556		 * proceed so the corresponding unlock_sock_fast() can
3557		 * avoid the slow path of release_sock() completely and
3558		 * just release slock.
3559		 *
3560		 * From a semantical POV this is equivalent to 'acquiring'
3561		 * the 'mutex', hence the corresponding lockdep
3562		 * mutex_release() has to happen in the fast path of
3563		 * unlock_sock_fast().
3564		 */
3565		return false;
3566	}
3567
3568	__lock_sock(sk);
3569	sk->sk_lock.owned = 1;
3570	__acquire(&sk->sk_lock.slock);
3571	spin_unlock_bh(&sk->sk_lock.slock);
3572	return true;
3573}
3574EXPORT_SYMBOL(__lock_sock_fast);
3575
3576int sock_gettstamp(struct socket *sock, void __user *userstamp,
3577		   bool timeval, bool time32)
3578{
3579	struct sock *sk = sock->sk;
3580	struct timespec64 ts;
3581
3582	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3583	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3584	if (ts.tv_sec == -1)
3585		return -ENOENT;
3586	if (ts.tv_sec == 0) {
3587		ktime_t kt = ktime_get_real();
3588		sock_write_timestamp(sk, kt);
3589		ts = ktime_to_timespec64(kt);
3590	}
3591
3592	if (timeval)
3593		ts.tv_nsec /= 1000;
3594
3595#ifdef CONFIG_COMPAT_32BIT_TIME
3596	if (time32)
3597		return put_old_timespec32(&ts, userstamp);
3598#endif
3599#ifdef CONFIG_SPARC64
3600	/* beware of padding in sparc64 timeval */
3601	if (timeval && !in_compat_syscall()) {
3602		struct __kernel_old_timeval __user tv = {
3603			.tv_sec = ts.tv_sec,
3604			.tv_usec = ts.tv_nsec,
3605		};
3606		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3607			return -EFAULT;
3608		return 0;
3609	}
3610#endif
3611	return put_timespec64(&ts, userstamp);
3612}
3613EXPORT_SYMBOL(sock_gettstamp);
3614
3615void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3616{
3617	if (!sock_flag(sk, flag)) {
3618		unsigned long previous_flags = sk->sk_flags;
3619
3620		sock_set_flag(sk, flag);
3621		/*
3622		 * we just set one of the two flags which require net
3623		 * time stamping, but time stamping might have been on
3624		 * already because of the other one
3625		 */
3626		if (sock_needs_netstamp(sk) &&
3627		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3628			net_enable_timestamp();
3629	}
3630}
3631
3632int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3633		       int level, int type)
3634{
3635	struct sock_exterr_skb *serr;
3636	struct sk_buff *skb;
3637	int copied, err;
3638
3639	err = -EAGAIN;
3640	skb = sock_dequeue_err_skb(sk);
3641	if (skb == NULL)
3642		goto out;
3643
3644	copied = skb->len;
3645	if (copied > len) {
3646		msg->msg_flags |= MSG_TRUNC;
3647		copied = len;
3648	}
3649	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3650	if (err)
3651		goto out_free_skb;
3652
3653	sock_recv_timestamp(msg, sk, skb);
3654
3655	serr = SKB_EXT_ERR(skb);
3656	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3657
3658	msg->msg_flags |= MSG_ERRQUEUE;
3659	err = copied;
3660
3661out_free_skb:
3662	kfree_skb(skb);
3663out:
3664	return err;
3665}
3666EXPORT_SYMBOL(sock_recv_errqueue);
3667
3668/*
3669 *	Get a socket option on an socket.
3670 *
3671 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3672 *	asynchronous errors should be reported by getsockopt. We assume
3673 *	this means if you specify SO_ERROR (otherwise whats the point of it).
3674 */
3675int sock_common_getsockopt(struct socket *sock, int level, int optname,
3676			   char __user *optval, int __user *optlen)
3677{
3678	struct sock *sk = sock->sk;
3679
3680	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3681	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3682}
3683EXPORT_SYMBOL(sock_common_getsockopt);
3684
3685int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3686			int flags)
3687{
3688	struct sock *sk = sock->sk;
3689	int addr_len = 0;
3690	int err;
3691
3692	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3693	if (err >= 0)
3694		msg->msg_namelen = addr_len;
3695	return err;
3696}
3697EXPORT_SYMBOL(sock_common_recvmsg);
3698
3699/*
3700 *	Set socket options on an inet socket.
3701 */
3702int sock_common_setsockopt(struct socket *sock, int level, int optname,
3703			   sockptr_t optval, unsigned int optlen)
3704{
3705	struct sock *sk = sock->sk;
3706
3707	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3708	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3709}
3710EXPORT_SYMBOL(sock_common_setsockopt);
3711
3712void sk_common_release(struct sock *sk)
3713{
3714	if (sk->sk_prot->destroy)
3715		sk->sk_prot->destroy(sk);
3716
3717	/*
3718	 * Observation: when sk_common_release is called, processes have
3719	 * no access to socket. But net still has.
3720	 * Step one, detach it from networking:
3721	 *
3722	 * A. Remove from hash tables.
3723	 */
3724
3725	sk->sk_prot->unhash(sk);
3726
3727	/*
3728	 * In this point socket cannot receive new packets, but it is possible
3729	 * that some packets are in flight because some CPU runs receiver and
3730	 * did hash table lookup before we unhashed socket. They will achieve
3731	 * receive queue and will be purged by socket destructor.
3732	 *
3733	 * Also we still have packets pending on receive queue and probably,
3734	 * our own packets waiting in device queues. sock_destroy will drain
3735	 * receive queue, but transmitted packets will delay socket destruction
3736	 * until the last reference will be released.
3737	 */
3738
3739	sock_orphan(sk);
3740
3741	xfrm_sk_free_policy(sk);
3742
3743	sock_put(sk);
3744}
3745EXPORT_SYMBOL(sk_common_release);
3746
3747void sk_get_meminfo(const struct sock *sk, u32 *mem)
3748{
3749	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3750
3751	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3752	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3753	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3754	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3755	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3756	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3757	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3758	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3759	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3760}
3761
3762#ifdef CONFIG_PROC_FS
3763static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3764
3765int sock_prot_inuse_get(struct net *net, struct proto *prot)
3766{
3767	int cpu, idx = prot->inuse_idx;
3768	int res = 0;
3769
3770	for_each_possible_cpu(cpu)
3771		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3772
3773	return res >= 0 ? res : 0;
3774}
3775EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3776
3777int sock_inuse_get(struct net *net)
3778{
3779	int cpu, res = 0;
3780
3781	for_each_possible_cpu(cpu)
3782		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3783
3784	return res;
3785}
3786
3787EXPORT_SYMBOL_GPL(sock_inuse_get);
3788
3789static int __net_init sock_inuse_init_net(struct net *net)
3790{
3791	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3792	if (net->core.prot_inuse == NULL)
3793		return -ENOMEM;
3794	return 0;
3795}
3796
3797static void __net_exit sock_inuse_exit_net(struct net *net)
3798{
3799	free_percpu(net->core.prot_inuse);
3800}
3801
3802static struct pernet_operations net_inuse_ops = {
3803	.init = sock_inuse_init_net,
3804	.exit = sock_inuse_exit_net,
3805};
3806
3807static __init int net_inuse_init(void)
3808{
3809	if (register_pernet_subsys(&net_inuse_ops))
3810		panic("Cannot initialize net inuse counters");
3811
3812	return 0;
3813}
3814
3815core_initcall(net_inuse_init);
3816
3817static int assign_proto_idx(struct proto *prot)
3818{
3819	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3820
3821	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3822		pr_err("PROTO_INUSE_NR exhausted\n");
3823		return -ENOSPC;
3824	}
3825
3826	set_bit(prot->inuse_idx, proto_inuse_idx);
3827	return 0;
3828}
3829
3830static void release_proto_idx(struct proto *prot)
3831{
3832	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3833		clear_bit(prot->inuse_idx, proto_inuse_idx);
3834}
3835#else
3836static inline int assign_proto_idx(struct proto *prot)
3837{
3838	return 0;
3839}
3840
3841static inline void release_proto_idx(struct proto *prot)
3842{
3843}
3844
3845#endif
3846
3847static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3848{
3849	if (!twsk_prot)
3850		return;
3851	kfree(twsk_prot->twsk_slab_name);
3852	twsk_prot->twsk_slab_name = NULL;
3853	kmem_cache_destroy(twsk_prot->twsk_slab);
3854	twsk_prot->twsk_slab = NULL;
3855}
3856
3857static int tw_prot_init(const struct proto *prot)
3858{
3859	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3860
3861	if (!twsk_prot)
3862		return 0;
3863
3864	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3865					      prot->name);
3866	if (!twsk_prot->twsk_slab_name)
3867		return -ENOMEM;
3868
3869	twsk_prot->twsk_slab =
3870		kmem_cache_create(twsk_prot->twsk_slab_name,
3871				  twsk_prot->twsk_obj_size, 0,
3872				  SLAB_ACCOUNT | prot->slab_flags,
3873				  NULL);
3874	if (!twsk_prot->twsk_slab) {
3875		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3876			prot->name);
3877		return -ENOMEM;
3878	}
3879
3880	return 0;
3881}
3882
3883static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3884{
3885	if (!rsk_prot)
3886		return;
3887	kfree(rsk_prot->slab_name);
3888	rsk_prot->slab_name = NULL;
3889	kmem_cache_destroy(rsk_prot->slab);
3890	rsk_prot->slab = NULL;
3891}
3892
3893static int req_prot_init(const struct proto *prot)
3894{
3895	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3896
3897	if (!rsk_prot)
3898		return 0;
3899
3900	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3901					prot->name);
3902	if (!rsk_prot->slab_name)
3903		return -ENOMEM;
3904
3905	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3906					   rsk_prot->obj_size, 0,
3907					   SLAB_ACCOUNT | prot->slab_flags,
3908					   NULL);
3909
3910	if (!rsk_prot->slab) {
3911		pr_crit("%s: Can't create request sock SLAB cache!\n",
3912			prot->name);
3913		return -ENOMEM;
3914	}
3915	return 0;
3916}
3917
3918int proto_register(struct proto *prot, int alloc_slab)
3919{
3920	int ret = -ENOBUFS;
3921
3922	if (prot->memory_allocated && !prot->sysctl_mem) {
3923		pr_err("%s: missing sysctl_mem\n", prot->name);
3924		return -EINVAL;
3925	}
3926	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3927		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3928		return -EINVAL;
3929	}
3930	if (alloc_slab) {
3931		prot->slab = kmem_cache_create_usercopy(prot->name,
3932					prot->obj_size, 0,
3933					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3934					prot->slab_flags,
3935					prot->useroffset, prot->usersize,
3936					NULL);
3937
3938		if (prot->slab == NULL) {
3939			pr_crit("%s: Can't create sock SLAB cache!\n",
3940				prot->name);
3941			goto out;
3942		}
3943
3944		if (req_prot_init(prot))
3945			goto out_free_request_sock_slab;
3946
3947		if (tw_prot_init(prot))
3948			goto out_free_timewait_sock_slab;
3949	}
3950
3951	mutex_lock(&proto_list_mutex);
3952	ret = assign_proto_idx(prot);
3953	if (ret) {
3954		mutex_unlock(&proto_list_mutex);
3955		goto out_free_timewait_sock_slab;
3956	}
3957	list_add(&prot->node, &proto_list);
3958	mutex_unlock(&proto_list_mutex);
3959	return ret;
3960
3961out_free_timewait_sock_slab:
3962	if (alloc_slab)
3963		tw_prot_cleanup(prot->twsk_prot);
3964out_free_request_sock_slab:
3965	if (alloc_slab) {
3966		req_prot_cleanup(prot->rsk_prot);
3967
3968		kmem_cache_destroy(prot->slab);
3969		prot->slab = NULL;
3970	}
3971out:
3972	return ret;
3973}
3974EXPORT_SYMBOL(proto_register);
3975
3976void proto_unregister(struct proto *prot)
3977{
3978	mutex_lock(&proto_list_mutex);
3979	release_proto_idx(prot);
3980	list_del(&prot->node);
3981	mutex_unlock(&proto_list_mutex);
3982
3983	kmem_cache_destroy(prot->slab);
3984	prot->slab = NULL;
3985
3986	req_prot_cleanup(prot->rsk_prot);
3987	tw_prot_cleanup(prot->twsk_prot);
3988}
3989EXPORT_SYMBOL(proto_unregister);
3990
3991int sock_load_diag_module(int family, int protocol)
3992{
3993	if (!protocol) {
3994		if (!sock_is_registered(family))
3995			return -ENOENT;
3996
3997		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3998				      NETLINK_SOCK_DIAG, family);
3999	}
4000
4001#ifdef CONFIG_INET
4002	if (family == AF_INET &&
4003	    protocol != IPPROTO_RAW &&
4004	    protocol < MAX_INET_PROTOS &&
4005	    !rcu_access_pointer(inet_protos[protocol]))
4006		return -ENOENT;
4007#endif
4008
4009	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4010			      NETLINK_SOCK_DIAG, family, protocol);
4011}
4012EXPORT_SYMBOL(sock_load_diag_module);
4013
4014#ifdef CONFIG_PROC_FS
4015static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4016	__acquires(proto_list_mutex)
4017{
4018	mutex_lock(&proto_list_mutex);
4019	return seq_list_start_head(&proto_list, *pos);
4020}
4021
4022static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4023{
4024	return seq_list_next(v, &proto_list, pos);
4025}
4026
4027static void proto_seq_stop(struct seq_file *seq, void *v)
4028	__releases(proto_list_mutex)
4029{
4030	mutex_unlock(&proto_list_mutex);
4031}
4032
4033static char proto_method_implemented(const void *method)
4034{
4035	return method == NULL ? 'n' : 'y';
4036}
4037static long sock_prot_memory_allocated(struct proto *proto)
4038{
4039	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4040}
4041
4042static const char *sock_prot_memory_pressure(struct proto *proto)
4043{
4044	return proto->memory_pressure != NULL ?
4045	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4046}
4047
4048static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4049{
4050
4051	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4052			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4053		   proto->name,
4054		   proto->obj_size,
4055		   sock_prot_inuse_get(seq_file_net(seq), proto),
4056		   sock_prot_memory_allocated(proto),
4057		   sock_prot_memory_pressure(proto),
4058		   proto->max_header,
4059		   proto->slab == NULL ? "no" : "yes",
4060		   module_name(proto->owner),
4061		   proto_method_implemented(proto->close),
4062		   proto_method_implemented(proto->connect),
4063		   proto_method_implemented(proto->disconnect),
4064		   proto_method_implemented(proto->accept),
4065		   proto_method_implemented(proto->ioctl),
4066		   proto_method_implemented(proto->init),
4067		   proto_method_implemented(proto->destroy),
4068		   proto_method_implemented(proto->shutdown),
4069		   proto_method_implemented(proto->setsockopt),
4070		   proto_method_implemented(proto->getsockopt),
4071		   proto_method_implemented(proto->sendmsg),
4072		   proto_method_implemented(proto->recvmsg),
4073		   proto_method_implemented(proto->bind),
4074		   proto_method_implemented(proto->backlog_rcv),
4075		   proto_method_implemented(proto->hash),
4076		   proto_method_implemented(proto->unhash),
4077		   proto_method_implemented(proto->get_port),
4078		   proto_method_implemented(proto->enter_memory_pressure));
4079}
4080
4081static int proto_seq_show(struct seq_file *seq, void *v)
4082{
4083	if (v == &proto_list)
4084		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4085			   "protocol",
4086			   "size",
4087			   "sockets",
4088			   "memory",
4089			   "press",
4090			   "maxhdr",
4091			   "slab",
4092			   "module",
4093			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4094	else
4095		proto_seq_printf(seq, list_entry(v, struct proto, node));
4096	return 0;
4097}
4098
4099static const struct seq_operations proto_seq_ops = {
4100	.start  = proto_seq_start,
4101	.next   = proto_seq_next,
4102	.stop   = proto_seq_stop,
4103	.show   = proto_seq_show,
4104};
4105
4106static __net_init int proto_init_net(struct net *net)
4107{
4108	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4109			sizeof(struct seq_net_private)))
4110		return -ENOMEM;
4111
4112	return 0;
4113}
4114
4115static __net_exit void proto_exit_net(struct net *net)
4116{
4117	remove_proc_entry("protocols", net->proc_net);
4118}
4119
4120
4121static __net_initdata struct pernet_operations proto_net_ops = {
4122	.init = proto_init_net,
4123	.exit = proto_exit_net,
4124};
4125
4126static int __init proto_init(void)
4127{
4128	return register_pernet_subsys(&proto_net_ops);
4129}
4130
4131subsys_initcall(proto_init);
4132
4133#endif /* PROC_FS */
4134
4135#ifdef CONFIG_NET_RX_BUSY_POLL
4136bool sk_busy_loop_end(void *p, unsigned long start_time)
4137{
4138	struct sock *sk = p;
4139
4140	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4141		return true;
4142
4143	if (sk_is_udp(sk) &&
4144	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4145		return true;
4146
4147	return sk_busy_loop_timeout(sk, start_time);
4148}
4149EXPORT_SYMBOL(sk_busy_loop_end);
4150#endif /* CONFIG_NET_RX_BUSY_POLL */
4151
4152int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4153{
4154	if (!sk->sk_prot->bind_add)
4155		return -EOPNOTSUPP;
4156	return sk->sk_prot->bind_add(sk, addr, addr_len);
4157}
4158EXPORT_SYMBOL(sock_bind_add);
4159
4160/* Copy 'size' bytes from userspace and return `size` back to userspace */
4161int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4162		     void __user *arg, void *karg, size_t size)
4163{
4164	int ret;
4165
4166	if (copy_from_user(karg, arg, size))
4167		return -EFAULT;
4168
4169	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4170	if (ret)
4171		return ret;
4172
4173	if (copy_to_user(arg, karg, size))
4174		return -EFAULT;
4175
4176	return 0;
4177}
4178EXPORT_SYMBOL(sock_ioctl_inout);
4179
4180/* This is the most common ioctl prep function, where the result (4 bytes) is
4181 * copied back to userspace if the ioctl() returns successfully. No input is
4182 * copied from userspace as input argument.
4183 */
4184static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4185{
4186	int ret, karg = 0;
4187
4188	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4189	if (ret)
4190		return ret;
4191
4192	return put_user(karg, (int __user *)arg);
4193}
4194
4195/* A wrapper around sock ioctls, which copies the data from userspace
4196 * (depending on the protocol/ioctl), and copies back the result to userspace.
4197 * The main motivation for this function is to pass kernel memory to the
4198 * protocol ioctl callbacks, instead of userspace memory.
4199 */
4200int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4201{
4202	int rc = 1;
4203
4204	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4205		rc = ipmr_sk_ioctl(sk, cmd, arg);
4206	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4207		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4208	else if (sk_is_phonet(sk))
4209		rc = phonet_sk_ioctl(sk, cmd, arg);
4210
4211	/* If ioctl was processed, returns its value */
4212	if (rc <= 0)
4213		return rc;
4214
4215	/* Otherwise call the default handler */
4216	return sock_ioctl_out(sk, cmd, arg);
4217}
4218EXPORT_SYMBOL(sk_ioctl);
4219