1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88#include <asm/unaligned.h> 89#include <linux/capability.h> 90#include <linux/errno.h> 91#include <linux/errqueue.h> 92#include <linux/types.h> 93#include <linux/socket.h> 94#include <linux/in.h> 95#include <linux/kernel.h> 96#include <linux/module.h> 97#include <linux/proc_fs.h> 98#include <linux/seq_file.h> 99#include <linux/sched.h> 100#include <linux/sched/mm.h> 101#include <linux/timer.h> 102#include <linux/string.h> 103#include <linux/sockios.h> 104#include <linux/net.h> 105#include <linux/mm.h> 106#include <linux/slab.h> 107#include <linux/interrupt.h> 108#include <linux/poll.h> 109#include <linux/tcp.h> 110#include <linux/init.h> 111#include <linux/highmem.h> 112#include <linux/user_namespace.h> 113#include <linux/static_key.h> 114#include <linux/memcontrol.h> 115#include <linux/prefetch.h> 116#include <linux/compat.h> 117 118#include <linux/uaccess.h> 119 120#include <linux/netdevice.h> 121#include <net/protocol.h> 122#include <linux/skbuff.h> 123#include <net/net_namespace.h> 124#include <net/request_sock.h> 125#include <net/sock.h> 126#include <linux/net_tstamp.h> 127#include <net/xfrm.h> 128#include <linux/ipsec.h> 129#include <net/cls_cgroup.h> 130#include <net/netprio_cgroup.h> 131#include <linux/sock_diag.h> 132 133#include <linux/filter.h> 134#include <net/sock_reuseport.h> 135#include <net/bpf_sk_storage.h> 136 137#include <trace/events/sock.h> 138 139#include <net/tcp.h> 140#include <net/busy_poll.h> 141 142static DEFINE_MUTEX(proto_list_mutex); 143static LIST_HEAD(proto_list); 144 145static void sock_inuse_add(struct net *net, int val); 146 147/** 148 * sk_ns_capable - General socket capability test 149 * @sk: Socket to use a capability on or through 150 * @user_ns: The user namespace of the capability to use 151 * @cap: The capability to use 152 * 153 * Test to see if the opener of the socket had when the socket was 154 * created and the current process has the capability @cap in the user 155 * namespace @user_ns. 156 */ 157bool sk_ns_capable(const struct sock *sk, 158 struct user_namespace *user_ns, int cap) 159{ 160 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 161 ns_capable(user_ns, cap); 162} 163EXPORT_SYMBOL(sk_ns_capable); 164 165/** 166 * sk_capable - Socket global capability test 167 * @sk: Socket to use a capability on or through 168 * @cap: The global capability to use 169 * 170 * Test to see if the opener of the socket had when the socket was 171 * created and the current process has the capability @cap in all user 172 * namespaces. 173 */ 174bool sk_capable(const struct sock *sk, int cap) 175{ 176 return sk_ns_capable(sk, &init_user_ns, cap); 177} 178EXPORT_SYMBOL(sk_capable); 179 180/** 181 * sk_net_capable - Network namespace socket capability test 182 * @sk: Socket to use a capability on or through 183 * @cap: The capability to use 184 * 185 * Test to see if the opener of the socket had when the socket was created 186 * and the current process has the capability @cap over the network namespace 187 * the socket is a member of. 188 */ 189bool sk_net_capable(const struct sock *sk, int cap) 190{ 191 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 192} 193EXPORT_SYMBOL(sk_net_capable); 194 195/* 196 * Each address family might have different locking rules, so we have 197 * one slock key per address family and separate keys for internal and 198 * userspace sockets. 199 */ 200static struct lock_class_key af_family_keys[AF_MAX]; 201static struct lock_class_key af_family_kern_keys[AF_MAX]; 202static struct lock_class_key af_family_slock_keys[AF_MAX]; 203static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 204 205/* 206 * Make lock validator output more readable. (we pre-construct these 207 * strings build-time, so that runtime initialization of socket 208 * locks is fast): 209 */ 210 211#define _sock_locks(x) \ 212 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 213 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 214 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 215 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 216 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 217 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 218 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 219 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 220 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 221 x "27" , x "28" , x "AF_CAN" , \ 222 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 223 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 224 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 225 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 226 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 227 x "AF_MAX" 228 229static const char *const af_family_key_strings[AF_MAX+1] = { 230 _sock_locks("sk_lock-") 231}; 232static const char *const af_family_slock_key_strings[AF_MAX+1] = { 233 _sock_locks("slock-") 234}; 235static const char *const af_family_clock_key_strings[AF_MAX+1] = { 236 _sock_locks("clock-") 237}; 238 239static const char *const af_family_kern_key_strings[AF_MAX+1] = { 240 _sock_locks("k-sk_lock-") 241}; 242static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 243 _sock_locks("k-slock-") 244}; 245static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 246 _sock_locks("k-clock-") 247}; 248static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 249 _sock_locks("rlock-") 250}; 251static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 252 _sock_locks("wlock-") 253}; 254static const char *const af_family_elock_key_strings[AF_MAX+1] = { 255 _sock_locks("elock-") 256}; 257 258/* 259 * sk_callback_lock and sk queues locking rules are per-address-family, 260 * so split the lock classes by using a per-AF key: 261 */ 262static struct lock_class_key af_callback_keys[AF_MAX]; 263static struct lock_class_key af_rlock_keys[AF_MAX]; 264static struct lock_class_key af_wlock_keys[AF_MAX]; 265static struct lock_class_key af_elock_keys[AF_MAX]; 266static struct lock_class_key af_kern_callback_keys[AF_MAX]; 267 268/* Run time adjustable parameters. */ 269__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 270EXPORT_SYMBOL(sysctl_wmem_max); 271__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 272EXPORT_SYMBOL(sysctl_rmem_max); 273__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 274__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 275 276/* Maximal space eaten by iovec or ancillary data plus some space */ 277int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 278EXPORT_SYMBOL(sysctl_optmem_max); 279 280int sysctl_tstamp_allow_data __read_mostly = 1; 281 282DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 283EXPORT_SYMBOL_GPL(memalloc_socks_key); 284 285/** 286 * sk_set_memalloc - sets %SOCK_MEMALLOC 287 * @sk: socket to set it on 288 * 289 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 290 * It's the responsibility of the admin to adjust min_free_kbytes 291 * to meet the requirements 292 */ 293void sk_set_memalloc(struct sock *sk) 294{ 295 sock_set_flag(sk, SOCK_MEMALLOC); 296 sk->sk_allocation |= __GFP_MEMALLOC; 297 static_branch_inc(&memalloc_socks_key); 298} 299EXPORT_SYMBOL_GPL(sk_set_memalloc); 300 301void sk_clear_memalloc(struct sock *sk) 302{ 303 sock_reset_flag(sk, SOCK_MEMALLOC); 304 sk->sk_allocation &= ~__GFP_MEMALLOC; 305 static_branch_dec(&memalloc_socks_key); 306 307 /* 308 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 309 * progress of swapping. SOCK_MEMALLOC may be cleared while 310 * it has rmem allocations due to the last swapfile being deactivated 311 * but there is a risk that the socket is unusable due to exceeding 312 * the rmem limits. Reclaim the reserves and obey rmem limits again. 313 */ 314 sk_mem_reclaim(sk); 315} 316EXPORT_SYMBOL_GPL(sk_clear_memalloc); 317 318int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 319{ 320 int ret; 321 unsigned int noreclaim_flag; 322 323 /* these should have been dropped before queueing */ 324 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 325 326 noreclaim_flag = memalloc_noreclaim_save(); 327 ret = sk->sk_backlog_rcv(sk, skb); 328 memalloc_noreclaim_restore(noreclaim_flag); 329 330 return ret; 331} 332EXPORT_SYMBOL(__sk_backlog_rcv); 333 334static int sock_get_timeout(long timeo, void *optval, bool old_timeval) 335{ 336 struct __kernel_sock_timeval tv; 337 338 if (timeo == MAX_SCHEDULE_TIMEOUT) { 339 tv.tv_sec = 0; 340 tv.tv_usec = 0; 341 } else { 342 tv.tv_sec = timeo / HZ; 343 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 344 } 345 346 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 347 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 348 *(struct old_timeval32 *)optval = tv32; 349 return sizeof(tv32); 350 } 351 352 if (old_timeval) { 353 struct __kernel_old_timeval old_tv; 354 old_tv.tv_sec = tv.tv_sec; 355 old_tv.tv_usec = tv.tv_usec; 356 *(struct __kernel_old_timeval *)optval = old_tv; 357 return sizeof(old_tv); 358 } 359 360 *(struct __kernel_sock_timeval *)optval = tv; 361 return sizeof(tv); 362} 363 364static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 365 bool old_timeval) 366{ 367 struct __kernel_sock_timeval tv; 368 369 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 370 struct old_timeval32 tv32; 371 372 if (optlen < sizeof(tv32)) 373 return -EINVAL; 374 375 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 376 return -EFAULT; 377 tv.tv_sec = tv32.tv_sec; 378 tv.tv_usec = tv32.tv_usec; 379 } else if (old_timeval) { 380 struct __kernel_old_timeval old_tv; 381 382 if (optlen < sizeof(old_tv)) 383 return -EINVAL; 384 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 385 return -EFAULT; 386 tv.tv_sec = old_tv.tv_sec; 387 tv.tv_usec = old_tv.tv_usec; 388 } else { 389 if (optlen < sizeof(tv)) 390 return -EINVAL; 391 if (copy_from_sockptr(&tv, optval, sizeof(tv))) 392 return -EFAULT; 393 } 394 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 395 return -EDOM; 396 397 if (tv.tv_sec < 0) { 398 static int warned __read_mostly; 399 400 *timeo_p = 0; 401 if (warned < 10 && net_ratelimit()) { 402 warned++; 403 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 404 __func__, current->comm, task_pid_nr(current)); 405 } 406 return 0; 407 } 408 *timeo_p = MAX_SCHEDULE_TIMEOUT; 409 if (tv.tv_sec == 0 && tv.tv_usec == 0) 410 return 0; 411 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) 412 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); 413 return 0; 414} 415 416static bool sock_needs_netstamp(const struct sock *sk) 417{ 418 switch (sk->sk_family) { 419 case AF_UNSPEC: 420 case AF_UNIX: 421 return false; 422 default: 423 return true; 424 } 425} 426 427static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 428{ 429 if (sk->sk_flags & flags) { 430 sk->sk_flags &= ~flags; 431 if (sock_needs_netstamp(sk) && 432 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 433 net_disable_timestamp(); 434 } 435} 436 437 438int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 439{ 440 unsigned long flags; 441 struct sk_buff_head *list = &sk->sk_receive_queue; 442 443 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 444 atomic_inc(&sk->sk_drops); 445 trace_sock_rcvqueue_full(sk, skb); 446 return -ENOMEM; 447 } 448 449 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 450 atomic_inc(&sk->sk_drops); 451 return -ENOBUFS; 452 } 453 454 skb->dev = NULL; 455 skb_set_owner_r(skb, sk); 456 457 /* we escape from rcu protected region, make sure we dont leak 458 * a norefcounted dst 459 */ 460 skb_dst_force(skb); 461 462 spin_lock_irqsave(&list->lock, flags); 463 sock_skb_set_dropcount(sk, skb); 464 __skb_queue_tail(list, skb); 465 spin_unlock_irqrestore(&list->lock, flags); 466 467 if (!sock_flag(sk, SOCK_DEAD)) 468 sk->sk_data_ready(sk); 469 return 0; 470} 471EXPORT_SYMBOL(__sock_queue_rcv_skb); 472 473int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 474{ 475 int err; 476 477 err = sk_filter(sk, skb); 478 if (err) 479 return err; 480 481 return __sock_queue_rcv_skb(sk, skb); 482} 483EXPORT_SYMBOL(sock_queue_rcv_skb); 484 485int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 486 const int nested, unsigned int trim_cap, bool refcounted) 487{ 488 int rc = NET_RX_SUCCESS; 489 490 if (sk_filter_trim_cap(sk, skb, trim_cap)) 491 goto discard_and_relse; 492 493 skb->dev = NULL; 494 495 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 496 atomic_inc(&sk->sk_drops); 497 goto discard_and_relse; 498 } 499 if (nested) 500 bh_lock_sock_nested(sk); 501 else 502 bh_lock_sock(sk); 503 if (!sock_owned_by_user(sk)) { 504 /* 505 * trylock + unlock semantics: 506 */ 507 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 508 509 rc = sk_backlog_rcv(sk, skb); 510 511 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 512 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { 513 bh_unlock_sock(sk); 514 atomic_inc(&sk->sk_drops); 515 goto discard_and_relse; 516 } 517 518 bh_unlock_sock(sk); 519out: 520 if (refcounted) 521 sock_put(sk); 522 return rc; 523discard_and_relse: 524 kfree_skb(skb); 525 goto out; 526} 527EXPORT_SYMBOL(__sk_receive_skb); 528 529struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 530{ 531 struct dst_entry *dst = __sk_dst_get(sk); 532 533 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 534 sk_tx_queue_clear(sk); 535 WRITE_ONCE(sk->sk_dst_pending_confirm, 0); 536 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 537 dst_release(dst); 538 return NULL; 539 } 540 541 return dst; 542} 543EXPORT_SYMBOL(__sk_dst_check); 544 545struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 546{ 547 struct dst_entry *dst = sk_dst_get(sk); 548 549 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 550 sk_dst_reset(sk); 551 dst_release(dst); 552 return NULL; 553 } 554 555 return dst; 556} 557EXPORT_SYMBOL(sk_dst_check); 558 559static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 560{ 561 int ret = -ENOPROTOOPT; 562#ifdef CONFIG_NETDEVICES 563 struct net *net = sock_net(sk); 564 565 /* Sorry... */ 566 ret = -EPERM; 567 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 568 goto out; 569 570 ret = -EINVAL; 571 if (ifindex < 0) 572 goto out; 573 574 sk->sk_bound_dev_if = ifindex; 575 if (sk->sk_prot->rehash) 576 sk->sk_prot->rehash(sk); 577 sk_dst_reset(sk); 578 579 ret = 0; 580 581out: 582#endif 583 584 return ret; 585} 586 587int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 588{ 589 int ret; 590 591 if (lock_sk) 592 lock_sock(sk); 593 ret = sock_bindtoindex_locked(sk, ifindex); 594 if (lock_sk) 595 release_sock(sk); 596 597 return ret; 598} 599EXPORT_SYMBOL(sock_bindtoindex); 600 601static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 602{ 603 int ret = -ENOPROTOOPT; 604#ifdef CONFIG_NETDEVICES 605 struct net *net = sock_net(sk); 606 char devname[IFNAMSIZ]; 607 int index; 608 609 ret = -EINVAL; 610 if (optlen < 0) 611 goto out; 612 613 /* Bind this socket to a particular device like "eth0", 614 * as specified in the passed interface name. If the 615 * name is "" or the option length is zero the socket 616 * is not bound. 617 */ 618 if (optlen > IFNAMSIZ - 1) 619 optlen = IFNAMSIZ - 1; 620 memset(devname, 0, sizeof(devname)); 621 622 ret = -EFAULT; 623 if (copy_from_sockptr(devname, optval, optlen)) 624 goto out; 625 626 index = 0; 627 if (devname[0] != '\0') { 628 struct net_device *dev; 629 630 rcu_read_lock(); 631 dev = dev_get_by_name_rcu(net, devname); 632 if (dev) 633 index = dev->ifindex; 634 rcu_read_unlock(); 635 ret = -ENODEV; 636 if (!dev) 637 goto out; 638 } 639 640 return sock_bindtoindex(sk, index, true); 641out: 642#endif 643 644 return ret; 645} 646 647static int sock_getbindtodevice(struct sock *sk, char __user *optval, 648 int __user *optlen, int len) 649{ 650 int ret = -ENOPROTOOPT; 651#ifdef CONFIG_NETDEVICES 652 struct net *net = sock_net(sk); 653 char devname[IFNAMSIZ]; 654 655 if (sk->sk_bound_dev_if == 0) { 656 len = 0; 657 goto zero; 658 } 659 660 ret = -EINVAL; 661 if (len < IFNAMSIZ) 662 goto out; 663 664 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 665 if (ret) 666 goto out; 667 668 len = strlen(devname) + 1; 669 670 ret = -EFAULT; 671 if (copy_to_user(optval, devname, len)) 672 goto out; 673 674zero: 675 ret = -EFAULT; 676 if (put_user(len, optlen)) 677 goto out; 678 679 ret = 0; 680 681out: 682#endif 683 684 return ret; 685} 686 687bool sk_mc_loop(struct sock *sk) 688{ 689 if (dev_recursion_level()) 690 return false; 691 if (!sk) 692 return true; 693 /* IPV6_ADDRFORM can change sk->sk_family under us. */ 694 switch (READ_ONCE(sk->sk_family)) { 695 case AF_INET: 696 return inet_sk(sk)->mc_loop; 697#if IS_ENABLED(CONFIG_IPV6) 698 case AF_INET6: 699 return inet6_sk(sk)->mc_loop; 700#endif 701 } 702 WARN_ON_ONCE(1); 703 return true; 704} 705EXPORT_SYMBOL(sk_mc_loop); 706 707void sock_set_reuseaddr(struct sock *sk) 708{ 709 lock_sock(sk); 710 sk->sk_reuse = SK_CAN_REUSE; 711 release_sock(sk); 712} 713EXPORT_SYMBOL(sock_set_reuseaddr); 714 715void sock_set_reuseport(struct sock *sk) 716{ 717 lock_sock(sk); 718 sk->sk_reuseport = true; 719 release_sock(sk); 720} 721EXPORT_SYMBOL(sock_set_reuseport); 722 723void sock_no_linger(struct sock *sk) 724{ 725 lock_sock(sk); 726 sk->sk_lingertime = 0; 727 sock_set_flag(sk, SOCK_LINGER); 728 release_sock(sk); 729} 730EXPORT_SYMBOL(sock_no_linger); 731 732void sock_set_priority(struct sock *sk, u32 priority) 733{ 734 lock_sock(sk); 735 sk->sk_priority = priority; 736 release_sock(sk); 737} 738EXPORT_SYMBOL(sock_set_priority); 739 740void sock_set_sndtimeo(struct sock *sk, s64 secs) 741{ 742 lock_sock(sk); 743 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 744 sk->sk_sndtimeo = secs * HZ; 745 else 746 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 747 release_sock(sk); 748} 749EXPORT_SYMBOL(sock_set_sndtimeo); 750 751static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 752{ 753 if (val) { 754 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 755 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); 756 sock_set_flag(sk, SOCK_RCVTSTAMP); 757 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 758 } else { 759 sock_reset_flag(sk, SOCK_RCVTSTAMP); 760 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 761 } 762} 763 764void sock_enable_timestamps(struct sock *sk) 765{ 766 lock_sock(sk); 767 __sock_set_timestamps(sk, true, false, true); 768 release_sock(sk); 769} 770EXPORT_SYMBOL(sock_enable_timestamps); 771 772void sock_set_keepalive(struct sock *sk) 773{ 774 lock_sock(sk); 775 if (sk->sk_prot->keepalive) 776 sk->sk_prot->keepalive(sk, true); 777 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 778 release_sock(sk); 779} 780EXPORT_SYMBOL(sock_set_keepalive); 781 782static void __sock_set_rcvbuf(struct sock *sk, int val) 783{ 784 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 785 * as a negative value. 786 */ 787 val = min_t(int, val, INT_MAX / 2); 788 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 789 790 /* We double it on the way in to account for "struct sk_buff" etc. 791 * overhead. Applications assume that the SO_RCVBUF setting they make 792 * will allow that much actual data to be received on that socket. 793 * 794 * Applications are unaware that "struct sk_buff" and other overheads 795 * allocate from the receive buffer during socket buffer allocation. 796 * 797 * And after considering the possible alternatives, returning the value 798 * we actually used in getsockopt is the most desirable behavior. 799 */ 800 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 801} 802 803void sock_set_rcvbuf(struct sock *sk, int val) 804{ 805 lock_sock(sk); 806 __sock_set_rcvbuf(sk, val); 807 release_sock(sk); 808} 809EXPORT_SYMBOL(sock_set_rcvbuf); 810 811static void __sock_set_mark(struct sock *sk, u32 val) 812{ 813 if (val != sk->sk_mark) { 814 sk->sk_mark = val; 815 sk_dst_reset(sk); 816 } 817} 818 819void sock_set_mark(struct sock *sk, u32 val) 820{ 821 lock_sock(sk); 822 __sock_set_mark(sk, val); 823 release_sock(sk); 824} 825EXPORT_SYMBOL(sock_set_mark); 826 827/* 828 * This is meant for all protocols to use and covers goings on 829 * at the socket level. Everything here is generic. 830 */ 831 832int sock_setsockopt(struct socket *sock, int level, int optname, 833 sockptr_t optval, unsigned int optlen) 834{ 835 struct sock_txtime sk_txtime; 836 struct sock *sk = sock->sk; 837 int val; 838 int valbool; 839 struct linger ling; 840 int ret = 0; 841 842 /* 843 * Options without arguments 844 */ 845 846 if (optname == SO_BINDTODEVICE) 847 return sock_setbindtodevice(sk, optval, optlen); 848 849 if (optlen < sizeof(int)) 850 return -EINVAL; 851 852 if (copy_from_sockptr(&val, optval, sizeof(val))) 853 return -EFAULT; 854 855 valbool = val ? 1 : 0; 856 857 lock_sock(sk); 858 859 switch (optname) { 860 case SO_DEBUG: 861 if (val && !capable(CAP_NET_ADMIN)) 862 ret = -EACCES; 863 else 864 sock_valbool_flag(sk, SOCK_DBG, valbool); 865 break; 866 case SO_REUSEADDR: 867 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 868 break; 869 case SO_REUSEPORT: 870 sk->sk_reuseport = valbool; 871 break; 872 case SO_TYPE: 873 case SO_PROTOCOL: 874 case SO_DOMAIN: 875 case SO_ERROR: 876 ret = -ENOPROTOOPT; 877 break; 878 case SO_DONTROUTE: 879 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 880 sk_dst_reset(sk); 881 break; 882 case SO_BROADCAST: 883 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 884 break; 885 case SO_SNDBUF: 886 /* Don't error on this BSD doesn't and if you think 887 * about it this is right. Otherwise apps have to 888 * play 'guess the biggest size' games. RCVBUF/SNDBUF 889 * are treated in BSD as hints 890 */ 891 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); 892set_sndbuf: 893 /* Ensure val * 2 fits into an int, to prevent max_t() 894 * from treating it as a negative value. 895 */ 896 val = min_t(int, val, INT_MAX / 2); 897 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 898 WRITE_ONCE(sk->sk_sndbuf, 899 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 900 /* Wake up sending tasks if we upped the value. */ 901 sk->sk_write_space(sk); 902 break; 903 904 case SO_SNDBUFFORCE: 905 if (!capable(CAP_NET_ADMIN)) { 906 ret = -EPERM; 907 break; 908 } 909 910 /* No negative values (to prevent underflow, as val will be 911 * multiplied by 2). 912 */ 913 if (val < 0) 914 val = 0; 915 goto set_sndbuf; 916 917 case SO_RCVBUF: 918 /* Don't error on this BSD doesn't and if you think 919 * about it this is right. Otherwise apps have to 920 * play 'guess the biggest size' games. RCVBUF/SNDBUF 921 * are treated in BSD as hints 922 */ 923 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); 924 break; 925 926 case SO_RCVBUFFORCE: 927 if (!capable(CAP_NET_ADMIN)) { 928 ret = -EPERM; 929 break; 930 } 931 932 /* No negative values (to prevent underflow, as val will be 933 * multiplied by 2). 934 */ 935 __sock_set_rcvbuf(sk, max(val, 0)); 936 break; 937 938 case SO_KEEPALIVE: 939 if (sk->sk_prot->keepalive) 940 sk->sk_prot->keepalive(sk, valbool); 941 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 942 break; 943 944 case SO_OOBINLINE: 945 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 946 break; 947 948 case SO_NO_CHECK: 949 sk->sk_no_check_tx = valbool; 950 break; 951 952 case SO_PRIORITY: 953 if ((val >= 0 && val <= 6) || 954 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 955 sk->sk_priority = val; 956 else 957 ret = -EPERM; 958 break; 959 960 case SO_LINGER: 961 if (optlen < sizeof(ling)) { 962 ret = -EINVAL; /* 1003.1g */ 963 break; 964 } 965 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 966 ret = -EFAULT; 967 break; 968 } 969 if (!ling.l_onoff) 970 sock_reset_flag(sk, SOCK_LINGER); 971 else { 972#if (BITS_PER_LONG == 32) 973 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 974 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 975 else 976#endif 977 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 978 sock_set_flag(sk, SOCK_LINGER); 979 } 980 break; 981 982 case SO_BSDCOMPAT: 983 break; 984 985 case SO_PASSCRED: 986 if (valbool) 987 set_bit(SOCK_PASSCRED, &sock->flags); 988 else 989 clear_bit(SOCK_PASSCRED, &sock->flags); 990 break; 991 992 case SO_TIMESTAMP_OLD: 993 __sock_set_timestamps(sk, valbool, false, false); 994 break; 995 case SO_TIMESTAMP_NEW: 996 __sock_set_timestamps(sk, valbool, true, false); 997 break; 998 case SO_TIMESTAMPNS_OLD: 999 __sock_set_timestamps(sk, valbool, false, true); 1000 break; 1001 case SO_TIMESTAMPNS_NEW: 1002 __sock_set_timestamps(sk, valbool, true, true); 1003 break; 1004 case SO_TIMESTAMPING_NEW: 1005 case SO_TIMESTAMPING_OLD: 1006 if (val & ~SOF_TIMESTAMPING_MASK) { 1007 ret = -EINVAL; 1008 break; 1009 } 1010 1011 if (val & SOF_TIMESTAMPING_OPT_ID && 1012 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 1013 if (sk->sk_protocol == IPPROTO_TCP && 1014 sk->sk_type == SOCK_STREAM) { 1015 if ((1 << sk->sk_state) & 1016 (TCPF_CLOSE | TCPF_LISTEN)) { 1017 ret = -EINVAL; 1018 break; 1019 } 1020 sk->sk_tskey = tcp_sk(sk)->snd_una; 1021 } else { 1022 sk->sk_tskey = 0; 1023 } 1024 } 1025 1026 if (val & SOF_TIMESTAMPING_OPT_STATS && 1027 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { 1028 ret = -EINVAL; 1029 break; 1030 } 1031 1032 sk->sk_tsflags = val; 1033 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 1034 1035 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 1036 sock_enable_timestamp(sk, 1037 SOCK_TIMESTAMPING_RX_SOFTWARE); 1038 else 1039 sock_disable_timestamp(sk, 1040 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 1041 break; 1042 1043 case SO_RCVLOWAT: 1044 if (val < 0) 1045 val = INT_MAX; 1046 if (sock->ops->set_rcvlowat) 1047 ret = sock->ops->set_rcvlowat(sk, val); 1048 else 1049 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1050 break; 1051 1052 case SO_RCVTIMEO_OLD: 1053 case SO_RCVTIMEO_NEW: 1054 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, 1055 optlen, optname == SO_RCVTIMEO_OLD); 1056 break; 1057 1058 case SO_SNDTIMEO_OLD: 1059 case SO_SNDTIMEO_NEW: 1060 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, 1061 optlen, optname == SO_SNDTIMEO_OLD); 1062 break; 1063 1064 case SO_ATTACH_FILTER: { 1065 struct sock_fprog fprog; 1066 1067 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1068 if (!ret) 1069 ret = sk_attach_filter(&fprog, sk); 1070 break; 1071 } 1072 case SO_ATTACH_BPF: 1073 ret = -EINVAL; 1074 if (optlen == sizeof(u32)) { 1075 u32 ufd; 1076 1077 ret = -EFAULT; 1078 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1079 break; 1080 1081 ret = sk_attach_bpf(ufd, sk); 1082 } 1083 break; 1084 1085 case SO_ATTACH_REUSEPORT_CBPF: { 1086 struct sock_fprog fprog; 1087 1088 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1089 if (!ret) 1090 ret = sk_reuseport_attach_filter(&fprog, sk); 1091 break; 1092 } 1093 case SO_ATTACH_REUSEPORT_EBPF: 1094 ret = -EINVAL; 1095 if (optlen == sizeof(u32)) { 1096 u32 ufd; 1097 1098 ret = -EFAULT; 1099 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1100 break; 1101 1102 ret = sk_reuseport_attach_bpf(ufd, sk); 1103 } 1104 break; 1105 1106 case SO_DETACH_REUSEPORT_BPF: 1107 ret = reuseport_detach_prog(sk); 1108 break; 1109 1110 case SO_DETACH_FILTER: 1111 ret = sk_detach_filter(sk); 1112 break; 1113 1114 case SO_LOCK_FILTER: 1115 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1116 ret = -EPERM; 1117 else 1118 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1119 break; 1120 1121 case SO_PASSSEC: 1122 if (valbool) 1123 set_bit(SOCK_PASSSEC, &sock->flags); 1124 else 1125 clear_bit(SOCK_PASSSEC, &sock->flags); 1126 break; 1127 case SO_MARK: 1128 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1129 ret = -EPERM; 1130 break; 1131 } 1132 1133 __sock_set_mark(sk, val); 1134 break; 1135 1136 case SO_RXQ_OVFL: 1137 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1138 break; 1139 1140 case SO_WIFI_STATUS: 1141 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1142 break; 1143 1144 case SO_PEEK_OFF: 1145 if (sock->ops->set_peek_off) 1146 ret = sock->ops->set_peek_off(sk, val); 1147 else 1148 ret = -EOPNOTSUPP; 1149 break; 1150 1151 case SO_NOFCS: 1152 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1153 break; 1154 1155 case SO_SELECT_ERR_QUEUE: 1156 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1157 break; 1158 1159#ifdef CONFIG_NET_RX_BUSY_POLL 1160 case SO_BUSY_POLL: 1161 /* allow unprivileged users to decrease the value */ 1162 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 1163 ret = -EPERM; 1164 else { 1165 if (val < 0) 1166 ret = -EINVAL; 1167 else 1168 WRITE_ONCE(sk->sk_ll_usec, val); 1169 } 1170 break; 1171#endif 1172 1173 case SO_MAX_PACING_RATE: 1174 { 1175 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1176 1177 if (sizeof(ulval) != sizeof(val) && 1178 optlen >= sizeof(ulval) && 1179 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1180 ret = -EFAULT; 1181 break; 1182 } 1183 if (ulval != ~0UL) 1184 cmpxchg(&sk->sk_pacing_status, 1185 SK_PACING_NONE, 1186 SK_PACING_NEEDED); 1187 /* Pairs with READ_ONCE() from sk_getsockopt() */ 1188 WRITE_ONCE(sk->sk_max_pacing_rate, ulval); 1189 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); 1190 break; 1191 } 1192 case SO_INCOMING_CPU: 1193 WRITE_ONCE(sk->sk_incoming_cpu, val); 1194 break; 1195 1196 case SO_CNX_ADVICE: 1197 if (val == 1) 1198 dst_negative_advice(sk); 1199 break; 1200 1201 case SO_ZEROCOPY: 1202 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1203 if (!((sk->sk_type == SOCK_STREAM && 1204 sk->sk_protocol == IPPROTO_TCP) || 1205 (sk->sk_type == SOCK_DGRAM && 1206 sk->sk_protocol == IPPROTO_UDP))) 1207 ret = -ENOTSUPP; 1208 } else if (sk->sk_family != PF_RDS) { 1209 ret = -ENOTSUPP; 1210 } 1211 if (!ret) { 1212 if (val < 0 || val > 1) 1213 ret = -EINVAL; 1214 else 1215 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1216 } 1217 break; 1218 1219 case SO_TXTIME: 1220 if (optlen != sizeof(struct sock_txtime)) { 1221 ret = -EINVAL; 1222 break; 1223 } else if (copy_from_sockptr(&sk_txtime, optval, 1224 sizeof(struct sock_txtime))) { 1225 ret = -EFAULT; 1226 break; 1227 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1228 ret = -EINVAL; 1229 break; 1230 } 1231 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1232 * scheduler has enough safe guards. 1233 */ 1234 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1235 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1236 ret = -EPERM; 1237 break; 1238 } 1239 sock_valbool_flag(sk, SOCK_TXTIME, true); 1240 sk->sk_clockid = sk_txtime.clockid; 1241 sk->sk_txtime_deadline_mode = 1242 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1243 sk->sk_txtime_report_errors = 1244 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1245 break; 1246 1247 case SO_BINDTOIFINDEX: 1248 ret = sock_bindtoindex_locked(sk, val); 1249 break; 1250 1251 default: 1252 ret = -ENOPROTOOPT; 1253 break; 1254 } 1255 release_sock(sk); 1256 return ret; 1257} 1258EXPORT_SYMBOL(sock_setsockopt); 1259 1260static const struct cred *sk_get_peer_cred(struct sock *sk) 1261{ 1262 const struct cred *cred; 1263 1264 spin_lock(&sk->sk_peer_lock); 1265 cred = get_cred(sk->sk_peer_cred); 1266 spin_unlock(&sk->sk_peer_lock); 1267 1268 return cred; 1269} 1270 1271static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1272 struct ucred *ucred) 1273{ 1274 ucred->pid = pid_vnr(pid); 1275 ucred->uid = ucred->gid = -1; 1276 if (cred) { 1277 struct user_namespace *current_ns = current_user_ns(); 1278 1279 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1280 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1281 } 1282} 1283 1284static int groups_to_user(gid_t __user *dst, const struct group_info *src) 1285{ 1286 struct user_namespace *user_ns = current_user_ns(); 1287 int i; 1288 1289 for (i = 0; i < src->ngroups; i++) 1290 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) 1291 return -EFAULT; 1292 1293 return 0; 1294} 1295 1296int sock_getsockopt(struct socket *sock, int level, int optname, 1297 char __user *optval, int __user *optlen) 1298{ 1299 struct sock *sk = sock->sk; 1300 1301 union { 1302 int val; 1303 u64 val64; 1304 unsigned long ulval; 1305 struct linger ling; 1306 struct old_timeval32 tm32; 1307 struct __kernel_old_timeval tm; 1308 struct __kernel_sock_timeval stm; 1309 struct sock_txtime txtime; 1310 } v; 1311 1312 int lv = sizeof(int); 1313 int len; 1314 1315 if (get_user(len, optlen)) 1316 return -EFAULT; 1317 if (len < 0) 1318 return -EINVAL; 1319 1320 memset(&v, 0, sizeof(v)); 1321 1322 switch (optname) { 1323 case SO_DEBUG: 1324 v.val = sock_flag(sk, SOCK_DBG); 1325 break; 1326 1327 case SO_DONTROUTE: 1328 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1329 break; 1330 1331 case SO_BROADCAST: 1332 v.val = sock_flag(sk, SOCK_BROADCAST); 1333 break; 1334 1335 case SO_SNDBUF: 1336 v.val = READ_ONCE(sk->sk_sndbuf); 1337 break; 1338 1339 case SO_RCVBUF: 1340 v.val = READ_ONCE(sk->sk_rcvbuf); 1341 break; 1342 1343 case SO_REUSEADDR: 1344 v.val = sk->sk_reuse; 1345 break; 1346 1347 case SO_REUSEPORT: 1348 v.val = sk->sk_reuseport; 1349 break; 1350 1351 case SO_KEEPALIVE: 1352 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1353 break; 1354 1355 case SO_TYPE: 1356 v.val = sk->sk_type; 1357 break; 1358 1359 case SO_PROTOCOL: 1360 v.val = sk->sk_protocol; 1361 break; 1362 1363 case SO_DOMAIN: 1364 v.val = sk->sk_family; 1365 break; 1366 1367 case SO_ERROR: 1368 v.val = -sock_error(sk); 1369 if (v.val == 0) 1370 v.val = xchg(&sk->sk_err_soft, 0); 1371 break; 1372 1373 case SO_OOBINLINE: 1374 v.val = sock_flag(sk, SOCK_URGINLINE); 1375 break; 1376 1377 case SO_NO_CHECK: 1378 v.val = sk->sk_no_check_tx; 1379 break; 1380 1381 case SO_PRIORITY: 1382 v.val = sk->sk_priority; 1383 break; 1384 1385 case SO_LINGER: 1386 lv = sizeof(v.ling); 1387 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1388 v.ling.l_linger = sk->sk_lingertime / HZ; 1389 break; 1390 1391 case SO_BSDCOMPAT: 1392 break; 1393 1394 case SO_TIMESTAMP_OLD: 1395 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1396 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1397 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1398 break; 1399 1400 case SO_TIMESTAMPNS_OLD: 1401 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1402 break; 1403 1404 case SO_TIMESTAMP_NEW: 1405 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1406 break; 1407 1408 case SO_TIMESTAMPNS_NEW: 1409 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1410 break; 1411 1412 case SO_TIMESTAMPING_OLD: 1413 v.val = sk->sk_tsflags; 1414 break; 1415 1416 case SO_RCVTIMEO_OLD: 1417 case SO_RCVTIMEO_NEW: 1418 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); 1419 break; 1420 1421 case SO_SNDTIMEO_OLD: 1422 case SO_SNDTIMEO_NEW: 1423 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); 1424 break; 1425 1426 case SO_RCVLOWAT: 1427 v.val = READ_ONCE(sk->sk_rcvlowat); 1428 break; 1429 1430 case SO_SNDLOWAT: 1431 v.val = 1; 1432 break; 1433 1434 case SO_PASSCRED: 1435 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1436 break; 1437 1438 case SO_PEERCRED: 1439 { 1440 struct ucred peercred; 1441 if (len > sizeof(peercred)) 1442 len = sizeof(peercred); 1443 1444 spin_lock(&sk->sk_peer_lock); 1445 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1446 spin_unlock(&sk->sk_peer_lock); 1447 1448 if (copy_to_user(optval, &peercred, len)) 1449 return -EFAULT; 1450 goto lenout; 1451 } 1452 1453 case SO_PEERGROUPS: 1454 { 1455 const struct cred *cred; 1456 int ret, n; 1457 1458 cred = sk_get_peer_cred(sk); 1459 if (!cred) 1460 return -ENODATA; 1461 1462 n = cred->group_info->ngroups; 1463 if (len < n * sizeof(gid_t)) { 1464 len = n * sizeof(gid_t); 1465 put_cred(cred); 1466 return put_user(len, optlen) ? -EFAULT : -ERANGE; 1467 } 1468 len = n * sizeof(gid_t); 1469 1470 ret = groups_to_user((gid_t __user *)optval, cred->group_info); 1471 put_cred(cred); 1472 if (ret) 1473 return ret; 1474 goto lenout; 1475 } 1476 1477 case SO_PEERNAME: 1478 { 1479 char address[128]; 1480 1481 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); 1482 if (lv < 0) 1483 return -ENOTCONN; 1484 if (lv < len) 1485 return -EINVAL; 1486 if (copy_to_user(optval, address, len)) 1487 return -EFAULT; 1488 goto lenout; 1489 } 1490 1491 /* Dubious BSD thing... Probably nobody even uses it, but 1492 * the UNIX standard wants it for whatever reason... -DaveM 1493 */ 1494 case SO_ACCEPTCONN: 1495 v.val = sk->sk_state == TCP_LISTEN; 1496 break; 1497 1498 case SO_PASSSEC: 1499 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1500 break; 1501 1502 case SO_PEERSEC: 1503 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1504 1505 case SO_MARK: 1506 v.val = sk->sk_mark; 1507 break; 1508 1509 case SO_RXQ_OVFL: 1510 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1511 break; 1512 1513 case SO_WIFI_STATUS: 1514 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1515 break; 1516 1517 case SO_PEEK_OFF: 1518 if (!sock->ops->set_peek_off) 1519 return -EOPNOTSUPP; 1520 1521 v.val = READ_ONCE(sk->sk_peek_off); 1522 break; 1523 case SO_NOFCS: 1524 v.val = sock_flag(sk, SOCK_NOFCS); 1525 break; 1526 1527 case SO_BINDTODEVICE: 1528 return sock_getbindtodevice(sk, optval, optlen, len); 1529 1530 case SO_GET_FILTER: 1531 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1532 if (len < 0) 1533 return len; 1534 1535 goto lenout; 1536 1537 case SO_LOCK_FILTER: 1538 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1539 break; 1540 1541 case SO_BPF_EXTENSIONS: 1542 v.val = bpf_tell_extensions(); 1543 break; 1544 1545 case SO_SELECT_ERR_QUEUE: 1546 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1547 break; 1548 1549#ifdef CONFIG_NET_RX_BUSY_POLL 1550 case SO_BUSY_POLL: 1551 v.val = READ_ONCE(sk->sk_ll_usec); 1552 break; 1553#endif 1554 1555 case SO_MAX_PACING_RATE: 1556 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ 1557 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 1558 lv = sizeof(v.ulval); 1559 v.ulval = READ_ONCE(sk->sk_max_pacing_rate); 1560 } else { 1561 /* 32bit version */ 1562 v.val = min_t(unsigned long, ~0U, 1563 READ_ONCE(sk->sk_max_pacing_rate)); 1564 } 1565 break; 1566 1567 case SO_INCOMING_CPU: 1568 v.val = READ_ONCE(sk->sk_incoming_cpu); 1569 break; 1570 1571 case SO_MEMINFO: 1572 { 1573 u32 meminfo[SK_MEMINFO_VARS]; 1574 1575 sk_get_meminfo(sk, meminfo); 1576 1577 len = min_t(unsigned int, len, sizeof(meminfo)); 1578 if (copy_to_user(optval, &meminfo, len)) 1579 return -EFAULT; 1580 1581 goto lenout; 1582 } 1583 1584#ifdef CONFIG_NET_RX_BUSY_POLL 1585 case SO_INCOMING_NAPI_ID: 1586 v.val = READ_ONCE(sk->sk_napi_id); 1587 1588 /* aggregate non-NAPI IDs down to 0 */ 1589 if (v.val < MIN_NAPI_ID) 1590 v.val = 0; 1591 1592 break; 1593#endif 1594 1595 case SO_COOKIE: 1596 lv = sizeof(u64); 1597 if (len < lv) 1598 return -EINVAL; 1599 v.val64 = sock_gen_cookie(sk); 1600 break; 1601 1602 case SO_ZEROCOPY: 1603 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1604 break; 1605 1606 case SO_TXTIME: 1607 lv = sizeof(v.txtime); 1608 v.txtime.clockid = sk->sk_clockid; 1609 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 1610 SOF_TXTIME_DEADLINE_MODE : 0; 1611 v.txtime.flags |= sk->sk_txtime_report_errors ? 1612 SOF_TXTIME_REPORT_ERRORS : 0; 1613 break; 1614 1615 case SO_BINDTOIFINDEX: 1616 v.val = sk->sk_bound_dev_if; 1617 break; 1618 1619 default: 1620 /* We implement the SO_SNDLOWAT etc to not be settable 1621 * (1003.1g 7). 1622 */ 1623 return -ENOPROTOOPT; 1624 } 1625 1626 if (len > lv) 1627 len = lv; 1628 if (copy_to_user(optval, &v, len)) 1629 return -EFAULT; 1630lenout: 1631 if (put_user(len, optlen)) 1632 return -EFAULT; 1633 return 0; 1634} 1635 1636/* 1637 * Initialize an sk_lock. 1638 * 1639 * (We also register the sk_lock with the lock validator.) 1640 */ 1641static inline void sock_lock_init(struct sock *sk) 1642{ 1643 if (sk->sk_kern_sock) 1644 sock_lock_init_class_and_name( 1645 sk, 1646 af_family_kern_slock_key_strings[sk->sk_family], 1647 af_family_kern_slock_keys + sk->sk_family, 1648 af_family_kern_key_strings[sk->sk_family], 1649 af_family_kern_keys + sk->sk_family); 1650 else 1651 sock_lock_init_class_and_name( 1652 sk, 1653 af_family_slock_key_strings[sk->sk_family], 1654 af_family_slock_keys + sk->sk_family, 1655 af_family_key_strings[sk->sk_family], 1656 af_family_keys + sk->sk_family); 1657} 1658 1659/* 1660 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1661 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1662 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1663 */ 1664static void sock_copy(struct sock *nsk, const struct sock *osk) 1665{ 1666 const struct proto *prot = READ_ONCE(osk->sk_prot); 1667#ifdef CONFIG_SECURITY_NETWORK 1668 void *sptr = nsk->sk_security; 1669#endif 1670 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1671 1672 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1673 prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1674 1675#ifdef CONFIG_SECURITY_NETWORK 1676 nsk->sk_security = sptr; 1677 security_sk_clone(osk, nsk); 1678#endif 1679} 1680 1681static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1682 int family) 1683{ 1684 struct sock *sk; 1685 struct kmem_cache *slab; 1686 1687 slab = prot->slab; 1688 if (slab != NULL) { 1689 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1690 if (!sk) 1691 return sk; 1692 if (want_init_on_alloc(priority)) 1693 sk_prot_clear_nulls(sk, prot->obj_size); 1694 } else 1695 sk = kmalloc(prot->obj_size, priority); 1696 1697 if (sk != NULL) { 1698 if (security_sk_alloc(sk, family, priority)) 1699 goto out_free; 1700 1701 if (!try_module_get(prot->owner)) 1702 goto out_free_sec; 1703 sk_tx_queue_clear(sk); 1704 } 1705 1706 return sk; 1707 1708out_free_sec: 1709 security_sk_free(sk); 1710out_free: 1711 if (slab != NULL) 1712 kmem_cache_free(slab, sk); 1713 else 1714 kfree(sk); 1715 return NULL; 1716} 1717 1718static void sk_prot_free(struct proto *prot, struct sock *sk) 1719{ 1720 struct kmem_cache *slab; 1721 struct module *owner; 1722 1723 owner = prot->owner; 1724 slab = prot->slab; 1725 1726 cgroup_sk_free(&sk->sk_cgrp_data); 1727 mem_cgroup_sk_free(sk); 1728 security_sk_free(sk); 1729 if (slab != NULL) 1730 kmem_cache_free(slab, sk); 1731 else 1732 kfree(sk); 1733 module_put(owner); 1734} 1735 1736/** 1737 * sk_alloc - All socket objects are allocated here 1738 * @net: the applicable net namespace 1739 * @family: protocol family 1740 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1741 * @prot: struct proto associated with this new sock instance 1742 * @kern: is this to be a kernel socket? 1743 */ 1744struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1745 struct proto *prot, int kern) 1746{ 1747 struct sock *sk; 1748 1749 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1750 if (sk) { 1751 sk->sk_family = family; 1752 /* 1753 * See comment in struct sock definition to understand 1754 * why we need sk_prot_creator -acme 1755 */ 1756 sk->sk_prot = sk->sk_prot_creator = prot; 1757 sk->sk_kern_sock = kern; 1758 sock_lock_init(sk); 1759 sk->sk_net_refcnt = kern ? 0 : 1; 1760 if (likely(sk->sk_net_refcnt)) { 1761 get_net(net); 1762 sock_inuse_add(net, 1); 1763 } 1764 1765 sock_net_set(sk, net); 1766 refcount_set(&sk->sk_wmem_alloc, 1); 1767 1768 mem_cgroup_sk_alloc(sk); 1769 cgroup_sk_alloc(&sk->sk_cgrp_data); 1770 sock_update_classid(&sk->sk_cgrp_data); 1771 sock_update_netprioidx(&sk->sk_cgrp_data); 1772 sk_tx_queue_clear(sk); 1773 } 1774 1775 return sk; 1776} 1777EXPORT_SYMBOL(sk_alloc); 1778 1779/* Sockets having SOCK_RCU_FREE will call this function after one RCU 1780 * grace period. This is the case for UDP sockets and TCP listeners. 1781 */ 1782static void __sk_destruct(struct rcu_head *head) 1783{ 1784 struct sock *sk = container_of(head, struct sock, sk_rcu); 1785 struct sk_filter *filter; 1786 1787 if (sk->sk_destruct) 1788 sk->sk_destruct(sk); 1789 1790 filter = rcu_dereference_check(sk->sk_filter, 1791 refcount_read(&sk->sk_wmem_alloc) == 0); 1792 if (filter) { 1793 sk_filter_uncharge(sk, filter); 1794 RCU_INIT_POINTER(sk->sk_filter, NULL); 1795 } 1796 1797 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 1798 1799#ifdef CONFIG_BPF_SYSCALL 1800 bpf_sk_storage_free(sk); 1801#endif 1802 1803 if (atomic_read(&sk->sk_omem_alloc)) 1804 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1805 __func__, atomic_read(&sk->sk_omem_alloc)); 1806 1807 if (sk->sk_frag.page) { 1808 put_page(sk->sk_frag.page); 1809 sk->sk_frag.page = NULL; 1810 } 1811 1812 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ 1813 put_cred(sk->sk_peer_cred); 1814 put_pid(sk->sk_peer_pid); 1815 1816 if (likely(sk->sk_net_refcnt)) 1817 put_net(sock_net(sk)); 1818 sk_prot_free(sk->sk_prot_creator, sk); 1819} 1820 1821void sk_destruct(struct sock *sk) 1822{ 1823 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 1824 1825 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 1826 reuseport_detach_sock(sk); 1827 use_call_rcu = true; 1828 } 1829 1830 if (use_call_rcu) 1831 call_rcu(&sk->sk_rcu, __sk_destruct); 1832 else 1833 __sk_destruct(&sk->sk_rcu); 1834} 1835 1836static void __sk_free(struct sock *sk) 1837{ 1838 if (likely(sk->sk_net_refcnt)) 1839 sock_inuse_add(sock_net(sk), -1); 1840 1841 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 1842 sock_diag_broadcast_destroy(sk); 1843 else 1844 sk_destruct(sk); 1845} 1846 1847void sk_free(struct sock *sk) 1848{ 1849 /* 1850 * We subtract one from sk_wmem_alloc and can know if 1851 * some packets are still in some tx queue. 1852 * If not null, sock_wfree() will call __sk_free(sk) later 1853 */ 1854 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 1855 __sk_free(sk); 1856} 1857EXPORT_SYMBOL(sk_free); 1858 1859static void sk_init_common(struct sock *sk) 1860{ 1861 skb_queue_head_init(&sk->sk_receive_queue); 1862 skb_queue_head_init(&sk->sk_write_queue); 1863 skb_queue_head_init(&sk->sk_error_queue); 1864 1865 rwlock_init(&sk->sk_callback_lock); 1866 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 1867 af_rlock_keys + sk->sk_family, 1868 af_family_rlock_key_strings[sk->sk_family]); 1869 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 1870 af_wlock_keys + sk->sk_family, 1871 af_family_wlock_key_strings[sk->sk_family]); 1872 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 1873 af_elock_keys + sk->sk_family, 1874 af_family_elock_key_strings[sk->sk_family]); 1875 lockdep_set_class_and_name(&sk->sk_callback_lock, 1876 af_callback_keys + sk->sk_family, 1877 af_family_clock_key_strings[sk->sk_family]); 1878} 1879 1880/** 1881 * sk_clone_lock - clone a socket, and lock its clone 1882 * @sk: the socket to clone 1883 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1884 * 1885 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1886 */ 1887struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 1888{ 1889 struct proto *prot = READ_ONCE(sk->sk_prot); 1890 struct sk_filter *filter; 1891 bool is_charged = true; 1892 struct sock *newsk; 1893 1894 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 1895 if (!newsk) 1896 goto out; 1897 1898 sock_copy(newsk, sk); 1899 1900 newsk->sk_prot_creator = prot; 1901 1902 /* SANITY */ 1903 if (likely(newsk->sk_net_refcnt)) { 1904 get_net(sock_net(newsk)); 1905 sock_inuse_add(sock_net(newsk), 1); 1906 } 1907 sk_node_init(&newsk->sk_node); 1908 sock_lock_init(newsk); 1909 bh_lock_sock(newsk); 1910 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1911 newsk->sk_backlog.len = 0; 1912 1913 atomic_set(&newsk->sk_rmem_alloc, 0); 1914 1915 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ 1916 refcount_set(&newsk->sk_wmem_alloc, 1); 1917 1918 atomic_set(&newsk->sk_omem_alloc, 0); 1919 sk_init_common(newsk); 1920 1921 newsk->sk_dst_cache = NULL; 1922 newsk->sk_dst_pending_confirm = 0; 1923 newsk->sk_wmem_queued = 0; 1924 newsk->sk_forward_alloc = 0; 1925 atomic_set(&newsk->sk_drops, 0); 1926 newsk->sk_send_head = NULL; 1927 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1928 atomic_set(&newsk->sk_zckey, 0); 1929 1930 sock_reset_flag(newsk, SOCK_DONE); 1931 1932 /* sk->sk_memcg will be populated at accept() time */ 1933 newsk->sk_memcg = NULL; 1934 1935 cgroup_sk_clone(&newsk->sk_cgrp_data); 1936 1937 rcu_read_lock(); 1938 filter = rcu_dereference(sk->sk_filter); 1939 if (filter != NULL) 1940 /* though it's an empty new sock, the charging may fail 1941 * if sysctl_optmem_max was changed between creation of 1942 * original socket and cloning 1943 */ 1944 is_charged = sk_filter_charge(newsk, filter); 1945 RCU_INIT_POINTER(newsk->sk_filter, filter); 1946 rcu_read_unlock(); 1947 1948 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 1949 /* We need to make sure that we don't uncharge the new 1950 * socket if we couldn't charge it in the first place 1951 * as otherwise we uncharge the parent's filter. 1952 */ 1953 if (!is_charged) 1954 RCU_INIT_POINTER(newsk->sk_filter, NULL); 1955 sk_free_unlock_clone(newsk); 1956 newsk = NULL; 1957 goto out; 1958 } 1959 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 1960 1961 if (bpf_sk_storage_clone(sk, newsk)) { 1962 sk_free_unlock_clone(newsk); 1963 newsk = NULL; 1964 goto out; 1965 } 1966 1967 /* Clear sk_user_data if parent had the pointer tagged 1968 * as not suitable for copying when cloning. 1969 */ 1970 if (sk_user_data_is_nocopy(newsk)) 1971 newsk->sk_user_data = NULL; 1972 1973 newsk->sk_err = 0; 1974 newsk->sk_err_soft = 0; 1975 newsk->sk_priority = 0; 1976 newsk->sk_incoming_cpu = raw_smp_processor_id(); 1977 1978 /* Before updating sk_refcnt, we must commit prior changes to memory 1979 * (Documentation/RCU/rculist_nulls.rst for details) 1980 */ 1981 smp_wmb(); 1982 refcount_set(&newsk->sk_refcnt, 2); 1983 1984 /* Increment the counter in the same struct proto as the master 1985 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 1986 * is the same as sk->sk_prot->socks, as this field was copied 1987 * with memcpy). 1988 * 1989 * This _changes_ the previous behaviour, where 1990 * tcp_create_openreq_child always was incrementing the 1991 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 1992 * to be taken into account in all callers. -acme 1993 */ 1994 sk_refcnt_debug_inc(newsk); 1995 sk_set_socket(newsk, NULL); 1996 sk_tx_queue_clear(newsk); 1997 RCU_INIT_POINTER(newsk->sk_wq, NULL); 1998 1999 if (newsk->sk_prot->sockets_allocated) 2000 sk_sockets_allocated_inc(newsk); 2001 2002 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2003 net_enable_timestamp(); 2004out: 2005 return newsk; 2006} 2007EXPORT_SYMBOL_GPL(sk_clone_lock); 2008 2009void sk_free_unlock_clone(struct sock *sk) 2010{ 2011 /* It is still raw copy of parent, so invalidate 2012 * destructor and make plain sk_free() */ 2013 sk->sk_destruct = NULL; 2014 bh_unlock_sock(sk); 2015 sk_free(sk); 2016} 2017EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 2018 2019void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2020{ 2021 u32 max_segs = 1; 2022 2023 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; 2024 if (sk->sk_route_caps & NETIF_F_GSO) 2025 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2026 sk->sk_route_caps &= ~sk->sk_route_nocaps; 2027 if (sk_can_gso(sk)) { 2028 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2029 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2030 } else { 2031 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2032 sk->sk_gso_max_size = dst->dev->gso_max_size; 2033 max_segs = max_t(u32, dst->dev->gso_max_segs, 1); 2034 } 2035 } 2036 sk->sk_gso_max_segs = max_segs; 2037 sk_dst_set(sk, dst); 2038} 2039EXPORT_SYMBOL_GPL(sk_setup_caps); 2040 2041/* 2042 * Simple resource managers for sockets. 2043 */ 2044 2045 2046/* 2047 * Write buffer destructor automatically called from kfree_skb. 2048 */ 2049void sock_wfree(struct sk_buff *skb) 2050{ 2051 struct sock *sk = skb->sk; 2052 unsigned int len = skb->truesize; 2053 2054 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2055 /* 2056 * Keep a reference on sk_wmem_alloc, this will be released 2057 * after sk_write_space() call 2058 */ 2059 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2060 sk->sk_write_space(sk); 2061 len = 1; 2062 } 2063 /* 2064 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2065 * could not do because of in-flight packets 2066 */ 2067 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2068 __sk_free(sk); 2069} 2070EXPORT_SYMBOL(sock_wfree); 2071 2072/* This variant of sock_wfree() is used by TCP, 2073 * since it sets SOCK_USE_WRITE_QUEUE. 2074 */ 2075void __sock_wfree(struct sk_buff *skb) 2076{ 2077 struct sock *sk = skb->sk; 2078 2079 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2080 __sk_free(sk); 2081} 2082 2083void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2084{ 2085 skb_orphan(skb); 2086 skb->sk = sk; 2087#ifdef CONFIG_INET 2088 if (unlikely(!sk_fullsock(sk))) { 2089 skb->destructor = sock_edemux; 2090 sock_hold(sk); 2091 return; 2092 } 2093#endif 2094 skb->destructor = sock_wfree; 2095 skb_set_hash_from_sk(skb, sk); 2096 /* 2097 * We used to take a refcount on sk, but following operation 2098 * is enough to guarantee sk_free() wont free this sock until 2099 * all in-flight packets are completed 2100 */ 2101 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 2102} 2103EXPORT_SYMBOL(skb_set_owner_w); 2104 2105static bool can_skb_orphan_partial(const struct sk_buff *skb) 2106{ 2107#ifdef CONFIG_TLS_DEVICE 2108 /* Drivers depend on in-order delivery for crypto offload, 2109 * partial orphan breaks out-of-order-OK logic. 2110 */ 2111 if (skb->decrypted) 2112 return false; 2113#endif 2114 return (skb->destructor == sock_wfree || 2115 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2116} 2117 2118/* This helper is used by netem, as it can hold packets in its 2119 * delay queue. We want to allow the owner socket to send more 2120 * packets, as if they were already TX completed by a typical driver. 2121 * But we also want to keep skb->sk set because some packet schedulers 2122 * rely on it (sch_fq for example). 2123 */ 2124void skb_orphan_partial(struct sk_buff *skb) 2125{ 2126 if (skb_is_tcp_pure_ack(skb)) 2127 return; 2128 2129 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2130 return; 2131 2132 skb_orphan(skb); 2133} 2134EXPORT_SYMBOL(skb_orphan_partial); 2135 2136/* 2137 * Read buffer destructor automatically called from kfree_skb. 2138 */ 2139void sock_rfree(struct sk_buff *skb) 2140{ 2141 struct sock *sk = skb->sk; 2142 unsigned int len = skb->truesize; 2143 2144 atomic_sub(len, &sk->sk_rmem_alloc); 2145 sk_mem_uncharge(sk, len); 2146} 2147EXPORT_SYMBOL(sock_rfree); 2148 2149/* 2150 * Buffer destructor for skbs that are not used directly in read or write 2151 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2152 */ 2153void sock_efree(struct sk_buff *skb) 2154{ 2155 sock_put(skb->sk); 2156} 2157EXPORT_SYMBOL(sock_efree); 2158 2159/* Buffer destructor for prefetch/receive path where reference count may 2160 * not be held, e.g. for listen sockets. 2161 */ 2162#ifdef CONFIG_INET 2163void sock_pfree(struct sk_buff *skb) 2164{ 2165 if (sk_is_refcounted(skb->sk)) 2166 sock_gen_put(skb->sk); 2167} 2168EXPORT_SYMBOL(sock_pfree); 2169#endif /* CONFIG_INET */ 2170 2171kuid_t sock_i_uid(struct sock *sk) 2172{ 2173 kuid_t uid; 2174 2175 read_lock_bh(&sk->sk_callback_lock); 2176 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 2177 read_unlock_bh(&sk->sk_callback_lock); 2178 return uid; 2179} 2180EXPORT_SYMBOL(sock_i_uid); 2181 2182unsigned long __sock_i_ino(struct sock *sk) 2183{ 2184 unsigned long ino; 2185 2186 read_lock(&sk->sk_callback_lock); 2187 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 2188 read_unlock(&sk->sk_callback_lock); 2189 return ino; 2190} 2191EXPORT_SYMBOL(__sock_i_ino); 2192 2193unsigned long sock_i_ino(struct sock *sk) 2194{ 2195 unsigned long ino; 2196 2197 local_bh_disable(); 2198 ino = __sock_i_ino(sk); 2199 local_bh_enable(); 2200 return ino; 2201} 2202EXPORT_SYMBOL(sock_i_ino); 2203 2204/* 2205 * Allocate a skb from the socket's send buffer. 2206 */ 2207struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2208 gfp_t priority) 2209{ 2210 if (force || 2211 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2212 struct sk_buff *skb = alloc_skb(size, priority); 2213 2214 if (skb) { 2215 skb_set_owner_w(skb, sk); 2216 return skb; 2217 } 2218 } 2219 return NULL; 2220} 2221EXPORT_SYMBOL(sock_wmalloc); 2222 2223static void sock_ofree(struct sk_buff *skb) 2224{ 2225 struct sock *sk = skb->sk; 2226 2227 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2228} 2229 2230struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2231 gfp_t priority) 2232{ 2233 struct sk_buff *skb; 2234 2235 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2236 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2237 READ_ONCE(sysctl_optmem_max)) 2238 return NULL; 2239 2240 skb = alloc_skb(size, priority); 2241 if (!skb) 2242 return NULL; 2243 2244 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2245 skb->sk = sk; 2246 skb->destructor = sock_ofree; 2247 return skb; 2248} 2249 2250/* 2251 * Allocate a memory block from the socket's option memory buffer. 2252 */ 2253void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2254{ 2255 int optmem_max = READ_ONCE(sysctl_optmem_max); 2256 2257 if ((unsigned int)size <= optmem_max && 2258 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { 2259 void *mem; 2260 /* First do the add, to avoid the race if kmalloc 2261 * might sleep. 2262 */ 2263 atomic_add(size, &sk->sk_omem_alloc); 2264 mem = kmalloc(size, priority); 2265 if (mem) 2266 return mem; 2267 atomic_sub(size, &sk->sk_omem_alloc); 2268 } 2269 return NULL; 2270} 2271EXPORT_SYMBOL(sock_kmalloc); 2272 2273/* Free an option memory block. Note, we actually want the inline 2274 * here as this allows gcc to detect the nullify and fold away the 2275 * condition entirely. 2276 */ 2277static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2278 const bool nullify) 2279{ 2280 if (WARN_ON_ONCE(!mem)) 2281 return; 2282 if (nullify) 2283 kfree_sensitive(mem); 2284 else 2285 kfree(mem); 2286 atomic_sub(size, &sk->sk_omem_alloc); 2287} 2288 2289void sock_kfree_s(struct sock *sk, void *mem, int size) 2290{ 2291 __sock_kfree_s(sk, mem, size, false); 2292} 2293EXPORT_SYMBOL(sock_kfree_s); 2294 2295void sock_kzfree_s(struct sock *sk, void *mem, int size) 2296{ 2297 __sock_kfree_s(sk, mem, size, true); 2298} 2299EXPORT_SYMBOL(sock_kzfree_s); 2300 2301/* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2302 I think, these locks should be removed for datagram sockets. 2303 */ 2304static long sock_wait_for_wmem(struct sock *sk, long timeo) 2305{ 2306 DEFINE_WAIT(wait); 2307 2308 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2309 for (;;) { 2310 if (!timeo) 2311 break; 2312 if (signal_pending(current)) 2313 break; 2314 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2315 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2316 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2317 break; 2318 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2319 break; 2320 if (READ_ONCE(sk->sk_err)) 2321 break; 2322 timeo = schedule_timeout(timeo); 2323 } 2324 finish_wait(sk_sleep(sk), &wait); 2325 return timeo; 2326} 2327 2328 2329/* 2330 * Generic send/receive buffer handlers 2331 */ 2332 2333struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2334 unsigned long data_len, int noblock, 2335 int *errcode, int max_page_order) 2336{ 2337 struct sk_buff *skb; 2338 long timeo; 2339 int err; 2340 2341 timeo = sock_sndtimeo(sk, noblock); 2342 for (;;) { 2343 err = sock_error(sk); 2344 if (err != 0) 2345 goto failure; 2346 2347 err = -EPIPE; 2348 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2349 goto failure; 2350 2351 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2352 break; 2353 2354 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2355 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2356 err = -EAGAIN; 2357 if (!timeo) 2358 goto failure; 2359 if (signal_pending(current)) 2360 goto interrupted; 2361 timeo = sock_wait_for_wmem(sk, timeo); 2362 } 2363 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2364 errcode, sk->sk_allocation); 2365 if (skb) 2366 skb_set_owner_w(skb, sk); 2367 return skb; 2368 2369interrupted: 2370 err = sock_intr_errno(timeo); 2371failure: 2372 *errcode = err; 2373 return NULL; 2374} 2375EXPORT_SYMBOL(sock_alloc_send_pskb); 2376 2377struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 2378 int noblock, int *errcode) 2379{ 2380 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); 2381} 2382EXPORT_SYMBOL(sock_alloc_send_skb); 2383 2384int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 2385 struct sockcm_cookie *sockc) 2386{ 2387 u32 tsflags; 2388 2389 switch (cmsg->cmsg_type) { 2390 case SO_MARK: 2391 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2392 return -EPERM; 2393 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2394 return -EINVAL; 2395 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2396 break; 2397 case SO_TIMESTAMPING_OLD: 2398 case SO_TIMESTAMPING_NEW: 2399 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2400 return -EINVAL; 2401 2402 tsflags = *(u32 *)CMSG_DATA(cmsg); 2403 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2404 return -EINVAL; 2405 2406 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2407 sockc->tsflags |= tsflags; 2408 break; 2409 case SCM_TXTIME: 2410 if (!sock_flag(sk, SOCK_TXTIME)) 2411 return -EINVAL; 2412 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2413 return -EINVAL; 2414 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2415 break; 2416 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2417 case SCM_RIGHTS: 2418 case SCM_CREDENTIALS: 2419 break; 2420 default: 2421 return -EINVAL; 2422 } 2423 return 0; 2424} 2425EXPORT_SYMBOL(__sock_cmsg_send); 2426 2427int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2428 struct sockcm_cookie *sockc) 2429{ 2430 struct cmsghdr *cmsg; 2431 int ret; 2432 2433 for_each_cmsghdr(cmsg, msg) { 2434 if (!CMSG_OK(msg, cmsg)) 2435 return -EINVAL; 2436 if (cmsg->cmsg_level != SOL_SOCKET) 2437 continue; 2438 ret = __sock_cmsg_send(sk, msg, cmsg, sockc); 2439 if (ret) 2440 return ret; 2441 } 2442 return 0; 2443} 2444EXPORT_SYMBOL(sock_cmsg_send); 2445 2446static void sk_enter_memory_pressure(struct sock *sk) 2447{ 2448 if (!sk->sk_prot->enter_memory_pressure) 2449 return; 2450 2451 sk->sk_prot->enter_memory_pressure(sk); 2452} 2453 2454static void sk_leave_memory_pressure(struct sock *sk) 2455{ 2456 if (sk->sk_prot->leave_memory_pressure) { 2457 sk->sk_prot->leave_memory_pressure(sk); 2458 } else { 2459 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2460 2461 if (memory_pressure && READ_ONCE(*memory_pressure)) 2462 WRITE_ONCE(*memory_pressure, 0); 2463 } 2464} 2465 2466DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 2467 2468/** 2469 * skb_page_frag_refill - check that a page_frag contains enough room 2470 * @sz: minimum size of the fragment we want to get 2471 * @pfrag: pointer to page_frag 2472 * @gfp: priority for memory allocation 2473 * 2474 * Note: While this allocator tries to use high order pages, there is 2475 * no guarantee that allocations succeed. Therefore, @sz MUST be 2476 * less or equal than PAGE_SIZE. 2477 */ 2478bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2479{ 2480 if (pfrag->page) { 2481 if (page_ref_count(pfrag->page) == 1) { 2482 pfrag->offset = 0; 2483 return true; 2484 } 2485 if (pfrag->offset + sz <= pfrag->size) 2486 return true; 2487 put_page(pfrag->page); 2488 } 2489 2490 pfrag->offset = 0; 2491 if (SKB_FRAG_PAGE_ORDER && 2492 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 2493 /* Avoid direct reclaim but allow kswapd to wake */ 2494 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2495 __GFP_COMP | __GFP_NOWARN | 2496 __GFP_NORETRY, 2497 SKB_FRAG_PAGE_ORDER); 2498 if (likely(pfrag->page)) { 2499 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2500 return true; 2501 } 2502 } 2503 pfrag->page = alloc_page(gfp); 2504 if (likely(pfrag->page)) { 2505 pfrag->size = PAGE_SIZE; 2506 return true; 2507 } 2508 return false; 2509} 2510EXPORT_SYMBOL(skb_page_frag_refill); 2511 2512bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2513{ 2514 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2515 return true; 2516 2517 sk_enter_memory_pressure(sk); 2518 sk_stream_moderate_sndbuf(sk); 2519 return false; 2520} 2521EXPORT_SYMBOL(sk_page_frag_refill); 2522 2523static void __lock_sock(struct sock *sk) 2524 __releases(&sk->sk_lock.slock) 2525 __acquires(&sk->sk_lock.slock) 2526{ 2527 DEFINE_WAIT(wait); 2528 2529 for (;;) { 2530 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2531 TASK_UNINTERRUPTIBLE); 2532 spin_unlock_bh(&sk->sk_lock.slock); 2533 schedule(); 2534 spin_lock_bh(&sk->sk_lock.slock); 2535 if (!sock_owned_by_user(sk)) 2536 break; 2537 } 2538 finish_wait(&sk->sk_lock.wq, &wait); 2539} 2540 2541void __release_sock(struct sock *sk) 2542 __releases(&sk->sk_lock.slock) 2543 __acquires(&sk->sk_lock.slock) 2544{ 2545 struct sk_buff *skb, *next; 2546 2547 while ((skb = sk->sk_backlog.head) != NULL) { 2548 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2549 2550 spin_unlock_bh(&sk->sk_lock.slock); 2551 2552 do { 2553 next = skb->next; 2554 prefetch(next); 2555 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2556 skb_mark_not_on_list(skb); 2557 sk_backlog_rcv(sk, skb); 2558 2559 cond_resched(); 2560 2561 skb = next; 2562 } while (skb != NULL); 2563 2564 spin_lock_bh(&sk->sk_lock.slock); 2565 } 2566 2567 /* 2568 * Doing the zeroing here guarantee we can not loop forever 2569 * while a wild producer attempts to flood us. 2570 */ 2571 sk->sk_backlog.len = 0; 2572} 2573 2574void __sk_flush_backlog(struct sock *sk) 2575{ 2576 spin_lock_bh(&sk->sk_lock.slock); 2577 __release_sock(sk); 2578 spin_unlock_bh(&sk->sk_lock.slock); 2579} 2580 2581/** 2582 * sk_wait_data - wait for data to arrive at sk_receive_queue 2583 * @sk: sock to wait on 2584 * @timeo: for how long 2585 * @skb: last skb seen on sk_receive_queue 2586 * 2587 * Now socket state including sk->sk_err is changed only under lock, 2588 * hence we may omit checks after joining wait queue. 2589 * We check receive queue before schedule() only as optimization; 2590 * it is very likely that release_sock() added new data. 2591 */ 2592int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2593{ 2594 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2595 int rc; 2596 2597 add_wait_queue(sk_sleep(sk), &wait); 2598 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2599 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 2600 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2601 remove_wait_queue(sk_sleep(sk), &wait); 2602 return rc; 2603} 2604EXPORT_SYMBOL(sk_wait_data); 2605 2606/** 2607 * __sk_mem_raise_allocated - increase memory_allocated 2608 * @sk: socket 2609 * @size: memory size to allocate 2610 * @amt: pages to allocate 2611 * @kind: allocation type 2612 * 2613 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 2614 */ 2615int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 2616{ 2617 struct proto *prot = sk->sk_prot; 2618 long allocated = sk_memory_allocated_add(sk, amt); 2619 bool charged = true; 2620 2621 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 2622 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) 2623 goto suppress_allocation; 2624 2625 /* Under limit. */ 2626 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2627 sk_leave_memory_pressure(sk); 2628 return 1; 2629 } 2630 2631 /* Under pressure. */ 2632 if (allocated > sk_prot_mem_limits(sk, 1)) 2633 sk_enter_memory_pressure(sk); 2634 2635 /* Over hard limit. */ 2636 if (allocated > sk_prot_mem_limits(sk, 2)) 2637 goto suppress_allocation; 2638 2639 /* guarantee minimum buffer size under pressure */ 2640 if (kind == SK_MEM_RECV) { 2641 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 2642 return 1; 2643 2644 } else { /* SK_MEM_SEND */ 2645 int wmem0 = sk_get_wmem0(sk, prot); 2646 2647 if (sk->sk_type == SOCK_STREAM) { 2648 if (sk->sk_wmem_queued < wmem0) 2649 return 1; 2650 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 2651 return 1; 2652 } 2653 } 2654 2655 if (sk_has_memory_pressure(sk)) { 2656 u64 alloc; 2657 2658 if (!sk_under_memory_pressure(sk)) 2659 return 1; 2660 alloc = sk_sockets_allocated_read_positive(sk); 2661 if (sk_prot_mem_limits(sk, 2) > alloc * 2662 sk_mem_pages(sk->sk_wmem_queued + 2663 atomic_read(&sk->sk_rmem_alloc) + 2664 sk->sk_forward_alloc)) 2665 return 1; 2666 } 2667 2668suppress_allocation: 2669 2670 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2671 sk_stream_moderate_sndbuf(sk); 2672 2673 /* Fail only if socket is _under_ its sndbuf. 2674 * In this case we cannot block, so that we have to fail. 2675 */ 2676 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 2677 return 1; 2678 } 2679 2680 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 2681 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 2682 2683 sk_memory_allocated_sub(sk, amt); 2684 2685 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2686 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2687 2688 return 0; 2689} 2690EXPORT_SYMBOL(__sk_mem_raise_allocated); 2691 2692/** 2693 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2694 * @sk: socket 2695 * @size: memory size to allocate 2696 * @kind: allocation type 2697 * 2698 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2699 * rmem allocation. This function assumes that protocols which have 2700 * memory_pressure use sk_wmem_queued as write buffer accounting. 2701 */ 2702int __sk_mem_schedule(struct sock *sk, int size, int kind) 2703{ 2704 int ret, amt = sk_mem_pages(size); 2705 2706 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; 2707 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 2708 if (!ret) 2709 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; 2710 return ret; 2711} 2712EXPORT_SYMBOL(__sk_mem_schedule); 2713 2714/** 2715 * __sk_mem_reduce_allocated - reclaim memory_allocated 2716 * @sk: socket 2717 * @amount: number of quanta 2718 * 2719 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 2720 */ 2721void __sk_mem_reduce_allocated(struct sock *sk, int amount) 2722{ 2723 sk_memory_allocated_sub(sk, amount); 2724 2725 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2726 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2727 2728 if (sk_under_global_memory_pressure(sk) && 2729 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2730 sk_leave_memory_pressure(sk); 2731} 2732EXPORT_SYMBOL(__sk_mem_reduce_allocated); 2733 2734/** 2735 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 2736 * @sk: socket 2737 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2738 */ 2739void __sk_mem_reclaim(struct sock *sk, int amount) 2740{ 2741 amount >>= SK_MEM_QUANTUM_SHIFT; 2742 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2743 __sk_mem_reduce_allocated(sk, amount); 2744} 2745EXPORT_SYMBOL(__sk_mem_reclaim); 2746 2747int sk_set_peek_off(struct sock *sk, int val) 2748{ 2749 WRITE_ONCE(sk->sk_peek_off, val); 2750 return 0; 2751} 2752EXPORT_SYMBOL_GPL(sk_set_peek_off); 2753 2754/* 2755 * Set of default routines for initialising struct proto_ops when 2756 * the protocol does not support a particular function. In certain 2757 * cases where it makes no sense for a protocol to have a "do nothing" 2758 * function, some default processing is provided. 2759 */ 2760 2761int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2762{ 2763 return -EOPNOTSUPP; 2764} 2765EXPORT_SYMBOL(sock_no_bind); 2766 2767int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2768 int len, int flags) 2769{ 2770 return -EOPNOTSUPP; 2771} 2772EXPORT_SYMBOL(sock_no_connect); 2773 2774int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2775{ 2776 return -EOPNOTSUPP; 2777} 2778EXPORT_SYMBOL(sock_no_socketpair); 2779 2780int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 2781 bool kern) 2782{ 2783 return -EOPNOTSUPP; 2784} 2785EXPORT_SYMBOL(sock_no_accept); 2786 2787int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2788 int peer) 2789{ 2790 return -EOPNOTSUPP; 2791} 2792EXPORT_SYMBOL(sock_no_getname); 2793 2794int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2795{ 2796 return -EOPNOTSUPP; 2797} 2798EXPORT_SYMBOL(sock_no_ioctl); 2799 2800int sock_no_listen(struct socket *sock, int backlog) 2801{ 2802 return -EOPNOTSUPP; 2803} 2804EXPORT_SYMBOL(sock_no_listen); 2805 2806int sock_no_shutdown(struct socket *sock, int how) 2807{ 2808 return -EOPNOTSUPP; 2809} 2810EXPORT_SYMBOL(sock_no_shutdown); 2811 2812int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 2813{ 2814 return -EOPNOTSUPP; 2815} 2816EXPORT_SYMBOL(sock_no_sendmsg); 2817 2818int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 2819{ 2820 return -EOPNOTSUPP; 2821} 2822EXPORT_SYMBOL(sock_no_sendmsg_locked); 2823 2824int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 2825 int flags) 2826{ 2827 return -EOPNOTSUPP; 2828} 2829EXPORT_SYMBOL(sock_no_recvmsg); 2830 2831int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2832{ 2833 /* Mirror missing mmap method error code */ 2834 return -ENODEV; 2835} 2836EXPORT_SYMBOL(sock_no_mmap); 2837 2838/* 2839 * When a file is received (via SCM_RIGHTS, etc), we must bump the 2840 * various sock-based usage counts. 2841 */ 2842void __receive_sock(struct file *file) 2843{ 2844 struct socket *sock; 2845 int error; 2846 2847 /* 2848 * The resulting value of "error" is ignored here since we only 2849 * need to take action when the file is a socket and testing 2850 * "sock" for NULL is sufficient. 2851 */ 2852 sock = sock_from_file(file, &error); 2853 if (sock) { 2854 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 2855 sock_update_classid(&sock->sk->sk_cgrp_data); 2856 } 2857} 2858 2859ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2860{ 2861 ssize_t res; 2862 struct msghdr msg = {.msg_flags = flags}; 2863 struct kvec iov; 2864 char *kaddr = kmap(page); 2865 iov.iov_base = kaddr + offset; 2866 iov.iov_len = size; 2867 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 2868 kunmap(page); 2869 return res; 2870} 2871EXPORT_SYMBOL(sock_no_sendpage); 2872 2873ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 2874 int offset, size_t size, int flags) 2875{ 2876 ssize_t res; 2877 struct msghdr msg = {.msg_flags = flags}; 2878 struct kvec iov; 2879 char *kaddr = kmap(page); 2880 2881 iov.iov_base = kaddr + offset; 2882 iov.iov_len = size; 2883 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); 2884 kunmap(page); 2885 return res; 2886} 2887EXPORT_SYMBOL(sock_no_sendpage_locked); 2888 2889/* 2890 * Default Socket Callbacks 2891 */ 2892 2893static void sock_def_wakeup(struct sock *sk) 2894{ 2895 struct socket_wq *wq; 2896 2897 rcu_read_lock(); 2898 wq = rcu_dereference(sk->sk_wq); 2899 if (skwq_has_sleeper(wq)) 2900 wake_up_interruptible_all(&wq->wait); 2901 rcu_read_unlock(); 2902} 2903 2904static void sock_def_error_report(struct sock *sk) 2905{ 2906 struct socket_wq *wq; 2907 2908 rcu_read_lock(); 2909 wq = rcu_dereference(sk->sk_wq); 2910 if (skwq_has_sleeper(wq)) 2911 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 2912 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 2913 rcu_read_unlock(); 2914} 2915 2916void sock_def_readable(struct sock *sk) 2917{ 2918 struct socket_wq *wq; 2919 2920 rcu_read_lock(); 2921 wq = rcu_dereference(sk->sk_wq); 2922 if (skwq_has_sleeper(wq)) 2923 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 2924 EPOLLRDNORM | EPOLLRDBAND); 2925 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 2926 rcu_read_unlock(); 2927} 2928 2929static void sock_def_write_space(struct sock *sk) 2930{ 2931 struct socket_wq *wq; 2932 2933 rcu_read_lock(); 2934 2935 /* Do not wake up a writer until he can make "significant" 2936 * progress. --DaveM 2937 */ 2938 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) { 2939 wq = rcu_dereference(sk->sk_wq); 2940 if (skwq_has_sleeper(wq)) 2941 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 2942 EPOLLWRNORM | EPOLLWRBAND); 2943 2944 /* Should agree with poll, otherwise some programs break */ 2945 if (sock_writeable(sk)) 2946 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 2947 } 2948 2949 rcu_read_unlock(); 2950} 2951 2952static void sock_def_destruct(struct sock *sk) 2953{ 2954} 2955 2956void sk_send_sigurg(struct sock *sk) 2957{ 2958 if (sk->sk_socket && sk->sk_socket->file) 2959 if (send_sigurg(&sk->sk_socket->file->f_owner)) 2960 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 2961} 2962EXPORT_SYMBOL(sk_send_sigurg); 2963 2964void sk_reset_timer(struct sock *sk, struct timer_list* timer, 2965 unsigned long expires) 2966{ 2967 if (!mod_timer(timer, expires)) 2968 sock_hold(sk); 2969} 2970EXPORT_SYMBOL(sk_reset_timer); 2971 2972void sk_stop_timer(struct sock *sk, struct timer_list* timer) 2973{ 2974 if (del_timer(timer)) 2975 __sock_put(sk); 2976} 2977EXPORT_SYMBOL(sk_stop_timer); 2978 2979void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 2980{ 2981 if (del_timer_sync(timer)) 2982 __sock_put(sk); 2983} 2984EXPORT_SYMBOL(sk_stop_timer_sync); 2985 2986void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) 2987{ 2988 sk_init_common(sk); 2989 sk->sk_send_head = NULL; 2990 2991 timer_setup(&sk->sk_timer, NULL, 0); 2992 2993 sk->sk_allocation = GFP_KERNEL; 2994 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); 2995 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); 2996 sk->sk_state = TCP_CLOSE; 2997 sk_set_socket(sk, sock); 2998 2999 sock_set_flag(sk, SOCK_ZAPPED); 3000 3001 if (sock) { 3002 sk->sk_type = sock->type; 3003 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 3004 sock->sk = sk; 3005 } else { 3006 RCU_INIT_POINTER(sk->sk_wq, NULL); 3007 } 3008 sk->sk_uid = uid; 3009 3010 rwlock_init(&sk->sk_callback_lock); 3011 if (sk->sk_kern_sock) 3012 lockdep_set_class_and_name( 3013 &sk->sk_callback_lock, 3014 af_kern_callback_keys + sk->sk_family, 3015 af_family_kern_clock_key_strings[sk->sk_family]); 3016 else 3017 lockdep_set_class_and_name( 3018 &sk->sk_callback_lock, 3019 af_callback_keys + sk->sk_family, 3020 af_family_clock_key_strings[sk->sk_family]); 3021 3022 sk->sk_state_change = sock_def_wakeup; 3023 sk->sk_data_ready = sock_def_readable; 3024 sk->sk_write_space = sock_def_write_space; 3025 sk->sk_error_report = sock_def_error_report; 3026 sk->sk_destruct = sock_def_destruct; 3027 3028 sk->sk_frag.page = NULL; 3029 sk->sk_frag.offset = 0; 3030 sk->sk_peek_off = -1; 3031 3032 sk->sk_peer_pid = NULL; 3033 sk->sk_peer_cred = NULL; 3034 spin_lock_init(&sk->sk_peer_lock); 3035 3036 sk->sk_write_pending = 0; 3037 sk->sk_rcvlowat = 1; 3038 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3039 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3040 3041 sk->sk_stamp = SK_DEFAULT_STAMP; 3042#if BITS_PER_LONG==32 3043 seqlock_init(&sk->sk_stamp_seq); 3044#endif 3045 atomic_set(&sk->sk_zckey, 0); 3046 3047#ifdef CONFIG_NET_RX_BUSY_POLL 3048 sk->sk_napi_id = 0; 3049 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); 3050#endif 3051 3052 sk->sk_max_pacing_rate = ~0UL; 3053 sk->sk_pacing_rate = ~0UL; 3054 WRITE_ONCE(sk->sk_pacing_shift, 10); 3055 sk->sk_incoming_cpu = -1; 3056 3057 sk_rx_queue_clear(sk); 3058 /* 3059 * Before updating sk_refcnt, we must commit prior changes to memory 3060 * (Documentation/RCU/rculist_nulls.rst for details) 3061 */ 3062 smp_wmb(); 3063 refcount_set(&sk->sk_refcnt, 1); 3064 atomic_set(&sk->sk_drops, 0); 3065} 3066EXPORT_SYMBOL(sock_init_data_uid); 3067 3068void sock_init_data(struct socket *sock, struct sock *sk) 3069{ 3070 kuid_t uid = sock ? 3071 SOCK_INODE(sock)->i_uid : 3072 make_kuid(sock_net(sk)->user_ns, 0); 3073 3074 sock_init_data_uid(sock, sk, uid); 3075} 3076EXPORT_SYMBOL(sock_init_data); 3077 3078void lock_sock_nested(struct sock *sk, int subclass) 3079{ 3080 might_sleep(); 3081 spin_lock_bh(&sk->sk_lock.slock); 3082 if (sk->sk_lock.owned) 3083 __lock_sock(sk); 3084 sk->sk_lock.owned = 1; 3085 spin_unlock(&sk->sk_lock.slock); 3086 /* 3087 * The sk_lock has mutex_lock() semantics here: 3088 */ 3089 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3090 local_bh_enable(); 3091} 3092EXPORT_SYMBOL(lock_sock_nested); 3093 3094void release_sock(struct sock *sk) 3095{ 3096 spin_lock_bh(&sk->sk_lock.slock); 3097 if (sk->sk_backlog.tail) 3098 __release_sock(sk); 3099 3100 /* Warning : release_cb() might need to release sk ownership, 3101 * ie call sock_release_ownership(sk) before us. 3102 */ 3103 if (sk->sk_prot->release_cb) 3104 sk->sk_prot->release_cb(sk); 3105 3106 sock_release_ownership(sk); 3107 if (waitqueue_active(&sk->sk_lock.wq)) 3108 wake_up(&sk->sk_lock.wq); 3109 spin_unlock_bh(&sk->sk_lock.slock); 3110} 3111EXPORT_SYMBOL(release_sock); 3112 3113/** 3114 * lock_sock_fast - fast version of lock_sock 3115 * @sk: socket 3116 * 3117 * This version should be used for very small section, where process wont block 3118 * return false if fast path is taken: 3119 * 3120 * sk_lock.slock locked, owned = 0, BH disabled 3121 * 3122 * return true if slow path is taken: 3123 * 3124 * sk_lock.slock unlocked, owned = 1, BH enabled 3125 */ 3126bool lock_sock_fast(struct sock *sk) 3127{ 3128 might_sleep(); 3129 spin_lock_bh(&sk->sk_lock.slock); 3130 3131 if (!sk->sk_lock.owned) 3132 /* 3133 * Note : We must disable BH 3134 */ 3135 return false; 3136 3137 __lock_sock(sk); 3138 sk->sk_lock.owned = 1; 3139 spin_unlock(&sk->sk_lock.slock); 3140 /* 3141 * The sk_lock has mutex_lock() semantics here: 3142 */ 3143 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 3144 local_bh_enable(); 3145 return true; 3146} 3147EXPORT_SYMBOL(lock_sock_fast); 3148 3149int sock_gettstamp(struct socket *sock, void __user *userstamp, 3150 bool timeval, bool time32) 3151{ 3152 struct sock *sk = sock->sk; 3153 struct timespec64 ts; 3154 3155 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3156 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3157 if (ts.tv_sec == -1) 3158 return -ENOENT; 3159 if (ts.tv_sec == 0) { 3160 ktime_t kt = ktime_get_real(); 3161 sock_write_timestamp(sk, kt); 3162 ts = ktime_to_timespec64(kt); 3163 } 3164 3165 if (timeval) 3166 ts.tv_nsec /= 1000; 3167 3168#ifdef CONFIG_COMPAT_32BIT_TIME 3169 if (time32) 3170 return put_old_timespec32(&ts, userstamp); 3171#endif 3172#ifdef CONFIG_SPARC64 3173 /* beware of padding in sparc64 timeval */ 3174 if (timeval && !in_compat_syscall()) { 3175 struct __kernel_old_timeval __user tv = { 3176 .tv_sec = ts.tv_sec, 3177 .tv_usec = ts.tv_nsec, 3178 }; 3179 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3180 return -EFAULT; 3181 return 0; 3182 } 3183#endif 3184 return put_timespec64(&ts, userstamp); 3185} 3186EXPORT_SYMBOL(sock_gettstamp); 3187 3188void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3189{ 3190 if (!sock_flag(sk, flag)) { 3191 unsigned long previous_flags = sk->sk_flags; 3192 3193 sock_set_flag(sk, flag); 3194 /* 3195 * we just set one of the two flags which require net 3196 * time stamping, but time stamping might have been on 3197 * already because of the other one 3198 */ 3199 if (sock_needs_netstamp(sk) && 3200 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3201 net_enable_timestamp(); 3202 } 3203} 3204 3205int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3206 int level, int type) 3207{ 3208 struct sock_exterr_skb *serr; 3209 struct sk_buff *skb; 3210 int copied, err; 3211 3212 err = -EAGAIN; 3213 skb = sock_dequeue_err_skb(sk); 3214 if (skb == NULL) 3215 goto out; 3216 3217 copied = skb->len; 3218 if (copied > len) { 3219 msg->msg_flags |= MSG_TRUNC; 3220 copied = len; 3221 } 3222 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3223 if (err) 3224 goto out_free_skb; 3225 3226 sock_recv_timestamp(msg, sk, skb); 3227 3228 serr = SKB_EXT_ERR(skb); 3229 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3230 3231 msg->msg_flags |= MSG_ERRQUEUE; 3232 err = copied; 3233 3234out_free_skb: 3235 kfree_skb(skb); 3236out: 3237 return err; 3238} 3239EXPORT_SYMBOL(sock_recv_errqueue); 3240 3241/* 3242 * Get a socket option on an socket. 3243 * 3244 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3245 * asynchronous errors should be reported by getsockopt. We assume 3246 * this means if you specify SO_ERROR (otherwise whats the point of it). 3247 */ 3248int sock_common_getsockopt(struct socket *sock, int level, int optname, 3249 char __user *optval, int __user *optlen) 3250{ 3251 struct sock *sk = sock->sk; 3252 3253 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3254 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); 3255} 3256EXPORT_SYMBOL(sock_common_getsockopt); 3257 3258int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3259 int flags) 3260{ 3261 struct sock *sk = sock->sk; 3262 int addr_len = 0; 3263 int err; 3264 3265 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, 3266 flags & ~MSG_DONTWAIT, &addr_len); 3267 if (err >= 0) 3268 msg->msg_namelen = addr_len; 3269 return err; 3270} 3271EXPORT_SYMBOL(sock_common_recvmsg); 3272 3273/* 3274 * Set socket options on an inet socket. 3275 */ 3276int sock_common_setsockopt(struct socket *sock, int level, int optname, 3277 sockptr_t optval, unsigned int optlen) 3278{ 3279 struct sock *sk = sock->sk; 3280 3281 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3282 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); 3283} 3284EXPORT_SYMBOL(sock_common_setsockopt); 3285 3286void sk_common_release(struct sock *sk) 3287{ 3288 if (sk->sk_prot->destroy) 3289 sk->sk_prot->destroy(sk); 3290 3291 /* 3292 * Observation: when sk_common_release is called, processes have 3293 * no access to socket. But net still has. 3294 * Step one, detach it from networking: 3295 * 3296 * A. Remove from hash tables. 3297 */ 3298 3299 sk->sk_prot->unhash(sk); 3300 3301 /* 3302 * In this point socket cannot receive new packets, but it is possible 3303 * that some packets are in flight because some CPU runs receiver and 3304 * did hash table lookup before we unhashed socket. They will achieve 3305 * receive queue and will be purged by socket destructor. 3306 * 3307 * Also we still have packets pending on receive queue and probably, 3308 * our own packets waiting in device queues. sock_destroy will drain 3309 * receive queue, but transmitted packets will delay socket destruction 3310 * until the last reference will be released. 3311 */ 3312 3313 sock_orphan(sk); 3314 3315 xfrm_sk_free_policy(sk); 3316 3317 sk_refcnt_debug_release(sk); 3318 3319 sock_put(sk); 3320} 3321EXPORT_SYMBOL(sk_common_release); 3322 3323void sk_get_meminfo(const struct sock *sk, u32 *mem) 3324{ 3325 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3326 3327 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3328 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3329 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3330 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 3331 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3332 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 3333 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3334 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 3335 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3336} 3337 3338#ifdef CONFIG_PROC_FS 3339#define PROTO_INUSE_NR 64 /* should be enough for the first time */ 3340struct prot_inuse { 3341 int val[PROTO_INUSE_NR]; 3342}; 3343 3344static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3345 3346void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 3347{ 3348 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); 3349} 3350EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 3351 3352int sock_prot_inuse_get(struct net *net, struct proto *prot) 3353{ 3354 int cpu, idx = prot->inuse_idx; 3355 int res = 0; 3356 3357 for_each_possible_cpu(cpu) 3358 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3359 3360 return res >= 0 ? res : 0; 3361} 3362EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3363 3364static void sock_inuse_add(struct net *net, int val) 3365{ 3366 this_cpu_add(*net->core.sock_inuse, val); 3367} 3368 3369int sock_inuse_get(struct net *net) 3370{ 3371 int cpu, res = 0; 3372 3373 for_each_possible_cpu(cpu) 3374 res += *per_cpu_ptr(net->core.sock_inuse, cpu); 3375 3376 return res; 3377} 3378 3379EXPORT_SYMBOL_GPL(sock_inuse_get); 3380 3381static int __net_init sock_inuse_init_net(struct net *net) 3382{ 3383 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3384 if (net->core.prot_inuse == NULL) 3385 return -ENOMEM; 3386 3387 net->core.sock_inuse = alloc_percpu(int); 3388 if (net->core.sock_inuse == NULL) 3389 goto out; 3390 3391 return 0; 3392 3393out: 3394 free_percpu(net->core.prot_inuse); 3395 return -ENOMEM; 3396} 3397 3398static void __net_exit sock_inuse_exit_net(struct net *net) 3399{ 3400 free_percpu(net->core.prot_inuse); 3401 free_percpu(net->core.sock_inuse); 3402} 3403 3404static struct pernet_operations net_inuse_ops = { 3405 .init = sock_inuse_init_net, 3406 .exit = sock_inuse_exit_net, 3407}; 3408 3409static __init int net_inuse_init(void) 3410{ 3411 if (register_pernet_subsys(&net_inuse_ops)) 3412 panic("Cannot initialize net inuse counters"); 3413 3414 return 0; 3415} 3416 3417core_initcall(net_inuse_init); 3418 3419static int assign_proto_idx(struct proto *prot) 3420{ 3421 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3422 3423 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3424 pr_err("PROTO_INUSE_NR exhausted\n"); 3425 return -ENOSPC; 3426 } 3427 3428 set_bit(prot->inuse_idx, proto_inuse_idx); 3429 return 0; 3430} 3431 3432static void release_proto_idx(struct proto *prot) 3433{ 3434 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3435 clear_bit(prot->inuse_idx, proto_inuse_idx); 3436} 3437#else 3438static inline int assign_proto_idx(struct proto *prot) 3439{ 3440 return 0; 3441} 3442 3443static inline void release_proto_idx(struct proto *prot) 3444{ 3445} 3446 3447static void sock_inuse_add(struct net *net, int val) 3448{ 3449} 3450#endif 3451 3452static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 3453{ 3454 if (!twsk_prot) 3455 return; 3456 kfree(twsk_prot->twsk_slab_name); 3457 twsk_prot->twsk_slab_name = NULL; 3458 kmem_cache_destroy(twsk_prot->twsk_slab); 3459 twsk_prot->twsk_slab = NULL; 3460} 3461 3462static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3463{ 3464 if (!rsk_prot) 3465 return; 3466 kfree(rsk_prot->slab_name); 3467 rsk_prot->slab_name = NULL; 3468 kmem_cache_destroy(rsk_prot->slab); 3469 rsk_prot->slab = NULL; 3470} 3471 3472static int req_prot_init(const struct proto *prot) 3473{ 3474 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3475 3476 if (!rsk_prot) 3477 return 0; 3478 3479 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3480 prot->name); 3481 if (!rsk_prot->slab_name) 3482 return -ENOMEM; 3483 3484 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3485 rsk_prot->obj_size, 0, 3486 SLAB_ACCOUNT | prot->slab_flags, 3487 NULL); 3488 3489 if (!rsk_prot->slab) { 3490 pr_crit("%s: Can't create request sock SLAB cache!\n", 3491 prot->name); 3492 return -ENOMEM; 3493 } 3494 return 0; 3495} 3496 3497int proto_register(struct proto *prot, int alloc_slab) 3498{ 3499 int ret = -ENOBUFS; 3500 3501 if (alloc_slab) { 3502 prot->slab = kmem_cache_create_usercopy(prot->name, 3503 prot->obj_size, 0, 3504 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 3505 prot->slab_flags, 3506 prot->useroffset, prot->usersize, 3507 NULL); 3508 3509 if (prot->slab == NULL) { 3510 pr_crit("%s: Can't create sock SLAB cache!\n", 3511 prot->name); 3512 goto out; 3513 } 3514 3515 if (req_prot_init(prot)) 3516 goto out_free_request_sock_slab; 3517 3518 if (prot->twsk_prot != NULL) { 3519 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); 3520 3521 if (prot->twsk_prot->twsk_slab_name == NULL) 3522 goto out_free_request_sock_slab; 3523 3524 prot->twsk_prot->twsk_slab = 3525 kmem_cache_create(prot->twsk_prot->twsk_slab_name, 3526 prot->twsk_prot->twsk_obj_size, 3527 0, 3528 SLAB_ACCOUNT | 3529 prot->slab_flags, 3530 NULL); 3531 if (prot->twsk_prot->twsk_slab == NULL) 3532 goto out_free_timewait_sock_slab; 3533 } 3534 } 3535 3536 mutex_lock(&proto_list_mutex); 3537 ret = assign_proto_idx(prot); 3538 if (ret) { 3539 mutex_unlock(&proto_list_mutex); 3540 goto out_free_timewait_sock_slab; 3541 } 3542 list_add(&prot->node, &proto_list); 3543 mutex_unlock(&proto_list_mutex); 3544 return ret; 3545 3546out_free_timewait_sock_slab: 3547 if (alloc_slab && prot->twsk_prot) 3548 tw_prot_cleanup(prot->twsk_prot); 3549out_free_request_sock_slab: 3550 if (alloc_slab) { 3551 req_prot_cleanup(prot->rsk_prot); 3552 3553 kmem_cache_destroy(prot->slab); 3554 prot->slab = NULL; 3555 } 3556out: 3557 return ret; 3558} 3559EXPORT_SYMBOL(proto_register); 3560 3561void proto_unregister(struct proto *prot) 3562{ 3563 mutex_lock(&proto_list_mutex); 3564 release_proto_idx(prot); 3565 list_del(&prot->node); 3566 mutex_unlock(&proto_list_mutex); 3567 3568 kmem_cache_destroy(prot->slab); 3569 prot->slab = NULL; 3570 3571 req_prot_cleanup(prot->rsk_prot); 3572 tw_prot_cleanup(prot->twsk_prot); 3573} 3574EXPORT_SYMBOL(proto_unregister); 3575 3576int sock_load_diag_module(int family, int protocol) 3577{ 3578 if (!protocol) { 3579 if (!sock_is_registered(family)) 3580 return -ENOENT; 3581 3582 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3583 NETLINK_SOCK_DIAG, family); 3584 } 3585 3586#ifdef CONFIG_INET 3587 if (family == AF_INET && 3588 protocol != IPPROTO_RAW && 3589 protocol < MAX_INET_PROTOS && 3590 !rcu_access_pointer(inet_protos[protocol])) 3591 return -ENOENT; 3592#endif 3593 3594 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 3595 NETLINK_SOCK_DIAG, family, protocol); 3596} 3597EXPORT_SYMBOL(sock_load_diag_module); 3598 3599#ifdef CONFIG_PROC_FS 3600static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 3601 __acquires(proto_list_mutex) 3602{ 3603 mutex_lock(&proto_list_mutex); 3604 return seq_list_start_head(&proto_list, *pos); 3605} 3606 3607static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3608{ 3609 return seq_list_next(v, &proto_list, pos); 3610} 3611 3612static void proto_seq_stop(struct seq_file *seq, void *v) 3613 __releases(proto_list_mutex) 3614{ 3615 mutex_unlock(&proto_list_mutex); 3616} 3617 3618static char proto_method_implemented(const void *method) 3619{ 3620 return method == NULL ? 'n' : 'y'; 3621} 3622static long sock_prot_memory_allocated(struct proto *proto) 3623{ 3624 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 3625} 3626 3627static const char *sock_prot_memory_pressure(struct proto *proto) 3628{ 3629 return proto->memory_pressure != NULL ? 3630 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 3631} 3632 3633static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 3634{ 3635 3636 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 3637 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 3638 proto->name, 3639 proto->obj_size, 3640 sock_prot_inuse_get(seq_file_net(seq), proto), 3641 sock_prot_memory_allocated(proto), 3642 sock_prot_memory_pressure(proto), 3643 proto->max_header, 3644 proto->slab == NULL ? "no" : "yes", 3645 module_name(proto->owner), 3646 proto_method_implemented(proto->close), 3647 proto_method_implemented(proto->connect), 3648 proto_method_implemented(proto->disconnect), 3649 proto_method_implemented(proto->accept), 3650 proto_method_implemented(proto->ioctl), 3651 proto_method_implemented(proto->init), 3652 proto_method_implemented(proto->destroy), 3653 proto_method_implemented(proto->shutdown), 3654 proto_method_implemented(proto->setsockopt), 3655 proto_method_implemented(proto->getsockopt), 3656 proto_method_implemented(proto->sendmsg), 3657 proto_method_implemented(proto->recvmsg), 3658 proto_method_implemented(proto->sendpage), 3659 proto_method_implemented(proto->bind), 3660 proto_method_implemented(proto->backlog_rcv), 3661 proto_method_implemented(proto->hash), 3662 proto_method_implemented(proto->unhash), 3663 proto_method_implemented(proto->get_port), 3664 proto_method_implemented(proto->enter_memory_pressure)); 3665} 3666 3667static int proto_seq_show(struct seq_file *seq, void *v) 3668{ 3669 if (v == &proto_list) 3670 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 3671 "protocol", 3672 "size", 3673 "sockets", 3674 "memory", 3675 "press", 3676 "maxhdr", 3677 "slab", 3678 "module", 3679 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 3680 else 3681 proto_seq_printf(seq, list_entry(v, struct proto, node)); 3682 return 0; 3683} 3684 3685static const struct seq_operations proto_seq_ops = { 3686 .start = proto_seq_start, 3687 .next = proto_seq_next, 3688 .stop = proto_seq_stop, 3689 .show = proto_seq_show, 3690}; 3691 3692static __net_init int proto_init_net(struct net *net) 3693{ 3694 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 3695 sizeof(struct seq_net_private))) 3696 return -ENOMEM; 3697 3698 return 0; 3699} 3700 3701static __net_exit void proto_exit_net(struct net *net) 3702{ 3703 remove_proc_entry("protocols", net->proc_net); 3704} 3705 3706 3707static __net_initdata struct pernet_operations proto_net_ops = { 3708 .init = proto_init_net, 3709 .exit = proto_exit_net, 3710}; 3711 3712static int __init proto_init(void) 3713{ 3714 return register_pernet_subsys(&proto_net_ops); 3715} 3716 3717subsys_initcall(proto_init); 3718 3719#endif /* PROC_FS */ 3720 3721#ifdef CONFIG_NET_RX_BUSY_POLL 3722bool sk_busy_loop_end(void *p, unsigned long start_time) 3723{ 3724 struct sock *sk = p; 3725 3726 return !skb_queue_empty_lockless(&sk->sk_receive_queue) || 3727 sk_busy_loop_timeout(sk, start_time); 3728} 3729EXPORT_SYMBOL(sk_busy_loop_end); 3730#endif /* CONFIG_NET_RX_BUSY_POLL */ 3731 3732int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) 3733{ 3734 if (!sk->sk_prot->bind_add) 3735 return -EOPNOTSUPP; 3736 return sk->sk_prot->bind_add(sk, addr, addr_len); 3737} 3738EXPORT_SYMBOL(sock_bind_add); 3739