1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Definitions for the AF_INET socket handler. 8 * 9 * Version: @(#)sock.h 1.0.4 05/13/93 10 * 11 * Authors: Ross Biro 12 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche <flla@stud.uni-sb.de> 15 * 16 * Fixes: 17 * Alan Cox : Volatiles in skbuff pointers. See 18 * skbuff comments. May be overdone, 19 * better to prove they can be removed 20 * than the reverse. 21 * Alan Cox : Added a zapped field for tcp to note 22 * a socket is reset and must stay shut up 23 * Alan Cox : New fields for options 24 * Pauline Middelink : identd support 25 * Alan Cox : Eliminate low level recv/recvfrom 26 * David S. Miller : New socket lookup architecture. 27 * Steve Whitehouse: Default routines for sock_ops 28 * Arnaldo C. Melo : removed net_pinfo, tp_pinfo and made 29 * protinfo be just a void pointer, as the 30 * protocol specific parts were moved to 31 * respective headers and ipv4/v6, etc now 32 * use private slabcaches for its socks 33 * Pedro Hortas : New flags field for socket options 34 */ 35#ifndef _SOCK_H 36#define _SOCK_H 37 38#include <linux/hardirq.h> 39#include <linux/kernel.h> 40#include <linux/list.h> 41#include <linux/list_nulls.h> 42#include <linux/timer.h> 43#include <linux/cache.h> 44#include <linux/bitops.h> 45#include <linux/lockdep.h> 46#include <linux/netdevice.h> 47#include <linux/skbuff.h> /* struct sk_buff */ 48#include <linux/mm.h> 49#include <linux/security.h> 50#include <linux/slab.h> 51#include <linux/uaccess.h> 52#include <linux/page_counter.h> 53#include <linux/memcontrol.h> 54#include <linux/static_key.h> 55#include <linux/sched.h> 56#include <linux/wait.h> 57#include <linux/cgroup-defs.h> 58#include <linux/rbtree.h> 59#include <linux/filter.h> 60#include <linux/rculist_nulls.h> 61#include <linux/poll.h> 62#include <linux/sockptr.h> 63 64#include <linux/atomic.h> 65#include <linux/refcount.h> 66#include <net/dst.h> 67#include <net/checksum.h> 68#include <net/tcp_states.h> 69#include <linux/net_tstamp.h> 70#include <net/l3mdev.h> 71#ifdef CONFIG_NEWIP 72#include <uapi/linux/nip_addr.h> 73#endif 74 75/* 76 * This structure really needs to be cleaned up. 77 * Most of it is for TCP, and not used by any of 78 * the other protocols. 79 */ 80 81/* Define this to get the SOCK_DBG debugging facility. */ 82#define SOCK_DEBUGGING 83#ifdef SOCK_DEBUGGING 84#define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \ 85 printk(KERN_DEBUG msg); } while (0) 86#else 87/* Validate arguments and do nothing */ 88static inline __printf(2, 3) 89void SOCK_DEBUG(const struct sock *sk, const char *msg, ...) 90{ 91} 92#endif 93 94/* This is the per-socket lock. The spinlock provides a synchronization 95 * between user contexts and software interrupt processing, whereas the 96 * mini-semaphore synchronizes multiple users amongst themselves. 97 */ 98typedef struct { 99 spinlock_t slock; 100 int owned; 101 wait_queue_head_t wq; 102 /* 103 * We express the mutex-alike socket_lock semantics 104 * to the lock validator by explicitly managing 105 * the slock as a lock variant (in addition to 106 * the slock itself): 107 */ 108#ifdef CONFIG_DEBUG_LOCK_ALLOC 109 struct lockdep_map dep_map; 110#endif 111} socket_lock_t; 112 113struct sock; 114struct proto; 115struct net; 116 117typedef __u32 __bitwise __portpair; 118typedef __u64 __bitwise __addrpair; 119 120/** 121 * struct sock_common - minimal network layer representation of sockets 122 * @skc_daddr: Foreign IPv4 addr 123 * @skc_rcv_saddr: Bound local IPv4 addr 124 * @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr 125 * @skc_hash: hash value used with various protocol lookup tables 126 * @skc_u16hashes: two u16 hash values used by UDP lookup tables 127 * @skc_dport: placeholder for inet_dport/tw_dport 128 * @skc_num: placeholder for inet_num/tw_num 129 * @skc_portpair: __u32 union of @skc_dport & @skc_num 130 * @skc_family: network address family 131 * @skc_state: Connection state 132 * @skc_reuse: %SO_REUSEADDR setting 133 * @skc_reuseport: %SO_REUSEPORT setting 134 * @skc_ipv6only: socket is IPV6 only 135 * @skc_net_refcnt: socket is using net ref counting 136 * @skc_bound_dev_if: bound device index if != 0 137 * @skc_bind_node: bind hash linkage for various protocol lookup tables 138 * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol 139 * @skc_prot: protocol handlers inside a network family 140 * @skc_net: reference to the network namespace of this socket 141 * @skc_v6_daddr: IPV6 destination address 142 * @skc_v6_rcv_saddr: IPV6 source address 143 * @skc_cookie: socket's cookie value 144 * @skc_node: main hash linkage for various protocol lookup tables 145 * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol 146 * @skc_tx_queue_mapping: tx queue number for this connection 147 * @skc_rx_queue_mapping: rx queue number for this connection 148 * @skc_flags: place holder for sk_flags 149 * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, 150 * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings 151 * @skc_listener: connection request listener socket (aka rsk_listener) 152 * [union with @skc_flags] 153 * @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row 154 * [union with @skc_flags] 155 * @skc_incoming_cpu: record/match cpu processing incoming packets 156 * @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled) 157 * [union with @skc_incoming_cpu] 158 * @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number 159 * [union with @skc_incoming_cpu] 160 * @skc_refcnt: reference count 161 * 162 * This is the minimal network layer representation of sockets, the header 163 * for struct sock and struct inet_timewait_sock. 164 */ 165struct sock_common { 166 union { 167 __addrpair skc_addrpair; 168 struct { 169 __be32 skc_daddr; 170 __be32 skc_rcv_saddr; 171 }; 172 }; 173 union { 174 unsigned int skc_hash; 175 __u16 skc_u16hashes[2]; 176 }; 177 /* skc_dport && skc_num must be grouped as well */ 178 union { 179 __portpair skc_portpair; 180 struct { 181 __be16 skc_dport; 182 __u16 skc_num; 183 }; 184 }; 185 186 unsigned short skc_family; 187 volatile unsigned char skc_state; 188 unsigned char skc_reuse:4; 189 unsigned char skc_reuseport:1; 190 unsigned char skc_ipv6only:1; 191 unsigned char skc_net_refcnt:1; 192 int skc_bound_dev_if; 193 union { 194 struct hlist_node skc_bind_node; 195 struct hlist_node skc_portaddr_node; 196 }; 197 struct proto *skc_prot; 198 possible_net_t skc_net; 199 200#if IS_ENABLED(CONFIG_IPV6) 201 struct in6_addr skc_v6_daddr; 202 struct in6_addr skc_v6_rcv_saddr; 203#endif 204 205#if IS_ENABLED(CONFIG_NEWIP) 206 struct nip_addr nip_daddr; /* NIP */ 207 struct nip_addr nip_rcv_saddr; /* NIP */ 208#endif 209 210 atomic64_t skc_cookie; 211 212 /* following fields are padding to force 213 * offset(struct sock, sk_refcnt) == 128 on 64bit arches 214 * assuming IPV6 is enabled. We use this padding differently 215 * for different kind of 'sockets' 216 */ 217 union { 218 unsigned long skc_flags; 219 struct sock *skc_listener; /* request_sock */ 220 struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */ 221 }; 222 /* 223 * fields between dontcopy_begin/dontcopy_end 224 * are not copied in sock_copy() 225 */ 226 /* private: */ 227 int skc_dontcopy_begin[0]; 228 /* public: */ 229 union { 230 struct hlist_node skc_node; 231 struct hlist_nulls_node skc_nulls_node; 232 }; 233 unsigned short skc_tx_queue_mapping; 234#ifdef CONFIG_XPS 235 unsigned short skc_rx_queue_mapping; 236#endif 237 union { 238 int skc_incoming_cpu; 239 u32 skc_rcv_wnd; 240 u32 skc_tw_rcv_nxt; /* struct tcp_timewait_sock */ 241 }; 242 243 refcount_t skc_refcnt; 244 /* private: */ 245 int skc_dontcopy_end[0]; 246 union { 247 u32 skc_rxhash; 248 u32 skc_window_clamp; 249 u32 skc_tw_snd_nxt; /* struct tcp_timewait_sock */ 250 }; 251 /* public: */ 252}; 253 254struct bpf_local_storage; 255 256/** 257 * struct sock - network layer representation of sockets 258 * @__sk_common: shared layout with inet_timewait_sock 259 * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN 260 * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings 261 * @sk_lock: synchronizer 262 * @sk_kern_sock: True if sock is using kernel lock classes 263 * @sk_rcvbuf: size of receive buffer in bytes 264 * @sk_wq: sock wait queue and async head 265 * @sk_rx_dst: receive input route used by early demux 266 * @sk_dst_cache: destination cache 267 * @sk_dst_pending_confirm: need to confirm neighbour 268 * @sk_policy: flow policy 269 * @sk_rx_skb_cache: cache copy of recently accessed RX skb 270 * @sk_receive_queue: incoming packets 271 * @sk_wmem_alloc: transmit queue bytes committed 272 * @sk_tsq_flags: TCP Small Queues flags 273 * @sk_write_queue: Packet sending queue 274 * @sk_omem_alloc: "o" is "option" or "other" 275 * @sk_wmem_queued: persistent queue size 276 * @sk_forward_alloc: space allocated forward 277 * @sk_napi_id: id of the last napi context to receive data for sk 278 * @sk_ll_usec: usecs to busypoll when there is no data 279 * @sk_allocation: allocation mode 280 * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) 281 * @sk_pacing_status: Pacing status (requested, handled by sch_fq) 282 * @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE) 283 * @sk_sndbuf: size of send buffer in bytes 284 * @__sk_flags_offset: empty field used to determine location of bitfield 285 * @sk_padding: unused element for alignment 286 * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets 287 * @sk_no_check_rx: allow zero checksum in RX packets 288 * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) 289 * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK) 290 * @sk_route_forced_caps: static, forced route capabilities 291 * (set in tcp_init_sock()) 292 * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) 293 * @sk_gso_max_size: Maximum GSO segment size to build 294 * @sk_gso_max_segs: Maximum number of GSO segments 295 * @sk_pacing_shift: scaling factor for TCP Small Queues 296 * @sk_lingertime: %SO_LINGER l_linger setting 297 * @sk_backlog: always used with the per-socket spinlock held 298 * @sk_callback_lock: used with the callbacks in the end of this struct 299 * @sk_error_queue: rarely used 300 * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, 301 * IPV6_ADDRFORM for instance) 302 * @sk_err: last error 303 * @sk_err_soft: errors that don't cause failure but are the cause of a 304 * persistent failure not just 'timed out' 305 * @sk_drops: raw/udp drops counter 306 * @sk_ack_backlog: current listen backlog 307 * @sk_max_ack_backlog: listen backlog set in listen() 308 * @sk_uid: user id of owner 309 * @sk_priority: %SO_PRIORITY setting 310 * @sk_type: socket type (%SOCK_STREAM, etc) 311 * @sk_protocol: which protocol this socket belongs in this network family 312 * @sk_peer_pid: &struct pid for this socket's peer 313 * @sk_peer_cred: %SO_PEERCRED setting 314 * @sk_rcvlowat: %SO_RCVLOWAT setting 315 * @sk_rcvtimeo: %SO_RCVTIMEO setting 316 * @sk_sndtimeo: %SO_SNDTIMEO setting 317 * @sk_txhash: computed flow hash for use on transmit 318 * @sk_filter: socket filtering instructions 319 * @sk_timer: sock cleanup timer 320 * @sk_stamp: time stamp of last packet received 321 * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only 322 * @sk_tsflags: SO_TIMESTAMPING socket options 323 * @sk_tskey: counter to disambiguate concurrent tstamp requests 324 * @sk_zckey: counter to order MSG_ZEROCOPY notifications 325 * @sk_socket: Identd and reporting IO signals 326 * @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock. 327 * @sk_frag: cached page frag 328 * @sk_peek_off: current peek_offset value 329 * @sk_send_head: front of stuff to transmit 330 * @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head] 331 * @sk_tx_skb_cache: cache copy of recently accessed TX skb 332 * @sk_security: used by security modules 333 * @sk_mark: generic packet mark 334 * @sk_cgrp_data: cgroup data for this cgroup 335 * @sk_memcg: this socket's memory cgroup association 336 * @sk_write_pending: a write to stream socket waits to start 337 * @sk_wait_pending: number of threads blocked on this socket 338 * @sk_state_change: callback to indicate change in the state of the sock 339 * @sk_data_ready: callback to indicate there is data to be processed 340 * @sk_write_space: callback to indicate there is bf sending space available 341 * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) 342 * @sk_backlog_rcv: callback to process the backlog 343 * @sk_validate_xmit_skb: ptr to an optional validate function 344 * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 345 * @sk_reuseport_cb: reuseport group container 346 * @sk_bpf_storage: ptr to cache and control for bpf_sk_storage 347 * @sk_rcu: used during RCU grace period 348 * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME) 349 * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME 350 * @sk_txtime_report_errors: set report errors mode for SO_TXTIME 351 * @sk_txtime_unused: unused txtime flags 352 */ 353struct sock { 354 /* 355 * Now struct inet_timewait_sock also uses sock_common, so please just 356 * don't add nothing before this first member (__sk_common) --acme 357 */ 358 struct sock_common __sk_common; 359#define sk_node __sk_common.skc_node 360#define sk_nulls_node __sk_common.skc_nulls_node 361#define sk_refcnt __sk_common.skc_refcnt 362#define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping 363#ifdef CONFIG_XPS 364#define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping 365#endif 366 367#define sk_dontcopy_begin __sk_common.skc_dontcopy_begin 368#define sk_dontcopy_end __sk_common.skc_dontcopy_end 369#define sk_hash __sk_common.skc_hash 370#define sk_portpair __sk_common.skc_portpair 371#define sk_num __sk_common.skc_num 372#define sk_dport __sk_common.skc_dport 373#define sk_addrpair __sk_common.skc_addrpair 374#define sk_daddr __sk_common.skc_daddr 375#define sk_rcv_saddr __sk_common.skc_rcv_saddr 376#define sk_family __sk_common.skc_family 377#define sk_state __sk_common.skc_state 378#define sk_reuse __sk_common.skc_reuse 379#define sk_reuseport __sk_common.skc_reuseport 380#define sk_ipv6only __sk_common.skc_ipv6only 381#define sk_net_refcnt __sk_common.skc_net_refcnt 382#define sk_bound_dev_if __sk_common.skc_bound_dev_if 383#define sk_bind_node __sk_common.skc_bind_node 384#define sk_prot __sk_common.skc_prot 385#define sk_net __sk_common.skc_net 386#define sk_v6_daddr __sk_common.skc_v6_daddr 387#define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr 388#define sk_cookie __sk_common.skc_cookie 389#define sk_incoming_cpu __sk_common.skc_incoming_cpu 390#define sk_flags __sk_common.skc_flags 391#define sk_rxhash __sk_common.skc_rxhash 392 393 socket_lock_t sk_lock; 394 atomic_t sk_drops; 395 int sk_rcvlowat; 396 struct sk_buff_head sk_error_queue; 397 struct sk_buff *sk_rx_skb_cache; 398 struct sk_buff_head sk_receive_queue; 399 /* 400 * The backlog queue is special, it is always used with 401 * the per-socket spinlock held and requires low latency 402 * access. Therefore we special case it's implementation. 403 * Note : rmem_alloc is in this structure to fill a hole 404 * on 64bit arches, not because its logically part of 405 * backlog. 406 */ 407 struct { 408 atomic_t rmem_alloc; 409 int len; 410 struct sk_buff *head; 411 struct sk_buff *tail; 412 } sk_backlog; 413#define sk_rmem_alloc sk_backlog.rmem_alloc 414 415 int sk_forward_alloc; 416#ifdef CONFIG_NET_RX_BUSY_POLL 417 unsigned int sk_ll_usec; 418 /* ===== mostly read cache line ===== */ 419 unsigned int sk_napi_id; 420#endif 421 int sk_rcvbuf; 422 int sk_wait_pending; 423 424 struct sk_filter __rcu *sk_filter; 425 union { 426 struct socket_wq __rcu *sk_wq; 427 /* private: */ 428 struct socket_wq *sk_wq_raw; 429 /* public: */ 430 }; 431#ifdef CONFIG_XFRM 432 struct xfrm_policy __rcu *sk_policy[2]; 433#endif 434 struct dst_entry __rcu *sk_rx_dst; 435 struct dst_entry __rcu *sk_dst_cache; 436 atomic_t sk_omem_alloc; 437 int sk_sndbuf; 438 439 /* ===== cache line for TX ===== */ 440 int sk_wmem_queued; 441 refcount_t sk_wmem_alloc; 442 unsigned long sk_tsq_flags; 443 union { 444 struct sk_buff *sk_send_head; 445 struct rb_root tcp_rtx_queue; 446 }; 447 struct sk_buff *sk_tx_skb_cache; 448 struct sk_buff_head sk_write_queue; 449 __s32 sk_peek_off; 450 int sk_write_pending; 451 __u32 sk_dst_pending_confirm; 452 u32 sk_pacing_status; /* see enum sk_pacing */ 453 long sk_sndtimeo; 454 struct timer_list sk_timer; 455 __u32 sk_priority; 456 __u32 sk_mark; 457 unsigned long sk_pacing_rate; /* bytes per second */ 458 unsigned long sk_max_pacing_rate; 459 struct page_frag sk_frag; 460 netdev_features_t sk_route_caps; 461 netdev_features_t sk_route_nocaps; 462 netdev_features_t sk_route_forced_caps; 463 int sk_gso_type; 464 unsigned int sk_gso_max_size; 465 gfp_t sk_allocation; 466 __u32 sk_txhash; 467 468 /* 469 * Because of non atomicity rules, all 470 * changes are protected by socket lock. 471 */ 472 u8 sk_padding : 1, 473 sk_kern_sock : 1, 474 sk_no_check_tx : 1, 475 sk_no_check_rx : 1, 476 sk_userlocks : 4; 477 u8 sk_pacing_shift; 478 u16 sk_type; 479 u16 sk_protocol; 480 u16 sk_gso_max_segs; 481 unsigned long sk_lingertime; 482 struct proto *sk_prot_creator; 483 rwlock_t sk_callback_lock; 484 int sk_err, 485 sk_err_soft; 486 u32 sk_ack_backlog; 487 u32 sk_max_ack_backlog; 488 kuid_t sk_uid; 489 spinlock_t sk_peer_lock; 490 struct pid *sk_peer_pid; 491 const struct cred *sk_peer_cred; 492 493 long sk_rcvtimeo; 494 ktime_t sk_stamp; 495#if BITS_PER_LONG==32 496 seqlock_t sk_stamp_seq; 497#endif 498 u16 sk_tsflags; 499 u8 sk_shutdown; 500 u32 sk_tskey; 501 atomic_t sk_zckey; 502 503 u8 sk_clockid; 504 u8 sk_txtime_deadline_mode : 1, 505 sk_txtime_report_errors : 1, 506 sk_txtime_unused : 6; 507 508 struct socket *sk_socket; 509 void *sk_user_data; 510#ifdef CONFIG_SECURITY 511 void *sk_security; 512#endif 513 struct sock_cgroup_data sk_cgrp_data; 514 struct mem_cgroup *sk_memcg; 515 void (*sk_state_change)(struct sock *sk); 516 void (*sk_data_ready)(struct sock *sk); 517 void (*sk_write_space)(struct sock *sk); 518 void (*sk_error_report)(struct sock *sk); 519 int (*sk_backlog_rcv)(struct sock *sk, 520 struct sk_buff *skb); 521#ifdef CONFIG_SOCK_VALIDATE_XMIT 522 struct sk_buff* (*sk_validate_xmit_skb)(struct sock *sk, 523 struct net_device *dev, 524 struct sk_buff *skb); 525#endif 526 void (*sk_destruct)(struct sock *sk); 527 struct sock_reuseport __rcu *sk_reuseport_cb; 528#ifdef CONFIG_BPF_SYSCALL 529 struct bpf_local_storage __rcu *sk_bpf_storage; 530#endif 531 struct rcu_head sk_rcu; 532}; 533 534enum sk_pacing { 535 SK_PACING_NONE = 0, 536 SK_PACING_NEEDED = 1, 537 SK_PACING_FQ = 2, 538}; 539 540/* flag bits in sk_user_data 541 * 542 * - SK_USER_DATA_NOCOPY: Pointer stored in sk_user_data might 543 * not be suitable for copying when cloning the socket. For instance, 544 * it can point to a reference counted object. sk_user_data bottom 545 * bit is set if pointer must not be copied. 546 * 547 * - SK_USER_DATA_BPF: Mark whether sk_user_data field is 548 * managed/owned by a BPF reuseport array. This bit should be set 549 * when sk_user_data's sk is added to the bpf's reuseport_array. 550 * 551 * - SK_USER_DATA_PSOCK: Mark whether pointer stored in 552 * sk_user_data points to psock type. This bit should be set 553 * when sk_user_data is assigned to a psock object. 554 */ 555#define SK_USER_DATA_NOCOPY 1UL 556#define SK_USER_DATA_BPF 2UL 557#define SK_USER_DATA_PSOCK 4UL 558#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\ 559 SK_USER_DATA_PSOCK) 560 561/** 562 * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied 563 * @sk: socket 564 */ 565static inline bool sk_user_data_is_nocopy(const struct sock *sk) 566{ 567 return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY); 568} 569 570#define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) 571 572/** 573 * __rcu_dereference_sk_user_data_with_flags - return the pointer 574 * only if argument flags all has been set in sk_user_data. Otherwise 575 * return NULL 576 * 577 * @sk: socket 578 * @flags: flag bits 579 */ 580static inline void * 581__rcu_dereference_sk_user_data_with_flags(const struct sock *sk, 582 uintptr_t flags) 583{ 584 uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk)); 585 586 WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK); 587 588 if ((sk_user_data & flags) == flags) 589 return (void *)(sk_user_data & SK_USER_DATA_PTRMASK); 590 return NULL; 591} 592 593#define rcu_dereference_sk_user_data(sk) \ 594 __rcu_dereference_sk_user_data_with_flags(sk, 0) 595#define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags) \ 596({ \ 597 uintptr_t __tmp1 = (uintptr_t)(ptr), \ 598 __tmp2 = (uintptr_t)(flags); \ 599 WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK); \ 600 WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK); \ 601 rcu_assign_pointer(__sk_user_data((sk)), \ 602 __tmp1 | __tmp2); \ 603}) 604#define rcu_assign_sk_user_data(sk, ptr) \ 605 __rcu_assign_sk_user_data_with_flags(sk, ptr, 0) 606 607/* 608 * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK 609 * or not whether his port will be reused by someone else. SK_FORCE_REUSE 610 * on a socket means that the socket will reuse everybody else's port 611 * without looking at the other's sk_reuse value. 612 */ 613 614#define SK_NO_REUSE 0 615#define SK_CAN_REUSE 1 616#define SK_FORCE_REUSE 2 617 618int sk_set_peek_off(struct sock *sk, int val); 619 620static inline int sk_peek_offset(struct sock *sk, int flags) 621{ 622 if (unlikely(flags & MSG_PEEK)) { 623 return READ_ONCE(sk->sk_peek_off); 624 } 625 626 return 0; 627} 628 629static inline void sk_peek_offset_bwd(struct sock *sk, int val) 630{ 631 s32 off = READ_ONCE(sk->sk_peek_off); 632 633 if (unlikely(off >= 0)) { 634 off = max_t(s32, off - val, 0); 635 WRITE_ONCE(sk->sk_peek_off, off); 636 } 637} 638 639static inline void sk_peek_offset_fwd(struct sock *sk, int val) 640{ 641 sk_peek_offset_bwd(sk, -val); 642} 643 644/* 645 * Hashed lists helper routines 646 */ 647static inline struct sock *sk_entry(const struct hlist_node *node) 648{ 649 return hlist_entry(node, struct sock, sk_node); 650} 651 652static inline struct sock *__sk_head(const struct hlist_head *head) 653{ 654 return hlist_entry(head->first, struct sock, sk_node); 655} 656 657static inline struct sock *sk_head(const struct hlist_head *head) 658{ 659 return hlist_empty(head) ? NULL : __sk_head(head); 660} 661 662static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head) 663{ 664 return hlist_nulls_entry(head->first, struct sock, sk_nulls_node); 665} 666 667static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head) 668{ 669 return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head); 670} 671 672static inline struct sock *sk_next(const struct sock *sk) 673{ 674 return hlist_entry_safe(sk->sk_node.next, struct sock, sk_node); 675} 676 677static inline struct sock *sk_nulls_next(const struct sock *sk) 678{ 679 return (!is_a_nulls(sk->sk_nulls_node.next)) ? 680 hlist_nulls_entry(sk->sk_nulls_node.next, 681 struct sock, sk_nulls_node) : 682 NULL; 683} 684 685static inline bool sk_unhashed(const struct sock *sk) 686{ 687 return hlist_unhashed(&sk->sk_node); 688} 689 690static inline bool sk_hashed(const struct sock *sk) 691{ 692 return !sk_unhashed(sk); 693} 694 695static inline void sk_node_init(struct hlist_node *node) 696{ 697 node->pprev = NULL; 698} 699 700static inline void sk_nulls_node_init(struct hlist_nulls_node *node) 701{ 702 node->pprev = NULL; 703} 704 705static inline void __sk_del_node(struct sock *sk) 706{ 707 __hlist_del(&sk->sk_node); 708} 709 710/* NB: equivalent to hlist_del_init_rcu */ 711static inline bool __sk_del_node_init(struct sock *sk) 712{ 713 if (sk_hashed(sk)) { 714 __sk_del_node(sk); 715 sk_node_init(&sk->sk_node); 716 return true; 717 } 718 return false; 719} 720 721/* Grab socket reference count. This operation is valid only 722 when sk is ALREADY grabbed f.e. it is found in hash table 723 or a list and the lookup is made under lock preventing hash table 724 modifications. 725 */ 726 727static __always_inline void sock_hold(struct sock *sk) 728{ 729 refcount_inc(&sk->sk_refcnt); 730} 731 732/* Ungrab socket in the context, which assumes that socket refcnt 733 cannot hit zero, f.e. it is true in context of any socketcall. 734 */ 735static __always_inline void __sock_put(struct sock *sk) 736{ 737 refcount_dec(&sk->sk_refcnt); 738} 739 740static inline bool sk_del_node_init(struct sock *sk) 741{ 742 bool rc = __sk_del_node_init(sk); 743 744 if (rc) { 745 /* paranoid for a while -acme */ 746 WARN_ON(refcount_read(&sk->sk_refcnt) == 1); 747 __sock_put(sk); 748 } 749 return rc; 750} 751#define sk_del_node_init_rcu(sk) sk_del_node_init(sk) 752 753static inline bool __sk_nulls_del_node_init_rcu(struct sock *sk) 754{ 755 if (sk_hashed(sk)) { 756 hlist_nulls_del_init_rcu(&sk->sk_nulls_node); 757 return true; 758 } 759 return false; 760} 761 762static inline bool sk_nulls_del_node_init_rcu(struct sock *sk) 763{ 764 bool rc = __sk_nulls_del_node_init_rcu(sk); 765 766 if (rc) { 767 /* paranoid for a while -acme */ 768 WARN_ON(refcount_read(&sk->sk_refcnt) == 1); 769 __sock_put(sk); 770 } 771 return rc; 772} 773 774static inline void __sk_add_node(struct sock *sk, struct hlist_head *list) 775{ 776 hlist_add_head(&sk->sk_node, list); 777} 778 779static inline void sk_add_node(struct sock *sk, struct hlist_head *list) 780{ 781 sock_hold(sk); 782 __sk_add_node(sk, list); 783} 784 785static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) 786{ 787 sock_hold(sk); 788 if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && 789 sk->sk_family == AF_INET6) 790 hlist_add_tail_rcu(&sk->sk_node, list); 791 else 792 hlist_add_head_rcu(&sk->sk_node, list); 793} 794 795static inline void sk_add_node_tail_rcu(struct sock *sk, struct hlist_head *list) 796{ 797 sock_hold(sk); 798 hlist_add_tail_rcu(&sk->sk_node, list); 799} 800 801static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) 802{ 803 hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); 804} 805 806static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nulls_head *list) 807{ 808 hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list); 809} 810 811static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) 812{ 813 sock_hold(sk); 814 __sk_nulls_add_node_rcu(sk, list); 815} 816 817static inline void __sk_del_bind_node(struct sock *sk) 818{ 819 __hlist_del(&sk->sk_bind_node); 820} 821 822static inline void sk_add_bind_node(struct sock *sk, 823 struct hlist_head *list) 824{ 825 hlist_add_head(&sk->sk_bind_node, list); 826} 827 828#define sk_for_each(__sk, list) \ 829 hlist_for_each_entry(__sk, list, sk_node) 830#define sk_for_each_rcu(__sk, list) \ 831 hlist_for_each_entry_rcu(__sk, list, sk_node) 832#define sk_nulls_for_each(__sk, node, list) \ 833 hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node) 834#define sk_nulls_for_each_rcu(__sk, node, list) \ 835 hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node) 836#define sk_for_each_from(__sk) \ 837 hlist_for_each_entry_from(__sk, sk_node) 838#define sk_nulls_for_each_from(__sk, node) \ 839 if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \ 840 hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node) 841#define sk_for_each_safe(__sk, tmp, list) \ 842 hlist_for_each_entry_safe(__sk, tmp, list, sk_node) 843#define sk_for_each_bound(__sk, list) \ 844 hlist_for_each_entry(__sk, list, sk_bind_node) 845 846/** 847 * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset 848 * @tpos: the type * to use as a loop cursor. 849 * @pos: the &struct hlist_node to use as a loop cursor. 850 * @head: the head for your list. 851 * @offset: offset of hlist_node within the struct. 852 * 853 */ 854#define sk_for_each_entry_offset_rcu(tpos, pos, head, offset) \ 855 for (pos = rcu_dereference(hlist_first_rcu(head)); \ 856 pos != NULL && \ 857 ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \ 858 pos = rcu_dereference(hlist_next_rcu(pos))) 859 860static inline struct user_namespace *sk_user_ns(struct sock *sk) 861{ 862 /* Careful only use this in a context where these parameters 863 * can not change and must all be valid, such as recvmsg from 864 * userspace. 865 */ 866 return sk->sk_socket->file->f_cred->user_ns; 867} 868 869/* Sock flags */ 870enum sock_flags { 871 SOCK_DEAD, 872 SOCK_DONE, 873 SOCK_URGINLINE, 874 SOCK_KEEPOPEN, 875 SOCK_LINGER, 876 SOCK_DESTROY, 877 SOCK_BROADCAST, 878 SOCK_TIMESTAMP, 879 SOCK_ZAPPED, 880 SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */ 881 SOCK_DBG, /* %SO_DEBUG setting */ 882 SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */ 883 SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ 884 SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ 885 SOCK_MEMALLOC, /* VM depends on this socket for swapping */ 886 SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */ 887 SOCK_FASYNC, /* fasync() active */ 888 SOCK_RXQ_OVFL, 889 SOCK_ZEROCOPY, /* buffers from userspace */ 890 SOCK_WIFI_STATUS, /* push wifi status to userspace */ 891 SOCK_NOFCS, /* Tell NIC not to do the Ethernet FCS. 892 * Will use last 4 bytes of packet sent from 893 * user-space instead. 894 */ 895 SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */ 896 SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ 897 SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */ 898 SOCK_TXTIME, 899 SOCK_XDP, /* XDP is attached */ 900 SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ 901}; 902 903#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) 904 905static inline void sock_copy_flags(struct sock *nsk, struct sock *osk) 906{ 907 nsk->sk_flags = osk->sk_flags; 908} 909 910static inline void sock_set_flag(struct sock *sk, enum sock_flags flag) 911{ 912 __set_bit(flag, &sk->sk_flags); 913} 914 915static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag) 916{ 917 __clear_bit(flag, &sk->sk_flags); 918} 919 920static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit, 921 int valbool) 922{ 923 if (valbool) 924 sock_set_flag(sk, bit); 925 else 926 sock_reset_flag(sk, bit); 927} 928 929static inline bool sock_flag(const struct sock *sk, enum sock_flags flag) 930{ 931 return test_bit(flag, &sk->sk_flags); 932} 933 934#ifdef CONFIG_NET 935DECLARE_STATIC_KEY_FALSE(memalloc_socks_key); 936static inline int sk_memalloc_socks(void) 937{ 938 return static_branch_unlikely(&memalloc_socks_key); 939} 940 941void __receive_sock(struct file *file); 942#else 943 944static inline int sk_memalloc_socks(void) 945{ 946 return 0; 947} 948 949static inline void __receive_sock(struct file *file) 950{ } 951#endif 952 953static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask) 954{ 955 return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC); 956} 957 958static inline void sk_acceptq_removed(struct sock *sk) 959{ 960 WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1); 961} 962 963static inline void sk_acceptq_added(struct sock *sk) 964{ 965 WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1); 966} 967 968static inline bool sk_acceptq_is_full(const struct sock *sk) 969{ 970 return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog); 971} 972 973/* 974 * Compute minimal free write space needed to queue new packets. 975 */ 976static inline int sk_stream_min_wspace(const struct sock *sk) 977{ 978 return READ_ONCE(sk->sk_wmem_queued) >> 1; 979} 980 981static inline int sk_stream_wspace(const struct sock *sk) 982{ 983 return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued); 984} 985 986static inline void sk_wmem_queued_add(struct sock *sk, int val) 987{ 988 WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val); 989} 990 991void sk_stream_write_space(struct sock *sk); 992 993/* OOB backlog add */ 994static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) 995{ 996 /* dont let skb dst not refcounted, we are going to leave rcu lock */ 997 skb_dst_force(skb); 998 999 if (!sk->sk_backlog.tail) 1000 WRITE_ONCE(sk->sk_backlog.head, skb); 1001 else 1002 sk->sk_backlog.tail->next = skb; 1003 1004 WRITE_ONCE(sk->sk_backlog.tail, skb); 1005 skb->next = NULL; 1006} 1007 1008/* 1009 * Take into account size of receive queue and backlog queue 1010 * Do not take into account this skb truesize, 1011 * to allow even a single big packet to come. 1012 */ 1013static inline bool sk_rcvqueues_full(const struct sock *sk, unsigned int limit) 1014{ 1015 unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); 1016 1017 return qsize > limit; 1018} 1019 1020/* The per-socket spinlock must be held here. */ 1021static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb, 1022 unsigned int limit) 1023{ 1024 if (sk_rcvqueues_full(sk, limit)) 1025 return -ENOBUFS; 1026 1027 /* 1028 * If the skb was allocated from pfmemalloc reserves, only 1029 * allow SOCK_MEMALLOC sockets to use it as this socket is 1030 * helping free memory 1031 */ 1032 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) 1033 return -ENOMEM; 1034 1035 __sk_add_backlog(sk, skb); 1036 sk->sk_backlog.len += skb->truesize; 1037 return 0; 1038} 1039 1040int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); 1041 1042static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 1043{ 1044 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) 1045 return __sk_backlog_rcv(sk, skb); 1046 1047 return sk->sk_backlog_rcv(sk, skb); 1048} 1049 1050static inline void sk_incoming_cpu_update(struct sock *sk) 1051{ 1052 int cpu = raw_smp_processor_id(); 1053 1054 if (unlikely(READ_ONCE(sk->sk_incoming_cpu) != cpu)) 1055 WRITE_ONCE(sk->sk_incoming_cpu, cpu); 1056} 1057 1058static inline void sock_rps_record_flow_hash(__u32 hash) 1059{ 1060#ifdef CONFIG_RPS 1061 struct rps_sock_flow_table *sock_flow_table; 1062 1063 rcu_read_lock(); 1064 sock_flow_table = rcu_dereference(rps_sock_flow_table); 1065 rps_record_sock_flow(sock_flow_table, hash); 1066 rcu_read_unlock(); 1067#endif 1068} 1069 1070static inline void sock_rps_record_flow(const struct sock *sk) 1071{ 1072#ifdef CONFIG_RPS 1073 if (static_branch_unlikely(&rfs_needed)) { 1074 /* Reading sk->sk_rxhash might incur an expensive cache line 1075 * miss. 1076 * 1077 * TCP_ESTABLISHED does cover almost all states where RFS 1078 * might be useful, and is cheaper [1] than testing : 1079 * IPv4: inet_sk(sk)->inet_daddr 1080 * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) 1081 * OR an additional socket flag 1082 * [1] : sk_state and sk_prot are in the same cache line. 1083 */ 1084 if (sk->sk_state == TCP_ESTABLISHED) { 1085 /* This READ_ONCE() is paired with the WRITE_ONCE() 1086 * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). 1087 */ 1088 sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); 1089 } 1090 } 1091#endif 1092} 1093 1094static inline void sock_rps_save_rxhash(struct sock *sk, 1095 const struct sk_buff *skb) 1096{ 1097#ifdef CONFIG_RPS 1098 /* The following WRITE_ONCE() is paired with the READ_ONCE() 1099 * here, and another one in sock_rps_record_flow(). 1100 */ 1101 if (unlikely(READ_ONCE(sk->sk_rxhash) != skb->hash)) 1102 WRITE_ONCE(sk->sk_rxhash, skb->hash); 1103#endif 1104} 1105 1106static inline void sock_rps_reset_rxhash(struct sock *sk) 1107{ 1108#ifdef CONFIG_RPS 1109 /* Paired with READ_ONCE() in sock_rps_record_flow() */ 1110 WRITE_ONCE(sk->sk_rxhash, 0); 1111#endif 1112} 1113 1114#define sk_wait_event(__sk, __timeo, __condition, __wait) \ 1115 ({ int __rc; \ 1116 __sk->sk_wait_pending++; \ 1117 release_sock(__sk); \ 1118 __rc = __condition; \ 1119 if (!__rc) { \ 1120 *(__timeo) = wait_woken(__wait, \ 1121 TASK_INTERRUPTIBLE, \ 1122 *(__timeo)); \ 1123 } \ 1124 sched_annotate_sleep(); \ 1125 lock_sock(__sk); \ 1126 __sk->sk_wait_pending--; \ 1127 __rc = __condition; \ 1128 __rc; \ 1129 }) 1130 1131int sk_stream_wait_connect(struct sock *sk, long *timeo_p); 1132int sk_stream_wait_memory(struct sock *sk, long *timeo_p); 1133void sk_stream_wait_close(struct sock *sk, long timeo_p); 1134int sk_stream_error(struct sock *sk, int flags, int err); 1135void sk_stream_kill_queues(struct sock *sk); 1136void sk_set_memalloc(struct sock *sk); 1137void sk_clear_memalloc(struct sock *sk); 1138 1139void __sk_flush_backlog(struct sock *sk); 1140 1141static inline bool sk_flush_backlog(struct sock *sk) 1142{ 1143 if (unlikely(READ_ONCE(sk->sk_backlog.tail))) { 1144 __sk_flush_backlog(sk); 1145 return true; 1146 } 1147 return false; 1148} 1149 1150int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb); 1151 1152struct request_sock_ops; 1153struct timewait_sock_ops; 1154struct inet_hashinfo; 1155struct raw_hashinfo; 1156struct smc_hashinfo; 1157struct module; 1158 1159/* 1160 * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes 1161 * un-modified. Special care is taken when initializing object to zero. 1162 */ 1163static inline void sk_prot_clear_nulls(struct sock *sk, int size) 1164{ 1165 if (offsetof(struct sock, sk_node.next) != 0) 1166 memset(sk, 0, offsetof(struct sock, sk_node.next)); 1167 memset(&sk->sk_node.pprev, 0, 1168 size - offsetof(struct sock, sk_node.pprev)); 1169} 1170 1171/* Networking protocol blocks we attach to sockets. 1172 * socket layer -> transport layer interface 1173 */ 1174struct proto { 1175 void (*close)(struct sock *sk, 1176 long timeout); 1177 int (*pre_connect)(struct sock *sk, 1178 struct sockaddr *uaddr, 1179 int addr_len); 1180 int (*connect)(struct sock *sk, 1181 struct sockaddr *uaddr, 1182 int addr_len); 1183 int (*disconnect)(struct sock *sk, int flags); 1184 1185 struct sock * (*accept)(struct sock *sk, int flags, int *err, 1186 bool kern); 1187 1188 int (*ioctl)(struct sock *sk, int cmd, 1189 unsigned long arg); 1190 int (*init)(struct sock *sk); 1191 void (*destroy)(struct sock *sk); 1192 void (*shutdown)(struct sock *sk, int how); 1193 int (*setsockopt)(struct sock *sk, int level, 1194 int optname, sockptr_t optval, 1195 unsigned int optlen); 1196 int (*getsockopt)(struct sock *sk, int level, 1197 int optname, char __user *optval, 1198 int __user *option); 1199 void (*keepalive)(struct sock *sk, int valbool); 1200#ifdef CONFIG_COMPAT 1201 int (*compat_ioctl)(struct sock *sk, 1202 unsigned int cmd, unsigned long arg); 1203#endif 1204 int (*sendmsg)(struct sock *sk, struct msghdr *msg, 1205 size_t len); 1206 int (*recvmsg)(struct sock *sk, struct msghdr *msg, 1207 size_t len, int noblock, int flags, 1208 int *addr_len); 1209 int (*sendpage)(struct sock *sk, struct page *page, 1210 int offset, size_t size, int flags); 1211 int (*bind)(struct sock *sk, 1212 struct sockaddr *addr, int addr_len); 1213 int (*bind_add)(struct sock *sk, 1214 struct sockaddr *addr, int addr_len); 1215 1216 int (*backlog_rcv) (struct sock *sk, 1217 struct sk_buff *skb); 1218 bool (*bpf_bypass_getsockopt)(int level, 1219 int optname); 1220 1221 void (*release_cb)(struct sock *sk); 1222 1223 /* Keeping track of sk's, looking them up, and port selection methods. */ 1224 int (*hash)(struct sock *sk); 1225 void (*unhash)(struct sock *sk); 1226 void (*rehash)(struct sock *sk); 1227 int (*get_port)(struct sock *sk, unsigned short snum); 1228 1229 /* Keeping track of sockets in use */ 1230#ifdef CONFIG_PROC_FS 1231 unsigned int inuse_idx; 1232#endif 1233 1234 bool (*stream_memory_free)(const struct sock *sk, int wake); 1235 bool (*stream_memory_read)(const struct sock *sk); 1236 /* Memory pressure */ 1237 void (*enter_memory_pressure)(struct sock *sk); 1238 void (*leave_memory_pressure)(struct sock *sk); 1239 atomic_long_t *memory_allocated; /* Current allocated memory. */ 1240 struct percpu_counter *sockets_allocated; /* Current number of sockets. */ 1241 /* 1242 * Pressure flag: try to collapse. 1243 * Technical note: it is used by multiple contexts non atomically. 1244 * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes. 1245 * All the __sk_mem_schedule() is of this nature: accounting 1246 * is strict, actions are advisory and have some latency. 1247 */ 1248 unsigned long *memory_pressure; 1249 long *sysctl_mem; 1250 1251 int *sysctl_wmem; 1252 int *sysctl_rmem; 1253 u32 sysctl_wmem_offset; 1254 u32 sysctl_rmem_offset; 1255 1256 int max_header; 1257 bool no_autobind; 1258 1259 struct kmem_cache *slab; 1260 unsigned int obj_size; 1261 slab_flags_t slab_flags; 1262 unsigned int useroffset; /* Usercopy region offset */ 1263 unsigned int usersize; /* Usercopy region size */ 1264 1265 unsigned int __percpu *orphan_count; 1266 1267 struct request_sock_ops *rsk_prot; 1268 struct timewait_sock_ops *twsk_prot; 1269 1270 union { 1271 struct inet_hashinfo *hashinfo; 1272 struct udp_table *udp_table; 1273 struct raw_hashinfo *raw_hash; 1274 struct smc_hashinfo *smc_hash; 1275 } h; 1276 1277 struct module *owner; 1278 1279 char name[32]; 1280 1281 struct list_head node; 1282#ifdef SOCK_REFCNT_DEBUG 1283 atomic_t socks; 1284#endif 1285 int (*diag_destroy)(struct sock *sk, int err); 1286} __randomize_layout; 1287 1288int proto_register(struct proto *prot, int alloc_slab); 1289void proto_unregister(struct proto *prot); 1290int sock_load_diag_module(int family, int protocol); 1291 1292#ifdef SOCK_REFCNT_DEBUG 1293static inline void sk_refcnt_debug_inc(struct sock *sk) 1294{ 1295 atomic_inc(&sk->sk_prot->socks); 1296} 1297 1298static inline void sk_refcnt_debug_dec(struct sock *sk) 1299{ 1300 atomic_dec(&sk->sk_prot->socks); 1301 printk(KERN_DEBUG "%s socket %p released, %d are still alive\n", 1302 sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks)); 1303} 1304 1305static inline void sk_refcnt_debug_release(const struct sock *sk) 1306{ 1307 if (refcount_read(&sk->sk_refcnt) != 1) 1308 printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n", 1309 sk->sk_prot->name, sk, refcount_read(&sk->sk_refcnt)); 1310} 1311#else /* SOCK_REFCNT_DEBUG */ 1312#define sk_refcnt_debug_inc(sk) do { } while (0) 1313#define sk_refcnt_debug_dec(sk) do { } while (0) 1314#define sk_refcnt_debug_release(sk) do { } while (0) 1315#endif /* SOCK_REFCNT_DEBUG */ 1316 1317static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) 1318{ 1319 if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) 1320 return false; 1321 1322 return sk->sk_prot->stream_memory_free ? 1323 sk->sk_prot->stream_memory_free(sk, wake) : true; 1324} 1325 1326static inline bool sk_stream_memory_free(const struct sock *sk) 1327{ 1328 return __sk_stream_memory_free(sk, 0); 1329} 1330 1331static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake) 1332{ 1333 return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && 1334 __sk_stream_memory_free(sk, wake); 1335} 1336 1337static inline bool sk_stream_is_writeable(const struct sock *sk) 1338{ 1339 return __sk_stream_is_writeable(sk, 0); 1340} 1341 1342static inline int sk_under_cgroup_hierarchy(struct sock *sk, 1343 struct cgroup *ancestor) 1344{ 1345#ifdef CONFIG_SOCK_CGROUP_DATA 1346 return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), 1347 ancestor); 1348#else 1349 return -ENOTSUPP; 1350#endif 1351} 1352 1353static inline bool sk_has_memory_pressure(const struct sock *sk) 1354{ 1355 return sk->sk_prot->memory_pressure != NULL; 1356} 1357 1358static inline bool sk_under_global_memory_pressure(const struct sock *sk) 1359{ 1360 return sk->sk_prot->memory_pressure && 1361 !!READ_ONCE(*sk->sk_prot->memory_pressure); 1362} 1363 1364static inline bool sk_under_memory_pressure(const struct sock *sk) 1365{ 1366 if (!sk->sk_prot->memory_pressure) 1367 return false; 1368 1369 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 1370 mem_cgroup_under_socket_pressure(sk->sk_memcg)) 1371 return true; 1372 1373 return !!READ_ONCE(*sk->sk_prot->memory_pressure); 1374} 1375 1376static inline long 1377sk_memory_allocated(const struct sock *sk) 1378{ 1379 return atomic_long_read(sk->sk_prot->memory_allocated); 1380} 1381 1382static inline long 1383sk_memory_allocated_add(struct sock *sk, int amt) 1384{ 1385 return atomic_long_add_return(amt, sk->sk_prot->memory_allocated); 1386} 1387 1388static inline void 1389sk_memory_allocated_sub(struct sock *sk, int amt) 1390{ 1391 atomic_long_sub(amt, sk->sk_prot->memory_allocated); 1392} 1393 1394static inline void sk_sockets_allocated_dec(struct sock *sk) 1395{ 1396 percpu_counter_dec(sk->sk_prot->sockets_allocated); 1397} 1398 1399static inline void sk_sockets_allocated_inc(struct sock *sk) 1400{ 1401 percpu_counter_inc(sk->sk_prot->sockets_allocated); 1402} 1403 1404static inline u64 1405sk_sockets_allocated_read_positive(struct sock *sk) 1406{ 1407 return percpu_counter_read_positive(sk->sk_prot->sockets_allocated); 1408} 1409 1410static inline int 1411proto_sockets_allocated_sum_positive(struct proto *prot) 1412{ 1413 return percpu_counter_sum_positive(prot->sockets_allocated); 1414} 1415 1416static inline long 1417proto_memory_allocated(struct proto *prot) 1418{ 1419 return atomic_long_read(prot->memory_allocated); 1420} 1421 1422static inline bool 1423proto_memory_pressure(struct proto *prot) 1424{ 1425 if (!prot->memory_pressure) 1426 return false; 1427 return !!READ_ONCE(*prot->memory_pressure); 1428} 1429 1430 1431#ifdef CONFIG_PROC_FS 1432/* Called with local bh disabled */ 1433void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc); 1434int sock_prot_inuse_get(struct net *net, struct proto *proto); 1435int sock_inuse_get(struct net *net); 1436#else 1437static inline void sock_prot_inuse_add(struct net *net, struct proto *prot, 1438 int inc) 1439{ 1440} 1441#endif 1442 1443 1444/* With per-bucket locks this operation is not-atomic, so that 1445 * this version is not worse. 1446 */ 1447static inline int __sk_prot_rehash(struct sock *sk) 1448{ 1449 sk->sk_prot->unhash(sk); 1450 return sk->sk_prot->hash(sk); 1451} 1452 1453/* About 10 seconds */ 1454#define SOCK_DESTROY_TIME (10*HZ) 1455 1456/* Sockets 0-1023 can't be bound to unless you are superuser */ 1457#define PROT_SOCK 1024 1458 1459#define SHUTDOWN_MASK 3 1460#define RCV_SHUTDOWN 1 1461#define SEND_SHUTDOWN 2 1462 1463#define SOCK_SNDBUF_LOCK 1 1464#define SOCK_RCVBUF_LOCK 2 1465#define SOCK_BINDADDR_LOCK 4 1466#define SOCK_BINDPORT_LOCK 8 1467 1468struct socket_alloc { 1469 struct socket socket; 1470 struct inode vfs_inode; 1471}; 1472 1473static inline struct socket *SOCKET_I(struct inode *inode) 1474{ 1475 return &container_of(inode, struct socket_alloc, vfs_inode)->socket; 1476} 1477 1478static inline struct inode *SOCK_INODE(struct socket *socket) 1479{ 1480 return &container_of(socket, struct socket_alloc, socket)->vfs_inode; 1481} 1482 1483/* 1484 * Functions for memory accounting 1485 */ 1486int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind); 1487int __sk_mem_schedule(struct sock *sk, int size, int kind); 1488void __sk_mem_reduce_allocated(struct sock *sk, int amount); 1489void __sk_mem_reclaim(struct sock *sk, int amount); 1490 1491/* We used to have PAGE_SIZE here, but systems with 64KB pages 1492 * do not necessarily have 16x time more memory than 4KB ones. 1493 */ 1494#define SK_MEM_QUANTUM 4096 1495#define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM) 1496#define SK_MEM_SEND 0 1497#define SK_MEM_RECV 1 1498 1499/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */ 1500static inline long sk_prot_mem_limits(const struct sock *sk, int index) 1501{ 1502 long val = READ_ONCE(sk->sk_prot->sysctl_mem[index]); 1503 1504#if PAGE_SIZE > SK_MEM_QUANTUM 1505 val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT; 1506#elif PAGE_SIZE < SK_MEM_QUANTUM 1507 val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT; 1508#endif 1509 return val; 1510} 1511 1512static inline int sk_mem_pages(int amt) 1513{ 1514 return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT; 1515} 1516 1517static inline bool sk_has_account(struct sock *sk) 1518{ 1519 /* return true if protocol supports memory accounting */ 1520 return !!sk->sk_prot->memory_allocated; 1521} 1522 1523static inline bool sk_wmem_schedule(struct sock *sk, int size) 1524{ 1525 int delta; 1526 1527 if (!sk_has_account(sk)) 1528 return true; 1529 delta = size - sk->sk_forward_alloc; 1530 return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND); 1531} 1532 1533static inline bool 1534sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) 1535{ 1536 int delta; 1537 1538 if (!sk_has_account(sk)) 1539 return true; 1540 delta = size - sk->sk_forward_alloc; 1541 return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) || 1542 skb_pfmemalloc(skb); 1543} 1544 1545static inline void sk_mem_reclaim(struct sock *sk) 1546{ 1547 if (!sk_has_account(sk)) 1548 return; 1549 if (sk->sk_forward_alloc >= SK_MEM_QUANTUM) 1550 __sk_mem_reclaim(sk, sk->sk_forward_alloc); 1551} 1552 1553static inline void sk_mem_reclaim_partial(struct sock *sk) 1554{ 1555 if (!sk_has_account(sk)) 1556 return; 1557 if (sk->sk_forward_alloc > SK_MEM_QUANTUM) 1558 __sk_mem_reclaim(sk, sk->sk_forward_alloc - 1); 1559} 1560 1561static inline void sk_mem_charge(struct sock *sk, int size) 1562{ 1563 if (!sk_has_account(sk)) 1564 return; 1565 sk->sk_forward_alloc -= size; 1566} 1567 1568static inline void sk_mem_uncharge(struct sock *sk, int size) 1569{ 1570 if (!sk_has_account(sk)) 1571 return; 1572 sk->sk_forward_alloc += size; 1573 1574 /* Avoid a possible overflow. 1575 * TCP send queues can make this happen, if sk_mem_reclaim() 1576 * is not called and more than 2 GBytes are released at once. 1577 * 1578 * If we reach 2 MBytes, reclaim 1 MBytes right now, there is 1579 * no need to hold that much forward allocation anyway. 1580 */ 1581 if (unlikely(sk->sk_forward_alloc >= 1 << 21)) 1582 __sk_mem_reclaim(sk, 1 << 20); 1583} 1584 1585DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); 1586static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) 1587{ 1588 sk_wmem_queued_add(sk, -skb->truesize); 1589 sk_mem_uncharge(sk, skb->truesize); 1590 if (static_branch_unlikely(&tcp_tx_skb_cache_key) && 1591 !sk->sk_tx_skb_cache && !skb_cloned(skb)) { 1592 skb_ext_reset(skb); 1593 skb_zcopy_clear(skb, true); 1594 sk->sk_tx_skb_cache = skb; 1595 return; 1596 } 1597 __kfree_skb(skb); 1598} 1599 1600static inline void sock_release_ownership(struct sock *sk) 1601{ 1602 if (sk->sk_lock.owned) { 1603 sk->sk_lock.owned = 0; 1604 1605 /* The sk_lock has mutex_unlock() semantics: */ 1606 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 1607 } 1608} 1609 1610/* 1611 * Macro so as to not evaluate some arguments when 1612 * lockdep is not enabled. 1613 * 1614 * Mark both the sk_lock and the sk_lock.slock as a 1615 * per-address-family lock class. 1616 */ 1617#define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ 1618do { \ 1619 sk->sk_lock.owned = 0; \ 1620 init_waitqueue_head(&sk->sk_lock.wq); \ 1621 spin_lock_init(&(sk)->sk_lock.slock); \ 1622 debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ 1623 sizeof((sk)->sk_lock)); \ 1624 lockdep_set_class_and_name(&(sk)->sk_lock.slock, \ 1625 (skey), (sname)); \ 1626 lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \ 1627} while (0) 1628 1629#ifdef CONFIG_LOCKDEP 1630static inline bool lockdep_sock_is_held(const struct sock *sk) 1631{ 1632 return lockdep_is_held(&sk->sk_lock) || 1633 lockdep_is_held(&sk->sk_lock.slock); 1634} 1635#endif 1636 1637void lock_sock_nested(struct sock *sk, int subclass); 1638 1639static inline void lock_sock(struct sock *sk) 1640{ 1641 lock_sock_nested(sk, 0); 1642} 1643 1644void __release_sock(struct sock *sk); 1645void release_sock(struct sock *sk); 1646 1647/* BH context may only use the following locking interface. */ 1648#define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock)) 1649#define bh_lock_sock_nested(__sk) \ 1650 spin_lock_nested(&((__sk)->sk_lock.slock), \ 1651 SINGLE_DEPTH_NESTING) 1652#define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) 1653 1654bool lock_sock_fast(struct sock *sk); 1655/** 1656 * unlock_sock_fast - complement of lock_sock_fast 1657 * @sk: socket 1658 * @slow: slow mode 1659 * 1660 * fast unlock socket for user context. 1661 * If slow mode is on, we call regular release_sock() 1662 */ 1663static inline void unlock_sock_fast(struct sock *sk, bool slow) 1664{ 1665 if (slow) 1666 release_sock(sk); 1667 else 1668 spin_unlock_bh(&sk->sk_lock.slock); 1669} 1670 1671/* Used by processes to "lock" a socket state, so that 1672 * interrupts and bottom half handlers won't change it 1673 * from under us. It essentially blocks any incoming 1674 * packets, so that we won't get any new data or any 1675 * packets that change the state of the socket. 1676 * 1677 * While locked, BH processing will add new packets to 1678 * the backlog queue. This queue is processed by the 1679 * owner of the socket lock right before it is released. 1680 * 1681 * Since ~2.3.5 it is also exclusive sleep lock serializing 1682 * accesses from user process context. 1683 */ 1684 1685static inline void sock_owned_by_me(const struct sock *sk) 1686{ 1687#ifdef CONFIG_LOCKDEP 1688 WARN_ON_ONCE(!lockdep_sock_is_held(sk) && debug_locks); 1689#endif 1690} 1691 1692static inline void sock_not_owned_by_me(const struct sock *sk) 1693{ 1694#ifdef CONFIG_LOCKDEP 1695 WARN_ON_ONCE(lockdep_sock_is_held(sk) && debug_locks); 1696#endif 1697} 1698 1699static inline bool sock_owned_by_user(const struct sock *sk) 1700{ 1701 sock_owned_by_me(sk); 1702 return sk->sk_lock.owned; 1703} 1704 1705static inline bool sock_owned_by_user_nocheck(const struct sock *sk) 1706{ 1707 return sk->sk_lock.owned; 1708} 1709 1710/* no reclassification while locks are held */ 1711static inline bool sock_allow_reclassification(const struct sock *csk) 1712{ 1713 struct sock *sk = (struct sock *)csk; 1714 1715 return !sk->sk_lock.owned && !spin_is_locked(&sk->sk_lock.slock); 1716} 1717 1718struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1719 struct proto *prot, int kern); 1720void sk_free(struct sock *sk); 1721void sk_destruct(struct sock *sk); 1722struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); 1723void sk_free_unlock_clone(struct sock *sk); 1724 1725struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 1726 gfp_t priority); 1727void __sock_wfree(struct sk_buff *skb); 1728void sock_wfree(struct sk_buff *skb); 1729struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 1730 gfp_t priority); 1731void skb_orphan_partial(struct sk_buff *skb); 1732void sock_rfree(struct sk_buff *skb); 1733void sock_efree(struct sk_buff *skb); 1734#ifdef CONFIG_INET 1735void sock_edemux(struct sk_buff *skb); 1736void sock_pfree(struct sk_buff *skb); 1737#else 1738#define sock_edemux sock_efree 1739#endif 1740 1741int sock_setsockopt(struct socket *sock, int level, int op, 1742 sockptr_t optval, unsigned int optlen); 1743 1744int sock_getsockopt(struct socket *sock, int level, int op, 1745 char __user *optval, int __user *optlen); 1746int sock_gettstamp(struct socket *sock, void __user *userstamp, 1747 bool timeval, bool time32); 1748struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 1749 int noblock, int *errcode); 1750struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 1751 unsigned long data_len, int noblock, 1752 int *errcode, int max_page_order); 1753void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); 1754void sock_kfree_s(struct sock *sk, void *mem, int size); 1755void sock_kzfree_s(struct sock *sk, void *mem, int size); 1756void sk_send_sigurg(struct sock *sk); 1757 1758struct sockcm_cookie { 1759 u64 transmit_time; 1760 u32 mark; 1761 u16 tsflags; 1762}; 1763 1764static inline void sockcm_init(struct sockcm_cookie *sockc, 1765 const struct sock *sk) 1766{ 1767 *sockc = (struct sockcm_cookie) { .tsflags = sk->sk_tsflags }; 1768} 1769 1770int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 1771 struct sockcm_cookie *sockc); 1772int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 1773 struct sockcm_cookie *sockc); 1774 1775/* 1776 * Functions to fill in entries in struct proto_ops when a protocol 1777 * does not implement a particular function. 1778 */ 1779int sock_no_bind(struct socket *, struct sockaddr *, int); 1780int sock_no_connect(struct socket *, struct sockaddr *, int, int); 1781int sock_no_socketpair(struct socket *, struct socket *); 1782int sock_no_accept(struct socket *, struct socket *, int, bool); 1783int sock_no_getname(struct socket *, struct sockaddr *, int); 1784int sock_no_ioctl(struct socket *, unsigned int, unsigned long); 1785int sock_no_listen(struct socket *, int); 1786int sock_no_shutdown(struct socket *, int); 1787int sock_no_sendmsg(struct socket *, struct msghdr *, size_t); 1788int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len); 1789int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int); 1790int sock_no_mmap(struct file *file, struct socket *sock, 1791 struct vm_area_struct *vma); 1792ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, 1793 size_t size, int flags); 1794ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 1795 int offset, size_t size, int flags); 1796 1797/* 1798 * Functions to fill in entries in struct proto_ops when a protocol 1799 * uses the inet style. 1800 */ 1801int sock_common_getsockopt(struct socket *sock, int level, int optname, 1802 char __user *optval, int __user *optlen); 1803int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 1804 int flags); 1805int sock_common_setsockopt(struct socket *sock, int level, int optname, 1806 sockptr_t optval, unsigned int optlen); 1807 1808void sk_common_release(struct sock *sk); 1809 1810/* 1811 * Default socket callbacks and setup code 1812 */ 1813 1814/* Initialise core socket variables using an explicit uid. */ 1815void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid); 1816 1817/* Initialise core socket variables. 1818 * Assumes struct socket *sock is embedded in a struct socket_alloc. 1819 */ 1820void sock_init_data(struct socket *sock, struct sock *sk); 1821 1822/* 1823 * Socket reference counting postulates. 1824 * 1825 * * Each user of socket SHOULD hold a reference count. 1826 * * Each access point to socket (an hash table bucket, reference from a list, 1827 * running timer, skb in flight MUST hold a reference count. 1828 * * When reference count hits 0, it means it will never increase back. 1829 * * When reference count hits 0, it means that no references from 1830 * outside exist to this socket and current process on current CPU 1831 * is last user and may/should destroy this socket. 1832 * * sk_free is called from any context: process, BH, IRQ. When 1833 * it is called, socket has no references from outside -> sk_free 1834 * may release descendant resources allocated by the socket, but 1835 * to the time when it is called, socket is NOT referenced by any 1836 * hash tables, lists etc. 1837 * * Packets, delivered from outside (from network or from another process) 1838 * and enqueued on receive/error queues SHOULD NOT grab reference count, 1839 * when they sit in queue. Otherwise, packets will leak to hole, when 1840 * socket is looked up by one cpu and unhasing is made by another CPU. 1841 * It is true for udp/raw, netlink (leak to receive and error queues), tcp 1842 * (leak to backlog). Packet socket does all the processing inside 1843 * BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets 1844 * use separate SMP lock, so that they are prone too. 1845 */ 1846 1847/* Ungrab socket and destroy it, if it was the last reference. */ 1848static inline void sock_put(struct sock *sk) 1849{ 1850 if (refcount_dec_and_test(&sk->sk_refcnt)) 1851 sk_free(sk); 1852} 1853/* Generic version of sock_put(), dealing with all sockets 1854 * (TCP_TIMEWAIT, TCP_NEW_SYN_RECV, ESTABLISHED...) 1855 */ 1856void sock_gen_put(struct sock *sk); 1857 1858int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, 1859 unsigned int trim_cap, bool refcounted); 1860static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb, 1861 const int nested) 1862{ 1863 return __sk_receive_skb(sk, skb, nested, 1, true); 1864} 1865 1866static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) 1867{ 1868 /* sk_tx_queue_mapping accept only upto a 16-bit value */ 1869 if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX)) 1870 return; 1871 /* Paired with READ_ONCE() in sk_tx_queue_get() and 1872 * other WRITE_ONCE() because socket lock might be not held. 1873 */ 1874 WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue); 1875} 1876 1877#define NO_QUEUE_MAPPING USHRT_MAX 1878 1879static inline void sk_tx_queue_clear(struct sock *sk) 1880{ 1881 /* Paired with READ_ONCE() in sk_tx_queue_get() and 1882 * other WRITE_ONCE() because socket lock might be not held. 1883 */ 1884 WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING); 1885} 1886 1887static inline int sk_tx_queue_get(const struct sock *sk) 1888{ 1889 if (sk) { 1890 /* Paired with WRITE_ONCE() in sk_tx_queue_clear() 1891 * and sk_tx_queue_set(). 1892 */ 1893 int val = READ_ONCE(sk->sk_tx_queue_mapping); 1894 1895 if (val != NO_QUEUE_MAPPING) 1896 return val; 1897 } 1898 return -1; 1899} 1900 1901static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) 1902{ 1903#ifdef CONFIG_XPS 1904 if (skb_rx_queue_recorded(skb)) { 1905 u16 rx_queue = skb_get_rx_queue(skb); 1906 1907 if (WARN_ON_ONCE(rx_queue == NO_QUEUE_MAPPING)) 1908 return; 1909 1910 sk->sk_rx_queue_mapping = rx_queue; 1911 } 1912#endif 1913} 1914 1915static inline void sk_rx_queue_clear(struct sock *sk) 1916{ 1917#ifdef CONFIG_XPS 1918 sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING; 1919#endif 1920} 1921 1922#ifdef CONFIG_XPS 1923static inline int sk_rx_queue_get(const struct sock *sk) 1924{ 1925 if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING) 1926 return sk->sk_rx_queue_mapping; 1927 1928 return -1; 1929} 1930#endif 1931 1932static inline void sk_set_socket(struct sock *sk, struct socket *sock) 1933{ 1934 sk->sk_socket = sock; 1935} 1936 1937static inline wait_queue_head_t *sk_sleep(struct sock *sk) 1938{ 1939 BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0); 1940 return &rcu_dereference_raw(sk->sk_wq)->wait; 1941} 1942/* Detach socket from process context. 1943 * Announce socket dead, detach it from wait queue and inode. 1944 * Note that parent inode held reference count on this struct sock, 1945 * we do not release it in this function, because protocol 1946 * probably wants some additional cleanups or even continuing 1947 * to work with this socket (TCP). 1948 */ 1949static inline void sock_orphan(struct sock *sk) 1950{ 1951 write_lock_bh(&sk->sk_callback_lock); 1952 sock_set_flag(sk, SOCK_DEAD); 1953 sk_set_socket(sk, NULL); 1954 sk->sk_wq = NULL; 1955 write_unlock_bh(&sk->sk_callback_lock); 1956} 1957 1958static inline void sock_graft(struct sock *sk, struct socket *parent) 1959{ 1960 WARN_ON(parent->sk); 1961 write_lock_bh(&sk->sk_callback_lock); 1962 rcu_assign_pointer(sk->sk_wq, &parent->wq); 1963 parent->sk = sk; 1964 sk_set_socket(sk, parent); 1965 sk->sk_uid = SOCK_INODE(parent)->i_uid; 1966 security_sock_graft(sk, parent); 1967 write_unlock_bh(&sk->sk_callback_lock); 1968} 1969 1970kuid_t sock_i_uid(struct sock *sk); 1971unsigned long __sock_i_ino(struct sock *sk); 1972unsigned long sock_i_ino(struct sock *sk); 1973 1974static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) 1975{ 1976 return sk ? sk->sk_uid : make_kuid(net->user_ns, 0); 1977} 1978 1979static inline u32 net_tx_rndhash(void) 1980{ 1981 u32 v = prandom_u32(); 1982 1983 return v ?: 1; 1984} 1985 1986static inline void sk_set_txhash(struct sock *sk) 1987{ 1988 /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ 1989 WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); 1990} 1991 1992static inline bool sk_rethink_txhash(struct sock *sk) 1993{ 1994 if (sk->sk_txhash) { 1995 sk_set_txhash(sk); 1996 return true; 1997 } 1998 return false; 1999} 2000 2001static inline struct dst_entry * 2002__sk_dst_get(struct sock *sk) 2003{ 2004 return rcu_dereference_check(sk->sk_dst_cache, 2005 lockdep_sock_is_held(sk)); 2006} 2007 2008static inline struct dst_entry * 2009sk_dst_get(struct sock *sk) 2010{ 2011 struct dst_entry *dst; 2012 2013 rcu_read_lock(); 2014 dst = rcu_dereference(sk->sk_dst_cache); 2015 if (dst && !atomic_inc_not_zero(&dst->__refcnt)) 2016 dst = NULL; 2017 rcu_read_unlock(); 2018 return dst; 2019} 2020 2021static inline void __dst_negative_advice(struct sock *sk) 2022{ 2023 struct dst_entry *dst = __sk_dst_get(sk); 2024 2025 if (dst && dst->ops->negative_advice) 2026 dst->ops->negative_advice(sk, dst); 2027} 2028 2029static inline void dst_negative_advice(struct sock *sk) 2030{ 2031 sk_rethink_txhash(sk); 2032 __dst_negative_advice(sk); 2033} 2034 2035static inline void 2036__sk_dst_set(struct sock *sk, struct dst_entry *dst) 2037{ 2038 struct dst_entry *old_dst; 2039 2040 sk_tx_queue_clear(sk); 2041 WRITE_ONCE(sk->sk_dst_pending_confirm, 0); 2042 old_dst = rcu_dereference_protected(sk->sk_dst_cache, 2043 lockdep_sock_is_held(sk)); 2044 rcu_assign_pointer(sk->sk_dst_cache, dst); 2045 dst_release(old_dst); 2046} 2047 2048static inline void 2049sk_dst_set(struct sock *sk, struct dst_entry *dst) 2050{ 2051 struct dst_entry *old_dst; 2052 2053 sk_tx_queue_clear(sk); 2054 WRITE_ONCE(sk->sk_dst_pending_confirm, 0); 2055 old_dst = xchg((__force struct dst_entry **)&sk->sk_dst_cache, dst); 2056 dst_release(old_dst); 2057} 2058 2059static inline void 2060__sk_dst_reset(struct sock *sk) 2061{ 2062 __sk_dst_set(sk, NULL); 2063} 2064 2065static inline void 2066sk_dst_reset(struct sock *sk) 2067{ 2068 sk_dst_set(sk, NULL); 2069} 2070 2071struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); 2072 2073struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); 2074 2075static inline void sk_dst_confirm(struct sock *sk) 2076{ 2077 if (!READ_ONCE(sk->sk_dst_pending_confirm)) 2078 WRITE_ONCE(sk->sk_dst_pending_confirm, 1); 2079} 2080 2081static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n) 2082{ 2083 if (skb_get_dst_pending_confirm(skb)) { 2084 struct sock *sk = skb->sk; 2085 unsigned long now = jiffies; 2086 2087 /* avoid dirtying neighbour */ 2088 if (READ_ONCE(n->confirmed) != now) 2089 WRITE_ONCE(n->confirmed, now); 2090 if (sk && READ_ONCE(sk->sk_dst_pending_confirm)) 2091 WRITE_ONCE(sk->sk_dst_pending_confirm, 0); 2092 } 2093} 2094 2095bool sk_mc_loop(struct sock *sk); 2096 2097static inline bool sk_can_gso(const struct sock *sk) 2098{ 2099 return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type); 2100} 2101 2102void sk_setup_caps(struct sock *sk, struct dst_entry *dst); 2103 2104static inline void sk_nocaps_add(struct sock *sk, netdev_features_t flags) 2105{ 2106 sk->sk_route_nocaps |= flags; 2107 sk->sk_route_caps &= ~flags; 2108} 2109 2110static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb, 2111 struct iov_iter *from, char *to, 2112 int copy, int offset) 2113{ 2114 if (skb->ip_summed == CHECKSUM_NONE) { 2115 __wsum csum = 0; 2116 if (!csum_and_copy_from_iter_full(to, copy, &csum, from)) 2117 return -EFAULT; 2118 skb->csum = csum_block_add(skb->csum, csum, offset); 2119 } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) { 2120 if (!copy_from_iter_full_nocache(to, copy, from)) 2121 return -EFAULT; 2122 } else if (!copy_from_iter_full(to, copy, from)) 2123 return -EFAULT; 2124 2125 return 0; 2126} 2127 2128static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb, 2129 struct iov_iter *from, int copy) 2130{ 2131 int err, offset = skb->len; 2132 2133 err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy), 2134 copy, offset); 2135 if (err) 2136 __skb_trim(skb, offset); 2137 2138 return err; 2139} 2140 2141static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *from, 2142 struct sk_buff *skb, 2143 struct page *page, 2144 int off, int copy) 2145{ 2146 int err; 2147 2148 err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off, 2149 copy, skb->len); 2150 if (err) 2151 return err; 2152 2153 skb->len += copy; 2154 skb->data_len += copy; 2155 skb->truesize += copy; 2156 sk_wmem_queued_add(sk, copy); 2157 sk_mem_charge(sk, copy); 2158 return 0; 2159} 2160 2161/** 2162 * sk_wmem_alloc_get - returns write allocations 2163 * @sk: socket 2164 * 2165 * Return: sk_wmem_alloc minus initial offset of one 2166 */ 2167static inline int sk_wmem_alloc_get(const struct sock *sk) 2168{ 2169 return refcount_read(&sk->sk_wmem_alloc) - 1; 2170} 2171 2172/** 2173 * sk_rmem_alloc_get - returns read allocations 2174 * @sk: socket 2175 * 2176 * Return: sk_rmem_alloc 2177 */ 2178static inline int sk_rmem_alloc_get(const struct sock *sk) 2179{ 2180 return atomic_read(&sk->sk_rmem_alloc); 2181} 2182 2183/** 2184 * sk_has_allocations - check if allocations are outstanding 2185 * @sk: socket 2186 * 2187 * Return: true if socket has write or read allocations 2188 */ 2189static inline bool sk_has_allocations(const struct sock *sk) 2190{ 2191 return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk); 2192} 2193 2194/** 2195 * skwq_has_sleeper - check if there are any waiting processes 2196 * @wq: struct socket_wq 2197 * 2198 * Return: true if socket_wq has waiting processes 2199 * 2200 * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory 2201 * barrier call. They were added due to the race found within the tcp code. 2202 * 2203 * Consider following tcp code paths:: 2204 * 2205 * CPU1 CPU2 2206 * sys_select receive packet 2207 * ... ... 2208 * __add_wait_queue update tp->rcv_nxt 2209 * ... ... 2210 * tp->rcv_nxt check sock_def_readable 2211 * ... { 2212 * schedule rcu_read_lock(); 2213 * wq = rcu_dereference(sk->sk_wq); 2214 * if (wq && waitqueue_active(&wq->wait)) 2215 * wake_up_interruptible(&wq->wait) 2216 * ... 2217 * } 2218 * 2219 * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay 2220 * in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 2221 * could then endup calling schedule and sleep forever if there are no more 2222 * data on the socket. 2223 * 2224 */ 2225static inline bool skwq_has_sleeper(struct socket_wq *wq) 2226{ 2227 return wq && wq_has_sleeper(&wq->wait); 2228} 2229 2230/** 2231 * sock_poll_wait - place memory barrier behind the poll_wait call. 2232 * @filp: file 2233 * @sock: socket to wait on 2234 * @p: poll_table 2235 * 2236 * See the comments in the wq_has_sleeper function. 2237 */ 2238static inline void sock_poll_wait(struct file *filp, struct socket *sock, 2239 poll_table *p) 2240{ 2241 if (!poll_does_not_wait(p)) { 2242 poll_wait(filp, &sock->wq.wait, p); 2243 /* We need to be sure we are in sync with the 2244 * socket flags modification. 2245 * 2246 * This memory barrier is paired in the wq_has_sleeper. 2247 */ 2248 smp_mb(); 2249 } 2250} 2251 2252static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) 2253{ 2254 /* This pairs with WRITE_ONCE() in sk_set_txhash() */ 2255 u32 txhash = READ_ONCE(sk->sk_txhash); 2256 2257 if (txhash) { 2258 skb->l4_hash = 1; 2259 skb->hash = txhash; 2260 } 2261} 2262 2263void skb_set_owner_w(struct sk_buff *skb, struct sock *sk); 2264 2265/* 2266 * Queue a received datagram if it will fit. Stream and sequenced 2267 * protocols can't normally use this as they need to fit buffers in 2268 * and play with them. 2269 * 2270 * Inlined as it's very short and called for pretty much every 2271 * packet ever received. 2272 */ 2273static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) 2274{ 2275 skb_orphan(skb); 2276 skb->sk = sk; 2277 skb->destructor = sock_rfree; 2278 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 2279 sk_mem_charge(sk, skb->truesize); 2280} 2281 2282static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk) 2283{ 2284 if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) { 2285 skb_orphan(skb); 2286 skb->destructor = sock_efree; 2287 skb->sk = sk; 2288 return true; 2289 } 2290 return false; 2291} 2292 2293static inline struct sk_buff *skb_clone_and_charge_r(struct sk_buff *skb, struct sock *sk) 2294{ 2295 skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); 2296 if (skb) { 2297 if (sk_rmem_schedule(sk, skb, skb->truesize)) { 2298 skb_set_owner_r(skb, sk); 2299 return skb; 2300 } 2301 __kfree_skb(skb); 2302 } 2303 return NULL; 2304} 2305 2306void sk_reset_timer(struct sock *sk, struct timer_list *timer, 2307 unsigned long expires); 2308 2309void sk_stop_timer(struct sock *sk, struct timer_list *timer); 2310 2311void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer); 2312 2313int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, 2314 struct sk_buff *skb, unsigned int flags, 2315 void (*destructor)(struct sock *sk, 2316 struct sk_buff *skb)); 2317int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); 2318int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); 2319 2320int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb); 2321struct sk_buff *sock_dequeue_err_skb(struct sock *sk); 2322 2323/* 2324 * Recover an error report and clear atomically 2325 */ 2326 2327static inline int sock_error(struct sock *sk) 2328{ 2329 int err; 2330 2331 /* Avoid an atomic operation for the common case. 2332 * This is racy since another cpu/thread can change sk_err under us. 2333 */ 2334 if (likely(data_race(!sk->sk_err))) 2335 return 0; 2336 2337 err = xchg(&sk->sk_err, 0); 2338 return -err; 2339} 2340 2341static inline unsigned long sock_wspace(struct sock *sk) 2342{ 2343 int amt = 0; 2344 2345 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { 2346 amt = sk->sk_sndbuf - refcount_read(&sk->sk_wmem_alloc); 2347 if (amt < 0) 2348 amt = 0; 2349 } 2350 return amt; 2351} 2352 2353/* Note: 2354 * We use sk->sk_wq_raw, from contexts knowing this 2355 * pointer is not NULL and cannot disappear/change. 2356 */ 2357static inline void sk_set_bit(int nr, struct sock *sk) 2358{ 2359 if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) && 2360 !sock_flag(sk, SOCK_FASYNC)) 2361 return; 2362 2363 set_bit(nr, &sk->sk_wq_raw->flags); 2364} 2365 2366static inline void sk_clear_bit(int nr, struct sock *sk) 2367{ 2368 if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) && 2369 !sock_flag(sk, SOCK_FASYNC)) 2370 return; 2371 2372 clear_bit(nr, &sk->sk_wq_raw->flags); 2373} 2374 2375static inline void sk_wake_async(const struct sock *sk, int how, int band) 2376{ 2377 if (sock_flag(sk, SOCK_FASYNC)) { 2378 rcu_read_lock(); 2379 sock_wake_async(rcu_dereference(sk->sk_wq), how, band); 2380 rcu_read_unlock(); 2381 } 2382} 2383 2384/* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might 2385 * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak. 2386 * Note: for send buffers, TCP works better if we can build two skbs at 2387 * minimum. 2388 */ 2389#define TCP_SKB_MIN_TRUESIZE (2048 + SKB_DATA_ALIGN(sizeof(struct sk_buff))) 2390 2391#define SOCK_MIN_SNDBUF (TCP_SKB_MIN_TRUESIZE * 2) 2392#define SOCK_MIN_RCVBUF TCP_SKB_MIN_TRUESIZE 2393 2394static inline void sk_stream_moderate_sndbuf(struct sock *sk) 2395{ 2396 u32 val; 2397 2398 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) 2399 return; 2400 2401 val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1); 2402 2403 WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF)); 2404} 2405 2406struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, 2407 bool force_schedule); 2408 2409/** 2410 * sk_page_frag - return an appropriate page_frag 2411 * @sk: socket 2412 * 2413 * Use the per task page_frag instead of the per socket one for 2414 * optimization when we know that we're in process context and own 2415 * everything that's associated with %current. 2416 * 2417 * Both direct reclaim and page faults can nest inside other 2418 * socket operations and end up recursing into sk_page_frag() 2419 * while it's already in use: explicitly avoid task page_frag 2420 * usage if the caller is potentially doing any of them. 2421 * This assumes that page fault handlers use the GFP_NOFS flags. 2422 * 2423 * Return: a per task page_frag if context allows that, 2424 * otherwise a per socket one. 2425 */ 2426static inline struct page_frag *sk_page_frag(struct sock *sk) 2427{ 2428 if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) == 2429 (__GFP_DIRECT_RECLAIM | __GFP_FS)) 2430 return ¤t->task_frag; 2431 2432 return &sk->sk_frag; 2433} 2434 2435bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); 2436 2437/* 2438 * Default write policy as shown to user space via poll/select/SIGIO 2439 */ 2440static inline bool sock_writeable(const struct sock *sk) 2441{ 2442 return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1); 2443} 2444 2445static inline gfp_t gfp_any(void) 2446{ 2447 return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; 2448} 2449 2450static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) 2451{ 2452 return noblock ? 0 : sk->sk_rcvtimeo; 2453} 2454 2455static inline long sock_sndtimeo(const struct sock *sk, bool noblock) 2456{ 2457 return noblock ? 0 : sk->sk_sndtimeo; 2458} 2459 2460static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) 2461{ 2462 int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len); 2463 2464 return v ?: 1; 2465} 2466 2467/* Alas, with timeout socket operations are not restartable. 2468 * Compare this to poll(). 2469 */ 2470static inline int sock_intr_errno(long timeo) 2471{ 2472 return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR; 2473} 2474 2475struct sock_skb_cb { 2476 u32 dropcount; 2477}; 2478 2479/* Store sock_skb_cb at the end of skb->cb[] so protocol families 2480 * using skb->cb[] would keep using it directly and utilize its 2481 * alignement guarantee. 2482 */ 2483#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \ 2484 sizeof(struct sock_skb_cb))) 2485 2486#define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \ 2487 SOCK_SKB_CB_OFFSET)) 2488 2489#define sock_skb_cb_check_size(size) \ 2490 BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) 2491 2492static inline void 2493sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) 2494{ 2495 SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? 2496 atomic_read(&sk->sk_drops) : 0; 2497} 2498 2499static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) 2500{ 2501 int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); 2502 2503 atomic_add(segs, &sk->sk_drops); 2504} 2505 2506static inline ktime_t sock_read_timestamp(struct sock *sk) 2507{ 2508#if BITS_PER_LONG==32 2509 unsigned int seq; 2510 ktime_t kt; 2511 2512 do { 2513 seq = read_seqbegin(&sk->sk_stamp_seq); 2514 kt = sk->sk_stamp; 2515 } while (read_seqretry(&sk->sk_stamp_seq, seq)); 2516 2517 return kt; 2518#else 2519 return READ_ONCE(sk->sk_stamp); 2520#endif 2521} 2522 2523static inline void sock_write_timestamp(struct sock *sk, ktime_t kt) 2524{ 2525#if BITS_PER_LONG==32 2526 write_seqlock(&sk->sk_stamp_seq); 2527 sk->sk_stamp = kt; 2528 write_sequnlock(&sk->sk_stamp_seq); 2529#else 2530 WRITE_ONCE(sk->sk_stamp, kt); 2531#endif 2532} 2533 2534void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, 2535 struct sk_buff *skb); 2536void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, 2537 struct sk_buff *skb); 2538 2539static inline void 2540sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) 2541{ 2542 ktime_t kt = skb->tstamp; 2543 struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); 2544 2545 /* 2546 * generate control messages if 2547 * - receive time stamping in software requested 2548 * - software time stamp available and wanted 2549 * - hardware time stamps available and wanted 2550 */ 2551 if (sock_flag(sk, SOCK_RCVTSTAMP) || 2552 (sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || 2553 (kt && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) || 2554 (hwtstamps->hwtstamp && 2555 (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) 2556 __sock_recv_timestamp(msg, sk, skb); 2557 else 2558 sock_write_timestamp(sk, kt); 2559 2560 if (sock_flag(sk, SOCK_WIFI_STATUS) && skb->wifi_acked_valid) 2561 __sock_recv_wifi_status(msg, sk, skb); 2562} 2563 2564void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, 2565 struct sk_buff *skb); 2566 2567#define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC) 2568static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, 2569 struct sk_buff *skb) 2570{ 2571#define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL) | \ 2572 (1UL << SOCK_RCVTSTAMP)) 2573#define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ 2574 SOF_TIMESTAMPING_RAW_HARDWARE) 2575 2576 if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY) 2577 __sock_recv_ts_and_drops(msg, sk, skb); 2578 else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) 2579 sock_write_timestamp(sk, skb->tstamp); 2580 else if (unlikely(sock_read_timestamp(sk) == SK_DEFAULT_STAMP)) 2581 sock_write_timestamp(sk, 0); 2582} 2583 2584void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags); 2585 2586/** 2587 * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped 2588 * @sk: socket sending this packet 2589 * @tsflags: timestamping flags to use 2590 * @tx_flags: completed with instructions for time stamping 2591 * @tskey: filled in with next sk_tskey (not for TCP, which uses seqno) 2592 * 2593 * Note: callers should take care of initial ``*tx_flags`` value (usually 0) 2594 */ 2595static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags, 2596 __u8 *tx_flags, __u32 *tskey) 2597{ 2598 if (unlikely(tsflags)) { 2599 __sock_tx_timestamp(tsflags, tx_flags); 2600 if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && 2601 tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) 2602 *tskey = sk->sk_tskey++; 2603 } 2604 if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) 2605 *tx_flags |= SKBTX_WIFI_STATUS; 2606} 2607 2608static inline void sock_tx_timestamp(struct sock *sk, __u16 tsflags, 2609 __u8 *tx_flags) 2610{ 2611 _sock_tx_timestamp(sk, tsflags, tx_flags, NULL); 2612} 2613 2614static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) 2615{ 2616 _sock_tx_timestamp(skb->sk, tsflags, &skb_shinfo(skb)->tx_flags, 2617 &skb_shinfo(skb)->tskey); 2618} 2619 2620DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); 2621/** 2622 * sk_eat_skb - Release a skb if it is no longer needed 2623 * @sk: socket to eat this skb from 2624 * @skb: socket buffer to eat 2625 * 2626 * This routine must be called with interrupts disabled or with the socket 2627 * locked so that the sk_buff queue operation is ok. 2628*/ 2629static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) 2630{ 2631 __skb_unlink(skb, &sk->sk_receive_queue); 2632 if (static_branch_unlikely(&tcp_rx_skb_cache_key) && 2633 !sk->sk_rx_skb_cache) { 2634 sk->sk_rx_skb_cache = skb; 2635 skb_orphan(skb); 2636 return; 2637 } 2638 __kfree_skb(skb); 2639} 2640 2641static inline 2642struct net *sock_net(const struct sock *sk) 2643{ 2644 return read_pnet(&sk->sk_net); 2645} 2646 2647static inline 2648void sock_net_set(struct sock *sk, struct net *net) 2649{ 2650 write_pnet(&sk->sk_net, net); 2651} 2652 2653static inline bool 2654skb_sk_is_prefetched(struct sk_buff *skb) 2655{ 2656#ifdef CONFIG_INET 2657 return skb->destructor == sock_pfree; 2658#else 2659 return false; 2660#endif /* CONFIG_INET */ 2661} 2662 2663/* This helper checks if a socket is a full socket, 2664 * ie _not_ a timewait or request socket. 2665 */ 2666static inline bool sk_fullsock(const struct sock *sk) 2667{ 2668 return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV); 2669} 2670 2671static inline bool 2672sk_is_refcounted(struct sock *sk) 2673{ 2674 /* Only full sockets have sk->sk_flags. */ 2675 return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); 2676} 2677 2678/** 2679 * skb_steal_sock - steal a socket from an sk_buff 2680 * @skb: sk_buff to steal the socket from 2681 * @refcounted: is set to true if the socket is reference-counted 2682 */ 2683static inline struct sock * 2684skb_steal_sock(struct sk_buff *skb, bool *refcounted) 2685{ 2686 if (skb->sk) { 2687 struct sock *sk = skb->sk; 2688 2689 *refcounted = true; 2690 if (skb_sk_is_prefetched(skb)) 2691 *refcounted = sk_is_refcounted(sk); 2692 skb->destructor = NULL; 2693 skb->sk = NULL; 2694 return sk; 2695 } 2696 *refcounted = false; 2697 return NULL; 2698} 2699 2700/* Checks if this SKB belongs to an HW offloaded socket 2701 * and whether any SW fallbacks are required based on dev. 2702 * Check decrypted mark in case skb_orphan() cleared socket. 2703 */ 2704static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, 2705 struct net_device *dev) 2706{ 2707#ifdef CONFIG_SOCK_VALIDATE_XMIT 2708 struct sock *sk = skb->sk; 2709 2710 if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { 2711 skb = sk->sk_validate_xmit_skb(sk, dev, skb); 2712#ifdef CONFIG_TLS_DEVICE 2713 } else if (unlikely(skb->decrypted)) { 2714 pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); 2715 kfree_skb(skb); 2716 skb = NULL; 2717#endif 2718 } 2719#endif 2720 2721 return skb; 2722} 2723 2724/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV 2725 * SYNACK messages can be attached to either ones (depending on SYNCOOKIE) 2726 */ 2727static inline bool sk_listener(const struct sock *sk) 2728{ 2729 return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); 2730} 2731 2732void sock_enable_timestamp(struct sock *sk, enum sock_flags flag); 2733int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, 2734 int type); 2735 2736bool sk_ns_capable(const struct sock *sk, 2737 struct user_namespace *user_ns, int cap); 2738bool sk_capable(const struct sock *sk, int cap); 2739bool sk_net_capable(const struct sock *sk, int cap); 2740 2741void sk_get_meminfo(const struct sock *sk, u32 *meminfo); 2742 2743/* Take into consideration the size of the struct sk_buff overhead in the 2744 * determination of these values, since that is non-constant across 2745 * platforms. This makes socket queueing behavior and performance 2746 * not depend upon such differences. 2747 */ 2748#define _SK_MEM_PACKETS 256 2749#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) 2750#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) 2751#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) 2752 2753extern __u32 sysctl_wmem_max; 2754extern __u32 sysctl_rmem_max; 2755 2756extern int sysctl_tstamp_allow_data; 2757extern int sysctl_optmem_max; 2758 2759extern __u32 sysctl_wmem_default; 2760extern __u32 sysctl_rmem_default; 2761 2762#define SKB_FRAG_PAGE_ORDER get_order(32768) 2763DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 2764 2765static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) 2766{ 2767 /* Does this proto have per netns sysctl_wmem ? */ 2768 if (proto->sysctl_wmem_offset) 2769 return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset)); 2770 2771 return READ_ONCE(*proto->sysctl_wmem); 2772} 2773 2774static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto) 2775{ 2776 /* Does this proto have per netns sysctl_rmem ? */ 2777 if (proto->sysctl_rmem_offset) 2778 return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset)); 2779 2780 return READ_ONCE(*proto->sysctl_rmem); 2781} 2782 2783/* Default TCP Small queue budget is ~1 ms of data (1sec >> 10) 2784 * Some wifi drivers need to tweak it to get more chunks. 2785 * They can use this helper from their ndo_start_xmit() 2786 */ 2787static inline void sk_pacing_shift_update(struct sock *sk, int val) 2788{ 2789 if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val) 2790 return; 2791 WRITE_ONCE(sk->sk_pacing_shift, val); 2792} 2793 2794/* if a socket is bound to a device, check that the given device 2795 * index is either the same or that the socket is bound to an L3 2796 * master device and the given device index is also enslaved to 2797 * that L3 master 2798 */ 2799static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) 2800{ 2801 int mdif; 2802 2803 if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif) 2804 return true; 2805 2806 mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif); 2807 if (mdif && mdif == sk->sk_bound_dev_if) 2808 return true; 2809 2810 return false; 2811} 2812 2813void sock_def_readable(struct sock *sk); 2814 2815int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk); 2816void sock_enable_timestamps(struct sock *sk); 2817void sock_no_linger(struct sock *sk); 2818void sock_set_keepalive(struct sock *sk); 2819void sock_set_priority(struct sock *sk, u32 priority); 2820void sock_set_rcvbuf(struct sock *sk, int val); 2821void sock_set_mark(struct sock *sk, u32 val); 2822void sock_set_reuseaddr(struct sock *sk); 2823void sock_set_reuseport(struct sock *sk); 2824void sock_set_sndtimeo(struct sock *sk, s64 secs); 2825 2826int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); 2827 2828#endif /* _SOCK_H */ 2829