18c2ecf20Sopenharmony_ci/* 28c2ecf20Sopenharmony_ci * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. 38c2ecf20Sopenharmony_ci * 48c2ecf20Sopenharmony_ci * This software is available to you under a choice of one of two 58c2ecf20Sopenharmony_ci * licenses. You may choose to be licensed under the terms of the GNU 68c2ecf20Sopenharmony_ci * General Public License (GPL) Version 2, available from the file 78c2ecf20Sopenharmony_ci * COPYING in the main directory of this source tree, or the 88c2ecf20Sopenharmony_ci * OpenIB.org BSD license below: 98c2ecf20Sopenharmony_ci * 108c2ecf20Sopenharmony_ci * Redistribution and use in source and binary forms, with or 118c2ecf20Sopenharmony_ci * without modification, are permitted provided that the following 128c2ecf20Sopenharmony_ci * conditions are met: 138c2ecf20Sopenharmony_ci * 148c2ecf20Sopenharmony_ci * - Redistributions of source code must retain the above 158c2ecf20Sopenharmony_ci * copyright notice, this list of conditions and the following 168c2ecf20Sopenharmony_ci * disclaimer. 178c2ecf20Sopenharmony_ci * 188c2ecf20Sopenharmony_ci * - Redistributions in binary form must reproduce the above 198c2ecf20Sopenharmony_ci * copyright notice, this list of conditions and the following 208c2ecf20Sopenharmony_ci * disclaimer in the documentation and/or other materials 218c2ecf20Sopenharmony_ci * provided with the distribution. 228c2ecf20Sopenharmony_ci * 238c2ecf20Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 248c2ecf20Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 258c2ecf20Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 268c2ecf20Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 278c2ecf20Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 288c2ecf20Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 298c2ecf20Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 308c2ecf20Sopenharmony_ci * SOFTWARE. 318c2ecf20Sopenharmony_ci * 328c2ecf20Sopenharmony_ci */ 338c2ecf20Sopenharmony_ci#include <linux/kernel.h> 348c2ecf20Sopenharmony_ci#include <linux/slab.h> 358c2ecf20Sopenharmony_ci#include <net/sock.h> 368c2ecf20Sopenharmony_ci#include <linux/in.h> 378c2ecf20Sopenharmony_ci#include <linux/export.h> 388c2ecf20Sopenharmony_ci#include <linux/time.h> 398c2ecf20Sopenharmony_ci#include <linux/rds.h> 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_ci#include "rds.h" 428c2ecf20Sopenharmony_ci 438c2ecf20Sopenharmony_civoid rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, 448c2ecf20Sopenharmony_ci struct in6_addr *saddr) 458c2ecf20Sopenharmony_ci{ 468c2ecf20Sopenharmony_ci refcount_set(&inc->i_refcount, 1); 478c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&inc->i_item); 488c2ecf20Sopenharmony_ci inc->i_conn = conn; 498c2ecf20Sopenharmony_ci inc->i_saddr = *saddr; 508c2ecf20Sopenharmony_ci inc->i_usercopy.rdma_cookie = 0; 518c2ecf20Sopenharmony_ci inc->i_usercopy.rx_tstamp = ktime_set(0, 0); 528c2ecf20Sopenharmony_ci 538c2ecf20Sopenharmony_ci memset(inc->i_rx_lat_trace, 0, sizeof(inc->i_rx_lat_trace)); 548c2ecf20Sopenharmony_ci} 558c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_inc_init); 568c2ecf20Sopenharmony_ci 578c2ecf20Sopenharmony_civoid rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp, 588c2ecf20Sopenharmony_ci struct in6_addr *saddr) 598c2ecf20Sopenharmony_ci{ 608c2ecf20Sopenharmony_ci refcount_set(&inc->i_refcount, 1); 618c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&inc->i_item); 628c2ecf20Sopenharmony_ci inc->i_conn = cp->cp_conn; 638c2ecf20Sopenharmony_ci inc->i_conn_path = cp; 648c2ecf20Sopenharmony_ci inc->i_saddr = *saddr; 658c2ecf20Sopenharmony_ci inc->i_usercopy.rdma_cookie = 0; 668c2ecf20Sopenharmony_ci inc->i_usercopy.rx_tstamp = ktime_set(0, 0); 678c2ecf20Sopenharmony_ci} 688c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_inc_path_init); 698c2ecf20Sopenharmony_ci 708c2ecf20Sopenharmony_cistatic void rds_inc_addref(struct rds_incoming *inc) 718c2ecf20Sopenharmony_ci{ 728c2ecf20Sopenharmony_ci rdsdebug("addref inc %p ref %d\n", inc, refcount_read(&inc->i_refcount)); 738c2ecf20Sopenharmony_ci refcount_inc(&inc->i_refcount); 748c2ecf20Sopenharmony_ci} 758c2ecf20Sopenharmony_ci 768c2ecf20Sopenharmony_civoid rds_inc_put(struct rds_incoming *inc) 778c2ecf20Sopenharmony_ci{ 788c2ecf20Sopenharmony_ci rdsdebug("put inc %p ref %d\n", inc, refcount_read(&inc->i_refcount)); 798c2ecf20Sopenharmony_ci if (refcount_dec_and_test(&inc->i_refcount)) { 808c2ecf20Sopenharmony_ci BUG_ON(!list_empty(&inc->i_item)); 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_ci inc->i_conn->c_trans->inc_free(inc); 838c2ecf20Sopenharmony_ci } 848c2ecf20Sopenharmony_ci} 858c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_inc_put); 868c2ecf20Sopenharmony_ci 878c2ecf20Sopenharmony_cistatic void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, 888c2ecf20Sopenharmony_ci struct rds_cong_map *map, 898c2ecf20Sopenharmony_ci int delta, __be16 port) 908c2ecf20Sopenharmony_ci{ 918c2ecf20Sopenharmony_ci int now_congested; 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci if (delta == 0) 948c2ecf20Sopenharmony_ci return; 958c2ecf20Sopenharmony_ci 968c2ecf20Sopenharmony_ci rs->rs_rcv_bytes += delta; 978c2ecf20Sopenharmony_ci if (delta > 0) 988c2ecf20Sopenharmony_ci rds_stats_add(s_recv_bytes_added_to_socket, delta); 998c2ecf20Sopenharmony_ci else 1008c2ecf20Sopenharmony_ci rds_stats_add(s_recv_bytes_removed_from_socket, -delta); 1018c2ecf20Sopenharmony_ci 1028c2ecf20Sopenharmony_ci /* loop transport doesn't send/recv congestion updates */ 1038c2ecf20Sopenharmony_ci if (rs->rs_transport->t_type == RDS_TRANS_LOOP) 1048c2ecf20Sopenharmony_ci return; 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); 1078c2ecf20Sopenharmony_ci 1088c2ecf20Sopenharmony_ci rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d " 1098c2ecf20Sopenharmony_ci "now_cong %d delta %d\n", 1108c2ecf20Sopenharmony_ci rs, &rs->rs_bound_addr, 1118c2ecf20Sopenharmony_ci ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, 1128c2ecf20Sopenharmony_ci rds_sk_rcvbuf(rs), now_congested, delta); 1138c2ecf20Sopenharmony_ci 1148c2ecf20Sopenharmony_ci /* wasn't -> am congested */ 1158c2ecf20Sopenharmony_ci if (!rs->rs_congested && now_congested) { 1168c2ecf20Sopenharmony_ci rs->rs_congested = 1; 1178c2ecf20Sopenharmony_ci rds_cong_set_bit(map, port); 1188c2ecf20Sopenharmony_ci rds_cong_queue_updates(map); 1198c2ecf20Sopenharmony_ci } 1208c2ecf20Sopenharmony_ci /* was -> aren't congested */ 1218c2ecf20Sopenharmony_ci /* Require more free space before reporting uncongested to prevent 1228c2ecf20Sopenharmony_ci bouncing cong/uncong state too often */ 1238c2ecf20Sopenharmony_ci else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) { 1248c2ecf20Sopenharmony_ci rs->rs_congested = 0; 1258c2ecf20Sopenharmony_ci rds_cong_clear_bit(map, port); 1268c2ecf20Sopenharmony_ci rds_cong_queue_updates(map); 1278c2ecf20Sopenharmony_ci } 1288c2ecf20Sopenharmony_ci 1298c2ecf20Sopenharmony_ci /* do nothing if no change in cong state */ 1308c2ecf20Sopenharmony_ci} 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_cistatic void rds_conn_peer_gen_update(struct rds_connection *conn, 1338c2ecf20Sopenharmony_ci u32 peer_gen_num) 1348c2ecf20Sopenharmony_ci{ 1358c2ecf20Sopenharmony_ci int i; 1368c2ecf20Sopenharmony_ci struct rds_message *rm, *tmp; 1378c2ecf20Sopenharmony_ci unsigned long flags; 1388c2ecf20Sopenharmony_ci 1398c2ecf20Sopenharmony_ci WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP); 1408c2ecf20Sopenharmony_ci if (peer_gen_num != 0) { 1418c2ecf20Sopenharmony_ci if (conn->c_peer_gen_num != 0 && 1428c2ecf20Sopenharmony_ci peer_gen_num != conn->c_peer_gen_num) { 1438c2ecf20Sopenharmony_ci for (i = 0; i < RDS_MPATH_WORKERS; i++) { 1448c2ecf20Sopenharmony_ci struct rds_conn_path *cp; 1458c2ecf20Sopenharmony_ci 1468c2ecf20Sopenharmony_ci cp = &conn->c_path[i]; 1478c2ecf20Sopenharmony_ci spin_lock_irqsave(&cp->cp_lock, flags); 1488c2ecf20Sopenharmony_ci cp->cp_next_tx_seq = 1; 1498c2ecf20Sopenharmony_ci cp->cp_next_rx_seq = 0; 1508c2ecf20Sopenharmony_ci list_for_each_entry_safe(rm, tmp, 1518c2ecf20Sopenharmony_ci &cp->cp_retrans, 1528c2ecf20Sopenharmony_ci m_conn_item) { 1538c2ecf20Sopenharmony_ci set_bit(RDS_MSG_FLUSH, &rm->m_flags); 1548c2ecf20Sopenharmony_ci } 1558c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&cp->cp_lock, flags); 1568c2ecf20Sopenharmony_ci } 1578c2ecf20Sopenharmony_ci } 1588c2ecf20Sopenharmony_ci conn->c_peer_gen_num = peer_gen_num; 1598c2ecf20Sopenharmony_ci } 1608c2ecf20Sopenharmony_ci} 1618c2ecf20Sopenharmony_ci 1628c2ecf20Sopenharmony_ci/* 1638c2ecf20Sopenharmony_ci * Process all extension headers that come with this message. 1648c2ecf20Sopenharmony_ci */ 1658c2ecf20Sopenharmony_cistatic void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs) 1668c2ecf20Sopenharmony_ci{ 1678c2ecf20Sopenharmony_ci struct rds_header *hdr = &inc->i_hdr; 1688c2ecf20Sopenharmony_ci unsigned int pos = 0, type, len; 1698c2ecf20Sopenharmony_ci union { 1708c2ecf20Sopenharmony_ci struct rds_ext_header_version version; 1718c2ecf20Sopenharmony_ci struct rds_ext_header_rdma rdma; 1728c2ecf20Sopenharmony_ci struct rds_ext_header_rdma_dest rdma_dest; 1738c2ecf20Sopenharmony_ci } buffer; 1748c2ecf20Sopenharmony_ci 1758c2ecf20Sopenharmony_ci while (1) { 1768c2ecf20Sopenharmony_ci len = sizeof(buffer); 1778c2ecf20Sopenharmony_ci type = rds_message_next_extension(hdr, &pos, &buffer, &len); 1788c2ecf20Sopenharmony_ci if (type == RDS_EXTHDR_NONE) 1798c2ecf20Sopenharmony_ci break; 1808c2ecf20Sopenharmony_ci /* Process extension header here */ 1818c2ecf20Sopenharmony_ci switch (type) { 1828c2ecf20Sopenharmony_ci case RDS_EXTHDR_RDMA: 1838c2ecf20Sopenharmony_ci rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0); 1848c2ecf20Sopenharmony_ci break; 1858c2ecf20Sopenharmony_ci 1868c2ecf20Sopenharmony_ci case RDS_EXTHDR_RDMA_DEST: 1878c2ecf20Sopenharmony_ci /* We ignore the size for now. We could stash it 1888c2ecf20Sopenharmony_ci * somewhere and use it for error checking. */ 1898c2ecf20Sopenharmony_ci inc->i_usercopy.rdma_cookie = rds_rdma_make_cookie( 1908c2ecf20Sopenharmony_ci be32_to_cpu(buffer.rdma_dest.h_rdma_rkey), 1918c2ecf20Sopenharmony_ci be32_to_cpu(buffer.rdma_dest.h_rdma_offset)); 1928c2ecf20Sopenharmony_ci 1938c2ecf20Sopenharmony_ci break; 1948c2ecf20Sopenharmony_ci } 1958c2ecf20Sopenharmony_ci } 1968c2ecf20Sopenharmony_ci} 1978c2ecf20Sopenharmony_ci 1988c2ecf20Sopenharmony_cistatic void rds_recv_hs_exthdrs(struct rds_header *hdr, 1998c2ecf20Sopenharmony_ci struct rds_connection *conn) 2008c2ecf20Sopenharmony_ci{ 2018c2ecf20Sopenharmony_ci unsigned int pos = 0, type, len; 2028c2ecf20Sopenharmony_ci union { 2038c2ecf20Sopenharmony_ci struct rds_ext_header_version version; 2048c2ecf20Sopenharmony_ci u16 rds_npaths; 2058c2ecf20Sopenharmony_ci u32 rds_gen_num; 2068c2ecf20Sopenharmony_ci } buffer; 2078c2ecf20Sopenharmony_ci u32 new_peer_gen_num = 0; 2088c2ecf20Sopenharmony_ci 2098c2ecf20Sopenharmony_ci while (1) { 2108c2ecf20Sopenharmony_ci len = sizeof(buffer); 2118c2ecf20Sopenharmony_ci type = rds_message_next_extension(hdr, &pos, &buffer, &len); 2128c2ecf20Sopenharmony_ci if (type == RDS_EXTHDR_NONE) 2138c2ecf20Sopenharmony_ci break; 2148c2ecf20Sopenharmony_ci /* Process extension header here */ 2158c2ecf20Sopenharmony_ci switch (type) { 2168c2ecf20Sopenharmony_ci case RDS_EXTHDR_NPATHS: 2178c2ecf20Sopenharmony_ci conn->c_npaths = min_t(int, RDS_MPATH_WORKERS, 2188c2ecf20Sopenharmony_ci be16_to_cpu(buffer.rds_npaths)); 2198c2ecf20Sopenharmony_ci break; 2208c2ecf20Sopenharmony_ci case RDS_EXTHDR_GEN_NUM: 2218c2ecf20Sopenharmony_ci new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num); 2228c2ecf20Sopenharmony_ci break; 2238c2ecf20Sopenharmony_ci default: 2248c2ecf20Sopenharmony_ci pr_warn_ratelimited("ignoring unknown exthdr type " 2258c2ecf20Sopenharmony_ci "0x%x\n", type); 2268c2ecf20Sopenharmony_ci } 2278c2ecf20Sopenharmony_ci } 2288c2ecf20Sopenharmony_ci /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */ 2298c2ecf20Sopenharmony_ci conn->c_npaths = max_t(int, conn->c_npaths, 1); 2308c2ecf20Sopenharmony_ci conn->c_ping_triggered = 0; 2318c2ecf20Sopenharmony_ci rds_conn_peer_gen_update(conn, new_peer_gen_num); 2328c2ecf20Sopenharmony_ci} 2338c2ecf20Sopenharmony_ci 2348c2ecf20Sopenharmony_ci/* rds_start_mprds() will synchronously start multiple paths when appropriate. 2358c2ecf20Sopenharmony_ci * The scheme is based on the following rules: 2368c2ecf20Sopenharmony_ci * 2378c2ecf20Sopenharmony_ci * 1. rds_sendmsg on first connect attempt sends the probe ping, with the 2388c2ecf20Sopenharmony_ci * sender's npaths (s_npaths) 2398c2ecf20Sopenharmony_ci * 2. rcvr of probe-ping knows the mprds_paths = min(s_npaths, r_npaths). It 2408c2ecf20Sopenharmony_ci * sends back a probe-pong with r_npaths. After that, if rcvr is the 2418c2ecf20Sopenharmony_ci * smaller ip addr, it starts rds_conn_path_connect_if_down on all 2428c2ecf20Sopenharmony_ci * mprds_paths. 2438c2ecf20Sopenharmony_ci * 3. sender gets woken up, and can move to rds_conn_path_connect_if_down. 2448c2ecf20Sopenharmony_ci * If it is the smaller ipaddr, rds_conn_path_connect_if_down can be 2458c2ecf20Sopenharmony_ci * called after reception of the probe-pong on all mprds_paths. 2468c2ecf20Sopenharmony_ci * Otherwise (sender of probe-ping is not the smaller ip addr): just call 2478c2ecf20Sopenharmony_ci * rds_conn_path_connect_if_down on the hashed path. (see rule 4) 2488c2ecf20Sopenharmony_ci * 4. rds_connect_worker must only trigger a connection if laddr < faddr. 2498c2ecf20Sopenharmony_ci * 5. sender may end up queuing the packet on the cp. will get sent out later. 2508c2ecf20Sopenharmony_ci * when connection is completed. 2518c2ecf20Sopenharmony_ci */ 2528c2ecf20Sopenharmony_cistatic void rds_start_mprds(struct rds_connection *conn) 2538c2ecf20Sopenharmony_ci{ 2548c2ecf20Sopenharmony_ci int i; 2558c2ecf20Sopenharmony_ci struct rds_conn_path *cp; 2568c2ecf20Sopenharmony_ci 2578c2ecf20Sopenharmony_ci if (conn->c_npaths > 1 && 2588c2ecf20Sopenharmony_ci rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) { 2598c2ecf20Sopenharmony_ci for (i = 0; i < conn->c_npaths; i++) { 2608c2ecf20Sopenharmony_ci cp = &conn->c_path[i]; 2618c2ecf20Sopenharmony_ci rds_conn_path_connect_if_down(cp); 2628c2ecf20Sopenharmony_ci } 2638c2ecf20Sopenharmony_ci } 2648c2ecf20Sopenharmony_ci} 2658c2ecf20Sopenharmony_ci 2668c2ecf20Sopenharmony_ci/* 2678c2ecf20Sopenharmony_ci * The transport must make sure that this is serialized against other 2688c2ecf20Sopenharmony_ci * rx and conn reset on this specific conn. 2698c2ecf20Sopenharmony_ci * 2708c2ecf20Sopenharmony_ci * We currently assert that only one fragmented message will be sent 2718c2ecf20Sopenharmony_ci * down a connection at a time. This lets us reassemble in the conn 2728c2ecf20Sopenharmony_ci * instead of per-flow which means that we don't have to go digging through 2738c2ecf20Sopenharmony_ci * flows to tear down partial reassembly progress on conn failure and 2748c2ecf20Sopenharmony_ci * we save flow lookup and locking for each frag arrival. It does mean 2758c2ecf20Sopenharmony_ci * that small messages will wait behind large ones. Fragmenting at all 2768c2ecf20Sopenharmony_ci * is only to reduce the memory consumption of pre-posted buffers. 2778c2ecf20Sopenharmony_ci * 2788c2ecf20Sopenharmony_ci * The caller passes in saddr and daddr instead of us getting it from the 2798c2ecf20Sopenharmony_ci * conn. This lets loopback, who only has one conn for both directions, 2808c2ecf20Sopenharmony_ci * tell us which roles the addrs in the conn are playing for this message. 2818c2ecf20Sopenharmony_ci */ 2828c2ecf20Sopenharmony_civoid rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, 2838c2ecf20Sopenharmony_ci struct in6_addr *daddr, 2848c2ecf20Sopenharmony_ci struct rds_incoming *inc, gfp_t gfp) 2858c2ecf20Sopenharmony_ci{ 2868c2ecf20Sopenharmony_ci struct rds_sock *rs = NULL; 2878c2ecf20Sopenharmony_ci struct sock *sk; 2888c2ecf20Sopenharmony_ci unsigned long flags; 2898c2ecf20Sopenharmony_ci struct rds_conn_path *cp; 2908c2ecf20Sopenharmony_ci 2918c2ecf20Sopenharmony_ci inc->i_conn = conn; 2928c2ecf20Sopenharmony_ci inc->i_rx_jiffies = jiffies; 2938c2ecf20Sopenharmony_ci if (conn->c_trans->t_mp_capable) 2948c2ecf20Sopenharmony_ci cp = inc->i_conn_path; 2958c2ecf20Sopenharmony_ci else 2968c2ecf20Sopenharmony_ci cp = &conn->c_path[0]; 2978c2ecf20Sopenharmony_ci 2988c2ecf20Sopenharmony_ci rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u " 2998c2ecf20Sopenharmony_ci "flags 0x%x rx_jiffies %lu\n", conn, 3008c2ecf20Sopenharmony_ci (unsigned long long)cp->cp_next_rx_seq, 3018c2ecf20Sopenharmony_ci inc, 3028c2ecf20Sopenharmony_ci (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence), 3038c2ecf20Sopenharmony_ci be32_to_cpu(inc->i_hdr.h_len), 3048c2ecf20Sopenharmony_ci be16_to_cpu(inc->i_hdr.h_sport), 3058c2ecf20Sopenharmony_ci be16_to_cpu(inc->i_hdr.h_dport), 3068c2ecf20Sopenharmony_ci inc->i_hdr.h_flags, 3078c2ecf20Sopenharmony_ci inc->i_rx_jiffies); 3088c2ecf20Sopenharmony_ci 3098c2ecf20Sopenharmony_ci /* 3108c2ecf20Sopenharmony_ci * Sequence numbers should only increase. Messages get their 3118c2ecf20Sopenharmony_ci * sequence number as they're queued in a sending conn. They 3128c2ecf20Sopenharmony_ci * can be dropped, though, if the sending socket is closed before 3138c2ecf20Sopenharmony_ci * they hit the wire. So sequence numbers can skip forward 3148c2ecf20Sopenharmony_ci * under normal operation. They can also drop back in the conn 3158c2ecf20Sopenharmony_ci * failover case as previously sent messages are resent down the 3168c2ecf20Sopenharmony_ci * new instance of a conn. We drop those, otherwise we have 3178c2ecf20Sopenharmony_ci * to assume that the next valid seq does not come after a 3188c2ecf20Sopenharmony_ci * hole in the fragment stream. 3198c2ecf20Sopenharmony_ci * 3208c2ecf20Sopenharmony_ci * The headers don't give us a way to realize if fragments of 3218c2ecf20Sopenharmony_ci * a message have been dropped. We assume that frags that arrive 3228c2ecf20Sopenharmony_ci * to a flow are part of the current message on the flow that is 3238c2ecf20Sopenharmony_ci * being reassembled. This means that senders can't drop messages 3248c2ecf20Sopenharmony_ci * from the sending conn until all their frags are sent. 3258c2ecf20Sopenharmony_ci * 3268c2ecf20Sopenharmony_ci * XXX we could spend more on the wire to get more robust failure 3278c2ecf20Sopenharmony_ci * detection, arguably worth it to avoid data corruption. 3288c2ecf20Sopenharmony_ci */ 3298c2ecf20Sopenharmony_ci if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq && 3308c2ecf20Sopenharmony_ci (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) { 3318c2ecf20Sopenharmony_ci rds_stats_inc(s_recv_drop_old_seq); 3328c2ecf20Sopenharmony_ci goto out; 3338c2ecf20Sopenharmony_ci } 3348c2ecf20Sopenharmony_ci cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1; 3358c2ecf20Sopenharmony_ci 3368c2ecf20Sopenharmony_ci if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { 3378c2ecf20Sopenharmony_ci if (inc->i_hdr.h_sport == 0) { 3388c2ecf20Sopenharmony_ci rdsdebug("ignore ping with 0 sport from %pI6c\n", 3398c2ecf20Sopenharmony_ci saddr); 3408c2ecf20Sopenharmony_ci goto out; 3418c2ecf20Sopenharmony_ci } 3428c2ecf20Sopenharmony_ci rds_stats_inc(s_recv_ping); 3438c2ecf20Sopenharmony_ci rds_send_pong(cp, inc->i_hdr.h_sport); 3448c2ecf20Sopenharmony_ci /* if this is a handshake ping, start multipath if necessary */ 3458c2ecf20Sopenharmony_ci if (RDS_HS_PROBE(be16_to_cpu(inc->i_hdr.h_sport), 3468c2ecf20Sopenharmony_ci be16_to_cpu(inc->i_hdr.h_dport))) { 3478c2ecf20Sopenharmony_ci rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn); 3488c2ecf20Sopenharmony_ci rds_start_mprds(cp->cp_conn); 3498c2ecf20Sopenharmony_ci } 3508c2ecf20Sopenharmony_ci goto out; 3518c2ecf20Sopenharmony_ci } 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_ci if (be16_to_cpu(inc->i_hdr.h_dport) == RDS_FLAG_PROBE_PORT && 3548c2ecf20Sopenharmony_ci inc->i_hdr.h_sport == 0) { 3558c2ecf20Sopenharmony_ci rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn); 3568c2ecf20Sopenharmony_ci /* if this is a handshake pong, start multipath if necessary */ 3578c2ecf20Sopenharmony_ci rds_start_mprds(cp->cp_conn); 3588c2ecf20Sopenharmony_ci wake_up(&cp->cp_conn->c_hs_waitq); 3598c2ecf20Sopenharmony_ci goto out; 3608c2ecf20Sopenharmony_ci } 3618c2ecf20Sopenharmony_ci 3628c2ecf20Sopenharmony_ci rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_bound_if); 3638c2ecf20Sopenharmony_ci if (!rs) { 3648c2ecf20Sopenharmony_ci rds_stats_inc(s_recv_drop_no_sock); 3658c2ecf20Sopenharmony_ci goto out; 3668c2ecf20Sopenharmony_ci } 3678c2ecf20Sopenharmony_ci 3688c2ecf20Sopenharmony_ci /* Process extension headers */ 3698c2ecf20Sopenharmony_ci rds_recv_incoming_exthdrs(inc, rs); 3708c2ecf20Sopenharmony_ci 3718c2ecf20Sopenharmony_ci /* We can be racing with rds_release() which marks the socket dead. */ 3728c2ecf20Sopenharmony_ci sk = rds_rs_to_sk(rs); 3738c2ecf20Sopenharmony_ci 3748c2ecf20Sopenharmony_ci /* serialize with rds_release -> sock_orphan */ 3758c2ecf20Sopenharmony_ci write_lock_irqsave(&rs->rs_recv_lock, flags); 3768c2ecf20Sopenharmony_ci if (!sock_flag(sk, SOCK_DEAD)) { 3778c2ecf20Sopenharmony_ci rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs); 3788c2ecf20Sopenharmony_ci rds_stats_inc(s_recv_queued); 3798c2ecf20Sopenharmony_ci rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, 3808c2ecf20Sopenharmony_ci be32_to_cpu(inc->i_hdr.h_len), 3818c2ecf20Sopenharmony_ci inc->i_hdr.h_dport); 3828c2ecf20Sopenharmony_ci if (sock_flag(sk, SOCK_RCVTSTAMP)) 3838c2ecf20Sopenharmony_ci inc->i_usercopy.rx_tstamp = ktime_get_real(); 3848c2ecf20Sopenharmony_ci rds_inc_addref(inc); 3858c2ecf20Sopenharmony_ci inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock(); 3868c2ecf20Sopenharmony_ci list_add_tail(&inc->i_item, &rs->rs_recv_queue); 3878c2ecf20Sopenharmony_ci __rds_wake_sk_sleep(sk); 3888c2ecf20Sopenharmony_ci } else { 3898c2ecf20Sopenharmony_ci rds_stats_inc(s_recv_drop_dead_sock); 3908c2ecf20Sopenharmony_ci } 3918c2ecf20Sopenharmony_ci write_unlock_irqrestore(&rs->rs_recv_lock, flags); 3928c2ecf20Sopenharmony_ci 3938c2ecf20Sopenharmony_ciout: 3948c2ecf20Sopenharmony_ci if (rs) 3958c2ecf20Sopenharmony_ci rds_sock_put(rs); 3968c2ecf20Sopenharmony_ci} 3978c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_recv_incoming); 3988c2ecf20Sopenharmony_ci 3998c2ecf20Sopenharmony_ci/* 4008c2ecf20Sopenharmony_ci * be very careful here. This is being called as the condition in 4018c2ecf20Sopenharmony_ci * wait_event_*() needs to cope with being called many times. 4028c2ecf20Sopenharmony_ci */ 4038c2ecf20Sopenharmony_cistatic int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc) 4048c2ecf20Sopenharmony_ci{ 4058c2ecf20Sopenharmony_ci unsigned long flags; 4068c2ecf20Sopenharmony_ci 4078c2ecf20Sopenharmony_ci if (!*inc) { 4088c2ecf20Sopenharmony_ci read_lock_irqsave(&rs->rs_recv_lock, flags); 4098c2ecf20Sopenharmony_ci if (!list_empty(&rs->rs_recv_queue)) { 4108c2ecf20Sopenharmony_ci *inc = list_entry(rs->rs_recv_queue.next, 4118c2ecf20Sopenharmony_ci struct rds_incoming, 4128c2ecf20Sopenharmony_ci i_item); 4138c2ecf20Sopenharmony_ci rds_inc_addref(*inc); 4148c2ecf20Sopenharmony_ci } 4158c2ecf20Sopenharmony_ci read_unlock_irqrestore(&rs->rs_recv_lock, flags); 4168c2ecf20Sopenharmony_ci } 4178c2ecf20Sopenharmony_ci 4188c2ecf20Sopenharmony_ci return *inc != NULL; 4198c2ecf20Sopenharmony_ci} 4208c2ecf20Sopenharmony_ci 4218c2ecf20Sopenharmony_cistatic int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, 4228c2ecf20Sopenharmony_ci int drop) 4238c2ecf20Sopenharmony_ci{ 4248c2ecf20Sopenharmony_ci struct sock *sk = rds_rs_to_sk(rs); 4258c2ecf20Sopenharmony_ci int ret = 0; 4268c2ecf20Sopenharmony_ci unsigned long flags; 4278c2ecf20Sopenharmony_ci 4288c2ecf20Sopenharmony_ci write_lock_irqsave(&rs->rs_recv_lock, flags); 4298c2ecf20Sopenharmony_ci if (!list_empty(&inc->i_item)) { 4308c2ecf20Sopenharmony_ci ret = 1; 4318c2ecf20Sopenharmony_ci if (drop) { 4328c2ecf20Sopenharmony_ci /* XXX make sure this i_conn is reliable */ 4338c2ecf20Sopenharmony_ci rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, 4348c2ecf20Sopenharmony_ci -be32_to_cpu(inc->i_hdr.h_len), 4358c2ecf20Sopenharmony_ci inc->i_hdr.h_dport); 4368c2ecf20Sopenharmony_ci list_del_init(&inc->i_item); 4378c2ecf20Sopenharmony_ci rds_inc_put(inc); 4388c2ecf20Sopenharmony_ci } 4398c2ecf20Sopenharmony_ci } 4408c2ecf20Sopenharmony_ci write_unlock_irqrestore(&rs->rs_recv_lock, flags); 4418c2ecf20Sopenharmony_ci 4428c2ecf20Sopenharmony_ci rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop); 4438c2ecf20Sopenharmony_ci return ret; 4448c2ecf20Sopenharmony_ci} 4458c2ecf20Sopenharmony_ci 4468c2ecf20Sopenharmony_ci/* 4478c2ecf20Sopenharmony_ci * Pull errors off the error queue. 4488c2ecf20Sopenharmony_ci * If msghdr is NULL, we will just purge the error queue. 4498c2ecf20Sopenharmony_ci */ 4508c2ecf20Sopenharmony_ciint rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) 4518c2ecf20Sopenharmony_ci{ 4528c2ecf20Sopenharmony_ci struct rds_notifier *notifier; 4538c2ecf20Sopenharmony_ci struct rds_rdma_notify cmsg; 4548c2ecf20Sopenharmony_ci unsigned int count = 0, max_messages = ~0U; 4558c2ecf20Sopenharmony_ci unsigned long flags; 4568c2ecf20Sopenharmony_ci LIST_HEAD(copy); 4578c2ecf20Sopenharmony_ci int err = 0; 4588c2ecf20Sopenharmony_ci 4598c2ecf20Sopenharmony_ci memset(&cmsg, 0, sizeof(cmsg)); /* fill holes with zero */ 4608c2ecf20Sopenharmony_ci 4618c2ecf20Sopenharmony_ci /* put_cmsg copies to user space and thus may sleep. We can't do this 4628c2ecf20Sopenharmony_ci * with rs_lock held, so first grab as many notifications as we can stuff 4638c2ecf20Sopenharmony_ci * in the user provided cmsg buffer. We don't try to copy more, to avoid 4648c2ecf20Sopenharmony_ci * losing notifications - except when the buffer is so small that it wouldn't 4658c2ecf20Sopenharmony_ci * even hold a single notification. Then we give him as much of this single 4668c2ecf20Sopenharmony_ci * msg as we can squeeze in, and set MSG_CTRUNC. 4678c2ecf20Sopenharmony_ci */ 4688c2ecf20Sopenharmony_ci if (msghdr) { 4698c2ecf20Sopenharmony_ci max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg)); 4708c2ecf20Sopenharmony_ci if (!max_messages) 4718c2ecf20Sopenharmony_ci max_messages = 1; 4728c2ecf20Sopenharmony_ci } 4738c2ecf20Sopenharmony_ci 4748c2ecf20Sopenharmony_ci spin_lock_irqsave(&rs->rs_lock, flags); 4758c2ecf20Sopenharmony_ci while (!list_empty(&rs->rs_notify_queue) && count < max_messages) { 4768c2ecf20Sopenharmony_ci notifier = list_entry(rs->rs_notify_queue.next, 4778c2ecf20Sopenharmony_ci struct rds_notifier, n_list); 4788c2ecf20Sopenharmony_ci list_move(¬ifier->n_list, ©); 4798c2ecf20Sopenharmony_ci count++; 4808c2ecf20Sopenharmony_ci } 4818c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&rs->rs_lock, flags); 4828c2ecf20Sopenharmony_ci 4838c2ecf20Sopenharmony_ci if (!count) 4848c2ecf20Sopenharmony_ci return 0; 4858c2ecf20Sopenharmony_ci 4868c2ecf20Sopenharmony_ci while (!list_empty(©)) { 4878c2ecf20Sopenharmony_ci notifier = list_entry(copy.next, struct rds_notifier, n_list); 4888c2ecf20Sopenharmony_ci 4898c2ecf20Sopenharmony_ci if (msghdr) { 4908c2ecf20Sopenharmony_ci cmsg.user_token = notifier->n_user_token; 4918c2ecf20Sopenharmony_ci cmsg.status = notifier->n_status; 4928c2ecf20Sopenharmony_ci 4938c2ecf20Sopenharmony_ci err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, 4948c2ecf20Sopenharmony_ci sizeof(cmsg), &cmsg); 4958c2ecf20Sopenharmony_ci if (err) 4968c2ecf20Sopenharmony_ci break; 4978c2ecf20Sopenharmony_ci } 4988c2ecf20Sopenharmony_ci 4998c2ecf20Sopenharmony_ci list_del_init(¬ifier->n_list); 5008c2ecf20Sopenharmony_ci kfree(notifier); 5018c2ecf20Sopenharmony_ci } 5028c2ecf20Sopenharmony_ci 5038c2ecf20Sopenharmony_ci /* If we bailed out because of an error in put_cmsg, 5048c2ecf20Sopenharmony_ci * we may be left with one or more notifications that we 5058c2ecf20Sopenharmony_ci * didn't process. Return them to the head of the list. */ 5068c2ecf20Sopenharmony_ci if (!list_empty(©)) { 5078c2ecf20Sopenharmony_ci spin_lock_irqsave(&rs->rs_lock, flags); 5088c2ecf20Sopenharmony_ci list_splice(©, &rs->rs_notify_queue); 5098c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&rs->rs_lock, flags); 5108c2ecf20Sopenharmony_ci } 5118c2ecf20Sopenharmony_ci 5128c2ecf20Sopenharmony_ci return err; 5138c2ecf20Sopenharmony_ci} 5148c2ecf20Sopenharmony_ci 5158c2ecf20Sopenharmony_ci/* 5168c2ecf20Sopenharmony_ci * Queue a congestion notification 5178c2ecf20Sopenharmony_ci */ 5188c2ecf20Sopenharmony_cistatic int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr) 5198c2ecf20Sopenharmony_ci{ 5208c2ecf20Sopenharmony_ci uint64_t notify = rs->rs_cong_notify; 5218c2ecf20Sopenharmony_ci unsigned long flags; 5228c2ecf20Sopenharmony_ci int err; 5238c2ecf20Sopenharmony_ci 5248c2ecf20Sopenharmony_ci err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE, 5258c2ecf20Sopenharmony_ci sizeof(notify), ¬ify); 5268c2ecf20Sopenharmony_ci if (err) 5278c2ecf20Sopenharmony_ci return err; 5288c2ecf20Sopenharmony_ci 5298c2ecf20Sopenharmony_ci spin_lock_irqsave(&rs->rs_lock, flags); 5308c2ecf20Sopenharmony_ci rs->rs_cong_notify &= ~notify; 5318c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&rs->rs_lock, flags); 5328c2ecf20Sopenharmony_ci 5338c2ecf20Sopenharmony_ci return 0; 5348c2ecf20Sopenharmony_ci} 5358c2ecf20Sopenharmony_ci 5368c2ecf20Sopenharmony_ci/* 5378c2ecf20Sopenharmony_ci * Receive any control messages. 5388c2ecf20Sopenharmony_ci */ 5398c2ecf20Sopenharmony_cistatic int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, 5408c2ecf20Sopenharmony_ci struct rds_sock *rs) 5418c2ecf20Sopenharmony_ci{ 5428c2ecf20Sopenharmony_ci int ret = 0; 5438c2ecf20Sopenharmony_ci 5448c2ecf20Sopenharmony_ci if (inc->i_usercopy.rdma_cookie) { 5458c2ecf20Sopenharmony_ci ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, 5468c2ecf20Sopenharmony_ci sizeof(inc->i_usercopy.rdma_cookie), 5478c2ecf20Sopenharmony_ci &inc->i_usercopy.rdma_cookie); 5488c2ecf20Sopenharmony_ci if (ret) 5498c2ecf20Sopenharmony_ci goto out; 5508c2ecf20Sopenharmony_ci } 5518c2ecf20Sopenharmony_ci 5528c2ecf20Sopenharmony_ci if ((inc->i_usercopy.rx_tstamp != 0) && 5538c2ecf20Sopenharmony_ci sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) { 5548c2ecf20Sopenharmony_ci struct __kernel_old_timeval tv = 5558c2ecf20Sopenharmony_ci ns_to_kernel_old_timeval(inc->i_usercopy.rx_tstamp); 5568c2ecf20Sopenharmony_ci 5578c2ecf20Sopenharmony_ci if (!sock_flag(rds_rs_to_sk(rs), SOCK_TSTAMP_NEW)) { 5588c2ecf20Sopenharmony_ci ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, 5598c2ecf20Sopenharmony_ci sizeof(tv), &tv); 5608c2ecf20Sopenharmony_ci } else { 5618c2ecf20Sopenharmony_ci struct __kernel_sock_timeval sk_tv; 5628c2ecf20Sopenharmony_ci 5638c2ecf20Sopenharmony_ci sk_tv.tv_sec = tv.tv_sec; 5648c2ecf20Sopenharmony_ci sk_tv.tv_usec = tv.tv_usec; 5658c2ecf20Sopenharmony_ci 5668c2ecf20Sopenharmony_ci ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, 5678c2ecf20Sopenharmony_ci sizeof(sk_tv), &sk_tv); 5688c2ecf20Sopenharmony_ci } 5698c2ecf20Sopenharmony_ci 5708c2ecf20Sopenharmony_ci if (ret) 5718c2ecf20Sopenharmony_ci goto out; 5728c2ecf20Sopenharmony_ci } 5738c2ecf20Sopenharmony_ci 5748c2ecf20Sopenharmony_ci if (rs->rs_rx_traces) { 5758c2ecf20Sopenharmony_ci struct rds_cmsg_rx_trace t; 5768c2ecf20Sopenharmony_ci int i, j; 5778c2ecf20Sopenharmony_ci 5788c2ecf20Sopenharmony_ci memset(&t, 0, sizeof(t)); 5798c2ecf20Sopenharmony_ci inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock(); 5808c2ecf20Sopenharmony_ci t.rx_traces = rs->rs_rx_traces; 5818c2ecf20Sopenharmony_ci for (i = 0; i < rs->rs_rx_traces; i++) { 5828c2ecf20Sopenharmony_ci j = rs->rs_rx_trace[i]; 5838c2ecf20Sopenharmony_ci t.rx_trace_pos[i] = j; 5848c2ecf20Sopenharmony_ci t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] - 5858c2ecf20Sopenharmony_ci inc->i_rx_lat_trace[j]; 5868c2ecf20Sopenharmony_ci } 5878c2ecf20Sopenharmony_ci 5888c2ecf20Sopenharmony_ci ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY, 5898c2ecf20Sopenharmony_ci sizeof(t), &t); 5908c2ecf20Sopenharmony_ci if (ret) 5918c2ecf20Sopenharmony_ci goto out; 5928c2ecf20Sopenharmony_ci } 5938c2ecf20Sopenharmony_ci 5948c2ecf20Sopenharmony_ciout: 5958c2ecf20Sopenharmony_ci return ret; 5968c2ecf20Sopenharmony_ci} 5978c2ecf20Sopenharmony_ci 5988c2ecf20Sopenharmony_cistatic bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg) 5998c2ecf20Sopenharmony_ci{ 6008c2ecf20Sopenharmony_ci struct rds_msg_zcopy_queue *q = &rs->rs_zcookie_queue; 6018c2ecf20Sopenharmony_ci struct rds_msg_zcopy_info *info = NULL; 6028c2ecf20Sopenharmony_ci struct rds_zcopy_cookies *done; 6038c2ecf20Sopenharmony_ci unsigned long flags; 6048c2ecf20Sopenharmony_ci 6058c2ecf20Sopenharmony_ci if (!msg->msg_control) 6068c2ecf20Sopenharmony_ci return false; 6078c2ecf20Sopenharmony_ci 6088c2ecf20Sopenharmony_ci if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) || 6098c2ecf20Sopenharmony_ci msg->msg_controllen < CMSG_SPACE(sizeof(*done))) 6108c2ecf20Sopenharmony_ci return false; 6118c2ecf20Sopenharmony_ci 6128c2ecf20Sopenharmony_ci spin_lock_irqsave(&q->lock, flags); 6138c2ecf20Sopenharmony_ci if (!list_empty(&q->zcookie_head)) { 6148c2ecf20Sopenharmony_ci info = list_entry(q->zcookie_head.next, 6158c2ecf20Sopenharmony_ci struct rds_msg_zcopy_info, rs_zcookie_next); 6168c2ecf20Sopenharmony_ci list_del(&info->rs_zcookie_next); 6178c2ecf20Sopenharmony_ci } 6188c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&q->lock, flags); 6198c2ecf20Sopenharmony_ci if (!info) 6208c2ecf20Sopenharmony_ci return false; 6218c2ecf20Sopenharmony_ci done = &info->zcookies; 6228c2ecf20Sopenharmony_ci if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done), 6238c2ecf20Sopenharmony_ci done)) { 6248c2ecf20Sopenharmony_ci spin_lock_irqsave(&q->lock, flags); 6258c2ecf20Sopenharmony_ci list_add(&info->rs_zcookie_next, &q->zcookie_head); 6268c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&q->lock, flags); 6278c2ecf20Sopenharmony_ci return false; 6288c2ecf20Sopenharmony_ci } 6298c2ecf20Sopenharmony_ci kfree(info); 6308c2ecf20Sopenharmony_ci return true; 6318c2ecf20Sopenharmony_ci} 6328c2ecf20Sopenharmony_ci 6338c2ecf20Sopenharmony_ciint rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 6348c2ecf20Sopenharmony_ci int msg_flags) 6358c2ecf20Sopenharmony_ci{ 6368c2ecf20Sopenharmony_ci struct sock *sk = sock->sk; 6378c2ecf20Sopenharmony_ci struct rds_sock *rs = rds_sk_to_rs(sk); 6388c2ecf20Sopenharmony_ci long timeo; 6398c2ecf20Sopenharmony_ci int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; 6408c2ecf20Sopenharmony_ci DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); 6418c2ecf20Sopenharmony_ci DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); 6428c2ecf20Sopenharmony_ci struct rds_incoming *inc = NULL; 6438c2ecf20Sopenharmony_ci 6448c2ecf20Sopenharmony_ci /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */ 6458c2ecf20Sopenharmony_ci timeo = sock_rcvtimeo(sk, nonblock); 6468c2ecf20Sopenharmony_ci 6478c2ecf20Sopenharmony_ci rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo); 6488c2ecf20Sopenharmony_ci 6498c2ecf20Sopenharmony_ci if (msg_flags & MSG_OOB) 6508c2ecf20Sopenharmony_ci goto out; 6518c2ecf20Sopenharmony_ci if (msg_flags & MSG_ERRQUEUE) 6528c2ecf20Sopenharmony_ci return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR); 6538c2ecf20Sopenharmony_ci 6548c2ecf20Sopenharmony_ci while (1) { 6558c2ecf20Sopenharmony_ci /* If there are pending notifications, do those - and nothing else */ 6568c2ecf20Sopenharmony_ci if (!list_empty(&rs->rs_notify_queue)) { 6578c2ecf20Sopenharmony_ci ret = rds_notify_queue_get(rs, msg); 6588c2ecf20Sopenharmony_ci break; 6598c2ecf20Sopenharmony_ci } 6608c2ecf20Sopenharmony_ci 6618c2ecf20Sopenharmony_ci if (rs->rs_cong_notify) { 6628c2ecf20Sopenharmony_ci ret = rds_notify_cong(rs, msg); 6638c2ecf20Sopenharmony_ci break; 6648c2ecf20Sopenharmony_ci } 6658c2ecf20Sopenharmony_ci 6668c2ecf20Sopenharmony_ci if (!rds_next_incoming(rs, &inc)) { 6678c2ecf20Sopenharmony_ci if (nonblock) { 6688c2ecf20Sopenharmony_ci bool reaped = rds_recvmsg_zcookie(rs, msg); 6698c2ecf20Sopenharmony_ci 6708c2ecf20Sopenharmony_ci ret = reaped ? 0 : -EAGAIN; 6718c2ecf20Sopenharmony_ci break; 6728c2ecf20Sopenharmony_ci } 6738c2ecf20Sopenharmony_ci 6748c2ecf20Sopenharmony_ci timeo = wait_event_interruptible_timeout(*sk_sleep(sk), 6758c2ecf20Sopenharmony_ci (!list_empty(&rs->rs_notify_queue) || 6768c2ecf20Sopenharmony_ci rs->rs_cong_notify || 6778c2ecf20Sopenharmony_ci rds_next_incoming(rs, &inc)), timeo); 6788c2ecf20Sopenharmony_ci rdsdebug("recvmsg woke inc %p timeo %ld\n", inc, 6798c2ecf20Sopenharmony_ci timeo); 6808c2ecf20Sopenharmony_ci if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) 6818c2ecf20Sopenharmony_ci continue; 6828c2ecf20Sopenharmony_ci 6838c2ecf20Sopenharmony_ci ret = timeo; 6848c2ecf20Sopenharmony_ci if (ret == 0) 6858c2ecf20Sopenharmony_ci ret = -ETIMEDOUT; 6868c2ecf20Sopenharmony_ci break; 6878c2ecf20Sopenharmony_ci } 6888c2ecf20Sopenharmony_ci 6898c2ecf20Sopenharmony_ci rdsdebug("copying inc %p from %pI6c:%u to user\n", inc, 6908c2ecf20Sopenharmony_ci &inc->i_conn->c_faddr, 6918c2ecf20Sopenharmony_ci ntohs(inc->i_hdr.h_sport)); 6928c2ecf20Sopenharmony_ci ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter); 6938c2ecf20Sopenharmony_ci if (ret < 0) 6948c2ecf20Sopenharmony_ci break; 6958c2ecf20Sopenharmony_ci 6968c2ecf20Sopenharmony_ci /* 6978c2ecf20Sopenharmony_ci * if the message we just copied isn't at the head of the 6988c2ecf20Sopenharmony_ci * recv queue then someone else raced us to return it, try 6998c2ecf20Sopenharmony_ci * to get the next message. 7008c2ecf20Sopenharmony_ci */ 7018c2ecf20Sopenharmony_ci if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) { 7028c2ecf20Sopenharmony_ci rds_inc_put(inc); 7038c2ecf20Sopenharmony_ci inc = NULL; 7048c2ecf20Sopenharmony_ci rds_stats_inc(s_recv_deliver_raced); 7058c2ecf20Sopenharmony_ci iov_iter_revert(&msg->msg_iter, ret); 7068c2ecf20Sopenharmony_ci continue; 7078c2ecf20Sopenharmony_ci } 7088c2ecf20Sopenharmony_ci 7098c2ecf20Sopenharmony_ci if (ret < be32_to_cpu(inc->i_hdr.h_len)) { 7108c2ecf20Sopenharmony_ci if (msg_flags & MSG_TRUNC) 7118c2ecf20Sopenharmony_ci ret = be32_to_cpu(inc->i_hdr.h_len); 7128c2ecf20Sopenharmony_ci msg->msg_flags |= MSG_TRUNC; 7138c2ecf20Sopenharmony_ci } 7148c2ecf20Sopenharmony_ci 7158c2ecf20Sopenharmony_ci if (rds_cmsg_recv(inc, msg, rs)) { 7168c2ecf20Sopenharmony_ci ret = -EFAULT; 7178c2ecf20Sopenharmony_ci break; 7188c2ecf20Sopenharmony_ci } 7198c2ecf20Sopenharmony_ci rds_recvmsg_zcookie(rs, msg); 7208c2ecf20Sopenharmony_ci 7218c2ecf20Sopenharmony_ci rds_stats_inc(s_recv_delivered); 7228c2ecf20Sopenharmony_ci 7238c2ecf20Sopenharmony_ci if (msg->msg_name) { 7248c2ecf20Sopenharmony_ci if (ipv6_addr_v4mapped(&inc->i_saddr)) { 7258c2ecf20Sopenharmony_ci sin = (struct sockaddr_in *)msg->msg_name; 7268c2ecf20Sopenharmony_ci 7278c2ecf20Sopenharmony_ci sin->sin_family = AF_INET; 7288c2ecf20Sopenharmony_ci sin->sin_port = inc->i_hdr.h_sport; 7298c2ecf20Sopenharmony_ci sin->sin_addr.s_addr = 7308c2ecf20Sopenharmony_ci inc->i_saddr.s6_addr32[3]; 7318c2ecf20Sopenharmony_ci memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 7328c2ecf20Sopenharmony_ci msg->msg_namelen = sizeof(*sin); 7338c2ecf20Sopenharmony_ci } else { 7348c2ecf20Sopenharmony_ci sin6 = (struct sockaddr_in6 *)msg->msg_name; 7358c2ecf20Sopenharmony_ci 7368c2ecf20Sopenharmony_ci sin6->sin6_family = AF_INET6; 7378c2ecf20Sopenharmony_ci sin6->sin6_port = inc->i_hdr.h_sport; 7388c2ecf20Sopenharmony_ci sin6->sin6_addr = inc->i_saddr; 7398c2ecf20Sopenharmony_ci sin6->sin6_flowinfo = 0; 7408c2ecf20Sopenharmony_ci sin6->sin6_scope_id = rs->rs_bound_scope_id; 7418c2ecf20Sopenharmony_ci msg->msg_namelen = sizeof(*sin6); 7428c2ecf20Sopenharmony_ci } 7438c2ecf20Sopenharmony_ci } 7448c2ecf20Sopenharmony_ci break; 7458c2ecf20Sopenharmony_ci } 7468c2ecf20Sopenharmony_ci 7478c2ecf20Sopenharmony_ci if (inc) 7488c2ecf20Sopenharmony_ci rds_inc_put(inc); 7498c2ecf20Sopenharmony_ci 7508c2ecf20Sopenharmony_ciout: 7518c2ecf20Sopenharmony_ci return ret; 7528c2ecf20Sopenharmony_ci} 7538c2ecf20Sopenharmony_ci 7548c2ecf20Sopenharmony_ci/* 7558c2ecf20Sopenharmony_ci * The socket is being shut down and we're asked to drop messages that were 7568c2ecf20Sopenharmony_ci * queued for recvmsg. The caller has unbound the socket so the receive path 7578c2ecf20Sopenharmony_ci * won't queue any more incoming fragments or messages on the socket. 7588c2ecf20Sopenharmony_ci */ 7598c2ecf20Sopenharmony_civoid rds_clear_recv_queue(struct rds_sock *rs) 7608c2ecf20Sopenharmony_ci{ 7618c2ecf20Sopenharmony_ci struct sock *sk = rds_rs_to_sk(rs); 7628c2ecf20Sopenharmony_ci struct rds_incoming *inc, *tmp; 7638c2ecf20Sopenharmony_ci unsigned long flags; 7648c2ecf20Sopenharmony_ci 7658c2ecf20Sopenharmony_ci write_lock_irqsave(&rs->rs_recv_lock, flags); 7668c2ecf20Sopenharmony_ci list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) { 7678c2ecf20Sopenharmony_ci rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, 7688c2ecf20Sopenharmony_ci -be32_to_cpu(inc->i_hdr.h_len), 7698c2ecf20Sopenharmony_ci inc->i_hdr.h_dport); 7708c2ecf20Sopenharmony_ci list_del_init(&inc->i_item); 7718c2ecf20Sopenharmony_ci rds_inc_put(inc); 7728c2ecf20Sopenharmony_ci } 7738c2ecf20Sopenharmony_ci write_unlock_irqrestore(&rs->rs_recv_lock, flags); 7748c2ecf20Sopenharmony_ci} 7758c2ecf20Sopenharmony_ci 7768c2ecf20Sopenharmony_ci/* 7778c2ecf20Sopenharmony_ci * inc->i_saddr isn't used here because it is only set in the receive 7788c2ecf20Sopenharmony_ci * path. 7798c2ecf20Sopenharmony_ci */ 7808c2ecf20Sopenharmony_civoid rds_inc_info_copy(struct rds_incoming *inc, 7818c2ecf20Sopenharmony_ci struct rds_info_iterator *iter, 7828c2ecf20Sopenharmony_ci __be32 saddr, __be32 daddr, int flip) 7838c2ecf20Sopenharmony_ci{ 7848c2ecf20Sopenharmony_ci struct rds_info_message minfo; 7858c2ecf20Sopenharmony_ci 7868c2ecf20Sopenharmony_ci minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence); 7878c2ecf20Sopenharmony_ci minfo.len = be32_to_cpu(inc->i_hdr.h_len); 7888c2ecf20Sopenharmony_ci minfo.tos = inc->i_conn->c_tos; 7898c2ecf20Sopenharmony_ci 7908c2ecf20Sopenharmony_ci if (flip) { 7918c2ecf20Sopenharmony_ci minfo.laddr = daddr; 7928c2ecf20Sopenharmony_ci minfo.faddr = saddr; 7938c2ecf20Sopenharmony_ci minfo.lport = inc->i_hdr.h_dport; 7948c2ecf20Sopenharmony_ci minfo.fport = inc->i_hdr.h_sport; 7958c2ecf20Sopenharmony_ci } else { 7968c2ecf20Sopenharmony_ci minfo.laddr = saddr; 7978c2ecf20Sopenharmony_ci minfo.faddr = daddr; 7988c2ecf20Sopenharmony_ci minfo.lport = inc->i_hdr.h_sport; 7998c2ecf20Sopenharmony_ci minfo.fport = inc->i_hdr.h_dport; 8008c2ecf20Sopenharmony_ci } 8018c2ecf20Sopenharmony_ci 8028c2ecf20Sopenharmony_ci minfo.flags = 0; 8038c2ecf20Sopenharmony_ci 8048c2ecf20Sopenharmony_ci rds_info_copy(iter, &minfo, sizeof(minfo)); 8058c2ecf20Sopenharmony_ci} 8068c2ecf20Sopenharmony_ci 8078c2ecf20Sopenharmony_ci#if IS_ENABLED(CONFIG_IPV6) 8088c2ecf20Sopenharmony_civoid rds6_inc_info_copy(struct rds_incoming *inc, 8098c2ecf20Sopenharmony_ci struct rds_info_iterator *iter, 8108c2ecf20Sopenharmony_ci struct in6_addr *saddr, struct in6_addr *daddr, 8118c2ecf20Sopenharmony_ci int flip) 8128c2ecf20Sopenharmony_ci{ 8138c2ecf20Sopenharmony_ci struct rds6_info_message minfo6; 8148c2ecf20Sopenharmony_ci 8158c2ecf20Sopenharmony_ci minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence); 8168c2ecf20Sopenharmony_ci minfo6.len = be32_to_cpu(inc->i_hdr.h_len); 8178c2ecf20Sopenharmony_ci minfo6.tos = inc->i_conn->c_tos; 8188c2ecf20Sopenharmony_ci 8198c2ecf20Sopenharmony_ci if (flip) { 8208c2ecf20Sopenharmony_ci minfo6.laddr = *daddr; 8218c2ecf20Sopenharmony_ci minfo6.faddr = *saddr; 8228c2ecf20Sopenharmony_ci minfo6.lport = inc->i_hdr.h_dport; 8238c2ecf20Sopenharmony_ci minfo6.fport = inc->i_hdr.h_sport; 8248c2ecf20Sopenharmony_ci } else { 8258c2ecf20Sopenharmony_ci minfo6.laddr = *saddr; 8268c2ecf20Sopenharmony_ci minfo6.faddr = *daddr; 8278c2ecf20Sopenharmony_ci minfo6.lport = inc->i_hdr.h_sport; 8288c2ecf20Sopenharmony_ci minfo6.fport = inc->i_hdr.h_dport; 8298c2ecf20Sopenharmony_ci } 8308c2ecf20Sopenharmony_ci 8318c2ecf20Sopenharmony_ci minfo6.flags = 0; 8328c2ecf20Sopenharmony_ci 8338c2ecf20Sopenharmony_ci rds_info_copy(iter, &minfo6, sizeof(minfo6)); 8348c2ecf20Sopenharmony_ci} 8358c2ecf20Sopenharmony_ci#endif 836