18c2ecf20Sopenharmony_ci/* 28c2ecf20Sopenharmony_ci * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. 38c2ecf20Sopenharmony_ci * 48c2ecf20Sopenharmony_ci * This software is available to you under a choice of one of two 58c2ecf20Sopenharmony_ci * licenses. You may choose to be licensed under the terms of the GNU 68c2ecf20Sopenharmony_ci * General Public License (GPL) Version 2, available from the file 78c2ecf20Sopenharmony_ci * COPYING in the main directory of this source tree, or the 88c2ecf20Sopenharmony_ci * OpenIB.org BSD license below: 98c2ecf20Sopenharmony_ci * 108c2ecf20Sopenharmony_ci * Redistribution and use in source and binary forms, with or 118c2ecf20Sopenharmony_ci * without modification, are permitted provided that the following 128c2ecf20Sopenharmony_ci * conditions are met: 138c2ecf20Sopenharmony_ci * 148c2ecf20Sopenharmony_ci * - Redistributions of source code must retain the above 158c2ecf20Sopenharmony_ci * copyright notice, this list of conditions and the following 168c2ecf20Sopenharmony_ci * disclaimer. 178c2ecf20Sopenharmony_ci * 188c2ecf20Sopenharmony_ci * - Redistributions in binary form must reproduce the above 198c2ecf20Sopenharmony_ci * copyright notice, this list of conditions and the following 208c2ecf20Sopenharmony_ci * disclaimer in the documentation and/or other materials 218c2ecf20Sopenharmony_ci * provided with the distribution. 228c2ecf20Sopenharmony_ci * 238c2ecf20Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 248c2ecf20Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 258c2ecf20Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 268c2ecf20Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 278c2ecf20Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 288c2ecf20Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 298c2ecf20Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 308c2ecf20Sopenharmony_ci * SOFTWARE. 318c2ecf20Sopenharmony_ci * 328c2ecf20Sopenharmony_ci */ 338c2ecf20Sopenharmony_ci#include <linux/kernel.h> 348c2ecf20Sopenharmony_ci#include <linux/moduleparam.h> 358c2ecf20Sopenharmony_ci#include <linux/gfp.h> 368c2ecf20Sopenharmony_ci#include <net/sock.h> 378c2ecf20Sopenharmony_ci#include <linux/in.h> 388c2ecf20Sopenharmony_ci#include <linux/list.h> 398c2ecf20Sopenharmony_ci#include <linux/ratelimit.h> 408c2ecf20Sopenharmony_ci#include <linux/export.h> 418c2ecf20Sopenharmony_ci#include <linux/sizes.h> 428c2ecf20Sopenharmony_ci 438c2ecf20Sopenharmony_ci#include "rds.h" 448c2ecf20Sopenharmony_ci 458c2ecf20Sopenharmony_ci/* When transmitting messages in rds_send_xmit, we need to emerge from 468c2ecf20Sopenharmony_ci * time to time and briefly release the CPU. Otherwise the softlock watchdog 478c2ecf20Sopenharmony_ci * will kick our shin. 488c2ecf20Sopenharmony_ci * Also, it seems fairer to not let one busy connection stall all the 498c2ecf20Sopenharmony_ci * others. 508c2ecf20Sopenharmony_ci * 518c2ecf20Sopenharmony_ci * send_batch_count is the number of times we'll loop in send_xmit. Setting 528c2ecf20Sopenharmony_ci * it to 0 will restore the old behavior (where we looped until we had 538c2ecf20Sopenharmony_ci * drained the queue). 548c2ecf20Sopenharmony_ci */ 558c2ecf20Sopenharmony_cistatic int send_batch_count = SZ_1K; 568c2ecf20Sopenharmony_cimodule_param(send_batch_count, int, 0444); 578c2ecf20Sopenharmony_ciMODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); 588c2ecf20Sopenharmony_ci 598c2ecf20Sopenharmony_cistatic void rds_send_remove_from_sock(struct list_head *messages, int status); 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ci/* 628c2ecf20Sopenharmony_ci * Reset the send state. Callers must ensure that this doesn't race with 638c2ecf20Sopenharmony_ci * rds_send_xmit(). 648c2ecf20Sopenharmony_ci */ 658c2ecf20Sopenharmony_civoid rds_send_path_reset(struct rds_conn_path *cp) 668c2ecf20Sopenharmony_ci{ 678c2ecf20Sopenharmony_ci struct rds_message *rm, *tmp; 688c2ecf20Sopenharmony_ci unsigned long flags; 698c2ecf20Sopenharmony_ci 708c2ecf20Sopenharmony_ci if (cp->cp_xmit_rm) { 718c2ecf20Sopenharmony_ci rm = cp->cp_xmit_rm; 728c2ecf20Sopenharmony_ci cp->cp_xmit_rm = NULL; 738c2ecf20Sopenharmony_ci /* Tell the user the RDMA op is no longer mapped by the 748c2ecf20Sopenharmony_ci * transport. This isn't entirely true (it's flushed out 758c2ecf20Sopenharmony_ci * independently) but as the connection is down, there's 768c2ecf20Sopenharmony_ci * no ongoing RDMA to/from that memory */ 778c2ecf20Sopenharmony_ci rds_message_unmapped(rm); 788c2ecf20Sopenharmony_ci rds_message_put(rm); 798c2ecf20Sopenharmony_ci } 808c2ecf20Sopenharmony_ci 818c2ecf20Sopenharmony_ci cp->cp_xmit_sg = 0; 828c2ecf20Sopenharmony_ci cp->cp_xmit_hdr_off = 0; 838c2ecf20Sopenharmony_ci cp->cp_xmit_data_off = 0; 848c2ecf20Sopenharmony_ci cp->cp_xmit_atomic_sent = 0; 858c2ecf20Sopenharmony_ci cp->cp_xmit_rdma_sent = 0; 868c2ecf20Sopenharmony_ci cp->cp_xmit_data_sent = 0; 878c2ecf20Sopenharmony_ci 888c2ecf20Sopenharmony_ci cp->cp_conn->c_map_queued = 0; 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci cp->cp_unacked_packets = rds_sysctl_max_unacked_packets; 918c2ecf20Sopenharmony_ci cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes; 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci /* Mark messages as retransmissions, and move them to the send q */ 948c2ecf20Sopenharmony_ci spin_lock_irqsave(&cp->cp_lock, flags); 958c2ecf20Sopenharmony_ci list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) { 968c2ecf20Sopenharmony_ci set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); 978c2ecf20Sopenharmony_ci set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags); 988c2ecf20Sopenharmony_ci } 998c2ecf20Sopenharmony_ci list_splice_init(&cp->cp_retrans, &cp->cp_send_queue); 1008c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&cp->cp_lock, flags); 1018c2ecf20Sopenharmony_ci} 1028c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_send_path_reset); 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_cistatic int acquire_in_xmit(struct rds_conn_path *cp) 1058c2ecf20Sopenharmony_ci{ 1068c2ecf20Sopenharmony_ci return test_and_set_bit(RDS_IN_XMIT, &cp->cp_flags) == 0; 1078c2ecf20Sopenharmony_ci} 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_cistatic void release_in_xmit(struct rds_conn_path *cp) 1108c2ecf20Sopenharmony_ci{ 1118c2ecf20Sopenharmony_ci clear_bit(RDS_IN_XMIT, &cp->cp_flags); 1128c2ecf20Sopenharmony_ci smp_mb__after_atomic(); 1138c2ecf20Sopenharmony_ci /* 1148c2ecf20Sopenharmony_ci * We don't use wait_on_bit()/wake_up_bit() because our waking is in a 1158c2ecf20Sopenharmony_ci * hot path and finding waiters is very rare. We don't want to walk 1168c2ecf20Sopenharmony_ci * the system-wide hashed waitqueue buckets in the fast path only to 1178c2ecf20Sopenharmony_ci * almost never find waiters. 1188c2ecf20Sopenharmony_ci */ 1198c2ecf20Sopenharmony_ci if (waitqueue_active(&cp->cp_waitq)) 1208c2ecf20Sopenharmony_ci wake_up_all(&cp->cp_waitq); 1218c2ecf20Sopenharmony_ci} 1228c2ecf20Sopenharmony_ci 1238c2ecf20Sopenharmony_ci/* 1248c2ecf20Sopenharmony_ci * We're making the conscious trade-off here to only send one message 1258c2ecf20Sopenharmony_ci * down the connection at a time. 1268c2ecf20Sopenharmony_ci * Pro: 1278c2ecf20Sopenharmony_ci * - tx queueing is a simple fifo list 1288c2ecf20Sopenharmony_ci * - reassembly is optional and easily done by transports per conn 1298c2ecf20Sopenharmony_ci * - no per flow rx lookup at all, straight to the socket 1308c2ecf20Sopenharmony_ci * - less per-frag memory and wire overhead 1318c2ecf20Sopenharmony_ci * Con: 1328c2ecf20Sopenharmony_ci * - queued acks can be delayed behind large messages 1338c2ecf20Sopenharmony_ci * Depends: 1348c2ecf20Sopenharmony_ci * - small message latency is higher behind queued large messages 1358c2ecf20Sopenharmony_ci * - large message latency isn't starved by intervening small sends 1368c2ecf20Sopenharmony_ci */ 1378c2ecf20Sopenharmony_ciint rds_send_xmit(struct rds_conn_path *cp) 1388c2ecf20Sopenharmony_ci{ 1398c2ecf20Sopenharmony_ci struct rds_connection *conn = cp->cp_conn; 1408c2ecf20Sopenharmony_ci struct rds_message *rm; 1418c2ecf20Sopenharmony_ci unsigned long flags; 1428c2ecf20Sopenharmony_ci unsigned int tmp; 1438c2ecf20Sopenharmony_ci struct scatterlist *sg; 1448c2ecf20Sopenharmony_ci int ret = 0; 1458c2ecf20Sopenharmony_ci LIST_HEAD(to_be_dropped); 1468c2ecf20Sopenharmony_ci int batch_count; 1478c2ecf20Sopenharmony_ci unsigned long send_gen = 0; 1488c2ecf20Sopenharmony_ci int same_rm = 0; 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_cirestart: 1518c2ecf20Sopenharmony_ci batch_count = 0; 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_ci /* 1548c2ecf20Sopenharmony_ci * sendmsg calls here after having queued its message on the send 1558c2ecf20Sopenharmony_ci * queue. We only have one task feeding the connection at a time. If 1568c2ecf20Sopenharmony_ci * another thread is already feeding the queue then we back off. This 1578c2ecf20Sopenharmony_ci * avoids blocking the caller and trading per-connection data between 1588c2ecf20Sopenharmony_ci * caches per message. 1598c2ecf20Sopenharmony_ci */ 1608c2ecf20Sopenharmony_ci if (!acquire_in_xmit(cp)) { 1618c2ecf20Sopenharmony_ci rds_stats_inc(s_send_lock_contention); 1628c2ecf20Sopenharmony_ci ret = -ENOMEM; 1638c2ecf20Sopenharmony_ci goto out; 1648c2ecf20Sopenharmony_ci } 1658c2ecf20Sopenharmony_ci 1668c2ecf20Sopenharmony_ci if (rds_destroy_pending(cp->cp_conn)) { 1678c2ecf20Sopenharmony_ci release_in_xmit(cp); 1688c2ecf20Sopenharmony_ci ret = -ENETUNREACH; /* dont requeue send work */ 1698c2ecf20Sopenharmony_ci goto out; 1708c2ecf20Sopenharmony_ci } 1718c2ecf20Sopenharmony_ci 1728c2ecf20Sopenharmony_ci /* 1738c2ecf20Sopenharmony_ci * we record the send generation after doing the xmit acquire. 1748c2ecf20Sopenharmony_ci * if someone else manages to jump in and do some work, we'll use 1758c2ecf20Sopenharmony_ci * this to avoid a goto restart farther down. 1768c2ecf20Sopenharmony_ci * 1778c2ecf20Sopenharmony_ci * The acquire_in_xmit() check above ensures that only one 1788c2ecf20Sopenharmony_ci * caller can increment c_send_gen at any time. 1798c2ecf20Sopenharmony_ci */ 1808c2ecf20Sopenharmony_ci send_gen = READ_ONCE(cp->cp_send_gen) + 1; 1818c2ecf20Sopenharmony_ci WRITE_ONCE(cp->cp_send_gen, send_gen); 1828c2ecf20Sopenharmony_ci 1838c2ecf20Sopenharmony_ci /* 1848c2ecf20Sopenharmony_ci * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, 1858c2ecf20Sopenharmony_ci * we do the opposite to avoid races. 1868c2ecf20Sopenharmony_ci */ 1878c2ecf20Sopenharmony_ci if (!rds_conn_path_up(cp)) { 1888c2ecf20Sopenharmony_ci release_in_xmit(cp); 1898c2ecf20Sopenharmony_ci ret = 0; 1908c2ecf20Sopenharmony_ci goto out; 1918c2ecf20Sopenharmony_ci } 1928c2ecf20Sopenharmony_ci 1938c2ecf20Sopenharmony_ci if (conn->c_trans->xmit_path_prepare) 1948c2ecf20Sopenharmony_ci conn->c_trans->xmit_path_prepare(cp); 1958c2ecf20Sopenharmony_ci 1968c2ecf20Sopenharmony_ci /* 1978c2ecf20Sopenharmony_ci * spin trying to push headers and data down the connection until 1988c2ecf20Sopenharmony_ci * the connection doesn't make forward progress. 1998c2ecf20Sopenharmony_ci */ 2008c2ecf20Sopenharmony_ci while (1) { 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci rm = cp->cp_xmit_rm; 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ci if (!rm) { 2058c2ecf20Sopenharmony_ci same_rm = 0; 2068c2ecf20Sopenharmony_ci } else { 2078c2ecf20Sopenharmony_ci same_rm++; 2088c2ecf20Sopenharmony_ci if (same_rm >= 4096) { 2098c2ecf20Sopenharmony_ci rds_stats_inc(s_send_stuck_rm); 2108c2ecf20Sopenharmony_ci ret = -EAGAIN; 2118c2ecf20Sopenharmony_ci break; 2128c2ecf20Sopenharmony_ci } 2138c2ecf20Sopenharmony_ci } 2148c2ecf20Sopenharmony_ci 2158c2ecf20Sopenharmony_ci /* 2168c2ecf20Sopenharmony_ci * If between sending messages, we can send a pending congestion 2178c2ecf20Sopenharmony_ci * map update. 2188c2ecf20Sopenharmony_ci */ 2198c2ecf20Sopenharmony_ci if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) { 2208c2ecf20Sopenharmony_ci rm = rds_cong_update_alloc(conn); 2218c2ecf20Sopenharmony_ci if (IS_ERR(rm)) { 2228c2ecf20Sopenharmony_ci ret = PTR_ERR(rm); 2238c2ecf20Sopenharmony_ci break; 2248c2ecf20Sopenharmony_ci } 2258c2ecf20Sopenharmony_ci rm->data.op_active = 1; 2268c2ecf20Sopenharmony_ci rm->m_inc.i_conn_path = cp; 2278c2ecf20Sopenharmony_ci rm->m_inc.i_conn = cp->cp_conn; 2288c2ecf20Sopenharmony_ci 2298c2ecf20Sopenharmony_ci cp->cp_xmit_rm = rm; 2308c2ecf20Sopenharmony_ci } 2318c2ecf20Sopenharmony_ci 2328c2ecf20Sopenharmony_ci /* 2338c2ecf20Sopenharmony_ci * If not already working on one, grab the next message. 2348c2ecf20Sopenharmony_ci * 2358c2ecf20Sopenharmony_ci * cp_xmit_rm holds a ref while we're sending this message down 2368c2ecf20Sopenharmony_ci * the connction. We can use this ref while holding the 2378c2ecf20Sopenharmony_ci * send_sem.. rds_send_reset() is serialized with it. 2388c2ecf20Sopenharmony_ci */ 2398c2ecf20Sopenharmony_ci if (!rm) { 2408c2ecf20Sopenharmony_ci unsigned int len; 2418c2ecf20Sopenharmony_ci 2428c2ecf20Sopenharmony_ci batch_count++; 2438c2ecf20Sopenharmony_ci 2448c2ecf20Sopenharmony_ci /* we want to process as big a batch as we can, but 2458c2ecf20Sopenharmony_ci * we also want to avoid softlockups. If we've been 2468c2ecf20Sopenharmony_ci * through a lot of messages, lets back off and see 2478c2ecf20Sopenharmony_ci * if anyone else jumps in 2488c2ecf20Sopenharmony_ci */ 2498c2ecf20Sopenharmony_ci if (batch_count >= send_batch_count) 2508c2ecf20Sopenharmony_ci goto over_batch; 2518c2ecf20Sopenharmony_ci 2528c2ecf20Sopenharmony_ci spin_lock_irqsave(&cp->cp_lock, flags); 2538c2ecf20Sopenharmony_ci 2548c2ecf20Sopenharmony_ci if (!list_empty(&cp->cp_send_queue)) { 2558c2ecf20Sopenharmony_ci rm = list_entry(cp->cp_send_queue.next, 2568c2ecf20Sopenharmony_ci struct rds_message, 2578c2ecf20Sopenharmony_ci m_conn_item); 2588c2ecf20Sopenharmony_ci rds_message_addref(rm); 2598c2ecf20Sopenharmony_ci 2608c2ecf20Sopenharmony_ci /* 2618c2ecf20Sopenharmony_ci * Move the message from the send queue to the retransmit 2628c2ecf20Sopenharmony_ci * list right away. 2638c2ecf20Sopenharmony_ci */ 2648c2ecf20Sopenharmony_ci list_move_tail(&rm->m_conn_item, 2658c2ecf20Sopenharmony_ci &cp->cp_retrans); 2668c2ecf20Sopenharmony_ci } 2678c2ecf20Sopenharmony_ci 2688c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&cp->cp_lock, flags); 2698c2ecf20Sopenharmony_ci 2708c2ecf20Sopenharmony_ci if (!rm) 2718c2ecf20Sopenharmony_ci break; 2728c2ecf20Sopenharmony_ci 2738c2ecf20Sopenharmony_ci /* Unfortunately, the way Infiniband deals with 2748c2ecf20Sopenharmony_ci * RDMA to a bad MR key is by moving the entire 2758c2ecf20Sopenharmony_ci * queue pair to error state. We cold possibly 2768c2ecf20Sopenharmony_ci * recover from that, but right now we drop the 2778c2ecf20Sopenharmony_ci * connection. 2788c2ecf20Sopenharmony_ci * Therefore, we never retransmit messages with RDMA ops. 2798c2ecf20Sopenharmony_ci */ 2808c2ecf20Sopenharmony_ci if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) || 2818c2ecf20Sopenharmony_ci (rm->rdma.op_active && 2828c2ecf20Sopenharmony_ci test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) { 2838c2ecf20Sopenharmony_ci spin_lock_irqsave(&cp->cp_lock, flags); 2848c2ecf20Sopenharmony_ci if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) 2858c2ecf20Sopenharmony_ci list_move(&rm->m_conn_item, &to_be_dropped); 2868c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&cp->cp_lock, flags); 2878c2ecf20Sopenharmony_ci continue; 2888c2ecf20Sopenharmony_ci } 2898c2ecf20Sopenharmony_ci 2908c2ecf20Sopenharmony_ci /* Require an ACK every once in a while */ 2918c2ecf20Sopenharmony_ci len = ntohl(rm->m_inc.i_hdr.h_len); 2928c2ecf20Sopenharmony_ci if (cp->cp_unacked_packets == 0 || 2938c2ecf20Sopenharmony_ci cp->cp_unacked_bytes < len) { 2948c2ecf20Sopenharmony_ci set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); 2958c2ecf20Sopenharmony_ci 2968c2ecf20Sopenharmony_ci cp->cp_unacked_packets = 2978c2ecf20Sopenharmony_ci rds_sysctl_max_unacked_packets; 2988c2ecf20Sopenharmony_ci cp->cp_unacked_bytes = 2998c2ecf20Sopenharmony_ci rds_sysctl_max_unacked_bytes; 3008c2ecf20Sopenharmony_ci rds_stats_inc(s_send_ack_required); 3018c2ecf20Sopenharmony_ci } else { 3028c2ecf20Sopenharmony_ci cp->cp_unacked_bytes -= len; 3038c2ecf20Sopenharmony_ci cp->cp_unacked_packets--; 3048c2ecf20Sopenharmony_ci } 3058c2ecf20Sopenharmony_ci 3068c2ecf20Sopenharmony_ci cp->cp_xmit_rm = rm; 3078c2ecf20Sopenharmony_ci } 3088c2ecf20Sopenharmony_ci 3098c2ecf20Sopenharmony_ci /* The transport either sends the whole rdma or none of it */ 3108c2ecf20Sopenharmony_ci if (rm->rdma.op_active && !cp->cp_xmit_rdma_sent) { 3118c2ecf20Sopenharmony_ci rm->m_final_op = &rm->rdma; 3128c2ecf20Sopenharmony_ci /* The transport owns the mapped memory for now. 3138c2ecf20Sopenharmony_ci * You can't unmap it while it's on the send queue 3148c2ecf20Sopenharmony_ci */ 3158c2ecf20Sopenharmony_ci set_bit(RDS_MSG_MAPPED, &rm->m_flags); 3168c2ecf20Sopenharmony_ci ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); 3178c2ecf20Sopenharmony_ci if (ret) { 3188c2ecf20Sopenharmony_ci clear_bit(RDS_MSG_MAPPED, &rm->m_flags); 3198c2ecf20Sopenharmony_ci wake_up_interruptible(&rm->m_flush_wait); 3208c2ecf20Sopenharmony_ci break; 3218c2ecf20Sopenharmony_ci } 3228c2ecf20Sopenharmony_ci cp->cp_xmit_rdma_sent = 1; 3238c2ecf20Sopenharmony_ci 3248c2ecf20Sopenharmony_ci } 3258c2ecf20Sopenharmony_ci 3268c2ecf20Sopenharmony_ci if (rm->atomic.op_active && !cp->cp_xmit_atomic_sent) { 3278c2ecf20Sopenharmony_ci rm->m_final_op = &rm->atomic; 3288c2ecf20Sopenharmony_ci /* The transport owns the mapped memory for now. 3298c2ecf20Sopenharmony_ci * You can't unmap it while it's on the send queue 3308c2ecf20Sopenharmony_ci */ 3318c2ecf20Sopenharmony_ci set_bit(RDS_MSG_MAPPED, &rm->m_flags); 3328c2ecf20Sopenharmony_ci ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); 3338c2ecf20Sopenharmony_ci if (ret) { 3348c2ecf20Sopenharmony_ci clear_bit(RDS_MSG_MAPPED, &rm->m_flags); 3358c2ecf20Sopenharmony_ci wake_up_interruptible(&rm->m_flush_wait); 3368c2ecf20Sopenharmony_ci break; 3378c2ecf20Sopenharmony_ci } 3388c2ecf20Sopenharmony_ci cp->cp_xmit_atomic_sent = 1; 3398c2ecf20Sopenharmony_ci 3408c2ecf20Sopenharmony_ci } 3418c2ecf20Sopenharmony_ci 3428c2ecf20Sopenharmony_ci /* 3438c2ecf20Sopenharmony_ci * A number of cases require an RDS header to be sent 3448c2ecf20Sopenharmony_ci * even if there is no data. 3458c2ecf20Sopenharmony_ci * We permit 0-byte sends; rds-ping depends on this. 3468c2ecf20Sopenharmony_ci * However, if there are exclusively attached silent ops, 3478c2ecf20Sopenharmony_ci * we skip the hdr/data send, to enable silent operation. 3488c2ecf20Sopenharmony_ci */ 3498c2ecf20Sopenharmony_ci if (rm->data.op_nents == 0) { 3508c2ecf20Sopenharmony_ci int ops_present; 3518c2ecf20Sopenharmony_ci int all_ops_are_silent = 1; 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_ci ops_present = (rm->atomic.op_active || rm->rdma.op_active); 3548c2ecf20Sopenharmony_ci if (rm->atomic.op_active && !rm->atomic.op_silent) 3558c2ecf20Sopenharmony_ci all_ops_are_silent = 0; 3568c2ecf20Sopenharmony_ci if (rm->rdma.op_active && !rm->rdma.op_silent) 3578c2ecf20Sopenharmony_ci all_ops_are_silent = 0; 3588c2ecf20Sopenharmony_ci 3598c2ecf20Sopenharmony_ci if (ops_present && all_ops_are_silent 3608c2ecf20Sopenharmony_ci && !rm->m_rdma_cookie) 3618c2ecf20Sopenharmony_ci rm->data.op_active = 0; 3628c2ecf20Sopenharmony_ci } 3638c2ecf20Sopenharmony_ci 3648c2ecf20Sopenharmony_ci if (rm->data.op_active && !cp->cp_xmit_data_sent) { 3658c2ecf20Sopenharmony_ci rm->m_final_op = &rm->data; 3668c2ecf20Sopenharmony_ci 3678c2ecf20Sopenharmony_ci ret = conn->c_trans->xmit(conn, rm, 3688c2ecf20Sopenharmony_ci cp->cp_xmit_hdr_off, 3698c2ecf20Sopenharmony_ci cp->cp_xmit_sg, 3708c2ecf20Sopenharmony_ci cp->cp_xmit_data_off); 3718c2ecf20Sopenharmony_ci if (ret <= 0) 3728c2ecf20Sopenharmony_ci break; 3738c2ecf20Sopenharmony_ci 3748c2ecf20Sopenharmony_ci if (cp->cp_xmit_hdr_off < sizeof(struct rds_header)) { 3758c2ecf20Sopenharmony_ci tmp = min_t(int, ret, 3768c2ecf20Sopenharmony_ci sizeof(struct rds_header) - 3778c2ecf20Sopenharmony_ci cp->cp_xmit_hdr_off); 3788c2ecf20Sopenharmony_ci cp->cp_xmit_hdr_off += tmp; 3798c2ecf20Sopenharmony_ci ret -= tmp; 3808c2ecf20Sopenharmony_ci } 3818c2ecf20Sopenharmony_ci 3828c2ecf20Sopenharmony_ci sg = &rm->data.op_sg[cp->cp_xmit_sg]; 3838c2ecf20Sopenharmony_ci while (ret) { 3848c2ecf20Sopenharmony_ci tmp = min_t(int, ret, sg->length - 3858c2ecf20Sopenharmony_ci cp->cp_xmit_data_off); 3868c2ecf20Sopenharmony_ci cp->cp_xmit_data_off += tmp; 3878c2ecf20Sopenharmony_ci ret -= tmp; 3888c2ecf20Sopenharmony_ci if (cp->cp_xmit_data_off == sg->length) { 3898c2ecf20Sopenharmony_ci cp->cp_xmit_data_off = 0; 3908c2ecf20Sopenharmony_ci sg++; 3918c2ecf20Sopenharmony_ci cp->cp_xmit_sg++; 3928c2ecf20Sopenharmony_ci BUG_ON(ret != 0 && cp->cp_xmit_sg == 3938c2ecf20Sopenharmony_ci rm->data.op_nents); 3948c2ecf20Sopenharmony_ci } 3958c2ecf20Sopenharmony_ci } 3968c2ecf20Sopenharmony_ci 3978c2ecf20Sopenharmony_ci if (cp->cp_xmit_hdr_off == sizeof(struct rds_header) && 3988c2ecf20Sopenharmony_ci (cp->cp_xmit_sg == rm->data.op_nents)) 3998c2ecf20Sopenharmony_ci cp->cp_xmit_data_sent = 1; 4008c2ecf20Sopenharmony_ci } 4018c2ecf20Sopenharmony_ci 4028c2ecf20Sopenharmony_ci /* 4038c2ecf20Sopenharmony_ci * A rm will only take multiple times through this loop 4048c2ecf20Sopenharmony_ci * if there is a data op. Thus, if the data is sent (or there was 4058c2ecf20Sopenharmony_ci * none), then we're done with the rm. 4068c2ecf20Sopenharmony_ci */ 4078c2ecf20Sopenharmony_ci if (!rm->data.op_active || cp->cp_xmit_data_sent) { 4088c2ecf20Sopenharmony_ci cp->cp_xmit_rm = NULL; 4098c2ecf20Sopenharmony_ci cp->cp_xmit_sg = 0; 4108c2ecf20Sopenharmony_ci cp->cp_xmit_hdr_off = 0; 4118c2ecf20Sopenharmony_ci cp->cp_xmit_data_off = 0; 4128c2ecf20Sopenharmony_ci cp->cp_xmit_rdma_sent = 0; 4138c2ecf20Sopenharmony_ci cp->cp_xmit_atomic_sent = 0; 4148c2ecf20Sopenharmony_ci cp->cp_xmit_data_sent = 0; 4158c2ecf20Sopenharmony_ci 4168c2ecf20Sopenharmony_ci rds_message_put(rm); 4178c2ecf20Sopenharmony_ci } 4188c2ecf20Sopenharmony_ci } 4198c2ecf20Sopenharmony_ci 4208c2ecf20Sopenharmony_ciover_batch: 4218c2ecf20Sopenharmony_ci if (conn->c_trans->xmit_path_complete) 4228c2ecf20Sopenharmony_ci conn->c_trans->xmit_path_complete(cp); 4238c2ecf20Sopenharmony_ci release_in_xmit(cp); 4248c2ecf20Sopenharmony_ci 4258c2ecf20Sopenharmony_ci /* Nuke any messages we decided not to retransmit. */ 4268c2ecf20Sopenharmony_ci if (!list_empty(&to_be_dropped)) { 4278c2ecf20Sopenharmony_ci /* irqs on here, so we can put(), unlike above */ 4288c2ecf20Sopenharmony_ci list_for_each_entry(rm, &to_be_dropped, m_conn_item) 4298c2ecf20Sopenharmony_ci rds_message_put(rm); 4308c2ecf20Sopenharmony_ci rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); 4318c2ecf20Sopenharmony_ci } 4328c2ecf20Sopenharmony_ci 4338c2ecf20Sopenharmony_ci /* 4348c2ecf20Sopenharmony_ci * Other senders can queue a message after we last test the send queue 4358c2ecf20Sopenharmony_ci * but before we clear RDS_IN_XMIT. In that case they'd back off and 4368c2ecf20Sopenharmony_ci * not try and send their newly queued message. We need to check the 4378c2ecf20Sopenharmony_ci * send queue after having cleared RDS_IN_XMIT so that their message 4388c2ecf20Sopenharmony_ci * doesn't get stuck on the send queue. 4398c2ecf20Sopenharmony_ci * 4408c2ecf20Sopenharmony_ci * If the transport cannot continue (i.e ret != 0), then it must 4418c2ecf20Sopenharmony_ci * call us when more room is available, such as from the tx 4428c2ecf20Sopenharmony_ci * completion handler. 4438c2ecf20Sopenharmony_ci * 4448c2ecf20Sopenharmony_ci * We have an extra generation check here so that if someone manages 4458c2ecf20Sopenharmony_ci * to jump in after our release_in_xmit, we'll see that they have done 4468c2ecf20Sopenharmony_ci * some work and we will skip our goto 4478c2ecf20Sopenharmony_ci */ 4488c2ecf20Sopenharmony_ci if (ret == 0) { 4498c2ecf20Sopenharmony_ci bool raced; 4508c2ecf20Sopenharmony_ci 4518c2ecf20Sopenharmony_ci smp_mb(); 4528c2ecf20Sopenharmony_ci raced = send_gen != READ_ONCE(cp->cp_send_gen); 4538c2ecf20Sopenharmony_ci 4548c2ecf20Sopenharmony_ci if ((test_bit(0, &conn->c_map_queued) || 4558c2ecf20Sopenharmony_ci !list_empty(&cp->cp_send_queue)) && !raced) { 4568c2ecf20Sopenharmony_ci if (batch_count < send_batch_count) 4578c2ecf20Sopenharmony_ci goto restart; 4588c2ecf20Sopenharmony_ci rcu_read_lock(); 4598c2ecf20Sopenharmony_ci if (rds_destroy_pending(cp->cp_conn)) 4608c2ecf20Sopenharmony_ci ret = -ENETUNREACH; 4618c2ecf20Sopenharmony_ci else 4628c2ecf20Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_send_w, 1); 4638c2ecf20Sopenharmony_ci rcu_read_unlock(); 4648c2ecf20Sopenharmony_ci } else if (raced) { 4658c2ecf20Sopenharmony_ci rds_stats_inc(s_send_lock_queue_raced); 4668c2ecf20Sopenharmony_ci } 4678c2ecf20Sopenharmony_ci } 4688c2ecf20Sopenharmony_ciout: 4698c2ecf20Sopenharmony_ci return ret; 4708c2ecf20Sopenharmony_ci} 4718c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_send_xmit); 4728c2ecf20Sopenharmony_ci 4738c2ecf20Sopenharmony_cistatic void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm) 4748c2ecf20Sopenharmony_ci{ 4758c2ecf20Sopenharmony_ci u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len); 4768c2ecf20Sopenharmony_ci 4778c2ecf20Sopenharmony_ci assert_spin_locked(&rs->rs_lock); 4788c2ecf20Sopenharmony_ci 4798c2ecf20Sopenharmony_ci BUG_ON(rs->rs_snd_bytes < len); 4808c2ecf20Sopenharmony_ci rs->rs_snd_bytes -= len; 4818c2ecf20Sopenharmony_ci 4828c2ecf20Sopenharmony_ci if (rs->rs_snd_bytes == 0) 4838c2ecf20Sopenharmony_ci rds_stats_inc(s_send_queue_empty); 4848c2ecf20Sopenharmony_ci} 4858c2ecf20Sopenharmony_ci 4868c2ecf20Sopenharmony_cistatic inline int rds_send_is_acked(struct rds_message *rm, u64 ack, 4878c2ecf20Sopenharmony_ci is_acked_func is_acked) 4888c2ecf20Sopenharmony_ci{ 4898c2ecf20Sopenharmony_ci if (is_acked) 4908c2ecf20Sopenharmony_ci return is_acked(rm, ack); 4918c2ecf20Sopenharmony_ci return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack; 4928c2ecf20Sopenharmony_ci} 4938c2ecf20Sopenharmony_ci 4948c2ecf20Sopenharmony_ci/* 4958c2ecf20Sopenharmony_ci * This is pretty similar to what happens below in the ACK 4968c2ecf20Sopenharmony_ci * handling code - except that we call here as soon as we get 4978c2ecf20Sopenharmony_ci * the IB send completion on the RDMA op and the accompanying 4988c2ecf20Sopenharmony_ci * message. 4998c2ecf20Sopenharmony_ci */ 5008c2ecf20Sopenharmony_civoid rds_rdma_send_complete(struct rds_message *rm, int status) 5018c2ecf20Sopenharmony_ci{ 5028c2ecf20Sopenharmony_ci struct rds_sock *rs = NULL; 5038c2ecf20Sopenharmony_ci struct rm_rdma_op *ro; 5048c2ecf20Sopenharmony_ci struct rds_notifier *notifier; 5058c2ecf20Sopenharmony_ci unsigned long flags; 5068c2ecf20Sopenharmony_ci 5078c2ecf20Sopenharmony_ci spin_lock_irqsave(&rm->m_rs_lock, flags); 5088c2ecf20Sopenharmony_ci 5098c2ecf20Sopenharmony_ci ro = &rm->rdma; 5108c2ecf20Sopenharmony_ci if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && 5118c2ecf20Sopenharmony_ci ro->op_active && ro->op_notify && ro->op_notifier) { 5128c2ecf20Sopenharmony_ci notifier = ro->op_notifier; 5138c2ecf20Sopenharmony_ci rs = rm->m_rs; 5148c2ecf20Sopenharmony_ci sock_hold(rds_rs_to_sk(rs)); 5158c2ecf20Sopenharmony_ci 5168c2ecf20Sopenharmony_ci notifier->n_status = status; 5178c2ecf20Sopenharmony_ci spin_lock(&rs->rs_lock); 5188c2ecf20Sopenharmony_ci list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); 5198c2ecf20Sopenharmony_ci spin_unlock(&rs->rs_lock); 5208c2ecf20Sopenharmony_ci 5218c2ecf20Sopenharmony_ci ro->op_notifier = NULL; 5228c2ecf20Sopenharmony_ci } 5238c2ecf20Sopenharmony_ci 5248c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&rm->m_rs_lock, flags); 5258c2ecf20Sopenharmony_ci 5268c2ecf20Sopenharmony_ci if (rs) { 5278c2ecf20Sopenharmony_ci rds_wake_sk_sleep(rs); 5288c2ecf20Sopenharmony_ci sock_put(rds_rs_to_sk(rs)); 5298c2ecf20Sopenharmony_ci } 5308c2ecf20Sopenharmony_ci} 5318c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_rdma_send_complete); 5328c2ecf20Sopenharmony_ci 5338c2ecf20Sopenharmony_ci/* 5348c2ecf20Sopenharmony_ci * Just like above, except looks at atomic op 5358c2ecf20Sopenharmony_ci */ 5368c2ecf20Sopenharmony_civoid rds_atomic_send_complete(struct rds_message *rm, int status) 5378c2ecf20Sopenharmony_ci{ 5388c2ecf20Sopenharmony_ci struct rds_sock *rs = NULL; 5398c2ecf20Sopenharmony_ci struct rm_atomic_op *ao; 5408c2ecf20Sopenharmony_ci struct rds_notifier *notifier; 5418c2ecf20Sopenharmony_ci unsigned long flags; 5428c2ecf20Sopenharmony_ci 5438c2ecf20Sopenharmony_ci spin_lock_irqsave(&rm->m_rs_lock, flags); 5448c2ecf20Sopenharmony_ci 5458c2ecf20Sopenharmony_ci ao = &rm->atomic; 5468c2ecf20Sopenharmony_ci if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) 5478c2ecf20Sopenharmony_ci && ao->op_active && ao->op_notify && ao->op_notifier) { 5488c2ecf20Sopenharmony_ci notifier = ao->op_notifier; 5498c2ecf20Sopenharmony_ci rs = rm->m_rs; 5508c2ecf20Sopenharmony_ci sock_hold(rds_rs_to_sk(rs)); 5518c2ecf20Sopenharmony_ci 5528c2ecf20Sopenharmony_ci notifier->n_status = status; 5538c2ecf20Sopenharmony_ci spin_lock(&rs->rs_lock); 5548c2ecf20Sopenharmony_ci list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); 5558c2ecf20Sopenharmony_ci spin_unlock(&rs->rs_lock); 5568c2ecf20Sopenharmony_ci 5578c2ecf20Sopenharmony_ci ao->op_notifier = NULL; 5588c2ecf20Sopenharmony_ci } 5598c2ecf20Sopenharmony_ci 5608c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&rm->m_rs_lock, flags); 5618c2ecf20Sopenharmony_ci 5628c2ecf20Sopenharmony_ci if (rs) { 5638c2ecf20Sopenharmony_ci rds_wake_sk_sleep(rs); 5648c2ecf20Sopenharmony_ci sock_put(rds_rs_to_sk(rs)); 5658c2ecf20Sopenharmony_ci } 5668c2ecf20Sopenharmony_ci} 5678c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_atomic_send_complete); 5688c2ecf20Sopenharmony_ci 5698c2ecf20Sopenharmony_ci/* 5708c2ecf20Sopenharmony_ci * This is the same as rds_rdma_send_complete except we 5718c2ecf20Sopenharmony_ci * don't do any locking - we have all the ingredients (message, 5728c2ecf20Sopenharmony_ci * socket, socket lock) and can just move the notifier. 5738c2ecf20Sopenharmony_ci */ 5748c2ecf20Sopenharmony_cistatic inline void 5758c2ecf20Sopenharmony_ci__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) 5768c2ecf20Sopenharmony_ci{ 5778c2ecf20Sopenharmony_ci struct rm_rdma_op *ro; 5788c2ecf20Sopenharmony_ci struct rm_atomic_op *ao; 5798c2ecf20Sopenharmony_ci 5808c2ecf20Sopenharmony_ci ro = &rm->rdma; 5818c2ecf20Sopenharmony_ci if (ro->op_active && ro->op_notify && ro->op_notifier) { 5828c2ecf20Sopenharmony_ci ro->op_notifier->n_status = status; 5838c2ecf20Sopenharmony_ci list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue); 5848c2ecf20Sopenharmony_ci ro->op_notifier = NULL; 5858c2ecf20Sopenharmony_ci } 5868c2ecf20Sopenharmony_ci 5878c2ecf20Sopenharmony_ci ao = &rm->atomic; 5888c2ecf20Sopenharmony_ci if (ao->op_active && ao->op_notify && ao->op_notifier) { 5898c2ecf20Sopenharmony_ci ao->op_notifier->n_status = status; 5908c2ecf20Sopenharmony_ci list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue); 5918c2ecf20Sopenharmony_ci ao->op_notifier = NULL; 5928c2ecf20Sopenharmony_ci } 5938c2ecf20Sopenharmony_ci 5948c2ecf20Sopenharmony_ci /* No need to wake the app - caller does this */ 5958c2ecf20Sopenharmony_ci} 5968c2ecf20Sopenharmony_ci 5978c2ecf20Sopenharmony_ci/* 5988c2ecf20Sopenharmony_ci * This removes messages from the socket's list if they're on it. The list 5998c2ecf20Sopenharmony_ci * argument must be private to the caller, we must be able to modify it 6008c2ecf20Sopenharmony_ci * without locks. The messages must have a reference held for their 6018c2ecf20Sopenharmony_ci * position on the list. This function will drop that reference after 6028c2ecf20Sopenharmony_ci * removing the messages from the 'messages' list regardless of if it found 6038c2ecf20Sopenharmony_ci * the messages on the socket list or not. 6048c2ecf20Sopenharmony_ci */ 6058c2ecf20Sopenharmony_cistatic void rds_send_remove_from_sock(struct list_head *messages, int status) 6068c2ecf20Sopenharmony_ci{ 6078c2ecf20Sopenharmony_ci unsigned long flags; 6088c2ecf20Sopenharmony_ci struct rds_sock *rs = NULL; 6098c2ecf20Sopenharmony_ci struct rds_message *rm; 6108c2ecf20Sopenharmony_ci 6118c2ecf20Sopenharmony_ci while (!list_empty(messages)) { 6128c2ecf20Sopenharmony_ci int was_on_sock = 0; 6138c2ecf20Sopenharmony_ci 6148c2ecf20Sopenharmony_ci rm = list_entry(messages->next, struct rds_message, 6158c2ecf20Sopenharmony_ci m_conn_item); 6168c2ecf20Sopenharmony_ci list_del_init(&rm->m_conn_item); 6178c2ecf20Sopenharmony_ci 6188c2ecf20Sopenharmony_ci /* 6198c2ecf20Sopenharmony_ci * If we see this flag cleared then we're *sure* that someone 6208c2ecf20Sopenharmony_ci * else beat us to removing it from the sock. If we race 6218c2ecf20Sopenharmony_ci * with their flag update we'll get the lock and then really 6228c2ecf20Sopenharmony_ci * see that the flag has been cleared. 6238c2ecf20Sopenharmony_ci * 6248c2ecf20Sopenharmony_ci * The message spinlock makes sure nobody clears rm->m_rs 6258c2ecf20Sopenharmony_ci * while we're messing with it. It does not prevent the 6268c2ecf20Sopenharmony_ci * message from being removed from the socket, though. 6278c2ecf20Sopenharmony_ci */ 6288c2ecf20Sopenharmony_ci spin_lock_irqsave(&rm->m_rs_lock, flags); 6298c2ecf20Sopenharmony_ci if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) 6308c2ecf20Sopenharmony_ci goto unlock_and_drop; 6318c2ecf20Sopenharmony_ci 6328c2ecf20Sopenharmony_ci if (rs != rm->m_rs) { 6338c2ecf20Sopenharmony_ci if (rs) { 6348c2ecf20Sopenharmony_ci rds_wake_sk_sleep(rs); 6358c2ecf20Sopenharmony_ci sock_put(rds_rs_to_sk(rs)); 6368c2ecf20Sopenharmony_ci } 6378c2ecf20Sopenharmony_ci rs = rm->m_rs; 6388c2ecf20Sopenharmony_ci if (rs) 6398c2ecf20Sopenharmony_ci sock_hold(rds_rs_to_sk(rs)); 6408c2ecf20Sopenharmony_ci } 6418c2ecf20Sopenharmony_ci if (!rs) 6428c2ecf20Sopenharmony_ci goto unlock_and_drop; 6438c2ecf20Sopenharmony_ci spin_lock(&rs->rs_lock); 6448c2ecf20Sopenharmony_ci 6458c2ecf20Sopenharmony_ci if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { 6468c2ecf20Sopenharmony_ci struct rm_rdma_op *ro = &rm->rdma; 6478c2ecf20Sopenharmony_ci struct rds_notifier *notifier; 6488c2ecf20Sopenharmony_ci 6498c2ecf20Sopenharmony_ci list_del_init(&rm->m_sock_item); 6508c2ecf20Sopenharmony_ci rds_send_sndbuf_remove(rs, rm); 6518c2ecf20Sopenharmony_ci 6528c2ecf20Sopenharmony_ci if (ro->op_active && ro->op_notifier && 6538c2ecf20Sopenharmony_ci (ro->op_notify || (ro->op_recverr && status))) { 6548c2ecf20Sopenharmony_ci notifier = ro->op_notifier; 6558c2ecf20Sopenharmony_ci list_add_tail(¬ifier->n_list, 6568c2ecf20Sopenharmony_ci &rs->rs_notify_queue); 6578c2ecf20Sopenharmony_ci if (!notifier->n_status) 6588c2ecf20Sopenharmony_ci notifier->n_status = status; 6598c2ecf20Sopenharmony_ci rm->rdma.op_notifier = NULL; 6608c2ecf20Sopenharmony_ci } 6618c2ecf20Sopenharmony_ci was_on_sock = 1; 6628c2ecf20Sopenharmony_ci } 6638c2ecf20Sopenharmony_ci spin_unlock(&rs->rs_lock); 6648c2ecf20Sopenharmony_ci 6658c2ecf20Sopenharmony_ciunlock_and_drop: 6668c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&rm->m_rs_lock, flags); 6678c2ecf20Sopenharmony_ci rds_message_put(rm); 6688c2ecf20Sopenharmony_ci if (was_on_sock) 6698c2ecf20Sopenharmony_ci rds_message_put(rm); 6708c2ecf20Sopenharmony_ci } 6718c2ecf20Sopenharmony_ci 6728c2ecf20Sopenharmony_ci if (rs) { 6738c2ecf20Sopenharmony_ci rds_wake_sk_sleep(rs); 6748c2ecf20Sopenharmony_ci sock_put(rds_rs_to_sk(rs)); 6758c2ecf20Sopenharmony_ci } 6768c2ecf20Sopenharmony_ci} 6778c2ecf20Sopenharmony_ci 6788c2ecf20Sopenharmony_ci/* 6798c2ecf20Sopenharmony_ci * Transports call here when they've determined that the receiver queued 6808c2ecf20Sopenharmony_ci * messages up to, and including, the given sequence number. Messages are 6818c2ecf20Sopenharmony_ci * moved to the retrans queue when rds_send_xmit picks them off the send 6828c2ecf20Sopenharmony_ci * queue. This means that in the TCP case, the message may not have been 6838c2ecf20Sopenharmony_ci * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked 6848c2ecf20Sopenharmony_ci * checks the RDS_MSG_HAS_ACK_SEQ bit. 6858c2ecf20Sopenharmony_ci */ 6868c2ecf20Sopenharmony_civoid rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack, 6878c2ecf20Sopenharmony_ci is_acked_func is_acked) 6888c2ecf20Sopenharmony_ci{ 6898c2ecf20Sopenharmony_ci struct rds_message *rm, *tmp; 6908c2ecf20Sopenharmony_ci unsigned long flags; 6918c2ecf20Sopenharmony_ci LIST_HEAD(list); 6928c2ecf20Sopenharmony_ci 6938c2ecf20Sopenharmony_ci spin_lock_irqsave(&cp->cp_lock, flags); 6948c2ecf20Sopenharmony_ci 6958c2ecf20Sopenharmony_ci list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) { 6968c2ecf20Sopenharmony_ci if (!rds_send_is_acked(rm, ack, is_acked)) 6978c2ecf20Sopenharmony_ci break; 6988c2ecf20Sopenharmony_ci 6998c2ecf20Sopenharmony_ci list_move(&rm->m_conn_item, &list); 7008c2ecf20Sopenharmony_ci clear_bit(RDS_MSG_ON_CONN, &rm->m_flags); 7018c2ecf20Sopenharmony_ci } 7028c2ecf20Sopenharmony_ci 7038c2ecf20Sopenharmony_ci /* order flag updates with spin locks */ 7048c2ecf20Sopenharmony_ci if (!list_empty(&list)) 7058c2ecf20Sopenharmony_ci smp_mb__after_atomic(); 7068c2ecf20Sopenharmony_ci 7078c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&cp->cp_lock, flags); 7088c2ecf20Sopenharmony_ci 7098c2ecf20Sopenharmony_ci /* now remove the messages from the sock list as needed */ 7108c2ecf20Sopenharmony_ci rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS); 7118c2ecf20Sopenharmony_ci} 7128c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_send_path_drop_acked); 7138c2ecf20Sopenharmony_ci 7148c2ecf20Sopenharmony_civoid rds_send_drop_acked(struct rds_connection *conn, u64 ack, 7158c2ecf20Sopenharmony_ci is_acked_func is_acked) 7168c2ecf20Sopenharmony_ci{ 7178c2ecf20Sopenharmony_ci WARN_ON(conn->c_trans->t_mp_capable); 7188c2ecf20Sopenharmony_ci rds_send_path_drop_acked(&conn->c_path[0], ack, is_acked); 7198c2ecf20Sopenharmony_ci} 7208c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_send_drop_acked); 7218c2ecf20Sopenharmony_ci 7228c2ecf20Sopenharmony_civoid rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest) 7238c2ecf20Sopenharmony_ci{ 7248c2ecf20Sopenharmony_ci struct rds_message *rm, *tmp; 7258c2ecf20Sopenharmony_ci struct rds_connection *conn; 7268c2ecf20Sopenharmony_ci struct rds_conn_path *cp; 7278c2ecf20Sopenharmony_ci unsigned long flags; 7288c2ecf20Sopenharmony_ci LIST_HEAD(list); 7298c2ecf20Sopenharmony_ci 7308c2ecf20Sopenharmony_ci /* get all the messages we're dropping under the rs lock */ 7318c2ecf20Sopenharmony_ci spin_lock_irqsave(&rs->rs_lock, flags); 7328c2ecf20Sopenharmony_ci 7338c2ecf20Sopenharmony_ci list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) { 7348c2ecf20Sopenharmony_ci if (dest && 7358c2ecf20Sopenharmony_ci (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) || 7368c2ecf20Sopenharmony_ci dest->sin6_port != rm->m_inc.i_hdr.h_dport)) 7378c2ecf20Sopenharmony_ci continue; 7388c2ecf20Sopenharmony_ci 7398c2ecf20Sopenharmony_ci list_move(&rm->m_sock_item, &list); 7408c2ecf20Sopenharmony_ci rds_send_sndbuf_remove(rs, rm); 7418c2ecf20Sopenharmony_ci clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); 7428c2ecf20Sopenharmony_ci } 7438c2ecf20Sopenharmony_ci 7448c2ecf20Sopenharmony_ci /* order flag updates with the rs lock */ 7458c2ecf20Sopenharmony_ci smp_mb__after_atomic(); 7468c2ecf20Sopenharmony_ci 7478c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&rs->rs_lock, flags); 7488c2ecf20Sopenharmony_ci 7498c2ecf20Sopenharmony_ci if (list_empty(&list)) 7508c2ecf20Sopenharmony_ci return; 7518c2ecf20Sopenharmony_ci 7528c2ecf20Sopenharmony_ci /* Remove the messages from the conn */ 7538c2ecf20Sopenharmony_ci list_for_each_entry(rm, &list, m_sock_item) { 7548c2ecf20Sopenharmony_ci 7558c2ecf20Sopenharmony_ci conn = rm->m_inc.i_conn; 7568c2ecf20Sopenharmony_ci if (conn->c_trans->t_mp_capable) 7578c2ecf20Sopenharmony_ci cp = rm->m_inc.i_conn_path; 7588c2ecf20Sopenharmony_ci else 7598c2ecf20Sopenharmony_ci cp = &conn->c_path[0]; 7608c2ecf20Sopenharmony_ci 7618c2ecf20Sopenharmony_ci spin_lock_irqsave(&cp->cp_lock, flags); 7628c2ecf20Sopenharmony_ci /* 7638c2ecf20Sopenharmony_ci * Maybe someone else beat us to removing rm from the conn. 7648c2ecf20Sopenharmony_ci * If we race with their flag update we'll get the lock and 7658c2ecf20Sopenharmony_ci * then really see that the flag has been cleared. 7668c2ecf20Sopenharmony_ci */ 7678c2ecf20Sopenharmony_ci if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { 7688c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&cp->cp_lock, flags); 7698c2ecf20Sopenharmony_ci continue; 7708c2ecf20Sopenharmony_ci } 7718c2ecf20Sopenharmony_ci list_del_init(&rm->m_conn_item); 7728c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&cp->cp_lock, flags); 7738c2ecf20Sopenharmony_ci 7748c2ecf20Sopenharmony_ci /* 7758c2ecf20Sopenharmony_ci * Couldn't grab m_rs_lock in top loop (lock ordering), 7768c2ecf20Sopenharmony_ci * but we can now. 7778c2ecf20Sopenharmony_ci */ 7788c2ecf20Sopenharmony_ci spin_lock_irqsave(&rm->m_rs_lock, flags); 7798c2ecf20Sopenharmony_ci 7808c2ecf20Sopenharmony_ci spin_lock(&rs->rs_lock); 7818c2ecf20Sopenharmony_ci __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); 7828c2ecf20Sopenharmony_ci spin_unlock(&rs->rs_lock); 7838c2ecf20Sopenharmony_ci 7848c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&rm->m_rs_lock, flags); 7858c2ecf20Sopenharmony_ci 7868c2ecf20Sopenharmony_ci rds_message_put(rm); 7878c2ecf20Sopenharmony_ci } 7888c2ecf20Sopenharmony_ci 7898c2ecf20Sopenharmony_ci rds_wake_sk_sleep(rs); 7908c2ecf20Sopenharmony_ci 7918c2ecf20Sopenharmony_ci while (!list_empty(&list)) { 7928c2ecf20Sopenharmony_ci rm = list_entry(list.next, struct rds_message, m_sock_item); 7938c2ecf20Sopenharmony_ci list_del_init(&rm->m_sock_item); 7948c2ecf20Sopenharmony_ci rds_message_wait(rm); 7958c2ecf20Sopenharmony_ci 7968c2ecf20Sopenharmony_ci /* just in case the code above skipped this message 7978c2ecf20Sopenharmony_ci * because RDS_MSG_ON_CONN wasn't set, run it again here 7988c2ecf20Sopenharmony_ci * taking m_rs_lock is the only thing that keeps us 7998c2ecf20Sopenharmony_ci * from racing with ack processing. 8008c2ecf20Sopenharmony_ci */ 8018c2ecf20Sopenharmony_ci spin_lock_irqsave(&rm->m_rs_lock, flags); 8028c2ecf20Sopenharmony_ci 8038c2ecf20Sopenharmony_ci spin_lock(&rs->rs_lock); 8048c2ecf20Sopenharmony_ci __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); 8058c2ecf20Sopenharmony_ci spin_unlock(&rs->rs_lock); 8068c2ecf20Sopenharmony_ci 8078c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&rm->m_rs_lock, flags); 8088c2ecf20Sopenharmony_ci 8098c2ecf20Sopenharmony_ci rds_message_put(rm); 8108c2ecf20Sopenharmony_ci } 8118c2ecf20Sopenharmony_ci} 8128c2ecf20Sopenharmony_ci 8138c2ecf20Sopenharmony_ci/* 8148c2ecf20Sopenharmony_ci * we only want this to fire once so we use the callers 'queued'. It's 8158c2ecf20Sopenharmony_ci * possible that another thread can race with us and remove the 8168c2ecf20Sopenharmony_ci * message from the flow with RDS_CANCEL_SENT_TO. 8178c2ecf20Sopenharmony_ci */ 8188c2ecf20Sopenharmony_cistatic int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, 8198c2ecf20Sopenharmony_ci struct rds_conn_path *cp, 8208c2ecf20Sopenharmony_ci struct rds_message *rm, __be16 sport, 8218c2ecf20Sopenharmony_ci __be16 dport, int *queued) 8228c2ecf20Sopenharmony_ci{ 8238c2ecf20Sopenharmony_ci unsigned long flags; 8248c2ecf20Sopenharmony_ci u32 len; 8258c2ecf20Sopenharmony_ci 8268c2ecf20Sopenharmony_ci if (*queued) 8278c2ecf20Sopenharmony_ci goto out; 8288c2ecf20Sopenharmony_ci 8298c2ecf20Sopenharmony_ci len = be32_to_cpu(rm->m_inc.i_hdr.h_len); 8308c2ecf20Sopenharmony_ci 8318c2ecf20Sopenharmony_ci /* this is the only place which holds both the socket's rs_lock 8328c2ecf20Sopenharmony_ci * and the connection's c_lock */ 8338c2ecf20Sopenharmony_ci spin_lock_irqsave(&rs->rs_lock, flags); 8348c2ecf20Sopenharmony_ci 8358c2ecf20Sopenharmony_ci /* 8368c2ecf20Sopenharmony_ci * If there is a little space in sndbuf, we don't queue anything, 8378c2ecf20Sopenharmony_ci * and userspace gets -EAGAIN. But poll() indicates there's send 8388c2ecf20Sopenharmony_ci * room. This can lead to bad behavior (spinning) if snd_bytes isn't 8398c2ecf20Sopenharmony_ci * freed up by incoming acks. So we check the *old* value of 8408c2ecf20Sopenharmony_ci * rs_snd_bytes here to allow the last msg to exceed the buffer, 8418c2ecf20Sopenharmony_ci * and poll() now knows no more data can be sent. 8428c2ecf20Sopenharmony_ci */ 8438c2ecf20Sopenharmony_ci if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) { 8448c2ecf20Sopenharmony_ci rs->rs_snd_bytes += len; 8458c2ecf20Sopenharmony_ci 8468c2ecf20Sopenharmony_ci /* let recv side know we are close to send space exhaustion. 8478c2ecf20Sopenharmony_ci * This is probably not the optimal way to do it, as this 8488c2ecf20Sopenharmony_ci * means we set the flag on *all* messages as soon as our 8498c2ecf20Sopenharmony_ci * throughput hits a certain threshold. 8508c2ecf20Sopenharmony_ci */ 8518c2ecf20Sopenharmony_ci if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2) 8528c2ecf20Sopenharmony_ci set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); 8538c2ecf20Sopenharmony_ci 8548c2ecf20Sopenharmony_ci list_add_tail(&rm->m_sock_item, &rs->rs_send_queue); 8558c2ecf20Sopenharmony_ci set_bit(RDS_MSG_ON_SOCK, &rm->m_flags); 8568c2ecf20Sopenharmony_ci rds_message_addref(rm); 8578c2ecf20Sopenharmony_ci sock_hold(rds_rs_to_sk(rs)); 8588c2ecf20Sopenharmony_ci rm->m_rs = rs; 8598c2ecf20Sopenharmony_ci 8608c2ecf20Sopenharmony_ci /* The code ordering is a little weird, but we're 8618c2ecf20Sopenharmony_ci trying to minimize the time we hold c_lock */ 8628c2ecf20Sopenharmony_ci rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0); 8638c2ecf20Sopenharmony_ci rm->m_inc.i_conn = conn; 8648c2ecf20Sopenharmony_ci rm->m_inc.i_conn_path = cp; 8658c2ecf20Sopenharmony_ci rds_message_addref(rm); 8668c2ecf20Sopenharmony_ci 8678c2ecf20Sopenharmony_ci spin_lock(&cp->cp_lock); 8688c2ecf20Sopenharmony_ci rm->m_inc.i_hdr.h_sequence = cpu_to_be64(cp->cp_next_tx_seq++); 8698c2ecf20Sopenharmony_ci list_add_tail(&rm->m_conn_item, &cp->cp_send_queue); 8708c2ecf20Sopenharmony_ci set_bit(RDS_MSG_ON_CONN, &rm->m_flags); 8718c2ecf20Sopenharmony_ci spin_unlock(&cp->cp_lock); 8728c2ecf20Sopenharmony_ci 8738c2ecf20Sopenharmony_ci rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n", 8748c2ecf20Sopenharmony_ci rm, len, rs, rs->rs_snd_bytes, 8758c2ecf20Sopenharmony_ci (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence)); 8768c2ecf20Sopenharmony_ci 8778c2ecf20Sopenharmony_ci *queued = 1; 8788c2ecf20Sopenharmony_ci } 8798c2ecf20Sopenharmony_ci 8808c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&rs->rs_lock, flags); 8818c2ecf20Sopenharmony_ciout: 8828c2ecf20Sopenharmony_ci return *queued; 8838c2ecf20Sopenharmony_ci} 8848c2ecf20Sopenharmony_ci 8858c2ecf20Sopenharmony_ci/* 8868c2ecf20Sopenharmony_ci * rds_message is getting to be quite complicated, and we'd like to allocate 8878c2ecf20Sopenharmony_ci * it all in one go. This figures out how big it needs to be up front. 8888c2ecf20Sopenharmony_ci */ 8898c2ecf20Sopenharmony_cistatic int rds_rm_size(struct msghdr *msg, int num_sgs, 8908c2ecf20Sopenharmony_ci struct rds_iov_vector_arr *vct) 8918c2ecf20Sopenharmony_ci{ 8928c2ecf20Sopenharmony_ci struct cmsghdr *cmsg; 8938c2ecf20Sopenharmony_ci int size = 0; 8948c2ecf20Sopenharmony_ci int cmsg_groups = 0; 8958c2ecf20Sopenharmony_ci int retval; 8968c2ecf20Sopenharmony_ci bool zcopy_cookie = false; 8978c2ecf20Sopenharmony_ci struct rds_iov_vector *iov, *tmp_iov; 8988c2ecf20Sopenharmony_ci 8998c2ecf20Sopenharmony_ci if (num_sgs < 0) 9008c2ecf20Sopenharmony_ci return -EINVAL; 9018c2ecf20Sopenharmony_ci 9028c2ecf20Sopenharmony_ci for_each_cmsghdr(cmsg, msg) { 9038c2ecf20Sopenharmony_ci if (!CMSG_OK(msg, cmsg)) 9048c2ecf20Sopenharmony_ci return -EINVAL; 9058c2ecf20Sopenharmony_ci 9068c2ecf20Sopenharmony_ci if (cmsg->cmsg_level != SOL_RDS) 9078c2ecf20Sopenharmony_ci continue; 9088c2ecf20Sopenharmony_ci 9098c2ecf20Sopenharmony_ci switch (cmsg->cmsg_type) { 9108c2ecf20Sopenharmony_ci case RDS_CMSG_RDMA_ARGS: 9118c2ecf20Sopenharmony_ci if (vct->indx >= vct->len) { 9128c2ecf20Sopenharmony_ci vct->len += vct->incr; 9138c2ecf20Sopenharmony_ci tmp_iov = 9148c2ecf20Sopenharmony_ci krealloc(vct->vec, 9158c2ecf20Sopenharmony_ci vct->len * 9168c2ecf20Sopenharmony_ci sizeof(struct rds_iov_vector), 9178c2ecf20Sopenharmony_ci GFP_KERNEL); 9188c2ecf20Sopenharmony_ci if (!tmp_iov) { 9198c2ecf20Sopenharmony_ci vct->len -= vct->incr; 9208c2ecf20Sopenharmony_ci return -ENOMEM; 9218c2ecf20Sopenharmony_ci } 9228c2ecf20Sopenharmony_ci vct->vec = tmp_iov; 9238c2ecf20Sopenharmony_ci } 9248c2ecf20Sopenharmony_ci iov = &vct->vec[vct->indx]; 9258c2ecf20Sopenharmony_ci memset(iov, 0, sizeof(struct rds_iov_vector)); 9268c2ecf20Sopenharmony_ci vct->indx++; 9278c2ecf20Sopenharmony_ci cmsg_groups |= 1; 9288c2ecf20Sopenharmony_ci retval = rds_rdma_extra_size(CMSG_DATA(cmsg), iov); 9298c2ecf20Sopenharmony_ci if (retval < 0) 9308c2ecf20Sopenharmony_ci return retval; 9318c2ecf20Sopenharmony_ci size += retval; 9328c2ecf20Sopenharmony_ci 9338c2ecf20Sopenharmony_ci break; 9348c2ecf20Sopenharmony_ci 9358c2ecf20Sopenharmony_ci case RDS_CMSG_ZCOPY_COOKIE: 9368c2ecf20Sopenharmony_ci zcopy_cookie = true; 9378c2ecf20Sopenharmony_ci fallthrough; 9388c2ecf20Sopenharmony_ci 9398c2ecf20Sopenharmony_ci case RDS_CMSG_RDMA_DEST: 9408c2ecf20Sopenharmony_ci case RDS_CMSG_RDMA_MAP: 9418c2ecf20Sopenharmony_ci cmsg_groups |= 2; 9428c2ecf20Sopenharmony_ci /* these are valid but do no add any size */ 9438c2ecf20Sopenharmony_ci break; 9448c2ecf20Sopenharmony_ci 9458c2ecf20Sopenharmony_ci case RDS_CMSG_ATOMIC_CSWP: 9468c2ecf20Sopenharmony_ci case RDS_CMSG_ATOMIC_FADD: 9478c2ecf20Sopenharmony_ci case RDS_CMSG_MASKED_ATOMIC_CSWP: 9488c2ecf20Sopenharmony_ci case RDS_CMSG_MASKED_ATOMIC_FADD: 9498c2ecf20Sopenharmony_ci cmsg_groups |= 1; 9508c2ecf20Sopenharmony_ci size += sizeof(struct scatterlist); 9518c2ecf20Sopenharmony_ci break; 9528c2ecf20Sopenharmony_ci 9538c2ecf20Sopenharmony_ci default: 9548c2ecf20Sopenharmony_ci return -EINVAL; 9558c2ecf20Sopenharmony_ci } 9568c2ecf20Sopenharmony_ci 9578c2ecf20Sopenharmony_ci } 9588c2ecf20Sopenharmony_ci 9598c2ecf20Sopenharmony_ci if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie) 9608c2ecf20Sopenharmony_ci return -EINVAL; 9618c2ecf20Sopenharmony_ci 9628c2ecf20Sopenharmony_ci size += num_sgs * sizeof(struct scatterlist); 9638c2ecf20Sopenharmony_ci 9648c2ecf20Sopenharmony_ci /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ 9658c2ecf20Sopenharmony_ci if (cmsg_groups == 3) 9668c2ecf20Sopenharmony_ci return -EINVAL; 9678c2ecf20Sopenharmony_ci 9688c2ecf20Sopenharmony_ci return size; 9698c2ecf20Sopenharmony_ci} 9708c2ecf20Sopenharmony_ci 9718c2ecf20Sopenharmony_cistatic int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm, 9728c2ecf20Sopenharmony_ci struct cmsghdr *cmsg) 9738c2ecf20Sopenharmony_ci{ 9748c2ecf20Sopenharmony_ci u32 *cookie; 9758c2ecf20Sopenharmony_ci 9768c2ecf20Sopenharmony_ci if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)) || 9778c2ecf20Sopenharmony_ci !rm->data.op_mmp_znotifier) 9788c2ecf20Sopenharmony_ci return -EINVAL; 9798c2ecf20Sopenharmony_ci cookie = CMSG_DATA(cmsg); 9808c2ecf20Sopenharmony_ci rm->data.op_mmp_znotifier->z_cookie = *cookie; 9818c2ecf20Sopenharmony_ci return 0; 9828c2ecf20Sopenharmony_ci} 9838c2ecf20Sopenharmony_ci 9848c2ecf20Sopenharmony_cistatic int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, 9858c2ecf20Sopenharmony_ci struct msghdr *msg, int *allocated_mr, 9868c2ecf20Sopenharmony_ci struct rds_iov_vector_arr *vct) 9878c2ecf20Sopenharmony_ci{ 9888c2ecf20Sopenharmony_ci struct cmsghdr *cmsg; 9898c2ecf20Sopenharmony_ci int ret = 0, ind = 0; 9908c2ecf20Sopenharmony_ci 9918c2ecf20Sopenharmony_ci for_each_cmsghdr(cmsg, msg) { 9928c2ecf20Sopenharmony_ci if (!CMSG_OK(msg, cmsg)) 9938c2ecf20Sopenharmony_ci return -EINVAL; 9948c2ecf20Sopenharmony_ci 9958c2ecf20Sopenharmony_ci if (cmsg->cmsg_level != SOL_RDS) 9968c2ecf20Sopenharmony_ci continue; 9978c2ecf20Sopenharmony_ci 9988c2ecf20Sopenharmony_ci /* As a side effect, RDMA_DEST and RDMA_MAP will set 9998c2ecf20Sopenharmony_ci * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr. 10008c2ecf20Sopenharmony_ci */ 10018c2ecf20Sopenharmony_ci switch (cmsg->cmsg_type) { 10028c2ecf20Sopenharmony_ci case RDS_CMSG_RDMA_ARGS: 10038c2ecf20Sopenharmony_ci if (ind >= vct->indx) 10048c2ecf20Sopenharmony_ci return -ENOMEM; 10058c2ecf20Sopenharmony_ci ret = rds_cmsg_rdma_args(rs, rm, cmsg, &vct->vec[ind]); 10068c2ecf20Sopenharmony_ci ind++; 10078c2ecf20Sopenharmony_ci break; 10088c2ecf20Sopenharmony_ci 10098c2ecf20Sopenharmony_ci case RDS_CMSG_RDMA_DEST: 10108c2ecf20Sopenharmony_ci ret = rds_cmsg_rdma_dest(rs, rm, cmsg); 10118c2ecf20Sopenharmony_ci break; 10128c2ecf20Sopenharmony_ci 10138c2ecf20Sopenharmony_ci case RDS_CMSG_RDMA_MAP: 10148c2ecf20Sopenharmony_ci ret = rds_cmsg_rdma_map(rs, rm, cmsg); 10158c2ecf20Sopenharmony_ci if (!ret) 10168c2ecf20Sopenharmony_ci *allocated_mr = 1; 10178c2ecf20Sopenharmony_ci else if (ret == -ENODEV) 10188c2ecf20Sopenharmony_ci /* Accommodate the get_mr() case which can fail 10198c2ecf20Sopenharmony_ci * if connection isn't established yet. 10208c2ecf20Sopenharmony_ci */ 10218c2ecf20Sopenharmony_ci ret = -EAGAIN; 10228c2ecf20Sopenharmony_ci break; 10238c2ecf20Sopenharmony_ci case RDS_CMSG_ATOMIC_CSWP: 10248c2ecf20Sopenharmony_ci case RDS_CMSG_ATOMIC_FADD: 10258c2ecf20Sopenharmony_ci case RDS_CMSG_MASKED_ATOMIC_CSWP: 10268c2ecf20Sopenharmony_ci case RDS_CMSG_MASKED_ATOMIC_FADD: 10278c2ecf20Sopenharmony_ci ret = rds_cmsg_atomic(rs, rm, cmsg); 10288c2ecf20Sopenharmony_ci break; 10298c2ecf20Sopenharmony_ci 10308c2ecf20Sopenharmony_ci case RDS_CMSG_ZCOPY_COOKIE: 10318c2ecf20Sopenharmony_ci ret = rds_cmsg_zcopy(rs, rm, cmsg); 10328c2ecf20Sopenharmony_ci break; 10338c2ecf20Sopenharmony_ci 10348c2ecf20Sopenharmony_ci default: 10358c2ecf20Sopenharmony_ci return -EINVAL; 10368c2ecf20Sopenharmony_ci } 10378c2ecf20Sopenharmony_ci 10388c2ecf20Sopenharmony_ci if (ret) 10398c2ecf20Sopenharmony_ci break; 10408c2ecf20Sopenharmony_ci } 10418c2ecf20Sopenharmony_ci 10428c2ecf20Sopenharmony_ci return ret; 10438c2ecf20Sopenharmony_ci} 10448c2ecf20Sopenharmony_ci 10458c2ecf20Sopenharmony_cistatic int rds_send_mprds_hash(struct rds_sock *rs, 10468c2ecf20Sopenharmony_ci struct rds_connection *conn, int nonblock) 10478c2ecf20Sopenharmony_ci{ 10488c2ecf20Sopenharmony_ci int hash; 10498c2ecf20Sopenharmony_ci 10508c2ecf20Sopenharmony_ci if (conn->c_npaths == 0) 10518c2ecf20Sopenharmony_ci hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS); 10528c2ecf20Sopenharmony_ci else 10538c2ecf20Sopenharmony_ci hash = RDS_MPATH_HASH(rs, conn->c_npaths); 10548c2ecf20Sopenharmony_ci if (conn->c_npaths == 0 && hash != 0) { 10558c2ecf20Sopenharmony_ci rds_send_ping(conn, 0); 10568c2ecf20Sopenharmony_ci 10578c2ecf20Sopenharmony_ci /* The underlying connection is not up yet. Need to wait 10588c2ecf20Sopenharmony_ci * until it is up to be sure that the non-zero c_path can be 10598c2ecf20Sopenharmony_ci * used. But if we are interrupted, we have to use the zero 10608c2ecf20Sopenharmony_ci * c_path in case the connection ends up being non-MP capable. 10618c2ecf20Sopenharmony_ci */ 10628c2ecf20Sopenharmony_ci if (conn->c_npaths == 0) { 10638c2ecf20Sopenharmony_ci /* Cannot wait for the connection be made, so just use 10648c2ecf20Sopenharmony_ci * the base c_path. 10658c2ecf20Sopenharmony_ci */ 10668c2ecf20Sopenharmony_ci if (nonblock) 10678c2ecf20Sopenharmony_ci return 0; 10688c2ecf20Sopenharmony_ci if (wait_event_interruptible(conn->c_hs_waitq, 10698c2ecf20Sopenharmony_ci conn->c_npaths != 0)) 10708c2ecf20Sopenharmony_ci hash = 0; 10718c2ecf20Sopenharmony_ci } 10728c2ecf20Sopenharmony_ci if (conn->c_npaths == 1) 10738c2ecf20Sopenharmony_ci hash = 0; 10748c2ecf20Sopenharmony_ci } 10758c2ecf20Sopenharmony_ci return hash; 10768c2ecf20Sopenharmony_ci} 10778c2ecf20Sopenharmony_ci 10788c2ecf20Sopenharmony_cistatic int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes) 10798c2ecf20Sopenharmony_ci{ 10808c2ecf20Sopenharmony_ci struct rds_rdma_args *args; 10818c2ecf20Sopenharmony_ci struct cmsghdr *cmsg; 10828c2ecf20Sopenharmony_ci 10838c2ecf20Sopenharmony_ci for_each_cmsghdr(cmsg, msg) { 10848c2ecf20Sopenharmony_ci if (!CMSG_OK(msg, cmsg)) 10858c2ecf20Sopenharmony_ci return -EINVAL; 10868c2ecf20Sopenharmony_ci 10878c2ecf20Sopenharmony_ci if (cmsg->cmsg_level != SOL_RDS) 10888c2ecf20Sopenharmony_ci continue; 10898c2ecf20Sopenharmony_ci 10908c2ecf20Sopenharmony_ci if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) { 10918c2ecf20Sopenharmony_ci if (cmsg->cmsg_len < 10928c2ecf20Sopenharmony_ci CMSG_LEN(sizeof(struct rds_rdma_args))) 10938c2ecf20Sopenharmony_ci return -EINVAL; 10948c2ecf20Sopenharmony_ci args = CMSG_DATA(cmsg); 10958c2ecf20Sopenharmony_ci *rdma_bytes += args->remote_vec.bytes; 10968c2ecf20Sopenharmony_ci } 10978c2ecf20Sopenharmony_ci } 10988c2ecf20Sopenharmony_ci return 0; 10998c2ecf20Sopenharmony_ci} 11008c2ecf20Sopenharmony_ci 11018c2ecf20Sopenharmony_ciint rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) 11028c2ecf20Sopenharmony_ci{ 11038c2ecf20Sopenharmony_ci struct sock *sk = sock->sk; 11048c2ecf20Sopenharmony_ci struct rds_sock *rs = rds_sk_to_rs(sk); 11058c2ecf20Sopenharmony_ci DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); 11068c2ecf20Sopenharmony_ci DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); 11078c2ecf20Sopenharmony_ci __be16 dport; 11088c2ecf20Sopenharmony_ci struct rds_message *rm = NULL; 11098c2ecf20Sopenharmony_ci struct rds_connection *conn; 11108c2ecf20Sopenharmony_ci int ret = 0; 11118c2ecf20Sopenharmony_ci int queued = 0, allocated_mr = 0; 11128c2ecf20Sopenharmony_ci int nonblock = msg->msg_flags & MSG_DONTWAIT; 11138c2ecf20Sopenharmony_ci long timeo = sock_sndtimeo(sk, nonblock); 11148c2ecf20Sopenharmony_ci struct rds_conn_path *cpath; 11158c2ecf20Sopenharmony_ci struct in6_addr daddr; 11168c2ecf20Sopenharmony_ci __u32 scope_id = 0; 11178c2ecf20Sopenharmony_ci size_t total_payload_len = payload_len, rdma_payload_len = 0; 11188c2ecf20Sopenharmony_ci bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && 11198c2ecf20Sopenharmony_ci sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); 11208c2ecf20Sopenharmony_ci int num_sgs = DIV_ROUND_UP(payload_len, PAGE_SIZE); 11218c2ecf20Sopenharmony_ci int namelen; 11228c2ecf20Sopenharmony_ci struct rds_iov_vector_arr vct; 11238c2ecf20Sopenharmony_ci int ind; 11248c2ecf20Sopenharmony_ci 11258c2ecf20Sopenharmony_ci memset(&vct, 0, sizeof(vct)); 11268c2ecf20Sopenharmony_ci 11278c2ecf20Sopenharmony_ci /* expect 1 RDMA CMSG per rds_sendmsg. can still grow if more needed. */ 11288c2ecf20Sopenharmony_ci vct.incr = 1; 11298c2ecf20Sopenharmony_ci 11308c2ecf20Sopenharmony_ci /* Mirror Linux UDP mirror of BSD error message compatibility */ 11318c2ecf20Sopenharmony_ci /* XXX: Perhaps MSG_MORE someday */ 11328c2ecf20Sopenharmony_ci if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT | MSG_ZEROCOPY)) { 11338c2ecf20Sopenharmony_ci ret = -EOPNOTSUPP; 11348c2ecf20Sopenharmony_ci goto out; 11358c2ecf20Sopenharmony_ci } 11368c2ecf20Sopenharmony_ci 11378c2ecf20Sopenharmony_ci namelen = msg->msg_namelen; 11388c2ecf20Sopenharmony_ci if (namelen != 0) { 11398c2ecf20Sopenharmony_ci if (namelen < sizeof(*usin)) { 11408c2ecf20Sopenharmony_ci ret = -EINVAL; 11418c2ecf20Sopenharmony_ci goto out; 11428c2ecf20Sopenharmony_ci } 11438c2ecf20Sopenharmony_ci switch (usin->sin_family) { 11448c2ecf20Sopenharmony_ci case AF_INET: 11458c2ecf20Sopenharmony_ci if (usin->sin_addr.s_addr == htonl(INADDR_ANY) || 11468c2ecf20Sopenharmony_ci usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || 11478c2ecf20Sopenharmony_ci ipv4_is_multicast(usin->sin_addr.s_addr)) { 11488c2ecf20Sopenharmony_ci ret = -EINVAL; 11498c2ecf20Sopenharmony_ci goto out; 11508c2ecf20Sopenharmony_ci } 11518c2ecf20Sopenharmony_ci ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr); 11528c2ecf20Sopenharmony_ci dport = usin->sin_port; 11538c2ecf20Sopenharmony_ci break; 11548c2ecf20Sopenharmony_ci 11558c2ecf20Sopenharmony_ci#if IS_ENABLED(CONFIG_IPV6) 11568c2ecf20Sopenharmony_ci case AF_INET6: { 11578c2ecf20Sopenharmony_ci int addr_type; 11588c2ecf20Sopenharmony_ci 11598c2ecf20Sopenharmony_ci if (namelen < sizeof(*sin6)) { 11608c2ecf20Sopenharmony_ci ret = -EINVAL; 11618c2ecf20Sopenharmony_ci goto out; 11628c2ecf20Sopenharmony_ci } 11638c2ecf20Sopenharmony_ci addr_type = ipv6_addr_type(&sin6->sin6_addr); 11648c2ecf20Sopenharmony_ci if (!(addr_type & IPV6_ADDR_UNICAST)) { 11658c2ecf20Sopenharmony_ci __be32 addr4; 11668c2ecf20Sopenharmony_ci 11678c2ecf20Sopenharmony_ci if (!(addr_type & IPV6_ADDR_MAPPED)) { 11688c2ecf20Sopenharmony_ci ret = -EINVAL; 11698c2ecf20Sopenharmony_ci goto out; 11708c2ecf20Sopenharmony_ci } 11718c2ecf20Sopenharmony_ci 11728c2ecf20Sopenharmony_ci /* It is a mapped address. Need to do some 11738c2ecf20Sopenharmony_ci * sanity checks. 11748c2ecf20Sopenharmony_ci */ 11758c2ecf20Sopenharmony_ci addr4 = sin6->sin6_addr.s6_addr32[3]; 11768c2ecf20Sopenharmony_ci if (addr4 == htonl(INADDR_ANY) || 11778c2ecf20Sopenharmony_ci addr4 == htonl(INADDR_BROADCAST) || 11788c2ecf20Sopenharmony_ci ipv4_is_multicast(addr4)) { 11798c2ecf20Sopenharmony_ci ret = -EINVAL; 11808c2ecf20Sopenharmony_ci goto out; 11818c2ecf20Sopenharmony_ci } 11828c2ecf20Sopenharmony_ci } 11838c2ecf20Sopenharmony_ci if (addr_type & IPV6_ADDR_LINKLOCAL) { 11848c2ecf20Sopenharmony_ci if (sin6->sin6_scope_id == 0) { 11858c2ecf20Sopenharmony_ci ret = -EINVAL; 11868c2ecf20Sopenharmony_ci goto out; 11878c2ecf20Sopenharmony_ci } 11888c2ecf20Sopenharmony_ci scope_id = sin6->sin6_scope_id; 11898c2ecf20Sopenharmony_ci } 11908c2ecf20Sopenharmony_ci 11918c2ecf20Sopenharmony_ci daddr = sin6->sin6_addr; 11928c2ecf20Sopenharmony_ci dport = sin6->sin6_port; 11938c2ecf20Sopenharmony_ci break; 11948c2ecf20Sopenharmony_ci } 11958c2ecf20Sopenharmony_ci#endif 11968c2ecf20Sopenharmony_ci 11978c2ecf20Sopenharmony_ci default: 11988c2ecf20Sopenharmony_ci ret = -EINVAL; 11998c2ecf20Sopenharmony_ci goto out; 12008c2ecf20Sopenharmony_ci } 12018c2ecf20Sopenharmony_ci } else { 12028c2ecf20Sopenharmony_ci /* We only care about consistency with ->connect() */ 12038c2ecf20Sopenharmony_ci lock_sock(sk); 12048c2ecf20Sopenharmony_ci daddr = rs->rs_conn_addr; 12058c2ecf20Sopenharmony_ci dport = rs->rs_conn_port; 12068c2ecf20Sopenharmony_ci scope_id = rs->rs_bound_scope_id; 12078c2ecf20Sopenharmony_ci release_sock(sk); 12088c2ecf20Sopenharmony_ci } 12098c2ecf20Sopenharmony_ci 12108c2ecf20Sopenharmony_ci lock_sock(sk); 12118c2ecf20Sopenharmony_ci if (ipv6_addr_any(&rs->rs_bound_addr) || ipv6_addr_any(&daddr)) { 12128c2ecf20Sopenharmony_ci release_sock(sk); 12138c2ecf20Sopenharmony_ci ret = -ENOTCONN; 12148c2ecf20Sopenharmony_ci goto out; 12158c2ecf20Sopenharmony_ci } else if (namelen != 0) { 12168c2ecf20Sopenharmony_ci /* Cannot send to an IPv4 address using an IPv6 source 12178c2ecf20Sopenharmony_ci * address and cannot send to an IPv6 address using an 12188c2ecf20Sopenharmony_ci * IPv4 source address. 12198c2ecf20Sopenharmony_ci */ 12208c2ecf20Sopenharmony_ci if (ipv6_addr_v4mapped(&daddr) ^ 12218c2ecf20Sopenharmony_ci ipv6_addr_v4mapped(&rs->rs_bound_addr)) { 12228c2ecf20Sopenharmony_ci release_sock(sk); 12238c2ecf20Sopenharmony_ci ret = -EOPNOTSUPP; 12248c2ecf20Sopenharmony_ci goto out; 12258c2ecf20Sopenharmony_ci } 12268c2ecf20Sopenharmony_ci /* If the socket is already bound to a link local address, 12278c2ecf20Sopenharmony_ci * it can only send to peers on the same link. But allow 12288c2ecf20Sopenharmony_ci * communicating beween link local and non-link local address. 12298c2ecf20Sopenharmony_ci */ 12308c2ecf20Sopenharmony_ci if (scope_id != rs->rs_bound_scope_id) { 12318c2ecf20Sopenharmony_ci if (!scope_id) { 12328c2ecf20Sopenharmony_ci scope_id = rs->rs_bound_scope_id; 12338c2ecf20Sopenharmony_ci } else if (rs->rs_bound_scope_id) { 12348c2ecf20Sopenharmony_ci release_sock(sk); 12358c2ecf20Sopenharmony_ci ret = -EINVAL; 12368c2ecf20Sopenharmony_ci goto out; 12378c2ecf20Sopenharmony_ci } 12388c2ecf20Sopenharmony_ci } 12398c2ecf20Sopenharmony_ci } 12408c2ecf20Sopenharmony_ci release_sock(sk); 12418c2ecf20Sopenharmony_ci 12428c2ecf20Sopenharmony_ci ret = rds_rdma_bytes(msg, &rdma_payload_len); 12438c2ecf20Sopenharmony_ci if (ret) 12448c2ecf20Sopenharmony_ci goto out; 12458c2ecf20Sopenharmony_ci 12468c2ecf20Sopenharmony_ci total_payload_len += rdma_payload_len; 12478c2ecf20Sopenharmony_ci if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) { 12488c2ecf20Sopenharmony_ci ret = -EMSGSIZE; 12498c2ecf20Sopenharmony_ci goto out; 12508c2ecf20Sopenharmony_ci } 12518c2ecf20Sopenharmony_ci 12528c2ecf20Sopenharmony_ci if (payload_len > rds_sk_sndbuf(rs)) { 12538c2ecf20Sopenharmony_ci ret = -EMSGSIZE; 12548c2ecf20Sopenharmony_ci goto out; 12558c2ecf20Sopenharmony_ci } 12568c2ecf20Sopenharmony_ci 12578c2ecf20Sopenharmony_ci if (zcopy) { 12588c2ecf20Sopenharmony_ci if (rs->rs_transport->t_type != RDS_TRANS_TCP) { 12598c2ecf20Sopenharmony_ci ret = -EOPNOTSUPP; 12608c2ecf20Sopenharmony_ci goto out; 12618c2ecf20Sopenharmony_ci } 12628c2ecf20Sopenharmony_ci num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX); 12638c2ecf20Sopenharmony_ci } 12648c2ecf20Sopenharmony_ci /* size of rm including all sgs */ 12658c2ecf20Sopenharmony_ci ret = rds_rm_size(msg, num_sgs, &vct); 12668c2ecf20Sopenharmony_ci if (ret < 0) 12678c2ecf20Sopenharmony_ci goto out; 12688c2ecf20Sopenharmony_ci 12698c2ecf20Sopenharmony_ci rm = rds_message_alloc(ret, GFP_KERNEL); 12708c2ecf20Sopenharmony_ci if (!rm) { 12718c2ecf20Sopenharmony_ci ret = -ENOMEM; 12728c2ecf20Sopenharmony_ci goto out; 12738c2ecf20Sopenharmony_ci } 12748c2ecf20Sopenharmony_ci 12758c2ecf20Sopenharmony_ci /* Attach data to the rm */ 12768c2ecf20Sopenharmony_ci if (payload_len) { 12778c2ecf20Sopenharmony_ci rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); 12788c2ecf20Sopenharmony_ci if (IS_ERR(rm->data.op_sg)) { 12798c2ecf20Sopenharmony_ci ret = PTR_ERR(rm->data.op_sg); 12808c2ecf20Sopenharmony_ci goto out; 12818c2ecf20Sopenharmony_ci } 12828c2ecf20Sopenharmony_ci ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy); 12838c2ecf20Sopenharmony_ci if (ret) 12848c2ecf20Sopenharmony_ci goto out; 12858c2ecf20Sopenharmony_ci } 12868c2ecf20Sopenharmony_ci rm->data.op_active = 1; 12878c2ecf20Sopenharmony_ci 12888c2ecf20Sopenharmony_ci rm->m_daddr = daddr; 12898c2ecf20Sopenharmony_ci 12908c2ecf20Sopenharmony_ci /* rds_conn_create has a spinlock that runs with IRQ off. 12918c2ecf20Sopenharmony_ci * Caching the conn in the socket helps a lot. */ 12928c2ecf20Sopenharmony_ci if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr) && 12938c2ecf20Sopenharmony_ci rs->rs_tos == rs->rs_conn->c_tos) { 12948c2ecf20Sopenharmony_ci conn = rs->rs_conn; 12958c2ecf20Sopenharmony_ci } else { 12968c2ecf20Sopenharmony_ci conn = rds_conn_create_outgoing(sock_net(sock->sk), 12978c2ecf20Sopenharmony_ci &rs->rs_bound_addr, &daddr, 12988c2ecf20Sopenharmony_ci rs->rs_transport, rs->rs_tos, 12998c2ecf20Sopenharmony_ci sock->sk->sk_allocation, 13008c2ecf20Sopenharmony_ci scope_id); 13018c2ecf20Sopenharmony_ci if (IS_ERR(conn)) { 13028c2ecf20Sopenharmony_ci ret = PTR_ERR(conn); 13038c2ecf20Sopenharmony_ci goto out; 13048c2ecf20Sopenharmony_ci } 13058c2ecf20Sopenharmony_ci rs->rs_conn = conn; 13068c2ecf20Sopenharmony_ci } 13078c2ecf20Sopenharmony_ci 13088c2ecf20Sopenharmony_ci if (conn->c_trans->t_mp_capable) 13098c2ecf20Sopenharmony_ci cpath = &conn->c_path[rds_send_mprds_hash(rs, conn, nonblock)]; 13108c2ecf20Sopenharmony_ci else 13118c2ecf20Sopenharmony_ci cpath = &conn->c_path[0]; 13128c2ecf20Sopenharmony_ci 13138c2ecf20Sopenharmony_ci rm->m_conn_path = cpath; 13148c2ecf20Sopenharmony_ci 13158c2ecf20Sopenharmony_ci /* Parse any control messages the user may have included. */ 13168c2ecf20Sopenharmony_ci ret = rds_cmsg_send(rs, rm, msg, &allocated_mr, &vct); 13178c2ecf20Sopenharmony_ci if (ret) { 13188c2ecf20Sopenharmony_ci /* Trigger connection so that its ready for the next retry */ 13198c2ecf20Sopenharmony_ci if (ret == -EAGAIN) 13208c2ecf20Sopenharmony_ci rds_conn_connect_if_down(conn); 13218c2ecf20Sopenharmony_ci goto out; 13228c2ecf20Sopenharmony_ci } 13238c2ecf20Sopenharmony_ci 13248c2ecf20Sopenharmony_ci if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { 13258c2ecf20Sopenharmony_ci printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", 13268c2ecf20Sopenharmony_ci &rm->rdma, conn->c_trans->xmit_rdma); 13278c2ecf20Sopenharmony_ci ret = -EOPNOTSUPP; 13288c2ecf20Sopenharmony_ci goto out; 13298c2ecf20Sopenharmony_ci } 13308c2ecf20Sopenharmony_ci 13318c2ecf20Sopenharmony_ci if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) { 13328c2ecf20Sopenharmony_ci printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n", 13338c2ecf20Sopenharmony_ci &rm->atomic, conn->c_trans->xmit_atomic); 13348c2ecf20Sopenharmony_ci ret = -EOPNOTSUPP; 13358c2ecf20Sopenharmony_ci goto out; 13368c2ecf20Sopenharmony_ci } 13378c2ecf20Sopenharmony_ci 13388c2ecf20Sopenharmony_ci if (rds_destroy_pending(conn)) { 13398c2ecf20Sopenharmony_ci ret = -EAGAIN; 13408c2ecf20Sopenharmony_ci goto out; 13418c2ecf20Sopenharmony_ci } 13428c2ecf20Sopenharmony_ci 13438c2ecf20Sopenharmony_ci if (rds_conn_path_down(cpath)) 13448c2ecf20Sopenharmony_ci rds_check_all_paths(conn); 13458c2ecf20Sopenharmony_ci 13468c2ecf20Sopenharmony_ci ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); 13478c2ecf20Sopenharmony_ci if (ret) { 13488c2ecf20Sopenharmony_ci rs->rs_seen_congestion = 1; 13498c2ecf20Sopenharmony_ci goto out; 13508c2ecf20Sopenharmony_ci } 13518c2ecf20Sopenharmony_ci while (!rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port, 13528c2ecf20Sopenharmony_ci dport, &queued)) { 13538c2ecf20Sopenharmony_ci rds_stats_inc(s_send_queue_full); 13548c2ecf20Sopenharmony_ci 13558c2ecf20Sopenharmony_ci if (nonblock) { 13568c2ecf20Sopenharmony_ci ret = -EAGAIN; 13578c2ecf20Sopenharmony_ci goto out; 13588c2ecf20Sopenharmony_ci } 13598c2ecf20Sopenharmony_ci 13608c2ecf20Sopenharmony_ci timeo = wait_event_interruptible_timeout(*sk_sleep(sk), 13618c2ecf20Sopenharmony_ci rds_send_queue_rm(rs, conn, cpath, rm, 13628c2ecf20Sopenharmony_ci rs->rs_bound_port, 13638c2ecf20Sopenharmony_ci dport, 13648c2ecf20Sopenharmony_ci &queued), 13658c2ecf20Sopenharmony_ci timeo); 13668c2ecf20Sopenharmony_ci rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo); 13678c2ecf20Sopenharmony_ci if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) 13688c2ecf20Sopenharmony_ci continue; 13698c2ecf20Sopenharmony_ci 13708c2ecf20Sopenharmony_ci ret = timeo; 13718c2ecf20Sopenharmony_ci if (ret == 0) 13728c2ecf20Sopenharmony_ci ret = -ETIMEDOUT; 13738c2ecf20Sopenharmony_ci goto out; 13748c2ecf20Sopenharmony_ci } 13758c2ecf20Sopenharmony_ci 13768c2ecf20Sopenharmony_ci /* 13778c2ecf20Sopenharmony_ci * By now we've committed to the send. We reuse rds_send_worker() 13788c2ecf20Sopenharmony_ci * to retry sends in the rds thread if the transport asks us to. 13798c2ecf20Sopenharmony_ci */ 13808c2ecf20Sopenharmony_ci rds_stats_inc(s_send_queued); 13818c2ecf20Sopenharmony_ci 13828c2ecf20Sopenharmony_ci ret = rds_send_xmit(cpath); 13838c2ecf20Sopenharmony_ci if (ret == -ENOMEM || ret == -EAGAIN) { 13848c2ecf20Sopenharmony_ci ret = 0; 13858c2ecf20Sopenharmony_ci rcu_read_lock(); 13868c2ecf20Sopenharmony_ci if (rds_destroy_pending(cpath->cp_conn)) 13878c2ecf20Sopenharmony_ci ret = -ENETUNREACH; 13888c2ecf20Sopenharmony_ci else 13898c2ecf20Sopenharmony_ci queue_delayed_work(rds_wq, &cpath->cp_send_w, 1); 13908c2ecf20Sopenharmony_ci rcu_read_unlock(); 13918c2ecf20Sopenharmony_ci } 13928c2ecf20Sopenharmony_ci if (ret) 13938c2ecf20Sopenharmony_ci goto out; 13948c2ecf20Sopenharmony_ci rds_message_put(rm); 13958c2ecf20Sopenharmony_ci 13968c2ecf20Sopenharmony_ci for (ind = 0; ind < vct.indx; ind++) 13978c2ecf20Sopenharmony_ci kfree(vct.vec[ind].iov); 13988c2ecf20Sopenharmony_ci kfree(vct.vec); 13998c2ecf20Sopenharmony_ci 14008c2ecf20Sopenharmony_ci return payload_len; 14018c2ecf20Sopenharmony_ci 14028c2ecf20Sopenharmony_ciout: 14038c2ecf20Sopenharmony_ci for (ind = 0; ind < vct.indx; ind++) 14048c2ecf20Sopenharmony_ci kfree(vct.vec[ind].iov); 14058c2ecf20Sopenharmony_ci kfree(vct.vec); 14068c2ecf20Sopenharmony_ci 14078c2ecf20Sopenharmony_ci /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly. 14088c2ecf20Sopenharmony_ci * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN 14098c2ecf20Sopenharmony_ci * or in any other way, we need to destroy the MR again */ 14108c2ecf20Sopenharmony_ci if (allocated_mr) 14118c2ecf20Sopenharmony_ci rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1); 14128c2ecf20Sopenharmony_ci 14138c2ecf20Sopenharmony_ci if (rm) 14148c2ecf20Sopenharmony_ci rds_message_put(rm); 14158c2ecf20Sopenharmony_ci return ret; 14168c2ecf20Sopenharmony_ci} 14178c2ecf20Sopenharmony_ci 14188c2ecf20Sopenharmony_ci/* 14198c2ecf20Sopenharmony_ci * send out a probe. Can be shared by rds_send_ping, 14208c2ecf20Sopenharmony_ci * rds_send_pong, rds_send_hb. 14218c2ecf20Sopenharmony_ci * rds_send_hb should use h_flags 14228c2ecf20Sopenharmony_ci * RDS_FLAG_HB_PING|RDS_FLAG_ACK_REQUIRED 14238c2ecf20Sopenharmony_ci * or 14248c2ecf20Sopenharmony_ci * RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED 14258c2ecf20Sopenharmony_ci */ 14268c2ecf20Sopenharmony_cistatic int 14278c2ecf20Sopenharmony_cirds_send_probe(struct rds_conn_path *cp, __be16 sport, 14288c2ecf20Sopenharmony_ci __be16 dport, u8 h_flags) 14298c2ecf20Sopenharmony_ci{ 14308c2ecf20Sopenharmony_ci struct rds_message *rm; 14318c2ecf20Sopenharmony_ci unsigned long flags; 14328c2ecf20Sopenharmony_ci int ret = 0; 14338c2ecf20Sopenharmony_ci 14348c2ecf20Sopenharmony_ci rm = rds_message_alloc(0, GFP_ATOMIC); 14358c2ecf20Sopenharmony_ci if (!rm) { 14368c2ecf20Sopenharmony_ci ret = -ENOMEM; 14378c2ecf20Sopenharmony_ci goto out; 14388c2ecf20Sopenharmony_ci } 14398c2ecf20Sopenharmony_ci 14408c2ecf20Sopenharmony_ci rm->m_daddr = cp->cp_conn->c_faddr; 14418c2ecf20Sopenharmony_ci rm->data.op_active = 1; 14428c2ecf20Sopenharmony_ci 14438c2ecf20Sopenharmony_ci rds_conn_path_connect_if_down(cp); 14448c2ecf20Sopenharmony_ci 14458c2ecf20Sopenharmony_ci ret = rds_cong_wait(cp->cp_conn->c_fcong, dport, 1, NULL); 14468c2ecf20Sopenharmony_ci if (ret) 14478c2ecf20Sopenharmony_ci goto out; 14488c2ecf20Sopenharmony_ci 14498c2ecf20Sopenharmony_ci spin_lock_irqsave(&cp->cp_lock, flags); 14508c2ecf20Sopenharmony_ci list_add_tail(&rm->m_conn_item, &cp->cp_send_queue); 14518c2ecf20Sopenharmony_ci set_bit(RDS_MSG_ON_CONN, &rm->m_flags); 14528c2ecf20Sopenharmony_ci rds_message_addref(rm); 14538c2ecf20Sopenharmony_ci rm->m_inc.i_conn = cp->cp_conn; 14548c2ecf20Sopenharmony_ci rm->m_inc.i_conn_path = cp; 14558c2ecf20Sopenharmony_ci 14568c2ecf20Sopenharmony_ci rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 14578c2ecf20Sopenharmony_ci cp->cp_next_tx_seq); 14588c2ecf20Sopenharmony_ci rm->m_inc.i_hdr.h_flags |= h_flags; 14598c2ecf20Sopenharmony_ci cp->cp_next_tx_seq++; 14608c2ecf20Sopenharmony_ci 14618c2ecf20Sopenharmony_ci if (RDS_HS_PROBE(be16_to_cpu(sport), be16_to_cpu(dport)) && 14628c2ecf20Sopenharmony_ci cp->cp_conn->c_trans->t_mp_capable) { 14638c2ecf20Sopenharmony_ci u16 npaths = cpu_to_be16(RDS_MPATH_WORKERS); 14648c2ecf20Sopenharmony_ci u32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num); 14658c2ecf20Sopenharmony_ci 14668c2ecf20Sopenharmony_ci rds_message_add_extension(&rm->m_inc.i_hdr, 14678c2ecf20Sopenharmony_ci RDS_EXTHDR_NPATHS, &npaths, 14688c2ecf20Sopenharmony_ci sizeof(npaths)); 14698c2ecf20Sopenharmony_ci rds_message_add_extension(&rm->m_inc.i_hdr, 14708c2ecf20Sopenharmony_ci RDS_EXTHDR_GEN_NUM, 14718c2ecf20Sopenharmony_ci &my_gen_num, 14728c2ecf20Sopenharmony_ci sizeof(u32)); 14738c2ecf20Sopenharmony_ci } 14748c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&cp->cp_lock, flags); 14758c2ecf20Sopenharmony_ci 14768c2ecf20Sopenharmony_ci rds_stats_inc(s_send_queued); 14778c2ecf20Sopenharmony_ci rds_stats_inc(s_send_pong); 14788c2ecf20Sopenharmony_ci 14798c2ecf20Sopenharmony_ci /* schedule the send work on rds_wq */ 14808c2ecf20Sopenharmony_ci rcu_read_lock(); 14818c2ecf20Sopenharmony_ci if (!rds_destroy_pending(cp->cp_conn)) 14828c2ecf20Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_send_w, 1); 14838c2ecf20Sopenharmony_ci rcu_read_unlock(); 14848c2ecf20Sopenharmony_ci 14858c2ecf20Sopenharmony_ci rds_message_put(rm); 14868c2ecf20Sopenharmony_ci return 0; 14878c2ecf20Sopenharmony_ci 14888c2ecf20Sopenharmony_ciout: 14898c2ecf20Sopenharmony_ci if (rm) 14908c2ecf20Sopenharmony_ci rds_message_put(rm); 14918c2ecf20Sopenharmony_ci return ret; 14928c2ecf20Sopenharmony_ci} 14938c2ecf20Sopenharmony_ci 14948c2ecf20Sopenharmony_ciint 14958c2ecf20Sopenharmony_cirds_send_pong(struct rds_conn_path *cp, __be16 dport) 14968c2ecf20Sopenharmony_ci{ 14978c2ecf20Sopenharmony_ci return rds_send_probe(cp, 0, dport, 0); 14988c2ecf20Sopenharmony_ci} 14998c2ecf20Sopenharmony_ci 15008c2ecf20Sopenharmony_civoid 15018c2ecf20Sopenharmony_cirds_send_ping(struct rds_connection *conn, int cp_index) 15028c2ecf20Sopenharmony_ci{ 15038c2ecf20Sopenharmony_ci unsigned long flags; 15048c2ecf20Sopenharmony_ci struct rds_conn_path *cp = &conn->c_path[cp_index]; 15058c2ecf20Sopenharmony_ci 15068c2ecf20Sopenharmony_ci spin_lock_irqsave(&cp->cp_lock, flags); 15078c2ecf20Sopenharmony_ci if (conn->c_ping_triggered) { 15088c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&cp->cp_lock, flags); 15098c2ecf20Sopenharmony_ci return; 15108c2ecf20Sopenharmony_ci } 15118c2ecf20Sopenharmony_ci conn->c_ping_triggered = 1; 15128c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&cp->cp_lock, flags); 15138c2ecf20Sopenharmony_ci rds_send_probe(cp, cpu_to_be16(RDS_FLAG_PROBE_PORT), 0, 0); 15148c2ecf20Sopenharmony_ci} 15158c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_send_ping); 1516