18c2ecf20Sopenharmony_ci/* 28c2ecf20Sopenharmony_ci * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. 38c2ecf20Sopenharmony_ci * 48c2ecf20Sopenharmony_ci * This software is available to you under a choice of one of two 58c2ecf20Sopenharmony_ci * licenses. You may choose to be licensed under the terms of the GNU 68c2ecf20Sopenharmony_ci * General Public License (GPL) Version 2, available from the file 78c2ecf20Sopenharmony_ci * COPYING in the main directory of this source tree, or the 88c2ecf20Sopenharmony_ci * OpenIB.org BSD license below: 98c2ecf20Sopenharmony_ci * 108c2ecf20Sopenharmony_ci * Redistribution and use in source and binary forms, with or 118c2ecf20Sopenharmony_ci * without modification, are permitted provided that the following 128c2ecf20Sopenharmony_ci * conditions are met: 138c2ecf20Sopenharmony_ci * 148c2ecf20Sopenharmony_ci * - Redistributions of source code must retain the above 158c2ecf20Sopenharmony_ci * copyright notice, this list of conditions and the following 168c2ecf20Sopenharmony_ci * disclaimer. 178c2ecf20Sopenharmony_ci * 188c2ecf20Sopenharmony_ci * - Redistributions in binary form must reproduce the above 198c2ecf20Sopenharmony_ci * copyright notice, this list of conditions and the following 208c2ecf20Sopenharmony_ci * disclaimer in the documentation and/or other materials 218c2ecf20Sopenharmony_ci * provided with the distribution. 228c2ecf20Sopenharmony_ci * 238c2ecf20Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 248c2ecf20Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 258c2ecf20Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 268c2ecf20Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 278c2ecf20Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 288c2ecf20Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 298c2ecf20Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 308c2ecf20Sopenharmony_ci * SOFTWARE. 318c2ecf20Sopenharmony_ci * 328c2ecf20Sopenharmony_ci */ 338c2ecf20Sopenharmony_ci#include <linux/kernel.h> 348c2ecf20Sopenharmony_ci#include <linux/in.h> 358c2ecf20Sopenharmony_ci#include <linux/device.h> 368c2ecf20Sopenharmony_ci#include <linux/dmapool.h> 378c2ecf20Sopenharmony_ci#include <linux/ratelimit.h> 388c2ecf20Sopenharmony_ci 398c2ecf20Sopenharmony_ci#include "rds_single_path.h" 408c2ecf20Sopenharmony_ci#include "rds.h" 418c2ecf20Sopenharmony_ci#include "ib.h" 428c2ecf20Sopenharmony_ci#include "ib_mr.h" 438c2ecf20Sopenharmony_ci 448c2ecf20Sopenharmony_ci/* 458c2ecf20Sopenharmony_ci * Convert IB-specific error message to RDS error message and call core 468c2ecf20Sopenharmony_ci * completion handler. 478c2ecf20Sopenharmony_ci */ 488c2ecf20Sopenharmony_cistatic void rds_ib_send_complete(struct rds_message *rm, 498c2ecf20Sopenharmony_ci int wc_status, 508c2ecf20Sopenharmony_ci void (*complete)(struct rds_message *rm, int status)) 518c2ecf20Sopenharmony_ci{ 528c2ecf20Sopenharmony_ci int notify_status; 538c2ecf20Sopenharmony_ci 548c2ecf20Sopenharmony_ci switch (wc_status) { 558c2ecf20Sopenharmony_ci case IB_WC_WR_FLUSH_ERR: 568c2ecf20Sopenharmony_ci return; 578c2ecf20Sopenharmony_ci 588c2ecf20Sopenharmony_ci case IB_WC_SUCCESS: 598c2ecf20Sopenharmony_ci notify_status = RDS_RDMA_SUCCESS; 608c2ecf20Sopenharmony_ci break; 618c2ecf20Sopenharmony_ci 628c2ecf20Sopenharmony_ci case IB_WC_REM_ACCESS_ERR: 638c2ecf20Sopenharmony_ci notify_status = RDS_RDMA_REMOTE_ERROR; 648c2ecf20Sopenharmony_ci break; 658c2ecf20Sopenharmony_ci 668c2ecf20Sopenharmony_ci default: 678c2ecf20Sopenharmony_ci notify_status = RDS_RDMA_OTHER_ERROR; 688c2ecf20Sopenharmony_ci break; 698c2ecf20Sopenharmony_ci } 708c2ecf20Sopenharmony_ci complete(rm, notify_status); 718c2ecf20Sopenharmony_ci} 728c2ecf20Sopenharmony_ci 738c2ecf20Sopenharmony_cistatic void rds_ib_send_unmap_data(struct rds_ib_connection *ic, 748c2ecf20Sopenharmony_ci struct rm_data_op *op, 758c2ecf20Sopenharmony_ci int wc_status) 768c2ecf20Sopenharmony_ci{ 778c2ecf20Sopenharmony_ci if (op->op_nents) 788c2ecf20Sopenharmony_ci ib_dma_unmap_sg(ic->i_cm_id->device, 798c2ecf20Sopenharmony_ci op->op_sg, op->op_nents, 808c2ecf20Sopenharmony_ci DMA_TO_DEVICE); 818c2ecf20Sopenharmony_ci} 828c2ecf20Sopenharmony_ci 838c2ecf20Sopenharmony_cistatic void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, 848c2ecf20Sopenharmony_ci struct rm_rdma_op *op, 858c2ecf20Sopenharmony_ci int wc_status) 868c2ecf20Sopenharmony_ci{ 878c2ecf20Sopenharmony_ci if (op->op_mapped) { 888c2ecf20Sopenharmony_ci ib_dma_unmap_sg(ic->i_cm_id->device, 898c2ecf20Sopenharmony_ci op->op_sg, op->op_nents, 908c2ecf20Sopenharmony_ci op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 918c2ecf20Sopenharmony_ci op->op_mapped = 0; 928c2ecf20Sopenharmony_ci } 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_ci /* If the user asked for a completion notification on this 958c2ecf20Sopenharmony_ci * message, we can implement three different semantics: 968c2ecf20Sopenharmony_ci * 1. Notify when we received the ACK on the RDS message 978c2ecf20Sopenharmony_ci * that was queued with the RDMA. This provides reliable 988c2ecf20Sopenharmony_ci * notification of RDMA status at the expense of a one-way 998c2ecf20Sopenharmony_ci * packet delay. 1008c2ecf20Sopenharmony_ci * 2. Notify when the IB stack gives us the completion event for 1018c2ecf20Sopenharmony_ci * the RDMA operation. 1028c2ecf20Sopenharmony_ci * 3. Notify when the IB stack gives us the completion event for 1038c2ecf20Sopenharmony_ci * the accompanying RDS messages. 1048c2ecf20Sopenharmony_ci * Here, we implement approach #3. To implement approach #2, 1058c2ecf20Sopenharmony_ci * we would need to take an event for the rdma WR. To implement #1, 1068c2ecf20Sopenharmony_ci * don't call rds_rdma_send_complete at all, and fall back to the notify 1078c2ecf20Sopenharmony_ci * handling in the ACK processing code. 1088c2ecf20Sopenharmony_ci * 1098c2ecf20Sopenharmony_ci * Note: There's no need to explicitly sync any RDMA buffers using 1108c2ecf20Sopenharmony_ci * ib_dma_sync_sg_for_cpu - the completion for the RDMA 1118c2ecf20Sopenharmony_ci * operation itself unmapped the RDMA buffers, which takes care 1128c2ecf20Sopenharmony_ci * of synching. 1138c2ecf20Sopenharmony_ci */ 1148c2ecf20Sopenharmony_ci rds_ib_send_complete(container_of(op, struct rds_message, rdma), 1158c2ecf20Sopenharmony_ci wc_status, rds_rdma_send_complete); 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_ci if (op->op_write) 1188c2ecf20Sopenharmony_ci rds_stats_add(s_send_rdma_bytes, op->op_bytes); 1198c2ecf20Sopenharmony_ci else 1208c2ecf20Sopenharmony_ci rds_stats_add(s_recv_rdma_bytes, op->op_bytes); 1218c2ecf20Sopenharmony_ci} 1228c2ecf20Sopenharmony_ci 1238c2ecf20Sopenharmony_cistatic void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, 1248c2ecf20Sopenharmony_ci struct rm_atomic_op *op, 1258c2ecf20Sopenharmony_ci int wc_status) 1268c2ecf20Sopenharmony_ci{ 1278c2ecf20Sopenharmony_ci /* unmap atomic recvbuf */ 1288c2ecf20Sopenharmony_ci if (op->op_mapped) { 1298c2ecf20Sopenharmony_ci ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1, 1308c2ecf20Sopenharmony_ci DMA_FROM_DEVICE); 1318c2ecf20Sopenharmony_ci op->op_mapped = 0; 1328c2ecf20Sopenharmony_ci } 1338c2ecf20Sopenharmony_ci 1348c2ecf20Sopenharmony_ci rds_ib_send_complete(container_of(op, struct rds_message, atomic), 1358c2ecf20Sopenharmony_ci wc_status, rds_atomic_send_complete); 1368c2ecf20Sopenharmony_ci 1378c2ecf20Sopenharmony_ci if (op->op_type == RDS_ATOMIC_TYPE_CSWP) 1388c2ecf20Sopenharmony_ci rds_ib_stats_inc(s_ib_atomic_cswp); 1398c2ecf20Sopenharmony_ci else 1408c2ecf20Sopenharmony_ci rds_ib_stats_inc(s_ib_atomic_fadd); 1418c2ecf20Sopenharmony_ci} 1428c2ecf20Sopenharmony_ci 1438c2ecf20Sopenharmony_ci/* 1448c2ecf20Sopenharmony_ci * Unmap the resources associated with a struct send_work. 1458c2ecf20Sopenharmony_ci * 1468c2ecf20Sopenharmony_ci * Returns the rm for no good reason other than it is unobtainable 1478c2ecf20Sopenharmony_ci * other than by switching on wr.opcode, currently, and the caller, 1488c2ecf20Sopenharmony_ci * the event handler, needs it. 1498c2ecf20Sopenharmony_ci */ 1508c2ecf20Sopenharmony_cistatic struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic, 1518c2ecf20Sopenharmony_ci struct rds_ib_send_work *send, 1528c2ecf20Sopenharmony_ci int wc_status) 1538c2ecf20Sopenharmony_ci{ 1548c2ecf20Sopenharmony_ci struct rds_message *rm = NULL; 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_ci /* In the error case, wc.opcode sometimes contains garbage */ 1578c2ecf20Sopenharmony_ci switch (send->s_wr.opcode) { 1588c2ecf20Sopenharmony_ci case IB_WR_SEND: 1598c2ecf20Sopenharmony_ci if (send->s_op) { 1608c2ecf20Sopenharmony_ci rm = container_of(send->s_op, struct rds_message, data); 1618c2ecf20Sopenharmony_ci rds_ib_send_unmap_data(ic, send->s_op, wc_status); 1628c2ecf20Sopenharmony_ci } 1638c2ecf20Sopenharmony_ci break; 1648c2ecf20Sopenharmony_ci case IB_WR_RDMA_WRITE: 1658c2ecf20Sopenharmony_ci case IB_WR_RDMA_READ: 1668c2ecf20Sopenharmony_ci if (send->s_op) { 1678c2ecf20Sopenharmony_ci rm = container_of(send->s_op, struct rds_message, rdma); 1688c2ecf20Sopenharmony_ci rds_ib_send_unmap_rdma(ic, send->s_op, wc_status); 1698c2ecf20Sopenharmony_ci } 1708c2ecf20Sopenharmony_ci break; 1718c2ecf20Sopenharmony_ci case IB_WR_ATOMIC_FETCH_AND_ADD: 1728c2ecf20Sopenharmony_ci case IB_WR_ATOMIC_CMP_AND_SWP: 1738c2ecf20Sopenharmony_ci if (send->s_op) { 1748c2ecf20Sopenharmony_ci rm = container_of(send->s_op, struct rds_message, atomic); 1758c2ecf20Sopenharmony_ci rds_ib_send_unmap_atomic(ic, send->s_op, wc_status); 1768c2ecf20Sopenharmony_ci } 1778c2ecf20Sopenharmony_ci break; 1788c2ecf20Sopenharmony_ci default: 1798c2ecf20Sopenharmony_ci printk_ratelimited(KERN_NOTICE 1808c2ecf20Sopenharmony_ci "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", 1818c2ecf20Sopenharmony_ci __func__, send->s_wr.opcode); 1828c2ecf20Sopenharmony_ci break; 1838c2ecf20Sopenharmony_ci } 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci send->s_wr.opcode = 0xdead; 1868c2ecf20Sopenharmony_ci 1878c2ecf20Sopenharmony_ci return rm; 1888c2ecf20Sopenharmony_ci} 1898c2ecf20Sopenharmony_ci 1908c2ecf20Sopenharmony_civoid rds_ib_send_init_ring(struct rds_ib_connection *ic) 1918c2ecf20Sopenharmony_ci{ 1928c2ecf20Sopenharmony_ci struct rds_ib_send_work *send; 1938c2ecf20Sopenharmony_ci u32 i; 1948c2ecf20Sopenharmony_ci 1958c2ecf20Sopenharmony_ci for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { 1968c2ecf20Sopenharmony_ci struct ib_sge *sge; 1978c2ecf20Sopenharmony_ci 1988c2ecf20Sopenharmony_ci send->s_op = NULL; 1998c2ecf20Sopenharmony_ci 2008c2ecf20Sopenharmony_ci send->s_wr.wr_id = i; 2018c2ecf20Sopenharmony_ci send->s_wr.sg_list = send->s_sge; 2028c2ecf20Sopenharmony_ci send->s_wr.ex.imm_data = 0; 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ci sge = &send->s_sge[0]; 2058c2ecf20Sopenharmony_ci sge->addr = ic->i_send_hdrs_dma[i]; 2068c2ecf20Sopenharmony_ci 2078c2ecf20Sopenharmony_ci sge->length = sizeof(struct rds_header); 2088c2ecf20Sopenharmony_ci sge->lkey = ic->i_pd->local_dma_lkey; 2098c2ecf20Sopenharmony_ci 2108c2ecf20Sopenharmony_ci send->s_sge[1].lkey = ic->i_pd->local_dma_lkey; 2118c2ecf20Sopenharmony_ci } 2128c2ecf20Sopenharmony_ci} 2138c2ecf20Sopenharmony_ci 2148c2ecf20Sopenharmony_civoid rds_ib_send_clear_ring(struct rds_ib_connection *ic) 2158c2ecf20Sopenharmony_ci{ 2168c2ecf20Sopenharmony_ci struct rds_ib_send_work *send; 2178c2ecf20Sopenharmony_ci u32 i; 2188c2ecf20Sopenharmony_ci 2198c2ecf20Sopenharmony_ci for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { 2208c2ecf20Sopenharmony_ci if (send->s_op && send->s_wr.opcode != 0xdead) 2218c2ecf20Sopenharmony_ci rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR); 2228c2ecf20Sopenharmony_ci } 2238c2ecf20Sopenharmony_ci} 2248c2ecf20Sopenharmony_ci 2258c2ecf20Sopenharmony_ci/* 2268c2ecf20Sopenharmony_ci * The only fast path caller always has a non-zero nr, so we don't 2278c2ecf20Sopenharmony_ci * bother testing nr before performing the atomic sub. 2288c2ecf20Sopenharmony_ci */ 2298c2ecf20Sopenharmony_cistatic void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr) 2308c2ecf20Sopenharmony_ci{ 2318c2ecf20Sopenharmony_ci if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) && 2328c2ecf20Sopenharmony_ci waitqueue_active(&rds_ib_ring_empty_wait)) 2338c2ecf20Sopenharmony_ci wake_up(&rds_ib_ring_empty_wait); 2348c2ecf20Sopenharmony_ci BUG_ON(atomic_read(&ic->i_signaled_sends) < 0); 2358c2ecf20Sopenharmony_ci} 2368c2ecf20Sopenharmony_ci 2378c2ecf20Sopenharmony_ci/* 2388c2ecf20Sopenharmony_ci * The _oldest/_free ring operations here race cleanly with the alloc/unalloc 2398c2ecf20Sopenharmony_ci * operations performed in the send path. As the sender allocs and potentially 2408c2ecf20Sopenharmony_ci * unallocs the next free entry in the ring it doesn't alter which is 2418c2ecf20Sopenharmony_ci * the next to be freed, which is what this is concerned with. 2428c2ecf20Sopenharmony_ci */ 2438c2ecf20Sopenharmony_civoid rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) 2448c2ecf20Sopenharmony_ci{ 2458c2ecf20Sopenharmony_ci struct rds_message *rm = NULL; 2468c2ecf20Sopenharmony_ci struct rds_connection *conn = ic->conn; 2478c2ecf20Sopenharmony_ci struct rds_ib_send_work *send; 2488c2ecf20Sopenharmony_ci u32 completed; 2498c2ecf20Sopenharmony_ci u32 oldest; 2508c2ecf20Sopenharmony_ci u32 i = 0; 2518c2ecf20Sopenharmony_ci int nr_sig = 0; 2528c2ecf20Sopenharmony_ci 2538c2ecf20Sopenharmony_ci 2548c2ecf20Sopenharmony_ci rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", 2558c2ecf20Sopenharmony_ci (unsigned long long)wc->wr_id, wc->status, 2568c2ecf20Sopenharmony_ci ib_wc_status_msg(wc->status), wc->byte_len, 2578c2ecf20Sopenharmony_ci be32_to_cpu(wc->ex.imm_data)); 2588c2ecf20Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_cq_event); 2598c2ecf20Sopenharmony_ci 2608c2ecf20Sopenharmony_ci if (wc->wr_id == RDS_IB_ACK_WR_ID) { 2618c2ecf20Sopenharmony_ci if (time_after(jiffies, ic->i_ack_queued + HZ / 2)) 2628c2ecf20Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_stalled); 2638c2ecf20Sopenharmony_ci rds_ib_ack_send_complete(ic); 2648c2ecf20Sopenharmony_ci return; 2658c2ecf20Sopenharmony_ci } 2668c2ecf20Sopenharmony_ci 2678c2ecf20Sopenharmony_ci oldest = rds_ib_ring_oldest(&ic->i_send_ring); 2688c2ecf20Sopenharmony_ci 2698c2ecf20Sopenharmony_ci completed = rds_ib_ring_completed(&ic->i_send_ring, wc->wr_id, oldest); 2708c2ecf20Sopenharmony_ci 2718c2ecf20Sopenharmony_ci for (i = 0; i < completed; i++) { 2728c2ecf20Sopenharmony_ci send = &ic->i_sends[oldest]; 2738c2ecf20Sopenharmony_ci if (send->s_wr.send_flags & IB_SEND_SIGNALED) 2748c2ecf20Sopenharmony_ci nr_sig++; 2758c2ecf20Sopenharmony_ci 2768c2ecf20Sopenharmony_ci rm = rds_ib_send_unmap_op(ic, send, wc->status); 2778c2ecf20Sopenharmony_ci 2788c2ecf20Sopenharmony_ci if (time_after(jiffies, send->s_queued + HZ / 2)) 2798c2ecf20Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_stalled); 2808c2ecf20Sopenharmony_ci 2818c2ecf20Sopenharmony_ci if (send->s_op) { 2828c2ecf20Sopenharmony_ci if (send->s_op == rm->m_final_op) { 2838c2ecf20Sopenharmony_ci /* If anyone waited for this message to get 2848c2ecf20Sopenharmony_ci * flushed out, wake them up now 2858c2ecf20Sopenharmony_ci */ 2868c2ecf20Sopenharmony_ci rds_message_unmapped(rm); 2878c2ecf20Sopenharmony_ci } 2888c2ecf20Sopenharmony_ci rds_message_put(rm); 2898c2ecf20Sopenharmony_ci send->s_op = NULL; 2908c2ecf20Sopenharmony_ci } 2918c2ecf20Sopenharmony_ci 2928c2ecf20Sopenharmony_ci oldest = (oldest + 1) % ic->i_send_ring.w_nr; 2938c2ecf20Sopenharmony_ci } 2948c2ecf20Sopenharmony_ci 2958c2ecf20Sopenharmony_ci rds_ib_ring_free(&ic->i_send_ring, completed); 2968c2ecf20Sopenharmony_ci rds_ib_sub_signaled(ic, nr_sig); 2978c2ecf20Sopenharmony_ci nr_sig = 0; 2988c2ecf20Sopenharmony_ci 2998c2ecf20Sopenharmony_ci if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || 3008c2ecf20Sopenharmony_ci test_bit(0, &conn->c_map_queued)) 3018c2ecf20Sopenharmony_ci queue_delayed_work(rds_wq, &conn->c_send_w, 0); 3028c2ecf20Sopenharmony_ci 3038c2ecf20Sopenharmony_ci /* We expect errors as the qp is drained during shutdown */ 3048c2ecf20Sopenharmony_ci if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { 3058c2ecf20Sopenharmony_ci rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n", 3068c2ecf20Sopenharmony_ci &conn->c_laddr, &conn->c_faddr, 3078c2ecf20Sopenharmony_ci conn->c_tos, wc->status, 3088c2ecf20Sopenharmony_ci ib_wc_status_msg(wc->status), wc->vendor_err); 3098c2ecf20Sopenharmony_ci } 3108c2ecf20Sopenharmony_ci} 3118c2ecf20Sopenharmony_ci 3128c2ecf20Sopenharmony_ci/* 3138c2ecf20Sopenharmony_ci * This is the main function for allocating credits when sending 3148c2ecf20Sopenharmony_ci * messages. 3158c2ecf20Sopenharmony_ci * 3168c2ecf20Sopenharmony_ci * Conceptually, we have two counters: 3178c2ecf20Sopenharmony_ci * - send credits: this tells us how many WRs we're allowed 3188c2ecf20Sopenharmony_ci * to submit without overruning the receiver's queue. For 3198c2ecf20Sopenharmony_ci * each SEND WR we post, we decrement this by one. 3208c2ecf20Sopenharmony_ci * 3218c2ecf20Sopenharmony_ci * - posted credits: this tells us how many WRs we recently 3228c2ecf20Sopenharmony_ci * posted to the receive queue. This value is transferred 3238c2ecf20Sopenharmony_ci * to the peer as a "credit update" in a RDS header field. 3248c2ecf20Sopenharmony_ci * Every time we transmit credits to the peer, we subtract 3258c2ecf20Sopenharmony_ci * the amount of transferred credits from this counter. 3268c2ecf20Sopenharmony_ci * 3278c2ecf20Sopenharmony_ci * It is essential that we avoid situations where both sides have 3288c2ecf20Sopenharmony_ci * exhausted their send credits, and are unable to send new credits 3298c2ecf20Sopenharmony_ci * to the peer. We achieve this by requiring that we send at least 3308c2ecf20Sopenharmony_ci * one credit update to the peer before exhausting our credits. 3318c2ecf20Sopenharmony_ci * When new credits arrive, we subtract one credit that is withheld 3328c2ecf20Sopenharmony_ci * until we've posted new buffers and are ready to transmit these 3338c2ecf20Sopenharmony_ci * credits (see rds_ib_send_add_credits below). 3348c2ecf20Sopenharmony_ci * 3358c2ecf20Sopenharmony_ci * The RDS send code is essentially single-threaded; rds_send_xmit 3368c2ecf20Sopenharmony_ci * sets RDS_IN_XMIT to ensure exclusive access to the send ring. 3378c2ecf20Sopenharmony_ci * However, the ACK sending code is independent and can race with 3388c2ecf20Sopenharmony_ci * message SENDs. 3398c2ecf20Sopenharmony_ci * 3408c2ecf20Sopenharmony_ci * In the send path, we need to update the counters for send credits 3418c2ecf20Sopenharmony_ci * and the counter of posted buffers atomically - when we use the 3428c2ecf20Sopenharmony_ci * last available credit, we cannot allow another thread to race us 3438c2ecf20Sopenharmony_ci * and grab the posted credits counter. Hence, we have to use a 3448c2ecf20Sopenharmony_ci * spinlock to protect the credit counter, or use atomics. 3458c2ecf20Sopenharmony_ci * 3468c2ecf20Sopenharmony_ci * Spinlocks shared between the send and the receive path are bad, 3478c2ecf20Sopenharmony_ci * because they create unnecessary delays. An early implementation 3488c2ecf20Sopenharmony_ci * using a spinlock showed a 5% degradation in throughput at some 3498c2ecf20Sopenharmony_ci * loads. 3508c2ecf20Sopenharmony_ci * 3518c2ecf20Sopenharmony_ci * This implementation avoids spinlocks completely, putting both 3528c2ecf20Sopenharmony_ci * counters into a single atomic, and updating that atomic using 3538c2ecf20Sopenharmony_ci * atomic_add (in the receive path, when receiving fresh credits), 3548c2ecf20Sopenharmony_ci * and using atomic_cmpxchg when updating the two counters. 3558c2ecf20Sopenharmony_ci */ 3568c2ecf20Sopenharmony_ciint rds_ib_send_grab_credits(struct rds_ib_connection *ic, 3578c2ecf20Sopenharmony_ci u32 wanted, u32 *adv_credits, int need_posted, int max_posted) 3588c2ecf20Sopenharmony_ci{ 3598c2ecf20Sopenharmony_ci unsigned int avail, posted, got = 0, advertise; 3608c2ecf20Sopenharmony_ci long oldval, newval; 3618c2ecf20Sopenharmony_ci 3628c2ecf20Sopenharmony_ci *adv_credits = 0; 3638c2ecf20Sopenharmony_ci if (!ic->i_flowctl) 3648c2ecf20Sopenharmony_ci return wanted; 3658c2ecf20Sopenharmony_ci 3668c2ecf20Sopenharmony_citry_again: 3678c2ecf20Sopenharmony_ci advertise = 0; 3688c2ecf20Sopenharmony_ci oldval = newval = atomic_read(&ic->i_credits); 3698c2ecf20Sopenharmony_ci posted = IB_GET_POST_CREDITS(oldval); 3708c2ecf20Sopenharmony_ci avail = IB_GET_SEND_CREDITS(oldval); 3718c2ecf20Sopenharmony_ci 3728c2ecf20Sopenharmony_ci rdsdebug("wanted=%u credits=%u posted=%u\n", 3738c2ecf20Sopenharmony_ci wanted, avail, posted); 3748c2ecf20Sopenharmony_ci 3758c2ecf20Sopenharmony_ci /* The last credit must be used to send a credit update. */ 3768c2ecf20Sopenharmony_ci if (avail && !posted) 3778c2ecf20Sopenharmony_ci avail--; 3788c2ecf20Sopenharmony_ci 3798c2ecf20Sopenharmony_ci if (avail < wanted) { 3808c2ecf20Sopenharmony_ci struct rds_connection *conn = ic->i_cm_id->context; 3818c2ecf20Sopenharmony_ci 3828c2ecf20Sopenharmony_ci /* Oops, there aren't that many credits left! */ 3838c2ecf20Sopenharmony_ci set_bit(RDS_LL_SEND_FULL, &conn->c_flags); 3848c2ecf20Sopenharmony_ci got = avail; 3858c2ecf20Sopenharmony_ci } else { 3868c2ecf20Sopenharmony_ci /* Sometimes you get what you want, lalala. */ 3878c2ecf20Sopenharmony_ci got = wanted; 3888c2ecf20Sopenharmony_ci } 3898c2ecf20Sopenharmony_ci newval -= IB_SET_SEND_CREDITS(got); 3908c2ecf20Sopenharmony_ci 3918c2ecf20Sopenharmony_ci /* 3928c2ecf20Sopenharmony_ci * If need_posted is non-zero, then the caller wants 3938c2ecf20Sopenharmony_ci * the posted regardless of whether any send credits are 3948c2ecf20Sopenharmony_ci * available. 3958c2ecf20Sopenharmony_ci */ 3968c2ecf20Sopenharmony_ci if (posted && (got || need_posted)) { 3978c2ecf20Sopenharmony_ci advertise = min_t(unsigned int, posted, max_posted); 3988c2ecf20Sopenharmony_ci newval -= IB_SET_POST_CREDITS(advertise); 3998c2ecf20Sopenharmony_ci } 4008c2ecf20Sopenharmony_ci 4018c2ecf20Sopenharmony_ci /* Finally bill everything */ 4028c2ecf20Sopenharmony_ci if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) 4038c2ecf20Sopenharmony_ci goto try_again; 4048c2ecf20Sopenharmony_ci 4058c2ecf20Sopenharmony_ci *adv_credits = advertise; 4068c2ecf20Sopenharmony_ci return got; 4078c2ecf20Sopenharmony_ci} 4088c2ecf20Sopenharmony_ci 4098c2ecf20Sopenharmony_civoid rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits) 4108c2ecf20Sopenharmony_ci{ 4118c2ecf20Sopenharmony_ci struct rds_ib_connection *ic = conn->c_transport_data; 4128c2ecf20Sopenharmony_ci 4138c2ecf20Sopenharmony_ci if (credits == 0) 4148c2ecf20Sopenharmony_ci return; 4158c2ecf20Sopenharmony_ci 4168c2ecf20Sopenharmony_ci rdsdebug("credits=%u current=%u%s\n", 4178c2ecf20Sopenharmony_ci credits, 4188c2ecf20Sopenharmony_ci IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)), 4198c2ecf20Sopenharmony_ci test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : ""); 4208c2ecf20Sopenharmony_ci 4218c2ecf20Sopenharmony_ci atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits); 4228c2ecf20Sopenharmony_ci if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)) 4238c2ecf20Sopenharmony_ci queue_delayed_work(rds_wq, &conn->c_send_w, 0); 4248c2ecf20Sopenharmony_ci 4258c2ecf20Sopenharmony_ci WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384); 4268c2ecf20Sopenharmony_ci 4278c2ecf20Sopenharmony_ci rds_ib_stats_inc(s_ib_rx_credit_updates); 4288c2ecf20Sopenharmony_ci} 4298c2ecf20Sopenharmony_ci 4308c2ecf20Sopenharmony_civoid rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) 4318c2ecf20Sopenharmony_ci{ 4328c2ecf20Sopenharmony_ci struct rds_ib_connection *ic = conn->c_transport_data; 4338c2ecf20Sopenharmony_ci 4348c2ecf20Sopenharmony_ci if (posted == 0) 4358c2ecf20Sopenharmony_ci return; 4368c2ecf20Sopenharmony_ci 4378c2ecf20Sopenharmony_ci atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits); 4388c2ecf20Sopenharmony_ci 4398c2ecf20Sopenharmony_ci /* Decide whether to send an update to the peer now. 4408c2ecf20Sopenharmony_ci * If we would send a credit update for every single buffer we 4418c2ecf20Sopenharmony_ci * post, we would end up with an ACK storm (ACK arrives, 4428c2ecf20Sopenharmony_ci * consumes buffer, we refill the ring, send ACK to remote 4438c2ecf20Sopenharmony_ci * advertising the newly posted buffer... ad inf) 4448c2ecf20Sopenharmony_ci * 4458c2ecf20Sopenharmony_ci * Performance pretty much depends on how often we send 4468c2ecf20Sopenharmony_ci * credit updates - too frequent updates mean lots of ACKs. 4478c2ecf20Sopenharmony_ci * Too infrequent updates, and the peer will run out of 4488c2ecf20Sopenharmony_ci * credits and has to throttle. 4498c2ecf20Sopenharmony_ci * For the time being, 16 seems to be a good compromise. 4508c2ecf20Sopenharmony_ci */ 4518c2ecf20Sopenharmony_ci if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16) 4528c2ecf20Sopenharmony_ci set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 4538c2ecf20Sopenharmony_ci} 4548c2ecf20Sopenharmony_ci 4558c2ecf20Sopenharmony_cistatic inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic, 4568c2ecf20Sopenharmony_ci struct rds_ib_send_work *send, 4578c2ecf20Sopenharmony_ci bool notify) 4588c2ecf20Sopenharmony_ci{ 4598c2ecf20Sopenharmony_ci /* 4608c2ecf20Sopenharmony_ci * We want to delay signaling completions just enough to get 4618c2ecf20Sopenharmony_ci * the batching benefits but not so much that we create dead time 4628c2ecf20Sopenharmony_ci * on the wire. 4638c2ecf20Sopenharmony_ci */ 4648c2ecf20Sopenharmony_ci if (ic->i_unsignaled_wrs-- == 0 || notify) { 4658c2ecf20Sopenharmony_ci ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; 4668c2ecf20Sopenharmony_ci send->s_wr.send_flags |= IB_SEND_SIGNALED; 4678c2ecf20Sopenharmony_ci return 1; 4688c2ecf20Sopenharmony_ci } 4698c2ecf20Sopenharmony_ci return 0; 4708c2ecf20Sopenharmony_ci} 4718c2ecf20Sopenharmony_ci 4728c2ecf20Sopenharmony_ci/* 4738c2ecf20Sopenharmony_ci * This can be called multiple times for a given message. The first time 4748c2ecf20Sopenharmony_ci * we see a message we map its scatterlist into the IB device so that 4758c2ecf20Sopenharmony_ci * we can provide that mapped address to the IB scatter gather entries 4768c2ecf20Sopenharmony_ci * in the IB work requests. We translate the scatterlist into a series 4778c2ecf20Sopenharmony_ci * of work requests that fragment the message. These work requests complete 4788c2ecf20Sopenharmony_ci * in order so we pass ownership of the message to the completion handler 4798c2ecf20Sopenharmony_ci * once we send the final fragment. 4808c2ecf20Sopenharmony_ci * 4818c2ecf20Sopenharmony_ci * The RDS core uses the c_send_lock to only enter this function once 4828c2ecf20Sopenharmony_ci * per connection. This makes sure that the tx ring alloc/unalloc pairs 4838c2ecf20Sopenharmony_ci * don't get out of sync and confuse the ring. 4848c2ecf20Sopenharmony_ci */ 4858c2ecf20Sopenharmony_ciint rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, 4868c2ecf20Sopenharmony_ci unsigned int hdr_off, unsigned int sg, unsigned int off) 4878c2ecf20Sopenharmony_ci{ 4888c2ecf20Sopenharmony_ci struct rds_ib_connection *ic = conn->c_transport_data; 4898c2ecf20Sopenharmony_ci struct ib_device *dev = ic->i_cm_id->device; 4908c2ecf20Sopenharmony_ci struct rds_ib_send_work *send = NULL; 4918c2ecf20Sopenharmony_ci struct rds_ib_send_work *first; 4928c2ecf20Sopenharmony_ci struct rds_ib_send_work *prev; 4938c2ecf20Sopenharmony_ci const struct ib_send_wr *failed_wr; 4948c2ecf20Sopenharmony_ci struct scatterlist *scat; 4958c2ecf20Sopenharmony_ci u32 pos; 4968c2ecf20Sopenharmony_ci u32 i; 4978c2ecf20Sopenharmony_ci u32 work_alloc; 4988c2ecf20Sopenharmony_ci u32 credit_alloc = 0; 4998c2ecf20Sopenharmony_ci u32 posted; 5008c2ecf20Sopenharmony_ci u32 adv_credits = 0; 5018c2ecf20Sopenharmony_ci int send_flags = 0; 5028c2ecf20Sopenharmony_ci int bytes_sent = 0; 5038c2ecf20Sopenharmony_ci int ret; 5048c2ecf20Sopenharmony_ci int flow_controlled = 0; 5058c2ecf20Sopenharmony_ci int nr_sig = 0; 5068c2ecf20Sopenharmony_ci 5078c2ecf20Sopenharmony_ci BUG_ON(off % RDS_FRAG_SIZE); 5088c2ecf20Sopenharmony_ci BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); 5098c2ecf20Sopenharmony_ci 5108c2ecf20Sopenharmony_ci /* Do not send cong updates to IB loopback */ 5118c2ecf20Sopenharmony_ci if (conn->c_loopback 5128c2ecf20Sopenharmony_ci && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { 5138c2ecf20Sopenharmony_ci rds_cong_map_updated(conn->c_fcong, ~(u64) 0); 5148c2ecf20Sopenharmony_ci scat = &rm->data.op_sg[sg]; 5158c2ecf20Sopenharmony_ci ret = max_t(int, RDS_CONG_MAP_BYTES, scat->length); 5168c2ecf20Sopenharmony_ci return sizeof(struct rds_header) + ret; 5178c2ecf20Sopenharmony_ci } 5188c2ecf20Sopenharmony_ci 5198c2ecf20Sopenharmony_ci /* FIXME we may overallocate here */ 5208c2ecf20Sopenharmony_ci if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) 5218c2ecf20Sopenharmony_ci i = 1; 5228c2ecf20Sopenharmony_ci else 5238c2ecf20Sopenharmony_ci i = DIV_ROUND_UP(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); 5248c2ecf20Sopenharmony_ci 5258c2ecf20Sopenharmony_ci work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); 5268c2ecf20Sopenharmony_ci if (work_alloc == 0) { 5278c2ecf20Sopenharmony_ci set_bit(RDS_LL_SEND_FULL, &conn->c_flags); 5288c2ecf20Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_ring_full); 5298c2ecf20Sopenharmony_ci ret = -ENOMEM; 5308c2ecf20Sopenharmony_ci goto out; 5318c2ecf20Sopenharmony_ci } 5328c2ecf20Sopenharmony_ci 5338c2ecf20Sopenharmony_ci if (ic->i_flowctl) { 5348c2ecf20Sopenharmony_ci credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); 5358c2ecf20Sopenharmony_ci adv_credits += posted; 5368c2ecf20Sopenharmony_ci if (credit_alloc < work_alloc) { 5378c2ecf20Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); 5388c2ecf20Sopenharmony_ci work_alloc = credit_alloc; 5398c2ecf20Sopenharmony_ci flow_controlled = 1; 5408c2ecf20Sopenharmony_ci } 5418c2ecf20Sopenharmony_ci if (work_alloc == 0) { 5428c2ecf20Sopenharmony_ci set_bit(RDS_LL_SEND_FULL, &conn->c_flags); 5438c2ecf20Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_throttle); 5448c2ecf20Sopenharmony_ci ret = -ENOMEM; 5458c2ecf20Sopenharmony_ci goto out; 5468c2ecf20Sopenharmony_ci } 5478c2ecf20Sopenharmony_ci } 5488c2ecf20Sopenharmony_ci 5498c2ecf20Sopenharmony_ci /* map the message the first time we see it */ 5508c2ecf20Sopenharmony_ci if (!ic->i_data_op) { 5518c2ecf20Sopenharmony_ci if (rm->data.op_nents) { 5528c2ecf20Sopenharmony_ci rm->data.op_count = ib_dma_map_sg(dev, 5538c2ecf20Sopenharmony_ci rm->data.op_sg, 5548c2ecf20Sopenharmony_ci rm->data.op_nents, 5558c2ecf20Sopenharmony_ci DMA_TO_DEVICE); 5568c2ecf20Sopenharmony_ci rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); 5578c2ecf20Sopenharmony_ci if (rm->data.op_count == 0) { 5588c2ecf20Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); 5598c2ecf20Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 5608c2ecf20Sopenharmony_ci ret = -ENOMEM; /* XXX ? */ 5618c2ecf20Sopenharmony_ci goto out; 5628c2ecf20Sopenharmony_ci } 5638c2ecf20Sopenharmony_ci } else { 5648c2ecf20Sopenharmony_ci rm->data.op_count = 0; 5658c2ecf20Sopenharmony_ci } 5668c2ecf20Sopenharmony_ci 5678c2ecf20Sopenharmony_ci rds_message_addref(rm); 5688c2ecf20Sopenharmony_ci rm->data.op_dmasg = 0; 5698c2ecf20Sopenharmony_ci rm->data.op_dmaoff = 0; 5708c2ecf20Sopenharmony_ci ic->i_data_op = &rm->data; 5718c2ecf20Sopenharmony_ci 5728c2ecf20Sopenharmony_ci /* Finalize the header */ 5738c2ecf20Sopenharmony_ci if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) 5748c2ecf20Sopenharmony_ci rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED; 5758c2ecf20Sopenharmony_ci if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) 5768c2ecf20Sopenharmony_ci rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; 5778c2ecf20Sopenharmony_ci 5788c2ecf20Sopenharmony_ci /* If it has a RDMA op, tell the peer we did it. This is 5798c2ecf20Sopenharmony_ci * used by the peer to release use-once RDMA MRs. */ 5808c2ecf20Sopenharmony_ci if (rm->rdma.op_active) { 5818c2ecf20Sopenharmony_ci struct rds_ext_header_rdma ext_hdr; 5828c2ecf20Sopenharmony_ci 5838c2ecf20Sopenharmony_ci ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); 5848c2ecf20Sopenharmony_ci rds_message_add_extension(&rm->m_inc.i_hdr, 5858c2ecf20Sopenharmony_ci RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); 5868c2ecf20Sopenharmony_ci } 5878c2ecf20Sopenharmony_ci if (rm->m_rdma_cookie) { 5888c2ecf20Sopenharmony_ci rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, 5898c2ecf20Sopenharmony_ci rds_rdma_cookie_key(rm->m_rdma_cookie), 5908c2ecf20Sopenharmony_ci rds_rdma_cookie_offset(rm->m_rdma_cookie)); 5918c2ecf20Sopenharmony_ci } 5928c2ecf20Sopenharmony_ci 5938c2ecf20Sopenharmony_ci /* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so 5948c2ecf20Sopenharmony_ci * we should not do this unless we have a chance of at least 5958c2ecf20Sopenharmony_ci * sticking the header into the send ring. Which is why we 5968c2ecf20Sopenharmony_ci * should call rds_ib_ring_alloc first. */ 5978c2ecf20Sopenharmony_ci rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic)); 5988c2ecf20Sopenharmony_ci rds_message_make_checksum(&rm->m_inc.i_hdr); 5998c2ecf20Sopenharmony_ci 6008c2ecf20Sopenharmony_ci /* 6018c2ecf20Sopenharmony_ci * Update adv_credits since we reset the ACK_REQUIRED bit. 6028c2ecf20Sopenharmony_ci */ 6038c2ecf20Sopenharmony_ci if (ic->i_flowctl) { 6048c2ecf20Sopenharmony_ci rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); 6058c2ecf20Sopenharmony_ci adv_credits += posted; 6068c2ecf20Sopenharmony_ci BUG_ON(adv_credits > 255); 6078c2ecf20Sopenharmony_ci } 6088c2ecf20Sopenharmony_ci } 6098c2ecf20Sopenharmony_ci 6108c2ecf20Sopenharmony_ci /* Sometimes you want to put a fence between an RDMA 6118c2ecf20Sopenharmony_ci * READ and the following SEND. 6128c2ecf20Sopenharmony_ci * We could either do this all the time 6138c2ecf20Sopenharmony_ci * or when requested by the user. Right now, we let 6148c2ecf20Sopenharmony_ci * the application choose. 6158c2ecf20Sopenharmony_ci */ 6168c2ecf20Sopenharmony_ci if (rm->rdma.op_active && rm->rdma.op_fence) 6178c2ecf20Sopenharmony_ci send_flags = IB_SEND_FENCE; 6188c2ecf20Sopenharmony_ci 6198c2ecf20Sopenharmony_ci /* Each frag gets a header. Msgs may be 0 bytes */ 6208c2ecf20Sopenharmony_ci send = &ic->i_sends[pos]; 6218c2ecf20Sopenharmony_ci first = send; 6228c2ecf20Sopenharmony_ci prev = NULL; 6238c2ecf20Sopenharmony_ci scat = &ic->i_data_op->op_sg[rm->data.op_dmasg]; 6248c2ecf20Sopenharmony_ci i = 0; 6258c2ecf20Sopenharmony_ci do { 6268c2ecf20Sopenharmony_ci unsigned int len = 0; 6278c2ecf20Sopenharmony_ci 6288c2ecf20Sopenharmony_ci /* Set up the header */ 6298c2ecf20Sopenharmony_ci send->s_wr.send_flags = send_flags; 6308c2ecf20Sopenharmony_ci send->s_wr.opcode = IB_WR_SEND; 6318c2ecf20Sopenharmony_ci send->s_wr.num_sge = 1; 6328c2ecf20Sopenharmony_ci send->s_wr.next = NULL; 6338c2ecf20Sopenharmony_ci send->s_queued = jiffies; 6348c2ecf20Sopenharmony_ci send->s_op = NULL; 6358c2ecf20Sopenharmony_ci 6368c2ecf20Sopenharmony_ci send->s_sge[0].addr = ic->i_send_hdrs_dma[pos]; 6378c2ecf20Sopenharmony_ci 6388c2ecf20Sopenharmony_ci send->s_sge[0].length = sizeof(struct rds_header); 6398c2ecf20Sopenharmony_ci send->s_sge[0].lkey = ic->i_pd->local_dma_lkey; 6408c2ecf20Sopenharmony_ci 6418c2ecf20Sopenharmony_ci ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, 6428c2ecf20Sopenharmony_ci ic->i_send_hdrs_dma[pos], 6438c2ecf20Sopenharmony_ci sizeof(struct rds_header), 6448c2ecf20Sopenharmony_ci DMA_TO_DEVICE); 6458c2ecf20Sopenharmony_ci memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, 6468c2ecf20Sopenharmony_ci sizeof(struct rds_header)); 6478c2ecf20Sopenharmony_ci 6488c2ecf20Sopenharmony_ci 6498c2ecf20Sopenharmony_ci /* Set up the data, if present */ 6508c2ecf20Sopenharmony_ci if (i < work_alloc 6518c2ecf20Sopenharmony_ci && scat != &rm->data.op_sg[rm->data.op_count]) { 6528c2ecf20Sopenharmony_ci len = min(RDS_FRAG_SIZE, 6538c2ecf20Sopenharmony_ci sg_dma_len(scat) - rm->data.op_dmaoff); 6548c2ecf20Sopenharmony_ci send->s_wr.num_sge = 2; 6558c2ecf20Sopenharmony_ci 6568c2ecf20Sopenharmony_ci send->s_sge[1].addr = sg_dma_address(scat); 6578c2ecf20Sopenharmony_ci send->s_sge[1].addr += rm->data.op_dmaoff; 6588c2ecf20Sopenharmony_ci send->s_sge[1].length = len; 6598c2ecf20Sopenharmony_ci send->s_sge[1].lkey = ic->i_pd->local_dma_lkey; 6608c2ecf20Sopenharmony_ci 6618c2ecf20Sopenharmony_ci bytes_sent += len; 6628c2ecf20Sopenharmony_ci rm->data.op_dmaoff += len; 6638c2ecf20Sopenharmony_ci if (rm->data.op_dmaoff == sg_dma_len(scat)) { 6648c2ecf20Sopenharmony_ci scat++; 6658c2ecf20Sopenharmony_ci rm->data.op_dmasg++; 6668c2ecf20Sopenharmony_ci rm->data.op_dmaoff = 0; 6678c2ecf20Sopenharmony_ci } 6688c2ecf20Sopenharmony_ci } 6698c2ecf20Sopenharmony_ci 6708c2ecf20Sopenharmony_ci rds_ib_set_wr_signal_state(ic, send, false); 6718c2ecf20Sopenharmony_ci 6728c2ecf20Sopenharmony_ci /* 6738c2ecf20Sopenharmony_ci * Always signal the last one if we're stopping due to flow control. 6748c2ecf20Sopenharmony_ci */ 6758c2ecf20Sopenharmony_ci if (ic->i_flowctl && flow_controlled && i == (work_alloc - 1)) { 6768c2ecf20Sopenharmony_ci rds_ib_set_wr_signal_state(ic, send, true); 6778c2ecf20Sopenharmony_ci send->s_wr.send_flags |= IB_SEND_SOLICITED; 6788c2ecf20Sopenharmony_ci } 6798c2ecf20Sopenharmony_ci 6808c2ecf20Sopenharmony_ci if (send->s_wr.send_flags & IB_SEND_SIGNALED) 6818c2ecf20Sopenharmony_ci nr_sig++; 6828c2ecf20Sopenharmony_ci 6838c2ecf20Sopenharmony_ci rdsdebug("send %p wr %p num_sge %u next %p\n", send, 6848c2ecf20Sopenharmony_ci &send->s_wr, send->s_wr.num_sge, send->s_wr.next); 6858c2ecf20Sopenharmony_ci 6868c2ecf20Sopenharmony_ci if (ic->i_flowctl && adv_credits) { 6878c2ecf20Sopenharmony_ci struct rds_header *hdr = ic->i_send_hdrs[pos]; 6888c2ecf20Sopenharmony_ci 6898c2ecf20Sopenharmony_ci /* add credit and redo the header checksum */ 6908c2ecf20Sopenharmony_ci hdr->h_credit = adv_credits; 6918c2ecf20Sopenharmony_ci rds_message_make_checksum(hdr); 6928c2ecf20Sopenharmony_ci adv_credits = 0; 6938c2ecf20Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_credit_updates); 6948c2ecf20Sopenharmony_ci } 6958c2ecf20Sopenharmony_ci ib_dma_sync_single_for_device(ic->rds_ibdev->dev, 6968c2ecf20Sopenharmony_ci ic->i_send_hdrs_dma[pos], 6978c2ecf20Sopenharmony_ci sizeof(struct rds_header), 6988c2ecf20Sopenharmony_ci DMA_TO_DEVICE); 6998c2ecf20Sopenharmony_ci 7008c2ecf20Sopenharmony_ci if (prev) 7018c2ecf20Sopenharmony_ci prev->s_wr.next = &send->s_wr; 7028c2ecf20Sopenharmony_ci prev = send; 7038c2ecf20Sopenharmony_ci 7048c2ecf20Sopenharmony_ci pos = (pos + 1) % ic->i_send_ring.w_nr; 7058c2ecf20Sopenharmony_ci send = &ic->i_sends[pos]; 7068c2ecf20Sopenharmony_ci i++; 7078c2ecf20Sopenharmony_ci 7088c2ecf20Sopenharmony_ci } while (i < work_alloc 7098c2ecf20Sopenharmony_ci && scat != &rm->data.op_sg[rm->data.op_count]); 7108c2ecf20Sopenharmony_ci 7118c2ecf20Sopenharmony_ci /* Account the RDS header in the number of bytes we sent, but just once. 7128c2ecf20Sopenharmony_ci * The caller has no concept of fragmentation. */ 7138c2ecf20Sopenharmony_ci if (hdr_off == 0) 7148c2ecf20Sopenharmony_ci bytes_sent += sizeof(struct rds_header); 7158c2ecf20Sopenharmony_ci 7168c2ecf20Sopenharmony_ci /* if we finished the message then send completion owns it */ 7178c2ecf20Sopenharmony_ci if (scat == &rm->data.op_sg[rm->data.op_count]) { 7188c2ecf20Sopenharmony_ci prev->s_op = ic->i_data_op; 7198c2ecf20Sopenharmony_ci prev->s_wr.send_flags |= IB_SEND_SOLICITED; 7208c2ecf20Sopenharmony_ci if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED)) 7218c2ecf20Sopenharmony_ci nr_sig += rds_ib_set_wr_signal_state(ic, prev, true); 7228c2ecf20Sopenharmony_ci ic->i_data_op = NULL; 7238c2ecf20Sopenharmony_ci } 7248c2ecf20Sopenharmony_ci 7258c2ecf20Sopenharmony_ci /* Put back wrs & credits we didn't use */ 7268c2ecf20Sopenharmony_ci if (i < work_alloc) { 7278c2ecf20Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); 7288c2ecf20Sopenharmony_ci work_alloc = i; 7298c2ecf20Sopenharmony_ci } 7308c2ecf20Sopenharmony_ci if (ic->i_flowctl && i < credit_alloc) 7318c2ecf20Sopenharmony_ci rds_ib_send_add_credits(conn, credit_alloc - i); 7328c2ecf20Sopenharmony_ci 7338c2ecf20Sopenharmony_ci if (nr_sig) 7348c2ecf20Sopenharmony_ci atomic_add(nr_sig, &ic->i_signaled_sends); 7358c2ecf20Sopenharmony_ci 7368c2ecf20Sopenharmony_ci /* XXX need to worry about failed_wr and partial sends. */ 7378c2ecf20Sopenharmony_ci failed_wr = &first->s_wr; 7388c2ecf20Sopenharmony_ci ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); 7398c2ecf20Sopenharmony_ci rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, 7408c2ecf20Sopenharmony_ci first, &first->s_wr, ret, failed_wr); 7418c2ecf20Sopenharmony_ci BUG_ON(failed_wr != &first->s_wr); 7428c2ecf20Sopenharmony_ci if (ret) { 7438c2ecf20Sopenharmony_ci printk(KERN_WARNING "RDS/IB: ib_post_send to %pI6c " 7448c2ecf20Sopenharmony_ci "returned %d\n", &conn->c_faddr, ret); 7458c2ecf20Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 7468c2ecf20Sopenharmony_ci rds_ib_sub_signaled(ic, nr_sig); 7478c2ecf20Sopenharmony_ci if (prev->s_op) { 7488c2ecf20Sopenharmony_ci ic->i_data_op = prev->s_op; 7498c2ecf20Sopenharmony_ci prev->s_op = NULL; 7508c2ecf20Sopenharmony_ci } 7518c2ecf20Sopenharmony_ci 7528c2ecf20Sopenharmony_ci rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); 7538c2ecf20Sopenharmony_ci goto out; 7548c2ecf20Sopenharmony_ci } 7558c2ecf20Sopenharmony_ci 7568c2ecf20Sopenharmony_ci ret = bytes_sent; 7578c2ecf20Sopenharmony_ciout: 7588c2ecf20Sopenharmony_ci BUG_ON(adv_credits); 7598c2ecf20Sopenharmony_ci return ret; 7608c2ecf20Sopenharmony_ci} 7618c2ecf20Sopenharmony_ci 7628c2ecf20Sopenharmony_ci/* 7638c2ecf20Sopenharmony_ci * Issue atomic operation. 7648c2ecf20Sopenharmony_ci * A simplified version of the rdma case, we always map 1 SG, and 7658c2ecf20Sopenharmony_ci * only 8 bytes, for the return value from the atomic operation. 7668c2ecf20Sopenharmony_ci */ 7678c2ecf20Sopenharmony_ciint rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) 7688c2ecf20Sopenharmony_ci{ 7698c2ecf20Sopenharmony_ci struct rds_ib_connection *ic = conn->c_transport_data; 7708c2ecf20Sopenharmony_ci struct rds_ib_send_work *send = NULL; 7718c2ecf20Sopenharmony_ci const struct ib_send_wr *failed_wr; 7728c2ecf20Sopenharmony_ci u32 pos; 7738c2ecf20Sopenharmony_ci u32 work_alloc; 7748c2ecf20Sopenharmony_ci int ret; 7758c2ecf20Sopenharmony_ci int nr_sig = 0; 7768c2ecf20Sopenharmony_ci 7778c2ecf20Sopenharmony_ci work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); 7788c2ecf20Sopenharmony_ci if (work_alloc != 1) { 7798c2ecf20Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_ring_full); 7808c2ecf20Sopenharmony_ci ret = -ENOMEM; 7818c2ecf20Sopenharmony_ci goto out; 7828c2ecf20Sopenharmony_ci } 7838c2ecf20Sopenharmony_ci 7848c2ecf20Sopenharmony_ci /* address of send request in ring */ 7858c2ecf20Sopenharmony_ci send = &ic->i_sends[pos]; 7868c2ecf20Sopenharmony_ci send->s_queued = jiffies; 7878c2ecf20Sopenharmony_ci 7888c2ecf20Sopenharmony_ci if (op->op_type == RDS_ATOMIC_TYPE_CSWP) { 7898c2ecf20Sopenharmony_ci send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP; 7908c2ecf20Sopenharmony_ci send->s_atomic_wr.compare_add = op->op_m_cswp.compare; 7918c2ecf20Sopenharmony_ci send->s_atomic_wr.swap = op->op_m_cswp.swap; 7928c2ecf20Sopenharmony_ci send->s_atomic_wr.compare_add_mask = op->op_m_cswp.compare_mask; 7938c2ecf20Sopenharmony_ci send->s_atomic_wr.swap_mask = op->op_m_cswp.swap_mask; 7948c2ecf20Sopenharmony_ci } else { /* FADD */ 7958c2ecf20Sopenharmony_ci send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; 7968c2ecf20Sopenharmony_ci send->s_atomic_wr.compare_add = op->op_m_fadd.add; 7978c2ecf20Sopenharmony_ci send->s_atomic_wr.swap = 0; 7988c2ecf20Sopenharmony_ci send->s_atomic_wr.compare_add_mask = op->op_m_fadd.nocarry_mask; 7998c2ecf20Sopenharmony_ci send->s_atomic_wr.swap_mask = 0; 8008c2ecf20Sopenharmony_ci } 8018c2ecf20Sopenharmony_ci send->s_wr.send_flags = 0; 8028c2ecf20Sopenharmony_ci nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify); 8038c2ecf20Sopenharmony_ci send->s_atomic_wr.wr.num_sge = 1; 8048c2ecf20Sopenharmony_ci send->s_atomic_wr.wr.next = NULL; 8058c2ecf20Sopenharmony_ci send->s_atomic_wr.remote_addr = op->op_remote_addr; 8068c2ecf20Sopenharmony_ci send->s_atomic_wr.rkey = op->op_rkey; 8078c2ecf20Sopenharmony_ci send->s_op = op; 8088c2ecf20Sopenharmony_ci rds_message_addref(container_of(send->s_op, struct rds_message, atomic)); 8098c2ecf20Sopenharmony_ci 8108c2ecf20Sopenharmony_ci /* map 8 byte retval buffer to the device */ 8118c2ecf20Sopenharmony_ci ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE); 8128c2ecf20Sopenharmony_ci rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret); 8138c2ecf20Sopenharmony_ci if (ret != 1) { 8148c2ecf20Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 8158c2ecf20Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); 8168c2ecf20Sopenharmony_ci ret = -ENOMEM; /* XXX ? */ 8178c2ecf20Sopenharmony_ci goto out; 8188c2ecf20Sopenharmony_ci } 8198c2ecf20Sopenharmony_ci 8208c2ecf20Sopenharmony_ci /* Convert our struct scatterlist to struct ib_sge */ 8218c2ecf20Sopenharmony_ci send->s_sge[0].addr = sg_dma_address(op->op_sg); 8228c2ecf20Sopenharmony_ci send->s_sge[0].length = sg_dma_len(op->op_sg); 8238c2ecf20Sopenharmony_ci send->s_sge[0].lkey = ic->i_pd->local_dma_lkey; 8248c2ecf20Sopenharmony_ci 8258c2ecf20Sopenharmony_ci rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, 8268c2ecf20Sopenharmony_ci send->s_sge[0].addr, send->s_sge[0].length); 8278c2ecf20Sopenharmony_ci 8288c2ecf20Sopenharmony_ci if (nr_sig) 8298c2ecf20Sopenharmony_ci atomic_add(nr_sig, &ic->i_signaled_sends); 8308c2ecf20Sopenharmony_ci 8318c2ecf20Sopenharmony_ci failed_wr = &send->s_atomic_wr.wr; 8328c2ecf20Sopenharmony_ci ret = ib_post_send(ic->i_cm_id->qp, &send->s_atomic_wr.wr, &failed_wr); 8338c2ecf20Sopenharmony_ci rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic, 8348c2ecf20Sopenharmony_ci send, &send->s_atomic_wr, ret, failed_wr); 8358c2ecf20Sopenharmony_ci BUG_ON(failed_wr != &send->s_atomic_wr.wr); 8368c2ecf20Sopenharmony_ci if (ret) { 8378c2ecf20Sopenharmony_ci printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI6c " 8388c2ecf20Sopenharmony_ci "returned %d\n", &conn->c_faddr, ret); 8398c2ecf20Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 8408c2ecf20Sopenharmony_ci rds_ib_sub_signaled(ic, nr_sig); 8418c2ecf20Sopenharmony_ci goto out; 8428c2ecf20Sopenharmony_ci } 8438c2ecf20Sopenharmony_ci 8448c2ecf20Sopenharmony_ci if (unlikely(failed_wr != &send->s_atomic_wr.wr)) { 8458c2ecf20Sopenharmony_ci printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret); 8468c2ecf20Sopenharmony_ci BUG_ON(failed_wr != &send->s_atomic_wr.wr); 8478c2ecf20Sopenharmony_ci } 8488c2ecf20Sopenharmony_ci 8498c2ecf20Sopenharmony_ciout: 8508c2ecf20Sopenharmony_ci return ret; 8518c2ecf20Sopenharmony_ci} 8528c2ecf20Sopenharmony_ci 8538c2ecf20Sopenharmony_ciint rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) 8548c2ecf20Sopenharmony_ci{ 8558c2ecf20Sopenharmony_ci struct rds_ib_connection *ic = conn->c_transport_data; 8568c2ecf20Sopenharmony_ci struct rds_ib_send_work *send = NULL; 8578c2ecf20Sopenharmony_ci struct rds_ib_send_work *first; 8588c2ecf20Sopenharmony_ci struct rds_ib_send_work *prev; 8598c2ecf20Sopenharmony_ci const struct ib_send_wr *failed_wr; 8608c2ecf20Sopenharmony_ci struct scatterlist *scat; 8618c2ecf20Sopenharmony_ci unsigned long len; 8628c2ecf20Sopenharmony_ci u64 remote_addr = op->op_remote_addr; 8638c2ecf20Sopenharmony_ci u32 max_sge = ic->rds_ibdev->max_sge; 8648c2ecf20Sopenharmony_ci u32 pos; 8658c2ecf20Sopenharmony_ci u32 work_alloc; 8668c2ecf20Sopenharmony_ci u32 i; 8678c2ecf20Sopenharmony_ci u32 j; 8688c2ecf20Sopenharmony_ci int sent; 8698c2ecf20Sopenharmony_ci int ret; 8708c2ecf20Sopenharmony_ci int num_sge; 8718c2ecf20Sopenharmony_ci int nr_sig = 0; 8728c2ecf20Sopenharmony_ci u64 odp_addr = op->op_odp_addr; 8738c2ecf20Sopenharmony_ci u32 odp_lkey = 0; 8748c2ecf20Sopenharmony_ci 8758c2ecf20Sopenharmony_ci /* map the op the first time we see it */ 8768c2ecf20Sopenharmony_ci if (!op->op_odp_mr) { 8778c2ecf20Sopenharmony_ci if (!op->op_mapped) { 8788c2ecf20Sopenharmony_ci op->op_count = 8798c2ecf20Sopenharmony_ci ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 8808c2ecf20Sopenharmony_ci op->op_nents, 8818c2ecf20Sopenharmony_ci (op->op_write) ? DMA_TO_DEVICE : 8828c2ecf20Sopenharmony_ci DMA_FROM_DEVICE); 8838c2ecf20Sopenharmony_ci rdsdebug("ic %p mapping op %p: %d\n", ic, op, 8848c2ecf20Sopenharmony_ci op->op_count); 8858c2ecf20Sopenharmony_ci if (op->op_count == 0) { 8868c2ecf20Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); 8878c2ecf20Sopenharmony_ci ret = -ENOMEM; /* XXX ? */ 8888c2ecf20Sopenharmony_ci goto out; 8898c2ecf20Sopenharmony_ci } 8908c2ecf20Sopenharmony_ci op->op_mapped = 1; 8918c2ecf20Sopenharmony_ci } 8928c2ecf20Sopenharmony_ci } else { 8938c2ecf20Sopenharmony_ci op->op_count = op->op_nents; 8948c2ecf20Sopenharmony_ci odp_lkey = rds_ib_get_lkey(op->op_odp_mr->r_trans_private); 8958c2ecf20Sopenharmony_ci } 8968c2ecf20Sopenharmony_ci 8978c2ecf20Sopenharmony_ci /* 8988c2ecf20Sopenharmony_ci * Instead of knowing how to return a partial rdma read/write we insist that there 8998c2ecf20Sopenharmony_ci * be enough work requests to send the entire message. 9008c2ecf20Sopenharmony_ci */ 9018c2ecf20Sopenharmony_ci i = DIV_ROUND_UP(op->op_count, max_sge); 9028c2ecf20Sopenharmony_ci 9038c2ecf20Sopenharmony_ci work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); 9048c2ecf20Sopenharmony_ci if (work_alloc != i) { 9058c2ecf20Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 9068c2ecf20Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_ring_full); 9078c2ecf20Sopenharmony_ci ret = -ENOMEM; 9088c2ecf20Sopenharmony_ci goto out; 9098c2ecf20Sopenharmony_ci } 9108c2ecf20Sopenharmony_ci 9118c2ecf20Sopenharmony_ci send = &ic->i_sends[pos]; 9128c2ecf20Sopenharmony_ci first = send; 9138c2ecf20Sopenharmony_ci prev = NULL; 9148c2ecf20Sopenharmony_ci scat = &op->op_sg[0]; 9158c2ecf20Sopenharmony_ci sent = 0; 9168c2ecf20Sopenharmony_ci num_sge = op->op_count; 9178c2ecf20Sopenharmony_ci 9188c2ecf20Sopenharmony_ci for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { 9198c2ecf20Sopenharmony_ci send->s_wr.send_flags = 0; 9208c2ecf20Sopenharmony_ci send->s_queued = jiffies; 9218c2ecf20Sopenharmony_ci send->s_op = NULL; 9228c2ecf20Sopenharmony_ci 9238c2ecf20Sopenharmony_ci if (!op->op_notify) 9248c2ecf20Sopenharmony_ci nr_sig += rds_ib_set_wr_signal_state(ic, send, 9258c2ecf20Sopenharmony_ci op->op_notify); 9268c2ecf20Sopenharmony_ci 9278c2ecf20Sopenharmony_ci send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; 9288c2ecf20Sopenharmony_ci send->s_rdma_wr.remote_addr = remote_addr; 9298c2ecf20Sopenharmony_ci send->s_rdma_wr.rkey = op->op_rkey; 9308c2ecf20Sopenharmony_ci 9318c2ecf20Sopenharmony_ci if (num_sge > max_sge) { 9328c2ecf20Sopenharmony_ci send->s_rdma_wr.wr.num_sge = max_sge; 9338c2ecf20Sopenharmony_ci num_sge -= max_sge; 9348c2ecf20Sopenharmony_ci } else { 9358c2ecf20Sopenharmony_ci send->s_rdma_wr.wr.num_sge = num_sge; 9368c2ecf20Sopenharmony_ci } 9378c2ecf20Sopenharmony_ci 9388c2ecf20Sopenharmony_ci send->s_rdma_wr.wr.next = NULL; 9398c2ecf20Sopenharmony_ci 9408c2ecf20Sopenharmony_ci if (prev) 9418c2ecf20Sopenharmony_ci prev->s_rdma_wr.wr.next = &send->s_rdma_wr.wr; 9428c2ecf20Sopenharmony_ci 9438c2ecf20Sopenharmony_ci for (j = 0; j < send->s_rdma_wr.wr.num_sge && 9448c2ecf20Sopenharmony_ci scat != &op->op_sg[op->op_count]; j++) { 9458c2ecf20Sopenharmony_ci len = sg_dma_len(scat); 9468c2ecf20Sopenharmony_ci if (!op->op_odp_mr) { 9478c2ecf20Sopenharmony_ci send->s_sge[j].addr = sg_dma_address(scat); 9488c2ecf20Sopenharmony_ci send->s_sge[j].lkey = ic->i_pd->local_dma_lkey; 9498c2ecf20Sopenharmony_ci } else { 9508c2ecf20Sopenharmony_ci send->s_sge[j].addr = odp_addr; 9518c2ecf20Sopenharmony_ci send->s_sge[j].lkey = odp_lkey; 9528c2ecf20Sopenharmony_ci } 9538c2ecf20Sopenharmony_ci send->s_sge[j].length = len; 9548c2ecf20Sopenharmony_ci 9558c2ecf20Sopenharmony_ci sent += len; 9568c2ecf20Sopenharmony_ci rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); 9578c2ecf20Sopenharmony_ci 9588c2ecf20Sopenharmony_ci remote_addr += len; 9598c2ecf20Sopenharmony_ci odp_addr += len; 9608c2ecf20Sopenharmony_ci scat++; 9618c2ecf20Sopenharmony_ci } 9628c2ecf20Sopenharmony_ci 9638c2ecf20Sopenharmony_ci rdsdebug("send %p wr %p num_sge %u next %p\n", send, 9648c2ecf20Sopenharmony_ci &send->s_rdma_wr.wr, 9658c2ecf20Sopenharmony_ci send->s_rdma_wr.wr.num_sge, 9668c2ecf20Sopenharmony_ci send->s_rdma_wr.wr.next); 9678c2ecf20Sopenharmony_ci 9688c2ecf20Sopenharmony_ci prev = send; 9698c2ecf20Sopenharmony_ci if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) 9708c2ecf20Sopenharmony_ci send = ic->i_sends; 9718c2ecf20Sopenharmony_ci } 9728c2ecf20Sopenharmony_ci 9738c2ecf20Sopenharmony_ci /* give a reference to the last op */ 9748c2ecf20Sopenharmony_ci if (scat == &op->op_sg[op->op_count]) { 9758c2ecf20Sopenharmony_ci prev->s_op = op; 9768c2ecf20Sopenharmony_ci rds_message_addref(container_of(op, struct rds_message, rdma)); 9778c2ecf20Sopenharmony_ci } 9788c2ecf20Sopenharmony_ci 9798c2ecf20Sopenharmony_ci if (i < work_alloc) { 9808c2ecf20Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); 9818c2ecf20Sopenharmony_ci work_alloc = i; 9828c2ecf20Sopenharmony_ci } 9838c2ecf20Sopenharmony_ci 9848c2ecf20Sopenharmony_ci if (nr_sig) 9858c2ecf20Sopenharmony_ci atomic_add(nr_sig, &ic->i_signaled_sends); 9868c2ecf20Sopenharmony_ci 9878c2ecf20Sopenharmony_ci failed_wr = &first->s_rdma_wr.wr; 9888c2ecf20Sopenharmony_ci ret = ib_post_send(ic->i_cm_id->qp, &first->s_rdma_wr.wr, &failed_wr); 9898c2ecf20Sopenharmony_ci rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, 9908c2ecf20Sopenharmony_ci first, &first->s_rdma_wr.wr, ret, failed_wr); 9918c2ecf20Sopenharmony_ci BUG_ON(failed_wr != &first->s_rdma_wr.wr); 9928c2ecf20Sopenharmony_ci if (ret) { 9938c2ecf20Sopenharmony_ci printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI6c " 9948c2ecf20Sopenharmony_ci "returned %d\n", &conn->c_faddr, ret); 9958c2ecf20Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 9968c2ecf20Sopenharmony_ci rds_ib_sub_signaled(ic, nr_sig); 9978c2ecf20Sopenharmony_ci goto out; 9988c2ecf20Sopenharmony_ci } 9998c2ecf20Sopenharmony_ci 10008c2ecf20Sopenharmony_ci if (unlikely(failed_wr != &first->s_rdma_wr.wr)) { 10018c2ecf20Sopenharmony_ci printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret); 10028c2ecf20Sopenharmony_ci BUG_ON(failed_wr != &first->s_rdma_wr.wr); 10038c2ecf20Sopenharmony_ci } 10048c2ecf20Sopenharmony_ci 10058c2ecf20Sopenharmony_ci 10068c2ecf20Sopenharmony_ciout: 10078c2ecf20Sopenharmony_ci return ret; 10088c2ecf20Sopenharmony_ci} 10098c2ecf20Sopenharmony_ci 10108c2ecf20Sopenharmony_civoid rds_ib_xmit_path_complete(struct rds_conn_path *cp) 10118c2ecf20Sopenharmony_ci{ 10128c2ecf20Sopenharmony_ci struct rds_connection *conn = cp->cp_conn; 10138c2ecf20Sopenharmony_ci struct rds_ib_connection *ic = conn->c_transport_data; 10148c2ecf20Sopenharmony_ci 10158c2ecf20Sopenharmony_ci /* We may have a pending ACK or window update we were unable 10168c2ecf20Sopenharmony_ci * to send previously (due to flow control). Try again. */ 10178c2ecf20Sopenharmony_ci rds_ib_attempt_ack(ic); 10188c2ecf20Sopenharmony_ci} 1019