162306a36Sopenharmony_ci/* 262306a36Sopenharmony_ci * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * This software is available to you under a choice of one of two 562306a36Sopenharmony_ci * licenses. You may choose to be licensed under the terms of the GNU 662306a36Sopenharmony_ci * General Public License (GPL) Version 2, available from the file 762306a36Sopenharmony_ci * COPYING in the main directory of this source tree, or the 862306a36Sopenharmony_ci * OpenIB.org BSD license below: 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * Redistribution and use in source and binary forms, with or 1162306a36Sopenharmony_ci * without modification, are permitted provided that the following 1262306a36Sopenharmony_ci * conditions are met: 1362306a36Sopenharmony_ci * 1462306a36Sopenharmony_ci * - Redistributions of source code must retain the above 1562306a36Sopenharmony_ci * copyright notice, this list of conditions and the following 1662306a36Sopenharmony_ci * disclaimer. 1762306a36Sopenharmony_ci * 1862306a36Sopenharmony_ci * - Redistributions in binary form must reproduce the above 1962306a36Sopenharmony_ci * copyright notice, this list of conditions and the following 2062306a36Sopenharmony_ci * disclaimer in the documentation and/or other materials 2162306a36Sopenharmony_ci * provided with the distribution. 2262306a36Sopenharmony_ci * 2362306a36Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 2462306a36Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 2562306a36Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 2662306a36Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 2762306a36Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 2862306a36Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 2962306a36Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 3062306a36Sopenharmony_ci * SOFTWARE. 3162306a36Sopenharmony_ci * 3262306a36Sopenharmony_ci */ 3362306a36Sopenharmony_ci#include <linux/kernel.h> 3462306a36Sopenharmony_ci#include <linux/in.h> 3562306a36Sopenharmony_ci#include <linux/device.h> 3662306a36Sopenharmony_ci#include <linux/dmapool.h> 3762306a36Sopenharmony_ci#include <linux/ratelimit.h> 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_ci#include "rds_single_path.h" 4062306a36Sopenharmony_ci#include "rds.h" 4162306a36Sopenharmony_ci#include "ib.h" 4262306a36Sopenharmony_ci#include "ib_mr.h" 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_ci/* 4562306a36Sopenharmony_ci * Convert IB-specific error message to RDS error message and call core 4662306a36Sopenharmony_ci * completion handler. 4762306a36Sopenharmony_ci */ 4862306a36Sopenharmony_cistatic void rds_ib_send_complete(struct rds_message *rm, 4962306a36Sopenharmony_ci int wc_status, 5062306a36Sopenharmony_ci void (*complete)(struct rds_message *rm, int status)) 5162306a36Sopenharmony_ci{ 5262306a36Sopenharmony_ci int notify_status; 5362306a36Sopenharmony_ci 5462306a36Sopenharmony_ci switch (wc_status) { 5562306a36Sopenharmony_ci case IB_WC_WR_FLUSH_ERR: 5662306a36Sopenharmony_ci return; 5762306a36Sopenharmony_ci 5862306a36Sopenharmony_ci case IB_WC_SUCCESS: 5962306a36Sopenharmony_ci notify_status = RDS_RDMA_SUCCESS; 6062306a36Sopenharmony_ci break; 6162306a36Sopenharmony_ci 6262306a36Sopenharmony_ci case IB_WC_REM_ACCESS_ERR: 6362306a36Sopenharmony_ci notify_status = RDS_RDMA_REMOTE_ERROR; 6462306a36Sopenharmony_ci break; 6562306a36Sopenharmony_ci 6662306a36Sopenharmony_ci default: 6762306a36Sopenharmony_ci notify_status = RDS_RDMA_OTHER_ERROR; 6862306a36Sopenharmony_ci break; 6962306a36Sopenharmony_ci } 7062306a36Sopenharmony_ci complete(rm, notify_status); 7162306a36Sopenharmony_ci} 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_cistatic void rds_ib_send_unmap_data(struct rds_ib_connection *ic, 7462306a36Sopenharmony_ci struct rm_data_op *op, 7562306a36Sopenharmony_ci int wc_status) 7662306a36Sopenharmony_ci{ 7762306a36Sopenharmony_ci if (op->op_nents) 7862306a36Sopenharmony_ci ib_dma_unmap_sg(ic->i_cm_id->device, 7962306a36Sopenharmony_ci op->op_sg, op->op_nents, 8062306a36Sopenharmony_ci DMA_TO_DEVICE); 8162306a36Sopenharmony_ci} 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_cistatic void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, 8462306a36Sopenharmony_ci struct rm_rdma_op *op, 8562306a36Sopenharmony_ci int wc_status) 8662306a36Sopenharmony_ci{ 8762306a36Sopenharmony_ci if (op->op_mapped) { 8862306a36Sopenharmony_ci ib_dma_unmap_sg(ic->i_cm_id->device, 8962306a36Sopenharmony_ci op->op_sg, op->op_nents, 9062306a36Sopenharmony_ci op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 9162306a36Sopenharmony_ci op->op_mapped = 0; 9262306a36Sopenharmony_ci } 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci /* If the user asked for a completion notification on this 9562306a36Sopenharmony_ci * message, we can implement three different semantics: 9662306a36Sopenharmony_ci * 1. Notify when we received the ACK on the RDS message 9762306a36Sopenharmony_ci * that was queued with the RDMA. This provides reliable 9862306a36Sopenharmony_ci * notification of RDMA status at the expense of a one-way 9962306a36Sopenharmony_ci * packet delay. 10062306a36Sopenharmony_ci * 2. Notify when the IB stack gives us the completion event for 10162306a36Sopenharmony_ci * the RDMA operation. 10262306a36Sopenharmony_ci * 3. Notify when the IB stack gives us the completion event for 10362306a36Sopenharmony_ci * the accompanying RDS messages. 10462306a36Sopenharmony_ci * Here, we implement approach #3. To implement approach #2, 10562306a36Sopenharmony_ci * we would need to take an event for the rdma WR. To implement #1, 10662306a36Sopenharmony_ci * don't call rds_rdma_send_complete at all, and fall back to the notify 10762306a36Sopenharmony_ci * handling in the ACK processing code. 10862306a36Sopenharmony_ci * 10962306a36Sopenharmony_ci * Note: There's no need to explicitly sync any RDMA buffers using 11062306a36Sopenharmony_ci * ib_dma_sync_sg_for_cpu - the completion for the RDMA 11162306a36Sopenharmony_ci * operation itself unmapped the RDMA buffers, which takes care 11262306a36Sopenharmony_ci * of synching. 11362306a36Sopenharmony_ci */ 11462306a36Sopenharmony_ci rds_ib_send_complete(container_of(op, struct rds_message, rdma), 11562306a36Sopenharmony_ci wc_status, rds_rdma_send_complete); 11662306a36Sopenharmony_ci 11762306a36Sopenharmony_ci if (op->op_write) 11862306a36Sopenharmony_ci rds_stats_add(s_send_rdma_bytes, op->op_bytes); 11962306a36Sopenharmony_ci else 12062306a36Sopenharmony_ci rds_stats_add(s_recv_rdma_bytes, op->op_bytes); 12162306a36Sopenharmony_ci} 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_cistatic void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, 12462306a36Sopenharmony_ci struct rm_atomic_op *op, 12562306a36Sopenharmony_ci int wc_status) 12662306a36Sopenharmony_ci{ 12762306a36Sopenharmony_ci /* unmap atomic recvbuf */ 12862306a36Sopenharmony_ci if (op->op_mapped) { 12962306a36Sopenharmony_ci ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1, 13062306a36Sopenharmony_ci DMA_FROM_DEVICE); 13162306a36Sopenharmony_ci op->op_mapped = 0; 13262306a36Sopenharmony_ci } 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci rds_ib_send_complete(container_of(op, struct rds_message, atomic), 13562306a36Sopenharmony_ci wc_status, rds_atomic_send_complete); 13662306a36Sopenharmony_ci 13762306a36Sopenharmony_ci if (op->op_type == RDS_ATOMIC_TYPE_CSWP) 13862306a36Sopenharmony_ci rds_ib_stats_inc(s_ib_atomic_cswp); 13962306a36Sopenharmony_ci else 14062306a36Sopenharmony_ci rds_ib_stats_inc(s_ib_atomic_fadd); 14162306a36Sopenharmony_ci} 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_ci/* 14462306a36Sopenharmony_ci * Unmap the resources associated with a struct send_work. 14562306a36Sopenharmony_ci * 14662306a36Sopenharmony_ci * Returns the rm for no good reason other than it is unobtainable 14762306a36Sopenharmony_ci * other than by switching on wr.opcode, currently, and the caller, 14862306a36Sopenharmony_ci * the event handler, needs it. 14962306a36Sopenharmony_ci */ 15062306a36Sopenharmony_cistatic struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic, 15162306a36Sopenharmony_ci struct rds_ib_send_work *send, 15262306a36Sopenharmony_ci int wc_status) 15362306a36Sopenharmony_ci{ 15462306a36Sopenharmony_ci struct rds_message *rm = NULL; 15562306a36Sopenharmony_ci 15662306a36Sopenharmony_ci /* In the error case, wc.opcode sometimes contains garbage */ 15762306a36Sopenharmony_ci switch (send->s_wr.opcode) { 15862306a36Sopenharmony_ci case IB_WR_SEND: 15962306a36Sopenharmony_ci if (send->s_op) { 16062306a36Sopenharmony_ci rm = container_of(send->s_op, struct rds_message, data); 16162306a36Sopenharmony_ci rds_ib_send_unmap_data(ic, send->s_op, wc_status); 16262306a36Sopenharmony_ci } 16362306a36Sopenharmony_ci break; 16462306a36Sopenharmony_ci case IB_WR_RDMA_WRITE: 16562306a36Sopenharmony_ci case IB_WR_RDMA_READ: 16662306a36Sopenharmony_ci if (send->s_op) { 16762306a36Sopenharmony_ci rm = container_of(send->s_op, struct rds_message, rdma); 16862306a36Sopenharmony_ci rds_ib_send_unmap_rdma(ic, send->s_op, wc_status); 16962306a36Sopenharmony_ci } 17062306a36Sopenharmony_ci break; 17162306a36Sopenharmony_ci case IB_WR_ATOMIC_FETCH_AND_ADD: 17262306a36Sopenharmony_ci case IB_WR_ATOMIC_CMP_AND_SWP: 17362306a36Sopenharmony_ci if (send->s_op) { 17462306a36Sopenharmony_ci rm = container_of(send->s_op, struct rds_message, atomic); 17562306a36Sopenharmony_ci rds_ib_send_unmap_atomic(ic, send->s_op, wc_status); 17662306a36Sopenharmony_ci } 17762306a36Sopenharmony_ci break; 17862306a36Sopenharmony_ci default: 17962306a36Sopenharmony_ci printk_ratelimited(KERN_NOTICE 18062306a36Sopenharmony_ci "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", 18162306a36Sopenharmony_ci __func__, send->s_wr.opcode); 18262306a36Sopenharmony_ci break; 18362306a36Sopenharmony_ci } 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ci send->s_wr.opcode = 0xdead; 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_ci return rm; 18862306a36Sopenharmony_ci} 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_civoid rds_ib_send_init_ring(struct rds_ib_connection *ic) 19162306a36Sopenharmony_ci{ 19262306a36Sopenharmony_ci struct rds_ib_send_work *send; 19362306a36Sopenharmony_ci u32 i; 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { 19662306a36Sopenharmony_ci struct ib_sge *sge; 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci send->s_op = NULL; 19962306a36Sopenharmony_ci 20062306a36Sopenharmony_ci send->s_wr.wr_id = i; 20162306a36Sopenharmony_ci send->s_wr.sg_list = send->s_sge; 20262306a36Sopenharmony_ci send->s_wr.ex.imm_data = 0; 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_ci sge = &send->s_sge[0]; 20562306a36Sopenharmony_ci sge->addr = ic->i_send_hdrs_dma[i]; 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci sge->length = sizeof(struct rds_header); 20862306a36Sopenharmony_ci sge->lkey = ic->i_pd->local_dma_lkey; 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_ci send->s_sge[1].lkey = ic->i_pd->local_dma_lkey; 21162306a36Sopenharmony_ci } 21262306a36Sopenharmony_ci} 21362306a36Sopenharmony_ci 21462306a36Sopenharmony_civoid rds_ib_send_clear_ring(struct rds_ib_connection *ic) 21562306a36Sopenharmony_ci{ 21662306a36Sopenharmony_ci struct rds_ib_send_work *send; 21762306a36Sopenharmony_ci u32 i; 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { 22062306a36Sopenharmony_ci if (send->s_op && send->s_wr.opcode != 0xdead) 22162306a36Sopenharmony_ci rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR); 22262306a36Sopenharmony_ci } 22362306a36Sopenharmony_ci} 22462306a36Sopenharmony_ci 22562306a36Sopenharmony_ci/* 22662306a36Sopenharmony_ci * The only fast path caller always has a non-zero nr, so we don't 22762306a36Sopenharmony_ci * bother testing nr before performing the atomic sub. 22862306a36Sopenharmony_ci */ 22962306a36Sopenharmony_cistatic void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr) 23062306a36Sopenharmony_ci{ 23162306a36Sopenharmony_ci if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) && 23262306a36Sopenharmony_ci waitqueue_active(&rds_ib_ring_empty_wait)) 23362306a36Sopenharmony_ci wake_up(&rds_ib_ring_empty_wait); 23462306a36Sopenharmony_ci BUG_ON(atomic_read(&ic->i_signaled_sends) < 0); 23562306a36Sopenharmony_ci} 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_ci/* 23862306a36Sopenharmony_ci * The _oldest/_free ring operations here race cleanly with the alloc/unalloc 23962306a36Sopenharmony_ci * operations performed in the send path. As the sender allocs and potentially 24062306a36Sopenharmony_ci * unallocs the next free entry in the ring it doesn't alter which is 24162306a36Sopenharmony_ci * the next to be freed, which is what this is concerned with. 24262306a36Sopenharmony_ci */ 24362306a36Sopenharmony_civoid rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) 24462306a36Sopenharmony_ci{ 24562306a36Sopenharmony_ci struct rds_message *rm = NULL; 24662306a36Sopenharmony_ci struct rds_connection *conn = ic->conn; 24762306a36Sopenharmony_ci struct rds_ib_send_work *send; 24862306a36Sopenharmony_ci u32 completed; 24962306a36Sopenharmony_ci u32 oldest; 25062306a36Sopenharmony_ci u32 i = 0; 25162306a36Sopenharmony_ci int nr_sig = 0; 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci 25462306a36Sopenharmony_ci rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", 25562306a36Sopenharmony_ci (unsigned long long)wc->wr_id, wc->status, 25662306a36Sopenharmony_ci ib_wc_status_msg(wc->status), wc->byte_len, 25762306a36Sopenharmony_ci be32_to_cpu(wc->ex.imm_data)); 25862306a36Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_cq_event); 25962306a36Sopenharmony_ci 26062306a36Sopenharmony_ci if (wc->wr_id == RDS_IB_ACK_WR_ID) { 26162306a36Sopenharmony_ci if (time_after(jiffies, ic->i_ack_queued + HZ / 2)) 26262306a36Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_stalled); 26362306a36Sopenharmony_ci rds_ib_ack_send_complete(ic); 26462306a36Sopenharmony_ci return; 26562306a36Sopenharmony_ci } 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_ci oldest = rds_ib_ring_oldest(&ic->i_send_ring); 26862306a36Sopenharmony_ci 26962306a36Sopenharmony_ci completed = rds_ib_ring_completed(&ic->i_send_ring, wc->wr_id, oldest); 27062306a36Sopenharmony_ci 27162306a36Sopenharmony_ci for (i = 0; i < completed; i++) { 27262306a36Sopenharmony_ci send = &ic->i_sends[oldest]; 27362306a36Sopenharmony_ci if (send->s_wr.send_flags & IB_SEND_SIGNALED) 27462306a36Sopenharmony_ci nr_sig++; 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci rm = rds_ib_send_unmap_op(ic, send, wc->status); 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci if (time_after(jiffies, send->s_queued + HZ / 2)) 27962306a36Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_stalled); 28062306a36Sopenharmony_ci 28162306a36Sopenharmony_ci if (send->s_op) { 28262306a36Sopenharmony_ci if (send->s_op == rm->m_final_op) { 28362306a36Sopenharmony_ci /* If anyone waited for this message to get 28462306a36Sopenharmony_ci * flushed out, wake them up now 28562306a36Sopenharmony_ci */ 28662306a36Sopenharmony_ci rds_message_unmapped(rm); 28762306a36Sopenharmony_ci } 28862306a36Sopenharmony_ci rds_message_put(rm); 28962306a36Sopenharmony_ci send->s_op = NULL; 29062306a36Sopenharmony_ci } 29162306a36Sopenharmony_ci 29262306a36Sopenharmony_ci oldest = (oldest + 1) % ic->i_send_ring.w_nr; 29362306a36Sopenharmony_ci } 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci rds_ib_ring_free(&ic->i_send_ring, completed); 29662306a36Sopenharmony_ci rds_ib_sub_signaled(ic, nr_sig); 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_ci if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || 29962306a36Sopenharmony_ci test_bit(0, &conn->c_map_queued)) 30062306a36Sopenharmony_ci queue_delayed_work(rds_wq, &conn->c_send_w, 0); 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci /* We expect errors as the qp is drained during shutdown */ 30362306a36Sopenharmony_ci if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { 30462306a36Sopenharmony_ci rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n", 30562306a36Sopenharmony_ci &conn->c_laddr, &conn->c_faddr, 30662306a36Sopenharmony_ci conn->c_tos, wc->status, 30762306a36Sopenharmony_ci ib_wc_status_msg(wc->status), wc->vendor_err); 30862306a36Sopenharmony_ci } 30962306a36Sopenharmony_ci} 31062306a36Sopenharmony_ci 31162306a36Sopenharmony_ci/* 31262306a36Sopenharmony_ci * This is the main function for allocating credits when sending 31362306a36Sopenharmony_ci * messages. 31462306a36Sopenharmony_ci * 31562306a36Sopenharmony_ci * Conceptually, we have two counters: 31662306a36Sopenharmony_ci * - send credits: this tells us how many WRs we're allowed 31762306a36Sopenharmony_ci * to submit without overruning the receiver's queue. For 31862306a36Sopenharmony_ci * each SEND WR we post, we decrement this by one. 31962306a36Sopenharmony_ci * 32062306a36Sopenharmony_ci * - posted credits: this tells us how many WRs we recently 32162306a36Sopenharmony_ci * posted to the receive queue. This value is transferred 32262306a36Sopenharmony_ci * to the peer as a "credit update" in a RDS header field. 32362306a36Sopenharmony_ci * Every time we transmit credits to the peer, we subtract 32462306a36Sopenharmony_ci * the amount of transferred credits from this counter. 32562306a36Sopenharmony_ci * 32662306a36Sopenharmony_ci * It is essential that we avoid situations where both sides have 32762306a36Sopenharmony_ci * exhausted their send credits, and are unable to send new credits 32862306a36Sopenharmony_ci * to the peer. We achieve this by requiring that we send at least 32962306a36Sopenharmony_ci * one credit update to the peer before exhausting our credits. 33062306a36Sopenharmony_ci * When new credits arrive, we subtract one credit that is withheld 33162306a36Sopenharmony_ci * until we've posted new buffers and are ready to transmit these 33262306a36Sopenharmony_ci * credits (see rds_ib_send_add_credits below). 33362306a36Sopenharmony_ci * 33462306a36Sopenharmony_ci * The RDS send code is essentially single-threaded; rds_send_xmit 33562306a36Sopenharmony_ci * sets RDS_IN_XMIT to ensure exclusive access to the send ring. 33662306a36Sopenharmony_ci * However, the ACK sending code is independent and can race with 33762306a36Sopenharmony_ci * message SENDs. 33862306a36Sopenharmony_ci * 33962306a36Sopenharmony_ci * In the send path, we need to update the counters for send credits 34062306a36Sopenharmony_ci * and the counter of posted buffers atomically - when we use the 34162306a36Sopenharmony_ci * last available credit, we cannot allow another thread to race us 34262306a36Sopenharmony_ci * and grab the posted credits counter. Hence, we have to use a 34362306a36Sopenharmony_ci * spinlock to protect the credit counter, or use atomics. 34462306a36Sopenharmony_ci * 34562306a36Sopenharmony_ci * Spinlocks shared between the send and the receive path are bad, 34662306a36Sopenharmony_ci * because they create unnecessary delays. An early implementation 34762306a36Sopenharmony_ci * using a spinlock showed a 5% degradation in throughput at some 34862306a36Sopenharmony_ci * loads. 34962306a36Sopenharmony_ci * 35062306a36Sopenharmony_ci * This implementation avoids spinlocks completely, putting both 35162306a36Sopenharmony_ci * counters into a single atomic, and updating that atomic using 35262306a36Sopenharmony_ci * atomic_add (in the receive path, when receiving fresh credits), 35362306a36Sopenharmony_ci * and using atomic_cmpxchg when updating the two counters. 35462306a36Sopenharmony_ci */ 35562306a36Sopenharmony_ciint rds_ib_send_grab_credits(struct rds_ib_connection *ic, 35662306a36Sopenharmony_ci u32 wanted, u32 *adv_credits, int need_posted, int max_posted) 35762306a36Sopenharmony_ci{ 35862306a36Sopenharmony_ci unsigned int avail, posted, got = 0, advertise; 35962306a36Sopenharmony_ci long oldval, newval; 36062306a36Sopenharmony_ci 36162306a36Sopenharmony_ci *adv_credits = 0; 36262306a36Sopenharmony_ci if (!ic->i_flowctl) 36362306a36Sopenharmony_ci return wanted; 36462306a36Sopenharmony_ci 36562306a36Sopenharmony_citry_again: 36662306a36Sopenharmony_ci advertise = 0; 36762306a36Sopenharmony_ci oldval = newval = atomic_read(&ic->i_credits); 36862306a36Sopenharmony_ci posted = IB_GET_POST_CREDITS(oldval); 36962306a36Sopenharmony_ci avail = IB_GET_SEND_CREDITS(oldval); 37062306a36Sopenharmony_ci 37162306a36Sopenharmony_ci rdsdebug("wanted=%u credits=%u posted=%u\n", 37262306a36Sopenharmony_ci wanted, avail, posted); 37362306a36Sopenharmony_ci 37462306a36Sopenharmony_ci /* The last credit must be used to send a credit update. */ 37562306a36Sopenharmony_ci if (avail && !posted) 37662306a36Sopenharmony_ci avail--; 37762306a36Sopenharmony_ci 37862306a36Sopenharmony_ci if (avail < wanted) { 37962306a36Sopenharmony_ci struct rds_connection *conn = ic->i_cm_id->context; 38062306a36Sopenharmony_ci 38162306a36Sopenharmony_ci /* Oops, there aren't that many credits left! */ 38262306a36Sopenharmony_ci set_bit(RDS_LL_SEND_FULL, &conn->c_flags); 38362306a36Sopenharmony_ci got = avail; 38462306a36Sopenharmony_ci } else { 38562306a36Sopenharmony_ci /* Sometimes you get what you want, lalala. */ 38662306a36Sopenharmony_ci got = wanted; 38762306a36Sopenharmony_ci } 38862306a36Sopenharmony_ci newval -= IB_SET_SEND_CREDITS(got); 38962306a36Sopenharmony_ci 39062306a36Sopenharmony_ci /* 39162306a36Sopenharmony_ci * If need_posted is non-zero, then the caller wants 39262306a36Sopenharmony_ci * the posted regardless of whether any send credits are 39362306a36Sopenharmony_ci * available. 39462306a36Sopenharmony_ci */ 39562306a36Sopenharmony_ci if (posted && (got || need_posted)) { 39662306a36Sopenharmony_ci advertise = min_t(unsigned int, posted, max_posted); 39762306a36Sopenharmony_ci newval -= IB_SET_POST_CREDITS(advertise); 39862306a36Sopenharmony_ci } 39962306a36Sopenharmony_ci 40062306a36Sopenharmony_ci /* Finally bill everything */ 40162306a36Sopenharmony_ci if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) 40262306a36Sopenharmony_ci goto try_again; 40362306a36Sopenharmony_ci 40462306a36Sopenharmony_ci *adv_credits = advertise; 40562306a36Sopenharmony_ci return got; 40662306a36Sopenharmony_ci} 40762306a36Sopenharmony_ci 40862306a36Sopenharmony_civoid rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits) 40962306a36Sopenharmony_ci{ 41062306a36Sopenharmony_ci struct rds_ib_connection *ic = conn->c_transport_data; 41162306a36Sopenharmony_ci 41262306a36Sopenharmony_ci if (credits == 0) 41362306a36Sopenharmony_ci return; 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_ci rdsdebug("credits=%u current=%u%s\n", 41662306a36Sopenharmony_ci credits, 41762306a36Sopenharmony_ci IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)), 41862306a36Sopenharmony_ci test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : ""); 41962306a36Sopenharmony_ci 42062306a36Sopenharmony_ci atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits); 42162306a36Sopenharmony_ci if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)) 42262306a36Sopenharmony_ci queue_delayed_work(rds_wq, &conn->c_send_w, 0); 42362306a36Sopenharmony_ci 42462306a36Sopenharmony_ci WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384); 42562306a36Sopenharmony_ci 42662306a36Sopenharmony_ci rds_ib_stats_inc(s_ib_rx_credit_updates); 42762306a36Sopenharmony_ci} 42862306a36Sopenharmony_ci 42962306a36Sopenharmony_civoid rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) 43062306a36Sopenharmony_ci{ 43162306a36Sopenharmony_ci struct rds_ib_connection *ic = conn->c_transport_data; 43262306a36Sopenharmony_ci 43362306a36Sopenharmony_ci if (posted == 0) 43462306a36Sopenharmony_ci return; 43562306a36Sopenharmony_ci 43662306a36Sopenharmony_ci atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits); 43762306a36Sopenharmony_ci 43862306a36Sopenharmony_ci /* Decide whether to send an update to the peer now. 43962306a36Sopenharmony_ci * If we would send a credit update for every single buffer we 44062306a36Sopenharmony_ci * post, we would end up with an ACK storm (ACK arrives, 44162306a36Sopenharmony_ci * consumes buffer, we refill the ring, send ACK to remote 44262306a36Sopenharmony_ci * advertising the newly posted buffer... ad inf) 44362306a36Sopenharmony_ci * 44462306a36Sopenharmony_ci * Performance pretty much depends on how often we send 44562306a36Sopenharmony_ci * credit updates - too frequent updates mean lots of ACKs. 44662306a36Sopenharmony_ci * Too infrequent updates, and the peer will run out of 44762306a36Sopenharmony_ci * credits and has to throttle. 44862306a36Sopenharmony_ci * For the time being, 16 seems to be a good compromise. 44962306a36Sopenharmony_ci */ 45062306a36Sopenharmony_ci if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16) 45162306a36Sopenharmony_ci set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 45262306a36Sopenharmony_ci} 45362306a36Sopenharmony_ci 45462306a36Sopenharmony_cistatic inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic, 45562306a36Sopenharmony_ci struct rds_ib_send_work *send, 45662306a36Sopenharmony_ci bool notify) 45762306a36Sopenharmony_ci{ 45862306a36Sopenharmony_ci /* 45962306a36Sopenharmony_ci * We want to delay signaling completions just enough to get 46062306a36Sopenharmony_ci * the batching benefits but not so much that we create dead time 46162306a36Sopenharmony_ci * on the wire. 46262306a36Sopenharmony_ci */ 46362306a36Sopenharmony_ci if (ic->i_unsignaled_wrs-- == 0 || notify) { 46462306a36Sopenharmony_ci ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; 46562306a36Sopenharmony_ci send->s_wr.send_flags |= IB_SEND_SIGNALED; 46662306a36Sopenharmony_ci return 1; 46762306a36Sopenharmony_ci } 46862306a36Sopenharmony_ci return 0; 46962306a36Sopenharmony_ci} 47062306a36Sopenharmony_ci 47162306a36Sopenharmony_ci/* 47262306a36Sopenharmony_ci * This can be called multiple times for a given message. The first time 47362306a36Sopenharmony_ci * we see a message we map its scatterlist into the IB device so that 47462306a36Sopenharmony_ci * we can provide that mapped address to the IB scatter gather entries 47562306a36Sopenharmony_ci * in the IB work requests. We translate the scatterlist into a series 47662306a36Sopenharmony_ci * of work requests that fragment the message. These work requests complete 47762306a36Sopenharmony_ci * in order so we pass ownership of the message to the completion handler 47862306a36Sopenharmony_ci * once we send the final fragment. 47962306a36Sopenharmony_ci * 48062306a36Sopenharmony_ci * The RDS core uses the c_send_lock to only enter this function once 48162306a36Sopenharmony_ci * per connection. This makes sure that the tx ring alloc/unalloc pairs 48262306a36Sopenharmony_ci * don't get out of sync and confuse the ring. 48362306a36Sopenharmony_ci */ 48462306a36Sopenharmony_ciint rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, 48562306a36Sopenharmony_ci unsigned int hdr_off, unsigned int sg, unsigned int off) 48662306a36Sopenharmony_ci{ 48762306a36Sopenharmony_ci struct rds_ib_connection *ic = conn->c_transport_data; 48862306a36Sopenharmony_ci struct ib_device *dev = ic->i_cm_id->device; 48962306a36Sopenharmony_ci struct rds_ib_send_work *send = NULL; 49062306a36Sopenharmony_ci struct rds_ib_send_work *first; 49162306a36Sopenharmony_ci struct rds_ib_send_work *prev; 49262306a36Sopenharmony_ci const struct ib_send_wr *failed_wr; 49362306a36Sopenharmony_ci struct scatterlist *scat; 49462306a36Sopenharmony_ci u32 pos; 49562306a36Sopenharmony_ci u32 i; 49662306a36Sopenharmony_ci u32 work_alloc; 49762306a36Sopenharmony_ci u32 credit_alloc = 0; 49862306a36Sopenharmony_ci u32 posted; 49962306a36Sopenharmony_ci u32 adv_credits = 0; 50062306a36Sopenharmony_ci int send_flags = 0; 50162306a36Sopenharmony_ci int bytes_sent = 0; 50262306a36Sopenharmony_ci int ret; 50362306a36Sopenharmony_ci int flow_controlled = 0; 50462306a36Sopenharmony_ci int nr_sig = 0; 50562306a36Sopenharmony_ci 50662306a36Sopenharmony_ci BUG_ON(off % RDS_FRAG_SIZE); 50762306a36Sopenharmony_ci BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); 50862306a36Sopenharmony_ci 50962306a36Sopenharmony_ci /* Do not send cong updates to IB loopback */ 51062306a36Sopenharmony_ci if (conn->c_loopback 51162306a36Sopenharmony_ci && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { 51262306a36Sopenharmony_ci rds_cong_map_updated(conn->c_fcong, ~(u64) 0); 51362306a36Sopenharmony_ci scat = &rm->data.op_sg[sg]; 51462306a36Sopenharmony_ci ret = max_t(int, RDS_CONG_MAP_BYTES, scat->length); 51562306a36Sopenharmony_ci return sizeof(struct rds_header) + ret; 51662306a36Sopenharmony_ci } 51762306a36Sopenharmony_ci 51862306a36Sopenharmony_ci /* FIXME we may overallocate here */ 51962306a36Sopenharmony_ci if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) 52062306a36Sopenharmony_ci i = 1; 52162306a36Sopenharmony_ci else 52262306a36Sopenharmony_ci i = DIV_ROUND_UP(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); 52362306a36Sopenharmony_ci 52462306a36Sopenharmony_ci work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); 52562306a36Sopenharmony_ci if (work_alloc == 0) { 52662306a36Sopenharmony_ci set_bit(RDS_LL_SEND_FULL, &conn->c_flags); 52762306a36Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_ring_full); 52862306a36Sopenharmony_ci ret = -ENOMEM; 52962306a36Sopenharmony_ci goto out; 53062306a36Sopenharmony_ci } 53162306a36Sopenharmony_ci 53262306a36Sopenharmony_ci if (ic->i_flowctl) { 53362306a36Sopenharmony_ci credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); 53462306a36Sopenharmony_ci adv_credits += posted; 53562306a36Sopenharmony_ci if (credit_alloc < work_alloc) { 53662306a36Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); 53762306a36Sopenharmony_ci work_alloc = credit_alloc; 53862306a36Sopenharmony_ci flow_controlled = 1; 53962306a36Sopenharmony_ci } 54062306a36Sopenharmony_ci if (work_alloc == 0) { 54162306a36Sopenharmony_ci set_bit(RDS_LL_SEND_FULL, &conn->c_flags); 54262306a36Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_throttle); 54362306a36Sopenharmony_ci ret = -ENOMEM; 54462306a36Sopenharmony_ci goto out; 54562306a36Sopenharmony_ci } 54662306a36Sopenharmony_ci } 54762306a36Sopenharmony_ci 54862306a36Sopenharmony_ci /* map the message the first time we see it */ 54962306a36Sopenharmony_ci if (!ic->i_data_op) { 55062306a36Sopenharmony_ci if (rm->data.op_nents) { 55162306a36Sopenharmony_ci rm->data.op_count = ib_dma_map_sg(dev, 55262306a36Sopenharmony_ci rm->data.op_sg, 55362306a36Sopenharmony_ci rm->data.op_nents, 55462306a36Sopenharmony_ci DMA_TO_DEVICE); 55562306a36Sopenharmony_ci rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); 55662306a36Sopenharmony_ci if (rm->data.op_count == 0) { 55762306a36Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); 55862306a36Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 55962306a36Sopenharmony_ci ret = -ENOMEM; /* XXX ? */ 56062306a36Sopenharmony_ci goto out; 56162306a36Sopenharmony_ci } 56262306a36Sopenharmony_ci } else { 56362306a36Sopenharmony_ci rm->data.op_count = 0; 56462306a36Sopenharmony_ci } 56562306a36Sopenharmony_ci 56662306a36Sopenharmony_ci rds_message_addref(rm); 56762306a36Sopenharmony_ci rm->data.op_dmasg = 0; 56862306a36Sopenharmony_ci rm->data.op_dmaoff = 0; 56962306a36Sopenharmony_ci ic->i_data_op = &rm->data; 57062306a36Sopenharmony_ci 57162306a36Sopenharmony_ci /* Finalize the header */ 57262306a36Sopenharmony_ci if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) 57362306a36Sopenharmony_ci rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED; 57462306a36Sopenharmony_ci if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) 57562306a36Sopenharmony_ci rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; 57662306a36Sopenharmony_ci 57762306a36Sopenharmony_ci /* If it has a RDMA op, tell the peer we did it. This is 57862306a36Sopenharmony_ci * used by the peer to release use-once RDMA MRs. */ 57962306a36Sopenharmony_ci if (rm->rdma.op_active) { 58062306a36Sopenharmony_ci struct rds_ext_header_rdma ext_hdr; 58162306a36Sopenharmony_ci 58262306a36Sopenharmony_ci ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); 58362306a36Sopenharmony_ci rds_message_add_extension(&rm->m_inc.i_hdr, 58462306a36Sopenharmony_ci RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); 58562306a36Sopenharmony_ci } 58662306a36Sopenharmony_ci if (rm->m_rdma_cookie) { 58762306a36Sopenharmony_ci rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, 58862306a36Sopenharmony_ci rds_rdma_cookie_key(rm->m_rdma_cookie), 58962306a36Sopenharmony_ci rds_rdma_cookie_offset(rm->m_rdma_cookie)); 59062306a36Sopenharmony_ci } 59162306a36Sopenharmony_ci 59262306a36Sopenharmony_ci /* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so 59362306a36Sopenharmony_ci * we should not do this unless we have a chance of at least 59462306a36Sopenharmony_ci * sticking the header into the send ring. Which is why we 59562306a36Sopenharmony_ci * should call rds_ib_ring_alloc first. */ 59662306a36Sopenharmony_ci rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic)); 59762306a36Sopenharmony_ci rds_message_make_checksum(&rm->m_inc.i_hdr); 59862306a36Sopenharmony_ci 59962306a36Sopenharmony_ci /* 60062306a36Sopenharmony_ci * Update adv_credits since we reset the ACK_REQUIRED bit. 60162306a36Sopenharmony_ci */ 60262306a36Sopenharmony_ci if (ic->i_flowctl) { 60362306a36Sopenharmony_ci rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); 60462306a36Sopenharmony_ci adv_credits += posted; 60562306a36Sopenharmony_ci BUG_ON(adv_credits > 255); 60662306a36Sopenharmony_ci } 60762306a36Sopenharmony_ci } 60862306a36Sopenharmony_ci 60962306a36Sopenharmony_ci /* Sometimes you want to put a fence between an RDMA 61062306a36Sopenharmony_ci * READ and the following SEND. 61162306a36Sopenharmony_ci * We could either do this all the time 61262306a36Sopenharmony_ci * or when requested by the user. Right now, we let 61362306a36Sopenharmony_ci * the application choose. 61462306a36Sopenharmony_ci */ 61562306a36Sopenharmony_ci if (rm->rdma.op_active && rm->rdma.op_fence) 61662306a36Sopenharmony_ci send_flags = IB_SEND_FENCE; 61762306a36Sopenharmony_ci 61862306a36Sopenharmony_ci /* Each frag gets a header. Msgs may be 0 bytes */ 61962306a36Sopenharmony_ci send = &ic->i_sends[pos]; 62062306a36Sopenharmony_ci first = send; 62162306a36Sopenharmony_ci prev = NULL; 62262306a36Sopenharmony_ci scat = &ic->i_data_op->op_sg[rm->data.op_dmasg]; 62362306a36Sopenharmony_ci i = 0; 62462306a36Sopenharmony_ci do { 62562306a36Sopenharmony_ci unsigned int len = 0; 62662306a36Sopenharmony_ci 62762306a36Sopenharmony_ci /* Set up the header */ 62862306a36Sopenharmony_ci send->s_wr.send_flags = send_flags; 62962306a36Sopenharmony_ci send->s_wr.opcode = IB_WR_SEND; 63062306a36Sopenharmony_ci send->s_wr.num_sge = 1; 63162306a36Sopenharmony_ci send->s_wr.next = NULL; 63262306a36Sopenharmony_ci send->s_queued = jiffies; 63362306a36Sopenharmony_ci send->s_op = NULL; 63462306a36Sopenharmony_ci 63562306a36Sopenharmony_ci send->s_sge[0].addr = ic->i_send_hdrs_dma[pos]; 63662306a36Sopenharmony_ci 63762306a36Sopenharmony_ci send->s_sge[0].length = sizeof(struct rds_header); 63862306a36Sopenharmony_ci send->s_sge[0].lkey = ic->i_pd->local_dma_lkey; 63962306a36Sopenharmony_ci 64062306a36Sopenharmony_ci ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, 64162306a36Sopenharmony_ci ic->i_send_hdrs_dma[pos], 64262306a36Sopenharmony_ci sizeof(struct rds_header), 64362306a36Sopenharmony_ci DMA_TO_DEVICE); 64462306a36Sopenharmony_ci memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, 64562306a36Sopenharmony_ci sizeof(struct rds_header)); 64662306a36Sopenharmony_ci 64762306a36Sopenharmony_ci 64862306a36Sopenharmony_ci /* Set up the data, if present */ 64962306a36Sopenharmony_ci if (i < work_alloc 65062306a36Sopenharmony_ci && scat != &rm->data.op_sg[rm->data.op_count]) { 65162306a36Sopenharmony_ci len = min(RDS_FRAG_SIZE, 65262306a36Sopenharmony_ci sg_dma_len(scat) - rm->data.op_dmaoff); 65362306a36Sopenharmony_ci send->s_wr.num_sge = 2; 65462306a36Sopenharmony_ci 65562306a36Sopenharmony_ci send->s_sge[1].addr = sg_dma_address(scat); 65662306a36Sopenharmony_ci send->s_sge[1].addr += rm->data.op_dmaoff; 65762306a36Sopenharmony_ci send->s_sge[1].length = len; 65862306a36Sopenharmony_ci send->s_sge[1].lkey = ic->i_pd->local_dma_lkey; 65962306a36Sopenharmony_ci 66062306a36Sopenharmony_ci bytes_sent += len; 66162306a36Sopenharmony_ci rm->data.op_dmaoff += len; 66262306a36Sopenharmony_ci if (rm->data.op_dmaoff == sg_dma_len(scat)) { 66362306a36Sopenharmony_ci scat++; 66462306a36Sopenharmony_ci rm->data.op_dmasg++; 66562306a36Sopenharmony_ci rm->data.op_dmaoff = 0; 66662306a36Sopenharmony_ci } 66762306a36Sopenharmony_ci } 66862306a36Sopenharmony_ci 66962306a36Sopenharmony_ci rds_ib_set_wr_signal_state(ic, send, false); 67062306a36Sopenharmony_ci 67162306a36Sopenharmony_ci /* 67262306a36Sopenharmony_ci * Always signal the last one if we're stopping due to flow control. 67362306a36Sopenharmony_ci */ 67462306a36Sopenharmony_ci if (ic->i_flowctl && flow_controlled && i == (work_alloc - 1)) { 67562306a36Sopenharmony_ci rds_ib_set_wr_signal_state(ic, send, true); 67662306a36Sopenharmony_ci send->s_wr.send_flags |= IB_SEND_SOLICITED; 67762306a36Sopenharmony_ci } 67862306a36Sopenharmony_ci 67962306a36Sopenharmony_ci if (send->s_wr.send_flags & IB_SEND_SIGNALED) 68062306a36Sopenharmony_ci nr_sig++; 68162306a36Sopenharmony_ci 68262306a36Sopenharmony_ci rdsdebug("send %p wr %p num_sge %u next %p\n", send, 68362306a36Sopenharmony_ci &send->s_wr, send->s_wr.num_sge, send->s_wr.next); 68462306a36Sopenharmony_ci 68562306a36Sopenharmony_ci if (ic->i_flowctl && adv_credits) { 68662306a36Sopenharmony_ci struct rds_header *hdr = ic->i_send_hdrs[pos]; 68762306a36Sopenharmony_ci 68862306a36Sopenharmony_ci /* add credit and redo the header checksum */ 68962306a36Sopenharmony_ci hdr->h_credit = adv_credits; 69062306a36Sopenharmony_ci rds_message_make_checksum(hdr); 69162306a36Sopenharmony_ci adv_credits = 0; 69262306a36Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_credit_updates); 69362306a36Sopenharmony_ci } 69462306a36Sopenharmony_ci ib_dma_sync_single_for_device(ic->rds_ibdev->dev, 69562306a36Sopenharmony_ci ic->i_send_hdrs_dma[pos], 69662306a36Sopenharmony_ci sizeof(struct rds_header), 69762306a36Sopenharmony_ci DMA_TO_DEVICE); 69862306a36Sopenharmony_ci 69962306a36Sopenharmony_ci if (prev) 70062306a36Sopenharmony_ci prev->s_wr.next = &send->s_wr; 70162306a36Sopenharmony_ci prev = send; 70262306a36Sopenharmony_ci 70362306a36Sopenharmony_ci pos = (pos + 1) % ic->i_send_ring.w_nr; 70462306a36Sopenharmony_ci send = &ic->i_sends[pos]; 70562306a36Sopenharmony_ci i++; 70662306a36Sopenharmony_ci 70762306a36Sopenharmony_ci } while (i < work_alloc 70862306a36Sopenharmony_ci && scat != &rm->data.op_sg[rm->data.op_count]); 70962306a36Sopenharmony_ci 71062306a36Sopenharmony_ci /* Account the RDS header in the number of bytes we sent, but just once. 71162306a36Sopenharmony_ci * The caller has no concept of fragmentation. */ 71262306a36Sopenharmony_ci if (hdr_off == 0) 71362306a36Sopenharmony_ci bytes_sent += sizeof(struct rds_header); 71462306a36Sopenharmony_ci 71562306a36Sopenharmony_ci /* if we finished the message then send completion owns it */ 71662306a36Sopenharmony_ci if (scat == &rm->data.op_sg[rm->data.op_count]) { 71762306a36Sopenharmony_ci prev->s_op = ic->i_data_op; 71862306a36Sopenharmony_ci prev->s_wr.send_flags |= IB_SEND_SOLICITED; 71962306a36Sopenharmony_ci if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED)) 72062306a36Sopenharmony_ci nr_sig += rds_ib_set_wr_signal_state(ic, prev, true); 72162306a36Sopenharmony_ci ic->i_data_op = NULL; 72262306a36Sopenharmony_ci } 72362306a36Sopenharmony_ci 72462306a36Sopenharmony_ci /* Put back wrs & credits we didn't use */ 72562306a36Sopenharmony_ci if (i < work_alloc) { 72662306a36Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); 72762306a36Sopenharmony_ci work_alloc = i; 72862306a36Sopenharmony_ci } 72962306a36Sopenharmony_ci if (ic->i_flowctl && i < credit_alloc) 73062306a36Sopenharmony_ci rds_ib_send_add_credits(conn, credit_alloc - i); 73162306a36Sopenharmony_ci 73262306a36Sopenharmony_ci if (nr_sig) 73362306a36Sopenharmony_ci atomic_add(nr_sig, &ic->i_signaled_sends); 73462306a36Sopenharmony_ci 73562306a36Sopenharmony_ci /* XXX need to worry about failed_wr and partial sends. */ 73662306a36Sopenharmony_ci failed_wr = &first->s_wr; 73762306a36Sopenharmony_ci ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); 73862306a36Sopenharmony_ci rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, 73962306a36Sopenharmony_ci first, &first->s_wr, ret, failed_wr); 74062306a36Sopenharmony_ci BUG_ON(failed_wr != &first->s_wr); 74162306a36Sopenharmony_ci if (ret) { 74262306a36Sopenharmony_ci printk(KERN_WARNING "RDS/IB: ib_post_send to %pI6c " 74362306a36Sopenharmony_ci "returned %d\n", &conn->c_faddr, ret); 74462306a36Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 74562306a36Sopenharmony_ci rds_ib_sub_signaled(ic, nr_sig); 74662306a36Sopenharmony_ci if (prev->s_op) { 74762306a36Sopenharmony_ci ic->i_data_op = prev->s_op; 74862306a36Sopenharmony_ci prev->s_op = NULL; 74962306a36Sopenharmony_ci } 75062306a36Sopenharmony_ci 75162306a36Sopenharmony_ci rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); 75262306a36Sopenharmony_ci goto out; 75362306a36Sopenharmony_ci } 75462306a36Sopenharmony_ci 75562306a36Sopenharmony_ci ret = bytes_sent; 75662306a36Sopenharmony_ciout: 75762306a36Sopenharmony_ci BUG_ON(adv_credits); 75862306a36Sopenharmony_ci return ret; 75962306a36Sopenharmony_ci} 76062306a36Sopenharmony_ci 76162306a36Sopenharmony_ci/* 76262306a36Sopenharmony_ci * Issue atomic operation. 76362306a36Sopenharmony_ci * A simplified version of the rdma case, we always map 1 SG, and 76462306a36Sopenharmony_ci * only 8 bytes, for the return value from the atomic operation. 76562306a36Sopenharmony_ci */ 76662306a36Sopenharmony_ciint rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) 76762306a36Sopenharmony_ci{ 76862306a36Sopenharmony_ci struct rds_ib_connection *ic = conn->c_transport_data; 76962306a36Sopenharmony_ci struct rds_ib_send_work *send = NULL; 77062306a36Sopenharmony_ci const struct ib_send_wr *failed_wr; 77162306a36Sopenharmony_ci u32 pos; 77262306a36Sopenharmony_ci u32 work_alloc; 77362306a36Sopenharmony_ci int ret; 77462306a36Sopenharmony_ci int nr_sig = 0; 77562306a36Sopenharmony_ci 77662306a36Sopenharmony_ci work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); 77762306a36Sopenharmony_ci if (work_alloc != 1) { 77862306a36Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_ring_full); 77962306a36Sopenharmony_ci ret = -ENOMEM; 78062306a36Sopenharmony_ci goto out; 78162306a36Sopenharmony_ci } 78262306a36Sopenharmony_ci 78362306a36Sopenharmony_ci /* address of send request in ring */ 78462306a36Sopenharmony_ci send = &ic->i_sends[pos]; 78562306a36Sopenharmony_ci send->s_queued = jiffies; 78662306a36Sopenharmony_ci 78762306a36Sopenharmony_ci if (op->op_type == RDS_ATOMIC_TYPE_CSWP) { 78862306a36Sopenharmony_ci send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP; 78962306a36Sopenharmony_ci send->s_atomic_wr.compare_add = op->op_m_cswp.compare; 79062306a36Sopenharmony_ci send->s_atomic_wr.swap = op->op_m_cswp.swap; 79162306a36Sopenharmony_ci send->s_atomic_wr.compare_add_mask = op->op_m_cswp.compare_mask; 79262306a36Sopenharmony_ci send->s_atomic_wr.swap_mask = op->op_m_cswp.swap_mask; 79362306a36Sopenharmony_ci } else { /* FADD */ 79462306a36Sopenharmony_ci send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; 79562306a36Sopenharmony_ci send->s_atomic_wr.compare_add = op->op_m_fadd.add; 79662306a36Sopenharmony_ci send->s_atomic_wr.swap = 0; 79762306a36Sopenharmony_ci send->s_atomic_wr.compare_add_mask = op->op_m_fadd.nocarry_mask; 79862306a36Sopenharmony_ci send->s_atomic_wr.swap_mask = 0; 79962306a36Sopenharmony_ci } 80062306a36Sopenharmony_ci send->s_wr.send_flags = 0; 80162306a36Sopenharmony_ci nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify); 80262306a36Sopenharmony_ci send->s_atomic_wr.wr.num_sge = 1; 80362306a36Sopenharmony_ci send->s_atomic_wr.wr.next = NULL; 80462306a36Sopenharmony_ci send->s_atomic_wr.remote_addr = op->op_remote_addr; 80562306a36Sopenharmony_ci send->s_atomic_wr.rkey = op->op_rkey; 80662306a36Sopenharmony_ci send->s_op = op; 80762306a36Sopenharmony_ci rds_message_addref(container_of(send->s_op, struct rds_message, atomic)); 80862306a36Sopenharmony_ci 80962306a36Sopenharmony_ci /* map 8 byte retval buffer to the device */ 81062306a36Sopenharmony_ci ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE); 81162306a36Sopenharmony_ci rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret); 81262306a36Sopenharmony_ci if (ret != 1) { 81362306a36Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 81462306a36Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); 81562306a36Sopenharmony_ci ret = -ENOMEM; /* XXX ? */ 81662306a36Sopenharmony_ci goto out; 81762306a36Sopenharmony_ci } 81862306a36Sopenharmony_ci 81962306a36Sopenharmony_ci /* Convert our struct scatterlist to struct ib_sge */ 82062306a36Sopenharmony_ci send->s_sge[0].addr = sg_dma_address(op->op_sg); 82162306a36Sopenharmony_ci send->s_sge[0].length = sg_dma_len(op->op_sg); 82262306a36Sopenharmony_ci send->s_sge[0].lkey = ic->i_pd->local_dma_lkey; 82362306a36Sopenharmony_ci 82462306a36Sopenharmony_ci rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, 82562306a36Sopenharmony_ci send->s_sge[0].addr, send->s_sge[0].length); 82662306a36Sopenharmony_ci 82762306a36Sopenharmony_ci if (nr_sig) 82862306a36Sopenharmony_ci atomic_add(nr_sig, &ic->i_signaled_sends); 82962306a36Sopenharmony_ci 83062306a36Sopenharmony_ci failed_wr = &send->s_atomic_wr.wr; 83162306a36Sopenharmony_ci ret = ib_post_send(ic->i_cm_id->qp, &send->s_atomic_wr.wr, &failed_wr); 83262306a36Sopenharmony_ci rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic, 83362306a36Sopenharmony_ci send, &send->s_atomic_wr, ret, failed_wr); 83462306a36Sopenharmony_ci BUG_ON(failed_wr != &send->s_atomic_wr.wr); 83562306a36Sopenharmony_ci if (ret) { 83662306a36Sopenharmony_ci printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI6c " 83762306a36Sopenharmony_ci "returned %d\n", &conn->c_faddr, ret); 83862306a36Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 83962306a36Sopenharmony_ci rds_ib_sub_signaled(ic, nr_sig); 84062306a36Sopenharmony_ci goto out; 84162306a36Sopenharmony_ci } 84262306a36Sopenharmony_ci 84362306a36Sopenharmony_ci if (unlikely(failed_wr != &send->s_atomic_wr.wr)) { 84462306a36Sopenharmony_ci printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret); 84562306a36Sopenharmony_ci BUG_ON(failed_wr != &send->s_atomic_wr.wr); 84662306a36Sopenharmony_ci } 84762306a36Sopenharmony_ci 84862306a36Sopenharmony_ciout: 84962306a36Sopenharmony_ci return ret; 85062306a36Sopenharmony_ci} 85162306a36Sopenharmony_ci 85262306a36Sopenharmony_ciint rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) 85362306a36Sopenharmony_ci{ 85462306a36Sopenharmony_ci struct rds_ib_connection *ic = conn->c_transport_data; 85562306a36Sopenharmony_ci struct rds_ib_send_work *send = NULL; 85662306a36Sopenharmony_ci struct rds_ib_send_work *first; 85762306a36Sopenharmony_ci struct rds_ib_send_work *prev; 85862306a36Sopenharmony_ci const struct ib_send_wr *failed_wr; 85962306a36Sopenharmony_ci struct scatterlist *scat; 86062306a36Sopenharmony_ci unsigned long len; 86162306a36Sopenharmony_ci u64 remote_addr = op->op_remote_addr; 86262306a36Sopenharmony_ci u32 max_sge = ic->rds_ibdev->max_sge; 86362306a36Sopenharmony_ci u32 pos; 86462306a36Sopenharmony_ci u32 work_alloc; 86562306a36Sopenharmony_ci u32 i; 86662306a36Sopenharmony_ci u32 j; 86762306a36Sopenharmony_ci int sent; 86862306a36Sopenharmony_ci int ret; 86962306a36Sopenharmony_ci int num_sge; 87062306a36Sopenharmony_ci int nr_sig = 0; 87162306a36Sopenharmony_ci u64 odp_addr = op->op_odp_addr; 87262306a36Sopenharmony_ci u32 odp_lkey = 0; 87362306a36Sopenharmony_ci 87462306a36Sopenharmony_ci /* map the op the first time we see it */ 87562306a36Sopenharmony_ci if (!op->op_odp_mr) { 87662306a36Sopenharmony_ci if (!op->op_mapped) { 87762306a36Sopenharmony_ci op->op_count = 87862306a36Sopenharmony_ci ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 87962306a36Sopenharmony_ci op->op_nents, 88062306a36Sopenharmony_ci (op->op_write) ? DMA_TO_DEVICE : 88162306a36Sopenharmony_ci DMA_FROM_DEVICE); 88262306a36Sopenharmony_ci rdsdebug("ic %p mapping op %p: %d\n", ic, op, 88362306a36Sopenharmony_ci op->op_count); 88462306a36Sopenharmony_ci if (op->op_count == 0) { 88562306a36Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); 88662306a36Sopenharmony_ci ret = -ENOMEM; /* XXX ? */ 88762306a36Sopenharmony_ci goto out; 88862306a36Sopenharmony_ci } 88962306a36Sopenharmony_ci op->op_mapped = 1; 89062306a36Sopenharmony_ci } 89162306a36Sopenharmony_ci } else { 89262306a36Sopenharmony_ci op->op_count = op->op_nents; 89362306a36Sopenharmony_ci odp_lkey = rds_ib_get_lkey(op->op_odp_mr->r_trans_private); 89462306a36Sopenharmony_ci } 89562306a36Sopenharmony_ci 89662306a36Sopenharmony_ci /* 89762306a36Sopenharmony_ci * Instead of knowing how to return a partial rdma read/write we insist that there 89862306a36Sopenharmony_ci * be enough work requests to send the entire message. 89962306a36Sopenharmony_ci */ 90062306a36Sopenharmony_ci i = DIV_ROUND_UP(op->op_count, max_sge); 90162306a36Sopenharmony_ci 90262306a36Sopenharmony_ci work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); 90362306a36Sopenharmony_ci if (work_alloc != i) { 90462306a36Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 90562306a36Sopenharmony_ci rds_ib_stats_inc(s_ib_tx_ring_full); 90662306a36Sopenharmony_ci ret = -ENOMEM; 90762306a36Sopenharmony_ci goto out; 90862306a36Sopenharmony_ci } 90962306a36Sopenharmony_ci 91062306a36Sopenharmony_ci send = &ic->i_sends[pos]; 91162306a36Sopenharmony_ci first = send; 91262306a36Sopenharmony_ci prev = NULL; 91362306a36Sopenharmony_ci scat = &op->op_sg[0]; 91462306a36Sopenharmony_ci sent = 0; 91562306a36Sopenharmony_ci num_sge = op->op_count; 91662306a36Sopenharmony_ci 91762306a36Sopenharmony_ci for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { 91862306a36Sopenharmony_ci send->s_wr.send_flags = 0; 91962306a36Sopenharmony_ci send->s_queued = jiffies; 92062306a36Sopenharmony_ci send->s_op = NULL; 92162306a36Sopenharmony_ci 92262306a36Sopenharmony_ci if (!op->op_notify) 92362306a36Sopenharmony_ci nr_sig += rds_ib_set_wr_signal_state(ic, send, 92462306a36Sopenharmony_ci op->op_notify); 92562306a36Sopenharmony_ci 92662306a36Sopenharmony_ci send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; 92762306a36Sopenharmony_ci send->s_rdma_wr.remote_addr = remote_addr; 92862306a36Sopenharmony_ci send->s_rdma_wr.rkey = op->op_rkey; 92962306a36Sopenharmony_ci 93062306a36Sopenharmony_ci if (num_sge > max_sge) { 93162306a36Sopenharmony_ci send->s_rdma_wr.wr.num_sge = max_sge; 93262306a36Sopenharmony_ci num_sge -= max_sge; 93362306a36Sopenharmony_ci } else { 93462306a36Sopenharmony_ci send->s_rdma_wr.wr.num_sge = num_sge; 93562306a36Sopenharmony_ci } 93662306a36Sopenharmony_ci 93762306a36Sopenharmony_ci send->s_rdma_wr.wr.next = NULL; 93862306a36Sopenharmony_ci 93962306a36Sopenharmony_ci if (prev) 94062306a36Sopenharmony_ci prev->s_rdma_wr.wr.next = &send->s_rdma_wr.wr; 94162306a36Sopenharmony_ci 94262306a36Sopenharmony_ci for (j = 0; j < send->s_rdma_wr.wr.num_sge && 94362306a36Sopenharmony_ci scat != &op->op_sg[op->op_count]; j++) { 94462306a36Sopenharmony_ci len = sg_dma_len(scat); 94562306a36Sopenharmony_ci if (!op->op_odp_mr) { 94662306a36Sopenharmony_ci send->s_sge[j].addr = sg_dma_address(scat); 94762306a36Sopenharmony_ci send->s_sge[j].lkey = ic->i_pd->local_dma_lkey; 94862306a36Sopenharmony_ci } else { 94962306a36Sopenharmony_ci send->s_sge[j].addr = odp_addr; 95062306a36Sopenharmony_ci send->s_sge[j].lkey = odp_lkey; 95162306a36Sopenharmony_ci } 95262306a36Sopenharmony_ci send->s_sge[j].length = len; 95362306a36Sopenharmony_ci 95462306a36Sopenharmony_ci sent += len; 95562306a36Sopenharmony_ci rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); 95662306a36Sopenharmony_ci 95762306a36Sopenharmony_ci remote_addr += len; 95862306a36Sopenharmony_ci odp_addr += len; 95962306a36Sopenharmony_ci scat++; 96062306a36Sopenharmony_ci } 96162306a36Sopenharmony_ci 96262306a36Sopenharmony_ci rdsdebug("send %p wr %p num_sge %u next %p\n", send, 96362306a36Sopenharmony_ci &send->s_rdma_wr.wr, 96462306a36Sopenharmony_ci send->s_rdma_wr.wr.num_sge, 96562306a36Sopenharmony_ci send->s_rdma_wr.wr.next); 96662306a36Sopenharmony_ci 96762306a36Sopenharmony_ci prev = send; 96862306a36Sopenharmony_ci if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) 96962306a36Sopenharmony_ci send = ic->i_sends; 97062306a36Sopenharmony_ci } 97162306a36Sopenharmony_ci 97262306a36Sopenharmony_ci /* give a reference to the last op */ 97362306a36Sopenharmony_ci if (scat == &op->op_sg[op->op_count]) { 97462306a36Sopenharmony_ci prev->s_op = op; 97562306a36Sopenharmony_ci rds_message_addref(container_of(op, struct rds_message, rdma)); 97662306a36Sopenharmony_ci } 97762306a36Sopenharmony_ci 97862306a36Sopenharmony_ci if (i < work_alloc) { 97962306a36Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); 98062306a36Sopenharmony_ci work_alloc = i; 98162306a36Sopenharmony_ci } 98262306a36Sopenharmony_ci 98362306a36Sopenharmony_ci if (nr_sig) 98462306a36Sopenharmony_ci atomic_add(nr_sig, &ic->i_signaled_sends); 98562306a36Sopenharmony_ci 98662306a36Sopenharmony_ci failed_wr = &first->s_rdma_wr.wr; 98762306a36Sopenharmony_ci ret = ib_post_send(ic->i_cm_id->qp, &first->s_rdma_wr.wr, &failed_wr); 98862306a36Sopenharmony_ci rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, 98962306a36Sopenharmony_ci first, &first->s_rdma_wr.wr, ret, failed_wr); 99062306a36Sopenharmony_ci BUG_ON(failed_wr != &first->s_rdma_wr.wr); 99162306a36Sopenharmony_ci if (ret) { 99262306a36Sopenharmony_ci printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI6c " 99362306a36Sopenharmony_ci "returned %d\n", &conn->c_faddr, ret); 99462306a36Sopenharmony_ci rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 99562306a36Sopenharmony_ci rds_ib_sub_signaled(ic, nr_sig); 99662306a36Sopenharmony_ci goto out; 99762306a36Sopenharmony_ci } 99862306a36Sopenharmony_ci 99962306a36Sopenharmony_ci if (unlikely(failed_wr != &first->s_rdma_wr.wr)) { 100062306a36Sopenharmony_ci printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret); 100162306a36Sopenharmony_ci BUG_ON(failed_wr != &first->s_rdma_wr.wr); 100262306a36Sopenharmony_ci } 100362306a36Sopenharmony_ci 100462306a36Sopenharmony_ci 100562306a36Sopenharmony_ciout: 100662306a36Sopenharmony_ci return ret; 100762306a36Sopenharmony_ci} 100862306a36Sopenharmony_ci 100962306a36Sopenharmony_civoid rds_ib_xmit_path_complete(struct rds_conn_path *cp) 101062306a36Sopenharmony_ci{ 101162306a36Sopenharmony_ci struct rds_connection *conn = cp->cp_conn; 101262306a36Sopenharmony_ci struct rds_ib_connection *ic = conn->c_transport_data; 101362306a36Sopenharmony_ci 101462306a36Sopenharmony_ci /* We may have a pending ACK or window update we were unable 101562306a36Sopenharmony_ci * to send previously (due to flow control). Try again. */ 101662306a36Sopenharmony_ci rds_ib_attempt_ack(ic); 101762306a36Sopenharmony_ci} 1018