162306a36Sopenharmony_ci/* 262306a36Sopenharmony_ci * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * This software is available to you under a choice of one of two 562306a36Sopenharmony_ci * licenses. You may choose to be licensed under the terms of the GNU 662306a36Sopenharmony_ci * General Public License (GPL) Version 2, available from the file 762306a36Sopenharmony_ci * COPYING in the main directory of this source tree, or the 862306a36Sopenharmony_ci * OpenIB.org BSD license below: 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * Redistribution and use in source and binary forms, with or 1162306a36Sopenharmony_ci * without modification, are permitted provided that the following 1262306a36Sopenharmony_ci * conditions are met: 1362306a36Sopenharmony_ci * 1462306a36Sopenharmony_ci * - Redistributions of source code must retain the above 1562306a36Sopenharmony_ci * copyright notice, this list of conditions and the following 1662306a36Sopenharmony_ci * disclaimer. 1762306a36Sopenharmony_ci * 1862306a36Sopenharmony_ci * - Redistributions in binary form must reproduce the above 1962306a36Sopenharmony_ci * copyright notice, this list of conditions and the following 2062306a36Sopenharmony_ci * disclaimer in the documentation and/or other materials 2162306a36Sopenharmony_ci * provided with the distribution. 2262306a36Sopenharmony_ci * 2362306a36Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 2462306a36Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 2562306a36Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 2662306a36Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 2762306a36Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 2862306a36Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 2962306a36Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 3062306a36Sopenharmony_ci * SOFTWARE. 3162306a36Sopenharmony_ci * 3262306a36Sopenharmony_ci */ 3362306a36Sopenharmony_ci#include <linux/kernel.h> 3462306a36Sopenharmony_ci#include <linux/slab.h> 3562306a36Sopenharmony_ci#include <net/tcp.h> 3662306a36Sopenharmony_ci#include <trace/events/sock.h> 3762306a36Sopenharmony_ci 3862306a36Sopenharmony_ci#include "rds.h" 3962306a36Sopenharmony_ci#include "tcp.h" 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_cistatic struct kmem_cache *rds_tcp_incoming_slab; 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_cistatic void rds_tcp_inc_purge(struct rds_incoming *inc) 4462306a36Sopenharmony_ci{ 4562306a36Sopenharmony_ci struct rds_tcp_incoming *tinc; 4662306a36Sopenharmony_ci tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 4762306a36Sopenharmony_ci rdsdebug("purging tinc %p inc %p\n", tinc, inc); 4862306a36Sopenharmony_ci skb_queue_purge(&tinc->ti_skb_list); 4962306a36Sopenharmony_ci} 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_civoid rds_tcp_inc_free(struct rds_incoming *inc) 5262306a36Sopenharmony_ci{ 5362306a36Sopenharmony_ci struct rds_tcp_incoming *tinc; 5462306a36Sopenharmony_ci tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 5562306a36Sopenharmony_ci rds_tcp_inc_purge(inc); 5662306a36Sopenharmony_ci rdsdebug("freeing tinc %p inc %p\n", tinc, inc); 5762306a36Sopenharmony_ci kmem_cache_free(rds_tcp_incoming_slab, tinc); 5862306a36Sopenharmony_ci} 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_ci/* 6162306a36Sopenharmony_ci * this is pretty lame, but, whatever. 6262306a36Sopenharmony_ci */ 6362306a36Sopenharmony_ciint rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to) 6462306a36Sopenharmony_ci{ 6562306a36Sopenharmony_ci struct rds_tcp_incoming *tinc; 6662306a36Sopenharmony_ci struct sk_buff *skb; 6762306a36Sopenharmony_ci int ret = 0; 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_ci if (!iov_iter_count(to)) 7062306a36Sopenharmony_ci goto out; 7162306a36Sopenharmony_ci 7262306a36Sopenharmony_ci tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_ci skb_queue_walk(&tinc->ti_skb_list, skb) { 7562306a36Sopenharmony_ci unsigned long to_copy, skb_off; 7662306a36Sopenharmony_ci for (skb_off = 0; skb_off < skb->len; skb_off += to_copy) { 7762306a36Sopenharmony_ci to_copy = iov_iter_count(to); 7862306a36Sopenharmony_ci to_copy = min(to_copy, skb->len - skb_off); 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci if (skb_copy_datagram_iter(skb, skb_off, to, to_copy)) 8162306a36Sopenharmony_ci return -EFAULT; 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_ci rds_stats_add(s_copy_to_user, to_copy); 8462306a36Sopenharmony_ci ret += to_copy; 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_ci if (!iov_iter_count(to)) 8762306a36Sopenharmony_ci goto out; 8862306a36Sopenharmony_ci } 8962306a36Sopenharmony_ci } 9062306a36Sopenharmony_ciout: 9162306a36Sopenharmony_ci return ret; 9262306a36Sopenharmony_ci} 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci/* 9562306a36Sopenharmony_ci * We have a series of skbs that have fragmented pieces of the congestion 9662306a36Sopenharmony_ci * bitmap. They must add up to the exact size of the congestion bitmap. We 9762306a36Sopenharmony_ci * use the skb helpers to copy those into the pages that make up the in-memory 9862306a36Sopenharmony_ci * congestion bitmap for the remote address of this connection. We then tell 9962306a36Sopenharmony_ci * the congestion core that the bitmap has been changed so that it can wake up 10062306a36Sopenharmony_ci * sleepers. 10162306a36Sopenharmony_ci * 10262306a36Sopenharmony_ci * This is racing with sending paths which are using test_bit to see if the 10362306a36Sopenharmony_ci * bitmap indicates that their recipient is congested. 10462306a36Sopenharmony_ci */ 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_cistatic void rds_tcp_cong_recv(struct rds_connection *conn, 10762306a36Sopenharmony_ci struct rds_tcp_incoming *tinc) 10862306a36Sopenharmony_ci{ 10962306a36Sopenharmony_ci struct sk_buff *skb; 11062306a36Sopenharmony_ci unsigned int to_copy, skb_off; 11162306a36Sopenharmony_ci unsigned int map_off; 11262306a36Sopenharmony_ci unsigned int map_page; 11362306a36Sopenharmony_ci struct rds_cong_map *map; 11462306a36Sopenharmony_ci int ret; 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci /* catch completely corrupt packets */ 11762306a36Sopenharmony_ci if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) 11862306a36Sopenharmony_ci return; 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_ci map_page = 0; 12162306a36Sopenharmony_ci map_off = 0; 12262306a36Sopenharmony_ci map = conn->c_fcong; 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci skb_queue_walk(&tinc->ti_skb_list, skb) { 12562306a36Sopenharmony_ci skb_off = 0; 12662306a36Sopenharmony_ci while (skb_off < skb->len) { 12762306a36Sopenharmony_ci to_copy = min_t(unsigned int, PAGE_SIZE - map_off, 12862306a36Sopenharmony_ci skb->len - skb_off); 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci BUG_ON(map_page >= RDS_CONG_MAP_PAGES); 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_ci /* only returns 0 or -error */ 13362306a36Sopenharmony_ci ret = skb_copy_bits(skb, skb_off, 13462306a36Sopenharmony_ci (void *)map->m_page_addrs[map_page] + map_off, 13562306a36Sopenharmony_ci to_copy); 13662306a36Sopenharmony_ci BUG_ON(ret != 0); 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci skb_off += to_copy; 13962306a36Sopenharmony_ci map_off += to_copy; 14062306a36Sopenharmony_ci if (map_off == PAGE_SIZE) { 14162306a36Sopenharmony_ci map_off = 0; 14262306a36Sopenharmony_ci map_page++; 14362306a36Sopenharmony_ci } 14462306a36Sopenharmony_ci } 14562306a36Sopenharmony_ci } 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci rds_cong_map_updated(map, ~(u64) 0); 14862306a36Sopenharmony_ci} 14962306a36Sopenharmony_ci 15062306a36Sopenharmony_cistruct rds_tcp_desc_arg { 15162306a36Sopenharmony_ci struct rds_conn_path *conn_path; 15262306a36Sopenharmony_ci gfp_t gfp; 15362306a36Sopenharmony_ci}; 15462306a36Sopenharmony_ci 15562306a36Sopenharmony_cistatic int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, 15662306a36Sopenharmony_ci unsigned int offset, size_t len) 15762306a36Sopenharmony_ci{ 15862306a36Sopenharmony_ci struct rds_tcp_desc_arg *arg = desc->arg.data; 15962306a36Sopenharmony_ci struct rds_conn_path *cp = arg->conn_path; 16062306a36Sopenharmony_ci struct rds_tcp_connection *tc = cp->cp_transport_data; 16162306a36Sopenharmony_ci struct rds_tcp_incoming *tinc = tc->t_tinc; 16262306a36Sopenharmony_ci struct sk_buff *clone; 16362306a36Sopenharmony_ci size_t left = len, to_copy; 16462306a36Sopenharmony_ci 16562306a36Sopenharmony_ci rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset, 16662306a36Sopenharmony_ci len); 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_ci /* 16962306a36Sopenharmony_ci * tcp_read_sock() interprets partial progress as an indication to stop 17062306a36Sopenharmony_ci * processing. 17162306a36Sopenharmony_ci */ 17262306a36Sopenharmony_ci while (left) { 17362306a36Sopenharmony_ci if (!tinc) { 17462306a36Sopenharmony_ci tinc = kmem_cache_alloc(rds_tcp_incoming_slab, 17562306a36Sopenharmony_ci arg->gfp); 17662306a36Sopenharmony_ci if (!tinc) { 17762306a36Sopenharmony_ci desc->error = -ENOMEM; 17862306a36Sopenharmony_ci goto out; 17962306a36Sopenharmony_ci } 18062306a36Sopenharmony_ci tc->t_tinc = tinc; 18162306a36Sopenharmony_ci rdsdebug("allocated tinc %p\n", tinc); 18262306a36Sopenharmony_ci rds_inc_path_init(&tinc->ti_inc, cp, 18362306a36Sopenharmony_ci &cp->cp_conn->c_faddr); 18462306a36Sopenharmony_ci tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] = 18562306a36Sopenharmony_ci local_clock(); 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_ci /* 18862306a36Sopenharmony_ci * XXX * we might be able to use the __ variants when 18962306a36Sopenharmony_ci * we've already serialized at a higher level. 19062306a36Sopenharmony_ci */ 19162306a36Sopenharmony_ci skb_queue_head_init(&tinc->ti_skb_list); 19262306a36Sopenharmony_ci } 19362306a36Sopenharmony_ci 19462306a36Sopenharmony_ci if (left && tc->t_tinc_hdr_rem) { 19562306a36Sopenharmony_ci to_copy = min(tc->t_tinc_hdr_rem, left); 19662306a36Sopenharmony_ci rdsdebug("copying %zu header from skb %p\n", to_copy, 19762306a36Sopenharmony_ci skb); 19862306a36Sopenharmony_ci skb_copy_bits(skb, offset, 19962306a36Sopenharmony_ci (char *)&tinc->ti_inc.i_hdr + 20062306a36Sopenharmony_ci sizeof(struct rds_header) - 20162306a36Sopenharmony_ci tc->t_tinc_hdr_rem, 20262306a36Sopenharmony_ci to_copy); 20362306a36Sopenharmony_ci tc->t_tinc_hdr_rem -= to_copy; 20462306a36Sopenharmony_ci left -= to_copy; 20562306a36Sopenharmony_ci offset += to_copy; 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci if (tc->t_tinc_hdr_rem == 0) { 20862306a36Sopenharmony_ci /* could be 0 for a 0 len message */ 20962306a36Sopenharmony_ci tc->t_tinc_data_rem = 21062306a36Sopenharmony_ci be32_to_cpu(tinc->ti_inc.i_hdr.h_len); 21162306a36Sopenharmony_ci tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] = 21262306a36Sopenharmony_ci local_clock(); 21362306a36Sopenharmony_ci } 21462306a36Sopenharmony_ci } 21562306a36Sopenharmony_ci 21662306a36Sopenharmony_ci if (left && tc->t_tinc_data_rem) { 21762306a36Sopenharmony_ci to_copy = min(tc->t_tinc_data_rem, left); 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci clone = pskb_extract(skb, offset, to_copy, arg->gfp); 22062306a36Sopenharmony_ci if (!clone) { 22162306a36Sopenharmony_ci desc->error = -ENOMEM; 22262306a36Sopenharmony_ci goto out; 22362306a36Sopenharmony_ci } 22462306a36Sopenharmony_ci 22562306a36Sopenharmony_ci skb_queue_tail(&tinc->ti_skb_list, clone); 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci rdsdebug("skb %p data %p len %d off %u to_copy %zu -> " 22862306a36Sopenharmony_ci "clone %p data %p len %d\n", 22962306a36Sopenharmony_ci skb, skb->data, skb->len, offset, to_copy, 23062306a36Sopenharmony_ci clone, clone->data, clone->len); 23162306a36Sopenharmony_ci 23262306a36Sopenharmony_ci tc->t_tinc_data_rem -= to_copy; 23362306a36Sopenharmony_ci left -= to_copy; 23462306a36Sopenharmony_ci offset += to_copy; 23562306a36Sopenharmony_ci } 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_ci if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) { 23862306a36Sopenharmony_ci struct rds_connection *conn = cp->cp_conn; 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_ci if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) 24162306a36Sopenharmony_ci rds_tcp_cong_recv(conn, tinc); 24262306a36Sopenharmony_ci else 24362306a36Sopenharmony_ci rds_recv_incoming(conn, &conn->c_faddr, 24462306a36Sopenharmony_ci &conn->c_laddr, 24562306a36Sopenharmony_ci &tinc->ti_inc, 24662306a36Sopenharmony_ci arg->gfp); 24762306a36Sopenharmony_ci 24862306a36Sopenharmony_ci tc->t_tinc_hdr_rem = sizeof(struct rds_header); 24962306a36Sopenharmony_ci tc->t_tinc_data_rem = 0; 25062306a36Sopenharmony_ci tc->t_tinc = NULL; 25162306a36Sopenharmony_ci rds_inc_put(&tinc->ti_inc); 25262306a36Sopenharmony_ci tinc = NULL; 25362306a36Sopenharmony_ci } 25462306a36Sopenharmony_ci } 25562306a36Sopenharmony_ciout: 25662306a36Sopenharmony_ci rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n", 25762306a36Sopenharmony_ci len, left, skb->len, 25862306a36Sopenharmony_ci skb_queue_len(&tc->t_sock->sk->sk_receive_queue)); 25962306a36Sopenharmony_ci return len - left; 26062306a36Sopenharmony_ci} 26162306a36Sopenharmony_ci 26262306a36Sopenharmony_ci/* the caller has to hold the sock lock */ 26362306a36Sopenharmony_cistatic int rds_tcp_read_sock(struct rds_conn_path *cp, gfp_t gfp) 26462306a36Sopenharmony_ci{ 26562306a36Sopenharmony_ci struct rds_tcp_connection *tc = cp->cp_transport_data; 26662306a36Sopenharmony_ci struct socket *sock = tc->t_sock; 26762306a36Sopenharmony_ci read_descriptor_t desc; 26862306a36Sopenharmony_ci struct rds_tcp_desc_arg arg; 26962306a36Sopenharmony_ci 27062306a36Sopenharmony_ci /* It's like glib in the kernel! */ 27162306a36Sopenharmony_ci arg.conn_path = cp; 27262306a36Sopenharmony_ci arg.gfp = gfp; 27362306a36Sopenharmony_ci desc.arg.data = &arg; 27462306a36Sopenharmony_ci desc.error = 0; 27562306a36Sopenharmony_ci desc.count = 1; /* give more than one skb per call */ 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_ci tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv); 27862306a36Sopenharmony_ci rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp, 27962306a36Sopenharmony_ci desc.error); 28062306a36Sopenharmony_ci 28162306a36Sopenharmony_ci return desc.error; 28262306a36Sopenharmony_ci} 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_ci/* 28562306a36Sopenharmony_ci * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from 28662306a36Sopenharmony_ci * data_ready. 28762306a36Sopenharmony_ci * 28862306a36Sopenharmony_ci * if we fail to allocate we're in trouble.. blindly wait some time before 28962306a36Sopenharmony_ci * trying again to see if the VM can free up something for us. 29062306a36Sopenharmony_ci */ 29162306a36Sopenharmony_ciint rds_tcp_recv_path(struct rds_conn_path *cp) 29262306a36Sopenharmony_ci{ 29362306a36Sopenharmony_ci struct rds_tcp_connection *tc = cp->cp_transport_data; 29462306a36Sopenharmony_ci struct socket *sock = tc->t_sock; 29562306a36Sopenharmony_ci int ret = 0; 29662306a36Sopenharmony_ci 29762306a36Sopenharmony_ci rdsdebug("recv worker path [%d] tc %p sock %p\n", 29862306a36Sopenharmony_ci cp->cp_index, tc, sock); 29962306a36Sopenharmony_ci 30062306a36Sopenharmony_ci lock_sock(sock->sk); 30162306a36Sopenharmony_ci ret = rds_tcp_read_sock(cp, GFP_KERNEL); 30262306a36Sopenharmony_ci release_sock(sock->sk); 30362306a36Sopenharmony_ci 30462306a36Sopenharmony_ci return ret; 30562306a36Sopenharmony_ci} 30662306a36Sopenharmony_ci 30762306a36Sopenharmony_civoid rds_tcp_data_ready(struct sock *sk) 30862306a36Sopenharmony_ci{ 30962306a36Sopenharmony_ci void (*ready)(struct sock *sk); 31062306a36Sopenharmony_ci struct rds_conn_path *cp; 31162306a36Sopenharmony_ci struct rds_tcp_connection *tc; 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci trace_sk_data_ready(sk); 31462306a36Sopenharmony_ci rdsdebug("data ready sk %p\n", sk); 31562306a36Sopenharmony_ci 31662306a36Sopenharmony_ci read_lock_bh(&sk->sk_callback_lock); 31762306a36Sopenharmony_ci cp = sk->sk_user_data; 31862306a36Sopenharmony_ci if (!cp) { /* check for teardown race */ 31962306a36Sopenharmony_ci ready = sk->sk_data_ready; 32062306a36Sopenharmony_ci goto out; 32162306a36Sopenharmony_ci } 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_ci tc = cp->cp_transport_data; 32462306a36Sopenharmony_ci ready = tc->t_orig_data_ready; 32562306a36Sopenharmony_ci rds_tcp_stats_inc(s_tcp_data_ready_calls); 32662306a36Sopenharmony_ci 32762306a36Sopenharmony_ci if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) { 32862306a36Sopenharmony_ci rcu_read_lock(); 32962306a36Sopenharmony_ci if (!rds_destroy_pending(cp->cp_conn)) 33062306a36Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); 33162306a36Sopenharmony_ci rcu_read_unlock(); 33262306a36Sopenharmony_ci } 33362306a36Sopenharmony_ciout: 33462306a36Sopenharmony_ci read_unlock_bh(&sk->sk_callback_lock); 33562306a36Sopenharmony_ci ready(sk); 33662306a36Sopenharmony_ci} 33762306a36Sopenharmony_ci 33862306a36Sopenharmony_ciint rds_tcp_recv_init(void) 33962306a36Sopenharmony_ci{ 34062306a36Sopenharmony_ci rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", 34162306a36Sopenharmony_ci sizeof(struct rds_tcp_incoming), 34262306a36Sopenharmony_ci 0, 0, NULL); 34362306a36Sopenharmony_ci if (!rds_tcp_incoming_slab) 34462306a36Sopenharmony_ci return -ENOMEM; 34562306a36Sopenharmony_ci return 0; 34662306a36Sopenharmony_ci} 34762306a36Sopenharmony_ci 34862306a36Sopenharmony_civoid rds_tcp_recv_exit(void) 34962306a36Sopenharmony_ci{ 35062306a36Sopenharmony_ci kmem_cache_destroy(rds_tcp_incoming_slab); 35162306a36Sopenharmony_ci} 352