162306a36Sopenharmony_ci/*
262306a36Sopenharmony_ci * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
362306a36Sopenharmony_ci *
462306a36Sopenharmony_ci * This software is available to you under a choice of one of two
562306a36Sopenharmony_ci * licenses.  You may choose to be licensed under the terms of the GNU
662306a36Sopenharmony_ci * General Public License (GPL) Version 2, available from the file
762306a36Sopenharmony_ci * COPYING in the main directory of this source tree, or the
862306a36Sopenharmony_ci * OpenIB.org BSD license below:
962306a36Sopenharmony_ci *
1062306a36Sopenharmony_ci *     Redistribution and use in source and binary forms, with or
1162306a36Sopenharmony_ci *     without modification, are permitted provided that the following
1262306a36Sopenharmony_ci *     conditions are met:
1362306a36Sopenharmony_ci *
1462306a36Sopenharmony_ci *      - Redistributions of source code must retain the above
1562306a36Sopenharmony_ci *        copyright notice, this list of conditions and the following
1662306a36Sopenharmony_ci *        disclaimer.
1762306a36Sopenharmony_ci *
1862306a36Sopenharmony_ci *      - Redistributions in binary form must reproduce the above
1962306a36Sopenharmony_ci *        copyright notice, this list of conditions and the following
2062306a36Sopenharmony_ci *        disclaimer in the documentation and/or other materials
2162306a36Sopenharmony_ci *        provided with the distribution.
2262306a36Sopenharmony_ci *
2362306a36Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
2462306a36Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
2562306a36Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
2662306a36Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
2762306a36Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
2862306a36Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
2962306a36Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3062306a36Sopenharmony_ci * SOFTWARE.
3162306a36Sopenharmony_ci *
3262306a36Sopenharmony_ci */
3362306a36Sopenharmony_ci#include <linux/kernel.h>
3462306a36Sopenharmony_ci#include <linux/slab.h>
3562306a36Sopenharmony_ci#include <net/sock.h>
3662306a36Sopenharmony_ci#include <linux/in.h>
3762306a36Sopenharmony_ci#include <linux/export.h>
3862306a36Sopenharmony_ci#include <linux/sched/clock.h>
3962306a36Sopenharmony_ci#include <linux/time.h>
4062306a36Sopenharmony_ci#include <linux/rds.h>
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_ci#include "rds.h"
4362306a36Sopenharmony_ci
4462306a36Sopenharmony_civoid rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
4562306a36Sopenharmony_ci		 struct in6_addr *saddr)
4662306a36Sopenharmony_ci{
4762306a36Sopenharmony_ci	refcount_set(&inc->i_refcount, 1);
4862306a36Sopenharmony_ci	INIT_LIST_HEAD(&inc->i_item);
4962306a36Sopenharmony_ci	inc->i_conn = conn;
5062306a36Sopenharmony_ci	inc->i_saddr = *saddr;
5162306a36Sopenharmony_ci	inc->i_usercopy.rdma_cookie = 0;
5262306a36Sopenharmony_ci	inc->i_usercopy.rx_tstamp = ktime_set(0, 0);
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_ci	memset(inc->i_rx_lat_trace, 0, sizeof(inc->i_rx_lat_trace));
5562306a36Sopenharmony_ci}
5662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_inc_init);
5762306a36Sopenharmony_ci
5862306a36Sopenharmony_civoid rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp,
5962306a36Sopenharmony_ci		       struct in6_addr  *saddr)
6062306a36Sopenharmony_ci{
6162306a36Sopenharmony_ci	refcount_set(&inc->i_refcount, 1);
6262306a36Sopenharmony_ci	INIT_LIST_HEAD(&inc->i_item);
6362306a36Sopenharmony_ci	inc->i_conn = cp->cp_conn;
6462306a36Sopenharmony_ci	inc->i_conn_path = cp;
6562306a36Sopenharmony_ci	inc->i_saddr = *saddr;
6662306a36Sopenharmony_ci	inc->i_usercopy.rdma_cookie = 0;
6762306a36Sopenharmony_ci	inc->i_usercopy.rx_tstamp = ktime_set(0, 0);
6862306a36Sopenharmony_ci}
6962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_inc_path_init);
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_cistatic void rds_inc_addref(struct rds_incoming *inc)
7262306a36Sopenharmony_ci{
7362306a36Sopenharmony_ci	rdsdebug("addref inc %p ref %d\n", inc, refcount_read(&inc->i_refcount));
7462306a36Sopenharmony_ci	refcount_inc(&inc->i_refcount);
7562306a36Sopenharmony_ci}
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_civoid rds_inc_put(struct rds_incoming *inc)
7862306a36Sopenharmony_ci{
7962306a36Sopenharmony_ci	rdsdebug("put inc %p ref %d\n", inc, refcount_read(&inc->i_refcount));
8062306a36Sopenharmony_ci	if (refcount_dec_and_test(&inc->i_refcount)) {
8162306a36Sopenharmony_ci		BUG_ON(!list_empty(&inc->i_item));
8262306a36Sopenharmony_ci
8362306a36Sopenharmony_ci		inc->i_conn->c_trans->inc_free(inc);
8462306a36Sopenharmony_ci	}
8562306a36Sopenharmony_ci}
8662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_inc_put);
8762306a36Sopenharmony_ci
8862306a36Sopenharmony_cistatic void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
8962306a36Sopenharmony_ci				  struct rds_cong_map *map,
9062306a36Sopenharmony_ci				  int delta, __be16 port)
9162306a36Sopenharmony_ci{
9262306a36Sopenharmony_ci	int now_congested;
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_ci	if (delta == 0)
9562306a36Sopenharmony_ci		return;
9662306a36Sopenharmony_ci
9762306a36Sopenharmony_ci	rs->rs_rcv_bytes += delta;
9862306a36Sopenharmony_ci	if (delta > 0)
9962306a36Sopenharmony_ci		rds_stats_add(s_recv_bytes_added_to_socket, delta);
10062306a36Sopenharmony_ci	else
10162306a36Sopenharmony_ci		rds_stats_add(s_recv_bytes_removed_from_socket, -delta);
10262306a36Sopenharmony_ci
10362306a36Sopenharmony_ci	/* loop transport doesn't send/recv congestion updates */
10462306a36Sopenharmony_ci	if (rs->rs_transport->t_type == RDS_TRANS_LOOP)
10562306a36Sopenharmony_ci		return;
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci	now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci	rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d "
11062306a36Sopenharmony_ci	  "now_cong %d delta %d\n",
11162306a36Sopenharmony_ci	  rs, &rs->rs_bound_addr,
11262306a36Sopenharmony_ci	  ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
11362306a36Sopenharmony_ci	  rds_sk_rcvbuf(rs), now_congested, delta);
11462306a36Sopenharmony_ci
11562306a36Sopenharmony_ci	/* wasn't -> am congested */
11662306a36Sopenharmony_ci	if (!rs->rs_congested && now_congested) {
11762306a36Sopenharmony_ci		rs->rs_congested = 1;
11862306a36Sopenharmony_ci		rds_cong_set_bit(map, port);
11962306a36Sopenharmony_ci		rds_cong_queue_updates(map);
12062306a36Sopenharmony_ci	}
12162306a36Sopenharmony_ci	/* was -> aren't congested */
12262306a36Sopenharmony_ci	/* Require more free space before reporting uncongested to prevent
12362306a36Sopenharmony_ci	   bouncing cong/uncong state too often */
12462306a36Sopenharmony_ci	else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
12562306a36Sopenharmony_ci		rs->rs_congested = 0;
12662306a36Sopenharmony_ci		rds_cong_clear_bit(map, port);
12762306a36Sopenharmony_ci		rds_cong_queue_updates(map);
12862306a36Sopenharmony_ci	}
12962306a36Sopenharmony_ci
13062306a36Sopenharmony_ci	/* do nothing if no change in cong state */
13162306a36Sopenharmony_ci}
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_cistatic void rds_conn_peer_gen_update(struct rds_connection *conn,
13462306a36Sopenharmony_ci				     u32 peer_gen_num)
13562306a36Sopenharmony_ci{
13662306a36Sopenharmony_ci	int i;
13762306a36Sopenharmony_ci	struct rds_message *rm, *tmp;
13862306a36Sopenharmony_ci	unsigned long flags;
13962306a36Sopenharmony_ci
14062306a36Sopenharmony_ci	WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP);
14162306a36Sopenharmony_ci	if (peer_gen_num != 0) {
14262306a36Sopenharmony_ci		if (conn->c_peer_gen_num != 0 &&
14362306a36Sopenharmony_ci		    peer_gen_num != conn->c_peer_gen_num) {
14462306a36Sopenharmony_ci			for (i = 0; i < RDS_MPATH_WORKERS; i++) {
14562306a36Sopenharmony_ci				struct rds_conn_path *cp;
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci				cp = &conn->c_path[i];
14862306a36Sopenharmony_ci				spin_lock_irqsave(&cp->cp_lock, flags);
14962306a36Sopenharmony_ci				cp->cp_next_tx_seq = 1;
15062306a36Sopenharmony_ci				cp->cp_next_rx_seq = 0;
15162306a36Sopenharmony_ci				list_for_each_entry_safe(rm, tmp,
15262306a36Sopenharmony_ci							 &cp->cp_retrans,
15362306a36Sopenharmony_ci							 m_conn_item) {
15462306a36Sopenharmony_ci					set_bit(RDS_MSG_FLUSH, &rm->m_flags);
15562306a36Sopenharmony_ci				}
15662306a36Sopenharmony_ci				spin_unlock_irqrestore(&cp->cp_lock, flags);
15762306a36Sopenharmony_ci			}
15862306a36Sopenharmony_ci		}
15962306a36Sopenharmony_ci		conn->c_peer_gen_num = peer_gen_num;
16062306a36Sopenharmony_ci	}
16162306a36Sopenharmony_ci}
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_ci/*
16462306a36Sopenharmony_ci * Process all extension headers that come with this message.
16562306a36Sopenharmony_ci */
16662306a36Sopenharmony_cistatic void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
16762306a36Sopenharmony_ci{
16862306a36Sopenharmony_ci	struct rds_header *hdr = &inc->i_hdr;
16962306a36Sopenharmony_ci	unsigned int pos = 0, type, len;
17062306a36Sopenharmony_ci	union {
17162306a36Sopenharmony_ci		struct rds_ext_header_version version;
17262306a36Sopenharmony_ci		struct rds_ext_header_rdma rdma;
17362306a36Sopenharmony_ci		struct rds_ext_header_rdma_dest rdma_dest;
17462306a36Sopenharmony_ci	} buffer;
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_ci	while (1) {
17762306a36Sopenharmony_ci		len = sizeof(buffer);
17862306a36Sopenharmony_ci		type = rds_message_next_extension(hdr, &pos, &buffer, &len);
17962306a36Sopenharmony_ci		if (type == RDS_EXTHDR_NONE)
18062306a36Sopenharmony_ci			break;
18162306a36Sopenharmony_ci		/* Process extension header here */
18262306a36Sopenharmony_ci		switch (type) {
18362306a36Sopenharmony_ci		case RDS_EXTHDR_RDMA:
18462306a36Sopenharmony_ci			rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
18562306a36Sopenharmony_ci			break;
18662306a36Sopenharmony_ci
18762306a36Sopenharmony_ci		case RDS_EXTHDR_RDMA_DEST:
18862306a36Sopenharmony_ci			/* We ignore the size for now. We could stash it
18962306a36Sopenharmony_ci			 * somewhere and use it for error checking. */
19062306a36Sopenharmony_ci			inc->i_usercopy.rdma_cookie = rds_rdma_make_cookie(
19162306a36Sopenharmony_ci					be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
19262306a36Sopenharmony_ci					be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
19362306a36Sopenharmony_ci
19462306a36Sopenharmony_ci			break;
19562306a36Sopenharmony_ci		}
19662306a36Sopenharmony_ci	}
19762306a36Sopenharmony_ci}
19862306a36Sopenharmony_ci
19962306a36Sopenharmony_cistatic void rds_recv_hs_exthdrs(struct rds_header *hdr,
20062306a36Sopenharmony_ci				struct rds_connection *conn)
20162306a36Sopenharmony_ci{
20262306a36Sopenharmony_ci	unsigned int pos = 0, type, len;
20362306a36Sopenharmony_ci	union {
20462306a36Sopenharmony_ci		struct rds_ext_header_version version;
20562306a36Sopenharmony_ci		u16 rds_npaths;
20662306a36Sopenharmony_ci		u32 rds_gen_num;
20762306a36Sopenharmony_ci	} buffer;
20862306a36Sopenharmony_ci	u32 new_peer_gen_num = 0;
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_ci	while (1) {
21162306a36Sopenharmony_ci		len = sizeof(buffer);
21262306a36Sopenharmony_ci		type = rds_message_next_extension(hdr, &pos, &buffer, &len);
21362306a36Sopenharmony_ci		if (type == RDS_EXTHDR_NONE)
21462306a36Sopenharmony_ci			break;
21562306a36Sopenharmony_ci		/* Process extension header here */
21662306a36Sopenharmony_ci		switch (type) {
21762306a36Sopenharmony_ci		case RDS_EXTHDR_NPATHS:
21862306a36Sopenharmony_ci			conn->c_npaths = min_t(int, RDS_MPATH_WORKERS,
21962306a36Sopenharmony_ci					       be16_to_cpu(buffer.rds_npaths));
22062306a36Sopenharmony_ci			break;
22162306a36Sopenharmony_ci		case RDS_EXTHDR_GEN_NUM:
22262306a36Sopenharmony_ci			new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num);
22362306a36Sopenharmony_ci			break;
22462306a36Sopenharmony_ci		default:
22562306a36Sopenharmony_ci			pr_warn_ratelimited("ignoring unknown exthdr type "
22662306a36Sopenharmony_ci					     "0x%x\n", type);
22762306a36Sopenharmony_ci		}
22862306a36Sopenharmony_ci	}
22962306a36Sopenharmony_ci	/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
23062306a36Sopenharmony_ci	conn->c_npaths = max_t(int, conn->c_npaths, 1);
23162306a36Sopenharmony_ci	conn->c_ping_triggered = 0;
23262306a36Sopenharmony_ci	rds_conn_peer_gen_update(conn, new_peer_gen_num);
23362306a36Sopenharmony_ci}
23462306a36Sopenharmony_ci
23562306a36Sopenharmony_ci/* rds_start_mprds() will synchronously start multiple paths when appropriate.
23662306a36Sopenharmony_ci * The scheme is based on the following rules:
23762306a36Sopenharmony_ci *
23862306a36Sopenharmony_ci * 1. rds_sendmsg on first connect attempt sends the probe ping, with the
23962306a36Sopenharmony_ci *    sender's npaths (s_npaths)
24062306a36Sopenharmony_ci * 2. rcvr of probe-ping knows the mprds_paths = min(s_npaths, r_npaths). It
24162306a36Sopenharmony_ci *    sends back a probe-pong with r_npaths. After that, if rcvr is the
24262306a36Sopenharmony_ci *    smaller ip addr, it starts rds_conn_path_connect_if_down on all
24362306a36Sopenharmony_ci *    mprds_paths.
24462306a36Sopenharmony_ci * 3. sender gets woken up, and can move to rds_conn_path_connect_if_down.
24562306a36Sopenharmony_ci *    If it is the smaller ipaddr, rds_conn_path_connect_if_down can be
24662306a36Sopenharmony_ci *    called after reception of the probe-pong on all mprds_paths.
24762306a36Sopenharmony_ci *    Otherwise (sender of probe-ping is not the smaller ip addr): just call
24862306a36Sopenharmony_ci *    rds_conn_path_connect_if_down on the hashed path. (see rule 4)
24962306a36Sopenharmony_ci * 4. rds_connect_worker must only trigger a connection if laddr < faddr.
25062306a36Sopenharmony_ci * 5. sender may end up queuing the packet on the cp. will get sent out later.
25162306a36Sopenharmony_ci *    when connection is completed.
25262306a36Sopenharmony_ci */
25362306a36Sopenharmony_cistatic void rds_start_mprds(struct rds_connection *conn)
25462306a36Sopenharmony_ci{
25562306a36Sopenharmony_ci	int i;
25662306a36Sopenharmony_ci	struct rds_conn_path *cp;
25762306a36Sopenharmony_ci
25862306a36Sopenharmony_ci	if (conn->c_npaths > 1 &&
25962306a36Sopenharmony_ci	    rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) {
26062306a36Sopenharmony_ci		for (i = 0; i < conn->c_npaths; i++) {
26162306a36Sopenharmony_ci			cp = &conn->c_path[i];
26262306a36Sopenharmony_ci			rds_conn_path_connect_if_down(cp);
26362306a36Sopenharmony_ci		}
26462306a36Sopenharmony_ci	}
26562306a36Sopenharmony_ci}
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_ci/*
26862306a36Sopenharmony_ci * The transport must make sure that this is serialized against other
26962306a36Sopenharmony_ci * rx and conn reset on this specific conn.
27062306a36Sopenharmony_ci *
27162306a36Sopenharmony_ci * We currently assert that only one fragmented message will be sent
27262306a36Sopenharmony_ci * down a connection at a time.  This lets us reassemble in the conn
27362306a36Sopenharmony_ci * instead of per-flow which means that we don't have to go digging through
27462306a36Sopenharmony_ci * flows to tear down partial reassembly progress on conn failure and
27562306a36Sopenharmony_ci * we save flow lookup and locking for each frag arrival.  It does mean
27662306a36Sopenharmony_ci * that small messages will wait behind large ones.  Fragmenting at all
27762306a36Sopenharmony_ci * is only to reduce the memory consumption of pre-posted buffers.
27862306a36Sopenharmony_ci *
27962306a36Sopenharmony_ci * The caller passes in saddr and daddr instead of us getting it from the
28062306a36Sopenharmony_ci * conn.  This lets loopback, who only has one conn for both directions,
28162306a36Sopenharmony_ci * tell us which roles the addrs in the conn are playing for this message.
28262306a36Sopenharmony_ci */
28362306a36Sopenharmony_civoid rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
28462306a36Sopenharmony_ci		       struct in6_addr *daddr,
28562306a36Sopenharmony_ci		       struct rds_incoming *inc, gfp_t gfp)
28662306a36Sopenharmony_ci{
28762306a36Sopenharmony_ci	struct rds_sock *rs = NULL;
28862306a36Sopenharmony_ci	struct sock *sk;
28962306a36Sopenharmony_ci	unsigned long flags;
29062306a36Sopenharmony_ci	struct rds_conn_path *cp;
29162306a36Sopenharmony_ci
29262306a36Sopenharmony_ci	inc->i_conn = conn;
29362306a36Sopenharmony_ci	inc->i_rx_jiffies = jiffies;
29462306a36Sopenharmony_ci	if (conn->c_trans->t_mp_capable)
29562306a36Sopenharmony_ci		cp = inc->i_conn_path;
29662306a36Sopenharmony_ci	else
29762306a36Sopenharmony_ci		cp = &conn->c_path[0];
29862306a36Sopenharmony_ci
29962306a36Sopenharmony_ci	rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
30062306a36Sopenharmony_ci		 "flags 0x%x rx_jiffies %lu\n", conn,
30162306a36Sopenharmony_ci		 (unsigned long long)cp->cp_next_rx_seq,
30262306a36Sopenharmony_ci		 inc,
30362306a36Sopenharmony_ci		 (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
30462306a36Sopenharmony_ci		 be32_to_cpu(inc->i_hdr.h_len),
30562306a36Sopenharmony_ci		 be16_to_cpu(inc->i_hdr.h_sport),
30662306a36Sopenharmony_ci		 be16_to_cpu(inc->i_hdr.h_dport),
30762306a36Sopenharmony_ci		 inc->i_hdr.h_flags,
30862306a36Sopenharmony_ci		 inc->i_rx_jiffies);
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_ci	/*
31162306a36Sopenharmony_ci	 * Sequence numbers should only increase.  Messages get their
31262306a36Sopenharmony_ci	 * sequence number as they're queued in a sending conn.  They
31362306a36Sopenharmony_ci	 * can be dropped, though, if the sending socket is closed before
31462306a36Sopenharmony_ci	 * they hit the wire.  So sequence numbers can skip forward
31562306a36Sopenharmony_ci	 * under normal operation.  They can also drop back in the conn
31662306a36Sopenharmony_ci	 * failover case as previously sent messages are resent down the
31762306a36Sopenharmony_ci	 * new instance of a conn.  We drop those, otherwise we have
31862306a36Sopenharmony_ci	 * to assume that the next valid seq does not come after a
31962306a36Sopenharmony_ci	 * hole in the fragment stream.
32062306a36Sopenharmony_ci	 *
32162306a36Sopenharmony_ci	 * The headers don't give us a way to realize if fragments of
32262306a36Sopenharmony_ci	 * a message have been dropped.  We assume that frags that arrive
32362306a36Sopenharmony_ci	 * to a flow are part of the current message on the flow that is
32462306a36Sopenharmony_ci	 * being reassembled.  This means that senders can't drop messages
32562306a36Sopenharmony_ci	 * from the sending conn until all their frags are sent.
32662306a36Sopenharmony_ci	 *
32762306a36Sopenharmony_ci	 * XXX we could spend more on the wire to get more robust failure
32862306a36Sopenharmony_ci	 * detection, arguably worth it to avoid data corruption.
32962306a36Sopenharmony_ci	 */
33062306a36Sopenharmony_ci	if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq &&
33162306a36Sopenharmony_ci	    (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
33262306a36Sopenharmony_ci		rds_stats_inc(s_recv_drop_old_seq);
33362306a36Sopenharmony_ci		goto out;
33462306a36Sopenharmony_ci	}
33562306a36Sopenharmony_ci	cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
33662306a36Sopenharmony_ci
33762306a36Sopenharmony_ci	if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
33862306a36Sopenharmony_ci		if (inc->i_hdr.h_sport == 0) {
33962306a36Sopenharmony_ci			rdsdebug("ignore ping with 0 sport from %pI6c\n",
34062306a36Sopenharmony_ci				 saddr);
34162306a36Sopenharmony_ci			goto out;
34262306a36Sopenharmony_ci		}
34362306a36Sopenharmony_ci		rds_stats_inc(s_recv_ping);
34462306a36Sopenharmony_ci		rds_send_pong(cp, inc->i_hdr.h_sport);
34562306a36Sopenharmony_ci		/* if this is a handshake ping, start multipath if necessary */
34662306a36Sopenharmony_ci		if (RDS_HS_PROBE(be16_to_cpu(inc->i_hdr.h_sport),
34762306a36Sopenharmony_ci				 be16_to_cpu(inc->i_hdr.h_dport))) {
34862306a36Sopenharmony_ci			rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
34962306a36Sopenharmony_ci			rds_start_mprds(cp->cp_conn);
35062306a36Sopenharmony_ci		}
35162306a36Sopenharmony_ci		goto out;
35262306a36Sopenharmony_ci	}
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_ci	if (be16_to_cpu(inc->i_hdr.h_dport) ==  RDS_FLAG_PROBE_PORT &&
35562306a36Sopenharmony_ci	    inc->i_hdr.h_sport == 0) {
35662306a36Sopenharmony_ci		rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
35762306a36Sopenharmony_ci		/* if this is a handshake pong, start multipath if necessary */
35862306a36Sopenharmony_ci		rds_start_mprds(cp->cp_conn);
35962306a36Sopenharmony_ci		wake_up(&cp->cp_conn->c_hs_waitq);
36062306a36Sopenharmony_ci		goto out;
36162306a36Sopenharmony_ci	}
36262306a36Sopenharmony_ci
36362306a36Sopenharmony_ci	rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_bound_if);
36462306a36Sopenharmony_ci	if (!rs) {
36562306a36Sopenharmony_ci		rds_stats_inc(s_recv_drop_no_sock);
36662306a36Sopenharmony_ci		goto out;
36762306a36Sopenharmony_ci	}
36862306a36Sopenharmony_ci
36962306a36Sopenharmony_ci	/* Process extension headers */
37062306a36Sopenharmony_ci	rds_recv_incoming_exthdrs(inc, rs);
37162306a36Sopenharmony_ci
37262306a36Sopenharmony_ci	/* We can be racing with rds_release() which marks the socket dead. */
37362306a36Sopenharmony_ci	sk = rds_rs_to_sk(rs);
37462306a36Sopenharmony_ci
37562306a36Sopenharmony_ci	/* serialize with rds_release -> sock_orphan */
37662306a36Sopenharmony_ci	write_lock_irqsave(&rs->rs_recv_lock, flags);
37762306a36Sopenharmony_ci	if (!sock_flag(sk, SOCK_DEAD)) {
37862306a36Sopenharmony_ci		rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
37962306a36Sopenharmony_ci		rds_stats_inc(s_recv_queued);
38062306a36Sopenharmony_ci		rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
38162306a36Sopenharmony_ci				      be32_to_cpu(inc->i_hdr.h_len),
38262306a36Sopenharmony_ci				      inc->i_hdr.h_dport);
38362306a36Sopenharmony_ci		if (sock_flag(sk, SOCK_RCVTSTAMP))
38462306a36Sopenharmony_ci			inc->i_usercopy.rx_tstamp = ktime_get_real();
38562306a36Sopenharmony_ci		rds_inc_addref(inc);
38662306a36Sopenharmony_ci		inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
38762306a36Sopenharmony_ci		list_add_tail(&inc->i_item, &rs->rs_recv_queue);
38862306a36Sopenharmony_ci		__rds_wake_sk_sleep(sk);
38962306a36Sopenharmony_ci	} else {
39062306a36Sopenharmony_ci		rds_stats_inc(s_recv_drop_dead_sock);
39162306a36Sopenharmony_ci	}
39262306a36Sopenharmony_ci	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
39362306a36Sopenharmony_ci
39462306a36Sopenharmony_ciout:
39562306a36Sopenharmony_ci	if (rs)
39662306a36Sopenharmony_ci		rds_sock_put(rs);
39762306a36Sopenharmony_ci}
39862306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_recv_incoming);
39962306a36Sopenharmony_ci
40062306a36Sopenharmony_ci/*
40162306a36Sopenharmony_ci * be very careful here.  This is being called as the condition in
40262306a36Sopenharmony_ci * wait_event_*() needs to cope with being called many times.
40362306a36Sopenharmony_ci */
40462306a36Sopenharmony_cistatic int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
40562306a36Sopenharmony_ci{
40662306a36Sopenharmony_ci	unsigned long flags;
40762306a36Sopenharmony_ci
40862306a36Sopenharmony_ci	if (!*inc) {
40962306a36Sopenharmony_ci		read_lock_irqsave(&rs->rs_recv_lock, flags);
41062306a36Sopenharmony_ci		if (!list_empty(&rs->rs_recv_queue)) {
41162306a36Sopenharmony_ci			*inc = list_entry(rs->rs_recv_queue.next,
41262306a36Sopenharmony_ci					  struct rds_incoming,
41362306a36Sopenharmony_ci					  i_item);
41462306a36Sopenharmony_ci			rds_inc_addref(*inc);
41562306a36Sopenharmony_ci		}
41662306a36Sopenharmony_ci		read_unlock_irqrestore(&rs->rs_recv_lock, flags);
41762306a36Sopenharmony_ci	}
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci	return *inc != NULL;
42062306a36Sopenharmony_ci}
42162306a36Sopenharmony_ci
42262306a36Sopenharmony_cistatic int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
42362306a36Sopenharmony_ci			    int drop)
42462306a36Sopenharmony_ci{
42562306a36Sopenharmony_ci	struct sock *sk = rds_rs_to_sk(rs);
42662306a36Sopenharmony_ci	int ret = 0;
42762306a36Sopenharmony_ci	unsigned long flags;
42862306a36Sopenharmony_ci
42962306a36Sopenharmony_ci	write_lock_irqsave(&rs->rs_recv_lock, flags);
43062306a36Sopenharmony_ci	if (!list_empty(&inc->i_item)) {
43162306a36Sopenharmony_ci		ret = 1;
43262306a36Sopenharmony_ci		if (drop) {
43362306a36Sopenharmony_ci			/* XXX make sure this i_conn is reliable */
43462306a36Sopenharmony_ci			rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
43562306a36Sopenharmony_ci					      -be32_to_cpu(inc->i_hdr.h_len),
43662306a36Sopenharmony_ci					      inc->i_hdr.h_dport);
43762306a36Sopenharmony_ci			list_del_init(&inc->i_item);
43862306a36Sopenharmony_ci			rds_inc_put(inc);
43962306a36Sopenharmony_ci		}
44062306a36Sopenharmony_ci	}
44162306a36Sopenharmony_ci	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_ci	rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
44462306a36Sopenharmony_ci	return ret;
44562306a36Sopenharmony_ci}
44662306a36Sopenharmony_ci
44762306a36Sopenharmony_ci/*
44862306a36Sopenharmony_ci * Pull errors off the error queue.
44962306a36Sopenharmony_ci * If msghdr is NULL, we will just purge the error queue.
45062306a36Sopenharmony_ci */
45162306a36Sopenharmony_ciint rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
45262306a36Sopenharmony_ci{
45362306a36Sopenharmony_ci	struct rds_notifier *notifier;
45462306a36Sopenharmony_ci	struct rds_rdma_notify cmsg;
45562306a36Sopenharmony_ci	unsigned int count = 0, max_messages = ~0U;
45662306a36Sopenharmony_ci	unsigned long flags;
45762306a36Sopenharmony_ci	LIST_HEAD(copy);
45862306a36Sopenharmony_ci	int err = 0;
45962306a36Sopenharmony_ci
46062306a36Sopenharmony_ci	memset(&cmsg, 0, sizeof(cmsg));	/* fill holes with zero */
46162306a36Sopenharmony_ci
46262306a36Sopenharmony_ci	/* put_cmsg copies to user space and thus may sleep. We can't do this
46362306a36Sopenharmony_ci	 * with rs_lock held, so first grab as many notifications as we can stuff
46462306a36Sopenharmony_ci	 * in the user provided cmsg buffer. We don't try to copy more, to avoid
46562306a36Sopenharmony_ci	 * losing notifications - except when the buffer is so small that it wouldn't
46662306a36Sopenharmony_ci	 * even hold a single notification. Then we give him as much of this single
46762306a36Sopenharmony_ci	 * msg as we can squeeze in, and set MSG_CTRUNC.
46862306a36Sopenharmony_ci	 */
46962306a36Sopenharmony_ci	if (msghdr) {
47062306a36Sopenharmony_ci		max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
47162306a36Sopenharmony_ci		if (!max_messages)
47262306a36Sopenharmony_ci			max_messages = 1;
47362306a36Sopenharmony_ci	}
47462306a36Sopenharmony_ci
47562306a36Sopenharmony_ci	spin_lock_irqsave(&rs->rs_lock, flags);
47662306a36Sopenharmony_ci	while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
47762306a36Sopenharmony_ci		notifier = list_entry(rs->rs_notify_queue.next,
47862306a36Sopenharmony_ci				struct rds_notifier, n_list);
47962306a36Sopenharmony_ci		list_move(&notifier->n_list, &copy);
48062306a36Sopenharmony_ci		count++;
48162306a36Sopenharmony_ci	}
48262306a36Sopenharmony_ci	spin_unlock_irqrestore(&rs->rs_lock, flags);
48362306a36Sopenharmony_ci
48462306a36Sopenharmony_ci	if (!count)
48562306a36Sopenharmony_ci		return 0;
48662306a36Sopenharmony_ci
48762306a36Sopenharmony_ci	while (!list_empty(&copy)) {
48862306a36Sopenharmony_ci		notifier = list_entry(copy.next, struct rds_notifier, n_list);
48962306a36Sopenharmony_ci
49062306a36Sopenharmony_ci		if (msghdr) {
49162306a36Sopenharmony_ci			cmsg.user_token = notifier->n_user_token;
49262306a36Sopenharmony_ci			cmsg.status = notifier->n_status;
49362306a36Sopenharmony_ci
49462306a36Sopenharmony_ci			err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
49562306a36Sopenharmony_ci				       sizeof(cmsg), &cmsg);
49662306a36Sopenharmony_ci			if (err)
49762306a36Sopenharmony_ci				break;
49862306a36Sopenharmony_ci		}
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ci		list_del_init(&notifier->n_list);
50162306a36Sopenharmony_ci		kfree(notifier);
50262306a36Sopenharmony_ci	}
50362306a36Sopenharmony_ci
50462306a36Sopenharmony_ci	/* If we bailed out because of an error in put_cmsg,
50562306a36Sopenharmony_ci	 * we may be left with one or more notifications that we
50662306a36Sopenharmony_ci	 * didn't process. Return them to the head of the list. */
50762306a36Sopenharmony_ci	if (!list_empty(&copy)) {
50862306a36Sopenharmony_ci		spin_lock_irqsave(&rs->rs_lock, flags);
50962306a36Sopenharmony_ci		list_splice(&copy, &rs->rs_notify_queue);
51062306a36Sopenharmony_ci		spin_unlock_irqrestore(&rs->rs_lock, flags);
51162306a36Sopenharmony_ci	}
51262306a36Sopenharmony_ci
51362306a36Sopenharmony_ci	return err;
51462306a36Sopenharmony_ci}
51562306a36Sopenharmony_ci
51662306a36Sopenharmony_ci/*
51762306a36Sopenharmony_ci * Queue a congestion notification
51862306a36Sopenharmony_ci */
51962306a36Sopenharmony_cistatic int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
52062306a36Sopenharmony_ci{
52162306a36Sopenharmony_ci	uint64_t notify = rs->rs_cong_notify;
52262306a36Sopenharmony_ci	unsigned long flags;
52362306a36Sopenharmony_ci	int err;
52462306a36Sopenharmony_ci
52562306a36Sopenharmony_ci	err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
52662306a36Sopenharmony_ci			sizeof(notify), &notify);
52762306a36Sopenharmony_ci	if (err)
52862306a36Sopenharmony_ci		return err;
52962306a36Sopenharmony_ci
53062306a36Sopenharmony_ci	spin_lock_irqsave(&rs->rs_lock, flags);
53162306a36Sopenharmony_ci	rs->rs_cong_notify &= ~notify;
53262306a36Sopenharmony_ci	spin_unlock_irqrestore(&rs->rs_lock, flags);
53362306a36Sopenharmony_ci
53462306a36Sopenharmony_ci	return 0;
53562306a36Sopenharmony_ci}
53662306a36Sopenharmony_ci
53762306a36Sopenharmony_ci/*
53862306a36Sopenharmony_ci * Receive any control messages.
53962306a36Sopenharmony_ci */
54062306a36Sopenharmony_cistatic int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
54162306a36Sopenharmony_ci			 struct rds_sock *rs)
54262306a36Sopenharmony_ci{
54362306a36Sopenharmony_ci	int ret = 0;
54462306a36Sopenharmony_ci
54562306a36Sopenharmony_ci	if (inc->i_usercopy.rdma_cookie) {
54662306a36Sopenharmony_ci		ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
54762306a36Sopenharmony_ci				sizeof(inc->i_usercopy.rdma_cookie),
54862306a36Sopenharmony_ci				&inc->i_usercopy.rdma_cookie);
54962306a36Sopenharmony_ci		if (ret)
55062306a36Sopenharmony_ci			goto out;
55162306a36Sopenharmony_ci	}
55262306a36Sopenharmony_ci
55362306a36Sopenharmony_ci	if ((inc->i_usercopy.rx_tstamp != 0) &&
55462306a36Sopenharmony_ci	    sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) {
55562306a36Sopenharmony_ci		struct __kernel_old_timeval tv =
55662306a36Sopenharmony_ci			ns_to_kernel_old_timeval(inc->i_usercopy.rx_tstamp);
55762306a36Sopenharmony_ci
55862306a36Sopenharmony_ci		if (!sock_flag(rds_rs_to_sk(rs), SOCK_TSTAMP_NEW)) {
55962306a36Sopenharmony_ci			ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
56062306a36Sopenharmony_ci				       sizeof(tv), &tv);
56162306a36Sopenharmony_ci		} else {
56262306a36Sopenharmony_ci			struct __kernel_sock_timeval sk_tv;
56362306a36Sopenharmony_ci
56462306a36Sopenharmony_ci			sk_tv.tv_sec = tv.tv_sec;
56562306a36Sopenharmony_ci			sk_tv.tv_usec = tv.tv_usec;
56662306a36Sopenharmony_ci
56762306a36Sopenharmony_ci			ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
56862306a36Sopenharmony_ci				       sizeof(sk_tv), &sk_tv);
56962306a36Sopenharmony_ci		}
57062306a36Sopenharmony_ci
57162306a36Sopenharmony_ci		if (ret)
57262306a36Sopenharmony_ci			goto out;
57362306a36Sopenharmony_ci	}
57462306a36Sopenharmony_ci
57562306a36Sopenharmony_ci	if (rs->rs_rx_traces) {
57662306a36Sopenharmony_ci		struct rds_cmsg_rx_trace t;
57762306a36Sopenharmony_ci		int i, j;
57862306a36Sopenharmony_ci
57962306a36Sopenharmony_ci		memset(&t, 0, sizeof(t));
58062306a36Sopenharmony_ci		inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
58162306a36Sopenharmony_ci		t.rx_traces =  rs->rs_rx_traces;
58262306a36Sopenharmony_ci		for (i = 0; i < rs->rs_rx_traces; i++) {
58362306a36Sopenharmony_ci			j = rs->rs_rx_trace[i];
58462306a36Sopenharmony_ci			t.rx_trace_pos[i] = j;
58562306a36Sopenharmony_ci			t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] -
58662306a36Sopenharmony_ci					  inc->i_rx_lat_trace[j];
58762306a36Sopenharmony_ci		}
58862306a36Sopenharmony_ci
58962306a36Sopenharmony_ci		ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY,
59062306a36Sopenharmony_ci			       sizeof(t), &t);
59162306a36Sopenharmony_ci		if (ret)
59262306a36Sopenharmony_ci			goto out;
59362306a36Sopenharmony_ci	}
59462306a36Sopenharmony_ci
59562306a36Sopenharmony_ciout:
59662306a36Sopenharmony_ci	return ret;
59762306a36Sopenharmony_ci}
59862306a36Sopenharmony_ci
59962306a36Sopenharmony_cistatic bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg)
60062306a36Sopenharmony_ci{
60162306a36Sopenharmony_ci	struct rds_msg_zcopy_queue *q = &rs->rs_zcookie_queue;
60262306a36Sopenharmony_ci	struct rds_msg_zcopy_info *info = NULL;
60362306a36Sopenharmony_ci	struct rds_zcopy_cookies *done;
60462306a36Sopenharmony_ci	unsigned long flags;
60562306a36Sopenharmony_ci
60662306a36Sopenharmony_ci	if (!msg->msg_control)
60762306a36Sopenharmony_ci		return false;
60862306a36Sopenharmony_ci
60962306a36Sopenharmony_ci	if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) ||
61062306a36Sopenharmony_ci	    msg->msg_controllen < CMSG_SPACE(sizeof(*done)))
61162306a36Sopenharmony_ci		return false;
61262306a36Sopenharmony_ci
61362306a36Sopenharmony_ci	spin_lock_irqsave(&q->lock, flags);
61462306a36Sopenharmony_ci	if (!list_empty(&q->zcookie_head)) {
61562306a36Sopenharmony_ci		info = list_entry(q->zcookie_head.next,
61662306a36Sopenharmony_ci				  struct rds_msg_zcopy_info, rs_zcookie_next);
61762306a36Sopenharmony_ci		list_del(&info->rs_zcookie_next);
61862306a36Sopenharmony_ci	}
61962306a36Sopenharmony_ci	spin_unlock_irqrestore(&q->lock, flags);
62062306a36Sopenharmony_ci	if (!info)
62162306a36Sopenharmony_ci		return false;
62262306a36Sopenharmony_ci	done = &info->zcookies;
62362306a36Sopenharmony_ci	if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done),
62462306a36Sopenharmony_ci		     done)) {
62562306a36Sopenharmony_ci		spin_lock_irqsave(&q->lock, flags);
62662306a36Sopenharmony_ci		list_add(&info->rs_zcookie_next, &q->zcookie_head);
62762306a36Sopenharmony_ci		spin_unlock_irqrestore(&q->lock, flags);
62862306a36Sopenharmony_ci		return false;
62962306a36Sopenharmony_ci	}
63062306a36Sopenharmony_ci	kfree(info);
63162306a36Sopenharmony_ci	return true;
63262306a36Sopenharmony_ci}
63362306a36Sopenharmony_ci
63462306a36Sopenharmony_ciint rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
63562306a36Sopenharmony_ci		int msg_flags)
63662306a36Sopenharmony_ci{
63762306a36Sopenharmony_ci	struct sock *sk = sock->sk;
63862306a36Sopenharmony_ci	struct rds_sock *rs = rds_sk_to_rs(sk);
63962306a36Sopenharmony_ci	long timeo;
64062306a36Sopenharmony_ci	int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
64162306a36Sopenharmony_ci	DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
64262306a36Sopenharmony_ci	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
64362306a36Sopenharmony_ci	struct rds_incoming *inc = NULL;
64462306a36Sopenharmony_ci
64562306a36Sopenharmony_ci	/* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
64662306a36Sopenharmony_ci	timeo = sock_rcvtimeo(sk, nonblock);
64762306a36Sopenharmony_ci
64862306a36Sopenharmony_ci	rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
64962306a36Sopenharmony_ci
65062306a36Sopenharmony_ci	if (msg_flags & MSG_OOB)
65162306a36Sopenharmony_ci		goto out;
65262306a36Sopenharmony_ci	if (msg_flags & MSG_ERRQUEUE)
65362306a36Sopenharmony_ci		return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR);
65462306a36Sopenharmony_ci
65562306a36Sopenharmony_ci	while (1) {
65662306a36Sopenharmony_ci		/* If there are pending notifications, do those - and nothing else */
65762306a36Sopenharmony_ci		if (!list_empty(&rs->rs_notify_queue)) {
65862306a36Sopenharmony_ci			ret = rds_notify_queue_get(rs, msg);
65962306a36Sopenharmony_ci			break;
66062306a36Sopenharmony_ci		}
66162306a36Sopenharmony_ci
66262306a36Sopenharmony_ci		if (rs->rs_cong_notify) {
66362306a36Sopenharmony_ci			ret = rds_notify_cong(rs, msg);
66462306a36Sopenharmony_ci			break;
66562306a36Sopenharmony_ci		}
66662306a36Sopenharmony_ci
66762306a36Sopenharmony_ci		if (!rds_next_incoming(rs, &inc)) {
66862306a36Sopenharmony_ci			if (nonblock) {
66962306a36Sopenharmony_ci				bool reaped = rds_recvmsg_zcookie(rs, msg);
67062306a36Sopenharmony_ci
67162306a36Sopenharmony_ci				ret = reaped ?  0 : -EAGAIN;
67262306a36Sopenharmony_ci				break;
67362306a36Sopenharmony_ci			}
67462306a36Sopenharmony_ci
67562306a36Sopenharmony_ci			timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
67662306a36Sopenharmony_ci					(!list_empty(&rs->rs_notify_queue) ||
67762306a36Sopenharmony_ci					 rs->rs_cong_notify ||
67862306a36Sopenharmony_ci					 rds_next_incoming(rs, &inc)), timeo);
67962306a36Sopenharmony_ci			rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
68062306a36Sopenharmony_ci				 timeo);
68162306a36Sopenharmony_ci			if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
68262306a36Sopenharmony_ci				continue;
68362306a36Sopenharmony_ci
68462306a36Sopenharmony_ci			ret = timeo;
68562306a36Sopenharmony_ci			if (ret == 0)
68662306a36Sopenharmony_ci				ret = -ETIMEDOUT;
68762306a36Sopenharmony_ci			break;
68862306a36Sopenharmony_ci		}
68962306a36Sopenharmony_ci
69062306a36Sopenharmony_ci		rdsdebug("copying inc %p from %pI6c:%u to user\n", inc,
69162306a36Sopenharmony_ci			 &inc->i_conn->c_faddr,
69262306a36Sopenharmony_ci			 ntohs(inc->i_hdr.h_sport));
69362306a36Sopenharmony_ci		ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
69462306a36Sopenharmony_ci		if (ret < 0)
69562306a36Sopenharmony_ci			break;
69662306a36Sopenharmony_ci
69762306a36Sopenharmony_ci		/*
69862306a36Sopenharmony_ci		 * if the message we just copied isn't at the head of the
69962306a36Sopenharmony_ci		 * recv queue then someone else raced us to return it, try
70062306a36Sopenharmony_ci		 * to get the next message.
70162306a36Sopenharmony_ci		 */
70262306a36Sopenharmony_ci		if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
70362306a36Sopenharmony_ci			rds_inc_put(inc);
70462306a36Sopenharmony_ci			inc = NULL;
70562306a36Sopenharmony_ci			rds_stats_inc(s_recv_deliver_raced);
70662306a36Sopenharmony_ci			iov_iter_revert(&msg->msg_iter, ret);
70762306a36Sopenharmony_ci			continue;
70862306a36Sopenharmony_ci		}
70962306a36Sopenharmony_ci
71062306a36Sopenharmony_ci		if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
71162306a36Sopenharmony_ci			if (msg_flags & MSG_TRUNC)
71262306a36Sopenharmony_ci				ret = be32_to_cpu(inc->i_hdr.h_len);
71362306a36Sopenharmony_ci			msg->msg_flags |= MSG_TRUNC;
71462306a36Sopenharmony_ci		}
71562306a36Sopenharmony_ci
71662306a36Sopenharmony_ci		if (rds_cmsg_recv(inc, msg, rs)) {
71762306a36Sopenharmony_ci			ret = -EFAULT;
71862306a36Sopenharmony_ci			break;
71962306a36Sopenharmony_ci		}
72062306a36Sopenharmony_ci		rds_recvmsg_zcookie(rs, msg);
72162306a36Sopenharmony_ci
72262306a36Sopenharmony_ci		rds_stats_inc(s_recv_delivered);
72362306a36Sopenharmony_ci
72462306a36Sopenharmony_ci		if (msg->msg_name) {
72562306a36Sopenharmony_ci			if (ipv6_addr_v4mapped(&inc->i_saddr)) {
72662306a36Sopenharmony_ci				sin->sin_family = AF_INET;
72762306a36Sopenharmony_ci				sin->sin_port = inc->i_hdr.h_sport;
72862306a36Sopenharmony_ci				sin->sin_addr.s_addr =
72962306a36Sopenharmony_ci				    inc->i_saddr.s6_addr32[3];
73062306a36Sopenharmony_ci				memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
73162306a36Sopenharmony_ci				msg->msg_namelen = sizeof(*sin);
73262306a36Sopenharmony_ci			} else {
73362306a36Sopenharmony_ci				sin6->sin6_family = AF_INET6;
73462306a36Sopenharmony_ci				sin6->sin6_port = inc->i_hdr.h_sport;
73562306a36Sopenharmony_ci				sin6->sin6_addr = inc->i_saddr;
73662306a36Sopenharmony_ci				sin6->sin6_flowinfo = 0;
73762306a36Sopenharmony_ci				sin6->sin6_scope_id = rs->rs_bound_scope_id;
73862306a36Sopenharmony_ci				msg->msg_namelen = sizeof(*sin6);
73962306a36Sopenharmony_ci			}
74062306a36Sopenharmony_ci		}
74162306a36Sopenharmony_ci		break;
74262306a36Sopenharmony_ci	}
74362306a36Sopenharmony_ci
74462306a36Sopenharmony_ci	if (inc)
74562306a36Sopenharmony_ci		rds_inc_put(inc);
74662306a36Sopenharmony_ci
74762306a36Sopenharmony_ciout:
74862306a36Sopenharmony_ci	return ret;
74962306a36Sopenharmony_ci}
75062306a36Sopenharmony_ci
75162306a36Sopenharmony_ci/*
75262306a36Sopenharmony_ci * The socket is being shut down and we're asked to drop messages that were
75362306a36Sopenharmony_ci * queued for recvmsg.  The caller has unbound the socket so the receive path
75462306a36Sopenharmony_ci * won't queue any more incoming fragments or messages on the socket.
75562306a36Sopenharmony_ci */
75662306a36Sopenharmony_civoid rds_clear_recv_queue(struct rds_sock *rs)
75762306a36Sopenharmony_ci{
75862306a36Sopenharmony_ci	struct sock *sk = rds_rs_to_sk(rs);
75962306a36Sopenharmony_ci	struct rds_incoming *inc, *tmp;
76062306a36Sopenharmony_ci	unsigned long flags;
76162306a36Sopenharmony_ci
76262306a36Sopenharmony_ci	write_lock_irqsave(&rs->rs_recv_lock, flags);
76362306a36Sopenharmony_ci	list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
76462306a36Sopenharmony_ci		rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
76562306a36Sopenharmony_ci				      -be32_to_cpu(inc->i_hdr.h_len),
76662306a36Sopenharmony_ci				      inc->i_hdr.h_dport);
76762306a36Sopenharmony_ci		list_del_init(&inc->i_item);
76862306a36Sopenharmony_ci		rds_inc_put(inc);
76962306a36Sopenharmony_ci	}
77062306a36Sopenharmony_ci	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
77162306a36Sopenharmony_ci}
77262306a36Sopenharmony_ci
77362306a36Sopenharmony_ci/*
77462306a36Sopenharmony_ci * inc->i_saddr isn't used here because it is only set in the receive
77562306a36Sopenharmony_ci * path.
77662306a36Sopenharmony_ci */
77762306a36Sopenharmony_civoid rds_inc_info_copy(struct rds_incoming *inc,
77862306a36Sopenharmony_ci		       struct rds_info_iterator *iter,
77962306a36Sopenharmony_ci		       __be32 saddr, __be32 daddr, int flip)
78062306a36Sopenharmony_ci{
78162306a36Sopenharmony_ci	struct rds_info_message minfo;
78262306a36Sopenharmony_ci
78362306a36Sopenharmony_ci	minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
78462306a36Sopenharmony_ci	minfo.len = be32_to_cpu(inc->i_hdr.h_len);
78562306a36Sopenharmony_ci	minfo.tos = inc->i_conn->c_tos;
78662306a36Sopenharmony_ci
78762306a36Sopenharmony_ci	if (flip) {
78862306a36Sopenharmony_ci		minfo.laddr = daddr;
78962306a36Sopenharmony_ci		minfo.faddr = saddr;
79062306a36Sopenharmony_ci		minfo.lport = inc->i_hdr.h_dport;
79162306a36Sopenharmony_ci		minfo.fport = inc->i_hdr.h_sport;
79262306a36Sopenharmony_ci	} else {
79362306a36Sopenharmony_ci		minfo.laddr = saddr;
79462306a36Sopenharmony_ci		minfo.faddr = daddr;
79562306a36Sopenharmony_ci		minfo.lport = inc->i_hdr.h_sport;
79662306a36Sopenharmony_ci		minfo.fport = inc->i_hdr.h_dport;
79762306a36Sopenharmony_ci	}
79862306a36Sopenharmony_ci
79962306a36Sopenharmony_ci	minfo.flags = 0;
80062306a36Sopenharmony_ci
80162306a36Sopenharmony_ci	rds_info_copy(iter, &minfo, sizeof(minfo));
80262306a36Sopenharmony_ci}
80362306a36Sopenharmony_ci
80462306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_IPV6)
80562306a36Sopenharmony_civoid rds6_inc_info_copy(struct rds_incoming *inc,
80662306a36Sopenharmony_ci			struct rds_info_iterator *iter,
80762306a36Sopenharmony_ci			struct in6_addr *saddr, struct in6_addr *daddr,
80862306a36Sopenharmony_ci			int flip)
80962306a36Sopenharmony_ci{
81062306a36Sopenharmony_ci	struct rds6_info_message minfo6;
81162306a36Sopenharmony_ci
81262306a36Sopenharmony_ci	minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence);
81362306a36Sopenharmony_ci	minfo6.len = be32_to_cpu(inc->i_hdr.h_len);
81462306a36Sopenharmony_ci	minfo6.tos = inc->i_conn->c_tos;
81562306a36Sopenharmony_ci
81662306a36Sopenharmony_ci	if (flip) {
81762306a36Sopenharmony_ci		minfo6.laddr = *daddr;
81862306a36Sopenharmony_ci		minfo6.faddr = *saddr;
81962306a36Sopenharmony_ci		minfo6.lport = inc->i_hdr.h_dport;
82062306a36Sopenharmony_ci		minfo6.fport = inc->i_hdr.h_sport;
82162306a36Sopenharmony_ci	} else {
82262306a36Sopenharmony_ci		minfo6.laddr = *saddr;
82362306a36Sopenharmony_ci		minfo6.faddr = *daddr;
82462306a36Sopenharmony_ci		minfo6.lport = inc->i_hdr.h_sport;
82562306a36Sopenharmony_ci		minfo6.fport = inc->i_hdr.h_dport;
82662306a36Sopenharmony_ci	}
82762306a36Sopenharmony_ci
82862306a36Sopenharmony_ci	minfo6.flags = 0;
82962306a36Sopenharmony_ci
83062306a36Sopenharmony_ci	rds_info_copy(iter, &minfo6, sizeof(minfo6));
83162306a36Sopenharmony_ci}
83262306a36Sopenharmony_ci#endif
833