1/*
2 * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
3 * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
4 * Copyright (c) 2004 Intel Corporation.  All rights reserved.
5 * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
6 * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
7 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
8 * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
9 *
10 * This software is available to you under a choice of one of two
11 * licenses.  You may choose to be licensed under the terms of the GNU
12 * General Public License (GPL) Version 2, available from the file
13 * COPYING in the main directory of this source tree, or the
14 * OpenIB.org BSD license below:
15 *
16 *     Redistribution and use in source and binary forms, with or
17 *     without modification, are permitted provided that the following
18 *     conditions are met:
19 *
20 *      - Redistributions of source code must retain the above
21 *        copyright notice, this list of conditions and the following
22 *        disclaimer.
23 *
24 *      - Redistributions in binary form must reproduce the above
25 *        copyright notice, this list of conditions and the following
26 *        disclaimer in the documentation and/or other materials
27 *        provided with the distribution.
28 *
29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
33 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
34 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
35 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
36 * SOFTWARE.
37 */
38
39#include <linux/errno.h>
40#include <linux/err.h>
41#include <linux/export.h>
42#include <linux/string.h>
43#include <linux/slab.h>
44#include <linux/in.h>
45#include <linux/in6.h>
46#include <net/addrconf.h>
47#include <linux/security.h>
48
49#include <rdma/ib_verbs.h>
50#include <rdma/ib_cache.h>
51#include <rdma/ib_addr.h>
52#include <rdma/rw.h>
53#include <rdma/lag.h>
54
55#include "core_priv.h"
56#include <trace/events/rdma_core.h>
57
58static int ib_resolve_eth_dmac(struct ib_device *device,
59			       struct rdma_ah_attr *ah_attr);
60
61static const char * const ib_events[] = {
62	[IB_EVENT_CQ_ERR]		= "CQ error",
63	[IB_EVENT_QP_FATAL]		= "QP fatal error",
64	[IB_EVENT_QP_REQ_ERR]		= "QP request error",
65	[IB_EVENT_QP_ACCESS_ERR]	= "QP access error",
66	[IB_EVENT_COMM_EST]		= "communication established",
67	[IB_EVENT_SQ_DRAINED]		= "send queue drained",
68	[IB_EVENT_PATH_MIG]		= "path migration successful",
69	[IB_EVENT_PATH_MIG_ERR]		= "path migration error",
70	[IB_EVENT_DEVICE_FATAL]		= "device fatal error",
71	[IB_EVENT_PORT_ACTIVE]		= "port active",
72	[IB_EVENT_PORT_ERR]		= "port error",
73	[IB_EVENT_LID_CHANGE]		= "LID change",
74	[IB_EVENT_PKEY_CHANGE]		= "P_key change",
75	[IB_EVENT_SM_CHANGE]		= "SM change",
76	[IB_EVENT_SRQ_ERR]		= "SRQ error",
77	[IB_EVENT_SRQ_LIMIT_REACHED]	= "SRQ limit reached",
78	[IB_EVENT_QP_LAST_WQE_REACHED]	= "last WQE reached",
79	[IB_EVENT_CLIENT_REREGISTER]	= "client reregister",
80	[IB_EVENT_GID_CHANGE]		= "GID changed",
81};
82
83const char *__attribute_const__ ib_event_msg(enum ib_event_type event)
84{
85	size_t index = event;
86
87	return (index < ARRAY_SIZE(ib_events) && ib_events[index]) ?
88			ib_events[index] : "unrecognized event";
89}
90EXPORT_SYMBOL(ib_event_msg);
91
92static const char * const wc_statuses[] = {
93	[IB_WC_SUCCESS]			= "success",
94	[IB_WC_LOC_LEN_ERR]		= "local length error",
95	[IB_WC_LOC_QP_OP_ERR]		= "local QP operation error",
96	[IB_WC_LOC_EEC_OP_ERR]		= "local EE context operation error",
97	[IB_WC_LOC_PROT_ERR]		= "local protection error",
98	[IB_WC_WR_FLUSH_ERR]		= "WR flushed",
99	[IB_WC_MW_BIND_ERR]		= "memory management operation error",
100	[IB_WC_BAD_RESP_ERR]		= "bad response error",
101	[IB_WC_LOC_ACCESS_ERR]		= "local access error",
102	[IB_WC_REM_INV_REQ_ERR]		= "invalid request error",
103	[IB_WC_REM_ACCESS_ERR]		= "remote access error",
104	[IB_WC_REM_OP_ERR]		= "remote operation error",
105	[IB_WC_RETRY_EXC_ERR]		= "transport retry counter exceeded",
106	[IB_WC_RNR_RETRY_EXC_ERR]	= "RNR retry counter exceeded",
107	[IB_WC_LOC_RDD_VIOL_ERR]	= "local RDD violation error",
108	[IB_WC_REM_INV_RD_REQ_ERR]	= "remote invalid RD request",
109	[IB_WC_REM_ABORT_ERR]		= "operation aborted",
110	[IB_WC_INV_EECN_ERR]		= "invalid EE context number",
111	[IB_WC_INV_EEC_STATE_ERR]	= "invalid EE context state",
112	[IB_WC_FATAL_ERR]		= "fatal error",
113	[IB_WC_RESP_TIMEOUT_ERR]	= "response timeout error",
114	[IB_WC_GENERAL_ERR]		= "general error",
115};
116
117const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status)
118{
119	size_t index = status;
120
121	return (index < ARRAY_SIZE(wc_statuses) && wc_statuses[index]) ?
122			wc_statuses[index] : "unrecognized status";
123}
124EXPORT_SYMBOL(ib_wc_status_msg);
125
126__attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
127{
128	switch (rate) {
129	case IB_RATE_2_5_GBPS: return   1;
130	case IB_RATE_5_GBPS:   return   2;
131	case IB_RATE_10_GBPS:  return   4;
132	case IB_RATE_20_GBPS:  return   8;
133	case IB_RATE_30_GBPS:  return  12;
134	case IB_RATE_40_GBPS:  return  16;
135	case IB_RATE_60_GBPS:  return  24;
136	case IB_RATE_80_GBPS:  return  32;
137	case IB_RATE_120_GBPS: return  48;
138	case IB_RATE_14_GBPS:  return   6;
139	case IB_RATE_56_GBPS:  return  22;
140	case IB_RATE_112_GBPS: return  45;
141	case IB_RATE_168_GBPS: return  67;
142	case IB_RATE_25_GBPS:  return  10;
143	case IB_RATE_100_GBPS: return  40;
144	case IB_RATE_200_GBPS: return  80;
145	case IB_RATE_300_GBPS: return 120;
146	case IB_RATE_28_GBPS:  return  11;
147	case IB_RATE_50_GBPS:  return  20;
148	case IB_RATE_400_GBPS: return 160;
149	case IB_RATE_600_GBPS: return 240;
150	default:	       return  -1;
151	}
152}
153EXPORT_SYMBOL(ib_rate_to_mult);
154
155__attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
156{
157	switch (mult) {
158	case 1:   return IB_RATE_2_5_GBPS;
159	case 2:   return IB_RATE_5_GBPS;
160	case 4:   return IB_RATE_10_GBPS;
161	case 8:   return IB_RATE_20_GBPS;
162	case 12:  return IB_RATE_30_GBPS;
163	case 16:  return IB_RATE_40_GBPS;
164	case 24:  return IB_RATE_60_GBPS;
165	case 32:  return IB_RATE_80_GBPS;
166	case 48:  return IB_RATE_120_GBPS;
167	case 6:   return IB_RATE_14_GBPS;
168	case 22:  return IB_RATE_56_GBPS;
169	case 45:  return IB_RATE_112_GBPS;
170	case 67:  return IB_RATE_168_GBPS;
171	case 10:  return IB_RATE_25_GBPS;
172	case 40:  return IB_RATE_100_GBPS;
173	case 80:  return IB_RATE_200_GBPS;
174	case 120: return IB_RATE_300_GBPS;
175	case 11:  return IB_RATE_28_GBPS;
176	case 20:  return IB_RATE_50_GBPS;
177	case 160: return IB_RATE_400_GBPS;
178	case 240: return IB_RATE_600_GBPS;
179	default:  return IB_RATE_PORT_CURRENT;
180	}
181}
182EXPORT_SYMBOL(mult_to_ib_rate);
183
184__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
185{
186	switch (rate) {
187	case IB_RATE_2_5_GBPS: return 2500;
188	case IB_RATE_5_GBPS:   return 5000;
189	case IB_RATE_10_GBPS:  return 10000;
190	case IB_RATE_20_GBPS:  return 20000;
191	case IB_RATE_30_GBPS:  return 30000;
192	case IB_RATE_40_GBPS:  return 40000;
193	case IB_RATE_60_GBPS:  return 60000;
194	case IB_RATE_80_GBPS:  return 80000;
195	case IB_RATE_120_GBPS: return 120000;
196	case IB_RATE_14_GBPS:  return 14062;
197	case IB_RATE_56_GBPS:  return 56250;
198	case IB_RATE_112_GBPS: return 112500;
199	case IB_RATE_168_GBPS: return 168750;
200	case IB_RATE_25_GBPS:  return 25781;
201	case IB_RATE_100_GBPS: return 103125;
202	case IB_RATE_200_GBPS: return 206250;
203	case IB_RATE_300_GBPS: return 309375;
204	case IB_RATE_28_GBPS:  return 28125;
205	case IB_RATE_50_GBPS:  return 53125;
206	case IB_RATE_400_GBPS: return 425000;
207	case IB_RATE_600_GBPS: return 637500;
208	default:	       return -1;
209	}
210}
211EXPORT_SYMBOL(ib_rate_to_mbps);
212
213__attribute_const__ enum rdma_transport_type
214rdma_node_get_transport(unsigned int node_type)
215{
216
217	if (node_type == RDMA_NODE_USNIC)
218		return RDMA_TRANSPORT_USNIC;
219	if (node_type == RDMA_NODE_USNIC_UDP)
220		return RDMA_TRANSPORT_USNIC_UDP;
221	if (node_type == RDMA_NODE_RNIC)
222		return RDMA_TRANSPORT_IWARP;
223	if (node_type == RDMA_NODE_UNSPECIFIED)
224		return RDMA_TRANSPORT_UNSPECIFIED;
225
226	return RDMA_TRANSPORT_IB;
227}
228EXPORT_SYMBOL(rdma_node_get_transport);
229
230enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num)
231{
232	enum rdma_transport_type lt;
233	if (device->ops.get_link_layer)
234		return device->ops.get_link_layer(device, port_num);
235
236	lt = rdma_node_get_transport(device->node_type);
237	if (lt == RDMA_TRANSPORT_IB)
238		return IB_LINK_LAYER_INFINIBAND;
239
240	return IB_LINK_LAYER_ETHERNET;
241}
242EXPORT_SYMBOL(rdma_port_get_link_layer);
243
244/* Protection domains */
245
246/**
247 * ib_alloc_pd - Allocates an unused protection domain.
248 * @device: The device on which to allocate the protection domain.
249 * @flags: protection domain flags
250 * @caller: caller's build-time module name
251 *
252 * A protection domain object provides an association between QPs, shared
253 * receive queues, address handles, memory regions, and memory windows.
254 *
255 * Every PD has a local_dma_lkey which can be used as the lkey value for local
256 * memory operations.
257 */
258struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
259		const char *caller)
260{
261	struct ib_pd *pd;
262	int mr_access_flags = 0;
263	int ret;
264
265	pd = rdma_zalloc_drv_obj(device, ib_pd);
266	if (!pd)
267		return ERR_PTR(-ENOMEM);
268
269	pd->device = device;
270	pd->uobject = NULL;
271	pd->__internal_mr = NULL;
272	atomic_set(&pd->usecnt, 0);
273	pd->flags = flags;
274
275	rdma_restrack_new(&pd->res, RDMA_RESTRACK_PD);
276	rdma_restrack_set_name(&pd->res, caller);
277
278	ret = device->ops.alloc_pd(pd, NULL);
279	if (ret) {
280		rdma_restrack_put(&pd->res);
281		kfree(pd);
282		return ERR_PTR(ret);
283	}
284	rdma_restrack_add(&pd->res);
285
286	if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
287		pd->local_dma_lkey = device->local_dma_lkey;
288	else
289		mr_access_flags |= IB_ACCESS_LOCAL_WRITE;
290
291	if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
292		pr_warn("%s: enabling unsafe global rkey\n", caller);
293		mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE;
294	}
295
296	if (mr_access_flags) {
297		struct ib_mr *mr;
298
299		mr = pd->device->ops.get_dma_mr(pd, mr_access_flags);
300		if (IS_ERR(mr)) {
301			ib_dealloc_pd(pd);
302			return ERR_CAST(mr);
303		}
304
305		mr->device	= pd->device;
306		mr->pd		= pd;
307		mr->type        = IB_MR_TYPE_DMA;
308		mr->uobject	= NULL;
309		mr->need_inval	= false;
310
311		pd->__internal_mr = mr;
312
313		if (!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY))
314			pd->local_dma_lkey = pd->__internal_mr->lkey;
315
316		if (flags & IB_PD_UNSAFE_GLOBAL_RKEY)
317			pd->unsafe_global_rkey = pd->__internal_mr->rkey;
318	}
319
320	return pd;
321}
322EXPORT_SYMBOL(__ib_alloc_pd);
323
324/**
325 * ib_dealloc_pd_user - Deallocates a protection domain.
326 * @pd: The protection domain to deallocate.
327 * @udata: Valid user data or NULL for kernel object
328 *
329 * It is an error to call this function while any resources in the pd still
330 * exist.  The caller is responsible to synchronously destroy them and
331 * guarantee no new allocations will happen.
332 */
333int ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata)
334{
335	int ret;
336
337	if (pd->__internal_mr) {
338		ret = pd->device->ops.dereg_mr(pd->__internal_mr, NULL);
339		WARN_ON(ret);
340		pd->__internal_mr = NULL;
341	}
342
343	/* uverbs manipulates usecnt with proper locking, while the kabi
344	   requires the caller to guarantee we can't race here. */
345	WARN_ON(atomic_read(&pd->usecnt));
346
347	ret = pd->device->ops.dealloc_pd(pd, udata);
348	if (ret)
349		return ret;
350
351	rdma_restrack_del(&pd->res);
352	kfree(pd);
353	return ret;
354}
355EXPORT_SYMBOL(ib_dealloc_pd_user);
356
357/* Address handles */
358
359/**
360 * rdma_copy_ah_attr - Copy rdma ah attribute from source to destination.
361 * @dest:       Pointer to destination ah_attr. Contents of the destination
362 *              pointer is assumed to be invalid and attribute are overwritten.
363 * @src:        Pointer to source ah_attr.
364 */
365void rdma_copy_ah_attr(struct rdma_ah_attr *dest,
366		       const struct rdma_ah_attr *src)
367{
368	*dest = *src;
369	if (dest->grh.sgid_attr)
370		rdma_hold_gid_attr(dest->grh.sgid_attr);
371}
372EXPORT_SYMBOL(rdma_copy_ah_attr);
373
374/**
375 * rdma_replace_ah_attr - Replace valid ah_attr with new new one.
376 * @old:        Pointer to existing ah_attr which needs to be replaced.
377 *              old is assumed to be valid or zero'd
378 * @new:        Pointer to the new ah_attr.
379 *
380 * rdma_replace_ah_attr() first releases any reference in the old ah_attr if
381 * old the ah_attr is valid; after that it copies the new attribute and holds
382 * the reference to the replaced ah_attr.
383 */
384void rdma_replace_ah_attr(struct rdma_ah_attr *old,
385			  const struct rdma_ah_attr *new)
386{
387	rdma_destroy_ah_attr(old);
388	*old = *new;
389	if (old->grh.sgid_attr)
390		rdma_hold_gid_attr(old->grh.sgid_attr);
391}
392EXPORT_SYMBOL(rdma_replace_ah_attr);
393
394/**
395 * rdma_move_ah_attr - Move ah_attr pointed by source to destination.
396 * @dest:       Pointer to destination ah_attr to copy to.
397 *              dest is assumed to be valid or zero'd
398 * @src:        Pointer to the new ah_attr.
399 *
400 * rdma_move_ah_attr() first releases any reference in the destination ah_attr
401 * if it is valid. This also transfers ownership of internal references from
402 * src to dest, making src invalid in the process. No new reference of the src
403 * ah_attr is taken.
404 */
405void rdma_move_ah_attr(struct rdma_ah_attr *dest, struct rdma_ah_attr *src)
406{
407	rdma_destroy_ah_attr(dest);
408	*dest = *src;
409	src->grh.sgid_attr = NULL;
410}
411EXPORT_SYMBOL(rdma_move_ah_attr);
412
413/*
414 * Validate that the rdma_ah_attr is valid for the device before passing it
415 * off to the driver.
416 */
417static int rdma_check_ah_attr(struct ib_device *device,
418			      struct rdma_ah_attr *ah_attr)
419{
420	if (!rdma_is_port_valid(device, ah_attr->port_num))
421		return -EINVAL;
422
423	if ((rdma_is_grh_required(device, ah_attr->port_num) ||
424	     ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) &&
425	    !(ah_attr->ah_flags & IB_AH_GRH))
426		return -EINVAL;
427
428	if (ah_attr->grh.sgid_attr) {
429		/*
430		 * Make sure the passed sgid_attr is consistent with the
431		 * parameters
432		 */
433		if (ah_attr->grh.sgid_attr->index != ah_attr->grh.sgid_index ||
434		    ah_attr->grh.sgid_attr->port_num != ah_attr->port_num)
435			return -EINVAL;
436	}
437	return 0;
438}
439
440/*
441 * If the ah requires a GRH then ensure that sgid_attr pointer is filled in.
442 * On success the caller is responsible to call rdma_unfill_sgid_attr().
443 */
444static int rdma_fill_sgid_attr(struct ib_device *device,
445			       struct rdma_ah_attr *ah_attr,
446			       const struct ib_gid_attr **old_sgid_attr)
447{
448	const struct ib_gid_attr *sgid_attr;
449	struct ib_global_route *grh;
450	int ret;
451
452	*old_sgid_attr = ah_attr->grh.sgid_attr;
453
454	ret = rdma_check_ah_attr(device, ah_attr);
455	if (ret)
456		return ret;
457
458	if (!(ah_attr->ah_flags & IB_AH_GRH))
459		return 0;
460
461	grh = rdma_ah_retrieve_grh(ah_attr);
462	if (grh->sgid_attr)
463		return 0;
464
465	sgid_attr =
466		rdma_get_gid_attr(device, ah_attr->port_num, grh->sgid_index);
467	if (IS_ERR(sgid_attr))
468		return PTR_ERR(sgid_attr);
469
470	/* Move ownerhip of the kref into the ah_attr */
471	grh->sgid_attr = sgid_attr;
472	return 0;
473}
474
475static void rdma_unfill_sgid_attr(struct rdma_ah_attr *ah_attr,
476				  const struct ib_gid_attr *old_sgid_attr)
477{
478	/*
479	 * Fill didn't change anything, the caller retains ownership of
480	 * whatever it passed
481	 */
482	if (ah_attr->grh.sgid_attr == old_sgid_attr)
483		return;
484
485	/*
486	 * Otherwise, we need to undo what rdma_fill_sgid_attr so the caller
487	 * doesn't see any change in the rdma_ah_attr. If we get here
488	 * old_sgid_attr is NULL.
489	 */
490	rdma_destroy_ah_attr(ah_attr);
491}
492
493static const struct ib_gid_attr *
494rdma_update_sgid_attr(struct rdma_ah_attr *ah_attr,
495		      const struct ib_gid_attr *old_attr)
496{
497	if (old_attr)
498		rdma_put_gid_attr(old_attr);
499	if (ah_attr->ah_flags & IB_AH_GRH) {
500		rdma_hold_gid_attr(ah_attr->grh.sgid_attr);
501		return ah_attr->grh.sgid_attr;
502	}
503	return NULL;
504}
505
506static struct ib_ah *_rdma_create_ah(struct ib_pd *pd,
507				     struct rdma_ah_attr *ah_attr,
508				     u32 flags,
509				     struct ib_udata *udata,
510				     struct net_device *xmit_slave)
511{
512	struct rdma_ah_init_attr init_attr = {};
513	struct ib_device *device = pd->device;
514	struct ib_ah *ah;
515	int ret;
516
517	might_sleep_if(flags & RDMA_CREATE_AH_SLEEPABLE);
518
519	if (!device->ops.create_ah)
520		return ERR_PTR(-EOPNOTSUPP);
521
522	ah = rdma_zalloc_drv_obj_gfp(
523		device, ib_ah,
524		(flags & RDMA_CREATE_AH_SLEEPABLE) ? GFP_KERNEL : GFP_ATOMIC);
525	if (!ah)
526		return ERR_PTR(-ENOMEM);
527
528	ah->device = device;
529	ah->pd = pd;
530	ah->type = ah_attr->type;
531	ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL);
532	init_attr.ah_attr = ah_attr;
533	init_attr.flags = flags;
534	init_attr.xmit_slave = xmit_slave;
535
536	ret = device->ops.create_ah(ah, &init_attr, udata);
537	if (ret) {
538		if (ah->sgid_attr)
539			rdma_put_gid_attr(ah->sgid_attr);
540		kfree(ah);
541		return ERR_PTR(ret);
542	}
543
544	atomic_inc(&pd->usecnt);
545	return ah;
546}
547
548/**
549 * rdma_create_ah - Creates an address handle for the
550 * given address vector.
551 * @pd: The protection domain associated with the address handle.
552 * @ah_attr: The attributes of the address vector.
553 * @flags: Create address handle flags (see enum rdma_create_ah_flags).
554 *
555 * It returns 0 on success and returns appropriate error code on error.
556 * The address handle is used to reference a local or global destination
557 * in all UD QP post sends.
558 */
559struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
560			     u32 flags)
561{
562	const struct ib_gid_attr *old_sgid_attr;
563	struct net_device *slave;
564	struct ib_ah *ah;
565	int ret;
566
567	ret = rdma_fill_sgid_attr(pd->device, ah_attr, &old_sgid_attr);
568	if (ret)
569		return ERR_PTR(ret);
570	slave = rdma_lag_get_ah_roce_slave(pd->device, ah_attr,
571					   (flags & RDMA_CREATE_AH_SLEEPABLE) ?
572					   GFP_KERNEL : GFP_ATOMIC);
573	if (IS_ERR(slave)) {
574		rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
575		return (void *)slave;
576	}
577	ah = _rdma_create_ah(pd, ah_attr, flags, NULL, slave);
578	rdma_lag_put_ah_roce_slave(slave);
579	rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
580	return ah;
581}
582EXPORT_SYMBOL(rdma_create_ah);
583
584/**
585 * rdma_create_user_ah - Creates an address handle for the
586 * given address vector.
587 * It resolves destination mac address for ah attribute of RoCE type.
588 * @pd: The protection domain associated with the address handle.
589 * @ah_attr: The attributes of the address vector.
590 * @udata: pointer to user's input output buffer information need by
591 *         provider driver.
592 *
593 * It returns 0 on success and returns appropriate error code on error.
594 * The address handle is used to reference a local or global destination
595 * in all UD QP post sends.
596 */
597struct ib_ah *rdma_create_user_ah(struct ib_pd *pd,
598				  struct rdma_ah_attr *ah_attr,
599				  struct ib_udata *udata)
600{
601	const struct ib_gid_attr *old_sgid_attr;
602	struct ib_ah *ah;
603	int err;
604
605	err = rdma_fill_sgid_attr(pd->device, ah_attr, &old_sgid_attr);
606	if (err)
607		return ERR_PTR(err);
608
609	if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
610		err = ib_resolve_eth_dmac(pd->device, ah_attr);
611		if (err) {
612			ah = ERR_PTR(err);
613			goto out;
614		}
615	}
616
617	ah = _rdma_create_ah(pd, ah_attr, RDMA_CREATE_AH_SLEEPABLE,
618			     udata, NULL);
619
620out:
621	rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
622	return ah;
623}
624EXPORT_SYMBOL(rdma_create_user_ah);
625
626int ib_get_rdma_header_version(const union rdma_network_hdr *hdr)
627{
628	const struct iphdr *ip4h = (struct iphdr *)&hdr->roce4grh;
629	struct iphdr ip4h_checked;
630	const struct ipv6hdr *ip6h = (struct ipv6hdr *)&hdr->ibgrh;
631
632	/* If it's IPv6, the version must be 6, otherwise, the first
633	 * 20 bytes (before the IPv4 header) are garbled.
634	 */
635	if (ip6h->version != 6)
636		return (ip4h->version == 4) ? 4 : 0;
637	/* version may be 6 or 4 because the first 20 bytes could be garbled */
638
639	/* RoCE v2 requires no options, thus header length
640	 * must be 5 words
641	 */
642	if (ip4h->ihl != 5)
643		return 6;
644
645	/* Verify checksum.
646	 * We can't write on scattered buffers so we need to copy to
647	 * temp buffer.
648	 */
649	memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked));
650	ip4h_checked.check = 0;
651	ip4h_checked.check = ip_fast_csum((u8 *)&ip4h_checked, 5);
652	/* if IPv4 header checksum is OK, believe it */
653	if (ip4h->check == ip4h_checked.check)
654		return 4;
655	return 6;
656}
657EXPORT_SYMBOL(ib_get_rdma_header_version);
658
659static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device,
660						     u8 port_num,
661						     const struct ib_grh *grh)
662{
663	int grh_version;
664
665	if (rdma_protocol_ib(device, port_num))
666		return RDMA_NETWORK_IB;
667
668	grh_version = ib_get_rdma_header_version((union rdma_network_hdr *)grh);
669
670	if (grh_version == 4)
671		return RDMA_NETWORK_IPV4;
672
673	if (grh->next_hdr == IPPROTO_UDP)
674		return RDMA_NETWORK_IPV6;
675
676	return RDMA_NETWORK_ROCE_V1;
677}
678
679struct find_gid_index_context {
680	u16 vlan_id;
681	enum ib_gid_type gid_type;
682};
683
684static bool find_gid_index(const union ib_gid *gid,
685			   const struct ib_gid_attr *gid_attr,
686			   void *context)
687{
688	struct find_gid_index_context *ctx = context;
689	u16 vlan_id = 0xffff;
690	int ret;
691
692	if (ctx->gid_type != gid_attr->gid_type)
693		return false;
694
695	ret = rdma_read_gid_l2_fields(gid_attr, &vlan_id, NULL);
696	if (ret)
697		return false;
698
699	return ctx->vlan_id == vlan_id;
700}
701
702static const struct ib_gid_attr *
703get_sgid_attr_from_eth(struct ib_device *device, u8 port_num,
704		       u16 vlan_id, const union ib_gid *sgid,
705		       enum ib_gid_type gid_type)
706{
707	struct find_gid_index_context context = {.vlan_id = vlan_id,
708						 .gid_type = gid_type};
709
710	return rdma_find_gid_by_filter(device, sgid, port_num, find_gid_index,
711				       &context);
712}
713
714int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr,
715			      enum rdma_network_type net_type,
716			      union ib_gid *sgid, union ib_gid *dgid)
717{
718	struct sockaddr_in  src_in;
719	struct sockaddr_in  dst_in;
720	__be32 src_saddr, dst_saddr;
721
722	if (!sgid || !dgid)
723		return -EINVAL;
724
725	if (net_type == RDMA_NETWORK_IPV4) {
726		memcpy(&src_in.sin_addr.s_addr,
727		       &hdr->roce4grh.saddr, 4);
728		memcpy(&dst_in.sin_addr.s_addr,
729		       &hdr->roce4grh.daddr, 4);
730		src_saddr = src_in.sin_addr.s_addr;
731		dst_saddr = dst_in.sin_addr.s_addr;
732		ipv6_addr_set_v4mapped(src_saddr,
733				       (struct in6_addr *)sgid);
734		ipv6_addr_set_v4mapped(dst_saddr,
735				       (struct in6_addr *)dgid);
736		return 0;
737	} else if (net_type == RDMA_NETWORK_IPV6 ||
738		   net_type == RDMA_NETWORK_IB || RDMA_NETWORK_ROCE_V1) {
739		*dgid = hdr->ibgrh.dgid;
740		*sgid = hdr->ibgrh.sgid;
741		return 0;
742	} else {
743		return -EINVAL;
744	}
745}
746EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr);
747
748/* Resolve destination mac address and hop limit for unicast destination
749 * GID entry, considering the source GID entry as well.
750 * ah_attribute must have have valid port_num, sgid_index.
751 */
752static int ib_resolve_unicast_gid_dmac(struct ib_device *device,
753				       struct rdma_ah_attr *ah_attr)
754{
755	struct ib_global_route *grh = rdma_ah_retrieve_grh(ah_attr);
756	const struct ib_gid_attr *sgid_attr = grh->sgid_attr;
757	int hop_limit = 0xff;
758	int ret = 0;
759
760	/* If destination is link local and source GID is RoCEv1,
761	 * IP stack is not used.
762	 */
763	if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw) &&
764	    sgid_attr->gid_type == IB_GID_TYPE_ROCE) {
765		rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw,
766				ah_attr->roce.dmac);
767		return ret;
768	}
769
770	ret = rdma_addr_find_l2_eth_by_grh(&sgid_attr->gid, &grh->dgid,
771					   ah_attr->roce.dmac,
772					   sgid_attr, &hop_limit);
773
774	grh->hop_limit = hop_limit;
775	return ret;
776}
777
778/*
779 * This function initializes address handle attributes from the incoming packet.
780 * Incoming packet has dgid of the receiver node on which this code is
781 * getting executed and, sgid contains the GID of the sender.
782 *
783 * When resolving mac address of destination, the arrived dgid is used
784 * as sgid and, sgid is used as dgid because sgid contains destinations
785 * GID whom to respond to.
786 *
787 * On success the caller is responsible to call rdma_destroy_ah_attr on the
788 * attr.
789 */
790int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num,
791			    const struct ib_wc *wc, const struct ib_grh *grh,
792			    struct rdma_ah_attr *ah_attr)
793{
794	u32 flow_class;
795	int ret;
796	enum rdma_network_type net_type = RDMA_NETWORK_IB;
797	enum ib_gid_type gid_type = IB_GID_TYPE_IB;
798	const struct ib_gid_attr *sgid_attr;
799	int hoplimit = 0xff;
800	union ib_gid dgid;
801	union ib_gid sgid;
802
803	might_sleep();
804
805	memset(ah_attr, 0, sizeof *ah_attr);
806	ah_attr->type = rdma_ah_find_type(device, port_num);
807	if (rdma_cap_eth_ah(device, port_num)) {
808		if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE)
809			net_type = wc->network_hdr_type;
810		else
811			net_type = ib_get_net_type_by_grh(device, port_num, grh);
812		gid_type = ib_network_to_gid_type(net_type);
813	}
814	ret = ib_get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type,
815					&sgid, &dgid);
816	if (ret)
817		return ret;
818
819	rdma_ah_set_sl(ah_attr, wc->sl);
820	rdma_ah_set_port_num(ah_attr, port_num);
821
822	if (rdma_protocol_roce(device, port_num)) {
823		u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ?
824				wc->vlan_id : 0xffff;
825
826		if (!(wc->wc_flags & IB_WC_GRH))
827			return -EPROTOTYPE;
828
829		sgid_attr = get_sgid_attr_from_eth(device, port_num,
830						   vlan_id, &dgid,
831						   gid_type);
832		if (IS_ERR(sgid_attr))
833			return PTR_ERR(sgid_attr);
834
835		flow_class = be32_to_cpu(grh->version_tclass_flow);
836		rdma_move_grh_sgid_attr(ah_attr,
837					&sgid,
838					flow_class & 0xFFFFF,
839					hoplimit,
840					(flow_class >> 20) & 0xFF,
841					sgid_attr);
842
843		ret = ib_resolve_unicast_gid_dmac(device, ah_attr);
844		if (ret)
845			rdma_destroy_ah_attr(ah_attr);
846
847		return ret;
848	} else {
849		rdma_ah_set_dlid(ah_attr, wc->slid);
850		rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits);
851
852		if ((wc->wc_flags & IB_WC_GRH) == 0)
853			return 0;
854
855		if (dgid.global.interface_id !=
856					cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) {
857			sgid_attr = rdma_find_gid_by_port(
858				device, &dgid, IB_GID_TYPE_IB, port_num, NULL);
859		} else
860			sgid_attr = rdma_get_gid_attr(device, port_num, 0);
861
862		if (IS_ERR(sgid_attr))
863			return PTR_ERR(sgid_attr);
864		flow_class = be32_to_cpu(grh->version_tclass_flow);
865		rdma_move_grh_sgid_attr(ah_attr,
866					&sgid,
867					flow_class & 0xFFFFF,
868					hoplimit,
869					(flow_class >> 20) & 0xFF,
870					sgid_attr);
871
872		return 0;
873	}
874}
875EXPORT_SYMBOL(ib_init_ah_attr_from_wc);
876
877/**
878 * rdma_move_grh_sgid_attr - Sets the sgid attribute of GRH, taking ownership
879 * of the reference
880 *
881 * @attr:	Pointer to AH attribute structure
882 * @dgid:	Destination GID
883 * @flow_label:	Flow label
884 * @hop_limit:	Hop limit
885 * @traffic_class: traffic class
886 * @sgid_attr:	Pointer to SGID attribute
887 *
888 * This takes ownership of the sgid_attr reference. The caller must ensure
889 * rdma_destroy_ah_attr() is called before destroying the rdma_ah_attr after
890 * calling this function.
891 */
892void rdma_move_grh_sgid_attr(struct rdma_ah_attr *attr, union ib_gid *dgid,
893			     u32 flow_label, u8 hop_limit, u8 traffic_class,
894			     const struct ib_gid_attr *sgid_attr)
895{
896	rdma_ah_set_grh(attr, dgid, flow_label, sgid_attr->index, hop_limit,
897			traffic_class);
898	attr->grh.sgid_attr = sgid_attr;
899}
900EXPORT_SYMBOL(rdma_move_grh_sgid_attr);
901
902/**
903 * rdma_destroy_ah_attr - Release reference to SGID attribute of
904 * ah attribute.
905 * @ah_attr: Pointer to ah attribute
906 *
907 * Release reference to the SGID attribute of the ah attribute if it is
908 * non NULL. It is safe to call this multiple times, and safe to call it on
909 * a zero initialized ah_attr.
910 */
911void rdma_destroy_ah_attr(struct rdma_ah_attr *ah_attr)
912{
913	if (ah_attr->grh.sgid_attr) {
914		rdma_put_gid_attr(ah_attr->grh.sgid_attr);
915		ah_attr->grh.sgid_attr = NULL;
916	}
917}
918EXPORT_SYMBOL(rdma_destroy_ah_attr);
919
920struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc,
921				   const struct ib_grh *grh, u8 port_num)
922{
923	struct rdma_ah_attr ah_attr;
924	struct ib_ah *ah;
925	int ret;
926
927	ret = ib_init_ah_attr_from_wc(pd->device, port_num, wc, grh, &ah_attr);
928	if (ret)
929		return ERR_PTR(ret);
930
931	ah = rdma_create_ah(pd, &ah_attr, RDMA_CREATE_AH_SLEEPABLE);
932
933	rdma_destroy_ah_attr(&ah_attr);
934	return ah;
935}
936EXPORT_SYMBOL(ib_create_ah_from_wc);
937
938int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr)
939{
940	const struct ib_gid_attr *old_sgid_attr;
941	int ret;
942
943	if (ah->type != ah_attr->type)
944		return -EINVAL;
945
946	ret = rdma_fill_sgid_attr(ah->device, ah_attr, &old_sgid_attr);
947	if (ret)
948		return ret;
949
950	ret = ah->device->ops.modify_ah ?
951		ah->device->ops.modify_ah(ah, ah_attr) :
952		-EOPNOTSUPP;
953
954	ah->sgid_attr = rdma_update_sgid_attr(ah_attr, ah->sgid_attr);
955	rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
956	return ret;
957}
958EXPORT_SYMBOL(rdma_modify_ah);
959
960int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr)
961{
962	ah_attr->grh.sgid_attr = NULL;
963
964	return ah->device->ops.query_ah ?
965		ah->device->ops.query_ah(ah, ah_attr) :
966		-EOPNOTSUPP;
967}
968EXPORT_SYMBOL(rdma_query_ah);
969
970int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata)
971{
972	const struct ib_gid_attr *sgid_attr = ah->sgid_attr;
973	struct ib_pd *pd;
974	int ret;
975
976	might_sleep_if(flags & RDMA_DESTROY_AH_SLEEPABLE);
977
978	pd = ah->pd;
979
980	ret = ah->device->ops.destroy_ah(ah, flags);
981	if (ret)
982		return ret;
983
984	atomic_dec(&pd->usecnt);
985	if (sgid_attr)
986		rdma_put_gid_attr(sgid_attr);
987
988	kfree(ah);
989	return ret;
990}
991EXPORT_SYMBOL(rdma_destroy_ah_user);
992
993/* Shared receive queues */
994
995/**
996 * ib_create_srq_user - Creates a SRQ associated with the specified protection
997 *   domain.
998 * @pd: The protection domain associated with the SRQ.
999 * @srq_init_attr: A list of initial attributes required to create the
1000 *   SRQ.  If SRQ creation succeeds, then the attributes are updated to
1001 *   the actual capabilities of the created SRQ.
1002 * @uobject: uobject pointer if this is not a kernel SRQ
1003 * @udata: udata pointer if this is not a kernel SRQ
1004 *
1005 * srq_attr->max_wr and srq_attr->max_sge are read the determine the
1006 * requested size of the SRQ, and set to the actual values allocated
1007 * on return.  If ib_create_srq() succeeds, then max_wr and max_sge
1008 * will always be at least as large as the requested values.
1009 */
1010struct ib_srq *ib_create_srq_user(struct ib_pd *pd,
1011				  struct ib_srq_init_attr *srq_init_attr,
1012				  struct ib_usrq_object *uobject,
1013				  struct ib_udata *udata)
1014{
1015	struct ib_srq *srq;
1016	int ret;
1017
1018	srq = rdma_zalloc_drv_obj(pd->device, ib_srq);
1019	if (!srq)
1020		return ERR_PTR(-ENOMEM);
1021
1022	srq->device = pd->device;
1023	srq->pd = pd;
1024	srq->event_handler = srq_init_attr->event_handler;
1025	srq->srq_context = srq_init_attr->srq_context;
1026	srq->srq_type = srq_init_attr->srq_type;
1027	srq->uobject = uobject;
1028
1029	if (ib_srq_has_cq(srq->srq_type)) {
1030		srq->ext.cq = srq_init_attr->ext.cq;
1031		atomic_inc(&srq->ext.cq->usecnt);
1032	}
1033	if (srq->srq_type == IB_SRQT_XRC) {
1034		srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
1035		atomic_inc(&srq->ext.xrc.xrcd->usecnt);
1036	}
1037	atomic_inc(&pd->usecnt);
1038
1039	ret = pd->device->ops.create_srq(srq, srq_init_attr, udata);
1040	if (ret) {
1041		atomic_dec(&srq->pd->usecnt);
1042		if (srq->srq_type == IB_SRQT_XRC)
1043			atomic_dec(&srq->ext.xrc.xrcd->usecnt);
1044		if (ib_srq_has_cq(srq->srq_type))
1045			atomic_dec(&srq->ext.cq->usecnt);
1046		kfree(srq);
1047		return ERR_PTR(ret);
1048	}
1049
1050	return srq;
1051}
1052EXPORT_SYMBOL(ib_create_srq_user);
1053
1054int ib_modify_srq(struct ib_srq *srq,
1055		  struct ib_srq_attr *srq_attr,
1056		  enum ib_srq_attr_mask srq_attr_mask)
1057{
1058	return srq->device->ops.modify_srq ?
1059		srq->device->ops.modify_srq(srq, srq_attr, srq_attr_mask,
1060					    NULL) : -EOPNOTSUPP;
1061}
1062EXPORT_SYMBOL(ib_modify_srq);
1063
1064int ib_query_srq(struct ib_srq *srq,
1065		 struct ib_srq_attr *srq_attr)
1066{
1067	return srq->device->ops.query_srq ?
1068		srq->device->ops.query_srq(srq, srq_attr) : -EOPNOTSUPP;
1069}
1070EXPORT_SYMBOL(ib_query_srq);
1071
1072int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata)
1073{
1074	int ret;
1075
1076	if (atomic_read(&srq->usecnt))
1077		return -EBUSY;
1078
1079	ret = srq->device->ops.destroy_srq(srq, udata);
1080	if (ret)
1081		return ret;
1082
1083	atomic_dec(&srq->pd->usecnt);
1084	if (srq->srq_type == IB_SRQT_XRC)
1085		atomic_dec(&srq->ext.xrc.xrcd->usecnt);
1086	if (ib_srq_has_cq(srq->srq_type))
1087		atomic_dec(&srq->ext.cq->usecnt);
1088	kfree(srq);
1089
1090	return ret;
1091}
1092EXPORT_SYMBOL(ib_destroy_srq_user);
1093
1094/* Queue pairs */
1095
1096static void __ib_shared_qp_event_handler(struct ib_event *event, void *context)
1097{
1098	struct ib_qp *qp = context;
1099	unsigned long flags;
1100
1101	spin_lock_irqsave(&qp->device->qp_open_list_lock, flags);
1102	list_for_each_entry(event->element.qp, &qp->open_list, open_list)
1103		if (event->element.qp->event_handler)
1104			event->element.qp->event_handler(event, event->element.qp->qp_context);
1105	spin_unlock_irqrestore(&qp->device->qp_open_list_lock, flags);
1106}
1107
1108static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp,
1109				  void (*event_handler)(struct ib_event *, void *),
1110				  void *qp_context)
1111{
1112	struct ib_qp *qp;
1113	unsigned long flags;
1114	int err;
1115
1116	qp = kzalloc(sizeof *qp, GFP_KERNEL);
1117	if (!qp)
1118		return ERR_PTR(-ENOMEM);
1119
1120	qp->real_qp = real_qp;
1121	err = ib_open_shared_qp_security(qp, real_qp->device);
1122	if (err) {
1123		kfree(qp);
1124		return ERR_PTR(err);
1125	}
1126
1127	qp->real_qp = real_qp;
1128	atomic_inc(&real_qp->usecnt);
1129	qp->device = real_qp->device;
1130	qp->event_handler = event_handler;
1131	qp->qp_context = qp_context;
1132	qp->qp_num = real_qp->qp_num;
1133	qp->qp_type = real_qp->qp_type;
1134
1135	spin_lock_irqsave(&real_qp->device->qp_open_list_lock, flags);
1136	list_add(&qp->open_list, &real_qp->open_list);
1137	spin_unlock_irqrestore(&real_qp->device->qp_open_list_lock, flags);
1138
1139	return qp;
1140}
1141
1142struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
1143			 struct ib_qp_open_attr *qp_open_attr)
1144{
1145	struct ib_qp *qp, *real_qp;
1146
1147	if (qp_open_attr->qp_type != IB_QPT_XRC_TGT)
1148		return ERR_PTR(-EINVAL);
1149
1150	down_read(&xrcd->tgt_qps_rwsem);
1151	real_qp = xa_load(&xrcd->tgt_qps, qp_open_attr->qp_num);
1152	if (!real_qp) {
1153		up_read(&xrcd->tgt_qps_rwsem);
1154		return ERR_PTR(-EINVAL);
1155	}
1156	qp = __ib_open_qp(real_qp, qp_open_attr->event_handler,
1157			  qp_open_attr->qp_context);
1158	up_read(&xrcd->tgt_qps_rwsem);
1159	return qp;
1160}
1161EXPORT_SYMBOL(ib_open_qp);
1162
1163static struct ib_qp *create_xrc_qp_user(struct ib_qp *qp,
1164					struct ib_qp_init_attr *qp_init_attr)
1165{
1166	struct ib_qp *real_qp = qp;
1167	int err;
1168
1169	qp->event_handler = __ib_shared_qp_event_handler;
1170	qp->qp_context = qp;
1171	qp->pd = NULL;
1172	qp->send_cq = qp->recv_cq = NULL;
1173	qp->srq = NULL;
1174	qp->xrcd = qp_init_attr->xrcd;
1175	atomic_inc(&qp_init_attr->xrcd->usecnt);
1176	INIT_LIST_HEAD(&qp->open_list);
1177
1178	qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
1179			  qp_init_attr->qp_context);
1180	if (IS_ERR(qp))
1181		return qp;
1182
1183	err = xa_err(xa_store(&qp_init_attr->xrcd->tgt_qps, real_qp->qp_num,
1184			      real_qp, GFP_KERNEL));
1185	if (err) {
1186		ib_close_qp(qp);
1187		return ERR_PTR(err);
1188	}
1189	return qp;
1190}
1191
1192/**
1193 * ib_create_qp - Creates a kernel QP associated with the specified protection
1194 *   domain.
1195 * @pd: The protection domain associated with the QP.
1196 * @qp_init_attr: A list of initial attributes required to create the
1197 *   QP.  If QP creation succeeds, then the attributes are updated to
1198 *   the actual capabilities of the created QP.
1199 *
1200 * NOTE: for user qp use ib_create_qp_user with valid udata!
1201 */
1202struct ib_qp *ib_create_qp(struct ib_pd *pd,
1203			   struct ib_qp_init_attr *qp_init_attr)
1204{
1205	struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device;
1206	struct ib_qp *qp;
1207	int ret;
1208
1209	if (qp_init_attr->rwq_ind_tbl &&
1210	    (qp_init_attr->recv_cq ||
1211	    qp_init_attr->srq || qp_init_attr->cap.max_recv_wr ||
1212	    qp_init_attr->cap.max_recv_sge))
1213		return ERR_PTR(-EINVAL);
1214
1215	if ((qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) &&
1216	    !(device->attrs.device_cap_flags & IB_DEVICE_INTEGRITY_HANDOVER))
1217		return ERR_PTR(-EINVAL);
1218
1219	/*
1220	 * If the callers is using the RDMA API calculate the resources
1221	 * needed for the RDMA READ/WRITE operations.
1222	 *
1223	 * Note that these callers need to pass in a port number.
1224	 */
1225	if (qp_init_attr->cap.max_rdma_ctxs)
1226		rdma_rw_init_qp(device, qp_init_attr);
1227
1228	qp = _ib_create_qp(device, pd, qp_init_attr, NULL, NULL);
1229	if (IS_ERR(qp))
1230		return qp;
1231
1232	ret = ib_create_qp_security(qp, device);
1233	if (ret)
1234		goto err;
1235
1236	if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
1237		struct ib_qp *xrc_qp =
1238			create_xrc_qp_user(qp, qp_init_attr);
1239
1240		if (IS_ERR(xrc_qp)) {
1241			ret = PTR_ERR(xrc_qp);
1242			goto err;
1243		}
1244		return xrc_qp;
1245	}
1246
1247	qp->event_handler = qp_init_attr->event_handler;
1248	qp->qp_context = qp_init_attr->qp_context;
1249	if (qp_init_attr->qp_type == IB_QPT_XRC_INI) {
1250		qp->recv_cq = NULL;
1251		qp->srq = NULL;
1252	} else {
1253		qp->recv_cq = qp_init_attr->recv_cq;
1254		if (qp_init_attr->recv_cq)
1255			atomic_inc(&qp_init_attr->recv_cq->usecnt);
1256		qp->srq = qp_init_attr->srq;
1257		if (qp->srq)
1258			atomic_inc(&qp_init_attr->srq->usecnt);
1259	}
1260
1261	qp->send_cq = qp_init_attr->send_cq;
1262	qp->xrcd    = NULL;
1263
1264	atomic_inc(&pd->usecnt);
1265	if (qp_init_attr->send_cq)
1266		atomic_inc(&qp_init_attr->send_cq->usecnt);
1267	if (qp_init_attr->rwq_ind_tbl)
1268		atomic_inc(&qp->rwq_ind_tbl->usecnt);
1269
1270	if (qp_init_attr->cap.max_rdma_ctxs) {
1271		ret = rdma_rw_init_mrs(qp, qp_init_attr);
1272		if (ret)
1273			goto err;
1274	}
1275
1276	/*
1277	 * Note: all hw drivers guarantee that max_send_sge is lower than
1278	 * the device RDMA WRITE SGE limit but not all hw drivers ensure that
1279	 * max_send_sge <= max_sge_rd.
1280	 */
1281	qp->max_write_sge = qp_init_attr->cap.max_send_sge;
1282	qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge,
1283				 device->attrs.max_sge_rd);
1284	if (qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN)
1285		qp->integrity_en = true;
1286
1287	return qp;
1288
1289err:
1290	ib_destroy_qp(qp);
1291	return ERR_PTR(ret);
1292
1293}
1294EXPORT_SYMBOL(ib_create_qp);
1295
1296static const struct {
1297	int			valid;
1298	enum ib_qp_attr_mask	req_param[IB_QPT_MAX];
1299	enum ib_qp_attr_mask	opt_param[IB_QPT_MAX];
1300} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
1301	[IB_QPS_RESET] = {
1302		[IB_QPS_RESET] = { .valid = 1 },
1303		[IB_QPS_INIT]  = {
1304			.valid = 1,
1305			.req_param = {
1306				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
1307						IB_QP_PORT			|
1308						IB_QP_QKEY),
1309				[IB_QPT_RAW_PACKET] = IB_QP_PORT,
1310				[IB_QPT_UC]  = (IB_QP_PKEY_INDEX		|
1311						IB_QP_PORT			|
1312						IB_QP_ACCESS_FLAGS),
1313				[IB_QPT_RC]  = (IB_QP_PKEY_INDEX		|
1314						IB_QP_PORT			|
1315						IB_QP_ACCESS_FLAGS),
1316				[IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX		|
1317						IB_QP_PORT			|
1318						IB_QP_ACCESS_FLAGS),
1319				[IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX		|
1320						IB_QP_PORT			|
1321						IB_QP_ACCESS_FLAGS),
1322				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
1323						IB_QP_QKEY),
1324				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
1325						IB_QP_QKEY),
1326			}
1327		},
1328	},
1329	[IB_QPS_INIT]  = {
1330		[IB_QPS_RESET] = { .valid = 1 },
1331		[IB_QPS_ERR] =   { .valid = 1 },
1332		[IB_QPS_INIT]  = {
1333			.valid = 1,
1334			.opt_param = {
1335				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
1336						IB_QP_PORT			|
1337						IB_QP_QKEY),
1338				[IB_QPT_UC]  = (IB_QP_PKEY_INDEX		|
1339						IB_QP_PORT			|
1340						IB_QP_ACCESS_FLAGS),
1341				[IB_QPT_RC]  = (IB_QP_PKEY_INDEX		|
1342						IB_QP_PORT			|
1343						IB_QP_ACCESS_FLAGS),
1344				[IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX		|
1345						IB_QP_PORT			|
1346						IB_QP_ACCESS_FLAGS),
1347				[IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX		|
1348						IB_QP_PORT			|
1349						IB_QP_ACCESS_FLAGS),
1350				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
1351						IB_QP_QKEY),
1352				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
1353						IB_QP_QKEY),
1354			}
1355		},
1356		[IB_QPS_RTR]   = {
1357			.valid = 1,
1358			.req_param = {
1359				[IB_QPT_UC]  = (IB_QP_AV			|
1360						IB_QP_PATH_MTU			|
1361						IB_QP_DEST_QPN			|
1362						IB_QP_RQ_PSN),
1363				[IB_QPT_RC]  = (IB_QP_AV			|
1364						IB_QP_PATH_MTU			|
1365						IB_QP_DEST_QPN			|
1366						IB_QP_RQ_PSN			|
1367						IB_QP_MAX_DEST_RD_ATOMIC	|
1368						IB_QP_MIN_RNR_TIMER),
1369				[IB_QPT_XRC_INI] = (IB_QP_AV			|
1370						IB_QP_PATH_MTU			|
1371						IB_QP_DEST_QPN			|
1372						IB_QP_RQ_PSN),
1373				[IB_QPT_XRC_TGT] = (IB_QP_AV			|
1374						IB_QP_PATH_MTU			|
1375						IB_QP_DEST_QPN			|
1376						IB_QP_RQ_PSN			|
1377						IB_QP_MAX_DEST_RD_ATOMIC	|
1378						IB_QP_MIN_RNR_TIMER),
1379			},
1380			.opt_param = {
1381				 [IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
1382						 IB_QP_QKEY),
1383				 [IB_QPT_UC]  = (IB_QP_ALT_PATH			|
1384						 IB_QP_ACCESS_FLAGS		|
1385						 IB_QP_PKEY_INDEX),
1386				 [IB_QPT_RC]  = (IB_QP_ALT_PATH			|
1387						 IB_QP_ACCESS_FLAGS		|
1388						 IB_QP_PKEY_INDEX),
1389				 [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH		|
1390						 IB_QP_ACCESS_FLAGS		|
1391						 IB_QP_PKEY_INDEX),
1392				 [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH		|
1393						 IB_QP_ACCESS_FLAGS		|
1394						 IB_QP_PKEY_INDEX),
1395				 [IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
1396						 IB_QP_QKEY),
1397				 [IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
1398						 IB_QP_QKEY),
1399			 },
1400		},
1401	},
1402	[IB_QPS_RTR]   = {
1403		[IB_QPS_RESET] = { .valid = 1 },
1404		[IB_QPS_ERR] =   { .valid = 1 },
1405		[IB_QPS_RTS]   = {
1406			.valid = 1,
1407			.req_param = {
1408				[IB_QPT_UD]  = IB_QP_SQ_PSN,
1409				[IB_QPT_UC]  = IB_QP_SQ_PSN,
1410				[IB_QPT_RC]  = (IB_QP_TIMEOUT			|
1411						IB_QP_RETRY_CNT			|
1412						IB_QP_RNR_RETRY			|
1413						IB_QP_SQ_PSN			|
1414						IB_QP_MAX_QP_RD_ATOMIC),
1415				[IB_QPT_XRC_INI] = (IB_QP_TIMEOUT		|
1416						IB_QP_RETRY_CNT			|
1417						IB_QP_RNR_RETRY			|
1418						IB_QP_SQ_PSN			|
1419						IB_QP_MAX_QP_RD_ATOMIC),
1420				[IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT		|
1421						IB_QP_SQ_PSN),
1422				[IB_QPT_SMI] = IB_QP_SQ_PSN,
1423				[IB_QPT_GSI] = IB_QP_SQ_PSN,
1424			},
1425			.opt_param = {
1426				 [IB_QPT_UD]  = (IB_QP_CUR_STATE		|
1427						 IB_QP_QKEY),
1428				 [IB_QPT_UC]  = (IB_QP_CUR_STATE		|
1429						 IB_QP_ALT_PATH			|
1430						 IB_QP_ACCESS_FLAGS		|
1431						 IB_QP_PATH_MIG_STATE),
1432				 [IB_QPT_RC]  = (IB_QP_CUR_STATE		|
1433						 IB_QP_ALT_PATH			|
1434						 IB_QP_ACCESS_FLAGS		|
1435						 IB_QP_MIN_RNR_TIMER		|
1436						 IB_QP_PATH_MIG_STATE),
1437				 [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE		|
1438						 IB_QP_ALT_PATH			|
1439						 IB_QP_ACCESS_FLAGS		|
1440						 IB_QP_PATH_MIG_STATE),
1441				 [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE		|
1442						 IB_QP_ALT_PATH			|
1443						 IB_QP_ACCESS_FLAGS		|
1444						 IB_QP_MIN_RNR_TIMER		|
1445						 IB_QP_PATH_MIG_STATE),
1446				 [IB_QPT_SMI] = (IB_QP_CUR_STATE		|
1447						 IB_QP_QKEY),
1448				 [IB_QPT_GSI] = (IB_QP_CUR_STATE		|
1449						 IB_QP_QKEY),
1450				 [IB_QPT_RAW_PACKET] = IB_QP_RATE_LIMIT,
1451			 }
1452		}
1453	},
1454	[IB_QPS_RTS]   = {
1455		[IB_QPS_RESET] = { .valid = 1 },
1456		[IB_QPS_ERR] =   { .valid = 1 },
1457		[IB_QPS_RTS]   = {
1458			.valid = 1,
1459			.opt_param = {
1460				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
1461						IB_QP_QKEY),
1462				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
1463						IB_QP_ACCESS_FLAGS		|
1464						IB_QP_ALT_PATH			|
1465						IB_QP_PATH_MIG_STATE),
1466				[IB_QPT_RC]  = (IB_QP_CUR_STATE			|
1467						IB_QP_ACCESS_FLAGS		|
1468						IB_QP_ALT_PATH			|
1469						IB_QP_PATH_MIG_STATE		|
1470						IB_QP_MIN_RNR_TIMER),
1471				[IB_QPT_XRC_INI] = (IB_QP_CUR_STATE		|
1472						IB_QP_ACCESS_FLAGS		|
1473						IB_QP_ALT_PATH			|
1474						IB_QP_PATH_MIG_STATE),
1475				[IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE		|
1476						IB_QP_ACCESS_FLAGS		|
1477						IB_QP_ALT_PATH			|
1478						IB_QP_PATH_MIG_STATE		|
1479						IB_QP_MIN_RNR_TIMER),
1480				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
1481						IB_QP_QKEY),
1482				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
1483						IB_QP_QKEY),
1484				[IB_QPT_RAW_PACKET] = IB_QP_RATE_LIMIT,
1485			}
1486		},
1487		[IB_QPS_SQD]   = {
1488			.valid = 1,
1489			.opt_param = {
1490				[IB_QPT_UD]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
1491				[IB_QPT_UC]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
1492				[IB_QPT_RC]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
1493				[IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
1494				[IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */
1495				[IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
1496				[IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY
1497			}
1498		},
1499	},
1500	[IB_QPS_SQD]   = {
1501		[IB_QPS_RESET] = { .valid = 1 },
1502		[IB_QPS_ERR] =   { .valid = 1 },
1503		[IB_QPS_RTS]   = {
1504			.valid = 1,
1505			.opt_param = {
1506				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
1507						IB_QP_QKEY),
1508				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
1509						IB_QP_ALT_PATH			|
1510						IB_QP_ACCESS_FLAGS		|
1511						IB_QP_PATH_MIG_STATE),
1512				[IB_QPT_RC]  = (IB_QP_CUR_STATE			|
1513						IB_QP_ALT_PATH			|
1514						IB_QP_ACCESS_FLAGS		|
1515						IB_QP_MIN_RNR_TIMER		|
1516						IB_QP_PATH_MIG_STATE),
1517				[IB_QPT_XRC_INI] = (IB_QP_CUR_STATE		|
1518						IB_QP_ALT_PATH			|
1519						IB_QP_ACCESS_FLAGS		|
1520						IB_QP_PATH_MIG_STATE),
1521				[IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE		|
1522						IB_QP_ALT_PATH			|
1523						IB_QP_ACCESS_FLAGS		|
1524						IB_QP_MIN_RNR_TIMER		|
1525						IB_QP_PATH_MIG_STATE),
1526				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
1527						IB_QP_QKEY),
1528				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
1529						IB_QP_QKEY),
1530			}
1531		},
1532		[IB_QPS_SQD]   = {
1533			.valid = 1,
1534			.opt_param = {
1535				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
1536						IB_QP_QKEY),
1537				[IB_QPT_UC]  = (IB_QP_AV			|
1538						IB_QP_ALT_PATH			|
1539						IB_QP_ACCESS_FLAGS		|
1540						IB_QP_PKEY_INDEX		|
1541						IB_QP_PATH_MIG_STATE),
1542				[IB_QPT_RC]  = (IB_QP_PORT			|
1543						IB_QP_AV			|
1544						IB_QP_TIMEOUT			|
1545						IB_QP_RETRY_CNT			|
1546						IB_QP_RNR_RETRY			|
1547						IB_QP_MAX_QP_RD_ATOMIC		|
1548						IB_QP_MAX_DEST_RD_ATOMIC	|
1549						IB_QP_ALT_PATH			|
1550						IB_QP_ACCESS_FLAGS		|
1551						IB_QP_PKEY_INDEX		|
1552						IB_QP_MIN_RNR_TIMER		|
1553						IB_QP_PATH_MIG_STATE),
1554				[IB_QPT_XRC_INI] = (IB_QP_PORT			|
1555						IB_QP_AV			|
1556						IB_QP_TIMEOUT			|
1557						IB_QP_RETRY_CNT			|
1558						IB_QP_RNR_RETRY			|
1559						IB_QP_MAX_QP_RD_ATOMIC		|
1560						IB_QP_ALT_PATH			|
1561						IB_QP_ACCESS_FLAGS		|
1562						IB_QP_PKEY_INDEX		|
1563						IB_QP_PATH_MIG_STATE),
1564				[IB_QPT_XRC_TGT] = (IB_QP_PORT			|
1565						IB_QP_AV			|
1566						IB_QP_TIMEOUT			|
1567						IB_QP_MAX_DEST_RD_ATOMIC	|
1568						IB_QP_ALT_PATH			|
1569						IB_QP_ACCESS_FLAGS		|
1570						IB_QP_PKEY_INDEX		|
1571						IB_QP_MIN_RNR_TIMER		|
1572						IB_QP_PATH_MIG_STATE),
1573				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
1574						IB_QP_QKEY),
1575				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
1576						IB_QP_QKEY),
1577			}
1578		}
1579	},
1580	[IB_QPS_SQE]   = {
1581		[IB_QPS_RESET] = { .valid = 1 },
1582		[IB_QPS_ERR] =   { .valid = 1 },
1583		[IB_QPS_RTS]   = {
1584			.valid = 1,
1585			.opt_param = {
1586				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
1587						IB_QP_QKEY),
1588				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
1589						IB_QP_ACCESS_FLAGS),
1590				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
1591						IB_QP_QKEY),
1592				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
1593						IB_QP_QKEY),
1594			}
1595		}
1596	},
1597	[IB_QPS_ERR] = {
1598		[IB_QPS_RESET] = { .valid = 1 },
1599		[IB_QPS_ERR] =   { .valid = 1 }
1600	}
1601};
1602
1603bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
1604			enum ib_qp_type type, enum ib_qp_attr_mask mask)
1605{
1606	enum ib_qp_attr_mask req_param, opt_param;
1607
1608	if (mask & IB_QP_CUR_STATE  &&
1609	    cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
1610	    cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
1611		return false;
1612
1613	if (!qp_state_table[cur_state][next_state].valid)
1614		return false;
1615
1616	req_param = qp_state_table[cur_state][next_state].req_param[type];
1617	opt_param = qp_state_table[cur_state][next_state].opt_param[type];
1618
1619	if ((mask & req_param) != req_param)
1620		return false;
1621
1622	if (mask & ~(req_param | opt_param | IB_QP_STATE))
1623		return false;
1624
1625	return true;
1626}
1627EXPORT_SYMBOL(ib_modify_qp_is_ok);
1628
1629/**
1630 * ib_resolve_eth_dmac - Resolve destination mac address
1631 * @device:		Device to consider
1632 * @ah_attr:		address handle attribute which describes the
1633 *			source and destination parameters
1634 * ib_resolve_eth_dmac() resolves destination mac address and L3 hop limit It
1635 * returns 0 on success or appropriate error code. It initializes the
1636 * necessary ah_attr fields when call is successful.
1637 */
1638static int ib_resolve_eth_dmac(struct ib_device *device,
1639			       struct rdma_ah_attr *ah_attr)
1640{
1641	int ret = 0;
1642
1643	if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) {
1644		if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) {
1645			__be32 addr = 0;
1646
1647			memcpy(&addr, ah_attr->grh.dgid.raw + 12, 4);
1648			ip_eth_mc_map(addr, (char *)ah_attr->roce.dmac);
1649		} else {
1650			ipv6_eth_mc_map((struct in6_addr *)ah_attr->grh.dgid.raw,
1651					(char *)ah_attr->roce.dmac);
1652		}
1653	} else {
1654		ret = ib_resolve_unicast_gid_dmac(device, ah_attr);
1655	}
1656	return ret;
1657}
1658
1659static bool is_qp_type_connected(const struct ib_qp *qp)
1660{
1661	return (qp->qp_type == IB_QPT_UC ||
1662		qp->qp_type == IB_QPT_RC ||
1663		qp->qp_type == IB_QPT_XRC_INI ||
1664		qp->qp_type == IB_QPT_XRC_TGT);
1665}
1666
1667/**
1668 * IB core internal function to perform QP attributes modification.
1669 */
1670static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr,
1671			 int attr_mask, struct ib_udata *udata)
1672{
1673	u8 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
1674	const struct ib_gid_attr *old_sgid_attr_av;
1675	const struct ib_gid_attr *old_sgid_attr_alt_av;
1676	int ret;
1677
1678	attr->xmit_slave = NULL;
1679	if (attr_mask & IB_QP_AV) {
1680		ret = rdma_fill_sgid_attr(qp->device, &attr->ah_attr,
1681					  &old_sgid_attr_av);
1682		if (ret)
1683			return ret;
1684
1685		if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE &&
1686		    is_qp_type_connected(qp)) {
1687			struct net_device *slave;
1688
1689			/*
1690			 * If the user provided the qp_attr then we have to
1691			 * resolve it. Kerne users have to provide already
1692			 * resolved rdma_ah_attr's.
1693			 */
1694			if (udata) {
1695				ret = ib_resolve_eth_dmac(qp->device,
1696							  &attr->ah_attr);
1697				if (ret)
1698					goto out_av;
1699			}
1700			slave = rdma_lag_get_ah_roce_slave(qp->device,
1701							   &attr->ah_attr,
1702							   GFP_KERNEL);
1703			if (IS_ERR(slave)) {
1704				ret = PTR_ERR(slave);
1705				goto out_av;
1706			}
1707			attr->xmit_slave = slave;
1708		}
1709	}
1710	if (attr_mask & IB_QP_ALT_PATH) {
1711		/*
1712		 * FIXME: This does not track the migration state, so if the
1713		 * user loads a new alternate path after the HW has migrated
1714		 * from primary->alternate we will keep the wrong
1715		 * references. This is OK for IB because the reference
1716		 * counting does not serve any functional purpose.
1717		 */
1718		ret = rdma_fill_sgid_attr(qp->device, &attr->alt_ah_attr,
1719					  &old_sgid_attr_alt_av);
1720		if (ret)
1721			goto out_av;
1722
1723		/*
1724		 * Today the core code can only handle alternate paths and APM
1725		 * for IB. Ban them in roce mode.
1726		 */
1727		if (!(rdma_protocol_ib(qp->device,
1728				       attr->alt_ah_attr.port_num) &&
1729		      rdma_protocol_ib(qp->device, port))) {
1730			ret = -EINVAL;
1731			goto out;
1732		}
1733	}
1734
1735	if (rdma_ib_or_roce(qp->device, port)) {
1736		if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) {
1737			dev_warn(&qp->device->dev,
1738				 "%s rq_psn overflow, masking to 24 bits\n",
1739				 __func__);
1740			attr->rq_psn &= 0xffffff;
1741		}
1742
1743		if (attr_mask & IB_QP_SQ_PSN && attr->sq_psn & ~0xffffff) {
1744			dev_warn(&qp->device->dev,
1745				 " %s sq_psn overflow, masking to 24 bits\n",
1746				 __func__);
1747			attr->sq_psn &= 0xffffff;
1748		}
1749	}
1750
1751	/*
1752	 * Bind this qp to a counter automatically based on the rdma counter
1753	 * rules. This only set in RST2INIT with port specified
1754	 */
1755	if (!qp->counter && (attr_mask & IB_QP_PORT) &&
1756	    ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT))
1757		rdma_counter_bind_qp_auto(qp, attr->port_num);
1758
1759	ret = ib_security_modify_qp(qp, attr, attr_mask, udata);
1760	if (ret)
1761		goto out;
1762
1763	if (attr_mask & IB_QP_PORT)
1764		qp->port = attr->port_num;
1765	if (attr_mask & IB_QP_AV)
1766		qp->av_sgid_attr =
1767			rdma_update_sgid_attr(&attr->ah_attr, qp->av_sgid_attr);
1768	if (attr_mask & IB_QP_ALT_PATH)
1769		qp->alt_path_sgid_attr = rdma_update_sgid_attr(
1770			&attr->alt_ah_attr, qp->alt_path_sgid_attr);
1771
1772out:
1773	if (attr_mask & IB_QP_ALT_PATH)
1774		rdma_unfill_sgid_attr(&attr->alt_ah_attr, old_sgid_attr_alt_av);
1775out_av:
1776	if (attr_mask & IB_QP_AV) {
1777		rdma_lag_put_ah_roce_slave(attr->xmit_slave);
1778		rdma_unfill_sgid_attr(&attr->ah_attr, old_sgid_attr_av);
1779	}
1780	return ret;
1781}
1782
1783/**
1784 * ib_modify_qp_with_udata - Modifies the attributes for the specified QP.
1785 * @ib_qp: The QP to modify.
1786 * @attr: On input, specifies the QP attributes to modify.  On output,
1787 *   the current values of selected QP attributes are returned.
1788 * @attr_mask: A bit-mask used to specify which attributes of the QP
1789 *   are being modified.
1790 * @udata: pointer to user's input output buffer information
1791 *   are being modified.
1792 * It returns 0 on success and returns appropriate error code on error.
1793 */
1794int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr,
1795			    int attr_mask, struct ib_udata *udata)
1796{
1797	return _ib_modify_qp(ib_qp->real_qp, attr, attr_mask, udata);
1798}
1799EXPORT_SYMBOL(ib_modify_qp_with_udata);
1800
1801int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u16 *speed, u8 *width)
1802{
1803	int rc;
1804	u32 netdev_speed;
1805	struct net_device *netdev;
1806	struct ethtool_link_ksettings lksettings;
1807
1808	if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET)
1809		return -EINVAL;
1810
1811	netdev = ib_device_get_netdev(dev, port_num);
1812	if (!netdev)
1813		return -ENODEV;
1814
1815	rtnl_lock();
1816	rc = __ethtool_get_link_ksettings(netdev, &lksettings);
1817	rtnl_unlock();
1818
1819	dev_put(netdev);
1820
1821	if (!rc && lksettings.base.speed != (u32)SPEED_UNKNOWN) {
1822		netdev_speed = lksettings.base.speed;
1823	} else {
1824		netdev_speed = SPEED_1000;
1825		pr_warn("%s speed is unknown, defaulting to %d\n", netdev->name,
1826			netdev_speed);
1827	}
1828
1829	if (netdev_speed <= SPEED_1000) {
1830		*width = IB_WIDTH_1X;
1831		*speed = IB_SPEED_SDR;
1832	} else if (netdev_speed <= SPEED_10000) {
1833		*width = IB_WIDTH_1X;
1834		*speed = IB_SPEED_FDR10;
1835	} else if (netdev_speed <= SPEED_20000) {
1836		*width = IB_WIDTH_4X;
1837		*speed = IB_SPEED_DDR;
1838	} else if (netdev_speed <= SPEED_25000) {
1839		*width = IB_WIDTH_1X;
1840		*speed = IB_SPEED_EDR;
1841	} else if (netdev_speed <= SPEED_40000) {
1842		*width = IB_WIDTH_4X;
1843		*speed = IB_SPEED_FDR10;
1844	} else {
1845		*width = IB_WIDTH_4X;
1846		*speed = IB_SPEED_EDR;
1847	}
1848
1849	return 0;
1850}
1851EXPORT_SYMBOL(ib_get_eth_speed);
1852
1853int ib_modify_qp(struct ib_qp *qp,
1854		 struct ib_qp_attr *qp_attr,
1855		 int qp_attr_mask)
1856{
1857	return _ib_modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL);
1858}
1859EXPORT_SYMBOL(ib_modify_qp);
1860
1861int ib_query_qp(struct ib_qp *qp,
1862		struct ib_qp_attr *qp_attr,
1863		int qp_attr_mask,
1864		struct ib_qp_init_attr *qp_init_attr)
1865{
1866	qp_attr->ah_attr.grh.sgid_attr = NULL;
1867	qp_attr->alt_ah_attr.grh.sgid_attr = NULL;
1868
1869	return qp->device->ops.query_qp ?
1870		qp->device->ops.query_qp(qp->real_qp, qp_attr, qp_attr_mask,
1871					 qp_init_attr) : -EOPNOTSUPP;
1872}
1873EXPORT_SYMBOL(ib_query_qp);
1874
1875int ib_close_qp(struct ib_qp *qp)
1876{
1877	struct ib_qp *real_qp;
1878	unsigned long flags;
1879
1880	real_qp = qp->real_qp;
1881	if (real_qp == qp)
1882		return -EINVAL;
1883
1884	spin_lock_irqsave(&real_qp->device->qp_open_list_lock, flags);
1885	list_del(&qp->open_list);
1886	spin_unlock_irqrestore(&real_qp->device->qp_open_list_lock, flags);
1887
1888	atomic_dec(&real_qp->usecnt);
1889	if (qp->qp_sec)
1890		ib_close_shared_qp_security(qp->qp_sec);
1891	kfree(qp);
1892
1893	return 0;
1894}
1895EXPORT_SYMBOL(ib_close_qp);
1896
1897static int __ib_destroy_shared_qp(struct ib_qp *qp)
1898{
1899	struct ib_xrcd *xrcd;
1900	struct ib_qp *real_qp;
1901	int ret;
1902
1903	real_qp = qp->real_qp;
1904	xrcd = real_qp->xrcd;
1905	down_write(&xrcd->tgt_qps_rwsem);
1906	ib_close_qp(qp);
1907	if (atomic_read(&real_qp->usecnt) == 0)
1908		xa_erase(&xrcd->tgt_qps, real_qp->qp_num);
1909	else
1910		real_qp = NULL;
1911	up_write(&xrcd->tgt_qps_rwsem);
1912
1913	if (real_qp) {
1914		ret = ib_destroy_qp(real_qp);
1915		if (!ret)
1916			atomic_dec(&xrcd->usecnt);
1917	}
1918
1919	return 0;
1920}
1921
1922int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata)
1923{
1924	const struct ib_gid_attr *alt_path_sgid_attr = qp->alt_path_sgid_attr;
1925	const struct ib_gid_attr *av_sgid_attr = qp->av_sgid_attr;
1926	struct ib_pd *pd;
1927	struct ib_cq *scq, *rcq;
1928	struct ib_srq *srq;
1929	struct ib_rwq_ind_table *ind_tbl;
1930	struct ib_qp_security *sec;
1931	int ret;
1932
1933	WARN_ON_ONCE(qp->mrs_used > 0);
1934
1935	if (atomic_read(&qp->usecnt))
1936		return -EBUSY;
1937
1938	if (qp->real_qp != qp)
1939		return __ib_destroy_shared_qp(qp);
1940
1941	pd   = qp->pd;
1942	scq  = qp->send_cq;
1943	rcq  = qp->recv_cq;
1944	srq  = qp->srq;
1945	ind_tbl = qp->rwq_ind_tbl;
1946	sec  = qp->qp_sec;
1947	if (sec)
1948		ib_destroy_qp_security_begin(sec);
1949
1950	if (!qp->uobject)
1951		rdma_rw_cleanup_mrs(qp);
1952
1953	rdma_counter_unbind_qp(qp, true);
1954	rdma_restrack_del(&qp->res);
1955	ret = qp->device->ops.destroy_qp(qp, udata);
1956	if (!ret) {
1957		if (alt_path_sgid_attr)
1958			rdma_put_gid_attr(alt_path_sgid_attr);
1959		if (av_sgid_attr)
1960			rdma_put_gid_attr(av_sgid_attr);
1961		if (pd)
1962			atomic_dec(&pd->usecnt);
1963		if (scq)
1964			atomic_dec(&scq->usecnt);
1965		if (rcq)
1966			atomic_dec(&rcq->usecnt);
1967		if (srq)
1968			atomic_dec(&srq->usecnt);
1969		if (ind_tbl)
1970			atomic_dec(&ind_tbl->usecnt);
1971		if (sec)
1972			ib_destroy_qp_security_end(sec);
1973	} else {
1974		if (sec)
1975			ib_destroy_qp_security_abort(sec);
1976	}
1977
1978	return ret;
1979}
1980EXPORT_SYMBOL(ib_destroy_qp_user);
1981
1982/* Completion queues */
1983
1984struct ib_cq *__ib_create_cq(struct ib_device *device,
1985			     ib_comp_handler comp_handler,
1986			     void (*event_handler)(struct ib_event *, void *),
1987			     void *cq_context,
1988			     const struct ib_cq_init_attr *cq_attr,
1989			     const char *caller)
1990{
1991	struct ib_cq *cq;
1992	int ret;
1993
1994	cq = rdma_zalloc_drv_obj(device, ib_cq);
1995	if (!cq)
1996		return ERR_PTR(-ENOMEM);
1997
1998	cq->device = device;
1999	cq->uobject = NULL;
2000	cq->comp_handler = comp_handler;
2001	cq->event_handler = event_handler;
2002	cq->cq_context = cq_context;
2003	atomic_set(&cq->usecnt, 0);
2004
2005	rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
2006	rdma_restrack_set_name(&cq->res, caller);
2007
2008	ret = device->ops.create_cq(cq, cq_attr, NULL);
2009	if (ret) {
2010		rdma_restrack_put(&cq->res);
2011		kfree(cq);
2012		return ERR_PTR(ret);
2013	}
2014
2015	rdma_restrack_add(&cq->res);
2016	return cq;
2017}
2018EXPORT_SYMBOL(__ib_create_cq);
2019
2020int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period)
2021{
2022	if (cq->shared)
2023		return -EOPNOTSUPP;
2024
2025	return cq->device->ops.modify_cq ?
2026		cq->device->ops.modify_cq(cq, cq_count,
2027					  cq_period) : -EOPNOTSUPP;
2028}
2029EXPORT_SYMBOL(rdma_set_cq_moderation);
2030
2031int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata)
2032{
2033	int ret;
2034
2035	if (WARN_ON_ONCE(cq->shared))
2036		return -EOPNOTSUPP;
2037
2038	if (atomic_read(&cq->usecnt))
2039		return -EBUSY;
2040
2041	ret = cq->device->ops.destroy_cq(cq, udata);
2042	if (ret)
2043		return ret;
2044
2045	rdma_restrack_del(&cq->res);
2046	kfree(cq);
2047	return ret;
2048}
2049EXPORT_SYMBOL(ib_destroy_cq_user);
2050
2051int ib_resize_cq(struct ib_cq *cq, int cqe)
2052{
2053	if (cq->shared)
2054		return -EOPNOTSUPP;
2055
2056	return cq->device->ops.resize_cq ?
2057		cq->device->ops.resize_cq(cq, cqe, NULL) : -EOPNOTSUPP;
2058}
2059EXPORT_SYMBOL(ib_resize_cq);
2060
2061/* Memory regions */
2062
2063struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
2064			     u64 virt_addr, int access_flags)
2065{
2066	struct ib_mr *mr;
2067
2068	if (access_flags & IB_ACCESS_ON_DEMAND) {
2069		if (!(pd->device->attrs.device_cap_flags &
2070		      IB_DEVICE_ON_DEMAND_PAGING)) {
2071			pr_debug("ODP support not available\n");
2072			return ERR_PTR(-EINVAL);
2073		}
2074	}
2075
2076	mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr,
2077					 access_flags, NULL);
2078
2079	if (IS_ERR(mr))
2080		return mr;
2081
2082	mr->device = pd->device;
2083	mr->type = IB_MR_TYPE_USER;
2084	mr->pd = pd;
2085	mr->dm = NULL;
2086	atomic_inc(&pd->usecnt);
2087	mr->iova =  virt_addr;
2088	mr->length = length;
2089
2090	rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR);
2091	rdma_restrack_parent_name(&mr->res, &pd->res);
2092	rdma_restrack_add(&mr->res);
2093
2094	return mr;
2095}
2096EXPORT_SYMBOL(ib_reg_user_mr);
2097
2098int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
2099		 u32 flags, struct ib_sge *sg_list, u32 num_sge)
2100{
2101	if (!pd->device->ops.advise_mr)
2102		return -EOPNOTSUPP;
2103
2104	if (!num_sge)
2105		return 0;
2106
2107	return pd->device->ops.advise_mr(pd, advice, flags, sg_list, num_sge,
2108					 NULL);
2109}
2110EXPORT_SYMBOL(ib_advise_mr);
2111
2112int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
2113{
2114	struct ib_pd *pd = mr->pd;
2115	struct ib_dm *dm = mr->dm;
2116	struct ib_sig_attrs *sig_attrs = mr->sig_attrs;
2117	int ret;
2118
2119	trace_mr_dereg(mr);
2120	rdma_restrack_del(&mr->res);
2121	ret = mr->device->ops.dereg_mr(mr, udata);
2122	if (!ret) {
2123		atomic_dec(&pd->usecnt);
2124		if (dm)
2125			atomic_dec(&dm->usecnt);
2126		kfree(sig_attrs);
2127	}
2128
2129	return ret;
2130}
2131EXPORT_SYMBOL(ib_dereg_mr_user);
2132
2133/**
2134 * ib_alloc_mr() - Allocates a memory region
2135 * @pd:            protection domain associated with the region
2136 * @mr_type:       memory region type
2137 * @max_num_sg:    maximum sg entries available for registration.
2138 *
2139 * Notes:
2140 * Memory registeration page/sg lists must not exceed max_num_sg.
2141 * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed
2142 * max_num_sg * used_page_size.
2143 *
2144 */
2145struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2146			  u32 max_num_sg)
2147{
2148	struct ib_mr *mr;
2149
2150	if (!pd->device->ops.alloc_mr) {
2151		mr = ERR_PTR(-EOPNOTSUPP);
2152		goto out;
2153	}
2154
2155	if (mr_type == IB_MR_TYPE_INTEGRITY) {
2156		WARN_ON_ONCE(1);
2157		mr = ERR_PTR(-EINVAL);
2158		goto out;
2159	}
2160
2161	mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg);
2162	if (IS_ERR(mr))
2163		goto out;
2164
2165	mr->device = pd->device;
2166	mr->pd = pd;
2167	mr->dm = NULL;
2168	mr->uobject = NULL;
2169	atomic_inc(&pd->usecnt);
2170	mr->need_inval = false;
2171	mr->type = mr_type;
2172	mr->sig_attrs = NULL;
2173
2174	rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR);
2175	rdma_restrack_parent_name(&mr->res, &pd->res);
2176	rdma_restrack_add(&mr->res);
2177out:
2178	trace_mr_alloc(pd, mr_type, max_num_sg, mr);
2179	return mr;
2180}
2181EXPORT_SYMBOL(ib_alloc_mr);
2182
2183/**
2184 * ib_alloc_mr_integrity() - Allocates an integrity memory region
2185 * @pd:                      protection domain associated with the region
2186 * @max_num_data_sg:         maximum data sg entries available for registration
2187 * @max_num_meta_sg:         maximum metadata sg entries available for
2188 *                           registration
2189 *
2190 * Notes:
2191 * Memory registration page/sg lists must not exceed max_num_sg,
2192 * also the integrity page/sg lists must not exceed max_num_meta_sg.
2193 *
2194 */
2195struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd,
2196				    u32 max_num_data_sg,
2197				    u32 max_num_meta_sg)
2198{
2199	struct ib_mr *mr;
2200	struct ib_sig_attrs *sig_attrs;
2201
2202	if (!pd->device->ops.alloc_mr_integrity ||
2203	    !pd->device->ops.map_mr_sg_pi) {
2204		mr = ERR_PTR(-EOPNOTSUPP);
2205		goto out;
2206	}
2207
2208	if (!max_num_meta_sg) {
2209		mr = ERR_PTR(-EINVAL);
2210		goto out;
2211	}
2212
2213	sig_attrs = kzalloc(sizeof(struct ib_sig_attrs), GFP_KERNEL);
2214	if (!sig_attrs) {
2215		mr = ERR_PTR(-ENOMEM);
2216		goto out;
2217	}
2218
2219	mr = pd->device->ops.alloc_mr_integrity(pd, max_num_data_sg,
2220						max_num_meta_sg);
2221	if (IS_ERR(mr)) {
2222		kfree(sig_attrs);
2223		goto out;
2224	}
2225
2226	mr->device = pd->device;
2227	mr->pd = pd;
2228	mr->dm = NULL;
2229	mr->uobject = NULL;
2230	atomic_inc(&pd->usecnt);
2231	mr->need_inval = false;
2232	mr->type = IB_MR_TYPE_INTEGRITY;
2233	mr->sig_attrs = sig_attrs;
2234
2235	rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR);
2236	rdma_restrack_parent_name(&mr->res, &pd->res);
2237	rdma_restrack_add(&mr->res);
2238out:
2239	trace_mr_integ_alloc(pd, max_num_data_sg, max_num_meta_sg, mr);
2240	return mr;
2241}
2242EXPORT_SYMBOL(ib_alloc_mr_integrity);
2243
2244/* Multicast groups */
2245
2246static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid)
2247{
2248	struct ib_qp_init_attr init_attr = {};
2249	struct ib_qp_attr attr = {};
2250	int num_eth_ports = 0;
2251	int port;
2252
2253	/* If QP state >= init, it is assigned to a port and we can check this
2254	 * port only.
2255	 */
2256	if (!ib_query_qp(qp, &attr, IB_QP_STATE | IB_QP_PORT, &init_attr)) {
2257		if (attr.qp_state >= IB_QPS_INIT) {
2258			if (rdma_port_get_link_layer(qp->device, attr.port_num) !=
2259			    IB_LINK_LAYER_INFINIBAND)
2260				return true;
2261			goto lid_check;
2262		}
2263	}
2264
2265	/* Can't get a quick answer, iterate over all ports */
2266	for (port = 0; port < qp->device->phys_port_cnt; port++)
2267		if (rdma_port_get_link_layer(qp->device, port) !=
2268		    IB_LINK_LAYER_INFINIBAND)
2269			num_eth_ports++;
2270
2271	/* If we have at lease one Ethernet port, RoCE annex declares that
2272	 * multicast LID should be ignored. We can't tell at this step if the
2273	 * QP belongs to an IB or Ethernet port.
2274	 */
2275	if (num_eth_ports)
2276		return true;
2277
2278	/* If all the ports are IB, we can check according to IB spec. */
2279lid_check:
2280	return !(lid < be16_to_cpu(IB_MULTICAST_LID_BASE) ||
2281		 lid == be16_to_cpu(IB_LID_PERMISSIVE));
2282}
2283
2284int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
2285{
2286	int ret;
2287
2288	if (!qp->device->ops.attach_mcast)
2289		return -EOPNOTSUPP;
2290
2291	if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) ||
2292	    qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid))
2293		return -EINVAL;
2294
2295	ret = qp->device->ops.attach_mcast(qp, gid, lid);
2296	if (!ret)
2297		atomic_inc(&qp->usecnt);
2298	return ret;
2299}
2300EXPORT_SYMBOL(ib_attach_mcast);
2301
2302int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
2303{
2304	int ret;
2305
2306	if (!qp->device->ops.detach_mcast)
2307		return -EOPNOTSUPP;
2308
2309	if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) ||
2310	    qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid))
2311		return -EINVAL;
2312
2313	ret = qp->device->ops.detach_mcast(qp, gid, lid);
2314	if (!ret)
2315		atomic_dec(&qp->usecnt);
2316	return ret;
2317}
2318EXPORT_SYMBOL(ib_detach_mcast);
2319
2320/**
2321 * ib_alloc_xrcd_user - Allocates an XRC domain.
2322 * @device: The device on which to allocate the XRC domain.
2323 * @inode: inode to connect XRCD
2324 * @udata: Valid user data or NULL for kernel object
2325 */
2326struct ib_xrcd *ib_alloc_xrcd_user(struct ib_device *device,
2327				   struct inode *inode, struct ib_udata *udata)
2328{
2329	struct ib_xrcd *xrcd;
2330	int ret;
2331
2332	if (!device->ops.alloc_xrcd)
2333		return ERR_PTR(-EOPNOTSUPP);
2334
2335	xrcd = rdma_zalloc_drv_obj(device, ib_xrcd);
2336	if (!xrcd)
2337		return ERR_PTR(-ENOMEM);
2338
2339	xrcd->device = device;
2340	xrcd->inode = inode;
2341	atomic_set(&xrcd->usecnt, 0);
2342	init_rwsem(&xrcd->tgt_qps_rwsem);
2343	xa_init(&xrcd->tgt_qps);
2344
2345	ret = device->ops.alloc_xrcd(xrcd, udata);
2346	if (ret)
2347		goto err;
2348	return xrcd;
2349err:
2350	kfree(xrcd);
2351	return ERR_PTR(ret);
2352}
2353EXPORT_SYMBOL(ib_alloc_xrcd_user);
2354
2355/**
2356 * ib_dealloc_xrcd_user - Deallocates an XRC domain.
2357 * @xrcd: The XRC domain to deallocate.
2358 * @udata: Valid user data or NULL for kernel object
2359 */
2360int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata)
2361{
2362	int ret;
2363
2364	if (atomic_read(&xrcd->usecnt))
2365		return -EBUSY;
2366
2367	WARN_ON(!xa_empty(&xrcd->tgt_qps));
2368	ret = xrcd->device->ops.dealloc_xrcd(xrcd, udata);
2369	if (ret)
2370		return ret;
2371	kfree(xrcd);
2372	return ret;
2373}
2374EXPORT_SYMBOL(ib_dealloc_xrcd_user);
2375
2376/**
2377 * ib_create_wq - Creates a WQ associated with the specified protection
2378 * domain.
2379 * @pd: The protection domain associated with the WQ.
2380 * @wq_attr: A list of initial attributes required to create the
2381 * WQ. If WQ creation succeeds, then the attributes are updated to
2382 * the actual capabilities of the created WQ.
2383 *
2384 * wq_attr->max_wr and wq_attr->max_sge determine
2385 * the requested size of the WQ, and set to the actual values allocated
2386 * on return.
2387 * If ib_create_wq() succeeds, then max_wr and max_sge will always be
2388 * at least as large as the requested values.
2389 */
2390struct ib_wq *ib_create_wq(struct ib_pd *pd,
2391			   struct ib_wq_init_attr *wq_attr)
2392{
2393	struct ib_wq *wq;
2394
2395	if (!pd->device->ops.create_wq)
2396		return ERR_PTR(-EOPNOTSUPP);
2397
2398	wq = pd->device->ops.create_wq(pd, wq_attr, NULL);
2399	if (!IS_ERR(wq)) {
2400		wq->event_handler = wq_attr->event_handler;
2401		wq->wq_context = wq_attr->wq_context;
2402		wq->wq_type = wq_attr->wq_type;
2403		wq->cq = wq_attr->cq;
2404		wq->device = pd->device;
2405		wq->pd = pd;
2406		wq->uobject = NULL;
2407		atomic_inc(&pd->usecnt);
2408		atomic_inc(&wq_attr->cq->usecnt);
2409		atomic_set(&wq->usecnt, 0);
2410	}
2411	return wq;
2412}
2413EXPORT_SYMBOL(ib_create_wq);
2414
2415/**
2416 * ib_destroy_wq_user - Destroys the specified user WQ.
2417 * @wq: The WQ to destroy.
2418 * @udata: Valid user data
2419 */
2420int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata)
2421{
2422	struct ib_cq *cq = wq->cq;
2423	struct ib_pd *pd = wq->pd;
2424	int ret;
2425
2426	if (atomic_read(&wq->usecnt))
2427		return -EBUSY;
2428
2429	ret = wq->device->ops.destroy_wq(wq, udata);
2430	if (ret)
2431		return ret;
2432
2433	atomic_dec(&pd->usecnt);
2434	atomic_dec(&cq->usecnt);
2435	return ret;
2436}
2437EXPORT_SYMBOL(ib_destroy_wq_user);
2438
2439/**
2440 * ib_modify_wq - Modifies the specified WQ.
2441 * @wq: The WQ to modify.
2442 * @wq_attr: On input, specifies the WQ attributes to modify.
2443 * @wq_attr_mask: A bit-mask used to specify which attributes of the WQ
2444 *   are being modified.
2445 * On output, the current values of selected WQ attributes are returned.
2446 */
2447int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
2448		 u32 wq_attr_mask)
2449{
2450	int err;
2451
2452	if (!wq->device->ops.modify_wq)
2453		return -EOPNOTSUPP;
2454
2455	err = wq->device->ops.modify_wq(wq, wq_attr, wq_attr_mask, NULL);
2456	return err;
2457}
2458EXPORT_SYMBOL(ib_modify_wq);
2459
2460int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
2461		       struct ib_mr_status *mr_status)
2462{
2463	if (!mr->device->ops.check_mr_status)
2464		return -EOPNOTSUPP;
2465
2466	return mr->device->ops.check_mr_status(mr, check_mask, mr_status);
2467}
2468EXPORT_SYMBOL(ib_check_mr_status);
2469
2470int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port,
2471			 int state)
2472{
2473	if (!device->ops.set_vf_link_state)
2474		return -EOPNOTSUPP;
2475
2476	return device->ops.set_vf_link_state(device, vf, port, state);
2477}
2478EXPORT_SYMBOL(ib_set_vf_link_state);
2479
2480int ib_get_vf_config(struct ib_device *device, int vf, u8 port,
2481		     struct ifla_vf_info *info)
2482{
2483	if (!device->ops.get_vf_config)
2484		return -EOPNOTSUPP;
2485
2486	return device->ops.get_vf_config(device, vf, port, info);
2487}
2488EXPORT_SYMBOL(ib_get_vf_config);
2489
2490int ib_get_vf_stats(struct ib_device *device, int vf, u8 port,
2491		    struct ifla_vf_stats *stats)
2492{
2493	if (!device->ops.get_vf_stats)
2494		return -EOPNOTSUPP;
2495
2496	return device->ops.get_vf_stats(device, vf, port, stats);
2497}
2498EXPORT_SYMBOL(ib_get_vf_stats);
2499
2500int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid,
2501		   int type)
2502{
2503	if (!device->ops.set_vf_guid)
2504		return -EOPNOTSUPP;
2505
2506	return device->ops.set_vf_guid(device, vf, port, guid, type);
2507}
2508EXPORT_SYMBOL(ib_set_vf_guid);
2509
2510int ib_get_vf_guid(struct ib_device *device, int vf, u8 port,
2511		   struct ifla_vf_guid *node_guid,
2512		   struct ifla_vf_guid *port_guid)
2513{
2514	if (!device->ops.get_vf_guid)
2515		return -EOPNOTSUPP;
2516
2517	return device->ops.get_vf_guid(device, vf, port, node_guid, port_guid);
2518}
2519EXPORT_SYMBOL(ib_get_vf_guid);
2520/**
2521 * ib_map_mr_sg_pi() - Map the dma mapped SG lists for PI (protection
2522 *     information) and set an appropriate memory region for registration.
2523 * @mr:             memory region
2524 * @data_sg:        dma mapped scatterlist for data
2525 * @data_sg_nents:  number of entries in data_sg
2526 * @data_sg_offset: offset in bytes into data_sg
2527 * @meta_sg:        dma mapped scatterlist for metadata
2528 * @meta_sg_nents:  number of entries in meta_sg
2529 * @meta_sg_offset: offset in bytes into meta_sg
2530 * @page_size:      page vector desired page size
2531 *
2532 * Constraints:
2533 * - The MR must be allocated with type IB_MR_TYPE_INTEGRITY.
2534 *
2535 * Return: 0 on success.
2536 *
2537 * After this completes successfully, the  memory region
2538 * is ready for registration.
2539 */
2540int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg,
2541		    int data_sg_nents, unsigned int *data_sg_offset,
2542		    struct scatterlist *meta_sg, int meta_sg_nents,
2543		    unsigned int *meta_sg_offset, unsigned int page_size)
2544{
2545	if (unlikely(!mr->device->ops.map_mr_sg_pi ||
2546		     WARN_ON_ONCE(mr->type != IB_MR_TYPE_INTEGRITY)))
2547		return -EOPNOTSUPP;
2548
2549	mr->page_size = page_size;
2550
2551	return mr->device->ops.map_mr_sg_pi(mr, data_sg, data_sg_nents,
2552					    data_sg_offset, meta_sg,
2553					    meta_sg_nents, meta_sg_offset);
2554}
2555EXPORT_SYMBOL(ib_map_mr_sg_pi);
2556
2557/**
2558 * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list
2559 *     and set it the memory region.
2560 * @mr:            memory region
2561 * @sg:            dma mapped scatterlist
2562 * @sg_nents:      number of entries in sg
2563 * @sg_offset:     offset in bytes into sg
2564 * @page_size:     page vector desired page size
2565 *
2566 * Constraints:
2567 *
2568 * - The first sg element is allowed to have an offset.
2569 * - Each sg element must either be aligned to page_size or virtually
2570 *   contiguous to the previous element. In case an sg element has a
2571 *   non-contiguous offset, the mapping prefix will not include it.
2572 * - The last sg element is allowed to have length less than page_size.
2573 * - If sg_nents total byte length exceeds the mr max_num_sge * page_size
2574 *   then only max_num_sg entries will be mapped.
2575 * - If the MR was allocated with type IB_MR_TYPE_SG_GAPS, none of these
2576 *   constraints holds and the page_size argument is ignored.
2577 *
2578 * Returns the number of sg elements that were mapped to the memory region.
2579 *
2580 * After this completes successfully, the  memory region
2581 * is ready for registration.
2582 */
2583int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
2584		 unsigned int *sg_offset, unsigned int page_size)
2585{
2586	if (unlikely(!mr->device->ops.map_mr_sg))
2587		return -EOPNOTSUPP;
2588
2589	mr->page_size = page_size;
2590
2591	return mr->device->ops.map_mr_sg(mr, sg, sg_nents, sg_offset);
2592}
2593EXPORT_SYMBOL(ib_map_mr_sg);
2594
2595/**
2596 * ib_sg_to_pages() - Convert the largest prefix of a sg list
2597 *     to a page vector
2598 * @mr:            memory region
2599 * @sgl:           dma mapped scatterlist
2600 * @sg_nents:      number of entries in sg
2601 * @sg_offset_p:   ==== =======================================================
2602 *                 IN   start offset in bytes into sg
2603 *                 OUT  offset in bytes for element n of the sg of the first
2604 *                      byte that has not been processed where n is the return
2605 *                      value of this function.
2606 *                 ==== =======================================================
2607 * @set_page:      driver page assignment function pointer
2608 *
2609 * Core service helper for drivers to convert the largest
2610 * prefix of given sg list to a page vector. The sg list
2611 * prefix converted is the prefix that meet the requirements
2612 * of ib_map_mr_sg.
2613 *
2614 * Returns the number of sg elements that were assigned to
2615 * a page vector.
2616 */
2617int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents,
2618		unsigned int *sg_offset_p, int (*set_page)(struct ib_mr *, u64))
2619{
2620	struct scatterlist *sg;
2621	u64 last_end_dma_addr = 0;
2622	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2623	unsigned int last_page_off = 0;
2624	u64 page_mask = ~((u64)mr->page_size - 1);
2625	int i, ret;
2626
2627	if (unlikely(sg_nents <= 0 || sg_offset > sg_dma_len(&sgl[0])))
2628		return -EINVAL;
2629
2630	mr->iova = sg_dma_address(&sgl[0]) + sg_offset;
2631	mr->length = 0;
2632
2633	for_each_sg(sgl, sg, sg_nents, i) {
2634		u64 dma_addr = sg_dma_address(sg) + sg_offset;
2635		u64 prev_addr = dma_addr;
2636		unsigned int dma_len = sg_dma_len(sg) - sg_offset;
2637		u64 end_dma_addr = dma_addr + dma_len;
2638		u64 page_addr = dma_addr & page_mask;
2639
2640		/*
2641		 * For the second and later elements, check whether either the
2642		 * end of element i-1 or the start of element i is not aligned
2643		 * on a page boundary.
2644		 */
2645		if (i && (last_page_off != 0 || page_addr != dma_addr)) {
2646			/* Stop mapping if there is a gap. */
2647			if (last_end_dma_addr != dma_addr)
2648				break;
2649
2650			/*
2651			 * Coalesce this element with the last. If it is small
2652			 * enough just update mr->length. Otherwise start
2653			 * mapping from the next page.
2654			 */
2655			goto next_page;
2656		}
2657
2658		do {
2659			ret = set_page(mr, page_addr);
2660			if (unlikely(ret < 0)) {
2661				sg_offset = prev_addr - sg_dma_address(sg);
2662				mr->length += prev_addr - dma_addr;
2663				if (sg_offset_p)
2664					*sg_offset_p = sg_offset;
2665				return i || sg_offset ? i : ret;
2666			}
2667			prev_addr = page_addr;
2668next_page:
2669			page_addr += mr->page_size;
2670		} while (page_addr < end_dma_addr);
2671
2672		mr->length += dma_len;
2673		last_end_dma_addr = end_dma_addr;
2674		last_page_off = end_dma_addr & ~page_mask;
2675
2676		sg_offset = 0;
2677	}
2678
2679	if (sg_offset_p)
2680		*sg_offset_p = 0;
2681	return i;
2682}
2683EXPORT_SYMBOL(ib_sg_to_pages);
2684
2685struct ib_drain_cqe {
2686	struct ib_cqe cqe;
2687	struct completion done;
2688};
2689
2690static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
2691{
2692	struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe,
2693						cqe);
2694
2695	complete(&cqe->done);
2696}
2697
2698/*
2699 * Post a WR and block until its completion is reaped for the SQ.
2700 */
2701static void __ib_drain_sq(struct ib_qp *qp)
2702{
2703	struct ib_cq *cq = qp->send_cq;
2704	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
2705	struct ib_drain_cqe sdrain;
2706	struct ib_rdma_wr swr = {
2707		.wr = {
2708			.next = NULL,
2709			{ .wr_cqe	= &sdrain.cqe, },
2710			.opcode	= IB_WR_RDMA_WRITE,
2711		},
2712	};
2713	int ret;
2714
2715	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
2716	if (ret) {
2717		WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
2718		return;
2719	}
2720
2721	sdrain.cqe.done = ib_drain_qp_done;
2722	init_completion(&sdrain.done);
2723
2724	ret = ib_post_send(qp, &swr.wr, NULL);
2725	if (ret) {
2726		WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
2727		return;
2728	}
2729
2730	if (cq->poll_ctx == IB_POLL_DIRECT)
2731		while (wait_for_completion_timeout(&sdrain.done, HZ / 10) <= 0)
2732			ib_process_cq_direct(cq, -1);
2733	else
2734		wait_for_completion(&sdrain.done);
2735}
2736
2737/*
2738 * Post a WR and block until its completion is reaped for the RQ.
2739 */
2740static void __ib_drain_rq(struct ib_qp *qp)
2741{
2742	struct ib_cq *cq = qp->recv_cq;
2743	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
2744	struct ib_drain_cqe rdrain;
2745	struct ib_recv_wr rwr = {};
2746	int ret;
2747
2748	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
2749	if (ret) {
2750		WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
2751		return;
2752	}
2753
2754	rwr.wr_cqe = &rdrain.cqe;
2755	rdrain.cqe.done = ib_drain_qp_done;
2756	init_completion(&rdrain.done);
2757
2758	ret = ib_post_recv(qp, &rwr, NULL);
2759	if (ret) {
2760		WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
2761		return;
2762	}
2763
2764	if (cq->poll_ctx == IB_POLL_DIRECT)
2765		while (wait_for_completion_timeout(&rdrain.done, HZ / 10) <= 0)
2766			ib_process_cq_direct(cq, -1);
2767	else
2768		wait_for_completion(&rdrain.done);
2769}
2770
2771/**
2772 * ib_drain_sq() - Block until all SQ CQEs have been consumed by the
2773 *		   application.
2774 * @qp:            queue pair to drain
2775 *
2776 * If the device has a provider-specific drain function, then
2777 * call that.  Otherwise call the generic drain function
2778 * __ib_drain_sq().
2779 *
2780 * The caller must:
2781 *
2782 * ensure there is room in the CQ and SQ for the drain work request and
2783 * completion.
2784 *
2785 * allocate the CQ using ib_alloc_cq().
2786 *
2787 * ensure that there are no other contexts that are posting WRs concurrently.
2788 * Otherwise the drain is not guaranteed.
2789 */
2790void ib_drain_sq(struct ib_qp *qp)
2791{
2792	if (qp->device->ops.drain_sq)
2793		qp->device->ops.drain_sq(qp);
2794	else
2795		__ib_drain_sq(qp);
2796	trace_cq_drain_complete(qp->send_cq);
2797}
2798EXPORT_SYMBOL(ib_drain_sq);
2799
2800/**
2801 * ib_drain_rq() - Block until all RQ CQEs have been consumed by the
2802 *		   application.
2803 * @qp:            queue pair to drain
2804 *
2805 * If the device has a provider-specific drain function, then
2806 * call that.  Otherwise call the generic drain function
2807 * __ib_drain_rq().
2808 *
2809 * The caller must:
2810 *
2811 * ensure there is room in the CQ and RQ for the drain work request and
2812 * completion.
2813 *
2814 * allocate the CQ using ib_alloc_cq().
2815 *
2816 * ensure that there are no other contexts that are posting WRs concurrently.
2817 * Otherwise the drain is not guaranteed.
2818 */
2819void ib_drain_rq(struct ib_qp *qp)
2820{
2821	if (qp->device->ops.drain_rq)
2822		qp->device->ops.drain_rq(qp);
2823	else
2824		__ib_drain_rq(qp);
2825	trace_cq_drain_complete(qp->recv_cq);
2826}
2827EXPORT_SYMBOL(ib_drain_rq);
2828
2829/**
2830 * ib_drain_qp() - Block until all CQEs have been consumed by the
2831 *		   application on both the RQ and SQ.
2832 * @qp:            queue pair to drain
2833 *
2834 * The caller must:
2835 *
2836 * ensure there is room in the CQ(s), SQ, and RQ for drain work requests
2837 * and completions.
2838 *
2839 * allocate the CQs using ib_alloc_cq().
2840 *
2841 * ensure that there are no other contexts that are posting WRs concurrently.
2842 * Otherwise the drain is not guaranteed.
2843 */
2844void ib_drain_qp(struct ib_qp *qp)
2845{
2846	ib_drain_sq(qp);
2847	if (!qp->srq)
2848		ib_drain_rq(qp);
2849}
2850EXPORT_SYMBOL(ib_drain_qp);
2851
2852struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num,
2853				     enum rdma_netdev_t type, const char *name,
2854				     unsigned char name_assign_type,
2855				     void (*setup)(struct net_device *))
2856{
2857	struct rdma_netdev_alloc_params params;
2858	struct net_device *netdev;
2859	int rc;
2860
2861	if (!device->ops.rdma_netdev_get_params)
2862		return ERR_PTR(-EOPNOTSUPP);
2863
2864	rc = device->ops.rdma_netdev_get_params(device, port_num, type,
2865						&params);
2866	if (rc)
2867		return ERR_PTR(rc);
2868
2869	netdev = alloc_netdev_mqs(params.sizeof_priv, name, name_assign_type,
2870				  setup, params.txqs, params.rxqs);
2871	if (!netdev)
2872		return ERR_PTR(-ENOMEM);
2873
2874	return netdev;
2875}
2876EXPORT_SYMBOL(rdma_alloc_netdev);
2877
2878int rdma_init_netdev(struct ib_device *device, u8 port_num,
2879		     enum rdma_netdev_t type, const char *name,
2880		     unsigned char name_assign_type,
2881		     void (*setup)(struct net_device *),
2882		     struct net_device *netdev)
2883{
2884	struct rdma_netdev_alloc_params params;
2885	int rc;
2886
2887	if (!device->ops.rdma_netdev_get_params)
2888		return -EOPNOTSUPP;
2889
2890	rc = device->ops.rdma_netdev_get_params(device, port_num, type,
2891						&params);
2892	if (rc)
2893		return rc;
2894
2895	return params.initialize_rdma_netdev(device, port_num,
2896					     netdev, params.param);
2897}
2898EXPORT_SYMBOL(rdma_init_netdev);
2899
2900void __rdma_block_iter_start(struct ib_block_iter *biter,
2901			     struct scatterlist *sglist, unsigned int nents,
2902			     unsigned long pgsz)
2903{
2904	memset(biter, 0, sizeof(struct ib_block_iter));
2905	biter->__sg = sglist;
2906	biter->__sg_nents = nents;
2907
2908	/* Driver provides best block size to use */
2909	biter->__pg_bit = __fls(pgsz);
2910}
2911EXPORT_SYMBOL(__rdma_block_iter_start);
2912
2913bool __rdma_block_iter_next(struct ib_block_iter *biter)
2914{
2915	unsigned int block_offset;
2916	unsigned int sg_delta;
2917
2918	if (!biter->__sg_nents || !biter->__sg)
2919		return false;
2920
2921	biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance;
2922	block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1);
2923	sg_delta = BIT_ULL(biter->__pg_bit) - block_offset;
2924
2925	if (sg_dma_len(biter->__sg) - biter->__sg_advance > sg_delta) {
2926		biter->__sg_advance += sg_delta;
2927	} else {
2928		biter->__sg_advance = 0;
2929		biter->__sg = sg_next(biter->__sg);
2930		biter->__sg_nents--;
2931	}
2932
2933	return true;
2934}
2935EXPORT_SYMBOL(__rdma_block_iter_next);
2936