18c2ecf20Sopenharmony_ci/*
28c2ecf20Sopenharmony_ci * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
38c2ecf20Sopenharmony_ci *
48c2ecf20Sopenharmony_ci * This software is available to you under a choice of one of two
58c2ecf20Sopenharmony_ci * licenses.  You may choose to be licensed under the terms of the GNU
68c2ecf20Sopenharmony_ci * General Public License (GPL) Version 2, available from the file
78c2ecf20Sopenharmony_ci * COPYING in the main directory of this source tree, or the
88c2ecf20Sopenharmony_ci * OpenIB.org BSD license below:
98c2ecf20Sopenharmony_ci *
108c2ecf20Sopenharmony_ci *     Redistribution and use in source and binary forms, with or
118c2ecf20Sopenharmony_ci *     without modification, are permitted provided that the following
128c2ecf20Sopenharmony_ci *     conditions are met:
138c2ecf20Sopenharmony_ci *
148c2ecf20Sopenharmony_ci *      - Redistributions of source code must retain the above
158c2ecf20Sopenharmony_ci *        copyright notice, this list of conditions and the following
168c2ecf20Sopenharmony_ci *        disclaimer.
178c2ecf20Sopenharmony_ci *
188c2ecf20Sopenharmony_ci *      - Redistributions in binary form must reproduce the above
198c2ecf20Sopenharmony_ci *        copyright notice, this list of conditions and the following
208c2ecf20Sopenharmony_ci *        disclaimer in the documentation and/or other materials
218c2ecf20Sopenharmony_ci *        provided with the distribution.
228c2ecf20Sopenharmony_ci *
238c2ecf20Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
248c2ecf20Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
258c2ecf20Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
268c2ecf20Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
278c2ecf20Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
288c2ecf20Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
298c2ecf20Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
308c2ecf20Sopenharmony_ci * SOFTWARE.
318c2ecf20Sopenharmony_ci */
328c2ecf20Sopenharmony_ci
338c2ecf20Sopenharmony_ci#include <rdma/ib_umem.h>
348c2ecf20Sopenharmony_ci#include <rdma/ib_umem_odp.h>
358c2ecf20Sopenharmony_ci#include <linux/kernel.h>
368c2ecf20Sopenharmony_ci
378c2ecf20Sopenharmony_ci#include "mlx5_ib.h"
388c2ecf20Sopenharmony_ci#include "cmd.h"
398c2ecf20Sopenharmony_ci#include "qp.h"
408c2ecf20Sopenharmony_ci
418c2ecf20Sopenharmony_ci#include <linux/mlx5/eq.h>
428c2ecf20Sopenharmony_ci
438c2ecf20Sopenharmony_ci/* Contains the details of a pagefault. */
448c2ecf20Sopenharmony_cistruct mlx5_pagefault {
458c2ecf20Sopenharmony_ci	u32			bytes_committed;
468c2ecf20Sopenharmony_ci	u32			token;
478c2ecf20Sopenharmony_ci	u8			event_subtype;
488c2ecf20Sopenharmony_ci	u8			type;
498c2ecf20Sopenharmony_ci	union {
508c2ecf20Sopenharmony_ci		/* Initiator or send message responder pagefault details. */
518c2ecf20Sopenharmony_ci		struct {
528c2ecf20Sopenharmony_ci			/* Received packet size, only valid for responders. */
538c2ecf20Sopenharmony_ci			u32	packet_size;
548c2ecf20Sopenharmony_ci			/*
558c2ecf20Sopenharmony_ci			 * Number of resource holding WQE, depends on type.
568c2ecf20Sopenharmony_ci			 */
578c2ecf20Sopenharmony_ci			u32	wq_num;
588c2ecf20Sopenharmony_ci			/*
598c2ecf20Sopenharmony_ci			 * WQE index. Refers to either the send queue or
608c2ecf20Sopenharmony_ci			 * receive queue, according to event_subtype.
618c2ecf20Sopenharmony_ci			 */
628c2ecf20Sopenharmony_ci			u16	wqe_index;
638c2ecf20Sopenharmony_ci		} wqe;
648c2ecf20Sopenharmony_ci		/* RDMA responder pagefault details */
658c2ecf20Sopenharmony_ci		struct {
668c2ecf20Sopenharmony_ci			u32	r_key;
678c2ecf20Sopenharmony_ci			/*
688c2ecf20Sopenharmony_ci			 * Received packet size, minimal size page fault
698c2ecf20Sopenharmony_ci			 * resolution required for forward progress.
708c2ecf20Sopenharmony_ci			 */
718c2ecf20Sopenharmony_ci			u32	packet_size;
728c2ecf20Sopenharmony_ci			u32	rdma_op_len;
738c2ecf20Sopenharmony_ci			u64	rdma_va;
748c2ecf20Sopenharmony_ci		} rdma;
758c2ecf20Sopenharmony_ci	};
768c2ecf20Sopenharmony_ci
778c2ecf20Sopenharmony_ci	struct mlx5_ib_pf_eq	*eq;
788c2ecf20Sopenharmony_ci	struct work_struct	work;
798c2ecf20Sopenharmony_ci};
808c2ecf20Sopenharmony_ci
818c2ecf20Sopenharmony_ci#define MAX_PREFETCH_LEN (4*1024*1024U)
828c2ecf20Sopenharmony_ci
838c2ecf20Sopenharmony_ci/* Timeout in ms to wait for an active mmu notifier to complete when handling
848c2ecf20Sopenharmony_ci * a pagefault. */
858c2ecf20Sopenharmony_ci#define MMU_NOTIFIER_TIMEOUT 1000
868c2ecf20Sopenharmony_ci
878c2ecf20Sopenharmony_ci#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
888c2ecf20Sopenharmony_ci#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
898c2ecf20Sopenharmony_ci#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
908c2ecf20Sopenharmony_ci#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
918c2ecf20Sopenharmony_ci#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_ci#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
948c2ecf20Sopenharmony_ci
958c2ecf20Sopenharmony_cistatic u64 mlx5_imr_ksm_entries;
968c2ecf20Sopenharmony_ci
978c2ecf20Sopenharmony_cistatic void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
988c2ecf20Sopenharmony_ci			struct mlx5_ib_mr *imr, int flags)
998c2ecf20Sopenharmony_ci{
1008c2ecf20Sopenharmony_ci	struct mlx5_klm *end = pklm + nentries;
1018c2ecf20Sopenharmony_ci
1028c2ecf20Sopenharmony_ci	if (flags & MLX5_IB_UPD_XLT_ZAP) {
1038c2ecf20Sopenharmony_ci		for (; pklm != end; pklm++, idx++) {
1048c2ecf20Sopenharmony_ci			pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
1058c2ecf20Sopenharmony_ci			pklm->key = cpu_to_be32(imr->dev->null_mkey);
1068c2ecf20Sopenharmony_ci			pklm->va = 0;
1078c2ecf20Sopenharmony_ci		}
1088c2ecf20Sopenharmony_ci		return;
1098c2ecf20Sopenharmony_ci	}
1108c2ecf20Sopenharmony_ci
1118c2ecf20Sopenharmony_ci	/*
1128c2ecf20Sopenharmony_ci	 * The locking here is pretty subtle. Ideally the implicit_children
1138c2ecf20Sopenharmony_ci	 * xarray would be protected by the umem_mutex, however that is not
1148c2ecf20Sopenharmony_ci	 * possible. Instead this uses a weaker update-then-lock pattern:
1158c2ecf20Sopenharmony_ci	 *
1168c2ecf20Sopenharmony_ci	 *  srcu_read_lock()
1178c2ecf20Sopenharmony_ci	 *    xa_store()
1188c2ecf20Sopenharmony_ci	 *    mutex_lock(umem_mutex)
1198c2ecf20Sopenharmony_ci	 *     mlx5_ib_update_xlt()
1208c2ecf20Sopenharmony_ci	 *    mutex_unlock(umem_mutex)
1218c2ecf20Sopenharmony_ci	 *    destroy lkey
1228c2ecf20Sopenharmony_ci	 *
1238c2ecf20Sopenharmony_ci	 * ie any change the xarray must be followed by the locked update_xlt
1248c2ecf20Sopenharmony_ci	 * before destroying.
1258c2ecf20Sopenharmony_ci	 *
1268c2ecf20Sopenharmony_ci	 * The umem_mutex provides the acquire/release semantic needed to make
1278c2ecf20Sopenharmony_ci	 * the xa_store() visible to a racing thread. While SRCU is not
1288c2ecf20Sopenharmony_ci	 * technically required, using it gives consistent use of the SRCU
1298c2ecf20Sopenharmony_ci	 * locking around the xarray.
1308c2ecf20Sopenharmony_ci	 */
1318c2ecf20Sopenharmony_ci	lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
1328c2ecf20Sopenharmony_ci	lockdep_assert_held(&imr->dev->odp_srcu);
1338c2ecf20Sopenharmony_ci
1348c2ecf20Sopenharmony_ci	for (; pklm != end; pklm++, idx++) {
1358c2ecf20Sopenharmony_ci		struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
1368c2ecf20Sopenharmony_ci
1378c2ecf20Sopenharmony_ci		pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
1388c2ecf20Sopenharmony_ci		if (mtt) {
1398c2ecf20Sopenharmony_ci			pklm->key = cpu_to_be32(mtt->ibmr.lkey);
1408c2ecf20Sopenharmony_ci			pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE);
1418c2ecf20Sopenharmony_ci		} else {
1428c2ecf20Sopenharmony_ci			pklm->key = cpu_to_be32(imr->dev->null_mkey);
1438c2ecf20Sopenharmony_ci			pklm->va = 0;
1448c2ecf20Sopenharmony_ci		}
1458c2ecf20Sopenharmony_ci	}
1468c2ecf20Sopenharmony_ci}
1478c2ecf20Sopenharmony_ci
1488c2ecf20Sopenharmony_cistatic u64 umem_dma_to_mtt(dma_addr_t umem_dma)
1498c2ecf20Sopenharmony_ci{
1508c2ecf20Sopenharmony_ci	u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
1518c2ecf20Sopenharmony_ci
1528c2ecf20Sopenharmony_ci	if (umem_dma & ODP_READ_ALLOWED_BIT)
1538c2ecf20Sopenharmony_ci		mtt_entry |= MLX5_IB_MTT_READ;
1548c2ecf20Sopenharmony_ci	if (umem_dma & ODP_WRITE_ALLOWED_BIT)
1558c2ecf20Sopenharmony_ci		mtt_entry |= MLX5_IB_MTT_WRITE;
1568c2ecf20Sopenharmony_ci
1578c2ecf20Sopenharmony_ci	return mtt_entry;
1588c2ecf20Sopenharmony_ci}
1598c2ecf20Sopenharmony_ci
1608c2ecf20Sopenharmony_cistatic void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
1618c2ecf20Sopenharmony_ci			 struct mlx5_ib_mr *mr, int flags)
1628c2ecf20Sopenharmony_ci{
1638c2ecf20Sopenharmony_ci	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
1648c2ecf20Sopenharmony_ci	dma_addr_t pa;
1658c2ecf20Sopenharmony_ci	size_t i;
1668c2ecf20Sopenharmony_ci
1678c2ecf20Sopenharmony_ci	if (flags & MLX5_IB_UPD_XLT_ZAP)
1688c2ecf20Sopenharmony_ci		return;
1698c2ecf20Sopenharmony_ci
1708c2ecf20Sopenharmony_ci	for (i = 0; i < nentries; i++) {
1718c2ecf20Sopenharmony_ci		pa = odp->dma_list[idx + i];
1728c2ecf20Sopenharmony_ci		pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
1738c2ecf20Sopenharmony_ci	}
1748c2ecf20Sopenharmony_ci}
1758c2ecf20Sopenharmony_ci
1768c2ecf20Sopenharmony_civoid mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
1778c2ecf20Sopenharmony_ci			   struct mlx5_ib_mr *mr, int flags)
1788c2ecf20Sopenharmony_ci{
1798c2ecf20Sopenharmony_ci	if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
1808c2ecf20Sopenharmony_ci		populate_klm(xlt, idx, nentries, mr, flags);
1818c2ecf20Sopenharmony_ci	} else {
1828c2ecf20Sopenharmony_ci		populate_mtt(xlt, idx, nentries, mr, flags);
1838c2ecf20Sopenharmony_ci	}
1848c2ecf20Sopenharmony_ci}
1858c2ecf20Sopenharmony_ci
1868c2ecf20Sopenharmony_cistatic void dma_fence_odp_mr(struct mlx5_ib_mr *mr)
1878c2ecf20Sopenharmony_ci{
1888c2ecf20Sopenharmony_ci	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
1898c2ecf20Sopenharmony_ci
1908c2ecf20Sopenharmony_ci	/* Ensure mlx5_ib_invalidate_range() will not touch the MR any more */
1918c2ecf20Sopenharmony_ci	mutex_lock(&odp->umem_mutex);
1928c2ecf20Sopenharmony_ci	if (odp->npages) {
1938c2ecf20Sopenharmony_ci		mlx5_mr_cache_invalidate(mr);
1948c2ecf20Sopenharmony_ci		ib_umem_odp_unmap_dma_pages(odp, ib_umem_start(odp),
1958c2ecf20Sopenharmony_ci					    ib_umem_end(odp));
1968c2ecf20Sopenharmony_ci		WARN_ON(odp->npages);
1978c2ecf20Sopenharmony_ci	}
1988c2ecf20Sopenharmony_ci	odp->private = NULL;
1998c2ecf20Sopenharmony_ci	mutex_unlock(&odp->umem_mutex);
2008c2ecf20Sopenharmony_ci
2018c2ecf20Sopenharmony_ci	if (!mr->cache_ent) {
2028c2ecf20Sopenharmony_ci		mlx5_core_destroy_mkey(mr->dev->mdev, &mr->mmkey);
2038c2ecf20Sopenharmony_ci		WARN_ON(mr->descs);
2048c2ecf20Sopenharmony_ci	}
2058c2ecf20Sopenharmony_ci}
2068c2ecf20Sopenharmony_ci
2078c2ecf20Sopenharmony_ci/*
2088c2ecf20Sopenharmony_ci * This must be called after the mr has been removed from implicit_children
2098c2ecf20Sopenharmony_ci * and the SRCU synchronized.  NOTE: The MR does not necessarily have to be
2108c2ecf20Sopenharmony_ci * empty here, parallel page faults could have raced with the free process and
2118c2ecf20Sopenharmony_ci * added pages to it.
2128c2ecf20Sopenharmony_ci */
2138c2ecf20Sopenharmony_cistatic void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt)
2148c2ecf20Sopenharmony_ci{
2158c2ecf20Sopenharmony_ci	struct mlx5_ib_mr *imr = mr->parent;
2168c2ecf20Sopenharmony_ci	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
2178c2ecf20Sopenharmony_ci	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
2188c2ecf20Sopenharmony_ci	unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
2198c2ecf20Sopenharmony_ci	int srcu_key;
2208c2ecf20Sopenharmony_ci
2218c2ecf20Sopenharmony_ci	/* implicit_child_mr's are not allowed to have deferred work */
2228c2ecf20Sopenharmony_ci	WARN_ON(atomic_read(&mr->num_deferred_work));
2238c2ecf20Sopenharmony_ci
2248c2ecf20Sopenharmony_ci	if (need_imr_xlt) {
2258c2ecf20Sopenharmony_ci		srcu_key = srcu_read_lock(&mr->dev->odp_srcu);
2268c2ecf20Sopenharmony_ci		mutex_lock(&odp_imr->umem_mutex);
2278c2ecf20Sopenharmony_ci		mlx5_ib_update_xlt(mr->parent, idx, 1, 0,
2288c2ecf20Sopenharmony_ci				   MLX5_IB_UPD_XLT_INDIRECT |
2298c2ecf20Sopenharmony_ci				   MLX5_IB_UPD_XLT_ATOMIC);
2308c2ecf20Sopenharmony_ci		mutex_unlock(&odp_imr->umem_mutex);
2318c2ecf20Sopenharmony_ci		srcu_read_unlock(&mr->dev->odp_srcu, srcu_key);
2328c2ecf20Sopenharmony_ci	}
2338c2ecf20Sopenharmony_ci
2348c2ecf20Sopenharmony_ci	dma_fence_odp_mr(mr);
2358c2ecf20Sopenharmony_ci
2368c2ecf20Sopenharmony_ci	mr->parent = NULL;
2378c2ecf20Sopenharmony_ci	mlx5_mr_cache_free(mr->dev, mr);
2388c2ecf20Sopenharmony_ci	ib_umem_odp_release(odp);
2398c2ecf20Sopenharmony_ci	if (atomic_dec_and_test(&imr->num_deferred_work))
2408c2ecf20Sopenharmony_ci		wake_up(&imr->q_deferred_work);
2418c2ecf20Sopenharmony_ci}
2428c2ecf20Sopenharmony_ci
2438c2ecf20Sopenharmony_cistatic void free_implicit_child_mr_work(struct work_struct *work)
2448c2ecf20Sopenharmony_ci{
2458c2ecf20Sopenharmony_ci	struct mlx5_ib_mr *mr =
2468c2ecf20Sopenharmony_ci		container_of(work, struct mlx5_ib_mr, odp_destroy.work);
2478c2ecf20Sopenharmony_ci
2488c2ecf20Sopenharmony_ci	free_implicit_child_mr(mr, true);
2498c2ecf20Sopenharmony_ci}
2508c2ecf20Sopenharmony_ci
2518c2ecf20Sopenharmony_cistatic void free_implicit_child_mr_rcu(struct rcu_head *head)
2528c2ecf20Sopenharmony_ci{
2538c2ecf20Sopenharmony_ci	struct mlx5_ib_mr *mr =
2548c2ecf20Sopenharmony_ci		container_of(head, struct mlx5_ib_mr, odp_destroy.rcu);
2558c2ecf20Sopenharmony_ci
2568c2ecf20Sopenharmony_ci	/* Freeing a MR is a sleeping operation, so bounce to a work queue */
2578c2ecf20Sopenharmony_ci	INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
2588c2ecf20Sopenharmony_ci	queue_work(system_unbound_wq, &mr->odp_destroy.work);
2598c2ecf20Sopenharmony_ci}
2608c2ecf20Sopenharmony_ci
2618c2ecf20Sopenharmony_cistatic void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
2628c2ecf20Sopenharmony_ci{
2638c2ecf20Sopenharmony_ci	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
2648c2ecf20Sopenharmony_ci	unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
2658c2ecf20Sopenharmony_ci	struct mlx5_ib_mr *imr = mr->parent;
2668c2ecf20Sopenharmony_ci
2678c2ecf20Sopenharmony_ci	xa_lock(&imr->implicit_children);
2688c2ecf20Sopenharmony_ci	/*
2698c2ecf20Sopenharmony_ci	 * This can race with mlx5_ib_free_implicit_mr(), the first one to
2708c2ecf20Sopenharmony_ci	 * reach the xa lock wins the race and destroys the MR.
2718c2ecf20Sopenharmony_ci	 */
2728c2ecf20Sopenharmony_ci	if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_ATOMIC) !=
2738c2ecf20Sopenharmony_ci	    mr)
2748c2ecf20Sopenharmony_ci		goto out_unlock;
2758c2ecf20Sopenharmony_ci
2768c2ecf20Sopenharmony_ci	atomic_inc(&imr->num_deferred_work);
2778c2ecf20Sopenharmony_ci	call_srcu(&mr->dev->odp_srcu, &mr->odp_destroy.rcu,
2788c2ecf20Sopenharmony_ci		  free_implicit_child_mr_rcu);
2798c2ecf20Sopenharmony_ci
2808c2ecf20Sopenharmony_ciout_unlock:
2818c2ecf20Sopenharmony_ci	xa_unlock(&imr->implicit_children);
2828c2ecf20Sopenharmony_ci}
2838c2ecf20Sopenharmony_ci
2848c2ecf20Sopenharmony_cistatic bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
2858c2ecf20Sopenharmony_ci				     const struct mmu_notifier_range *range,
2868c2ecf20Sopenharmony_ci				     unsigned long cur_seq)
2878c2ecf20Sopenharmony_ci{
2888c2ecf20Sopenharmony_ci	struct ib_umem_odp *umem_odp =
2898c2ecf20Sopenharmony_ci		container_of(mni, struct ib_umem_odp, notifier);
2908c2ecf20Sopenharmony_ci	struct mlx5_ib_mr *mr;
2918c2ecf20Sopenharmony_ci	const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
2928c2ecf20Sopenharmony_ci				    sizeof(struct mlx5_mtt)) - 1;
2938c2ecf20Sopenharmony_ci	u64 idx = 0, blk_start_idx = 0;
2948c2ecf20Sopenharmony_ci	u64 invalidations = 0;
2958c2ecf20Sopenharmony_ci	unsigned long start;
2968c2ecf20Sopenharmony_ci	unsigned long end;
2978c2ecf20Sopenharmony_ci	int in_block = 0;
2988c2ecf20Sopenharmony_ci	u64 addr;
2998c2ecf20Sopenharmony_ci
3008c2ecf20Sopenharmony_ci	if (!mmu_notifier_range_blockable(range))
3018c2ecf20Sopenharmony_ci		return false;
3028c2ecf20Sopenharmony_ci
3038c2ecf20Sopenharmony_ci	mutex_lock(&umem_odp->umem_mutex);
3048c2ecf20Sopenharmony_ci	mmu_interval_set_seq(mni, cur_seq);
3058c2ecf20Sopenharmony_ci	/*
3068c2ecf20Sopenharmony_ci	 * If npages is zero then umem_odp->private may not be setup yet. This
3078c2ecf20Sopenharmony_ci	 * does not complete until after the first page is mapped for DMA.
3088c2ecf20Sopenharmony_ci	 */
3098c2ecf20Sopenharmony_ci	if (!umem_odp->npages)
3108c2ecf20Sopenharmony_ci		goto out;
3118c2ecf20Sopenharmony_ci	mr = umem_odp->private;
3128c2ecf20Sopenharmony_ci
3138c2ecf20Sopenharmony_ci	start = max_t(u64, ib_umem_start(umem_odp), range->start);
3148c2ecf20Sopenharmony_ci	end = min_t(u64, ib_umem_end(umem_odp), range->end);
3158c2ecf20Sopenharmony_ci
3168c2ecf20Sopenharmony_ci	/*
3178c2ecf20Sopenharmony_ci	 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
3188c2ecf20Sopenharmony_ci	 * while we are doing the invalidation, no page fault will attempt to
3198c2ecf20Sopenharmony_ci	 * overwrite the same MTTs.  Concurent invalidations might race us,
3208c2ecf20Sopenharmony_ci	 * but they will write 0s as well, so no difference in the end result.
3218c2ecf20Sopenharmony_ci	 */
3228c2ecf20Sopenharmony_ci	for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) {
3238c2ecf20Sopenharmony_ci		idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
3248c2ecf20Sopenharmony_ci		/*
3258c2ecf20Sopenharmony_ci		 * Strive to write the MTTs in chunks, but avoid overwriting
3268c2ecf20Sopenharmony_ci		 * non-existing MTTs. The huristic here can be improved to
3278c2ecf20Sopenharmony_ci		 * estimate the cost of another UMR vs. the cost of bigger
3288c2ecf20Sopenharmony_ci		 * UMR.
3298c2ecf20Sopenharmony_ci		 */
3308c2ecf20Sopenharmony_ci		if (umem_odp->dma_list[idx] &
3318c2ecf20Sopenharmony_ci		    (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
3328c2ecf20Sopenharmony_ci			if (!in_block) {
3338c2ecf20Sopenharmony_ci				blk_start_idx = idx;
3348c2ecf20Sopenharmony_ci				in_block = 1;
3358c2ecf20Sopenharmony_ci			}
3368c2ecf20Sopenharmony_ci
3378c2ecf20Sopenharmony_ci			/* Count page invalidations */
3388c2ecf20Sopenharmony_ci			invalidations += idx - blk_start_idx + 1;
3398c2ecf20Sopenharmony_ci		} else {
3408c2ecf20Sopenharmony_ci			u64 umr_offset = idx & umr_block_mask;
3418c2ecf20Sopenharmony_ci
3428c2ecf20Sopenharmony_ci			if (in_block && umr_offset == 0) {
3438c2ecf20Sopenharmony_ci				mlx5_ib_update_xlt(mr, blk_start_idx,
3448c2ecf20Sopenharmony_ci						   idx - blk_start_idx, 0,
3458c2ecf20Sopenharmony_ci						   MLX5_IB_UPD_XLT_ZAP |
3468c2ecf20Sopenharmony_ci						   MLX5_IB_UPD_XLT_ATOMIC);
3478c2ecf20Sopenharmony_ci				in_block = 0;
3488c2ecf20Sopenharmony_ci			}
3498c2ecf20Sopenharmony_ci		}
3508c2ecf20Sopenharmony_ci	}
3518c2ecf20Sopenharmony_ci	if (in_block)
3528c2ecf20Sopenharmony_ci		mlx5_ib_update_xlt(mr, blk_start_idx,
3538c2ecf20Sopenharmony_ci				   idx - blk_start_idx + 1, 0,
3548c2ecf20Sopenharmony_ci				   MLX5_IB_UPD_XLT_ZAP |
3558c2ecf20Sopenharmony_ci				   MLX5_IB_UPD_XLT_ATOMIC);
3568c2ecf20Sopenharmony_ci
3578c2ecf20Sopenharmony_ci	mlx5_update_odp_stats(mr, invalidations, invalidations);
3588c2ecf20Sopenharmony_ci
3598c2ecf20Sopenharmony_ci	/*
3608c2ecf20Sopenharmony_ci	 * We are now sure that the device will not access the
3618c2ecf20Sopenharmony_ci	 * memory. We can safely unmap it, and mark it as dirty if
3628c2ecf20Sopenharmony_ci	 * needed.
3638c2ecf20Sopenharmony_ci	 */
3648c2ecf20Sopenharmony_ci
3658c2ecf20Sopenharmony_ci	ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
3668c2ecf20Sopenharmony_ci
3678c2ecf20Sopenharmony_ci	if (unlikely(!umem_odp->npages && mr->parent))
3688c2ecf20Sopenharmony_ci		destroy_unused_implicit_child_mr(mr);
3698c2ecf20Sopenharmony_ciout:
3708c2ecf20Sopenharmony_ci	mutex_unlock(&umem_odp->umem_mutex);
3718c2ecf20Sopenharmony_ci	return true;
3728c2ecf20Sopenharmony_ci}
3738c2ecf20Sopenharmony_ci
3748c2ecf20Sopenharmony_ciconst struct mmu_interval_notifier_ops mlx5_mn_ops = {
3758c2ecf20Sopenharmony_ci	.invalidate = mlx5_ib_invalidate_range,
3768c2ecf20Sopenharmony_ci};
3778c2ecf20Sopenharmony_ci
3788c2ecf20Sopenharmony_civoid mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
3798c2ecf20Sopenharmony_ci{
3808c2ecf20Sopenharmony_ci	struct ib_odp_caps *caps = &dev->odp_caps;
3818c2ecf20Sopenharmony_ci
3828c2ecf20Sopenharmony_ci	memset(caps, 0, sizeof(*caps));
3838c2ecf20Sopenharmony_ci
3848c2ecf20Sopenharmony_ci	if (!MLX5_CAP_GEN(dev->mdev, pg) ||
3858c2ecf20Sopenharmony_ci	    !mlx5_ib_can_load_pas_with_umr(dev, 0))
3868c2ecf20Sopenharmony_ci		return;
3878c2ecf20Sopenharmony_ci
3888c2ecf20Sopenharmony_ci	caps->general_caps = IB_ODP_SUPPORT;
3898c2ecf20Sopenharmony_ci
3908c2ecf20Sopenharmony_ci	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
3918c2ecf20Sopenharmony_ci		dev->odp_max_size = U64_MAX;
3928c2ecf20Sopenharmony_ci	else
3938c2ecf20Sopenharmony_ci		dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT);
3948c2ecf20Sopenharmony_ci
3958c2ecf20Sopenharmony_ci	if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send))
3968c2ecf20Sopenharmony_ci		caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
3978c2ecf20Sopenharmony_ci
3988c2ecf20Sopenharmony_ci	if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive))
3998c2ecf20Sopenharmony_ci		caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
4008c2ecf20Sopenharmony_ci
4018c2ecf20Sopenharmony_ci	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send))
4028c2ecf20Sopenharmony_ci		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
4038c2ecf20Sopenharmony_ci
4048c2ecf20Sopenharmony_ci	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive))
4058c2ecf20Sopenharmony_ci		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV;
4068c2ecf20Sopenharmony_ci
4078c2ecf20Sopenharmony_ci	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write))
4088c2ecf20Sopenharmony_ci		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE;
4098c2ecf20Sopenharmony_ci
4108c2ecf20Sopenharmony_ci	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read))
4118c2ecf20Sopenharmony_ci		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
4128c2ecf20Sopenharmony_ci
4138c2ecf20Sopenharmony_ci	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
4148c2ecf20Sopenharmony_ci		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
4158c2ecf20Sopenharmony_ci
4168c2ecf20Sopenharmony_ci	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive))
4178c2ecf20Sopenharmony_ci		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
4188c2ecf20Sopenharmony_ci
4198c2ecf20Sopenharmony_ci	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send))
4208c2ecf20Sopenharmony_ci		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND;
4218c2ecf20Sopenharmony_ci
4228c2ecf20Sopenharmony_ci	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive))
4238c2ecf20Sopenharmony_ci		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV;
4248c2ecf20Sopenharmony_ci
4258c2ecf20Sopenharmony_ci	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write))
4268c2ecf20Sopenharmony_ci		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE;
4278c2ecf20Sopenharmony_ci
4288c2ecf20Sopenharmony_ci	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read))
4298c2ecf20Sopenharmony_ci		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ;
4308c2ecf20Sopenharmony_ci
4318c2ecf20Sopenharmony_ci	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic))
4328c2ecf20Sopenharmony_ci		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
4338c2ecf20Sopenharmony_ci
4348c2ecf20Sopenharmony_ci	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive))
4358c2ecf20Sopenharmony_ci		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
4368c2ecf20Sopenharmony_ci
4378c2ecf20Sopenharmony_ci	if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
4388c2ecf20Sopenharmony_ci	    MLX5_CAP_GEN(dev->mdev, null_mkey) &&
4398c2ecf20Sopenharmony_ci	    MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
4408c2ecf20Sopenharmony_ci	    !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled))
4418c2ecf20Sopenharmony_ci		caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
4428c2ecf20Sopenharmony_ci}
4438c2ecf20Sopenharmony_ci
4448c2ecf20Sopenharmony_cistatic void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
4458c2ecf20Sopenharmony_ci				      struct mlx5_pagefault *pfault,
4468c2ecf20Sopenharmony_ci				      int error)
4478c2ecf20Sopenharmony_ci{
4488c2ecf20Sopenharmony_ci	int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
4498c2ecf20Sopenharmony_ci		     pfault->wqe.wq_num : pfault->token;
4508c2ecf20Sopenharmony_ci	u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {};
4518c2ecf20Sopenharmony_ci	int err;
4528c2ecf20Sopenharmony_ci
4538c2ecf20Sopenharmony_ci	MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME);
4548c2ecf20Sopenharmony_ci	MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type);
4558c2ecf20Sopenharmony_ci	MLX5_SET(page_fault_resume_in, in, token, pfault->token);
4568c2ecf20Sopenharmony_ci	MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
4578c2ecf20Sopenharmony_ci	MLX5_SET(page_fault_resume_in, in, error, !!error);
4588c2ecf20Sopenharmony_ci
4598c2ecf20Sopenharmony_ci	err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in);
4608c2ecf20Sopenharmony_ci	if (err)
4618c2ecf20Sopenharmony_ci		mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n",
4628c2ecf20Sopenharmony_ci			    wq_num, err);
4638c2ecf20Sopenharmony_ci}
4648c2ecf20Sopenharmony_ci
4658c2ecf20Sopenharmony_cistatic struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
4668c2ecf20Sopenharmony_ci						unsigned long idx)
4678c2ecf20Sopenharmony_ci{
4688c2ecf20Sopenharmony_ci	struct ib_umem_odp *odp;
4698c2ecf20Sopenharmony_ci	struct mlx5_ib_mr *mr;
4708c2ecf20Sopenharmony_ci	struct mlx5_ib_mr *ret;
4718c2ecf20Sopenharmony_ci	int err;
4728c2ecf20Sopenharmony_ci
4738c2ecf20Sopenharmony_ci	odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
4748c2ecf20Sopenharmony_ci				      idx * MLX5_IMR_MTT_SIZE,
4758c2ecf20Sopenharmony_ci				      MLX5_IMR_MTT_SIZE, &mlx5_mn_ops);
4768c2ecf20Sopenharmony_ci	if (IS_ERR(odp))
4778c2ecf20Sopenharmony_ci		return ERR_CAST(odp);
4788c2ecf20Sopenharmony_ci
4798c2ecf20Sopenharmony_ci	ret = mr = mlx5_mr_cache_alloc(imr->dev, MLX5_IMR_MTT_CACHE_ENTRY,
4808c2ecf20Sopenharmony_ci				       imr->access_flags);
4818c2ecf20Sopenharmony_ci	if (IS_ERR(mr))
4828c2ecf20Sopenharmony_ci		goto out_umem;
4838c2ecf20Sopenharmony_ci
4848c2ecf20Sopenharmony_ci	mr->ibmr.pd = imr->ibmr.pd;
4858c2ecf20Sopenharmony_ci	mr->umem = &odp->umem;
4868c2ecf20Sopenharmony_ci	mr->ibmr.lkey = mr->mmkey.key;
4878c2ecf20Sopenharmony_ci	mr->ibmr.rkey = mr->mmkey.key;
4888c2ecf20Sopenharmony_ci	mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE;
4898c2ecf20Sopenharmony_ci	mr->parent = imr;
4908c2ecf20Sopenharmony_ci	odp->private = mr;
4918c2ecf20Sopenharmony_ci
4928c2ecf20Sopenharmony_ci	err = mlx5_ib_update_xlt(mr, 0,
4938c2ecf20Sopenharmony_ci				 MLX5_IMR_MTT_ENTRIES,
4948c2ecf20Sopenharmony_ci				 PAGE_SHIFT,
4958c2ecf20Sopenharmony_ci				 MLX5_IB_UPD_XLT_ZAP |
4968c2ecf20Sopenharmony_ci				 MLX5_IB_UPD_XLT_ENABLE);
4978c2ecf20Sopenharmony_ci	if (err) {
4988c2ecf20Sopenharmony_ci		ret = ERR_PTR(err);
4998c2ecf20Sopenharmony_ci		goto out_mr;
5008c2ecf20Sopenharmony_ci	}
5018c2ecf20Sopenharmony_ci
5028c2ecf20Sopenharmony_ci	/*
5038c2ecf20Sopenharmony_ci	 * Once the store to either xarray completes any error unwind has to
5048c2ecf20Sopenharmony_ci	 * use synchronize_srcu(). Avoid this with xa_reserve()
5058c2ecf20Sopenharmony_ci	 */
5068c2ecf20Sopenharmony_ci	ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr,
5078c2ecf20Sopenharmony_ci			 GFP_KERNEL);
5088c2ecf20Sopenharmony_ci	if (unlikely(ret)) {
5098c2ecf20Sopenharmony_ci		if (xa_is_err(ret)) {
5108c2ecf20Sopenharmony_ci			ret = ERR_PTR(xa_err(ret));
5118c2ecf20Sopenharmony_ci			goto out_mr;
5128c2ecf20Sopenharmony_ci		}
5138c2ecf20Sopenharmony_ci		/*
5148c2ecf20Sopenharmony_ci		 * Another thread beat us to creating the child mr, use
5158c2ecf20Sopenharmony_ci		 * theirs.
5168c2ecf20Sopenharmony_ci		 */
5178c2ecf20Sopenharmony_ci		goto out_mr;
5188c2ecf20Sopenharmony_ci	}
5198c2ecf20Sopenharmony_ci
5208c2ecf20Sopenharmony_ci	mlx5_ib_dbg(imr->dev, "key %x mr %p\n", mr->mmkey.key, mr);
5218c2ecf20Sopenharmony_ci	return mr;
5228c2ecf20Sopenharmony_ci
5238c2ecf20Sopenharmony_ciout_mr:
5248c2ecf20Sopenharmony_ci	mlx5_mr_cache_free(imr->dev, mr);
5258c2ecf20Sopenharmony_ciout_umem:
5268c2ecf20Sopenharmony_ci	ib_umem_odp_release(odp);
5278c2ecf20Sopenharmony_ci	return ret;
5288c2ecf20Sopenharmony_ci}
5298c2ecf20Sopenharmony_ci
5308c2ecf20Sopenharmony_cistruct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
5318c2ecf20Sopenharmony_ci					     struct ib_udata *udata,
5328c2ecf20Sopenharmony_ci					     int access_flags)
5338c2ecf20Sopenharmony_ci{
5348c2ecf20Sopenharmony_ci	struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device);
5358c2ecf20Sopenharmony_ci	struct ib_umem_odp *umem_odp;
5368c2ecf20Sopenharmony_ci	struct mlx5_ib_mr *imr;
5378c2ecf20Sopenharmony_ci	int err;
5388c2ecf20Sopenharmony_ci
5398c2ecf20Sopenharmony_ci	umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags);
5408c2ecf20Sopenharmony_ci	if (IS_ERR(umem_odp))
5418c2ecf20Sopenharmony_ci		return ERR_CAST(umem_odp);
5428c2ecf20Sopenharmony_ci
5438c2ecf20Sopenharmony_ci	imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY, access_flags);
5448c2ecf20Sopenharmony_ci	if (IS_ERR(imr)) {
5458c2ecf20Sopenharmony_ci		err = PTR_ERR(imr);
5468c2ecf20Sopenharmony_ci		goto out_umem;
5478c2ecf20Sopenharmony_ci	}
5488c2ecf20Sopenharmony_ci
5498c2ecf20Sopenharmony_ci	imr->ibmr.pd = &pd->ibpd;
5508c2ecf20Sopenharmony_ci	imr->mmkey.iova = 0;
5518c2ecf20Sopenharmony_ci	imr->umem = &umem_odp->umem;
5528c2ecf20Sopenharmony_ci	imr->ibmr.lkey = imr->mmkey.key;
5538c2ecf20Sopenharmony_ci	imr->ibmr.rkey = imr->mmkey.key;
5548c2ecf20Sopenharmony_ci	imr->umem = &umem_odp->umem;
5558c2ecf20Sopenharmony_ci	imr->is_odp_implicit = true;
5568c2ecf20Sopenharmony_ci	atomic_set(&imr->num_deferred_work, 0);
5578c2ecf20Sopenharmony_ci	init_waitqueue_head(&imr->q_deferred_work);
5588c2ecf20Sopenharmony_ci	xa_init(&imr->implicit_children);
5598c2ecf20Sopenharmony_ci
5608c2ecf20Sopenharmony_ci	err = mlx5_ib_update_xlt(imr, 0,
5618c2ecf20Sopenharmony_ci				 mlx5_imr_ksm_entries,
5628c2ecf20Sopenharmony_ci				 MLX5_KSM_PAGE_SHIFT,
5638c2ecf20Sopenharmony_ci				 MLX5_IB_UPD_XLT_INDIRECT |
5648c2ecf20Sopenharmony_ci				 MLX5_IB_UPD_XLT_ZAP |
5658c2ecf20Sopenharmony_ci				 MLX5_IB_UPD_XLT_ENABLE);
5668c2ecf20Sopenharmony_ci	if (err)
5678c2ecf20Sopenharmony_ci		goto out_mr;
5688c2ecf20Sopenharmony_ci
5698c2ecf20Sopenharmony_ci	err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key),
5708c2ecf20Sopenharmony_ci			      &imr->mmkey, GFP_KERNEL));
5718c2ecf20Sopenharmony_ci	if (err)
5728c2ecf20Sopenharmony_ci		goto out_mr;
5738c2ecf20Sopenharmony_ci
5748c2ecf20Sopenharmony_ci	mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr);
5758c2ecf20Sopenharmony_ci	return imr;
5768c2ecf20Sopenharmony_ciout_mr:
5778c2ecf20Sopenharmony_ci	mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
5788c2ecf20Sopenharmony_ci	mlx5_mr_cache_free(dev, imr);
5798c2ecf20Sopenharmony_ciout_umem:
5808c2ecf20Sopenharmony_ci	ib_umem_odp_release(umem_odp);
5818c2ecf20Sopenharmony_ci	return ERR_PTR(err);
5828c2ecf20Sopenharmony_ci}
5838c2ecf20Sopenharmony_ci
5848c2ecf20Sopenharmony_civoid mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
5858c2ecf20Sopenharmony_ci{
5868c2ecf20Sopenharmony_ci	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
5878c2ecf20Sopenharmony_ci	struct mlx5_ib_dev *dev = imr->dev;
5888c2ecf20Sopenharmony_ci	struct list_head destroy_list;
5898c2ecf20Sopenharmony_ci	struct mlx5_ib_mr *mtt;
5908c2ecf20Sopenharmony_ci	struct mlx5_ib_mr *tmp;
5918c2ecf20Sopenharmony_ci	unsigned long idx;
5928c2ecf20Sopenharmony_ci
5938c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&destroy_list);
5948c2ecf20Sopenharmony_ci
5958c2ecf20Sopenharmony_ci	xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key));
5968c2ecf20Sopenharmony_ci	/*
5978c2ecf20Sopenharmony_ci	 * This stops the SRCU protected page fault path from touching either
5988c2ecf20Sopenharmony_ci	 * the imr or any children. The page fault path can only reach the
5998c2ecf20Sopenharmony_ci	 * children xarray via the imr.
6008c2ecf20Sopenharmony_ci	 */
6018c2ecf20Sopenharmony_ci	synchronize_srcu(&dev->odp_srcu);
6028c2ecf20Sopenharmony_ci
6038c2ecf20Sopenharmony_ci	/*
6048c2ecf20Sopenharmony_ci	 * All work on the prefetch list must be completed, xa_erase() prevented
6058c2ecf20Sopenharmony_ci	 * new work from being created.
6068c2ecf20Sopenharmony_ci	 */
6078c2ecf20Sopenharmony_ci	wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work));
6088c2ecf20Sopenharmony_ci
6098c2ecf20Sopenharmony_ci	/*
6108c2ecf20Sopenharmony_ci	 * At this point it is forbidden for any other thread to enter
6118c2ecf20Sopenharmony_ci	 * pagefault_mr() on this imr. It is already forbidden to call
6128c2ecf20Sopenharmony_ci	 * pagefault_mr() on an implicit child. Due to this additions to
6138c2ecf20Sopenharmony_ci	 * implicit_children are prevented.
6148c2ecf20Sopenharmony_ci	 */
6158c2ecf20Sopenharmony_ci
6168c2ecf20Sopenharmony_ci	/*
6178c2ecf20Sopenharmony_ci	 * Block destroy_unused_implicit_child_mr() from incrementing
6188c2ecf20Sopenharmony_ci	 * num_deferred_work.
6198c2ecf20Sopenharmony_ci	 */
6208c2ecf20Sopenharmony_ci	xa_lock(&imr->implicit_children);
6218c2ecf20Sopenharmony_ci	xa_for_each (&imr->implicit_children, idx, mtt) {
6228c2ecf20Sopenharmony_ci		__xa_erase(&imr->implicit_children, idx);
6238c2ecf20Sopenharmony_ci		list_add(&mtt->odp_destroy.elm, &destroy_list);
6248c2ecf20Sopenharmony_ci	}
6258c2ecf20Sopenharmony_ci	xa_unlock(&imr->implicit_children);
6268c2ecf20Sopenharmony_ci
6278c2ecf20Sopenharmony_ci	/*
6288c2ecf20Sopenharmony_ci	 * Wait for any concurrent destroy_unused_implicit_child_mr() to
6298c2ecf20Sopenharmony_ci	 * complete.
6308c2ecf20Sopenharmony_ci	 */
6318c2ecf20Sopenharmony_ci	wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work));
6328c2ecf20Sopenharmony_ci
6338c2ecf20Sopenharmony_ci	/*
6348c2ecf20Sopenharmony_ci	 * Fence the imr before we destroy the children. This allows us to
6358c2ecf20Sopenharmony_ci	 * skip updating the XLT of the imr during destroy of the child mkey
6368c2ecf20Sopenharmony_ci	 * the imr points to.
6378c2ecf20Sopenharmony_ci	 */
6388c2ecf20Sopenharmony_ci	mlx5_mr_cache_invalidate(imr);
6398c2ecf20Sopenharmony_ci
6408c2ecf20Sopenharmony_ci	list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm)
6418c2ecf20Sopenharmony_ci		free_implicit_child_mr(mtt, false);
6428c2ecf20Sopenharmony_ci
6438c2ecf20Sopenharmony_ci	mlx5_mr_cache_free(dev, imr);
6448c2ecf20Sopenharmony_ci	ib_umem_odp_release(odp_imr);
6458c2ecf20Sopenharmony_ci}
6468c2ecf20Sopenharmony_ci
6478c2ecf20Sopenharmony_ci/**
6488c2ecf20Sopenharmony_ci * mlx5_ib_fence_odp_mr - Stop all access to the ODP MR
6498c2ecf20Sopenharmony_ci * @mr: to fence
6508c2ecf20Sopenharmony_ci *
6518c2ecf20Sopenharmony_ci * On return no parallel threads will be touching this MR and no DMA will be
6528c2ecf20Sopenharmony_ci * active.
6538c2ecf20Sopenharmony_ci */
6548c2ecf20Sopenharmony_civoid mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr)
6558c2ecf20Sopenharmony_ci{
6568c2ecf20Sopenharmony_ci	/* Prevent new page faults and prefetch requests from succeeding */
6578c2ecf20Sopenharmony_ci	xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
6588c2ecf20Sopenharmony_ci
6598c2ecf20Sopenharmony_ci	/* Wait for all running page-fault handlers to finish. */
6608c2ecf20Sopenharmony_ci	synchronize_srcu(&mr->dev->odp_srcu);
6618c2ecf20Sopenharmony_ci
6628c2ecf20Sopenharmony_ci	wait_event(mr->q_deferred_work, !atomic_read(&mr->num_deferred_work));
6638c2ecf20Sopenharmony_ci
6648c2ecf20Sopenharmony_ci	dma_fence_odp_mr(mr);
6658c2ecf20Sopenharmony_ci}
6668c2ecf20Sopenharmony_ci
6678c2ecf20Sopenharmony_ci#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
6688c2ecf20Sopenharmony_ci#define MLX5_PF_FLAGS_SNAPSHOT BIT(2)
6698c2ecf20Sopenharmony_ci#define MLX5_PF_FLAGS_ENABLE BIT(3)
6708c2ecf20Sopenharmony_cistatic int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
6718c2ecf20Sopenharmony_ci			     u64 user_va, size_t bcnt, u32 *bytes_mapped,
6728c2ecf20Sopenharmony_ci			     u32 flags)
6738c2ecf20Sopenharmony_ci{
6748c2ecf20Sopenharmony_ci	int page_shift, ret, np;
6758c2ecf20Sopenharmony_ci	bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
6768c2ecf20Sopenharmony_ci	u64 access_mask;
6778c2ecf20Sopenharmony_ci	u64 start_idx;
6788c2ecf20Sopenharmony_ci	bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
6798c2ecf20Sopenharmony_ci	u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
6808c2ecf20Sopenharmony_ci
6818c2ecf20Sopenharmony_ci	if (flags & MLX5_PF_FLAGS_ENABLE)
6828c2ecf20Sopenharmony_ci		xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
6838c2ecf20Sopenharmony_ci
6848c2ecf20Sopenharmony_ci	page_shift = odp->page_shift;
6858c2ecf20Sopenharmony_ci	start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
6868c2ecf20Sopenharmony_ci	access_mask = ODP_READ_ALLOWED_BIT;
6878c2ecf20Sopenharmony_ci
6888c2ecf20Sopenharmony_ci	if (odp->umem.writable && !downgrade)
6898c2ecf20Sopenharmony_ci		access_mask |= ODP_WRITE_ALLOWED_BIT;
6908c2ecf20Sopenharmony_ci
6918c2ecf20Sopenharmony_ci	np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
6928c2ecf20Sopenharmony_ci	if (np < 0)
6938c2ecf20Sopenharmony_ci		return np;
6948c2ecf20Sopenharmony_ci
6958c2ecf20Sopenharmony_ci	/*
6968c2ecf20Sopenharmony_ci	 * No need to check whether the MTTs really belong to this MR, since
6978c2ecf20Sopenharmony_ci	 * ib_umem_odp_map_dma_and_lock already checks this.
6988c2ecf20Sopenharmony_ci	 */
6998c2ecf20Sopenharmony_ci	ret = mlx5_ib_update_xlt(mr, start_idx, np, page_shift, xlt_flags);
7008c2ecf20Sopenharmony_ci	mutex_unlock(&odp->umem_mutex);
7018c2ecf20Sopenharmony_ci
7028c2ecf20Sopenharmony_ci	if (ret < 0) {
7038c2ecf20Sopenharmony_ci		if (ret != -EAGAIN)
7048c2ecf20Sopenharmony_ci			mlx5_ib_err(mr->dev,
7058c2ecf20Sopenharmony_ci				    "Failed to update mkey page tables\n");
7068c2ecf20Sopenharmony_ci		goto out;
7078c2ecf20Sopenharmony_ci	}
7088c2ecf20Sopenharmony_ci
7098c2ecf20Sopenharmony_ci	if (bytes_mapped) {
7108c2ecf20Sopenharmony_ci		u32 new_mappings = (np << page_shift) -
7118c2ecf20Sopenharmony_ci			(user_va - round_down(user_va, 1 << page_shift));
7128c2ecf20Sopenharmony_ci
7138c2ecf20Sopenharmony_ci		*bytes_mapped += min_t(u32, new_mappings, bcnt);
7148c2ecf20Sopenharmony_ci	}
7158c2ecf20Sopenharmony_ci
7168c2ecf20Sopenharmony_ci	return np << (page_shift - PAGE_SHIFT);
7178c2ecf20Sopenharmony_ci
7188c2ecf20Sopenharmony_ciout:
7198c2ecf20Sopenharmony_ci	return ret;
7208c2ecf20Sopenharmony_ci}
7218c2ecf20Sopenharmony_ci
7228c2ecf20Sopenharmony_cistatic int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
7238c2ecf20Sopenharmony_ci				 struct ib_umem_odp *odp_imr, u64 user_va,
7248c2ecf20Sopenharmony_ci				 size_t bcnt, u32 *bytes_mapped, u32 flags)
7258c2ecf20Sopenharmony_ci{
7268c2ecf20Sopenharmony_ci	unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT;
7278c2ecf20Sopenharmony_ci	unsigned long upd_start_idx = end_idx + 1;
7288c2ecf20Sopenharmony_ci	unsigned long upd_len = 0;
7298c2ecf20Sopenharmony_ci	unsigned long npages = 0;
7308c2ecf20Sopenharmony_ci	int err;
7318c2ecf20Sopenharmony_ci	int ret;
7328c2ecf20Sopenharmony_ci
7338c2ecf20Sopenharmony_ci	if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE ||
7348c2ecf20Sopenharmony_ci		     mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt))
7358c2ecf20Sopenharmony_ci		return -EFAULT;
7368c2ecf20Sopenharmony_ci
7378c2ecf20Sopenharmony_ci	/* Fault each child mr that intersects with our interval. */
7388c2ecf20Sopenharmony_ci	while (bcnt) {
7398c2ecf20Sopenharmony_ci		unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT;
7408c2ecf20Sopenharmony_ci		struct ib_umem_odp *umem_odp;
7418c2ecf20Sopenharmony_ci		struct mlx5_ib_mr *mtt;
7428c2ecf20Sopenharmony_ci		u64 len;
7438c2ecf20Sopenharmony_ci
7448c2ecf20Sopenharmony_ci		mtt = xa_load(&imr->implicit_children, idx);
7458c2ecf20Sopenharmony_ci		if (unlikely(!mtt)) {
7468c2ecf20Sopenharmony_ci			mtt = implicit_get_child_mr(imr, idx);
7478c2ecf20Sopenharmony_ci			if (IS_ERR(mtt)) {
7488c2ecf20Sopenharmony_ci				ret = PTR_ERR(mtt);
7498c2ecf20Sopenharmony_ci				goto out;
7508c2ecf20Sopenharmony_ci			}
7518c2ecf20Sopenharmony_ci			upd_start_idx = min(upd_start_idx, idx);
7528c2ecf20Sopenharmony_ci			upd_len = idx - upd_start_idx + 1;
7538c2ecf20Sopenharmony_ci		}
7548c2ecf20Sopenharmony_ci
7558c2ecf20Sopenharmony_ci		umem_odp = to_ib_umem_odp(mtt->umem);
7568c2ecf20Sopenharmony_ci		len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) -
7578c2ecf20Sopenharmony_ci		      user_va;
7588c2ecf20Sopenharmony_ci
7598c2ecf20Sopenharmony_ci		ret = pagefault_real_mr(mtt, umem_odp, user_va, len,
7608c2ecf20Sopenharmony_ci					bytes_mapped, flags);
7618c2ecf20Sopenharmony_ci		if (ret < 0)
7628c2ecf20Sopenharmony_ci			goto out;
7638c2ecf20Sopenharmony_ci		user_va += len;
7648c2ecf20Sopenharmony_ci		bcnt -= len;
7658c2ecf20Sopenharmony_ci		npages += ret;
7668c2ecf20Sopenharmony_ci	}
7678c2ecf20Sopenharmony_ci
7688c2ecf20Sopenharmony_ci	ret = npages;
7698c2ecf20Sopenharmony_ci
7708c2ecf20Sopenharmony_ci	/*
7718c2ecf20Sopenharmony_ci	 * Any time the implicit_children are changed we must perform an
7728c2ecf20Sopenharmony_ci	 * update of the xlt before exiting to ensure the HW and the
7738c2ecf20Sopenharmony_ci	 * implicit_children remains synchronized.
7748c2ecf20Sopenharmony_ci	 */
7758c2ecf20Sopenharmony_ciout:
7768c2ecf20Sopenharmony_ci	if (likely(!upd_len))
7778c2ecf20Sopenharmony_ci		return ret;
7788c2ecf20Sopenharmony_ci
7798c2ecf20Sopenharmony_ci	/*
7808c2ecf20Sopenharmony_ci	 * Notice this is not strictly ordered right, the KSM is updated after
7818c2ecf20Sopenharmony_ci	 * the implicit_children is updated, so a parallel page fault could
7828c2ecf20Sopenharmony_ci	 * see a MR that is not yet visible in the KSM.  This is similar to a
7838c2ecf20Sopenharmony_ci	 * parallel page fault seeing a MR that is being concurrently removed
7848c2ecf20Sopenharmony_ci	 * from the KSM. Both of these improbable situations are resolved
7858c2ecf20Sopenharmony_ci	 * safely by resuming the HW and then taking another page fault. The
7868c2ecf20Sopenharmony_ci	 * next pagefault handler will see the new information.
7878c2ecf20Sopenharmony_ci	 */
7888c2ecf20Sopenharmony_ci	mutex_lock(&odp_imr->umem_mutex);
7898c2ecf20Sopenharmony_ci	err = mlx5_ib_update_xlt(imr, upd_start_idx, upd_len, 0,
7908c2ecf20Sopenharmony_ci				 MLX5_IB_UPD_XLT_INDIRECT |
7918c2ecf20Sopenharmony_ci					 MLX5_IB_UPD_XLT_ATOMIC);
7928c2ecf20Sopenharmony_ci	mutex_unlock(&odp_imr->umem_mutex);
7938c2ecf20Sopenharmony_ci	if (err) {
7948c2ecf20Sopenharmony_ci		mlx5_ib_err(imr->dev, "Failed to update PAS\n");
7958c2ecf20Sopenharmony_ci		return err;
7968c2ecf20Sopenharmony_ci	}
7978c2ecf20Sopenharmony_ci	return ret;
7988c2ecf20Sopenharmony_ci}
7998c2ecf20Sopenharmony_ci
8008c2ecf20Sopenharmony_ci/*
8018c2ecf20Sopenharmony_ci * Returns:
8028c2ecf20Sopenharmony_ci *  -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
8038c2ecf20Sopenharmony_ci *           not accessible, or the MR is no longer valid.
8048c2ecf20Sopenharmony_ci *  -EAGAIN/-ENOMEM: The operation should be retried
8058c2ecf20Sopenharmony_ci *
8068c2ecf20Sopenharmony_ci *  -EINVAL/others: General internal malfunction
8078c2ecf20Sopenharmony_ci *  >0: Number of pages mapped
8088c2ecf20Sopenharmony_ci */
8098c2ecf20Sopenharmony_cistatic int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
8108c2ecf20Sopenharmony_ci			u32 *bytes_mapped, u32 flags)
8118c2ecf20Sopenharmony_ci{
8128c2ecf20Sopenharmony_ci	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
8138c2ecf20Sopenharmony_ci
8148c2ecf20Sopenharmony_ci	lockdep_assert_held(&mr->dev->odp_srcu);
8158c2ecf20Sopenharmony_ci	if (unlikely(io_virt < mr->mmkey.iova))
8168c2ecf20Sopenharmony_ci		return -EFAULT;
8178c2ecf20Sopenharmony_ci
8188c2ecf20Sopenharmony_ci	if (!odp->is_implicit_odp) {
8198c2ecf20Sopenharmony_ci		u64 user_va;
8208c2ecf20Sopenharmony_ci
8218c2ecf20Sopenharmony_ci		if (check_add_overflow(io_virt - mr->mmkey.iova,
8228c2ecf20Sopenharmony_ci				       (u64)odp->umem.address, &user_va))
8238c2ecf20Sopenharmony_ci			return -EFAULT;
8248c2ecf20Sopenharmony_ci		if (unlikely(user_va >= ib_umem_end(odp) ||
8258c2ecf20Sopenharmony_ci			     ib_umem_end(odp) - user_va < bcnt))
8268c2ecf20Sopenharmony_ci			return -EFAULT;
8278c2ecf20Sopenharmony_ci		return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped,
8288c2ecf20Sopenharmony_ci					 flags);
8298c2ecf20Sopenharmony_ci	}
8308c2ecf20Sopenharmony_ci	return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped,
8318c2ecf20Sopenharmony_ci				     flags);
8328c2ecf20Sopenharmony_ci}
8338c2ecf20Sopenharmony_ci
8348c2ecf20Sopenharmony_ciint mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable)
8358c2ecf20Sopenharmony_ci{
8368c2ecf20Sopenharmony_ci	u32 flags = MLX5_PF_FLAGS_SNAPSHOT;
8378c2ecf20Sopenharmony_ci	int ret;
8388c2ecf20Sopenharmony_ci
8398c2ecf20Sopenharmony_ci	if (enable)
8408c2ecf20Sopenharmony_ci		flags |= MLX5_PF_FLAGS_ENABLE;
8418c2ecf20Sopenharmony_ci
8428c2ecf20Sopenharmony_ci	ret = pagefault_real_mr(mr, to_ib_umem_odp(mr->umem),
8438c2ecf20Sopenharmony_ci				mr->umem->address, mr->umem->length, NULL,
8448c2ecf20Sopenharmony_ci				flags);
8458c2ecf20Sopenharmony_ci	return ret >= 0 ? 0 : ret;
8468c2ecf20Sopenharmony_ci}
8478c2ecf20Sopenharmony_ci
8488c2ecf20Sopenharmony_cistruct pf_frame {
8498c2ecf20Sopenharmony_ci	struct pf_frame *next;
8508c2ecf20Sopenharmony_ci	u32 key;
8518c2ecf20Sopenharmony_ci	u64 io_virt;
8528c2ecf20Sopenharmony_ci	size_t bcnt;
8538c2ecf20Sopenharmony_ci	int depth;
8548c2ecf20Sopenharmony_ci};
8558c2ecf20Sopenharmony_ci
8568c2ecf20Sopenharmony_cistatic bool mkey_is_eq(struct mlx5_core_mkey *mmkey, u32 key)
8578c2ecf20Sopenharmony_ci{
8588c2ecf20Sopenharmony_ci	if (!mmkey)
8598c2ecf20Sopenharmony_ci		return false;
8608c2ecf20Sopenharmony_ci	if (mmkey->type == MLX5_MKEY_MW)
8618c2ecf20Sopenharmony_ci		return mlx5_base_mkey(mmkey->key) == mlx5_base_mkey(key);
8628c2ecf20Sopenharmony_ci	return mmkey->key == key;
8638c2ecf20Sopenharmony_ci}
8648c2ecf20Sopenharmony_ci
8658c2ecf20Sopenharmony_cistatic int get_indirect_num_descs(struct mlx5_core_mkey *mmkey)
8668c2ecf20Sopenharmony_ci{
8678c2ecf20Sopenharmony_ci	struct mlx5_ib_mw *mw;
8688c2ecf20Sopenharmony_ci	struct mlx5_ib_devx_mr *devx_mr;
8698c2ecf20Sopenharmony_ci
8708c2ecf20Sopenharmony_ci	if (mmkey->type == MLX5_MKEY_MW) {
8718c2ecf20Sopenharmony_ci		mw = container_of(mmkey, struct mlx5_ib_mw, mmkey);
8728c2ecf20Sopenharmony_ci		return mw->ndescs;
8738c2ecf20Sopenharmony_ci	}
8748c2ecf20Sopenharmony_ci
8758c2ecf20Sopenharmony_ci	devx_mr = container_of(mmkey, struct mlx5_ib_devx_mr,
8768c2ecf20Sopenharmony_ci			       mmkey);
8778c2ecf20Sopenharmony_ci	return devx_mr->ndescs;
8788c2ecf20Sopenharmony_ci}
8798c2ecf20Sopenharmony_ci
8808c2ecf20Sopenharmony_ci/*
8818c2ecf20Sopenharmony_ci * Handle a single data segment in a page-fault WQE or RDMA region.
8828c2ecf20Sopenharmony_ci *
8838c2ecf20Sopenharmony_ci * Returns number of OS pages retrieved on success. The caller may continue to
8848c2ecf20Sopenharmony_ci * the next data segment.
8858c2ecf20Sopenharmony_ci * Can return the following error codes:
8868c2ecf20Sopenharmony_ci * -EAGAIN to designate a temporary error. The caller will abort handling the
8878c2ecf20Sopenharmony_ci *  page fault and resolve it.
8888c2ecf20Sopenharmony_ci * -EFAULT when there's an error mapping the requested pages. The caller will
8898c2ecf20Sopenharmony_ci *  abort the page fault handling.
8908c2ecf20Sopenharmony_ci */
8918c2ecf20Sopenharmony_cistatic int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
8928c2ecf20Sopenharmony_ci					 struct ib_pd *pd, u32 key,
8938c2ecf20Sopenharmony_ci					 u64 io_virt, size_t bcnt,
8948c2ecf20Sopenharmony_ci					 u32 *bytes_committed,
8958c2ecf20Sopenharmony_ci					 u32 *bytes_mapped)
8968c2ecf20Sopenharmony_ci{
8978c2ecf20Sopenharmony_ci	int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0;
8988c2ecf20Sopenharmony_ci	struct pf_frame *head = NULL, *frame;
8998c2ecf20Sopenharmony_ci	struct mlx5_core_mkey *mmkey;
9008c2ecf20Sopenharmony_ci	struct mlx5_ib_mr *mr;
9018c2ecf20Sopenharmony_ci	struct mlx5_klm *pklm;
9028c2ecf20Sopenharmony_ci	u32 *out = NULL;
9038c2ecf20Sopenharmony_ci	size_t offset;
9048c2ecf20Sopenharmony_ci	int ndescs;
9058c2ecf20Sopenharmony_ci
9068c2ecf20Sopenharmony_ci	srcu_key = srcu_read_lock(&dev->odp_srcu);
9078c2ecf20Sopenharmony_ci
9088c2ecf20Sopenharmony_ci	io_virt += *bytes_committed;
9098c2ecf20Sopenharmony_ci	bcnt -= *bytes_committed;
9108c2ecf20Sopenharmony_ci
9118c2ecf20Sopenharmony_cinext_mr:
9128c2ecf20Sopenharmony_ci	mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key));
9138c2ecf20Sopenharmony_ci	if (!mmkey) {
9148c2ecf20Sopenharmony_ci		mlx5_ib_dbg(
9158c2ecf20Sopenharmony_ci			dev,
9168c2ecf20Sopenharmony_ci			"skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
9178c2ecf20Sopenharmony_ci			key);
9188c2ecf20Sopenharmony_ci		if (bytes_mapped)
9198c2ecf20Sopenharmony_ci			*bytes_mapped += bcnt;
9208c2ecf20Sopenharmony_ci		/*
9218c2ecf20Sopenharmony_ci		 * The user could specify a SGL with multiple lkeys and only
9228c2ecf20Sopenharmony_ci		 * some of them are ODP. Treat the non-ODP ones as fully
9238c2ecf20Sopenharmony_ci		 * faulted.
9248c2ecf20Sopenharmony_ci		 */
9258c2ecf20Sopenharmony_ci		ret = 0;
9268c2ecf20Sopenharmony_ci		goto srcu_unlock;
9278c2ecf20Sopenharmony_ci	}
9288c2ecf20Sopenharmony_ci	if (!mkey_is_eq(mmkey, key)) {
9298c2ecf20Sopenharmony_ci		mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
9308c2ecf20Sopenharmony_ci		ret = -EFAULT;
9318c2ecf20Sopenharmony_ci		goto srcu_unlock;
9328c2ecf20Sopenharmony_ci	}
9338c2ecf20Sopenharmony_ci
9348c2ecf20Sopenharmony_ci	switch (mmkey->type) {
9358c2ecf20Sopenharmony_ci	case MLX5_MKEY_MR:
9368c2ecf20Sopenharmony_ci		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
9378c2ecf20Sopenharmony_ci
9388c2ecf20Sopenharmony_ci		ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0);
9398c2ecf20Sopenharmony_ci		if (ret < 0)
9408c2ecf20Sopenharmony_ci			goto srcu_unlock;
9418c2ecf20Sopenharmony_ci
9428c2ecf20Sopenharmony_ci		mlx5_update_odp_stats(mr, faults, ret);
9438c2ecf20Sopenharmony_ci
9448c2ecf20Sopenharmony_ci		npages += ret;
9458c2ecf20Sopenharmony_ci		ret = 0;
9468c2ecf20Sopenharmony_ci		break;
9478c2ecf20Sopenharmony_ci
9488c2ecf20Sopenharmony_ci	case MLX5_MKEY_MW:
9498c2ecf20Sopenharmony_ci	case MLX5_MKEY_INDIRECT_DEVX:
9508c2ecf20Sopenharmony_ci		ndescs = get_indirect_num_descs(mmkey);
9518c2ecf20Sopenharmony_ci
9528c2ecf20Sopenharmony_ci		if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) {
9538c2ecf20Sopenharmony_ci			mlx5_ib_dbg(dev, "indirection level exceeded\n");
9548c2ecf20Sopenharmony_ci			ret = -EFAULT;
9558c2ecf20Sopenharmony_ci			goto srcu_unlock;
9568c2ecf20Sopenharmony_ci		}
9578c2ecf20Sopenharmony_ci
9588c2ecf20Sopenharmony_ci		outlen = MLX5_ST_SZ_BYTES(query_mkey_out) +
9598c2ecf20Sopenharmony_ci			sizeof(*pklm) * (ndescs - 2);
9608c2ecf20Sopenharmony_ci
9618c2ecf20Sopenharmony_ci		if (outlen > cur_outlen) {
9628c2ecf20Sopenharmony_ci			kfree(out);
9638c2ecf20Sopenharmony_ci			out = kzalloc(outlen, GFP_KERNEL);
9648c2ecf20Sopenharmony_ci			if (!out) {
9658c2ecf20Sopenharmony_ci				ret = -ENOMEM;
9668c2ecf20Sopenharmony_ci				goto srcu_unlock;
9678c2ecf20Sopenharmony_ci			}
9688c2ecf20Sopenharmony_ci			cur_outlen = outlen;
9698c2ecf20Sopenharmony_ci		}
9708c2ecf20Sopenharmony_ci
9718c2ecf20Sopenharmony_ci		pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out,
9728c2ecf20Sopenharmony_ci						       bsf0_klm0_pas_mtt0_1);
9738c2ecf20Sopenharmony_ci
9748c2ecf20Sopenharmony_ci		ret = mlx5_core_query_mkey(dev->mdev, mmkey, out, outlen);
9758c2ecf20Sopenharmony_ci		if (ret)
9768c2ecf20Sopenharmony_ci			goto srcu_unlock;
9778c2ecf20Sopenharmony_ci
9788c2ecf20Sopenharmony_ci		offset = io_virt - MLX5_GET64(query_mkey_out, out,
9798c2ecf20Sopenharmony_ci					      memory_key_mkey_entry.start_addr);
9808c2ecf20Sopenharmony_ci
9818c2ecf20Sopenharmony_ci		for (i = 0; bcnt && i < ndescs; i++, pklm++) {
9828c2ecf20Sopenharmony_ci			if (offset >= be32_to_cpu(pklm->bcount)) {
9838c2ecf20Sopenharmony_ci				offset -= be32_to_cpu(pklm->bcount);
9848c2ecf20Sopenharmony_ci				continue;
9858c2ecf20Sopenharmony_ci			}
9868c2ecf20Sopenharmony_ci
9878c2ecf20Sopenharmony_ci			frame = kzalloc(sizeof(*frame), GFP_KERNEL);
9888c2ecf20Sopenharmony_ci			if (!frame) {
9898c2ecf20Sopenharmony_ci				ret = -ENOMEM;
9908c2ecf20Sopenharmony_ci				goto srcu_unlock;
9918c2ecf20Sopenharmony_ci			}
9928c2ecf20Sopenharmony_ci
9938c2ecf20Sopenharmony_ci			frame->key = be32_to_cpu(pklm->key);
9948c2ecf20Sopenharmony_ci			frame->io_virt = be64_to_cpu(pklm->va) + offset;
9958c2ecf20Sopenharmony_ci			frame->bcnt = min_t(size_t, bcnt,
9968c2ecf20Sopenharmony_ci					    be32_to_cpu(pklm->bcount) - offset);
9978c2ecf20Sopenharmony_ci			frame->depth = depth + 1;
9988c2ecf20Sopenharmony_ci			frame->next = head;
9998c2ecf20Sopenharmony_ci			head = frame;
10008c2ecf20Sopenharmony_ci
10018c2ecf20Sopenharmony_ci			bcnt -= frame->bcnt;
10028c2ecf20Sopenharmony_ci			offset = 0;
10038c2ecf20Sopenharmony_ci		}
10048c2ecf20Sopenharmony_ci		break;
10058c2ecf20Sopenharmony_ci
10068c2ecf20Sopenharmony_ci	default:
10078c2ecf20Sopenharmony_ci		mlx5_ib_dbg(dev, "wrong mkey type %d\n", mmkey->type);
10088c2ecf20Sopenharmony_ci		ret = -EFAULT;
10098c2ecf20Sopenharmony_ci		goto srcu_unlock;
10108c2ecf20Sopenharmony_ci	}
10118c2ecf20Sopenharmony_ci
10128c2ecf20Sopenharmony_ci	if (head) {
10138c2ecf20Sopenharmony_ci		frame = head;
10148c2ecf20Sopenharmony_ci		head = frame->next;
10158c2ecf20Sopenharmony_ci
10168c2ecf20Sopenharmony_ci		key = frame->key;
10178c2ecf20Sopenharmony_ci		io_virt = frame->io_virt;
10188c2ecf20Sopenharmony_ci		bcnt = frame->bcnt;
10198c2ecf20Sopenharmony_ci		depth = frame->depth;
10208c2ecf20Sopenharmony_ci		kfree(frame);
10218c2ecf20Sopenharmony_ci
10228c2ecf20Sopenharmony_ci		goto next_mr;
10238c2ecf20Sopenharmony_ci	}
10248c2ecf20Sopenharmony_ci
10258c2ecf20Sopenharmony_cisrcu_unlock:
10268c2ecf20Sopenharmony_ci	while (head) {
10278c2ecf20Sopenharmony_ci		frame = head;
10288c2ecf20Sopenharmony_ci		head = frame->next;
10298c2ecf20Sopenharmony_ci		kfree(frame);
10308c2ecf20Sopenharmony_ci	}
10318c2ecf20Sopenharmony_ci	kfree(out);
10328c2ecf20Sopenharmony_ci
10338c2ecf20Sopenharmony_ci	srcu_read_unlock(&dev->odp_srcu, srcu_key);
10348c2ecf20Sopenharmony_ci	*bytes_committed = 0;
10358c2ecf20Sopenharmony_ci	return ret ? ret : npages;
10368c2ecf20Sopenharmony_ci}
10378c2ecf20Sopenharmony_ci
10388c2ecf20Sopenharmony_ci/**
10398c2ecf20Sopenharmony_ci * Parse a series of data segments for page fault handling.
10408c2ecf20Sopenharmony_ci *
10418c2ecf20Sopenharmony_ci * @pfault contains page fault information.
10428c2ecf20Sopenharmony_ci * @wqe points at the first data segment in the WQE.
10438c2ecf20Sopenharmony_ci * @wqe_end points after the end of the WQE.
10448c2ecf20Sopenharmony_ci * @bytes_mapped receives the number of bytes that the function was able to
10458c2ecf20Sopenharmony_ci *               map. This allows the caller to decide intelligently whether
10468c2ecf20Sopenharmony_ci *               enough memory was mapped to resolve the page fault
10478c2ecf20Sopenharmony_ci *               successfully (e.g. enough for the next MTU, or the entire
10488c2ecf20Sopenharmony_ci *               WQE).
10498c2ecf20Sopenharmony_ci * @total_wqe_bytes receives the total data size of this WQE in bytes (minus
10508c2ecf20Sopenharmony_ci *                  the committed bytes).
10518c2ecf20Sopenharmony_ci *
10528c2ecf20Sopenharmony_ci * Returns the number of pages loaded if positive, zero for an empty WQE, or a
10538c2ecf20Sopenharmony_ci * negative error code.
10548c2ecf20Sopenharmony_ci */
10558c2ecf20Sopenharmony_cistatic int pagefault_data_segments(struct mlx5_ib_dev *dev,
10568c2ecf20Sopenharmony_ci				   struct mlx5_pagefault *pfault,
10578c2ecf20Sopenharmony_ci				   void *wqe,
10588c2ecf20Sopenharmony_ci				   void *wqe_end, u32 *bytes_mapped,
10598c2ecf20Sopenharmony_ci				   u32 *total_wqe_bytes, bool receive_queue)
10608c2ecf20Sopenharmony_ci{
10618c2ecf20Sopenharmony_ci	int ret = 0, npages = 0;
10628c2ecf20Sopenharmony_ci	u64 io_virt;
10638c2ecf20Sopenharmony_ci	u32 key;
10648c2ecf20Sopenharmony_ci	u32 byte_count;
10658c2ecf20Sopenharmony_ci	size_t bcnt;
10668c2ecf20Sopenharmony_ci	int inline_segment;
10678c2ecf20Sopenharmony_ci
10688c2ecf20Sopenharmony_ci	if (bytes_mapped)
10698c2ecf20Sopenharmony_ci		*bytes_mapped = 0;
10708c2ecf20Sopenharmony_ci	if (total_wqe_bytes)
10718c2ecf20Sopenharmony_ci		*total_wqe_bytes = 0;
10728c2ecf20Sopenharmony_ci
10738c2ecf20Sopenharmony_ci	while (wqe < wqe_end) {
10748c2ecf20Sopenharmony_ci		struct mlx5_wqe_data_seg *dseg = wqe;
10758c2ecf20Sopenharmony_ci
10768c2ecf20Sopenharmony_ci		io_virt = be64_to_cpu(dseg->addr);
10778c2ecf20Sopenharmony_ci		key = be32_to_cpu(dseg->lkey);
10788c2ecf20Sopenharmony_ci		byte_count = be32_to_cpu(dseg->byte_count);
10798c2ecf20Sopenharmony_ci		inline_segment = !!(byte_count &  MLX5_INLINE_SEG);
10808c2ecf20Sopenharmony_ci		bcnt	       = byte_count & ~MLX5_INLINE_SEG;
10818c2ecf20Sopenharmony_ci
10828c2ecf20Sopenharmony_ci		if (inline_segment) {
10838c2ecf20Sopenharmony_ci			bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK;
10848c2ecf20Sopenharmony_ci			wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt,
10858c2ecf20Sopenharmony_ci				     16);
10868c2ecf20Sopenharmony_ci		} else {
10878c2ecf20Sopenharmony_ci			wqe += sizeof(*dseg);
10888c2ecf20Sopenharmony_ci		}
10898c2ecf20Sopenharmony_ci
10908c2ecf20Sopenharmony_ci		/* receive WQE end of sg list. */
10918c2ecf20Sopenharmony_ci		if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY &&
10928c2ecf20Sopenharmony_ci		    io_virt == 0)
10938c2ecf20Sopenharmony_ci			break;
10948c2ecf20Sopenharmony_ci
10958c2ecf20Sopenharmony_ci		if (!inline_segment && total_wqe_bytes) {
10968c2ecf20Sopenharmony_ci			*total_wqe_bytes += bcnt - min_t(size_t, bcnt,
10978c2ecf20Sopenharmony_ci					pfault->bytes_committed);
10988c2ecf20Sopenharmony_ci		}
10998c2ecf20Sopenharmony_ci
11008c2ecf20Sopenharmony_ci		/* A zero length data segment designates a length of 2GB. */
11018c2ecf20Sopenharmony_ci		if (bcnt == 0)
11028c2ecf20Sopenharmony_ci			bcnt = 1U << 31;
11038c2ecf20Sopenharmony_ci
11048c2ecf20Sopenharmony_ci		if (inline_segment || bcnt <= pfault->bytes_committed) {
11058c2ecf20Sopenharmony_ci			pfault->bytes_committed -=
11068c2ecf20Sopenharmony_ci				min_t(size_t, bcnt,
11078c2ecf20Sopenharmony_ci				      pfault->bytes_committed);
11088c2ecf20Sopenharmony_ci			continue;
11098c2ecf20Sopenharmony_ci		}
11108c2ecf20Sopenharmony_ci
11118c2ecf20Sopenharmony_ci		ret = pagefault_single_data_segment(dev, NULL, key,
11128c2ecf20Sopenharmony_ci						    io_virt, bcnt,
11138c2ecf20Sopenharmony_ci						    &pfault->bytes_committed,
11148c2ecf20Sopenharmony_ci						    bytes_mapped);
11158c2ecf20Sopenharmony_ci		if (ret < 0)
11168c2ecf20Sopenharmony_ci			break;
11178c2ecf20Sopenharmony_ci		npages += ret;
11188c2ecf20Sopenharmony_ci	}
11198c2ecf20Sopenharmony_ci
11208c2ecf20Sopenharmony_ci	return ret < 0 ? ret : npages;
11218c2ecf20Sopenharmony_ci}
11228c2ecf20Sopenharmony_ci
11238c2ecf20Sopenharmony_ci/*
11248c2ecf20Sopenharmony_ci * Parse initiator WQE. Advances the wqe pointer to point at the
11258c2ecf20Sopenharmony_ci * scatter-gather list, and set wqe_end to the end of the WQE.
11268c2ecf20Sopenharmony_ci */
11278c2ecf20Sopenharmony_cistatic int mlx5_ib_mr_initiator_pfault_handler(
11288c2ecf20Sopenharmony_ci	struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault,
11298c2ecf20Sopenharmony_ci	struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length)
11308c2ecf20Sopenharmony_ci{
11318c2ecf20Sopenharmony_ci	struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
11328c2ecf20Sopenharmony_ci	u16 wqe_index = pfault->wqe.wqe_index;
11338c2ecf20Sopenharmony_ci	struct mlx5_base_av *av;
11348c2ecf20Sopenharmony_ci	unsigned ds, opcode;
11358c2ecf20Sopenharmony_ci	u32 qpn = qp->trans_qp.base.mqp.qpn;
11368c2ecf20Sopenharmony_ci
11378c2ecf20Sopenharmony_ci	ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
11388c2ecf20Sopenharmony_ci	if (ds * MLX5_WQE_DS_UNITS > wqe_length) {
11398c2ecf20Sopenharmony_ci		mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n",
11408c2ecf20Sopenharmony_ci			    ds, wqe_length);
11418c2ecf20Sopenharmony_ci		return -EFAULT;
11428c2ecf20Sopenharmony_ci	}
11438c2ecf20Sopenharmony_ci
11448c2ecf20Sopenharmony_ci	if (ds == 0) {
11458c2ecf20Sopenharmony_ci		mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
11468c2ecf20Sopenharmony_ci			    wqe_index, qpn);
11478c2ecf20Sopenharmony_ci		return -EFAULT;
11488c2ecf20Sopenharmony_ci	}
11498c2ecf20Sopenharmony_ci
11508c2ecf20Sopenharmony_ci	*wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS;
11518c2ecf20Sopenharmony_ci	*wqe += sizeof(*ctrl);
11528c2ecf20Sopenharmony_ci
11538c2ecf20Sopenharmony_ci	opcode = be32_to_cpu(ctrl->opmod_idx_opcode) &
11548c2ecf20Sopenharmony_ci		 MLX5_WQE_CTRL_OPCODE_MASK;
11558c2ecf20Sopenharmony_ci
11568c2ecf20Sopenharmony_ci	if (qp->ibqp.qp_type == IB_QPT_XRC_INI)
11578c2ecf20Sopenharmony_ci		*wqe += sizeof(struct mlx5_wqe_xrc_seg);
11588c2ecf20Sopenharmony_ci
11598c2ecf20Sopenharmony_ci	if (qp->type == IB_QPT_UD || qp->type == MLX5_IB_QPT_DCI) {
11608c2ecf20Sopenharmony_ci		av = *wqe;
11618c2ecf20Sopenharmony_ci		if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV))
11628c2ecf20Sopenharmony_ci			*wqe += sizeof(struct mlx5_av);
11638c2ecf20Sopenharmony_ci		else
11648c2ecf20Sopenharmony_ci			*wqe += sizeof(struct mlx5_base_av);
11658c2ecf20Sopenharmony_ci	}
11668c2ecf20Sopenharmony_ci
11678c2ecf20Sopenharmony_ci	switch (opcode) {
11688c2ecf20Sopenharmony_ci	case MLX5_OPCODE_RDMA_WRITE:
11698c2ecf20Sopenharmony_ci	case MLX5_OPCODE_RDMA_WRITE_IMM:
11708c2ecf20Sopenharmony_ci	case MLX5_OPCODE_RDMA_READ:
11718c2ecf20Sopenharmony_ci		*wqe += sizeof(struct mlx5_wqe_raddr_seg);
11728c2ecf20Sopenharmony_ci		break;
11738c2ecf20Sopenharmony_ci	case MLX5_OPCODE_ATOMIC_CS:
11748c2ecf20Sopenharmony_ci	case MLX5_OPCODE_ATOMIC_FA:
11758c2ecf20Sopenharmony_ci		*wqe += sizeof(struct mlx5_wqe_raddr_seg);
11768c2ecf20Sopenharmony_ci		*wqe += sizeof(struct mlx5_wqe_atomic_seg);
11778c2ecf20Sopenharmony_ci		break;
11788c2ecf20Sopenharmony_ci	}
11798c2ecf20Sopenharmony_ci
11808c2ecf20Sopenharmony_ci	return 0;
11818c2ecf20Sopenharmony_ci}
11828c2ecf20Sopenharmony_ci
11838c2ecf20Sopenharmony_ci/*
11848c2ecf20Sopenharmony_ci * Parse responder WQE and set wqe_end to the end of the WQE.
11858c2ecf20Sopenharmony_ci */
11868c2ecf20Sopenharmony_cistatic int mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev *dev,
11878c2ecf20Sopenharmony_ci						   struct mlx5_ib_srq *srq,
11888c2ecf20Sopenharmony_ci						   void **wqe, void **wqe_end,
11898c2ecf20Sopenharmony_ci						   int wqe_length)
11908c2ecf20Sopenharmony_ci{
11918c2ecf20Sopenharmony_ci	int wqe_size = 1 << srq->msrq.wqe_shift;
11928c2ecf20Sopenharmony_ci
11938c2ecf20Sopenharmony_ci	if (wqe_size > wqe_length) {
11948c2ecf20Sopenharmony_ci		mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
11958c2ecf20Sopenharmony_ci		return -EFAULT;
11968c2ecf20Sopenharmony_ci	}
11978c2ecf20Sopenharmony_ci
11988c2ecf20Sopenharmony_ci	*wqe_end = *wqe + wqe_size;
11998c2ecf20Sopenharmony_ci	*wqe += sizeof(struct mlx5_wqe_srq_next_seg);
12008c2ecf20Sopenharmony_ci
12018c2ecf20Sopenharmony_ci	return 0;
12028c2ecf20Sopenharmony_ci}
12038c2ecf20Sopenharmony_ci
12048c2ecf20Sopenharmony_cistatic int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev *dev,
12058c2ecf20Sopenharmony_ci						  struct mlx5_ib_qp *qp,
12068c2ecf20Sopenharmony_ci						  void *wqe, void **wqe_end,
12078c2ecf20Sopenharmony_ci						  int wqe_length)
12088c2ecf20Sopenharmony_ci{
12098c2ecf20Sopenharmony_ci	struct mlx5_ib_wq *wq = &qp->rq;
12108c2ecf20Sopenharmony_ci	int wqe_size = 1 << wq->wqe_shift;
12118c2ecf20Sopenharmony_ci
12128c2ecf20Sopenharmony_ci	if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) {
12138c2ecf20Sopenharmony_ci		mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n");
12148c2ecf20Sopenharmony_ci		return -EFAULT;
12158c2ecf20Sopenharmony_ci	}
12168c2ecf20Sopenharmony_ci
12178c2ecf20Sopenharmony_ci	if (wqe_size > wqe_length) {
12188c2ecf20Sopenharmony_ci		mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
12198c2ecf20Sopenharmony_ci		return -EFAULT;
12208c2ecf20Sopenharmony_ci	}
12218c2ecf20Sopenharmony_ci
12228c2ecf20Sopenharmony_ci	*wqe_end = wqe + wqe_size;
12238c2ecf20Sopenharmony_ci
12248c2ecf20Sopenharmony_ci	return 0;
12258c2ecf20Sopenharmony_ci}
12268c2ecf20Sopenharmony_ci
12278c2ecf20Sopenharmony_cistatic inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev,
12288c2ecf20Sopenharmony_ci						       u32 wq_num, int pf_type)
12298c2ecf20Sopenharmony_ci{
12308c2ecf20Sopenharmony_ci	struct mlx5_core_rsc_common *common = NULL;
12318c2ecf20Sopenharmony_ci	struct mlx5_core_srq *srq;
12328c2ecf20Sopenharmony_ci
12338c2ecf20Sopenharmony_ci	switch (pf_type) {
12348c2ecf20Sopenharmony_ci	case MLX5_WQE_PF_TYPE_RMP:
12358c2ecf20Sopenharmony_ci		srq = mlx5_cmd_get_srq(dev, wq_num);
12368c2ecf20Sopenharmony_ci		if (srq)
12378c2ecf20Sopenharmony_ci			common = &srq->common;
12388c2ecf20Sopenharmony_ci		break;
12398c2ecf20Sopenharmony_ci	case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE:
12408c2ecf20Sopenharmony_ci	case MLX5_WQE_PF_TYPE_RESP:
12418c2ecf20Sopenharmony_ci	case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC:
12428c2ecf20Sopenharmony_ci		common = mlx5_core_res_hold(dev, wq_num, MLX5_RES_QP);
12438c2ecf20Sopenharmony_ci		break;
12448c2ecf20Sopenharmony_ci	default:
12458c2ecf20Sopenharmony_ci		break;
12468c2ecf20Sopenharmony_ci	}
12478c2ecf20Sopenharmony_ci
12488c2ecf20Sopenharmony_ci	return common;
12498c2ecf20Sopenharmony_ci}
12508c2ecf20Sopenharmony_ci
12518c2ecf20Sopenharmony_cistatic inline struct mlx5_ib_qp *res_to_qp(struct mlx5_core_rsc_common *res)
12528c2ecf20Sopenharmony_ci{
12538c2ecf20Sopenharmony_ci	struct mlx5_core_qp *mqp = (struct mlx5_core_qp *)res;
12548c2ecf20Sopenharmony_ci
12558c2ecf20Sopenharmony_ci	return to_mibqp(mqp);
12568c2ecf20Sopenharmony_ci}
12578c2ecf20Sopenharmony_ci
12588c2ecf20Sopenharmony_cistatic inline struct mlx5_ib_srq *res_to_srq(struct mlx5_core_rsc_common *res)
12598c2ecf20Sopenharmony_ci{
12608c2ecf20Sopenharmony_ci	struct mlx5_core_srq *msrq =
12618c2ecf20Sopenharmony_ci		container_of(res, struct mlx5_core_srq, common);
12628c2ecf20Sopenharmony_ci
12638c2ecf20Sopenharmony_ci	return to_mibsrq(msrq);
12648c2ecf20Sopenharmony_ci}
12658c2ecf20Sopenharmony_ci
12668c2ecf20Sopenharmony_cistatic void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
12678c2ecf20Sopenharmony_ci					  struct mlx5_pagefault *pfault)
12688c2ecf20Sopenharmony_ci{
12698c2ecf20Sopenharmony_ci	bool sq = pfault->type & MLX5_PFAULT_REQUESTOR;
12708c2ecf20Sopenharmony_ci	u16 wqe_index = pfault->wqe.wqe_index;
12718c2ecf20Sopenharmony_ci	void *wqe, *wqe_start = NULL, *wqe_end = NULL;
12728c2ecf20Sopenharmony_ci	u32 bytes_mapped, total_wqe_bytes;
12738c2ecf20Sopenharmony_ci	struct mlx5_core_rsc_common *res;
12748c2ecf20Sopenharmony_ci	int resume_with_error = 1;
12758c2ecf20Sopenharmony_ci	struct mlx5_ib_qp *qp;
12768c2ecf20Sopenharmony_ci	size_t bytes_copied;
12778c2ecf20Sopenharmony_ci	int ret = 0;
12788c2ecf20Sopenharmony_ci
12798c2ecf20Sopenharmony_ci	res = odp_get_rsc(dev, pfault->wqe.wq_num, pfault->type);
12808c2ecf20Sopenharmony_ci	if (!res) {
12818c2ecf20Sopenharmony_ci		mlx5_ib_dbg(dev, "wqe page fault for missing resource %d\n", pfault->wqe.wq_num);
12828c2ecf20Sopenharmony_ci		return;
12838c2ecf20Sopenharmony_ci	}
12848c2ecf20Sopenharmony_ci
12858c2ecf20Sopenharmony_ci	if (res->res != MLX5_RES_QP && res->res != MLX5_RES_SRQ &&
12868c2ecf20Sopenharmony_ci	    res->res != MLX5_RES_XSRQ) {
12878c2ecf20Sopenharmony_ci		mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n",
12888c2ecf20Sopenharmony_ci			    pfault->type);
12898c2ecf20Sopenharmony_ci		goto resolve_page_fault;
12908c2ecf20Sopenharmony_ci	}
12918c2ecf20Sopenharmony_ci
12928c2ecf20Sopenharmony_ci	wqe_start = (void *)__get_free_page(GFP_KERNEL);
12938c2ecf20Sopenharmony_ci	if (!wqe_start) {
12948c2ecf20Sopenharmony_ci		mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
12958c2ecf20Sopenharmony_ci		goto resolve_page_fault;
12968c2ecf20Sopenharmony_ci	}
12978c2ecf20Sopenharmony_ci
12988c2ecf20Sopenharmony_ci	wqe = wqe_start;
12998c2ecf20Sopenharmony_ci	qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL;
13008c2ecf20Sopenharmony_ci	if (qp && sq) {
13018c2ecf20Sopenharmony_ci		ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE,
13028c2ecf20Sopenharmony_ci					  &bytes_copied);
13038c2ecf20Sopenharmony_ci		if (ret)
13048c2ecf20Sopenharmony_ci			goto read_user;
13058c2ecf20Sopenharmony_ci		ret = mlx5_ib_mr_initiator_pfault_handler(
13068c2ecf20Sopenharmony_ci			dev, pfault, qp, &wqe, &wqe_end, bytes_copied);
13078c2ecf20Sopenharmony_ci	} else if (qp && !sq) {
13088c2ecf20Sopenharmony_ci		ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE,
13098c2ecf20Sopenharmony_ci					  &bytes_copied);
13108c2ecf20Sopenharmony_ci		if (ret)
13118c2ecf20Sopenharmony_ci			goto read_user;
13128c2ecf20Sopenharmony_ci		ret = mlx5_ib_mr_responder_pfault_handler_rq(
13138c2ecf20Sopenharmony_ci			dev, qp, wqe, &wqe_end, bytes_copied);
13148c2ecf20Sopenharmony_ci	} else if (!qp) {
13158c2ecf20Sopenharmony_ci		struct mlx5_ib_srq *srq = res_to_srq(res);
13168c2ecf20Sopenharmony_ci
13178c2ecf20Sopenharmony_ci		ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE,
13188c2ecf20Sopenharmony_ci					   &bytes_copied);
13198c2ecf20Sopenharmony_ci		if (ret)
13208c2ecf20Sopenharmony_ci			goto read_user;
13218c2ecf20Sopenharmony_ci		ret = mlx5_ib_mr_responder_pfault_handler_srq(
13228c2ecf20Sopenharmony_ci			dev, srq, &wqe, &wqe_end, bytes_copied);
13238c2ecf20Sopenharmony_ci	}
13248c2ecf20Sopenharmony_ci
13258c2ecf20Sopenharmony_ci	if (ret < 0 || wqe >= wqe_end)
13268c2ecf20Sopenharmony_ci		goto resolve_page_fault;
13278c2ecf20Sopenharmony_ci
13288c2ecf20Sopenharmony_ci	ret = pagefault_data_segments(dev, pfault, wqe, wqe_end, &bytes_mapped,
13298c2ecf20Sopenharmony_ci				      &total_wqe_bytes, !sq);
13308c2ecf20Sopenharmony_ci	if (ret == -EAGAIN)
13318c2ecf20Sopenharmony_ci		goto out;
13328c2ecf20Sopenharmony_ci
13338c2ecf20Sopenharmony_ci	if (ret < 0 || total_wqe_bytes > bytes_mapped)
13348c2ecf20Sopenharmony_ci		goto resolve_page_fault;
13358c2ecf20Sopenharmony_ci
13368c2ecf20Sopenharmony_ciout:
13378c2ecf20Sopenharmony_ci	ret = 0;
13388c2ecf20Sopenharmony_ci	resume_with_error = 0;
13398c2ecf20Sopenharmony_ci
13408c2ecf20Sopenharmony_ciread_user:
13418c2ecf20Sopenharmony_ci	if (ret)
13428c2ecf20Sopenharmony_ci		mlx5_ib_err(
13438c2ecf20Sopenharmony_ci			dev,
13448c2ecf20Sopenharmony_ci			"Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n",
13458c2ecf20Sopenharmony_ci			ret, wqe_index, pfault->token);
13468c2ecf20Sopenharmony_ci
13478c2ecf20Sopenharmony_ciresolve_page_fault:
13488c2ecf20Sopenharmony_ci	mlx5_ib_page_fault_resume(dev, pfault, resume_with_error);
13498c2ecf20Sopenharmony_ci	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
13508c2ecf20Sopenharmony_ci		    pfault->wqe.wq_num, resume_with_error,
13518c2ecf20Sopenharmony_ci		    pfault->type);
13528c2ecf20Sopenharmony_ci	mlx5_core_res_put(res);
13538c2ecf20Sopenharmony_ci	free_page((unsigned long)wqe_start);
13548c2ecf20Sopenharmony_ci}
13558c2ecf20Sopenharmony_ci
13568c2ecf20Sopenharmony_cistatic int pages_in_range(u64 address, u32 length)
13578c2ecf20Sopenharmony_ci{
13588c2ecf20Sopenharmony_ci	return (ALIGN(address + length, PAGE_SIZE) -
13598c2ecf20Sopenharmony_ci		(address & PAGE_MASK)) >> PAGE_SHIFT;
13608c2ecf20Sopenharmony_ci}
13618c2ecf20Sopenharmony_ci
13628c2ecf20Sopenharmony_cistatic void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
13638c2ecf20Sopenharmony_ci					   struct mlx5_pagefault *pfault)
13648c2ecf20Sopenharmony_ci{
13658c2ecf20Sopenharmony_ci	u64 address;
13668c2ecf20Sopenharmony_ci	u32 length;
13678c2ecf20Sopenharmony_ci	u32 prefetch_len = pfault->bytes_committed;
13688c2ecf20Sopenharmony_ci	int prefetch_activated = 0;
13698c2ecf20Sopenharmony_ci	u32 rkey = pfault->rdma.r_key;
13708c2ecf20Sopenharmony_ci	int ret;
13718c2ecf20Sopenharmony_ci
13728c2ecf20Sopenharmony_ci	/* The RDMA responder handler handles the page fault in two parts.
13738c2ecf20Sopenharmony_ci	 * First it brings the necessary pages for the current packet
13748c2ecf20Sopenharmony_ci	 * (and uses the pfault context), and then (after resuming the QP)
13758c2ecf20Sopenharmony_ci	 * prefetches more pages. The second operation cannot use the pfault
13768c2ecf20Sopenharmony_ci	 * context and therefore uses the dummy_pfault context allocated on
13778c2ecf20Sopenharmony_ci	 * the stack */
13788c2ecf20Sopenharmony_ci	pfault->rdma.rdma_va += pfault->bytes_committed;
13798c2ecf20Sopenharmony_ci	pfault->rdma.rdma_op_len -= min(pfault->bytes_committed,
13808c2ecf20Sopenharmony_ci					 pfault->rdma.rdma_op_len);
13818c2ecf20Sopenharmony_ci	pfault->bytes_committed = 0;
13828c2ecf20Sopenharmony_ci
13838c2ecf20Sopenharmony_ci	address = pfault->rdma.rdma_va;
13848c2ecf20Sopenharmony_ci	length  = pfault->rdma.rdma_op_len;
13858c2ecf20Sopenharmony_ci
13868c2ecf20Sopenharmony_ci	/* For some operations, the hardware cannot tell the exact message
13878c2ecf20Sopenharmony_ci	 * length, and in those cases it reports zero. Use prefetch
13888c2ecf20Sopenharmony_ci	 * logic. */
13898c2ecf20Sopenharmony_ci	if (length == 0) {
13908c2ecf20Sopenharmony_ci		prefetch_activated = 1;
13918c2ecf20Sopenharmony_ci		length = pfault->rdma.packet_size;
13928c2ecf20Sopenharmony_ci		prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
13938c2ecf20Sopenharmony_ci	}
13948c2ecf20Sopenharmony_ci
13958c2ecf20Sopenharmony_ci	ret = pagefault_single_data_segment(dev, NULL, rkey, address, length,
13968c2ecf20Sopenharmony_ci					    &pfault->bytes_committed, NULL);
13978c2ecf20Sopenharmony_ci	if (ret == -EAGAIN) {
13988c2ecf20Sopenharmony_ci		/* We're racing with an invalidation, don't prefetch */
13998c2ecf20Sopenharmony_ci		prefetch_activated = 0;
14008c2ecf20Sopenharmony_ci	} else if (ret < 0 || pages_in_range(address, length) > ret) {
14018c2ecf20Sopenharmony_ci		mlx5_ib_page_fault_resume(dev, pfault, 1);
14028c2ecf20Sopenharmony_ci		if (ret != -ENOENT)
14038c2ecf20Sopenharmony_ci			mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n",
14048c2ecf20Sopenharmony_ci				    ret, pfault->token, pfault->type);
14058c2ecf20Sopenharmony_ci		return;
14068c2ecf20Sopenharmony_ci	}
14078c2ecf20Sopenharmony_ci
14088c2ecf20Sopenharmony_ci	mlx5_ib_page_fault_resume(dev, pfault, 0);
14098c2ecf20Sopenharmony_ci	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n",
14108c2ecf20Sopenharmony_ci		    pfault->token, pfault->type,
14118c2ecf20Sopenharmony_ci		    prefetch_activated);
14128c2ecf20Sopenharmony_ci
14138c2ecf20Sopenharmony_ci	/* At this point, there might be a new pagefault already arriving in
14148c2ecf20Sopenharmony_ci	 * the eq, switch to the dummy pagefault for the rest of the
14158c2ecf20Sopenharmony_ci	 * processing. We're still OK with the objects being alive as the
14168c2ecf20Sopenharmony_ci	 * work-queue is being fenced. */
14178c2ecf20Sopenharmony_ci
14188c2ecf20Sopenharmony_ci	if (prefetch_activated) {
14198c2ecf20Sopenharmony_ci		u32 bytes_committed = 0;
14208c2ecf20Sopenharmony_ci
14218c2ecf20Sopenharmony_ci		ret = pagefault_single_data_segment(dev, NULL, rkey, address,
14228c2ecf20Sopenharmony_ci						    prefetch_len,
14238c2ecf20Sopenharmony_ci						    &bytes_committed, NULL);
14248c2ecf20Sopenharmony_ci		if (ret < 0 && ret != -EAGAIN) {
14258c2ecf20Sopenharmony_ci			mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
14268c2ecf20Sopenharmony_ci				    ret, pfault->token, address, prefetch_len);
14278c2ecf20Sopenharmony_ci		}
14288c2ecf20Sopenharmony_ci	}
14298c2ecf20Sopenharmony_ci}
14308c2ecf20Sopenharmony_ci
14318c2ecf20Sopenharmony_cistatic void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault)
14328c2ecf20Sopenharmony_ci{
14338c2ecf20Sopenharmony_ci	u8 event_subtype = pfault->event_subtype;
14348c2ecf20Sopenharmony_ci
14358c2ecf20Sopenharmony_ci	switch (event_subtype) {
14368c2ecf20Sopenharmony_ci	case MLX5_PFAULT_SUBTYPE_WQE:
14378c2ecf20Sopenharmony_ci		mlx5_ib_mr_wqe_pfault_handler(dev, pfault);
14388c2ecf20Sopenharmony_ci		break;
14398c2ecf20Sopenharmony_ci	case MLX5_PFAULT_SUBTYPE_RDMA:
14408c2ecf20Sopenharmony_ci		mlx5_ib_mr_rdma_pfault_handler(dev, pfault);
14418c2ecf20Sopenharmony_ci		break;
14428c2ecf20Sopenharmony_ci	default:
14438c2ecf20Sopenharmony_ci		mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n",
14448c2ecf20Sopenharmony_ci			    event_subtype);
14458c2ecf20Sopenharmony_ci		mlx5_ib_page_fault_resume(dev, pfault, 1);
14468c2ecf20Sopenharmony_ci	}
14478c2ecf20Sopenharmony_ci}
14488c2ecf20Sopenharmony_ci
14498c2ecf20Sopenharmony_cistatic void mlx5_ib_eqe_pf_action(struct work_struct *work)
14508c2ecf20Sopenharmony_ci{
14518c2ecf20Sopenharmony_ci	struct mlx5_pagefault *pfault = container_of(work,
14528c2ecf20Sopenharmony_ci						     struct mlx5_pagefault,
14538c2ecf20Sopenharmony_ci						     work);
14548c2ecf20Sopenharmony_ci	struct mlx5_ib_pf_eq *eq = pfault->eq;
14558c2ecf20Sopenharmony_ci
14568c2ecf20Sopenharmony_ci	mlx5_ib_pfault(eq->dev, pfault);
14578c2ecf20Sopenharmony_ci	mempool_free(pfault, eq->pool);
14588c2ecf20Sopenharmony_ci}
14598c2ecf20Sopenharmony_ci
14608c2ecf20Sopenharmony_cistatic void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
14618c2ecf20Sopenharmony_ci{
14628c2ecf20Sopenharmony_ci	struct mlx5_eqe_page_fault *pf_eqe;
14638c2ecf20Sopenharmony_ci	struct mlx5_pagefault *pfault;
14648c2ecf20Sopenharmony_ci	struct mlx5_eqe *eqe;
14658c2ecf20Sopenharmony_ci	int cc = 0;
14668c2ecf20Sopenharmony_ci
14678c2ecf20Sopenharmony_ci	while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) {
14688c2ecf20Sopenharmony_ci		pfault = mempool_alloc(eq->pool, GFP_ATOMIC);
14698c2ecf20Sopenharmony_ci		if (!pfault) {
14708c2ecf20Sopenharmony_ci			schedule_work(&eq->work);
14718c2ecf20Sopenharmony_ci			break;
14728c2ecf20Sopenharmony_ci		}
14738c2ecf20Sopenharmony_ci
14748c2ecf20Sopenharmony_ci		pf_eqe = &eqe->data.page_fault;
14758c2ecf20Sopenharmony_ci		pfault->event_subtype = eqe->sub_type;
14768c2ecf20Sopenharmony_ci		pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
14778c2ecf20Sopenharmony_ci
14788c2ecf20Sopenharmony_ci		mlx5_ib_dbg(eq->dev,
14798c2ecf20Sopenharmony_ci			    "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
14808c2ecf20Sopenharmony_ci			    eqe->sub_type, pfault->bytes_committed);
14818c2ecf20Sopenharmony_ci
14828c2ecf20Sopenharmony_ci		switch (eqe->sub_type) {
14838c2ecf20Sopenharmony_ci		case MLX5_PFAULT_SUBTYPE_RDMA:
14848c2ecf20Sopenharmony_ci			/* RDMA based event */
14858c2ecf20Sopenharmony_ci			pfault->type =
14868c2ecf20Sopenharmony_ci				be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
14878c2ecf20Sopenharmony_ci			pfault->token =
14888c2ecf20Sopenharmony_ci				be32_to_cpu(pf_eqe->rdma.pftype_token) &
14898c2ecf20Sopenharmony_ci				MLX5_24BIT_MASK;
14908c2ecf20Sopenharmony_ci			pfault->rdma.r_key =
14918c2ecf20Sopenharmony_ci				be32_to_cpu(pf_eqe->rdma.r_key);
14928c2ecf20Sopenharmony_ci			pfault->rdma.packet_size =
14938c2ecf20Sopenharmony_ci				be16_to_cpu(pf_eqe->rdma.packet_length);
14948c2ecf20Sopenharmony_ci			pfault->rdma.rdma_op_len =
14958c2ecf20Sopenharmony_ci				be32_to_cpu(pf_eqe->rdma.rdma_op_len);
14968c2ecf20Sopenharmony_ci			pfault->rdma.rdma_va =
14978c2ecf20Sopenharmony_ci				be64_to_cpu(pf_eqe->rdma.rdma_va);
14988c2ecf20Sopenharmony_ci			mlx5_ib_dbg(eq->dev,
14998c2ecf20Sopenharmony_ci				    "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
15008c2ecf20Sopenharmony_ci				    pfault->type, pfault->token,
15018c2ecf20Sopenharmony_ci				    pfault->rdma.r_key);
15028c2ecf20Sopenharmony_ci			mlx5_ib_dbg(eq->dev,
15038c2ecf20Sopenharmony_ci				    "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
15048c2ecf20Sopenharmony_ci				    pfault->rdma.rdma_op_len,
15058c2ecf20Sopenharmony_ci				    pfault->rdma.rdma_va);
15068c2ecf20Sopenharmony_ci			break;
15078c2ecf20Sopenharmony_ci
15088c2ecf20Sopenharmony_ci		case MLX5_PFAULT_SUBTYPE_WQE:
15098c2ecf20Sopenharmony_ci			/* WQE based event */
15108c2ecf20Sopenharmony_ci			pfault->type =
15118c2ecf20Sopenharmony_ci				(be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7;
15128c2ecf20Sopenharmony_ci			pfault->token =
15138c2ecf20Sopenharmony_ci				be32_to_cpu(pf_eqe->wqe.token);
15148c2ecf20Sopenharmony_ci			pfault->wqe.wq_num =
15158c2ecf20Sopenharmony_ci				be32_to_cpu(pf_eqe->wqe.pftype_wq) &
15168c2ecf20Sopenharmony_ci				MLX5_24BIT_MASK;
15178c2ecf20Sopenharmony_ci			pfault->wqe.wqe_index =
15188c2ecf20Sopenharmony_ci				be16_to_cpu(pf_eqe->wqe.wqe_index);
15198c2ecf20Sopenharmony_ci			pfault->wqe.packet_size =
15208c2ecf20Sopenharmony_ci				be16_to_cpu(pf_eqe->wqe.packet_length);
15218c2ecf20Sopenharmony_ci			mlx5_ib_dbg(eq->dev,
15228c2ecf20Sopenharmony_ci				    "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
15238c2ecf20Sopenharmony_ci				    pfault->type, pfault->token,
15248c2ecf20Sopenharmony_ci				    pfault->wqe.wq_num,
15258c2ecf20Sopenharmony_ci				    pfault->wqe.wqe_index);
15268c2ecf20Sopenharmony_ci			break;
15278c2ecf20Sopenharmony_ci
15288c2ecf20Sopenharmony_ci		default:
15298c2ecf20Sopenharmony_ci			mlx5_ib_warn(eq->dev,
15308c2ecf20Sopenharmony_ci				     "Unsupported page fault event sub-type: 0x%02hhx\n",
15318c2ecf20Sopenharmony_ci				     eqe->sub_type);
15328c2ecf20Sopenharmony_ci			/* Unsupported page faults should still be
15338c2ecf20Sopenharmony_ci			 * resolved by the page fault handler
15348c2ecf20Sopenharmony_ci			 */
15358c2ecf20Sopenharmony_ci		}
15368c2ecf20Sopenharmony_ci
15378c2ecf20Sopenharmony_ci		pfault->eq = eq;
15388c2ecf20Sopenharmony_ci		INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action);
15398c2ecf20Sopenharmony_ci		queue_work(eq->wq, &pfault->work);
15408c2ecf20Sopenharmony_ci
15418c2ecf20Sopenharmony_ci		cc = mlx5_eq_update_cc(eq->core, ++cc);
15428c2ecf20Sopenharmony_ci	}
15438c2ecf20Sopenharmony_ci
15448c2ecf20Sopenharmony_ci	mlx5_eq_update_ci(eq->core, cc, 1);
15458c2ecf20Sopenharmony_ci}
15468c2ecf20Sopenharmony_ci
15478c2ecf20Sopenharmony_cistatic int mlx5_ib_eq_pf_int(struct notifier_block *nb, unsigned long type,
15488c2ecf20Sopenharmony_ci			     void *data)
15498c2ecf20Sopenharmony_ci{
15508c2ecf20Sopenharmony_ci	struct mlx5_ib_pf_eq *eq =
15518c2ecf20Sopenharmony_ci		container_of(nb, struct mlx5_ib_pf_eq, irq_nb);
15528c2ecf20Sopenharmony_ci	unsigned long flags;
15538c2ecf20Sopenharmony_ci
15548c2ecf20Sopenharmony_ci	if (spin_trylock_irqsave(&eq->lock, flags)) {
15558c2ecf20Sopenharmony_ci		mlx5_ib_eq_pf_process(eq);
15568c2ecf20Sopenharmony_ci		spin_unlock_irqrestore(&eq->lock, flags);
15578c2ecf20Sopenharmony_ci	} else {
15588c2ecf20Sopenharmony_ci		schedule_work(&eq->work);
15598c2ecf20Sopenharmony_ci	}
15608c2ecf20Sopenharmony_ci
15618c2ecf20Sopenharmony_ci	return IRQ_HANDLED;
15628c2ecf20Sopenharmony_ci}
15638c2ecf20Sopenharmony_ci
15648c2ecf20Sopenharmony_ci/* mempool_refill() was proposed but unfortunately wasn't accepted
15658c2ecf20Sopenharmony_ci * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
15668c2ecf20Sopenharmony_ci * Cheap workaround.
15678c2ecf20Sopenharmony_ci */
15688c2ecf20Sopenharmony_cistatic void mempool_refill(mempool_t *pool)
15698c2ecf20Sopenharmony_ci{
15708c2ecf20Sopenharmony_ci	while (pool->curr_nr < pool->min_nr)
15718c2ecf20Sopenharmony_ci		mempool_free(mempool_alloc(pool, GFP_KERNEL), pool);
15728c2ecf20Sopenharmony_ci}
15738c2ecf20Sopenharmony_ci
15748c2ecf20Sopenharmony_cistatic void mlx5_ib_eq_pf_action(struct work_struct *work)
15758c2ecf20Sopenharmony_ci{
15768c2ecf20Sopenharmony_ci	struct mlx5_ib_pf_eq *eq =
15778c2ecf20Sopenharmony_ci		container_of(work, struct mlx5_ib_pf_eq, work);
15788c2ecf20Sopenharmony_ci
15798c2ecf20Sopenharmony_ci	mempool_refill(eq->pool);
15808c2ecf20Sopenharmony_ci
15818c2ecf20Sopenharmony_ci	spin_lock_irq(&eq->lock);
15828c2ecf20Sopenharmony_ci	mlx5_ib_eq_pf_process(eq);
15838c2ecf20Sopenharmony_ci	spin_unlock_irq(&eq->lock);
15848c2ecf20Sopenharmony_ci}
15858c2ecf20Sopenharmony_ci
15868c2ecf20Sopenharmony_cienum {
15878c2ecf20Sopenharmony_ci	MLX5_IB_NUM_PF_EQE	= 0x1000,
15888c2ecf20Sopenharmony_ci	MLX5_IB_NUM_PF_DRAIN	= 64,
15898c2ecf20Sopenharmony_ci};
15908c2ecf20Sopenharmony_ci
15918c2ecf20Sopenharmony_cistatic int
15928c2ecf20Sopenharmony_cimlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
15938c2ecf20Sopenharmony_ci{
15948c2ecf20Sopenharmony_ci	struct mlx5_eq_param param = {};
15958c2ecf20Sopenharmony_ci	int err;
15968c2ecf20Sopenharmony_ci
15978c2ecf20Sopenharmony_ci	INIT_WORK(&eq->work, mlx5_ib_eq_pf_action);
15988c2ecf20Sopenharmony_ci	spin_lock_init(&eq->lock);
15998c2ecf20Sopenharmony_ci	eq->dev = dev;
16008c2ecf20Sopenharmony_ci
16018c2ecf20Sopenharmony_ci	eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN,
16028c2ecf20Sopenharmony_ci					       sizeof(struct mlx5_pagefault));
16038c2ecf20Sopenharmony_ci	if (!eq->pool)
16048c2ecf20Sopenharmony_ci		return -ENOMEM;
16058c2ecf20Sopenharmony_ci
16068c2ecf20Sopenharmony_ci	eq->wq = alloc_workqueue("mlx5_ib_page_fault",
16078c2ecf20Sopenharmony_ci				 WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM,
16088c2ecf20Sopenharmony_ci				 MLX5_NUM_CMD_EQE);
16098c2ecf20Sopenharmony_ci	if (!eq->wq) {
16108c2ecf20Sopenharmony_ci		err = -ENOMEM;
16118c2ecf20Sopenharmony_ci		goto err_mempool;
16128c2ecf20Sopenharmony_ci	}
16138c2ecf20Sopenharmony_ci
16148c2ecf20Sopenharmony_ci	eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
16158c2ecf20Sopenharmony_ci	param = (struct mlx5_eq_param) {
16168c2ecf20Sopenharmony_ci		.irq_index = 0,
16178c2ecf20Sopenharmony_ci		.nent = MLX5_IB_NUM_PF_EQE,
16188c2ecf20Sopenharmony_ci	};
16198c2ecf20Sopenharmony_ci	param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT;
16208c2ecf20Sopenharmony_ci	eq->core = mlx5_eq_create_generic(dev->mdev, &param);
16218c2ecf20Sopenharmony_ci	if (IS_ERR(eq->core)) {
16228c2ecf20Sopenharmony_ci		err = PTR_ERR(eq->core);
16238c2ecf20Sopenharmony_ci		goto err_wq;
16248c2ecf20Sopenharmony_ci	}
16258c2ecf20Sopenharmony_ci	err = mlx5_eq_enable(dev->mdev, eq->core, &eq->irq_nb);
16268c2ecf20Sopenharmony_ci	if (err) {
16278c2ecf20Sopenharmony_ci		mlx5_ib_err(dev, "failed to enable odp EQ %d\n", err);
16288c2ecf20Sopenharmony_ci		goto err_eq;
16298c2ecf20Sopenharmony_ci	}
16308c2ecf20Sopenharmony_ci
16318c2ecf20Sopenharmony_ci	return 0;
16328c2ecf20Sopenharmony_cierr_eq:
16338c2ecf20Sopenharmony_ci	mlx5_eq_destroy_generic(dev->mdev, eq->core);
16348c2ecf20Sopenharmony_cierr_wq:
16358c2ecf20Sopenharmony_ci	destroy_workqueue(eq->wq);
16368c2ecf20Sopenharmony_cierr_mempool:
16378c2ecf20Sopenharmony_ci	mempool_destroy(eq->pool);
16388c2ecf20Sopenharmony_ci	return err;
16398c2ecf20Sopenharmony_ci}
16408c2ecf20Sopenharmony_ci
16418c2ecf20Sopenharmony_cistatic int
16428c2ecf20Sopenharmony_cimlx5_ib_destroy_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
16438c2ecf20Sopenharmony_ci{
16448c2ecf20Sopenharmony_ci	int err;
16458c2ecf20Sopenharmony_ci
16468c2ecf20Sopenharmony_ci	mlx5_eq_disable(dev->mdev, eq->core, &eq->irq_nb);
16478c2ecf20Sopenharmony_ci	err = mlx5_eq_destroy_generic(dev->mdev, eq->core);
16488c2ecf20Sopenharmony_ci	cancel_work_sync(&eq->work);
16498c2ecf20Sopenharmony_ci	destroy_workqueue(eq->wq);
16508c2ecf20Sopenharmony_ci	mempool_destroy(eq->pool);
16518c2ecf20Sopenharmony_ci
16528c2ecf20Sopenharmony_ci	return err;
16538c2ecf20Sopenharmony_ci}
16548c2ecf20Sopenharmony_ci
16558c2ecf20Sopenharmony_civoid mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
16568c2ecf20Sopenharmony_ci{
16578c2ecf20Sopenharmony_ci	if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
16588c2ecf20Sopenharmony_ci		return;
16598c2ecf20Sopenharmony_ci
16608c2ecf20Sopenharmony_ci	switch (ent->order - 2) {
16618c2ecf20Sopenharmony_ci	case MLX5_IMR_MTT_CACHE_ENTRY:
16628c2ecf20Sopenharmony_ci		ent->page = PAGE_SHIFT;
16638c2ecf20Sopenharmony_ci		ent->xlt = MLX5_IMR_MTT_ENTRIES *
16648c2ecf20Sopenharmony_ci			   sizeof(struct mlx5_mtt) /
16658c2ecf20Sopenharmony_ci			   MLX5_IB_UMR_OCTOWORD;
16668c2ecf20Sopenharmony_ci		ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
16678c2ecf20Sopenharmony_ci		ent->limit = 0;
16688c2ecf20Sopenharmony_ci		break;
16698c2ecf20Sopenharmony_ci
16708c2ecf20Sopenharmony_ci	case MLX5_IMR_KSM_CACHE_ENTRY:
16718c2ecf20Sopenharmony_ci		ent->page = MLX5_KSM_PAGE_SHIFT;
16728c2ecf20Sopenharmony_ci		ent->xlt = mlx5_imr_ksm_entries *
16738c2ecf20Sopenharmony_ci			   sizeof(struct mlx5_klm) /
16748c2ecf20Sopenharmony_ci			   MLX5_IB_UMR_OCTOWORD;
16758c2ecf20Sopenharmony_ci		ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
16768c2ecf20Sopenharmony_ci		ent->limit = 0;
16778c2ecf20Sopenharmony_ci		break;
16788c2ecf20Sopenharmony_ci	}
16798c2ecf20Sopenharmony_ci}
16808c2ecf20Sopenharmony_ci
16818c2ecf20Sopenharmony_cistatic const struct ib_device_ops mlx5_ib_dev_odp_ops = {
16828c2ecf20Sopenharmony_ci	.advise_mr = mlx5_ib_advise_mr,
16838c2ecf20Sopenharmony_ci};
16848c2ecf20Sopenharmony_ci
16858c2ecf20Sopenharmony_ciint mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
16868c2ecf20Sopenharmony_ci{
16878c2ecf20Sopenharmony_ci	int ret = 0;
16888c2ecf20Sopenharmony_ci
16898c2ecf20Sopenharmony_ci	if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
16908c2ecf20Sopenharmony_ci		return ret;
16918c2ecf20Sopenharmony_ci
16928c2ecf20Sopenharmony_ci	ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
16938c2ecf20Sopenharmony_ci
16948c2ecf20Sopenharmony_ci	if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
16958c2ecf20Sopenharmony_ci		ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
16968c2ecf20Sopenharmony_ci		if (ret) {
16978c2ecf20Sopenharmony_ci			mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret);
16988c2ecf20Sopenharmony_ci			return ret;
16998c2ecf20Sopenharmony_ci		}
17008c2ecf20Sopenharmony_ci	}
17018c2ecf20Sopenharmony_ci
17028c2ecf20Sopenharmony_ci	ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq);
17038c2ecf20Sopenharmony_ci
17048c2ecf20Sopenharmony_ci	return ret;
17058c2ecf20Sopenharmony_ci}
17068c2ecf20Sopenharmony_ci
17078c2ecf20Sopenharmony_civoid mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
17088c2ecf20Sopenharmony_ci{
17098c2ecf20Sopenharmony_ci	if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
17108c2ecf20Sopenharmony_ci		return;
17118c2ecf20Sopenharmony_ci
17128c2ecf20Sopenharmony_ci	mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq);
17138c2ecf20Sopenharmony_ci}
17148c2ecf20Sopenharmony_ci
17158c2ecf20Sopenharmony_ciint mlx5_ib_odp_init(void)
17168c2ecf20Sopenharmony_ci{
17178c2ecf20Sopenharmony_ci	mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) -
17188c2ecf20Sopenharmony_ci				       MLX5_IMR_MTT_BITS);
17198c2ecf20Sopenharmony_ci
17208c2ecf20Sopenharmony_ci	return 0;
17218c2ecf20Sopenharmony_ci}
17228c2ecf20Sopenharmony_ci
17238c2ecf20Sopenharmony_cistruct prefetch_mr_work {
17248c2ecf20Sopenharmony_ci	struct work_struct work;
17258c2ecf20Sopenharmony_ci	u32 pf_flags;
17268c2ecf20Sopenharmony_ci	u32 num_sge;
17278c2ecf20Sopenharmony_ci	struct {
17288c2ecf20Sopenharmony_ci		u64 io_virt;
17298c2ecf20Sopenharmony_ci		struct mlx5_ib_mr *mr;
17308c2ecf20Sopenharmony_ci		size_t length;
17318c2ecf20Sopenharmony_ci	} frags[];
17328c2ecf20Sopenharmony_ci};
17338c2ecf20Sopenharmony_ci
17348c2ecf20Sopenharmony_cistatic void destroy_prefetch_work(struct prefetch_mr_work *work)
17358c2ecf20Sopenharmony_ci{
17368c2ecf20Sopenharmony_ci	u32 i;
17378c2ecf20Sopenharmony_ci
17388c2ecf20Sopenharmony_ci	for (i = 0; i < work->num_sge; ++i)
17398c2ecf20Sopenharmony_ci		if (atomic_dec_and_test(&work->frags[i].mr->num_deferred_work))
17408c2ecf20Sopenharmony_ci			wake_up(&work->frags[i].mr->q_deferred_work);
17418c2ecf20Sopenharmony_ci	kvfree(work);
17428c2ecf20Sopenharmony_ci}
17438c2ecf20Sopenharmony_ci
17448c2ecf20Sopenharmony_cistatic struct mlx5_ib_mr *
17458c2ecf20Sopenharmony_ciget_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
17468c2ecf20Sopenharmony_ci		    u32 lkey)
17478c2ecf20Sopenharmony_ci{
17488c2ecf20Sopenharmony_ci	struct mlx5_ib_dev *dev = to_mdev(pd->device);
17498c2ecf20Sopenharmony_ci	struct mlx5_core_mkey *mmkey;
17508c2ecf20Sopenharmony_ci	struct ib_umem_odp *odp;
17518c2ecf20Sopenharmony_ci	struct mlx5_ib_mr *mr;
17528c2ecf20Sopenharmony_ci
17538c2ecf20Sopenharmony_ci	lockdep_assert_held(&dev->odp_srcu);
17548c2ecf20Sopenharmony_ci
17558c2ecf20Sopenharmony_ci	mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey));
17568c2ecf20Sopenharmony_ci	if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR)
17578c2ecf20Sopenharmony_ci		return NULL;
17588c2ecf20Sopenharmony_ci
17598c2ecf20Sopenharmony_ci	mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
17608c2ecf20Sopenharmony_ci
17618c2ecf20Sopenharmony_ci	if (mr->ibmr.pd != pd)
17628c2ecf20Sopenharmony_ci		return NULL;
17638c2ecf20Sopenharmony_ci
17648c2ecf20Sopenharmony_ci	odp = to_ib_umem_odp(mr->umem);
17658c2ecf20Sopenharmony_ci
17668c2ecf20Sopenharmony_ci	/* prefetch with write-access must be supported by the MR */
17678c2ecf20Sopenharmony_ci	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
17688c2ecf20Sopenharmony_ci	    !odp->umem.writable)
17698c2ecf20Sopenharmony_ci		return NULL;
17708c2ecf20Sopenharmony_ci
17718c2ecf20Sopenharmony_ci	return mr;
17728c2ecf20Sopenharmony_ci}
17738c2ecf20Sopenharmony_ci
17748c2ecf20Sopenharmony_cistatic void mlx5_ib_prefetch_mr_work(struct work_struct *w)
17758c2ecf20Sopenharmony_ci{
17768c2ecf20Sopenharmony_ci	struct prefetch_mr_work *work =
17778c2ecf20Sopenharmony_ci		container_of(w, struct prefetch_mr_work, work);
17788c2ecf20Sopenharmony_ci	struct mlx5_ib_dev *dev;
17798c2ecf20Sopenharmony_ci	u32 bytes_mapped = 0;
17808c2ecf20Sopenharmony_ci	int srcu_key;
17818c2ecf20Sopenharmony_ci	int ret;
17828c2ecf20Sopenharmony_ci	u32 i;
17838c2ecf20Sopenharmony_ci
17848c2ecf20Sopenharmony_ci	/* We rely on IB/core that work is executed if we have num_sge != 0 only. */
17858c2ecf20Sopenharmony_ci	WARN_ON(!work->num_sge);
17868c2ecf20Sopenharmony_ci	dev = work->frags[0].mr->dev;
17878c2ecf20Sopenharmony_ci	/* SRCU should be held when calling to mlx5_odp_populate_xlt() */
17888c2ecf20Sopenharmony_ci	srcu_key = srcu_read_lock(&dev->odp_srcu);
17898c2ecf20Sopenharmony_ci	for (i = 0; i < work->num_sge; ++i) {
17908c2ecf20Sopenharmony_ci		ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
17918c2ecf20Sopenharmony_ci				   work->frags[i].length, &bytes_mapped,
17928c2ecf20Sopenharmony_ci				   work->pf_flags);
17938c2ecf20Sopenharmony_ci		if (ret <= 0)
17948c2ecf20Sopenharmony_ci			continue;
17958c2ecf20Sopenharmony_ci		mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret);
17968c2ecf20Sopenharmony_ci	}
17978c2ecf20Sopenharmony_ci	srcu_read_unlock(&dev->odp_srcu, srcu_key);
17988c2ecf20Sopenharmony_ci
17998c2ecf20Sopenharmony_ci	destroy_prefetch_work(work);
18008c2ecf20Sopenharmony_ci}
18018c2ecf20Sopenharmony_ci
18028c2ecf20Sopenharmony_cistatic bool init_prefetch_work(struct ib_pd *pd,
18038c2ecf20Sopenharmony_ci			       enum ib_uverbs_advise_mr_advice advice,
18048c2ecf20Sopenharmony_ci			       u32 pf_flags, struct prefetch_mr_work *work,
18058c2ecf20Sopenharmony_ci			       struct ib_sge *sg_list, u32 num_sge)
18068c2ecf20Sopenharmony_ci{
18078c2ecf20Sopenharmony_ci	u32 i;
18088c2ecf20Sopenharmony_ci
18098c2ecf20Sopenharmony_ci	INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
18108c2ecf20Sopenharmony_ci	work->pf_flags = pf_flags;
18118c2ecf20Sopenharmony_ci
18128c2ecf20Sopenharmony_ci	for (i = 0; i < num_sge; ++i) {
18138c2ecf20Sopenharmony_ci		work->frags[i].io_virt = sg_list[i].addr;
18148c2ecf20Sopenharmony_ci		work->frags[i].length = sg_list[i].length;
18158c2ecf20Sopenharmony_ci		work->frags[i].mr =
18168c2ecf20Sopenharmony_ci			get_prefetchable_mr(pd, advice, sg_list[i].lkey);
18178c2ecf20Sopenharmony_ci		if (!work->frags[i].mr) {
18188c2ecf20Sopenharmony_ci			work->num_sge = i;
18198c2ecf20Sopenharmony_ci			return false;
18208c2ecf20Sopenharmony_ci		}
18218c2ecf20Sopenharmony_ci
18228c2ecf20Sopenharmony_ci		/* Keep the MR pointer will valid outside the SRCU */
18238c2ecf20Sopenharmony_ci		atomic_inc(&work->frags[i].mr->num_deferred_work);
18248c2ecf20Sopenharmony_ci	}
18258c2ecf20Sopenharmony_ci	work->num_sge = num_sge;
18268c2ecf20Sopenharmony_ci	return true;
18278c2ecf20Sopenharmony_ci}
18288c2ecf20Sopenharmony_ci
18298c2ecf20Sopenharmony_cistatic int mlx5_ib_prefetch_sg_list(struct ib_pd *pd,
18308c2ecf20Sopenharmony_ci				    enum ib_uverbs_advise_mr_advice advice,
18318c2ecf20Sopenharmony_ci				    u32 pf_flags, struct ib_sge *sg_list,
18328c2ecf20Sopenharmony_ci				    u32 num_sge)
18338c2ecf20Sopenharmony_ci{
18348c2ecf20Sopenharmony_ci	struct mlx5_ib_dev *dev = to_mdev(pd->device);
18358c2ecf20Sopenharmony_ci	u32 bytes_mapped = 0;
18368c2ecf20Sopenharmony_ci	int srcu_key;
18378c2ecf20Sopenharmony_ci	int ret = 0;
18388c2ecf20Sopenharmony_ci	u32 i;
18398c2ecf20Sopenharmony_ci
18408c2ecf20Sopenharmony_ci	srcu_key = srcu_read_lock(&dev->odp_srcu);
18418c2ecf20Sopenharmony_ci	for (i = 0; i < num_sge; ++i) {
18428c2ecf20Sopenharmony_ci		struct mlx5_ib_mr *mr;
18438c2ecf20Sopenharmony_ci
18448c2ecf20Sopenharmony_ci		mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey);
18458c2ecf20Sopenharmony_ci		if (!mr) {
18468c2ecf20Sopenharmony_ci			ret = -ENOENT;
18478c2ecf20Sopenharmony_ci			goto out;
18488c2ecf20Sopenharmony_ci		}
18498c2ecf20Sopenharmony_ci		ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length,
18508c2ecf20Sopenharmony_ci				   &bytes_mapped, pf_flags);
18518c2ecf20Sopenharmony_ci		if (ret < 0)
18528c2ecf20Sopenharmony_ci			goto out;
18538c2ecf20Sopenharmony_ci		mlx5_update_odp_stats(mr, prefetch, ret);
18548c2ecf20Sopenharmony_ci	}
18558c2ecf20Sopenharmony_ci	ret = 0;
18568c2ecf20Sopenharmony_ci
18578c2ecf20Sopenharmony_ciout:
18588c2ecf20Sopenharmony_ci	srcu_read_unlock(&dev->odp_srcu, srcu_key);
18598c2ecf20Sopenharmony_ci	return ret;
18608c2ecf20Sopenharmony_ci}
18618c2ecf20Sopenharmony_ci
18628c2ecf20Sopenharmony_ciint mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
18638c2ecf20Sopenharmony_ci			       enum ib_uverbs_advise_mr_advice advice,
18648c2ecf20Sopenharmony_ci			       u32 flags, struct ib_sge *sg_list, u32 num_sge)
18658c2ecf20Sopenharmony_ci{
18668c2ecf20Sopenharmony_ci	struct mlx5_ib_dev *dev = to_mdev(pd->device);
18678c2ecf20Sopenharmony_ci	u32 pf_flags = 0;
18688c2ecf20Sopenharmony_ci	struct prefetch_mr_work *work;
18698c2ecf20Sopenharmony_ci	int srcu_key;
18708c2ecf20Sopenharmony_ci
18718c2ecf20Sopenharmony_ci	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
18728c2ecf20Sopenharmony_ci		pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
18738c2ecf20Sopenharmony_ci
18748c2ecf20Sopenharmony_ci	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
18758c2ecf20Sopenharmony_ci		pf_flags |= MLX5_PF_FLAGS_SNAPSHOT;
18768c2ecf20Sopenharmony_ci
18778c2ecf20Sopenharmony_ci	if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
18788c2ecf20Sopenharmony_ci		return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list,
18798c2ecf20Sopenharmony_ci						num_sge);
18808c2ecf20Sopenharmony_ci
18818c2ecf20Sopenharmony_ci	work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL);
18828c2ecf20Sopenharmony_ci	if (!work)
18838c2ecf20Sopenharmony_ci		return -ENOMEM;
18848c2ecf20Sopenharmony_ci
18858c2ecf20Sopenharmony_ci	srcu_key = srcu_read_lock(&dev->odp_srcu);
18868c2ecf20Sopenharmony_ci	if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) {
18878c2ecf20Sopenharmony_ci		srcu_read_unlock(&dev->odp_srcu, srcu_key);
18888c2ecf20Sopenharmony_ci		destroy_prefetch_work(work);
18898c2ecf20Sopenharmony_ci		return -EINVAL;
18908c2ecf20Sopenharmony_ci	}
18918c2ecf20Sopenharmony_ci	queue_work(system_unbound_wq, &work->work);
18928c2ecf20Sopenharmony_ci	srcu_read_unlock(&dev->odp_srcu, srcu_key);
18938c2ecf20Sopenharmony_ci	return 0;
18948c2ecf20Sopenharmony_ci}
1895