1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2
3/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4/* Copyright (c) 2008-2019, IBM Corporation */
5
6#include <linux/gfp.h>
7#include <rdma/ib_verbs.h>
8#include <linux/dma-mapping.h>
9#include <linux/slab.h>
10#include <linux/sched/mm.h>
11#include <linux/resource.h>
12
13#include "siw.h"
14#include "siw_mem.h"
15
16/*
17 * Stag lookup is based on its index part only (24 bits).
18 * The code avoids special Stag of zero and tries to randomize
19 * STag values between 1 and SIW_STAG_MAX_INDEX.
20 */
21int siw_mem_add(struct siw_device *sdev, struct siw_mem *m)
22{
23	struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
24	u32 id, next;
25
26	get_random_bytes(&next, 4);
27	next &= 0x00ffffff;
28
29	if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next,
30	    GFP_KERNEL) < 0)
31		return -ENOMEM;
32
33	/* Set the STag index part */
34	m->stag = id << 8;
35
36	siw_dbg_mem(m, "new MEM object\n");
37
38	return 0;
39}
40
41/*
42 * siw_mem_id2obj()
43 *
44 * resolves memory from stag given by id. might be called from:
45 * o process context before sending out of sgl, or
46 * o in softirq when resolving target memory
47 */
48struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index)
49{
50	struct siw_mem *mem;
51
52	rcu_read_lock();
53	mem = xa_load(&sdev->mem_xa, stag_index);
54	if (likely(mem && kref_get_unless_zero(&mem->ref))) {
55		rcu_read_unlock();
56		return mem;
57	}
58	rcu_read_unlock();
59
60	return NULL;
61}
62
63static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages,
64			   bool dirty)
65{
66	unpin_user_pages_dirty_lock(chunk->plist, num_pages, dirty);
67}
68
69void siw_umem_release(struct siw_umem *umem, bool dirty)
70{
71	struct mm_struct *mm_s = umem->owning_mm;
72	int i, num_pages = umem->num_pages;
73
74	for (i = 0; num_pages; i++) {
75		int to_free = min_t(int, PAGES_PER_CHUNK, num_pages);
76
77		siw_free_plist(&umem->page_chunk[i], to_free,
78			       umem->writable && dirty);
79		kfree(umem->page_chunk[i].plist);
80		num_pages -= to_free;
81	}
82	atomic64_sub(umem->num_pages, &mm_s->pinned_vm);
83
84	mmdrop(mm_s);
85	kfree(umem->page_chunk);
86	kfree(umem);
87}
88
89int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj,
90		   u64 start, u64 len, int rights)
91{
92	struct siw_device *sdev = to_siw_dev(pd->device);
93	struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
94	struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
95	u32 id, next;
96
97	if (!mem)
98		return -ENOMEM;
99
100	mem->mem_obj = mem_obj;
101	mem->stag_valid = 0;
102	mem->sdev = sdev;
103	mem->va = start;
104	mem->len = len;
105	mem->pd = pd;
106	mem->perms = rights & IWARP_ACCESS_MASK;
107	kref_init(&mem->ref);
108
109	get_random_bytes(&next, 4);
110	next &= 0x00ffffff;
111
112	if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next,
113	    GFP_KERNEL) < 0) {
114		kfree(mem);
115		return -ENOMEM;
116	}
117
118	mr->mem = mem;
119	/* Set the STag index part */
120	mem->stag = id << 8;
121	mr->base_mr.lkey = mr->base_mr.rkey = mem->stag;
122
123	return 0;
124}
125
126void siw_mr_drop_mem(struct siw_mr *mr)
127{
128	struct siw_mem *mem = mr->mem, *found;
129
130	mem->stag_valid = 0;
131
132	/* make STag invalid visible asap */
133	smp_mb();
134
135	found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8);
136	WARN_ON(found != mem);
137	siw_mem_put(mem);
138}
139
140void siw_free_mem(struct kref *ref)
141{
142	struct siw_mem *mem = container_of(ref, struct siw_mem, ref);
143
144	siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n");
145
146	if (!mem->is_mw && mem->mem_obj) {
147		if (mem->is_pbl == 0)
148			siw_umem_release(mem->umem, true);
149		else
150			kfree(mem->pbl);
151	}
152	kfree(mem);
153}
154
155/*
156 * siw_check_mem()
157 *
158 * Check protection domain, STAG state, access permissions and
159 * address range for memory object.
160 *
161 * @pd:		Protection Domain memory should belong to
162 * @mem:	memory to be checked
163 * @addr:	starting addr of mem
164 * @perms:	requested access permissions
165 * @len:	len of memory interval to be checked
166 *
167 */
168int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
169		  enum ib_access_flags perms, int len)
170{
171	if (!mem->stag_valid) {
172		siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag);
173		return -E_STAG_INVALID;
174	}
175	if (mem->pd != pd) {
176		siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag);
177		return -E_PD_MISMATCH;
178	}
179	/*
180	 * check access permissions
181	 */
182	if ((mem->perms & perms) < perms) {
183		siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n",
184			   mem->perms, perms);
185		return -E_ACCESS_PERM;
186	}
187	/*
188	 * Check if access falls into valid memory interval.
189	 */
190	if (addr < mem->va || addr + len > mem->va + mem->len) {
191		siw_dbg_pd(pd, "MEM interval len %d\n", len);
192		siw_dbg_pd(pd, "[0x%pK, 0x%pK] out of bounds\n",
193			   (void *)(uintptr_t)addr,
194			   (void *)(uintptr_t)(addr + len));
195		siw_dbg_pd(pd, "[0x%pK, 0x%pK] STag=0x%08x\n",
196			   (void *)(uintptr_t)mem->va,
197			   (void *)(uintptr_t)(mem->va + mem->len),
198			   mem->stag);
199
200		return -E_BASE_BOUNDS;
201	}
202	return E_ACCESS_OK;
203}
204
205/*
206 * siw_check_sge()
207 *
208 * Check SGE for access rights in given interval
209 *
210 * @pd:		Protection Domain memory should belong to
211 * @sge:	SGE to be checked
212 * @mem:	location of memory reference within array
213 * @perms:	requested access permissions
214 * @off:	starting offset in SGE
215 * @len:	len of memory interval to be checked
216 *
217 * NOTE: Function references SGE's memory object (mem->obj)
218 * if not yet done. New reference is kept if check went ok and
219 * released if check failed. If mem->obj is already valid, no new
220 * lookup is being done and mem is not released it check fails.
221 */
222int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[],
223		  enum ib_access_flags perms, u32 off, int len)
224{
225	struct siw_device *sdev = to_siw_dev(pd->device);
226	struct siw_mem *new = NULL;
227	int rv = E_ACCESS_OK;
228
229	if (len + off > sge->length) {
230		rv = -E_BASE_BOUNDS;
231		goto fail;
232	}
233	if (*mem == NULL) {
234		new = siw_mem_id2obj(sdev, sge->lkey >> 8);
235		if (unlikely(!new)) {
236			siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey);
237			rv = -E_STAG_INVALID;
238			goto fail;
239		}
240		*mem = new;
241	}
242	/* Check if user re-registered with different STag key */
243	if (unlikely((*mem)->stag != sge->lkey)) {
244		siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey);
245		rv = -E_STAG_INVALID;
246		goto fail;
247	}
248	rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len);
249	if (unlikely(rv))
250		goto fail;
251
252	return 0;
253
254fail:
255	if (new) {
256		*mem = NULL;
257		siw_mem_put(new);
258	}
259	return rv;
260}
261
262void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op)
263{
264	switch (op) {
265	case SIW_OP_SEND:
266	case SIW_OP_WRITE:
267	case SIW_OP_SEND_WITH_IMM:
268	case SIW_OP_SEND_REMOTE_INV:
269	case SIW_OP_READ:
270	case SIW_OP_READ_LOCAL_INV:
271		if (!(wqe->sqe.flags & SIW_WQE_INLINE))
272			siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge);
273		break;
274
275	case SIW_OP_RECEIVE:
276		siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge);
277		break;
278
279	case SIW_OP_READ_RESPONSE:
280		siw_unref_mem_sgl(wqe->mem, 1);
281		break;
282
283	default:
284		/*
285		 * SIW_OP_INVAL_STAG and SIW_OP_REG_MR
286		 * do not hold memory references
287		 */
288		break;
289	}
290}
291
292int siw_invalidate_stag(struct ib_pd *pd, u32 stag)
293{
294	struct siw_device *sdev = to_siw_dev(pd->device);
295	struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8);
296	int rv = 0;
297
298	if (unlikely(!mem)) {
299		siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag);
300		return -EINVAL;
301	}
302	if (unlikely(mem->pd != pd)) {
303		siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag);
304		rv = -EACCES;
305		goto out;
306	}
307	/*
308	 * Per RDMA verbs definition, an STag may already be in invalid
309	 * state if invalidation is requested. So no state check here.
310	 */
311	mem->stag_valid = 0;
312
313	siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag);
314out:
315	siw_mem_put(mem);
316	return rv;
317}
318
319/*
320 * Gets physical address backed by PBL element. Address is referenced
321 * by linear byte offset into list of variably sized PB elements.
322 * Optionally, provides remaining len within current element, and
323 * current PBL index for later resume at same element.
324 */
325dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx)
326{
327	int i = idx ? *idx : 0;
328
329	while (i < pbl->num_buf) {
330		struct siw_pble *pble = &pbl->pbe[i];
331
332		if (pble->pbl_off + pble->size > off) {
333			u64 pble_off = off - pble->pbl_off;
334
335			if (len)
336				*len = pble->size - pble_off;
337			if (idx)
338				*idx = i;
339
340			return pble->addr + pble_off;
341		}
342		i++;
343	}
344	if (len)
345		*len = 0;
346	return 0;
347}
348
349struct siw_pbl *siw_pbl_alloc(u32 num_buf)
350{
351	struct siw_pbl *pbl;
352
353	if (num_buf == 0)
354		return ERR_PTR(-EINVAL);
355
356	pbl = kzalloc(struct_size(pbl, pbe, num_buf), GFP_KERNEL);
357	if (!pbl)
358		return ERR_PTR(-ENOMEM);
359
360	pbl->max_buf = num_buf;
361
362	return pbl;
363}
364
365struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
366{
367	struct siw_umem *umem;
368	struct mm_struct *mm_s;
369	u64 first_page_va;
370	unsigned long mlock_limit;
371	unsigned int foll_flags = FOLL_WRITE;
372	int num_pages, num_chunks, i, rv = 0;
373
374	if (!can_do_mlock())
375		return ERR_PTR(-EPERM);
376
377	if (!len)
378		return ERR_PTR(-EINVAL);
379
380	first_page_va = start & PAGE_MASK;
381	num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT;
382	num_chunks = (num_pages >> CHUNK_SHIFT) + 1;
383
384	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
385	if (!umem)
386		return ERR_PTR(-ENOMEM);
387
388	mm_s = current->mm;
389	umem->owning_mm = mm_s;
390	umem->writable = writable;
391
392	mmgrab(mm_s);
393
394	if (!writable)
395		foll_flags |= FOLL_FORCE;
396
397	mmap_read_lock(mm_s);
398
399	mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
400
401	if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) {
402		rv = -ENOMEM;
403		goto out_sem_up;
404	}
405	umem->fp_addr = first_page_va;
406
407	umem->page_chunk =
408		kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL);
409	if (!umem->page_chunk) {
410		rv = -ENOMEM;
411		goto out_sem_up;
412	}
413	for (i = 0; num_pages; i++) {
414		int got, nents = min_t(int, num_pages, PAGES_PER_CHUNK);
415
416		umem->page_chunk[i].plist =
417			kcalloc(nents, sizeof(struct page *), GFP_KERNEL);
418		if (!umem->page_chunk[i].plist) {
419			rv = -ENOMEM;
420			goto out_sem_up;
421		}
422		got = 0;
423		while (nents) {
424			struct page **plist = &umem->page_chunk[i].plist[got];
425
426			rv = pin_user_pages(first_page_va, nents,
427					    foll_flags | FOLL_LONGTERM,
428					    plist, NULL);
429			if (rv < 0)
430				goto out_sem_up;
431
432			umem->num_pages += rv;
433			atomic64_add(rv, &mm_s->pinned_vm);
434			first_page_va += rv * PAGE_SIZE;
435			nents -= rv;
436			got += rv;
437		}
438		num_pages -= got;
439	}
440out_sem_up:
441	mmap_read_unlock(mm_s);
442
443	if (rv > 0)
444		return umem;
445
446	siw_umem_release(umem, false);
447
448	return ERR_PTR(rv);
449}
450