162306a36Sopenharmony_ci/* 262306a36Sopenharmony_ci * Copyright (c) 2005 Topspin Communications. All rights reserved. 362306a36Sopenharmony_ci * Copyright (c) 2005 Cisco Systems. All rights reserved. 462306a36Sopenharmony_ci * Copyright (c) 2005 Mellanox Technologies. All rights reserved. 562306a36Sopenharmony_ci * Copyright (c) 2020 Intel Corporation. All rights reserved. 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * This software is available to you under a choice of one of two 862306a36Sopenharmony_ci * licenses. You may choose to be licensed under the terms of the GNU 962306a36Sopenharmony_ci * General Public License (GPL) Version 2, available from the file 1062306a36Sopenharmony_ci * COPYING in the main directory of this source tree, or the 1162306a36Sopenharmony_ci * OpenIB.org BSD license below: 1262306a36Sopenharmony_ci * 1362306a36Sopenharmony_ci * Redistribution and use in source and binary forms, with or 1462306a36Sopenharmony_ci * without modification, are permitted provided that the following 1562306a36Sopenharmony_ci * conditions are met: 1662306a36Sopenharmony_ci * 1762306a36Sopenharmony_ci * - Redistributions of source code must retain the above 1862306a36Sopenharmony_ci * copyright notice, this list of conditions and the following 1962306a36Sopenharmony_ci * disclaimer. 2062306a36Sopenharmony_ci * 2162306a36Sopenharmony_ci * - Redistributions in binary form must reproduce the above 2262306a36Sopenharmony_ci * copyright notice, this list of conditions and the following 2362306a36Sopenharmony_ci * disclaimer in the documentation and/or other materials 2462306a36Sopenharmony_ci * provided with the distribution. 2562306a36Sopenharmony_ci * 2662306a36Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 2762306a36Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 2862306a36Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 2962306a36Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 3062306a36Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 3162306a36Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 3262306a36Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 3362306a36Sopenharmony_ci * SOFTWARE. 3462306a36Sopenharmony_ci */ 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_ci#include <linux/mm.h> 3762306a36Sopenharmony_ci#include <linux/dma-mapping.h> 3862306a36Sopenharmony_ci#include <linux/sched/signal.h> 3962306a36Sopenharmony_ci#include <linux/sched/mm.h> 4062306a36Sopenharmony_ci#include <linux/export.h> 4162306a36Sopenharmony_ci#include <linux/slab.h> 4262306a36Sopenharmony_ci#include <linux/pagemap.h> 4362306a36Sopenharmony_ci#include <linux/count_zeros.h> 4462306a36Sopenharmony_ci#include <rdma/ib_umem_odp.h> 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_ci#include "uverbs.h" 4762306a36Sopenharmony_ci 4862306a36Sopenharmony_cistatic void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) 4962306a36Sopenharmony_ci{ 5062306a36Sopenharmony_ci bool make_dirty = umem->writable && dirty; 5162306a36Sopenharmony_ci struct scatterlist *sg; 5262306a36Sopenharmony_ci unsigned int i; 5362306a36Sopenharmony_ci 5462306a36Sopenharmony_ci if (dirty) 5562306a36Sopenharmony_ci ib_dma_unmap_sgtable_attrs(dev, &umem->sgt_append.sgt, 5662306a36Sopenharmony_ci DMA_BIDIRECTIONAL, 0); 5762306a36Sopenharmony_ci 5862306a36Sopenharmony_ci for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) 5962306a36Sopenharmony_ci unpin_user_page_range_dirty_lock(sg_page(sg), 6062306a36Sopenharmony_ci DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty); 6162306a36Sopenharmony_ci 6262306a36Sopenharmony_ci sg_free_append_table(&umem->sgt_append); 6362306a36Sopenharmony_ci} 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ci/** 6662306a36Sopenharmony_ci * ib_umem_find_best_pgsz - Find best HW page size to use for this MR 6762306a36Sopenharmony_ci * 6862306a36Sopenharmony_ci * @umem: umem struct 6962306a36Sopenharmony_ci * @pgsz_bitmap: bitmap of HW supported page sizes 7062306a36Sopenharmony_ci * @virt: IOVA 7162306a36Sopenharmony_ci * 7262306a36Sopenharmony_ci * This helper is intended for HW that support multiple page 7362306a36Sopenharmony_ci * sizes but can do only a single page size in an MR. 7462306a36Sopenharmony_ci * 7562306a36Sopenharmony_ci * Returns 0 if the umem requires page sizes not supported by 7662306a36Sopenharmony_ci * the driver to be mapped. Drivers always supporting PAGE_SIZE 7762306a36Sopenharmony_ci * or smaller will never see a 0 result. 7862306a36Sopenharmony_ci */ 7962306a36Sopenharmony_ciunsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, 8062306a36Sopenharmony_ci unsigned long pgsz_bitmap, 8162306a36Sopenharmony_ci unsigned long virt) 8262306a36Sopenharmony_ci{ 8362306a36Sopenharmony_ci struct scatterlist *sg; 8462306a36Sopenharmony_ci unsigned long va, pgoff; 8562306a36Sopenharmony_ci dma_addr_t mask; 8662306a36Sopenharmony_ci int i; 8762306a36Sopenharmony_ci 8862306a36Sopenharmony_ci umem->iova = va = virt; 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci if (umem->is_odp) { 9162306a36Sopenharmony_ci unsigned int page_size = BIT(to_ib_umem_odp(umem)->page_shift); 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ci /* ODP must always be self consistent. */ 9462306a36Sopenharmony_ci if (!(pgsz_bitmap & page_size)) 9562306a36Sopenharmony_ci return 0; 9662306a36Sopenharmony_ci return page_size; 9762306a36Sopenharmony_ci } 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_ci /* The best result is the smallest page size that results in the minimum 10062306a36Sopenharmony_ci * number of required pages. Compute the largest page size that could 10162306a36Sopenharmony_ci * work based on VA address bits that don't change. 10262306a36Sopenharmony_ci */ 10362306a36Sopenharmony_ci mask = pgsz_bitmap & 10462306a36Sopenharmony_ci GENMASK(BITS_PER_LONG - 1, 10562306a36Sopenharmony_ci bits_per((umem->length - 1 + virt) ^ virt)); 10662306a36Sopenharmony_ci /* offset into first SGL */ 10762306a36Sopenharmony_ci pgoff = umem->address & ~PAGE_MASK; 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ci for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) { 11062306a36Sopenharmony_ci /* Walk SGL and reduce max page size if VA/PA bits differ 11162306a36Sopenharmony_ci * for any address. 11262306a36Sopenharmony_ci */ 11362306a36Sopenharmony_ci mask |= (sg_dma_address(sg) + pgoff) ^ va; 11462306a36Sopenharmony_ci va += sg_dma_len(sg) - pgoff; 11562306a36Sopenharmony_ci /* Except for the last entry, the ending iova alignment sets 11662306a36Sopenharmony_ci * the maximum possible page size as the low bits of the iova 11762306a36Sopenharmony_ci * must be zero when starting the next chunk. 11862306a36Sopenharmony_ci */ 11962306a36Sopenharmony_ci if (i != (umem->sgt_append.sgt.nents - 1)) 12062306a36Sopenharmony_ci mask |= va; 12162306a36Sopenharmony_ci pgoff = 0; 12262306a36Sopenharmony_ci } 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci /* The mask accumulates 1's in each position where the VA and physical 12562306a36Sopenharmony_ci * address differ, thus the length of trailing 0 is the largest page 12662306a36Sopenharmony_ci * size that can pass the VA through to the physical. 12762306a36Sopenharmony_ci */ 12862306a36Sopenharmony_ci if (mask) 12962306a36Sopenharmony_ci pgsz_bitmap &= GENMASK(count_trailing_zeros(mask), 0); 13062306a36Sopenharmony_ci return pgsz_bitmap ? rounddown_pow_of_two(pgsz_bitmap) : 0; 13162306a36Sopenharmony_ci} 13262306a36Sopenharmony_ciEXPORT_SYMBOL(ib_umem_find_best_pgsz); 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci/** 13562306a36Sopenharmony_ci * ib_umem_get - Pin and DMA map userspace memory. 13662306a36Sopenharmony_ci * 13762306a36Sopenharmony_ci * @device: IB device to connect UMEM 13862306a36Sopenharmony_ci * @addr: userspace virtual address to start at 13962306a36Sopenharmony_ci * @size: length of region to pin 14062306a36Sopenharmony_ci * @access: IB_ACCESS_xxx flags for memory being pinned 14162306a36Sopenharmony_ci */ 14262306a36Sopenharmony_cistruct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, 14362306a36Sopenharmony_ci size_t size, int access) 14462306a36Sopenharmony_ci{ 14562306a36Sopenharmony_ci struct ib_umem *umem; 14662306a36Sopenharmony_ci struct page **page_list; 14762306a36Sopenharmony_ci unsigned long lock_limit; 14862306a36Sopenharmony_ci unsigned long new_pinned; 14962306a36Sopenharmony_ci unsigned long cur_base; 15062306a36Sopenharmony_ci unsigned long dma_attr = 0; 15162306a36Sopenharmony_ci struct mm_struct *mm; 15262306a36Sopenharmony_ci unsigned long npages; 15362306a36Sopenharmony_ci int pinned, ret; 15462306a36Sopenharmony_ci unsigned int gup_flags = FOLL_LONGTERM; 15562306a36Sopenharmony_ci 15662306a36Sopenharmony_ci /* 15762306a36Sopenharmony_ci * If the combination of the addr and size requested for this memory 15862306a36Sopenharmony_ci * region causes an integer overflow, return error. 15962306a36Sopenharmony_ci */ 16062306a36Sopenharmony_ci if (((addr + size) < addr) || 16162306a36Sopenharmony_ci PAGE_ALIGN(addr + size) < (addr + size)) 16262306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 16362306a36Sopenharmony_ci 16462306a36Sopenharmony_ci if (!can_do_mlock()) 16562306a36Sopenharmony_ci return ERR_PTR(-EPERM); 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_ci if (access & IB_ACCESS_ON_DEMAND) 16862306a36Sopenharmony_ci return ERR_PTR(-EOPNOTSUPP); 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci umem = kzalloc(sizeof(*umem), GFP_KERNEL); 17162306a36Sopenharmony_ci if (!umem) 17262306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 17362306a36Sopenharmony_ci umem->ibdev = device; 17462306a36Sopenharmony_ci umem->length = size; 17562306a36Sopenharmony_ci umem->address = addr; 17662306a36Sopenharmony_ci /* 17762306a36Sopenharmony_ci * Drivers should call ib_umem_find_best_pgsz() to set the iova 17862306a36Sopenharmony_ci * correctly. 17962306a36Sopenharmony_ci */ 18062306a36Sopenharmony_ci umem->iova = addr; 18162306a36Sopenharmony_ci umem->writable = ib_access_writable(access); 18262306a36Sopenharmony_ci umem->owning_mm = mm = current->mm; 18362306a36Sopenharmony_ci mmgrab(mm); 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ci page_list = (struct page **) __get_free_page(GFP_KERNEL); 18662306a36Sopenharmony_ci if (!page_list) { 18762306a36Sopenharmony_ci ret = -ENOMEM; 18862306a36Sopenharmony_ci goto umem_kfree; 18962306a36Sopenharmony_ci } 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_ci npages = ib_umem_num_pages(umem); 19262306a36Sopenharmony_ci if (npages == 0 || npages > UINT_MAX) { 19362306a36Sopenharmony_ci ret = -EINVAL; 19462306a36Sopenharmony_ci goto out; 19562306a36Sopenharmony_ci } 19662306a36Sopenharmony_ci 19762306a36Sopenharmony_ci lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci new_pinned = atomic64_add_return(npages, &mm->pinned_vm); 20062306a36Sopenharmony_ci if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) { 20162306a36Sopenharmony_ci atomic64_sub(npages, &mm->pinned_vm); 20262306a36Sopenharmony_ci ret = -ENOMEM; 20362306a36Sopenharmony_ci goto out; 20462306a36Sopenharmony_ci } 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_ci cur_base = addr & PAGE_MASK; 20762306a36Sopenharmony_ci 20862306a36Sopenharmony_ci if (umem->writable) 20962306a36Sopenharmony_ci gup_flags |= FOLL_WRITE; 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci while (npages) { 21262306a36Sopenharmony_ci cond_resched(); 21362306a36Sopenharmony_ci pinned = pin_user_pages_fast(cur_base, 21462306a36Sopenharmony_ci min_t(unsigned long, npages, 21562306a36Sopenharmony_ci PAGE_SIZE / 21662306a36Sopenharmony_ci sizeof(struct page *)), 21762306a36Sopenharmony_ci gup_flags, page_list); 21862306a36Sopenharmony_ci if (pinned < 0) { 21962306a36Sopenharmony_ci ret = pinned; 22062306a36Sopenharmony_ci goto umem_release; 22162306a36Sopenharmony_ci } 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_ci cur_base += pinned * PAGE_SIZE; 22462306a36Sopenharmony_ci npages -= pinned; 22562306a36Sopenharmony_ci ret = sg_alloc_append_table_from_pages( 22662306a36Sopenharmony_ci &umem->sgt_append, page_list, pinned, 0, 22762306a36Sopenharmony_ci pinned << PAGE_SHIFT, ib_dma_max_seg_size(device), 22862306a36Sopenharmony_ci npages, GFP_KERNEL); 22962306a36Sopenharmony_ci if (ret) { 23062306a36Sopenharmony_ci unpin_user_pages_dirty_lock(page_list, pinned, 0); 23162306a36Sopenharmony_ci goto umem_release; 23262306a36Sopenharmony_ci } 23362306a36Sopenharmony_ci } 23462306a36Sopenharmony_ci 23562306a36Sopenharmony_ci if (access & IB_ACCESS_RELAXED_ORDERING) 23662306a36Sopenharmony_ci dma_attr |= DMA_ATTR_WEAK_ORDERING; 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci ret = ib_dma_map_sgtable_attrs(device, &umem->sgt_append.sgt, 23962306a36Sopenharmony_ci DMA_BIDIRECTIONAL, dma_attr); 24062306a36Sopenharmony_ci if (ret) 24162306a36Sopenharmony_ci goto umem_release; 24262306a36Sopenharmony_ci goto out; 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ciumem_release: 24562306a36Sopenharmony_ci __ib_umem_release(device, umem, 0); 24662306a36Sopenharmony_ci atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); 24762306a36Sopenharmony_ciout: 24862306a36Sopenharmony_ci free_page((unsigned long) page_list); 24962306a36Sopenharmony_ciumem_kfree: 25062306a36Sopenharmony_ci if (ret) { 25162306a36Sopenharmony_ci mmdrop(umem->owning_mm); 25262306a36Sopenharmony_ci kfree(umem); 25362306a36Sopenharmony_ci } 25462306a36Sopenharmony_ci return ret ? ERR_PTR(ret) : umem; 25562306a36Sopenharmony_ci} 25662306a36Sopenharmony_ciEXPORT_SYMBOL(ib_umem_get); 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_ci/** 25962306a36Sopenharmony_ci * ib_umem_release - release memory pinned with ib_umem_get 26062306a36Sopenharmony_ci * @umem: umem struct to release 26162306a36Sopenharmony_ci */ 26262306a36Sopenharmony_civoid ib_umem_release(struct ib_umem *umem) 26362306a36Sopenharmony_ci{ 26462306a36Sopenharmony_ci if (!umem) 26562306a36Sopenharmony_ci return; 26662306a36Sopenharmony_ci if (umem->is_dmabuf) 26762306a36Sopenharmony_ci return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem)); 26862306a36Sopenharmony_ci if (umem->is_odp) 26962306a36Sopenharmony_ci return ib_umem_odp_release(to_ib_umem_odp(umem)); 27062306a36Sopenharmony_ci 27162306a36Sopenharmony_ci __ib_umem_release(umem->ibdev, umem, 1); 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); 27462306a36Sopenharmony_ci mmdrop(umem->owning_mm); 27562306a36Sopenharmony_ci kfree(umem); 27662306a36Sopenharmony_ci} 27762306a36Sopenharmony_ciEXPORT_SYMBOL(ib_umem_release); 27862306a36Sopenharmony_ci 27962306a36Sopenharmony_ci/* 28062306a36Sopenharmony_ci * Copy from the given ib_umem's pages to the given buffer. 28162306a36Sopenharmony_ci * 28262306a36Sopenharmony_ci * umem - the umem to copy from 28362306a36Sopenharmony_ci * offset - offset to start copying from 28462306a36Sopenharmony_ci * dst - destination buffer 28562306a36Sopenharmony_ci * length - buffer length 28662306a36Sopenharmony_ci * 28762306a36Sopenharmony_ci * Returns 0 on success, or an error code. 28862306a36Sopenharmony_ci */ 28962306a36Sopenharmony_ciint ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, 29062306a36Sopenharmony_ci size_t length) 29162306a36Sopenharmony_ci{ 29262306a36Sopenharmony_ci size_t end = offset + length; 29362306a36Sopenharmony_ci int ret; 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci if (offset > umem->length || length > umem->length - offset) { 29662306a36Sopenharmony_ci pr_err("%s not in range. offset: %zd umem length: %zd end: %zd\n", 29762306a36Sopenharmony_ci __func__, offset, umem->length, end); 29862306a36Sopenharmony_ci return -EINVAL; 29962306a36Sopenharmony_ci } 30062306a36Sopenharmony_ci 30162306a36Sopenharmony_ci ret = sg_pcopy_to_buffer(umem->sgt_append.sgt.sgl, 30262306a36Sopenharmony_ci umem->sgt_append.sgt.orig_nents, dst, length, 30362306a36Sopenharmony_ci offset + ib_umem_offset(umem)); 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_ci if (ret < 0) 30662306a36Sopenharmony_ci return ret; 30762306a36Sopenharmony_ci else if (ret != length) 30862306a36Sopenharmony_ci return -EINVAL; 30962306a36Sopenharmony_ci else 31062306a36Sopenharmony_ci return 0; 31162306a36Sopenharmony_ci} 31262306a36Sopenharmony_ciEXPORT_SYMBOL(ib_umem_copy_from); 313