1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2. 4 * 5 * Copyright (C) 2018 IBM Corp. All rights reserved. 6 * Author: Alexey Kardashevskiy <aik@ozlabs.ru> 7 * 8 * Register an on-GPU RAM region for cacheable access. 9 * 10 * Derived from original vfio_pci_igd.c: 11 * Copyright (C) 2016 Red Hat, Inc. All rights reserved. 12 * Author: Alex Williamson <alex.williamson@redhat.com> 13 */ 14 15#include <linux/io.h> 16#include <linux/pci.h> 17#include <linux/uaccess.h> 18#include <linux/vfio.h> 19#include <linux/sched/mm.h> 20#include <linux/mmu_context.h> 21#include <asm/kvm_ppc.h> 22#include "vfio_pci_private.h" 23 24#define CREATE_TRACE_POINTS 25#include "trace.h" 26 27EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault); 28EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap); 29EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap); 30 31struct vfio_pci_nvgpu_data { 32 unsigned long gpu_hpa; /* GPU RAM physical address */ 33 unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */ 34 unsigned long useraddr; /* GPU RAM userspace address */ 35 unsigned long size; /* Size of the GPU RAM window (usually 128GB) */ 36 struct mm_struct *mm; 37 struct mm_iommu_table_group_mem_t *mem; /* Pre-registered RAM descr. */ 38 struct pci_dev *gpdev; 39 struct notifier_block group_notifier; 40}; 41 42static size_t vfio_pci_nvgpu_rw(struct vfio_pci_device *vdev, 43 char __user *buf, size_t count, loff_t *ppos, bool iswrite) 44{ 45 unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; 46 struct vfio_pci_nvgpu_data *data = vdev->region[i].data; 47 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; 48 loff_t posaligned = pos & PAGE_MASK, posoff = pos & ~PAGE_MASK; 49 size_t sizealigned; 50 void __iomem *ptr; 51 52 if (pos >= vdev->region[i].size) 53 return -EINVAL; 54 55 count = min(count, (size_t)(vdev->region[i].size - pos)); 56 57 /* 58 * We map only a bit of GPU RAM for a short time instead of mapping it 59 * for the guest lifetime as: 60 * 61 * 1) we do not know GPU RAM size, only aperture which is 4-8 times 62 * bigger than actual RAM size (16/32GB RAM vs. 128GB aperture); 63 * 2) mapping GPU RAM allows CPU to prefetch and if this happens 64 * before NVLink bridge is reset (which fences GPU RAM), 65 * hardware management interrupts (HMI) might happen, this 66 * will freeze NVLink bridge. 67 * 68 * This is not fast path anyway. 69 */ 70 sizealigned = ALIGN(posoff + count, PAGE_SIZE); 71 ptr = ioremap_cache(data->gpu_hpa + posaligned, sizealigned); 72 if (!ptr) 73 return -EFAULT; 74 75 if (iswrite) { 76 if (copy_from_user(ptr + posoff, buf, count)) 77 count = -EFAULT; 78 else 79 *ppos += count; 80 } else { 81 if (copy_to_user(buf, ptr + posoff, count)) 82 count = -EFAULT; 83 else 84 *ppos += count; 85 } 86 87 iounmap(ptr); 88 89 return count; 90} 91 92static void vfio_pci_nvgpu_release(struct vfio_pci_device *vdev, 93 struct vfio_pci_region *region) 94{ 95 struct vfio_pci_nvgpu_data *data = region->data; 96 long ret; 97 98 /* If there were any mappings at all... */ 99 if (data->mm) { 100 if (data->mem) { 101 ret = mm_iommu_put(data->mm, data->mem); 102 WARN_ON(ret); 103 } 104 105 mmdrop(data->mm); 106 } 107 108 vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, 109 &data->group_notifier); 110 111 pnv_npu2_unmap_lpar_dev(data->gpdev); 112 113 kfree(data); 114} 115 116static vm_fault_t vfio_pci_nvgpu_mmap_fault(struct vm_fault *vmf) 117{ 118 vm_fault_t ret; 119 struct vm_area_struct *vma = vmf->vma; 120 struct vfio_pci_region *region = vma->vm_private_data; 121 struct vfio_pci_nvgpu_data *data = region->data; 122 unsigned long vmf_off = (vmf->address - vma->vm_start) >> PAGE_SHIFT; 123 unsigned long nv2pg = data->gpu_hpa >> PAGE_SHIFT; 124 unsigned long vm_pgoff = vma->vm_pgoff & 125 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 126 unsigned long pfn = nv2pg + vm_pgoff + vmf_off; 127 128 ret = vmf_insert_pfn(vma, vmf->address, pfn); 129 trace_vfio_pci_nvgpu_mmap_fault(data->gpdev, pfn << PAGE_SHIFT, 130 vmf->address, ret); 131 132 return ret; 133} 134 135static const struct vm_operations_struct vfio_pci_nvgpu_mmap_vmops = { 136 .fault = vfio_pci_nvgpu_mmap_fault, 137}; 138 139static int vfio_pci_nvgpu_mmap(struct vfio_pci_device *vdev, 140 struct vfio_pci_region *region, struct vm_area_struct *vma) 141{ 142 int ret; 143 struct vfio_pci_nvgpu_data *data = region->data; 144 145 if (data->useraddr) 146 return -EPERM; 147 148 if (vma->vm_end - vma->vm_start > data->size) 149 return -EINVAL; 150 151 vma->vm_private_data = region; 152 vma->vm_flags |= VM_PFNMAP; 153 vma->vm_ops = &vfio_pci_nvgpu_mmap_vmops; 154 155 /* 156 * Calling mm_iommu_newdev() here once as the region is not 157 * registered yet and therefore right initialization will happen now. 158 * Other places will use mm_iommu_find() which returns 159 * registered @mem and does not go gup(). 160 */ 161 data->useraddr = vma->vm_start; 162 data->mm = current->mm; 163 164 mmgrab(data->mm); 165 ret = (int) mm_iommu_newdev(data->mm, data->useraddr, 166 vma_pages(vma), data->gpu_hpa, &data->mem); 167 168 trace_vfio_pci_nvgpu_mmap(vdev->pdev, data->gpu_hpa, data->useraddr, 169 vma->vm_end - vma->vm_start, ret); 170 171 return ret; 172} 173 174static int vfio_pci_nvgpu_add_capability(struct vfio_pci_device *vdev, 175 struct vfio_pci_region *region, struct vfio_info_cap *caps) 176{ 177 struct vfio_pci_nvgpu_data *data = region->data; 178 struct vfio_region_info_cap_nvlink2_ssatgt cap = { 179 .header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, 180 .header.version = 1, 181 .tgt = data->gpu_tgt 182 }; 183 184 return vfio_info_add_capability(caps, &cap.header, sizeof(cap)); 185} 186 187static const struct vfio_pci_regops vfio_pci_nvgpu_regops = { 188 .rw = vfio_pci_nvgpu_rw, 189 .release = vfio_pci_nvgpu_release, 190 .mmap = vfio_pci_nvgpu_mmap, 191 .add_capability = vfio_pci_nvgpu_add_capability, 192}; 193 194static int vfio_pci_nvgpu_group_notifier(struct notifier_block *nb, 195 unsigned long action, void *opaque) 196{ 197 struct kvm *kvm = opaque; 198 struct vfio_pci_nvgpu_data *data = container_of(nb, 199 struct vfio_pci_nvgpu_data, 200 group_notifier); 201 202 if (action == VFIO_GROUP_NOTIFY_SET_KVM && kvm && 203 pnv_npu2_map_lpar_dev(data->gpdev, 204 kvm->arch.lpid, MSR_DR | MSR_PR)) 205 return NOTIFY_BAD; 206 207 return NOTIFY_OK; 208} 209 210int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev) 211{ 212 int ret; 213 u64 reg[2]; 214 u64 tgt = 0; 215 struct device_node *npu_node, *mem_node; 216 struct pci_dev *npu_dev; 217 struct vfio_pci_nvgpu_data *data; 218 uint32_t mem_phandle = 0; 219 unsigned long events = VFIO_GROUP_NOTIFY_SET_KVM; 220 221 /* 222 * PCI config space does not tell us about NVLink presense but 223 * platform does, use this. 224 */ 225 npu_dev = pnv_pci_get_npu_dev(vdev->pdev, 0); 226 if (!npu_dev) 227 return -ENODEV; 228 229 npu_node = pci_device_to_OF_node(npu_dev); 230 if (!npu_node) 231 return -EINVAL; 232 233 if (of_property_read_u32(npu_node, "memory-region", &mem_phandle)) 234 return -ENODEV; 235 236 mem_node = of_find_node_by_phandle(mem_phandle); 237 if (!mem_node) 238 return -EINVAL; 239 240 if (of_property_read_variable_u64_array(mem_node, "reg", reg, 241 ARRAY_SIZE(reg), ARRAY_SIZE(reg)) != 242 ARRAY_SIZE(reg)) 243 return -EINVAL; 244 245 if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) { 246 dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n"); 247 return -EFAULT; 248 } 249 250 data = kzalloc(sizeof(*data), GFP_KERNEL); 251 if (!data) 252 return -ENOMEM; 253 254 data->gpu_hpa = reg[0]; 255 data->gpu_tgt = tgt; 256 data->size = reg[1]; 257 258 dev_dbg(&vdev->pdev->dev, "%lx..%lx\n", data->gpu_hpa, 259 data->gpu_hpa + data->size - 1); 260 261 data->gpdev = vdev->pdev; 262 data->group_notifier.notifier_call = vfio_pci_nvgpu_group_notifier; 263 264 ret = vfio_register_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, 265 &events, &data->group_notifier); 266 if (ret) 267 goto free_exit; 268 269 /* 270 * We have just set KVM, we do not need the listener anymore. 271 * Also, keeping it registered means that if more than one GPU is 272 * assigned, we will get several similar notifiers notifying about 273 * the same device again which does not help with anything. 274 */ 275 vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, 276 &data->group_notifier); 277 278 ret = vfio_pci_register_dev_region(vdev, 279 PCI_VENDOR_ID_NVIDIA | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, 280 VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, 281 &vfio_pci_nvgpu_regops, 282 data->size, 283 VFIO_REGION_INFO_FLAG_READ | 284 VFIO_REGION_INFO_FLAG_WRITE | 285 VFIO_REGION_INFO_FLAG_MMAP, 286 data); 287 if (ret) 288 goto free_exit; 289 290 return 0; 291free_exit: 292 kfree(data); 293 294 return ret; 295} 296 297/* 298 * IBM NPU2 bridge 299 */ 300struct vfio_pci_npu2_data { 301 void *base; /* ATSD register virtual address, for emulated access */ 302 unsigned long mmio_atsd; /* ATSD physical address */ 303 unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */ 304 unsigned int link_speed; /* The link speed from DT's ibm,nvlink-speed */ 305}; 306 307static size_t vfio_pci_npu2_rw(struct vfio_pci_device *vdev, 308 char __user *buf, size_t count, loff_t *ppos, bool iswrite) 309{ 310 unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; 311 struct vfio_pci_npu2_data *data = vdev->region[i].data; 312 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; 313 314 if (pos >= vdev->region[i].size) 315 return -EINVAL; 316 317 count = min(count, (size_t)(vdev->region[i].size - pos)); 318 319 if (iswrite) { 320 if (copy_from_user(data->base + pos, buf, count)) 321 return -EFAULT; 322 } else { 323 if (copy_to_user(buf, data->base + pos, count)) 324 return -EFAULT; 325 } 326 *ppos += count; 327 328 return count; 329} 330 331static int vfio_pci_npu2_mmap(struct vfio_pci_device *vdev, 332 struct vfio_pci_region *region, struct vm_area_struct *vma) 333{ 334 int ret; 335 struct vfio_pci_npu2_data *data = region->data; 336 unsigned long req_len = vma->vm_end - vma->vm_start; 337 338 if (req_len != PAGE_SIZE) 339 return -EINVAL; 340 341 vma->vm_flags |= VM_PFNMAP; 342 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 343 344 ret = remap_pfn_range(vma, vma->vm_start, data->mmio_atsd >> PAGE_SHIFT, 345 req_len, vma->vm_page_prot); 346 trace_vfio_pci_npu2_mmap(vdev->pdev, data->mmio_atsd, vma->vm_start, 347 vma->vm_end - vma->vm_start, ret); 348 349 return ret; 350} 351 352static void vfio_pci_npu2_release(struct vfio_pci_device *vdev, 353 struct vfio_pci_region *region) 354{ 355 struct vfio_pci_npu2_data *data = region->data; 356 357 memunmap(data->base); 358 kfree(data); 359} 360 361static int vfio_pci_npu2_add_capability(struct vfio_pci_device *vdev, 362 struct vfio_pci_region *region, struct vfio_info_cap *caps) 363{ 364 struct vfio_pci_npu2_data *data = region->data; 365 struct vfio_region_info_cap_nvlink2_ssatgt captgt = { 366 .header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, 367 .header.version = 1, 368 .tgt = data->gpu_tgt 369 }; 370 struct vfio_region_info_cap_nvlink2_lnkspd capspd = { 371 .header.id = VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD, 372 .header.version = 1, 373 .link_speed = data->link_speed 374 }; 375 int ret; 376 377 ret = vfio_info_add_capability(caps, &captgt.header, sizeof(captgt)); 378 if (ret) 379 return ret; 380 381 return vfio_info_add_capability(caps, &capspd.header, sizeof(capspd)); 382} 383 384static const struct vfio_pci_regops vfio_pci_npu2_regops = { 385 .rw = vfio_pci_npu2_rw, 386 .mmap = vfio_pci_npu2_mmap, 387 .release = vfio_pci_npu2_release, 388 .add_capability = vfio_pci_npu2_add_capability, 389}; 390 391int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev) 392{ 393 int ret; 394 struct vfio_pci_npu2_data *data; 395 struct device_node *nvlink_dn; 396 u32 nvlink_index = 0, mem_phandle = 0; 397 struct pci_dev *npdev = vdev->pdev; 398 struct device_node *npu_node = pci_device_to_OF_node(npdev); 399 struct pci_controller *hose = pci_bus_to_host(npdev->bus); 400 u64 mmio_atsd = 0; 401 u64 tgt = 0; 402 u32 link_speed = 0xff; 403 404 /* 405 * PCI config space does not tell us about NVLink presense but 406 * platform does, use this. 407 */ 408 if (!pnv_pci_get_gpu_dev(vdev->pdev)) 409 return -ENODEV; 410 411 if (of_property_read_u32(npu_node, "memory-region", &mem_phandle)) 412 return -ENODEV; 413 414 /* 415 * NPU2 normally has 8 ATSD registers (for concurrency) and 6 links 416 * so we can allocate one register per link, using nvlink index as 417 * a key. 418 * There is always at least one ATSD register so as long as at least 419 * NVLink bridge #0 is passed to the guest, ATSD will be available. 420 */ 421 nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); 422 if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", 423 &nvlink_index))) 424 return -ENODEV; 425 426 if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", nvlink_index, 427 &mmio_atsd)) { 428 if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", 0, 429 &mmio_atsd)) { 430 dev_warn(&vdev->pdev->dev, "No available ATSD found\n"); 431 mmio_atsd = 0; 432 } else { 433 dev_warn(&vdev->pdev->dev, 434 "Using fallback ibm,mmio-atsd[0] for ATSD.\n"); 435 } 436 } 437 438 if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) { 439 dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n"); 440 return -EFAULT; 441 } 442 443 if (of_property_read_u32(npu_node, "ibm,nvlink-speed", &link_speed)) { 444 dev_warn(&vdev->pdev->dev, "No ibm,nvlink-speed found\n"); 445 return -EFAULT; 446 } 447 448 data = kzalloc(sizeof(*data), GFP_KERNEL); 449 if (!data) 450 return -ENOMEM; 451 452 data->mmio_atsd = mmio_atsd; 453 data->gpu_tgt = tgt; 454 data->link_speed = link_speed; 455 if (data->mmio_atsd) { 456 data->base = memremap(data->mmio_atsd, SZ_64K, MEMREMAP_WT); 457 if (!data->base) { 458 ret = -ENOMEM; 459 goto free_exit; 460 } 461 } 462 463 /* 464 * We want to expose the capability even if this specific NVLink 465 * did not get its own ATSD register because capabilities 466 * belong to VFIO regions and normally there will be ATSD register 467 * assigned to the NVLink bridge. 468 */ 469 ret = vfio_pci_register_dev_region(vdev, 470 PCI_VENDOR_ID_IBM | 471 VFIO_REGION_TYPE_PCI_VENDOR_TYPE, 472 VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, 473 &vfio_pci_npu2_regops, 474 data->mmio_atsd ? PAGE_SIZE : 0, 475 VFIO_REGION_INFO_FLAG_READ | 476 VFIO_REGION_INFO_FLAG_WRITE | 477 VFIO_REGION_INFO_FLAG_MMAP, 478 data); 479 if (ret) 480 goto free_exit; 481 482 return 0; 483 484free_exit: 485 if (data->base) 486 memunmap(data->base); 487 kfree(data); 488 489 return ret; 490} 491