1/*
2 * Copyright 2018 Red Hat Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22#include "nouveau_svm.h"
23#include "nouveau_drv.h"
24#include "nouveau_chan.h"
25#include "nouveau_dmem.h"
26
27#include <nvif/event.h>
28#include <nvif/object.h>
29#include <nvif/vmm.h>
30
31#include <nvif/class.h>
32#include <nvif/clb069.h>
33#include <nvif/ifc00d.h>
34
35#include <linux/sched/mm.h>
36#include <linux/sort.h>
37#include <linux/hmm.h>
38#include <linux/memremap.h>
39#include <linux/rmap.h>
40
41struct nouveau_svm {
42	struct nouveau_drm *drm;
43	struct mutex mutex;
44	struct list_head inst;
45
46	struct nouveau_svm_fault_buffer {
47		int id;
48		struct nvif_object object;
49		u32 entries;
50		u32 getaddr;
51		u32 putaddr;
52		u32 get;
53		u32 put;
54		struct nvif_event notify;
55		struct work_struct work;
56
57		struct nouveau_svm_fault {
58			u64 inst;
59			u64 addr;
60			u64 time;
61			u32 engine;
62			u8  gpc;
63			u8  hub;
64			u8  access;
65			u8  client;
66			u8  fault;
67			struct nouveau_svmm *svmm;
68		} **fault;
69		int fault_nr;
70	} buffer[1];
71};
72
73#define FAULT_ACCESS_READ 0
74#define FAULT_ACCESS_WRITE 1
75#define FAULT_ACCESS_ATOMIC 2
76#define FAULT_ACCESS_PREFETCH 3
77
78#define SVM_DBG(s,f,a...) NV_DEBUG((s)->drm, "svm: "f"\n", ##a)
79#define SVM_ERR(s,f,a...) NV_WARN((s)->drm, "svm: "f"\n", ##a)
80
81struct nouveau_pfnmap_args {
82	struct nvif_ioctl_v0 i;
83	struct nvif_ioctl_mthd_v0 m;
84	struct nvif_vmm_pfnmap_v0 p;
85};
86
87struct nouveau_ivmm {
88	struct nouveau_svmm *svmm;
89	u64 inst;
90	struct list_head head;
91};
92
93static struct nouveau_ivmm *
94nouveau_ivmm_find(struct nouveau_svm *svm, u64 inst)
95{
96	struct nouveau_ivmm *ivmm;
97	list_for_each_entry(ivmm, &svm->inst, head) {
98		if (ivmm->inst == inst)
99			return ivmm;
100	}
101	return NULL;
102}
103
104#define SVMM_DBG(s,f,a...)                                                     \
105	NV_DEBUG((s)->vmm->cli->drm, "svm-%p: "f"\n", (s), ##a)
106#define SVMM_ERR(s,f,a...)                                                     \
107	NV_WARN((s)->vmm->cli->drm, "svm-%p: "f"\n", (s), ##a)
108
109int
110nouveau_svmm_bind(struct drm_device *dev, void *data,
111		  struct drm_file *file_priv)
112{
113	struct nouveau_cli *cli = nouveau_cli(file_priv);
114	struct drm_nouveau_svm_bind *args = data;
115	unsigned target, cmd, priority;
116	unsigned long addr, end;
117	struct mm_struct *mm;
118
119	args->va_start &= PAGE_MASK;
120	args->va_end = ALIGN(args->va_end, PAGE_SIZE);
121
122	/* Sanity check arguments */
123	if (args->reserved0 || args->reserved1)
124		return -EINVAL;
125	if (args->header & (~NOUVEAU_SVM_BIND_VALID_MASK))
126		return -EINVAL;
127	if (args->va_start >= args->va_end)
128		return -EINVAL;
129
130	cmd = args->header >> NOUVEAU_SVM_BIND_COMMAND_SHIFT;
131	cmd &= NOUVEAU_SVM_BIND_COMMAND_MASK;
132	switch (cmd) {
133	case NOUVEAU_SVM_BIND_COMMAND__MIGRATE:
134		break;
135	default:
136		return -EINVAL;
137	}
138
139	priority = args->header >> NOUVEAU_SVM_BIND_PRIORITY_SHIFT;
140	priority &= NOUVEAU_SVM_BIND_PRIORITY_MASK;
141
142	/* FIXME support CPU target ie all target value < GPU_VRAM */
143	target = args->header >> NOUVEAU_SVM_BIND_TARGET_SHIFT;
144	target &= NOUVEAU_SVM_BIND_TARGET_MASK;
145	switch (target) {
146	case NOUVEAU_SVM_BIND_TARGET__GPU_VRAM:
147		break;
148	default:
149		return -EINVAL;
150	}
151
152	/*
153	 * FIXME: For now refuse non 0 stride, we need to change the migrate
154	 * kernel function to handle stride to avoid to create a mess within
155	 * each device driver.
156	 */
157	if (args->stride)
158		return -EINVAL;
159
160	/*
161	 * Ok we are ask to do something sane, for now we only support migrate
162	 * commands but we will add things like memory policy (what to do on
163	 * page fault) and maybe some other commands.
164	 */
165
166	mm = get_task_mm(current);
167	if (!mm) {
168		return -EINVAL;
169	}
170	mmap_read_lock(mm);
171
172	if (!cli->svm.svmm) {
173		mmap_read_unlock(mm);
174		mmput(mm);
175		return -EINVAL;
176	}
177
178	for (addr = args->va_start, end = args->va_end; addr < end;) {
179		struct vm_area_struct *vma;
180		unsigned long next;
181
182		vma = find_vma_intersection(mm, addr, end);
183		if (!vma)
184			break;
185
186		addr = max(addr, vma->vm_start);
187		next = min(vma->vm_end, end);
188		/* This is a best effort so we ignore errors */
189		nouveau_dmem_migrate_vma(cli->drm, cli->svm.svmm, vma, addr,
190					 next);
191		addr = next;
192	}
193
194	/*
195	 * FIXME Return the number of page we have migrated, again we need to
196	 * update the migrate API to return that information so that we can
197	 * report it to user space.
198	 */
199	args->result = 0;
200
201	mmap_read_unlock(mm);
202	mmput(mm);
203
204	return 0;
205}
206
207/* Unlink channel instance from SVMM. */
208void
209nouveau_svmm_part(struct nouveau_svmm *svmm, u64 inst)
210{
211	struct nouveau_ivmm *ivmm;
212	if (svmm) {
213		mutex_lock(&svmm->vmm->cli->drm->svm->mutex);
214		ivmm = nouveau_ivmm_find(svmm->vmm->cli->drm->svm, inst);
215		if (ivmm) {
216			list_del(&ivmm->head);
217			kfree(ivmm);
218		}
219		mutex_unlock(&svmm->vmm->cli->drm->svm->mutex);
220	}
221}
222
223/* Link channel instance to SVMM. */
224int
225nouveau_svmm_join(struct nouveau_svmm *svmm, u64 inst)
226{
227	struct nouveau_ivmm *ivmm;
228	if (svmm) {
229		if (!(ivmm = kmalloc(sizeof(*ivmm), GFP_KERNEL)))
230			return -ENOMEM;
231		ivmm->svmm = svmm;
232		ivmm->inst = inst;
233
234		mutex_lock(&svmm->vmm->cli->drm->svm->mutex);
235		list_add(&ivmm->head, &svmm->vmm->cli->drm->svm->inst);
236		mutex_unlock(&svmm->vmm->cli->drm->svm->mutex);
237	}
238	return 0;
239}
240
241/* Invalidate SVMM address-range on GPU. */
242void
243nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit)
244{
245	if (limit > start) {
246		nvif_object_mthd(&svmm->vmm->vmm.object, NVIF_VMM_V0_PFNCLR,
247				 &(struct nvif_vmm_pfnclr_v0) {
248					.addr = start,
249					.size = limit - start,
250				 }, sizeof(struct nvif_vmm_pfnclr_v0));
251	}
252}
253
254static int
255nouveau_svmm_invalidate_range_start(struct mmu_notifier *mn,
256				    const struct mmu_notifier_range *update)
257{
258	struct nouveau_svmm *svmm =
259		container_of(mn, struct nouveau_svmm, notifier);
260	unsigned long start = update->start;
261	unsigned long limit = update->end;
262
263	if (!mmu_notifier_range_blockable(update))
264		return -EAGAIN;
265
266	SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit);
267
268	mutex_lock(&svmm->mutex);
269	if (unlikely(!svmm->vmm))
270		goto out;
271
272	/*
273	 * Ignore invalidation callbacks for device private pages since
274	 * the invalidation is handled as part of the migration process.
275	 */
276	if (update->event == MMU_NOTIFY_MIGRATE &&
277	    update->owner == svmm->vmm->cli->drm->dev)
278		goto out;
279
280	if (limit > svmm->unmanaged.start && start < svmm->unmanaged.limit) {
281		if (start < svmm->unmanaged.start) {
282			nouveau_svmm_invalidate(svmm, start,
283						svmm->unmanaged.limit);
284		}
285		start = svmm->unmanaged.limit;
286	}
287
288	nouveau_svmm_invalidate(svmm, start, limit);
289
290out:
291	mutex_unlock(&svmm->mutex);
292	return 0;
293}
294
295static void nouveau_svmm_free_notifier(struct mmu_notifier *mn)
296{
297	kfree(container_of(mn, struct nouveau_svmm, notifier));
298}
299
300static const struct mmu_notifier_ops nouveau_mn_ops = {
301	.invalidate_range_start = nouveau_svmm_invalidate_range_start,
302	.free_notifier = nouveau_svmm_free_notifier,
303};
304
305void
306nouveau_svmm_fini(struct nouveau_svmm **psvmm)
307{
308	struct nouveau_svmm *svmm = *psvmm;
309	if (svmm) {
310		mutex_lock(&svmm->mutex);
311		svmm->vmm = NULL;
312		mutex_unlock(&svmm->mutex);
313		mmu_notifier_put(&svmm->notifier);
314		*psvmm = NULL;
315	}
316}
317
318int
319nouveau_svmm_init(struct drm_device *dev, void *data,
320		  struct drm_file *file_priv)
321{
322	struct nouveau_cli *cli = nouveau_cli(file_priv);
323	struct nouveau_svmm *svmm;
324	struct drm_nouveau_svm_init *args = data;
325	int ret;
326
327	/* We need to fail if svm is disabled */
328	if (!cli->drm->svm)
329		return -ENOSYS;
330
331	/* Allocate tracking for SVM-enabled VMM. */
332	if (!(svmm = kzalloc(sizeof(*svmm), GFP_KERNEL)))
333		return -ENOMEM;
334	svmm->vmm = &cli->svm;
335	svmm->unmanaged.start = args->unmanaged_addr;
336	svmm->unmanaged.limit = args->unmanaged_addr + args->unmanaged_size;
337	mutex_init(&svmm->mutex);
338
339	/* Check that SVM isn't already enabled for the client. */
340	mutex_lock(&cli->mutex);
341	if (cli->svm.cli) {
342		ret = -EBUSY;
343		goto out_free;
344	}
345
346	/* Allocate a new GPU VMM that can support SVM (managed by the
347	 * client, with replayable faults enabled).
348	 *
349	 * All future channel/memory allocations will make use of this
350	 * VMM instead of the standard one.
351	 */
352	ret = nvif_vmm_ctor(&cli->mmu, "svmVmm",
353			    cli->vmm.vmm.object.oclass, MANAGED,
354			    args->unmanaged_addr, args->unmanaged_size,
355			    &(struct gp100_vmm_v0) {
356				.fault_replay = true,
357			    }, sizeof(struct gp100_vmm_v0), &cli->svm.vmm);
358	if (ret)
359		goto out_free;
360
361	mmap_write_lock(current->mm);
362	svmm->notifier.ops = &nouveau_mn_ops;
363	ret = __mmu_notifier_register(&svmm->notifier, current->mm);
364	if (ret)
365		goto out_mm_unlock;
366	/* Note, ownership of svmm transfers to mmu_notifier */
367
368	cli->svm.svmm = svmm;
369	cli->svm.cli = cli;
370	mmap_write_unlock(current->mm);
371	mutex_unlock(&cli->mutex);
372	return 0;
373
374out_mm_unlock:
375	mmap_write_unlock(current->mm);
376out_free:
377	mutex_unlock(&cli->mutex);
378	kfree(svmm);
379	return ret;
380}
381
382/* Issue fault replay for GPU to retry accesses that faulted previously. */
383static void
384nouveau_svm_fault_replay(struct nouveau_svm *svm)
385{
386	SVM_DBG(svm, "replay");
387	WARN_ON(nvif_object_mthd(&svm->drm->client.vmm.vmm.object,
388				 GP100_VMM_VN_FAULT_REPLAY,
389				 &(struct gp100_vmm_fault_replay_vn) {},
390				 sizeof(struct gp100_vmm_fault_replay_vn)));
391}
392
393/* Cancel a replayable fault that could not be handled.
394 *
395 * Cancelling the fault will trigger recovery to reset the engine
396 * and kill the offending channel (ie. GPU SIGSEGV).
397 */
398static void
399nouveau_svm_fault_cancel(struct nouveau_svm *svm,
400			 u64 inst, u8 hub, u8 gpc, u8 client)
401{
402	SVM_DBG(svm, "cancel %016llx %d %02x %02x", inst, hub, gpc, client);
403	WARN_ON(nvif_object_mthd(&svm->drm->client.vmm.vmm.object,
404				 GP100_VMM_VN_FAULT_CANCEL,
405				 &(struct gp100_vmm_fault_cancel_v0) {
406					.hub = hub,
407					.gpc = gpc,
408					.client = client,
409					.inst = inst,
410				 }, sizeof(struct gp100_vmm_fault_cancel_v0)));
411}
412
413static void
414nouveau_svm_fault_cancel_fault(struct nouveau_svm *svm,
415			       struct nouveau_svm_fault *fault)
416{
417	nouveau_svm_fault_cancel(svm, fault->inst,
418				      fault->hub,
419				      fault->gpc,
420				      fault->client);
421}
422
423static int
424nouveau_svm_fault_priority(u8 fault)
425{
426	switch (fault) {
427	case FAULT_ACCESS_PREFETCH:
428		return 0;
429	case FAULT_ACCESS_READ:
430		return 1;
431	case FAULT_ACCESS_WRITE:
432		return 2;
433	case FAULT_ACCESS_ATOMIC:
434		return 3;
435	default:
436		WARN_ON_ONCE(1);
437		return -1;
438	}
439}
440
441static int
442nouveau_svm_fault_cmp(const void *a, const void *b)
443{
444	const struct nouveau_svm_fault *fa = *(struct nouveau_svm_fault **)a;
445	const struct nouveau_svm_fault *fb = *(struct nouveau_svm_fault **)b;
446	int ret;
447	if ((ret = (s64)fa->inst - fb->inst))
448		return ret;
449	if ((ret = (s64)fa->addr - fb->addr))
450		return ret;
451	return nouveau_svm_fault_priority(fa->access) -
452		nouveau_svm_fault_priority(fb->access);
453}
454
455static void
456nouveau_svm_fault_cache(struct nouveau_svm *svm,
457			struct nouveau_svm_fault_buffer *buffer, u32 offset)
458{
459	struct nvif_object *memory = &buffer->object;
460	const u32 instlo = nvif_rd32(memory, offset + 0x00);
461	const u32 insthi = nvif_rd32(memory, offset + 0x04);
462	const u32 addrlo = nvif_rd32(memory, offset + 0x08);
463	const u32 addrhi = nvif_rd32(memory, offset + 0x0c);
464	const u32 timelo = nvif_rd32(memory, offset + 0x10);
465	const u32 timehi = nvif_rd32(memory, offset + 0x14);
466	const u32 engine = nvif_rd32(memory, offset + 0x18);
467	const u32   info = nvif_rd32(memory, offset + 0x1c);
468	const u64   inst = (u64)insthi << 32 | instlo;
469	const u8     gpc = (info & 0x1f000000) >> 24;
470	const u8     hub = (info & 0x00100000) >> 20;
471	const u8  client = (info & 0x00007f00) >> 8;
472	struct nouveau_svm_fault *fault;
473
474	//XXX: i think we're supposed to spin waiting */
475	if (WARN_ON(!(info & 0x80000000)))
476		return;
477
478	nvif_mask(memory, offset + 0x1c, 0x80000000, 0x00000000);
479
480	if (!buffer->fault[buffer->fault_nr]) {
481		fault = kmalloc(sizeof(*fault), GFP_KERNEL);
482		if (WARN_ON(!fault)) {
483			nouveau_svm_fault_cancel(svm, inst, hub, gpc, client);
484			return;
485		}
486		buffer->fault[buffer->fault_nr] = fault;
487	}
488
489	fault = buffer->fault[buffer->fault_nr++];
490	fault->inst   = inst;
491	fault->addr   = (u64)addrhi << 32 | addrlo;
492	fault->time   = (u64)timehi << 32 | timelo;
493	fault->engine = engine;
494	fault->gpc    = gpc;
495	fault->hub    = hub;
496	fault->access = (info & 0x000f0000) >> 16;
497	fault->client = client;
498	fault->fault  = (info & 0x0000001f);
499
500	SVM_DBG(svm, "fault %016llx %016llx %02x",
501		fault->inst, fault->addr, fault->access);
502}
503
504struct svm_notifier {
505	struct mmu_interval_notifier notifier;
506	struct nouveau_svmm *svmm;
507};
508
509static bool nouveau_svm_range_invalidate(struct mmu_interval_notifier *mni,
510					 const struct mmu_notifier_range *range,
511					 unsigned long cur_seq)
512{
513	struct svm_notifier *sn =
514		container_of(mni, struct svm_notifier, notifier);
515
516	if (range->event == MMU_NOTIFY_EXCLUSIVE &&
517	    range->owner == sn->svmm->vmm->cli->drm->dev)
518		return true;
519
520	/*
521	 * serializes the update to mni->invalidate_seq done by caller and
522	 * prevents invalidation of the PTE from progressing while HW is being
523	 * programmed. This is very hacky and only works because the normal
524	 * notifier that does invalidation is always called after the range
525	 * notifier.
526	 */
527	if (mmu_notifier_range_blockable(range))
528		mutex_lock(&sn->svmm->mutex);
529	else if (!mutex_trylock(&sn->svmm->mutex))
530		return false;
531	mmu_interval_set_seq(mni, cur_seq);
532	mutex_unlock(&sn->svmm->mutex);
533	return true;
534}
535
536static const struct mmu_interval_notifier_ops nouveau_svm_mni_ops = {
537	.invalidate = nouveau_svm_range_invalidate,
538};
539
540static void nouveau_hmm_convert_pfn(struct nouveau_drm *drm,
541				    struct hmm_range *range,
542				    struct nouveau_pfnmap_args *args)
543{
544	struct page *page;
545
546	/*
547	 * The address prepared here is passed through nvif_object_ioctl()
548	 * to an eventual DMA map in something like gp100_vmm_pgt_pfn()
549	 *
550	 * This is all just encoding the internal hmm representation into a
551	 * different nouveau internal representation.
552	 */
553	if (!(range->hmm_pfns[0] & HMM_PFN_VALID)) {
554		args->p.phys[0] = 0;
555		return;
556	}
557
558	page = hmm_pfn_to_page(range->hmm_pfns[0]);
559	/*
560	 * Only map compound pages to the GPU if the CPU is also mapping the
561	 * page as a compound page. Otherwise, the PTE protections might not be
562	 * consistent (e.g., CPU only maps part of a compound page).
563	 * Note that the underlying page might still be larger than the
564	 * CPU mapping (e.g., a PUD sized compound page partially mapped with
565	 * a PMD sized page table entry).
566	 */
567	if (hmm_pfn_to_map_order(range->hmm_pfns[0])) {
568		unsigned long addr = args->p.addr;
569
570		args->p.page = hmm_pfn_to_map_order(range->hmm_pfns[0]) +
571				PAGE_SHIFT;
572		args->p.size = 1UL << args->p.page;
573		args->p.addr &= ~(args->p.size - 1);
574		page -= (addr - args->p.addr) >> PAGE_SHIFT;
575	}
576	if (is_device_private_page(page))
577		args->p.phys[0] = nouveau_dmem_page_addr(page) |
578				NVIF_VMM_PFNMAP_V0_V |
579				NVIF_VMM_PFNMAP_V0_VRAM;
580	else
581		args->p.phys[0] = page_to_phys(page) |
582				NVIF_VMM_PFNMAP_V0_V |
583				NVIF_VMM_PFNMAP_V0_HOST;
584	if (range->hmm_pfns[0] & HMM_PFN_WRITE)
585		args->p.phys[0] |= NVIF_VMM_PFNMAP_V0_W;
586}
587
588static int nouveau_atomic_range_fault(struct nouveau_svmm *svmm,
589			       struct nouveau_drm *drm,
590			       struct nouveau_pfnmap_args *args, u32 size,
591			       struct svm_notifier *notifier)
592{
593	unsigned long timeout =
594		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
595	struct mm_struct *mm = svmm->notifier.mm;
596	struct page *page;
597	unsigned long start = args->p.addr;
598	unsigned long notifier_seq;
599	int ret = 0;
600
601	ret = mmu_interval_notifier_insert(&notifier->notifier, mm,
602					args->p.addr, args->p.size,
603					&nouveau_svm_mni_ops);
604	if (ret)
605		return ret;
606
607	while (true) {
608		if (time_after(jiffies, timeout)) {
609			ret = -EBUSY;
610			goto out;
611		}
612
613		notifier_seq = mmu_interval_read_begin(&notifier->notifier);
614		mmap_read_lock(mm);
615		ret = make_device_exclusive_range(mm, start, start + PAGE_SIZE,
616					    &page, drm->dev);
617		mmap_read_unlock(mm);
618		if (ret <= 0 || !page) {
619			ret = -EINVAL;
620			goto out;
621		}
622
623		mutex_lock(&svmm->mutex);
624		if (!mmu_interval_read_retry(&notifier->notifier,
625					     notifier_seq))
626			break;
627		mutex_unlock(&svmm->mutex);
628	}
629
630	/* Map the page on the GPU. */
631	args->p.page = 12;
632	args->p.size = PAGE_SIZE;
633	args->p.addr = start;
634	args->p.phys[0] = page_to_phys(page) |
635		NVIF_VMM_PFNMAP_V0_V |
636		NVIF_VMM_PFNMAP_V0_W |
637		NVIF_VMM_PFNMAP_V0_A |
638		NVIF_VMM_PFNMAP_V0_HOST;
639
640	ret = nvif_object_ioctl(&svmm->vmm->vmm.object, args, size, NULL);
641	mutex_unlock(&svmm->mutex);
642
643	unlock_page(page);
644	put_page(page);
645
646out:
647	mmu_interval_notifier_remove(&notifier->notifier);
648	return ret;
649}
650
651static int nouveau_range_fault(struct nouveau_svmm *svmm,
652			       struct nouveau_drm *drm,
653			       struct nouveau_pfnmap_args *args, u32 size,
654			       unsigned long hmm_flags,
655			       struct svm_notifier *notifier)
656{
657	unsigned long timeout =
658		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
659	/* Have HMM fault pages within the fault window to the GPU. */
660	unsigned long hmm_pfns[1];
661	struct hmm_range range = {
662		.notifier = &notifier->notifier,
663		.default_flags = hmm_flags,
664		.hmm_pfns = hmm_pfns,
665		.dev_private_owner = drm->dev,
666	};
667	struct mm_struct *mm = svmm->notifier.mm;
668	int ret;
669
670	ret = mmu_interval_notifier_insert(&notifier->notifier, mm,
671					args->p.addr, args->p.size,
672					&nouveau_svm_mni_ops);
673	if (ret)
674		return ret;
675
676	range.start = notifier->notifier.interval_tree.start;
677	range.end = notifier->notifier.interval_tree.last + 1;
678
679	while (true) {
680		if (time_after(jiffies, timeout)) {
681			ret = -EBUSY;
682			goto out;
683		}
684
685		range.notifier_seq = mmu_interval_read_begin(range.notifier);
686		mmap_read_lock(mm);
687		ret = hmm_range_fault(&range);
688		mmap_read_unlock(mm);
689		if (ret) {
690			if (ret == -EBUSY)
691				continue;
692			goto out;
693		}
694
695		mutex_lock(&svmm->mutex);
696		if (mmu_interval_read_retry(range.notifier,
697					    range.notifier_seq)) {
698			mutex_unlock(&svmm->mutex);
699			continue;
700		}
701		break;
702	}
703
704	nouveau_hmm_convert_pfn(drm, &range, args);
705
706	ret = nvif_object_ioctl(&svmm->vmm->vmm.object, args, size, NULL);
707	mutex_unlock(&svmm->mutex);
708
709out:
710	mmu_interval_notifier_remove(&notifier->notifier);
711
712	return ret;
713}
714
715static void
716nouveau_svm_fault(struct work_struct *work)
717{
718	struct nouveau_svm_fault_buffer *buffer = container_of(work, typeof(*buffer), work);
719	struct nouveau_svm *svm = container_of(buffer, typeof(*svm), buffer[buffer->id]);
720	struct nvif_object *device = &svm->drm->client.device.object;
721	struct nouveau_svmm *svmm;
722	struct {
723		struct nouveau_pfnmap_args i;
724		u64 phys[1];
725	} args;
726	unsigned long hmm_flags;
727	u64 inst, start, limit;
728	int fi, fn;
729	int replay = 0, atomic = 0, ret;
730
731	/* Parse available fault buffer entries into a cache, and update
732	 * the GET pointer so HW can reuse the entries.
733	 */
734	SVM_DBG(svm, "fault handler");
735	if (buffer->get == buffer->put) {
736		buffer->put = nvif_rd32(device, buffer->putaddr);
737		buffer->get = nvif_rd32(device, buffer->getaddr);
738		if (buffer->get == buffer->put)
739			return;
740	}
741	buffer->fault_nr = 0;
742
743	SVM_DBG(svm, "get %08x put %08x", buffer->get, buffer->put);
744	while (buffer->get != buffer->put) {
745		nouveau_svm_fault_cache(svm, buffer, buffer->get * 0x20);
746		if (++buffer->get == buffer->entries)
747			buffer->get = 0;
748	}
749	nvif_wr32(device, buffer->getaddr, buffer->get);
750	SVM_DBG(svm, "%d fault(s) pending", buffer->fault_nr);
751
752	/* Sort parsed faults by instance pointer to prevent unnecessary
753	 * instance to SVMM translations, followed by address and access
754	 * type to reduce the amount of work when handling the faults.
755	 */
756	sort(buffer->fault, buffer->fault_nr, sizeof(*buffer->fault),
757	     nouveau_svm_fault_cmp, NULL);
758
759	/* Lookup SVMM structure for each unique instance pointer. */
760	mutex_lock(&svm->mutex);
761	for (fi = 0, svmm = NULL; fi < buffer->fault_nr; fi++) {
762		if (!svmm || buffer->fault[fi]->inst != inst) {
763			struct nouveau_ivmm *ivmm =
764				nouveau_ivmm_find(svm, buffer->fault[fi]->inst);
765			svmm = ivmm ? ivmm->svmm : NULL;
766			inst = buffer->fault[fi]->inst;
767			SVM_DBG(svm, "inst %016llx -> svm-%p", inst, svmm);
768		}
769		buffer->fault[fi]->svmm = svmm;
770	}
771	mutex_unlock(&svm->mutex);
772
773	/* Process list of faults. */
774	args.i.i.version = 0;
775	args.i.i.type = NVIF_IOCTL_V0_MTHD;
776	args.i.m.version = 0;
777	args.i.m.method = NVIF_VMM_V0_PFNMAP;
778	args.i.p.version = 0;
779
780	for (fi = 0; fn = fi + 1, fi < buffer->fault_nr; fi = fn) {
781		struct svm_notifier notifier;
782		struct mm_struct *mm;
783
784		/* Cancel any faults from non-SVM channels. */
785		if (!(svmm = buffer->fault[fi]->svmm)) {
786			nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
787			continue;
788		}
789		SVMM_DBG(svmm, "addr %016llx", buffer->fault[fi]->addr);
790
791		/* We try and group handling of faults within a small
792		 * window into a single update.
793		 */
794		start = buffer->fault[fi]->addr;
795		limit = start + PAGE_SIZE;
796		if (start < svmm->unmanaged.limit)
797			limit = min_t(u64, limit, svmm->unmanaged.start);
798
799		/*
800		 * Prepare the GPU-side update of all pages within the
801		 * fault window, determining required pages and access
802		 * permissions based on pending faults.
803		 */
804		args.i.p.addr = start;
805		args.i.p.page = PAGE_SHIFT;
806		args.i.p.size = PAGE_SIZE;
807		/*
808		 * Determine required permissions based on GPU fault
809		 * access flags.
810		 */
811		switch (buffer->fault[fi]->access) {
812		case 0: /* READ. */
813			hmm_flags = HMM_PFN_REQ_FAULT;
814			break;
815		case 2: /* ATOMIC. */
816			atomic = true;
817			break;
818		case 3: /* PREFETCH. */
819			hmm_flags = 0;
820			break;
821		default:
822			hmm_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE;
823			break;
824		}
825
826		mm = svmm->notifier.mm;
827		if (!mmget_not_zero(mm)) {
828			nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
829			continue;
830		}
831
832		notifier.svmm = svmm;
833		if (atomic)
834			ret = nouveau_atomic_range_fault(svmm, svm->drm,
835							 &args.i, sizeof(args),
836							 &notifier);
837		else
838			ret = nouveau_range_fault(svmm, svm->drm, &args.i,
839						  sizeof(args), hmm_flags,
840						  &notifier);
841		mmput(mm);
842
843		limit = args.i.p.addr + args.i.p.size;
844		for (fn = fi; ++fn < buffer->fault_nr; ) {
845			/* It's okay to skip over duplicate addresses from the
846			 * same SVMM as faults are ordered by access type such
847			 * that only the first one needs to be handled.
848			 *
849			 * ie. WRITE faults appear first, thus any handling of
850			 * pending READ faults will already be satisfied.
851			 * But if a large page is mapped, make sure subsequent
852			 * fault addresses have sufficient access permission.
853			 */
854			if (buffer->fault[fn]->svmm != svmm ||
855			    buffer->fault[fn]->addr >= limit ||
856			    (buffer->fault[fi]->access == FAULT_ACCESS_READ &&
857			     !(args.phys[0] & NVIF_VMM_PFNMAP_V0_V)) ||
858			    (buffer->fault[fi]->access != FAULT_ACCESS_READ &&
859			     buffer->fault[fi]->access != FAULT_ACCESS_PREFETCH &&
860			     !(args.phys[0] & NVIF_VMM_PFNMAP_V0_W)) ||
861			    (buffer->fault[fi]->access != FAULT_ACCESS_READ &&
862			     buffer->fault[fi]->access != FAULT_ACCESS_WRITE &&
863			     buffer->fault[fi]->access != FAULT_ACCESS_PREFETCH &&
864			     !(args.phys[0] & NVIF_VMM_PFNMAP_V0_A)))
865				break;
866		}
867
868		/* If handling failed completely, cancel all faults. */
869		if (ret) {
870			while (fi < fn) {
871				struct nouveau_svm_fault *fault =
872					buffer->fault[fi++];
873
874				nouveau_svm_fault_cancel_fault(svm, fault);
875			}
876		} else
877			replay++;
878	}
879
880	/* Issue fault replay to the GPU. */
881	if (replay)
882		nouveau_svm_fault_replay(svm);
883}
884
885static int
886nouveau_svm_event(struct nvif_event *event, void *argv, u32 argc)
887{
888	struct nouveau_svm_fault_buffer *buffer = container_of(event, typeof(*buffer), notify);
889
890	schedule_work(&buffer->work);
891	return NVIF_EVENT_KEEP;
892}
893
894static struct nouveau_pfnmap_args *
895nouveau_pfns_to_args(void *pfns)
896{
897	return container_of(pfns, struct nouveau_pfnmap_args, p.phys);
898}
899
900u64 *
901nouveau_pfns_alloc(unsigned long npages)
902{
903	struct nouveau_pfnmap_args *args;
904
905	args = kzalloc(struct_size(args, p.phys, npages), GFP_KERNEL);
906	if (!args)
907		return NULL;
908
909	args->i.type = NVIF_IOCTL_V0_MTHD;
910	args->m.method = NVIF_VMM_V0_PFNMAP;
911	args->p.page = PAGE_SHIFT;
912
913	return args->p.phys;
914}
915
916void
917nouveau_pfns_free(u64 *pfns)
918{
919	struct nouveau_pfnmap_args *args = nouveau_pfns_to_args(pfns);
920
921	kfree(args);
922}
923
924void
925nouveau_pfns_map(struct nouveau_svmm *svmm, struct mm_struct *mm,
926		 unsigned long addr, u64 *pfns, unsigned long npages)
927{
928	struct nouveau_pfnmap_args *args = nouveau_pfns_to_args(pfns);
929	int ret;
930
931	args->p.addr = addr;
932	args->p.size = npages << PAGE_SHIFT;
933
934	mutex_lock(&svmm->mutex);
935
936	ret = nvif_object_ioctl(&svmm->vmm->vmm.object, args,
937				struct_size(args, p.phys, npages), NULL);
938
939	mutex_unlock(&svmm->mutex);
940}
941
942static void
943nouveau_svm_fault_buffer_fini(struct nouveau_svm *svm, int id)
944{
945	struct nouveau_svm_fault_buffer *buffer = &svm->buffer[id];
946
947	nvif_event_block(&buffer->notify);
948	flush_work(&buffer->work);
949}
950
951static int
952nouveau_svm_fault_buffer_init(struct nouveau_svm *svm, int id)
953{
954	struct nouveau_svm_fault_buffer *buffer = &svm->buffer[id];
955	struct nvif_object *device = &svm->drm->client.device.object;
956
957	buffer->get = nvif_rd32(device, buffer->getaddr);
958	buffer->put = nvif_rd32(device, buffer->putaddr);
959	SVM_DBG(svm, "get %08x put %08x (init)", buffer->get, buffer->put);
960
961	return nvif_event_allow(&buffer->notify);
962}
963
964static void
965nouveau_svm_fault_buffer_dtor(struct nouveau_svm *svm, int id)
966{
967	struct nouveau_svm_fault_buffer *buffer = &svm->buffer[id];
968	int i;
969
970	if (!nvif_object_constructed(&buffer->object))
971		return;
972
973	nouveau_svm_fault_buffer_fini(svm, id);
974
975	if (buffer->fault) {
976		for (i = 0; buffer->fault[i] && i < buffer->entries; i++)
977			kfree(buffer->fault[i]);
978		kvfree(buffer->fault);
979	}
980
981	nvif_event_dtor(&buffer->notify);
982	nvif_object_dtor(&buffer->object);
983}
984
985static int
986nouveau_svm_fault_buffer_ctor(struct nouveau_svm *svm, s32 oclass, int id)
987{
988	struct nouveau_svm_fault_buffer *buffer = &svm->buffer[id];
989	struct nouveau_drm *drm = svm->drm;
990	struct nvif_object *device = &drm->client.device.object;
991	struct nvif_clb069_v0 args = {};
992	int ret;
993
994	buffer->id = id;
995
996	ret = nvif_object_ctor(device, "svmFaultBuffer", 0, oclass, &args,
997			       sizeof(args), &buffer->object);
998	if (ret < 0) {
999		SVM_ERR(svm, "Fault buffer allocation failed: %d", ret);
1000		return ret;
1001	}
1002
1003	nvif_object_map(&buffer->object, NULL, 0);
1004	buffer->entries = args.entries;
1005	buffer->getaddr = args.get;
1006	buffer->putaddr = args.put;
1007	INIT_WORK(&buffer->work, nouveau_svm_fault);
1008
1009	ret = nvif_event_ctor(&buffer->object, "svmFault", id, nouveau_svm_event, true, NULL, 0,
1010			      &buffer->notify);
1011	if (ret)
1012		return ret;
1013
1014	buffer->fault = kvcalloc(buffer->entries, sizeof(*buffer->fault), GFP_KERNEL);
1015	if (!buffer->fault)
1016		return -ENOMEM;
1017
1018	return nouveau_svm_fault_buffer_init(svm, id);
1019}
1020
1021void
1022nouveau_svm_resume(struct nouveau_drm *drm)
1023{
1024	struct nouveau_svm *svm = drm->svm;
1025	if (svm)
1026		nouveau_svm_fault_buffer_init(svm, 0);
1027}
1028
1029void
1030nouveau_svm_suspend(struct nouveau_drm *drm)
1031{
1032	struct nouveau_svm *svm = drm->svm;
1033	if (svm)
1034		nouveau_svm_fault_buffer_fini(svm, 0);
1035}
1036
1037void
1038nouveau_svm_fini(struct nouveau_drm *drm)
1039{
1040	struct nouveau_svm *svm = drm->svm;
1041	if (svm) {
1042		nouveau_svm_fault_buffer_dtor(svm, 0);
1043		kfree(drm->svm);
1044		drm->svm = NULL;
1045	}
1046}
1047
1048void
1049nouveau_svm_init(struct nouveau_drm *drm)
1050{
1051	static const struct nvif_mclass buffers[] = {
1052		{   VOLTA_FAULT_BUFFER_A, 0 },
1053		{ MAXWELL_FAULT_BUFFER_A, 0 },
1054		{}
1055	};
1056	struct nouveau_svm *svm;
1057	int ret;
1058
1059	/* Disable on Volta and newer until channel recovery is fixed,
1060	 * otherwise clients will have a trivial way to trash the GPU
1061	 * for everyone.
1062	 */
1063	if (drm->client.device.info.family > NV_DEVICE_INFO_V0_PASCAL)
1064		return;
1065
1066	if (!(drm->svm = svm = kzalloc(sizeof(*drm->svm), GFP_KERNEL)))
1067		return;
1068
1069	drm->svm->drm = drm;
1070	mutex_init(&drm->svm->mutex);
1071	INIT_LIST_HEAD(&drm->svm->inst);
1072
1073	ret = nvif_mclass(&drm->client.device.object, buffers);
1074	if (ret < 0) {
1075		SVM_DBG(svm, "No supported fault buffer class");
1076		nouveau_svm_fini(drm);
1077		return;
1078	}
1079
1080	ret = nouveau_svm_fault_buffer_ctor(svm, buffers[ret].oclass, 0);
1081	if (ret) {
1082		nouveau_svm_fini(drm);
1083		return;
1084	}
1085
1086	SVM_DBG(svm, "Initialised");
1087}
1088