1// SPDX-License-Identifier: GPL-2.0 OR MIT
2/*
3 * Copyright 2020-2021 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24#include <linux/types.h>
25#include <linux/sched/task.h>
26#include <linux/dynamic_debug.h>
27#include <drm/ttm/ttm_tt.h>
28#include <drm/drm_exec.h>
29
30#include "amdgpu_sync.h"
31#include "amdgpu_object.h"
32#include "amdgpu_vm.h"
33#include "amdgpu_hmm.h"
34#include "amdgpu.h"
35#include "amdgpu_xgmi.h"
36#include "kfd_priv.h"
37#include "kfd_svm.h"
38#include "kfd_migrate.h"
39#include "kfd_smi_events.h"
40
41#ifdef dev_fmt
42#undef dev_fmt
43#endif
44#define dev_fmt(fmt) "kfd_svm: %s: " fmt, __func__
45
46#define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1
47
48/* Long enough to ensure no retry fault comes after svm range is restored and
49 * page table is updated.
50 */
51#define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING	(2UL * NSEC_PER_MSEC)
52#if IS_ENABLED(CONFIG_DYNAMIC_DEBUG)
53#define dynamic_svm_range_dump(svms) \
54	_dynamic_func_call_no_desc("svm_range_dump", svm_range_debug_dump, svms)
55#else
56#define dynamic_svm_range_dump(svms) \
57	do { if (0) svm_range_debug_dump(svms); } while (0)
58#endif
59
60/* Giant svm range split into smaller ranges based on this, it is decided using
61 * minimum of all dGPU/APU 1/32 VRAM size, between 2MB to 1GB and alignment to
62 * power of 2MB.
63 */
64static uint64_t max_svm_range_pages;
65
66struct criu_svm_metadata {
67	struct list_head list;
68	struct kfd_criu_svm_range_priv_data data;
69};
70
71static void svm_range_evict_svm_bo_worker(struct work_struct *work);
72static bool
73svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
74				    const struct mmu_notifier_range *range,
75				    unsigned long cur_seq);
76static int
77svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last,
78		   uint64_t *bo_s, uint64_t *bo_l);
79static const struct mmu_interval_notifier_ops svm_range_mn_ops = {
80	.invalidate = svm_range_cpu_invalidate_pagetables,
81};
82
83/**
84 * svm_range_unlink - unlink svm_range from lists and interval tree
85 * @prange: svm range structure to be removed
86 *
87 * Remove the svm_range from the svms and svm_bo lists and the svms
88 * interval tree.
89 *
90 * Context: The caller must hold svms->lock
91 */
92static void svm_range_unlink(struct svm_range *prange)
93{
94	pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
95		 prange, prange->start, prange->last);
96
97	if (prange->svm_bo) {
98		spin_lock(&prange->svm_bo->list_lock);
99		list_del(&prange->svm_bo_list);
100		spin_unlock(&prange->svm_bo->list_lock);
101	}
102
103	list_del(&prange->list);
104	if (prange->it_node.start != 0 && prange->it_node.last != 0)
105		interval_tree_remove(&prange->it_node, &prange->svms->objects);
106}
107
108static void
109svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange)
110{
111	pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
112		 prange, prange->start, prange->last);
113
114	mmu_interval_notifier_insert_locked(&prange->notifier, mm,
115				     prange->start << PAGE_SHIFT,
116				     prange->npages << PAGE_SHIFT,
117				     &svm_range_mn_ops);
118}
119
120/**
121 * svm_range_add_to_svms - add svm range to svms
122 * @prange: svm range structure to be added
123 *
124 * Add the svm range to svms interval tree and link list
125 *
126 * Context: The caller must hold svms->lock
127 */
128static void svm_range_add_to_svms(struct svm_range *prange)
129{
130	pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
131		 prange, prange->start, prange->last);
132
133	list_move_tail(&prange->list, &prange->svms->list);
134	prange->it_node.start = prange->start;
135	prange->it_node.last = prange->last;
136	interval_tree_insert(&prange->it_node, &prange->svms->objects);
137}
138
139static void svm_range_remove_notifier(struct svm_range *prange)
140{
141	pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n",
142		 prange->svms, prange,
143		 prange->notifier.interval_tree.start >> PAGE_SHIFT,
144		 prange->notifier.interval_tree.last >> PAGE_SHIFT);
145
146	if (prange->notifier.interval_tree.start != 0 &&
147	    prange->notifier.interval_tree.last != 0)
148		mmu_interval_notifier_remove(&prange->notifier);
149}
150
151static bool
152svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr)
153{
154	return dma_addr && !dma_mapping_error(dev, dma_addr) &&
155	       !(dma_addr & SVM_RANGE_VRAM_DOMAIN);
156}
157
158static int
159svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange,
160		      unsigned long offset, unsigned long npages,
161		      unsigned long *hmm_pfns, uint32_t gpuidx)
162{
163	enum dma_data_direction dir = DMA_BIDIRECTIONAL;
164	dma_addr_t *addr = prange->dma_addr[gpuidx];
165	struct device *dev = adev->dev;
166	struct page *page;
167	int i, r;
168
169	if (!addr) {
170		addr = kvcalloc(prange->npages, sizeof(*addr), GFP_KERNEL);
171		if (!addr)
172			return -ENOMEM;
173		prange->dma_addr[gpuidx] = addr;
174	}
175
176	addr += offset;
177	for (i = 0; i < npages; i++) {
178		if (svm_is_valid_dma_mapping_addr(dev, addr[i]))
179			dma_unmap_page(dev, addr[i], PAGE_SIZE, dir);
180
181		page = hmm_pfn_to_page(hmm_pfns[i]);
182		if (is_zone_device_page(page)) {
183			struct amdgpu_device *bo_adev = prange->svm_bo->node->adev;
184
185			addr[i] = (hmm_pfns[i] << PAGE_SHIFT) +
186				   bo_adev->vm_manager.vram_base_offset -
187				   bo_adev->kfd.pgmap.range.start;
188			addr[i] |= SVM_RANGE_VRAM_DOMAIN;
189			pr_debug_ratelimited("vram address: 0x%llx\n", addr[i]);
190			continue;
191		}
192		addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
193		r = dma_mapping_error(dev, addr[i]);
194		if (r) {
195			dev_err(dev, "failed %d dma_map_page\n", r);
196			return r;
197		}
198		pr_debug_ratelimited("dma mapping 0x%llx for page addr 0x%lx\n",
199				     addr[i] >> PAGE_SHIFT, page_to_pfn(page));
200	}
201	return 0;
202}
203
204static int
205svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap,
206		  unsigned long offset, unsigned long npages,
207		  unsigned long *hmm_pfns)
208{
209	struct kfd_process *p;
210	uint32_t gpuidx;
211	int r;
212
213	p = container_of(prange->svms, struct kfd_process, svms);
214
215	for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
216		struct kfd_process_device *pdd;
217
218		pr_debug("mapping to gpu idx 0x%x\n", gpuidx);
219		pdd = kfd_process_device_from_gpuidx(p, gpuidx);
220		if (!pdd) {
221			pr_debug("failed to find device idx %d\n", gpuidx);
222			return -EINVAL;
223		}
224
225		r = svm_range_dma_map_dev(pdd->dev->adev, prange, offset, npages,
226					  hmm_pfns, gpuidx);
227		if (r)
228			break;
229	}
230
231	return r;
232}
233
234void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
235			 unsigned long offset, unsigned long npages)
236{
237	enum dma_data_direction dir = DMA_BIDIRECTIONAL;
238	int i;
239
240	if (!dma_addr)
241		return;
242
243	for (i = offset; i < offset + npages; i++) {
244		if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i]))
245			continue;
246		pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i] >> PAGE_SHIFT);
247		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
248		dma_addr[i] = 0;
249	}
250}
251
252void svm_range_free_dma_mappings(struct svm_range *prange, bool unmap_dma)
253{
254	struct kfd_process_device *pdd;
255	dma_addr_t *dma_addr;
256	struct device *dev;
257	struct kfd_process *p;
258	uint32_t gpuidx;
259
260	p = container_of(prange->svms, struct kfd_process, svms);
261
262	for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) {
263		dma_addr = prange->dma_addr[gpuidx];
264		if (!dma_addr)
265			continue;
266
267		pdd = kfd_process_device_from_gpuidx(p, gpuidx);
268		if (!pdd) {
269			pr_debug("failed to find device idx %d\n", gpuidx);
270			continue;
271		}
272		dev = &pdd->dev->adev->pdev->dev;
273		if (unmap_dma)
274			svm_range_dma_unmap(dev, dma_addr, 0, prange->npages);
275		kvfree(dma_addr);
276		prange->dma_addr[gpuidx] = NULL;
277	}
278}
279
280static void svm_range_free(struct svm_range *prange, bool do_unmap)
281{
282	uint64_t size = (prange->last - prange->start + 1) << PAGE_SHIFT;
283	struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
284
285	pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange,
286		 prange->start, prange->last);
287
288	svm_range_vram_node_free(prange);
289	svm_range_free_dma_mappings(prange, do_unmap);
290
291	if (do_unmap && !p->xnack_enabled) {
292		pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange, size);
293		amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
294					KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
295	}
296	mutex_destroy(&prange->lock);
297	mutex_destroy(&prange->migrate_mutex);
298	kfree(prange);
299}
300
301static void
302svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc,
303				 uint8_t *granularity, uint32_t *flags)
304{
305	*location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
306	*prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
307	*granularity = 9;
308	*flags =
309		KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT;
310}
311
312static struct
313svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
314			 uint64_t last, bool update_mem_usage)
315{
316	uint64_t size = last - start + 1;
317	struct svm_range *prange;
318	struct kfd_process *p;
319
320	prange = kzalloc(sizeof(*prange), GFP_KERNEL);
321	if (!prange)
322		return NULL;
323
324	p = container_of(svms, struct kfd_process, svms);
325	if (!p->xnack_enabled && update_mem_usage &&
326	    amdgpu_amdkfd_reserve_mem_limit(NULL, size << PAGE_SHIFT,
327				    KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0)) {
328		pr_info("SVM mapping failed, exceeds resident system memory limit\n");
329		kfree(prange);
330		return NULL;
331	}
332	prange->npages = size;
333	prange->svms = svms;
334	prange->start = start;
335	prange->last = last;
336	INIT_LIST_HEAD(&prange->list);
337	INIT_LIST_HEAD(&prange->update_list);
338	INIT_LIST_HEAD(&prange->svm_bo_list);
339	INIT_LIST_HEAD(&prange->deferred_list);
340	INIT_LIST_HEAD(&prange->child_list);
341	atomic_set(&prange->invalid, 0);
342	prange->validate_timestamp = 0;
343	mutex_init(&prange->migrate_mutex);
344	mutex_init(&prange->lock);
345
346	if (p->xnack_enabled)
347		bitmap_copy(prange->bitmap_access, svms->bitmap_supported,
348			    MAX_GPU_INSTANCE);
349
350	svm_range_set_default_attributes(&prange->preferred_loc,
351					 &prange->prefetch_loc,
352					 &prange->granularity, &prange->flags);
353
354	pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last);
355
356	return prange;
357}
358
359static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo)
360{
361	if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref))
362		return false;
363
364	return true;
365}
366
367static void svm_range_bo_release(struct kref *kref)
368{
369	struct svm_range_bo *svm_bo;
370
371	svm_bo = container_of(kref, struct svm_range_bo, kref);
372	pr_debug("svm_bo 0x%p\n", svm_bo);
373
374	spin_lock(&svm_bo->list_lock);
375	while (!list_empty(&svm_bo->range_list)) {
376		struct svm_range *prange =
377				list_first_entry(&svm_bo->range_list,
378						struct svm_range, svm_bo_list);
379		/* list_del_init tells a concurrent svm_range_vram_node_new when
380		 * it's safe to reuse the svm_bo pointer and svm_bo_list head.
381		 */
382		list_del_init(&prange->svm_bo_list);
383		spin_unlock(&svm_bo->list_lock);
384
385		pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
386			 prange->start, prange->last);
387		mutex_lock(&prange->lock);
388		prange->svm_bo = NULL;
389		mutex_unlock(&prange->lock);
390
391		spin_lock(&svm_bo->list_lock);
392	}
393	spin_unlock(&svm_bo->list_lock);
394	if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base))
395		/* We're not in the eviction worker. Signal the fence. */
396		dma_fence_signal(&svm_bo->eviction_fence->base);
397	dma_fence_put(&svm_bo->eviction_fence->base);
398	amdgpu_bo_unref(&svm_bo->bo);
399	kfree(svm_bo);
400}
401
402static void svm_range_bo_wq_release(struct work_struct *work)
403{
404	struct svm_range_bo *svm_bo;
405
406	svm_bo = container_of(work, struct svm_range_bo, release_work);
407	svm_range_bo_release(&svm_bo->kref);
408}
409
410static void svm_range_bo_release_async(struct kref *kref)
411{
412	struct svm_range_bo *svm_bo;
413
414	svm_bo = container_of(kref, struct svm_range_bo, kref);
415	pr_debug("svm_bo 0x%p\n", svm_bo);
416	INIT_WORK(&svm_bo->release_work, svm_range_bo_wq_release);
417	schedule_work(&svm_bo->release_work);
418}
419
420void svm_range_bo_unref_async(struct svm_range_bo *svm_bo)
421{
422	kref_put(&svm_bo->kref, svm_range_bo_release_async);
423}
424
425static void svm_range_bo_unref(struct svm_range_bo *svm_bo)
426{
427	if (svm_bo)
428		kref_put(&svm_bo->kref, svm_range_bo_release);
429}
430
431static bool
432svm_range_validate_svm_bo(struct kfd_node *node, struct svm_range *prange)
433{
434	mutex_lock(&prange->lock);
435	if (!prange->svm_bo) {
436		mutex_unlock(&prange->lock);
437		return false;
438	}
439	if (prange->ttm_res) {
440		/* We still have a reference, all is well */
441		mutex_unlock(&prange->lock);
442		return true;
443	}
444	if (svm_bo_ref_unless_zero(prange->svm_bo)) {
445		/*
446		 * Migrate from GPU to GPU, remove range from source svm_bo->node
447		 * range list, and return false to allocate svm_bo from destination
448		 * node.
449		 */
450		if (prange->svm_bo->node != node) {
451			mutex_unlock(&prange->lock);
452
453			spin_lock(&prange->svm_bo->list_lock);
454			list_del_init(&prange->svm_bo_list);
455			spin_unlock(&prange->svm_bo->list_lock);
456
457			svm_range_bo_unref(prange->svm_bo);
458			return false;
459		}
460		if (READ_ONCE(prange->svm_bo->evicting)) {
461			struct dma_fence *f;
462			struct svm_range_bo *svm_bo;
463			/* The BO is getting evicted,
464			 * we need to get a new one
465			 */
466			mutex_unlock(&prange->lock);
467			svm_bo = prange->svm_bo;
468			f = dma_fence_get(&svm_bo->eviction_fence->base);
469			svm_range_bo_unref(prange->svm_bo);
470			/* wait for the fence to avoid long spin-loop
471			 * at list_empty_careful
472			 */
473			dma_fence_wait(f, false);
474			dma_fence_put(f);
475		} else {
476			/* The BO was still around and we got
477			 * a new reference to it
478			 */
479			mutex_unlock(&prange->lock);
480			pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n",
481				 prange->svms, prange->start, prange->last);
482
483			prange->ttm_res = prange->svm_bo->bo->tbo.resource;
484			return true;
485		}
486
487	} else {
488		mutex_unlock(&prange->lock);
489	}
490
491	/* We need a new svm_bo. Spin-loop to wait for concurrent
492	 * svm_range_bo_release to finish removing this range from
493	 * its range list and set prange->svm_bo to null. After this,
494	 * it is safe to reuse the svm_bo pointer and svm_bo_list head.
495	 */
496	while (!list_empty_careful(&prange->svm_bo_list) || prange->svm_bo)
497		cond_resched();
498
499	return false;
500}
501
502static struct svm_range_bo *svm_range_bo_new(void)
503{
504	struct svm_range_bo *svm_bo;
505
506	svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL);
507	if (!svm_bo)
508		return NULL;
509
510	kref_init(&svm_bo->kref);
511	INIT_LIST_HEAD(&svm_bo->range_list);
512	spin_lock_init(&svm_bo->list_lock);
513
514	return svm_bo;
515}
516
517int
518svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange,
519			bool clear)
520{
521	struct amdgpu_bo_param bp;
522	struct svm_range_bo *svm_bo;
523	struct amdgpu_bo_user *ubo;
524	struct amdgpu_bo *bo;
525	struct kfd_process *p;
526	struct mm_struct *mm;
527	int r;
528
529	p = container_of(prange->svms, struct kfd_process, svms);
530	pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms,
531		 prange->start, prange->last);
532
533	if (svm_range_validate_svm_bo(node, prange))
534		return 0;
535
536	svm_bo = svm_range_bo_new();
537	if (!svm_bo) {
538		pr_debug("failed to alloc svm bo\n");
539		return -ENOMEM;
540	}
541	mm = get_task_mm(p->lead_thread);
542	if (!mm) {
543		pr_debug("failed to get mm\n");
544		kfree(svm_bo);
545		return -ESRCH;
546	}
547	svm_bo->node = node;
548	svm_bo->eviction_fence =
549		amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),
550					   mm,
551					   svm_bo);
552	mmput(mm);
553	INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker);
554	svm_bo->evicting = 0;
555	memset(&bp, 0, sizeof(bp));
556	bp.size = prange->npages * PAGE_SIZE;
557	bp.byte_align = PAGE_SIZE;
558	bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
559	bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
560	bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0;
561	bp.flags |= AMDGPU_GEM_CREATE_DISCARDABLE;
562	bp.type = ttm_bo_type_device;
563	bp.resv = NULL;
564	if (node->xcp)
565		bp.xcp_id_plus1 = node->xcp->id + 1;
566
567	r = amdgpu_bo_create_user(node->adev, &bp, &ubo);
568	if (r) {
569		pr_debug("failed %d to create bo\n", r);
570		goto create_bo_failed;
571	}
572	bo = &ubo->bo;
573
574	pr_debug("alloc bo at offset 0x%lx size 0x%lx on partition %d\n",
575		 bo->tbo.resource->start << PAGE_SHIFT, bp.size,
576		 bp.xcp_id_plus1 - 1);
577
578	r = amdgpu_bo_reserve(bo, true);
579	if (r) {
580		pr_debug("failed %d to reserve bo\n", r);
581		goto reserve_bo_failed;
582	}
583
584	if (clear) {
585		r = amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);
586		if (r) {
587			pr_debug("failed %d to sync bo\n", r);
588			amdgpu_bo_unreserve(bo);
589			goto reserve_bo_failed;
590		}
591	}
592
593	r = dma_resv_reserve_fences(bo->tbo.base.resv, 1);
594	if (r) {
595		pr_debug("failed %d to reserve bo\n", r);
596		amdgpu_bo_unreserve(bo);
597		goto reserve_bo_failed;
598	}
599	amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true);
600
601	amdgpu_bo_unreserve(bo);
602
603	svm_bo->bo = bo;
604	prange->svm_bo = svm_bo;
605	prange->ttm_res = bo->tbo.resource;
606	prange->offset = 0;
607
608	spin_lock(&svm_bo->list_lock);
609	list_add(&prange->svm_bo_list, &svm_bo->range_list);
610	spin_unlock(&svm_bo->list_lock);
611
612	return 0;
613
614reserve_bo_failed:
615	amdgpu_bo_unref(&bo);
616create_bo_failed:
617	dma_fence_put(&svm_bo->eviction_fence->base);
618	kfree(svm_bo);
619	prange->ttm_res = NULL;
620
621	return r;
622}
623
624void svm_range_vram_node_free(struct svm_range *prange)
625{
626	/* serialize prange->svm_bo unref */
627	mutex_lock(&prange->lock);
628	/* prange->svm_bo has not been unref */
629	if (prange->ttm_res) {
630		prange->ttm_res = NULL;
631		mutex_unlock(&prange->lock);
632		svm_range_bo_unref(prange->svm_bo);
633	} else
634		mutex_unlock(&prange->lock);
635}
636
637struct kfd_node *
638svm_range_get_node_by_id(struct svm_range *prange, uint32_t gpu_id)
639{
640	struct kfd_process *p;
641	struct kfd_process_device *pdd;
642
643	p = container_of(prange->svms, struct kfd_process, svms);
644	pdd = kfd_process_device_data_by_id(p, gpu_id);
645	if (!pdd) {
646		pr_debug("failed to get kfd process device by id 0x%x\n", gpu_id);
647		return NULL;
648	}
649
650	return pdd->dev;
651}
652
653struct kfd_process_device *
654svm_range_get_pdd_by_node(struct svm_range *prange, struct kfd_node *node)
655{
656	struct kfd_process *p;
657
658	p = container_of(prange->svms, struct kfd_process, svms);
659
660	return kfd_get_process_device_data(node, p);
661}
662
663static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo)
664{
665	struct ttm_operation_ctx ctx = { false, false };
666
667	amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM);
668
669	return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
670}
671
672static int
673svm_range_check_attr(struct kfd_process *p,
674		     uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
675{
676	uint32_t i;
677
678	for (i = 0; i < nattr; i++) {
679		uint32_t val = attrs[i].value;
680		int gpuidx = MAX_GPU_INSTANCE;
681
682		switch (attrs[i].type) {
683		case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
684			if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM &&
685			    val != KFD_IOCTL_SVM_LOCATION_UNDEFINED)
686				gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
687			break;
688		case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
689			if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM)
690				gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
691			break;
692		case KFD_IOCTL_SVM_ATTR_ACCESS:
693		case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
694		case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
695			gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
696			break;
697		case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
698			break;
699		case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
700			break;
701		case KFD_IOCTL_SVM_ATTR_GRANULARITY:
702			break;
703		default:
704			pr_debug("unknown attr type 0x%x\n", attrs[i].type);
705			return -EINVAL;
706		}
707
708		if (gpuidx < 0) {
709			pr_debug("no GPU 0x%x found\n", val);
710			return -EINVAL;
711		} else if (gpuidx < MAX_GPU_INSTANCE &&
712			   !test_bit(gpuidx, p->svms.bitmap_supported)) {
713			pr_debug("GPU 0x%x not supported\n", val);
714			return -EINVAL;
715		}
716	}
717
718	return 0;
719}
720
721static void
722svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange,
723		      uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs,
724		      bool *update_mapping)
725{
726	uint32_t i;
727	int gpuidx;
728
729	for (i = 0; i < nattr; i++) {
730		switch (attrs[i].type) {
731		case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
732			prange->preferred_loc = attrs[i].value;
733			break;
734		case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
735			prange->prefetch_loc = attrs[i].value;
736			break;
737		case KFD_IOCTL_SVM_ATTR_ACCESS:
738		case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
739		case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
740			if (!p->xnack_enabled)
741				*update_mapping = true;
742
743			gpuidx = kfd_process_gpuidx_from_gpuid(p,
744							       attrs[i].value);
745			if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) {
746				bitmap_clear(prange->bitmap_access, gpuidx, 1);
747				bitmap_clear(prange->bitmap_aip, gpuidx, 1);
748			} else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) {
749				bitmap_set(prange->bitmap_access, gpuidx, 1);
750				bitmap_clear(prange->bitmap_aip, gpuidx, 1);
751			} else {
752				bitmap_clear(prange->bitmap_access, gpuidx, 1);
753				bitmap_set(prange->bitmap_aip, gpuidx, 1);
754			}
755			break;
756		case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
757			*update_mapping = true;
758			prange->flags |= attrs[i].value;
759			break;
760		case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
761			*update_mapping = true;
762			prange->flags &= ~attrs[i].value;
763			break;
764		case KFD_IOCTL_SVM_ATTR_GRANULARITY:
765			prange->granularity = min_t(uint32_t, attrs[i].value, 0x3F);
766			break;
767		default:
768			WARN_ONCE(1, "svm_range_check_attrs wasn't called?");
769		}
770	}
771}
772
773static bool
774svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange,
775			uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
776{
777	uint32_t i;
778	int gpuidx;
779
780	for (i = 0; i < nattr; i++) {
781		switch (attrs[i].type) {
782		case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
783			if (prange->preferred_loc != attrs[i].value)
784				return false;
785			break;
786		case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
787			/* Prefetch should always trigger a migration even
788			 * if the value of the attribute didn't change.
789			 */
790			return false;
791		case KFD_IOCTL_SVM_ATTR_ACCESS:
792		case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
793		case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
794			gpuidx = kfd_process_gpuidx_from_gpuid(p,
795							       attrs[i].value);
796			if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) {
797				if (test_bit(gpuidx, prange->bitmap_access) ||
798				    test_bit(gpuidx, prange->bitmap_aip))
799					return false;
800			} else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) {
801				if (!test_bit(gpuidx, prange->bitmap_access))
802					return false;
803			} else {
804				if (!test_bit(gpuidx, prange->bitmap_aip))
805					return false;
806			}
807			break;
808		case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
809			if ((prange->flags & attrs[i].value) != attrs[i].value)
810				return false;
811			break;
812		case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
813			if ((prange->flags & attrs[i].value) != 0)
814				return false;
815			break;
816		case KFD_IOCTL_SVM_ATTR_GRANULARITY:
817			if (prange->granularity != attrs[i].value)
818				return false;
819			break;
820		default:
821			WARN_ONCE(1, "svm_range_check_attrs wasn't called?");
822		}
823	}
824
825	return true;
826}
827
828/**
829 * svm_range_debug_dump - print all range information from svms
830 * @svms: svm range list header
831 *
832 * debug output svm range start, end, prefetch location from svms
833 * interval tree and link list
834 *
835 * Context: The caller must hold svms->lock
836 */
837static void svm_range_debug_dump(struct svm_range_list *svms)
838{
839	struct interval_tree_node *node;
840	struct svm_range *prange;
841
842	pr_debug("dump svms 0x%p list\n", svms);
843	pr_debug("range\tstart\tpage\tend\t\tlocation\n");
844
845	list_for_each_entry(prange, &svms->list, list) {
846		pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n",
847			 prange, prange->start, prange->npages,
848			 prange->start + prange->npages - 1,
849			 prange->actual_loc);
850	}
851
852	pr_debug("dump svms 0x%p interval tree\n", svms);
853	pr_debug("range\tstart\tpage\tend\t\tlocation\n");
854	node = interval_tree_iter_first(&svms->objects, 0, ~0ULL);
855	while (node) {
856		prange = container_of(node, struct svm_range, it_node);
857		pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n",
858			 prange, prange->start, prange->npages,
859			 prange->start + prange->npages - 1,
860			 prange->actual_loc);
861		node = interval_tree_iter_next(node, 0, ~0ULL);
862	}
863}
864
865static void *
866svm_range_copy_array(void *psrc, size_t size, uint64_t num_elements,
867		     uint64_t offset)
868{
869	unsigned char *dst;
870
871	dst = kvmalloc_array(num_elements, size, GFP_KERNEL);
872	if (!dst)
873		return NULL;
874	memcpy(dst, (unsigned char *)psrc + offset, num_elements * size);
875
876	return (void *)dst;
877}
878
879static int
880svm_range_copy_dma_addrs(struct svm_range *dst, struct svm_range *src)
881{
882	int i;
883
884	for (i = 0; i < MAX_GPU_INSTANCE; i++) {
885		if (!src->dma_addr[i])
886			continue;
887		dst->dma_addr[i] = svm_range_copy_array(src->dma_addr[i],
888					sizeof(*src->dma_addr[i]), src->npages, 0);
889		if (!dst->dma_addr[i])
890			return -ENOMEM;
891	}
892
893	return 0;
894}
895
896static int
897svm_range_split_array(void *ppnew, void *ppold, size_t size,
898		      uint64_t old_start, uint64_t old_n,
899		      uint64_t new_start, uint64_t new_n)
900{
901	unsigned char *new, *old, *pold;
902	uint64_t d;
903
904	if (!ppold)
905		return 0;
906	pold = *(unsigned char **)ppold;
907	if (!pold)
908		return 0;
909
910	d = (new_start - old_start) * size;
911	new = svm_range_copy_array(pold, size, new_n, d);
912	if (!new)
913		return -ENOMEM;
914	d = (new_start == old_start) ? new_n * size : 0;
915	old = svm_range_copy_array(pold, size, old_n, d);
916	if (!old) {
917		kvfree(new);
918		return -ENOMEM;
919	}
920	kvfree(pold);
921	*(void **)ppold = old;
922	*(void **)ppnew = new;
923
924	return 0;
925}
926
927static int
928svm_range_split_pages(struct svm_range *new, struct svm_range *old,
929		      uint64_t start, uint64_t last)
930{
931	uint64_t npages = last - start + 1;
932	int i, r;
933
934	for (i = 0; i < MAX_GPU_INSTANCE; i++) {
935		r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i],
936					  sizeof(*old->dma_addr[i]), old->start,
937					  npages, new->start, new->npages);
938		if (r)
939			return r;
940	}
941
942	return 0;
943}
944
945static int
946svm_range_split_nodes(struct svm_range *new, struct svm_range *old,
947		      uint64_t start, uint64_t last)
948{
949	uint64_t npages = last - start + 1;
950
951	pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n",
952		 new->svms, new, new->start, start, last);
953
954	if (new->start == old->start) {
955		new->offset = old->offset;
956		old->offset += new->npages;
957	} else {
958		new->offset = old->offset + npages;
959	}
960
961	new->svm_bo = svm_range_bo_ref(old->svm_bo);
962	new->ttm_res = old->ttm_res;
963
964	spin_lock(&new->svm_bo->list_lock);
965	list_add(&new->svm_bo_list, &new->svm_bo->range_list);
966	spin_unlock(&new->svm_bo->list_lock);
967
968	return 0;
969}
970
971/**
972 * svm_range_split_adjust - split range and adjust
973 *
974 * @new: new range
975 * @old: the old range
976 * @start: the old range adjust to start address in pages
977 * @last: the old range adjust to last address in pages
978 *
979 * Copy system memory dma_addr or vram ttm_res in old range to new
980 * range from new_start up to size new->npages, the remaining old range is from
981 * start to last
982 *
983 * Return:
984 * 0 - OK, -ENOMEM - out of memory
985 */
986static int
987svm_range_split_adjust(struct svm_range *new, struct svm_range *old,
988		      uint64_t start, uint64_t last)
989{
990	int r;
991
992	pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n",
993		 new->svms, new->start, old->start, old->last, start, last);
994
995	if (new->start < old->start ||
996	    new->last > old->last) {
997		WARN_ONCE(1, "invalid new range start or last\n");
998		return -EINVAL;
999	}
1000
1001	r = svm_range_split_pages(new, old, start, last);
1002	if (r)
1003		return r;
1004
1005	if (old->actual_loc && old->ttm_res) {
1006		r = svm_range_split_nodes(new, old, start, last);
1007		if (r)
1008			return r;
1009	}
1010
1011	old->npages = last - start + 1;
1012	old->start = start;
1013	old->last = last;
1014	new->flags = old->flags;
1015	new->preferred_loc = old->preferred_loc;
1016	new->prefetch_loc = old->prefetch_loc;
1017	new->actual_loc = old->actual_loc;
1018	new->granularity = old->granularity;
1019	new->mapped_to_gpu = old->mapped_to_gpu;
1020	bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
1021	bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
1022
1023	return 0;
1024}
1025
1026/**
1027 * svm_range_split - split a range in 2 ranges
1028 *
1029 * @prange: the svm range to split
1030 * @start: the remaining range start address in pages
1031 * @last: the remaining range last address in pages
1032 * @new: the result new range generated
1033 *
1034 * Two cases only:
1035 * case 1: if start == prange->start
1036 *         prange ==> prange[start, last]
1037 *         new range [last + 1, prange->last]
1038 *
1039 * case 2: if last == prange->last
1040 *         prange ==> prange[start, last]
1041 *         new range [prange->start, start - 1]
1042 *
1043 * Return:
1044 * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last
1045 */
1046static int
1047svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last,
1048		struct svm_range **new)
1049{
1050	uint64_t old_start = prange->start;
1051	uint64_t old_last = prange->last;
1052	struct svm_range_list *svms;
1053	int r = 0;
1054
1055	pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", prange->svms,
1056		 old_start, old_last, start, last);
1057
1058	if (old_start != start && old_last != last)
1059		return -EINVAL;
1060	if (start < old_start || last > old_last)
1061		return -EINVAL;
1062
1063	svms = prange->svms;
1064	if (old_start == start)
1065		*new = svm_range_new(svms, last + 1, old_last, false);
1066	else
1067		*new = svm_range_new(svms, old_start, start - 1, false);
1068	if (!*new)
1069		return -ENOMEM;
1070
1071	r = svm_range_split_adjust(*new, prange, start, last);
1072	if (r) {
1073		pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n",
1074			 r, old_start, old_last, start, last);
1075		svm_range_free(*new, false);
1076		*new = NULL;
1077	}
1078
1079	return r;
1080}
1081
1082static int
1083svm_range_split_tail(struct svm_range *prange,
1084		     uint64_t new_last, struct list_head *insert_list)
1085{
1086	struct svm_range *tail;
1087	int r = svm_range_split(prange, prange->start, new_last, &tail);
1088
1089	if (!r)
1090		list_add(&tail->list, insert_list);
1091	return r;
1092}
1093
1094static int
1095svm_range_split_head(struct svm_range *prange,
1096		     uint64_t new_start, struct list_head *insert_list)
1097{
1098	struct svm_range *head;
1099	int r = svm_range_split(prange, new_start, prange->last, &head);
1100
1101	if (!r)
1102		list_add(&head->list, insert_list);
1103	return r;
1104}
1105
1106static void
1107svm_range_add_child(struct svm_range *prange, struct mm_struct *mm,
1108		    struct svm_range *pchild, enum svm_work_list_ops op)
1109{
1110	pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n",
1111		 pchild, pchild->start, pchild->last, prange, op);
1112
1113	pchild->work_item.mm = mm;
1114	pchild->work_item.op = op;
1115	list_add_tail(&pchild->child_list, &prange->child_list);
1116}
1117
1118/**
1119 * svm_range_split_by_granularity - collect ranges within granularity boundary
1120 *
1121 * @p: the process with svms list
1122 * @mm: mm structure
1123 * @addr: the vm fault address in pages, to split the prange
1124 * @parent: parent range if prange is from child list
1125 * @prange: prange to split
1126 *
1127 * Trims @prange to be a single aligned block of prange->granularity if
1128 * possible. The head and tail are added to the child_list in @parent.
1129 *
1130 * Context: caller must hold mmap_read_lock and prange->lock
1131 *
1132 * Return:
1133 * 0 - OK, otherwise error code
1134 */
1135int
1136svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm,
1137			       unsigned long addr, struct svm_range *parent,
1138			       struct svm_range *prange)
1139{
1140	struct svm_range *head, *tail;
1141	unsigned long start, last, size;
1142	int r;
1143
1144	/* Align splited range start and size to granularity size, then a single
1145	 * PTE will be used for whole range, this reduces the number of PTE
1146	 * updated and the L1 TLB space used for translation.
1147	 */
1148	size = 1UL << prange->granularity;
1149	start = ALIGN_DOWN(addr, size);
1150	last = ALIGN(addr + 1, size) - 1;
1151
1152	pr_debug("svms 0x%p split [0x%lx 0x%lx] to [0x%lx 0x%lx] size 0x%lx\n",
1153		 prange->svms, prange->start, prange->last, start, last, size);
1154
1155	if (start > prange->start) {
1156		r = svm_range_split(prange, start, prange->last, &head);
1157		if (r)
1158			return r;
1159		svm_range_add_child(parent, mm, head, SVM_OP_ADD_RANGE);
1160	}
1161
1162	if (last < prange->last) {
1163		r = svm_range_split(prange, prange->start, last, &tail);
1164		if (r)
1165			return r;
1166		svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE);
1167	}
1168
1169	/* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */
1170	if (p->xnack_enabled && prange->work_item.op == SVM_OP_ADD_RANGE) {
1171		prange->work_item.op = SVM_OP_ADD_RANGE_AND_MAP;
1172		pr_debug("change prange 0x%p [0x%lx 0x%lx] op %d\n",
1173			 prange, prange->start, prange->last,
1174			 SVM_OP_ADD_RANGE_AND_MAP);
1175	}
1176	return 0;
1177}
1178static bool
1179svm_nodes_in_same_hive(struct kfd_node *node_a, struct kfd_node *node_b)
1180{
1181	return (node_a->adev == node_b->adev ||
1182		amdgpu_xgmi_same_hive(node_a->adev, node_b->adev));
1183}
1184
1185static uint64_t
1186svm_range_get_pte_flags(struct kfd_node *node,
1187			struct svm_range *prange, int domain)
1188{
1189	struct kfd_node *bo_node;
1190	uint32_t flags = prange->flags;
1191	uint32_t mapping_flags = 0;
1192	uint64_t pte_flags;
1193	bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN);
1194	bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT;
1195	bool uncached = false; /*flags & KFD_IOCTL_SVM_FLAG_UNCACHED;*/
1196	unsigned int mtype_local;
1197
1198	if (domain == SVM_RANGE_VRAM_DOMAIN)
1199		bo_node = prange->svm_bo->node;
1200
1201	switch (node->adev->ip_versions[GC_HWIP][0]) {
1202	case IP_VERSION(9, 4, 1):
1203		if (domain == SVM_RANGE_VRAM_DOMAIN) {
1204			if (bo_node == node) {
1205				mapping_flags |= coherent ?
1206					AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
1207			} else {
1208				mapping_flags |= coherent ?
1209					AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1210				if (svm_nodes_in_same_hive(node, bo_node))
1211					snoop = true;
1212			}
1213		} else {
1214			mapping_flags |= coherent ?
1215				AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1216		}
1217		break;
1218	case IP_VERSION(9, 4, 2):
1219		if (domain == SVM_RANGE_VRAM_DOMAIN) {
1220			if (bo_node == node) {
1221				mapping_flags |= coherent ?
1222					AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
1223				if (node->adev->gmc.xgmi.connected_to_cpu)
1224					snoop = true;
1225			} else {
1226				mapping_flags |= coherent ?
1227					AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1228				if (svm_nodes_in_same_hive(node, bo_node))
1229					snoop = true;
1230			}
1231		} else {
1232			mapping_flags |= coherent ?
1233				AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1234		}
1235		break;
1236	case IP_VERSION(9, 4, 3):
1237		mtype_local = amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC :
1238			     (amdgpu_mtype_local == 2 ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW);
1239		snoop = true;
1240		if (uncached) {
1241			mapping_flags |= AMDGPU_VM_MTYPE_UC;
1242		} else if (domain == SVM_RANGE_VRAM_DOMAIN) {
1243			/* local HBM region close to partition */
1244			if (bo_node->adev == node->adev &&
1245			    (!bo_node->xcp || !node->xcp || bo_node->xcp->mem_id == node->xcp->mem_id))
1246				mapping_flags |= mtype_local;
1247			/* local HBM region far from partition or remote XGMI GPU */
1248			else if (svm_nodes_in_same_hive(bo_node, node))
1249				mapping_flags |= AMDGPU_VM_MTYPE_NC;
1250			/* PCIe P2P */
1251			else
1252				mapping_flags |= AMDGPU_VM_MTYPE_UC;
1253		/* system memory accessed by the APU */
1254		} else if (node->adev->flags & AMD_IS_APU) {
1255			/* On NUMA systems, locality is determined per-page
1256			 * in amdgpu_gmc_override_vm_pte_flags
1257			 */
1258			if (num_possible_nodes() <= 1)
1259				mapping_flags |= mtype_local;
1260			else
1261				mapping_flags |= AMDGPU_VM_MTYPE_NC;
1262		/* system memory accessed by the dGPU */
1263		} else {
1264			mapping_flags |= AMDGPU_VM_MTYPE_UC;
1265		}
1266		break;
1267	default:
1268		mapping_flags |= coherent ?
1269			AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1270	}
1271
1272	mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE;
1273
1274	if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO)
1275		mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE;
1276	if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC)
1277		mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
1278
1279	pte_flags = AMDGPU_PTE_VALID;
1280	pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM;
1281	pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0;
1282
1283	pte_flags |= amdgpu_gem_va_map_flags(node->adev, mapping_flags);
1284	return pte_flags;
1285}
1286
1287static int
1288svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
1289			 uint64_t start, uint64_t last,
1290			 struct dma_fence **fence)
1291{
1292	uint64_t init_pte_value = 0;
1293
1294	pr_debug("[0x%llx 0x%llx]\n", start, last);
1295
1296	return amdgpu_vm_update_range(adev, vm, false, true, true, NULL, start,
1297				      last, init_pte_value, 0, 0, NULL, NULL,
1298				      fence);
1299}
1300
1301static int
1302svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start,
1303			  unsigned long last, uint32_t trigger)
1304{
1305	DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
1306	struct kfd_process_device *pdd;
1307	struct dma_fence *fence = NULL;
1308	struct kfd_process *p;
1309	uint32_t gpuidx;
1310	int r = 0;
1311
1312	if (!prange->mapped_to_gpu) {
1313		pr_debug("prange 0x%p [0x%lx 0x%lx] not mapped to GPU\n",
1314			 prange, prange->start, prange->last);
1315		return 0;
1316	}
1317
1318	if (prange->start == start && prange->last == last) {
1319		pr_debug("unmap svms 0x%p prange 0x%p\n", prange->svms, prange);
1320		prange->mapped_to_gpu = false;
1321	}
1322
1323	bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
1324		  MAX_GPU_INSTANCE);
1325	p = container_of(prange->svms, struct kfd_process, svms);
1326
1327	for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
1328		pr_debug("unmap from gpu idx 0x%x\n", gpuidx);
1329		pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1330		if (!pdd) {
1331			pr_debug("failed to find device idx %d\n", gpuidx);
1332			return -EINVAL;
1333		}
1334
1335		kfd_smi_event_unmap_from_gpu(pdd->dev, p->lead_thread->pid,
1336					     start, last, trigger);
1337
1338		r = svm_range_unmap_from_gpu(pdd->dev->adev,
1339					     drm_priv_to_vm(pdd->drm_priv),
1340					     start, last, &fence);
1341		if (r)
1342			break;
1343
1344		if (fence) {
1345			r = dma_fence_wait(fence, false);
1346			dma_fence_put(fence);
1347			fence = NULL;
1348			if (r)
1349				break;
1350		}
1351		kfd_flush_tlb(pdd, TLB_FLUSH_HEAVYWEIGHT);
1352	}
1353
1354	return r;
1355}
1356
1357static int
1358svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange,
1359		     unsigned long offset, unsigned long npages, bool readonly,
1360		     dma_addr_t *dma_addr, struct amdgpu_device *bo_adev,
1361		     struct dma_fence **fence, bool flush_tlb)
1362{
1363	struct amdgpu_device *adev = pdd->dev->adev;
1364	struct amdgpu_vm *vm = drm_priv_to_vm(pdd->drm_priv);
1365	uint64_t pte_flags;
1366	unsigned long last_start;
1367	int last_domain;
1368	int r = 0;
1369	int64_t i, j;
1370
1371	last_start = prange->start + offset;
1372
1373	pr_debug("svms 0x%p [0x%lx 0x%lx] readonly %d\n", prange->svms,
1374		 last_start, last_start + npages - 1, readonly);
1375
1376	for (i = offset; i < offset + npages; i++) {
1377		last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN;
1378		dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN;
1379
1380		/* Collect all pages in the same address range and memory domain
1381		 * that can be mapped with a single call to update mapping.
1382		 */
1383		if (i < offset + npages - 1 &&
1384		    last_domain == (dma_addr[i + 1] & SVM_RANGE_VRAM_DOMAIN))
1385			continue;
1386
1387		pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n",
1388			 last_start, prange->start + i, last_domain ? "GPU" : "CPU");
1389
1390		pte_flags = svm_range_get_pte_flags(pdd->dev, prange, last_domain);
1391		if (readonly)
1392			pte_flags &= ~AMDGPU_PTE_WRITEABLE;
1393
1394		pr_debug("svms 0x%p map [0x%lx 0x%llx] vram %d PTE 0x%llx\n",
1395			 prange->svms, last_start, prange->start + i,
1396			 (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0,
1397			 pte_flags);
1398
1399		/* For dGPU mode, we use same vm_manager to allocate VRAM for
1400		 * different memory partition based on fpfn/lpfn, we should use
1401		 * same vm_manager.vram_base_offset regardless memory partition.
1402		 */
1403		r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, NULL,
1404					   last_start, prange->start + i,
1405					   pte_flags,
1406					   (last_start - prange->start) << PAGE_SHIFT,
1407					   bo_adev ? bo_adev->vm_manager.vram_base_offset : 0,
1408					   NULL, dma_addr, &vm->last_update);
1409
1410		for (j = last_start - prange->start; j <= i; j++)
1411			dma_addr[j] |= last_domain;
1412
1413		if (r) {
1414			pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start);
1415			goto out;
1416		}
1417		last_start = prange->start + i + 1;
1418	}
1419
1420	r = amdgpu_vm_update_pdes(adev, vm, false);
1421	if (r) {
1422		pr_debug("failed %d to update directories 0x%lx\n", r,
1423			 prange->start);
1424		goto out;
1425	}
1426
1427	if (fence)
1428		*fence = dma_fence_get(vm->last_update);
1429
1430out:
1431	return r;
1432}
1433
1434static int
1435svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset,
1436		      unsigned long npages, bool readonly,
1437		      unsigned long *bitmap, bool wait, bool flush_tlb)
1438{
1439	struct kfd_process_device *pdd;
1440	struct amdgpu_device *bo_adev = NULL;
1441	struct kfd_process *p;
1442	struct dma_fence *fence = NULL;
1443	uint32_t gpuidx;
1444	int r = 0;
1445
1446	if (prange->svm_bo && prange->ttm_res)
1447		bo_adev = prange->svm_bo->node->adev;
1448
1449	p = container_of(prange->svms, struct kfd_process, svms);
1450	for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
1451		pr_debug("mapping to gpu idx 0x%x\n", gpuidx);
1452		pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1453		if (!pdd) {
1454			pr_debug("failed to find device idx %d\n", gpuidx);
1455			return -EINVAL;
1456		}
1457
1458		pdd = kfd_bind_process_to_device(pdd->dev, p);
1459		if (IS_ERR(pdd))
1460			return -EINVAL;
1461
1462		if (bo_adev && pdd->dev->adev != bo_adev &&
1463		    !amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) {
1464			pr_debug("cannot map to device idx %d\n", gpuidx);
1465			continue;
1466		}
1467
1468		r = svm_range_map_to_gpu(pdd, prange, offset, npages, readonly,
1469					 prange->dma_addr[gpuidx],
1470					 bo_adev, wait ? &fence : NULL,
1471					 flush_tlb);
1472		if (r)
1473			break;
1474
1475		if (fence) {
1476			r = dma_fence_wait(fence, false);
1477			dma_fence_put(fence);
1478			fence = NULL;
1479			if (r) {
1480				pr_debug("failed %d to dma fence wait\n", r);
1481				break;
1482			}
1483		}
1484
1485		kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY);
1486	}
1487
1488	return r;
1489}
1490
1491struct svm_validate_context {
1492	struct kfd_process *process;
1493	struct svm_range *prange;
1494	bool intr;
1495	DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
1496	struct drm_exec exec;
1497};
1498
1499static int svm_range_reserve_bos(struct svm_validate_context *ctx, bool intr)
1500{
1501	struct kfd_process_device *pdd;
1502	struct amdgpu_vm *vm;
1503	uint32_t gpuidx;
1504	int r;
1505
1506	drm_exec_init(&ctx->exec, intr ? DRM_EXEC_INTERRUPTIBLE_WAIT: 0);
1507	drm_exec_until_all_locked(&ctx->exec) {
1508		for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) {
1509			pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx);
1510			if (!pdd) {
1511				pr_debug("failed to find device idx %d\n", gpuidx);
1512				r = -EINVAL;
1513				goto unreserve_out;
1514			}
1515			vm = drm_priv_to_vm(pdd->drm_priv);
1516
1517			r = amdgpu_vm_lock_pd(vm, &ctx->exec, 2);
1518			drm_exec_retry_on_contention(&ctx->exec);
1519			if (unlikely(r)) {
1520				pr_debug("failed %d to reserve bo\n", r);
1521				goto unreserve_out;
1522			}
1523		}
1524	}
1525
1526	for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) {
1527		pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx);
1528		if (!pdd) {
1529			pr_debug("failed to find device idx %d\n", gpuidx);
1530			r = -EINVAL;
1531			goto unreserve_out;
1532		}
1533
1534		r = amdgpu_vm_validate_pt_bos(pdd->dev->adev,
1535					      drm_priv_to_vm(pdd->drm_priv),
1536					      svm_range_bo_validate, NULL);
1537		if (r) {
1538			pr_debug("failed %d validate pt bos\n", r);
1539			goto unreserve_out;
1540		}
1541	}
1542
1543	return 0;
1544
1545unreserve_out:
1546	drm_exec_fini(&ctx->exec);
1547	return r;
1548}
1549
1550static void svm_range_unreserve_bos(struct svm_validate_context *ctx)
1551{
1552	drm_exec_fini(&ctx->exec);
1553}
1554
1555static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx)
1556{
1557	struct kfd_process_device *pdd;
1558
1559	pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1560	if (!pdd)
1561		return NULL;
1562
1563	return SVM_ADEV_PGMAP_OWNER(pdd->dev->adev);
1564}
1565
1566/*
1567 * Validation+GPU mapping with concurrent invalidation (MMU notifiers)
1568 *
1569 * To prevent concurrent destruction or change of range attributes, the
1570 * svm_read_lock must be held. The caller must not hold the svm_write_lock
1571 * because that would block concurrent evictions and lead to deadlocks. To
1572 * serialize concurrent migrations or validations of the same range, the
1573 * prange->migrate_mutex must be held.
1574 *
1575 * For VRAM ranges, the SVM BO must be allocated and valid (protected by its
1576 * eviction fence.
1577 *
1578 * The following sequence ensures race-free validation and GPU mapping:
1579 *
1580 * 1. Reserve page table (and SVM BO if range is in VRAM)
1581 * 2. hmm_range_fault to get page addresses (if system memory)
1582 * 3. DMA-map pages (if system memory)
1583 * 4-a. Take notifier lock
1584 * 4-b. Check that pages still valid (mmu_interval_read_retry)
1585 * 4-c. Check that the range was not split or otherwise invalidated
1586 * 4-d. Update GPU page table
1587 * 4.e. Release notifier lock
1588 * 5. Release page table (and SVM BO) reservation
1589 */
1590static int svm_range_validate_and_map(struct mm_struct *mm,
1591				      struct svm_range *prange, int32_t gpuidx,
1592				      bool intr, bool wait, bool flush_tlb)
1593{
1594	struct svm_validate_context *ctx;
1595	unsigned long start, end, addr;
1596	struct kfd_process *p;
1597	void *owner;
1598	int32_t idx;
1599	int r = 0;
1600
1601	ctx = kzalloc(sizeof(struct svm_validate_context), GFP_KERNEL);
1602	if (!ctx)
1603		return -ENOMEM;
1604	ctx->process = container_of(prange->svms, struct kfd_process, svms);
1605	ctx->prange = prange;
1606	ctx->intr = intr;
1607
1608	if (gpuidx < MAX_GPU_INSTANCE) {
1609		bitmap_zero(ctx->bitmap, MAX_GPU_INSTANCE);
1610		bitmap_set(ctx->bitmap, gpuidx, 1);
1611	} else if (ctx->process->xnack_enabled) {
1612		bitmap_copy(ctx->bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE);
1613
1614		/* If prefetch range to GPU, or GPU retry fault migrate range to
1615		 * GPU, which has ACCESS attribute to the range, create mapping
1616		 * on that GPU.
1617		 */
1618		if (prange->actual_loc) {
1619			gpuidx = kfd_process_gpuidx_from_gpuid(ctx->process,
1620							prange->actual_loc);
1621			if (gpuidx < 0) {
1622				WARN_ONCE(1, "failed get device by id 0x%x\n",
1623					 prange->actual_loc);
1624				r = -EINVAL;
1625				goto free_ctx;
1626			}
1627			if (test_bit(gpuidx, prange->bitmap_access))
1628				bitmap_set(ctx->bitmap, gpuidx, 1);
1629		}
1630
1631		/*
1632		 * If prange is already mapped or with always mapped flag,
1633		 * update mapping on GPUs with ACCESS attribute
1634		 */
1635		if (bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) {
1636			if (prange->mapped_to_gpu ||
1637			    prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)
1638				bitmap_copy(ctx->bitmap, prange->bitmap_access, MAX_GPU_INSTANCE);
1639		}
1640	} else {
1641		bitmap_or(ctx->bitmap, prange->bitmap_access,
1642			  prange->bitmap_aip, MAX_GPU_INSTANCE);
1643	}
1644
1645	if (bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) {
1646		r = 0;
1647		goto free_ctx;
1648	}
1649
1650	if (prange->actual_loc && !prange->ttm_res) {
1651		/* This should never happen. actual_loc gets set by
1652		 * svm_migrate_ram_to_vram after allocating a BO.
1653		 */
1654		WARN_ONCE(1, "VRAM BO missing during validation\n");
1655		r = -EINVAL;
1656		goto free_ctx;
1657	}
1658
1659	svm_range_reserve_bos(ctx, intr);
1660
1661	p = container_of(prange->svms, struct kfd_process, svms);
1662	owner = kfd_svm_page_owner(p, find_first_bit(ctx->bitmap,
1663						MAX_GPU_INSTANCE));
1664	for_each_set_bit(idx, ctx->bitmap, MAX_GPU_INSTANCE) {
1665		if (kfd_svm_page_owner(p, idx) != owner) {
1666			owner = NULL;
1667			break;
1668		}
1669	}
1670
1671	start = prange->start << PAGE_SHIFT;
1672	end = (prange->last + 1) << PAGE_SHIFT;
1673	for (addr = start; !r && addr < end; ) {
1674		struct hmm_range *hmm_range;
1675		struct vm_area_struct *vma;
1676		unsigned long next = 0;
1677		unsigned long offset;
1678		unsigned long npages;
1679		bool readonly;
1680
1681		vma = vma_lookup(mm, addr);
1682		if (vma) {
1683			readonly = !(vma->vm_flags & VM_WRITE);
1684
1685			next = min(vma->vm_end, end);
1686			npages = (next - addr) >> PAGE_SHIFT;
1687			WRITE_ONCE(p->svms.faulting_task, current);
1688			r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages,
1689						       readonly, owner, NULL,
1690						       &hmm_range);
1691			WRITE_ONCE(p->svms.faulting_task, NULL);
1692			if (r) {
1693				pr_debug("failed %d to get svm range pages\n", r);
1694				if (r == -EBUSY)
1695					r = -EAGAIN;
1696			}
1697		} else {
1698			r = -EFAULT;
1699		}
1700
1701		if (!r) {
1702			offset = (addr - start) >> PAGE_SHIFT;
1703			r = svm_range_dma_map(prange, ctx->bitmap, offset, npages,
1704					      hmm_range->hmm_pfns);
1705			if (r)
1706				pr_debug("failed %d to dma map range\n", r);
1707		}
1708
1709		svm_range_lock(prange);
1710		if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) {
1711			pr_debug("hmm update the range, need validate again\n");
1712			r = -EAGAIN;
1713		}
1714
1715		if (!r && !list_empty(&prange->child_list)) {
1716			pr_debug("range split by unmap in parallel, validate again\n");
1717			r = -EAGAIN;
1718		}
1719
1720		if (!r)
1721			r = svm_range_map_to_gpus(prange, offset, npages, readonly,
1722						  ctx->bitmap, wait, flush_tlb);
1723
1724		if (!r && next == end)
1725			prange->mapped_to_gpu = true;
1726
1727		svm_range_unlock(prange);
1728
1729		addr = next;
1730	}
1731
1732	svm_range_unreserve_bos(ctx);
1733	if (!r)
1734		prange->validate_timestamp = ktime_get_boottime();
1735
1736free_ctx:
1737	kfree(ctx);
1738
1739	return r;
1740}
1741
1742/**
1743 * svm_range_list_lock_and_flush_work - flush pending deferred work
1744 *
1745 * @svms: the svm range list
1746 * @mm: the mm structure
1747 *
1748 * Context: Returns with mmap write lock held, pending deferred work flushed
1749 *
1750 */
1751void
1752svm_range_list_lock_and_flush_work(struct svm_range_list *svms,
1753				   struct mm_struct *mm)
1754{
1755retry_flush_work:
1756	flush_work(&svms->deferred_list_work);
1757	mmap_write_lock(mm);
1758
1759	if (list_empty(&svms->deferred_range_list))
1760		return;
1761	mmap_write_unlock(mm);
1762	pr_debug("retry flush\n");
1763	goto retry_flush_work;
1764}
1765
1766static void svm_range_restore_work(struct work_struct *work)
1767{
1768	struct delayed_work *dwork = to_delayed_work(work);
1769	struct amdkfd_process_info *process_info;
1770	struct svm_range_list *svms;
1771	struct svm_range *prange;
1772	struct kfd_process *p;
1773	struct mm_struct *mm;
1774	int evicted_ranges;
1775	int invalid;
1776	int r;
1777
1778	svms = container_of(dwork, struct svm_range_list, restore_work);
1779	evicted_ranges = atomic_read(&svms->evicted_ranges);
1780	if (!evicted_ranges)
1781		return;
1782
1783	pr_debug("restore svm ranges\n");
1784
1785	p = container_of(svms, struct kfd_process, svms);
1786	process_info = p->kgd_process_info;
1787
1788	/* Keep mm reference when svm_range_validate_and_map ranges */
1789	mm = get_task_mm(p->lead_thread);
1790	if (!mm) {
1791		pr_debug("svms 0x%p process mm gone\n", svms);
1792		return;
1793	}
1794
1795	mutex_lock(&process_info->lock);
1796	svm_range_list_lock_and_flush_work(svms, mm);
1797	mutex_lock(&svms->lock);
1798
1799	evicted_ranges = atomic_read(&svms->evicted_ranges);
1800
1801	list_for_each_entry(prange, &svms->list, list) {
1802		invalid = atomic_read(&prange->invalid);
1803		if (!invalid)
1804			continue;
1805
1806		pr_debug("restoring svms 0x%p prange 0x%p [0x%lx %lx] inv %d\n",
1807			 prange->svms, prange, prange->start, prange->last,
1808			 invalid);
1809
1810		/*
1811		 * If range is migrating, wait for migration is done.
1812		 */
1813		mutex_lock(&prange->migrate_mutex);
1814
1815		r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
1816					       false, true, false);
1817		if (r)
1818			pr_debug("failed %d to map 0x%lx to gpus\n", r,
1819				 prange->start);
1820
1821		mutex_unlock(&prange->migrate_mutex);
1822		if (r)
1823			goto out_reschedule;
1824
1825		if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid)
1826			goto out_reschedule;
1827	}
1828
1829	if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) !=
1830	    evicted_ranges)
1831		goto out_reschedule;
1832
1833	evicted_ranges = 0;
1834
1835	r = kgd2kfd_resume_mm(mm);
1836	if (r) {
1837		/* No recovery from this failure. Probably the CP is
1838		 * hanging. No point trying again.
1839		 */
1840		pr_debug("failed %d to resume KFD\n", r);
1841	}
1842
1843	pr_debug("restore svm ranges successfully\n");
1844
1845out_reschedule:
1846	mutex_unlock(&svms->lock);
1847	mmap_write_unlock(mm);
1848	mutex_unlock(&process_info->lock);
1849
1850	/* If validation failed, reschedule another attempt */
1851	if (evicted_ranges) {
1852		pr_debug("reschedule to restore svm range\n");
1853		schedule_delayed_work(&svms->restore_work,
1854			msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
1855
1856		kfd_smi_event_queue_restore_rescheduled(mm);
1857	}
1858	mmput(mm);
1859}
1860
1861/**
1862 * svm_range_evict - evict svm range
1863 * @prange: svm range structure
1864 * @mm: current process mm_struct
1865 * @start: starting process queue number
1866 * @last: last process queue number
1867 * @event: mmu notifier event when range is evicted or migrated
1868 *
1869 * Stop all queues of the process to ensure GPU doesn't access the memory, then
1870 * return to let CPU evict the buffer and proceed CPU pagetable update.
1871 *
1872 * Don't need use lock to sync cpu pagetable invalidation with GPU execution.
1873 * If invalidation happens while restore work is running, restore work will
1874 * restart to ensure to get the latest CPU pages mapping to GPU, then start
1875 * the queues.
1876 */
1877static int
1878svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
1879		unsigned long start, unsigned long last,
1880		enum mmu_notifier_event event)
1881{
1882	struct svm_range_list *svms = prange->svms;
1883	struct svm_range *pchild;
1884	struct kfd_process *p;
1885	int r = 0;
1886
1887	p = container_of(svms, struct kfd_process, svms);
1888
1889	pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
1890		 svms, prange->start, prange->last, start, last);
1891
1892	if (!p->xnack_enabled ||
1893	    (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) {
1894		int evicted_ranges;
1895		bool mapped = prange->mapped_to_gpu;
1896
1897		list_for_each_entry(pchild, &prange->child_list, child_list) {
1898			if (!pchild->mapped_to_gpu)
1899				continue;
1900			mapped = true;
1901			mutex_lock_nested(&pchild->lock, 1);
1902			if (pchild->start <= last && pchild->last >= start) {
1903				pr_debug("increment pchild invalid [0x%lx 0x%lx]\n",
1904					 pchild->start, pchild->last);
1905				atomic_inc(&pchild->invalid);
1906			}
1907			mutex_unlock(&pchild->lock);
1908		}
1909
1910		if (!mapped)
1911			return r;
1912
1913		if (prange->start <= last && prange->last >= start)
1914			atomic_inc(&prange->invalid);
1915
1916		evicted_ranges = atomic_inc_return(&svms->evicted_ranges);
1917		if (evicted_ranges != 1)
1918			return r;
1919
1920		pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n",
1921			 prange->svms, prange->start, prange->last);
1922
1923		/* First eviction, stop the queues */
1924		r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM);
1925		if (r)
1926			pr_debug("failed to quiesce KFD\n");
1927
1928		pr_debug("schedule to restore svm %p ranges\n", svms);
1929		schedule_delayed_work(&svms->restore_work,
1930			msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
1931	} else {
1932		unsigned long s, l;
1933		uint32_t trigger;
1934
1935		if (event == MMU_NOTIFY_MIGRATE)
1936			trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE;
1937		else
1938			trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY;
1939
1940		pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n",
1941			 prange->svms, start, last);
1942		list_for_each_entry(pchild, &prange->child_list, child_list) {
1943			mutex_lock_nested(&pchild->lock, 1);
1944			s = max(start, pchild->start);
1945			l = min(last, pchild->last);
1946			if (l >= s)
1947				svm_range_unmap_from_gpus(pchild, s, l, trigger);
1948			mutex_unlock(&pchild->lock);
1949		}
1950		s = max(start, prange->start);
1951		l = min(last, prange->last);
1952		if (l >= s)
1953			svm_range_unmap_from_gpus(prange, s, l, trigger);
1954	}
1955
1956	return r;
1957}
1958
1959static struct svm_range *svm_range_clone(struct svm_range *old)
1960{
1961	struct svm_range *new;
1962
1963	new = svm_range_new(old->svms, old->start, old->last, false);
1964	if (!new)
1965		return NULL;
1966	if (svm_range_copy_dma_addrs(new, old)) {
1967		svm_range_free(new, false);
1968		return NULL;
1969	}
1970	if (old->svm_bo) {
1971		new->ttm_res = old->ttm_res;
1972		new->offset = old->offset;
1973		new->svm_bo = svm_range_bo_ref(old->svm_bo);
1974		spin_lock(&new->svm_bo->list_lock);
1975		list_add(&new->svm_bo_list, &new->svm_bo->range_list);
1976		spin_unlock(&new->svm_bo->list_lock);
1977	}
1978	new->flags = old->flags;
1979	new->preferred_loc = old->preferred_loc;
1980	new->prefetch_loc = old->prefetch_loc;
1981	new->actual_loc = old->actual_loc;
1982	new->granularity = old->granularity;
1983	new->mapped_to_gpu = old->mapped_to_gpu;
1984	bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
1985	bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
1986
1987	return new;
1988}
1989
1990void svm_range_set_max_pages(struct amdgpu_device *adev)
1991{
1992	uint64_t max_pages;
1993	uint64_t pages, _pages;
1994	uint64_t min_pages = 0;
1995	int i, id;
1996
1997	for (i = 0; i < adev->kfd.dev->num_nodes; i++) {
1998		if (adev->kfd.dev->nodes[i]->xcp)
1999			id = adev->kfd.dev->nodes[i]->xcp->id;
2000		else
2001			id = -1;
2002		pages = KFD_XCP_MEMORY_SIZE(adev, id) >> 17;
2003		pages = clamp(pages, 1ULL << 9, 1ULL << 18);
2004		pages = rounddown_pow_of_two(pages);
2005		min_pages = min_not_zero(min_pages, pages);
2006	}
2007
2008	do {
2009		max_pages = READ_ONCE(max_svm_range_pages);
2010		_pages = min_not_zero(max_pages, min_pages);
2011	} while (cmpxchg(&max_svm_range_pages, max_pages, _pages) != max_pages);
2012}
2013
2014static int
2015svm_range_split_new(struct svm_range_list *svms, uint64_t start, uint64_t last,
2016		    uint64_t max_pages, struct list_head *insert_list,
2017		    struct list_head *update_list)
2018{
2019	struct svm_range *prange;
2020	uint64_t l;
2021
2022	pr_debug("max_svm_range_pages 0x%llx adding [0x%llx 0x%llx]\n",
2023		 max_pages, start, last);
2024
2025	while (last >= start) {
2026		l = min(last, ALIGN_DOWN(start + max_pages, max_pages) - 1);
2027
2028		prange = svm_range_new(svms, start, l, true);
2029		if (!prange)
2030			return -ENOMEM;
2031		list_add(&prange->list, insert_list);
2032		list_add(&prange->update_list, update_list);
2033
2034		start = l + 1;
2035	}
2036	return 0;
2037}
2038
2039/**
2040 * svm_range_add - add svm range and handle overlap
2041 * @p: the range add to this process svms
2042 * @start: page size aligned
2043 * @size: page size aligned
2044 * @nattr: number of attributes
2045 * @attrs: array of attributes
2046 * @update_list: output, the ranges need validate and update GPU mapping
2047 * @insert_list: output, the ranges need insert to svms
2048 * @remove_list: output, the ranges are replaced and need remove from svms
2049 *
2050 * Check if the virtual address range has overlap with any existing ranges,
2051 * split partly overlapping ranges and add new ranges in the gaps. All changes
2052 * should be applied to the range_list and interval tree transactionally. If
2053 * any range split or allocation fails, the entire update fails. Therefore any
2054 * existing overlapping svm_ranges are cloned and the original svm_ranges left
2055 * unchanged.
2056 *
2057 * If the transaction succeeds, the caller can update and insert clones and
2058 * new ranges, then free the originals.
2059 *
2060 * Otherwise the caller can free the clones and new ranges, while the old
2061 * svm_ranges remain unchanged.
2062 *
2063 * Context: Process context, caller must hold svms->lock
2064 *
2065 * Return:
2066 * 0 - OK, otherwise error code
2067 */
2068static int
2069svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
2070	      uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs,
2071	      struct list_head *update_list, struct list_head *insert_list,
2072	      struct list_head *remove_list)
2073{
2074	unsigned long last = start + size - 1UL;
2075	struct svm_range_list *svms = &p->svms;
2076	struct interval_tree_node *node;
2077	struct svm_range *prange;
2078	struct svm_range *tmp;
2079	struct list_head new_list;
2080	int r = 0;
2081
2082	pr_debug("svms 0x%p [0x%llx 0x%lx]\n", &p->svms, start, last);
2083
2084	INIT_LIST_HEAD(update_list);
2085	INIT_LIST_HEAD(insert_list);
2086	INIT_LIST_HEAD(remove_list);
2087	INIT_LIST_HEAD(&new_list);
2088
2089	node = interval_tree_iter_first(&svms->objects, start, last);
2090	while (node) {
2091		struct interval_tree_node *next;
2092		unsigned long next_start;
2093
2094		pr_debug("found overlap node [0x%lx 0x%lx]\n", node->start,
2095			 node->last);
2096
2097		prange = container_of(node, struct svm_range, it_node);
2098		next = interval_tree_iter_next(node, start, last);
2099		next_start = min(node->last, last) + 1;
2100
2101		if (svm_range_is_same_attrs(p, prange, nattr, attrs) &&
2102		    prange->mapped_to_gpu) {
2103			/* nothing to do */
2104		} else if (node->start < start || node->last > last) {
2105			/* node intersects the update range and its attributes
2106			 * will change. Clone and split it, apply updates only
2107			 * to the overlapping part
2108			 */
2109			struct svm_range *old = prange;
2110
2111			prange = svm_range_clone(old);
2112			if (!prange) {
2113				r = -ENOMEM;
2114				goto out;
2115			}
2116
2117			list_add(&old->update_list, remove_list);
2118			list_add(&prange->list, insert_list);
2119			list_add(&prange->update_list, update_list);
2120
2121			if (node->start < start) {
2122				pr_debug("change old range start\n");
2123				r = svm_range_split_head(prange, start,
2124							 insert_list);
2125				if (r)
2126					goto out;
2127			}
2128			if (node->last > last) {
2129				pr_debug("change old range last\n");
2130				r = svm_range_split_tail(prange, last,
2131							 insert_list);
2132				if (r)
2133					goto out;
2134			}
2135		} else {
2136			/* The node is contained within start..last,
2137			 * just update it
2138			 */
2139			list_add(&prange->update_list, update_list);
2140		}
2141
2142		/* insert a new node if needed */
2143		if (node->start > start) {
2144			r = svm_range_split_new(svms, start, node->start - 1,
2145						READ_ONCE(max_svm_range_pages),
2146						&new_list, update_list);
2147			if (r)
2148				goto out;
2149		}
2150
2151		node = next;
2152		start = next_start;
2153	}
2154
2155	/* add a final range at the end if needed */
2156	if (start <= last)
2157		r = svm_range_split_new(svms, start, last,
2158					READ_ONCE(max_svm_range_pages),
2159					&new_list, update_list);
2160
2161out:
2162	if (r) {
2163		list_for_each_entry_safe(prange, tmp, insert_list, list)
2164			svm_range_free(prange, false);
2165		list_for_each_entry_safe(prange, tmp, &new_list, list)
2166			svm_range_free(prange, true);
2167	} else {
2168		list_splice(&new_list, insert_list);
2169	}
2170
2171	return r;
2172}
2173
2174static void
2175svm_range_update_notifier_and_interval_tree(struct mm_struct *mm,
2176					    struct svm_range *prange)
2177{
2178	unsigned long start;
2179	unsigned long last;
2180
2181	start = prange->notifier.interval_tree.start >> PAGE_SHIFT;
2182	last = prange->notifier.interval_tree.last >> PAGE_SHIFT;
2183
2184	if (prange->start == start && prange->last == last)
2185		return;
2186
2187	pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
2188		  prange->svms, prange, start, last, prange->start,
2189		  prange->last);
2190
2191	if (start != 0 && last != 0) {
2192		interval_tree_remove(&prange->it_node, &prange->svms->objects);
2193		svm_range_remove_notifier(prange);
2194	}
2195	prange->it_node.start = prange->start;
2196	prange->it_node.last = prange->last;
2197
2198	interval_tree_insert(&prange->it_node, &prange->svms->objects);
2199	svm_range_add_notifier_locked(mm, prange);
2200}
2201
2202static void
2203svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange,
2204			 struct mm_struct *mm)
2205{
2206	switch (prange->work_item.op) {
2207	case SVM_OP_NULL:
2208		pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n",
2209			 svms, prange, prange->start, prange->last);
2210		break;
2211	case SVM_OP_UNMAP_RANGE:
2212		pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n",
2213			 svms, prange, prange->start, prange->last);
2214		svm_range_unlink(prange);
2215		svm_range_remove_notifier(prange);
2216		svm_range_free(prange, true);
2217		break;
2218	case SVM_OP_UPDATE_RANGE_NOTIFIER:
2219		pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n",
2220			 svms, prange, prange->start, prange->last);
2221		svm_range_update_notifier_and_interval_tree(mm, prange);
2222		break;
2223	case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP:
2224		pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n",
2225			 svms, prange, prange->start, prange->last);
2226		svm_range_update_notifier_and_interval_tree(mm, prange);
2227		/* TODO: implement deferred validation and mapping */
2228		break;
2229	case SVM_OP_ADD_RANGE:
2230		pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange,
2231			 prange->start, prange->last);
2232		svm_range_add_to_svms(prange);
2233		svm_range_add_notifier_locked(mm, prange);
2234		break;
2235	case SVM_OP_ADD_RANGE_AND_MAP:
2236		pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms,
2237			 prange, prange->start, prange->last);
2238		svm_range_add_to_svms(prange);
2239		svm_range_add_notifier_locked(mm, prange);
2240		/* TODO: implement deferred validation and mapping */
2241		break;
2242	default:
2243		WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange,
2244			 prange->work_item.op);
2245	}
2246}
2247
2248static void svm_range_drain_retry_fault(struct svm_range_list *svms)
2249{
2250	struct kfd_process_device *pdd;
2251	struct kfd_process *p;
2252	int drain;
2253	uint32_t i;
2254
2255	p = container_of(svms, struct kfd_process, svms);
2256
2257restart:
2258	drain = atomic_read(&svms->drain_pagefaults);
2259	if (!drain)
2260		return;
2261
2262	for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) {
2263		pdd = p->pdds[i];
2264		if (!pdd)
2265			continue;
2266
2267		pr_debug("drain retry fault gpu %d svms %p\n", i, svms);
2268
2269		amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
2270				pdd->dev->adev->irq.retry_cam_enabled ?
2271				&pdd->dev->adev->irq.ih :
2272				&pdd->dev->adev->irq.ih1);
2273
2274		if (pdd->dev->adev->irq.retry_cam_enabled)
2275			amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
2276				&pdd->dev->adev->irq.ih_soft);
2277
2278
2279		pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
2280	}
2281	if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain)
2282		goto restart;
2283}
2284
2285static void svm_range_deferred_list_work(struct work_struct *work)
2286{
2287	struct svm_range_list *svms;
2288	struct svm_range *prange;
2289	struct mm_struct *mm;
2290
2291	svms = container_of(work, struct svm_range_list, deferred_list_work);
2292	pr_debug("enter svms 0x%p\n", svms);
2293
2294	spin_lock(&svms->deferred_list_lock);
2295	while (!list_empty(&svms->deferred_range_list)) {
2296		prange = list_first_entry(&svms->deferred_range_list,
2297					  struct svm_range, deferred_list);
2298		spin_unlock(&svms->deferred_list_lock);
2299
2300		pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange,
2301			 prange->start, prange->last, prange->work_item.op);
2302
2303		mm = prange->work_item.mm;
2304retry:
2305		mmap_write_lock(mm);
2306
2307		/* Checking for the need to drain retry faults must be inside
2308		 * mmap write lock to serialize with munmap notifiers.
2309		 */
2310		if (unlikely(atomic_read(&svms->drain_pagefaults))) {
2311			mmap_write_unlock(mm);
2312			svm_range_drain_retry_fault(svms);
2313			goto retry;
2314		}
2315
2316		/* Remove from deferred_list must be inside mmap write lock, for
2317		 * two race cases:
2318		 * 1. unmap_from_cpu may change work_item.op and add the range
2319		 *    to deferred_list again, cause use after free bug.
2320		 * 2. svm_range_list_lock_and_flush_work may hold mmap write
2321		 *    lock and continue because deferred_list is empty, but
2322		 *    deferred_list work is actually waiting for mmap lock.
2323		 */
2324		spin_lock(&svms->deferred_list_lock);
2325		list_del_init(&prange->deferred_list);
2326		spin_unlock(&svms->deferred_list_lock);
2327
2328		mutex_lock(&svms->lock);
2329		mutex_lock(&prange->migrate_mutex);
2330		while (!list_empty(&prange->child_list)) {
2331			struct svm_range *pchild;
2332
2333			pchild = list_first_entry(&prange->child_list,
2334						struct svm_range, child_list);
2335			pr_debug("child prange 0x%p op %d\n", pchild,
2336				 pchild->work_item.op);
2337			list_del_init(&pchild->child_list);
2338			svm_range_handle_list_op(svms, pchild, mm);
2339		}
2340		mutex_unlock(&prange->migrate_mutex);
2341
2342		svm_range_handle_list_op(svms, prange, mm);
2343		mutex_unlock(&svms->lock);
2344		mmap_write_unlock(mm);
2345
2346		/* Pairs with mmget in svm_range_add_list_work. If dropping the
2347		 * last mm refcount, schedule release work to avoid circular locking
2348		 */
2349		mmput_async(mm);
2350
2351		spin_lock(&svms->deferred_list_lock);
2352	}
2353	spin_unlock(&svms->deferred_list_lock);
2354	pr_debug("exit svms 0x%p\n", svms);
2355}
2356
2357void
2358svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange,
2359			struct mm_struct *mm, enum svm_work_list_ops op)
2360{
2361	spin_lock(&svms->deferred_list_lock);
2362	/* if prange is on the deferred list */
2363	if (!list_empty(&prange->deferred_list)) {
2364		pr_debug("update exist prange 0x%p work op %d\n", prange, op);
2365		WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n");
2366		if (op != SVM_OP_NULL &&
2367		    prange->work_item.op != SVM_OP_UNMAP_RANGE)
2368			prange->work_item.op = op;
2369	} else {
2370		prange->work_item.op = op;
2371
2372		/* Pairs with mmput in deferred_list_work */
2373		mmget(mm);
2374		prange->work_item.mm = mm;
2375		list_add_tail(&prange->deferred_list,
2376			      &prange->svms->deferred_range_list);
2377		pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n",
2378			 prange, prange->start, prange->last, op);
2379	}
2380	spin_unlock(&svms->deferred_list_lock);
2381}
2382
2383void schedule_deferred_list_work(struct svm_range_list *svms)
2384{
2385	spin_lock(&svms->deferred_list_lock);
2386	if (!list_empty(&svms->deferred_range_list))
2387		schedule_work(&svms->deferred_list_work);
2388	spin_unlock(&svms->deferred_list_lock);
2389}
2390
2391static void
2392svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent,
2393		      struct svm_range *prange, unsigned long start,
2394		      unsigned long last)
2395{
2396	struct svm_range *head;
2397	struct svm_range *tail;
2398
2399	if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
2400		pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange,
2401			 prange->start, prange->last);
2402		return;
2403	}
2404	if (start > prange->last || last < prange->start)
2405		return;
2406
2407	head = tail = prange;
2408	if (start > prange->start)
2409		svm_range_split(prange, prange->start, start - 1, &tail);
2410	if (last < tail->last)
2411		svm_range_split(tail, last + 1, tail->last, &head);
2412
2413	if (head != prange && tail != prange) {
2414		svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
2415		svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE);
2416	} else if (tail != prange) {
2417		svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE);
2418	} else if (head != prange) {
2419		svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
2420	} else if (parent != prange) {
2421		prange->work_item.op = SVM_OP_UNMAP_RANGE;
2422	}
2423}
2424
2425static void
2426svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
2427			 unsigned long start, unsigned long last)
2428{
2429	uint32_t trigger = KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU;
2430	struct svm_range_list *svms;
2431	struct svm_range *pchild;
2432	struct kfd_process *p;
2433	unsigned long s, l;
2434	bool unmap_parent;
2435
2436	p = kfd_lookup_process_by_mm(mm);
2437	if (!p)
2438		return;
2439	svms = &p->svms;
2440
2441	pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms,
2442		 prange, prange->start, prange->last, start, last);
2443
2444	/* Make sure pending page faults are drained in the deferred worker
2445	 * before the range is freed to avoid straggler interrupts on
2446	 * unmapped memory causing "phantom faults".
2447	 */
2448	atomic_inc(&svms->drain_pagefaults);
2449
2450	unmap_parent = start <= prange->start && last >= prange->last;
2451
2452	list_for_each_entry(pchild, &prange->child_list, child_list) {
2453		mutex_lock_nested(&pchild->lock, 1);
2454		s = max(start, pchild->start);
2455		l = min(last, pchild->last);
2456		if (l >= s)
2457			svm_range_unmap_from_gpus(pchild, s, l, trigger);
2458		svm_range_unmap_split(mm, prange, pchild, start, last);
2459		mutex_unlock(&pchild->lock);
2460	}
2461	s = max(start, prange->start);
2462	l = min(last, prange->last);
2463	if (l >= s)
2464		svm_range_unmap_from_gpus(prange, s, l, trigger);
2465	svm_range_unmap_split(mm, prange, prange, start, last);
2466
2467	if (unmap_parent)
2468		svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE);
2469	else
2470		svm_range_add_list_work(svms, prange, mm,
2471					SVM_OP_UPDATE_RANGE_NOTIFIER);
2472	schedule_deferred_list_work(svms);
2473
2474	kfd_unref_process(p);
2475}
2476
2477/**
2478 * svm_range_cpu_invalidate_pagetables - interval notifier callback
2479 * @mni: mmu_interval_notifier struct
2480 * @range: mmu_notifier_range struct
2481 * @cur_seq: value to pass to mmu_interval_set_seq()
2482 *
2483 * If event is MMU_NOTIFY_UNMAP, this is from CPU unmap range, otherwise, it
2484 * is from migration, or CPU page invalidation callback.
2485 *
2486 * For unmap event, unmap range from GPUs, remove prange from svms in a delayed
2487 * work thread, and split prange if only part of prange is unmapped.
2488 *
2489 * For invalidation event, if GPU retry fault is not enabled, evict the queues,
2490 * then schedule svm_range_restore_work to update GPU mapping and resume queues.
2491 * If GPU retry fault is enabled, unmap the svm range from GPU, retry fault will
2492 * update GPU mapping to recover.
2493 *
2494 * Context: mmap lock, notifier_invalidate_start lock are held
2495 *          for invalidate event, prange lock is held if this is from migration
2496 */
2497static bool
2498svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
2499				    const struct mmu_notifier_range *range,
2500				    unsigned long cur_seq)
2501{
2502	struct svm_range *prange;
2503	unsigned long start;
2504	unsigned long last;
2505
2506	if (range->event == MMU_NOTIFY_RELEASE)
2507		return true;
2508	if (!mmget_not_zero(mni->mm))
2509		return true;
2510
2511	start = mni->interval_tree.start;
2512	last = mni->interval_tree.last;
2513	start = max(start, range->start) >> PAGE_SHIFT;
2514	last = min(last, range->end - 1) >> PAGE_SHIFT;
2515	pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n",
2516		 start, last, range->start >> PAGE_SHIFT,
2517		 (range->end - 1) >> PAGE_SHIFT,
2518		 mni->interval_tree.start >> PAGE_SHIFT,
2519		 mni->interval_tree.last >> PAGE_SHIFT, range->event);
2520
2521	prange = container_of(mni, struct svm_range, notifier);
2522
2523	svm_range_lock(prange);
2524	mmu_interval_set_seq(mni, cur_seq);
2525
2526	switch (range->event) {
2527	case MMU_NOTIFY_UNMAP:
2528		svm_range_unmap_from_cpu(mni->mm, prange, start, last);
2529		break;
2530	default:
2531		svm_range_evict(prange, mni->mm, start, last, range->event);
2532		break;
2533	}
2534
2535	svm_range_unlock(prange);
2536	mmput(mni->mm);
2537
2538	return true;
2539}
2540
2541/**
2542 * svm_range_from_addr - find svm range from fault address
2543 * @svms: svm range list header
2544 * @addr: address to search range interval tree, in pages
2545 * @parent: parent range if range is on child list
2546 *
2547 * Context: The caller must hold svms->lock
2548 *
2549 * Return: the svm_range found or NULL
2550 */
2551struct svm_range *
2552svm_range_from_addr(struct svm_range_list *svms, unsigned long addr,
2553		    struct svm_range **parent)
2554{
2555	struct interval_tree_node *node;
2556	struct svm_range *prange;
2557	struct svm_range *pchild;
2558
2559	node = interval_tree_iter_first(&svms->objects, addr, addr);
2560	if (!node)
2561		return NULL;
2562
2563	prange = container_of(node, struct svm_range, it_node);
2564	pr_debug("address 0x%lx prange [0x%lx 0x%lx] node [0x%lx 0x%lx]\n",
2565		 addr, prange->start, prange->last, node->start, node->last);
2566
2567	if (addr >= prange->start && addr <= prange->last) {
2568		if (parent)
2569			*parent = prange;
2570		return prange;
2571	}
2572	list_for_each_entry(pchild, &prange->child_list, child_list)
2573		if (addr >= pchild->start && addr <= pchild->last) {
2574			pr_debug("found address 0x%lx pchild [0x%lx 0x%lx]\n",
2575				 addr, pchild->start, pchild->last);
2576			if (parent)
2577				*parent = prange;
2578			return pchild;
2579		}
2580
2581	return NULL;
2582}
2583
2584/* svm_range_best_restore_location - decide the best fault restore location
2585 * @prange: svm range structure
2586 * @adev: the GPU on which vm fault happened
2587 *
2588 * This is only called when xnack is on, to decide the best location to restore
2589 * the range mapping after GPU vm fault. Caller uses the best location to do
2590 * migration if actual loc is not best location, then update GPU page table
2591 * mapping to the best location.
2592 *
2593 * If the preferred loc is accessible by faulting GPU, use preferred loc.
2594 * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu
2595 * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then
2596 *    if range actual loc is cpu, best_loc is cpu
2597 *    if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is
2598 *    range actual loc.
2599 * Otherwise, GPU no access, best_loc is -1.
2600 *
2601 * Return:
2602 * -1 means vm fault GPU no access
2603 * 0 for CPU or GPU id
2604 */
2605static int32_t
2606svm_range_best_restore_location(struct svm_range *prange,
2607				struct kfd_node *node,
2608				int32_t *gpuidx)
2609{
2610	struct kfd_node *bo_node, *preferred_node;
2611	struct kfd_process *p;
2612	uint32_t gpuid;
2613	int r;
2614
2615	p = container_of(prange->svms, struct kfd_process, svms);
2616
2617	r = kfd_process_gpuid_from_node(p, node, &gpuid, gpuidx);
2618	if (r < 0) {
2619		pr_debug("failed to get gpuid from kgd\n");
2620		return -1;
2621	}
2622
2623	if (node->adev->gmc.is_app_apu)
2624		return 0;
2625
2626	if (prange->preferred_loc == gpuid ||
2627	    prange->preferred_loc == KFD_IOCTL_SVM_LOCATION_SYSMEM) {
2628		return prange->preferred_loc;
2629	} else if (prange->preferred_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED) {
2630		preferred_node = svm_range_get_node_by_id(prange, prange->preferred_loc);
2631		if (preferred_node && svm_nodes_in_same_hive(node, preferred_node))
2632			return prange->preferred_loc;
2633		/* fall through */
2634	}
2635
2636	if (test_bit(*gpuidx, prange->bitmap_access))
2637		return gpuid;
2638
2639	if (test_bit(*gpuidx, prange->bitmap_aip)) {
2640		if (!prange->actual_loc)
2641			return 0;
2642
2643		bo_node = svm_range_get_node_by_id(prange, prange->actual_loc);
2644		if (bo_node && svm_nodes_in_same_hive(node, bo_node))
2645			return prange->actual_loc;
2646		else
2647			return 0;
2648	}
2649
2650	return -1;
2651}
2652
2653static int
2654svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr,
2655			       unsigned long *start, unsigned long *last,
2656			       bool *is_heap_stack)
2657{
2658	struct vm_area_struct *vma;
2659	struct interval_tree_node *node;
2660	struct rb_node *rb_node;
2661	unsigned long start_limit, end_limit;
2662
2663	vma = vma_lookup(p->mm, addr << PAGE_SHIFT);
2664	if (!vma) {
2665		pr_debug("VMA does not exist in address [0x%llx]\n", addr);
2666		return -EFAULT;
2667	}
2668
2669	*is_heap_stack = vma_is_initial_heap(vma) || vma_is_initial_stack(vma);
2670
2671	start_limit = max(vma->vm_start >> PAGE_SHIFT,
2672		      (unsigned long)ALIGN_DOWN(addr, 2UL << 8));
2673	end_limit = min(vma->vm_end >> PAGE_SHIFT,
2674		    (unsigned long)ALIGN(addr + 1, 2UL << 8));
2675	/* First range that starts after the fault address */
2676	node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX);
2677	if (node) {
2678		end_limit = min(end_limit, node->start);
2679		/* Last range that ends before the fault address */
2680		rb_node = rb_prev(&node->rb);
2681	} else {
2682		/* Last range must end before addr because
2683		 * there was no range after addr
2684		 */
2685		rb_node = rb_last(&p->svms.objects.rb_root);
2686	}
2687	if (rb_node) {
2688		node = container_of(rb_node, struct interval_tree_node, rb);
2689		if (node->last >= addr) {
2690			WARN(1, "Overlap with prev node and page fault addr\n");
2691			return -EFAULT;
2692		}
2693		start_limit = max(start_limit, node->last + 1);
2694	}
2695
2696	*start = start_limit;
2697	*last = end_limit - 1;
2698
2699	pr_debug("vma [0x%lx 0x%lx] range [0x%lx 0x%lx] is_heap_stack %d\n",
2700		 vma->vm_start >> PAGE_SHIFT, vma->vm_end >> PAGE_SHIFT,
2701		 *start, *last, *is_heap_stack);
2702
2703	return 0;
2704}
2705
2706static int
2707svm_range_check_vm_userptr(struct kfd_process *p, uint64_t start, uint64_t last,
2708			   uint64_t *bo_s, uint64_t *bo_l)
2709{
2710	struct amdgpu_bo_va_mapping *mapping;
2711	struct interval_tree_node *node;
2712	struct amdgpu_bo *bo = NULL;
2713	unsigned long userptr;
2714	uint32_t i;
2715	int r;
2716
2717	for (i = 0; i < p->n_pdds; i++) {
2718		struct amdgpu_vm *vm;
2719
2720		if (!p->pdds[i]->drm_priv)
2721			continue;
2722
2723		vm = drm_priv_to_vm(p->pdds[i]->drm_priv);
2724		r = amdgpu_bo_reserve(vm->root.bo, false);
2725		if (r)
2726			return r;
2727
2728		/* Check userptr by searching entire vm->va interval tree */
2729		node = interval_tree_iter_first(&vm->va, 0, ~0ULL);
2730		while (node) {
2731			mapping = container_of((struct rb_node *)node,
2732					       struct amdgpu_bo_va_mapping, rb);
2733			bo = mapping->bo_va->base.bo;
2734
2735			if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm,
2736							 start << PAGE_SHIFT,
2737							 last << PAGE_SHIFT,
2738							 &userptr)) {
2739				node = interval_tree_iter_next(node, 0, ~0ULL);
2740				continue;
2741			}
2742
2743			pr_debug("[0x%llx 0x%llx] already userptr mapped\n",
2744				 start, last);
2745			if (bo_s && bo_l) {
2746				*bo_s = userptr >> PAGE_SHIFT;
2747				*bo_l = *bo_s + bo->tbo.ttm->num_pages - 1;
2748			}
2749			amdgpu_bo_unreserve(vm->root.bo);
2750			return -EADDRINUSE;
2751		}
2752		amdgpu_bo_unreserve(vm->root.bo);
2753	}
2754	return 0;
2755}
2756
2757static struct
2758svm_range *svm_range_create_unregistered_range(struct kfd_node *node,
2759						struct kfd_process *p,
2760						struct mm_struct *mm,
2761						int64_t addr)
2762{
2763	struct svm_range *prange = NULL;
2764	unsigned long start, last;
2765	uint32_t gpuid, gpuidx;
2766	bool is_heap_stack;
2767	uint64_t bo_s = 0;
2768	uint64_t bo_l = 0;
2769	int r;
2770
2771	if (svm_range_get_range_boundaries(p, addr, &start, &last,
2772					   &is_heap_stack))
2773		return NULL;
2774
2775	r = svm_range_check_vm(p, start, last, &bo_s, &bo_l);
2776	if (r != -EADDRINUSE)
2777		r = svm_range_check_vm_userptr(p, start, last, &bo_s, &bo_l);
2778
2779	if (r == -EADDRINUSE) {
2780		if (addr >= bo_s && addr <= bo_l)
2781			return NULL;
2782
2783		/* Create one page svm range if 2MB range overlapping */
2784		start = addr;
2785		last = addr;
2786	}
2787
2788	prange = svm_range_new(&p->svms, start, last, true);
2789	if (!prange) {
2790		pr_debug("Failed to create prange in address [0x%llx]\n", addr);
2791		return NULL;
2792	}
2793	if (kfd_process_gpuid_from_node(p, node, &gpuid, &gpuidx)) {
2794		pr_debug("failed to get gpuid from kgd\n");
2795		svm_range_free(prange, true);
2796		return NULL;
2797	}
2798
2799	if (is_heap_stack)
2800		prange->preferred_loc = KFD_IOCTL_SVM_LOCATION_SYSMEM;
2801
2802	svm_range_add_to_svms(prange);
2803	svm_range_add_notifier_locked(mm, prange);
2804
2805	return prange;
2806}
2807
2808/* svm_range_skip_recover - decide if prange can be recovered
2809 * @prange: svm range structure
2810 *
2811 * GPU vm retry fault handle skip recover the range for cases:
2812 * 1. prange is on deferred list to be removed after unmap, it is stale fault,
2813 *    deferred list work will drain the stale fault before free the prange.
2814 * 2. prange is on deferred list to add interval notifier after split, or
2815 * 3. prange is child range, it is split from parent prange, recover later
2816 *    after interval notifier is added.
2817 *
2818 * Return: true to skip recover, false to recover
2819 */
2820static bool svm_range_skip_recover(struct svm_range *prange)
2821{
2822	struct svm_range_list *svms = prange->svms;
2823
2824	spin_lock(&svms->deferred_list_lock);
2825	if (list_empty(&prange->deferred_list) &&
2826	    list_empty(&prange->child_list)) {
2827		spin_unlock(&svms->deferred_list_lock);
2828		return false;
2829	}
2830	spin_unlock(&svms->deferred_list_lock);
2831
2832	if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
2833		pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n",
2834			 svms, prange, prange->start, prange->last);
2835		return true;
2836	}
2837	if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP ||
2838	    prange->work_item.op == SVM_OP_ADD_RANGE) {
2839		pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n",
2840			 svms, prange, prange->start, prange->last);
2841		return true;
2842	}
2843	return false;
2844}
2845
2846static void
2847svm_range_count_fault(struct kfd_node *node, struct kfd_process *p,
2848		      int32_t gpuidx)
2849{
2850	struct kfd_process_device *pdd;
2851
2852	/* fault is on different page of same range
2853	 * or fault is skipped to recover later
2854	 * or fault is on invalid virtual address
2855	 */
2856	if (gpuidx == MAX_GPU_INSTANCE) {
2857		uint32_t gpuid;
2858		int r;
2859
2860		r = kfd_process_gpuid_from_node(p, node, &gpuid, &gpuidx);
2861		if (r < 0)
2862			return;
2863	}
2864
2865	/* fault is recovered
2866	 * or fault cannot recover because GPU no access on the range
2867	 */
2868	pdd = kfd_process_device_from_gpuidx(p, gpuidx);
2869	if (pdd)
2870		WRITE_ONCE(pdd->faults, pdd->faults + 1);
2871}
2872
2873static bool
2874svm_fault_allowed(struct vm_area_struct *vma, bool write_fault)
2875{
2876	unsigned long requested = VM_READ;
2877
2878	if (write_fault)
2879		requested |= VM_WRITE;
2880
2881	pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested,
2882		vma->vm_flags);
2883	return (vma->vm_flags & requested) == requested;
2884}
2885
2886int
2887svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
2888			uint32_t vmid, uint32_t node_id,
2889			uint64_t addr, bool write_fault)
2890{
2891	struct mm_struct *mm = NULL;
2892	struct svm_range_list *svms;
2893	struct svm_range *prange;
2894	struct kfd_process *p;
2895	ktime_t timestamp = ktime_get_boottime();
2896	struct kfd_node *node;
2897	int32_t best_loc;
2898	int32_t gpuidx = MAX_GPU_INSTANCE;
2899	bool write_locked = false;
2900	struct vm_area_struct *vma;
2901	bool migration = false;
2902	int r = 0;
2903
2904	if (!KFD_IS_SVM_API_SUPPORTED(adev)) {
2905		pr_debug("device does not support SVM\n");
2906		return -EFAULT;
2907	}
2908
2909	p = kfd_lookup_process_by_pasid(pasid);
2910	if (!p) {
2911		pr_debug("kfd process not founded pasid 0x%x\n", pasid);
2912		return 0;
2913	}
2914	svms = &p->svms;
2915
2916	pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr);
2917
2918	if (atomic_read(&svms->drain_pagefaults)) {
2919		pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
2920		r = 0;
2921		goto out;
2922	}
2923
2924	if (!p->xnack_enabled) {
2925		pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
2926		r = -EFAULT;
2927		goto out;
2928	}
2929
2930	/* p->lead_thread is available as kfd_process_wq_release flush the work
2931	 * before releasing task ref.
2932	 */
2933	mm = get_task_mm(p->lead_thread);
2934	if (!mm) {
2935		pr_debug("svms 0x%p failed to get mm\n", svms);
2936		r = 0;
2937		goto out;
2938	}
2939
2940	node = kfd_node_by_irq_ids(adev, node_id, vmid);
2941	if (!node) {
2942		pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id,
2943			 vmid);
2944		r = -EFAULT;
2945		goto out;
2946	}
2947	mmap_read_lock(mm);
2948retry_write_locked:
2949	mutex_lock(&svms->lock);
2950	prange = svm_range_from_addr(svms, addr, NULL);
2951	if (!prange) {
2952		pr_debug("failed to find prange svms 0x%p address [0x%llx]\n",
2953			 svms, addr);
2954		if (!write_locked) {
2955			/* Need the write lock to create new range with MMU notifier.
2956			 * Also flush pending deferred work to make sure the interval
2957			 * tree is up to date before we add a new range
2958			 */
2959			mutex_unlock(&svms->lock);
2960			mmap_read_unlock(mm);
2961			mmap_write_lock(mm);
2962			write_locked = true;
2963			goto retry_write_locked;
2964		}
2965		prange = svm_range_create_unregistered_range(node, p, mm, addr);
2966		if (!prange) {
2967			pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n",
2968				 svms, addr);
2969			mmap_write_downgrade(mm);
2970			r = -EFAULT;
2971			goto out_unlock_svms;
2972		}
2973	}
2974	if (write_locked)
2975		mmap_write_downgrade(mm);
2976
2977	mutex_lock(&prange->migrate_mutex);
2978
2979	if (svm_range_skip_recover(prange)) {
2980		amdgpu_gmc_filter_faults_remove(node->adev, addr, pasid);
2981		r = 0;
2982		goto out_unlock_range;
2983	}
2984
2985	/* skip duplicate vm fault on different pages of same range */
2986	if (ktime_before(timestamp, ktime_add_ns(prange->validate_timestamp,
2987				AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) {
2988		pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
2989			 svms, prange->start, prange->last);
2990		r = 0;
2991		goto out_unlock_range;
2992	}
2993
2994	/* __do_munmap removed VMA, return success as we are handling stale
2995	 * retry fault.
2996	 */
2997	vma = vma_lookup(mm, addr << PAGE_SHIFT);
2998	if (!vma) {
2999		pr_debug("address 0x%llx VMA is removed\n", addr);
3000		r = 0;
3001		goto out_unlock_range;
3002	}
3003
3004	if (!svm_fault_allowed(vma, write_fault)) {
3005		pr_debug("fault addr 0x%llx no %s permission\n", addr,
3006			write_fault ? "write" : "read");
3007		r = -EPERM;
3008		goto out_unlock_range;
3009	}
3010
3011	best_loc = svm_range_best_restore_location(prange, node, &gpuidx);
3012	if (best_loc == -1) {
3013		pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n",
3014			 svms, prange->start, prange->last);
3015		r = -EACCES;
3016		goto out_unlock_range;
3017	}
3018
3019	pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n",
3020		 svms, prange->start, prange->last, best_loc,
3021		 prange->actual_loc);
3022
3023	kfd_smi_event_page_fault_start(node, p->lead_thread->pid, addr,
3024				       write_fault, timestamp);
3025
3026	if (prange->actual_loc != best_loc) {
3027		migration = true;
3028		if (best_loc) {
3029			r = svm_migrate_to_vram(prange, best_loc, mm,
3030					KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
3031			if (r) {
3032				pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n",
3033					 r, addr);
3034				/* Fallback to system memory if migration to
3035				 * VRAM failed
3036				 */
3037				if (prange->actual_loc)
3038					r = svm_migrate_vram_to_ram(prange, mm,
3039					   KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
3040					   NULL);
3041				else
3042					r = 0;
3043			}
3044		} else {
3045			r = svm_migrate_vram_to_ram(prange, mm,
3046					KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
3047					NULL);
3048		}
3049		if (r) {
3050			pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n",
3051				 r, svms, prange->start, prange->last);
3052			goto out_unlock_range;
3053		}
3054	}
3055
3056	r = svm_range_validate_and_map(mm, prange, gpuidx, false, false, false);
3057	if (r)
3058		pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
3059			 r, svms, prange->start, prange->last);
3060
3061	kfd_smi_event_page_fault_end(node, p->lead_thread->pid, addr,
3062				     migration);
3063
3064out_unlock_range:
3065	mutex_unlock(&prange->migrate_mutex);
3066out_unlock_svms:
3067	mutex_unlock(&svms->lock);
3068	mmap_read_unlock(mm);
3069
3070	svm_range_count_fault(node, p, gpuidx);
3071
3072	mmput(mm);
3073out:
3074	kfd_unref_process(p);
3075
3076	if (r == -EAGAIN) {
3077		pr_debug("recover vm fault later\n");
3078		amdgpu_gmc_filter_faults_remove(node->adev, addr, pasid);
3079		r = 0;
3080	}
3081	return r;
3082}
3083
3084int
3085svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool xnack_enabled)
3086{
3087	struct svm_range *prange, *pchild;
3088	uint64_t reserved_size = 0;
3089	uint64_t size;
3090	int r = 0;
3091
3092	pr_debug("switching xnack from %d to %d\n", p->xnack_enabled, xnack_enabled);
3093
3094	mutex_lock(&p->svms.lock);
3095
3096	list_for_each_entry(prange, &p->svms.list, list) {
3097		svm_range_lock(prange);
3098		list_for_each_entry(pchild, &prange->child_list, child_list) {
3099			size = (pchild->last - pchild->start + 1) << PAGE_SHIFT;
3100			if (xnack_enabled) {
3101				amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
3102					KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
3103			} else {
3104				r = amdgpu_amdkfd_reserve_mem_limit(NULL, size,
3105					KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
3106				if (r)
3107					goto out_unlock;
3108				reserved_size += size;
3109			}
3110		}
3111
3112		size = (prange->last - prange->start + 1) << PAGE_SHIFT;
3113		if (xnack_enabled) {
3114			amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
3115					KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
3116		} else {
3117			r = amdgpu_amdkfd_reserve_mem_limit(NULL, size,
3118					KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
3119			if (r)
3120				goto out_unlock;
3121			reserved_size += size;
3122		}
3123out_unlock:
3124		svm_range_unlock(prange);
3125		if (r)
3126			break;
3127	}
3128
3129	if (r)
3130		amdgpu_amdkfd_unreserve_mem_limit(NULL, reserved_size,
3131					KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
3132	else
3133		/* Change xnack mode must be inside svms lock, to avoid race with
3134		 * svm_range_deferred_list_work unreserve memory in parallel.
3135		 */
3136		p->xnack_enabled = xnack_enabled;
3137
3138	mutex_unlock(&p->svms.lock);
3139	return r;
3140}
3141
3142void svm_range_list_fini(struct kfd_process *p)
3143{
3144	struct svm_range *prange;
3145	struct svm_range *next;
3146
3147	pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms);
3148
3149	cancel_delayed_work_sync(&p->svms.restore_work);
3150
3151	/* Ensure list work is finished before process is destroyed */
3152	flush_work(&p->svms.deferred_list_work);
3153
3154	/*
3155	 * Ensure no retry fault comes in afterwards, as page fault handler will
3156	 * not find kfd process and take mm lock to recover fault.
3157	 */
3158	atomic_inc(&p->svms.drain_pagefaults);
3159	svm_range_drain_retry_fault(&p->svms);
3160
3161	list_for_each_entry_safe(prange, next, &p->svms.list, list) {
3162		svm_range_unlink(prange);
3163		svm_range_remove_notifier(prange);
3164		svm_range_free(prange, true);
3165	}
3166
3167	mutex_destroy(&p->svms.lock);
3168
3169	pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms);
3170}
3171
3172int svm_range_list_init(struct kfd_process *p)
3173{
3174	struct svm_range_list *svms = &p->svms;
3175	int i;
3176
3177	svms->objects = RB_ROOT_CACHED;
3178	mutex_init(&svms->lock);
3179	INIT_LIST_HEAD(&svms->list);
3180	atomic_set(&svms->evicted_ranges, 0);
3181	atomic_set(&svms->drain_pagefaults, 0);
3182	INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work);
3183	INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work);
3184	INIT_LIST_HEAD(&svms->deferred_range_list);
3185	INIT_LIST_HEAD(&svms->criu_svm_metadata_list);
3186	spin_lock_init(&svms->deferred_list_lock);
3187
3188	for (i = 0; i < p->n_pdds; i++)
3189		if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev->adev))
3190			bitmap_set(svms->bitmap_supported, i, 1);
3191
3192	return 0;
3193}
3194
3195/**
3196 * svm_range_check_vm - check if virtual address range mapped already
3197 * @p: current kfd_process
3198 * @start: range start address, in pages
3199 * @last: range last address, in pages
3200 * @bo_s: mapping start address in pages if address range already mapped
3201 * @bo_l: mapping last address in pages if address range already mapped
3202 *
3203 * The purpose is to avoid virtual address ranges already allocated by
3204 * kfd_ioctl_alloc_memory_of_gpu ioctl.
3205 * It looks for each pdd in the kfd_process.
3206 *
3207 * Context: Process context
3208 *
3209 * Return 0 - OK, if the range is not mapped.
3210 * Otherwise error code:
3211 * -EADDRINUSE - if address is mapped already by kfd_ioctl_alloc_memory_of_gpu
3212 * -ERESTARTSYS - A wait for the buffer to become unreserved was interrupted by
3213 * a signal. Release all buffer reservations and return to user-space.
3214 */
3215static int
3216svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last,
3217		   uint64_t *bo_s, uint64_t *bo_l)
3218{
3219	struct amdgpu_bo_va_mapping *mapping;
3220	struct interval_tree_node *node;
3221	uint32_t i;
3222	int r;
3223
3224	for (i = 0; i < p->n_pdds; i++) {
3225		struct amdgpu_vm *vm;
3226
3227		if (!p->pdds[i]->drm_priv)
3228			continue;
3229
3230		vm = drm_priv_to_vm(p->pdds[i]->drm_priv);
3231		r = amdgpu_bo_reserve(vm->root.bo, false);
3232		if (r)
3233			return r;
3234
3235		node = interval_tree_iter_first(&vm->va, start, last);
3236		if (node) {
3237			pr_debug("range [0x%llx 0x%llx] already TTM mapped\n",
3238				 start, last);
3239			mapping = container_of((struct rb_node *)node,
3240					       struct amdgpu_bo_va_mapping, rb);
3241			if (bo_s && bo_l) {
3242				*bo_s = mapping->start;
3243				*bo_l = mapping->last;
3244			}
3245			amdgpu_bo_unreserve(vm->root.bo);
3246			return -EADDRINUSE;
3247		}
3248		amdgpu_bo_unreserve(vm->root.bo);
3249	}
3250
3251	return 0;
3252}
3253
3254/**
3255 * svm_range_is_valid - check if virtual address range is valid
3256 * @p: current kfd_process
3257 * @start: range start address, in pages
3258 * @size: range size, in pages
3259 *
3260 * Valid virtual address range means it belongs to one or more VMAs
3261 *
3262 * Context: Process context
3263 *
3264 * Return:
3265 *  0 - OK, otherwise error code
3266 */
3267static int
3268svm_range_is_valid(struct kfd_process *p, uint64_t start, uint64_t size)
3269{
3270	const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
3271	struct vm_area_struct *vma;
3272	unsigned long end;
3273	unsigned long start_unchg = start;
3274
3275	start <<= PAGE_SHIFT;
3276	end = start + (size << PAGE_SHIFT);
3277	do {
3278		vma = vma_lookup(p->mm, start);
3279		if (!vma || (vma->vm_flags & device_vma))
3280			return -EFAULT;
3281		start = min(end, vma->vm_end);
3282	} while (start < end);
3283
3284	return svm_range_check_vm(p, start_unchg, (end - 1) >> PAGE_SHIFT, NULL,
3285				  NULL);
3286}
3287
3288/**
3289 * svm_range_best_prefetch_location - decide the best prefetch location
3290 * @prange: svm range structure
3291 *
3292 * For xnack off:
3293 * If range map to single GPU, the best prefetch location is prefetch_loc, which
3294 * can be CPU or GPU.
3295 *
3296 * If range is ACCESS or ACCESS_IN_PLACE by mGPUs, only if mGPU connection on
3297 * XGMI same hive, the best prefetch location is prefetch_loc GPU, othervise
3298 * the best prefetch location is always CPU, because GPU can not have coherent
3299 * mapping VRAM of other GPUs even with large-BAR PCIe connection.
3300 *
3301 * For xnack on:
3302 * If range is not ACCESS_IN_PLACE by mGPUs, the best prefetch location is
3303 * prefetch_loc, other GPU access will generate vm fault and trigger migration.
3304 *
3305 * If range is ACCESS_IN_PLACE by mGPUs, only if mGPU connection on XGMI same
3306 * hive, the best prefetch location is prefetch_loc GPU, otherwise the best
3307 * prefetch location is always CPU.
3308 *
3309 * Context: Process context
3310 *
3311 * Return:
3312 * 0 for CPU or GPU id
3313 */
3314static uint32_t
3315svm_range_best_prefetch_location(struct svm_range *prange)
3316{
3317	DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
3318	uint32_t best_loc = prange->prefetch_loc;
3319	struct kfd_process_device *pdd;
3320	struct kfd_node *bo_node;
3321	struct kfd_process *p;
3322	uint32_t gpuidx;
3323
3324	p = container_of(prange->svms, struct kfd_process, svms);
3325
3326	if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED)
3327		goto out;
3328
3329	bo_node = svm_range_get_node_by_id(prange, best_loc);
3330	if (!bo_node) {
3331		WARN_ONCE(1, "failed to get valid kfd node at id%x\n", best_loc);
3332		best_loc = 0;
3333		goto out;
3334	}
3335
3336	if (bo_node->adev->gmc.is_app_apu) {
3337		best_loc = 0;
3338		goto out;
3339	}
3340
3341	if (p->xnack_enabled)
3342		bitmap_copy(bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE);
3343	else
3344		bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
3345			  MAX_GPU_INSTANCE);
3346
3347	for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
3348		pdd = kfd_process_device_from_gpuidx(p, gpuidx);
3349		if (!pdd) {
3350			pr_debug("failed to get device by idx 0x%x\n", gpuidx);
3351			continue;
3352		}
3353
3354		if (pdd->dev->adev == bo_node->adev)
3355			continue;
3356
3357		if (!svm_nodes_in_same_hive(pdd->dev, bo_node)) {
3358			best_loc = 0;
3359			break;
3360		}
3361	}
3362
3363out:
3364	pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n",
3365		 p->xnack_enabled, &p->svms, prange->start, prange->last,
3366		 best_loc);
3367
3368	return best_loc;
3369}
3370
3371/* svm_range_trigger_migration - start page migration if prefetch loc changed
3372 * @mm: current process mm_struct
3373 * @prange: svm range structure
3374 * @migrated: output, true if migration is triggered
3375 *
3376 * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range
3377 * from ram to vram.
3378 * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range
3379 * from vram to ram.
3380 *
3381 * If GPU vm fault retry is not enabled, migration interact with MMU notifier
3382 * and restore work:
3383 * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict
3384 *    stops all queues, schedule restore work
3385 * 2. svm_range_restore_work wait for migration is done by
3386 *    a. svm_range_validate_vram takes prange->migrate_mutex
3387 *    b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns
3388 * 3. restore work update mappings of GPU, resume all queues.
3389 *
3390 * Context: Process context
3391 *
3392 * Return:
3393 * 0 - OK, otherwise - error code of migration
3394 */
3395static int
3396svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange,
3397			    bool *migrated)
3398{
3399	uint32_t best_loc;
3400	int r = 0;
3401
3402	*migrated = false;
3403	best_loc = svm_range_best_prefetch_location(prange);
3404
3405	if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
3406	    best_loc == prange->actual_loc)
3407		return 0;
3408
3409	if (!best_loc) {
3410		r = svm_migrate_vram_to_ram(prange, mm,
3411					KFD_MIGRATE_TRIGGER_PREFETCH, NULL);
3412		*migrated = !r;
3413		return r;
3414	}
3415
3416	r = svm_migrate_to_vram(prange, best_loc, mm, KFD_MIGRATE_TRIGGER_PREFETCH);
3417	*migrated = !r;
3418
3419	return r;
3420}
3421
3422int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence)
3423{
3424	/* Dereferencing fence->svm_bo is safe here because the fence hasn't
3425	 * signaled yet and we're under the protection of the fence->lock.
3426	 * After the fence is signaled in svm_range_bo_release, we cannot get
3427	 * here any more.
3428	 *
3429	 * Reference is dropped in svm_range_evict_svm_bo_worker.
3430	 */
3431	if (svm_bo_ref_unless_zero(fence->svm_bo)) {
3432		WRITE_ONCE(fence->svm_bo->evicting, 1);
3433		schedule_work(&fence->svm_bo->eviction_work);
3434	}
3435
3436	return 0;
3437}
3438
3439static void svm_range_evict_svm_bo_worker(struct work_struct *work)
3440{
3441	struct svm_range_bo *svm_bo;
3442	struct mm_struct *mm;
3443	int r = 0;
3444
3445	svm_bo = container_of(work, struct svm_range_bo, eviction_work);
3446
3447	if (mmget_not_zero(svm_bo->eviction_fence->mm)) {
3448		mm = svm_bo->eviction_fence->mm;
3449	} else {
3450		svm_range_bo_unref(svm_bo);
3451		return;
3452	}
3453
3454	mmap_read_lock(mm);
3455	spin_lock(&svm_bo->list_lock);
3456	while (!list_empty(&svm_bo->range_list) && !r) {
3457		struct svm_range *prange =
3458				list_first_entry(&svm_bo->range_list,
3459						struct svm_range, svm_bo_list);
3460		int retries = 3;
3461
3462		list_del_init(&prange->svm_bo_list);
3463		spin_unlock(&svm_bo->list_lock);
3464
3465		pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
3466			 prange->start, prange->last);
3467
3468		mutex_lock(&prange->migrate_mutex);
3469		do {
3470			r = svm_migrate_vram_to_ram(prange, mm,
3471					KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL);
3472		} while (!r && prange->actual_loc && --retries);
3473
3474		if (!r && prange->actual_loc)
3475			pr_info_once("Migration failed during eviction");
3476
3477		if (!prange->actual_loc) {
3478			mutex_lock(&prange->lock);
3479			prange->svm_bo = NULL;
3480			mutex_unlock(&prange->lock);
3481		}
3482		mutex_unlock(&prange->migrate_mutex);
3483
3484		spin_lock(&svm_bo->list_lock);
3485	}
3486	spin_unlock(&svm_bo->list_lock);
3487	mmap_read_unlock(mm);
3488	mmput(mm);
3489
3490	dma_fence_signal(&svm_bo->eviction_fence->base);
3491
3492	/* This is the last reference to svm_bo, after svm_range_vram_node_free
3493	 * has been called in svm_migrate_vram_to_ram
3494	 */
3495	WARN_ONCE(!r && kref_read(&svm_bo->kref) != 1, "This was not the last reference\n");
3496	svm_range_bo_unref(svm_bo);
3497}
3498
3499static int
3500svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
3501		   uint64_t start, uint64_t size, uint32_t nattr,
3502		   struct kfd_ioctl_svm_attribute *attrs)
3503{
3504	struct amdkfd_process_info *process_info = p->kgd_process_info;
3505	struct list_head update_list;
3506	struct list_head insert_list;
3507	struct list_head remove_list;
3508	struct svm_range_list *svms;
3509	struct svm_range *prange;
3510	struct svm_range *next;
3511	bool update_mapping = false;
3512	bool flush_tlb;
3513	int r, ret = 0;
3514
3515	pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n",
3516		 p->pasid, &p->svms, start, start + size - 1, size);
3517
3518	r = svm_range_check_attr(p, nattr, attrs);
3519	if (r)
3520		return r;
3521
3522	svms = &p->svms;
3523
3524	mutex_lock(&process_info->lock);
3525
3526	svm_range_list_lock_and_flush_work(svms, mm);
3527
3528	r = svm_range_is_valid(p, start, size);
3529	if (r) {
3530		pr_debug("invalid range r=%d\n", r);
3531		mmap_write_unlock(mm);
3532		goto out;
3533	}
3534
3535	mutex_lock(&svms->lock);
3536
3537	/* Add new range and split existing ranges as needed */
3538	r = svm_range_add(p, start, size, nattr, attrs, &update_list,
3539			  &insert_list, &remove_list);
3540	if (r) {
3541		mutex_unlock(&svms->lock);
3542		mmap_write_unlock(mm);
3543		goto out;
3544	}
3545	/* Apply changes as a transaction */
3546	list_for_each_entry_safe(prange, next, &insert_list, list) {
3547		svm_range_add_to_svms(prange);
3548		svm_range_add_notifier_locked(mm, prange);
3549	}
3550	list_for_each_entry(prange, &update_list, update_list) {
3551		svm_range_apply_attrs(p, prange, nattr, attrs, &update_mapping);
3552		/* TODO: unmap ranges from GPU that lost access */
3553	}
3554	list_for_each_entry_safe(prange, next, &remove_list, update_list) {
3555		pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n",
3556			 prange->svms, prange, prange->start,
3557			 prange->last);
3558		svm_range_unlink(prange);
3559		svm_range_remove_notifier(prange);
3560		svm_range_free(prange, false);
3561	}
3562
3563	mmap_write_downgrade(mm);
3564	/* Trigger migrations and revalidate and map to GPUs as needed. If
3565	 * this fails we may be left with partially completed actions. There
3566	 * is no clean way of rolling back to the previous state in such a
3567	 * case because the rollback wouldn't be guaranteed to work either.
3568	 */
3569	list_for_each_entry(prange, &update_list, update_list) {
3570		bool migrated;
3571
3572		mutex_lock(&prange->migrate_mutex);
3573
3574		r = svm_range_trigger_migration(mm, prange, &migrated);
3575		if (r)
3576			goto out_unlock_range;
3577
3578		if (migrated && (!p->xnack_enabled ||
3579		    (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) &&
3580		    prange->mapped_to_gpu) {
3581			pr_debug("restore_work will update mappings of GPUs\n");
3582			mutex_unlock(&prange->migrate_mutex);
3583			continue;
3584		}
3585
3586		if (!migrated && !update_mapping) {
3587			mutex_unlock(&prange->migrate_mutex);
3588			continue;
3589		}
3590
3591		flush_tlb = !migrated && update_mapping && prange->mapped_to_gpu;
3592
3593		r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
3594					       true, true, flush_tlb);
3595		if (r)
3596			pr_debug("failed %d to map svm range\n", r);
3597
3598out_unlock_range:
3599		mutex_unlock(&prange->migrate_mutex);
3600		if (r)
3601			ret = r;
3602	}
3603
3604	dynamic_svm_range_dump(svms);
3605
3606	mutex_unlock(&svms->lock);
3607	mmap_read_unlock(mm);
3608out:
3609	mutex_unlock(&process_info->lock);
3610
3611	pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid,
3612		 &p->svms, start, start + size - 1, r);
3613
3614	return ret ? ret : r;
3615}
3616
3617static int
3618svm_range_get_attr(struct kfd_process *p, struct mm_struct *mm,
3619		   uint64_t start, uint64_t size, uint32_t nattr,
3620		   struct kfd_ioctl_svm_attribute *attrs)
3621{
3622	DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
3623	DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
3624	bool get_preferred_loc = false;
3625	bool get_prefetch_loc = false;
3626	bool get_granularity = false;
3627	bool get_accessible = false;
3628	bool get_flags = false;
3629	uint64_t last = start + size - 1UL;
3630	uint8_t granularity = 0xff;
3631	struct interval_tree_node *node;
3632	struct svm_range_list *svms;
3633	struct svm_range *prange;
3634	uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3635	uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3636	uint32_t flags_and = 0xffffffff;
3637	uint32_t flags_or = 0;
3638	int gpuidx;
3639	uint32_t i;
3640	int r = 0;
3641
3642	pr_debug("svms 0x%p [0x%llx 0x%llx] nattr 0x%x\n", &p->svms, start,
3643		 start + size - 1, nattr);
3644
3645	/* Flush pending deferred work to avoid racing with deferred actions from
3646	 * previous memory map changes (e.g. munmap). Concurrent memory map changes
3647	 * can still race with get_attr because we don't hold the mmap lock. But that
3648	 * would be a race condition in the application anyway, and undefined
3649	 * behaviour is acceptable in that case.
3650	 */
3651	flush_work(&p->svms.deferred_list_work);
3652
3653	mmap_read_lock(mm);
3654	r = svm_range_is_valid(p, start, size);
3655	mmap_read_unlock(mm);
3656	if (r) {
3657		pr_debug("invalid range r=%d\n", r);
3658		return r;
3659	}
3660
3661	for (i = 0; i < nattr; i++) {
3662		switch (attrs[i].type) {
3663		case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
3664			get_preferred_loc = true;
3665			break;
3666		case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
3667			get_prefetch_loc = true;
3668			break;
3669		case KFD_IOCTL_SVM_ATTR_ACCESS:
3670			get_accessible = true;
3671			break;
3672		case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3673		case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
3674			get_flags = true;
3675			break;
3676		case KFD_IOCTL_SVM_ATTR_GRANULARITY:
3677			get_granularity = true;
3678			break;
3679		case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
3680		case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
3681			fallthrough;
3682		default:
3683			pr_debug("get invalid attr type 0x%x\n", attrs[i].type);
3684			return -EINVAL;
3685		}
3686	}
3687
3688	svms = &p->svms;
3689
3690	mutex_lock(&svms->lock);
3691
3692	node = interval_tree_iter_first(&svms->objects, start, last);
3693	if (!node) {
3694		pr_debug("range attrs not found return default values\n");
3695		svm_range_set_default_attributes(&location, &prefetch_loc,
3696						 &granularity, &flags_and);
3697		flags_or = flags_and;
3698		if (p->xnack_enabled)
3699			bitmap_copy(bitmap_access, svms->bitmap_supported,
3700				    MAX_GPU_INSTANCE);
3701		else
3702			bitmap_zero(bitmap_access, MAX_GPU_INSTANCE);
3703		bitmap_zero(bitmap_aip, MAX_GPU_INSTANCE);
3704		goto fill_values;
3705	}
3706	bitmap_copy(bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE);
3707	bitmap_copy(bitmap_aip, svms->bitmap_supported, MAX_GPU_INSTANCE);
3708
3709	while (node) {
3710		struct interval_tree_node *next;
3711
3712		prange = container_of(node, struct svm_range, it_node);
3713		next = interval_tree_iter_next(node, start, last);
3714
3715		if (get_preferred_loc) {
3716			if (prange->preferred_loc ==
3717					KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
3718			    (location != KFD_IOCTL_SVM_LOCATION_UNDEFINED &&
3719			     location != prange->preferred_loc)) {
3720				location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3721				get_preferred_loc = false;
3722			} else {
3723				location = prange->preferred_loc;
3724			}
3725		}
3726		if (get_prefetch_loc) {
3727			if (prange->prefetch_loc ==
3728					KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
3729			    (prefetch_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED &&
3730			     prefetch_loc != prange->prefetch_loc)) {
3731				prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3732				get_prefetch_loc = false;
3733			} else {
3734				prefetch_loc = prange->prefetch_loc;
3735			}
3736		}
3737		if (get_accessible) {
3738			bitmap_and(bitmap_access, bitmap_access,
3739				   prange->bitmap_access, MAX_GPU_INSTANCE);
3740			bitmap_and(bitmap_aip, bitmap_aip,
3741				   prange->bitmap_aip, MAX_GPU_INSTANCE);
3742		}
3743		if (get_flags) {
3744			flags_and &= prange->flags;
3745			flags_or |= prange->flags;
3746		}
3747
3748		if (get_granularity && prange->granularity < granularity)
3749			granularity = prange->granularity;
3750
3751		node = next;
3752	}
3753fill_values:
3754	mutex_unlock(&svms->lock);
3755
3756	for (i = 0; i < nattr; i++) {
3757		switch (attrs[i].type) {
3758		case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
3759			attrs[i].value = location;
3760			break;
3761		case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
3762			attrs[i].value = prefetch_loc;
3763			break;
3764		case KFD_IOCTL_SVM_ATTR_ACCESS:
3765			gpuidx = kfd_process_gpuidx_from_gpuid(p,
3766							       attrs[i].value);
3767			if (gpuidx < 0) {
3768				pr_debug("invalid gpuid %x\n", attrs[i].value);
3769				return -EINVAL;
3770			}
3771			if (test_bit(gpuidx, bitmap_access))
3772				attrs[i].type = KFD_IOCTL_SVM_ATTR_ACCESS;
3773			else if (test_bit(gpuidx, bitmap_aip))
3774				attrs[i].type =
3775					KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE;
3776			else
3777				attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS;
3778			break;
3779		case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3780			attrs[i].value = flags_and;
3781			break;
3782		case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
3783			attrs[i].value = ~flags_or;
3784			break;
3785		case KFD_IOCTL_SVM_ATTR_GRANULARITY:
3786			attrs[i].value = (uint32_t)granularity;
3787			break;
3788		}
3789	}
3790
3791	return 0;
3792}
3793
3794int kfd_criu_resume_svm(struct kfd_process *p)
3795{
3796	struct kfd_ioctl_svm_attribute *set_attr_new, *set_attr = NULL;
3797	int nattr_common = 4, nattr_accessibility = 1;
3798	struct criu_svm_metadata *criu_svm_md = NULL;
3799	struct svm_range_list *svms = &p->svms;
3800	struct criu_svm_metadata *next = NULL;
3801	uint32_t set_flags = 0xffffffff;
3802	int i, j, num_attrs, ret = 0;
3803	uint64_t set_attr_size;
3804	struct mm_struct *mm;
3805
3806	if (list_empty(&svms->criu_svm_metadata_list)) {
3807		pr_debug("No SVM data from CRIU restore stage 2\n");
3808		return ret;
3809	}
3810
3811	mm = get_task_mm(p->lead_thread);
3812	if (!mm) {
3813		pr_err("failed to get mm for the target process\n");
3814		return -ESRCH;
3815	}
3816
3817	num_attrs = nattr_common + (nattr_accessibility * p->n_pdds);
3818
3819	i = j = 0;
3820	list_for_each_entry(criu_svm_md, &svms->criu_svm_metadata_list, list) {
3821		pr_debug("criu_svm_md[%d]\n\tstart: 0x%llx size: 0x%llx (npages)\n",
3822			 i, criu_svm_md->data.start_addr, criu_svm_md->data.size);
3823
3824		for (j = 0; j < num_attrs; j++) {
3825			pr_debug("\ncriu_svm_md[%d]->attrs[%d].type : 0x%x\ncriu_svm_md[%d]->attrs[%d].value : 0x%x\n",
3826				 i, j, criu_svm_md->data.attrs[j].type,
3827				 i, j, criu_svm_md->data.attrs[j].value);
3828			switch (criu_svm_md->data.attrs[j].type) {
3829			/* During Checkpoint operation, the query for
3830			 * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC attribute might
3831			 * return KFD_IOCTL_SVM_LOCATION_UNDEFINED if they were
3832			 * not used by the range which was checkpointed. Care
3833			 * must be taken to not restore with an invalid value
3834			 * otherwise the gpuidx value will be invalid and
3835			 * set_attr would eventually fail so just replace those
3836			 * with another dummy attribute such as
3837			 * KFD_IOCTL_SVM_ATTR_SET_FLAGS.
3838			 */
3839			case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
3840				if (criu_svm_md->data.attrs[j].value ==
3841				    KFD_IOCTL_SVM_LOCATION_UNDEFINED) {
3842					criu_svm_md->data.attrs[j].type =
3843						KFD_IOCTL_SVM_ATTR_SET_FLAGS;
3844					criu_svm_md->data.attrs[j].value = 0;
3845				}
3846				break;
3847			case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3848				set_flags = criu_svm_md->data.attrs[j].value;
3849				break;
3850			default:
3851				break;
3852			}
3853		}
3854
3855		/* CLR_FLAGS is not available via get_attr during checkpoint but
3856		 * it needs to be inserted before restoring the ranges so
3857		 * allocate extra space for it before calling set_attr
3858		 */
3859		set_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
3860						(num_attrs + 1);
3861		set_attr_new = krealloc(set_attr, set_attr_size,
3862					    GFP_KERNEL);
3863		if (!set_attr_new) {
3864			ret = -ENOMEM;
3865			goto exit;
3866		}
3867		set_attr = set_attr_new;
3868
3869		memcpy(set_attr, criu_svm_md->data.attrs, num_attrs *
3870					sizeof(struct kfd_ioctl_svm_attribute));
3871		set_attr[num_attrs].type = KFD_IOCTL_SVM_ATTR_CLR_FLAGS;
3872		set_attr[num_attrs].value = ~set_flags;
3873
3874		ret = svm_range_set_attr(p, mm, criu_svm_md->data.start_addr,
3875					 criu_svm_md->data.size, num_attrs + 1,
3876					 set_attr);
3877		if (ret) {
3878			pr_err("CRIU: failed to set range attributes\n");
3879			goto exit;
3880		}
3881
3882		i++;
3883	}
3884exit:
3885	kfree(set_attr);
3886	list_for_each_entry_safe(criu_svm_md, next, &svms->criu_svm_metadata_list, list) {
3887		pr_debug("freeing criu_svm_md[]\n\tstart: 0x%llx\n",
3888						criu_svm_md->data.start_addr);
3889		kfree(criu_svm_md);
3890	}
3891
3892	mmput(mm);
3893	return ret;
3894
3895}
3896
3897int kfd_criu_restore_svm(struct kfd_process *p,
3898			 uint8_t __user *user_priv_ptr,
3899			 uint64_t *priv_data_offset,
3900			 uint64_t max_priv_data_size)
3901{
3902	uint64_t svm_priv_data_size, svm_object_md_size, svm_attrs_size;
3903	int nattr_common = 4, nattr_accessibility = 1;
3904	struct criu_svm_metadata *criu_svm_md = NULL;
3905	struct svm_range_list *svms = &p->svms;
3906	uint32_t num_devices;
3907	int ret = 0;
3908
3909	num_devices = p->n_pdds;
3910	/* Handle one SVM range object at a time, also the number of gpus are
3911	 * assumed to be same on the restore node, checking must be done while
3912	 * evaluating the topology earlier
3913	 */
3914
3915	svm_attrs_size = sizeof(struct kfd_ioctl_svm_attribute) *
3916		(nattr_common + nattr_accessibility * num_devices);
3917	svm_object_md_size = sizeof(struct criu_svm_metadata) + svm_attrs_size;
3918
3919	svm_priv_data_size = sizeof(struct kfd_criu_svm_range_priv_data) +
3920								svm_attrs_size;
3921
3922	criu_svm_md = kzalloc(svm_object_md_size, GFP_KERNEL);
3923	if (!criu_svm_md) {
3924		pr_err("failed to allocate memory to store svm metadata\n");
3925		return -ENOMEM;
3926	}
3927	if (*priv_data_offset + svm_priv_data_size > max_priv_data_size) {
3928		ret = -EINVAL;
3929		goto exit;
3930	}
3931
3932	ret = copy_from_user(&criu_svm_md->data, user_priv_ptr + *priv_data_offset,
3933			     svm_priv_data_size);
3934	if (ret) {
3935		ret = -EFAULT;
3936		goto exit;
3937	}
3938	*priv_data_offset += svm_priv_data_size;
3939
3940	list_add_tail(&criu_svm_md->list, &svms->criu_svm_metadata_list);
3941
3942	return 0;
3943
3944
3945exit:
3946	kfree(criu_svm_md);
3947	return ret;
3948}
3949
3950int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges,
3951		       uint64_t *svm_priv_data_size)
3952{
3953	uint64_t total_size, accessibility_size, common_attr_size;
3954	int nattr_common = 4, nattr_accessibility = 1;
3955	int num_devices = p->n_pdds;
3956	struct svm_range_list *svms;
3957	struct svm_range *prange;
3958	uint32_t count = 0;
3959
3960	*svm_priv_data_size = 0;
3961
3962	svms = &p->svms;
3963	if (!svms)
3964		return -EINVAL;
3965
3966	mutex_lock(&svms->lock);
3967	list_for_each_entry(prange, &svms->list, list) {
3968		pr_debug("prange: 0x%p start: 0x%lx\t npages: 0x%llx\t end: 0x%llx\n",
3969			 prange, prange->start, prange->npages,
3970			 prange->start + prange->npages - 1);
3971		count++;
3972	}
3973	mutex_unlock(&svms->lock);
3974
3975	*num_svm_ranges = count;
3976	/* Only the accessbility attributes need to be queried for all the gpus
3977	 * individually, remaining ones are spanned across the entire process
3978	 * regardless of the various gpu nodes. Of the remaining attributes,
3979	 * KFD_IOCTL_SVM_ATTR_CLR_FLAGS need not be saved.
3980	 *
3981	 * KFD_IOCTL_SVM_ATTR_PREFERRED_LOC
3982	 * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC
3983	 * KFD_IOCTL_SVM_ATTR_SET_FLAGS
3984	 * KFD_IOCTL_SVM_ATTR_GRANULARITY
3985	 *
3986	 * ** ACCESSBILITY ATTRIBUTES **
3987	 * (Considered as one, type is altered during query, value is gpuid)
3988	 * KFD_IOCTL_SVM_ATTR_ACCESS
3989	 * KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE
3990	 * KFD_IOCTL_SVM_ATTR_NO_ACCESS
3991	 */
3992	if (*num_svm_ranges > 0) {
3993		common_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
3994			nattr_common;
3995		accessibility_size = sizeof(struct kfd_ioctl_svm_attribute) *
3996			nattr_accessibility * num_devices;
3997
3998		total_size = sizeof(struct kfd_criu_svm_range_priv_data) +
3999			common_attr_size + accessibility_size;
4000
4001		*svm_priv_data_size = *num_svm_ranges * total_size;
4002	}
4003
4004	pr_debug("num_svm_ranges %u total_priv_size %llu\n", *num_svm_ranges,
4005		 *svm_priv_data_size);
4006	return 0;
4007}
4008
4009int kfd_criu_checkpoint_svm(struct kfd_process *p,
4010			    uint8_t __user *user_priv_data,
4011			    uint64_t *priv_data_offset)
4012{
4013	struct kfd_criu_svm_range_priv_data *svm_priv = NULL;
4014	struct kfd_ioctl_svm_attribute *query_attr = NULL;
4015	uint64_t svm_priv_data_size, query_attr_size = 0;
4016	int index, nattr_common = 4, ret = 0;
4017	struct svm_range_list *svms;
4018	int num_devices = p->n_pdds;
4019	struct svm_range *prange;
4020	struct mm_struct *mm;
4021
4022	svms = &p->svms;
4023	if (!svms)
4024		return -EINVAL;
4025
4026	mm = get_task_mm(p->lead_thread);
4027	if (!mm) {
4028		pr_err("failed to get mm for the target process\n");
4029		return -ESRCH;
4030	}
4031
4032	query_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
4033				(nattr_common + num_devices);
4034
4035	query_attr = kzalloc(query_attr_size, GFP_KERNEL);
4036	if (!query_attr) {
4037		ret = -ENOMEM;
4038		goto exit;
4039	}
4040
4041	query_attr[0].type = KFD_IOCTL_SVM_ATTR_PREFERRED_LOC;
4042	query_attr[1].type = KFD_IOCTL_SVM_ATTR_PREFETCH_LOC;
4043	query_attr[2].type = KFD_IOCTL_SVM_ATTR_SET_FLAGS;
4044	query_attr[3].type = KFD_IOCTL_SVM_ATTR_GRANULARITY;
4045
4046	for (index = 0; index < num_devices; index++) {
4047		struct kfd_process_device *pdd = p->pdds[index];
4048
4049		query_attr[index + nattr_common].type =
4050			KFD_IOCTL_SVM_ATTR_ACCESS;
4051		query_attr[index + nattr_common].value = pdd->user_gpu_id;
4052	}
4053
4054	svm_priv_data_size = sizeof(*svm_priv) + query_attr_size;
4055
4056	svm_priv = kzalloc(svm_priv_data_size, GFP_KERNEL);
4057	if (!svm_priv) {
4058		ret = -ENOMEM;
4059		goto exit_query;
4060	}
4061
4062	index = 0;
4063	list_for_each_entry(prange, &svms->list, list) {
4064
4065		svm_priv->object_type = KFD_CRIU_OBJECT_TYPE_SVM_RANGE;
4066		svm_priv->start_addr = prange->start;
4067		svm_priv->size = prange->npages;
4068		memcpy(&svm_priv->attrs, query_attr, query_attr_size);
4069		pr_debug("CRIU: prange: 0x%p start: 0x%lx\t npages: 0x%llx end: 0x%llx\t size: 0x%llx\n",
4070			 prange, prange->start, prange->npages,
4071			 prange->start + prange->npages - 1,
4072			 prange->npages * PAGE_SIZE);
4073
4074		ret = svm_range_get_attr(p, mm, svm_priv->start_addr,
4075					 svm_priv->size,
4076					 (nattr_common + num_devices),
4077					 svm_priv->attrs);
4078		if (ret) {
4079			pr_err("CRIU: failed to obtain range attributes\n");
4080			goto exit_priv;
4081		}
4082
4083		if (copy_to_user(user_priv_data + *priv_data_offset, svm_priv,
4084				 svm_priv_data_size)) {
4085			pr_err("Failed to copy svm priv to user\n");
4086			ret = -EFAULT;
4087			goto exit_priv;
4088		}
4089
4090		*priv_data_offset += svm_priv_data_size;
4091
4092	}
4093
4094
4095exit_priv:
4096	kfree(svm_priv);
4097exit_query:
4098	kfree(query_attr);
4099exit:
4100	mmput(mm);
4101	return ret;
4102}
4103
4104int
4105svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start,
4106	  uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs)
4107{
4108	struct mm_struct *mm = current->mm;
4109	int r;
4110
4111	start >>= PAGE_SHIFT;
4112	size >>= PAGE_SHIFT;
4113
4114	switch (op) {
4115	case KFD_IOCTL_SVM_OP_SET_ATTR:
4116		r = svm_range_set_attr(p, mm, start, size, nattrs, attrs);
4117		break;
4118	case KFD_IOCTL_SVM_OP_GET_ATTR:
4119		r = svm_range_get_attr(p, mm, start, size, nattrs, attrs);
4120		break;
4121	default:
4122		r = EINVAL;
4123		break;
4124	}
4125
4126	return r;
4127}
4128