1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 *
4 * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
5 */
6
7#include <linux/types.h>
8#include <linux/string.h>
9#include <linux/kvm.h>
10#include <linux/kvm_host.h>
11#include <linux/anon_inodes.h>
12#include <linux/file.h>
13#include <linux/debugfs.h>
14#include <linux/pgtable.h>
15
16#include <asm/kvm_ppc.h>
17#include <asm/kvm_book3s.h>
18#include <asm/page.h>
19#include <asm/mmu.h>
20#include <asm/pgalloc.h>
21#include <asm/pte-walk.h>
22#include <asm/ultravisor.h>
23#include <asm/kvm_book3s_uvmem.h>
24
25/*
26 * Supported radix tree geometry.
27 * Like p9, we support either 5 or 9 bits at the first (lowest) level,
28 * for a page size of 64k or 4k.
29 */
30static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
31
32unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
33					      gva_t eaddr, void *to, void *from,
34					      unsigned long n)
35{
36	int old_pid, old_lpid;
37	unsigned long quadrant, ret = n;
38	bool is_load = !!to;
39
40	/* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
41	if (kvmhv_on_pseries())
42		return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
43					  (to != NULL) ? __pa(to): 0,
44					  (from != NULL) ? __pa(from): 0, n);
45
46	quadrant = 1;
47	if (!pid)
48		quadrant = 2;
49	if (is_load)
50		from = (void *) (eaddr | (quadrant << 62));
51	else
52		to = (void *) (eaddr | (quadrant << 62));
53
54	preempt_disable();
55
56	/* switch the lpid first to avoid running host with unallocated pid */
57	old_lpid = mfspr(SPRN_LPID);
58	if (old_lpid != lpid)
59		mtspr(SPRN_LPID, lpid);
60	if (quadrant == 1) {
61		old_pid = mfspr(SPRN_PID);
62		if (old_pid != pid)
63			mtspr(SPRN_PID, pid);
64	}
65	isync();
66
67	pagefault_disable();
68	if (is_load)
69		ret = __copy_from_user_inatomic(to, (const void __user *)from, n);
70	else
71		ret = __copy_to_user_inatomic((void __user *)to, from, n);
72	pagefault_enable();
73
74	/* switch the pid first to avoid running host with unallocated pid */
75	if (quadrant == 1 && pid != old_pid)
76		mtspr(SPRN_PID, old_pid);
77	if (lpid != old_lpid)
78		mtspr(SPRN_LPID, old_lpid);
79	isync();
80
81	preempt_enable();
82
83	return ret;
84}
85EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix);
86
87static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
88					  void *to, void *from, unsigned long n)
89{
90	int lpid = vcpu->kvm->arch.lpid;
91	int pid = vcpu->arch.pid;
92
93	/* This would cause a data segment intr so don't allow the access */
94	if (eaddr & (0x3FFUL << 52))
95		return -EINVAL;
96
97	/* Should we be using the nested lpid */
98	if (vcpu->arch.nested)
99		lpid = vcpu->arch.nested->shadow_lpid;
100
101	/* If accessing quadrant 3 then pid is expected to be 0 */
102	if (((eaddr >> 62) & 0x3) == 0x3)
103		pid = 0;
104
105	eaddr &= ~(0xFFFUL << 52);
106
107	return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
108}
109
110long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
111				 unsigned long n)
112{
113	long ret;
114
115	ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
116	if (ret > 0)
117		memset(to + (n - ret), 0, ret);
118
119	return ret;
120}
121EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix);
122
123long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
124			       unsigned long n)
125{
126	return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
127}
128EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix);
129
130int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
131			       struct kvmppc_pte *gpte, u64 root,
132			       u64 *pte_ret_p)
133{
134	struct kvm *kvm = vcpu->kvm;
135	int ret, level, ps;
136	unsigned long rts, bits, offset, index;
137	u64 pte, base, gpa;
138	__be64 rpte;
139
140	rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
141		((root & RTS2_MASK) >> RTS2_SHIFT);
142	bits = root & RPDS_MASK;
143	base = root & RPDB_MASK;
144
145	offset = rts + 31;
146
147	/* Current implementations only support 52-bit space */
148	if (offset != 52)
149		return -EINVAL;
150
151	/* Walk each level of the radix tree */
152	for (level = 3; level >= 0; --level) {
153		u64 addr;
154		/* Check a valid size */
155		if (level && bits != p9_supported_radix_bits[level])
156			return -EINVAL;
157		if (level == 0 && !(bits == 5 || bits == 9))
158			return -EINVAL;
159		offset -= bits;
160		index = (eaddr >> offset) & ((1UL << bits) - 1);
161		/* Check that low bits of page table base are zero */
162		if (base & ((1UL << (bits + 3)) - 1))
163			return -EINVAL;
164		/* Read the entry from guest memory */
165		addr = base + (index * sizeof(rpte));
166		vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
167		ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
168		srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
169		if (ret) {
170			if (pte_ret_p)
171				*pte_ret_p = addr;
172			return ret;
173		}
174		pte = __be64_to_cpu(rpte);
175		if (!(pte & _PAGE_PRESENT))
176			return -ENOENT;
177		/* Check if a leaf entry */
178		if (pte & _PAGE_PTE)
179			break;
180		/* Get ready to walk the next level */
181		base = pte & RPDB_MASK;
182		bits = pte & RPDS_MASK;
183	}
184
185	/* Need a leaf at lowest level; 512GB pages not supported */
186	if (level < 0 || level == 3)
187		return -EINVAL;
188
189	/* We found a valid leaf PTE */
190	/* Offset is now log base 2 of the page size */
191	gpa = pte & 0x01fffffffffff000ul;
192	if (gpa & ((1ul << offset) - 1))
193		return -EINVAL;
194	gpa |= eaddr & ((1ul << offset) - 1);
195	for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
196		if (offset == mmu_psize_defs[ps].shift)
197			break;
198	gpte->page_size = ps;
199	gpte->page_shift = offset;
200
201	gpte->eaddr = eaddr;
202	gpte->raddr = gpa;
203
204	/* Work out permissions */
205	gpte->may_read = !!(pte & _PAGE_READ);
206	gpte->may_write = !!(pte & _PAGE_WRITE);
207	gpte->may_execute = !!(pte & _PAGE_EXEC);
208
209	gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
210
211	if (pte_ret_p)
212		*pte_ret_p = pte;
213
214	return 0;
215}
216
217/*
218 * Used to walk a partition or process table radix tree in guest memory
219 * Note: We exploit the fact that a partition table and a process
220 * table have the same layout, a partition-scoped page table and a
221 * process-scoped page table have the same layout, and the 2nd
222 * doubleword of a partition table entry has the same layout as
223 * the PTCR register.
224 */
225int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
226				     struct kvmppc_pte *gpte, u64 table,
227				     int table_index, u64 *pte_ret_p)
228{
229	struct kvm *kvm = vcpu->kvm;
230	int ret;
231	unsigned long size, ptbl, root;
232	struct prtb_entry entry;
233
234	if ((table & PRTS_MASK) > 24)
235		return -EINVAL;
236	size = 1ul << ((table & PRTS_MASK) + 12);
237
238	/* Is the table big enough to contain this entry? */
239	if ((table_index * sizeof(entry)) >= size)
240		return -EINVAL;
241
242	/* Read the table to find the root of the radix tree */
243	ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
244	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
245	ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
246	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
247	if (ret)
248		return ret;
249
250	/* Root is stored in the first double word */
251	root = be64_to_cpu(entry.prtb0);
252
253	return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
254}
255
256int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
257			   struct kvmppc_pte *gpte, bool data, bool iswrite)
258{
259	u32 pid;
260	u64 pte;
261	int ret;
262
263	/* Work out effective PID */
264	switch (eaddr >> 62) {
265	case 0:
266		pid = vcpu->arch.pid;
267		break;
268	case 3:
269		pid = 0;
270		break;
271	default:
272		return -EINVAL;
273	}
274
275	ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
276				vcpu->kvm->arch.process_table, pid, &pte);
277	if (ret)
278		return ret;
279
280	/* Check privilege (applies only to process scoped translations) */
281	if (kvmppc_get_msr(vcpu) & MSR_PR) {
282		if (pte & _PAGE_PRIVILEGED) {
283			gpte->may_read = 0;
284			gpte->may_write = 0;
285			gpte->may_execute = 0;
286		}
287	} else {
288		if (!(pte & _PAGE_PRIVILEGED)) {
289			/* Check AMR/IAMR to see if strict mode is in force */
290			if (vcpu->arch.amr & (1ul << 62))
291				gpte->may_read = 0;
292			if (vcpu->arch.amr & (1ul << 63))
293				gpte->may_write = 0;
294			if (vcpu->arch.iamr & (1ul << 62))
295				gpte->may_execute = 0;
296		}
297	}
298
299	return 0;
300}
301
302void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
303			     unsigned int pshift, unsigned int lpid)
304{
305	unsigned long psize = PAGE_SIZE;
306	int psi;
307	long rc;
308	unsigned long rb;
309
310	if (pshift)
311		psize = 1UL << pshift;
312	else
313		pshift = PAGE_SHIFT;
314
315	addr &= ~(psize - 1);
316
317	if (!kvmhv_on_pseries()) {
318		radix__flush_tlb_lpid_page(lpid, addr, psize);
319		return;
320	}
321
322	psi = shift_to_mmu_psize(pshift);
323	rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
324	rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
325				lpid, rb);
326	if (rc)
327		pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
328}
329
330static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
331{
332	long rc;
333
334	if (!kvmhv_on_pseries()) {
335		radix__flush_pwc_lpid(lpid);
336		return;
337	}
338
339	rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
340				lpid, TLBIEL_INVAL_SET_LPID);
341	if (rc)
342		pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
343}
344
345static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
346				      unsigned long clr, unsigned long set,
347				      unsigned long addr, unsigned int shift)
348{
349	return __radix_pte_update(ptep, clr, set);
350}
351
352static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
353			     pte_t *ptep, pte_t pte)
354{
355	radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
356}
357
358static struct kmem_cache *kvm_pte_cache;
359static struct kmem_cache *kvm_pmd_cache;
360
361static pte_t *kvmppc_pte_alloc(void)
362{
363	pte_t *pte;
364
365	pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
366	/* pmd_populate() will only reference _pa(pte). */
367	kmemleak_ignore(pte);
368
369	return pte;
370}
371
372static void kvmppc_pte_free(pte_t *ptep)
373{
374	kmem_cache_free(kvm_pte_cache, ptep);
375}
376
377static pmd_t *kvmppc_pmd_alloc(void)
378{
379	pmd_t *pmd;
380
381	pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
382	/* pud_populate() will only reference _pa(pmd). */
383	kmemleak_ignore(pmd);
384
385	return pmd;
386}
387
388static void kvmppc_pmd_free(pmd_t *pmdp)
389{
390	kmem_cache_free(kvm_pmd_cache, pmdp);
391}
392
393/* Called with kvm->mmu_lock held */
394void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
395		      unsigned int shift,
396		      const struct kvm_memory_slot *memslot,
397		      unsigned int lpid)
398
399{
400	unsigned long old;
401	unsigned long gfn = gpa >> PAGE_SHIFT;
402	unsigned long page_size = PAGE_SIZE;
403	unsigned long hpa;
404
405	old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
406	kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
407
408	/* The following only applies to L1 entries */
409	if (lpid != kvm->arch.lpid)
410		return;
411
412	if (!memslot) {
413		memslot = gfn_to_memslot(kvm, gfn);
414		if (!memslot)
415			return;
416	}
417	if (shift) { /* 1GB or 2MB page */
418		page_size = 1ul << shift;
419		if (shift == PMD_SHIFT)
420			kvm->stat.num_2M_pages--;
421		else if (shift == PUD_SHIFT)
422			kvm->stat.num_1G_pages--;
423	}
424
425	gpa &= ~(page_size - 1);
426	hpa = old & PTE_RPN_MASK;
427	kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
428
429	if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
430		kvmppc_update_dirty_map(memslot, gfn, page_size);
431}
432
433/*
434 * kvmppc_free_p?d are used to free existing page tables, and recursively
435 * descend and clear and free children.
436 * Callers are responsible for flushing the PWC.
437 *
438 * When page tables are being unmapped/freed as part of page fault path
439 * (full == false), valid ptes are generally not expected; however, there
440 * is one situation where they arise, which is when dirty page logging is
441 * turned off for a memslot while the VM is running.  The new memslot
442 * becomes visible to page faults before the memslot commit function
443 * gets to flush the memslot, which can lead to a 2MB page mapping being
444 * installed for a guest physical address where there are already 64kB
445 * (or 4kB) mappings (of sub-pages of the same 2MB page).
446 */
447static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
448				  unsigned int lpid)
449{
450	if (full) {
451		memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE);
452	} else {
453		pte_t *p = pte;
454		unsigned long it;
455
456		for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
457			if (pte_val(*p) == 0)
458				continue;
459			kvmppc_unmap_pte(kvm, p,
460					 pte_pfn(*p) << PAGE_SHIFT,
461					 PAGE_SHIFT, NULL, lpid);
462		}
463	}
464
465	kvmppc_pte_free(pte);
466}
467
468static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
469				  unsigned int lpid)
470{
471	unsigned long im;
472	pmd_t *p = pmd;
473
474	for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
475		if (!pmd_present(*p))
476			continue;
477		if (pmd_is_leaf(*p)) {
478			if (full) {
479				pmd_clear(p);
480			} else {
481				WARN_ON_ONCE(1);
482				kvmppc_unmap_pte(kvm, (pte_t *)p,
483					 pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
484					 PMD_SHIFT, NULL, lpid);
485			}
486		} else {
487			pte_t *pte;
488
489			pte = pte_offset_map(p, 0);
490			kvmppc_unmap_free_pte(kvm, pte, full, lpid);
491			pmd_clear(p);
492		}
493	}
494	kvmppc_pmd_free(pmd);
495}
496
497static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
498				  unsigned int lpid)
499{
500	unsigned long iu;
501	pud_t *p = pud;
502
503	for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
504		if (!pud_present(*p))
505			continue;
506		if (pud_is_leaf(*p)) {
507			pud_clear(p);
508		} else {
509			pmd_t *pmd;
510
511			pmd = pmd_offset(p, 0);
512			kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
513			pud_clear(p);
514		}
515	}
516	pud_free(kvm->mm, pud);
517}
518
519void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
520{
521	unsigned long ig;
522
523	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
524		p4d_t *p4d = p4d_offset(pgd, 0);
525		pud_t *pud;
526
527		if (!p4d_present(*p4d))
528			continue;
529		pud = pud_offset(p4d, 0);
530		kvmppc_unmap_free_pud(kvm, pud, lpid);
531		p4d_clear(p4d);
532	}
533}
534
535void kvmppc_free_radix(struct kvm *kvm)
536{
537	if (kvm->arch.pgtable) {
538		kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
539					  kvm->arch.lpid);
540		pgd_free(kvm->mm, kvm->arch.pgtable);
541		kvm->arch.pgtable = NULL;
542	}
543}
544
545static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
546					unsigned long gpa, unsigned int lpid)
547{
548	pte_t *pte = pte_offset_kernel(pmd, 0);
549
550	/*
551	 * Clearing the pmd entry then flushing the PWC ensures that the pte
552	 * page no longer be cached by the MMU, so can be freed without
553	 * flushing the PWC again.
554	 */
555	pmd_clear(pmd);
556	kvmppc_radix_flush_pwc(kvm, lpid);
557
558	kvmppc_unmap_free_pte(kvm, pte, false, lpid);
559}
560
561static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
562					unsigned long gpa, unsigned int lpid)
563{
564	pmd_t *pmd = pmd_offset(pud, 0);
565
566	/*
567	 * Clearing the pud entry then flushing the PWC ensures that the pmd
568	 * page and any children pte pages will no longer be cached by the MMU,
569	 * so can be freed without flushing the PWC again.
570	 */
571	pud_clear(pud);
572	kvmppc_radix_flush_pwc(kvm, lpid);
573
574	kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
575}
576
577/*
578 * There are a number of bits which may differ between different faults to
579 * the same partition scope entry. RC bits, in the course of cleaning and
580 * aging. And the write bit can change, either the access could have been
581 * upgraded, or a read fault could happen concurrently with a write fault
582 * that sets those bits first.
583 */
584#define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
585
586int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
587		      unsigned long gpa, unsigned int level,
588		      unsigned long mmu_seq, unsigned int lpid,
589		      unsigned long *rmapp, struct rmap_nested **n_rmap)
590{
591	pgd_t *pgd;
592	p4d_t *p4d;
593	pud_t *pud, *new_pud = NULL;
594	pmd_t *pmd, *new_pmd = NULL;
595	pte_t *ptep, *new_ptep = NULL;
596	int ret;
597
598	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
599	pgd = pgtable + pgd_index(gpa);
600	p4d = p4d_offset(pgd, gpa);
601
602	pud = NULL;
603	if (p4d_present(*p4d))
604		pud = pud_offset(p4d, gpa);
605	else
606		new_pud = pud_alloc_one(kvm->mm, gpa);
607
608	pmd = NULL;
609	if (pud && pud_present(*pud) && !pud_is_leaf(*pud))
610		pmd = pmd_offset(pud, gpa);
611	else if (level <= 1)
612		new_pmd = kvmppc_pmd_alloc();
613
614	if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
615		new_ptep = kvmppc_pte_alloc();
616
617	/* Check if we might have been invalidated; let the guest retry if so */
618	spin_lock(&kvm->mmu_lock);
619	ret = -EAGAIN;
620	if (mmu_notifier_retry(kvm, mmu_seq))
621		goto out_unlock;
622
623	/* Now traverse again under the lock and change the tree */
624	ret = -ENOMEM;
625	if (p4d_none(*p4d)) {
626		if (!new_pud)
627			goto out_unlock;
628		p4d_populate(kvm->mm, p4d, new_pud);
629		new_pud = NULL;
630	}
631	pud = pud_offset(p4d, gpa);
632	if (pud_is_leaf(*pud)) {
633		unsigned long hgpa = gpa & PUD_MASK;
634
635		/* Check if we raced and someone else has set the same thing */
636		if (level == 2) {
637			if (pud_raw(*pud) == pte_raw(pte)) {
638				ret = 0;
639				goto out_unlock;
640			}
641			/* Valid 1GB page here already, add our extra bits */
642			WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
643							PTE_BITS_MUST_MATCH);
644			kvmppc_radix_update_pte(kvm, (pte_t *)pud,
645					      0, pte_val(pte), hgpa, PUD_SHIFT);
646			ret = 0;
647			goto out_unlock;
648		}
649		/*
650		 * If we raced with another CPU which has just put
651		 * a 1GB pte in after we saw a pmd page, try again.
652		 */
653		if (!new_pmd) {
654			ret = -EAGAIN;
655			goto out_unlock;
656		}
657		/* Valid 1GB page here already, remove it */
658		kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
659				 lpid);
660	}
661	if (level == 2) {
662		if (!pud_none(*pud)) {
663			/*
664			 * There's a page table page here, but we wanted to
665			 * install a large page, so remove and free the page
666			 * table page.
667			 */
668			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
669		}
670		kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
671		if (rmapp && n_rmap)
672			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
673		ret = 0;
674		goto out_unlock;
675	}
676	if (pud_none(*pud)) {
677		if (!new_pmd)
678			goto out_unlock;
679		pud_populate(kvm->mm, pud, new_pmd);
680		new_pmd = NULL;
681	}
682	pmd = pmd_offset(pud, gpa);
683	if (pmd_is_leaf(*pmd)) {
684		unsigned long lgpa = gpa & PMD_MASK;
685
686		/* Check if we raced and someone else has set the same thing */
687		if (level == 1) {
688			if (pmd_raw(*pmd) == pte_raw(pte)) {
689				ret = 0;
690				goto out_unlock;
691			}
692			/* Valid 2MB page here already, add our extra bits */
693			WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
694							PTE_BITS_MUST_MATCH);
695			kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
696					0, pte_val(pte), lgpa, PMD_SHIFT);
697			ret = 0;
698			goto out_unlock;
699		}
700
701		/*
702		 * If we raced with another CPU which has just put
703		 * a 2MB pte in after we saw a pte page, try again.
704		 */
705		if (!new_ptep) {
706			ret = -EAGAIN;
707			goto out_unlock;
708		}
709		/* Valid 2MB page here already, remove it */
710		kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
711				 lpid);
712	}
713	if (level == 1) {
714		if (!pmd_none(*pmd)) {
715			/*
716			 * There's a page table page here, but we wanted to
717			 * install a large page, so remove and free the page
718			 * table page.
719			 */
720			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
721		}
722		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
723		if (rmapp && n_rmap)
724			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
725		ret = 0;
726		goto out_unlock;
727	}
728	if (pmd_none(*pmd)) {
729		if (!new_ptep)
730			goto out_unlock;
731		pmd_populate(kvm->mm, pmd, new_ptep);
732		new_ptep = NULL;
733	}
734	ptep = pte_offset_kernel(pmd, gpa);
735	if (pte_present(*ptep)) {
736		/* Check if someone else set the same thing */
737		if (pte_raw(*ptep) == pte_raw(pte)) {
738			ret = 0;
739			goto out_unlock;
740		}
741		/* Valid page here already, add our extra bits */
742		WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
743							PTE_BITS_MUST_MATCH);
744		kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
745		ret = 0;
746		goto out_unlock;
747	}
748	kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
749	if (rmapp && n_rmap)
750		kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
751	ret = 0;
752
753 out_unlock:
754	spin_unlock(&kvm->mmu_lock);
755	if (new_pud)
756		pud_free(kvm->mm, new_pud);
757	if (new_pmd)
758		kvmppc_pmd_free(new_pmd);
759	if (new_ptep)
760		kvmppc_pte_free(new_ptep);
761	return ret;
762}
763
764bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing,
765			     unsigned long gpa, unsigned int lpid)
766{
767	unsigned long pgflags;
768	unsigned int shift;
769	pte_t *ptep;
770
771	/*
772	 * Need to set an R or C bit in the 2nd-level tables;
773	 * since we are just helping out the hardware here,
774	 * it is sufficient to do what the hardware does.
775	 */
776	pgflags = _PAGE_ACCESSED;
777	if (writing)
778		pgflags |= _PAGE_DIRTY;
779
780	if (nested)
781		ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
782	else
783		ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
784
785	if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
786		kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
787		return true;
788	}
789	return false;
790}
791
792int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
793				   unsigned long gpa,
794				   struct kvm_memory_slot *memslot,
795				   bool writing, bool kvm_ro,
796				   pte_t *inserted_pte, unsigned int *levelp)
797{
798	struct kvm *kvm = vcpu->kvm;
799	struct page *page = NULL;
800	unsigned long mmu_seq;
801	unsigned long hva, gfn = gpa >> PAGE_SHIFT;
802	bool upgrade_write = false;
803	bool *upgrade_p = &upgrade_write;
804	pte_t pte, *ptep;
805	unsigned int shift, level;
806	int ret;
807	bool large_enable;
808
809	/* used to check for invalidations in progress */
810	mmu_seq = kvm->mmu_notifier_seq;
811	smp_rmb();
812
813	/*
814	 * Do a fast check first, since __gfn_to_pfn_memslot doesn't
815	 * do it with !atomic && !async, which is how we call it.
816	 * We always ask for write permission since the common case
817	 * is that the page is writable.
818	 */
819	hva = gfn_to_hva_memslot(memslot, gfn);
820	if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) {
821		upgrade_write = true;
822	} else {
823		unsigned long pfn;
824
825		/* Call KVM generic code to do the slow-path check */
826		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
827					   writing, upgrade_p);
828		if (is_error_noslot_pfn(pfn))
829			return -EFAULT;
830		page = NULL;
831		if (pfn_valid(pfn)) {
832			page = pfn_to_page(pfn);
833			if (PageReserved(page))
834				page = NULL;
835		}
836	}
837
838	/*
839	 * Read the PTE from the process' radix tree and use that
840	 * so we get the shift and attribute bits.
841	 */
842	spin_lock(&kvm->mmu_lock);
843	ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
844	pte = __pte(0);
845	if (ptep)
846		pte = READ_ONCE(*ptep);
847	spin_unlock(&kvm->mmu_lock);
848	/*
849	 * If the PTE disappeared temporarily due to a THP
850	 * collapse, just return and let the guest try again.
851	 */
852	if (!pte_present(pte)) {
853		if (page)
854			put_page(page);
855		return RESUME_GUEST;
856	}
857
858	/* If we're logging dirty pages, always map single pages */
859	large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
860
861	/* Get pte level from shift/size */
862	if (large_enable && shift == PUD_SHIFT &&
863	    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
864	    (hva & (PUD_SIZE - PAGE_SIZE))) {
865		level = 2;
866	} else if (large_enable && shift == PMD_SHIFT &&
867		   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
868		   (hva & (PMD_SIZE - PAGE_SIZE))) {
869		level = 1;
870	} else {
871		level = 0;
872		if (shift > PAGE_SHIFT) {
873			/*
874			 * If the pte maps more than one page, bring over
875			 * bits from the virtual address to get the real
876			 * address of the specific single page we want.
877			 */
878			unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
879			pte = __pte(pte_val(pte) | (hva & rpnmask));
880		}
881	}
882
883	pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
884	if (writing || upgrade_write) {
885		if (pte_val(pte) & _PAGE_WRITE)
886			pte = __pte(pte_val(pte) | _PAGE_DIRTY);
887	} else {
888		pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
889	}
890
891	/* Allocate space in the tree and write the PTE */
892	ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
893				mmu_seq, kvm->arch.lpid, NULL, NULL);
894	if (inserted_pte)
895		*inserted_pte = pte;
896	if (levelp)
897		*levelp = level;
898
899	if (page) {
900		if (!ret && (pte_val(pte) & _PAGE_WRITE))
901			set_page_dirty_lock(page);
902		put_page(page);
903	}
904
905	/* Increment number of large pages if we (successfully) inserted one */
906	if (!ret) {
907		if (level == 1)
908			kvm->stat.num_2M_pages++;
909		else if (level == 2)
910			kvm->stat.num_1G_pages++;
911	}
912
913	return ret;
914}
915
916int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
917				   unsigned long ea, unsigned long dsisr)
918{
919	struct kvm *kvm = vcpu->kvm;
920	unsigned long gpa, gfn;
921	struct kvm_memory_slot *memslot;
922	long ret;
923	bool writing = !!(dsisr & DSISR_ISSTORE);
924	bool kvm_ro = false;
925
926	/* Check for unusual errors */
927	if (dsisr & DSISR_UNSUPP_MMU) {
928		pr_err("KVM: Got unsupported MMU fault\n");
929		return -EFAULT;
930	}
931	if (dsisr & DSISR_BADACCESS) {
932		/* Reflect to the guest as DSI */
933		pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
934		kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
935		return RESUME_GUEST;
936	}
937
938	/* Translate the logical address */
939	gpa = vcpu->arch.fault_gpa & ~0xfffUL;
940	gpa &= ~0xF000000000000000ul;
941	gfn = gpa >> PAGE_SHIFT;
942	if (!(dsisr & DSISR_PRTABLE_FAULT))
943		gpa |= ea & 0xfff;
944
945	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
946		return kvmppc_send_page_to_uv(kvm, gfn);
947
948	/* Get the corresponding memslot */
949	memslot = gfn_to_memslot(kvm, gfn);
950
951	/* No memslot means it's an emulated MMIO region */
952	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
953		if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
954			     DSISR_SET_RC)) {
955			/*
956			 * Bad address in guest page table tree, or other
957			 * unusual error - reflect it to the guest as DSI.
958			 */
959			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
960			return RESUME_GUEST;
961		}
962		return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing);
963	}
964
965	if (memslot->flags & KVM_MEM_READONLY) {
966		if (writing) {
967			/* give the guest a DSI */
968			kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
969						       DSISR_PROTFAULT);
970			return RESUME_GUEST;
971		}
972		kvm_ro = true;
973	}
974
975	/* Failed to set the reference/change bits */
976	if (dsisr & DSISR_SET_RC) {
977		spin_lock(&kvm->mmu_lock);
978		if (kvmppc_hv_handle_set_rc(kvm, false, writing,
979					    gpa, kvm->arch.lpid))
980			dsisr &= ~DSISR_SET_RC;
981		spin_unlock(&kvm->mmu_lock);
982
983		if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
984			       DSISR_PROTFAULT | DSISR_SET_RC)))
985			return RESUME_GUEST;
986	}
987
988	/* Try to insert a pte */
989	ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
990					     kvm_ro, NULL, NULL);
991
992	if (ret == 0 || ret == -EAGAIN)
993		ret = RESUME_GUEST;
994	return ret;
995}
996
997/* Called with kvm->mmu_lock held */
998int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
999		    unsigned long gfn)
1000{
1001	pte_t *ptep;
1002	unsigned long gpa = gfn << PAGE_SHIFT;
1003	unsigned int shift;
1004
1005	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
1006		uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
1007		return 0;
1008	}
1009
1010	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1011	if (ptep && pte_present(*ptep))
1012		kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1013				 kvm->arch.lpid);
1014	return 0;
1015}
1016
1017/* Called with kvm->mmu_lock held */
1018int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1019		  unsigned long gfn)
1020{
1021	pte_t *ptep;
1022	unsigned long gpa = gfn << PAGE_SHIFT;
1023	unsigned int shift;
1024	int ref = 0;
1025	unsigned long old, *rmapp;
1026
1027	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1028		return ref;
1029
1030	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1031	if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
1032		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
1033					      gpa, shift);
1034		/* XXX need to flush tlb here? */
1035		/* Also clear bit in ptes in shadow pgtable for nested guests */
1036		rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1037		kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
1038					       old & PTE_RPN_MASK,
1039					       1UL << shift);
1040		ref = 1;
1041	}
1042	return ref;
1043}
1044
1045/* Called with kvm->mmu_lock held */
1046int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1047		       unsigned long gfn)
1048{
1049	pte_t *ptep;
1050	unsigned long gpa = gfn << PAGE_SHIFT;
1051	unsigned int shift;
1052	int ref = 0;
1053
1054	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1055		return ref;
1056
1057	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1058	if (ptep && pte_present(*ptep) && pte_young(*ptep))
1059		ref = 1;
1060	return ref;
1061}
1062
1063/* Returns the number of PAGE_SIZE pages that are dirty */
1064static int kvm_radix_test_clear_dirty(struct kvm *kvm,
1065				struct kvm_memory_slot *memslot, int pagenum)
1066{
1067	unsigned long gfn = memslot->base_gfn + pagenum;
1068	unsigned long gpa = gfn << PAGE_SHIFT;
1069	pte_t *ptep, pte;
1070	unsigned int shift;
1071	int ret = 0;
1072	unsigned long old, *rmapp;
1073
1074	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1075		return ret;
1076
1077	/*
1078	 * For performance reasons we don't hold kvm->mmu_lock while walking the
1079	 * partition scoped table.
1080	 */
1081	ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift);
1082	if (!ptep)
1083		return 0;
1084
1085	pte = READ_ONCE(*ptep);
1086	if (pte_present(pte) && pte_dirty(pte)) {
1087		spin_lock(&kvm->mmu_lock);
1088		/*
1089		 * Recheck the pte again
1090		 */
1091		if (pte_val(pte) != pte_val(*ptep)) {
1092			/*
1093			 * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can
1094			 * only find PAGE_SIZE pte entries here. We can continue
1095			 * to use the pte addr returned by above page table
1096			 * walk.
1097			 */
1098			if (!pte_present(*ptep) || !pte_dirty(*ptep)) {
1099				spin_unlock(&kvm->mmu_lock);
1100				return 0;
1101			}
1102		}
1103
1104		ret = 1;
1105		VM_BUG_ON(shift);
1106		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
1107					      gpa, shift);
1108		kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
1109		/* Also clear bit in ptes in shadow pgtable for nested guests */
1110		rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1111		kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
1112					       old & PTE_RPN_MASK,
1113					       1UL << shift);
1114		spin_unlock(&kvm->mmu_lock);
1115	}
1116	return ret;
1117}
1118
1119long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
1120			struct kvm_memory_slot *memslot, unsigned long *map)
1121{
1122	unsigned long i, j;
1123	int npages;
1124
1125	for (i = 0; i < memslot->npages; i = j) {
1126		npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
1127
1128		/*
1129		 * Note that if npages > 0 then i must be a multiple of npages,
1130		 * since huge pages are only used to back the guest at guest
1131		 * real addresses that are a multiple of their size.
1132		 * Since we have at most one PTE covering any given guest
1133		 * real address, if npages > 1 we can skip to i + npages.
1134		 */
1135		j = i + 1;
1136		if (npages) {
1137			set_dirty_bits(map, i, npages);
1138			j = i + npages;
1139		}
1140	}
1141	return 0;
1142}
1143
1144void kvmppc_radix_flush_memslot(struct kvm *kvm,
1145				const struct kvm_memory_slot *memslot)
1146{
1147	unsigned long n;
1148	pte_t *ptep;
1149	unsigned long gpa;
1150	unsigned int shift;
1151
1152	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)
1153		kvmppc_uvmem_drop_pages(memslot, kvm, true);
1154
1155	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1156		return;
1157
1158	gpa = memslot->base_gfn << PAGE_SHIFT;
1159	spin_lock(&kvm->mmu_lock);
1160	for (n = memslot->npages; n; --n) {
1161		ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1162		if (ptep && pte_present(*ptep))
1163			kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1164					 kvm->arch.lpid);
1165		gpa += PAGE_SIZE;
1166	}
1167	/*
1168	 * Increase the mmu notifier sequence number to prevent any page
1169	 * fault that read the memslot earlier from writing a PTE.
1170	 */
1171	kvm->mmu_notifier_seq++;
1172	spin_unlock(&kvm->mmu_lock);
1173}
1174
1175static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
1176				 int psize, int *indexp)
1177{
1178	if (!mmu_psize_defs[psize].shift)
1179		return;
1180	info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
1181		(mmu_psize_defs[psize].ap << 29);
1182	++(*indexp);
1183}
1184
1185int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
1186{
1187	int i;
1188
1189	if (!radix_enabled())
1190		return -EINVAL;
1191	memset(info, 0, sizeof(*info));
1192
1193	/* 4k page size */
1194	info->geometries[0].page_shift = 12;
1195	info->geometries[0].level_bits[0] = 9;
1196	for (i = 1; i < 4; ++i)
1197		info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
1198	/* 64k page size */
1199	info->geometries[1].page_shift = 16;
1200	for (i = 0; i < 4; ++i)
1201		info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
1202
1203	i = 0;
1204	add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
1205	add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
1206	add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
1207	add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
1208
1209	return 0;
1210}
1211
1212int kvmppc_init_vm_radix(struct kvm *kvm)
1213{
1214	kvm->arch.pgtable = pgd_alloc(kvm->mm);
1215	if (!kvm->arch.pgtable)
1216		return -ENOMEM;
1217	return 0;
1218}
1219
1220static void pte_ctor(void *addr)
1221{
1222	memset(addr, 0, RADIX_PTE_TABLE_SIZE);
1223}
1224
1225static void pmd_ctor(void *addr)
1226{
1227	memset(addr, 0, RADIX_PMD_TABLE_SIZE);
1228}
1229
1230struct debugfs_radix_state {
1231	struct kvm	*kvm;
1232	struct mutex	mutex;
1233	unsigned long	gpa;
1234	int		lpid;
1235	int		chars_left;
1236	int		buf_index;
1237	char		buf[128];
1238	u8		hdr;
1239};
1240
1241static int debugfs_radix_open(struct inode *inode, struct file *file)
1242{
1243	struct kvm *kvm = inode->i_private;
1244	struct debugfs_radix_state *p;
1245
1246	p = kzalloc(sizeof(*p), GFP_KERNEL);
1247	if (!p)
1248		return -ENOMEM;
1249
1250	kvm_get_kvm(kvm);
1251	p->kvm = kvm;
1252	mutex_init(&p->mutex);
1253	file->private_data = p;
1254
1255	return nonseekable_open(inode, file);
1256}
1257
1258static int debugfs_radix_release(struct inode *inode, struct file *file)
1259{
1260	struct debugfs_radix_state *p = file->private_data;
1261
1262	kvm_put_kvm(p->kvm);
1263	kfree(p);
1264	return 0;
1265}
1266
1267static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
1268				 size_t len, loff_t *ppos)
1269{
1270	struct debugfs_radix_state *p = file->private_data;
1271	ssize_t ret, r;
1272	unsigned long n;
1273	struct kvm *kvm;
1274	unsigned long gpa;
1275	pgd_t *pgt;
1276	struct kvm_nested_guest *nested;
1277	pgd_t *pgdp;
1278	p4d_t p4d, *p4dp;
1279	pud_t pud, *pudp;
1280	pmd_t pmd, *pmdp;
1281	pte_t *ptep;
1282	int shift;
1283	unsigned long pte;
1284
1285	kvm = p->kvm;
1286	if (!kvm_is_radix(kvm))
1287		return 0;
1288
1289	ret = mutex_lock_interruptible(&p->mutex);
1290	if (ret)
1291		return ret;
1292
1293	if (p->chars_left) {
1294		n = p->chars_left;
1295		if (n > len)
1296			n = len;
1297		r = copy_to_user(buf, p->buf + p->buf_index, n);
1298		n -= r;
1299		p->chars_left -= n;
1300		p->buf_index += n;
1301		buf += n;
1302		len -= n;
1303		ret = n;
1304		if (r) {
1305			if (!n)
1306				ret = -EFAULT;
1307			goto out;
1308		}
1309	}
1310
1311	gpa = p->gpa;
1312	nested = NULL;
1313	pgt = NULL;
1314	while (len != 0 && p->lpid >= 0) {
1315		if (gpa >= RADIX_PGTABLE_RANGE) {
1316			gpa = 0;
1317			pgt = NULL;
1318			if (nested) {
1319				kvmhv_put_nested(nested);
1320				nested = NULL;
1321			}
1322			p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
1323			p->hdr = 0;
1324			if (p->lpid < 0)
1325				break;
1326		}
1327		if (!pgt) {
1328			if (p->lpid == 0) {
1329				pgt = kvm->arch.pgtable;
1330			} else {
1331				nested = kvmhv_get_nested(kvm, p->lpid, false);
1332				if (!nested) {
1333					gpa = RADIX_PGTABLE_RANGE;
1334					continue;
1335				}
1336				pgt = nested->shadow_pgtable;
1337			}
1338		}
1339		n = 0;
1340		if (!p->hdr) {
1341			if (p->lpid > 0)
1342				n = scnprintf(p->buf, sizeof(p->buf),
1343					      "\nNested LPID %d: ", p->lpid);
1344			n += scnprintf(p->buf + n, sizeof(p->buf) - n,
1345				      "pgdir: %lx\n", (unsigned long)pgt);
1346			p->hdr = 1;
1347			goto copy;
1348		}
1349
1350		pgdp = pgt + pgd_index(gpa);
1351		p4dp = p4d_offset(pgdp, gpa);
1352		p4d = READ_ONCE(*p4dp);
1353		if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
1354			gpa = (gpa & P4D_MASK) + P4D_SIZE;
1355			continue;
1356		}
1357
1358		pudp = pud_offset(&p4d, gpa);
1359		pud = READ_ONCE(*pudp);
1360		if (!(pud_val(pud) & _PAGE_PRESENT)) {
1361			gpa = (gpa & PUD_MASK) + PUD_SIZE;
1362			continue;
1363		}
1364		if (pud_val(pud) & _PAGE_PTE) {
1365			pte = pud_val(pud);
1366			shift = PUD_SHIFT;
1367			goto leaf;
1368		}
1369
1370		pmdp = pmd_offset(&pud, gpa);
1371		pmd = READ_ONCE(*pmdp);
1372		if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
1373			gpa = (gpa & PMD_MASK) + PMD_SIZE;
1374			continue;
1375		}
1376		if (pmd_val(pmd) & _PAGE_PTE) {
1377			pte = pmd_val(pmd);
1378			shift = PMD_SHIFT;
1379			goto leaf;
1380		}
1381
1382		ptep = pte_offset_kernel(&pmd, gpa);
1383		pte = pte_val(READ_ONCE(*ptep));
1384		if (!(pte & _PAGE_PRESENT)) {
1385			gpa += PAGE_SIZE;
1386			continue;
1387		}
1388		shift = PAGE_SHIFT;
1389	leaf:
1390		n = scnprintf(p->buf, sizeof(p->buf),
1391			      " %lx: %lx %d\n", gpa, pte, shift);
1392		gpa += 1ul << shift;
1393	copy:
1394		p->chars_left = n;
1395		if (n > len)
1396			n = len;
1397		r = copy_to_user(buf, p->buf, n);
1398		n -= r;
1399		p->chars_left -= n;
1400		p->buf_index = n;
1401		buf += n;
1402		len -= n;
1403		ret += n;
1404		if (r) {
1405			if (!ret)
1406				ret = -EFAULT;
1407			break;
1408		}
1409	}
1410	p->gpa = gpa;
1411	if (nested)
1412		kvmhv_put_nested(nested);
1413
1414 out:
1415	mutex_unlock(&p->mutex);
1416	return ret;
1417}
1418
1419static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
1420			   size_t len, loff_t *ppos)
1421{
1422	return -EACCES;
1423}
1424
1425static const struct file_operations debugfs_radix_fops = {
1426	.owner	 = THIS_MODULE,
1427	.open	 = debugfs_radix_open,
1428	.release = debugfs_radix_release,
1429	.read	 = debugfs_radix_read,
1430	.write	 = debugfs_radix_write,
1431	.llseek	 = generic_file_llseek,
1432};
1433
1434void kvmhv_radix_debugfs_init(struct kvm *kvm)
1435{
1436	debugfs_create_file("radix", 0400, kvm->arch.debugfs_dir, kvm,
1437			    &debugfs_radix_fops);
1438}
1439
1440int kvmppc_radix_init(void)
1441{
1442	unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
1443
1444	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
1445	if (!kvm_pte_cache)
1446		return -ENOMEM;
1447
1448	size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
1449
1450	kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
1451	if (!kvm_pmd_cache) {
1452		kmem_cache_destroy(kvm_pte_cache);
1453		return -ENOMEM;
1454	}
1455
1456	return 0;
1457}
1458
1459void kvmppc_radix_exit(void)
1460{
1461	kmem_cache_destroy(kvm_pte_cache);
1462	kmem_cache_destroy(kvm_pmd_cache);
1463}
1464