xref: /kernel/linux/linux-5.10/arch/loongarch/kvm/mmu.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
4 */
5
6#include <linux/highmem.h>
7#include <linux/hugetlb.h>
8#include <linux/page-flags.h>
9#include <linux/kvm_host.h>
10#include <linux/uaccess.h>
11#include <asm/mmu_context.h>
12#include <asm/pgalloc.h>
13#include <asm/tlb.h>
14#include "kvm_compat.h"
15
16/*
17 * KVM_MMU_CACHE_MIN_PAGES is the number of GPA page table translation levels
18 * for which pages need to be cached.
19 */
20#if defined(__PAGETABLE_PMD_FOLDED)
21#define KVM_MMU_CACHE_MIN_PAGES 1
22#else
23#define KVM_MMU_CACHE_MIN_PAGES 2
24#endif
25
26static inline int kvm_pte_huge(pte_t pte) { return pte_val(pte) & _PAGE_HUGE; }
27static inline pte_t kvm_pte_mksmall(pte_t pte)
28{
29	pte_val(pte) &= ~_PAGE_HUGE;
30	return pte;
31}
32static inline void kvm_set_pte(pte_t *ptep, pte_t val)
33{
34	WRITE_ONCE(*ptep, val);
35}
36
37static int kvm_tlb_flush_gpa(struct kvm_vcpu *vcpu, unsigned long gpa)
38{
39	preempt_disable();
40	gpa &= (PAGE_MASK << 1);
41	invtlb(INVTLB_GID_ADDR, kvm_read_csr_gstat() & KVM_GSTAT_GID, gpa);
42	preempt_enable();
43	return 0;
44}
45
46static inline pmd_t kvm_pmd_mkhuge(pmd_t pmd)
47{
48#ifdef CONFIG_TRANSPARENT_HUGEPAGE
49	return pmd_mkhuge(pmd);
50#elif defined(CONFIG_HUGETLB_PAGE)
51	pte_t entry;
52
53	pte_val(entry) = pmd_val(pmd);
54	entry = pte_mkhuge(entry);
55	pmd_val(pmd) = pte_val(entry);
56#endif
57	return pmd;
58}
59
60static inline pmd_t kvm_pmd_mkclean(pmd_t pmd)
61{
62#ifdef CONFIG_TRANSPARENT_HUGEPAGE
63	return pmd_mkclean(pmd);
64#elif defined(CONFIG_HUGETLB_PAGE)
65	pte_t entry;
66
67	pte_val(entry) = pmd_val(pmd);
68	entry = pte_mkclean(entry);
69	pmd_val(pmd) = pte_val(entry);
70#endif
71	return pmd;
72}
73
74static inline pmd_t kvm_pmd_mkold(pmd_t pmd)
75{
76#ifdef CONFIG_TRANSPARENT_HUGEPAGE
77	return pmd_mkold(pmd);
78#elif defined(CONFIG_HUGETLB_PAGE)
79	pte_t entry;
80
81	pte_val(entry) = pmd_val(pmd);
82	entry = pte_mkold(entry);
83	pmd_val(pmd) = pte_val(entry);
84#endif
85	return pmd;
86}
87
88void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
89{
90	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
91}
92
93/**
94 * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory.
95 *
96 * Allocate a blank KVM GPA page directory (PGD) for representing guest physical
97 * to host physical page mappings.
98 *
99 * Returns:	Pointer to new KVM GPA page directory.
100 *		NULL on allocation failure.
101 */
102pgd_t *kvm_pgd_alloc(void)
103{
104	pgd_t *ret;
105	struct page *page;
106
107	page = alloc_pages(GFP_KERNEL, 0);
108	if (!page)
109		return NULL;
110	ret = (pgd_t *) page_address(page);
111	if (ret)
112		pgd_init((unsigned long)ret);
113
114	return ret;
115}
116
117/**
118 * kvm_walk_pgd() - Walk page table with optional allocation.
119 * @pgd:	Page directory pointer.
120 * @addr:	Address to index page table using.
121 * @cache:	MMU page cache to allocate new page tables from, or NULL.
122 *
123 * Walk the page tables pointed to by @pgd to find the PTE corresponding to the
124 * address @addr. If page tables don't exist for @addr, they will be created
125 * from the MMU cache if @cache is not NULL.
126 *
127 * Returns:	Pointer to pte_t corresponding to @addr.
128 *		NULL if a page table doesn't exist for @addr and !@cache.
129 *		NULL if a page table allocation failed.
130 */
131static pte_t *kvm_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache,
132				unsigned long addr)
133{
134	p4d_t *p4d;
135	pud_t *pud;
136	pmd_t *pmd;
137
138	pgd += pgd_index(addr);
139	if (pgd_none(*pgd)) {
140		/* Not used yet */
141		BUG();
142		return NULL;
143	}
144	p4d = p4d_offset(pgd, addr);
145	pud = pud_offset(p4d, addr);
146	if (pud_none(*pud)) {
147		pmd_t *new_pmd;
148
149		if (!cache)
150			return NULL;
151		new_pmd = kvm_mmu_memory_cache_alloc(cache);
152		pmd_init((unsigned long)new_pmd,
153			 (unsigned long)invalid_pte_table);
154		pud_populate(NULL, pud, new_pmd);
155	}
156	pmd = pmd_offset(pud, addr);
157#ifdef CONFIG_HUGETLB_PAGE
158	if (pmd_huge(*pmd)) {
159		return (pte_t *)pmd;
160	}
161#endif
162	if (pmd_none(*pmd)) {
163		pte_t *new_pte;
164
165		if (!cache)
166			return NULL;
167		new_pte = kvm_mmu_memory_cache_alloc(cache);
168		clear_page(new_pte);
169		pmd_populate_kernel(NULL, pmd, new_pte);
170	}
171	return pte_offset_kernel(pmd, addr);
172}
173
174/* Caller must hold kvm->mm_lock */
175static pte_t *kvm_pte_for_gpa(struct kvm *kvm,
176				struct kvm_mmu_memory_cache *cache,
177				unsigned long addr)
178{
179	return kvm_walk_pgd(kvm->arch.gpa_mm.pgd, cache, addr);
180}
181
182/*
183 * kvm_flush_gpa_{pte,pmd,pud,pgd,pt}.
184 * Flush a range of guest physical address space from the VM's GPA page tables.
185 */
186
187static bool kvm_flush_gpa_pte(pte_t *pte, unsigned long start_gpa,
188				   unsigned long end_gpa, unsigned long *data)
189{
190	int i_min = pte_index(start_gpa);
191	int i_max = pte_index(end_gpa);
192	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PTE - 1);
193	int i;
194
195	for (i = i_min; i <= i_max; ++i) {
196		if (!pte_present(pte[i]))
197			continue;
198
199		set_pte(pte + i, __pte(0));
200		if (data)
201			*data = *data + 1;
202	}
203	return safe_to_remove;
204}
205
206static bool kvm_flush_gpa_pmd(pmd_t *pmd, unsigned long start_gpa,
207				   unsigned long end_gpa, unsigned long *data)
208{
209	pte_t *pte;
210	unsigned long end = ~0ul;
211	int i_min = pmd_index(start_gpa);
212	int i_max = pmd_index(end_gpa);
213	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PMD - 1);
214	int i;
215
216	for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
217		if (!pmd_present(pmd[i]))
218			continue;
219
220		if (pmd_huge(pmd[i]) && pmd_present(pmd[i])) {
221			pmd_clear(pmd + i);
222			if (data)
223				*data += PTRS_PER_PMD;
224			continue;
225		}
226
227		pte = pte_offset_kernel(pmd + i, 0);
228		if (i == i_max)
229			end = end_gpa;
230
231		if (kvm_flush_gpa_pte(pte, start_gpa, end, data)) {
232			pmd_clear(pmd + i);
233			pte_free_kernel(NULL, pte);
234		} else {
235			safe_to_remove = false;
236		}
237	}
238	return safe_to_remove;
239}
240
241static bool kvm_flush_gpa_pud(pud_t *pud, unsigned long start_gpa,
242				   unsigned long end_gpa, unsigned long *data)
243{
244	pmd_t *pmd;
245	unsigned long end = ~0ul;
246	int i_min = pud_index(start_gpa);
247	int i_max = pud_index(end_gpa);
248	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PUD - 1);
249	int i;
250
251	for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
252		if (!pud_present(pud[i]))
253			continue;
254
255		pmd = pmd_offset(pud + i, 0);
256		if (i == i_max)
257			end = end_gpa;
258
259		if (kvm_flush_gpa_pmd(pmd, start_gpa, end, data)) {
260			pud_clear(pud + i);
261			pmd_free(NULL, pmd);
262		} else {
263			safe_to_remove = false;
264		}
265	}
266	return safe_to_remove;
267}
268
269static bool kvm_flush_gpa_pgd(pgd_t *pgd, unsigned long start_gpa,
270				unsigned long end_gpa, unsigned long *data)
271{
272	p4d_t *p4d;
273	pud_t *pud;
274	unsigned long end = ~0ul;
275	int i_min = pgd_index(start_gpa);
276	int i_max = pgd_index(end_gpa);
277	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PGD - 1);
278	int i;
279
280	for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
281		if (!pgd_present(pgd[i]))
282			continue;
283
284		p4d = p4d_offset(pgd, 0);
285		pud = pud_offset(p4d + i, 0);
286		if (i == i_max)
287			end = end_gpa;
288
289		if (kvm_flush_gpa_pud(pud, start_gpa, end, data)) {
290			pgd_clear(pgd + i);
291			pud_free(NULL, pud);
292		} else {
293			safe_to_remove = false;
294		}
295	}
296	return safe_to_remove;
297}
298
299/**
300 * kvm_flush_gpa_pt() - Flush a range of guest physical addresses.
301 * @kvm:	KVM pointer.
302 * @start_gfn:	Guest frame number of first page in GPA range to flush.
303 * @end_gfn:	Guest frame number of last page in GPA range to flush.
304 *
305 * Flushes a range of GPA mappings from the GPA page tables.
306 *
307 * The caller must hold the @kvm->mmu_lock spinlock.
308 *
309 * Returns:	Whether its safe to remove the top level page directory because
310 *		all lower levels have been removed.
311 */
312static bool kvm_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, void *data)
313{
314	return kvm_flush_gpa_pgd(kvm->arch.gpa_mm.pgd,
315				start_gfn << PAGE_SHIFT,
316				end_gfn << PAGE_SHIFT, (unsigned long *)data);
317}
318
319/*
320 * kvm_mkclean_gpa_pt.
321 * Mark a range of guest physical address space clean (writes fault) in the VM's
322 * GPA page table to allow dirty page tracking.
323 */
324
325static int kvm_mkclean_pte(pte_t *pte, unsigned long start, unsigned long end)
326{
327	int ret = 0;
328	int i_min = pte_index(start);
329	int i_max = pte_index(end);
330	int i;
331	pte_t val;
332
333	for (i = i_min; i <= i_max; ++i) {
334		val = pte[i];
335		if (pte_present(val) && pte_dirty(val)) {
336			set_pte(pte + i, pte_mkclean(val));
337			ret = 1;
338		}
339	}
340	return ret;
341}
342
343static int kvm_mkclean_pmd(pmd_t *pmd, unsigned long start, unsigned long end)
344{
345	int ret = 0;
346	pte_t *pte;
347	unsigned long cur_end = ~0ul;
348	int i_min = pmd_index(start);
349	int i_max = pmd_index(end);
350	int i;
351	pmd_t old, new;
352
353	for (i = i_min; i <= i_max; ++i, start = 0) {
354		if (!pmd_present(pmd[i]))
355			continue;
356
357		if (pmd_huge(pmd[i])) {
358			old = pmd[i];
359			new = kvm_pmd_mkclean(old);
360			if (pmd_val(new) == pmd_val(old))
361				continue;
362			set_pmd(pmd + i, new);
363			ret = 1;
364			continue;
365		}
366
367		pte = pte_offset_kernel(pmd + i, 0);
368		if (i == i_max)
369			cur_end = end;
370
371		ret |= kvm_mkclean_pte(pte, start, cur_end);
372	}
373
374	return ret;
375}
376
377static int kvm_mkclean_pud(pud_t *pud, unsigned long start, unsigned long end)
378{
379	int ret = 0;
380	pmd_t *pmd;
381	unsigned long cur_end = ~0ul;
382	int i_min = pud_index(start);
383	int i_max = pud_index(end);
384	int i;
385
386	for (i = i_min; i <= i_max; ++i, start = 0) {
387		if (!pud_present(pud[i]))
388			continue;
389
390		pmd = pmd_offset(pud + i, 0);
391		if (i == i_max)
392			cur_end = end;
393
394		ret |= kvm_mkclean_pmd(pmd, start, cur_end);
395	}
396	return ret;
397}
398
399static int kvm_mkclean_pgd(pgd_t *pgd, unsigned long start, unsigned long end)
400{
401	int ret = 0;
402	p4d_t *p4d;							\
403	pud_t *pud;
404	unsigned long cur_end = ~0ul;
405	int i_min = pgd_index(start);
406	int i_max = pgd_index(end);
407	int i;
408
409	for (i = i_min; i <= i_max; ++i, start = 0) {
410		if (!pgd_present(pgd[i]))
411			continue;
412
413		p4d = p4d_offset(pgd, 0);
414		pud = pud_offset(p4d + i, 0);
415		if (i == i_max)
416			cur_end = end;
417
418		ret |= kvm_mkclean_pud(pud, start, cur_end);
419	}
420	return ret;
421}
422
423/**
424 * kvm_mkclean_gpa_pt() - Make a range of guest physical addresses clean.
425 * @kvm:	KVM pointer.
426 * @start_gfn:	Guest frame number of first page in GPA range to flush.
427 * @end_gfn:	Guest frame number of last page in GPA range to flush.
428 *
429 * Make a range of GPA mappings clean so that guest writes will fault and
430 * trigger dirty page logging.
431 *
432 * The caller must hold the @kvm->mmu_lock spinlock.
433 *
434 * Returns:	Whether any GPA mappings were modified, which would require
435 *		derived mappings (GVA page tables & TLB enties) to be
436 *		invalidated.
437 */
438static int kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
439{
440	return kvm_mkclean_pgd(kvm->arch.gpa_mm.pgd, start_gfn << PAGE_SHIFT,
441				end_gfn << PAGE_SHIFT);
442}
443
444/**
445 * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages
446 * @kvm:	The KVM pointer
447 * @slot:	The memory slot associated with mask
448 * @gfn_offset:	The gfn offset in memory slot
449 * @mask:	The mask of dirty pages at offset 'gfn_offset' in this memory
450 *		slot to be write protected
451 *
452 * Walks bits set in mask write protects the associated pte's. Caller must
453 * acquire @kvm->mmu_lock.
454 */
455void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
456		struct kvm_memory_slot *slot,
457		gfn_t gfn_offset, unsigned long mask)
458{
459	gfn_t base_gfn = slot->base_gfn + gfn_offset;
460	gfn_t start = base_gfn +  __ffs(mask);
461	gfn_t end = base_gfn + __fls(mask);
462
463	kvm_mkclean_gpa_pt(kvm, start, end);
464}
465
466void kvm_arch_commit_memory_region(struct kvm *kvm,
467				   const struct kvm_userspace_memory_region *mem,
468				   struct kvm_memory_slot *old,
469				   const struct kvm_memory_slot *new,
470				   enum kvm_mr_change change)
471{
472	int needs_flush;
473
474	kvm_debug("%s: kvm: %p slot: %d, GPA: %llx, size: %llx, QVA: %llx\n",
475		  __func__, kvm, mem->slot, mem->guest_phys_addr,
476		  mem->memory_size, mem->userspace_addr);
477
478	/*
479	 * If dirty page logging is enabled, write protect all pages in the slot
480	 * ready for dirty logging.
481	 *
482	 * There is no need to do this in any of the following cases:
483	 * CREATE:	No dirty mappings will already exist.
484	 * MOVE/DELETE:	The old mappings will already have been cleaned up by
485	 *		kvm_arch_flush_shadow_memslot()
486	 */
487	if (change == KVM_MR_FLAGS_ONLY &&
488	    (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
489	     new->flags & KVM_MEM_LOG_DIRTY_PAGES)) {
490		spin_lock(&kvm->mmu_lock);
491		/* Write protect GPA page table entries */
492		needs_flush = kvm_mkclean_gpa_pt(kvm, new->base_gfn,
493					new->base_gfn + new->npages - 1);
494		/* Let implementation do the rest */
495		if (needs_flush)
496			kvm_flush_remote_tlbs(kvm);
497		spin_unlock(&kvm->mmu_lock);
498	}
499}
500
501void kvm_arch_flush_shadow_all(struct kvm *kvm)
502{
503	/* Flush whole GPA */
504	kvm_flush_gpa_pt(kvm, 0, ~0UL, NULL);
505
506	/* Flush vpid for each VCPU individually */
507	kvm_flush_remote_tlbs(kvm);
508}
509
510void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
511		struct kvm_memory_slot *slot)
512{
513	unsigned long npages;
514
515	/*
516	 * The slot has been made invalid (ready for moving or deletion), so we
517	 * need to ensure that it can no longer be accessed by any guest VCPUs.
518	 */
519
520	npages = 0;
521	spin_lock(&kvm->mmu_lock);
522	/* Flush slot from GPA */
523	kvm_flush_gpa_pt(kvm, slot->base_gfn,
524			slot->base_gfn + slot->npages - 1, &npages);
525	/* Let implementation do the rest */
526	if (npages)
527		kvm_flush_remote_tlbs(kvm);
528	spin_unlock(&kvm->mmu_lock);
529}
530
531void _kvm_destroy_mm(struct kvm *kvm)
532{
533	/* It should always be safe to remove after flushing the whole range */
534	WARN_ON(!kvm_flush_gpa_pt(kvm, 0, ~0UL, NULL));
535	pgd_free(NULL, kvm->arch.gpa_mm.pgd);
536	kvm->arch.gpa_mm.pgd = NULL;
537}
538
539/*
540 * Mark a range of guest physical address space old (all accesses fault) in the
541 * VM's GPA page table to allow detection of commonly used pages.
542 */
543
544static int kvm_mkold_pte(pte_t *pte, unsigned long start,
545				 unsigned long end)
546{
547	int ret = 0;
548	int i_min = pte_index(start);
549	int i_max = pte_index(end);
550	int i;
551	pte_t old, new;
552
553	for (i = i_min; i <= i_max; ++i) {
554		if (!pte_present(pte[i]))
555			continue;
556
557		old = pte[i];
558		new = pte_mkold(old);
559		if (pte_val(new) == pte_val(old))
560			continue;
561		set_pte(pte + i, new);
562		ret = 1;
563	}
564
565	return ret;
566}
567
568static int kvm_mkold_pmd(pmd_t *pmd, unsigned long start, unsigned long end)
569{
570	int ret = 0;
571	pte_t *pte;
572	unsigned long cur_end = ~0ul;
573	int i_min = pmd_index(start);
574	int i_max = pmd_index(end);
575	int i;
576	pmd_t old, new;
577
578	for (i = i_min; i <= i_max; ++i, start = 0) {
579		if (!pmd_present(pmd[i]))
580			continue;
581
582		if (pmd_huge(pmd[i])) {
583			old = pmd[i];
584			new = kvm_pmd_mkold(old);
585			if (pmd_val(new) == pmd_val(old))
586				continue;
587			set_pmd(pmd + i, new);
588			ret = 1;
589			continue;
590		}
591
592		pte = pte_offset_kernel(pmd + i, 0);
593		if (i == i_max)
594			cur_end = end;
595
596		ret |= kvm_mkold_pte(pte, start, cur_end);
597	}
598
599	return ret;
600}
601
602static int kvm_mkold_pud(pud_t *pud, unsigned long start, unsigned long end)
603{
604	int ret = 0;
605	pmd_t *pmd;
606	unsigned long cur_end = ~0ul;
607	int i_min = pud_index(start);
608	int i_max = pud_index(end);
609	int i;
610
611	for (i = i_min; i <= i_max; ++i, start = 0) {
612		if (!pud_present(pud[i]))
613			continue;
614
615		pmd = pmd_offset(pud + i, 0);
616		if (i == i_max)
617			cur_end = end;
618
619		ret |= kvm_mkold_pmd(pmd, start, cur_end);
620	}
621
622	return ret;
623}
624
625static int kvm_mkold_pgd(pgd_t *pgd, unsigned long start, unsigned long end)
626{
627	int ret = 0;
628	p4d_t *p4d;
629	pud_t *pud;
630	unsigned long cur_end = ~0ul;
631	int i_min = pgd_index(start);
632	int i_max = pgd_index(end);
633	int i;
634
635	for (i = i_min; i <= i_max; ++i, start = 0) {
636		if (!pgd_present(pgd[i]))
637			continue;
638
639		p4d = p4d_offset(pgd, 0);
640		pud = pud_offset(p4d + i, 0);
641		if (i == i_max)
642			cur_end = end;
643
644		ret |= kvm_mkold_pud(pud, start, cur_end);
645	}
646
647	return ret;
648}
649
650static int handle_hva_to_gpa(struct kvm *kvm,
651			     unsigned long start,
652			     unsigned long end,
653			     int (*handler)(struct kvm *kvm, gfn_t gfn,
654					    gpa_t gfn_end,
655					    struct kvm_memory_slot *memslot,
656					    void *data),
657			     void *data)
658{
659	struct kvm_memslots *slots;
660	struct kvm_memory_slot *memslot;
661	int ret = 0;
662
663	slots = kvm_memslots(kvm);
664
665	/* we only care about the pages that the guest sees */
666	kvm_for_each_memslot(memslot, slots) {
667		unsigned long hva_start, hva_end;
668		gfn_t gfn, gfn_end;
669
670		hva_start = max(start, memslot->userspace_addr);
671		hva_end = min(end, memslot->userspace_addr +
672					(memslot->npages << PAGE_SHIFT));
673		if (hva_start >= hva_end)
674			continue;
675
676		/*
677		 * {gfn(page) | page intersects with [hva_start, hva_end)} =
678		 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
679		 */
680		gfn = hva_to_gfn_memslot(hva_start, memslot);
681		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
682		ret |= handler(kvm, gfn, gfn_end, memslot, data);
683	}
684
685	return ret;
686}
687
688
689static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
690				 struct kvm_memory_slot *memslot, void *data)
691{
692	unsigned long npages;
693
694	npages = 0;
695	kvm_flush_gpa_pt(kvm, gfn, gfn_end - 1, &npages);
696	*(unsigned long *)data = *(unsigned long *)data + npages;
697
698	return npages > 0;
699}
700
701int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable)
702{
703	unsigned long npages;
704
705	npages = 0;
706	return handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &npages);
707}
708
709static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
710				struct kvm_memory_slot *memslot, void *data)
711{
712	gpa_t gpa = gfn << PAGE_SHIFT;
713	pte_t hva_pte = *(pte_t *)data;
714	pte_t *gpa_pte = kvm_pte_for_gpa(kvm, NULL, gpa);
715	pte_t old_pte;
716
717	if (!gpa_pte)
718		return 0;
719
720	/* Mapping may need adjusting depending on memslot flags */
721	old_pte = *gpa_pte;
722	if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
723		hva_pte = pte_mkclean(hva_pte);
724	else if (memslot->flags & KVM_MEM_READONLY)
725		hva_pte = pte_wrprotect(hva_pte);
726
727	set_pte(gpa_pte, hva_pte);
728
729	/* Replacing an absent or old page doesn't need flushes */
730	if (!pte_present(old_pte) || !pte_young(old_pte))
731		return 0;
732
733	/* Pages swapped, aged, moved, or cleaned require flushes */
734	return !pte_present(hva_pte) ||
735	       !pte_young(hva_pte) ||
736	       pte_pfn(old_pte) != pte_pfn(hva_pte) ||
737	       (pte_dirty(old_pte) && !pte_dirty(hva_pte));
738}
739
740int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
741{
742	unsigned long end = hva + PAGE_SIZE;
743	int ret;
744
745	ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte);
746	if (ret)
747		/* Flush vpid for each VCPU individually */
748		kvm_flush_remote_tlbs(kvm);
749	return 0;
750}
751
752static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
753			       struct kvm_memory_slot *memslot, void *data)
754{
755	return kvm_mkold_pgd(kvm->arch.gpa_mm.pgd, gfn << PAGE_SHIFT,
756				gfn_end << PAGE_SHIFT);
757}
758
759static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
760				    struct kvm_memory_slot *memslot, void *data)
761{
762	gpa_t gpa = gfn << PAGE_SHIFT;
763	pte_t *gpa_pte = kvm_pte_for_gpa(kvm, NULL, gpa);
764
765	if (!gpa_pte)
766		return 0;
767	return pte_young(*gpa_pte);
768}
769
770int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
771{
772	return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
773}
774
775int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
776{
777	return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
778}
779
780static pud_t *kvm_get_pud(struct kvm *kvm,
781		 struct kvm_mmu_memory_cache *cache, phys_addr_t addr)
782{
783	pgd_t *pgd;
784
785	pgd = kvm->arch.gpa_mm.pgd + pgd_index(addr);
786	if (pgd_none(*pgd)) {
787		/* Not used yet */
788		BUG();
789		return NULL;
790	}
791
792	return pud_offset(p4d_offset(pgd, addr), addr);
793}
794
795static pmd_t *kvm_get_pmd(struct kvm *kvm,
796		 struct kvm_mmu_memory_cache *cache, phys_addr_t addr)
797{
798	pud_t *pud;
799	pmd_t *pmd;
800
801	pud = kvm_get_pud(kvm, cache, addr);
802	if (!pud || pud_huge(*pud))
803		return NULL;
804
805	if (pud_none(*pud)) {
806		if (!cache)
807			return NULL;
808		pmd = kvm_mmu_memory_cache_alloc(cache);
809		pmd_init((unsigned long)pmd,
810				(unsigned long)invalid_pte_table);
811		pud_populate(NULL, pud, pmd);
812	}
813
814	return pmd_offset(pud, addr);
815}
816
817static int kvm_set_pmd_huge(struct kvm_vcpu *vcpu, struct kvm_mmu_memory_cache
818			       *cache, phys_addr_t addr, const pmd_t *new_pmd)
819{
820	pmd_t *pmd, old_pmd;
821
822retry:
823	pmd = kvm_get_pmd(vcpu->kvm, cache, addr);
824	VM_BUG_ON(!pmd);
825
826	old_pmd = *pmd;
827	/*
828	 * Multiple vcpus faulting on the same PMD entry, can
829	 * lead to them sequentially updating the PMD with the
830	 * same value. Following the break-before-make
831	 * (pmd_clear() followed by tlb_flush()) process can
832	 * hinder forward progress due to refaults generated
833	 * on missing translations.
834	 *
835	 * Skip updating the page table if the entry is
836	 * unchanged.
837	 */
838	if (pmd_val(old_pmd) == pmd_val(*new_pmd))
839		return 0;
840
841	if (pmd_present(old_pmd)) {
842		/*
843		 * If we already have PTE level mapping for this block,
844		 * we must unmap it to avoid inconsistent TLB state and
845		 * leaking the table page. We could end up in this situation
846		 * if the memory slot was marked for dirty logging and was
847		 * reverted, leaving PTE level mappings for the pages accessed
848		 * during the period. So, unmap the PTE level mapping for this
849		 * block and retry, as we could have released the upper level
850		 * table in the process.
851		 *
852		 * Normal THP split/merge follows mmu_notifier callbacks and do
853		 * get handled accordingly.
854		 */
855		if (!pmd_huge(old_pmd)) {
856			++vcpu->stat.huge_merge_exits;
857			kvm_flush_gpa_pt(vcpu->kvm,
858				(addr & PMD_MASK) >> PAGE_SHIFT,
859				((addr & PMD_MASK) + PMD_SIZE - 1) >> PAGE_SHIFT, NULL);
860			goto retry;
861		}
862		/*
863		 * Mapping in huge pages should only happen through a
864		 * fault.  If a page is merged into a transparent huge
865		 * page, the individual subpages of that huge page
866		 * should be unmapped through MMU notifiers before we
867		 * get here.
868		 *
869		 * Merging of CompoundPages is not supported; they
870		 * should become splitting first, unmapped, merged,
871		 * and mapped back in on-demand.
872		 */
873#ifdef CONFIG_TRANSPARENT_HUGEPAGE
874		WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
875#endif
876		pmd_clear(pmd);
877	}
878
879	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
880	set_pmd(pmd, *new_pmd);
881	return 0;
882}
883
884/*
885 * Adjust pfn start boundary if support for transparent hugepage
886 */
887static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, unsigned long *gpap)
888{
889	kvm_pfn_t pfn = *pfnp;
890	gfn_t gfn = *gpap >> PAGE_SHIFT;
891	struct page *page = pfn_to_page(pfn);
892
893	/*
894	 * PageTransCompoundMap() returns true for THP and
895	 * hugetlbfs. Make sure the adjustment is done only for THP
896	 * pages.
897	 */
898	if ((!PageHuge(page)) && PageTransCompound(page) &&
899			 (atomic_read(&page->_mapcount) < 0)) {
900		unsigned long mask;
901		/*
902		 * The address we faulted on is backed by a transparent huge
903		 * page.  However, because we map the compound huge page and
904		 * not the individual tail page, we need to transfer the
905		 * refcount to the head page.  We have to be careful that the
906		 * THP doesn't start to split while we are adjusting the
907		 * refcounts.
908		 *
909		 * We are sure this doesn't happen, because mmu_notifier_retry
910		 * was successful and we are holding the mmu_lock, so if this
911		 * THP is trying to split, it will be blocked in the mmu
912		 * notifier before touching any of the pages, specifically
913		 * before being able to call __split_huge_page_refcount().
914		 *
915		 * We can therefore safely transfer the refcount from PG_tail
916		 * to PG_head and switch the pfn from a tail page to the head
917		 * page accordingly.
918		 */
919		mask = PTRS_PER_PMD - 1;
920		VM_BUG_ON((gfn & mask) != (pfn & mask));
921		if (pfn & mask) {
922			*gpap &= PMD_MASK;
923			kvm_release_pfn_clean(pfn);
924			pfn &= ~mask;
925			kvm_get_pfn(pfn);
926			*pfnp = pfn;
927		}
928
929		return true;
930	}
931
932	return false;
933}
934
935static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot,
936					unsigned long hva, bool write)
937{
938	gpa_t gpa_start;
939	hva_t uaddr_start, uaddr_end;
940	unsigned long map_size;
941	size_t size;
942
943	map_size = PMD_SIZE;
944	/* Disable dirty logging on HugePages */
945	if ((memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) && write)
946		return false;
947
948	size = memslot->npages * PAGE_SIZE;
949	gpa_start = memslot->base_gfn << PAGE_SHIFT;
950	uaddr_start = memslot->userspace_addr;
951	uaddr_end = uaddr_start + size;
952
953	/*
954	 * Pages belonging to memslots that don't have the same alignment
955	 * within a PMD/PUD for userspace and GPA cannot be mapped with stage-2
956	 * PMD/PUD entries, because we'll end up mapping the wrong pages.
957	 *
958	 * Consider a layout like the following:
959	 *
960	 *    memslot->userspace_addr:
961	 *    +-----+--------------------+--------------------+---+
962	 *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
963	 *    +-----+--------------------+--------------------+---+
964	 *
965	 *    memslot->base_gfn << PAGE_SIZE:
966	 *      +---+--------------------+--------------------+-----+
967	 *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
968	 *      +---+--------------------+--------------------+-----+
969	 *
970	 * If we create those stage-2 blocks, we'll end up with this incorrect
971	 * mapping:
972	 *   d -> f
973	 *   e -> g
974	 *   f -> h
975	 */
976	if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
977		return false;
978
979	/*
980	 * Next, let's make sure we're not trying to map anything not covered
981	 * by the memslot. This means we have to prohibit block size mappings
982	 * for the beginning and end of a non-block aligned and non-block sized
983	 * memory slot (illustrated by the head and tail parts of the
984	 * userspace view above containing pages 'abcde' and 'xyz',
985	 * respectively).
986	 *
987	 * Note that it doesn't matter if we do the check using the
988	 * userspace_addr or the base_gfn, as both are equally aligned (per
989	 * the check above) and equally sized.
990	 */
991	return (hva & ~(map_size - 1)) >= uaddr_start &&
992	       (hva & ~(map_size - 1)) + map_size <= uaddr_end;
993}
994
995/**
996 * kvm_map_page_fast() - Fast path GPA fault handler.
997 * @vcpu:		VCPU pointer.
998 * @gpa:		Guest physical address of fault.
999 * @write:	Whether the fault was due to a write.
1000 *
1001 * Perform fast path GPA fault handling, doing all that can be done without
1002 * calling into KVM. This handles marking old pages young (for idle page
1003 * tracking), and dirtying of clean pages (for dirty page logging).
1004 *
1005 * Returns:	0 on success, in which case we can update derived mappings and
1006 *		resume guest execution.
1007 *		-EFAULT on failure due to absent GPA mapping or write to
1008 *		read-only page, in which case KVM must be consulted.
1009 */
1010static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa,
1011				   bool write)
1012{
1013	struct kvm *kvm = vcpu->kvm;
1014	gfn_t gfn = gpa >> PAGE_SHIFT;
1015	pte_t *ptep;
1016	kvm_pfn_t pfn = 0;	/* silence bogus GCC warning */
1017	bool pfn_valid = false;
1018	int ret = 0;
1019	struct kvm_memory_slot *slot;
1020
1021	spin_lock(&kvm->mmu_lock);
1022
1023	/* Fast path - just check GPA page table for an existing entry */
1024	ptep = kvm_pte_for_gpa(kvm, NULL, gpa);
1025	if (!ptep || !pte_present(*ptep)) {
1026		ret = -EFAULT;
1027		goto out;
1028	}
1029
1030	/* Track access to pages marked old */
1031	if (!pte_young(*ptep)) {
1032		set_pte(ptep, pte_mkyoung(*ptep));
1033		pfn = pte_pfn(*ptep);
1034		pfn_valid = true;
1035		/* call kvm_set_pfn_accessed() after unlock */
1036	}
1037	if (write && !pte_dirty(*ptep)) {
1038		if (!pte_write(*ptep)) {
1039			ret = -EFAULT;
1040			goto out;
1041		}
1042
1043		if (kvm_pte_huge(*ptep)) {
1044			/*
1045			 * Do not set write permission when dirty logging is
1046			 * enabled for HugePages
1047			 */
1048			slot = gfn_to_memslot(kvm, gfn);
1049			if (slot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1050				ret = -EFAULT;
1051				goto out;
1052			}
1053		}
1054
1055		/* Track dirtying of writeable pages */
1056		set_pte(ptep, pte_mkdirty(*ptep));
1057		pfn = pte_pfn(*ptep);
1058		if (pmd_huge(*((pmd_t *)ptep))) {
1059			int i;
1060			gfn_t base_gfn = (gpa & PMD_MASK) >> PAGE_SHIFT;
1061
1062			for (i = 0; i < PTRS_PER_PTE; i++)
1063				mark_page_dirty(kvm, base_gfn + i);
1064		} else
1065			mark_page_dirty(kvm, gfn);
1066		kvm_set_pfn_dirty(pfn);
1067	}
1068
1069out:
1070	spin_unlock(&kvm->mmu_lock);
1071	if (pfn_valid)
1072		kvm_set_pfn_accessed(pfn);
1073	return ret;
1074}
1075
1076/*
1077 * Split huge page
1078 */
1079static pte_t *kvm_split_huge(struct kvm_vcpu *vcpu, pte_t *ptep, gfn_t gfn,
1080		struct vm_area_struct *vma, unsigned long hva)
1081{
1082	int i;
1083	pte_t val, *child;
1084	struct kvm_mmu_memory_cache *memcache;
1085
1086	memcache = &vcpu->arch.mmu_page_cache;
1087	child = kvm_mmu_memory_cache_alloc(memcache);
1088	val = kvm_pte_mksmall(*ptep);
1089	for (i = 0; i < PTRS_PER_PTE; i++) {
1090		kvm_set_pte(child + i, val);
1091		pte_val(val) += PAGE_SIZE;
1092	}
1093
1094	/* The later kvm_flush_tlb_gpa() will flush hugepage tlb */
1095	pte_val(val) = (unsigned long)child;
1096	kvm_set_pte(ptep, val);
1097	return child + (gfn & (PTRS_PER_PTE - 1));
1098}
1099
1100/**
1101 * kvm_map_page() - Map a guest physical page.
1102 * @vcpu:		VCPU pointer.
1103 * @gpa:		Guest physical address of fault.
1104 * @write:	Whether the fault was due to a write.
1105 *
1106 * Handle GPA faults by creating a new GPA mapping (or updating an existing
1107 * one).
1108 *
1109 * This takes care of marking pages young or dirty (idle/dirty page tracking),
1110 * asking KVM for the corresponding PFN, and creating a mapping in the GPA page
1111 * tables. Derived mappings (GVA page tables and TLBs) must be handled by the
1112 * caller.
1113 *
1114 * Returns:	0 on success, in which case the caller may use the @out_entry
1115 *		and @out_buddy PTEs to update derived mappings and resume guest
1116 *		execution.
1117 *		-EFAULT if there is no memory region at @gpa or a write was
1118 *		attempted to a read-only memory region. This is usually handled
1119 *		as an MMIO access.
1120 */
1121static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
1122			     bool write)
1123{
1124	bool writeable;
1125	bool force_pte = false;
1126	int i, srcu_idx, err = 0, retry_no = 0;
1127	unsigned long hva;
1128	unsigned long mmu_seq;
1129	unsigned long prot_bits;
1130	unsigned long vma_pagesize;
1131	pte_t *ptep;
1132	kvm_pfn_t pfn;
1133	gfn_t gfn = gpa >> PAGE_SHIFT;
1134	struct vm_area_struct *vma;
1135	struct kvm *kvm = vcpu->kvm;
1136	struct kvm_memory_slot *memslot;
1137	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1138
1139	/* Try the fast path to handle old / clean pages */
1140	srcu_idx = srcu_read_lock(&kvm->srcu);
1141	err = kvm_map_page_fast(vcpu, gpa, write);
1142	if (!err)
1143		goto out;
1144
1145	memslot = gfn_to_memslot(kvm, gfn);
1146	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writeable);
1147	if (kvm_is_error_hva(hva) || (write && !writeable))
1148		goto out;
1149
1150	/* Let's check if we will get back a huge page backed by hugetlbfs */
1151	mmap_read_lock(current->mm);
1152	vma = find_vma_intersection(current->mm, hva, hva + 1);
1153	if (unlikely(!vma)) {
1154		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1155		mmap_read_unlock(current->mm);
1156		err = -EFAULT;
1157		goto out;
1158	}
1159
1160	vma_pagesize = vma_kernel_pagesize(vma);
1161	if ((vma_pagesize == PMD_SIZE) &&
1162		!fault_supports_huge_mapping(memslot, hva, write)) {
1163		force_pte = true;
1164		vma_pagesize = PAGE_SIZE;
1165		++vcpu->stat.huge_dec_exits;
1166	}
1167
1168	/* PMD is not folded, adjust gfn to new boundary */
1169	if (vma_pagesize == PMD_SIZE)
1170		gfn = (gpa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1171
1172	mmap_read_unlock(current->mm);
1173
1174	/* We need a minimum of cached pages ready for page table creation */
1175	err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES);
1176	if (err)
1177		goto out;
1178
1179retry:
1180	/*
1181	 * Used to check for invalidations in progress, of the pfn that is
1182	 * returned by pfn_to_pfn_prot below.
1183	 */
1184	mmu_seq = kvm->mmu_notifier_seq;
1185	/*
1186	 * Ensure the read of mmu_notifier_seq isn't reordered with PTE reads in
1187	 * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't
1188	 * risk the page we get a reference to getting unmapped before we have a
1189	 * chance to grab the mmu_lock without mmu_notifier_retry() noticing.
1190	 *
1191	 * This smp_rmb() pairs with the effective smp_wmb() of the combination
1192	 * of the pte_unmap_unlock() after the PTE is zapped, and the
1193	 * spin_lock() in kvm_mmu_notifier_invalidate_<page|range_end>() before
1194	 * mmu_notifier_seq is incremented.
1195	 */
1196	smp_rmb();
1197
1198	/* Slow path - ask KVM core whether we can access this GPA */
1199	pfn = gfn_to_pfn_prot(kvm, gfn, write, &writeable);
1200	if (is_error_noslot_pfn(pfn)) {
1201		err = -EFAULT;
1202		goto out;
1203	}
1204
1205	spin_lock(&kvm->mmu_lock);
1206	/* Check if an invalidation has taken place since we got pfn */
1207	if (mmu_notifier_retry(kvm, mmu_seq)) {
1208		/*
1209		 * This can happen when mappings are changed asynchronously, but
1210		 * also synchronously if a COW is triggered by
1211		 * gfn_to_pfn_prot().
1212		 */
1213		spin_unlock(&kvm->mmu_lock);
1214		kvm_set_pfn_accessed(pfn);
1215		kvm_release_pfn_clean(pfn);
1216		if (retry_no > 100) {
1217			retry_no = 0;
1218			schedule();
1219		}
1220		retry_no++;
1221		goto retry;
1222	}
1223
1224	if (vma_pagesize == PAGE_SIZE && !force_pte) {
1225		/*
1226		 * Only PMD_SIZE transparent hugepages(THP) are
1227		 * currently supported. This code will need to be
1228		 * updated to support other THP sizes.
1229		 *
1230		 * Make sure the host VA and the guest IPA are sufficiently
1231		 * aligned and that the block is contained within the memslot.
1232		 */
1233		++vcpu->stat.huge_thp_exits;
1234		if (fault_supports_huge_mapping(memslot, hva, write) &&
1235		    transparent_hugepage_adjust(&pfn, &gpa)) {
1236			++vcpu->stat.huge_adjust_exits;
1237			vma_pagesize = PMD_SIZE;
1238		}
1239	}
1240
1241	/* Set up the prot bits */
1242	prot_bits = _PAGE_PRESENT | __READABLE;
1243	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1244		prot_bits |= _CACHE_SUC;
1245	else
1246		prot_bits |= _CACHE_CC;
1247
1248	if (writeable) {
1249		prot_bits |= _PAGE_WRITE;
1250		if (write) {
1251			prot_bits |= __WRITEABLE;
1252			mark_page_dirty(kvm, gfn);
1253			kvm_set_pfn_dirty(pfn);
1254		}
1255	}
1256
1257	if (vma_pagesize == PMD_SIZE) {
1258		pmd_t new_pmd = pfn_pmd(pfn, __pgprot(prot_bits));
1259		new_pmd = pmd_mkhuge(new_pmd);
1260		if (writeable && write) {
1261			gfn_t base_gfn = (gpa & PMD_MASK) >> PAGE_SHIFT;
1262			for (i = 0; i < PTRS_PER_PTE; i++)
1263				mark_page_dirty(kvm, base_gfn + i);
1264		}
1265
1266		++vcpu->stat.huge_set_exits;
1267		kvm_set_pmd_huge(vcpu, memcache, gpa, &new_pmd);
1268	} else {
1269		pte_t new_pte = pfn_pte(pfn, __pgprot(prot_bits));
1270		if (writeable && write)
1271			mark_page_dirty(kvm, gfn);
1272
1273		/* Ensure page tables are allocated */
1274		ptep = kvm_pte_for_gpa(kvm, memcache, gpa);
1275		if (ptep && kvm_pte_huge(*ptep) && write)
1276			ptep = kvm_split_huge(vcpu, ptep, gfn, vma, hva);
1277
1278		set_pte(ptep, new_pte);
1279		err = 0;
1280	}
1281
1282	spin_unlock(&kvm->mmu_lock);
1283	kvm_release_pfn_clean(pfn);
1284	kvm_set_pfn_accessed(pfn);
1285out:
1286	srcu_read_unlock(&kvm->srcu, srcu_idx);
1287	return err;
1288}
1289
1290int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv,
1291				      bool write)
1292{
1293	int ret;
1294
1295	ret = kvm_map_page(vcpu, badv, write);
1296	if (ret)
1297		return ret;
1298
1299	/* Invalidate this entry in the TLB */
1300	return kvm_tlb_flush_gpa(vcpu, badv);
1301}
1302
1303/**
1304 * kvm_flush_tlb_all() - Flush all root TLB entries for
1305 * guests.
1306 *
1307 * Invalidate all entries including GVA-->GPA and GPA-->HPA mappings.
1308 */
1309void kvm_flush_tlb_all(void)
1310{
1311	unsigned long flags;
1312
1313	local_irq_save(flags);
1314	invtlb_all(INVTLB_ALLGID, 0, 0);
1315	local_irq_restore(flags);
1316}
1317