xref: /kernel/linux/linux-6.6/arch/x86/kvm/mmu/mmu.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
7 *
8 * MMU support
9 *
10 * Copyright (C) 2006 Qumranet, Inc.
11 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
12 *
13 * Authors:
14 *   Yaniv Kamay  <yaniv@qumranet.com>
15 *   Avi Kivity   <avi@qumranet.com>
16 */
17#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
19#include "irq.h"
20#include "ioapic.h"
21#include "mmu.h"
22#include "mmu_internal.h"
23#include "tdp_mmu.h"
24#include "x86.h"
25#include "kvm_cache_regs.h"
26#include "smm.h"
27#include "kvm_emulate.h"
28#include "page_track.h"
29#include "cpuid.h"
30#include "spte.h"
31
32#include <linux/kvm_host.h>
33#include <linux/types.h>
34#include <linux/string.h>
35#include <linux/mm.h>
36#include <linux/highmem.h>
37#include <linux/moduleparam.h>
38#include <linux/export.h>
39#include <linux/swap.h>
40#include <linux/hugetlb.h>
41#include <linux/compiler.h>
42#include <linux/srcu.h>
43#include <linux/slab.h>
44#include <linux/sched/signal.h>
45#include <linux/uaccess.h>
46#include <linux/hash.h>
47#include <linux/kern_levels.h>
48#include <linux/kstrtox.h>
49#include <linux/kthread.h>
50
51#include <asm/page.h>
52#include <asm/memtype.h>
53#include <asm/cmpxchg.h>
54#include <asm/io.h>
55#include <asm/set_memory.h>
56#include <asm/vmx.h>
57
58#include "trace.h"
59
60extern bool itlb_multihit_kvm_mitigation;
61
62static bool nx_hugepage_mitigation_hard_disabled;
63
64int __read_mostly nx_huge_pages = -1;
65static uint __read_mostly nx_huge_pages_recovery_period_ms;
66#ifdef CONFIG_PREEMPT_RT
67/* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
68static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
69#else
70static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
71#endif
72
73static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp);
74static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
75static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
76
77static const struct kernel_param_ops nx_huge_pages_ops = {
78	.set = set_nx_huge_pages,
79	.get = get_nx_huge_pages,
80};
81
82static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
83	.set = set_nx_huge_pages_recovery_param,
84	.get = param_get_uint,
85};
86
87module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
88__MODULE_PARM_TYPE(nx_huge_pages, "bool");
89module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
90		&nx_huge_pages_recovery_ratio, 0644);
91__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
92module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
93		&nx_huge_pages_recovery_period_ms, 0644);
94__MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
95
96static bool __read_mostly force_flush_and_sync_on_reuse;
97module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
98
99/*
100 * When setting this variable to true it enables Two-Dimensional-Paging
101 * where the hardware walks 2 page tables:
102 * 1. the guest-virtual to guest-physical
103 * 2. while doing 1. it walks guest-physical to host-physical
104 * If the hardware supports that we don't need to do shadow paging.
105 */
106bool tdp_enabled = false;
107
108static bool __ro_after_init tdp_mmu_allowed;
109
110#ifdef CONFIG_X86_64
111bool __read_mostly tdp_mmu_enabled = true;
112module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444);
113#endif
114
115static int max_huge_page_level __read_mostly;
116static int tdp_root_level __read_mostly;
117static int max_tdp_level __read_mostly;
118
119#define PTE_PREFETCH_NUM		8
120
121#include <trace/events/kvm.h>
122
123/* make pte_list_desc fit well in cache lines */
124#define PTE_LIST_EXT 14
125
126/*
127 * struct pte_list_desc is the core data structure used to implement a custom
128 * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a
129 * given GFN when used in the context of rmaps.  Using a custom list allows KVM
130 * to optimize for the common case where many GFNs will have at most a handful
131 * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small
132 * memory footprint, which in turn improves runtime performance by exploiting
133 * cache locality.
134 *
135 * A list is comprised of one or more pte_list_desc objects (descriptors).
136 * Each individual descriptor stores up to PTE_LIST_EXT SPTEs.  If a descriptor
137 * is full and a new SPTEs needs to be added, a new descriptor is allocated and
138 * becomes the head of the list.  This means that by definitions, all tail
139 * descriptors are full.
140 *
141 * Note, the meta data fields are deliberately placed at the start of the
142 * structure to optimize the cacheline layout; accessing the descriptor will
143 * touch only a single cacheline so long as @spte_count<=6 (or if only the
144 * descriptors metadata is accessed).
145 */
146struct pte_list_desc {
147	struct pte_list_desc *more;
148	/* The number of PTEs stored in _this_ descriptor. */
149	u32 spte_count;
150	/* The number of PTEs stored in all tails of this descriptor. */
151	u32 tail_count;
152	u64 *sptes[PTE_LIST_EXT];
153};
154
155struct kvm_shadow_walk_iterator {
156	u64 addr;
157	hpa_t shadow_addr;
158	u64 *sptep;
159	int level;
160	unsigned index;
161};
162
163#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
164	for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
165					 (_root), (_addr));                \
166	     shadow_walk_okay(&(_walker));			           \
167	     shadow_walk_next(&(_walker)))
168
169#define for_each_shadow_entry(_vcpu, _addr, _walker)            \
170	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
171	     shadow_walk_okay(&(_walker));			\
172	     shadow_walk_next(&(_walker)))
173
174#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)	\
175	for (shadow_walk_init(&(_walker), _vcpu, _addr);		\
176	     shadow_walk_okay(&(_walker)) &&				\
177		({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });	\
178	     __shadow_walk_next(&(_walker), spte))
179
180static struct kmem_cache *pte_list_desc_cache;
181struct kmem_cache *mmu_page_header_cache;
182static struct percpu_counter kvm_total_used_mmu_pages;
183
184static void mmu_spte_set(u64 *sptep, u64 spte);
185
186struct kvm_mmu_role_regs {
187	const unsigned long cr0;
188	const unsigned long cr4;
189	const u64 efer;
190};
191
192#define CREATE_TRACE_POINTS
193#include "mmutrace.h"
194
195/*
196 * Yes, lot's of underscores.  They're a hint that you probably shouldn't be
197 * reading from the role_regs.  Once the root_role is constructed, it becomes
198 * the single source of truth for the MMU's state.
199 */
200#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag)			\
201static inline bool __maybe_unused					\
202____is_##reg##_##name(const struct kvm_mmu_role_regs *regs)		\
203{									\
204	return !!(regs->reg & flag);					\
205}
206BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
207BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
208BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
209BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
210BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
211BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
212BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
213BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
214BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
215BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
216
217/*
218 * The MMU itself (with a valid role) is the single source of truth for the
219 * MMU.  Do not use the regs used to build the MMU/role, nor the vCPU.  The
220 * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
221 * and the vCPU may be incorrect/irrelevant.
222 */
223#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name)		\
224static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu)	\
225{								\
226	return !!(mmu->cpu_role. base_or_ext . reg##_##name);	\
227}
228BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
229BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pse);
230BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smep);
231BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smap);
232BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pke);
233BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, la57);
234BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
235BUILD_MMU_ROLE_ACCESSOR(ext,  efer, lma);
236
237static inline bool is_cr0_pg(struct kvm_mmu *mmu)
238{
239        return mmu->cpu_role.base.level > 0;
240}
241
242static inline bool is_cr4_pae(struct kvm_mmu *mmu)
243{
244        return !mmu->cpu_role.base.has_4_byte_gpte;
245}
246
247static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
248{
249	struct kvm_mmu_role_regs regs = {
250		.cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
251		.cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
252		.efer = vcpu->arch.efer,
253	};
254
255	return regs;
256}
257
258static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu)
259{
260	return kvm_read_cr3(vcpu);
261}
262
263static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
264						  struct kvm_mmu *mmu)
265{
266	if (IS_ENABLED(CONFIG_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
267		return kvm_read_cr3(vcpu);
268
269	return mmu->get_guest_pgd(vcpu);
270}
271
272static inline bool kvm_available_flush_remote_tlbs_range(void)
273{
274	return kvm_x86_ops.flush_remote_tlbs_range;
275}
276
277int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
278{
279	if (!kvm_x86_ops.flush_remote_tlbs_range)
280		return -EOPNOTSUPP;
281
282	return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages);
283}
284
285static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
286
287/* Flush the range of guest memory mapped by the given SPTE. */
288static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep)
289{
290	struct kvm_mmu_page *sp = sptep_to_sp(sptep);
291	gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep));
292
293	kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
294}
295
296static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
297			   unsigned int access)
298{
299	u64 spte = make_mmio_spte(vcpu, gfn, access);
300
301	trace_mark_mmio_spte(sptep, gfn, spte);
302	mmu_spte_set(sptep, spte);
303}
304
305static gfn_t get_mmio_spte_gfn(u64 spte)
306{
307	u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
308
309	gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
310	       & shadow_nonpresent_or_rsvd_mask;
311
312	return gpa >> PAGE_SHIFT;
313}
314
315static unsigned get_mmio_spte_access(u64 spte)
316{
317	return spte & shadow_mmio_access_mask;
318}
319
320static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
321{
322	u64 kvm_gen, spte_gen, gen;
323
324	gen = kvm_vcpu_memslots(vcpu)->generation;
325	if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
326		return false;
327
328	kvm_gen = gen & MMIO_SPTE_GEN_MASK;
329	spte_gen = get_mmio_spte_generation(spte);
330
331	trace_check_mmio_spte(spte, kvm_gen, spte_gen);
332	return likely(kvm_gen == spte_gen);
333}
334
335static int is_cpuid_PSE36(void)
336{
337	return 1;
338}
339
340#ifdef CONFIG_X86_64
341static void __set_spte(u64 *sptep, u64 spte)
342{
343	WRITE_ONCE(*sptep, spte);
344}
345
346static void __update_clear_spte_fast(u64 *sptep, u64 spte)
347{
348	WRITE_ONCE(*sptep, spte);
349}
350
351static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
352{
353	return xchg(sptep, spte);
354}
355
356static u64 __get_spte_lockless(u64 *sptep)
357{
358	return READ_ONCE(*sptep);
359}
360#else
361union split_spte {
362	struct {
363		u32 spte_low;
364		u32 spte_high;
365	};
366	u64 spte;
367};
368
369static void count_spte_clear(u64 *sptep, u64 spte)
370{
371	struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
372
373	if (is_shadow_present_pte(spte))
374		return;
375
376	/* Ensure the spte is completely set before we increase the count */
377	smp_wmb();
378	sp->clear_spte_count++;
379}
380
381static void __set_spte(u64 *sptep, u64 spte)
382{
383	union split_spte *ssptep, sspte;
384
385	ssptep = (union split_spte *)sptep;
386	sspte = (union split_spte)spte;
387
388	ssptep->spte_high = sspte.spte_high;
389
390	/*
391	 * If we map the spte from nonpresent to present, We should store
392	 * the high bits firstly, then set present bit, so cpu can not
393	 * fetch this spte while we are setting the spte.
394	 */
395	smp_wmb();
396
397	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
398}
399
400static void __update_clear_spte_fast(u64 *sptep, u64 spte)
401{
402	union split_spte *ssptep, sspte;
403
404	ssptep = (union split_spte *)sptep;
405	sspte = (union split_spte)spte;
406
407	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
408
409	/*
410	 * If we map the spte from present to nonpresent, we should clear
411	 * present bit firstly to avoid vcpu fetch the old high bits.
412	 */
413	smp_wmb();
414
415	ssptep->spte_high = sspte.spte_high;
416	count_spte_clear(sptep, spte);
417}
418
419static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
420{
421	union split_spte *ssptep, sspte, orig;
422
423	ssptep = (union split_spte *)sptep;
424	sspte = (union split_spte)spte;
425
426	/* xchg acts as a barrier before the setting of the high bits */
427	orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
428	orig.spte_high = ssptep->spte_high;
429	ssptep->spte_high = sspte.spte_high;
430	count_spte_clear(sptep, spte);
431
432	return orig.spte;
433}
434
435/*
436 * The idea using the light way get the spte on x86_32 guest is from
437 * gup_get_pte (mm/gup.c).
438 *
439 * An spte tlb flush may be pending, because kvm_set_pte_rmap
440 * coalesces them and we are running out of the MMU lock.  Therefore
441 * we need to protect against in-progress updates of the spte.
442 *
443 * Reading the spte while an update is in progress may get the old value
444 * for the high part of the spte.  The race is fine for a present->non-present
445 * change (because the high part of the spte is ignored for non-present spte),
446 * but for a present->present change we must reread the spte.
447 *
448 * All such changes are done in two steps (present->non-present and
449 * non-present->present), hence it is enough to count the number of
450 * present->non-present updates: if it changed while reading the spte,
451 * we might have hit the race.  This is done using clear_spte_count.
452 */
453static u64 __get_spte_lockless(u64 *sptep)
454{
455	struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
456	union split_spte spte, *orig = (union split_spte *)sptep;
457	int count;
458
459retry:
460	count = sp->clear_spte_count;
461	smp_rmb();
462
463	spte.spte_low = orig->spte_low;
464	smp_rmb();
465
466	spte.spte_high = orig->spte_high;
467	smp_rmb();
468
469	if (unlikely(spte.spte_low != orig->spte_low ||
470	      count != sp->clear_spte_count))
471		goto retry;
472
473	return spte.spte;
474}
475#endif
476
477/* Rules for using mmu_spte_set:
478 * Set the sptep from nonpresent to present.
479 * Note: the sptep being assigned *must* be either not present
480 * or in a state where the hardware will not attempt to update
481 * the spte.
482 */
483static void mmu_spte_set(u64 *sptep, u64 new_spte)
484{
485	WARN_ON_ONCE(is_shadow_present_pte(*sptep));
486	__set_spte(sptep, new_spte);
487}
488
489/*
490 * Update the SPTE (excluding the PFN), but do not track changes in its
491 * accessed/dirty status.
492 */
493static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
494{
495	u64 old_spte = *sptep;
496
497	WARN_ON_ONCE(!is_shadow_present_pte(new_spte));
498	check_spte_writable_invariants(new_spte);
499
500	if (!is_shadow_present_pte(old_spte)) {
501		mmu_spte_set(sptep, new_spte);
502		return old_spte;
503	}
504
505	if (!spte_has_volatile_bits(old_spte))
506		__update_clear_spte_fast(sptep, new_spte);
507	else
508		old_spte = __update_clear_spte_slow(sptep, new_spte);
509
510	WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
511
512	return old_spte;
513}
514
515/* Rules for using mmu_spte_update:
516 * Update the state bits, it means the mapped pfn is not changed.
517 *
518 * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
519 * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
520 * spte, even though the writable spte might be cached on a CPU's TLB.
521 *
522 * Returns true if the TLB needs to be flushed
523 */
524static bool mmu_spte_update(u64 *sptep, u64 new_spte)
525{
526	bool flush = false;
527	u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
528
529	if (!is_shadow_present_pte(old_spte))
530		return false;
531
532	/*
533	 * For the spte updated out of mmu-lock is safe, since
534	 * we always atomically update it, see the comments in
535	 * spte_has_volatile_bits().
536	 */
537	if (is_mmu_writable_spte(old_spte) &&
538	      !is_writable_pte(new_spte))
539		flush = true;
540
541	/*
542	 * Flush TLB when accessed/dirty states are changed in the page tables,
543	 * to guarantee consistency between TLB and page tables.
544	 */
545
546	if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
547		flush = true;
548		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
549	}
550
551	if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
552		flush = true;
553		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
554	}
555
556	return flush;
557}
558
559/*
560 * Rules for using mmu_spte_clear_track_bits:
561 * It sets the sptep from present to nonpresent, and track the
562 * state bits, it is used to clear the last level sptep.
563 * Returns the old PTE.
564 */
565static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
566{
567	kvm_pfn_t pfn;
568	u64 old_spte = *sptep;
569	int level = sptep_to_sp(sptep)->role.level;
570	struct page *page;
571
572	if (!is_shadow_present_pte(old_spte) ||
573	    !spte_has_volatile_bits(old_spte))
574		__update_clear_spte_fast(sptep, 0ull);
575	else
576		old_spte = __update_clear_spte_slow(sptep, 0ull);
577
578	if (!is_shadow_present_pte(old_spte))
579		return old_spte;
580
581	kvm_update_page_stats(kvm, level, -1);
582
583	pfn = spte_to_pfn(old_spte);
584
585	/*
586	 * KVM doesn't hold a reference to any pages mapped into the guest, and
587	 * instead uses the mmu_notifier to ensure that KVM unmaps any pages
588	 * before they are reclaimed.  Sanity check that, if the pfn is backed
589	 * by a refcounted page, the refcount is elevated.
590	 */
591	page = kvm_pfn_to_refcounted_page(pfn);
592	WARN_ON_ONCE(page && !page_count(page));
593
594	if (is_accessed_spte(old_spte))
595		kvm_set_pfn_accessed(pfn);
596
597	if (is_dirty_spte(old_spte))
598		kvm_set_pfn_dirty(pfn);
599
600	return old_spte;
601}
602
603/*
604 * Rules for using mmu_spte_clear_no_track:
605 * Directly clear spte without caring the state bits of sptep,
606 * it is used to set the upper level spte.
607 */
608static void mmu_spte_clear_no_track(u64 *sptep)
609{
610	__update_clear_spte_fast(sptep, 0ull);
611}
612
613static u64 mmu_spte_get_lockless(u64 *sptep)
614{
615	return __get_spte_lockless(sptep);
616}
617
618/* Returns the Accessed status of the PTE and resets it at the same time. */
619static bool mmu_spte_age(u64 *sptep)
620{
621	u64 spte = mmu_spte_get_lockless(sptep);
622
623	if (!is_accessed_spte(spte))
624		return false;
625
626	if (spte_ad_enabled(spte)) {
627		clear_bit((ffs(shadow_accessed_mask) - 1),
628			  (unsigned long *)sptep);
629	} else {
630		/*
631		 * Capture the dirty status of the page, so that it doesn't get
632		 * lost when the SPTE is marked for access tracking.
633		 */
634		if (is_writable_pte(spte))
635			kvm_set_pfn_dirty(spte_to_pfn(spte));
636
637		spte = mark_spte_for_access_track(spte);
638		mmu_spte_update_no_track(sptep, spte);
639	}
640
641	return true;
642}
643
644static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu)
645{
646	return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct;
647}
648
649static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
650{
651	if (is_tdp_mmu_active(vcpu)) {
652		kvm_tdp_mmu_walk_lockless_begin();
653	} else {
654		/*
655		 * Prevent page table teardown by making any free-er wait during
656		 * kvm_flush_remote_tlbs() IPI to all active vcpus.
657		 */
658		local_irq_disable();
659
660		/*
661		 * Make sure a following spte read is not reordered ahead of the write
662		 * to vcpu->mode.
663		 */
664		smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
665	}
666}
667
668static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
669{
670	if (is_tdp_mmu_active(vcpu)) {
671		kvm_tdp_mmu_walk_lockless_end();
672	} else {
673		/*
674		 * Make sure the write to vcpu->mode is not reordered in front of
675		 * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
676		 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
677		 */
678		smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
679		local_irq_enable();
680	}
681}
682
683static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
684{
685	int r;
686
687	/* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
688	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
689				       1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
690	if (r)
691		return r;
692	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
693				       PT64_ROOT_MAX_LEVEL);
694	if (r)
695		return r;
696	if (maybe_indirect) {
697		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
698					       PT64_ROOT_MAX_LEVEL);
699		if (r)
700			return r;
701	}
702	return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
703					  PT64_ROOT_MAX_LEVEL);
704}
705
706static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
707{
708	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
709	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
710	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
711	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
712}
713
714static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
715{
716	kmem_cache_free(pte_list_desc_cache, pte_list_desc);
717}
718
719static bool sp_has_gptes(struct kvm_mmu_page *sp);
720
721static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
722{
723	if (sp->role.passthrough)
724		return sp->gfn;
725
726	if (!sp->role.direct)
727		return sp->shadowed_translation[index] >> PAGE_SHIFT;
728
729	return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
730}
731
732/*
733 * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note
734 * that the SPTE itself may have a more constrained access permissions that
735 * what the guest enforces. For example, a guest may create an executable
736 * huge PTE but KVM may disallow execution to mitigate iTLB multihit.
737 */
738static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
739{
740	if (sp_has_gptes(sp))
741		return sp->shadowed_translation[index] & ACC_ALL;
742
743	/*
744	 * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs,
745	 * KVM is not shadowing any guest page tables, so the "guest access
746	 * permissions" are just ACC_ALL.
747	 *
748	 * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM
749	 * is shadowing a guest huge page with small pages, the guest access
750	 * permissions being shadowed are the access permissions of the huge
751	 * page.
752	 *
753	 * In both cases, sp->role.access contains the correct access bits.
754	 */
755	return sp->role.access;
756}
757
758static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
759					 gfn_t gfn, unsigned int access)
760{
761	if (sp_has_gptes(sp)) {
762		sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
763		return;
764	}
765
766	WARN_ONCE(access != kvm_mmu_page_get_access(sp, index),
767	          "access mismatch under %s page %llx (expected %u, got %u)\n",
768	          sp->role.passthrough ? "passthrough" : "direct",
769	          sp->gfn, kvm_mmu_page_get_access(sp, index), access);
770
771	WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index),
772	          "gfn mismatch under %s page %llx (expected %llx, got %llx)\n",
773	          sp->role.passthrough ? "passthrough" : "direct",
774	          sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn);
775}
776
777static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index,
778				    unsigned int access)
779{
780	gfn_t gfn = kvm_mmu_page_get_gfn(sp, index);
781
782	kvm_mmu_page_set_translation(sp, index, gfn, access);
783}
784
785/*
786 * Return the pointer to the large page information for a given gfn,
787 * handling slots that are not large page aligned.
788 */
789static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
790		const struct kvm_memory_slot *slot, int level)
791{
792	unsigned long idx;
793
794	idx = gfn_to_index(gfn, slot->base_gfn, level);
795	return &slot->arch.lpage_info[level - 2][idx];
796}
797
798static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
799					    gfn_t gfn, int count)
800{
801	struct kvm_lpage_info *linfo;
802	int i;
803
804	for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
805		linfo = lpage_info_slot(gfn, slot, i);
806		linfo->disallow_lpage += count;
807		WARN_ON_ONCE(linfo->disallow_lpage < 0);
808	}
809}
810
811void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
812{
813	update_gfn_disallow_lpage_count(slot, gfn, 1);
814}
815
816void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
817{
818	update_gfn_disallow_lpage_count(slot, gfn, -1);
819}
820
821static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
822{
823	struct kvm_memslots *slots;
824	struct kvm_memory_slot *slot;
825	gfn_t gfn;
826
827	kvm->arch.indirect_shadow_pages++;
828	gfn = sp->gfn;
829	slots = kvm_memslots_for_spte_role(kvm, sp->role);
830	slot = __gfn_to_memslot(slots, gfn);
831
832	/* the non-leaf shadow pages are keeping readonly. */
833	if (sp->role.level > PG_LEVEL_4K)
834		return __kvm_write_track_add_gfn(kvm, slot, gfn);
835
836	kvm_mmu_gfn_disallow_lpage(slot, gfn);
837
838	if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
839		kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
840}
841
842void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
843{
844	/*
845	 * If it's possible to replace the shadow page with an NX huge page,
846	 * i.e. if the shadow page is the only thing currently preventing KVM
847	 * from using a huge page, add the shadow page to the list of "to be
848	 * zapped for NX recovery" pages.  Note, the shadow page can already be
849	 * on the list if KVM is reusing an existing shadow page, i.e. if KVM
850	 * links a shadow page at multiple points.
851	 */
852	if (!list_empty(&sp->possible_nx_huge_page_link))
853		return;
854
855	++kvm->stat.nx_lpage_splits;
856	list_add_tail(&sp->possible_nx_huge_page_link,
857		      &kvm->arch.possible_nx_huge_pages);
858}
859
860static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
861				 bool nx_huge_page_possible)
862{
863	sp->nx_huge_page_disallowed = true;
864
865	if (nx_huge_page_possible)
866		track_possible_nx_huge_page(kvm, sp);
867}
868
869static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
870{
871	struct kvm_memslots *slots;
872	struct kvm_memory_slot *slot;
873	gfn_t gfn;
874
875	kvm->arch.indirect_shadow_pages--;
876	gfn = sp->gfn;
877	slots = kvm_memslots_for_spte_role(kvm, sp->role);
878	slot = __gfn_to_memslot(slots, gfn);
879	if (sp->role.level > PG_LEVEL_4K)
880		return __kvm_write_track_remove_gfn(kvm, slot, gfn);
881
882	kvm_mmu_gfn_allow_lpage(slot, gfn);
883}
884
885void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
886{
887	if (list_empty(&sp->possible_nx_huge_page_link))
888		return;
889
890	--kvm->stat.nx_lpage_splits;
891	list_del_init(&sp->possible_nx_huge_page_link);
892}
893
894static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
895{
896	sp->nx_huge_page_disallowed = false;
897
898	untrack_possible_nx_huge_page(kvm, sp);
899}
900
901static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu,
902							   gfn_t gfn,
903							   bool no_dirty_log)
904{
905	struct kvm_memory_slot *slot;
906
907	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
908	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
909		return NULL;
910	if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
911		return NULL;
912
913	return slot;
914}
915
916/*
917 * About rmap_head encoding:
918 *
919 * If the bit zero of rmap_head->val is clear, then it points to the only spte
920 * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
921 * pte_list_desc containing more mappings.
922 */
923
924/*
925 * Returns the number of pointers in the rmap chain, not counting the new one.
926 */
927static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
928			struct kvm_rmap_head *rmap_head)
929{
930	struct pte_list_desc *desc;
931	int count = 0;
932
933	if (!rmap_head->val) {
934		rmap_head->val = (unsigned long)spte;
935	} else if (!(rmap_head->val & 1)) {
936		desc = kvm_mmu_memory_cache_alloc(cache);
937		desc->sptes[0] = (u64 *)rmap_head->val;
938		desc->sptes[1] = spte;
939		desc->spte_count = 2;
940		desc->tail_count = 0;
941		rmap_head->val = (unsigned long)desc | 1;
942		++count;
943	} else {
944		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
945		count = desc->tail_count + desc->spte_count;
946
947		/*
948		 * If the previous head is full, allocate a new head descriptor
949		 * as tail descriptors are always kept full.
950		 */
951		if (desc->spte_count == PTE_LIST_EXT) {
952			desc = kvm_mmu_memory_cache_alloc(cache);
953			desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul);
954			desc->spte_count = 0;
955			desc->tail_count = count;
956			rmap_head->val = (unsigned long)desc | 1;
957		}
958		desc->sptes[desc->spte_count++] = spte;
959	}
960	return count;
961}
962
963static void pte_list_desc_remove_entry(struct kvm *kvm,
964				       struct kvm_rmap_head *rmap_head,
965				       struct pte_list_desc *desc, int i)
966{
967	struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
968	int j = head_desc->spte_count - 1;
969
970	/*
971	 * The head descriptor should never be empty.  A new head is added only
972	 * when adding an entry and the previous head is full, and heads are
973	 * removed (this flow) when they become empty.
974	 */
975	KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm);
976
977	/*
978	 * Replace the to-be-freed SPTE with the last valid entry from the head
979	 * descriptor to ensure that tail descriptors are full at all times.
980	 * Note, this also means that tail_count is stable for each descriptor.
981	 */
982	desc->sptes[i] = head_desc->sptes[j];
983	head_desc->sptes[j] = NULL;
984	head_desc->spte_count--;
985	if (head_desc->spte_count)
986		return;
987
988	/*
989	 * The head descriptor is empty.  If there are no tail descriptors,
990	 * nullify the rmap head to mark the list as emtpy, else point the rmap
991	 * head at the next descriptor, i.e. the new head.
992	 */
993	if (!head_desc->more)
994		rmap_head->val = 0;
995	else
996		rmap_head->val = (unsigned long)head_desc->more | 1;
997	mmu_free_pte_list_desc(head_desc);
998}
999
1000static void pte_list_remove(struct kvm *kvm, u64 *spte,
1001			    struct kvm_rmap_head *rmap_head)
1002{
1003	struct pte_list_desc *desc;
1004	int i;
1005
1006	if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
1007		return;
1008
1009	if (!(rmap_head->val & 1)) {
1010		if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
1011			return;
1012
1013		rmap_head->val = 0;
1014	} else {
1015		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1016		while (desc) {
1017			for (i = 0; i < desc->spte_count; ++i) {
1018				if (desc->sptes[i] == spte) {
1019					pte_list_desc_remove_entry(kvm, rmap_head,
1020								   desc, i);
1021					return;
1022				}
1023			}
1024			desc = desc->more;
1025		}
1026
1027		KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
1028	}
1029}
1030
1031static void kvm_zap_one_rmap_spte(struct kvm *kvm,
1032				  struct kvm_rmap_head *rmap_head, u64 *sptep)
1033{
1034	mmu_spte_clear_track_bits(kvm, sptep);
1035	pte_list_remove(kvm, sptep, rmap_head);
1036}
1037
1038/* Return true if at least one SPTE was zapped, false otherwise */
1039static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
1040				   struct kvm_rmap_head *rmap_head)
1041{
1042	struct pte_list_desc *desc, *next;
1043	int i;
1044
1045	if (!rmap_head->val)
1046		return false;
1047
1048	if (!(rmap_head->val & 1)) {
1049		mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
1050		goto out;
1051	}
1052
1053	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1054
1055	for (; desc; desc = next) {
1056		for (i = 0; i < desc->spte_count; i++)
1057			mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
1058		next = desc->more;
1059		mmu_free_pte_list_desc(desc);
1060	}
1061out:
1062	/* rmap_head is meaningless now, remember to reset it */
1063	rmap_head->val = 0;
1064	return true;
1065}
1066
1067unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
1068{
1069	struct pte_list_desc *desc;
1070
1071	if (!rmap_head->val)
1072		return 0;
1073	else if (!(rmap_head->val & 1))
1074		return 1;
1075
1076	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1077	return desc->tail_count + desc->spte_count;
1078}
1079
1080static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
1081					 const struct kvm_memory_slot *slot)
1082{
1083	unsigned long idx;
1084
1085	idx = gfn_to_index(gfn, slot->base_gfn, level);
1086	return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
1087}
1088
1089static void rmap_remove(struct kvm *kvm, u64 *spte)
1090{
1091	struct kvm_memslots *slots;
1092	struct kvm_memory_slot *slot;
1093	struct kvm_mmu_page *sp;
1094	gfn_t gfn;
1095	struct kvm_rmap_head *rmap_head;
1096
1097	sp = sptep_to_sp(spte);
1098	gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte));
1099
1100	/*
1101	 * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
1102	 * so we have to determine which memslots to use based on context
1103	 * information in sp->role.
1104	 */
1105	slots = kvm_memslots_for_spte_role(kvm, sp->role);
1106
1107	slot = __gfn_to_memslot(slots, gfn);
1108	rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1109
1110	pte_list_remove(kvm, spte, rmap_head);
1111}
1112
1113/*
1114 * Used by the following functions to iterate through the sptes linked by a
1115 * rmap.  All fields are private and not assumed to be used outside.
1116 */
1117struct rmap_iterator {
1118	/* private fields */
1119	struct pte_list_desc *desc;	/* holds the sptep if not NULL */
1120	int pos;			/* index of the sptep */
1121};
1122
1123/*
1124 * Iteration must be started by this function.  This should also be used after
1125 * removing/dropping sptes from the rmap link because in such cases the
1126 * information in the iterator may not be valid.
1127 *
1128 * Returns sptep if found, NULL otherwise.
1129 */
1130static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1131			   struct rmap_iterator *iter)
1132{
1133	u64 *sptep;
1134
1135	if (!rmap_head->val)
1136		return NULL;
1137
1138	if (!(rmap_head->val & 1)) {
1139		iter->desc = NULL;
1140		sptep = (u64 *)rmap_head->val;
1141		goto out;
1142	}
1143
1144	iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1145	iter->pos = 0;
1146	sptep = iter->desc->sptes[iter->pos];
1147out:
1148	BUG_ON(!is_shadow_present_pte(*sptep));
1149	return sptep;
1150}
1151
1152/*
1153 * Must be used with a valid iterator: e.g. after rmap_get_first().
1154 *
1155 * Returns sptep if found, NULL otherwise.
1156 */
1157static u64 *rmap_get_next(struct rmap_iterator *iter)
1158{
1159	u64 *sptep;
1160
1161	if (iter->desc) {
1162		if (iter->pos < PTE_LIST_EXT - 1) {
1163			++iter->pos;
1164			sptep = iter->desc->sptes[iter->pos];
1165			if (sptep)
1166				goto out;
1167		}
1168
1169		iter->desc = iter->desc->more;
1170
1171		if (iter->desc) {
1172			iter->pos = 0;
1173			/* desc->sptes[0] cannot be NULL */
1174			sptep = iter->desc->sptes[iter->pos];
1175			goto out;
1176		}
1177	}
1178
1179	return NULL;
1180out:
1181	BUG_ON(!is_shadow_present_pte(*sptep));
1182	return sptep;
1183}
1184
1185#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)			\
1186	for (_spte_ = rmap_get_first(_rmap_head_, _iter_);		\
1187	     _spte_; _spte_ = rmap_get_next(_iter_))
1188
1189static void drop_spte(struct kvm *kvm, u64 *sptep)
1190{
1191	u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
1192
1193	if (is_shadow_present_pte(old_spte))
1194		rmap_remove(kvm, sptep);
1195}
1196
1197static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
1198{
1199	struct kvm_mmu_page *sp;
1200
1201	sp = sptep_to_sp(sptep);
1202	WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K);
1203
1204	drop_spte(kvm, sptep);
1205
1206	if (flush)
1207		kvm_flush_remote_tlbs_sptep(kvm, sptep);
1208}
1209
1210/*
1211 * Write-protect on the specified @sptep, @pt_protect indicates whether
1212 * spte write-protection is caused by protecting shadow page table.
1213 *
1214 * Note: write protection is difference between dirty logging and spte
1215 * protection:
1216 * - for dirty logging, the spte can be set to writable at anytime if
1217 *   its dirty bitmap is properly set.
1218 * - for spte protection, the spte can be writable only after unsync-ing
1219 *   shadow page.
1220 *
1221 * Return true if tlb need be flushed.
1222 */
1223static bool spte_write_protect(u64 *sptep, bool pt_protect)
1224{
1225	u64 spte = *sptep;
1226
1227	if (!is_writable_pte(spte) &&
1228	    !(pt_protect && is_mmu_writable_spte(spte)))
1229		return false;
1230
1231	if (pt_protect)
1232		spte &= ~shadow_mmu_writable_mask;
1233	spte = spte & ~PT_WRITABLE_MASK;
1234
1235	return mmu_spte_update(sptep, spte);
1236}
1237
1238static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
1239			       bool pt_protect)
1240{
1241	u64 *sptep;
1242	struct rmap_iterator iter;
1243	bool flush = false;
1244
1245	for_each_rmap_spte(rmap_head, &iter, sptep)
1246		flush |= spte_write_protect(sptep, pt_protect);
1247
1248	return flush;
1249}
1250
1251static bool spte_clear_dirty(u64 *sptep)
1252{
1253	u64 spte = *sptep;
1254
1255	KVM_MMU_WARN_ON(!spte_ad_enabled(spte));
1256	spte &= ~shadow_dirty_mask;
1257	return mmu_spte_update(sptep, spte);
1258}
1259
1260static bool spte_wrprot_for_clear_dirty(u64 *sptep)
1261{
1262	bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1263					       (unsigned long *)sptep);
1264	if (was_writable && !spte_ad_enabled(*sptep))
1265		kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1266
1267	return was_writable;
1268}
1269
1270/*
1271 * Gets the GFN ready for another round of dirty logging by clearing the
1272 *	- D bit on ad-enabled SPTEs, and
1273 *	- W bit on ad-disabled SPTEs.
1274 * Returns true iff any D or W bits were cleared.
1275 */
1276static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1277			       const struct kvm_memory_slot *slot)
1278{
1279	u64 *sptep;
1280	struct rmap_iterator iter;
1281	bool flush = false;
1282
1283	for_each_rmap_spte(rmap_head, &iter, sptep)
1284		if (spte_ad_need_write_protect(*sptep))
1285			flush |= spte_wrprot_for_clear_dirty(sptep);
1286		else
1287			flush |= spte_clear_dirty(sptep);
1288
1289	return flush;
1290}
1291
1292/**
1293 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1294 * @kvm: kvm instance
1295 * @slot: slot to protect
1296 * @gfn_offset: start of the BITS_PER_LONG pages we care about
1297 * @mask: indicates which pages we should protect
1298 *
1299 * Used when we do not need to care about huge page mappings.
1300 */
1301static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1302				     struct kvm_memory_slot *slot,
1303				     gfn_t gfn_offset, unsigned long mask)
1304{
1305	struct kvm_rmap_head *rmap_head;
1306
1307	if (tdp_mmu_enabled)
1308		kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1309				slot->base_gfn + gfn_offset, mask, true);
1310
1311	if (!kvm_memslots_have_rmaps(kvm))
1312		return;
1313
1314	while (mask) {
1315		rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1316					PG_LEVEL_4K, slot);
1317		rmap_write_protect(rmap_head, false);
1318
1319		/* clear the first set bit */
1320		mask &= mask - 1;
1321	}
1322}
1323
1324/**
1325 * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1326 * protect the page if the D-bit isn't supported.
1327 * @kvm: kvm instance
1328 * @slot: slot to clear D-bit
1329 * @gfn_offset: start of the BITS_PER_LONG pages we care about
1330 * @mask: indicates which pages we should clear D-bit
1331 *
1332 * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1333 */
1334static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1335					 struct kvm_memory_slot *slot,
1336					 gfn_t gfn_offset, unsigned long mask)
1337{
1338	struct kvm_rmap_head *rmap_head;
1339
1340	if (tdp_mmu_enabled)
1341		kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1342				slot->base_gfn + gfn_offset, mask, false);
1343
1344	if (!kvm_memslots_have_rmaps(kvm))
1345		return;
1346
1347	while (mask) {
1348		rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1349					PG_LEVEL_4K, slot);
1350		__rmap_clear_dirty(kvm, rmap_head, slot);
1351
1352		/* clear the first set bit */
1353		mask &= mask - 1;
1354	}
1355}
1356
1357/**
1358 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1359 * PT level pages.
1360 *
1361 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1362 * enable dirty logging for them.
1363 *
1364 * We need to care about huge page mappings: e.g. during dirty logging we may
1365 * have such mappings.
1366 */
1367void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1368				struct kvm_memory_slot *slot,
1369				gfn_t gfn_offset, unsigned long mask)
1370{
1371	/*
1372	 * Huge pages are NOT write protected when we start dirty logging in
1373	 * initially-all-set mode; must write protect them here so that they
1374	 * are split to 4K on the first write.
1375	 *
1376	 * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
1377	 * of memslot has no such restriction, so the range can cross two large
1378	 * pages.
1379	 */
1380	if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
1381		gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
1382		gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
1383
1384		if (READ_ONCE(eager_page_split))
1385			kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
1386
1387		kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
1388
1389		/* Cross two large pages? */
1390		if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
1391		    ALIGN(end << PAGE_SHIFT, PMD_SIZE))
1392			kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
1393						       PG_LEVEL_2M);
1394	}
1395
1396	/* Now handle 4K PTEs.  */
1397	if (kvm_x86_ops.cpu_dirty_log_size)
1398		kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
1399	else
1400		kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1401}
1402
1403int kvm_cpu_dirty_log_size(void)
1404{
1405	return kvm_x86_ops.cpu_dirty_log_size;
1406}
1407
1408bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1409				    struct kvm_memory_slot *slot, u64 gfn,
1410				    int min_level)
1411{
1412	struct kvm_rmap_head *rmap_head;
1413	int i;
1414	bool write_protected = false;
1415
1416	if (kvm_memslots_have_rmaps(kvm)) {
1417		for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
1418			rmap_head = gfn_to_rmap(gfn, i, slot);
1419			write_protected |= rmap_write_protect(rmap_head, true);
1420		}
1421	}
1422
1423	if (tdp_mmu_enabled)
1424		write_protected |=
1425			kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
1426
1427	return write_protected;
1428}
1429
1430static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
1431{
1432	struct kvm_memory_slot *slot;
1433
1434	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1435	return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
1436}
1437
1438static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1439			   const struct kvm_memory_slot *slot)
1440{
1441	return kvm_zap_all_rmap_sptes(kvm, rmap_head);
1442}
1443
1444static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1445			 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1446			 pte_t unused)
1447{
1448	return __kvm_zap_rmap(kvm, rmap_head, slot);
1449}
1450
1451static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1452			     struct kvm_memory_slot *slot, gfn_t gfn, int level,
1453			     pte_t pte)
1454{
1455	u64 *sptep;
1456	struct rmap_iterator iter;
1457	bool need_flush = false;
1458	u64 new_spte;
1459	kvm_pfn_t new_pfn;
1460
1461	WARN_ON_ONCE(pte_huge(pte));
1462	new_pfn = pte_pfn(pte);
1463
1464restart:
1465	for_each_rmap_spte(rmap_head, &iter, sptep) {
1466		need_flush = true;
1467
1468		if (pte_write(pte)) {
1469			kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
1470			goto restart;
1471		} else {
1472			new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1473					*sptep, new_pfn);
1474
1475			mmu_spte_clear_track_bits(kvm, sptep);
1476			mmu_spte_set(sptep, new_spte);
1477		}
1478	}
1479
1480	if (need_flush && kvm_available_flush_remote_tlbs_range()) {
1481		kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
1482		return false;
1483	}
1484
1485	return need_flush;
1486}
1487
1488struct slot_rmap_walk_iterator {
1489	/* input fields. */
1490	const struct kvm_memory_slot *slot;
1491	gfn_t start_gfn;
1492	gfn_t end_gfn;
1493	int start_level;
1494	int end_level;
1495
1496	/* output fields. */
1497	gfn_t gfn;
1498	struct kvm_rmap_head *rmap;
1499	int level;
1500
1501	/* private field. */
1502	struct kvm_rmap_head *end_rmap;
1503};
1504
1505static void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator,
1506				 int level)
1507{
1508	iterator->level = level;
1509	iterator->gfn = iterator->start_gfn;
1510	iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
1511	iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
1512}
1513
1514static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1515				const struct kvm_memory_slot *slot,
1516				int start_level, int end_level,
1517				gfn_t start_gfn, gfn_t end_gfn)
1518{
1519	iterator->slot = slot;
1520	iterator->start_level = start_level;
1521	iterator->end_level = end_level;
1522	iterator->start_gfn = start_gfn;
1523	iterator->end_gfn = end_gfn;
1524
1525	rmap_walk_init_level(iterator, iterator->start_level);
1526}
1527
1528static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1529{
1530	return !!iterator->rmap;
1531}
1532
1533static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1534{
1535	while (++iterator->rmap <= iterator->end_rmap) {
1536		iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1537
1538		if (iterator->rmap->val)
1539			return;
1540	}
1541
1542	if (++iterator->level > iterator->end_level) {
1543		iterator->rmap = NULL;
1544		return;
1545	}
1546
1547	rmap_walk_init_level(iterator, iterator->level);
1548}
1549
1550#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,	\
1551	   _start_gfn, _end_gfn, _iter_)				\
1552	for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,		\
1553				 _end_level_, _start_gfn, _end_gfn);	\
1554	     slot_rmap_walk_okay(_iter_);				\
1555	     slot_rmap_walk_next(_iter_))
1556
1557typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1558			       struct kvm_memory_slot *slot, gfn_t gfn,
1559			       int level, pte_t pte);
1560
1561static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
1562						 struct kvm_gfn_range *range,
1563						 rmap_handler_t handler)
1564{
1565	struct slot_rmap_walk_iterator iterator;
1566	bool ret = false;
1567
1568	for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
1569				 range->start, range->end - 1, &iterator)
1570		ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
1571			       iterator.level, range->arg.pte);
1572
1573	return ret;
1574}
1575
1576bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1577{
1578	bool flush = false;
1579
1580	if (kvm_memslots_have_rmaps(kvm))
1581		flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
1582
1583	if (tdp_mmu_enabled)
1584		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
1585
1586	if (kvm_x86_ops.set_apic_access_page_addr &&
1587	    range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
1588		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
1589
1590	return flush;
1591}
1592
1593bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1594{
1595	bool flush = false;
1596
1597	if (kvm_memslots_have_rmaps(kvm))
1598		flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
1599
1600	if (tdp_mmu_enabled)
1601		flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
1602
1603	return flush;
1604}
1605
1606static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1607			 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1608			 pte_t unused)
1609{
1610	u64 *sptep;
1611	struct rmap_iterator iter;
1612	int young = 0;
1613
1614	for_each_rmap_spte(rmap_head, &iter, sptep)
1615		young |= mmu_spte_age(sptep);
1616
1617	return young;
1618}
1619
1620static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1621			      struct kvm_memory_slot *slot, gfn_t gfn,
1622			      int level, pte_t unused)
1623{
1624	u64 *sptep;
1625	struct rmap_iterator iter;
1626
1627	for_each_rmap_spte(rmap_head, &iter, sptep)
1628		if (is_accessed_spte(*sptep))
1629			return true;
1630	return false;
1631}
1632
1633#define RMAP_RECYCLE_THRESHOLD 1000
1634
1635static void __rmap_add(struct kvm *kvm,
1636		       struct kvm_mmu_memory_cache *cache,
1637		       const struct kvm_memory_slot *slot,
1638		       u64 *spte, gfn_t gfn, unsigned int access)
1639{
1640	struct kvm_mmu_page *sp;
1641	struct kvm_rmap_head *rmap_head;
1642	int rmap_count;
1643
1644	sp = sptep_to_sp(spte);
1645	kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access);
1646	kvm_update_page_stats(kvm, sp->role.level, 1);
1647
1648	rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1649	rmap_count = pte_list_add(cache, spte, rmap_head);
1650
1651	if (rmap_count > kvm->stat.max_mmu_rmap_size)
1652		kvm->stat.max_mmu_rmap_size = rmap_count;
1653	if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
1654		kvm_zap_all_rmap_sptes(kvm, rmap_head);
1655		kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
1656	}
1657}
1658
1659static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
1660		     u64 *spte, gfn_t gfn, unsigned int access)
1661{
1662	struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache;
1663
1664	__rmap_add(vcpu->kvm, cache, slot, spte, gfn, access);
1665}
1666
1667bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1668{
1669	bool young = false;
1670
1671	if (kvm_memslots_have_rmaps(kvm))
1672		young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap);
1673
1674	if (tdp_mmu_enabled)
1675		young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
1676
1677	return young;
1678}
1679
1680bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1681{
1682	bool young = false;
1683
1684	if (kvm_memslots_have_rmaps(kvm))
1685		young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap);
1686
1687	if (tdp_mmu_enabled)
1688		young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
1689
1690	return young;
1691}
1692
1693static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
1694{
1695#ifdef CONFIG_KVM_PROVE_MMU
1696	int i;
1697
1698	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
1699		if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i])))
1700			pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free",
1701					   sp->spt[i], &sp->spt[i],
1702					   kvm_mmu_page_get_gfn(sp, i));
1703	}
1704#endif
1705}
1706
1707/*
1708 * This value is the sum of all of the kvm instances's
1709 * kvm->arch.n_used_mmu_pages values.  We need a global,
1710 * aggregate version in order to make the slab shrinker
1711 * faster
1712 */
1713static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
1714{
1715	kvm->arch.n_used_mmu_pages += nr;
1716	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1717}
1718
1719static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1720{
1721	kvm_mod_used_mmu_pages(kvm, +1);
1722	kvm_account_pgtable_pages((void *)sp->spt, +1);
1723}
1724
1725static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1726{
1727	kvm_mod_used_mmu_pages(kvm, -1);
1728	kvm_account_pgtable_pages((void *)sp->spt, -1);
1729}
1730
1731static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
1732{
1733	kvm_mmu_check_sptes_at_free(sp);
1734
1735	hlist_del(&sp->hash_link);
1736	list_del(&sp->link);
1737	free_page((unsigned long)sp->spt);
1738	if (!sp->role.direct)
1739		free_page((unsigned long)sp->shadowed_translation);
1740	kmem_cache_free(mmu_page_header_cache, sp);
1741}
1742
1743static unsigned kvm_page_table_hashfn(gfn_t gfn)
1744{
1745	return hash_64(gfn, KVM_MMU_HASH_SHIFT);
1746}
1747
1748static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
1749				    struct kvm_mmu_page *sp, u64 *parent_pte)
1750{
1751	if (!parent_pte)
1752		return;
1753
1754	pte_list_add(cache, parent_pte, &sp->parent_ptes);
1755}
1756
1757static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
1758				       u64 *parent_pte)
1759{
1760	pte_list_remove(kvm, parent_pte, &sp->parent_ptes);
1761}
1762
1763static void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
1764			    u64 *parent_pte)
1765{
1766	mmu_page_remove_parent_pte(kvm, sp, parent_pte);
1767	mmu_spte_clear_no_track(parent_pte);
1768}
1769
1770static void mark_unsync(u64 *spte);
1771static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1772{
1773	u64 *sptep;
1774	struct rmap_iterator iter;
1775
1776	for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
1777		mark_unsync(sptep);
1778	}
1779}
1780
1781static void mark_unsync(u64 *spte)
1782{
1783	struct kvm_mmu_page *sp;
1784
1785	sp = sptep_to_sp(spte);
1786	if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap))
1787		return;
1788	if (sp->unsync_children++)
1789		return;
1790	kvm_mmu_mark_parents_unsync(sp);
1791}
1792
1793#define KVM_PAGE_ARRAY_NR 16
1794
1795struct kvm_mmu_pages {
1796	struct mmu_page_and_offset {
1797		struct kvm_mmu_page *sp;
1798		unsigned int idx;
1799	} page[KVM_PAGE_ARRAY_NR];
1800	unsigned int nr;
1801};
1802
1803static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1804			 int idx)
1805{
1806	int i;
1807
1808	if (sp->unsync)
1809		for (i=0; i < pvec->nr; i++)
1810			if (pvec->page[i].sp == sp)
1811				return 0;
1812
1813	pvec->page[pvec->nr].sp = sp;
1814	pvec->page[pvec->nr].idx = idx;
1815	pvec->nr++;
1816	return (pvec->nr == KVM_PAGE_ARRAY_NR);
1817}
1818
1819static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
1820{
1821	--sp->unsync_children;
1822	WARN_ON_ONCE((int)sp->unsync_children < 0);
1823	__clear_bit(idx, sp->unsync_child_bitmap);
1824}
1825
1826static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1827			   struct kvm_mmu_pages *pvec)
1828{
1829	int i, ret, nr_unsync_leaf = 0;
1830
1831	for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
1832		struct kvm_mmu_page *child;
1833		u64 ent = sp->spt[i];
1834
1835		if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
1836			clear_unsync_child_bit(sp, i);
1837			continue;
1838		}
1839
1840		child = spte_to_child_sp(ent);
1841
1842		if (child->unsync_children) {
1843			if (mmu_pages_add(pvec, child, i))
1844				return -ENOSPC;
1845
1846			ret = __mmu_unsync_walk(child, pvec);
1847			if (!ret) {
1848				clear_unsync_child_bit(sp, i);
1849				continue;
1850			} else if (ret > 0) {
1851				nr_unsync_leaf += ret;
1852			} else
1853				return ret;
1854		} else if (child->unsync) {
1855			nr_unsync_leaf++;
1856			if (mmu_pages_add(pvec, child, i))
1857				return -ENOSPC;
1858		} else
1859			clear_unsync_child_bit(sp, i);
1860	}
1861
1862	return nr_unsync_leaf;
1863}
1864
1865#define INVALID_INDEX (-1)
1866
1867static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1868			   struct kvm_mmu_pages *pvec)
1869{
1870	pvec->nr = 0;
1871	if (!sp->unsync_children)
1872		return 0;
1873
1874	mmu_pages_add(pvec, sp, INVALID_INDEX);
1875	return __mmu_unsync_walk(sp, pvec);
1876}
1877
1878static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1879{
1880	WARN_ON_ONCE(!sp->unsync);
1881	trace_kvm_mmu_sync_page(sp);
1882	sp->unsync = 0;
1883	--kvm->stat.mmu_unsync;
1884}
1885
1886static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1887				     struct list_head *invalid_list);
1888static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1889				    struct list_head *invalid_list);
1890
1891static bool sp_has_gptes(struct kvm_mmu_page *sp)
1892{
1893	if (sp->role.direct)
1894		return false;
1895
1896	if (sp->role.passthrough)
1897		return false;
1898
1899	return true;
1900}
1901
1902#define for_each_valid_sp(_kvm, _sp, _list)				\
1903	hlist_for_each_entry(_sp, _list, hash_link)			\
1904		if (is_obsolete_sp((_kvm), (_sp))) {			\
1905		} else
1906
1907#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn)		\
1908	for_each_valid_sp(_kvm, _sp,					\
1909	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])	\
1910		if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
1911
1912static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1913{
1914	union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
1915
1916	/*
1917	 * Ignore various flags when verifying that it's safe to sync a shadow
1918	 * page using the current MMU context.
1919	 *
1920	 *  - level: not part of the overall MMU role and will never match as the MMU's
1921	 *           level tracks the root level
1922	 *  - access: updated based on the new guest PTE
1923	 *  - quadrant: not part of the overall MMU role (similar to level)
1924	 */
1925	const union kvm_mmu_page_role sync_role_ign = {
1926		.level = 0xf,
1927		.access = 0x7,
1928		.quadrant = 0x3,
1929		.passthrough = 0x1,
1930	};
1931
1932	/*
1933	 * Direct pages can never be unsync, and KVM should never attempt to
1934	 * sync a shadow page for a different MMU context, e.g. if the role
1935	 * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
1936	 * reserved bits checks will be wrong, etc...
1937	 */
1938	if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte ||
1939			 (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
1940		return false;
1941
1942	return true;
1943}
1944
1945static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
1946{
1947	if (!sp->spt[i])
1948		return 0;
1949
1950	return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
1951}
1952
1953static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1954{
1955	int flush = 0;
1956	int i;
1957
1958	if (!kvm_sync_page_check(vcpu, sp))
1959		return -1;
1960
1961	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
1962		int ret = kvm_sync_spte(vcpu, sp, i);
1963
1964		if (ret < -1)
1965			return -1;
1966		flush |= ret;
1967	}
1968
1969	/*
1970	 * Note, any flush is purely for KVM's correctness, e.g. when dropping
1971	 * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
1972	 * unmap or dirty logging event doesn't fail to flush.  The guest is
1973	 * responsible for flushing the TLB to ensure any changes in protection
1974	 * bits are recognized, i.e. until the guest flushes or page faults on
1975	 * a relevant address, KVM is architecturally allowed to let vCPUs use
1976	 * cached translations with the old protection bits.
1977	 */
1978	return flush;
1979}
1980
1981static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1982			 struct list_head *invalid_list)
1983{
1984	int ret = __kvm_sync_page(vcpu, sp);
1985
1986	if (ret < 0)
1987		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1988	return ret;
1989}
1990
1991static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
1992					struct list_head *invalid_list,
1993					bool remote_flush)
1994{
1995	if (!remote_flush && list_empty(invalid_list))
1996		return false;
1997
1998	if (!list_empty(invalid_list))
1999		kvm_mmu_commit_zap_page(kvm, invalid_list);
2000	else
2001		kvm_flush_remote_tlbs(kvm);
2002	return true;
2003}
2004
2005static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
2006{
2007	if (sp->role.invalid)
2008		return true;
2009
2010	/* TDP MMU pages do not use the MMU generation. */
2011	return !is_tdp_mmu_page(sp) &&
2012	       unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
2013}
2014
2015struct mmu_page_path {
2016	struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
2017	unsigned int idx[PT64_ROOT_MAX_LEVEL];
2018};
2019
2020#define for_each_sp(pvec, sp, parents, i)			\
2021		for (i = mmu_pages_first(&pvec, &parents);	\
2022			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
2023			i = mmu_pages_next(&pvec, &parents, i))
2024
2025static int mmu_pages_next(struct kvm_mmu_pages *pvec,
2026			  struct mmu_page_path *parents,
2027			  int i)
2028{
2029	int n;
2030
2031	for (n = i+1; n < pvec->nr; n++) {
2032		struct kvm_mmu_page *sp = pvec->page[n].sp;
2033		unsigned idx = pvec->page[n].idx;
2034		int level = sp->role.level;
2035
2036		parents->idx[level-1] = idx;
2037		if (level == PG_LEVEL_4K)
2038			break;
2039
2040		parents->parent[level-2] = sp;
2041	}
2042
2043	return n;
2044}
2045
2046static int mmu_pages_first(struct kvm_mmu_pages *pvec,
2047			   struct mmu_page_path *parents)
2048{
2049	struct kvm_mmu_page *sp;
2050	int level;
2051
2052	if (pvec->nr == 0)
2053		return 0;
2054
2055	WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX);
2056
2057	sp = pvec->page[0].sp;
2058	level = sp->role.level;
2059	WARN_ON_ONCE(level == PG_LEVEL_4K);
2060
2061	parents->parent[level-2] = sp;
2062
2063	/* Also set up a sentinel.  Further entries in pvec are all
2064	 * children of sp, so this element is never overwritten.
2065	 */
2066	parents->parent[level-1] = NULL;
2067	return mmu_pages_next(pvec, parents, 0);
2068}
2069
2070static void mmu_pages_clear_parents(struct mmu_page_path *parents)
2071{
2072	struct kvm_mmu_page *sp;
2073	unsigned int level = 0;
2074
2075	do {
2076		unsigned int idx = parents->idx[level];
2077		sp = parents->parent[level];
2078		if (!sp)
2079			return;
2080
2081		WARN_ON_ONCE(idx == INVALID_INDEX);
2082		clear_unsync_child_bit(sp, idx);
2083		level++;
2084	} while (!sp->unsync_children);
2085}
2086
2087static int mmu_sync_children(struct kvm_vcpu *vcpu,
2088			     struct kvm_mmu_page *parent, bool can_yield)
2089{
2090	int i;
2091	struct kvm_mmu_page *sp;
2092	struct mmu_page_path parents;
2093	struct kvm_mmu_pages pages;
2094	LIST_HEAD(invalid_list);
2095	bool flush = false;
2096
2097	while (mmu_unsync_walk(parent, &pages)) {
2098		bool protected = false;
2099
2100		for_each_sp(pages, sp, parents, i)
2101			protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
2102
2103		if (protected) {
2104			kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
2105			flush = false;
2106		}
2107
2108		for_each_sp(pages, sp, parents, i) {
2109			kvm_unlink_unsync_page(vcpu->kvm, sp);
2110			flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
2111			mmu_pages_clear_parents(&parents);
2112		}
2113		if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
2114			kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
2115			if (!can_yield) {
2116				kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2117				return -EINTR;
2118			}
2119
2120			cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
2121			flush = false;
2122		}
2123	}
2124
2125	kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
2126	return 0;
2127}
2128
2129static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2130{
2131	atomic_set(&sp->write_flooding_count,  0);
2132}
2133
2134static void clear_sp_write_flooding_count(u64 *spte)
2135{
2136	__clear_sp_write_flooding_count(sptep_to_sp(spte));
2137}
2138
2139/*
2140 * The vCPU is required when finding indirect shadow pages; the shadow
2141 * page may already exist and syncing it needs the vCPU pointer in
2142 * order to read guest page tables.  Direct shadow pages are never
2143 * unsync, thus @vcpu can be NULL if @role.direct is true.
2144 */
2145static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
2146						     struct kvm_vcpu *vcpu,
2147						     gfn_t gfn,
2148						     struct hlist_head *sp_list,
2149						     union kvm_mmu_page_role role)
2150{
2151	struct kvm_mmu_page *sp;
2152	int ret;
2153	int collisions = 0;
2154	LIST_HEAD(invalid_list);
2155
2156	for_each_valid_sp(kvm, sp, sp_list) {
2157		if (sp->gfn != gfn) {
2158			collisions++;
2159			continue;
2160		}
2161
2162		if (sp->role.word != role.word) {
2163			/*
2164			 * If the guest is creating an upper-level page, zap
2165			 * unsync pages for the same gfn.  While it's possible
2166			 * the guest is using recursive page tables, in all
2167			 * likelihood the guest has stopped using the unsync
2168			 * page and is installing a completely unrelated page.
2169			 * Unsync pages must not be left as is, because the new
2170			 * upper-level page will be write-protected.
2171			 */
2172			if (role.level > PG_LEVEL_4K && sp->unsync)
2173				kvm_mmu_prepare_zap_page(kvm, sp,
2174							 &invalid_list);
2175			continue;
2176		}
2177
2178		/* unsync and write-flooding only apply to indirect SPs. */
2179		if (sp->role.direct)
2180			goto out;
2181
2182		if (sp->unsync) {
2183			if (KVM_BUG_ON(!vcpu, kvm))
2184				break;
2185
2186			/*
2187			 * The page is good, but is stale.  kvm_sync_page does
2188			 * get the latest guest state, but (unlike mmu_unsync_children)
2189			 * it doesn't write-protect the page or mark it synchronized!
2190			 * This way the validity of the mapping is ensured, but the
2191			 * overhead of write protection is not incurred until the
2192			 * guest invalidates the TLB mapping.  This allows multiple
2193			 * SPs for a single gfn to be unsync.
2194			 *
2195			 * If the sync fails, the page is zapped.  If so, break
2196			 * in order to rebuild it.
2197			 */
2198			ret = kvm_sync_page(vcpu, sp, &invalid_list);
2199			if (ret < 0)
2200				break;
2201
2202			WARN_ON_ONCE(!list_empty(&invalid_list));
2203			if (ret > 0)
2204				kvm_flush_remote_tlbs(kvm);
2205		}
2206
2207		__clear_sp_write_flooding_count(sp);
2208
2209		goto out;
2210	}
2211
2212	sp = NULL;
2213	++kvm->stat.mmu_cache_miss;
2214
2215out:
2216	kvm_mmu_commit_zap_page(kvm, &invalid_list);
2217
2218	if (collisions > kvm->stat.max_mmu_page_hash_collisions)
2219		kvm->stat.max_mmu_page_hash_collisions = collisions;
2220	return sp;
2221}
2222
2223/* Caches used when allocating a new shadow page. */
2224struct shadow_page_caches {
2225	struct kvm_mmu_memory_cache *page_header_cache;
2226	struct kvm_mmu_memory_cache *shadow_page_cache;
2227	struct kvm_mmu_memory_cache *shadowed_info_cache;
2228};
2229
2230static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
2231						      struct shadow_page_caches *caches,
2232						      gfn_t gfn,
2233						      struct hlist_head *sp_list,
2234						      union kvm_mmu_page_role role)
2235{
2236	struct kvm_mmu_page *sp;
2237
2238	sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
2239	sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
2240	if (!role.direct)
2241		sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
2242
2243	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2244
2245	INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
2246
2247	/*
2248	 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
2249	 * depends on valid pages being added to the head of the list.  See
2250	 * comments in kvm_zap_obsolete_pages().
2251	 */
2252	sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
2253	list_add(&sp->link, &kvm->arch.active_mmu_pages);
2254	kvm_account_mmu_page(kvm, sp);
2255
2256	sp->gfn = gfn;
2257	sp->role = role;
2258	hlist_add_head(&sp->hash_link, sp_list);
2259	if (sp_has_gptes(sp))
2260		account_shadowed(kvm, sp);
2261
2262	return sp;
2263}
2264
2265/* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */
2266static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
2267						      struct kvm_vcpu *vcpu,
2268						      struct shadow_page_caches *caches,
2269						      gfn_t gfn,
2270						      union kvm_mmu_page_role role)
2271{
2272	struct hlist_head *sp_list;
2273	struct kvm_mmu_page *sp;
2274	bool created = false;
2275
2276	sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
2277
2278	sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
2279	if (!sp) {
2280		created = true;
2281		sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role);
2282	}
2283
2284	trace_kvm_mmu_get_page(sp, created);
2285	return sp;
2286}
2287
2288static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
2289						    gfn_t gfn,
2290						    union kvm_mmu_page_role role)
2291{
2292	struct shadow_page_caches caches = {
2293		.page_header_cache = &vcpu->arch.mmu_page_header_cache,
2294		.shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
2295		.shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
2296	};
2297
2298	return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
2299}
2300
2301static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct,
2302						  unsigned int access)
2303{
2304	struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
2305	union kvm_mmu_page_role role;
2306
2307	role = parent_sp->role;
2308	role.level--;
2309	role.access = access;
2310	role.direct = direct;
2311	role.passthrough = 0;
2312
2313	/*
2314	 * If the guest has 4-byte PTEs then that means it's using 32-bit,
2315	 * 2-level, non-PAE paging. KVM shadows such guests with PAE paging
2316	 * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must
2317	 * shadow each guest page table with multiple shadow page tables, which
2318	 * requires extra bookkeeping in the role.
2319	 *
2320	 * Specifically, to shadow the guest's page directory (which covers a
2321	 * 4GiB address space), KVM uses 4 PAE page directories, each mapping
2322	 * 1GiB of the address space. @role.quadrant encodes which quarter of
2323	 * the address space each maps.
2324	 *
2325	 * To shadow the guest's page tables (which each map a 4MiB region), KVM
2326	 * uses 2 PAE page tables, each mapping a 2MiB region. For these,
2327	 * @role.quadrant encodes which half of the region they map.
2328	 *
2329	 * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE
2330	 * consumes bits 29:21.  To consume bits 31:30, KVM's uses 4 shadow
2331	 * PDPTEs; those 4 PAE page directories are pre-allocated and their
2332	 * quadrant is assigned in mmu_alloc_root().   A 4-byte PTE consumes
2333	 * bits 21:12, while an 8-byte PTE consumes bits 20:12.  To consume
2334	 * bit 21 in the PTE (the child here), KVM propagates that bit to the
2335	 * quadrant, i.e. sets quadrant to '0' or '1'.  The parent 8-byte PDE
2336	 * covers bit 21 (see above), thus the quadrant is calculated from the
2337	 * _least_ significant bit of the PDE index.
2338	 */
2339	if (role.has_4_byte_gpte) {
2340		WARN_ON_ONCE(role.level != PG_LEVEL_4K);
2341		role.quadrant = spte_index(sptep) & 1;
2342	}
2343
2344	return role;
2345}
2346
2347static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu,
2348						 u64 *sptep, gfn_t gfn,
2349						 bool direct, unsigned int access)
2350{
2351	union kvm_mmu_page_role role;
2352
2353	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
2354		return ERR_PTR(-EEXIST);
2355
2356	role = kvm_mmu_child_role(sptep, direct, access);
2357	return kvm_mmu_get_shadow_page(vcpu, gfn, role);
2358}
2359
2360static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2361					struct kvm_vcpu *vcpu, hpa_t root,
2362					u64 addr)
2363{
2364	iterator->addr = addr;
2365	iterator->shadow_addr = root;
2366	iterator->level = vcpu->arch.mmu->root_role.level;
2367
2368	if (iterator->level >= PT64_ROOT_4LEVEL &&
2369	    vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL &&
2370	    !vcpu->arch.mmu->root_role.direct)
2371		iterator->level = PT32E_ROOT_LEVEL;
2372
2373	if (iterator->level == PT32E_ROOT_LEVEL) {
2374		/*
2375		 * prev_root is currently only used for 64-bit hosts. So only
2376		 * the active root_hpa is valid here.
2377		 */
2378		BUG_ON(root != vcpu->arch.mmu->root.hpa);
2379
2380		iterator->shadow_addr
2381			= vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2382		iterator->shadow_addr &= SPTE_BASE_ADDR_MASK;
2383		--iterator->level;
2384		if (!iterator->shadow_addr)
2385			iterator->level = 0;
2386	}
2387}
2388
2389static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2390			     struct kvm_vcpu *vcpu, u64 addr)
2391{
2392	shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
2393				    addr);
2394}
2395
2396static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2397{
2398	if (iterator->level < PG_LEVEL_4K)
2399		return false;
2400
2401	iterator->index = SPTE_INDEX(iterator->addr, iterator->level);
2402	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2403	return true;
2404}
2405
2406static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2407			       u64 spte)
2408{
2409	if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
2410		iterator->level = 0;
2411		return;
2412	}
2413
2414	iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK;
2415	--iterator->level;
2416}
2417
2418static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2419{
2420	__shadow_walk_next(iterator, *iterator->sptep);
2421}
2422
2423static void __link_shadow_page(struct kvm *kvm,
2424			       struct kvm_mmu_memory_cache *cache, u64 *sptep,
2425			       struct kvm_mmu_page *sp, bool flush)
2426{
2427	u64 spte;
2428
2429	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2430
2431	/*
2432	 * If an SPTE is present already, it must be a leaf and therefore
2433	 * a large one.  Drop it, and flush the TLB if needed, before
2434	 * installing sp.
2435	 */
2436	if (is_shadow_present_pte(*sptep))
2437		drop_large_spte(kvm, sptep, flush);
2438
2439	spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
2440
2441	mmu_spte_set(sptep, spte);
2442
2443	mmu_page_add_parent_pte(cache, sp, sptep);
2444
2445	/*
2446	 * The non-direct sub-pagetable must be updated before linking.  For
2447	 * L1 sp, the pagetable is updated via kvm_sync_page() in
2448	 * kvm_mmu_find_shadow_page() without write-protecting the gfn,
2449	 * so sp->unsync can be true or false.  For higher level non-direct
2450	 * sp, the pagetable is updated/synced via mmu_sync_children() in
2451	 * FNAME(fetch)(), so sp->unsync_children can only be false.
2452	 * WARN_ON_ONCE() if anything happens unexpectedly.
2453	 */
2454	if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync)
2455		mark_unsync(sptep);
2456}
2457
2458static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2459			     struct kvm_mmu_page *sp)
2460{
2461	__link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true);
2462}
2463
2464static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2465				   unsigned direct_access)
2466{
2467	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2468		struct kvm_mmu_page *child;
2469
2470		/*
2471		 * For the direct sp, if the guest pte's dirty bit
2472		 * changed form clean to dirty, it will corrupt the
2473		 * sp's access: allow writable in the read-only sp,
2474		 * so we should update the spte at this point to get
2475		 * a new sp with the correct access.
2476		 */
2477		child = spte_to_child_sp(*sptep);
2478		if (child->role.access == direct_access)
2479			return;
2480
2481		drop_parent_pte(vcpu->kvm, child, sptep);
2482		kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
2483	}
2484}
2485
2486/* Returns the number of zapped non-leaf child shadow pages. */
2487static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2488			    u64 *spte, struct list_head *invalid_list)
2489{
2490	u64 pte;
2491	struct kvm_mmu_page *child;
2492
2493	pte = *spte;
2494	if (is_shadow_present_pte(pte)) {
2495		if (is_last_spte(pte, sp->role.level)) {
2496			drop_spte(kvm, spte);
2497		} else {
2498			child = spte_to_child_sp(pte);
2499			drop_parent_pte(kvm, child, spte);
2500
2501			/*
2502			 * Recursively zap nested TDP SPs, parentless SPs are
2503			 * unlikely to be used again in the near future.  This
2504			 * avoids retaining a large number of stale nested SPs.
2505			 */
2506			if (tdp_enabled && invalid_list &&
2507			    child->role.guest_mode && !child->parent_ptes.val)
2508				return kvm_mmu_prepare_zap_page(kvm, child,
2509								invalid_list);
2510		}
2511	} else if (is_mmio_spte(pte)) {
2512		mmu_spte_clear_no_track(spte);
2513	}
2514	return 0;
2515}
2516
2517static int kvm_mmu_page_unlink_children(struct kvm *kvm,
2518					struct kvm_mmu_page *sp,
2519					struct list_head *invalid_list)
2520{
2521	int zapped = 0;
2522	unsigned i;
2523
2524	for (i = 0; i < SPTE_ENT_PER_PAGE; ++i)
2525		zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
2526
2527	return zapped;
2528}
2529
2530static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
2531{
2532	u64 *sptep;
2533	struct rmap_iterator iter;
2534
2535	while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2536		drop_parent_pte(kvm, sp, sptep);
2537}
2538
2539static int mmu_zap_unsync_children(struct kvm *kvm,
2540				   struct kvm_mmu_page *parent,
2541				   struct list_head *invalid_list)
2542{
2543	int i, zapped = 0;
2544	struct mmu_page_path parents;
2545	struct kvm_mmu_pages pages;
2546
2547	if (parent->role.level == PG_LEVEL_4K)
2548		return 0;
2549
2550	while (mmu_unsync_walk(parent, &pages)) {
2551		struct kvm_mmu_page *sp;
2552
2553		for_each_sp(pages, sp, parents, i) {
2554			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2555			mmu_pages_clear_parents(&parents);
2556			zapped++;
2557		}
2558	}
2559
2560	return zapped;
2561}
2562
2563static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2564				       struct kvm_mmu_page *sp,
2565				       struct list_head *invalid_list,
2566				       int *nr_zapped)
2567{
2568	bool list_unstable, zapped_root = false;
2569
2570	lockdep_assert_held_write(&kvm->mmu_lock);
2571	trace_kvm_mmu_prepare_zap_page(sp);
2572	++kvm->stat.mmu_shadow_zapped;
2573	*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2574	*nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
2575	kvm_mmu_unlink_parents(kvm, sp);
2576
2577	/* Zapping children means active_mmu_pages has become unstable. */
2578	list_unstable = *nr_zapped;
2579
2580	if (!sp->role.invalid && sp_has_gptes(sp))
2581		unaccount_shadowed(kvm, sp);
2582
2583	if (sp->unsync)
2584		kvm_unlink_unsync_page(kvm, sp);
2585	if (!sp->root_count) {
2586		/* Count self */
2587		(*nr_zapped)++;
2588
2589		/*
2590		 * Already invalid pages (previously active roots) are not on
2591		 * the active page list.  See list_del() in the "else" case of
2592		 * !sp->root_count.
2593		 */
2594		if (sp->role.invalid)
2595			list_add(&sp->link, invalid_list);
2596		else
2597			list_move(&sp->link, invalid_list);
2598		kvm_unaccount_mmu_page(kvm, sp);
2599	} else {
2600		/*
2601		 * Remove the active root from the active page list, the root
2602		 * will be explicitly freed when the root_count hits zero.
2603		 */
2604		list_del(&sp->link);
2605
2606		/*
2607		 * Obsolete pages cannot be used on any vCPUs, see the comment
2608		 * in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
2609		 * treats invalid shadow pages as being obsolete.
2610		 */
2611		zapped_root = !is_obsolete_sp(kvm, sp);
2612	}
2613
2614	if (sp->nx_huge_page_disallowed)
2615		unaccount_nx_huge_page(kvm, sp);
2616
2617	sp->role.invalid = 1;
2618
2619	/*
2620	 * Make the request to free obsolete roots after marking the root
2621	 * invalid, otherwise other vCPUs may not see it as invalid.
2622	 */
2623	if (zapped_root)
2624		kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
2625	return list_unstable;
2626}
2627
2628static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2629				     struct list_head *invalid_list)
2630{
2631	int nr_zapped;
2632
2633	__kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2634	return nr_zapped;
2635}
2636
2637static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2638				    struct list_head *invalid_list)
2639{
2640	struct kvm_mmu_page *sp, *nsp;
2641
2642	if (list_empty(invalid_list))
2643		return;
2644
2645	/*
2646	 * We need to make sure everyone sees our modifications to
2647	 * the page tables and see changes to vcpu->mode here. The barrier
2648	 * in the kvm_flush_remote_tlbs() achieves this. This pairs
2649	 * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2650	 *
2651	 * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2652	 * guest mode and/or lockless shadow page table walks.
2653	 */
2654	kvm_flush_remote_tlbs(kvm);
2655
2656	list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2657		WARN_ON_ONCE(!sp->role.invalid || sp->root_count);
2658		kvm_mmu_free_shadow_page(sp);
2659	}
2660}
2661
2662static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
2663						  unsigned long nr_to_zap)
2664{
2665	unsigned long total_zapped = 0;
2666	struct kvm_mmu_page *sp, *tmp;
2667	LIST_HEAD(invalid_list);
2668	bool unstable;
2669	int nr_zapped;
2670
2671	if (list_empty(&kvm->arch.active_mmu_pages))
2672		return 0;
2673
2674restart:
2675	list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
2676		/*
2677		 * Don't zap active root pages, the page itself can't be freed
2678		 * and zapping it will just force vCPUs to realloc and reload.
2679		 */
2680		if (sp->root_count)
2681			continue;
2682
2683		unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
2684						      &nr_zapped);
2685		total_zapped += nr_zapped;
2686		if (total_zapped >= nr_to_zap)
2687			break;
2688
2689		if (unstable)
2690			goto restart;
2691	}
2692
2693	kvm_mmu_commit_zap_page(kvm, &invalid_list);
2694
2695	kvm->stat.mmu_recycled += total_zapped;
2696	return total_zapped;
2697}
2698
2699static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
2700{
2701	if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
2702		return kvm->arch.n_max_mmu_pages -
2703			kvm->arch.n_used_mmu_pages;
2704
2705	return 0;
2706}
2707
2708static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
2709{
2710	unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
2711
2712	if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
2713		return 0;
2714
2715	kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
2716
2717	/*
2718	 * Note, this check is intentionally soft, it only guarantees that one
2719	 * page is available, while the caller may end up allocating as many as
2720	 * four pages, e.g. for PAE roots or for 5-level paging.  Temporarily
2721	 * exceeding the (arbitrary by default) limit will not harm the host,
2722	 * being too aggressive may unnecessarily kill the guest, and getting an
2723	 * exact count is far more trouble than it's worth, especially in the
2724	 * page fault paths.
2725	 */
2726	if (!kvm_mmu_available_pages(vcpu->kvm))
2727		return -ENOSPC;
2728	return 0;
2729}
2730
2731/*
2732 * Changing the number of mmu pages allocated to the vm
2733 * Note: if goal_nr_mmu_pages is too small, you will get dead lock
2734 */
2735void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2736{
2737	write_lock(&kvm->mmu_lock);
2738
2739	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2740		kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
2741						  goal_nr_mmu_pages);
2742
2743		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2744	}
2745
2746	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2747
2748	write_unlock(&kvm->mmu_lock);
2749}
2750
2751int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2752{
2753	struct kvm_mmu_page *sp;
2754	LIST_HEAD(invalid_list);
2755	int r;
2756
2757	r = 0;
2758	write_lock(&kvm->mmu_lock);
2759	for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
2760		r = 1;
2761		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2762	}
2763	kvm_mmu_commit_zap_page(kvm, &invalid_list);
2764	write_unlock(&kvm->mmu_lock);
2765
2766	return r;
2767}
2768
2769static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2770{
2771	gpa_t gpa;
2772	int r;
2773
2774	if (vcpu->arch.mmu->root_role.direct)
2775		return 0;
2776
2777	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2778
2779	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2780
2781	return r;
2782}
2783
2784static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2785{
2786	trace_kvm_mmu_unsync_page(sp);
2787	++kvm->stat.mmu_unsync;
2788	sp->unsync = 1;
2789
2790	kvm_mmu_mark_parents_unsync(sp);
2791}
2792
2793/*
2794 * Attempt to unsync any shadow pages that can be reached by the specified gfn,
2795 * KVM is creating a writable mapping for said gfn.  Returns 0 if all pages
2796 * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
2797 * be write-protected.
2798 */
2799int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
2800			    gfn_t gfn, bool can_unsync, bool prefetch)
2801{
2802	struct kvm_mmu_page *sp;
2803	bool locked = false;
2804
2805	/*
2806	 * Force write-protection if the page is being tracked.  Note, the page
2807	 * track machinery is used to write-protect upper-level shadow pages,
2808	 * i.e. this guards the role.level == 4K assertion below!
2809	 */
2810	if (kvm_gfn_is_write_tracked(kvm, slot, gfn))
2811		return -EPERM;
2812
2813	/*
2814	 * The page is not write-tracked, mark existing shadow pages unsync
2815	 * unless KVM is synchronizing an unsync SP (can_unsync = false).  In
2816	 * that case, KVM must complete emulation of the guest TLB flush before
2817	 * allowing shadow pages to become unsync (writable by the guest).
2818	 */
2819	for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
2820		if (!can_unsync)
2821			return -EPERM;
2822
2823		if (sp->unsync)
2824			continue;
2825
2826		if (prefetch)
2827			return -EEXIST;
2828
2829		/*
2830		 * TDP MMU page faults require an additional spinlock as they
2831		 * run with mmu_lock held for read, not write, and the unsync
2832		 * logic is not thread safe.  Take the spinklock regardless of
2833		 * the MMU type to avoid extra conditionals/parameters, there's
2834		 * no meaningful penalty if mmu_lock is held for write.
2835		 */
2836		if (!locked) {
2837			locked = true;
2838			spin_lock(&kvm->arch.mmu_unsync_pages_lock);
2839
2840			/*
2841			 * Recheck after taking the spinlock, a different vCPU
2842			 * may have since marked the page unsync.  A false
2843			 * positive on the unprotected check above is not
2844			 * possible as clearing sp->unsync _must_ hold mmu_lock
2845			 * for write, i.e. unsync cannot transition from 0->1
2846			 * while this CPU holds mmu_lock for read (or write).
2847			 */
2848			if (READ_ONCE(sp->unsync))
2849				continue;
2850		}
2851
2852		WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
2853		kvm_unsync_page(kvm, sp);
2854	}
2855	if (locked)
2856		spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
2857
2858	/*
2859	 * We need to ensure that the marking of unsync pages is visible
2860	 * before the SPTE is updated to allow writes because
2861	 * kvm_mmu_sync_roots() checks the unsync flags without holding
2862	 * the MMU lock and so can race with this. If the SPTE was updated
2863	 * before the page had been marked as unsync-ed, something like the
2864	 * following could happen:
2865	 *
2866	 * CPU 1                    CPU 2
2867	 * ---------------------------------------------------------------------
2868	 * 1.2 Host updates SPTE
2869	 *     to be writable
2870	 *                      2.1 Guest writes a GPTE for GVA X.
2871	 *                          (GPTE being in the guest page table shadowed
2872	 *                           by the SP from CPU 1.)
2873	 *                          This reads SPTE during the page table walk.
2874	 *                          Since SPTE.W is read as 1, there is no
2875	 *                          fault.
2876	 *
2877	 *                      2.2 Guest issues TLB flush.
2878	 *                          That causes a VM Exit.
2879	 *
2880	 *                      2.3 Walking of unsync pages sees sp->unsync is
2881	 *                          false and skips the page.
2882	 *
2883	 *                      2.4 Guest accesses GVA X.
2884	 *                          Since the mapping in the SP was not updated,
2885	 *                          so the old mapping for GVA X incorrectly
2886	 *                          gets used.
2887	 * 1.1 Host marks SP
2888	 *     as unsync
2889	 *     (sp->unsync = true)
2890	 *
2891	 * The write barrier below ensures that 1.1 happens before 1.2 and thus
2892	 * the situation in 2.4 does not arise.  It pairs with the read barrier
2893	 * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
2894	 */
2895	smp_wmb();
2896
2897	return 0;
2898}
2899
2900static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
2901			u64 *sptep, unsigned int pte_access, gfn_t gfn,
2902			kvm_pfn_t pfn, struct kvm_page_fault *fault)
2903{
2904	struct kvm_mmu_page *sp = sptep_to_sp(sptep);
2905	int level = sp->role.level;
2906	int was_rmapped = 0;
2907	int ret = RET_PF_FIXED;
2908	bool flush = false;
2909	bool wrprot;
2910	u64 spte;
2911
2912	/* Prefetching always gets a writable pfn.  */
2913	bool host_writable = !fault || fault->map_writable;
2914	bool prefetch = !fault || fault->prefetch;
2915	bool write_fault = fault && fault->write;
2916
2917	if (unlikely(is_noslot_pfn(pfn))) {
2918		vcpu->stat.pf_mmio_spte_created++;
2919		mark_mmio_spte(vcpu, sptep, gfn, pte_access);
2920		return RET_PF_EMULATE;
2921	}
2922
2923	if (is_shadow_present_pte(*sptep)) {
2924		/*
2925		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2926		 * the parent of the now unreachable PTE.
2927		 */
2928		if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
2929			struct kvm_mmu_page *child;
2930			u64 pte = *sptep;
2931
2932			child = spte_to_child_sp(pte);
2933			drop_parent_pte(vcpu->kvm, child, sptep);
2934			flush = true;
2935		} else if (pfn != spte_to_pfn(*sptep)) {
2936			drop_spte(vcpu->kvm, sptep);
2937			flush = true;
2938		} else
2939			was_rmapped = 1;
2940	}
2941
2942	wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
2943			   true, host_writable, &spte);
2944
2945	if (*sptep == spte) {
2946		ret = RET_PF_SPURIOUS;
2947	} else {
2948		flush |= mmu_spte_update(sptep, spte);
2949		trace_kvm_mmu_set_spte(level, gfn, sptep);
2950	}
2951
2952	if (wrprot) {
2953		if (write_fault)
2954			ret = RET_PF_EMULATE;
2955	}
2956
2957	if (flush)
2958		kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
2959
2960	if (!was_rmapped) {
2961		WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
2962		rmap_add(vcpu, slot, sptep, gfn, pte_access);
2963	} else {
2964		/* Already rmapped but the pte_access bits may have changed. */
2965		kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access);
2966	}
2967
2968	return ret;
2969}
2970
2971static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2972				    struct kvm_mmu_page *sp,
2973				    u64 *start, u64 *end)
2974{
2975	struct page *pages[PTE_PREFETCH_NUM];
2976	struct kvm_memory_slot *slot;
2977	unsigned int access = sp->role.access;
2978	int i, ret;
2979	gfn_t gfn;
2980
2981	gfn = kvm_mmu_page_get_gfn(sp, spte_index(start));
2982	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
2983	if (!slot)
2984		return -1;
2985
2986	ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
2987	if (ret <= 0)
2988		return -1;
2989
2990	for (i = 0; i < ret; i++, gfn++, start++) {
2991		mmu_set_spte(vcpu, slot, start, access, gfn,
2992			     page_to_pfn(pages[i]), NULL);
2993		put_page(pages[i]);
2994	}
2995
2996	return 0;
2997}
2998
2999static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
3000				  struct kvm_mmu_page *sp, u64 *sptep)
3001{
3002	u64 *spte, *start = NULL;
3003	int i;
3004
3005	WARN_ON_ONCE(!sp->role.direct);
3006
3007	i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
3008	spte = sp->spt + i;
3009
3010	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
3011		if (is_shadow_present_pte(*spte) || spte == sptep) {
3012			if (!start)
3013				continue;
3014			if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
3015				return;
3016			start = NULL;
3017		} else if (!start)
3018			start = spte;
3019	}
3020	if (start)
3021		direct_pte_prefetch_many(vcpu, sp, start, spte);
3022}
3023
3024static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3025{
3026	struct kvm_mmu_page *sp;
3027
3028	sp = sptep_to_sp(sptep);
3029
3030	/*
3031	 * Without accessed bits, there's no way to distinguish between
3032	 * actually accessed translations and prefetched, so disable pte
3033	 * prefetch if accessed bits aren't available.
3034	 */
3035	if (sp_ad_disabled(sp))
3036		return;
3037
3038	if (sp->role.level > PG_LEVEL_4K)
3039		return;
3040
3041	/*
3042	 * If addresses are being invalidated, skip prefetching to avoid
3043	 * accidentally prefetching those addresses.
3044	 */
3045	if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
3046		return;
3047
3048	__direct_pte_prefetch(vcpu, sp, sptep);
3049}
3050
3051/*
3052 * Lookup the mapping level for @gfn in the current mm.
3053 *
3054 * WARNING!  Use of host_pfn_mapping_level() requires the caller and the end
3055 * consumer to be tied into KVM's handlers for MMU notifier events!
3056 *
3057 * There are several ways to safely use this helper:
3058 *
3059 * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before
3060 *   consuming it.  In this case, mmu_lock doesn't need to be held during the
3061 *   lookup, but it does need to be held while checking the MMU notifier.
3062 *
3063 * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation
3064 *   event for the hva.  This can be done by explicit checking the MMU notifier
3065 *   or by ensuring that KVM already has a valid mapping that covers the hva.
3066 *
3067 * - Do not use the result to install new mappings, e.g. use the host mapping
3068 *   level only to decide whether or not to zap an entry.  In this case, it's
3069 *   not required to hold mmu_lock (though it's highly likely the caller will
3070 *   want to hold mmu_lock anyways, e.g. to modify SPTEs).
3071 *
3072 * Note!  The lookup can still race with modifications to host page tables, but
3073 * the above "rules" ensure KVM will not _consume_ the result of the walk if a
3074 * race with the primary MMU occurs.
3075 */
3076static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
3077				  const struct kvm_memory_slot *slot)
3078{
3079	int level = PG_LEVEL_4K;
3080	unsigned long hva;
3081	unsigned long flags;
3082	pgd_t pgd;
3083	p4d_t p4d;
3084	pud_t pud;
3085	pmd_t pmd;
3086
3087	/*
3088	 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
3089	 * is not solely for performance, it's also necessary to avoid the
3090	 * "writable" check in __gfn_to_hva_many(), which will always fail on
3091	 * read-only memslots due to gfn_to_hva() assuming writes.  Earlier
3092	 * page fault steps have already verified the guest isn't writing a
3093	 * read-only memslot.
3094	 */
3095	hva = __gfn_to_hva_memslot(slot, gfn);
3096
3097	/*
3098	 * Disable IRQs to prevent concurrent tear down of host page tables,
3099	 * e.g. if the primary MMU promotes a P*D to a huge page and then frees
3100	 * the original page table.
3101	 */
3102	local_irq_save(flags);
3103
3104	/*
3105	 * Read each entry once.  As above, a non-leaf entry can be promoted to
3106	 * a huge page _during_ this walk.  Re-reading the entry could send the
3107	 * walk into the weeks, e.g. p*d_large() returns false (sees the old
3108	 * value) and then p*d_offset() walks into the target huge page instead
3109	 * of the old page table (sees the new value).
3110	 */
3111	pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
3112	if (pgd_none(pgd))
3113		goto out;
3114
3115	p4d = READ_ONCE(*p4d_offset(&pgd, hva));
3116	if (p4d_none(p4d) || !p4d_present(p4d))
3117		goto out;
3118
3119	pud = READ_ONCE(*pud_offset(&p4d, hva));
3120	if (pud_none(pud) || !pud_present(pud))
3121		goto out;
3122
3123	if (pud_large(pud)) {
3124		level = PG_LEVEL_1G;
3125		goto out;
3126	}
3127
3128	pmd = READ_ONCE(*pmd_offset(&pud, hva));
3129	if (pmd_none(pmd) || !pmd_present(pmd))
3130		goto out;
3131
3132	if (pmd_large(pmd))
3133		level = PG_LEVEL_2M;
3134
3135out:
3136	local_irq_restore(flags);
3137	return level;
3138}
3139
3140int kvm_mmu_max_mapping_level(struct kvm *kvm,
3141			      const struct kvm_memory_slot *slot, gfn_t gfn,
3142			      int max_level)
3143{
3144	struct kvm_lpage_info *linfo;
3145	int host_level;
3146
3147	max_level = min(max_level, max_huge_page_level);
3148	for ( ; max_level > PG_LEVEL_4K; max_level--) {
3149		linfo = lpage_info_slot(gfn, slot, max_level);
3150		if (!linfo->disallow_lpage)
3151			break;
3152	}
3153
3154	if (max_level == PG_LEVEL_4K)
3155		return PG_LEVEL_4K;
3156
3157	host_level = host_pfn_mapping_level(kvm, gfn, slot);
3158	return min(host_level, max_level);
3159}
3160
3161void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3162{
3163	struct kvm_memory_slot *slot = fault->slot;
3164	kvm_pfn_t mask;
3165
3166	fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
3167
3168	if (unlikely(fault->max_level == PG_LEVEL_4K))
3169		return;
3170
3171	if (is_error_noslot_pfn(fault->pfn))
3172		return;
3173
3174	if (kvm_slot_dirty_track_enabled(slot))
3175		return;
3176
3177	/*
3178	 * Enforce the iTLB multihit workaround after capturing the requested
3179	 * level, which will be used to do precise, accurate accounting.
3180	 */
3181	fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
3182						     fault->gfn, fault->max_level);
3183	if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
3184		return;
3185
3186	/*
3187	 * mmu_invalidate_retry() was successful and mmu_lock is held, so
3188	 * the pmd can't be split from under us.
3189	 */
3190	fault->goal_level = fault->req_level;
3191	mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
3192	VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
3193	fault->pfn &= ~mask;
3194}
3195
3196void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
3197{
3198	if (cur_level > PG_LEVEL_4K &&
3199	    cur_level == fault->goal_level &&
3200	    is_shadow_present_pte(spte) &&
3201	    !is_large_pte(spte) &&
3202	    spte_to_child_sp(spte)->nx_huge_page_disallowed) {
3203		/*
3204		 * A small SPTE exists for this pfn, but FNAME(fetch),
3205		 * direct_map(), or kvm_tdp_mmu_map() would like to create a
3206		 * large PTE instead: just force them to go down another level,
3207		 * patching back for them into pfn the next 9 bits of the
3208		 * address.
3209		 */
3210		u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
3211				KVM_PAGES_PER_HPAGE(cur_level - 1);
3212		fault->pfn |= fault->gfn & page_mask;
3213		fault->goal_level--;
3214	}
3215}
3216
3217static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3218{
3219	struct kvm_shadow_walk_iterator it;
3220	struct kvm_mmu_page *sp;
3221	int ret;
3222	gfn_t base_gfn = fault->gfn;
3223
3224	kvm_mmu_hugepage_adjust(vcpu, fault);
3225
3226	trace_kvm_mmu_spte_requested(fault);
3227	for_each_shadow_entry(vcpu, fault->addr, it) {
3228		/*
3229		 * We cannot overwrite existing page tables with an NX
3230		 * large page, as the leaf could be executable.
3231		 */
3232		if (fault->nx_huge_page_workaround_enabled)
3233			disallowed_hugepage_adjust(fault, *it.sptep, it.level);
3234
3235		base_gfn = gfn_round_for_level(fault->gfn, it.level);
3236		if (it.level == fault->goal_level)
3237			break;
3238
3239		sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
3240		if (sp == ERR_PTR(-EEXIST))
3241			continue;
3242
3243		link_shadow_page(vcpu, it.sptep, sp);
3244		if (fault->huge_page_disallowed)
3245			account_nx_huge_page(vcpu->kvm, sp,
3246					     fault->req_level >= it.level);
3247	}
3248
3249	if (WARN_ON_ONCE(it.level != fault->goal_level))
3250		return -EFAULT;
3251
3252	ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
3253			   base_gfn, fault->pfn, fault);
3254	if (ret == RET_PF_SPURIOUS)
3255		return ret;
3256
3257	direct_pte_prefetch(vcpu, it.sptep);
3258	return ret;
3259}
3260
3261static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn)
3262{
3263	unsigned long hva = gfn_to_hva_memslot(slot, gfn);
3264
3265	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current);
3266}
3267
3268static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3269{
3270	if (is_sigpending_pfn(fault->pfn)) {
3271		kvm_handle_signal_exit(vcpu);
3272		return -EINTR;
3273	}
3274
3275	/*
3276	 * Do not cache the mmio info caused by writing the readonly gfn
3277	 * into the spte otherwise read access on readonly gfn also can
3278	 * caused mmio page fault and treat it as mmio access.
3279	 */
3280	if (fault->pfn == KVM_PFN_ERR_RO_FAULT)
3281		return RET_PF_EMULATE;
3282
3283	if (fault->pfn == KVM_PFN_ERR_HWPOISON) {
3284		kvm_send_hwpoison_signal(fault->slot, fault->gfn);
3285		return RET_PF_RETRY;
3286	}
3287
3288	return -EFAULT;
3289}
3290
3291static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
3292				   struct kvm_page_fault *fault,
3293				   unsigned int access)
3294{
3295	gva_t gva = fault->is_tdp ? 0 : fault->addr;
3296
3297	vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
3298			     access & shadow_mmio_access_mask);
3299
3300	/*
3301	 * If MMIO caching is disabled, emulate immediately without
3302	 * touching the shadow page tables as attempting to install an
3303	 * MMIO SPTE will just be an expensive nop.
3304	 */
3305	if (unlikely(!enable_mmio_caching))
3306		return RET_PF_EMULATE;
3307
3308	/*
3309	 * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR,
3310	 * any guest that generates such gfns is running nested and is being
3311	 * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and
3312	 * only if L1's MAXPHYADDR is inaccurate with respect to the
3313	 * hardware's).
3314	 */
3315	if (unlikely(fault->gfn > kvm_mmu_max_gfn()))
3316		return RET_PF_EMULATE;
3317
3318	return RET_PF_CONTINUE;
3319}
3320
3321static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
3322{
3323	/*
3324	 * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
3325	 * reach the common page fault handler if the SPTE has an invalid MMIO
3326	 * generation number.  Refreshing the MMIO generation needs to go down
3327	 * the slow path.  Note, EPT Misconfigs do NOT set the PRESENT flag!
3328	 */
3329	if (fault->rsvd)
3330		return false;
3331
3332	/*
3333	 * #PF can be fast if:
3334	 *
3335	 * 1. The shadow page table entry is not present and A/D bits are
3336	 *    disabled _by KVM_, which could mean that the fault is potentially
3337	 *    caused by access tracking (if enabled).  If A/D bits are enabled
3338	 *    by KVM, but disabled by L1 for L2, KVM is forced to disable A/D
3339	 *    bits for L2 and employ access tracking, but the fast page fault
3340	 *    mechanism only supports direct MMUs.
3341	 * 2. The shadow page table entry is present, the access is a write,
3342	 *    and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e.
3343	 *    the fault was caused by a write-protection violation.  If the
3344	 *    SPTE is MMU-writable (determined later), the fault can be fixed
3345	 *    by setting the Writable bit, which can be done out of mmu_lock.
3346	 */
3347	if (!fault->present)
3348		return !kvm_ad_enabled();
3349
3350	/*
3351	 * Note, instruction fetches and writes are mutually exclusive, ignore
3352	 * the "exec" flag.
3353	 */
3354	return fault->write;
3355}
3356
3357/*
3358 * Returns true if the SPTE was fixed successfully. Otherwise,
3359 * someone else modified the SPTE from its original value.
3360 */
3361static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
3362				    struct kvm_page_fault *fault,
3363				    u64 *sptep, u64 old_spte, u64 new_spte)
3364{
3365	/*
3366	 * Theoretically we could also set dirty bit (and flush TLB) here in
3367	 * order to eliminate unnecessary PML logging. See comments in
3368	 * set_spte. But fast_page_fault is very unlikely to happen with PML
3369	 * enabled, so we do not do this. This might result in the same GPA
3370	 * to be logged in PML buffer again when the write really happens, and
3371	 * eventually to be called by mark_page_dirty twice. But it's also no
3372	 * harm. This also avoids the TLB flush needed after setting dirty bit
3373	 * so non-PML cases won't be impacted.
3374	 *
3375	 * Compare with set_spte where instead shadow_dirty_mask is set.
3376	 */
3377	if (!try_cmpxchg64(sptep, &old_spte, new_spte))
3378		return false;
3379
3380	if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
3381		mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
3382
3383	return true;
3384}
3385
3386static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
3387{
3388	if (fault->exec)
3389		return is_executable_pte(spte);
3390
3391	if (fault->write)
3392		return is_writable_pte(spte);
3393
3394	/* Fault was on Read access */
3395	return spte & PT_PRESENT_MASK;
3396}
3397
3398/*
3399 * Returns the last level spte pointer of the shadow page walk for the given
3400 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
3401 * walk could be performed, returns NULL and *spte does not contain valid data.
3402 *
3403 * Contract:
3404 *  - Must be called between walk_shadow_page_lockless_{begin,end}.
3405 *  - The returned sptep must not be used after walk_shadow_page_lockless_end.
3406 */
3407static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
3408{
3409	struct kvm_shadow_walk_iterator iterator;
3410	u64 old_spte;
3411	u64 *sptep = NULL;
3412
3413	for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
3414		sptep = iterator.sptep;
3415		*spte = old_spte;
3416	}
3417
3418	return sptep;
3419}
3420
3421/*
3422 * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
3423 */
3424static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3425{
3426	struct kvm_mmu_page *sp;
3427	int ret = RET_PF_INVALID;
3428	u64 spte = 0ull;
3429	u64 *sptep = NULL;
3430	uint retry_count = 0;
3431
3432	if (!page_fault_can_be_fast(fault))
3433		return ret;
3434
3435	walk_shadow_page_lockless_begin(vcpu);
3436
3437	do {
3438		u64 new_spte;
3439
3440		if (tdp_mmu_enabled)
3441			sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
3442		else
3443			sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
3444
3445		if (!is_shadow_present_pte(spte))
3446			break;
3447
3448		sp = sptep_to_sp(sptep);
3449		if (!is_last_spte(spte, sp->role.level))
3450			break;
3451
3452		/*
3453		 * Check whether the memory access that caused the fault would
3454		 * still cause it if it were to be performed right now. If not,
3455		 * then this is a spurious fault caused by TLB lazily flushed,
3456		 * or some other CPU has already fixed the PTE after the
3457		 * current CPU took the fault.
3458		 *
3459		 * Need not check the access of upper level table entries since
3460		 * they are always ACC_ALL.
3461		 */
3462		if (is_access_allowed(fault, spte)) {
3463			ret = RET_PF_SPURIOUS;
3464			break;
3465		}
3466
3467		new_spte = spte;
3468
3469		/*
3470		 * KVM only supports fixing page faults outside of MMU lock for
3471		 * direct MMUs, nested MMUs are always indirect, and KVM always
3472		 * uses A/D bits for non-nested MMUs.  Thus, if A/D bits are
3473		 * enabled, the SPTE can't be an access-tracked SPTE.
3474		 */
3475		if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte))
3476			new_spte = restore_acc_track_spte(new_spte);
3477
3478		/*
3479		 * To keep things simple, only SPTEs that are MMU-writable can
3480		 * be made fully writable outside of mmu_lock, e.g. only SPTEs
3481		 * that were write-protected for dirty-logging or access
3482		 * tracking are handled here.  Don't bother checking if the
3483		 * SPTE is writable to prioritize running with A/D bits enabled.
3484		 * The is_access_allowed() check above handles the common case
3485		 * of the fault being spurious, and the SPTE is known to be
3486		 * shadow-present, i.e. except for access tracking restoration
3487		 * making the new SPTE writable, the check is wasteful.
3488		 */
3489		if (fault->write && is_mmu_writable_spte(spte)) {
3490			new_spte |= PT_WRITABLE_MASK;
3491
3492			/*
3493			 * Do not fix write-permission on the large spte when
3494			 * dirty logging is enabled. Since we only dirty the
3495			 * first page into the dirty-bitmap in
3496			 * fast_pf_fix_direct_spte(), other pages are missed
3497			 * if its slot has dirty logging enabled.
3498			 *
3499			 * Instead, we let the slow page fault path create a
3500			 * normal spte to fix the access.
3501			 */
3502			if (sp->role.level > PG_LEVEL_4K &&
3503			    kvm_slot_dirty_track_enabled(fault->slot))
3504				break;
3505		}
3506
3507		/* Verify that the fault can be handled in the fast path */
3508		if (new_spte == spte ||
3509		    !is_access_allowed(fault, new_spte))
3510			break;
3511
3512		/*
3513		 * Currently, fast page fault only works for direct mapping
3514		 * since the gfn is not stable for indirect shadow page. See
3515		 * Documentation/virt/kvm/locking.rst to get more detail.
3516		 */
3517		if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
3518			ret = RET_PF_FIXED;
3519			break;
3520		}
3521
3522		if (++retry_count > 4) {
3523			pr_warn_once("Fast #PF retrying more than 4 times.\n");
3524			break;
3525		}
3526
3527	} while (true);
3528
3529	trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
3530	walk_shadow_page_lockless_end(vcpu);
3531
3532	if (ret != RET_PF_INVALID)
3533		vcpu->stat.pf_fast++;
3534
3535	return ret;
3536}
3537
3538static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3539			       struct list_head *invalid_list)
3540{
3541	struct kvm_mmu_page *sp;
3542
3543	if (!VALID_PAGE(*root_hpa))
3544		return;
3545
3546	sp = root_to_sp(*root_hpa);
3547	if (WARN_ON_ONCE(!sp))
3548		return;
3549
3550	if (is_tdp_mmu_page(sp))
3551		kvm_tdp_mmu_put_root(kvm, sp, false);
3552	else if (!--sp->root_count && sp->role.invalid)
3553		kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3554
3555	*root_hpa = INVALID_PAGE;
3556}
3557
3558/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
3559void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
3560			ulong roots_to_free)
3561{
3562	int i;
3563	LIST_HEAD(invalid_list);
3564	bool free_active_root;
3565
3566	WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL);
3567
3568	BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3569
3570	/* Before acquiring the MMU lock, see if we need to do any real work. */
3571	free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
3572		&& VALID_PAGE(mmu->root.hpa);
3573
3574	if (!free_active_root) {
3575		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3576			if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3577			    VALID_PAGE(mmu->prev_roots[i].hpa))
3578				break;
3579
3580		if (i == KVM_MMU_NUM_PREV_ROOTS)
3581			return;
3582	}
3583
3584	write_lock(&kvm->mmu_lock);
3585
3586	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3587		if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3588			mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
3589					   &invalid_list);
3590
3591	if (free_active_root) {
3592		if (kvm_mmu_is_dummy_root(mmu->root.hpa)) {
3593			/* Nothing to cleanup for dummy roots. */
3594		} else if (root_to_sp(mmu->root.hpa)) {
3595			mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
3596		} else if (mmu->pae_root) {
3597			for (i = 0; i < 4; ++i) {
3598				if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
3599					continue;
3600
3601				mmu_free_root_page(kvm, &mmu->pae_root[i],
3602						   &invalid_list);
3603				mmu->pae_root[i] = INVALID_PAE_ROOT;
3604			}
3605		}
3606		mmu->root.hpa = INVALID_PAGE;
3607		mmu->root.pgd = 0;
3608	}
3609
3610	kvm_mmu_commit_zap_page(kvm, &invalid_list);
3611	write_unlock(&kvm->mmu_lock);
3612}
3613EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3614
3615void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
3616{
3617	unsigned long roots_to_free = 0;
3618	struct kvm_mmu_page *sp;
3619	hpa_t root_hpa;
3620	int i;
3621
3622	/*
3623	 * This should not be called while L2 is active, L2 can't invalidate
3624	 * _only_ its own roots, e.g. INVVPID unconditionally exits.
3625	 */
3626	WARN_ON_ONCE(mmu->root_role.guest_mode);
3627
3628	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
3629		root_hpa = mmu->prev_roots[i].hpa;
3630		if (!VALID_PAGE(root_hpa))
3631			continue;
3632
3633		sp = root_to_sp(root_hpa);
3634		if (!sp || sp->role.guest_mode)
3635			roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
3636	}
3637
3638	kvm_mmu_free_roots(kvm, mmu, roots_to_free);
3639}
3640EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
3641
3642static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
3643			    u8 level)
3644{
3645	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
3646	struct kvm_mmu_page *sp;
3647
3648	role.level = level;
3649	role.quadrant = quadrant;
3650
3651	WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte);
3652	WARN_ON_ONCE(role.direct && role.has_4_byte_gpte);
3653
3654	sp = kvm_mmu_get_shadow_page(vcpu, gfn, role);
3655	++sp->root_count;
3656
3657	return __pa(sp->spt);
3658}
3659
3660static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3661{
3662	struct kvm_mmu *mmu = vcpu->arch.mmu;
3663	u8 shadow_root_level = mmu->root_role.level;
3664	hpa_t root;
3665	unsigned i;
3666	int r;
3667
3668	write_lock(&vcpu->kvm->mmu_lock);
3669	r = make_mmu_pages_available(vcpu);
3670	if (r < 0)
3671		goto out_unlock;
3672
3673	if (tdp_mmu_enabled) {
3674		root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
3675		mmu->root.hpa = root;
3676	} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
3677		root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
3678		mmu->root.hpa = root;
3679	} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
3680		if (WARN_ON_ONCE(!mmu->pae_root)) {
3681			r = -EIO;
3682			goto out_unlock;
3683		}
3684
3685		for (i = 0; i < 4; ++i) {
3686			WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3687
3688			root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0,
3689					      PT32_ROOT_LEVEL);
3690			mmu->pae_root[i] = root | PT_PRESENT_MASK |
3691					   shadow_me_value;
3692		}
3693		mmu->root.hpa = __pa(mmu->pae_root);
3694	} else {
3695		WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
3696		r = -EIO;
3697		goto out_unlock;
3698	}
3699
3700	/* root.pgd is ignored for direct MMUs. */
3701	mmu->root.pgd = 0;
3702out_unlock:
3703	write_unlock(&vcpu->kvm->mmu_lock);
3704	return r;
3705}
3706
3707static int mmu_first_shadow_root_alloc(struct kvm *kvm)
3708{
3709	struct kvm_memslots *slots;
3710	struct kvm_memory_slot *slot;
3711	int r = 0, i, bkt;
3712
3713	/*
3714	 * Check if this is the first shadow root being allocated before
3715	 * taking the lock.
3716	 */
3717	if (kvm_shadow_root_allocated(kvm))
3718		return 0;
3719
3720	mutex_lock(&kvm->slots_arch_lock);
3721
3722	/* Recheck, under the lock, whether this is the first shadow root. */
3723	if (kvm_shadow_root_allocated(kvm))
3724		goto out_unlock;
3725
3726	/*
3727	 * Check if anything actually needs to be allocated, e.g. all metadata
3728	 * will be allocated upfront if TDP is disabled.
3729	 */
3730	if (kvm_memslots_have_rmaps(kvm) &&
3731	    kvm_page_track_write_tracking_enabled(kvm))
3732		goto out_success;
3733
3734	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
3735		slots = __kvm_memslots(kvm, i);
3736		kvm_for_each_memslot(slot, bkt, slots) {
3737			/*
3738			 * Both of these functions are no-ops if the target is
3739			 * already allocated, so unconditionally calling both
3740			 * is safe.  Intentionally do NOT free allocations on
3741			 * failure to avoid having to track which allocations
3742			 * were made now versus when the memslot was created.
3743			 * The metadata is guaranteed to be freed when the slot
3744			 * is freed, and will be kept/used if userspace retries
3745			 * KVM_RUN instead of killing the VM.
3746			 */
3747			r = memslot_rmap_alloc(slot, slot->npages);
3748			if (r)
3749				goto out_unlock;
3750			r = kvm_page_track_write_tracking_alloc(slot);
3751			if (r)
3752				goto out_unlock;
3753		}
3754	}
3755
3756	/*
3757	 * Ensure that shadow_root_allocated becomes true strictly after
3758	 * all the related pointers are set.
3759	 */
3760out_success:
3761	smp_store_release(&kvm->arch.shadow_root_allocated, true);
3762
3763out_unlock:
3764	mutex_unlock(&kvm->slots_arch_lock);
3765	return r;
3766}
3767
3768static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3769{
3770	struct kvm_mmu *mmu = vcpu->arch.mmu;
3771	u64 pdptrs[4], pm_mask;
3772	gfn_t root_gfn, root_pgd;
3773	int quadrant, i, r;
3774	hpa_t root;
3775
3776	root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
3777	root_gfn = root_pgd >> PAGE_SHIFT;
3778
3779	if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
3780		mmu->root.hpa = kvm_mmu_get_dummy_root();
3781		return 0;
3782	}
3783
3784	/*
3785	 * On SVM, reading PDPTRs might access guest memory, which might fault
3786	 * and thus might sleep.  Grab the PDPTRs before acquiring mmu_lock.
3787	 */
3788	if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
3789		for (i = 0; i < 4; ++i) {
3790			pdptrs[i] = mmu->get_pdptr(vcpu, i);
3791			if (!(pdptrs[i] & PT_PRESENT_MASK))
3792				continue;
3793
3794			if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT))
3795				pdptrs[i] = 0;
3796		}
3797	}
3798
3799	r = mmu_first_shadow_root_alloc(vcpu->kvm);
3800	if (r)
3801		return r;
3802
3803	write_lock(&vcpu->kvm->mmu_lock);
3804	r = make_mmu_pages_available(vcpu);
3805	if (r < 0)
3806		goto out_unlock;
3807
3808	/*
3809	 * Do we shadow a long mode page table? If so we need to
3810	 * write-protect the guests page table root.
3811	 */
3812	if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
3813		root = mmu_alloc_root(vcpu, root_gfn, 0,
3814				      mmu->root_role.level);
3815		mmu->root.hpa = root;
3816		goto set_root_pgd;
3817	}
3818
3819	if (WARN_ON_ONCE(!mmu->pae_root)) {
3820		r = -EIO;
3821		goto out_unlock;
3822	}
3823
3824	/*
3825	 * We shadow a 32 bit page table. This may be a legacy 2-level
3826	 * or a PAE 3-level page table. In either case we need to be aware that
3827	 * the shadow page table may be a PAE or a long mode page table.
3828	 */
3829	pm_mask = PT_PRESENT_MASK | shadow_me_value;
3830	if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
3831		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3832
3833		if (WARN_ON_ONCE(!mmu->pml4_root)) {
3834			r = -EIO;
3835			goto out_unlock;
3836		}
3837		mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
3838
3839		if (mmu->root_role.level == PT64_ROOT_5LEVEL) {
3840			if (WARN_ON_ONCE(!mmu->pml5_root)) {
3841				r = -EIO;
3842				goto out_unlock;
3843			}
3844			mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
3845		}
3846	}
3847
3848	for (i = 0; i < 4; ++i) {
3849		WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3850
3851		if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
3852			if (!(pdptrs[i] & PT_PRESENT_MASK)) {
3853				mmu->pae_root[i] = INVALID_PAE_ROOT;
3854				continue;
3855			}
3856			root_gfn = pdptrs[i] >> PAGE_SHIFT;
3857		}
3858
3859		/*
3860		 * If shadowing 32-bit non-PAE page tables, each PAE page
3861		 * directory maps one quarter of the guest's non-PAE page
3862		 * directory. Othwerise each PAE page direct shadows one guest
3863		 * PAE page directory so that quadrant should be 0.
3864		 */
3865		quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0;
3866
3867		root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL);
3868		mmu->pae_root[i] = root | pm_mask;
3869	}
3870
3871	if (mmu->root_role.level == PT64_ROOT_5LEVEL)
3872		mmu->root.hpa = __pa(mmu->pml5_root);
3873	else if (mmu->root_role.level == PT64_ROOT_4LEVEL)
3874		mmu->root.hpa = __pa(mmu->pml4_root);
3875	else
3876		mmu->root.hpa = __pa(mmu->pae_root);
3877
3878set_root_pgd:
3879	mmu->root.pgd = root_pgd;
3880out_unlock:
3881	write_unlock(&vcpu->kvm->mmu_lock);
3882
3883	return r;
3884}
3885
3886static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
3887{
3888	struct kvm_mmu *mmu = vcpu->arch.mmu;
3889	bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
3890	u64 *pml5_root = NULL;
3891	u64 *pml4_root = NULL;
3892	u64 *pae_root;
3893
3894	/*
3895	 * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
3896	 * tables are allocated and initialized at root creation as there is no
3897	 * equivalent level in the guest's NPT to shadow.  Allocate the tables
3898	 * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
3899	 */
3900	if (mmu->root_role.direct ||
3901	    mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
3902	    mmu->root_role.level < PT64_ROOT_4LEVEL)
3903		return 0;
3904
3905	/*
3906	 * NPT, the only paging mode that uses this horror, uses a fixed number
3907	 * of levels for the shadow page tables, e.g. all MMUs are 4-level or
3908	 * all MMus are 5-level.  Thus, this can safely require that pml5_root
3909	 * is allocated if the other roots are valid and pml5 is needed, as any
3910	 * prior MMU would also have required pml5.
3911	 */
3912	if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
3913		return 0;
3914
3915	/*
3916	 * The special roots should always be allocated in concert.  Yell and
3917	 * bail if KVM ends up in a state where only one of the roots is valid.
3918	 */
3919	if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
3920			 (need_pml5 && mmu->pml5_root)))
3921		return -EIO;
3922
3923	/*
3924	 * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
3925	 * doesn't need to be decrypted.
3926	 */
3927	pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3928	if (!pae_root)
3929		return -ENOMEM;
3930
3931#ifdef CONFIG_X86_64
3932	pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3933	if (!pml4_root)
3934		goto err_pml4;
3935
3936	if (need_pml5) {
3937		pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3938		if (!pml5_root)
3939			goto err_pml5;
3940	}
3941#endif
3942
3943	mmu->pae_root = pae_root;
3944	mmu->pml4_root = pml4_root;
3945	mmu->pml5_root = pml5_root;
3946
3947	return 0;
3948
3949#ifdef CONFIG_X86_64
3950err_pml5:
3951	free_page((unsigned long)pml4_root);
3952err_pml4:
3953	free_page((unsigned long)pae_root);
3954	return -ENOMEM;
3955#endif
3956}
3957
3958static bool is_unsync_root(hpa_t root)
3959{
3960	struct kvm_mmu_page *sp;
3961
3962	if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root))
3963		return false;
3964
3965	/*
3966	 * The read barrier orders the CPU's read of SPTE.W during the page table
3967	 * walk before the reads of sp->unsync/sp->unsync_children here.
3968	 *
3969	 * Even if another CPU was marking the SP as unsync-ed simultaneously,
3970	 * any guest page table changes are not guaranteed to be visible anyway
3971	 * until this VCPU issues a TLB flush strictly after those changes are
3972	 * made.  We only need to ensure that the other CPU sets these flags
3973	 * before any actual changes to the page tables are made.  The comments
3974	 * in mmu_try_to_unsync_pages() describe what could go wrong if this
3975	 * requirement isn't satisfied.
3976	 */
3977	smp_rmb();
3978	sp = root_to_sp(root);
3979
3980	/*
3981	 * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
3982	 * PDPTEs for a given PAE root need to be synchronized individually.
3983	 */
3984	if (WARN_ON_ONCE(!sp))
3985		return false;
3986
3987	if (sp->unsync || sp->unsync_children)
3988		return true;
3989
3990	return false;
3991}
3992
3993void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3994{
3995	int i;
3996	struct kvm_mmu_page *sp;
3997
3998	if (vcpu->arch.mmu->root_role.direct)
3999		return;
4000
4001	if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
4002		return;
4003
4004	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4005
4006	if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
4007		hpa_t root = vcpu->arch.mmu->root.hpa;
4008
4009		if (!is_unsync_root(root))
4010			return;
4011
4012		sp = root_to_sp(root);
4013
4014		write_lock(&vcpu->kvm->mmu_lock);
4015		mmu_sync_children(vcpu, sp, true);
4016		write_unlock(&vcpu->kvm->mmu_lock);
4017		return;
4018	}
4019
4020	write_lock(&vcpu->kvm->mmu_lock);
4021
4022	for (i = 0; i < 4; ++i) {
4023		hpa_t root = vcpu->arch.mmu->pae_root[i];
4024
4025		if (IS_VALID_PAE_ROOT(root)) {
4026			sp = spte_to_child_sp(root);
4027			mmu_sync_children(vcpu, sp, true);
4028		}
4029	}
4030
4031	write_unlock(&vcpu->kvm->mmu_lock);
4032}
4033
4034void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
4035{
4036	unsigned long roots_to_free = 0;
4037	int i;
4038
4039	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
4040		if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
4041			roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
4042
4043	/* sync prev_roots by simply freeing them */
4044	kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
4045}
4046
4047static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
4048				  gpa_t vaddr, u64 access,
4049				  struct x86_exception *exception)
4050{
4051	if (exception)
4052		exception->error_code = 0;
4053	return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
4054}
4055
4056static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4057{
4058	/*
4059	 * A nested guest cannot use the MMIO cache if it is using nested
4060	 * page tables, because cr2 is a nGPA while the cache stores GPAs.
4061	 */
4062	if (mmu_is_nested(vcpu))
4063		return false;
4064
4065	if (direct)
4066		return vcpu_match_mmio_gpa(vcpu, addr);
4067
4068	return vcpu_match_mmio_gva(vcpu, addr);
4069}
4070
4071/*
4072 * Return the level of the lowest level SPTE added to sptes.
4073 * That SPTE may be non-present.
4074 *
4075 * Must be called between walk_shadow_page_lockless_{begin,end}.
4076 */
4077static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
4078{
4079	struct kvm_shadow_walk_iterator iterator;
4080	int leaf = -1;
4081	u64 spte;
4082
4083	for (shadow_walk_init(&iterator, vcpu, addr),
4084	     *root_level = iterator.level;
4085	     shadow_walk_okay(&iterator);
4086	     __shadow_walk_next(&iterator, spte)) {
4087		leaf = iterator.level;
4088		spte = mmu_spte_get_lockless(iterator.sptep);
4089
4090		sptes[leaf] = spte;
4091	}
4092
4093	return leaf;
4094}
4095
4096/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
4097static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
4098{
4099	u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
4100	struct rsvd_bits_validate *rsvd_check;
4101	int root, leaf, level;
4102	bool reserved = false;
4103
4104	walk_shadow_page_lockless_begin(vcpu);
4105
4106	if (is_tdp_mmu_active(vcpu))
4107		leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
4108	else
4109		leaf = get_walk(vcpu, addr, sptes, &root);
4110
4111	walk_shadow_page_lockless_end(vcpu);
4112
4113	if (unlikely(leaf < 0)) {
4114		*sptep = 0ull;
4115		return reserved;
4116	}
4117
4118	*sptep = sptes[leaf];
4119
4120	/*
4121	 * Skip reserved bits checks on the terminal leaf if it's not a valid
4122	 * SPTE.  Note, this also (intentionally) skips MMIO SPTEs, which, by
4123	 * design, always have reserved bits set.  The purpose of the checks is
4124	 * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
4125	 */
4126	if (!is_shadow_present_pte(sptes[leaf]))
4127		leaf++;
4128
4129	rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
4130
4131	for (level = root; level >= leaf; level--)
4132		reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
4133
4134	if (reserved) {
4135		pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
4136		       __func__, addr);
4137		for (level = root; level >= leaf; level--)
4138			pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
4139			       sptes[level], level,
4140			       get_rsvd_bits(rsvd_check, sptes[level], level));
4141	}
4142
4143	return reserved;
4144}
4145
4146static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4147{
4148	u64 spte;
4149	bool reserved;
4150
4151	if (mmio_info_in_cache(vcpu, addr, direct))
4152		return RET_PF_EMULATE;
4153
4154	reserved = get_mmio_spte(vcpu, addr, &spte);
4155	if (WARN_ON_ONCE(reserved))
4156		return -EINVAL;
4157
4158	if (is_mmio_spte(spte)) {
4159		gfn_t gfn = get_mmio_spte_gfn(spte);
4160		unsigned int access = get_mmio_spte_access(spte);
4161
4162		if (!check_mmio_spte(vcpu, spte))
4163			return RET_PF_INVALID;
4164
4165		if (direct)
4166			addr = 0;
4167
4168		trace_handle_mmio_page_fault(addr, gfn, access);
4169		vcpu_cache_mmio_info(vcpu, addr, gfn, access);
4170		return RET_PF_EMULATE;
4171	}
4172
4173	/*
4174	 * If the page table is zapped by other cpus, let CPU fault again on
4175	 * the address.
4176	 */
4177	return RET_PF_RETRY;
4178}
4179
4180static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
4181					 struct kvm_page_fault *fault)
4182{
4183	if (unlikely(fault->rsvd))
4184		return false;
4185
4186	if (!fault->present || !fault->write)
4187		return false;
4188
4189	/*
4190	 * guest is writing the page which is write tracked which can
4191	 * not be fixed by page fault handler.
4192	 */
4193	if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn))
4194		return true;
4195
4196	return false;
4197}
4198
4199static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
4200{
4201	struct kvm_shadow_walk_iterator iterator;
4202	u64 spte;
4203
4204	walk_shadow_page_lockless_begin(vcpu);
4205	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
4206		clear_sp_write_flooding_count(iterator.sptep);
4207	walk_shadow_page_lockless_end(vcpu);
4208}
4209
4210static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
4211{
4212	/* make sure the token value is not 0 */
4213	u32 id = vcpu->arch.apf.id;
4214
4215	if (id << 12 == 0)
4216		vcpu->arch.apf.id = 1;
4217
4218	return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4219}
4220
4221static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
4222				    gfn_t gfn)
4223{
4224	struct kvm_arch_async_pf arch;
4225
4226	arch.token = alloc_apf_token(vcpu);
4227	arch.gfn = gfn;
4228	arch.direct_map = vcpu->arch.mmu->root_role.direct;
4229	arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
4230
4231	return kvm_setup_async_pf(vcpu, cr2_or_gpa,
4232				  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4233}
4234
4235void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
4236{
4237	int r;
4238
4239	if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
4240	      work->wakeup_all)
4241		return;
4242
4243	r = kvm_mmu_reload(vcpu);
4244	if (unlikely(r))
4245		return;
4246
4247	if (!vcpu->arch.mmu->root_role.direct &&
4248	      work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
4249		return;
4250
4251	kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
4252}
4253
4254static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4255{
4256	struct kvm_memory_slot *slot = fault->slot;
4257	bool async;
4258
4259	/*
4260	 * Retry the page fault if the gfn hit a memslot that is being deleted
4261	 * or moved.  This ensures any existing SPTEs for the old memslot will
4262	 * be zapped before KVM inserts a new MMIO SPTE for the gfn.
4263	 */
4264	if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
4265		return RET_PF_RETRY;
4266
4267	if (!kvm_is_visible_memslot(slot)) {
4268		/* Don't expose private memslots to L2. */
4269		if (is_guest_mode(vcpu)) {
4270			fault->slot = NULL;
4271			fault->pfn = KVM_PFN_NOSLOT;
4272			fault->map_writable = false;
4273			return RET_PF_CONTINUE;
4274		}
4275		/*
4276		 * If the APIC access page exists but is disabled, go directly
4277		 * to emulation without caching the MMIO access or creating a
4278		 * MMIO SPTE.  That way the cache doesn't need to be purged
4279		 * when the AVIC is re-enabled.
4280		 */
4281		if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
4282		    !kvm_apicv_activated(vcpu->kvm))
4283			return RET_PF_EMULATE;
4284	}
4285
4286	async = false;
4287	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
4288					  fault->write, &fault->map_writable,
4289					  &fault->hva);
4290	if (!async)
4291		return RET_PF_CONTINUE; /* *pfn has correct page already */
4292
4293	if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
4294		trace_kvm_try_async_get_page(fault->addr, fault->gfn);
4295		if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
4296			trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
4297			kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4298			return RET_PF_RETRY;
4299		} else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
4300			return RET_PF_RETRY;
4301		}
4302	}
4303
4304	/*
4305	 * Allow gup to bail on pending non-fatal signals when it's also allowed
4306	 * to wait for IO.  Note, gup always bails if it is unable to quickly
4307	 * get a page and a fatal signal, i.e. SIGKILL, is pending.
4308	 */
4309	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
4310					  fault->write, &fault->map_writable,
4311					  &fault->hva);
4312	return RET_PF_CONTINUE;
4313}
4314
4315static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
4316			   unsigned int access)
4317{
4318	int ret;
4319
4320	fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
4321	smp_rmb();
4322
4323	ret = __kvm_faultin_pfn(vcpu, fault);
4324	if (ret != RET_PF_CONTINUE)
4325		return ret;
4326
4327	if (unlikely(is_error_pfn(fault->pfn)))
4328		return kvm_handle_error_pfn(vcpu, fault);
4329
4330	if (unlikely(!fault->slot))
4331		return kvm_handle_noslot_fault(vcpu, fault, access);
4332
4333	return RET_PF_CONTINUE;
4334}
4335
4336/*
4337 * Returns true if the page fault is stale and needs to be retried, i.e. if the
4338 * root was invalidated by a memslot update or a relevant mmu_notifier fired.
4339 */
4340static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
4341				struct kvm_page_fault *fault)
4342{
4343	struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
4344
4345	/* Special roots, e.g. pae_root, are not backed by shadow pages. */
4346	if (sp && is_obsolete_sp(vcpu->kvm, sp))
4347		return true;
4348
4349	/*
4350	 * Roots without an associated shadow page are considered invalid if
4351	 * there is a pending request to free obsolete roots.  The request is
4352	 * only a hint that the current root _may_ be obsolete and needs to be
4353	 * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
4354	 * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
4355	 * to reload even if no vCPU is actively using the root.
4356	 */
4357	if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
4358		return true;
4359
4360	return fault->slot &&
4361	       mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva);
4362}
4363
4364static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4365{
4366	int r;
4367
4368	/* Dummy roots are used only for shadowing bad guest roots. */
4369	if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa)))
4370		return RET_PF_RETRY;
4371
4372	if (page_fault_handle_page_track(vcpu, fault))
4373		return RET_PF_EMULATE;
4374
4375	r = fast_page_fault(vcpu, fault);
4376	if (r != RET_PF_INVALID)
4377		return r;
4378
4379	r = mmu_topup_memory_caches(vcpu, false);
4380	if (r)
4381		return r;
4382
4383	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
4384	if (r != RET_PF_CONTINUE)
4385		return r;
4386
4387	r = RET_PF_RETRY;
4388	write_lock(&vcpu->kvm->mmu_lock);
4389
4390	if (is_page_fault_stale(vcpu, fault))
4391		goto out_unlock;
4392
4393	r = make_mmu_pages_available(vcpu);
4394	if (r)
4395		goto out_unlock;
4396
4397	r = direct_map(vcpu, fault);
4398
4399out_unlock:
4400	write_unlock(&vcpu->kvm->mmu_lock);
4401	kvm_release_pfn_clean(fault->pfn);
4402	return r;
4403}
4404
4405static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
4406				struct kvm_page_fault *fault)
4407{
4408	/* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
4409	fault->max_level = PG_LEVEL_2M;
4410	return direct_page_fault(vcpu, fault);
4411}
4412
4413int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4414				u64 fault_address, char *insn, int insn_len)
4415{
4416	int r = 1;
4417	u32 flags = vcpu->arch.apf.host_apf_flags;
4418
4419#ifndef CONFIG_X86_64
4420	/* A 64-bit CR2 should be impossible on 32-bit KVM. */
4421	if (WARN_ON_ONCE(fault_address >> 32))
4422		return -EFAULT;
4423#endif
4424
4425	vcpu->arch.l1tf_flush_l1d = true;
4426	if (!flags) {
4427		trace_kvm_page_fault(vcpu, fault_address, error_code);
4428
4429		if (kvm_event_needs_reinjection(vcpu))
4430			kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4431		r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4432				insn_len);
4433	} else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
4434		vcpu->arch.apf.host_apf_flags = 0;
4435		local_irq_disable();
4436		kvm_async_pf_task_wait_schedule(fault_address);
4437		local_irq_enable();
4438	} else {
4439		WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
4440	}
4441
4442	return r;
4443}
4444EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4445
4446#ifdef CONFIG_X86_64
4447static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
4448				  struct kvm_page_fault *fault)
4449{
4450	int r;
4451
4452	if (page_fault_handle_page_track(vcpu, fault))
4453		return RET_PF_EMULATE;
4454
4455	r = fast_page_fault(vcpu, fault);
4456	if (r != RET_PF_INVALID)
4457		return r;
4458
4459	r = mmu_topup_memory_caches(vcpu, false);
4460	if (r)
4461		return r;
4462
4463	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
4464	if (r != RET_PF_CONTINUE)
4465		return r;
4466
4467	r = RET_PF_RETRY;
4468	read_lock(&vcpu->kvm->mmu_lock);
4469
4470	if (is_page_fault_stale(vcpu, fault))
4471		goto out_unlock;
4472
4473	r = kvm_tdp_mmu_map(vcpu, fault);
4474
4475out_unlock:
4476	read_unlock(&vcpu->kvm->mmu_lock);
4477	kvm_release_pfn_clean(fault->pfn);
4478	return r;
4479}
4480#endif
4481
4482int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4483{
4484	/*
4485	 * If the guest's MTRRs may be used to compute the "real" memtype,
4486	 * restrict the mapping level to ensure KVM uses a consistent memtype
4487	 * across the entire mapping.  If the host MTRRs are ignored by TDP
4488	 * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA
4489	 * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype
4490	 * from the guest's MTRRs so that guest accesses to memory that is
4491	 * DMA'd aren't cached against the guest's wishes.
4492	 *
4493	 * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
4494	 * e.g. KVM will force UC memtype for host MMIO.
4495	 */
4496	if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
4497		for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
4498			int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
4499			gfn_t base = gfn_round_for_level(fault->gfn,
4500							 fault->max_level);
4501
4502			if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
4503				break;
4504		}
4505	}
4506
4507#ifdef CONFIG_X86_64
4508	if (tdp_mmu_enabled)
4509		return kvm_tdp_mmu_page_fault(vcpu, fault);
4510#endif
4511
4512	return direct_page_fault(vcpu, fault);
4513}
4514
4515static void nonpaging_init_context(struct kvm_mmu *context)
4516{
4517	context->page_fault = nonpaging_page_fault;
4518	context->gva_to_gpa = nonpaging_gva_to_gpa;
4519	context->sync_spte = NULL;
4520}
4521
4522static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
4523				  union kvm_mmu_page_role role)
4524{
4525	struct kvm_mmu_page *sp;
4526
4527	if (!VALID_PAGE(root->hpa))
4528		return false;
4529
4530	if (!role.direct && pgd != root->pgd)
4531		return false;
4532
4533	sp = root_to_sp(root->hpa);
4534	if (WARN_ON_ONCE(!sp))
4535		return false;
4536
4537	return role.word == sp->role.word;
4538}
4539
4540/*
4541 * Find out if a previously cached root matching the new pgd/role is available,
4542 * and insert the current root as the MRU in the cache.
4543 * If a matching root is found, it is assigned to kvm_mmu->root and
4544 * true is returned.
4545 * If no match is found, kvm_mmu->root is left invalid, the LRU root is
4546 * evicted to make room for the current root, and false is returned.
4547 */
4548static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
4549					      gpa_t new_pgd,
4550					      union kvm_mmu_page_role new_role)
4551{
4552	uint i;
4553
4554	if (is_root_usable(&mmu->root, new_pgd, new_role))
4555		return true;
4556
4557	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4558		/*
4559		 * The swaps end up rotating the cache like this:
4560		 *   C   0 1 2 3   (on entry to the function)
4561		 *   0   C 1 2 3
4562		 *   1   C 0 2 3
4563		 *   2   C 0 1 3
4564		 *   3   C 0 1 2   (on exit from the loop)
4565		 */
4566		swap(mmu->root, mmu->prev_roots[i]);
4567		if (is_root_usable(&mmu->root, new_pgd, new_role))
4568			return true;
4569	}
4570
4571	kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4572	return false;
4573}
4574
4575/*
4576 * Find out if a previously cached root matching the new pgd/role is available.
4577 * On entry, mmu->root is invalid.
4578 * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
4579 * of the cache becomes invalid, and true is returned.
4580 * If no match is found, kvm_mmu->root is left invalid and false is returned.
4581 */
4582static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
4583					     gpa_t new_pgd,
4584					     union kvm_mmu_page_role new_role)
4585{
4586	uint i;
4587
4588	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
4589		if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role))
4590			goto hit;
4591
4592	return false;
4593
4594hit:
4595	swap(mmu->root, mmu->prev_roots[i]);
4596	/* Bubble up the remaining roots.  */
4597	for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
4598		mmu->prev_roots[i] = mmu->prev_roots[i + 1];
4599	mmu->prev_roots[i].hpa = INVALID_PAGE;
4600	return true;
4601}
4602
4603static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
4604			    gpa_t new_pgd, union kvm_mmu_page_role new_role)
4605{
4606	/*
4607	 * Limit reuse to 64-bit hosts+VMs without "special" roots in order to
4608	 * avoid having to deal with PDPTEs and other complexities.
4609	 */
4610	if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa))
4611		kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4612
4613	if (VALID_PAGE(mmu->root.hpa))
4614		return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
4615	else
4616		return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
4617}
4618
4619void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
4620{
4621	struct kvm_mmu *mmu = vcpu->arch.mmu;
4622	union kvm_mmu_page_role new_role = mmu->root_role;
4623
4624	/*
4625	 * Return immediately if no usable root was found, kvm_mmu_reload()
4626	 * will establish a valid root prior to the next VM-Enter.
4627	 */
4628	if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role))
4629		return;
4630
4631	/*
4632	 * It's possible that the cached previous root page is obsolete because
4633	 * of a change in the MMU generation number. However, changing the
4634	 * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
4635	 * which will free the root set here and allocate a new one.
4636	 */
4637	kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
4638
4639	if (force_flush_and_sync_on_reuse) {
4640		kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4641		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
4642	}
4643
4644	/*
4645	 * The last MMIO access's GVA and GPA are cached in the VCPU. When
4646	 * switching to a new CR3, that GVA->GPA mapping may no longer be
4647	 * valid. So clear any cached MMIO info even when we don't need to sync
4648	 * the shadow page tables.
4649	 */
4650	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4651
4652	/*
4653	 * If this is a direct root page, it doesn't have a write flooding
4654	 * count. Otherwise, clear the write flooding count.
4655	 */
4656	if (!new_role.direct) {
4657		struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
4658
4659		if (!WARN_ON_ONCE(!sp))
4660			__clear_sp_write_flooding_count(sp);
4661	}
4662}
4663EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
4664
4665static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4666			   unsigned int access)
4667{
4668	if (unlikely(is_mmio_spte(*sptep))) {
4669		if (gfn != get_mmio_spte_gfn(*sptep)) {
4670			mmu_spte_clear_no_track(sptep);
4671			return true;
4672		}
4673
4674		mark_mmio_spte(vcpu, sptep, gfn, access);
4675		return true;
4676	}
4677
4678	return false;
4679}
4680
4681#define PTTYPE_EPT 18 /* arbitrary */
4682#define PTTYPE PTTYPE_EPT
4683#include "paging_tmpl.h"
4684#undef PTTYPE
4685
4686#define PTTYPE 64
4687#include "paging_tmpl.h"
4688#undef PTTYPE
4689
4690#define PTTYPE 32
4691#include "paging_tmpl.h"
4692#undef PTTYPE
4693
4694static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
4695				    u64 pa_bits_rsvd, int level, bool nx,
4696				    bool gbpages, bool pse, bool amd)
4697{
4698	u64 gbpages_bit_rsvd = 0;
4699	u64 nonleaf_bit8_rsvd = 0;
4700	u64 high_bits_rsvd;
4701
4702	rsvd_check->bad_mt_xwr = 0;
4703
4704	if (!gbpages)
4705		gbpages_bit_rsvd = rsvd_bits(7, 7);
4706
4707	if (level == PT32E_ROOT_LEVEL)
4708		high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
4709	else
4710		high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4711
4712	/* Note, NX doesn't exist in PDPTEs, this is handled below. */
4713	if (!nx)
4714		high_bits_rsvd |= rsvd_bits(63, 63);
4715
4716	/*
4717	 * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4718	 * leaf entries) on AMD CPUs only.
4719	 */
4720	if (amd)
4721		nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4722
4723	switch (level) {
4724	case PT32_ROOT_LEVEL:
4725		/* no rsvd bits for 2 level 4K page table entries */
4726		rsvd_check->rsvd_bits_mask[0][1] = 0;
4727		rsvd_check->rsvd_bits_mask[0][0] = 0;
4728		rsvd_check->rsvd_bits_mask[1][0] =
4729			rsvd_check->rsvd_bits_mask[0][0];
4730
4731		if (!pse) {
4732			rsvd_check->rsvd_bits_mask[1][1] = 0;
4733			break;
4734		}
4735
4736		if (is_cpuid_PSE36())
4737			/* 36bits PSE 4MB page */
4738			rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4739		else
4740			/* 32 bits PSE 4MB page */
4741			rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4742		break;
4743	case PT32E_ROOT_LEVEL:
4744		rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
4745						   high_bits_rsvd |
4746						   rsvd_bits(5, 8) |
4747						   rsvd_bits(1, 2);	/* PDPTE */
4748		rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;	/* PDE */
4749		rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;	/* PTE */
4750		rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4751						   rsvd_bits(13, 20);	/* large page */
4752		rsvd_check->rsvd_bits_mask[1][0] =
4753			rsvd_check->rsvd_bits_mask[0][0];
4754		break;
4755	case PT64_ROOT_5LEVEL:
4756		rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
4757						   nonleaf_bit8_rsvd |
4758						   rsvd_bits(7, 7);
4759		rsvd_check->rsvd_bits_mask[1][4] =
4760			rsvd_check->rsvd_bits_mask[0][4];
4761		fallthrough;
4762	case PT64_ROOT_4LEVEL:
4763		rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
4764						   nonleaf_bit8_rsvd |
4765						   rsvd_bits(7, 7);
4766		rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
4767						   gbpages_bit_rsvd;
4768		rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
4769		rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4770		rsvd_check->rsvd_bits_mask[1][3] =
4771			rsvd_check->rsvd_bits_mask[0][3];
4772		rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
4773						   gbpages_bit_rsvd |
4774						   rsvd_bits(13, 29);
4775		rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4776						   rsvd_bits(13, 20); /* large page */
4777		rsvd_check->rsvd_bits_mask[1][0] =
4778			rsvd_check->rsvd_bits_mask[0][0];
4779		break;
4780	}
4781}
4782
4783static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4784					struct kvm_mmu *context)
4785{
4786	__reset_rsvds_bits_mask(&context->guest_rsvd_check,
4787				vcpu->arch.reserved_gpa_bits,
4788				context->cpu_role.base.level, is_efer_nx(context),
4789				guest_can_use(vcpu, X86_FEATURE_GBPAGES),
4790				is_cr4_pse(context),
4791				guest_cpuid_is_amd_or_hygon(vcpu));
4792}
4793
4794static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4795					u64 pa_bits_rsvd, bool execonly,
4796					int huge_page_level)
4797{
4798	u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4799	u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
4800	u64 bad_mt_xwr;
4801
4802	if (huge_page_level < PG_LEVEL_1G)
4803		large_1g_rsvd = rsvd_bits(7, 7);
4804	if (huge_page_level < PG_LEVEL_2M)
4805		large_2m_rsvd = rsvd_bits(7, 7);
4806
4807	rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
4808	rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
4809	rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
4810	rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
4811	rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4812
4813	/* large page */
4814	rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4815	rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4816	rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
4817	rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
4818	rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4819
4820	bad_mt_xwr = 0xFFull << (2 * 8);	/* bits 3..5 must not be 2 */
4821	bad_mt_xwr |= 0xFFull << (3 * 8);	/* bits 3..5 must not be 3 */
4822	bad_mt_xwr |= 0xFFull << (7 * 8);	/* bits 3..5 must not be 7 */
4823	bad_mt_xwr |= REPEAT_BYTE(1ull << 2);	/* bits 0..2 must not be 010 */
4824	bad_mt_xwr |= REPEAT_BYTE(1ull << 6);	/* bits 0..2 must not be 110 */
4825	if (!execonly) {
4826		/* bits 0..2 must not be 100 unless VMX capabilities allow it */
4827		bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4828	}
4829	rsvd_check->bad_mt_xwr = bad_mt_xwr;
4830}
4831
4832static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4833		struct kvm_mmu *context, bool execonly, int huge_page_level)
4834{
4835	__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4836				    vcpu->arch.reserved_gpa_bits, execonly,
4837				    huge_page_level);
4838}
4839
4840static inline u64 reserved_hpa_bits(void)
4841{
4842	return rsvd_bits(shadow_phys_bits, 63);
4843}
4844
4845/*
4846 * the page table on host is the shadow page table for the page
4847 * table in guest or amd nested guest, its mmu features completely
4848 * follow the features in guest.
4849 */
4850static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4851					struct kvm_mmu *context)
4852{
4853	/* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
4854	bool is_amd = true;
4855	/* KVM doesn't use 2-level page tables for the shadow MMU. */
4856	bool is_pse = false;
4857	struct rsvd_bits_validate *shadow_zero_check;
4858	int i;
4859
4860	WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
4861
4862	shadow_zero_check = &context->shadow_zero_check;
4863	__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4864				context->root_role.level,
4865				context->root_role.efer_nx,
4866				guest_can_use(vcpu, X86_FEATURE_GBPAGES),
4867				is_pse, is_amd);
4868
4869	if (!shadow_me_mask)
4870		return;
4871
4872	for (i = context->root_role.level; --i >= 0;) {
4873		/*
4874		 * So far shadow_me_value is a constant during KVM's life
4875		 * time.  Bits in shadow_me_value are allowed to be set.
4876		 * Bits in shadow_me_mask but not in shadow_me_value are
4877		 * not allowed to be set.
4878		 */
4879		shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask;
4880		shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask;
4881		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value;
4882		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value;
4883	}
4884
4885}
4886
4887static inline bool boot_cpu_is_amd(void)
4888{
4889	WARN_ON_ONCE(!tdp_enabled);
4890	return shadow_x_mask == 0;
4891}
4892
4893/*
4894 * the direct page table on host, use as much mmu features as
4895 * possible, however, kvm currently does not do execution-protection.
4896 */
4897static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
4898{
4899	struct rsvd_bits_validate *shadow_zero_check;
4900	int i;
4901
4902	shadow_zero_check = &context->shadow_zero_check;
4903
4904	if (boot_cpu_is_amd())
4905		__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4906					context->root_role.level, true,
4907					boot_cpu_has(X86_FEATURE_GBPAGES),
4908					false, true);
4909	else
4910		__reset_rsvds_bits_mask_ept(shadow_zero_check,
4911					    reserved_hpa_bits(), false,
4912					    max_huge_page_level);
4913
4914	if (!shadow_me_mask)
4915		return;
4916
4917	for (i = context->root_role.level; --i >= 0;) {
4918		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4919		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4920	}
4921}
4922
4923/*
4924 * as the comments in reset_shadow_zero_bits_mask() except it
4925 * is the shadow page table for intel nested guest.
4926 */
4927static void
4928reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
4929{
4930	__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4931				    reserved_hpa_bits(), execonly,
4932				    max_huge_page_level);
4933}
4934
4935#define BYTE_MASK(access) \
4936	((1 & (access) ? 2 : 0) | \
4937	 (2 & (access) ? 4 : 0) | \
4938	 (3 & (access) ? 8 : 0) | \
4939	 (4 & (access) ? 16 : 0) | \
4940	 (5 & (access) ? 32 : 0) | \
4941	 (6 & (access) ? 64 : 0) | \
4942	 (7 & (access) ? 128 : 0))
4943
4944
4945static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
4946{
4947	unsigned byte;
4948
4949	const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4950	const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4951	const u8 u = BYTE_MASK(ACC_USER_MASK);
4952
4953	bool cr4_smep = is_cr4_smep(mmu);
4954	bool cr4_smap = is_cr4_smap(mmu);
4955	bool cr0_wp = is_cr0_wp(mmu);
4956	bool efer_nx = is_efer_nx(mmu);
4957
4958	for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4959		unsigned pfec = byte << 1;
4960
4961		/*
4962		 * Each "*f" variable has a 1 bit for each UWX value
4963		 * that causes a fault with the given PFEC.
4964		 */
4965
4966		/* Faults from writes to non-writable pages */
4967		u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
4968		/* Faults from user mode accesses to supervisor pages */
4969		u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
4970		/* Faults from fetches of non-executable pages*/
4971		u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
4972		/* Faults from kernel mode fetches of user pages */
4973		u8 smepf = 0;
4974		/* Faults from kernel mode accesses of user pages */
4975		u8 smapf = 0;
4976
4977		if (!ept) {
4978			/* Faults from kernel mode accesses to user pages */
4979			u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4980
4981			/* Not really needed: !nx will cause pte.nx to fault */
4982			if (!efer_nx)
4983				ff = 0;
4984
4985			/* Allow supervisor writes if !cr0.wp */
4986			if (!cr0_wp)
4987				wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4988
4989			/* Disallow supervisor fetches of user code if cr4.smep */
4990			if (cr4_smep)
4991				smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4992
4993			/*
4994			 * SMAP:kernel-mode data accesses from user-mode
4995			 * mappings should fault. A fault is considered
4996			 * as a SMAP violation if all of the following
4997			 * conditions are true:
4998			 *   - X86_CR4_SMAP is set in CR4
4999			 *   - A user page is accessed
5000			 *   - The access is not a fetch
5001			 *   - The access is supervisor mode
5002			 *   - If implicit supervisor access or X86_EFLAGS_AC is clear
5003			 *
5004			 * Here, we cover the first four conditions.
5005			 * The fifth is computed dynamically in permission_fault();
5006			 * PFERR_RSVD_MASK bit will be set in PFEC if the access is
5007			 * *not* subject to SMAP restrictions.
5008			 */
5009			if (cr4_smap)
5010				smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
5011		}
5012
5013		mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
5014	}
5015}
5016
5017/*
5018* PKU is an additional mechanism by which the paging controls access to
5019* user-mode addresses based on the value in the PKRU register.  Protection
5020* key violations are reported through a bit in the page fault error code.
5021* Unlike other bits of the error code, the PK bit is not known at the
5022* call site of e.g. gva_to_gpa; it must be computed directly in
5023* permission_fault based on two bits of PKRU, on some machine state (CR4,
5024* CR0, EFER, CPL), and on other bits of the error code and the page tables.
5025*
5026* In particular the following conditions come from the error code, the
5027* page tables and the machine state:
5028* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
5029* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
5030* - PK is always zero if U=0 in the page tables
5031* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
5032*
5033* The PKRU bitmask caches the result of these four conditions.  The error
5034* code (minus the P bit) and the page table's U bit form an index into the
5035* PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
5036* with the two bits of the PKRU register corresponding to the protection key.
5037* For the first three conditions above the bits will be 00, thus masking
5038* away both AD and WD.  For all reads or if the last condition holds, WD
5039* only will be masked away.
5040*/
5041static void update_pkru_bitmask(struct kvm_mmu *mmu)
5042{
5043	unsigned bit;
5044	bool wp;
5045
5046	mmu->pkru_mask = 0;
5047
5048	if (!is_cr4_pke(mmu))
5049		return;
5050
5051	wp = is_cr0_wp(mmu);
5052
5053	for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
5054		unsigned pfec, pkey_bits;
5055		bool check_pkey, check_write, ff, uf, wf, pte_user;
5056
5057		pfec = bit << 1;
5058		ff = pfec & PFERR_FETCH_MASK;
5059		uf = pfec & PFERR_USER_MASK;
5060		wf = pfec & PFERR_WRITE_MASK;
5061
5062		/* PFEC.RSVD is replaced by ACC_USER_MASK. */
5063		pte_user = pfec & PFERR_RSVD_MASK;
5064
5065		/*
5066		 * Only need to check the access which is not an
5067		 * instruction fetch and is to a user page.
5068		 */
5069		check_pkey = (!ff && pte_user);
5070		/*
5071		 * write access is controlled by PKRU if it is a
5072		 * user access or CR0.WP = 1.
5073		 */
5074		check_write = check_pkey && wf && (uf || wp);
5075
5076		/* PKRU.AD stops both read and write access. */
5077		pkey_bits = !!check_pkey;
5078		/* PKRU.WD stops write access. */
5079		pkey_bits |= (!!check_write) << 1;
5080
5081		mmu->pkru_mask |= (pkey_bits & 3) << pfec;
5082	}
5083}
5084
5085static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
5086					struct kvm_mmu *mmu)
5087{
5088	if (!is_cr0_pg(mmu))
5089		return;
5090
5091	reset_guest_rsvds_bits_mask(vcpu, mmu);
5092	update_permission_bitmask(mmu, false);
5093	update_pkru_bitmask(mmu);
5094}
5095
5096static void paging64_init_context(struct kvm_mmu *context)
5097{
5098	context->page_fault = paging64_page_fault;
5099	context->gva_to_gpa = paging64_gva_to_gpa;
5100	context->sync_spte = paging64_sync_spte;
5101}
5102
5103static void paging32_init_context(struct kvm_mmu *context)
5104{
5105	context->page_fault = paging32_page_fault;
5106	context->gva_to_gpa = paging32_gva_to_gpa;
5107	context->sync_spte = paging32_sync_spte;
5108}
5109
5110static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
5111					    const struct kvm_mmu_role_regs *regs)
5112{
5113	union kvm_cpu_role role = {0};
5114
5115	role.base.access = ACC_ALL;
5116	role.base.smm = is_smm(vcpu);
5117	role.base.guest_mode = is_guest_mode(vcpu);
5118	role.ext.valid = 1;
5119
5120	if (!____is_cr0_pg(regs)) {
5121		role.base.direct = 1;
5122		return role;
5123	}
5124
5125	role.base.efer_nx = ____is_efer_nx(regs);
5126	role.base.cr0_wp = ____is_cr0_wp(regs);
5127	role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
5128	role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
5129	role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
5130
5131	if (____is_efer_lma(regs))
5132		role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL
5133							: PT64_ROOT_4LEVEL;
5134	else if (____is_cr4_pae(regs))
5135		role.base.level = PT32E_ROOT_LEVEL;
5136	else
5137		role.base.level = PT32_ROOT_LEVEL;
5138
5139	role.ext.cr4_smep = ____is_cr4_smep(regs);
5140	role.ext.cr4_smap = ____is_cr4_smap(regs);
5141	role.ext.cr4_pse = ____is_cr4_pse(regs);
5142
5143	/* PKEY and LA57 are active iff long mode is active. */
5144	role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
5145	role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
5146	role.ext.efer_lma = ____is_efer_lma(regs);
5147	return role;
5148}
5149
5150void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
5151					struct kvm_mmu *mmu)
5152{
5153	const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP);
5154
5155	BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP);
5156	BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS));
5157
5158	if (is_cr0_wp(mmu) == cr0_wp)
5159		return;
5160
5161	mmu->cpu_role.base.cr0_wp = cr0_wp;
5162	reset_guest_paging_metadata(vcpu, mmu);
5163}
5164
5165static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
5166{
5167	/* tdp_root_level is architecture forced level, use it if nonzero */
5168	if (tdp_root_level)
5169		return tdp_root_level;
5170
5171	/* Use 5-level TDP if and only if it's useful/necessary. */
5172	if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
5173		return 4;
5174
5175	return max_tdp_level;
5176}
5177
5178static union kvm_mmu_page_role
5179kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
5180				union kvm_cpu_role cpu_role)
5181{
5182	union kvm_mmu_page_role role = {0};
5183
5184	role.access = ACC_ALL;
5185	role.cr0_wp = true;
5186	role.efer_nx = true;
5187	role.smm = cpu_role.base.smm;
5188	role.guest_mode = cpu_role.base.guest_mode;
5189	role.ad_disabled = !kvm_ad_enabled();
5190	role.level = kvm_mmu_get_tdp_level(vcpu);
5191	role.direct = true;
5192	role.has_4_byte_gpte = false;
5193
5194	return role;
5195}
5196
5197static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
5198			     union kvm_cpu_role cpu_role)
5199{
5200	struct kvm_mmu *context = &vcpu->arch.root_mmu;
5201	union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
5202
5203	if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
5204	    root_role.word == context->root_role.word)
5205		return;
5206
5207	context->cpu_role.as_u64 = cpu_role.as_u64;
5208	context->root_role.word = root_role.word;
5209	context->page_fault = kvm_tdp_page_fault;
5210	context->sync_spte = NULL;
5211	context->get_guest_pgd = get_guest_cr3;
5212	context->get_pdptr = kvm_pdptr_read;
5213	context->inject_page_fault = kvm_inject_page_fault;
5214
5215	if (!is_cr0_pg(context))
5216		context->gva_to_gpa = nonpaging_gva_to_gpa;
5217	else if (is_cr4_pae(context))
5218		context->gva_to_gpa = paging64_gva_to_gpa;
5219	else
5220		context->gva_to_gpa = paging32_gva_to_gpa;
5221
5222	reset_guest_paging_metadata(vcpu, context);
5223	reset_tdp_shadow_zero_bits_mask(context);
5224}
5225
5226static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
5227				    union kvm_cpu_role cpu_role,
5228				    union kvm_mmu_page_role root_role)
5229{
5230	if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
5231	    root_role.word == context->root_role.word)
5232		return;
5233
5234	context->cpu_role.as_u64 = cpu_role.as_u64;
5235	context->root_role.word = root_role.word;
5236
5237	if (!is_cr0_pg(context))
5238		nonpaging_init_context(context);
5239	else if (is_cr4_pae(context))
5240		paging64_init_context(context);
5241	else
5242		paging32_init_context(context);
5243
5244	reset_guest_paging_metadata(vcpu, context);
5245	reset_shadow_zero_bits_mask(vcpu, context);
5246}
5247
5248static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
5249				union kvm_cpu_role cpu_role)
5250{
5251	struct kvm_mmu *context = &vcpu->arch.root_mmu;
5252	union kvm_mmu_page_role root_role;
5253
5254	root_role = cpu_role.base;
5255
5256	/* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
5257	root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
5258
5259	/*
5260	 * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
5261	 * KVM uses NX when TDP is disabled to handle a variety of scenarios,
5262	 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
5263	 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
5264	 * The iTLB multi-hit workaround can be toggled at any time, so assume
5265	 * NX can be used by any non-nested shadow MMU to avoid having to reset
5266	 * MMU contexts.
5267	 */
5268	root_role.efer_nx = true;
5269
5270	shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
5271}
5272
5273void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
5274			     unsigned long cr4, u64 efer, gpa_t nested_cr3)
5275{
5276	struct kvm_mmu *context = &vcpu->arch.guest_mmu;
5277	struct kvm_mmu_role_regs regs = {
5278		.cr0 = cr0,
5279		.cr4 = cr4 & ~X86_CR4_PKE,
5280		.efer = efer,
5281	};
5282	union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
5283	union kvm_mmu_page_role root_role;
5284
5285	/* NPT requires CR0.PG=1. */
5286	WARN_ON_ONCE(cpu_role.base.direct);
5287
5288	root_role = cpu_role.base;
5289	root_role.level = kvm_mmu_get_tdp_level(vcpu);
5290	if (root_role.level == PT64_ROOT_5LEVEL &&
5291	    cpu_role.base.level == PT64_ROOT_4LEVEL)
5292		root_role.passthrough = 1;
5293
5294	shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
5295	kvm_mmu_new_pgd(vcpu, nested_cr3);
5296}
5297EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
5298
5299static union kvm_cpu_role
5300kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
5301				   bool execonly, u8 level)
5302{
5303	union kvm_cpu_role role = {0};
5304
5305	/*
5306	 * KVM does not support SMM transfer monitors, and consequently does not
5307	 * support the "entry to SMM" control either.  role.base.smm is always 0.
5308	 */
5309	WARN_ON_ONCE(is_smm(vcpu));
5310	role.base.level = level;
5311	role.base.has_4_byte_gpte = false;
5312	role.base.direct = false;
5313	role.base.ad_disabled = !accessed_dirty;
5314	role.base.guest_mode = true;
5315	role.base.access = ACC_ALL;
5316
5317	role.ext.word = 0;
5318	role.ext.execonly = execonly;
5319	role.ext.valid = 1;
5320
5321	return role;
5322}
5323
5324void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
5325			     int huge_page_level, bool accessed_dirty,
5326			     gpa_t new_eptp)
5327{
5328	struct kvm_mmu *context = &vcpu->arch.guest_mmu;
5329	u8 level = vmx_eptp_page_walk_level(new_eptp);
5330	union kvm_cpu_role new_mode =
5331		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
5332						   execonly, level);
5333
5334	if (new_mode.as_u64 != context->cpu_role.as_u64) {
5335		/* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
5336		context->cpu_role.as_u64 = new_mode.as_u64;
5337		context->root_role.word = new_mode.base.word;
5338
5339		context->page_fault = ept_page_fault;
5340		context->gva_to_gpa = ept_gva_to_gpa;
5341		context->sync_spte = ept_sync_spte;
5342
5343		update_permission_bitmask(context, true);
5344		context->pkru_mask = 0;
5345		reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
5346		reset_ept_shadow_zero_bits_mask(context, execonly);
5347	}
5348
5349	kvm_mmu_new_pgd(vcpu, new_eptp);
5350}
5351EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
5352
5353static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
5354			     union kvm_cpu_role cpu_role)
5355{
5356	struct kvm_mmu *context = &vcpu->arch.root_mmu;
5357
5358	kvm_init_shadow_mmu(vcpu, cpu_role);
5359
5360	context->get_guest_pgd     = get_guest_cr3;
5361	context->get_pdptr         = kvm_pdptr_read;
5362	context->inject_page_fault = kvm_inject_page_fault;
5363}
5364
5365static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
5366				union kvm_cpu_role new_mode)
5367{
5368	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5369
5370	if (new_mode.as_u64 == g_context->cpu_role.as_u64)
5371		return;
5372
5373	g_context->cpu_role.as_u64   = new_mode.as_u64;
5374	g_context->get_guest_pgd     = get_guest_cr3;
5375	g_context->get_pdptr         = kvm_pdptr_read;
5376	g_context->inject_page_fault = kvm_inject_page_fault;
5377
5378	/*
5379	 * L2 page tables are never shadowed, so there is no need to sync
5380	 * SPTEs.
5381	 */
5382	g_context->sync_spte         = NULL;
5383
5384	/*
5385	 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
5386	 * L1's nested page tables (e.g. EPT12). The nested translation
5387	 * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
5388	 * L2's page tables as the first level of translation and L1's
5389	 * nested page tables as the second level of translation. Basically
5390	 * the gva_to_gpa functions between mmu and nested_mmu are swapped.
5391	 */
5392	if (!is_paging(vcpu))
5393		g_context->gva_to_gpa = nonpaging_gva_to_gpa;
5394	else if (is_long_mode(vcpu))
5395		g_context->gva_to_gpa = paging64_gva_to_gpa;
5396	else if (is_pae(vcpu))
5397		g_context->gva_to_gpa = paging64_gva_to_gpa;
5398	else
5399		g_context->gva_to_gpa = paging32_gva_to_gpa;
5400
5401	reset_guest_paging_metadata(vcpu, g_context);
5402}
5403
5404void kvm_init_mmu(struct kvm_vcpu *vcpu)
5405{
5406	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
5407	union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
5408
5409	if (mmu_is_nested(vcpu))
5410		init_kvm_nested_mmu(vcpu, cpu_role);
5411	else if (tdp_enabled)
5412		init_kvm_tdp_mmu(vcpu, cpu_role);
5413	else
5414		init_kvm_softmmu(vcpu, cpu_role);
5415}
5416EXPORT_SYMBOL_GPL(kvm_init_mmu);
5417
5418void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
5419{
5420	/*
5421	 * Invalidate all MMU roles to force them to reinitialize as CPUID
5422	 * information is factored into reserved bit calculations.
5423	 *
5424	 * Correctly handling multiple vCPU models with respect to paging and
5425	 * physical address properties) in a single VM would require tracking
5426	 * all relevant CPUID information in kvm_mmu_page_role. That is very
5427	 * undesirable as it would increase the memory requirements for
5428	 * gfn_write_track (see struct kvm_mmu_page_role comments).  For now
5429	 * that problem is swept under the rug; KVM's CPUID API is horrific and
5430	 * it's all but impossible to solve it without introducing a new API.
5431	 */
5432	vcpu->arch.root_mmu.root_role.word = 0;
5433	vcpu->arch.guest_mmu.root_role.word = 0;
5434	vcpu->arch.nested_mmu.root_role.word = 0;
5435	vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
5436	vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
5437	vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
5438	kvm_mmu_reset_context(vcpu);
5439
5440	/*
5441	 * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
5442	 * kvm_arch_vcpu_ioctl().
5443	 */
5444	KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm);
5445}
5446
5447void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5448{
5449	kvm_mmu_unload(vcpu);
5450	kvm_init_mmu(vcpu);
5451}
5452EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5453
5454int kvm_mmu_load(struct kvm_vcpu *vcpu)
5455{
5456	int r;
5457
5458	r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
5459	if (r)
5460		goto out;
5461	r = mmu_alloc_special_roots(vcpu);
5462	if (r)
5463		goto out;
5464	if (vcpu->arch.mmu->root_role.direct)
5465		r = mmu_alloc_direct_roots(vcpu);
5466	else
5467		r = mmu_alloc_shadow_roots(vcpu);
5468	if (r)
5469		goto out;
5470
5471	kvm_mmu_sync_roots(vcpu);
5472
5473	kvm_mmu_load_pgd(vcpu);
5474
5475	/*
5476	 * Flush any TLB entries for the new root, the provenance of the root
5477	 * is unknown.  Even if KVM ensures there are no stale TLB entries
5478	 * for a freed root, in theory another hypervisor could have left
5479	 * stale entries.  Flushing on alloc also allows KVM to skip the TLB
5480	 * flush when freeing a root (see kvm_tdp_mmu_put_root()).
5481	 */
5482	static_call(kvm_x86_flush_tlb_current)(vcpu);
5483out:
5484	return r;
5485}
5486
5487void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5488{
5489	struct kvm *kvm = vcpu->kvm;
5490
5491	kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5492	WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
5493	kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5494	WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
5495	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
5496}
5497
5498static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
5499{
5500	struct kvm_mmu_page *sp;
5501
5502	if (!VALID_PAGE(root_hpa))
5503		return false;
5504
5505	/*
5506	 * When freeing obsolete roots, treat roots as obsolete if they don't
5507	 * have an associated shadow page, as it's impossible to determine if
5508	 * such roots are fresh or stale.  This does mean KVM will get false
5509	 * positives and free roots that don't strictly need to be freed, but
5510	 * such false positives are relatively rare:
5511	 *
5512	 *  (a) only PAE paging and nested NPT have roots without shadow pages
5513	 *      (or any shadow paging flavor with a dummy root, see note below)
5514	 *  (b) remote reloads due to a memslot update obsoletes _all_ roots
5515	 *  (c) KVM doesn't track previous roots for PAE paging, and the guest
5516	 *      is unlikely to zap an in-use PGD.
5517	 *
5518	 * Note!  Dummy roots are unique in that they are obsoleted by memslot
5519	 * _creation_!  See also FNAME(fetch).
5520	 */
5521	sp = root_to_sp(root_hpa);
5522	return !sp || is_obsolete_sp(kvm, sp);
5523}
5524
5525static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
5526{
5527	unsigned long roots_to_free = 0;
5528	int i;
5529
5530	if (is_obsolete_root(kvm, mmu->root.hpa))
5531		roots_to_free |= KVM_MMU_ROOT_CURRENT;
5532
5533	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5534		if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa))
5535			roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5536	}
5537
5538	if (roots_to_free)
5539		kvm_mmu_free_roots(kvm, mmu, roots_to_free);
5540}
5541
5542void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
5543{
5544	__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
5545	__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
5546}
5547
5548static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5549				    int *bytes)
5550{
5551	u64 gentry = 0;
5552	int r;
5553
5554	/*
5555	 * Assume that the pte write on a page table of the same type
5556	 * as the current vcpu paging mode since we update the sptes only
5557	 * when they have the same mode.
5558	 */
5559	if (is_pae(vcpu) && *bytes == 4) {
5560		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
5561		*gpa &= ~(gpa_t)7;
5562		*bytes = 8;
5563	}
5564
5565	if (*bytes == 4 || *bytes == 8) {
5566		r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5567		if (r)
5568			gentry = 0;
5569	}
5570
5571	return gentry;
5572}
5573
5574/*
5575 * If we're seeing too many writes to a page, it may no longer be a page table,
5576 * or we may be forking, in which case it is better to unmap the page.
5577 */
5578static bool detect_write_flooding(struct kvm_mmu_page *sp)
5579{
5580	/*
5581	 * Skip write-flooding detected for the sp whose level is 1, because
5582	 * it can become unsync, then the guest page is not write-protected.
5583	 */
5584	if (sp->role.level == PG_LEVEL_4K)
5585		return false;
5586
5587	atomic_inc(&sp->write_flooding_count);
5588	return atomic_read(&sp->write_flooding_count) >= 3;
5589}
5590
5591/*
5592 * Misaligned accesses are too much trouble to fix up; also, they usually
5593 * indicate a page is not used as a page table.
5594 */
5595static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5596				    int bytes)
5597{
5598	unsigned offset, pte_size, misaligned;
5599
5600	offset = offset_in_page(gpa);
5601	pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
5602
5603	/*
5604	 * Sometimes, the OS only writes the last one bytes to update status
5605	 * bits, for example, in linux, andb instruction is used in clear_bit().
5606	 */
5607	if (!(offset & (pte_size - 1)) && bytes == 1)
5608		return false;
5609
5610	misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5611	misaligned |= bytes < 4;
5612
5613	return misaligned;
5614}
5615
5616static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5617{
5618	unsigned page_offset, quadrant;
5619	u64 *spte;
5620	int level;
5621
5622	page_offset = offset_in_page(gpa);
5623	level = sp->role.level;
5624	*nspte = 1;
5625	if (sp->role.has_4_byte_gpte) {
5626		page_offset <<= 1;	/* 32->64 */
5627		/*
5628		 * A 32-bit pde maps 4MB while the shadow pdes map
5629		 * only 2MB.  So we need to double the offset again
5630		 * and zap two pdes instead of one.
5631		 */
5632		if (level == PT32_ROOT_LEVEL) {
5633			page_offset &= ~7; /* kill rounding error */
5634			page_offset <<= 1;
5635			*nspte = 2;
5636		}
5637		quadrant = page_offset >> PAGE_SHIFT;
5638		page_offset &= ~PAGE_MASK;
5639		if (quadrant != sp->role.quadrant)
5640			return NULL;
5641	}
5642
5643	spte = &sp->spt[page_offset / sizeof(*spte)];
5644	return spte;
5645}
5646
5647void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
5648			 int bytes)
5649{
5650	gfn_t gfn = gpa >> PAGE_SHIFT;
5651	struct kvm_mmu_page *sp;
5652	LIST_HEAD(invalid_list);
5653	u64 entry, gentry, *spte;
5654	int npte;
5655	bool flush = false;
5656
5657	/*
5658	 * If we don't have indirect shadow pages, it means no page is
5659	 * write-protected, so we can exit simply.
5660	 */
5661	if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5662		return;
5663
5664	write_lock(&vcpu->kvm->mmu_lock);
5665
5666	gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5667
5668	++vcpu->kvm->stat.mmu_pte_write;
5669
5670	for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) {
5671		if (detect_write_misaligned(sp, gpa, bytes) ||
5672		      detect_write_flooding(sp)) {
5673			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5674			++vcpu->kvm->stat.mmu_flooded;
5675			continue;
5676		}
5677
5678		spte = get_written_sptes(sp, gpa, &npte);
5679		if (!spte)
5680			continue;
5681
5682		while (npte--) {
5683			entry = *spte;
5684			mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
5685			if (gentry && sp->role.level != PG_LEVEL_4K)
5686				++vcpu->kvm->stat.mmu_pde_zapped;
5687			if (is_shadow_present_pte(entry))
5688				flush = true;
5689			++spte;
5690		}
5691	}
5692	kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
5693	write_unlock(&vcpu->kvm->mmu_lock);
5694}
5695
5696int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
5697		       void *insn, int insn_len)
5698{
5699	int r, emulation_type = EMULTYPE_PF;
5700	bool direct = vcpu->arch.mmu->root_role.direct;
5701
5702	/*
5703	 * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
5704	 * checks when emulating instructions that triggers implicit access.
5705	 * WARN if hardware generates a fault with an error code that collides
5706	 * with the KVM-defined value.  Clear the flag and continue on, i.e.
5707	 * don't terminate the VM, as KVM can't possibly be relying on a flag
5708	 * that KVM doesn't know about.
5709	 */
5710	if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
5711		error_code &= ~PFERR_IMPLICIT_ACCESS;
5712
5713	if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
5714		return RET_PF_RETRY;
5715
5716	r = RET_PF_INVALID;
5717	if (unlikely(error_code & PFERR_RSVD_MASK)) {
5718		r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
5719		if (r == RET_PF_EMULATE)
5720			goto emulate;
5721	}
5722
5723	if (r == RET_PF_INVALID) {
5724		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
5725					  lower_32_bits(error_code), false,
5726					  &emulation_type);
5727		if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
5728			return -EIO;
5729	}
5730
5731	if (r < 0)
5732		return r;
5733	if (r != RET_PF_EMULATE)
5734		return 1;
5735
5736	/*
5737	 * Before emulating the instruction, check if the error code
5738	 * was due to a RO violation while translating the guest page.
5739	 * This can occur when using nested virtualization with nested
5740	 * paging in both guests. If true, we simply unprotect the page
5741	 * and resume the guest.
5742	 */
5743	if (vcpu->arch.mmu->root_role.direct &&
5744	    (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5745		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
5746		return 1;
5747	}
5748
5749	/*
5750	 * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
5751	 * optimistically try to just unprotect the page and let the processor
5752	 * re-execute the instruction that caused the page fault.  Do not allow
5753	 * retrying MMIO emulation, as it's not only pointless but could also
5754	 * cause us to enter an infinite loop because the processor will keep
5755	 * faulting on the non-existent MMIO address.  Retrying an instruction
5756	 * from a nested guest is also pointless and dangerous as we are only
5757	 * explicitly shadowing L1's page tables, i.e. unprotecting something
5758	 * for L1 isn't going to magically fix whatever issue cause L2 to fail.
5759	 */
5760	if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
5761		emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
5762emulate:
5763	return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
5764				       insn_len);
5765}
5766EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5767
5768static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
5769				      u64 addr, hpa_t root_hpa)
5770{
5771	struct kvm_shadow_walk_iterator iterator;
5772
5773	vcpu_clear_mmio_info(vcpu, addr);
5774
5775	/*
5776	 * Walking and synchronizing SPTEs both assume they are operating in
5777	 * the context of the current MMU, and would need to be reworked if
5778	 * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT.
5779	 */
5780	if (WARN_ON_ONCE(mmu != vcpu->arch.mmu))
5781		return;
5782
5783	if (!VALID_PAGE(root_hpa))
5784		return;
5785
5786	write_lock(&vcpu->kvm->mmu_lock);
5787	for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) {
5788		struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep);
5789
5790		if (sp->unsync) {
5791			int ret = kvm_sync_spte(vcpu, sp, iterator.index);
5792
5793			if (ret < 0)
5794				mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL);
5795			if (ret)
5796				kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep);
5797		}
5798
5799		if (!sp->unsync_children)
5800			break;
5801	}
5802	write_unlock(&vcpu->kvm->mmu_lock);
5803}
5804
5805void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
5806			     u64 addr, unsigned long roots)
5807{
5808	int i;
5809
5810	WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL);
5811
5812	/* It's actually a GPA for vcpu->arch.guest_mmu.  */
5813	if (mmu != &vcpu->arch.guest_mmu) {
5814		/* INVLPG on a non-canonical address is a NOP according to the SDM.  */
5815		if (is_noncanonical_address(addr, vcpu))
5816			return;
5817
5818		static_call(kvm_x86_flush_tlb_gva)(vcpu, addr);
5819	}
5820
5821	if (!mmu->sync_spte)
5822		return;
5823
5824	if (roots & KVM_MMU_ROOT_CURRENT)
5825		__kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa);
5826
5827	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5828		if (roots & KVM_MMU_ROOT_PREVIOUS(i))
5829			__kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa);
5830	}
5831}
5832EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr);
5833
5834void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5835{
5836	/*
5837	 * INVLPG is required to invalidate any global mappings for the VA,
5838	 * irrespective of PCID.  Blindly sync all roots as it would take
5839	 * roughly the same amount of work/time to determine whether any of the
5840	 * previous roots have a global mapping.
5841	 *
5842	 * Mappings not reachable via the current or previous cached roots will
5843	 * be synced when switching to that new cr3, so nothing needs to be
5844	 * done here for them.
5845	 */
5846	kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL);
5847	++vcpu->stat.invlpg;
5848}
5849EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5850
5851
5852void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5853{
5854	struct kvm_mmu *mmu = vcpu->arch.mmu;
5855	unsigned long roots = 0;
5856	uint i;
5857
5858	if (pcid == kvm_get_active_pcid(vcpu))
5859		roots |= KVM_MMU_ROOT_CURRENT;
5860
5861	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5862		if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5863		    pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd))
5864			roots |= KVM_MMU_ROOT_PREVIOUS(i);
5865	}
5866
5867	if (roots)
5868		kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots);
5869	++vcpu->stat.invlpg;
5870
5871	/*
5872	 * Mappings not reachable via the current cr3 or the prev_roots will be
5873	 * synced when switching to that cr3, so nothing needs to be done here
5874	 * for them.
5875	 */
5876}
5877
5878void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
5879		       int tdp_max_root_level, int tdp_huge_page_level)
5880{
5881	tdp_enabled = enable_tdp;
5882	tdp_root_level = tdp_forced_root_level;
5883	max_tdp_level = tdp_max_root_level;
5884
5885#ifdef CONFIG_X86_64
5886	tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled;
5887#endif
5888	/*
5889	 * max_huge_page_level reflects KVM's MMU capabilities irrespective
5890	 * of kernel support, e.g. KVM may be capable of using 1GB pages when
5891	 * the kernel is not.  But, KVM never creates a page size greater than
5892	 * what is used by the kernel for any given HVA, i.e. the kernel's
5893	 * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
5894	 */
5895	if (tdp_enabled)
5896		max_huge_page_level = tdp_huge_page_level;
5897	else if (boot_cpu_has(X86_FEATURE_GBPAGES))
5898		max_huge_page_level = PG_LEVEL_1G;
5899	else
5900		max_huge_page_level = PG_LEVEL_2M;
5901}
5902EXPORT_SYMBOL_GPL(kvm_configure_mmu);
5903
5904/* The return value indicates if tlb flush on all vcpus is needed. */
5905typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
5906				    struct kvm_rmap_head *rmap_head,
5907				    const struct kvm_memory_slot *slot);
5908
5909static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
5910					      const struct kvm_memory_slot *slot,
5911					      slot_rmaps_handler fn,
5912					      int start_level, int end_level,
5913					      gfn_t start_gfn, gfn_t end_gfn,
5914					      bool flush_on_yield, bool flush)
5915{
5916	struct slot_rmap_walk_iterator iterator;
5917
5918	lockdep_assert_held_write(&kvm->mmu_lock);
5919
5920	for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
5921			end_gfn, &iterator) {
5922		if (iterator.rmap)
5923			flush |= fn(kvm, iterator.rmap, slot);
5924
5925		if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
5926			if (flush && flush_on_yield) {
5927				kvm_flush_remote_tlbs_range(kvm, start_gfn,
5928							    iterator.gfn - start_gfn + 1);
5929				flush = false;
5930			}
5931			cond_resched_rwlock_write(&kvm->mmu_lock);
5932		}
5933	}
5934
5935	return flush;
5936}
5937
5938static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
5939					    const struct kvm_memory_slot *slot,
5940					    slot_rmaps_handler fn,
5941					    int start_level, int end_level,
5942					    bool flush_on_yield)
5943{
5944	return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
5945				 slot->base_gfn, slot->base_gfn + slot->npages - 1,
5946				 flush_on_yield, false);
5947}
5948
5949static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
5950					       const struct kvm_memory_slot *slot,
5951					       slot_rmaps_handler fn,
5952					       bool flush_on_yield)
5953{
5954	return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
5955}
5956
5957static void free_mmu_pages(struct kvm_mmu *mmu)
5958{
5959	if (!tdp_enabled && mmu->pae_root)
5960		set_memory_encrypted((unsigned long)mmu->pae_root, 1);
5961	free_page((unsigned long)mmu->pae_root);
5962	free_page((unsigned long)mmu->pml4_root);
5963	free_page((unsigned long)mmu->pml5_root);
5964}
5965
5966static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
5967{
5968	struct page *page;
5969	int i;
5970
5971	mmu->root.hpa = INVALID_PAGE;
5972	mmu->root.pgd = 0;
5973	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5974		mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5975
5976	/* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
5977	if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
5978		return 0;
5979
5980	/*
5981	 * When using PAE paging, the four PDPTEs are treated as 'root' pages,
5982	 * while the PDP table is a per-vCPU construct that's allocated at MMU
5983	 * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
5984	 * x86_64.  Therefore we need to allocate the PDP table in the first
5985	 * 4GB of memory, which happens to fit the DMA32 zone.  TDP paging
5986	 * generally doesn't use PAE paging and can skip allocating the PDP
5987	 * table.  The main exception, handled here, is SVM's 32-bit NPT.  The
5988	 * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
5989	 * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
5990	 */
5991	if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5992		return 0;
5993
5994	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5995	if (!page)
5996		return -ENOMEM;
5997
5998	mmu->pae_root = page_address(page);
5999
6000	/*
6001	 * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
6002	 * get the CPU to treat the PDPTEs as encrypted.  Decrypt the page so
6003	 * that KVM's writes and the CPU's reads get along.  Note, this is
6004	 * only necessary when using shadow paging, as 64-bit NPT can get at
6005	 * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
6006	 * by 32-bit kernels (when KVM itself uses 32-bit NPT).
6007	 */
6008	if (!tdp_enabled)
6009		set_memory_decrypted((unsigned long)mmu->pae_root, 1);
6010	else
6011		WARN_ON_ONCE(shadow_me_value);
6012
6013	for (i = 0; i < 4; ++i)
6014		mmu->pae_root[i] = INVALID_PAE_ROOT;
6015
6016	return 0;
6017}
6018
6019int kvm_mmu_create(struct kvm_vcpu *vcpu)
6020{
6021	int ret;
6022
6023	vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
6024	vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
6025
6026	vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
6027	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
6028
6029	vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
6030
6031	vcpu->arch.mmu = &vcpu->arch.root_mmu;
6032	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
6033
6034	ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
6035	if (ret)
6036		return ret;
6037
6038	ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
6039	if (ret)
6040		goto fail_allocate_root;
6041
6042	return ret;
6043 fail_allocate_root:
6044	free_mmu_pages(&vcpu->arch.guest_mmu);
6045	return ret;
6046}
6047
6048#define BATCH_ZAP_PAGES	10
6049static void kvm_zap_obsolete_pages(struct kvm *kvm)
6050{
6051	struct kvm_mmu_page *sp, *node;
6052	int nr_zapped, batch = 0;
6053	bool unstable;
6054
6055restart:
6056	list_for_each_entry_safe_reverse(sp, node,
6057	      &kvm->arch.active_mmu_pages, link) {
6058		/*
6059		 * No obsolete valid page exists before a newly created page
6060		 * since active_mmu_pages is a FIFO list.
6061		 */
6062		if (!is_obsolete_sp(kvm, sp))
6063			break;
6064
6065		/*
6066		 * Invalid pages should never land back on the list of active
6067		 * pages.  Skip the bogus page, otherwise we'll get stuck in an
6068		 * infinite loop if the page gets put back on the list (again).
6069		 */
6070		if (WARN_ON_ONCE(sp->role.invalid))
6071			continue;
6072
6073		/*
6074		 * No need to flush the TLB since we're only zapping shadow
6075		 * pages with an obsolete generation number and all vCPUS have
6076		 * loaded a new root, i.e. the shadow pages being zapped cannot
6077		 * be in active use by the guest.
6078		 */
6079		if (batch >= BATCH_ZAP_PAGES &&
6080		    cond_resched_rwlock_write(&kvm->mmu_lock)) {
6081			batch = 0;
6082			goto restart;
6083		}
6084
6085		unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
6086				&kvm->arch.zapped_obsolete_pages, &nr_zapped);
6087		batch += nr_zapped;
6088
6089		if (unstable)
6090			goto restart;
6091	}
6092
6093	/*
6094	 * Kick all vCPUs (via remote TLB flush) before freeing the page tables
6095	 * to ensure KVM is not in the middle of a lockless shadow page table
6096	 * walk, which may reference the pages.  The remote TLB flush itself is
6097	 * not required and is simply a convenient way to kick vCPUs as needed.
6098	 * KVM performs a local TLB flush when allocating a new root (see
6099	 * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
6100	 * running with an obsolete MMU.
6101	 */
6102	kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
6103}
6104
6105/*
6106 * Fast invalidate all shadow pages and use lock-break technique
6107 * to zap obsolete pages.
6108 *
6109 * It's required when memslot is being deleted or VM is being
6110 * destroyed, in these cases, we should ensure that KVM MMU does
6111 * not use any resource of the being-deleted slot or all slots
6112 * after calling the function.
6113 */
6114static void kvm_mmu_zap_all_fast(struct kvm *kvm)
6115{
6116	lockdep_assert_held(&kvm->slots_lock);
6117
6118	write_lock(&kvm->mmu_lock);
6119	trace_kvm_mmu_zap_all_fast(kvm);
6120
6121	/*
6122	 * Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
6123	 * held for the entire duration of zapping obsolete pages, it's
6124	 * impossible for there to be multiple invalid generations associated
6125	 * with *valid* shadow pages at any given time, i.e. there is exactly
6126	 * one valid generation and (at most) one invalid generation.
6127	 */
6128	kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
6129
6130	/*
6131	 * In order to ensure all vCPUs drop their soon-to-be invalid roots,
6132	 * invalidating TDP MMU roots must be done while holding mmu_lock for
6133	 * write and in the same critical section as making the reload request,
6134	 * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
6135	 */
6136	if (tdp_mmu_enabled)
6137		kvm_tdp_mmu_invalidate_all_roots(kvm);
6138
6139	/*
6140	 * Notify all vcpus to reload its shadow page table and flush TLB.
6141	 * Then all vcpus will switch to new shadow page table with the new
6142	 * mmu_valid_gen.
6143	 *
6144	 * Note: we need to do this under the protection of mmu_lock,
6145	 * otherwise, vcpu would purge shadow page but miss tlb flush.
6146	 */
6147	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
6148
6149	kvm_zap_obsolete_pages(kvm);
6150
6151	write_unlock(&kvm->mmu_lock);
6152
6153	/*
6154	 * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
6155	 * returning to the caller, e.g. if the zap is in response to a memslot
6156	 * deletion, mmu_notifier callbacks will be unable to reach the SPTEs
6157	 * associated with the deleted memslot once the update completes, and
6158	 * Deferring the zap until the final reference to the root is put would
6159	 * lead to use-after-free.
6160	 */
6161	if (tdp_mmu_enabled)
6162		kvm_tdp_mmu_zap_invalidated_roots(kvm);
6163}
6164
6165static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
6166{
6167	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
6168}
6169
6170void kvm_mmu_init_vm(struct kvm *kvm)
6171{
6172	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
6173	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
6174	INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
6175	spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
6176
6177	if (tdp_mmu_enabled)
6178		kvm_mmu_init_tdp_mmu(kvm);
6179
6180	kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
6181	kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
6182
6183	kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO;
6184
6185	kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
6186	kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
6187}
6188
6189static void mmu_free_vm_memory_caches(struct kvm *kvm)
6190{
6191	kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache);
6192	kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache);
6193	kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache);
6194}
6195
6196void kvm_mmu_uninit_vm(struct kvm *kvm)
6197{
6198	if (tdp_mmu_enabled)
6199		kvm_mmu_uninit_tdp_mmu(kvm);
6200
6201	mmu_free_vm_memory_caches(kvm);
6202}
6203
6204static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
6205{
6206	const struct kvm_memory_slot *memslot;
6207	struct kvm_memslots *slots;
6208	struct kvm_memslot_iter iter;
6209	bool flush = false;
6210	gfn_t start, end;
6211	int i;
6212
6213	if (!kvm_memslots_have_rmaps(kvm))
6214		return flush;
6215
6216	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
6217		slots = __kvm_memslots(kvm, i);
6218
6219		kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
6220			memslot = iter.slot;
6221			start = max(gfn_start, memslot->base_gfn);
6222			end = min(gfn_end, memslot->base_gfn + memslot->npages);
6223			if (WARN_ON_ONCE(start >= end))
6224				continue;
6225
6226			flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap,
6227						  PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
6228						  start, end - 1, true, flush);
6229		}
6230	}
6231
6232	return flush;
6233}
6234
6235/*
6236 * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
6237 * (not including it)
6238 */
6239void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
6240{
6241	bool flush;
6242
6243	if (WARN_ON_ONCE(gfn_end <= gfn_start))
6244		return;
6245
6246	write_lock(&kvm->mmu_lock);
6247
6248	kvm_mmu_invalidate_begin(kvm, 0, -1ul);
6249
6250	flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
6251
6252	if (tdp_mmu_enabled)
6253		flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush);
6254
6255	if (flush)
6256		kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
6257
6258	kvm_mmu_invalidate_end(kvm, 0, -1ul);
6259
6260	write_unlock(&kvm->mmu_lock);
6261}
6262
6263static bool slot_rmap_write_protect(struct kvm *kvm,
6264				    struct kvm_rmap_head *rmap_head,
6265				    const struct kvm_memory_slot *slot)
6266{
6267	return rmap_write_protect(rmap_head, false);
6268}
6269
6270void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
6271				      const struct kvm_memory_slot *memslot,
6272				      int start_level)
6273{
6274	if (kvm_memslots_have_rmaps(kvm)) {
6275		write_lock(&kvm->mmu_lock);
6276		walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect,
6277				start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
6278		write_unlock(&kvm->mmu_lock);
6279	}
6280
6281	if (tdp_mmu_enabled) {
6282		read_lock(&kvm->mmu_lock);
6283		kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
6284		read_unlock(&kvm->mmu_lock);
6285	}
6286}
6287
6288static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min)
6289{
6290	return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
6291}
6292
6293static bool need_topup_split_caches_or_resched(struct kvm *kvm)
6294{
6295	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
6296		return true;
6297
6298	/*
6299	 * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed
6300	 * to split a single huge page. Calculating how many are actually needed
6301	 * is possible but not worth the complexity.
6302	 */
6303	return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) ||
6304	       need_topup(&kvm->arch.split_page_header_cache, 1) ||
6305	       need_topup(&kvm->arch.split_shadow_page_cache, 1);
6306}
6307
6308static int topup_split_caches(struct kvm *kvm)
6309{
6310	/*
6311	 * Allocating rmap list entries when splitting huge pages for nested
6312	 * MMUs is uncommon as KVM needs to use a list if and only if there is
6313	 * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be
6314	 * aliased by multiple L2 gfns and/or from multiple nested roots with
6315	 * different roles.  Aliasing gfns when using TDP is atypical for VMMs;
6316	 * a few gfns are often aliased during boot, e.g. when remapping BIOS,
6317	 * but aliasing rarely occurs post-boot or for many gfns.  If there is
6318	 * only one rmap entry, rmap->val points directly at that one entry and
6319	 * doesn't need to allocate a list.  Buffer the cache by the default
6320	 * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM
6321	 * encounters an aliased gfn or two.
6322	 */
6323	const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS +
6324			     KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE;
6325	int r;
6326
6327	lockdep_assert_held(&kvm->slots_lock);
6328
6329	r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity,
6330					 SPLIT_DESC_CACHE_MIN_NR_OBJECTS);
6331	if (r)
6332		return r;
6333
6334	r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1);
6335	if (r)
6336		return r;
6337
6338	return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1);
6339}
6340
6341static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep)
6342{
6343	struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
6344	struct shadow_page_caches caches = {};
6345	union kvm_mmu_page_role role;
6346	unsigned int access;
6347	gfn_t gfn;
6348
6349	gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
6350	access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep));
6351
6352	/*
6353	 * Note, huge page splitting always uses direct shadow pages, regardless
6354	 * of whether the huge page itself is mapped by a direct or indirect
6355	 * shadow page, since the huge page region itself is being directly
6356	 * mapped with smaller pages.
6357	 */
6358	role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access);
6359
6360	/* Direct SPs do not require a shadowed_info_cache. */
6361	caches.page_header_cache = &kvm->arch.split_page_header_cache;
6362	caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
6363
6364	/* Safe to pass NULL for vCPU since requesting a direct SP. */
6365	return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
6366}
6367
6368static void shadow_mmu_split_huge_page(struct kvm *kvm,
6369				       const struct kvm_memory_slot *slot,
6370				       u64 *huge_sptep)
6371
6372{
6373	struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache;
6374	u64 huge_spte = READ_ONCE(*huge_sptep);
6375	struct kvm_mmu_page *sp;
6376	bool flush = false;
6377	u64 *sptep, spte;
6378	gfn_t gfn;
6379	int index;
6380
6381	sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep);
6382
6383	for (index = 0; index < SPTE_ENT_PER_PAGE; index++) {
6384		sptep = &sp->spt[index];
6385		gfn = kvm_mmu_page_get_gfn(sp, index);
6386
6387		/*
6388		 * The SP may already have populated SPTEs, e.g. if this huge
6389		 * page is aliased by multiple sptes with the same access
6390		 * permissions. These entries are guaranteed to map the same
6391		 * gfn-to-pfn translation since the SP is direct, so no need to
6392		 * modify them.
6393		 *
6394		 * However, if a given SPTE points to a lower level page table,
6395		 * that lower level page table may only be partially populated.
6396		 * Installing such SPTEs would effectively unmap a potion of the
6397		 * huge page. Unmapping guest memory always requires a TLB flush
6398		 * since a subsequent operation on the unmapped regions would
6399		 * fail to detect the need to flush.
6400		 */
6401		if (is_shadow_present_pte(*sptep)) {
6402			flush |= !is_last_spte(*sptep, sp->role.level);
6403			continue;
6404		}
6405
6406		spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index);
6407		mmu_spte_set(sptep, spte);
6408		__rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
6409	}
6410
6411	__link_shadow_page(kvm, cache, huge_sptep, sp, flush);
6412}
6413
6414static int shadow_mmu_try_split_huge_page(struct kvm *kvm,
6415					  const struct kvm_memory_slot *slot,
6416					  u64 *huge_sptep)
6417{
6418	struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
6419	int level, r = 0;
6420	gfn_t gfn;
6421	u64 spte;
6422
6423	/* Grab information for the tracepoint before dropping the MMU lock. */
6424	gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
6425	level = huge_sp->role.level;
6426	spte = *huge_sptep;
6427
6428	if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) {
6429		r = -ENOSPC;
6430		goto out;
6431	}
6432
6433	if (need_topup_split_caches_or_resched(kvm)) {
6434		write_unlock(&kvm->mmu_lock);
6435		cond_resched();
6436		/*
6437		 * If the topup succeeds, return -EAGAIN to indicate that the
6438		 * rmap iterator should be restarted because the MMU lock was
6439		 * dropped.
6440		 */
6441		r = topup_split_caches(kvm) ?: -EAGAIN;
6442		write_lock(&kvm->mmu_lock);
6443		goto out;
6444	}
6445
6446	shadow_mmu_split_huge_page(kvm, slot, huge_sptep);
6447
6448out:
6449	trace_kvm_mmu_split_huge_page(gfn, spte, level, r);
6450	return r;
6451}
6452
6453static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm,
6454					    struct kvm_rmap_head *rmap_head,
6455					    const struct kvm_memory_slot *slot)
6456{
6457	struct rmap_iterator iter;
6458	struct kvm_mmu_page *sp;
6459	u64 *huge_sptep;
6460	int r;
6461
6462restart:
6463	for_each_rmap_spte(rmap_head, &iter, huge_sptep) {
6464		sp = sptep_to_sp(huge_sptep);
6465
6466		/* TDP MMU is enabled, so rmap only contains nested MMU SPs. */
6467		if (WARN_ON_ONCE(!sp->role.guest_mode))
6468			continue;
6469
6470		/* The rmaps should never contain non-leaf SPTEs. */
6471		if (WARN_ON_ONCE(!is_large_pte(*huge_sptep)))
6472			continue;
6473
6474		/* SPs with level >PG_LEVEL_4K should never by unsync. */
6475		if (WARN_ON_ONCE(sp->unsync))
6476			continue;
6477
6478		/* Don't bother splitting huge pages on invalid SPs. */
6479		if (sp->role.invalid)
6480			continue;
6481
6482		r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep);
6483
6484		/*
6485		 * The split succeeded or needs to be retried because the MMU
6486		 * lock was dropped. Either way, restart the iterator to get it
6487		 * back into a consistent state.
6488		 */
6489		if (!r || r == -EAGAIN)
6490			goto restart;
6491
6492		/* The split failed and shouldn't be retried (e.g. -ENOMEM). */
6493		break;
6494	}
6495
6496	return false;
6497}
6498
6499static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
6500						const struct kvm_memory_slot *slot,
6501						gfn_t start, gfn_t end,
6502						int target_level)
6503{
6504	int level;
6505
6506	/*
6507	 * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working
6508	 * down to the target level. This ensures pages are recursively split
6509	 * all the way to the target level. There's no need to split pages
6510	 * already at the target level.
6511	 */
6512	for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
6513		__walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages,
6514				  level, level, start, end - 1, true, false);
6515}
6516
6517/* Must be called with the mmu_lock held in write-mode. */
6518void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
6519				   const struct kvm_memory_slot *memslot,
6520				   u64 start, u64 end,
6521				   int target_level)
6522{
6523	if (!tdp_mmu_enabled)
6524		return;
6525
6526	if (kvm_memslots_have_rmaps(kvm))
6527		kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
6528
6529	kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false);
6530
6531	/*
6532	 * A TLB flush is unnecessary at this point for the same resons as in
6533	 * kvm_mmu_slot_try_split_huge_pages().
6534	 */
6535}
6536
6537void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
6538					const struct kvm_memory_slot *memslot,
6539					int target_level)
6540{
6541	u64 start = memslot->base_gfn;
6542	u64 end = start + memslot->npages;
6543
6544	if (!tdp_mmu_enabled)
6545		return;
6546
6547	if (kvm_memslots_have_rmaps(kvm)) {
6548		write_lock(&kvm->mmu_lock);
6549		kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
6550		write_unlock(&kvm->mmu_lock);
6551	}
6552
6553	read_lock(&kvm->mmu_lock);
6554	kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
6555	read_unlock(&kvm->mmu_lock);
6556
6557	/*
6558	 * No TLB flush is necessary here. KVM will flush TLBs after
6559	 * write-protecting and/or clearing dirty on the newly split SPTEs to
6560	 * ensure that guest writes are reflected in the dirty log before the
6561	 * ioctl to enable dirty logging on this memslot completes. Since the
6562	 * split SPTEs retain the write and dirty bits of the huge SPTE, it is
6563	 * safe for KVM to decide if a TLB flush is necessary based on the split
6564	 * SPTEs.
6565	 */
6566}
6567
6568static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
6569					 struct kvm_rmap_head *rmap_head,
6570					 const struct kvm_memory_slot *slot)
6571{
6572	u64 *sptep;
6573	struct rmap_iterator iter;
6574	int need_tlb_flush = 0;
6575	struct kvm_mmu_page *sp;
6576
6577restart:
6578	for_each_rmap_spte(rmap_head, &iter, sptep) {
6579		sp = sptep_to_sp(sptep);
6580
6581		/*
6582		 * We cannot do huge page mapping for indirect shadow pages,
6583		 * which are found on the last rmap (level = 1) when not using
6584		 * tdp; such shadow pages are synced with the page table in
6585		 * the guest, and the guest page table is using 4K page size
6586		 * mapping if the indirect sp has level = 1.
6587		 */
6588		if (sp->role.direct &&
6589		    sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
6590							       PG_LEVEL_NUM)) {
6591			kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
6592
6593			if (kvm_available_flush_remote_tlbs_range())
6594				kvm_flush_remote_tlbs_sptep(kvm, sptep);
6595			else
6596				need_tlb_flush = 1;
6597
6598			goto restart;
6599		}
6600	}
6601
6602	return need_tlb_flush;
6603}
6604
6605static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
6606					   const struct kvm_memory_slot *slot)
6607{
6608	/*
6609	 * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap
6610	 * pages that are already mapped at the maximum hugepage level.
6611	 */
6612	if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte,
6613			    PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
6614		kvm_flush_remote_tlbs_memslot(kvm, slot);
6615}
6616
6617void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
6618				   const struct kvm_memory_slot *slot)
6619{
6620	if (kvm_memslots_have_rmaps(kvm)) {
6621		write_lock(&kvm->mmu_lock);
6622		kvm_rmap_zap_collapsible_sptes(kvm, slot);
6623		write_unlock(&kvm->mmu_lock);
6624	}
6625
6626	if (tdp_mmu_enabled) {
6627		read_lock(&kvm->mmu_lock);
6628		kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
6629		read_unlock(&kvm->mmu_lock);
6630	}
6631}
6632
6633void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
6634				   const struct kvm_memory_slot *memslot)
6635{
6636	if (kvm_memslots_have_rmaps(kvm)) {
6637		write_lock(&kvm->mmu_lock);
6638		/*
6639		 * Clear dirty bits only on 4k SPTEs since the legacy MMU only
6640		 * support dirty logging at a 4k granularity.
6641		 */
6642		walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false);
6643		write_unlock(&kvm->mmu_lock);
6644	}
6645
6646	if (tdp_mmu_enabled) {
6647		read_lock(&kvm->mmu_lock);
6648		kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
6649		read_unlock(&kvm->mmu_lock);
6650	}
6651
6652	/*
6653	 * The caller will flush the TLBs after this function returns.
6654	 *
6655	 * It's also safe to flush TLBs out of mmu lock here as currently this
6656	 * function is only used for dirty logging, in which case flushing TLB
6657	 * out of mmu lock also guarantees no dirty pages will be lost in
6658	 * dirty_bitmap.
6659	 */
6660}
6661
6662static void kvm_mmu_zap_all(struct kvm *kvm)
6663{
6664	struct kvm_mmu_page *sp, *node;
6665	LIST_HEAD(invalid_list);
6666	int ign;
6667
6668	write_lock(&kvm->mmu_lock);
6669restart:
6670	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
6671		if (WARN_ON_ONCE(sp->role.invalid))
6672			continue;
6673		if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
6674			goto restart;
6675		if (cond_resched_rwlock_write(&kvm->mmu_lock))
6676			goto restart;
6677	}
6678
6679	kvm_mmu_commit_zap_page(kvm, &invalid_list);
6680
6681	if (tdp_mmu_enabled)
6682		kvm_tdp_mmu_zap_all(kvm);
6683
6684	write_unlock(&kvm->mmu_lock);
6685}
6686
6687void kvm_arch_flush_shadow_all(struct kvm *kvm)
6688{
6689	kvm_mmu_zap_all(kvm);
6690}
6691
6692void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
6693				   struct kvm_memory_slot *slot)
6694{
6695	kvm_mmu_zap_all_fast(kvm);
6696}
6697
6698void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
6699{
6700	WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
6701
6702	gen &= MMIO_SPTE_GEN_MASK;
6703
6704	/*
6705	 * Generation numbers are incremented in multiples of the number of
6706	 * address spaces in order to provide unique generations across all
6707	 * address spaces.  Strip what is effectively the address space
6708	 * modifier prior to checking for a wrap of the MMIO generation so
6709	 * that a wrap in any address space is detected.
6710	 */
6711	gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
6712
6713	/*
6714	 * The very rare case: if the MMIO generation number has wrapped,
6715	 * zap all shadow pages.
6716	 */
6717	if (unlikely(gen == 0)) {
6718		kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n");
6719		kvm_mmu_zap_all_fast(kvm);
6720	}
6721}
6722
6723static unsigned long mmu_shrink_scan(struct shrinker *shrink,
6724				     struct shrink_control *sc)
6725{
6726	struct kvm *kvm;
6727	int nr_to_scan = sc->nr_to_scan;
6728	unsigned long freed = 0;
6729
6730	mutex_lock(&kvm_lock);
6731
6732	list_for_each_entry(kvm, &vm_list, vm_list) {
6733		int idx;
6734		LIST_HEAD(invalid_list);
6735
6736		/*
6737		 * Never scan more than sc->nr_to_scan VM instances.
6738		 * Will not hit this condition practically since we do not try
6739		 * to shrink more than one VM and it is very unlikely to see
6740		 * !n_used_mmu_pages so many times.
6741		 */
6742		if (!nr_to_scan--)
6743			break;
6744		/*
6745		 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
6746		 * here. We may skip a VM instance errorneosly, but we do not
6747		 * want to shrink a VM that only started to populate its MMU
6748		 * anyway.
6749		 */
6750		if (!kvm->arch.n_used_mmu_pages &&
6751		    !kvm_has_zapped_obsolete_pages(kvm))
6752			continue;
6753
6754		idx = srcu_read_lock(&kvm->srcu);
6755		write_lock(&kvm->mmu_lock);
6756
6757		if (kvm_has_zapped_obsolete_pages(kvm)) {
6758			kvm_mmu_commit_zap_page(kvm,
6759			      &kvm->arch.zapped_obsolete_pages);
6760			goto unlock;
6761		}
6762
6763		freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
6764
6765unlock:
6766		write_unlock(&kvm->mmu_lock);
6767		srcu_read_unlock(&kvm->srcu, idx);
6768
6769		/*
6770		 * unfair on small ones
6771		 * per-vm shrinkers cry out
6772		 * sadness comes quickly
6773		 */
6774		list_move_tail(&kvm->vm_list, &vm_list);
6775		break;
6776	}
6777
6778	mutex_unlock(&kvm_lock);
6779	return freed;
6780}
6781
6782static unsigned long mmu_shrink_count(struct shrinker *shrink,
6783				      struct shrink_control *sc)
6784{
6785	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
6786}
6787
6788static struct shrinker mmu_shrinker = {
6789	.count_objects = mmu_shrink_count,
6790	.scan_objects = mmu_shrink_scan,
6791	.seeks = DEFAULT_SEEKS * 10,
6792};
6793
6794static void mmu_destroy_caches(void)
6795{
6796	kmem_cache_destroy(pte_list_desc_cache);
6797	kmem_cache_destroy(mmu_page_header_cache);
6798}
6799
6800static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
6801{
6802	if (nx_hugepage_mitigation_hard_disabled)
6803		return sysfs_emit(buffer, "never\n");
6804
6805	return param_get_bool(buffer, kp);
6806}
6807
6808static bool get_nx_auto_mode(void)
6809{
6810	/* Return true when CPU has the bug, and mitigations are ON */
6811	return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6812}
6813
6814static void __set_nx_huge_pages(bool val)
6815{
6816	nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6817}
6818
6819static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
6820{
6821	bool old_val = nx_huge_pages;
6822	bool new_val;
6823
6824	if (nx_hugepage_mitigation_hard_disabled)
6825		return -EPERM;
6826
6827	/* In "auto" mode deploy workaround only if CPU has the bug. */
6828	if (sysfs_streq(val, "off")) {
6829		new_val = 0;
6830	} else if (sysfs_streq(val, "force")) {
6831		new_val = 1;
6832	} else if (sysfs_streq(val, "auto")) {
6833		new_val = get_nx_auto_mode();
6834	} else if (sysfs_streq(val, "never")) {
6835		new_val = 0;
6836
6837		mutex_lock(&kvm_lock);
6838		if (!list_empty(&vm_list)) {
6839			mutex_unlock(&kvm_lock);
6840			return -EBUSY;
6841		}
6842		nx_hugepage_mitigation_hard_disabled = true;
6843		mutex_unlock(&kvm_lock);
6844	} else if (kstrtobool(val, &new_val) < 0) {
6845		return -EINVAL;
6846	}
6847
6848	__set_nx_huge_pages(new_val);
6849
6850	if (new_val != old_val) {
6851		struct kvm *kvm;
6852
6853		mutex_lock(&kvm_lock);
6854
6855		list_for_each_entry(kvm, &vm_list, vm_list) {
6856			mutex_lock(&kvm->slots_lock);
6857			kvm_mmu_zap_all_fast(kvm);
6858			mutex_unlock(&kvm->slots_lock);
6859
6860			wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
6861		}
6862		mutex_unlock(&kvm_lock);
6863	}
6864
6865	return 0;
6866}
6867
6868/*
6869 * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
6870 * its default value of -1 is technically undefined behavior for a boolean.
6871 * Forward the module init call to SPTE code so that it too can handle module
6872 * params that need to be resolved/snapshot.
6873 */
6874void __init kvm_mmu_x86_module_init(void)
6875{
6876	if (nx_huge_pages == -1)
6877		__set_nx_huge_pages(get_nx_auto_mode());
6878
6879	/*
6880	 * Snapshot userspace's desire to enable the TDP MMU. Whether or not the
6881	 * TDP MMU is actually enabled is determined in kvm_configure_mmu()
6882	 * when the vendor module is loaded.
6883	 */
6884	tdp_mmu_allowed = tdp_mmu_enabled;
6885
6886	kvm_mmu_spte_module_init();
6887}
6888
6889/*
6890 * The bulk of the MMU initialization is deferred until the vendor module is
6891 * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
6892 * to be reset when a potentially different vendor module is loaded.
6893 */
6894int kvm_mmu_vendor_module_init(void)
6895{
6896	int ret = -ENOMEM;
6897
6898	/*
6899	 * MMU roles use union aliasing which is, generally speaking, an
6900	 * undefined behavior. However, we supposedly know how compilers behave
6901	 * and the current status quo is unlikely to change. Guardians below are
6902	 * supposed to let us know if the assumption becomes false.
6903	 */
6904	BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6905	BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6906	BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64));
6907
6908	kvm_mmu_reset_all_pte_masks();
6909
6910	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6911					    sizeof(struct pte_list_desc),
6912					    0, SLAB_ACCOUNT, NULL);
6913	if (!pte_list_desc_cache)
6914		goto out;
6915
6916	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6917						  sizeof(struct kvm_mmu_page),
6918						  0, SLAB_ACCOUNT, NULL);
6919	if (!mmu_page_header_cache)
6920		goto out;
6921
6922	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6923		goto out;
6924
6925	ret = register_shrinker(&mmu_shrinker, "x86-mmu");
6926	if (ret)
6927		goto out_shrinker;
6928
6929	return 0;
6930
6931out_shrinker:
6932	percpu_counter_destroy(&kvm_total_used_mmu_pages);
6933out:
6934	mmu_destroy_caches();
6935	return ret;
6936}
6937
6938void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6939{
6940	kvm_mmu_unload(vcpu);
6941	free_mmu_pages(&vcpu->arch.root_mmu);
6942	free_mmu_pages(&vcpu->arch.guest_mmu);
6943	mmu_free_memory_caches(vcpu);
6944}
6945
6946void kvm_mmu_vendor_module_exit(void)
6947{
6948	mmu_destroy_caches();
6949	percpu_counter_destroy(&kvm_total_used_mmu_pages);
6950	unregister_shrinker(&mmu_shrinker);
6951}
6952
6953/*
6954 * Calculate the effective recovery period, accounting for '0' meaning "let KVM
6955 * select a halving time of 1 hour".  Returns true if recovery is enabled.
6956 */
6957static bool calc_nx_huge_pages_recovery_period(uint *period)
6958{
6959	/*
6960	 * Use READ_ONCE to get the params, this may be called outside of the
6961	 * param setters, e.g. by the kthread to compute its next timeout.
6962	 */
6963	bool enabled = READ_ONCE(nx_huge_pages);
6964	uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6965
6966	if (!enabled || !ratio)
6967		return false;
6968
6969	*period = READ_ONCE(nx_huge_pages_recovery_period_ms);
6970	if (!*period) {
6971		/* Make sure the period is not less than one second.  */
6972		ratio = min(ratio, 3600u);
6973		*period = 60 * 60 * 1000 / ratio;
6974	}
6975	return true;
6976}
6977
6978static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
6979{
6980	bool was_recovery_enabled, is_recovery_enabled;
6981	uint old_period, new_period;
6982	int err;
6983
6984	if (nx_hugepage_mitigation_hard_disabled)
6985		return -EPERM;
6986
6987	was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
6988
6989	err = param_set_uint(val, kp);
6990	if (err)
6991		return err;
6992
6993	is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
6994
6995	if (is_recovery_enabled &&
6996	    (!was_recovery_enabled || old_period > new_period)) {
6997		struct kvm *kvm;
6998
6999		mutex_lock(&kvm_lock);
7000
7001		list_for_each_entry(kvm, &vm_list, vm_list)
7002			wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
7003
7004		mutex_unlock(&kvm_lock);
7005	}
7006
7007	return err;
7008}
7009
7010static void kvm_recover_nx_huge_pages(struct kvm *kvm)
7011{
7012	unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
7013	struct kvm_memory_slot *slot;
7014	int rcu_idx;
7015	struct kvm_mmu_page *sp;
7016	unsigned int ratio;
7017	LIST_HEAD(invalid_list);
7018	bool flush = false;
7019	ulong to_zap;
7020
7021	rcu_idx = srcu_read_lock(&kvm->srcu);
7022	write_lock(&kvm->mmu_lock);
7023
7024	/*
7025	 * Zapping TDP MMU shadow pages, including the remote TLB flush, must
7026	 * be done under RCU protection, because the pages are freed via RCU
7027	 * callback.
7028	 */
7029	rcu_read_lock();
7030
7031	ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
7032	to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
7033	for ( ; to_zap; --to_zap) {
7034		if (list_empty(&kvm->arch.possible_nx_huge_pages))
7035			break;
7036
7037		/*
7038		 * We use a separate list instead of just using active_mmu_pages
7039		 * because the number of shadow pages that be replaced with an
7040		 * NX huge page is expected to be relatively small compared to
7041		 * the total number of shadow pages.  And because the TDP MMU
7042		 * doesn't use active_mmu_pages.
7043		 */
7044		sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
7045				      struct kvm_mmu_page,
7046				      possible_nx_huge_page_link);
7047		WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
7048		WARN_ON_ONCE(!sp->role.direct);
7049
7050		/*
7051		 * Unaccount and do not attempt to recover any NX Huge Pages
7052		 * that are being dirty tracked, as they would just be faulted
7053		 * back in as 4KiB pages. The NX Huge Pages in this slot will be
7054		 * recovered, along with all the other huge pages in the slot,
7055		 * when dirty logging is disabled.
7056		 *
7057		 * Since gfn_to_memslot() is relatively expensive, it helps to
7058		 * skip it if it the test cannot possibly return true.  On the
7059		 * other hand, if any memslot has logging enabled, chances are
7060		 * good that all of them do, in which case unaccount_nx_huge_page()
7061		 * is much cheaper than zapping the page.
7062		 *
7063		 * If a memslot update is in progress, reading an incorrect value
7064		 * of kvm->nr_memslots_dirty_logging is not a problem: if it is
7065		 * becoming zero, gfn_to_memslot() will be done unnecessarily; if
7066		 * it is becoming nonzero, the page will be zapped unnecessarily.
7067		 * Either way, this only affects efficiency in racy situations,
7068		 * and not correctness.
7069		 */
7070		slot = NULL;
7071		if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
7072			struct kvm_memslots *slots;
7073
7074			slots = kvm_memslots_for_spte_role(kvm, sp->role);
7075			slot = __gfn_to_memslot(slots, sp->gfn);
7076			WARN_ON_ONCE(!slot);
7077		}
7078
7079		if (slot && kvm_slot_dirty_track_enabled(slot))
7080			unaccount_nx_huge_page(kvm, sp);
7081		else if (is_tdp_mmu_page(sp))
7082			flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
7083		else
7084			kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
7085		WARN_ON_ONCE(sp->nx_huge_page_disallowed);
7086
7087		if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
7088			kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
7089			rcu_read_unlock();
7090
7091			cond_resched_rwlock_write(&kvm->mmu_lock);
7092			flush = false;
7093
7094			rcu_read_lock();
7095		}
7096	}
7097	kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
7098
7099	rcu_read_unlock();
7100
7101	write_unlock(&kvm->mmu_lock);
7102	srcu_read_unlock(&kvm->srcu, rcu_idx);
7103}
7104
7105static long get_nx_huge_page_recovery_timeout(u64 start_time)
7106{
7107	bool enabled;
7108	uint period;
7109
7110	enabled = calc_nx_huge_pages_recovery_period(&period);
7111
7112	return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
7113		       : MAX_SCHEDULE_TIMEOUT;
7114}
7115
7116static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
7117{
7118	u64 start_time;
7119	long remaining_time;
7120
7121	while (true) {
7122		start_time = get_jiffies_64();
7123		remaining_time = get_nx_huge_page_recovery_timeout(start_time);
7124
7125		set_current_state(TASK_INTERRUPTIBLE);
7126		while (!kthread_should_stop() && remaining_time > 0) {
7127			schedule_timeout(remaining_time);
7128			remaining_time = get_nx_huge_page_recovery_timeout(start_time);
7129			set_current_state(TASK_INTERRUPTIBLE);
7130		}
7131
7132		set_current_state(TASK_RUNNING);
7133
7134		if (kthread_should_stop())
7135			return 0;
7136
7137		kvm_recover_nx_huge_pages(kvm);
7138	}
7139}
7140
7141int kvm_mmu_post_init_vm(struct kvm *kvm)
7142{
7143	int err;
7144
7145	if (nx_hugepage_mitigation_hard_disabled)
7146		return 0;
7147
7148	err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
7149					  "kvm-nx-lpage-recovery",
7150					  &kvm->arch.nx_huge_page_recovery_thread);
7151	if (!err)
7152		kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
7153
7154	return err;
7155}
7156
7157void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
7158{
7159	if (kvm->arch.nx_huge_page_recovery_thread)
7160		kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
7161}
7162