xref: /kernel/linux/linux-6.6/arch/x86/kvm/mmu/mmu.c (revision 62306a36)
162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Kernel-based Virtual Machine driver for Linux
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * This module enables machines with Intel VT-x extensions to run virtual
662306a36Sopenharmony_ci * machines without emulation or binary translation.
762306a36Sopenharmony_ci *
862306a36Sopenharmony_ci * MMU support
962306a36Sopenharmony_ci *
1062306a36Sopenharmony_ci * Copyright (C) 2006 Qumranet, Inc.
1162306a36Sopenharmony_ci * Copyright 2010 Red Hat, Inc. and/or its affiliates.
1262306a36Sopenharmony_ci *
1362306a36Sopenharmony_ci * Authors:
1462306a36Sopenharmony_ci *   Yaniv Kamay  <yaniv@qumranet.com>
1562306a36Sopenharmony_ci *   Avi Kivity   <avi@qumranet.com>
1662306a36Sopenharmony_ci */
1762306a36Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
1862306a36Sopenharmony_ci
1962306a36Sopenharmony_ci#include "irq.h"
2062306a36Sopenharmony_ci#include "ioapic.h"
2162306a36Sopenharmony_ci#include "mmu.h"
2262306a36Sopenharmony_ci#include "mmu_internal.h"
2362306a36Sopenharmony_ci#include "tdp_mmu.h"
2462306a36Sopenharmony_ci#include "x86.h"
2562306a36Sopenharmony_ci#include "kvm_cache_regs.h"
2662306a36Sopenharmony_ci#include "smm.h"
2762306a36Sopenharmony_ci#include "kvm_emulate.h"
2862306a36Sopenharmony_ci#include "page_track.h"
2962306a36Sopenharmony_ci#include "cpuid.h"
3062306a36Sopenharmony_ci#include "spte.h"
3162306a36Sopenharmony_ci
3262306a36Sopenharmony_ci#include <linux/kvm_host.h>
3362306a36Sopenharmony_ci#include <linux/types.h>
3462306a36Sopenharmony_ci#include <linux/string.h>
3562306a36Sopenharmony_ci#include <linux/mm.h>
3662306a36Sopenharmony_ci#include <linux/highmem.h>
3762306a36Sopenharmony_ci#include <linux/moduleparam.h>
3862306a36Sopenharmony_ci#include <linux/export.h>
3962306a36Sopenharmony_ci#include <linux/swap.h>
4062306a36Sopenharmony_ci#include <linux/hugetlb.h>
4162306a36Sopenharmony_ci#include <linux/compiler.h>
4262306a36Sopenharmony_ci#include <linux/srcu.h>
4362306a36Sopenharmony_ci#include <linux/slab.h>
4462306a36Sopenharmony_ci#include <linux/sched/signal.h>
4562306a36Sopenharmony_ci#include <linux/uaccess.h>
4662306a36Sopenharmony_ci#include <linux/hash.h>
4762306a36Sopenharmony_ci#include <linux/kern_levels.h>
4862306a36Sopenharmony_ci#include <linux/kstrtox.h>
4962306a36Sopenharmony_ci#include <linux/kthread.h>
5062306a36Sopenharmony_ci
5162306a36Sopenharmony_ci#include <asm/page.h>
5262306a36Sopenharmony_ci#include <asm/memtype.h>
5362306a36Sopenharmony_ci#include <asm/cmpxchg.h>
5462306a36Sopenharmony_ci#include <asm/io.h>
5562306a36Sopenharmony_ci#include <asm/set_memory.h>
5662306a36Sopenharmony_ci#include <asm/vmx.h>
5762306a36Sopenharmony_ci
5862306a36Sopenharmony_ci#include "trace.h"
5962306a36Sopenharmony_ci
6062306a36Sopenharmony_ciextern bool itlb_multihit_kvm_mitigation;
6162306a36Sopenharmony_ci
6262306a36Sopenharmony_cistatic bool nx_hugepage_mitigation_hard_disabled;
6362306a36Sopenharmony_ci
6462306a36Sopenharmony_ciint __read_mostly nx_huge_pages = -1;
6562306a36Sopenharmony_cistatic uint __read_mostly nx_huge_pages_recovery_period_ms;
6662306a36Sopenharmony_ci#ifdef CONFIG_PREEMPT_RT
6762306a36Sopenharmony_ci/* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
6862306a36Sopenharmony_cistatic uint __read_mostly nx_huge_pages_recovery_ratio = 0;
6962306a36Sopenharmony_ci#else
7062306a36Sopenharmony_cistatic uint __read_mostly nx_huge_pages_recovery_ratio = 60;
7162306a36Sopenharmony_ci#endif
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_cistatic int get_nx_huge_pages(char *buffer, const struct kernel_param *kp);
7462306a36Sopenharmony_cistatic int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
7562306a36Sopenharmony_cistatic int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_cistatic const struct kernel_param_ops nx_huge_pages_ops = {
7862306a36Sopenharmony_ci	.set = set_nx_huge_pages,
7962306a36Sopenharmony_ci	.get = get_nx_huge_pages,
8062306a36Sopenharmony_ci};
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_cistatic const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
8362306a36Sopenharmony_ci	.set = set_nx_huge_pages_recovery_param,
8462306a36Sopenharmony_ci	.get = param_get_uint,
8562306a36Sopenharmony_ci};
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_cimodule_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
8862306a36Sopenharmony_ci__MODULE_PARM_TYPE(nx_huge_pages, "bool");
8962306a36Sopenharmony_cimodule_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
9062306a36Sopenharmony_ci		&nx_huge_pages_recovery_ratio, 0644);
9162306a36Sopenharmony_ci__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
9262306a36Sopenharmony_cimodule_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
9362306a36Sopenharmony_ci		&nx_huge_pages_recovery_period_ms, 0644);
9462306a36Sopenharmony_ci__MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_cistatic bool __read_mostly force_flush_and_sync_on_reuse;
9762306a36Sopenharmony_cimodule_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
9862306a36Sopenharmony_ci
9962306a36Sopenharmony_ci/*
10062306a36Sopenharmony_ci * When setting this variable to true it enables Two-Dimensional-Paging
10162306a36Sopenharmony_ci * where the hardware walks 2 page tables:
10262306a36Sopenharmony_ci * 1. the guest-virtual to guest-physical
10362306a36Sopenharmony_ci * 2. while doing 1. it walks guest-physical to host-physical
10462306a36Sopenharmony_ci * If the hardware supports that we don't need to do shadow paging.
10562306a36Sopenharmony_ci */
10662306a36Sopenharmony_cibool tdp_enabled = false;
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_cistatic bool __ro_after_init tdp_mmu_allowed;
10962306a36Sopenharmony_ci
11062306a36Sopenharmony_ci#ifdef CONFIG_X86_64
11162306a36Sopenharmony_cibool __read_mostly tdp_mmu_enabled = true;
11262306a36Sopenharmony_cimodule_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444);
11362306a36Sopenharmony_ci#endif
11462306a36Sopenharmony_ci
11562306a36Sopenharmony_cistatic int max_huge_page_level __read_mostly;
11662306a36Sopenharmony_cistatic int tdp_root_level __read_mostly;
11762306a36Sopenharmony_cistatic int max_tdp_level __read_mostly;
11862306a36Sopenharmony_ci
11962306a36Sopenharmony_ci#define PTE_PREFETCH_NUM		8
12062306a36Sopenharmony_ci
12162306a36Sopenharmony_ci#include <trace/events/kvm.h>
12262306a36Sopenharmony_ci
12362306a36Sopenharmony_ci/* make pte_list_desc fit well in cache lines */
12462306a36Sopenharmony_ci#define PTE_LIST_EXT 14
12562306a36Sopenharmony_ci
12662306a36Sopenharmony_ci/*
12762306a36Sopenharmony_ci * struct pte_list_desc is the core data structure used to implement a custom
12862306a36Sopenharmony_ci * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a
12962306a36Sopenharmony_ci * given GFN when used in the context of rmaps.  Using a custom list allows KVM
13062306a36Sopenharmony_ci * to optimize for the common case where many GFNs will have at most a handful
13162306a36Sopenharmony_ci * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small
13262306a36Sopenharmony_ci * memory footprint, which in turn improves runtime performance by exploiting
13362306a36Sopenharmony_ci * cache locality.
13462306a36Sopenharmony_ci *
13562306a36Sopenharmony_ci * A list is comprised of one or more pte_list_desc objects (descriptors).
13662306a36Sopenharmony_ci * Each individual descriptor stores up to PTE_LIST_EXT SPTEs.  If a descriptor
13762306a36Sopenharmony_ci * is full and a new SPTEs needs to be added, a new descriptor is allocated and
13862306a36Sopenharmony_ci * becomes the head of the list.  This means that by definitions, all tail
13962306a36Sopenharmony_ci * descriptors are full.
14062306a36Sopenharmony_ci *
14162306a36Sopenharmony_ci * Note, the meta data fields are deliberately placed at the start of the
14262306a36Sopenharmony_ci * structure to optimize the cacheline layout; accessing the descriptor will
14362306a36Sopenharmony_ci * touch only a single cacheline so long as @spte_count<=6 (or if only the
14462306a36Sopenharmony_ci * descriptors metadata is accessed).
14562306a36Sopenharmony_ci */
14662306a36Sopenharmony_cistruct pte_list_desc {
14762306a36Sopenharmony_ci	struct pte_list_desc *more;
14862306a36Sopenharmony_ci	/* The number of PTEs stored in _this_ descriptor. */
14962306a36Sopenharmony_ci	u32 spte_count;
15062306a36Sopenharmony_ci	/* The number of PTEs stored in all tails of this descriptor. */
15162306a36Sopenharmony_ci	u32 tail_count;
15262306a36Sopenharmony_ci	u64 *sptes[PTE_LIST_EXT];
15362306a36Sopenharmony_ci};
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_cistruct kvm_shadow_walk_iterator {
15662306a36Sopenharmony_ci	u64 addr;
15762306a36Sopenharmony_ci	hpa_t shadow_addr;
15862306a36Sopenharmony_ci	u64 *sptep;
15962306a36Sopenharmony_ci	int level;
16062306a36Sopenharmony_ci	unsigned index;
16162306a36Sopenharmony_ci};
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_ci#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
16462306a36Sopenharmony_ci	for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
16562306a36Sopenharmony_ci					 (_root), (_addr));                \
16662306a36Sopenharmony_ci	     shadow_walk_okay(&(_walker));			           \
16762306a36Sopenharmony_ci	     shadow_walk_next(&(_walker)))
16862306a36Sopenharmony_ci
16962306a36Sopenharmony_ci#define for_each_shadow_entry(_vcpu, _addr, _walker)            \
17062306a36Sopenharmony_ci	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
17162306a36Sopenharmony_ci	     shadow_walk_okay(&(_walker));			\
17262306a36Sopenharmony_ci	     shadow_walk_next(&(_walker)))
17362306a36Sopenharmony_ci
17462306a36Sopenharmony_ci#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)	\
17562306a36Sopenharmony_ci	for (shadow_walk_init(&(_walker), _vcpu, _addr);		\
17662306a36Sopenharmony_ci	     shadow_walk_okay(&(_walker)) &&				\
17762306a36Sopenharmony_ci		({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });	\
17862306a36Sopenharmony_ci	     __shadow_walk_next(&(_walker), spte))
17962306a36Sopenharmony_ci
18062306a36Sopenharmony_cistatic struct kmem_cache *pte_list_desc_cache;
18162306a36Sopenharmony_cistruct kmem_cache *mmu_page_header_cache;
18262306a36Sopenharmony_cistatic struct percpu_counter kvm_total_used_mmu_pages;
18362306a36Sopenharmony_ci
18462306a36Sopenharmony_cistatic void mmu_spte_set(u64 *sptep, u64 spte);
18562306a36Sopenharmony_ci
18662306a36Sopenharmony_cistruct kvm_mmu_role_regs {
18762306a36Sopenharmony_ci	const unsigned long cr0;
18862306a36Sopenharmony_ci	const unsigned long cr4;
18962306a36Sopenharmony_ci	const u64 efer;
19062306a36Sopenharmony_ci};
19162306a36Sopenharmony_ci
19262306a36Sopenharmony_ci#define CREATE_TRACE_POINTS
19362306a36Sopenharmony_ci#include "mmutrace.h"
19462306a36Sopenharmony_ci
19562306a36Sopenharmony_ci/*
19662306a36Sopenharmony_ci * Yes, lot's of underscores.  They're a hint that you probably shouldn't be
19762306a36Sopenharmony_ci * reading from the role_regs.  Once the root_role is constructed, it becomes
19862306a36Sopenharmony_ci * the single source of truth for the MMU's state.
19962306a36Sopenharmony_ci */
20062306a36Sopenharmony_ci#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag)			\
20162306a36Sopenharmony_cistatic inline bool __maybe_unused					\
20262306a36Sopenharmony_ci____is_##reg##_##name(const struct kvm_mmu_role_regs *regs)		\
20362306a36Sopenharmony_ci{									\
20462306a36Sopenharmony_ci	return !!(regs->reg & flag);					\
20562306a36Sopenharmony_ci}
20662306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
20762306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
20862306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
20962306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
21062306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
21162306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
21262306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
21362306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
21462306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
21562306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_ci/*
21862306a36Sopenharmony_ci * The MMU itself (with a valid role) is the single source of truth for the
21962306a36Sopenharmony_ci * MMU.  Do not use the regs used to build the MMU/role, nor the vCPU.  The
22062306a36Sopenharmony_ci * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
22162306a36Sopenharmony_ci * and the vCPU may be incorrect/irrelevant.
22262306a36Sopenharmony_ci */
22362306a36Sopenharmony_ci#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name)		\
22462306a36Sopenharmony_cistatic inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu)	\
22562306a36Sopenharmony_ci{								\
22662306a36Sopenharmony_ci	return !!(mmu->cpu_role. base_or_ext . reg##_##name);	\
22762306a36Sopenharmony_ci}
22862306a36Sopenharmony_ciBUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
22962306a36Sopenharmony_ciBUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pse);
23062306a36Sopenharmony_ciBUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smep);
23162306a36Sopenharmony_ciBUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smap);
23262306a36Sopenharmony_ciBUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pke);
23362306a36Sopenharmony_ciBUILD_MMU_ROLE_ACCESSOR(ext,  cr4, la57);
23462306a36Sopenharmony_ciBUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
23562306a36Sopenharmony_ciBUILD_MMU_ROLE_ACCESSOR(ext,  efer, lma);
23662306a36Sopenharmony_ci
23762306a36Sopenharmony_cistatic inline bool is_cr0_pg(struct kvm_mmu *mmu)
23862306a36Sopenharmony_ci{
23962306a36Sopenharmony_ci        return mmu->cpu_role.base.level > 0;
24062306a36Sopenharmony_ci}
24162306a36Sopenharmony_ci
24262306a36Sopenharmony_cistatic inline bool is_cr4_pae(struct kvm_mmu *mmu)
24362306a36Sopenharmony_ci{
24462306a36Sopenharmony_ci        return !mmu->cpu_role.base.has_4_byte_gpte;
24562306a36Sopenharmony_ci}
24662306a36Sopenharmony_ci
24762306a36Sopenharmony_cistatic struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
24862306a36Sopenharmony_ci{
24962306a36Sopenharmony_ci	struct kvm_mmu_role_regs regs = {
25062306a36Sopenharmony_ci		.cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
25162306a36Sopenharmony_ci		.cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
25262306a36Sopenharmony_ci		.efer = vcpu->arch.efer,
25362306a36Sopenharmony_ci	};
25462306a36Sopenharmony_ci
25562306a36Sopenharmony_ci	return regs;
25662306a36Sopenharmony_ci}
25762306a36Sopenharmony_ci
25862306a36Sopenharmony_cistatic unsigned long get_guest_cr3(struct kvm_vcpu *vcpu)
25962306a36Sopenharmony_ci{
26062306a36Sopenharmony_ci	return kvm_read_cr3(vcpu);
26162306a36Sopenharmony_ci}
26262306a36Sopenharmony_ci
26362306a36Sopenharmony_cistatic inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
26462306a36Sopenharmony_ci						  struct kvm_mmu *mmu)
26562306a36Sopenharmony_ci{
26662306a36Sopenharmony_ci	if (IS_ENABLED(CONFIG_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
26762306a36Sopenharmony_ci		return kvm_read_cr3(vcpu);
26862306a36Sopenharmony_ci
26962306a36Sopenharmony_ci	return mmu->get_guest_pgd(vcpu);
27062306a36Sopenharmony_ci}
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_cistatic inline bool kvm_available_flush_remote_tlbs_range(void)
27362306a36Sopenharmony_ci{
27462306a36Sopenharmony_ci	return kvm_x86_ops.flush_remote_tlbs_range;
27562306a36Sopenharmony_ci}
27662306a36Sopenharmony_ci
27762306a36Sopenharmony_ciint kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
27862306a36Sopenharmony_ci{
27962306a36Sopenharmony_ci	if (!kvm_x86_ops.flush_remote_tlbs_range)
28062306a36Sopenharmony_ci		return -EOPNOTSUPP;
28162306a36Sopenharmony_ci
28262306a36Sopenharmony_ci	return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages);
28362306a36Sopenharmony_ci}
28462306a36Sopenharmony_ci
28562306a36Sopenharmony_cistatic gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
28662306a36Sopenharmony_ci
28762306a36Sopenharmony_ci/* Flush the range of guest memory mapped by the given SPTE. */
28862306a36Sopenharmony_cistatic void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep)
28962306a36Sopenharmony_ci{
29062306a36Sopenharmony_ci	struct kvm_mmu_page *sp = sptep_to_sp(sptep);
29162306a36Sopenharmony_ci	gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep));
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci	kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
29462306a36Sopenharmony_ci}
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_cistatic void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
29762306a36Sopenharmony_ci			   unsigned int access)
29862306a36Sopenharmony_ci{
29962306a36Sopenharmony_ci	u64 spte = make_mmio_spte(vcpu, gfn, access);
30062306a36Sopenharmony_ci
30162306a36Sopenharmony_ci	trace_mark_mmio_spte(sptep, gfn, spte);
30262306a36Sopenharmony_ci	mmu_spte_set(sptep, spte);
30362306a36Sopenharmony_ci}
30462306a36Sopenharmony_ci
30562306a36Sopenharmony_cistatic gfn_t get_mmio_spte_gfn(u64 spte)
30662306a36Sopenharmony_ci{
30762306a36Sopenharmony_ci	u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
30862306a36Sopenharmony_ci
30962306a36Sopenharmony_ci	gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
31062306a36Sopenharmony_ci	       & shadow_nonpresent_or_rsvd_mask;
31162306a36Sopenharmony_ci
31262306a36Sopenharmony_ci	return gpa >> PAGE_SHIFT;
31362306a36Sopenharmony_ci}
31462306a36Sopenharmony_ci
31562306a36Sopenharmony_cistatic unsigned get_mmio_spte_access(u64 spte)
31662306a36Sopenharmony_ci{
31762306a36Sopenharmony_ci	return spte & shadow_mmio_access_mask;
31862306a36Sopenharmony_ci}
31962306a36Sopenharmony_ci
32062306a36Sopenharmony_cistatic bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
32162306a36Sopenharmony_ci{
32262306a36Sopenharmony_ci	u64 kvm_gen, spte_gen, gen;
32362306a36Sopenharmony_ci
32462306a36Sopenharmony_ci	gen = kvm_vcpu_memslots(vcpu)->generation;
32562306a36Sopenharmony_ci	if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
32662306a36Sopenharmony_ci		return false;
32762306a36Sopenharmony_ci
32862306a36Sopenharmony_ci	kvm_gen = gen & MMIO_SPTE_GEN_MASK;
32962306a36Sopenharmony_ci	spte_gen = get_mmio_spte_generation(spte);
33062306a36Sopenharmony_ci
33162306a36Sopenharmony_ci	trace_check_mmio_spte(spte, kvm_gen, spte_gen);
33262306a36Sopenharmony_ci	return likely(kvm_gen == spte_gen);
33362306a36Sopenharmony_ci}
33462306a36Sopenharmony_ci
33562306a36Sopenharmony_cistatic int is_cpuid_PSE36(void)
33662306a36Sopenharmony_ci{
33762306a36Sopenharmony_ci	return 1;
33862306a36Sopenharmony_ci}
33962306a36Sopenharmony_ci
34062306a36Sopenharmony_ci#ifdef CONFIG_X86_64
34162306a36Sopenharmony_cistatic void __set_spte(u64 *sptep, u64 spte)
34262306a36Sopenharmony_ci{
34362306a36Sopenharmony_ci	WRITE_ONCE(*sptep, spte);
34462306a36Sopenharmony_ci}
34562306a36Sopenharmony_ci
34662306a36Sopenharmony_cistatic void __update_clear_spte_fast(u64 *sptep, u64 spte)
34762306a36Sopenharmony_ci{
34862306a36Sopenharmony_ci	WRITE_ONCE(*sptep, spte);
34962306a36Sopenharmony_ci}
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_cistatic u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
35262306a36Sopenharmony_ci{
35362306a36Sopenharmony_ci	return xchg(sptep, spte);
35462306a36Sopenharmony_ci}
35562306a36Sopenharmony_ci
35662306a36Sopenharmony_cistatic u64 __get_spte_lockless(u64 *sptep)
35762306a36Sopenharmony_ci{
35862306a36Sopenharmony_ci	return READ_ONCE(*sptep);
35962306a36Sopenharmony_ci}
36062306a36Sopenharmony_ci#else
36162306a36Sopenharmony_ciunion split_spte {
36262306a36Sopenharmony_ci	struct {
36362306a36Sopenharmony_ci		u32 spte_low;
36462306a36Sopenharmony_ci		u32 spte_high;
36562306a36Sopenharmony_ci	};
36662306a36Sopenharmony_ci	u64 spte;
36762306a36Sopenharmony_ci};
36862306a36Sopenharmony_ci
36962306a36Sopenharmony_cistatic void count_spte_clear(u64 *sptep, u64 spte)
37062306a36Sopenharmony_ci{
37162306a36Sopenharmony_ci	struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
37262306a36Sopenharmony_ci
37362306a36Sopenharmony_ci	if (is_shadow_present_pte(spte))
37462306a36Sopenharmony_ci		return;
37562306a36Sopenharmony_ci
37662306a36Sopenharmony_ci	/* Ensure the spte is completely set before we increase the count */
37762306a36Sopenharmony_ci	smp_wmb();
37862306a36Sopenharmony_ci	sp->clear_spte_count++;
37962306a36Sopenharmony_ci}
38062306a36Sopenharmony_ci
38162306a36Sopenharmony_cistatic void __set_spte(u64 *sptep, u64 spte)
38262306a36Sopenharmony_ci{
38362306a36Sopenharmony_ci	union split_spte *ssptep, sspte;
38462306a36Sopenharmony_ci
38562306a36Sopenharmony_ci	ssptep = (union split_spte *)sptep;
38662306a36Sopenharmony_ci	sspte = (union split_spte)spte;
38762306a36Sopenharmony_ci
38862306a36Sopenharmony_ci	ssptep->spte_high = sspte.spte_high;
38962306a36Sopenharmony_ci
39062306a36Sopenharmony_ci	/*
39162306a36Sopenharmony_ci	 * If we map the spte from nonpresent to present, We should store
39262306a36Sopenharmony_ci	 * the high bits firstly, then set present bit, so cpu can not
39362306a36Sopenharmony_ci	 * fetch this spte while we are setting the spte.
39462306a36Sopenharmony_ci	 */
39562306a36Sopenharmony_ci	smp_wmb();
39662306a36Sopenharmony_ci
39762306a36Sopenharmony_ci	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
39862306a36Sopenharmony_ci}
39962306a36Sopenharmony_ci
40062306a36Sopenharmony_cistatic void __update_clear_spte_fast(u64 *sptep, u64 spte)
40162306a36Sopenharmony_ci{
40262306a36Sopenharmony_ci	union split_spte *ssptep, sspte;
40362306a36Sopenharmony_ci
40462306a36Sopenharmony_ci	ssptep = (union split_spte *)sptep;
40562306a36Sopenharmony_ci	sspte = (union split_spte)spte;
40662306a36Sopenharmony_ci
40762306a36Sopenharmony_ci	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci	/*
41062306a36Sopenharmony_ci	 * If we map the spte from present to nonpresent, we should clear
41162306a36Sopenharmony_ci	 * present bit firstly to avoid vcpu fetch the old high bits.
41262306a36Sopenharmony_ci	 */
41362306a36Sopenharmony_ci	smp_wmb();
41462306a36Sopenharmony_ci
41562306a36Sopenharmony_ci	ssptep->spte_high = sspte.spte_high;
41662306a36Sopenharmony_ci	count_spte_clear(sptep, spte);
41762306a36Sopenharmony_ci}
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_cistatic u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
42062306a36Sopenharmony_ci{
42162306a36Sopenharmony_ci	union split_spte *ssptep, sspte, orig;
42262306a36Sopenharmony_ci
42362306a36Sopenharmony_ci	ssptep = (union split_spte *)sptep;
42462306a36Sopenharmony_ci	sspte = (union split_spte)spte;
42562306a36Sopenharmony_ci
42662306a36Sopenharmony_ci	/* xchg acts as a barrier before the setting of the high bits */
42762306a36Sopenharmony_ci	orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
42862306a36Sopenharmony_ci	orig.spte_high = ssptep->spte_high;
42962306a36Sopenharmony_ci	ssptep->spte_high = sspte.spte_high;
43062306a36Sopenharmony_ci	count_spte_clear(sptep, spte);
43162306a36Sopenharmony_ci
43262306a36Sopenharmony_ci	return orig.spte;
43362306a36Sopenharmony_ci}
43462306a36Sopenharmony_ci
43562306a36Sopenharmony_ci/*
43662306a36Sopenharmony_ci * The idea using the light way get the spte on x86_32 guest is from
43762306a36Sopenharmony_ci * gup_get_pte (mm/gup.c).
43862306a36Sopenharmony_ci *
43962306a36Sopenharmony_ci * An spte tlb flush may be pending, because kvm_set_pte_rmap
44062306a36Sopenharmony_ci * coalesces them and we are running out of the MMU lock.  Therefore
44162306a36Sopenharmony_ci * we need to protect against in-progress updates of the spte.
44262306a36Sopenharmony_ci *
44362306a36Sopenharmony_ci * Reading the spte while an update is in progress may get the old value
44462306a36Sopenharmony_ci * for the high part of the spte.  The race is fine for a present->non-present
44562306a36Sopenharmony_ci * change (because the high part of the spte is ignored for non-present spte),
44662306a36Sopenharmony_ci * but for a present->present change we must reread the spte.
44762306a36Sopenharmony_ci *
44862306a36Sopenharmony_ci * All such changes are done in two steps (present->non-present and
44962306a36Sopenharmony_ci * non-present->present), hence it is enough to count the number of
45062306a36Sopenharmony_ci * present->non-present updates: if it changed while reading the spte,
45162306a36Sopenharmony_ci * we might have hit the race.  This is done using clear_spte_count.
45262306a36Sopenharmony_ci */
45362306a36Sopenharmony_cistatic u64 __get_spte_lockless(u64 *sptep)
45462306a36Sopenharmony_ci{
45562306a36Sopenharmony_ci	struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
45662306a36Sopenharmony_ci	union split_spte spte, *orig = (union split_spte *)sptep;
45762306a36Sopenharmony_ci	int count;
45862306a36Sopenharmony_ci
45962306a36Sopenharmony_ciretry:
46062306a36Sopenharmony_ci	count = sp->clear_spte_count;
46162306a36Sopenharmony_ci	smp_rmb();
46262306a36Sopenharmony_ci
46362306a36Sopenharmony_ci	spte.spte_low = orig->spte_low;
46462306a36Sopenharmony_ci	smp_rmb();
46562306a36Sopenharmony_ci
46662306a36Sopenharmony_ci	spte.spte_high = orig->spte_high;
46762306a36Sopenharmony_ci	smp_rmb();
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci	if (unlikely(spte.spte_low != orig->spte_low ||
47062306a36Sopenharmony_ci	      count != sp->clear_spte_count))
47162306a36Sopenharmony_ci		goto retry;
47262306a36Sopenharmony_ci
47362306a36Sopenharmony_ci	return spte.spte;
47462306a36Sopenharmony_ci}
47562306a36Sopenharmony_ci#endif
47662306a36Sopenharmony_ci
47762306a36Sopenharmony_ci/* Rules for using mmu_spte_set:
47862306a36Sopenharmony_ci * Set the sptep from nonpresent to present.
47962306a36Sopenharmony_ci * Note: the sptep being assigned *must* be either not present
48062306a36Sopenharmony_ci * or in a state where the hardware will not attempt to update
48162306a36Sopenharmony_ci * the spte.
48262306a36Sopenharmony_ci */
48362306a36Sopenharmony_cistatic void mmu_spte_set(u64 *sptep, u64 new_spte)
48462306a36Sopenharmony_ci{
48562306a36Sopenharmony_ci	WARN_ON_ONCE(is_shadow_present_pte(*sptep));
48662306a36Sopenharmony_ci	__set_spte(sptep, new_spte);
48762306a36Sopenharmony_ci}
48862306a36Sopenharmony_ci
48962306a36Sopenharmony_ci/*
49062306a36Sopenharmony_ci * Update the SPTE (excluding the PFN), but do not track changes in its
49162306a36Sopenharmony_ci * accessed/dirty status.
49262306a36Sopenharmony_ci */
49362306a36Sopenharmony_cistatic u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
49462306a36Sopenharmony_ci{
49562306a36Sopenharmony_ci	u64 old_spte = *sptep;
49662306a36Sopenharmony_ci
49762306a36Sopenharmony_ci	WARN_ON_ONCE(!is_shadow_present_pte(new_spte));
49862306a36Sopenharmony_ci	check_spte_writable_invariants(new_spte);
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ci	if (!is_shadow_present_pte(old_spte)) {
50162306a36Sopenharmony_ci		mmu_spte_set(sptep, new_spte);
50262306a36Sopenharmony_ci		return old_spte;
50362306a36Sopenharmony_ci	}
50462306a36Sopenharmony_ci
50562306a36Sopenharmony_ci	if (!spte_has_volatile_bits(old_spte))
50662306a36Sopenharmony_ci		__update_clear_spte_fast(sptep, new_spte);
50762306a36Sopenharmony_ci	else
50862306a36Sopenharmony_ci		old_spte = __update_clear_spte_slow(sptep, new_spte);
50962306a36Sopenharmony_ci
51062306a36Sopenharmony_ci	WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
51162306a36Sopenharmony_ci
51262306a36Sopenharmony_ci	return old_spte;
51362306a36Sopenharmony_ci}
51462306a36Sopenharmony_ci
51562306a36Sopenharmony_ci/* Rules for using mmu_spte_update:
51662306a36Sopenharmony_ci * Update the state bits, it means the mapped pfn is not changed.
51762306a36Sopenharmony_ci *
51862306a36Sopenharmony_ci * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
51962306a36Sopenharmony_ci * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
52062306a36Sopenharmony_ci * spte, even though the writable spte might be cached on a CPU's TLB.
52162306a36Sopenharmony_ci *
52262306a36Sopenharmony_ci * Returns true if the TLB needs to be flushed
52362306a36Sopenharmony_ci */
52462306a36Sopenharmony_cistatic bool mmu_spte_update(u64 *sptep, u64 new_spte)
52562306a36Sopenharmony_ci{
52662306a36Sopenharmony_ci	bool flush = false;
52762306a36Sopenharmony_ci	u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
52862306a36Sopenharmony_ci
52962306a36Sopenharmony_ci	if (!is_shadow_present_pte(old_spte))
53062306a36Sopenharmony_ci		return false;
53162306a36Sopenharmony_ci
53262306a36Sopenharmony_ci	/*
53362306a36Sopenharmony_ci	 * For the spte updated out of mmu-lock is safe, since
53462306a36Sopenharmony_ci	 * we always atomically update it, see the comments in
53562306a36Sopenharmony_ci	 * spte_has_volatile_bits().
53662306a36Sopenharmony_ci	 */
53762306a36Sopenharmony_ci	if (is_mmu_writable_spte(old_spte) &&
53862306a36Sopenharmony_ci	      !is_writable_pte(new_spte))
53962306a36Sopenharmony_ci		flush = true;
54062306a36Sopenharmony_ci
54162306a36Sopenharmony_ci	/*
54262306a36Sopenharmony_ci	 * Flush TLB when accessed/dirty states are changed in the page tables,
54362306a36Sopenharmony_ci	 * to guarantee consistency between TLB and page tables.
54462306a36Sopenharmony_ci	 */
54562306a36Sopenharmony_ci
54662306a36Sopenharmony_ci	if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
54762306a36Sopenharmony_ci		flush = true;
54862306a36Sopenharmony_ci		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
54962306a36Sopenharmony_ci	}
55062306a36Sopenharmony_ci
55162306a36Sopenharmony_ci	if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
55262306a36Sopenharmony_ci		flush = true;
55362306a36Sopenharmony_ci		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
55462306a36Sopenharmony_ci	}
55562306a36Sopenharmony_ci
55662306a36Sopenharmony_ci	return flush;
55762306a36Sopenharmony_ci}
55862306a36Sopenharmony_ci
55962306a36Sopenharmony_ci/*
56062306a36Sopenharmony_ci * Rules for using mmu_spte_clear_track_bits:
56162306a36Sopenharmony_ci * It sets the sptep from present to nonpresent, and track the
56262306a36Sopenharmony_ci * state bits, it is used to clear the last level sptep.
56362306a36Sopenharmony_ci * Returns the old PTE.
56462306a36Sopenharmony_ci */
56562306a36Sopenharmony_cistatic u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
56662306a36Sopenharmony_ci{
56762306a36Sopenharmony_ci	kvm_pfn_t pfn;
56862306a36Sopenharmony_ci	u64 old_spte = *sptep;
56962306a36Sopenharmony_ci	int level = sptep_to_sp(sptep)->role.level;
57062306a36Sopenharmony_ci	struct page *page;
57162306a36Sopenharmony_ci
57262306a36Sopenharmony_ci	if (!is_shadow_present_pte(old_spte) ||
57362306a36Sopenharmony_ci	    !spte_has_volatile_bits(old_spte))
57462306a36Sopenharmony_ci		__update_clear_spte_fast(sptep, 0ull);
57562306a36Sopenharmony_ci	else
57662306a36Sopenharmony_ci		old_spte = __update_clear_spte_slow(sptep, 0ull);
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_ci	if (!is_shadow_present_pte(old_spte))
57962306a36Sopenharmony_ci		return old_spte;
58062306a36Sopenharmony_ci
58162306a36Sopenharmony_ci	kvm_update_page_stats(kvm, level, -1);
58262306a36Sopenharmony_ci
58362306a36Sopenharmony_ci	pfn = spte_to_pfn(old_spte);
58462306a36Sopenharmony_ci
58562306a36Sopenharmony_ci	/*
58662306a36Sopenharmony_ci	 * KVM doesn't hold a reference to any pages mapped into the guest, and
58762306a36Sopenharmony_ci	 * instead uses the mmu_notifier to ensure that KVM unmaps any pages
58862306a36Sopenharmony_ci	 * before they are reclaimed.  Sanity check that, if the pfn is backed
58962306a36Sopenharmony_ci	 * by a refcounted page, the refcount is elevated.
59062306a36Sopenharmony_ci	 */
59162306a36Sopenharmony_ci	page = kvm_pfn_to_refcounted_page(pfn);
59262306a36Sopenharmony_ci	WARN_ON_ONCE(page && !page_count(page));
59362306a36Sopenharmony_ci
59462306a36Sopenharmony_ci	if (is_accessed_spte(old_spte))
59562306a36Sopenharmony_ci		kvm_set_pfn_accessed(pfn);
59662306a36Sopenharmony_ci
59762306a36Sopenharmony_ci	if (is_dirty_spte(old_spte))
59862306a36Sopenharmony_ci		kvm_set_pfn_dirty(pfn);
59962306a36Sopenharmony_ci
60062306a36Sopenharmony_ci	return old_spte;
60162306a36Sopenharmony_ci}
60262306a36Sopenharmony_ci
60362306a36Sopenharmony_ci/*
60462306a36Sopenharmony_ci * Rules for using mmu_spte_clear_no_track:
60562306a36Sopenharmony_ci * Directly clear spte without caring the state bits of sptep,
60662306a36Sopenharmony_ci * it is used to set the upper level spte.
60762306a36Sopenharmony_ci */
60862306a36Sopenharmony_cistatic void mmu_spte_clear_no_track(u64 *sptep)
60962306a36Sopenharmony_ci{
61062306a36Sopenharmony_ci	__update_clear_spte_fast(sptep, 0ull);
61162306a36Sopenharmony_ci}
61262306a36Sopenharmony_ci
61362306a36Sopenharmony_cistatic u64 mmu_spte_get_lockless(u64 *sptep)
61462306a36Sopenharmony_ci{
61562306a36Sopenharmony_ci	return __get_spte_lockless(sptep);
61662306a36Sopenharmony_ci}
61762306a36Sopenharmony_ci
61862306a36Sopenharmony_ci/* Returns the Accessed status of the PTE and resets it at the same time. */
61962306a36Sopenharmony_cistatic bool mmu_spte_age(u64 *sptep)
62062306a36Sopenharmony_ci{
62162306a36Sopenharmony_ci	u64 spte = mmu_spte_get_lockless(sptep);
62262306a36Sopenharmony_ci
62362306a36Sopenharmony_ci	if (!is_accessed_spte(spte))
62462306a36Sopenharmony_ci		return false;
62562306a36Sopenharmony_ci
62662306a36Sopenharmony_ci	if (spte_ad_enabled(spte)) {
62762306a36Sopenharmony_ci		clear_bit((ffs(shadow_accessed_mask) - 1),
62862306a36Sopenharmony_ci			  (unsigned long *)sptep);
62962306a36Sopenharmony_ci	} else {
63062306a36Sopenharmony_ci		/*
63162306a36Sopenharmony_ci		 * Capture the dirty status of the page, so that it doesn't get
63262306a36Sopenharmony_ci		 * lost when the SPTE is marked for access tracking.
63362306a36Sopenharmony_ci		 */
63462306a36Sopenharmony_ci		if (is_writable_pte(spte))
63562306a36Sopenharmony_ci			kvm_set_pfn_dirty(spte_to_pfn(spte));
63662306a36Sopenharmony_ci
63762306a36Sopenharmony_ci		spte = mark_spte_for_access_track(spte);
63862306a36Sopenharmony_ci		mmu_spte_update_no_track(sptep, spte);
63962306a36Sopenharmony_ci	}
64062306a36Sopenharmony_ci
64162306a36Sopenharmony_ci	return true;
64262306a36Sopenharmony_ci}
64362306a36Sopenharmony_ci
64462306a36Sopenharmony_cistatic inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu)
64562306a36Sopenharmony_ci{
64662306a36Sopenharmony_ci	return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct;
64762306a36Sopenharmony_ci}
64862306a36Sopenharmony_ci
64962306a36Sopenharmony_cistatic void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
65062306a36Sopenharmony_ci{
65162306a36Sopenharmony_ci	if (is_tdp_mmu_active(vcpu)) {
65262306a36Sopenharmony_ci		kvm_tdp_mmu_walk_lockless_begin();
65362306a36Sopenharmony_ci	} else {
65462306a36Sopenharmony_ci		/*
65562306a36Sopenharmony_ci		 * Prevent page table teardown by making any free-er wait during
65662306a36Sopenharmony_ci		 * kvm_flush_remote_tlbs() IPI to all active vcpus.
65762306a36Sopenharmony_ci		 */
65862306a36Sopenharmony_ci		local_irq_disable();
65962306a36Sopenharmony_ci
66062306a36Sopenharmony_ci		/*
66162306a36Sopenharmony_ci		 * Make sure a following spte read is not reordered ahead of the write
66262306a36Sopenharmony_ci		 * to vcpu->mode.
66362306a36Sopenharmony_ci		 */
66462306a36Sopenharmony_ci		smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
66562306a36Sopenharmony_ci	}
66662306a36Sopenharmony_ci}
66762306a36Sopenharmony_ci
66862306a36Sopenharmony_cistatic void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
66962306a36Sopenharmony_ci{
67062306a36Sopenharmony_ci	if (is_tdp_mmu_active(vcpu)) {
67162306a36Sopenharmony_ci		kvm_tdp_mmu_walk_lockless_end();
67262306a36Sopenharmony_ci	} else {
67362306a36Sopenharmony_ci		/*
67462306a36Sopenharmony_ci		 * Make sure the write to vcpu->mode is not reordered in front of
67562306a36Sopenharmony_ci		 * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
67662306a36Sopenharmony_ci		 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
67762306a36Sopenharmony_ci		 */
67862306a36Sopenharmony_ci		smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
67962306a36Sopenharmony_ci		local_irq_enable();
68062306a36Sopenharmony_ci	}
68162306a36Sopenharmony_ci}
68262306a36Sopenharmony_ci
68362306a36Sopenharmony_cistatic int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
68462306a36Sopenharmony_ci{
68562306a36Sopenharmony_ci	int r;
68662306a36Sopenharmony_ci
68762306a36Sopenharmony_ci	/* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
68862306a36Sopenharmony_ci	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
68962306a36Sopenharmony_ci				       1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
69062306a36Sopenharmony_ci	if (r)
69162306a36Sopenharmony_ci		return r;
69262306a36Sopenharmony_ci	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
69362306a36Sopenharmony_ci				       PT64_ROOT_MAX_LEVEL);
69462306a36Sopenharmony_ci	if (r)
69562306a36Sopenharmony_ci		return r;
69662306a36Sopenharmony_ci	if (maybe_indirect) {
69762306a36Sopenharmony_ci		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
69862306a36Sopenharmony_ci					       PT64_ROOT_MAX_LEVEL);
69962306a36Sopenharmony_ci		if (r)
70062306a36Sopenharmony_ci			return r;
70162306a36Sopenharmony_ci	}
70262306a36Sopenharmony_ci	return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
70362306a36Sopenharmony_ci					  PT64_ROOT_MAX_LEVEL);
70462306a36Sopenharmony_ci}
70562306a36Sopenharmony_ci
70662306a36Sopenharmony_cistatic void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
70762306a36Sopenharmony_ci{
70862306a36Sopenharmony_ci	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
70962306a36Sopenharmony_ci	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
71062306a36Sopenharmony_ci	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
71162306a36Sopenharmony_ci	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
71262306a36Sopenharmony_ci}
71362306a36Sopenharmony_ci
71462306a36Sopenharmony_cistatic void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
71562306a36Sopenharmony_ci{
71662306a36Sopenharmony_ci	kmem_cache_free(pte_list_desc_cache, pte_list_desc);
71762306a36Sopenharmony_ci}
71862306a36Sopenharmony_ci
71962306a36Sopenharmony_cistatic bool sp_has_gptes(struct kvm_mmu_page *sp);
72062306a36Sopenharmony_ci
72162306a36Sopenharmony_cistatic gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
72262306a36Sopenharmony_ci{
72362306a36Sopenharmony_ci	if (sp->role.passthrough)
72462306a36Sopenharmony_ci		return sp->gfn;
72562306a36Sopenharmony_ci
72662306a36Sopenharmony_ci	if (!sp->role.direct)
72762306a36Sopenharmony_ci		return sp->shadowed_translation[index] >> PAGE_SHIFT;
72862306a36Sopenharmony_ci
72962306a36Sopenharmony_ci	return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
73062306a36Sopenharmony_ci}
73162306a36Sopenharmony_ci
73262306a36Sopenharmony_ci/*
73362306a36Sopenharmony_ci * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note
73462306a36Sopenharmony_ci * that the SPTE itself may have a more constrained access permissions that
73562306a36Sopenharmony_ci * what the guest enforces. For example, a guest may create an executable
73662306a36Sopenharmony_ci * huge PTE but KVM may disallow execution to mitigate iTLB multihit.
73762306a36Sopenharmony_ci */
73862306a36Sopenharmony_cistatic u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
73962306a36Sopenharmony_ci{
74062306a36Sopenharmony_ci	if (sp_has_gptes(sp))
74162306a36Sopenharmony_ci		return sp->shadowed_translation[index] & ACC_ALL;
74262306a36Sopenharmony_ci
74362306a36Sopenharmony_ci	/*
74462306a36Sopenharmony_ci	 * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs,
74562306a36Sopenharmony_ci	 * KVM is not shadowing any guest page tables, so the "guest access
74662306a36Sopenharmony_ci	 * permissions" are just ACC_ALL.
74762306a36Sopenharmony_ci	 *
74862306a36Sopenharmony_ci	 * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM
74962306a36Sopenharmony_ci	 * is shadowing a guest huge page with small pages, the guest access
75062306a36Sopenharmony_ci	 * permissions being shadowed are the access permissions of the huge
75162306a36Sopenharmony_ci	 * page.
75262306a36Sopenharmony_ci	 *
75362306a36Sopenharmony_ci	 * In both cases, sp->role.access contains the correct access bits.
75462306a36Sopenharmony_ci	 */
75562306a36Sopenharmony_ci	return sp->role.access;
75662306a36Sopenharmony_ci}
75762306a36Sopenharmony_ci
75862306a36Sopenharmony_cistatic void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
75962306a36Sopenharmony_ci					 gfn_t gfn, unsigned int access)
76062306a36Sopenharmony_ci{
76162306a36Sopenharmony_ci	if (sp_has_gptes(sp)) {
76262306a36Sopenharmony_ci		sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
76362306a36Sopenharmony_ci		return;
76462306a36Sopenharmony_ci	}
76562306a36Sopenharmony_ci
76662306a36Sopenharmony_ci	WARN_ONCE(access != kvm_mmu_page_get_access(sp, index),
76762306a36Sopenharmony_ci	          "access mismatch under %s page %llx (expected %u, got %u)\n",
76862306a36Sopenharmony_ci	          sp->role.passthrough ? "passthrough" : "direct",
76962306a36Sopenharmony_ci	          sp->gfn, kvm_mmu_page_get_access(sp, index), access);
77062306a36Sopenharmony_ci
77162306a36Sopenharmony_ci	WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index),
77262306a36Sopenharmony_ci	          "gfn mismatch under %s page %llx (expected %llx, got %llx)\n",
77362306a36Sopenharmony_ci	          sp->role.passthrough ? "passthrough" : "direct",
77462306a36Sopenharmony_ci	          sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn);
77562306a36Sopenharmony_ci}
77662306a36Sopenharmony_ci
77762306a36Sopenharmony_cistatic void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index,
77862306a36Sopenharmony_ci				    unsigned int access)
77962306a36Sopenharmony_ci{
78062306a36Sopenharmony_ci	gfn_t gfn = kvm_mmu_page_get_gfn(sp, index);
78162306a36Sopenharmony_ci
78262306a36Sopenharmony_ci	kvm_mmu_page_set_translation(sp, index, gfn, access);
78362306a36Sopenharmony_ci}
78462306a36Sopenharmony_ci
78562306a36Sopenharmony_ci/*
78662306a36Sopenharmony_ci * Return the pointer to the large page information for a given gfn,
78762306a36Sopenharmony_ci * handling slots that are not large page aligned.
78862306a36Sopenharmony_ci */
78962306a36Sopenharmony_cistatic struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
79062306a36Sopenharmony_ci		const struct kvm_memory_slot *slot, int level)
79162306a36Sopenharmony_ci{
79262306a36Sopenharmony_ci	unsigned long idx;
79362306a36Sopenharmony_ci
79462306a36Sopenharmony_ci	idx = gfn_to_index(gfn, slot->base_gfn, level);
79562306a36Sopenharmony_ci	return &slot->arch.lpage_info[level - 2][idx];
79662306a36Sopenharmony_ci}
79762306a36Sopenharmony_ci
79862306a36Sopenharmony_cistatic void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
79962306a36Sopenharmony_ci					    gfn_t gfn, int count)
80062306a36Sopenharmony_ci{
80162306a36Sopenharmony_ci	struct kvm_lpage_info *linfo;
80262306a36Sopenharmony_ci	int i;
80362306a36Sopenharmony_ci
80462306a36Sopenharmony_ci	for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
80562306a36Sopenharmony_ci		linfo = lpage_info_slot(gfn, slot, i);
80662306a36Sopenharmony_ci		linfo->disallow_lpage += count;
80762306a36Sopenharmony_ci		WARN_ON_ONCE(linfo->disallow_lpage < 0);
80862306a36Sopenharmony_ci	}
80962306a36Sopenharmony_ci}
81062306a36Sopenharmony_ci
81162306a36Sopenharmony_civoid kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
81262306a36Sopenharmony_ci{
81362306a36Sopenharmony_ci	update_gfn_disallow_lpage_count(slot, gfn, 1);
81462306a36Sopenharmony_ci}
81562306a36Sopenharmony_ci
81662306a36Sopenharmony_civoid kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
81762306a36Sopenharmony_ci{
81862306a36Sopenharmony_ci	update_gfn_disallow_lpage_count(slot, gfn, -1);
81962306a36Sopenharmony_ci}
82062306a36Sopenharmony_ci
82162306a36Sopenharmony_cistatic void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
82262306a36Sopenharmony_ci{
82362306a36Sopenharmony_ci	struct kvm_memslots *slots;
82462306a36Sopenharmony_ci	struct kvm_memory_slot *slot;
82562306a36Sopenharmony_ci	gfn_t gfn;
82662306a36Sopenharmony_ci
82762306a36Sopenharmony_ci	kvm->arch.indirect_shadow_pages++;
82862306a36Sopenharmony_ci	gfn = sp->gfn;
82962306a36Sopenharmony_ci	slots = kvm_memslots_for_spte_role(kvm, sp->role);
83062306a36Sopenharmony_ci	slot = __gfn_to_memslot(slots, gfn);
83162306a36Sopenharmony_ci
83262306a36Sopenharmony_ci	/* the non-leaf shadow pages are keeping readonly. */
83362306a36Sopenharmony_ci	if (sp->role.level > PG_LEVEL_4K)
83462306a36Sopenharmony_ci		return __kvm_write_track_add_gfn(kvm, slot, gfn);
83562306a36Sopenharmony_ci
83662306a36Sopenharmony_ci	kvm_mmu_gfn_disallow_lpage(slot, gfn);
83762306a36Sopenharmony_ci
83862306a36Sopenharmony_ci	if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
83962306a36Sopenharmony_ci		kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
84062306a36Sopenharmony_ci}
84162306a36Sopenharmony_ci
84262306a36Sopenharmony_civoid track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
84362306a36Sopenharmony_ci{
84462306a36Sopenharmony_ci	/*
84562306a36Sopenharmony_ci	 * If it's possible to replace the shadow page with an NX huge page,
84662306a36Sopenharmony_ci	 * i.e. if the shadow page is the only thing currently preventing KVM
84762306a36Sopenharmony_ci	 * from using a huge page, add the shadow page to the list of "to be
84862306a36Sopenharmony_ci	 * zapped for NX recovery" pages.  Note, the shadow page can already be
84962306a36Sopenharmony_ci	 * on the list if KVM is reusing an existing shadow page, i.e. if KVM
85062306a36Sopenharmony_ci	 * links a shadow page at multiple points.
85162306a36Sopenharmony_ci	 */
85262306a36Sopenharmony_ci	if (!list_empty(&sp->possible_nx_huge_page_link))
85362306a36Sopenharmony_ci		return;
85462306a36Sopenharmony_ci
85562306a36Sopenharmony_ci	++kvm->stat.nx_lpage_splits;
85662306a36Sopenharmony_ci	list_add_tail(&sp->possible_nx_huge_page_link,
85762306a36Sopenharmony_ci		      &kvm->arch.possible_nx_huge_pages);
85862306a36Sopenharmony_ci}
85962306a36Sopenharmony_ci
86062306a36Sopenharmony_cistatic void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
86162306a36Sopenharmony_ci				 bool nx_huge_page_possible)
86262306a36Sopenharmony_ci{
86362306a36Sopenharmony_ci	sp->nx_huge_page_disallowed = true;
86462306a36Sopenharmony_ci
86562306a36Sopenharmony_ci	if (nx_huge_page_possible)
86662306a36Sopenharmony_ci		track_possible_nx_huge_page(kvm, sp);
86762306a36Sopenharmony_ci}
86862306a36Sopenharmony_ci
86962306a36Sopenharmony_cistatic void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
87062306a36Sopenharmony_ci{
87162306a36Sopenharmony_ci	struct kvm_memslots *slots;
87262306a36Sopenharmony_ci	struct kvm_memory_slot *slot;
87362306a36Sopenharmony_ci	gfn_t gfn;
87462306a36Sopenharmony_ci
87562306a36Sopenharmony_ci	kvm->arch.indirect_shadow_pages--;
87662306a36Sopenharmony_ci	gfn = sp->gfn;
87762306a36Sopenharmony_ci	slots = kvm_memslots_for_spte_role(kvm, sp->role);
87862306a36Sopenharmony_ci	slot = __gfn_to_memslot(slots, gfn);
87962306a36Sopenharmony_ci	if (sp->role.level > PG_LEVEL_4K)
88062306a36Sopenharmony_ci		return __kvm_write_track_remove_gfn(kvm, slot, gfn);
88162306a36Sopenharmony_ci
88262306a36Sopenharmony_ci	kvm_mmu_gfn_allow_lpage(slot, gfn);
88362306a36Sopenharmony_ci}
88462306a36Sopenharmony_ci
88562306a36Sopenharmony_civoid untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
88662306a36Sopenharmony_ci{
88762306a36Sopenharmony_ci	if (list_empty(&sp->possible_nx_huge_page_link))
88862306a36Sopenharmony_ci		return;
88962306a36Sopenharmony_ci
89062306a36Sopenharmony_ci	--kvm->stat.nx_lpage_splits;
89162306a36Sopenharmony_ci	list_del_init(&sp->possible_nx_huge_page_link);
89262306a36Sopenharmony_ci}
89362306a36Sopenharmony_ci
89462306a36Sopenharmony_cistatic void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
89562306a36Sopenharmony_ci{
89662306a36Sopenharmony_ci	sp->nx_huge_page_disallowed = false;
89762306a36Sopenharmony_ci
89862306a36Sopenharmony_ci	untrack_possible_nx_huge_page(kvm, sp);
89962306a36Sopenharmony_ci}
90062306a36Sopenharmony_ci
90162306a36Sopenharmony_cistatic struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu,
90262306a36Sopenharmony_ci							   gfn_t gfn,
90362306a36Sopenharmony_ci							   bool no_dirty_log)
90462306a36Sopenharmony_ci{
90562306a36Sopenharmony_ci	struct kvm_memory_slot *slot;
90662306a36Sopenharmony_ci
90762306a36Sopenharmony_ci	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
90862306a36Sopenharmony_ci	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
90962306a36Sopenharmony_ci		return NULL;
91062306a36Sopenharmony_ci	if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
91162306a36Sopenharmony_ci		return NULL;
91262306a36Sopenharmony_ci
91362306a36Sopenharmony_ci	return slot;
91462306a36Sopenharmony_ci}
91562306a36Sopenharmony_ci
91662306a36Sopenharmony_ci/*
91762306a36Sopenharmony_ci * About rmap_head encoding:
91862306a36Sopenharmony_ci *
91962306a36Sopenharmony_ci * If the bit zero of rmap_head->val is clear, then it points to the only spte
92062306a36Sopenharmony_ci * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
92162306a36Sopenharmony_ci * pte_list_desc containing more mappings.
92262306a36Sopenharmony_ci */
92362306a36Sopenharmony_ci
92462306a36Sopenharmony_ci/*
92562306a36Sopenharmony_ci * Returns the number of pointers in the rmap chain, not counting the new one.
92662306a36Sopenharmony_ci */
92762306a36Sopenharmony_cistatic int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
92862306a36Sopenharmony_ci			struct kvm_rmap_head *rmap_head)
92962306a36Sopenharmony_ci{
93062306a36Sopenharmony_ci	struct pte_list_desc *desc;
93162306a36Sopenharmony_ci	int count = 0;
93262306a36Sopenharmony_ci
93362306a36Sopenharmony_ci	if (!rmap_head->val) {
93462306a36Sopenharmony_ci		rmap_head->val = (unsigned long)spte;
93562306a36Sopenharmony_ci	} else if (!(rmap_head->val & 1)) {
93662306a36Sopenharmony_ci		desc = kvm_mmu_memory_cache_alloc(cache);
93762306a36Sopenharmony_ci		desc->sptes[0] = (u64 *)rmap_head->val;
93862306a36Sopenharmony_ci		desc->sptes[1] = spte;
93962306a36Sopenharmony_ci		desc->spte_count = 2;
94062306a36Sopenharmony_ci		desc->tail_count = 0;
94162306a36Sopenharmony_ci		rmap_head->val = (unsigned long)desc | 1;
94262306a36Sopenharmony_ci		++count;
94362306a36Sopenharmony_ci	} else {
94462306a36Sopenharmony_ci		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
94562306a36Sopenharmony_ci		count = desc->tail_count + desc->spte_count;
94662306a36Sopenharmony_ci
94762306a36Sopenharmony_ci		/*
94862306a36Sopenharmony_ci		 * If the previous head is full, allocate a new head descriptor
94962306a36Sopenharmony_ci		 * as tail descriptors are always kept full.
95062306a36Sopenharmony_ci		 */
95162306a36Sopenharmony_ci		if (desc->spte_count == PTE_LIST_EXT) {
95262306a36Sopenharmony_ci			desc = kvm_mmu_memory_cache_alloc(cache);
95362306a36Sopenharmony_ci			desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul);
95462306a36Sopenharmony_ci			desc->spte_count = 0;
95562306a36Sopenharmony_ci			desc->tail_count = count;
95662306a36Sopenharmony_ci			rmap_head->val = (unsigned long)desc | 1;
95762306a36Sopenharmony_ci		}
95862306a36Sopenharmony_ci		desc->sptes[desc->spte_count++] = spte;
95962306a36Sopenharmony_ci	}
96062306a36Sopenharmony_ci	return count;
96162306a36Sopenharmony_ci}
96262306a36Sopenharmony_ci
96362306a36Sopenharmony_cistatic void pte_list_desc_remove_entry(struct kvm *kvm,
96462306a36Sopenharmony_ci				       struct kvm_rmap_head *rmap_head,
96562306a36Sopenharmony_ci				       struct pte_list_desc *desc, int i)
96662306a36Sopenharmony_ci{
96762306a36Sopenharmony_ci	struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
96862306a36Sopenharmony_ci	int j = head_desc->spte_count - 1;
96962306a36Sopenharmony_ci
97062306a36Sopenharmony_ci	/*
97162306a36Sopenharmony_ci	 * The head descriptor should never be empty.  A new head is added only
97262306a36Sopenharmony_ci	 * when adding an entry and the previous head is full, and heads are
97362306a36Sopenharmony_ci	 * removed (this flow) when they become empty.
97462306a36Sopenharmony_ci	 */
97562306a36Sopenharmony_ci	KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm);
97662306a36Sopenharmony_ci
97762306a36Sopenharmony_ci	/*
97862306a36Sopenharmony_ci	 * Replace the to-be-freed SPTE with the last valid entry from the head
97962306a36Sopenharmony_ci	 * descriptor to ensure that tail descriptors are full at all times.
98062306a36Sopenharmony_ci	 * Note, this also means that tail_count is stable for each descriptor.
98162306a36Sopenharmony_ci	 */
98262306a36Sopenharmony_ci	desc->sptes[i] = head_desc->sptes[j];
98362306a36Sopenharmony_ci	head_desc->sptes[j] = NULL;
98462306a36Sopenharmony_ci	head_desc->spte_count--;
98562306a36Sopenharmony_ci	if (head_desc->spte_count)
98662306a36Sopenharmony_ci		return;
98762306a36Sopenharmony_ci
98862306a36Sopenharmony_ci	/*
98962306a36Sopenharmony_ci	 * The head descriptor is empty.  If there are no tail descriptors,
99062306a36Sopenharmony_ci	 * nullify the rmap head to mark the list as emtpy, else point the rmap
99162306a36Sopenharmony_ci	 * head at the next descriptor, i.e. the new head.
99262306a36Sopenharmony_ci	 */
99362306a36Sopenharmony_ci	if (!head_desc->more)
99462306a36Sopenharmony_ci		rmap_head->val = 0;
99562306a36Sopenharmony_ci	else
99662306a36Sopenharmony_ci		rmap_head->val = (unsigned long)head_desc->more | 1;
99762306a36Sopenharmony_ci	mmu_free_pte_list_desc(head_desc);
99862306a36Sopenharmony_ci}
99962306a36Sopenharmony_ci
100062306a36Sopenharmony_cistatic void pte_list_remove(struct kvm *kvm, u64 *spte,
100162306a36Sopenharmony_ci			    struct kvm_rmap_head *rmap_head)
100262306a36Sopenharmony_ci{
100362306a36Sopenharmony_ci	struct pte_list_desc *desc;
100462306a36Sopenharmony_ci	int i;
100562306a36Sopenharmony_ci
100662306a36Sopenharmony_ci	if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
100762306a36Sopenharmony_ci		return;
100862306a36Sopenharmony_ci
100962306a36Sopenharmony_ci	if (!(rmap_head->val & 1)) {
101062306a36Sopenharmony_ci		if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
101162306a36Sopenharmony_ci			return;
101262306a36Sopenharmony_ci
101362306a36Sopenharmony_ci		rmap_head->val = 0;
101462306a36Sopenharmony_ci	} else {
101562306a36Sopenharmony_ci		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
101662306a36Sopenharmony_ci		while (desc) {
101762306a36Sopenharmony_ci			for (i = 0; i < desc->spte_count; ++i) {
101862306a36Sopenharmony_ci				if (desc->sptes[i] == spte) {
101962306a36Sopenharmony_ci					pte_list_desc_remove_entry(kvm, rmap_head,
102062306a36Sopenharmony_ci								   desc, i);
102162306a36Sopenharmony_ci					return;
102262306a36Sopenharmony_ci				}
102362306a36Sopenharmony_ci			}
102462306a36Sopenharmony_ci			desc = desc->more;
102562306a36Sopenharmony_ci		}
102662306a36Sopenharmony_ci
102762306a36Sopenharmony_ci		KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
102862306a36Sopenharmony_ci	}
102962306a36Sopenharmony_ci}
103062306a36Sopenharmony_ci
103162306a36Sopenharmony_cistatic void kvm_zap_one_rmap_spte(struct kvm *kvm,
103262306a36Sopenharmony_ci				  struct kvm_rmap_head *rmap_head, u64 *sptep)
103362306a36Sopenharmony_ci{
103462306a36Sopenharmony_ci	mmu_spte_clear_track_bits(kvm, sptep);
103562306a36Sopenharmony_ci	pte_list_remove(kvm, sptep, rmap_head);
103662306a36Sopenharmony_ci}
103762306a36Sopenharmony_ci
103862306a36Sopenharmony_ci/* Return true if at least one SPTE was zapped, false otherwise */
103962306a36Sopenharmony_cistatic bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
104062306a36Sopenharmony_ci				   struct kvm_rmap_head *rmap_head)
104162306a36Sopenharmony_ci{
104262306a36Sopenharmony_ci	struct pte_list_desc *desc, *next;
104362306a36Sopenharmony_ci	int i;
104462306a36Sopenharmony_ci
104562306a36Sopenharmony_ci	if (!rmap_head->val)
104662306a36Sopenharmony_ci		return false;
104762306a36Sopenharmony_ci
104862306a36Sopenharmony_ci	if (!(rmap_head->val & 1)) {
104962306a36Sopenharmony_ci		mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
105062306a36Sopenharmony_ci		goto out;
105162306a36Sopenharmony_ci	}
105262306a36Sopenharmony_ci
105362306a36Sopenharmony_ci	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
105462306a36Sopenharmony_ci
105562306a36Sopenharmony_ci	for (; desc; desc = next) {
105662306a36Sopenharmony_ci		for (i = 0; i < desc->spte_count; i++)
105762306a36Sopenharmony_ci			mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
105862306a36Sopenharmony_ci		next = desc->more;
105962306a36Sopenharmony_ci		mmu_free_pte_list_desc(desc);
106062306a36Sopenharmony_ci	}
106162306a36Sopenharmony_ciout:
106262306a36Sopenharmony_ci	/* rmap_head is meaningless now, remember to reset it */
106362306a36Sopenharmony_ci	rmap_head->val = 0;
106462306a36Sopenharmony_ci	return true;
106562306a36Sopenharmony_ci}
106662306a36Sopenharmony_ci
106762306a36Sopenharmony_ciunsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
106862306a36Sopenharmony_ci{
106962306a36Sopenharmony_ci	struct pte_list_desc *desc;
107062306a36Sopenharmony_ci
107162306a36Sopenharmony_ci	if (!rmap_head->val)
107262306a36Sopenharmony_ci		return 0;
107362306a36Sopenharmony_ci	else if (!(rmap_head->val & 1))
107462306a36Sopenharmony_ci		return 1;
107562306a36Sopenharmony_ci
107662306a36Sopenharmony_ci	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
107762306a36Sopenharmony_ci	return desc->tail_count + desc->spte_count;
107862306a36Sopenharmony_ci}
107962306a36Sopenharmony_ci
108062306a36Sopenharmony_cistatic struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
108162306a36Sopenharmony_ci					 const struct kvm_memory_slot *slot)
108262306a36Sopenharmony_ci{
108362306a36Sopenharmony_ci	unsigned long idx;
108462306a36Sopenharmony_ci
108562306a36Sopenharmony_ci	idx = gfn_to_index(gfn, slot->base_gfn, level);
108662306a36Sopenharmony_ci	return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
108762306a36Sopenharmony_ci}
108862306a36Sopenharmony_ci
108962306a36Sopenharmony_cistatic void rmap_remove(struct kvm *kvm, u64 *spte)
109062306a36Sopenharmony_ci{
109162306a36Sopenharmony_ci	struct kvm_memslots *slots;
109262306a36Sopenharmony_ci	struct kvm_memory_slot *slot;
109362306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
109462306a36Sopenharmony_ci	gfn_t gfn;
109562306a36Sopenharmony_ci	struct kvm_rmap_head *rmap_head;
109662306a36Sopenharmony_ci
109762306a36Sopenharmony_ci	sp = sptep_to_sp(spte);
109862306a36Sopenharmony_ci	gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte));
109962306a36Sopenharmony_ci
110062306a36Sopenharmony_ci	/*
110162306a36Sopenharmony_ci	 * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
110262306a36Sopenharmony_ci	 * so we have to determine which memslots to use based on context
110362306a36Sopenharmony_ci	 * information in sp->role.
110462306a36Sopenharmony_ci	 */
110562306a36Sopenharmony_ci	slots = kvm_memslots_for_spte_role(kvm, sp->role);
110662306a36Sopenharmony_ci
110762306a36Sopenharmony_ci	slot = __gfn_to_memslot(slots, gfn);
110862306a36Sopenharmony_ci	rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
110962306a36Sopenharmony_ci
111062306a36Sopenharmony_ci	pte_list_remove(kvm, spte, rmap_head);
111162306a36Sopenharmony_ci}
111262306a36Sopenharmony_ci
111362306a36Sopenharmony_ci/*
111462306a36Sopenharmony_ci * Used by the following functions to iterate through the sptes linked by a
111562306a36Sopenharmony_ci * rmap.  All fields are private and not assumed to be used outside.
111662306a36Sopenharmony_ci */
111762306a36Sopenharmony_cistruct rmap_iterator {
111862306a36Sopenharmony_ci	/* private fields */
111962306a36Sopenharmony_ci	struct pte_list_desc *desc;	/* holds the sptep if not NULL */
112062306a36Sopenharmony_ci	int pos;			/* index of the sptep */
112162306a36Sopenharmony_ci};
112262306a36Sopenharmony_ci
112362306a36Sopenharmony_ci/*
112462306a36Sopenharmony_ci * Iteration must be started by this function.  This should also be used after
112562306a36Sopenharmony_ci * removing/dropping sptes from the rmap link because in such cases the
112662306a36Sopenharmony_ci * information in the iterator may not be valid.
112762306a36Sopenharmony_ci *
112862306a36Sopenharmony_ci * Returns sptep if found, NULL otherwise.
112962306a36Sopenharmony_ci */
113062306a36Sopenharmony_cistatic u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
113162306a36Sopenharmony_ci			   struct rmap_iterator *iter)
113262306a36Sopenharmony_ci{
113362306a36Sopenharmony_ci	u64 *sptep;
113462306a36Sopenharmony_ci
113562306a36Sopenharmony_ci	if (!rmap_head->val)
113662306a36Sopenharmony_ci		return NULL;
113762306a36Sopenharmony_ci
113862306a36Sopenharmony_ci	if (!(rmap_head->val & 1)) {
113962306a36Sopenharmony_ci		iter->desc = NULL;
114062306a36Sopenharmony_ci		sptep = (u64 *)rmap_head->val;
114162306a36Sopenharmony_ci		goto out;
114262306a36Sopenharmony_ci	}
114362306a36Sopenharmony_ci
114462306a36Sopenharmony_ci	iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
114562306a36Sopenharmony_ci	iter->pos = 0;
114662306a36Sopenharmony_ci	sptep = iter->desc->sptes[iter->pos];
114762306a36Sopenharmony_ciout:
114862306a36Sopenharmony_ci	BUG_ON(!is_shadow_present_pte(*sptep));
114962306a36Sopenharmony_ci	return sptep;
115062306a36Sopenharmony_ci}
115162306a36Sopenharmony_ci
115262306a36Sopenharmony_ci/*
115362306a36Sopenharmony_ci * Must be used with a valid iterator: e.g. after rmap_get_first().
115462306a36Sopenharmony_ci *
115562306a36Sopenharmony_ci * Returns sptep if found, NULL otherwise.
115662306a36Sopenharmony_ci */
115762306a36Sopenharmony_cistatic u64 *rmap_get_next(struct rmap_iterator *iter)
115862306a36Sopenharmony_ci{
115962306a36Sopenharmony_ci	u64 *sptep;
116062306a36Sopenharmony_ci
116162306a36Sopenharmony_ci	if (iter->desc) {
116262306a36Sopenharmony_ci		if (iter->pos < PTE_LIST_EXT - 1) {
116362306a36Sopenharmony_ci			++iter->pos;
116462306a36Sopenharmony_ci			sptep = iter->desc->sptes[iter->pos];
116562306a36Sopenharmony_ci			if (sptep)
116662306a36Sopenharmony_ci				goto out;
116762306a36Sopenharmony_ci		}
116862306a36Sopenharmony_ci
116962306a36Sopenharmony_ci		iter->desc = iter->desc->more;
117062306a36Sopenharmony_ci
117162306a36Sopenharmony_ci		if (iter->desc) {
117262306a36Sopenharmony_ci			iter->pos = 0;
117362306a36Sopenharmony_ci			/* desc->sptes[0] cannot be NULL */
117462306a36Sopenharmony_ci			sptep = iter->desc->sptes[iter->pos];
117562306a36Sopenharmony_ci			goto out;
117662306a36Sopenharmony_ci		}
117762306a36Sopenharmony_ci	}
117862306a36Sopenharmony_ci
117962306a36Sopenharmony_ci	return NULL;
118062306a36Sopenharmony_ciout:
118162306a36Sopenharmony_ci	BUG_ON(!is_shadow_present_pte(*sptep));
118262306a36Sopenharmony_ci	return sptep;
118362306a36Sopenharmony_ci}
118462306a36Sopenharmony_ci
118562306a36Sopenharmony_ci#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)			\
118662306a36Sopenharmony_ci	for (_spte_ = rmap_get_first(_rmap_head_, _iter_);		\
118762306a36Sopenharmony_ci	     _spte_; _spte_ = rmap_get_next(_iter_))
118862306a36Sopenharmony_ci
118962306a36Sopenharmony_cistatic void drop_spte(struct kvm *kvm, u64 *sptep)
119062306a36Sopenharmony_ci{
119162306a36Sopenharmony_ci	u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
119262306a36Sopenharmony_ci
119362306a36Sopenharmony_ci	if (is_shadow_present_pte(old_spte))
119462306a36Sopenharmony_ci		rmap_remove(kvm, sptep);
119562306a36Sopenharmony_ci}
119662306a36Sopenharmony_ci
119762306a36Sopenharmony_cistatic void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
119862306a36Sopenharmony_ci{
119962306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
120062306a36Sopenharmony_ci
120162306a36Sopenharmony_ci	sp = sptep_to_sp(sptep);
120262306a36Sopenharmony_ci	WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K);
120362306a36Sopenharmony_ci
120462306a36Sopenharmony_ci	drop_spte(kvm, sptep);
120562306a36Sopenharmony_ci
120662306a36Sopenharmony_ci	if (flush)
120762306a36Sopenharmony_ci		kvm_flush_remote_tlbs_sptep(kvm, sptep);
120862306a36Sopenharmony_ci}
120962306a36Sopenharmony_ci
121062306a36Sopenharmony_ci/*
121162306a36Sopenharmony_ci * Write-protect on the specified @sptep, @pt_protect indicates whether
121262306a36Sopenharmony_ci * spte write-protection is caused by protecting shadow page table.
121362306a36Sopenharmony_ci *
121462306a36Sopenharmony_ci * Note: write protection is difference between dirty logging and spte
121562306a36Sopenharmony_ci * protection:
121662306a36Sopenharmony_ci * - for dirty logging, the spte can be set to writable at anytime if
121762306a36Sopenharmony_ci *   its dirty bitmap is properly set.
121862306a36Sopenharmony_ci * - for spte protection, the spte can be writable only after unsync-ing
121962306a36Sopenharmony_ci *   shadow page.
122062306a36Sopenharmony_ci *
122162306a36Sopenharmony_ci * Return true if tlb need be flushed.
122262306a36Sopenharmony_ci */
122362306a36Sopenharmony_cistatic bool spte_write_protect(u64 *sptep, bool pt_protect)
122462306a36Sopenharmony_ci{
122562306a36Sopenharmony_ci	u64 spte = *sptep;
122662306a36Sopenharmony_ci
122762306a36Sopenharmony_ci	if (!is_writable_pte(spte) &&
122862306a36Sopenharmony_ci	    !(pt_protect && is_mmu_writable_spte(spte)))
122962306a36Sopenharmony_ci		return false;
123062306a36Sopenharmony_ci
123162306a36Sopenharmony_ci	if (pt_protect)
123262306a36Sopenharmony_ci		spte &= ~shadow_mmu_writable_mask;
123362306a36Sopenharmony_ci	spte = spte & ~PT_WRITABLE_MASK;
123462306a36Sopenharmony_ci
123562306a36Sopenharmony_ci	return mmu_spte_update(sptep, spte);
123662306a36Sopenharmony_ci}
123762306a36Sopenharmony_ci
123862306a36Sopenharmony_cistatic bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
123962306a36Sopenharmony_ci			       bool pt_protect)
124062306a36Sopenharmony_ci{
124162306a36Sopenharmony_ci	u64 *sptep;
124262306a36Sopenharmony_ci	struct rmap_iterator iter;
124362306a36Sopenharmony_ci	bool flush = false;
124462306a36Sopenharmony_ci
124562306a36Sopenharmony_ci	for_each_rmap_spte(rmap_head, &iter, sptep)
124662306a36Sopenharmony_ci		flush |= spte_write_protect(sptep, pt_protect);
124762306a36Sopenharmony_ci
124862306a36Sopenharmony_ci	return flush;
124962306a36Sopenharmony_ci}
125062306a36Sopenharmony_ci
125162306a36Sopenharmony_cistatic bool spte_clear_dirty(u64 *sptep)
125262306a36Sopenharmony_ci{
125362306a36Sopenharmony_ci	u64 spte = *sptep;
125462306a36Sopenharmony_ci
125562306a36Sopenharmony_ci	KVM_MMU_WARN_ON(!spte_ad_enabled(spte));
125662306a36Sopenharmony_ci	spte &= ~shadow_dirty_mask;
125762306a36Sopenharmony_ci	return mmu_spte_update(sptep, spte);
125862306a36Sopenharmony_ci}
125962306a36Sopenharmony_ci
126062306a36Sopenharmony_cistatic bool spte_wrprot_for_clear_dirty(u64 *sptep)
126162306a36Sopenharmony_ci{
126262306a36Sopenharmony_ci	bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
126362306a36Sopenharmony_ci					       (unsigned long *)sptep);
126462306a36Sopenharmony_ci	if (was_writable && !spte_ad_enabled(*sptep))
126562306a36Sopenharmony_ci		kvm_set_pfn_dirty(spte_to_pfn(*sptep));
126662306a36Sopenharmony_ci
126762306a36Sopenharmony_ci	return was_writable;
126862306a36Sopenharmony_ci}
126962306a36Sopenharmony_ci
127062306a36Sopenharmony_ci/*
127162306a36Sopenharmony_ci * Gets the GFN ready for another round of dirty logging by clearing the
127262306a36Sopenharmony_ci *	- D bit on ad-enabled SPTEs, and
127362306a36Sopenharmony_ci *	- W bit on ad-disabled SPTEs.
127462306a36Sopenharmony_ci * Returns true iff any D or W bits were cleared.
127562306a36Sopenharmony_ci */
127662306a36Sopenharmony_cistatic bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
127762306a36Sopenharmony_ci			       const struct kvm_memory_slot *slot)
127862306a36Sopenharmony_ci{
127962306a36Sopenharmony_ci	u64 *sptep;
128062306a36Sopenharmony_ci	struct rmap_iterator iter;
128162306a36Sopenharmony_ci	bool flush = false;
128262306a36Sopenharmony_ci
128362306a36Sopenharmony_ci	for_each_rmap_spte(rmap_head, &iter, sptep)
128462306a36Sopenharmony_ci		if (spte_ad_need_write_protect(*sptep))
128562306a36Sopenharmony_ci			flush |= spte_wrprot_for_clear_dirty(sptep);
128662306a36Sopenharmony_ci		else
128762306a36Sopenharmony_ci			flush |= spte_clear_dirty(sptep);
128862306a36Sopenharmony_ci
128962306a36Sopenharmony_ci	return flush;
129062306a36Sopenharmony_ci}
129162306a36Sopenharmony_ci
129262306a36Sopenharmony_ci/**
129362306a36Sopenharmony_ci * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
129462306a36Sopenharmony_ci * @kvm: kvm instance
129562306a36Sopenharmony_ci * @slot: slot to protect
129662306a36Sopenharmony_ci * @gfn_offset: start of the BITS_PER_LONG pages we care about
129762306a36Sopenharmony_ci * @mask: indicates which pages we should protect
129862306a36Sopenharmony_ci *
129962306a36Sopenharmony_ci * Used when we do not need to care about huge page mappings.
130062306a36Sopenharmony_ci */
130162306a36Sopenharmony_cistatic void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
130262306a36Sopenharmony_ci				     struct kvm_memory_slot *slot,
130362306a36Sopenharmony_ci				     gfn_t gfn_offset, unsigned long mask)
130462306a36Sopenharmony_ci{
130562306a36Sopenharmony_ci	struct kvm_rmap_head *rmap_head;
130662306a36Sopenharmony_ci
130762306a36Sopenharmony_ci	if (tdp_mmu_enabled)
130862306a36Sopenharmony_ci		kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
130962306a36Sopenharmony_ci				slot->base_gfn + gfn_offset, mask, true);
131062306a36Sopenharmony_ci
131162306a36Sopenharmony_ci	if (!kvm_memslots_have_rmaps(kvm))
131262306a36Sopenharmony_ci		return;
131362306a36Sopenharmony_ci
131462306a36Sopenharmony_ci	while (mask) {
131562306a36Sopenharmony_ci		rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
131662306a36Sopenharmony_ci					PG_LEVEL_4K, slot);
131762306a36Sopenharmony_ci		rmap_write_protect(rmap_head, false);
131862306a36Sopenharmony_ci
131962306a36Sopenharmony_ci		/* clear the first set bit */
132062306a36Sopenharmony_ci		mask &= mask - 1;
132162306a36Sopenharmony_ci	}
132262306a36Sopenharmony_ci}
132362306a36Sopenharmony_ci
132462306a36Sopenharmony_ci/**
132562306a36Sopenharmony_ci * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
132662306a36Sopenharmony_ci * protect the page if the D-bit isn't supported.
132762306a36Sopenharmony_ci * @kvm: kvm instance
132862306a36Sopenharmony_ci * @slot: slot to clear D-bit
132962306a36Sopenharmony_ci * @gfn_offset: start of the BITS_PER_LONG pages we care about
133062306a36Sopenharmony_ci * @mask: indicates which pages we should clear D-bit
133162306a36Sopenharmony_ci *
133262306a36Sopenharmony_ci * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
133362306a36Sopenharmony_ci */
133462306a36Sopenharmony_cistatic void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
133562306a36Sopenharmony_ci					 struct kvm_memory_slot *slot,
133662306a36Sopenharmony_ci					 gfn_t gfn_offset, unsigned long mask)
133762306a36Sopenharmony_ci{
133862306a36Sopenharmony_ci	struct kvm_rmap_head *rmap_head;
133962306a36Sopenharmony_ci
134062306a36Sopenharmony_ci	if (tdp_mmu_enabled)
134162306a36Sopenharmony_ci		kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
134262306a36Sopenharmony_ci				slot->base_gfn + gfn_offset, mask, false);
134362306a36Sopenharmony_ci
134462306a36Sopenharmony_ci	if (!kvm_memslots_have_rmaps(kvm))
134562306a36Sopenharmony_ci		return;
134662306a36Sopenharmony_ci
134762306a36Sopenharmony_ci	while (mask) {
134862306a36Sopenharmony_ci		rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
134962306a36Sopenharmony_ci					PG_LEVEL_4K, slot);
135062306a36Sopenharmony_ci		__rmap_clear_dirty(kvm, rmap_head, slot);
135162306a36Sopenharmony_ci
135262306a36Sopenharmony_ci		/* clear the first set bit */
135362306a36Sopenharmony_ci		mask &= mask - 1;
135462306a36Sopenharmony_ci	}
135562306a36Sopenharmony_ci}
135662306a36Sopenharmony_ci
135762306a36Sopenharmony_ci/**
135862306a36Sopenharmony_ci * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
135962306a36Sopenharmony_ci * PT level pages.
136062306a36Sopenharmony_ci *
136162306a36Sopenharmony_ci * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
136262306a36Sopenharmony_ci * enable dirty logging for them.
136362306a36Sopenharmony_ci *
136462306a36Sopenharmony_ci * We need to care about huge page mappings: e.g. during dirty logging we may
136562306a36Sopenharmony_ci * have such mappings.
136662306a36Sopenharmony_ci */
136762306a36Sopenharmony_civoid kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
136862306a36Sopenharmony_ci				struct kvm_memory_slot *slot,
136962306a36Sopenharmony_ci				gfn_t gfn_offset, unsigned long mask)
137062306a36Sopenharmony_ci{
137162306a36Sopenharmony_ci	/*
137262306a36Sopenharmony_ci	 * Huge pages are NOT write protected when we start dirty logging in
137362306a36Sopenharmony_ci	 * initially-all-set mode; must write protect them here so that they
137462306a36Sopenharmony_ci	 * are split to 4K on the first write.
137562306a36Sopenharmony_ci	 *
137662306a36Sopenharmony_ci	 * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
137762306a36Sopenharmony_ci	 * of memslot has no such restriction, so the range can cross two large
137862306a36Sopenharmony_ci	 * pages.
137962306a36Sopenharmony_ci	 */
138062306a36Sopenharmony_ci	if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
138162306a36Sopenharmony_ci		gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
138262306a36Sopenharmony_ci		gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
138362306a36Sopenharmony_ci
138462306a36Sopenharmony_ci		if (READ_ONCE(eager_page_split))
138562306a36Sopenharmony_ci			kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
138662306a36Sopenharmony_ci
138762306a36Sopenharmony_ci		kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
138862306a36Sopenharmony_ci
138962306a36Sopenharmony_ci		/* Cross two large pages? */
139062306a36Sopenharmony_ci		if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
139162306a36Sopenharmony_ci		    ALIGN(end << PAGE_SHIFT, PMD_SIZE))
139262306a36Sopenharmony_ci			kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
139362306a36Sopenharmony_ci						       PG_LEVEL_2M);
139462306a36Sopenharmony_ci	}
139562306a36Sopenharmony_ci
139662306a36Sopenharmony_ci	/* Now handle 4K PTEs.  */
139762306a36Sopenharmony_ci	if (kvm_x86_ops.cpu_dirty_log_size)
139862306a36Sopenharmony_ci		kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
139962306a36Sopenharmony_ci	else
140062306a36Sopenharmony_ci		kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
140162306a36Sopenharmony_ci}
140262306a36Sopenharmony_ci
140362306a36Sopenharmony_ciint kvm_cpu_dirty_log_size(void)
140462306a36Sopenharmony_ci{
140562306a36Sopenharmony_ci	return kvm_x86_ops.cpu_dirty_log_size;
140662306a36Sopenharmony_ci}
140762306a36Sopenharmony_ci
140862306a36Sopenharmony_cibool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
140962306a36Sopenharmony_ci				    struct kvm_memory_slot *slot, u64 gfn,
141062306a36Sopenharmony_ci				    int min_level)
141162306a36Sopenharmony_ci{
141262306a36Sopenharmony_ci	struct kvm_rmap_head *rmap_head;
141362306a36Sopenharmony_ci	int i;
141462306a36Sopenharmony_ci	bool write_protected = false;
141562306a36Sopenharmony_ci
141662306a36Sopenharmony_ci	if (kvm_memslots_have_rmaps(kvm)) {
141762306a36Sopenharmony_ci		for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
141862306a36Sopenharmony_ci			rmap_head = gfn_to_rmap(gfn, i, slot);
141962306a36Sopenharmony_ci			write_protected |= rmap_write_protect(rmap_head, true);
142062306a36Sopenharmony_ci		}
142162306a36Sopenharmony_ci	}
142262306a36Sopenharmony_ci
142362306a36Sopenharmony_ci	if (tdp_mmu_enabled)
142462306a36Sopenharmony_ci		write_protected |=
142562306a36Sopenharmony_ci			kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
142662306a36Sopenharmony_ci
142762306a36Sopenharmony_ci	return write_protected;
142862306a36Sopenharmony_ci}
142962306a36Sopenharmony_ci
143062306a36Sopenharmony_cistatic bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
143162306a36Sopenharmony_ci{
143262306a36Sopenharmony_ci	struct kvm_memory_slot *slot;
143362306a36Sopenharmony_ci
143462306a36Sopenharmony_ci	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
143562306a36Sopenharmony_ci	return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
143662306a36Sopenharmony_ci}
143762306a36Sopenharmony_ci
143862306a36Sopenharmony_cistatic bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
143962306a36Sopenharmony_ci			   const struct kvm_memory_slot *slot)
144062306a36Sopenharmony_ci{
144162306a36Sopenharmony_ci	return kvm_zap_all_rmap_sptes(kvm, rmap_head);
144262306a36Sopenharmony_ci}
144362306a36Sopenharmony_ci
144462306a36Sopenharmony_cistatic bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
144562306a36Sopenharmony_ci			 struct kvm_memory_slot *slot, gfn_t gfn, int level,
144662306a36Sopenharmony_ci			 pte_t unused)
144762306a36Sopenharmony_ci{
144862306a36Sopenharmony_ci	return __kvm_zap_rmap(kvm, rmap_head, slot);
144962306a36Sopenharmony_ci}
145062306a36Sopenharmony_ci
145162306a36Sopenharmony_cistatic bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
145262306a36Sopenharmony_ci			     struct kvm_memory_slot *slot, gfn_t gfn, int level,
145362306a36Sopenharmony_ci			     pte_t pte)
145462306a36Sopenharmony_ci{
145562306a36Sopenharmony_ci	u64 *sptep;
145662306a36Sopenharmony_ci	struct rmap_iterator iter;
145762306a36Sopenharmony_ci	bool need_flush = false;
145862306a36Sopenharmony_ci	u64 new_spte;
145962306a36Sopenharmony_ci	kvm_pfn_t new_pfn;
146062306a36Sopenharmony_ci
146162306a36Sopenharmony_ci	WARN_ON_ONCE(pte_huge(pte));
146262306a36Sopenharmony_ci	new_pfn = pte_pfn(pte);
146362306a36Sopenharmony_ci
146462306a36Sopenharmony_cirestart:
146562306a36Sopenharmony_ci	for_each_rmap_spte(rmap_head, &iter, sptep) {
146662306a36Sopenharmony_ci		need_flush = true;
146762306a36Sopenharmony_ci
146862306a36Sopenharmony_ci		if (pte_write(pte)) {
146962306a36Sopenharmony_ci			kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
147062306a36Sopenharmony_ci			goto restart;
147162306a36Sopenharmony_ci		} else {
147262306a36Sopenharmony_ci			new_spte = kvm_mmu_changed_pte_notifier_make_spte(
147362306a36Sopenharmony_ci					*sptep, new_pfn);
147462306a36Sopenharmony_ci
147562306a36Sopenharmony_ci			mmu_spte_clear_track_bits(kvm, sptep);
147662306a36Sopenharmony_ci			mmu_spte_set(sptep, new_spte);
147762306a36Sopenharmony_ci		}
147862306a36Sopenharmony_ci	}
147962306a36Sopenharmony_ci
148062306a36Sopenharmony_ci	if (need_flush && kvm_available_flush_remote_tlbs_range()) {
148162306a36Sopenharmony_ci		kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
148262306a36Sopenharmony_ci		return false;
148362306a36Sopenharmony_ci	}
148462306a36Sopenharmony_ci
148562306a36Sopenharmony_ci	return need_flush;
148662306a36Sopenharmony_ci}
148762306a36Sopenharmony_ci
148862306a36Sopenharmony_cistruct slot_rmap_walk_iterator {
148962306a36Sopenharmony_ci	/* input fields. */
149062306a36Sopenharmony_ci	const struct kvm_memory_slot *slot;
149162306a36Sopenharmony_ci	gfn_t start_gfn;
149262306a36Sopenharmony_ci	gfn_t end_gfn;
149362306a36Sopenharmony_ci	int start_level;
149462306a36Sopenharmony_ci	int end_level;
149562306a36Sopenharmony_ci
149662306a36Sopenharmony_ci	/* output fields. */
149762306a36Sopenharmony_ci	gfn_t gfn;
149862306a36Sopenharmony_ci	struct kvm_rmap_head *rmap;
149962306a36Sopenharmony_ci	int level;
150062306a36Sopenharmony_ci
150162306a36Sopenharmony_ci	/* private field. */
150262306a36Sopenharmony_ci	struct kvm_rmap_head *end_rmap;
150362306a36Sopenharmony_ci};
150462306a36Sopenharmony_ci
150562306a36Sopenharmony_cistatic void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator,
150662306a36Sopenharmony_ci				 int level)
150762306a36Sopenharmony_ci{
150862306a36Sopenharmony_ci	iterator->level = level;
150962306a36Sopenharmony_ci	iterator->gfn = iterator->start_gfn;
151062306a36Sopenharmony_ci	iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
151162306a36Sopenharmony_ci	iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
151262306a36Sopenharmony_ci}
151362306a36Sopenharmony_ci
151462306a36Sopenharmony_cistatic void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
151562306a36Sopenharmony_ci				const struct kvm_memory_slot *slot,
151662306a36Sopenharmony_ci				int start_level, int end_level,
151762306a36Sopenharmony_ci				gfn_t start_gfn, gfn_t end_gfn)
151862306a36Sopenharmony_ci{
151962306a36Sopenharmony_ci	iterator->slot = slot;
152062306a36Sopenharmony_ci	iterator->start_level = start_level;
152162306a36Sopenharmony_ci	iterator->end_level = end_level;
152262306a36Sopenharmony_ci	iterator->start_gfn = start_gfn;
152362306a36Sopenharmony_ci	iterator->end_gfn = end_gfn;
152462306a36Sopenharmony_ci
152562306a36Sopenharmony_ci	rmap_walk_init_level(iterator, iterator->start_level);
152662306a36Sopenharmony_ci}
152762306a36Sopenharmony_ci
152862306a36Sopenharmony_cistatic bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
152962306a36Sopenharmony_ci{
153062306a36Sopenharmony_ci	return !!iterator->rmap;
153162306a36Sopenharmony_ci}
153262306a36Sopenharmony_ci
153362306a36Sopenharmony_cistatic void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
153462306a36Sopenharmony_ci{
153562306a36Sopenharmony_ci	while (++iterator->rmap <= iterator->end_rmap) {
153662306a36Sopenharmony_ci		iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
153762306a36Sopenharmony_ci
153862306a36Sopenharmony_ci		if (iterator->rmap->val)
153962306a36Sopenharmony_ci			return;
154062306a36Sopenharmony_ci	}
154162306a36Sopenharmony_ci
154262306a36Sopenharmony_ci	if (++iterator->level > iterator->end_level) {
154362306a36Sopenharmony_ci		iterator->rmap = NULL;
154462306a36Sopenharmony_ci		return;
154562306a36Sopenharmony_ci	}
154662306a36Sopenharmony_ci
154762306a36Sopenharmony_ci	rmap_walk_init_level(iterator, iterator->level);
154862306a36Sopenharmony_ci}
154962306a36Sopenharmony_ci
155062306a36Sopenharmony_ci#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,	\
155162306a36Sopenharmony_ci	   _start_gfn, _end_gfn, _iter_)				\
155262306a36Sopenharmony_ci	for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,		\
155362306a36Sopenharmony_ci				 _end_level_, _start_gfn, _end_gfn);	\
155462306a36Sopenharmony_ci	     slot_rmap_walk_okay(_iter_);				\
155562306a36Sopenharmony_ci	     slot_rmap_walk_next(_iter_))
155662306a36Sopenharmony_ci
155762306a36Sopenharmony_citypedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
155862306a36Sopenharmony_ci			       struct kvm_memory_slot *slot, gfn_t gfn,
155962306a36Sopenharmony_ci			       int level, pte_t pte);
156062306a36Sopenharmony_ci
156162306a36Sopenharmony_cistatic __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
156262306a36Sopenharmony_ci						 struct kvm_gfn_range *range,
156362306a36Sopenharmony_ci						 rmap_handler_t handler)
156462306a36Sopenharmony_ci{
156562306a36Sopenharmony_ci	struct slot_rmap_walk_iterator iterator;
156662306a36Sopenharmony_ci	bool ret = false;
156762306a36Sopenharmony_ci
156862306a36Sopenharmony_ci	for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
156962306a36Sopenharmony_ci				 range->start, range->end - 1, &iterator)
157062306a36Sopenharmony_ci		ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
157162306a36Sopenharmony_ci			       iterator.level, range->arg.pte);
157262306a36Sopenharmony_ci
157362306a36Sopenharmony_ci	return ret;
157462306a36Sopenharmony_ci}
157562306a36Sopenharmony_ci
157662306a36Sopenharmony_cibool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
157762306a36Sopenharmony_ci{
157862306a36Sopenharmony_ci	bool flush = false;
157962306a36Sopenharmony_ci
158062306a36Sopenharmony_ci	if (kvm_memslots_have_rmaps(kvm))
158162306a36Sopenharmony_ci		flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
158262306a36Sopenharmony_ci
158362306a36Sopenharmony_ci	if (tdp_mmu_enabled)
158462306a36Sopenharmony_ci		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
158562306a36Sopenharmony_ci
158662306a36Sopenharmony_ci	if (kvm_x86_ops.set_apic_access_page_addr &&
158762306a36Sopenharmony_ci	    range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
158862306a36Sopenharmony_ci		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
158962306a36Sopenharmony_ci
159062306a36Sopenharmony_ci	return flush;
159162306a36Sopenharmony_ci}
159262306a36Sopenharmony_ci
159362306a36Sopenharmony_cibool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
159462306a36Sopenharmony_ci{
159562306a36Sopenharmony_ci	bool flush = false;
159662306a36Sopenharmony_ci
159762306a36Sopenharmony_ci	if (kvm_memslots_have_rmaps(kvm))
159862306a36Sopenharmony_ci		flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
159962306a36Sopenharmony_ci
160062306a36Sopenharmony_ci	if (tdp_mmu_enabled)
160162306a36Sopenharmony_ci		flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
160262306a36Sopenharmony_ci
160362306a36Sopenharmony_ci	return flush;
160462306a36Sopenharmony_ci}
160562306a36Sopenharmony_ci
160662306a36Sopenharmony_cistatic bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
160762306a36Sopenharmony_ci			 struct kvm_memory_slot *slot, gfn_t gfn, int level,
160862306a36Sopenharmony_ci			 pte_t unused)
160962306a36Sopenharmony_ci{
161062306a36Sopenharmony_ci	u64 *sptep;
161162306a36Sopenharmony_ci	struct rmap_iterator iter;
161262306a36Sopenharmony_ci	int young = 0;
161362306a36Sopenharmony_ci
161462306a36Sopenharmony_ci	for_each_rmap_spte(rmap_head, &iter, sptep)
161562306a36Sopenharmony_ci		young |= mmu_spte_age(sptep);
161662306a36Sopenharmony_ci
161762306a36Sopenharmony_ci	return young;
161862306a36Sopenharmony_ci}
161962306a36Sopenharmony_ci
162062306a36Sopenharmony_cistatic bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
162162306a36Sopenharmony_ci			      struct kvm_memory_slot *slot, gfn_t gfn,
162262306a36Sopenharmony_ci			      int level, pte_t unused)
162362306a36Sopenharmony_ci{
162462306a36Sopenharmony_ci	u64 *sptep;
162562306a36Sopenharmony_ci	struct rmap_iterator iter;
162662306a36Sopenharmony_ci
162762306a36Sopenharmony_ci	for_each_rmap_spte(rmap_head, &iter, sptep)
162862306a36Sopenharmony_ci		if (is_accessed_spte(*sptep))
162962306a36Sopenharmony_ci			return true;
163062306a36Sopenharmony_ci	return false;
163162306a36Sopenharmony_ci}
163262306a36Sopenharmony_ci
163362306a36Sopenharmony_ci#define RMAP_RECYCLE_THRESHOLD 1000
163462306a36Sopenharmony_ci
163562306a36Sopenharmony_cistatic void __rmap_add(struct kvm *kvm,
163662306a36Sopenharmony_ci		       struct kvm_mmu_memory_cache *cache,
163762306a36Sopenharmony_ci		       const struct kvm_memory_slot *slot,
163862306a36Sopenharmony_ci		       u64 *spte, gfn_t gfn, unsigned int access)
163962306a36Sopenharmony_ci{
164062306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
164162306a36Sopenharmony_ci	struct kvm_rmap_head *rmap_head;
164262306a36Sopenharmony_ci	int rmap_count;
164362306a36Sopenharmony_ci
164462306a36Sopenharmony_ci	sp = sptep_to_sp(spte);
164562306a36Sopenharmony_ci	kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access);
164662306a36Sopenharmony_ci	kvm_update_page_stats(kvm, sp->role.level, 1);
164762306a36Sopenharmony_ci
164862306a36Sopenharmony_ci	rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
164962306a36Sopenharmony_ci	rmap_count = pte_list_add(cache, spte, rmap_head);
165062306a36Sopenharmony_ci
165162306a36Sopenharmony_ci	if (rmap_count > kvm->stat.max_mmu_rmap_size)
165262306a36Sopenharmony_ci		kvm->stat.max_mmu_rmap_size = rmap_count;
165362306a36Sopenharmony_ci	if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
165462306a36Sopenharmony_ci		kvm_zap_all_rmap_sptes(kvm, rmap_head);
165562306a36Sopenharmony_ci		kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
165662306a36Sopenharmony_ci	}
165762306a36Sopenharmony_ci}
165862306a36Sopenharmony_ci
165962306a36Sopenharmony_cistatic void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
166062306a36Sopenharmony_ci		     u64 *spte, gfn_t gfn, unsigned int access)
166162306a36Sopenharmony_ci{
166262306a36Sopenharmony_ci	struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache;
166362306a36Sopenharmony_ci
166462306a36Sopenharmony_ci	__rmap_add(vcpu->kvm, cache, slot, spte, gfn, access);
166562306a36Sopenharmony_ci}
166662306a36Sopenharmony_ci
166762306a36Sopenharmony_cibool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
166862306a36Sopenharmony_ci{
166962306a36Sopenharmony_ci	bool young = false;
167062306a36Sopenharmony_ci
167162306a36Sopenharmony_ci	if (kvm_memslots_have_rmaps(kvm))
167262306a36Sopenharmony_ci		young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap);
167362306a36Sopenharmony_ci
167462306a36Sopenharmony_ci	if (tdp_mmu_enabled)
167562306a36Sopenharmony_ci		young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
167662306a36Sopenharmony_ci
167762306a36Sopenharmony_ci	return young;
167862306a36Sopenharmony_ci}
167962306a36Sopenharmony_ci
168062306a36Sopenharmony_cibool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
168162306a36Sopenharmony_ci{
168262306a36Sopenharmony_ci	bool young = false;
168362306a36Sopenharmony_ci
168462306a36Sopenharmony_ci	if (kvm_memslots_have_rmaps(kvm))
168562306a36Sopenharmony_ci		young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap);
168662306a36Sopenharmony_ci
168762306a36Sopenharmony_ci	if (tdp_mmu_enabled)
168862306a36Sopenharmony_ci		young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
168962306a36Sopenharmony_ci
169062306a36Sopenharmony_ci	return young;
169162306a36Sopenharmony_ci}
169262306a36Sopenharmony_ci
169362306a36Sopenharmony_cistatic void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
169462306a36Sopenharmony_ci{
169562306a36Sopenharmony_ci#ifdef CONFIG_KVM_PROVE_MMU
169662306a36Sopenharmony_ci	int i;
169762306a36Sopenharmony_ci
169862306a36Sopenharmony_ci	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
169962306a36Sopenharmony_ci		if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i])))
170062306a36Sopenharmony_ci			pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free",
170162306a36Sopenharmony_ci					   sp->spt[i], &sp->spt[i],
170262306a36Sopenharmony_ci					   kvm_mmu_page_get_gfn(sp, i));
170362306a36Sopenharmony_ci	}
170462306a36Sopenharmony_ci#endif
170562306a36Sopenharmony_ci}
170662306a36Sopenharmony_ci
170762306a36Sopenharmony_ci/*
170862306a36Sopenharmony_ci * This value is the sum of all of the kvm instances's
170962306a36Sopenharmony_ci * kvm->arch.n_used_mmu_pages values.  We need a global,
171062306a36Sopenharmony_ci * aggregate version in order to make the slab shrinker
171162306a36Sopenharmony_ci * faster
171262306a36Sopenharmony_ci */
171362306a36Sopenharmony_cistatic inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
171462306a36Sopenharmony_ci{
171562306a36Sopenharmony_ci	kvm->arch.n_used_mmu_pages += nr;
171662306a36Sopenharmony_ci	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
171762306a36Sopenharmony_ci}
171862306a36Sopenharmony_ci
171962306a36Sopenharmony_cistatic void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
172062306a36Sopenharmony_ci{
172162306a36Sopenharmony_ci	kvm_mod_used_mmu_pages(kvm, +1);
172262306a36Sopenharmony_ci	kvm_account_pgtable_pages((void *)sp->spt, +1);
172362306a36Sopenharmony_ci}
172462306a36Sopenharmony_ci
172562306a36Sopenharmony_cistatic void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
172662306a36Sopenharmony_ci{
172762306a36Sopenharmony_ci	kvm_mod_used_mmu_pages(kvm, -1);
172862306a36Sopenharmony_ci	kvm_account_pgtable_pages((void *)sp->spt, -1);
172962306a36Sopenharmony_ci}
173062306a36Sopenharmony_ci
173162306a36Sopenharmony_cistatic void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
173262306a36Sopenharmony_ci{
173362306a36Sopenharmony_ci	kvm_mmu_check_sptes_at_free(sp);
173462306a36Sopenharmony_ci
173562306a36Sopenharmony_ci	hlist_del(&sp->hash_link);
173662306a36Sopenharmony_ci	list_del(&sp->link);
173762306a36Sopenharmony_ci	free_page((unsigned long)sp->spt);
173862306a36Sopenharmony_ci	if (!sp->role.direct)
173962306a36Sopenharmony_ci		free_page((unsigned long)sp->shadowed_translation);
174062306a36Sopenharmony_ci	kmem_cache_free(mmu_page_header_cache, sp);
174162306a36Sopenharmony_ci}
174262306a36Sopenharmony_ci
174362306a36Sopenharmony_cistatic unsigned kvm_page_table_hashfn(gfn_t gfn)
174462306a36Sopenharmony_ci{
174562306a36Sopenharmony_ci	return hash_64(gfn, KVM_MMU_HASH_SHIFT);
174662306a36Sopenharmony_ci}
174762306a36Sopenharmony_ci
174862306a36Sopenharmony_cistatic void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
174962306a36Sopenharmony_ci				    struct kvm_mmu_page *sp, u64 *parent_pte)
175062306a36Sopenharmony_ci{
175162306a36Sopenharmony_ci	if (!parent_pte)
175262306a36Sopenharmony_ci		return;
175362306a36Sopenharmony_ci
175462306a36Sopenharmony_ci	pte_list_add(cache, parent_pte, &sp->parent_ptes);
175562306a36Sopenharmony_ci}
175662306a36Sopenharmony_ci
175762306a36Sopenharmony_cistatic void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
175862306a36Sopenharmony_ci				       u64 *parent_pte)
175962306a36Sopenharmony_ci{
176062306a36Sopenharmony_ci	pte_list_remove(kvm, parent_pte, &sp->parent_ptes);
176162306a36Sopenharmony_ci}
176262306a36Sopenharmony_ci
176362306a36Sopenharmony_cistatic void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
176462306a36Sopenharmony_ci			    u64 *parent_pte)
176562306a36Sopenharmony_ci{
176662306a36Sopenharmony_ci	mmu_page_remove_parent_pte(kvm, sp, parent_pte);
176762306a36Sopenharmony_ci	mmu_spte_clear_no_track(parent_pte);
176862306a36Sopenharmony_ci}
176962306a36Sopenharmony_ci
177062306a36Sopenharmony_cistatic void mark_unsync(u64 *spte);
177162306a36Sopenharmony_cistatic void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
177262306a36Sopenharmony_ci{
177362306a36Sopenharmony_ci	u64 *sptep;
177462306a36Sopenharmony_ci	struct rmap_iterator iter;
177562306a36Sopenharmony_ci
177662306a36Sopenharmony_ci	for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
177762306a36Sopenharmony_ci		mark_unsync(sptep);
177862306a36Sopenharmony_ci	}
177962306a36Sopenharmony_ci}
178062306a36Sopenharmony_ci
178162306a36Sopenharmony_cistatic void mark_unsync(u64 *spte)
178262306a36Sopenharmony_ci{
178362306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
178462306a36Sopenharmony_ci
178562306a36Sopenharmony_ci	sp = sptep_to_sp(spte);
178662306a36Sopenharmony_ci	if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap))
178762306a36Sopenharmony_ci		return;
178862306a36Sopenharmony_ci	if (sp->unsync_children++)
178962306a36Sopenharmony_ci		return;
179062306a36Sopenharmony_ci	kvm_mmu_mark_parents_unsync(sp);
179162306a36Sopenharmony_ci}
179262306a36Sopenharmony_ci
179362306a36Sopenharmony_ci#define KVM_PAGE_ARRAY_NR 16
179462306a36Sopenharmony_ci
179562306a36Sopenharmony_cistruct kvm_mmu_pages {
179662306a36Sopenharmony_ci	struct mmu_page_and_offset {
179762306a36Sopenharmony_ci		struct kvm_mmu_page *sp;
179862306a36Sopenharmony_ci		unsigned int idx;
179962306a36Sopenharmony_ci	} page[KVM_PAGE_ARRAY_NR];
180062306a36Sopenharmony_ci	unsigned int nr;
180162306a36Sopenharmony_ci};
180262306a36Sopenharmony_ci
180362306a36Sopenharmony_cistatic int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
180462306a36Sopenharmony_ci			 int idx)
180562306a36Sopenharmony_ci{
180662306a36Sopenharmony_ci	int i;
180762306a36Sopenharmony_ci
180862306a36Sopenharmony_ci	if (sp->unsync)
180962306a36Sopenharmony_ci		for (i=0; i < pvec->nr; i++)
181062306a36Sopenharmony_ci			if (pvec->page[i].sp == sp)
181162306a36Sopenharmony_ci				return 0;
181262306a36Sopenharmony_ci
181362306a36Sopenharmony_ci	pvec->page[pvec->nr].sp = sp;
181462306a36Sopenharmony_ci	pvec->page[pvec->nr].idx = idx;
181562306a36Sopenharmony_ci	pvec->nr++;
181662306a36Sopenharmony_ci	return (pvec->nr == KVM_PAGE_ARRAY_NR);
181762306a36Sopenharmony_ci}
181862306a36Sopenharmony_ci
181962306a36Sopenharmony_cistatic inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
182062306a36Sopenharmony_ci{
182162306a36Sopenharmony_ci	--sp->unsync_children;
182262306a36Sopenharmony_ci	WARN_ON_ONCE((int)sp->unsync_children < 0);
182362306a36Sopenharmony_ci	__clear_bit(idx, sp->unsync_child_bitmap);
182462306a36Sopenharmony_ci}
182562306a36Sopenharmony_ci
182662306a36Sopenharmony_cistatic int __mmu_unsync_walk(struct kvm_mmu_page *sp,
182762306a36Sopenharmony_ci			   struct kvm_mmu_pages *pvec)
182862306a36Sopenharmony_ci{
182962306a36Sopenharmony_ci	int i, ret, nr_unsync_leaf = 0;
183062306a36Sopenharmony_ci
183162306a36Sopenharmony_ci	for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
183262306a36Sopenharmony_ci		struct kvm_mmu_page *child;
183362306a36Sopenharmony_ci		u64 ent = sp->spt[i];
183462306a36Sopenharmony_ci
183562306a36Sopenharmony_ci		if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
183662306a36Sopenharmony_ci			clear_unsync_child_bit(sp, i);
183762306a36Sopenharmony_ci			continue;
183862306a36Sopenharmony_ci		}
183962306a36Sopenharmony_ci
184062306a36Sopenharmony_ci		child = spte_to_child_sp(ent);
184162306a36Sopenharmony_ci
184262306a36Sopenharmony_ci		if (child->unsync_children) {
184362306a36Sopenharmony_ci			if (mmu_pages_add(pvec, child, i))
184462306a36Sopenharmony_ci				return -ENOSPC;
184562306a36Sopenharmony_ci
184662306a36Sopenharmony_ci			ret = __mmu_unsync_walk(child, pvec);
184762306a36Sopenharmony_ci			if (!ret) {
184862306a36Sopenharmony_ci				clear_unsync_child_bit(sp, i);
184962306a36Sopenharmony_ci				continue;
185062306a36Sopenharmony_ci			} else if (ret > 0) {
185162306a36Sopenharmony_ci				nr_unsync_leaf += ret;
185262306a36Sopenharmony_ci			} else
185362306a36Sopenharmony_ci				return ret;
185462306a36Sopenharmony_ci		} else if (child->unsync) {
185562306a36Sopenharmony_ci			nr_unsync_leaf++;
185662306a36Sopenharmony_ci			if (mmu_pages_add(pvec, child, i))
185762306a36Sopenharmony_ci				return -ENOSPC;
185862306a36Sopenharmony_ci		} else
185962306a36Sopenharmony_ci			clear_unsync_child_bit(sp, i);
186062306a36Sopenharmony_ci	}
186162306a36Sopenharmony_ci
186262306a36Sopenharmony_ci	return nr_unsync_leaf;
186362306a36Sopenharmony_ci}
186462306a36Sopenharmony_ci
186562306a36Sopenharmony_ci#define INVALID_INDEX (-1)
186662306a36Sopenharmony_ci
186762306a36Sopenharmony_cistatic int mmu_unsync_walk(struct kvm_mmu_page *sp,
186862306a36Sopenharmony_ci			   struct kvm_mmu_pages *pvec)
186962306a36Sopenharmony_ci{
187062306a36Sopenharmony_ci	pvec->nr = 0;
187162306a36Sopenharmony_ci	if (!sp->unsync_children)
187262306a36Sopenharmony_ci		return 0;
187362306a36Sopenharmony_ci
187462306a36Sopenharmony_ci	mmu_pages_add(pvec, sp, INVALID_INDEX);
187562306a36Sopenharmony_ci	return __mmu_unsync_walk(sp, pvec);
187662306a36Sopenharmony_ci}
187762306a36Sopenharmony_ci
187862306a36Sopenharmony_cistatic void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
187962306a36Sopenharmony_ci{
188062306a36Sopenharmony_ci	WARN_ON_ONCE(!sp->unsync);
188162306a36Sopenharmony_ci	trace_kvm_mmu_sync_page(sp);
188262306a36Sopenharmony_ci	sp->unsync = 0;
188362306a36Sopenharmony_ci	--kvm->stat.mmu_unsync;
188462306a36Sopenharmony_ci}
188562306a36Sopenharmony_ci
188662306a36Sopenharmony_cistatic bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
188762306a36Sopenharmony_ci				     struct list_head *invalid_list);
188862306a36Sopenharmony_cistatic void kvm_mmu_commit_zap_page(struct kvm *kvm,
188962306a36Sopenharmony_ci				    struct list_head *invalid_list);
189062306a36Sopenharmony_ci
189162306a36Sopenharmony_cistatic bool sp_has_gptes(struct kvm_mmu_page *sp)
189262306a36Sopenharmony_ci{
189362306a36Sopenharmony_ci	if (sp->role.direct)
189462306a36Sopenharmony_ci		return false;
189562306a36Sopenharmony_ci
189662306a36Sopenharmony_ci	if (sp->role.passthrough)
189762306a36Sopenharmony_ci		return false;
189862306a36Sopenharmony_ci
189962306a36Sopenharmony_ci	return true;
190062306a36Sopenharmony_ci}
190162306a36Sopenharmony_ci
190262306a36Sopenharmony_ci#define for_each_valid_sp(_kvm, _sp, _list)				\
190362306a36Sopenharmony_ci	hlist_for_each_entry(_sp, _list, hash_link)			\
190462306a36Sopenharmony_ci		if (is_obsolete_sp((_kvm), (_sp))) {			\
190562306a36Sopenharmony_ci		} else
190662306a36Sopenharmony_ci
190762306a36Sopenharmony_ci#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn)		\
190862306a36Sopenharmony_ci	for_each_valid_sp(_kvm, _sp,					\
190962306a36Sopenharmony_ci	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])	\
191062306a36Sopenharmony_ci		if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
191162306a36Sopenharmony_ci
191262306a36Sopenharmony_cistatic bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
191362306a36Sopenharmony_ci{
191462306a36Sopenharmony_ci	union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
191562306a36Sopenharmony_ci
191662306a36Sopenharmony_ci	/*
191762306a36Sopenharmony_ci	 * Ignore various flags when verifying that it's safe to sync a shadow
191862306a36Sopenharmony_ci	 * page using the current MMU context.
191962306a36Sopenharmony_ci	 *
192062306a36Sopenharmony_ci	 *  - level: not part of the overall MMU role and will never match as the MMU's
192162306a36Sopenharmony_ci	 *           level tracks the root level
192262306a36Sopenharmony_ci	 *  - access: updated based on the new guest PTE
192362306a36Sopenharmony_ci	 *  - quadrant: not part of the overall MMU role (similar to level)
192462306a36Sopenharmony_ci	 */
192562306a36Sopenharmony_ci	const union kvm_mmu_page_role sync_role_ign = {
192662306a36Sopenharmony_ci		.level = 0xf,
192762306a36Sopenharmony_ci		.access = 0x7,
192862306a36Sopenharmony_ci		.quadrant = 0x3,
192962306a36Sopenharmony_ci		.passthrough = 0x1,
193062306a36Sopenharmony_ci	};
193162306a36Sopenharmony_ci
193262306a36Sopenharmony_ci	/*
193362306a36Sopenharmony_ci	 * Direct pages can never be unsync, and KVM should never attempt to
193462306a36Sopenharmony_ci	 * sync a shadow page for a different MMU context, e.g. if the role
193562306a36Sopenharmony_ci	 * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
193662306a36Sopenharmony_ci	 * reserved bits checks will be wrong, etc...
193762306a36Sopenharmony_ci	 */
193862306a36Sopenharmony_ci	if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte ||
193962306a36Sopenharmony_ci			 (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
194062306a36Sopenharmony_ci		return false;
194162306a36Sopenharmony_ci
194262306a36Sopenharmony_ci	return true;
194362306a36Sopenharmony_ci}
194462306a36Sopenharmony_ci
194562306a36Sopenharmony_cistatic int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
194662306a36Sopenharmony_ci{
194762306a36Sopenharmony_ci	if (!sp->spt[i])
194862306a36Sopenharmony_ci		return 0;
194962306a36Sopenharmony_ci
195062306a36Sopenharmony_ci	return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
195162306a36Sopenharmony_ci}
195262306a36Sopenharmony_ci
195362306a36Sopenharmony_cistatic int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
195462306a36Sopenharmony_ci{
195562306a36Sopenharmony_ci	int flush = 0;
195662306a36Sopenharmony_ci	int i;
195762306a36Sopenharmony_ci
195862306a36Sopenharmony_ci	if (!kvm_sync_page_check(vcpu, sp))
195962306a36Sopenharmony_ci		return -1;
196062306a36Sopenharmony_ci
196162306a36Sopenharmony_ci	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
196262306a36Sopenharmony_ci		int ret = kvm_sync_spte(vcpu, sp, i);
196362306a36Sopenharmony_ci
196462306a36Sopenharmony_ci		if (ret < -1)
196562306a36Sopenharmony_ci			return -1;
196662306a36Sopenharmony_ci		flush |= ret;
196762306a36Sopenharmony_ci	}
196862306a36Sopenharmony_ci
196962306a36Sopenharmony_ci	/*
197062306a36Sopenharmony_ci	 * Note, any flush is purely for KVM's correctness, e.g. when dropping
197162306a36Sopenharmony_ci	 * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
197262306a36Sopenharmony_ci	 * unmap or dirty logging event doesn't fail to flush.  The guest is
197362306a36Sopenharmony_ci	 * responsible for flushing the TLB to ensure any changes in protection
197462306a36Sopenharmony_ci	 * bits are recognized, i.e. until the guest flushes or page faults on
197562306a36Sopenharmony_ci	 * a relevant address, KVM is architecturally allowed to let vCPUs use
197662306a36Sopenharmony_ci	 * cached translations with the old protection bits.
197762306a36Sopenharmony_ci	 */
197862306a36Sopenharmony_ci	return flush;
197962306a36Sopenharmony_ci}
198062306a36Sopenharmony_ci
198162306a36Sopenharmony_cistatic int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
198262306a36Sopenharmony_ci			 struct list_head *invalid_list)
198362306a36Sopenharmony_ci{
198462306a36Sopenharmony_ci	int ret = __kvm_sync_page(vcpu, sp);
198562306a36Sopenharmony_ci
198662306a36Sopenharmony_ci	if (ret < 0)
198762306a36Sopenharmony_ci		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
198862306a36Sopenharmony_ci	return ret;
198962306a36Sopenharmony_ci}
199062306a36Sopenharmony_ci
199162306a36Sopenharmony_cistatic bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
199262306a36Sopenharmony_ci					struct list_head *invalid_list,
199362306a36Sopenharmony_ci					bool remote_flush)
199462306a36Sopenharmony_ci{
199562306a36Sopenharmony_ci	if (!remote_flush && list_empty(invalid_list))
199662306a36Sopenharmony_ci		return false;
199762306a36Sopenharmony_ci
199862306a36Sopenharmony_ci	if (!list_empty(invalid_list))
199962306a36Sopenharmony_ci		kvm_mmu_commit_zap_page(kvm, invalid_list);
200062306a36Sopenharmony_ci	else
200162306a36Sopenharmony_ci		kvm_flush_remote_tlbs(kvm);
200262306a36Sopenharmony_ci	return true;
200362306a36Sopenharmony_ci}
200462306a36Sopenharmony_ci
200562306a36Sopenharmony_cistatic bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
200662306a36Sopenharmony_ci{
200762306a36Sopenharmony_ci	if (sp->role.invalid)
200862306a36Sopenharmony_ci		return true;
200962306a36Sopenharmony_ci
201062306a36Sopenharmony_ci	/* TDP MMU pages do not use the MMU generation. */
201162306a36Sopenharmony_ci	return !is_tdp_mmu_page(sp) &&
201262306a36Sopenharmony_ci	       unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
201362306a36Sopenharmony_ci}
201462306a36Sopenharmony_ci
201562306a36Sopenharmony_cistruct mmu_page_path {
201662306a36Sopenharmony_ci	struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
201762306a36Sopenharmony_ci	unsigned int idx[PT64_ROOT_MAX_LEVEL];
201862306a36Sopenharmony_ci};
201962306a36Sopenharmony_ci
202062306a36Sopenharmony_ci#define for_each_sp(pvec, sp, parents, i)			\
202162306a36Sopenharmony_ci		for (i = mmu_pages_first(&pvec, &parents);	\
202262306a36Sopenharmony_ci			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
202362306a36Sopenharmony_ci			i = mmu_pages_next(&pvec, &parents, i))
202462306a36Sopenharmony_ci
202562306a36Sopenharmony_cistatic int mmu_pages_next(struct kvm_mmu_pages *pvec,
202662306a36Sopenharmony_ci			  struct mmu_page_path *parents,
202762306a36Sopenharmony_ci			  int i)
202862306a36Sopenharmony_ci{
202962306a36Sopenharmony_ci	int n;
203062306a36Sopenharmony_ci
203162306a36Sopenharmony_ci	for (n = i+1; n < pvec->nr; n++) {
203262306a36Sopenharmony_ci		struct kvm_mmu_page *sp = pvec->page[n].sp;
203362306a36Sopenharmony_ci		unsigned idx = pvec->page[n].idx;
203462306a36Sopenharmony_ci		int level = sp->role.level;
203562306a36Sopenharmony_ci
203662306a36Sopenharmony_ci		parents->idx[level-1] = idx;
203762306a36Sopenharmony_ci		if (level == PG_LEVEL_4K)
203862306a36Sopenharmony_ci			break;
203962306a36Sopenharmony_ci
204062306a36Sopenharmony_ci		parents->parent[level-2] = sp;
204162306a36Sopenharmony_ci	}
204262306a36Sopenharmony_ci
204362306a36Sopenharmony_ci	return n;
204462306a36Sopenharmony_ci}
204562306a36Sopenharmony_ci
204662306a36Sopenharmony_cistatic int mmu_pages_first(struct kvm_mmu_pages *pvec,
204762306a36Sopenharmony_ci			   struct mmu_page_path *parents)
204862306a36Sopenharmony_ci{
204962306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
205062306a36Sopenharmony_ci	int level;
205162306a36Sopenharmony_ci
205262306a36Sopenharmony_ci	if (pvec->nr == 0)
205362306a36Sopenharmony_ci		return 0;
205462306a36Sopenharmony_ci
205562306a36Sopenharmony_ci	WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX);
205662306a36Sopenharmony_ci
205762306a36Sopenharmony_ci	sp = pvec->page[0].sp;
205862306a36Sopenharmony_ci	level = sp->role.level;
205962306a36Sopenharmony_ci	WARN_ON_ONCE(level == PG_LEVEL_4K);
206062306a36Sopenharmony_ci
206162306a36Sopenharmony_ci	parents->parent[level-2] = sp;
206262306a36Sopenharmony_ci
206362306a36Sopenharmony_ci	/* Also set up a sentinel.  Further entries in pvec are all
206462306a36Sopenharmony_ci	 * children of sp, so this element is never overwritten.
206562306a36Sopenharmony_ci	 */
206662306a36Sopenharmony_ci	parents->parent[level-1] = NULL;
206762306a36Sopenharmony_ci	return mmu_pages_next(pvec, parents, 0);
206862306a36Sopenharmony_ci}
206962306a36Sopenharmony_ci
207062306a36Sopenharmony_cistatic void mmu_pages_clear_parents(struct mmu_page_path *parents)
207162306a36Sopenharmony_ci{
207262306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
207362306a36Sopenharmony_ci	unsigned int level = 0;
207462306a36Sopenharmony_ci
207562306a36Sopenharmony_ci	do {
207662306a36Sopenharmony_ci		unsigned int idx = parents->idx[level];
207762306a36Sopenharmony_ci		sp = parents->parent[level];
207862306a36Sopenharmony_ci		if (!sp)
207962306a36Sopenharmony_ci			return;
208062306a36Sopenharmony_ci
208162306a36Sopenharmony_ci		WARN_ON_ONCE(idx == INVALID_INDEX);
208262306a36Sopenharmony_ci		clear_unsync_child_bit(sp, idx);
208362306a36Sopenharmony_ci		level++;
208462306a36Sopenharmony_ci	} while (!sp->unsync_children);
208562306a36Sopenharmony_ci}
208662306a36Sopenharmony_ci
208762306a36Sopenharmony_cistatic int mmu_sync_children(struct kvm_vcpu *vcpu,
208862306a36Sopenharmony_ci			     struct kvm_mmu_page *parent, bool can_yield)
208962306a36Sopenharmony_ci{
209062306a36Sopenharmony_ci	int i;
209162306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
209262306a36Sopenharmony_ci	struct mmu_page_path parents;
209362306a36Sopenharmony_ci	struct kvm_mmu_pages pages;
209462306a36Sopenharmony_ci	LIST_HEAD(invalid_list);
209562306a36Sopenharmony_ci	bool flush = false;
209662306a36Sopenharmony_ci
209762306a36Sopenharmony_ci	while (mmu_unsync_walk(parent, &pages)) {
209862306a36Sopenharmony_ci		bool protected = false;
209962306a36Sopenharmony_ci
210062306a36Sopenharmony_ci		for_each_sp(pages, sp, parents, i)
210162306a36Sopenharmony_ci			protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
210262306a36Sopenharmony_ci
210362306a36Sopenharmony_ci		if (protected) {
210462306a36Sopenharmony_ci			kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
210562306a36Sopenharmony_ci			flush = false;
210662306a36Sopenharmony_ci		}
210762306a36Sopenharmony_ci
210862306a36Sopenharmony_ci		for_each_sp(pages, sp, parents, i) {
210962306a36Sopenharmony_ci			kvm_unlink_unsync_page(vcpu->kvm, sp);
211062306a36Sopenharmony_ci			flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
211162306a36Sopenharmony_ci			mmu_pages_clear_parents(&parents);
211262306a36Sopenharmony_ci		}
211362306a36Sopenharmony_ci		if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
211462306a36Sopenharmony_ci			kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
211562306a36Sopenharmony_ci			if (!can_yield) {
211662306a36Sopenharmony_ci				kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
211762306a36Sopenharmony_ci				return -EINTR;
211862306a36Sopenharmony_ci			}
211962306a36Sopenharmony_ci
212062306a36Sopenharmony_ci			cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
212162306a36Sopenharmony_ci			flush = false;
212262306a36Sopenharmony_ci		}
212362306a36Sopenharmony_ci	}
212462306a36Sopenharmony_ci
212562306a36Sopenharmony_ci	kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
212662306a36Sopenharmony_ci	return 0;
212762306a36Sopenharmony_ci}
212862306a36Sopenharmony_ci
212962306a36Sopenharmony_cistatic void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
213062306a36Sopenharmony_ci{
213162306a36Sopenharmony_ci	atomic_set(&sp->write_flooding_count,  0);
213262306a36Sopenharmony_ci}
213362306a36Sopenharmony_ci
213462306a36Sopenharmony_cistatic void clear_sp_write_flooding_count(u64 *spte)
213562306a36Sopenharmony_ci{
213662306a36Sopenharmony_ci	__clear_sp_write_flooding_count(sptep_to_sp(spte));
213762306a36Sopenharmony_ci}
213862306a36Sopenharmony_ci
213962306a36Sopenharmony_ci/*
214062306a36Sopenharmony_ci * The vCPU is required when finding indirect shadow pages; the shadow
214162306a36Sopenharmony_ci * page may already exist and syncing it needs the vCPU pointer in
214262306a36Sopenharmony_ci * order to read guest page tables.  Direct shadow pages are never
214362306a36Sopenharmony_ci * unsync, thus @vcpu can be NULL if @role.direct is true.
214462306a36Sopenharmony_ci */
214562306a36Sopenharmony_cistatic struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
214662306a36Sopenharmony_ci						     struct kvm_vcpu *vcpu,
214762306a36Sopenharmony_ci						     gfn_t gfn,
214862306a36Sopenharmony_ci						     struct hlist_head *sp_list,
214962306a36Sopenharmony_ci						     union kvm_mmu_page_role role)
215062306a36Sopenharmony_ci{
215162306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
215262306a36Sopenharmony_ci	int ret;
215362306a36Sopenharmony_ci	int collisions = 0;
215462306a36Sopenharmony_ci	LIST_HEAD(invalid_list);
215562306a36Sopenharmony_ci
215662306a36Sopenharmony_ci	for_each_valid_sp(kvm, sp, sp_list) {
215762306a36Sopenharmony_ci		if (sp->gfn != gfn) {
215862306a36Sopenharmony_ci			collisions++;
215962306a36Sopenharmony_ci			continue;
216062306a36Sopenharmony_ci		}
216162306a36Sopenharmony_ci
216262306a36Sopenharmony_ci		if (sp->role.word != role.word) {
216362306a36Sopenharmony_ci			/*
216462306a36Sopenharmony_ci			 * If the guest is creating an upper-level page, zap
216562306a36Sopenharmony_ci			 * unsync pages for the same gfn.  While it's possible
216662306a36Sopenharmony_ci			 * the guest is using recursive page tables, in all
216762306a36Sopenharmony_ci			 * likelihood the guest has stopped using the unsync
216862306a36Sopenharmony_ci			 * page and is installing a completely unrelated page.
216962306a36Sopenharmony_ci			 * Unsync pages must not be left as is, because the new
217062306a36Sopenharmony_ci			 * upper-level page will be write-protected.
217162306a36Sopenharmony_ci			 */
217262306a36Sopenharmony_ci			if (role.level > PG_LEVEL_4K && sp->unsync)
217362306a36Sopenharmony_ci				kvm_mmu_prepare_zap_page(kvm, sp,
217462306a36Sopenharmony_ci							 &invalid_list);
217562306a36Sopenharmony_ci			continue;
217662306a36Sopenharmony_ci		}
217762306a36Sopenharmony_ci
217862306a36Sopenharmony_ci		/* unsync and write-flooding only apply to indirect SPs. */
217962306a36Sopenharmony_ci		if (sp->role.direct)
218062306a36Sopenharmony_ci			goto out;
218162306a36Sopenharmony_ci
218262306a36Sopenharmony_ci		if (sp->unsync) {
218362306a36Sopenharmony_ci			if (KVM_BUG_ON(!vcpu, kvm))
218462306a36Sopenharmony_ci				break;
218562306a36Sopenharmony_ci
218662306a36Sopenharmony_ci			/*
218762306a36Sopenharmony_ci			 * The page is good, but is stale.  kvm_sync_page does
218862306a36Sopenharmony_ci			 * get the latest guest state, but (unlike mmu_unsync_children)
218962306a36Sopenharmony_ci			 * it doesn't write-protect the page or mark it synchronized!
219062306a36Sopenharmony_ci			 * This way the validity of the mapping is ensured, but the
219162306a36Sopenharmony_ci			 * overhead of write protection is not incurred until the
219262306a36Sopenharmony_ci			 * guest invalidates the TLB mapping.  This allows multiple
219362306a36Sopenharmony_ci			 * SPs for a single gfn to be unsync.
219462306a36Sopenharmony_ci			 *
219562306a36Sopenharmony_ci			 * If the sync fails, the page is zapped.  If so, break
219662306a36Sopenharmony_ci			 * in order to rebuild it.
219762306a36Sopenharmony_ci			 */
219862306a36Sopenharmony_ci			ret = kvm_sync_page(vcpu, sp, &invalid_list);
219962306a36Sopenharmony_ci			if (ret < 0)
220062306a36Sopenharmony_ci				break;
220162306a36Sopenharmony_ci
220262306a36Sopenharmony_ci			WARN_ON_ONCE(!list_empty(&invalid_list));
220362306a36Sopenharmony_ci			if (ret > 0)
220462306a36Sopenharmony_ci				kvm_flush_remote_tlbs(kvm);
220562306a36Sopenharmony_ci		}
220662306a36Sopenharmony_ci
220762306a36Sopenharmony_ci		__clear_sp_write_flooding_count(sp);
220862306a36Sopenharmony_ci
220962306a36Sopenharmony_ci		goto out;
221062306a36Sopenharmony_ci	}
221162306a36Sopenharmony_ci
221262306a36Sopenharmony_ci	sp = NULL;
221362306a36Sopenharmony_ci	++kvm->stat.mmu_cache_miss;
221462306a36Sopenharmony_ci
221562306a36Sopenharmony_ciout:
221662306a36Sopenharmony_ci	kvm_mmu_commit_zap_page(kvm, &invalid_list);
221762306a36Sopenharmony_ci
221862306a36Sopenharmony_ci	if (collisions > kvm->stat.max_mmu_page_hash_collisions)
221962306a36Sopenharmony_ci		kvm->stat.max_mmu_page_hash_collisions = collisions;
222062306a36Sopenharmony_ci	return sp;
222162306a36Sopenharmony_ci}
222262306a36Sopenharmony_ci
222362306a36Sopenharmony_ci/* Caches used when allocating a new shadow page. */
222462306a36Sopenharmony_cistruct shadow_page_caches {
222562306a36Sopenharmony_ci	struct kvm_mmu_memory_cache *page_header_cache;
222662306a36Sopenharmony_ci	struct kvm_mmu_memory_cache *shadow_page_cache;
222762306a36Sopenharmony_ci	struct kvm_mmu_memory_cache *shadowed_info_cache;
222862306a36Sopenharmony_ci};
222962306a36Sopenharmony_ci
223062306a36Sopenharmony_cistatic struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
223162306a36Sopenharmony_ci						      struct shadow_page_caches *caches,
223262306a36Sopenharmony_ci						      gfn_t gfn,
223362306a36Sopenharmony_ci						      struct hlist_head *sp_list,
223462306a36Sopenharmony_ci						      union kvm_mmu_page_role role)
223562306a36Sopenharmony_ci{
223662306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
223762306a36Sopenharmony_ci
223862306a36Sopenharmony_ci	sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
223962306a36Sopenharmony_ci	sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
224062306a36Sopenharmony_ci	if (!role.direct)
224162306a36Sopenharmony_ci		sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
224262306a36Sopenharmony_ci
224362306a36Sopenharmony_ci	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
224462306a36Sopenharmony_ci
224562306a36Sopenharmony_ci	INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
224662306a36Sopenharmony_ci
224762306a36Sopenharmony_ci	/*
224862306a36Sopenharmony_ci	 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
224962306a36Sopenharmony_ci	 * depends on valid pages being added to the head of the list.  See
225062306a36Sopenharmony_ci	 * comments in kvm_zap_obsolete_pages().
225162306a36Sopenharmony_ci	 */
225262306a36Sopenharmony_ci	sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
225362306a36Sopenharmony_ci	list_add(&sp->link, &kvm->arch.active_mmu_pages);
225462306a36Sopenharmony_ci	kvm_account_mmu_page(kvm, sp);
225562306a36Sopenharmony_ci
225662306a36Sopenharmony_ci	sp->gfn = gfn;
225762306a36Sopenharmony_ci	sp->role = role;
225862306a36Sopenharmony_ci	hlist_add_head(&sp->hash_link, sp_list);
225962306a36Sopenharmony_ci	if (sp_has_gptes(sp))
226062306a36Sopenharmony_ci		account_shadowed(kvm, sp);
226162306a36Sopenharmony_ci
226262306a36Sopenharmony_ci	return sp;
226362306a36Sopenharmony_ci}
226462306a36Sopenharmony_ci
226562306a36Sopenharmony_ci/* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */
226662306a36Sopenharmony_cistatic struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
226762306a36Sopenharmony_ci						      struct kvm_vcpu *vcpu,
226862306a36Sopenharmony_ci						      struct shadow_page_caches *caches,
226962306a36Sopenharmony_ci						      gfn_t gfn,
227062306a36Sopenharmony_ci						      union kvm_mmu_page_role role)
227162306a36Sopenharmony_ci{
227262306a36Sopenharmony_ci	struct hlist_head *sp_list;
227362306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
227462306a36Sopenharmony_ci	bool created = false;
227562306a36Sopenharmony_ci
227662306a36Sopenharmony_ci	sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
227762306a36Sopenharmony_ci
227862306a36Sopenharmony_ci	sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
227962306a36Sopenharmony_ci	if (!sp) {
228062306a36Sopenharmony_ci		created = true;
228162306a36Sopenharmony_ci		sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role);
228262306a36Sopenharmony_ci	}
228362306a36Sopenharmony_ci
228462306a36Sopenharmony_ci	trace_kvm_mmu_get_page(sp, created);
228562306a36Sopenharmony_ci	return sp;
228662306a36Sopenharmony_ci}
228762306a36Sopenharmony_ci
228862306a36Sopenharmony_cistatic struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
228962306a36Sopenharmony_ci						    gfn_t gfn,
229062306a36Sopenharmony_ci						    union kvm_mmu_page_role role)
229162306a36Sopenharmony_ci{
229262306a36Sopenharmony_ci	struct shadow_page_caches caches = {
229362306a36Sopenharmony_ci		.page_header_cache = &vcpu->arch.mmu_page_header_cache,
229462306a36Sopenharmony_ci		.shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
229562306a36Sopenharmony_ci		.shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
229662306a36Sopenharmony_ci	};
229762306a36Sopenharmony_ci
229862306a36Sopenharmony_ci	return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
229962306a36Sopenharmony_ci}
230062306a36Sopenharmony_ci
230162306a36Sopenharmony_cistatic union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct,
230262306a36Sopenharmony_ci						  unsigned int access)
230362306a36Sopenharmony_ci{
230462306a36Sopenharmony_ci	struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
230562306a36Sopenharmony_ci	union kvm_mmu_page_role role;
230662306a36Sopenharmony_ci
230762306a36Sopenharmony_ci	role = parent_sp->role;
230862306a36Sopenharmony_ci	role.level--;
230962306a36Sopenharmony_ci	role.access = access;
231062306a36Sopenharmony_ci	role.direct = direct;
231162306a36Sopenharmony_ci	role.passthrough = 0;
231262306a36Sopenharmony_ci
231362306a36Sopenharmony_ci	/*
231462306a36Sopenharmony_ci	 * If the guest has 4-byte PTEs then that means it's using 32-bit,
231562306a36Sopenharmony_ci	 * 2-level, non-PAE paging. KVM shadows such guests with PAE paging
231662306a36Sopenharmony_ci	 * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must
231762306a36Sopenharmony_ci	 * shadow each guest page table with multiple shadow page tables, which
231862306a36Sopenharmony_ci	 * requires extra bookkeeping in the role.
231962306a36Sopenharmony_ci	 *
232062306a36Sopenharmony_ci	 * Specifically, to shadow the guest's page directory (which covers a
232162306a36Sopenharmony_ci	 * 4GiB address space), KVM uses 4 PAE page directories, each mapping
232262306a36Sopenharmony_ci	 * 1GiB of the address space. @role.quadrant encodes which quarter of
232362306a36Sopenharmony_ci	 * the address space each maps.
232462306a36Sopenharmony_ci	 *
232562306a36Sopenharmony_ci	 * To shadow the guest's page tables (which each map a 4MiB region), KVM
232662306a36Sopenharmony_ci	 * uses 2 PAE page tables, each mapping a 2MiB region. For these,
232762306a36Sopenharmony_ci	 * @role.quadrant encodes which half of the region they map.
232862306a36Sopenharmony_ci	 *
232962306a36Sopenharmony_ci	 * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE
233062306a36Sopenharmony_ci	 * consumes bits 29:21.  To consume bits 31:30, KVM's uses 4 shadow
233162306a36Sopenharmony_ci	 * PDPTEs; those 4 PAE page directories are pre-allocated and their
233262306a36Sopenharmony_ci	 * quadrant is assigned in mmu_alloc_root().   A 4-byte PTE consumes
233362306a36Sopenharmony_ci	 * bits 21:12, while an 8-byte PTE consumes bits 20:12.  To consume
233462306a36Sopenharmony_ci	 * bit 21 in the PTE (the child here), KVM propagates that bit to the
233562306a36Sopenharmony_ci	 * quadrant, i.e. sets quadrant to '0' or '1'.  The parent 8-byte PDE
233662306a36Sopenharmony_ci	 * covers bit 21 (see above), thus the quadrant is calculated from the
233762306a36Sopenharmony_ci	 * _least_ significant bit of the PDE index.
233862306a36Sopenharmony_ci	 */
233962306a36Sopenharmony_ci	if (role.has_4_byte_gpte) {
234062306a36Sopenharmony_ci		WARN_ON_ONCE(role.level != PG_LEVEL_4K);
234162306a36Sopenharmony_ci		role.quadrant = spte_index(sptep) & 1;
234262306a36Sopenharmony_ci	}
234362306a36Sopenharmony_ci
234462306a36Sopenharmony_ci	return role;
234562306a36Sopenharmony_ci}
234662306a36Sopenharmony_ci
234762306a36Sopenharmony_cistatic struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu,
234862306a36Sopenharmony_ci						 u64 *sptep, gfn_t gfn,
234962306a36Sopenharmony_ci						 bool direct, unsigned int access)
235062306a36Sopenharmony_ci{
235162306a36Sopenharmony_ci	union kvm_mmu_page_role role;
235262306a36Sopenharmony_ci
235362306a36Sopenharmony_ci	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
235462306a36Sopenharmony_ci		return ERR_PTR(-EEXIST);
235562306a36Sopenharmony_ci
235662306a36Sopenharmony_ci	role = kvm_mmu_child_role(sptep, direct, access);
235762306a36Sopenharmony_ci	return kvm_mmu_get_shadow_page(vcpu, gfn, role);
235862306a36Sopenharmony_ci}
235962306a36Sopenharmony_ci
236062306a36Sopenharmony_cistatic void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
236162306a36Sopenharmony_ci					struct kvm_vcpu *vcpu, hpa_t root,
236262306a36Sopenharmony_ci					u64 addr)
236362306a36Sopenharmony_ci{
236462306a36Sopenharmony_ci	iterator->addr = addr;
236562306a36Sopenharmony_ci	iterator->shadow_addr = root;
236662306a36Sopenharmony_ci	iterator->level = vcpu->arch.mmu->root_role.level;
236762306a36Sopenharmony_ci
236862306a36Sopenharmony_ci	if (iterator->level >= PT64_ROOT_4LEVEL &&
236962306a36Sopenharmony_ci	    vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL &&
237062306a36Sopenharmony_ci	    !vcpu->arch.mmu->root_role.direct)
237162306a36Sopenharmony_ci		iterator->level = PT32E_ROOT_LEVEL;
237262306a36Sopenharmony_ci
237362306a36Sopenharmony_ci	if (iterator->level == PT32E_ROOT_LEVEL) {
237462306a36Sopenharmony_ci		/*
237562306a36Sopenharmony_ci		 * prev_root is currently only used for 64-bit hosts. So only
237662306a36Sopenharmony_ci		 * the active root_hpa is valid here.
237762306a36Sopenharmony_ci		 */
237862306a36Sopenharmony_ci		BUG_ON(root != vcpu->arch.mmu->root.hpa);
237962306a36Sopenharmony_ci
238062306a36Sopenharmony_ci		iterator->shadow_addr
238162306a36Sopenharmony_ci			= vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
238262306a36Sopenharmony_ci		iterator->shadow_addr &= SPTE_BASE_ADDR_MASK;
238362306a36Sopenharmony_ci		--iterator->level;
238462306a36Sopenharmony_ci		if (!iterator->shadow_addr)
238562306a36Sopenharmony_ci			iterator->level = 0;
238662306a36Sopenharmony_ci	}
238762306a36Sopenharmony_ci}
238862306a36Sopenharmony_ci
238962306a36Sopenharmony_cistatic void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
239062306a36Sopenharmony_ci			     struct kvm_vcpu *vcpu, u64 addr)
239162306a36Sopenharmony_ci{
239262306a36Sopenharmony_ci	shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
239362306a36Sopenharmony_ci				    addr);
239462306a36Sopenharmony_ci}
239562306a36Sopenharmony_ci
239662306a36Sopenharmony_cistatic bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
239762306a36Sopenharmony_ci{
239862306a36Sopenharmony_ci	if (iterator->level < PG_LEVEL_4K)
239962306a36Sopenharmony_ci		return false;
240062306a36Sopenharmony_ci
240162306a36Sopenharmony_ci	iterator->index = SPTE_INDEX(iterator->addr, iterator->level);
240262306a36Sopenharmony_ci	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
240362306a36Sopenharmony_ci	return true;
240462306a36Sopenharmony_ci}
240562306a36Sopenharmony_ci
240662306a36Sopenharmony_cistatic void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
240762306a36Sopenharmony_ci			       u64 spte)
240862306a36Sopenharmony_ci{
240962306a36Sopenharmony_ci	if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
241062306a36Sopenharmony_ci		iterator->level = 0;
241162306a36Sopenharmony_ci		return;
241262306a36Sopenharmony_ci	}
241362306a36Sopenharmony_ci
241462306a36Sopenharmony_ci	iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK;
241562306a36Sopenharmony_ci	--iterator->level;
241662306a36Sopenharmony_ci}
241762306a36Sopenharmony_ci
241862306a36Sopenharmony_cistatic void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
241962306a36Sopenharmony_ci{
242062306a36Sopenharmony_ci	__shadow_walk_next(iterator, *iterator->sptep);
242162306a36Sopenharmony_ci}
242262306a36Sopenharmony_ci
242362306a36Sopenharmony_cistatic void __link_shadow_page(struct kvm *kvm,
242462306a36Sopenharmony_ci			       struct kvm_mmu_memory_cache *cache, u64 *sptep,
242562306a36Sopenharmony_ci			       struct kvm_mmu_page *sp, bool flush)
242662306a36Sopenharmony_ci{
242762306a36Sopenharmony_ci	u64 spte;
242862306a36Sopenharmony_ci
242962306a36Sopenharmony_ci	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
243062306a36Sopenharmony_ci
243162306a36Sopenharmony_ci	/*
243262306a36Sopenharmony_ci	 * If an SPTE is present already, it must be a leaf and therefore
243362306a36Sopenharmony_ci	 * a large one.  Drop it, and flush the TLB if needed, before
243462306a36Sopenharmony_ci	 * installing sp.
243562306a36Sopenharmony_ci	 */
243662306a36Sopenharmony_ci	if (is_shadow_present_pte(*sptep))
243762306a36Sopenharmony_ci		drop_large_spte(kvm, sptep, flush);
243862306a36Sopenharmony_ci
243962306a36Sopenharmony_ci	spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
244062306a36Sopenharmony_ci
244162306a36Sopenharmony_ci	mmu_spte_set(sptep, spte);
244262306a36Sopenharmony_ci
244362306a36Sopenharmony_ci	mmu_page_add_parent_pte(cache, sp, sptep);
244462306a36Sopenharmony_ci
244562306a36Sopenharmony_ci	/*
244662306a36Sopenharmony_ci	 * The non-direct sub-pagetable must be updated before linking.  For
244762306a36Sopenharmony_ci	 * L1 sp, the pagetable is updated via kvm_sync_page() in
244862306a36Sopenharmony_ci	 * kvm_mmu_find_shadow_page() without write-protecting the gfn,
244962306a36Sopenharmony_ci	 * so sp->unsync can be true or false.  For higher level non-direct
245062306a36Sopenharmony_ci	 * sp, the pagetable is updated/synced via mmu_sync_children() in
245162306a36Sopenharmony_ci	 * FNAME(fetch)(), so sp->unsync_children can only be false.
245262306a36Sopenharmony_ci	 * WARN_ON_ONCE() if anything happens unexpectedly.
245362306a36Sopenharmony_ci	 */
245462306a36Sopenharmony_ci	if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync)
245562306a36Sopenharmony_ci		mark_unsync(sptep);
245662306a36Sopenharmony_ci}
245762306a36Sopenharmony_ci
245862306a36Sopenharmony_cistatic void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
245962306a36Sopenharmony_ci			     struct kvm_mmu_page *sp)
246062306a36Sopenharmony_ci{
246162306a36Sopenharmony_ci	__link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true);
246262306a36Sopenharmony_ci}
246362306a36Sopenharmony_ci
246462306a36Sopenharmony_cistatic void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
246562306a36Sopenharmony_ci				   unsigned direct_access)
246662306a36Sopenharmony_ci{
246762306a36Sopenharmony_ci	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
246862306a36Sopenharmony_ci		struct kvm_mmu_page *child;
246962306a36Sopenharmony_ci
247062306a36Sopenharmony_ci		/*
247162306a36Sopenharmony_ci		 * For the direct sp, if the guest pte's dirty bit
247262306a36Sopenharmony_ci		 * changed form clean to dirty, it will corrupt the
247362306a36Sopenharmony_ci		 * sp's access: allow writable in the read-only sp,
247462306a36Sopenharmony_ci		 * so we should update the spte at this point to get
247562306a36Sopenharmony_ci		 * a new sp with the correct access.
247662306a36Sopenharmony_ci		 */
247762306a36Sopenharmony_ci		child = spte_to_child_sp(*sptep);
247862306a36Sopenharmony_ci		if (child->role.access == direct_access)
247962306a36Sopenharmony_ci			return;
248062306a36Sopenharmony_ci
248162306a36Sopenharmony_ci		drop_parent_pte(vcpu->kvm, child, sptep);
248262306a36Sopenharmony_ci		kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
248362306a36Sopenharmony_ci	}
248462306a36Sopenharmony_ci}
248562306a36Sopenharmony_ci
248662306a36Sopenharmony_ci/* Returns the number of zapped non-leaf child shadow pages. */
248762306a36Sopenharmony_cistatic int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
248862306a36Sopenharmony_ci			    u64 *spte, struct list_head *invalid_list)
248962306a36Sopenharmony_ci{
249062306a36Sopenharmony_ci	u64 pte;
249162306a36Sopenharmony_ci	struct kvm_mmu_page *child;
249262306a36Sopenharmony_ci
249362306a36Sopenharmony_ci	pte = *spte;
249462306a36Sopenharmony_ci	if (is_shadow_present_pte(pte)) {
249562306a36Sopenharmony_ci		if (is_last_spte(pte, sp->role.level)) {
249662306a36Sopenharmony_ci			drop_spte(kvm, spte);
249762306a36Sopenharmony_ci		} else {
249862306a36Sopenharmony_ci			child = spte_to_child_sp(pte);
249962306a36Sopenharmony_ci			drop_parent_pte(kvm, child, spte);
250062306a36Sopenharmony_ci
250162306a36Sopenharmony_ci			/*
250262306a36Sopenharmony_ci			 * Recursively zap nested TDP SPs, parentless SPs are
250362306a36Sopenharmony_ci			 * unlikely to be used again in the near future.  This
250462306a36Sopenharmony_ci			 * avoids retaining a large number of stale nested SPs.
250562306a36Sopenharmony_ci			 */
250662306a36Sopenharmony_ci			if (tdp_enabled && invalid_list &&
250762306a36Sopenharmony_ci			    child->role.guest_mode && !child->parent_ptes.val)
250862306a36Sopenharmony_ci				return kvm_mmu_prepare_zap_page(kvm, child,
250962306a36Sopenharmony_ci								invalid_list);
251062306a36Sopenharmony_ci		}
251162306a36Sopenharmony_ci	} else if (is_mmio_spte(pte)) {
251262306a36Sopenharmony_ci		mmu_spte_clear_no_track(spte);
251362306a36Sopenharmony_ci	}
251462306a36Sopenharmony_ci	return 0;
251562306a36Sopenharmony_ci}
251662306a36Sopenharmony_ci
251762306a36Sopenharmony_cistatic int kvm_mmu_page_unlink_children(struct kvm *kvm,
251862306a36Sopenharmony_ci					struct kvm_mmu_page *sp,
251962306a36Sopenharmony_ci					struct list_head *invalid_list)
252062306a36Sopenharmony_ci{
252162306a36Sopenharmony_ci	int zapped = 0;
252262306a36Sopenharmony_ci	unsigned i;
252362306a36Sopenharmony_ci
252462306a36Sopenharmony_ci	for (i = 0; i < SPTE_ENT_PER_PAGE; ++i)
252562306a36Sopenharmony_ci		zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
252662306a36Sopenharmony_ci
252762306a36Sopenharmony_ci	return zapped;
252862306a36Sopenharmony_ci}
252962306a36Sopenharmony_ci
253062306a36Sopenharmony_cistatic void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
253162306a36Sopenharmony_ci{
253262306a36Sopenharmony_ci	u64 *sptep;
253362306a36Sopenharmony_ci	struct rmap_iterator iter;
253462306a36Sopenharmony_ci
253562306a36Sopenharmony_ci	while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
253662306a36Sopenharmony_ci		drop_parent_pte(kvm, sp, sptep);
253762306a36Sopenharmony_ci}
253862306a36Sopenharmony_ci
253962306a36Sopenharmony_cistatic int mmu_zap_unsync_children(struct kvm *kvm,
254062306a36Sopenharmony_ci				   struct kvm_mmu_page *parent,
254162306a36Sopenharmony_ci				   struct list_head *invalid_list)
254262306a36Sopenharmony_ci{
254362306a36Sopenharmony_ci	int i, zapped = 0;
254462306a36Sopenharmony_ci	struct mmu_page_path parents;
254562306a36Sopenharmony_ci	struct kvm_mmu_pages pages;
254662306a36Sopenharmony_ci
254762306a36Sopenharmony_ci	if (parent->role.level == PG_LEVEL_4K)
254862306a36Sopenharmony_ci		return 0;
254962306a36Sopenharmony_ci
255062306a36Sopenharmony_ci	while (mmu_unsync_walk(parent, &pages)) {
255162306a36Sopenharmony_ci		struct kvm_mmu_page *sp;
255262306a36Sopenharmony_ci
255362306a36Sopenharmony_ci		for_each_sp(pages, sp, parents, i) {
255462306a36Sopenharmony_ci			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
255562306a36Sopenharmony_ci			mmu_pages_clear_parents(&parents);
255662306a36Sopenharmony_ci			zapped++;
255762306a36Sopenharmony_ci		}
255862306a36Sopenharmony_ci	}
255962306a36Sopenharmony_ci
256062306a36Sopenharmony_ci	return zapped;
256162306a36Sopenharmony_ci}
256262306a36Sopenharmony_ci
256362306a36Sopenharmony_cistatic bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
256462306a36Sopenharmony_ci				       struct kvm_mmu_page *sp,
256562306a36Sopenharmony_ci				       struct list_head *invalid_list,
256662306a36Sopenharmony_ci				       int *nr_zapped)
256762306a36Sopenharmony_ci{
256862306a36Sopenharmony_ci	bool list_unstable, zapped_root = false;
256962306a36Sopenharmony_ci
257062306a36Sopenharmony_ci	lockdep_assert_held_write(&kvm->mmu_lock);
257162306a36Sopenharmony_ci	trace_kvm_mmu_prepare_zap_page(sp);
257262306a36Sopenharmony_ci	++kvm->stat.mmu_shadow_zapped;
257362306a36Sopenharmony_ci	*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
257462306a36Sopenharmony_ci	*nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
257562306a36Sopenharmony_ci	kvm_mmu_unlink_parents(kvm, sp);
257662306a36Sopenharmony_ci
257762306a36Sopenharmony_ci	/* Zapping children means active_mmu_pages has become unstable. */
257862306a36Sopenharmony_ci	list_unstable = *nr_zapped;
257962306a36Sopenharmony_ci
258062306a36Sopenharmony_ci	if (!sp->role.invalid && sp_has_gptes(sp))
258162306a36Sopenharmony_ci		unaccount_shadowed(kvm, sp);
258262306a36Sopenharmony_ci
258362306a36Sopenharmony_ci	if (sp->unsync)
258462306a36Sopenharmony_ci		kvm_unlink_unsync_page(kvm, sp);
258562306a36Sopenharmony_ci	if (!sp->root_count) {
258662306a36Sopenharmony_ci		/* Count self */
258762306a36Sopenharmony_ci		(*nr_zapped)++;
258862306a36Sopenharmony_ci
258962306a36Sopenharmony_ci		/*
259062306a36Sopenharmony_ci		 * Already invalid pages (previously active roots) are not on
259162306a36Sopenharmony_ci		 * the active page list.  See list_del() in the "else" case of
259262306a36Sopenharmony_ci		 * !sp->root_count.
259362306a36Sopenharmony_ci		 */
259462306a36Sopenharmony_ci		if (sp->role.invalid)
259562306a36Sopenharmony_ci			list_add(&sp->link, invalid_list);
259662306a36Sopenharmony_ci		else
259762306a36Sopenharmony_ci			list_move(&sp->link, invalid_list);
259862306a36Sopenharmony_ci		kvm_unaccount_mmu_page(kvm, sp);
259962306a36Sopenharmony_ci	} else {
260062306a36Sopenharmony_ci		/*
260162306a36Sopenharmony_ci		 * Remove the active root from the active page list, the root
260262306a36Sopenharmony_ci		 * will be explicitly freed when the root_count hits zero.
260362306a36Sopenharmony_ci		 */
260462306a36Sopenharmony_ci		list_del(&sp->link);
260562306a36Sopenharmony_ci
260662306a36Sopenharmony_ci		/*
260762306a36Sopenharmony_ci		 * Obsolete pages cannot be used on any vCPUs, see the comment
260862306a36Sopenharmony_ci		 * in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
260962306a36Sopenharmony_ci		 * treats invalid shadow pages as being obsolete.
261062306a36Sopenharmony_ci		 */
261162306a36Sopenharmony_ci		zapped_root = !is_obsolete_sp(kvm, sp);
261262306a36Sopenharmony_ci	}
261362306a36Sopenharmony_ci
261462306a36Sopenharmony_ci	if (sp->nx_huge_page_disallowed)
261562306a36Sopenharmony_ci		unaccount_nx_huge_page(kvm, sp);
261662306a36Sopenharmony_ci
261762306a36Sopenharmony_ci	sp->role.invalid = 1;
261862306a36Sopenharmony_ci
261962306a36Sopenharmony_ci	/*
262062306a36Sopenharmony_ci	 * Make the request to free obsolete roots after marking the root
262162306a36Sopenharmony_ci	 * invalid, otherwise other vCPUs may not see it as invalid.
262262306a36Sopenharmony_ci	 */
262362306a36Sopenharmony_ci	if (zapped_root)
262462306a36Sopenharmony_ci		kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
262562306a36Sopenharmony_ci	return list_unstable;
262662306a36Sopenharmony_ci}
262762306a36Sopenharmony_ci
262862306a36Sopenharmony_cistatic bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
262962306a36Sopenharmony_ci				     struct list_head *invalid_list)
263062306a36Sopenharmony_ci{
263162306a36Sopenharmony_ci	int nr_zapped;
263262306a36Sopenharmony_ci
263362306a36Sopenharmony_ci	__kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
263462306a36Sopenharmony_ci	return nr_zapped;
263562306a36Sopenharmony_ci}
263662306a36Sopenharmony_ci
263762306a36Sopenharmony_cistatic void kvm_mmu_commit_zap_page(struct kvm *kvm,
263862306a36Sopenharmony_ci				    struct list_head *invalid_list)
263962306a36Sopenharmony_ci{
264062306a36Sopenharmony_ci	struct kvm_mmu_page *sp, *nsp;
264162306a36Sopenharmony_ci
264262306a36Sopenharmony_ci	if (list_empty(invalid_list))
264362306a36Sopenharmony_ci		return;
264462306a36Sopenharmony_ci
264562306a36Sopenharmony_ci	/*
264662306a36Sopenharmony_ci	 * We need to make sure everyone sees our modifications to
264762306a36Sopenharmony_ci	 * the page tables and see changes to vcpu->mode here. The barrier
264862306a36Sopenharmony_ci	 * in the kvm_flush_remote_tlbs() achieves this. This pairs
264962306a36Sopenharmony_ci	 * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
265062306a36Sopenharmony_ci	 *
265162306a36Sopenharmony_ci	 * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
265262306a36Sopenharmony_ci	 * guest mode and/or lockless shadow page table walks.
265362306a36Sopenharmony_ci	 */
265462306a36Sopenharmony_ci	kvm_flush_remote_tlbs(kvm);
265562306a36Sopenharmony_ci
265662306a36Sopenharmony_ci	list_for_each_entry_safe(sp, nsp, invalid_list, link) {
265762306a36Sopenharmony_ci		WARN_ON_ONCE(!sp->role.invalid || sp->root_count);
265862306a36Sopenharmony_ci		kvm_mmu_free_shadow_page(sp);
265962306a36Sopenharmony_ci	}
266062306a36Sopenharmony_ci}
266162306a36Sopenharmony_ci
266262306a36Sopenharmony_cistatic unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
266362306a36Sopenharmony_ci						  unsigned long nr_to_zap)
266462306a36Sopenharmony_ci{
266562306a36Sopenharmony_ci	unsigned long total_zapped = 0;
266662306a36Sopenharmony_ci	struct kvm_mmu_page *sp, *tmp;
266762306a36Sopenharmony_ci	LIST_HEAD(invalid_list);
266862306a36Sopenharmony_ci	bool unstable;
266962306a36Sopenharmony_ci	int nr_zapped;
267062306a36Sopenharmony_ci
267162306a36Sopenharmony_ci	if (list_empty(&kvm->arch.active_mmu_pages))
267262306a36Sopenharmony_ci		return 0;
267362306a36Sopenharmony_ci
267462306a36Sopenharmony_cirestart:
267562306a36Sopenharmony_ci	list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
267662306a36Sopenharmony_ci		/*
267762306a36Sopenharmony_ci		 * Don't zap active root pages, the page itself can't be freed
267862306a36Sopenharmony_ci		 * and zapping it will just force vCPUs to realloc and reload.
267962306a36Sopenharmony_ci		 */
268062306a36Sopenharmony_ci		if (sp->root_count)
268162306a36Sopenharmony_ci			continue;
268262306a36Sopenharmony_ci
268362306a36Sopenharmony_ci		unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
268462306a36Sopenharmony_ci						      &nr_zapped);
268562306a36Sopenharmony_ci		total_zapped += nr_zapped;
268662306a36Sopenharmony_ci		if (total_zapped >= nr_to_zap)
268762306a36Sopenharmony_ci			break;
268862306a36Sopenharmony_ci
268962306a36Sopenharmony_ci		if (unstable)
269062306a36Sopenharmony_ci			goto restart;
269162306a36Sopenharmony_ci	}
269262306a36Sopenharmony_ci
269362306a36Sopenharmony_ci	kvm_mmu_commit_zap_page(kvm, &invalid_list);
269462306a36Sopenharmony_ci
269562306a36Sopenharmony_ci	kvm->stat.mmu_recycled += total_zapped;
269662306a36Sopenharmony_ci	return total_zapped;
269762306a36Sopenharmony_ci}
269862306a36Sopenharmony_ci
269962306a36Sopenharmony_cistatic inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
270062306a36Sopenharmony_ci{
270162306a36Sopenharmony_ci	if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
270262306a36Sopenharmony_ci		return kvm->arch.n_max_mmu_pages -
270362306a36Sopenharmony_ci			kvm->arch.n_used_mmu_pages;
270462306a36Sopenharmony_ci
270562306a36Sopenharmony_ci	return 0;
270662306a36Sopenharmony_ci}
270762306a36Sopenharmony_ci
270862306a36Sopenharmony_cistatic int make_mmu_pages_available(struct kvm_vcpu *vcpu)
270962306a36Sopenharmony_ci{
271062306a36Sopenharmony_ci	unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
271162306a36Sopenharmony_ci
271262306a36Sopenharmony_ci	if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
271362306a36Sopenharmony_ci		return 0;
271462306a36Sopenharmony_ci
271562306a36Sopenharmony_ci	kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
271662306a36Sopenharmony_ci
271762306a36Sopenharmony_ci	/*
271862306a36Sopenharmony_ci	 * Note, this check is intentionally soft, it only guarantees that one
271962306a36Sopenharmony_ci	 * page is available, while the caller may end up allocating as many as
272062306a36Sopenharmony_ci	 * four pages, e.g. for PAE roots or for 5-level paging.  Temporarily
272162306a36Sopenharmony_ci	 * exceeding the (arbitrary by default) limit will not harm the host,
272262306a36Sopenharmony_ci	 * being too aggressive may unnecessarily kill the guest, and getting an
272362306a36Sopenharmony_ci	 * exact count is far more trouble than it's worth, especially in the
272462306a36Sopenharmony_ci	 * page fault paths.
272562306a36Sopenharmony_ci	 */
272662306a36Sopenharmony_ci	if (!kvm_mmu_available_pages(vcpu->kvm))
272762306a36Sopenharmony_ci		return -ENOSPC;
272862306a36Sopenharmony_ci	return 0;
272962306a36Sopenharmony_ci}
273062306a36Sopenharmony_ci
273162306a36Sopenharmony_ci/*
273262306a36Sopenharmony_ci * Changing the number of mmu pages allocated to the vm
273362306a36Sopenharmony_ci * Note: if goal_nr_mmu_pages is too small, you will get dead lock
273462306a36Sopenharmony_ci */
273562306a36Sopenharmony_civoid kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
273662306a36Sopenharmony_ci{
273762306a36Sopenharmony_ci	write_lock(&kvm->mmu_lock);
273862306a36Sopenharmony_ci
273962306a36Sopenharmony_ci	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
274062306a36Sopenharmony_ci		kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
274162306a36Sopenharmony_ci						  goal_nr_mmu_pages);
274262306a36Sopenharmony_ci
274362306a36Sopenharmony_ci		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
274462306a36Sopenharmony_ci	}
274562306a36Sopenharmony_ci
274662306a36Sopenharmony_ci	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
274762306a36Sopenharmony_ci
274862306a36Sopenharmony_ci	write_unlock(&kvm->mmu_lock);
274962306a36Sopenharmony_ci}
275062306a36Sopenharmony_ci
275162306a36Sopenharmony_ciint kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
275262306a36Sopenharmony_ci{
275362306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
275462306a36Sopenharmony_ci	LIST_HEAD(invalid_list);
275562306a36Sopenharmony_ci	int r;
275662306a36Sopenharmony_ci
275762306a36Sopenharmony_ci	r = 0;
275862306a36Sopenharmony_ci	write_lock(&kvm->mmu_lock);
275962306a36Sopenharmony_ci	for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
276062306a36Sopenharmony_ci		r = 1;
276162306a36Sopenharmony_ci		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
276262306a36Sopenharmony_ci	}
276362306a36Sopenharmony_ci	kvm_mmu_commit_zap_page(kvm, &invalid_list);
276462306a36Sopenharmony_ci	write_unlock(&kvm->mmu_lock);
276562306a36Sopenharmony_ci
276662306a36Sopenharmony_ci	return r;
276762306a36Sopenharmony_ci}
276862306a36Sopenharmony_ci
276962306a36Sopenharmony_cistatic int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
277062306a36Sopenharmony_ci{
277162306a36Sopenharmony_ci	gpa_t gpa;
277262306a36Sopenharmony_ci	int r;
277362306a36Sopenharmony_ci
277462306a36Sopenharmony_ci	if (vcpu->arch.mmu->root_role.direct)
277562306a36Sopenharmony_ci		return 0;
277662306a36Sopenharmony_ci
277762306a36Sopenharmony_ci	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
277862306a36Sopenharmony_ci
277962306a36Sopenharmony_ci	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
278062306a36Sopenharmony_ci
278162306a36Sopenharmony_ci	return r;
278262306a36Sopenharmony_ci}
278362306a36Sopenharmony_ci
278462306a36Sopenharmony_cistatic void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
278562306a36Sopenharmony_ci{
278662306a36Sopenharmony_ci	trace_kvm_mmu_unsync_page(sp);
278762306a36Sopenharmony_ci	++kvm->stat.mmu_unsync;
278862306a36Sopenharmony_ci	sp->unsync = 1;
278962306a36Sopenharmony_ci
279062306a36Sopenharmony_ci	kvm_mmu_mark_parents_unsync(sp);
279162306a36Sopenharmony_ci}
279262306a36Sopenharmony_ci
279362306a36Sopenharmony_ci/*
279462306a36Sopenharmony_ci * Attempt to unsync any shadow pages that can be reached by the specified gfn,
279562306a36Sopenharmony_ci * KVM is creating a writable mapping for said gfn.  Returns 0 if all pages
279662306a36Sopenharmony_ci * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
279762306a36Sopenharmony_ci * be write-protected.
279862306a36Sopenharmony_ci */
279962306a36Sopenharmony_ciint mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
280062306a36Sopenharmony_ci			    gfn_t gfn, bool can_unsync, bool prefetch)
280162306a36Sopenharmony_ci{
280262306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
280362306a36Sopenharmony_ci	bool locked = false;
280462306a36Sopenharmony_ci
280562306a36Sopenharmony_ci	/*
280662306a36Sopenharmony_ci	 * Force write-protection if the page is being tracked.  Note, the page
280762306a36Sopenharmony_ci	 * track machinery is used to write-protect upper-level shadow pages,
280862306a36Sopenharmony_ci	 * i.e. this guards the role.level == 4K assertion below!
280962306a36Sopenharmony_ci	 */
281062306a36Sopenharmony_ci	if (kvm_gfn_is_write_tracked(kvm, slot, gfn))
281162306a36Sopenharmony_ci		return -EPERM;
281262306a36Sopenharmony_ci
281362306a36Sopenharmony_ci	/*
281462306a36Sopenharmony_ci	 * The page is not write-tracked, mark existing shadow pages unsync
281562306a36Sopenharmony_ci	 * unless KVM is synchronizing an unsync SP (can_unsync = false).  In
281662306a36Sopenharmony_ci	 * that case, KVM must complete emulation of the guest TLB flush before
281762306a36Sopenharmony_ci	 * allowing shadow pages to become unsync (writable by the guest).
281862306a36Sopenharmony_ci	 */
281962306a36Sopenharmony_ci	for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
282062306a36Sopenharmony_ci		if (!can_unsync)
282162306a36Sopenharmony_ci			return -EPERM;
282262306a36Sopenharmony_ci
282362306a36Sopenharmony_ci		if (sp->unsync)
282462306a36Sopenharmony_ci			continue;
282562306a36Sopenharmony_ci
282662306a36Sopenharmony_ci		if (prefetch)
282762306a36Sopenharmony_ci			return -EEXIST;
282862306a36Sopenharmony_ci
282962306a36Sopenharmony_ci		/*
283062306a36Sopenharmony_ci		 * TDP MMU page faults require an additional spinlock as they
283162306a36Sopenharmony_ci		 * run with mmu_lock held for read, not write, and the unsync
283262306a36Sopenharmony_ci		 * logic is not thread safe.  Take the spinklock regardless of
283362306a36Sopenharmony_ci		 * the MMU type to avoid extra conditionals/parameters, there's
283462306a36Sopenharmony_ci		 * no meaningful penalty if mmu_lock is held for write.
283562306a36Sopenharmony_ci		 */
283662306a36Sopenharmony_ci		if (!locked) {
283762306a36Sopenharmony_ci			locked = true;
283862306a36Sopenharmony_ci			spin_lock(&kvm->arch.mmu_unsync_pages_lock);
283962306a36Sopenharmony_ci
284062306a36Sopenharmony_ci			/*
284162306a36Sopenharmony_ci			 * Recheck after taking the spinlock, a different vCPU
284262306a36Sopenharmony_ci			 * may have since marked the page unsync.  A false
284362306a36Sopenharmony_ci			 * positive on the unprotected check above is not
284462306a36Sopenharmony_ci			 * possible as clearing sp->unsync _must_ hold mmu_lock
284562306a36Sopenharmony_ci			 * for write, i.e. unsync cannot transition from 0->1
284662306a36Sopenharmony_ci			 * while this CPU holds mmu_lock for read (or write).
284762306a36Sopenharmony_ci			 */
284862306a36Sopenharmony_ci			if (READ_ONCE(sp->unsync))
284962306a36Sopenharmony_ci				continue;
285062306a36Sopenharmony_ci		}
285162306a36Sopenharmony_ci
285262306a36Sopenharmony_ci		WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
285362306a36Sopenharmony_ci		kvm_unsync_page(kvm, sp);
285462306a36Sopenharmony_ci	}
285562306a36Sopenharmony_ci	if (locked)
285662306a36Sopenharmony_ci		spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
285762306a36Sopenharmony_ci
285862306a36Sopenharmony_ci	/*
285962306a36Sopenharmony_ci	 * We need to ensure that the marking of unsync pages is visible
286062306a36Sopenharmony_ci	 * before the SPTE is updated to allow writes because
286162306a36Sopenharmony_ci	 * kvm_mmu_sync_roots() checks the unsync flags without holding
286262306a36Sopenharmony_ci	 * the MMU lock and so can race with this. If the SPTE was updated
286362306a36Sopenharmony_ci	 * before the page had been marked as unsync-ed, something like the
286462306a36Sopenharmony_ci	 * following could happen:
286562306a36Sopenharmony_ci	 *
286662306a36Sopenharmony_ci	 * CPU 1                    CPU 2
286762306a36Sopenharmony_ci	 * ---------------------------------------------------------------------
286862306a36Sopenharmony_ci	 * 1.2 Host updates SPTE
286962306a36Sopenharmony_ci	 *     to be writable
287062306a36Sopenharmony_ci	 *                      2.1 Guest writes a GPTE for GVA X.
287162306a36Sopenharmony_ci	 *                          (GPTE being in the guest page table shadowed
287262306a36Sopenharmony_ci	 *                           by the SP from CPU 1.)
287362306a36Sopenharmony_ci	 *                          This reads SPTE during the page table walk.
287462306a36Sopenharmony_ci	 *                          Since SPTE.W is read as 1, there is no
287562306a36Sopenharmony_ci	 *                          fault.
287662306a36Sopenharmony_ci	 *
287762306a36Sopenharmony_ci	 *                      2.2 Guest issues TLB flush.
287862306a36Sopenharmony_ci	 *                          That causes a VM Exit.
287962306a36Sopenharmony_ci	 *
288062306a36Sopenharmony_ci	 *                      2.3 Walking of unsync pages sees sp->unsync is
288162306a36Sopenharmony_ci	 *                          false and skips the page.
288262306a36Sopenharmony_ci	 *
288362306a36Sopenharmony_ci	 *                      2.4 Guest accesses GVA X.
288462306a36Sopenharmony_ci	 *                          Since the mapping in the SP was not updated,
288562306a36Sopenharmony_ci	 *                          so the old mapping for GVA X incorrectly
288662306a36Sopenharmony_ci	 *                          gets used.
288762306a36Sopenharmony_ci	 * 1.1 Host marks SP
288862306a36Sopenharmony_ci	 *     as unsync
288962306a36Sopenharmony_ci	 *     (sp->unsync = true)
289062306a36Sopenharmony_ci	 *
289162306a36Sopenharmony_ci	 * The write barrier below ensures that 1.1 happens before 1.2 and thus
289262306a36Sopenharmony_ci	 * the situation in 2.4 does not arise.  It pairs with the read barrier
289362306a36Sopenharmony_ci	 * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
289462306a36Sopenharmony_ci	 */
289562306a36Sopenharmony_ci	smp_wmb();
289662306a36Sopenharmony_ci
289762306a36Sopenharmony_ci	return 0;
289862306a36Sopenharmony_ci}
289962306a36Sopenharmony_ci
290062306a36Sopenharmony_cistatic int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
290162306a36Sopenharmony_ci			u64 *sptep, unsigned int pte_access, gfn_t gfn,
290262306a36Sopenharmony_ci			kvm_pfn_t pfn, struct kvm_page_fault *fault)
290362306a36Sopenharmony_ci{
290462306a36Sopenharmony_ci	struct kvm_mmu_page *sp = sptep_to_sp(sptep);
290562306a36Sopenharmony_ci	int level = sp->role.level;
290662306a36Sopenharmony_ci	int was_rmapped = 0;
290762306a36Sopenharmony_ci	int ret = RET_PF_FIXED;
290862306a36Sopenharmony_ci	bool flush = false;
290962306a36Sopenharmony_ci	bool wrprot;
291062306a36Sopenharmony_ci	u64 spte;
291162306a36Sopenharmony_ci
291262306a36Sopenharmony_ci	/* Prefetching always gets a writable pfn.  */
291362306a36Sopenharmony_ci	bool host_writable = !fault || fault->map_writable;
291462306a36Sopenharmony_ci	bool prefetch = !fault || fault->prefetch;
291562306a36Sopenharmony_ci	bool write_fault = fault && fault->write;
291662306a36Sopenharmony_ci
291762306a36Sopenharmony_ci	if (unlikely(is_noslot_pfn(pfn))) {
291862306a36Sopenharmony_ci		vcpu->stat.pf_mmio_spte_created++;
291962306a36Sopenharmony_ci		mark_mmio_spte(vcpu, sptep, gfn, pte_access);
292062306a36Sopenharmony_ci		return RET_PF_EMULATE;
292162306a36Sopenharmony_ci	}
292262306a36Sopenharmony_ci
292362306a36Sopenharmony_ci	if (is_shadow_present_pte(*sptep)) {
292462306a36Sopenharmony_ci		/*
292562306a36Sopenharmony_ci		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
292662306a36Sopenharmony_ci		 * the parent of the now unreachable PTE.
292762306a36Sopenharmony_ci		 */
292862306a36Sopenharmony_ci		if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
292962306a36Sopenharmony_ci			struct kvm_mmu_page *child;
293062306a36Sopenharmony_ci			u64 pte = *sptep;
293162306a36Sopenharmony_ci
293262306a36Sopenharmony_ci			child = spte_to_child_sp(pte);
293362306a36Sopenharmony_ci			drop_parent_pte(vcpu->kvm, child, sptep);
293462306a36Sopenharmony_ci			flush = true;
293562306a36Sopenharmony_ci		} else if (pfn != spte_to_pfn(*sptep)) {
293662306a36Sopenharmony_ci			drop_spte(vcpu->kvm, sptep);
293762306a36Sopenharmony_ci			flush = true;
293862306a36Sopenharmony_ci		} else
293962306a36Sopenharmony_ci			was_rmapped = 1;
294062306a36Sopenharmony_ci	}
294162306a36Sopenharmony_ci
294262306a36Sopenharmony_ci	wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
294362306a36Sopenharmony_ci			   true, host_writable, &spte);
294462306a36Sopenharmony_ci
294562306a36Sopenharmony_ci	if (*sptep == spte) {
294662306a36Sopenharmony_ci		ret = RET_PF_SPURIOUS;
294762306a36Sopenharmony_ci	} else {
294862306a36Sopenharmony_ci		flush |= mmu_spte_update(sptep, spte);
294962306a36Sopenharmony_ci		trace_kvm_mmu_set_spte(level, gfn, sptep);
295062306a36Sopenharmony_ci	}
295162306a36Sopenharmony_ci
295262306a36Sopenharmony_ci	if (wrprot) {
295362306a36Sopenharmony_ci		if (write_fault)
295462306a36Sopenharmony_ci			ret = RET_PF_EMULATE;
295562306a36Sopenharmony_ci	}
295662306a36Sopenharmony_ci
295762306a36Sopenharmony_ci	if (flush)
295862306a36Sopenharmony_ci		kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
295962306a36Sopenharmony_ci
296062306a36Sopenharmony_ci	if (!was_rmapped) {
296162306a36Sopenharmony_ci		WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
296262306a36Sopenharmony_ci		rmap_add(vcpu, slot, sptep, gfn, pte_access);
296362306a36Sopenharmony_ci	} else {
296462306a36Sopenharmony_ci		/* Already rmapped but the pte_access bits may have changed. */
296562306a36Sopenharmony_ci		kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access);
296662306a36Sopenharmony_ci	}
296762306a36Sopenharmony_ci
296862306a36Sopenharmony_ci	return ret;
296962306a36Sopenharmony_ci}
297062306a36Sopenharmony_ci
297162306a36Sopenharmony_cistatic int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
297262306a36Sopenharmony_ci				    struct kvm_mmu_page *sp,
297362306a36Sopenharmony_ci				    u64 *start, u64 *end)
297462306a36Sopenharmony_ci{
297562306a36Sopenharmony_ci	struct page *pages[PTE_PREFETCH_NUM];
297662306a36Sopenharmony_ci	struct kvm_memory_slot *slot;
297762306a36Sopenharmony_ci	unsigned int access = sp->role.access;
297862306a36Sopenharmony_ci	int i, ret;
297962306a36Sopenharmony_ci	gfn_t gfn;
298062306a36Sopenharmony_ci
298162306a36Sopenharmony_ci	gfn = kvm_mmu_page_get_gfn(sp, spte_index(start));
298262306a36Sopenharmony_ci	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
298362306a36Sopenharmony_ci	if (!slot)
298462306a36Sopenharmony_ci		return -1;
298562306a36Sopenharmony_ci
298662306a36Sopenharmony_ci	ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
298762306a36Sopenharmony_ci	if (ret <= 0)
298862306a36Sopenharmony_ci		return -1;
298962306a36Sopenharmony_ci
299062306a36Sopenharmony_ci	for (i = 0; i < ret; i++, gfn++, start++) {
299162306a36Sopenharmony_ci		mmu_set_spte(vcpu, slot, start, access, gfn,
299262306a36Sopenharmony_ci			     page_to_pfn(pages[i]), NULL);
299362306a36Sopenharmony_ci		put_page(pages[i]);
299462306a36Sopenharmony_ci	}
299562306a36Sopenharmony_ci
299662306a36Sopenharmony_ci	return 0;
299762306a36Sopenharmony_ci}
299862306a36Sopenharmony_ci
299962306a36Sopenharmony_cistatic void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
300062306a36Sopenharmony_ci				  struct kvm_mmu_page *sp, u64 *sptep)
300162306a36Sopenharmony_ci{
300262306a36Sopenharmony_ci	u64 *spte, *start = NULL;
300362306a36Sopenharmony_ci	int i;
300462306a36Sopenharmony_ci
300562306a36Sopenharmony_ci	WARN_ON_ONCE(!sp->role.direct);
300662306a36Sopenharmony_ci
300762306a36Sopenharmony_ci	i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
300862306a36Sopenharmony_ci	spte = sp->spt + i;
300962306a36Sopenharmony_ci
301062306a36Sopenharmony_ci	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
301162306a36Sopenharmony_ci		if (is_shadow_present_pte(*spte) || spte == sptep) {
301262306a36Sopenharmony_ci			if (!start)
301362306a36Sopenharmony_ci				continue;
301462306a36Sopenharmony_ci			if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
301562306a36Sopenharmony_ci				return;
301662306a36Sopenharmony_ci			start = NULL;
301762306a36Sopenharmony_ci		} else if (!start)
301862306a36Sopenharmony_ci			start = spte;
301962306a36Sopenharmony_ci	}
302062306a36Sopenharmony_ci	if (start)
302162306a36Sopenharmony_ci		direct_pte_prefetch_many(vcpu, sp, start, spte);
302262306a36Sopenharmony_ci}
302362306a36Sopenharmony_ci
302462306a36Sopenharmony_cistatic void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
302562306a36Sopenharmony_ci{
302662306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
302762306a36Sopenharmony_ci
302862306a36Sopenharmony_ci	sp = sptep_to_sp(sptep);
302962306a36Sopenharmony_ci
303062306a36Sopenharmony_ci	/*
303162306a36Sopenharmony_ci	 * Without accessed bits, there's no way to distinguish between
303262306a36Sopenharmony_ci	 * actually accessed translations and prefetched, so disable pte
303362306a36Sopenharmony_ci	 * prefetch if accessed bits aren't available.
303462306a36Sopenharmony_ci	 */
303562306a36Sopenharmony_ci	if (sp_ad_disabled(sp))
303662306a36Sopenharmony_ci		return;
303762306a36Sopenharmony_ci
303862306a36Sopenharmony_ci	if (sp->role.level > PG_LEVEL_4K)
303962306a36Sopenharmony_ci		return;
304062306a36Sopenharmony_ci
304162306a36Sopenharmony_ci	/*
304262306a36Sopenharmony_ci	 * If addresses are being invalidated, skip prefetching to avoid
304362306a36Sopenharmony_ci	 * accidentally prefetching those addresses.
304462306a36Sopenharmony_ci	 */
304562306a36Sopenharmony_ci	if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
304662306a36Sopenharmony_ci		return;
304762306a36Sopenharmony_ci
304862306a36Sopenharmony_ci	__direct_pte_prefetch(vcpu, sp, sptep);
304962306a36Sopenharmony_ci}
305062306a36Sopenharmony_ci
305162306a36Sopenharmony_ci/*
305262306a36Sopenharmony_ci * Lookup the mapping level for @gfn in the current mm.
305362306a36Sopenharmony_ci *
305462306a36Sopenharmony_ci * WARNING!  Use of host_pfn_mapping_level() requires the caller and the end
305562306a36Sopenharmony_ci * consumer to be tied into KVM's handlers for MMU notifier events!
305662306a36Sopenharmony_ci *
305762306a36Sopenharmony_ci * There are several ways to safely use this helper:
305862306a36Sopenharmony_ci *
305962306a36Sopenharmony_ci * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before
306062306a36Sopenharmony_ci *   consuming it.  In this case, mmu_lock doesn't need to be held during the
306162306a36Sopenharmony_ci *   lookup, but it does need to be held while checking the MMU notifier.
306262306a36Sopenharmony_ci *
306362306a36Sopenharmony_ci * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation
306462306a36Sopenharmony_ci *   event for the hva.  This can be done by explicit checking the MMU notifier
306562306a36Sopenharmony_ci *   or by ensuring that KVM already has a valid mapping that covers the hva.
306662306a36Sopenharmony_ci *
306762306a36Sopenharmony_ci * - Do not use the result to install new mappings, e.g. use the host mapping
306862306a36Sopenharmony_ci *   level only to decide whether or not to zap an entry.  In this case, it's
306962306a36Sopenharmony_ci *   not required to hold mmu_lock (though it's highly likely the caller will
307062306a36Sopenharmony_ci *   want to hold mmu_lock anyways, e.g. to modify SPTEs).
307162306a36Sopenharmony_ci *
307262306a36Sopenharmony_ci * Note!  The lookup can still race with modifications to host page tables, but
307362306a36Sopenharmony_ci * the above "rules" ensure KVM will not _consume_ the result of the walk if a
307462306a36Sopenharmony_ci * race with the primary MMU occurs.
307562306a36Sopenharmony_ci */
307662306a36Sopenharmony_cistatic int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
307762306a36Sopenharmony_ci				  const struct kvm_memory_slot *slot)
307862306a36Sopenharmony_ci{
307962306a36Sopenharmony_ci	int level = PG_LEVEL_4K;
308062306a36Sopenharmony_ci	unsigned long hva;
308162306a36Sopenharmony_ci	unsigned long flags;
308262306a36Sopenharmony_ci	pgd_t pgd;
308362306a36Sopenharmony_ci	p4d_t p4d;
308462306a36Sopenharmony_ci	pud_t pud;
308562306a36Sopenharmony_ci	pmd_t pmd;
308662306a36Sopenharmony_ci
308762306a36Sopenharmony_ci	/*
308862306a36Sopenharmony_ci	 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
308962306a36Sopenharmony_ci	 * is not solely for performance, it's also necessary to avoid the
309062306a36Sopenharmony_ci	 * "writable" check in __gfn_to_hva_many(), which will always fail on
309162306a36Sopenharmony_ci	 * read-only memslots due to gfn_to_hva() assuming writes.  Earlier
309262306a36Sopenharmony_ci	 * page fault steps have already verified the guest isn't writing a
309362306a36Sopenharmony_ci	 * read-only memslot.
309462306a36Sopenharmony_ci	 */
309562306a36Sopenharmony_ci	hva = __gfn_to_hva_memslot(slot, gfn);
309662306a36Sopenharmony_ci
309762306a36Sopenharmony_ci	/*
309862306a36Sopenharmony_ci	 * Disable IRQs to prevent concurrent tear down of host page tables,
309962306a36Sopenharmony_ci	 * e.g. if the primary MMU promotes a P*D to a huge page and then frees
310062306a36Sopenharmony_ci	 * the original page table.
310162306a36Sopenharmony_ci	 */
310262306a36Sopenharmony_ci	local_irq_save(flags);
310362306a36Sopenharmony_ci
310462306a36Sopenharmony_ci	/*
310562306a36Sopenharmony_ci	 * Read each entry once.  As above, a non-leaf entry can be promoted to
310662306a36Sopenharmony_ci	 * a huge page _during_ this walk.  Re-reading the entry could send the
310762306a36Sopenharmony_ci	 * walk into the weeks, e.g. p*d_large() returns false (sees the old
310862306a36Sopenharmony_ci	 * value) and then p*d_offset() walks into the target huge page instead
310962306a36Sopenharmony_ci	 * of the old page table (sees the new value).
311062306a36Sopenharmony_ci	 */
311162306a36Sopenharmony_ci	pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
311262306a36Sopenharmony_ci	if (pgd_none(pgd))
311362306a36Sopenharmony_ci		goto out;
311462306a36Sopenharmony_ci
311562306a36Sopenharmony_ci	p4d = READ_ONCE(*p4d_offset(&pgd, hva));
311662306a36Sopenharmony_ci	if (p4d_none(p4d) || !p4d_present(p4d))
311762306a36Sopenharmony_ci		goto out;
311862306a36Sopenharmony_ci
311962306a36Sopenharmony_ci	pud = READ_ONCE(*pud_offset(&p4d, hva));
312062306a36Sopenharmony_ci	if (pud_none(pud) || !pud_present(pud))
312162306a36Sopenharmony_ci		goto out;
312262306a36Sopenharmony_ci
312362306a36Sopenharmony_ci	if (pud_large(pud)) {
312462306a36Sopenharmony_ci		level = PG_LEVEL_1G;
312562306a36Sopenharmony_ci		goto out;
312662306a36Sopenharmony_ci	}
312762306a36Sopenharmony_ci
312862306a36Sopenharmony_ci	pmd = READ_ONCE(*pmd_offset(&pud, hva));
312962306a36Sopenharmony_ci	if (pmd_none(pmd) || !pmd_present(pmd))
313062306a36Sopenharmony_ci		goto out;
313162306a36Sopenharmony_ci
313262306a36Sopenharmony_ci	if (pmd_large(pmd))
313362306a36Sopenharmony_ci		level = PG_LEVEL_2M;
313462306a36Sopenharmony_ci
313562306a36Sopenharmony_ciout:
313662306a36Sopenharmony_ci	local_irq_restore(flags);
313762306a36Sopenharmony_ci	return level;
313862306a36Sopenharmony_ci}
313962306a36Sopenharmony_ci
314062306a36Sopenharmony_ciint kvm_mmu_max_mapping_level(struct kvm *kvm,
314162306a36Sopenharmony_ci			      const struct kvm_memory_slot *slot, gfn_t gfn,
314262306a36Sopenharmony_ci			      int max_level)
314362306a36Sopenharmony_ci{
314462306a36Sopenharmony_ci	struct kvm_lpage_info *linfo;
314562306a36Sopenharmony_ci	int host_level;
314662306a36Sopenharmony_ci
314762306a36Sopenharmony_ci	max_level = min(max_level, max_huge_page_level);
314862306a36Sopenharmony_ci	for ( ; max_level > PG_LEVEL_4K; max_level--) {
314962306a36Sopenharmony_ci		linfo = lpage_info_slot(gfn, slot, max_level);
315062306a36Sopenharmony_ci		if (!linfo->disallow_lpage)
315162306a36Sopenharmony_ci			break;
315262306a36Sopenharmony_ci	}
315362306a36Sopenharmony_ci
315462306a36Sopenharmony_ci	if (max_level == PG_LEVEL_4K)
315562306a36Sopenharmony_ci		return PG_LEVEL_4K;
315662306a36Sopenharmony_ci
315762306a36Sopenharmony_ci	host_level = host_pfn_mapping_level(kvm, gfn, slot);
315862306a36Sopenharmony_ci	return min(host_level, max_level);
315962306a36Sopenharmony_ci}
316062306a36Sopenharmony_ci
316162306a36Sopenharmony_civoid kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
316262306a36Sopenharmony_ci{
316362306a36Sopenharmony_ci	struct kvm_memory_slot *slot = fault->slot;
316462306a36Sopenharmony_ci	kvm_pfn_t mask;
316562306a36Sopenharmony_ci
316662306a36Sopenharmony_ci	fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
316762306a36Sopenharmony_ci
316862306a36Sopenharmony_ci	if (unlikely(fault->max_level == PG_LEVEL_4K))
316962306a36Sopenharmony_ci		return;
317062306a36Sopenharmony_ci
317162306a36Sopenharmony_ci	if (is_error_noslot_pfn(fault->pfn))
317262306a36Sopenharmony_ci		return;
317362306a36Sopenharmony_ci
317462306a36Sopenharmony_ci	if (kvm_slot_dirty_track_enabled(slot))
317562306a36Sopenharmony_ci		return;
317662306a36Sopenharmony_ci
317762306a36Sopenharmony_ci	/*
317862306a36Sopenharmony_ci	 * Enforce the iTLB multihit workaround after capturing the requested
317962306a36Sopenharmony_ci	 * level, which will be used to do precise, accurate accounting.
318062306a36Sopenharmony_ci	 */
318162306a36Sopenharmony_ci	fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
318262306a36Sopenharmony_ci						     fault->gfn, fault->max_level);
318362306a36Sopenharmony_ci	if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
318462306a36Sopenharmony_ci		return;
318562306a36Sopenharmony_ci
318662306a36Sopenharmony_ci	/*
318762306a36Sopenharmony_ci	 * mmu_invalidate_retry() was successful and mmu_lock is held, so
318862306a36Sopenharmony_ci	 * the pmd can't be split from under us.
318962306a36Sopenharmony_ci	 */
319062306a36Sopenharmony_ci	fault->goal_level = fault->req_level;
319162306a36Sopenharmony_ci	mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
319262306a36Sopenharmony_ci	VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
319362306a36Sopenharmony_ci	fault->pfn &= ~mask;
319462306a36Sopenharmony_ci}
319562306a36Sopenharmony_ci
319662306a36Sopenharmony_civoid disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
319762306a36Sopenharmony_ci{
319862306a36Sopenharmony_ci	if (cur_level > PG_LEVEL_4K &&
319962306a36Sopenharmony_ci	    cur_level == fault->goal_level &&
320062306a36Sopenharmony_ci	    is_shadow_present_pte(spte) &&
320162306a36Sopenharmony_ci	    !is_large_pte(spte) &&
320262306a36Sopenharmony_ci	    spte_to_child_sp(spte)->nx_huge_page_disallowed) {
320362306a36Sopenharmony_ci		/*
320462306a36Sopenharmony_ci		 * A small SPTE exists for this pfn, but FNAME(fetch),
320562306a36Sopenharmony_ci		 * direct_map(), or kvm_tdp_mmu_map() would like to create a
320662306a36Sopenharmony_ci		 * large PTE instead: just force them to go down another level,
320762306a36Sopenharmony_ci		 * patching back for them into pfn the next 9 bits of the
320862306a36Sopenharmony_ci		 * address.
320962306a36Sopenharmony_ci		 */
321062306a36Sopenharmony_ci		u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
321162306a36Sopenharmony_ci				KVM_PAGES_PER_HPAGE(cur_level - 1);
321262306a36Sopenharmony_ci		fault->pfn |= fault->gfn & page_mask;
321362306a36Sopenharmony_ci		fault->goal_level--;
321462306a36Sopenharmony_ci	}
321562306a36Sopenharmony_ci}
321662306a36Sopenharmony_ci
321762306a36Sopenharmony_cistatic int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
321862306a36Sopenharmony_ci{
321962306a36Sopenharmony_ci	struct kvm_shadow_walk_iterator it;
322062306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
322162306a36Sopenharmony_ci	int ret;
322262306a36Sopenharmony_ci	gfn_t base_gfn = fault->gfn;
322362306a36Sopenharmony_ci
322462306a36Sopenharmony_ci	kvm_mmu_hugepage_adjust(vcpu, fault);
322562306a36Sopenharmony_ci
322662306a36Sopenharmony_ci	trace_kvm_mmu_spte_requested(fault);
322762306a36Sopenharmony_ci	for_each_shadow_entry(vcpu, fault->addr, it) {
322862306a36Sopenharmony_ci		/*
322962306a36Sopenharmony_ci		 * We cannot overwrite existing page tables with an NX
323062306a36Sopenharmony_ci		 * large page, as the leaf could be executable.
323162306a36Sopenharmony_ci		 */
323262306a36Sopenharmony_ci		if (fault->nx_huge_page_workaround_enabled)
323362306a36Sopenharmony_ci			disallowed_hugepage_adjust(fault, *it.sptep, it.level);
323462306a36Sopenharmony_ci
323562306a36Sopenharmony_ci		base_gfn = gfn_round_for_level(fault->gfn, it.level);
323662306a36Sopenharmony_ci		if (it.level == fault->goal_level)
323762306a36Sopenharmony_ci			break;
323862306a36Sopenharmony_ci
323962306a36Sopenharmony_ci		sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
324062306a36Sopenharmony_ci		if (sp == ERR_PTR(-EEXIST))
324162306a36Sopenharmony_ci			continue;
324262306a36Sopenharmony_ci
324362306a36Sopenharmony_ci		link_shadow_page(vcpu, it.sptep, sp);
324462306a36Sopenharmony_ci		if (fault->huge_page_disallowed)
324562306a36Sopenharmony_ci			account_nx_huge_page(vcpu->kvm, sp,
324662306a36Sopenharmony_ci					     fault->req_level >= it.level);
324762306a36Sopenharmony_ci	}
324862306a36Sopenharmony_ci
324962306a36Sopenharmony_ci	if (WARN_ON_ONCE(it.level != fault->goal_level))
325062306a36Sopenharmony_ci		return -EFAULT;
325162306a36Sopenharmony_ci
325262306a36Sopenharmony_ci	ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
325362306a36Sopenharmony_ci			   base_gfn, fault->pfn, fault);
325462306a36Sopenharmony_ci	if (ret == RET_PF_SPURIOUS)
325562306a36Sopenharmony_ci		return ret;
325662306a36Sopenharmony_ci
325762306a36Sopenharmony_ci	direct_pte_prefetch(vcpu, it.sptep);
325862306a36Sopenharmony_ci	return ret;
325962306a36Sopenharmony_ci}
326062306a36Sopenharmony_ci
326162306a36Sopenharmony_cistatic void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn)
326262306a36Sopenharmony_ci{
326362306a36Sopenharmony_ci	unsigned long hva = gfn_to_hva_memslot(slot, gfn);
326462306a36Sopenharmony_ci
326562306a36Sopenharmony_ci	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current);
326662306a36Sopenharmony_ci}
326762306a36Sopenharmony_ci
326862306a36Sopenharmony_cistatic int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
326962306a36Sopenharmony_ci{
327062306a36Sopenharmony_ci	if (is_sigpending_pfn(fault->pfn)) {
327162306a36Sopenharmony_ci		kvm_handle_signal_exit(vcpu);
327262306a36Sopenharmony_ci		return -EINTR;
327362306a36Sopenharmony_ci	}
327462306a36Sopenharmony_ci
327562306a36Sopenharmony_ci	/*
327662306a36Sopenharmony_ci	 * Do not cache the mmio info caused by writing the readonly gfn
327762306a36Sopenharmony_ci	 * into the spte otherwise read access on readonly gfn also can
327862306a36Sopenharmony_ci	 * caused mmio page fault and treat it as mmio access.
327962306a36Sopenharmony_ci	 */
328062306a36Sopenharmony_ci	if (fault->pfn == KVM_PFN_ERR_RO_FAULT)
328162306a36Sopenharmony_ci		return RET_PF_EMULATE;
328262306a36Sopenharmony_ci
328362306a36Sopenharmony_ci	if (fault->pfn == KVM_PFN_ERR_HWPOISON) {
328462306a36Sopenharmony_ci		kvm_send_hwpoison_signal(fault->slot, fault->gfn);
328562306a36Sopenharmony_ci		return RET_PF_RETRY;
328662306a36Sopenharmony_ci	}
328762306a36Sopenharmony_ci
328862306a36Sopenharmony_ci	return -EFAULT;
328962306a36Sopenharmony_ci}
329062306a36Sopenharmony_ci
329162306a36Sopenharmony_cistatic int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
329262306a36Sopenharmony_ci				   struct kvm_page_fault *fault,
329362306a36Sopenharmony_ci				   unsigned int access)
329462306a36Sopenharmony_ci{
329562306a36Sopenharmony_ci	gva_t gva = fault->is_tdp ? 0 : fault->addr;
329662306a36Sopenharmony_ci
329762306a36Sopenharmony_ci	vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
329862306a36Sopenharmony_ci			     access & shadow_mmio_access_mask);
329962306a36Sopenharmony_ci
330062306a36Sopenharmony_ci	/*
330162306a36Sopenharmony_ci	 * If MMIO caching is disabled, emulate immediately without
330262306a36Sopenharmony_ci	 * touching the shadow page tables as attempting to install an
330362306a36Sopenharmony_ci	 * MMIO SPTE will just be an expensive nop.
330462306a36Sopenharmony_ci	 */
330562306a36Sopenharmony_ci	if (unlikely(!enable_mmio_caching))
330662306a36Sopenharmony_ci		return RET_PF_EMULATE;
330762306a36Sopenharmony_ci
330862306a36Sopenharmony_ci	/*
330962306a36Sopenharmony_ci	 * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR,
331062306a36Sopenharmony_ci	 * any guest that generates such gfns is running nested and is being
331162306a36Sopenharmony_ci	 * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and
331262306a36Sopenharmony_ci	 * only if L1's MAXPHYADDR is inaccurate with respect to the
331362306a36Sopenharmony_ci	 * hardware's).
331462306a36Sopenharmony_ci	 */
331562306a36Sopenharmony_ci	if (unlikely(fault->gfn > kvm_mmu_max_gfn()))
331662306a36Sopenharmony_ci		return RET_PF_EMULATE;
331762306a36Sopenharmony_ci
331862306a36Sopenharmony_ci	return RET_PF_CONTINUE;
331962306a36Sopenharmony_ci}
332062306a36Sopenharmony_ci
332162306a36Sopenharmony_cistatic bool page_fault_can_be_fast(struct kvm_page_fault *fault)
332262306a36Sopenharmony_ci{
332362306a36Sopenharmony_ci	/*
332462306a36Sopenharmony_ci	 * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
332562306a36Sopenharmony_ci	 * reach the common page fault handler if the SPTE has an invalid MMIO
332662306a36Sopenharmony_ci	 * generation number.  Refreshing the MMIO generation needs to go down
332762306a36Sopenharmony_ci	 * the slow path.  Note, EPT Misconfigs do NOT set the PRESENT flag!
332862306a36Sopenharmony_ci	 */
332962306a36Sopenharmony_ci	if (fault->rsvd)
333062306a36Sopenharmony_ci		return false;
333162306a36Sopenharmony_ci
333262306a36Sopenharmony_ci	/*
333362306a36Sopenharmony_ci	 * #PF can be fast if:
333462306a36Sopenharmony_ci	 *
333562306a36Sopenharmony_ci	 * 1. The shadow page table entry is not present and A/D bits are
333662306a36Sopenharmony_ci	 *    disabled _by KVM_, which could mean that the fault is potentially
333762306a36Sopenharmony_ci	 *    caused by access tracking (if enabled).  If A/D bits are enabled
333862306a36Sopenharmony_ci	 *    by KVM, but disabled by L1 for L2, KVM is forced to disable A/D
333962306a36Sopenharmony_ci	 *    bits for L2 and employ access tracking, but the fast page fault
334062306a36Sopenharmony_ci	 *    mechanism only supports direct MMUs.
334162306a36Sopenharmony_ci	 * 2. The shadow page table entry is present, the access is a write,
334262306a36Sopenharmony_ci	 *    and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e.
334362306a36Sopenharmony_ci	 *    the fault was caused by a write-protection violation.  If the
334462306a36Sopenharmony_ci	 *    SPTE is MMU-writable (determined later), the fault can be fixed
334562306a36Sopenharmony_ci	 *    by setting the Writable bit, which can be done out of mmu_lock.
334662306a36Sopenharmony_ci	 */
334762306a36Sopenharmony_ci	if (!fault->present)
334862306a36Sopenharmony_ci		return !kvm_ad_enabled();
334962306a36Sopenharmony_ci
335062306a36Sopenharmony_ci	/*
335162306a36Sopenharmony_ci	 * Note, instruction fetches and writes are mutually exclusive, ignore
335262306a36Sopenharmony_ci	 * the "exec" flag.
335362306a36Sopenharmony_ci	 */
335462306a36Sopenharmony_ci	return fault->write;
335562306a36Sopenharmony_ci}
335662306a36Sopenharmony_ci
335762306a36Sopenharmony_ci/*
335862306a36Sopenharmony_ci * Returns true if the SPTE was fixed successfully. Otherwise,
335962306a36Sopenharmony_ci * someone else modified the SPTE from its original value.
336062306a36Sopenharmony_ci */
336162306a36Sopenharmony_cistatic bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
336262306a36Sopenharmony_ci				    struct kvm_page_fault *fault,
336362306a36Sopenharmony_ci				    u64 *sptep, u64 old_spte, u64 new_spte)
336462306a36Sopenharmony_ci{
336562306a36Sopenharmony_ci	/*
336662306a36Sopenharmony_ci	 * Theoretically we could also set dirty bit (and flush TLB) here in
336762306a36Sopenharmony_ci	 * order to eliminate unnecessary PML logging. See comments in
336862306a36Sopenharmony_ci	 * set_spte. But fast_page_fault is very unlikely to happen with PML
336962306a36Sopenharmony_ci	 * enabled, so we do not do this. This might result in the same GPA
337062306a36Sopenharmony_ci	 * to be logged in PML buffer again when the write really happens, and
337162306a36Sopenharmony_ci	 * eventually to be called by mark_page_dirty twice. But it's also no
337262306a36Sopenharmony_ci	 * harm. This also avoids the TLB flush needed after setting dirty bit
337362306a36Sopenharmony_ci	 * so non-PML cases won't be impacted.
337462306a36Sopenharmony_ci	 *
337562306a36Sopenharmony_ci	 * Compare with set_spte where instead shadow_dirty_mask is set.
337662306a36Sopenharmony_ci	 */
337762306a36Sopenharmony_ci	if (!try_cmpxchg64(sptep, &old_spte, new_spte))
337862306a36Sopenharmony_ci		return false;
337962306a36Sopenharmony_ci
338062306a36Sopenharmony_ci	if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
338162306a36Sopenharmony_ci		mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
338262306a36Sopenharmony_ci
338362306a36Sopenharmony_ci	return true;
338462306a36Sopenharmony_ci}
338562306a36Sopenharmony_ci
338662306a36Sopenharmony_cistatic bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
338762306a36Sopenharmony_ci{
338862306a36Sopenharmony_ci	if (fault->exec)
338962306a36Sopenharmony_ci		return is_executable_pte(spte);
339062306a36Sopenharmony_ci
339162306a36Sopenharmony_ci	if (fault->write)
339262306a36Sopenharmony_ci		return is_writable_pte(spte);
339362306a36Sopenharmony_ci
339462306a36Sopenharmony_ci	/* Fault was on Read access */
339562306a36Sopenharmony_ci	return spte & PT_PRESENT_MASK;
339662306a36Sopenharmony_ci}
339762306a36Sopenharmony_ci
339862306a36Sopenharmony_ci/*
339962306a36Sopenharmony_ci * Returns the last level spte pointer of the shadow page walk for the given
340062306a36Sopenharmony_ci * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
340162306a36Sopenharmony_ci * walk could be performed, returns NULL and *spte does not contain valid data.
340262306a36Sopenharmony_ci *
340362306a36Sopenharmony_ci * Contract:
340462306a36Sopenharmony_ci *  - Must be called between walk_shadow_page_lockless_{begin,end}.
340562306a36Sopenharmony_ci *  - The returned sptep must not be used after walk_shadow_page_lockless_end.
340662306a36Sopenharmony_ci */
340762306a36Sopenharmony_cistatic u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
340862306a36Sopenharmony_ci{
340962306a36Sopenharmony_ci	struct kvm_shadow_walk_iterator iterator;
341062306a36Sopenharmony_ci	u64 old_spte;
341162306a36Sopenharmony_ci	u64 *sptep = NULL;
341262306a36Sopenharmony_ci
341362306a36Sopenharmony_ci	for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
341462306a36Sopenharmony_ci		sptep = iterator.sptep;
341562306a36Sopenharmony_ci		*spte = old_spte;
341662306a36Sopenharmony_ci	}
341762306a36Sopenharmony_ci
341862306a36Sopenharmony_ci	return sptep;
341962306a36Sopenharmony_ci}
342062306a36Sopenharmony_ci
342162306a36Sopenharmony_ci/*
342262306a36Sopenharmony_ci * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
342362306a36Sopenharmony_ci */
342462306a36Sopenharmony_cistatic int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
342562306a36Sopenharmony_ci{
342662306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
342762306a36Sopenharmony_ci	int ret = RET_PF_INVALID;
342862306a36Sopenharmony_ci	u64 spte = 0ull;
342962306a36Sopenharmony_ci	u64 *sptep = NULL;
343062306a36Sopenharmony_ci	uint retry_count = 0;
343162306a36Sopenharmony_ci
343262306a36Sopenharmony_ci	if (!page_fault_can_be_fast(fault))
343362306a36Sopenharmony_ci		return ret;
343462306a36Sopenharmony_ci
343562306a36Sopenharmony_ci	walk_shadow_page_lockless_begin(vcpu);
343662306a36Sopenharmony_ci
343762306a36Sopenharmony_ci	do {
343862306a36Sopenharmony_ci		u64 new_spte;
343962306a36Sopenharmony_ci
344062306a36Sopenharmony_ci		if (tdp_mmu_enabled)
344162306a36Sopenharmony_ci			sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
344262306a36Sopenharmony_ci		else
344362306a36Sopenharmony_ci			sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
344462306a36Sopenharmony_ci
344562306a36Sopenharmony_ci		if (!is_shadow_present_pte(spte))
344662306a36Sopenharmony_ci			break;
344762306a36Sopenharmony_ci
344862306a36Sopenharmony_ci		sp = sptep_to_sp(sptep);
344962306a36Sopenharmony_ci		if (!is_last_spte(spte, sp->role.level))
345062306a36Sopenharmony_ci			break;
345162306a36Sopenharmony_ci
345262306a36Sopenharmony_ci		/*
345362306a36Sopenharmony_ci		 * Check whether the memory access that caused the fault would
345462306a36Sopenharmony_ci		 * still cause it if it were to be performed right now. If not,
345562306a36Sopenharmony_ci		 * then this is a spurious fault caused by TLB lazily flushed,
345662306a36Sopenharmony_ci		 * or some other CPU has already fixed the PTE after the
345762306a36Sopenharmony_ci		 * current CPU took the fault.
345862306a36Sopenharmony_ci		 *
345962306a36Sopenharmony_ci		 * Need not check the access of upper level table entries since
346062306a36Sopenharmony_ci		 * they are always ACC_ALL.
346162306a36Sopenharmony_ci		 */
346262306a36Sopenharmony_ci		if (is_access_allowed(fault, spte)) {
346362306a36Sopenharmony_ci			ret = RET_PF_SPURIOUS;
346462306a36Sopenharmony_ci			break;
346562306a36Sopenharmony_ci		}
346662306a36Sopenharmony_ci
346762306a36Sopenharmony_ci		new_spte = spte;
346862306a36Sopenharmony_ci
346962306a36Sopenharmony_ci		/*
347062306a36Sopenharmony_ci		 * KVM only supports fixing page faults outside of MMU lock for
347162306a36Sopenharmony_ci		 * direct MMUs, nested MMUs are always indirect, and KVM always
347262306a36Sopenharmony_ci		 * uses A/D bits for non-nested MMUs.  Thus, if A/D bits are
347362306a36Sopenharmony_ci		 * enabled, the SPTE can't be an access-tracked SPTE.
347462306a36Sopenharmony_ci		 */
347562306a36Sopenharmony_ci		if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte))
347662306a36Sopenharmony_ci			new_spte = restore_acc_track_spte(new_spte);
347762306a36Sopenharmony_ci
347862306a36Sopenharmony_ci		/*
347962306a36Sopenharmony_ci		 * To keep things simple, only SPTEs that are MMU-writable can
348062306a36Sopenharmony_ci		 * be made fully writable outside of mmu_lock, e.g. only SPTEs
348162306a36Sopenharmony_ci		 * that were write-protected for dirty-logging or access
348262306a36Sopenharmony_ci		 * tracking are handled here.  Don't bother checking if the
348362306a36Sopenharmony_ci		 * SPTE is writable to prioritize running with A/D bits enabled.
348462306a36Sopenharmony_ci		 * The is_access_allowed() check above handles the common case
348562306a36Sopenharmony_ci		 * of the fault being spurious, and the SPTE is known to be
348662306a36Sopenharmony_ci		 * shadow-present, i.e. except for access tracking restoration
348762306a36Sopenharmony_ci		 * making the new SPTE writable, the check is wasteful.
348862306a36Sopenharmony_ci		 */
348962306a36Sopenharmony_ci		if (fault->write && is_mmu_writable_spte(spte)) {
349062306a36Sopenharmony_ci			new_spte |= PT_WRITABLE_MASK;
349162306a36Sopenharmony_ci
349262306a36Sopenharmony_ci			/*
349362306a36Sopenharmony_ci			 * Do not fix write-permission on the large spte when
349462306a36Sopenharmony_ci			 * dirty logging is enabled. Since we only dirty the
349562306a36Sopenharmony_ci			 * first page into the dirty-bitmap in
349662306a36Sopenharmony_ci			 * fast_pf_fix_direct_spte(), other pages are missed
349762306a36Sopenharmony_ci			 * if its slot has dirty logging enabled.
349862306a36Sopenharmony_ci			 *
349962306a36Sopenharmony_ci			 * Instead, we let the slow page fault path create a
350062306a36Sopenharmony_ci			 * normal spte to fix the access.
350162306a36Sopenharmony_ci			 */
350262306a36Sopenharmony_ci			if (sp->role.level > PG_LEVEL_4K &&
350362306a36Sopenharmony_ci			    kvm_slot_dirty_track_enabled(fault->slot))
350462306a36Sopenharmony_ci				break;
350562306a36Sopenharmony_ci		}
350662306a36Sopenharmony_ci
350762306a36Sopenharmony_ci		/* Verify that the fault can be handled in the fast path */
350862306a36Sopenharmony_ci		if (new_spte == spte ||
350962306a36Sopenharmony_ci		    !is_access_allowed(fault, new_spte))
351062306a36Sopenharmony_ci			break;
351162306a36Sopenharmony_ci
351262306a36Sopenharmony_ci		/*
351362306a36Sopenharmony_ci		 * Currently, fast page fault only works for direct mapping
351462306a36Sopenharmony_ci		 * since the gfn is not stable for indirect shadow page. See
351562306a36Sopenharmony_ci		 * Documentation/virt/kvm/locking.rst to get more detail.
351662306a36Sopenharmony_ci		 */
351762306a36Sopenharmony_ci		if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
351862306a36Sopenharmony_ci			ret = RET_PF_FIXED;
351962306a36Sopenharmony_ci			break;
352062306a36Sopenharmony_ci		}
352162306a36Sopenharmony_ci
352262306a36Sopenharmony_ci		if (++retry_count > 4) {
352362306a36Sopenharmony_ci			pr_warn_once("Fast #PF retrying more than 4 times.\n");
352462306a36Sopenharmony_ci			break;
352562306a36Sopenharmony_ci		}
352662306a36Sopenharmony_ci
352762306a36Sopenharmony_ci	} while (true);
352862306a36Sopenharmony_ci
352962306a36Sopenharmony_ci	trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
353062306a36Sopenharmony_ci	walk_shadow_page_lockless_end(vcpu);
353162306a36Sopenharmony_ci
353262306a36Sopenharmony_ci	if (ret != RET_PF_INVALID)
353362306a36Sopenharmony_ci		vcpu->stat.pf_fast++;
353462306a36Sopenharmony_ci
353562306a36Sopenharmony_ci	return ret;
353662306a36Sopenharmony_ci}
353762306a36Sopenharmony_ci
353862306a36Sopenharmony_cistatic void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
353962306a36Sopenharmony_ci			       struct list_head *invalid_list)
354062306a36Sopenharmony_ci{
354162306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
354262306a36Sopenharmony_ci
354362306a36Sopenharmony_ci	if (!VALID_PAGE(*root_hpa))
354462306a36Sopenharmony_ci		return;
354562306a36Sopenharmony_ci
354662306a36Sopenharmony_ci	sp = root_to_sp(*root_hpa);
354762306a36Sopenharmony_ci	if (WARN_ON_ONCE(!sp))
354862306a36Sopenharmony_ci		return;
354962306a36Sopenharmony_ci
355062306a36Sopenharmony_ci	if (is_tdp_mmu_page(sp))
355162306a36Sopenharmony_ci		kvm_tdp_mmu_put_root(kvm, sp, false);
355262306a36Sopenharmony_ci	else if (!--sp->root_count && sp->role.invalid)
355362306a36Sopenharmony_ci		kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
355462306a36Sopenharmony_ci
355562306a36Sopenharmony_ci	*root_hpa = INVALID_PAGE;
355662306a36Sopenharmony_ci}
355762306a36Sopenharmony_ci
355862306a36Sopenharmony_ci/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
355962306a36Sopenharmony_civoid kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
356062306a36Sopenharmony_ci			ulong roots_to_free)
356162306a36Sopenharmony_ci{
356262306a36Sopenharmony_ci	int i;
356362306a36Sopenharmony_ci	LIST_HEAD(invalid_list);
356462306a36Sopenharmony_ci	bool free_active_root;
356562306a36Sopenharmony_ci
356662306a36Sopenharmony_ci	WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL);
356762306a36Sopenharmony_ci
356862306a36Sopenharmony_ci	BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
356962306a36Sopenharmony_ci
357062306a36Sopenharmony_ci	/* Before acquiring the MMU lock, see if we need to do any real work. */
357162306a36Sopenharmony_ci	free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
357262306a36Sopenharmony_ci		&& VALID_PAGE(mmu->root.hpa);
357362306a36Sopenharmony_ci
357462306a36Sopenharmony_ci	if (!free_active_root) {
357562306a36Sopenharmony_ci		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
357662306a36Sopenharmony_ci			if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
357762306a36Sopenharmony_ci			    VALID_PAGE(mmu->prev_roots[i].hpa))
357862306a36Sopenharmony_ci				break;
357962306a36Sopenharmony_ci
358062306a36Sopenharmony_ci		if (i == KVM_MMU_NUM_PREV_ROOTS)
358162306a36Sopenharmony_ci			return;
358262306a36Sopenharmony_ci	}
358362306a36Sopenharmony_ci
358462306a36Sopenharmony_ci	write_lock(&kvm->mmu_lock);
358562306a36Sopenharmony_ci
358662306a36Sopenharmony_ci	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
358762306a36Sopenharmony_ci		if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
358862306a36Sopenharmony_ci			mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
358962306a36Sopenharmony_ci					   &invalid_list);
359062306a36Sopenharmony_ci
359162306a36Sopenharmony_ci	if (free_active_root) {
359262306a36Sopenharmony_ci		if (kvm_mmu_is_dummy_root(mmu->root.hpa)) {
359362306a36Sopenharmony_ci			/* Nothing to cleanup for dummy roots. */
359462306a36Sopenharmony_ci		} else if (root_to_sp(mmu->root.hpa)) {
359562306a36Sopenharmony_ci			mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
359662306a36Sopenharmony_ci		} else if (mmu->pae_root) {
359762306a36Sopenharmony_ci			for (i = 0; i < 4; ++i) {
359862306a36Sopenharmony_ci				if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
359962306a36Sopenharmony_ci					continue;
360062306a36Sopenharmony_ci
360162306a36Sopenharmony_ci				mmu_free_root_page(kvm, &mmu->pae_root[i],
360262306a36Sopenharmony_ci						   &invalid_list);
360362306a36Sopenharmony_ci				mmu->pae_root[i] = INVALID_PAE_ROOT;
360462306a36Sopenharmony_ci			}
360562306a36Sopenharmony_ci		}
360662306a36Sopenharmony_ci		mmu->root.hpa = INVALID_PAGE;
360762306a36Sopenharmony_ci		mmu->root.pgd = 0;
360862306a36Sopenharmony_ci	}
360962306a36Sopenharmony_ci
361062306a36Sopenharmony_ci	kvm_mmu_commit_zap_page(kvm, &invalid_list);
361162306a36Sopenharmony_ci	write_unlock(&kvm->mmu_lock);
361262306a36Sopenharmony_ci}
361362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
361462306a36Sopenharmony_ci
361562306a36Sopenharmony_civoid kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
361662306a36Sopenharmony_ci{
361762306a36Sopenharmony_ci	unsigned long roots_to_free = 0;
361862306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
361962306a36Sopenharmony_ci	hpa_t root_hpa;
362062306a36Sopenharmony_ci	int i;
362162306a36Sopenharmony_ci
362262306a36Sopenharmony_ci	/*
362362306a36Sopenharmony_ci	 * This should not be called while L2 is active, L2 can't invalidate
362462306a36Sopenharmony_ci	 * _only_ its own roots, e.g. INVVPID unconditionally exits.
362562306a36Sopenharmony_ci	 */
362662306a36Sopenharmony_ci	WARN_ON_ONCE(mmu->root_role.guest_mode);
362762306a36Sopenharmony_ci
362862306a36Sopenharmony_ci	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
362962306a36Sopenharmony_ci		root_hpa = mmu->prev_roots[i].hpa;
363062306a36Sopenharmony_ci		if (!VALID_PAGE(root_hpa))
363162306a36Sopenharmony_ci			continue;
363262306a36Sopenharmony_ci
363362306a36Sopenharmony_ci		sp = root_to_sp(root_hpa);
363462306a36Sopenharmony_ci		if (!sp || sp->role.guest_mode)
363562306a36Sopenharmony_ci			roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
363662306a36Sopenharmony_ci	}
363762306a36Sopenharmony_ci
363862306a36Sopenharmony_ci	kvm_mmu_free_roots(kvm, mmu, roots_to_free);
363962306a36Sopenharmony_ci}
364062306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
364162306a36Sopenharmony_ci
364262306a36Sopenharmony_cistatic hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
364362306a36Sopenharmony_ci			    u8 level)
364462306a36Sopenharmony_ci{
364562306a36Sopenharmony_ci	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
364662306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
364762306a36Sopenharmony_ci
364862306a36Sopenharmony_ci	role.level = level;
364962306a36Sopenharmony_ci	role.quadrant = quadrant;
365062306a36Sopenharmony_ci
365162306a36Sopenharmony_ci	WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte);
365262306a36Sopenharmony_ci	WARN_ON_ONCE(role.direct && role.has_4_byte_gpte);
365362306a36Sopenharmony_ci
365462306a36Sopenharmony_ci	sp = kvm_mmu_get_shadow_page(vcpu, gfn, role);
365562306a36Sopenharmony_ci	++sp->root_count;
365662306a36Sopenharmony_ci
365762306a36Sopenharmony_ci	return __pa(sp->spt);
365862306a36Sopenharmony_ci}
365962306a36Sopenharmony_ci
366062306a36Sopenharmony_cistatic int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
366162306a36Sopenharmony_ci{
366262306a36Sopenharmony_ci	struct kvm_mmu *mmu = vcpu->arch.mmu;
366362306a36Sopenharmony_ci	u8 shadow_root_level = mmu->root_role.level;
366462306a36Sopenharmony_ci	hpa_t root;
366562306a36Sopenharmony_ci	unsigned i;
366662306a36Sopenharmony_ci	int r;
366762306a36Sopenharmony_ci
366862306a36Sopenharmony_ci	write_lock(&vcpu->kvm->mmu_lock);
366962306a36Sopenharmony_ci	r = make_mmu_pages_available(vcpu);
367062306a36Sopenharmony_ci	if (r < 0)
367162306a36Sopenharmony_ci		goto out_unlock;
367262306a36Sopenharmony_ci
367362306a36Sopenharmony_ci	if (tdp_mmu_enabled) {
367462306a36Sopenharmony_ci		root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
367562306a36Sopenharmony_ci		mmu->root.hpa = root;
367662306a36Sopenharmony_ci	} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
367762306a36Sopenharmony_ci		root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
367862306a36Sopenharmony_ci		mmu->root.hpa = root;
367962306a36Sopenharmony_ci	} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
368062306a36Sopenharmony_ci		if (WARN_ON_ONCE(!mmu->pae_root)) {
368162306a36Sopenharmony_ci			r = -EIO;
368262306a36Sopenharmony_ci			goto out_unlock;
368362306a36Sopenharmony_ci		}
368462306a36Sopenharmony_ci
368562306a36Sopenharmony_ci		for (i = 0; i < 4; ++i) {
368662306a36Sopenharmony_ci			WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
368762306a36Sopenharmony_ci
368862306a36Sopenharmony_ci			root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0,
368962306a36Sopenharmony_ci					      PT32_ROOT_LEVEL);
369062306a36Sopenharmony_ci			mmu->pae_root[i] = root | PT_PRESENT_MASK |
369162306a36Sopenharmony_ci					   shadow_me_value;
369262306a36Sopenharmony_ci		}
369362306a36Sopenharmony_ci		mmu->root.hpa = __pa(mmu->pae_root);
369462306a36Sopenharmony_ci	} else {
369562306a36Sopenharmony_ci		WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
369662306a36Sopenharmony_ci		r = -EIO;
369762306a36Sopenharmony_ci		goto out_unlock;
369862306a36Sopenharmony_ci	}
369962306a36Sopenharmony_ci
370062306a36Sopenharmony_ci	/* root.pgd is ignored for direct MMUs. */
370162306a36Sopenharmony_ci	mmu->root.pgd = 0;
370262306a36Sopenharmony_ciout_unlock:
370362306a36Sopenharmony_ci	write_unlock(&vcpu->kvm->mmu_lock);
370462306a36Sopenharmony_ci	return r;
370562306a36Sopenharmony_ci}
370662306a36Sopenharmony_ci
370762306a36Sopenharmony_cistatic int mmu_first_shadow_root_alloc(struct kvm *kvm)
370862306a36Sopenharmony_ci{
370962306a36Sopenharmony_ci	struct kvm_memslots *slots;
371062306a36Sopenharmony_ci	struct kvm_memory_slot *slot;
371162306a36Sopenharmony_ci	int r = 0, i, bkt;
371262306a36Sopenharmony_ci
371362306a36Sopenharmony_ci	/*
371462306a36Sopenharmony_ci	 * Check if this is the first shadow root being allocated before
371562306a36Sopenharmony_ci	 * taking the lock.
371662306a36Sopenharmony_ci	 */
371762306a36Sopenharmony_ci	if (kvm_shadow_root_allocated(kvm))
371862306a36Sopenharmony_ci		return 0;
371962306a36Sopenharmony_ci
372062306a36Sopenharmony_ci	mutex_lock(&kvm->slots_arch_lock);
372162306a36Sopenharmony_ci
372262306a36Sopenharmony_ci	/* Recheck, under the lock, whether this is the first shadow root. */
372362306a36Sopenharmony_ci	if (kvm_shadow_root_allocated(kvm))
372462306a36Sopenharmony_ci		goto out_unlock;
372562306a36Sopenharmony_ci
372662306a36Sopenharmony_ci	/*
372762306a36Sopenharmony_ci	 * Check if anything actually needs to be allocated, e.g. all metadata
372862306a36Sopenharmony_ci	 * will be allocated upfront if TDP is disabled.
372962306a36Sopenharmony_ci	 */
373062306a36Sopenharmony_ci	if (kvm_memslots_have_rmaps(kvm) &&
373162306a36Sopenharmony_ci	    kvm_page_track_write_tracking_enabled(kvm))
373262306a36Sopenharmony_ci		goto out_success;
373362306a36Sopenharmony_ci
373462306a36Sopenharmony_ci	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
373562306a36Sopenharmony_ci		slots = __kvm_memslots(kvm, i);
373662306a36Sopenharmony_ci		kvm_for_each_memslot(slot, bkt, slots) {
373762306a36Sopenharmony_ci			/*
373862306a36Sopenharmony_ci			 * Both of these functions are no-ops if the target is
373962306a36Sopenharmony_ci			 * already allocated, so unconditionally calling both
374062306a36Sopenharmony_ci			 * is safe.  Intentionally do NOT free allocations on
374162306a36Sopenharmony_ci			 * failure to avoid having to track which allocations
374262306a36Sopenharmony_ci			 * were made now versus when the memslot was created.
374362306a36Sopenharmony_ci			 * The metadata is guaranteed to be freed when the slot
374462306a36Sopenharmony_ci			 * is freed, and will be kept/used if userspace retries
374562306a36Sopenharmony_ci			 * KVM_RUN instead of killing the VM.
374662306a36Sopenharmony_ci			 */
374762306a36Sopenharmony_ci			r = memslot_rmap_alloc(slot, slot->npages);
374862306a36Sopenharmony_ci			if (r)
374962306a36Sopenharmony_ci				goto out_unlock;
375062306a36Sopenharmony_ci			r = kvm_page_track_write_tracking_alloc(slot);
375162306a36Sopenharmony_ci			if (r)
375262306a36Sopenharmony_ci				goto out_unlock;
375362306a36Sopenharmony_ci		}
375462306a36Sopenharmony_ci	}
375562306a36Sopenharmony_ci
375662306a36Sopenharmony_ci	/*
375762306a36Sopenharmony_ci	 * Ensure that shadow_root_allocated becomes true strictly after
375862306a36Sopenharmony_ci	 * all the related pointers are set.
375962306a36Sopenharmony_ci	 */
376062306a36Sopenharmony_ciout_success:
376162306a36Sopenharmony_ci	smp_store_release(&kvm->arch.shadow_root_allocated, true);
376262306a36Sopenharmony_ci
376362306a36Sopenharmony_ciout_unlock:
376462306a36Sopenharmony_ci	mutex_unlock(&kvm->slots_arch_lock);
376562306a36Sopenharmony_ci	return r;
376662306a36Sopenharmony_ci}
376762306a36Sopenharmony_ci
376862306a36Sopenharmony_cistatic int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
376962306a36Sopenharmony_ci{
377062306a36Sopenharmony_ci	struct kvm_mmu *mmu = vcpu->arch.mmu;
377162306a36Sopenharmony_ci	u64 pdptrs[4], pm_mask;
377262306a36Sopenharmony_ci	gfn_t root_gfn, root_pgd;
377362306a36Sopenharmony_ci	int quadrant, i, r;
377462306a36Sopenharmony_ci	hpa_t root;
377562306a36Sopenharmony_ci
377662306a36Sopenharmony_ci	root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
377762306a36Sopenharmony_ci	root_gfn = root_pgd >> PAGE_SHIFT;
377862306a36Sopenharmony_ci
377962306a36Sopenharmony_ci	if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
378062306a36Sopenharmony_ci		mmu->root.hpa = kvm_mmu_get_dummy_root();
378162306a36Sopenharmony_ci		return 0;
378262306a36Sopenharmony_ci	}
378362306a36Sopenharmony_ci
378462306a36Sopenharmony_ci	/*
378562306a36Sopenharmony_ci	 * On SVM, reading PDPTRs might access guest memory, which might fault
378662306a36Sopenharmony_ci	 * and thus might sleep.  Grab the PDPTRs before acquiring mmu_lock.
378762306a36Sopenharmony_ci	 */
378862306a36Sopenharmony_ci	if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
378962306a36Sopenharmony_ci		for (i = 0; i < 4; ++i) {
379062306a36Sopenharmony_ci			pdptrs[i] = mmu->get_pdptr(vcpu, i);
379162306a36Sopenharmony_ci			if (!(pdptrs[i] & PT_PRESENT_MASK))
379262306a36Sopenharmony_ci				continue;
379362306a36Sopenharmony_ci
379462306a36Sopenharmony_ci			if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT))
379562306a36Sopenharmony_ci				pdptrs[i] = 0;
379662306a36Sopenharmony_ci		}
379762306a36Sopenharmony_ci	}
379862306a36Sopenharmony_ci
379962306a36Sopenharmony_ci	r = mmu_first_shadow_root_alloc(vcpu->kvm);
380062306a36Sopenharmony_ci	if (r)
380162306a36Sopenharmony_ci		return r;
380262306a36Sopenharmony_ci
380362306a36Sopenharmony_ci	write_lock(&vcpu->kvm->mmu_lock);
380462306a36Sopenharmony_ci	r = make_mmu_pages_available(vcpu);
380562306a36Sopenharmony_ci	if (r < 0)
380662306a36Sopenharmony_ci		goto out_unlock;
380762306a36Sopenharmony_ci
380862306a36Sopenharmony_ci	/*
380962306a36Sopenharmony_ci	 * Do we shadow a long mode page table? If so we need to
381062306a36Sopenharmony_ci	 * write-protect the guests page table root.
381162306a36Sopenharmony_ci	 */
381262306a36Sopenharmony_ci	if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
381362306a36Sopenharmony_ci		root = mmu_alloc_root(vcpu, root_gfn, 0,
381462306a36Sopenharmony_ci				      mmu->root_role.level);
381562306a36Sopenharmony_ci		mmu->root.hpa = root;
381662306a36Sopenharmony_ci		goto set_root_pgd;
381762306a36Sopenharmony_ci	}
381862306a36Sopenharmony_ci
381962306a36Sopenharmony_ci	if (WARN_ON_ONCE(!mmu->pae_root)) {
382062306a36Sopenharmony_ci		r = -EIO;
382162306a36Sopenharmony_ci		goto out_unlock;
382262306a36Sopenharmony_ci	}
382362306a36Sopenharmony_ci
382462306a36Sopenharmony_ci	/*
382562306a36Sopenharmony_ci	 * We shadow a 32 bit page table. This may be a legacy 2-level
382662306a36Sopenharmony_ci	 * or a PAE 3-level page table. In either case we need to be aware that
382762306a36Sopenharmony_ci	 * the shadow page table may be a PAE or a long mode page table.
382862306a36Sopenharmony_ci	 */
382962306a36Sopenharmony_ci	pm_mask = PT_PRESENT_MASK | shadow_me_value;
383062306a36Sopenharmony_ci	if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
383162306a36Sopenharmony_ci		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
383262306a36Sopenharmony_ci
383362306a36Sopenharmony_ci		if (WARN_ON_ONCE(!mmu->pml4_root)) {
383462306a36Sopenharmony_ci			r = -EIO;
383562306a36Sopenharmony_ci			goto out_unlock;
383662306a36Sopenharmony_ci		}
383762306a36Sopenharmony_ci		mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
383862306a36Sopenharmony_ci
383962306a36Sopenharmony_ci		if (mmu->root_role.level == PT64_ROOT_5LEVEL) {
384062306a36Sopenharmony_ci			if (WARN_ON_ONCE(!mmu->pml5_root)) {
384162306a36Sopenharmony_ci				r = -EIO;
384262306a36Sopenharmony_ci				goto out_unlock;
384362306a36Sopenharmony_ci			}
384462306a36Sopenharmony_ci			mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
384562306a36Sopenharmony_ci		}
384662306a36Sopenharmony_ci	}
384762306a36Sopenharmony_ci
384862306a36Sopenharmony_ci	for (i = 0; i < 4; ++i) {
384962306a36Sopenharmony_ci		WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
385062306a36Sopenharmony_ci
385162306a36Sopenharmony_ci		if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
385262306a36Sopenharmony_ci			if (!(pdptrs[i] & PT_PRESENT_MASK)) {
385362306a36Sopenharmony_ci				mmu->pae_root[i] = INVALID_PAE_ROOT;
385462306a36Sopenharmony_ci				continue;
385562306a36Sopenharmony_ci			}
385662306a36Sopenharmony_ci			root_gfn = pdptrs[i] >> PAGE_SHIFT;
385762306a36Sopenharmony_ci		}
385862306a36Sopenharmony_ci
385962306a36Sopenharmony_ci		/*
386062306a36Sopenharmony_ci		 * If shadowing 32-bit non-PAE page tables, each PAE page
386162306a36Sopenharmony_ci		 * directory maps one quarter of the guest's non-PAE page
386262306a36Sopenharmony_ci		 * directory. Othwerise each PAE page direct shadows one guest
386362306a36Sopenharmony_ci		 * PAE page directory so that quadrant should be 0.
386462306a36Sopenharmony_ci		 */
386562306a36Sopenharmony_ci		quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0;
386662306a36Sopenharmony_ci
386762306a36Sopenharmony_ci		root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL);
386862306a36Sopenharmony_ci		mmu->pae_root[i] = root | pm_mask;
386962306a36Sopenharmony_ci	}
387062306a36Sopenharmony_ci
387162306a36Sopenharmony_ci	if (mmu->root_role.level == PT64_ROOT_5LEVEL)
387262306a36Sopenharmony_ci		mmu->root.hpa = __pa(mmu->pml5_root);
387362306a36Sopenharmony_ci	else if (mmu->root_role.level == PT64_ROOT_4LEVEL)
387462306a36Sopenharmony_ci		mmu->root.hpa = __pa(mmu->pml4_root);
387562306a36Sopenharmony_ci	else
387662306a36Sopenharmony_ci		mmu->root.hpa = __pa(mmu->pae_root);
387762306a36Sopenharmony_ci
387862306a36Sopenharmony_ciset_root_pgd:
387962306a36Sopenharmony_ci	mmu->root.pgd = root_pgd;
388062306a36Sopenharmony_ciout_unlock:
388162306a36Sopenharmony_ci	write_unlock(&vcpu->kvm->mmu_lock);
388262306a36Sopenharmony_ci
388362306a36Sopenharmony_ci	return r;
388462306a36Sopenharmony_ci}
388562306a36Sopenharmony_ci
388662306a36Sopenharmony_cistatic int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
388762306a36Sopenharmony_ci{
388862306a36Sopenharmony_ci	struct kvm_mmu *mmu = vcpu->arch.mmu;
388962306a36Sopenharmony_ci	bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
389062306a36Sopenharmony_ci	u64 *pml5_root = NULL;
389162306a36Sopenharmony_ci	u64 *pml4_root = NULL;
389262306a36Sopenharmony_ci	u64 *pae_root;
389362306a36Sopenharmony_ci
389462306a36Sopenharmony_ci	/*
389562306a36Sopenharmony_ci	 * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
389662306a36Sopenharmony_ci	 * tables are allocated and initialized at root creation as there is no
389762306a36Sopenharmony_ci	 * equivalent level in the guest's NPT to shadow.  Allocate the tables
389862306a36Sopenharmony_ci	 * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
389962306a36Sopenharmony_ci	 */
390062306a36Sopenharmony_ci	if (mmu->root_role.direct ||
390162306a36Sopenharmony_ci	    mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
390262306a36Sopenharmony_ci	    mmu->root_role.level < PT64_ROOT_4LEVEL)
390362306a36Sopenharmony_ci		return 0;
390462306a36Sopenharmony_ci
390562306a36Sopenharmony_ci	/*
390662306a36Sopenharmony_ci	 * NPT, the only paging mode that uses this horror, uses a fixed number
390762306a36Sopenharmony_ci	 * of levels for the shadow page tables, e.g. all MMUs are 4-level or
390862306a36Sopenharmony_ci	 * all MMus are 5-level.  Thus, this can safely require that pml5_root
390962306a36Sopenharmony_ci	 * is allocated if the other roots are valid and pml5 is needed, as any
391062306a36Sopenharmony_ci	 * prior MMU would also have required pml5.
391162306a36Sopenharmony_ci	 */
391262306a36Sopenharmony_ci	if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
391362306a36Sopenharmony_ci		return 0;
391462306a36Sopenharmony_ci
391562306a36Sopenharmony_ci	/*
391662306a36Sopenharmony_ci	 * The special roots should always be allocated in concert.  Yell and
391762306a36Sopenharmony_ci	 * bail if KVM ends up in a state where only one of the roots is valid.
391862306a36Sopenharmony_ci	 */
391962306a36Sopenharmony_ci	if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
392062306a36Sopenharmony_ci			 (need_pml5 && mmu->pml5_root)))
392162306a36Sopenharmony_ci		return -EIO;
392262306a36Sopenharmony_ci
392362306a36Sopenharmony_ci	/*
392462306a36Sopenharmony_ci	 * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
392562306a36Sopenharmony_ci	 * doesn't need to be decrypted.
392662306a36Sopenharmony_ci	 */
392762306a36Sopenharmony_ci	pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
392862306a36Sopenharmony_ci	if (!pae_root)
392962306a36Sopenharmony_ci		return -ENOMEM;
393062306a36Sopenharmony_ci
393162306a36Sopenharmony_ci#ifdef CONFIG_X86_64
393262306a36Sopenharmony_ci	pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
393362306a36Sopenharmony_ci	if (!pml4_root)
393462306a36Sopenharmony_ci		goto err_pml4;
393562306a36Sopenharmony_ci
393662306a36Sopenharmony_ci	if (need_pml5) {
393762306a36Sopenharmony_ci		pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
393862306a36Sopenharmony_ci		if (!pml5_root)
393962306a36Sopenharmony_ci			goto err_pml5;
394062306a36Sopenharmony_ci	}
394162306a36Sopenharmony_ci#endif
394262306a36Sopenharmony_ci
394362306a36Sopenharmony_ci	mmu->pae_root = pae_root;
394462306a36Sopenharmony_ci	mmu->pml4_root = pml4_root;
394562306a36Sopenharmony_ci	mmu->pml5_root = pml5_root;
394662306a36Sopenharmony_ci
394762306a36Sopenharmony_ci	return 0;
394862306a36Sopenharmony_ci
394962306a36Sopenharmony_ci#ifdef CONFIG_X86_64
395062306a36Sopenharmony_cierr_pml5:
395162306a36Sopenharmony_ci	free_page((unsigned long)pml4_root);
395262306a36Sopenharmony_cierr_pml4:
395362306a36Sopenharmony_ci	free_page((unsigned long)pae_root);
395462306a36Sopenharmony_ci	return -ENOMEM;
395562306a36Sopenharmony_ci#endif
395662306a36Sopenharmony_ci}
395762306a36Sopenharmony_ci
395862306a36Sopenharmony_cistatic bool is_unsync_root(hpa_t root)
395962306a36Sopenharmony_ci{
396062306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
396162306a36Sopenharmony_ci
396262306a36Sopenharmony_ci	if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root))
396362306a36Sopenharmony_ci		return false;
396462306a36Sopenharmony_ci
396562306a36Sopenharmony_ci	/*
396662306a36Sopenharmony_ci	 * The read barrier orders the CPU's read of SPTE.W during the page table
396762306a36Sopenharmony_ci	 * walk before the reads of sp->unsync/sp->unsync_children here.
396862306a36Sopenharmony_ci	 *
396962306a36Sopenharmony_ci	 * Even if another CPU was marking the SP as unsync-ed simultaneously,
397062306a36Sopenharmony_ci	 * any guest page table changes are not guaranteed to be visible anyway
397162306a36Sopenharmony_ci	 * until this VCPU issues a TLB flush strictly after those changes are
397262306a36Sopenharmony_ci	 * made.  We only need to ensure that the other CPU sets these flags
397362306a36Sopenharmony_ci	 * before any actual changes to the page tables are made.  The comments
397462306a36Sopenharmony_ci	 * in mmu_try_to_unsync_pages() describe what could go wrong if this
397562306a36Sopenharmony_ci	 * requirement isn't satisfied.
397662306a36Sopenharmony_ci	 */
397762306a36Sopenharmony_ci	smp_rmb();
397862306a36Sopenharmony_ci	sp = root_to_sp(root);
397962306a36Sopenharmony_ci
398062306a36Sopenharmony_ci	/*
398162306a36Sopenharmony_ci	 * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
398262306a36Sopenharmony_ci	 * PDPTEs for a given PAE root need to be synchronized individually.
398362306a36Sopenharmony_ci	 */
398462306a36Sopenharmony_ci	if (WARN_ON_ONCE(!sp))
398562306a36Sopenharmony_ci		return false;
398662306a36Sopenharmony_ci
398762306a36Sopenharmony_ci	if (sp->unsync || sp->unsync_children)
398862306a36Sopenharmony_ci		return true;
398962306a36Sopenharmony_ci
399062306a36Sopenharmony_ci	return false;
399162306a36Sopenharmony_ci}
399262306a36Sopenharmony_ci
399362306a36Sopenharmony_civoid kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
399462306a36Sopenharmony_ci{
399562306a36Sopenharmony_ci	int i;
399662306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
399762306a36Sopenharmony_ci
399862306a36Sopenharmony_ci	if (vcpu->arch.mmu->root_role.direct)
399962306a36Sopenharmony_ci		return;
400062306a36Sopenharmony_ci
400162306a36Sopenharmony_ci	if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
400262306a36Sopenharmony_ci		return;
400362306a36Sopenharmony_ci
400462306a36Sopenharmony_ci	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
400562306a36Sopenharmony_ci
400662306a36Sopenharmony_ci	if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
400762306a36Sopenharmony_ci		hpa_t root = vcpu->arch.mmu->root.hpa;
400862306a36Sopenharmony_ci
400962306a36Sopenharmony_ci		if (!is_unsync_root(root))
401062306a36Sopenharmony_ci			return;
401162306a36Sopenharmony_ci
401262306a36Sopenharmony_ci		sp = root_to_sp(root);
401362306a36Sopenharmony_ci
401462306a36Sopenharmony_ci		write_lock(&vcpu->kvm->mmu_lock);
401562306a36Sopenharmony_ci		mmu_sync_children(vcpu, sp, true);
401662306a36Sopenharmony_ci		write_unlock(&vcpu->kvm->mmu_lock);
401762306a36Sopenharmony_ci		return;
401862306a36Sopenharmony_ci	}
401962306a36Sopenharmony_ci
402062306a36Sopenharmony_ci	write_lock(&vcpu->kvm->mmu_lock);
402162306a36Sopenharmony_ci
402262306a36Sopenharmony_ci	for (i = 0; i < 4; ++i) {
402362306a36Sopenharmony_ci		hpa_t root = vcpu->arch.mmu->pae_root[i];
402462306a36Sopenharmony_ci
402562306a36Sopenharmony_ci		if (IS_VALID_PAE_ROOT(root)) {
402662306a36Sopenharmony_ci			sp = spte_to_child_sp(root);
402762306a36Sopenharmony_ci			mmu_sync_children(vcpu, sp, true);
402862306a36Sopenharmony_ci		}
402962306a36Sopenharmony_ci	}
403062306a36Sopenharmony_ci
403162306a36Sopenharmony_ci	write_unlock(&vcpu->kvm->mmu_lock);
403262306a36Sopenharmony_ci}
403362306a36Sopenharmony_ci
403462306a36Sopenharmony_civoid kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
403562306a36Sopenharmony_ci{
403662306a36Sopenharmony_ci	unsigned long roots_to_free = 0;
403762306a36Sopenharmony_ci	int i;
403862306a36Sopenharmony_ci
403962306a36Sopenharmony_ci	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
404062306a36Sopenharmony_ci		if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
404162306a36Sopenharmony_ci			roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
404262306a36Sopenharmony_ci
404362306a36Sopenharmony_ci	/* sync prev_roots by simply freeing them */
404462306a36Sopenharmony_ci	kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
404562306a36Sopenharmony_ci}
404662306a36Sopenharmony_ci
404762306a36Sopenharmony_cistatic gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
404862306a36Sopenharmony_ci				  gpa_t vaddr, u64 access,
404962306a36Sopenharmony_ci				  struct x86_exception *exception)
405062306a36Sopenharmony_ci{
405162306a36Sopenharmony_ci	if (exception)
405262306a36Sopenharmony_ci		exception->error_code = 0;
405362306a36Sopenharmony_ci	return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
405462306a36Sopenharmony_ci}
405562306a36Sopenharmony_ci
405662306a36Sopenharmony_cistatic bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
405762306a36Sopenharmony_ci{
405862306a36Sopenharmony_ci	/*
405962306a36Sopenharmony_ci	 * A nested guest cannot use the MMIO cache if it is using nested
406062306a36Sopenharmony_ci	 * page tables, because cr2 is a nGPA while the cache stores GPAs.
406162306a36Sopenharmony_ci	 */
406262306a36Sopenharmony_ci	if (mmu_is_nested(vcpu))
406362306a36Sopenharmony_ci		return false;
406462306a36Sopenharmony_ci
406562306a36Sopenharmony_ci	if (direct)
406662306a36Sopenharmony_ci		return vcpu_match_mmio_gpa(vcpu, addr);
406762306a36Sopenharmony_ci
406862306a36Sopenharmony_ci	return vcpu_match_mmio_gva(vcpu, addr);
406962306a36Sopenharmony_ci}
407062306a36Sopenharmony_ci
407162306a36Sopenharmony_ci/*
407262306a36Sopenharmony_ci * Return the level of the lowest level SPTE added to sptes.
407362306a36Sopenharmony_ci * That SPTE may be non-present.
407462306a36Sopenharmony_ci *
407562306a36Sopenharmony_ci * Must be called between walk_shadow_page_lockless_{begin,end}.
407662306a36Sopenharmony_ci */
407762306a36Sopenharmony_cistatic int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
407862306a36Sopenharmony_ci{
407962306a36Sopenharmony_ci	struct kvm_shadow_walk_iterator iterator;
408062306a36Sopenharmony_ci	int leaf = -1;
408162306a36Sopenharmony_ci	u64 spte;
408262306a36Sopenharmony_ci
408362306a36Sopenharmony_ci	for (shadow_walk_init(&iterator, vcpu, addr),
408462306a36Sopenharmony_ci	     *root_level = iterator.level;
408562306a36Sopenharmony_ci	     shadow_walk_okay(&iterator);
408662306a36Sopenharmony_ci	     __shadow_walk_next(&iterator, spte)) {
408762306a36Sopenharmony_ci		leaf = iterator.level;
408862306a36Sopenharmony_ci		spte = mmu_spte_get_lockless(iterator.sptep);
408962306a36Sopenharmony_ci
409062306a36Sopenharmony_ci		sptes[leaf] = spte;
409162306a36Sopenharmony_ci	}
409262306a36Sopenharmony_ci
409362306a36Sopenharmony_ci	return leaf;
409462306a36Sopenharmony_ci}
409562306a36Sopenharmony_ci
409662306a36Sopenharmony_ci/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
409762306a36Sopenharmony_cistatic bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
409862306a36Sopenharmony_ci{
409962306a36Sopenharmony_ci	u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
410062306a36Sopenharmony_ci	struct rsvd_bits_validate *rsvd_check;
410162306a36Sopenharmony_ci	int root, leaf, level;
410262306a36Sopenharmony_ci	bool reserved = false;
410362306a36Sopenharmony_ci
410462306a36Sopenharmony_ci	walk_shadow_page_lockless_begin(vcpu);
410562306a36Sopenharmony_ci
410662306a36Sopenharmony_ci	if (is_tdp_mmu_active(vcpu))
410762306a36Sopenharmony_ci		leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
410862306a36Sopenharmony_ci	else
410962306a36Sopenharmony_ci		leaf = get_walk(vcpu, addr, sptes, &root);
411062306a36Sopenharmony_ci
411162306a36Sopenharmony_ci	walk_shadow_page_lockless_end(vcpu);
411262306a36Sopenharmony_ci
411362306a36Sopenharmony_ci	if (unlikely(leaf < 0)) {
411462306a36Sopenharmony_ci		*sptep = 0ull;
411562306a36Sopenharmony_ci		return reserved;
411662306a36Sopenharmony_ci	}
411762306a36Sopenharmony_ci
411862306a36Sopenharmony_ci	*sptep = sptes[leaf];
411962306a36Sopenharmony_ci
412062306a36Sopenharmony_ci	/*
412162306a36Sopenharmony_ci	 * Skip reserved bits checks on the terminal leaf if it's not a valid
412262306a36Sopenharmony_ci	 * SPTE.  Note, this also (intentionally) skips MMIO SPTEs, which, by
412362306a36Sopenharmony_ci	 * design, always have reserved bits set.  The purpose of the checks is
412462306a36Sopenharmony_ci	 * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
412562306a36Sopenharmony_ci	 */
412662306a36Sopenharmony_ci	if (!is_shadow_present_pte(sptes[leaf]))
412762306a36Sopenharmony_ci		leaf++;
412862306a36Sopenharmony_ci
412962306a36Sopenharmony_ci	rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
413062306a36Sopenharmony_ci
413162306a36Sopenharmony_ci	for (level = root; level >= leaf; level--)
413262306a36Sopenharmony_ci		reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
413362306a36Sopenharmony_ci
413462306a36Sopenharmony_ci	if (reserved) {
413562306a36Sopenharmony_ci		pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
413662306a36Sopenharmony_ci		       __func__, addr);
413762306a36Sopenharmony_ci		for (level = root; level >= leaf; level--)
413862306a36Sopenharmony_ci			pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
413962306a36Sopenharmony_ci			       sptes[level], level,
414062306a36Sopenharmony_ci			       get_rsvd_bits(rsvd_check, sptes[level], level));
414162306a36Sopenharmony_ci	}
414262306a36Sopenharmony_ci
414362306a36Sopenharmony_ci	return reserved;
414462306a36Sopenharmony_ci}
414562306a36Sopenharmony_ci
414662306a36Sopenharmony_cistatic int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
414762306a36Sopenharmony_ci{
414862306a36Sopenharmony_ci	u64 spte;
414962306a36Sopenharmony_ci	bool reserved;
415062306a36Sopenharmony_ci
415162306a36Sopenharmony_ci	if (mmio_info_in_cache(vcpu, addr, direct))
415262306a36Sopenharmony_ci		return RET_PF_EMULATE;
415362306a36Sopenharmony_ci
415462306a36Sopenharmony_ci	reserved = get_mmio_spte(vcpu, addr, &spte);
415562306a36Sopenharmony_ci	if (WARN_ON_ONCE(reserved))
415662306a36Sopenharmony_ci		return -EINVAL;
415762306a36Sopenharmony_ci
415862306a36Sopenharmony_ci	if (is_mmio_spte(spte)) {
415962306a36Sopenharmony_ci		gfn_t gfn = get_mmio_spte_gfn(spte);
416062306a36Sopenharmony_ci		unsigned int access = get_mmio_spte_access(spte);
416162306a36Sopenharmony_ci
416262306a36Sopenharmony_ci		if (!check_mmio_spte(vcpu, spte))
416362306a36Sopenharmony_ci			return RET_PF_INVALID;
416462306a36Sopenharmony_ci
416562306a36Sopenharmony_ci		if (direct)
416662306a36Sopenharmony_ci			addr = 0;
416762306a36Sopenharmony_ci
416862306a36Sopenharmony_ci		trace_handle_mmio_page_fault(addr, gfn, access);
416962306a36Sopenharmony_ci		vcpu_cache_mmio_info(vcpu, addr, gfn, access);
417062306a36Sopenharmony_ci		return RET_PF_EMULATE;
417162306a36Sopenharmony_ci	}
417262306a36Sopenharmony_ci
417362306a36Sopenharmony_ci	/*
417462306a36Sopenharmony_ci	 * If the page table is zapped by other cpus, let CPU fault again on
417562306a36Sopenharmony_ci	 * the address.
417662306a36Sopenharmony_ci	 */
417762306a36Sopenharmony_ci	return RET_PF_RETRY;
417862306a36Sopenharmony_ci}
417962306a36Sopenharmony_ci
418062306a36Sopenharmony_cistatic bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
418162306a36Sopenharmony_ci					 struct kvm_page_fault *fault)
418262306a36Sopenharmony_ci{
418362306a36Sopenharmony_ci	if (unlikely(fault->rsvd))
418462306a36Sopenharmony_ci		return false;
418562306a36Sopenharmony_ci
418662306a36Sopenharmony_ci	if (!fault->present || !fault->write)
418762306a36Sopenharmony_ci		return false;
418862306a36Sopenharmony_ci
418962306a36Sopenharmony_ci	/*
419062306a36Sopenharmony_ci	 * guest is writing the page which is write tracked which can
419162306a36Sopenharmony_ci	 * not be fixed by page fault handler.
419262306a36Sopenharmony_ci	 */
419362306a36Sopenharmony_ci	if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn))
419462306a36Sopenharmony_ci		return true;
419562306a36Sopenharmony_ci
419662306a36Sopenharmony_ci	return false;
419762306a36Sopenharmony_ci}
419862306a36Sopenharmony_ci
419962306a36Sopenharmony_cistatic void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
420062306a36Sopenharmony_ci{
420162306a36Sopenharmony_ci	struct kvm_shadow_walk_iterator iterator;
420262306a36Sopenharmony_ci	u64 spte;
420362306a36Sopenharmony_ci
420462306a36Sopenharmony_ci	walk_shadow_page_lockless_begin(vcpu);
420562306a36Sopenharmony_ci	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
420662306a36Sopenharmony_ci		clear_sp_write_flooding_count(iterator.sptep);
420762306a36Sopenharmony_ci	walk_shadow_page_lockless_end(vcpu);
420862306a36Sopenharmony_ci}
420962306a36Sopenharmony_ci
421062306a36Sopenharmony_cistatic u32 alloc_apf_token(struct kvm_vcpu *vcpu)
421162306a36Sopenharmony_ci{
421262306a36Sopenharmony_ci	/* make sure the token value is not 0 */
421362306a36Sopenharmony_ci	u32 id = vcpu->arch.apf.id;
421462306a36Sopenharmony_ci
421562306a36Sopenharmony_ci	if (id << 12 == 0)
421662306a36Sopenharmony_ci		vcpu->arch.apf.id = 1;
421762306a36Sopenharmony_ci
421862306a36Sopenharmony_ci	return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
421962306a36Sopenharmony_ci}
422062306a36Sopenharmony_ci
422162306a36Sopenharmony_cistatic bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
422262306a36Sopenharmony_ci				    gfn_t gfn)
422362306a36Sopenharmony_ci{
422462306a36Sopenharmony_ci	struct kvm_arch_async_pf arch;
422562306a36Sopenharmony_ci
422662306a36Sopenharmony_ci	arch.token = alloc_apf_token(vcpu);
422762306a36Sopenharmony_ci	arch.gfn = gfn;
422862306a36Sopenharmony_ci	arch.direct_map = vcpu->arch.mmu->root_role.direct;
422962306a36Sopenharmony_ci	arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
423062306a36Sopenharmony_ci
423162306a36Sopenharmony_ci	return kvm_setup_async_pf(vcpu, cr2_or_gpa,
423262306a36Sopenharmony_ci				  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
423362306a36Sopenharmony_ci}
423462306a36Sopenharmony_ci
423562306a36Sopenharmony_civoid kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
423662306a36Sopenharmony_ci{
423762306a36Sopenharmony_ci	int r;
423862306a36Sopenharmony_ci
423962306a36Sopenharmony_ci	if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
424062306a36Sopenharmony_ci	      work->wakeup_all)
424162306a36Sopenharmony_ci		return;
424262306a36Sopenharmony_ci
424362306a36Sopenharmony_ci	r = kvm_mmu_reload(vcpu);
424462306a36Sopenharmony_ci	if (unlikely(r))
424562306a36Sopenharmony_ci		return;
424662306a36Sopenharmony_ci
424762306a36Sopenharmony_ci	if (!vcpu->arch.mmu->root_role.direct &&
424862306a36Sopenharmony_ci	      work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
424962306a36Sopenharmony_ci		return;
425062306a36Sopenharmony_ci
425162306a36Sopenharmony_ci	kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
425262306a36Sopenharmony_ci}
425362306a36Sopenharmony_ci
425462306a36Sopenharmony_cistatic int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
425562306a36Sopenharmony_ci{
425662306a36Sopenharmony_ci	struct kvm_memory_slot *slot = fault->slot;
425762306a36Sopenharmony_ci	bool async;
425862306a36Sopenharmony_ci
425962306a36Sopenharmony_ci	/*
426062306a36Sopenharmony_ci	 * Retry the page fault if the gfn hit a memslot that is being deleted
426162306a36Sopenharmony_ci	 * or moved.  This ensures any existing SPTEs for the old memslot will
426262306a36Sopenharmony_ci	 * be zapped before KVM inserts a new MMIO SPTE for the gfn.
426362306a36Sopenharmony_ci	 */
426462306a36Sopenharmony_ci	if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
426562306a36Sopenharmony_ci		return RET_PF_RETRY;
426662306a36Sopenharmony_ci
426762306a36Sopenharmony_ci	if (!kvm_is_visible_memslot(slot)) {
426862306a36Sopenharmony_ci		/* Don't expose private memslots to L2. */
426962306a36Sopenharmony_ci		if (is_guest_mode(vcpu)) {
427062306a36Sopenharmony_ci			fault->slot = NULL;
427162306a36Sopenharmony_ci			fault->pfn = KVM_PFN_NOSLOT;
427262306a36Sopenharmony_ci			fault->map_writable = false;
427362306a36Sopenharmony_ci			return RET_PF_CONTINUE;
427462306a36Sopenharmony_ci		}
427562306a36Sopenharmony_ci		/*
427662306a36Sopenharmony_ci		 * If the APIC access page exists but is disabled, go directly
427762306a36Sopenharmony_ci		 * to emulation without caching the MMIO access or creating a
427862306a36Sopenharmony_ci		 * MMIO SPTE.  That way the cache doesn't need to be purged
427962306a36Sopenharmony_ci		 * when the AVIC is re-enabled.
428062306a36Sopenharmony_ci		 */
428162306a36Sopenharmony_ci		if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
428262306a36Sopenharmony_ci		    !kvm_apicv_activated(vcpu->kvm))
428362306a36Sopenharmony_ci			return RET_PF_EMULATE;
428462306a36Sopenharmony_ci	}
428562306a36Sopenharmony_ci
428662306a36Sopenharmony_ci	async = false;
428762306a36Sopenharmony_ci	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
428862306a36Sopenharmony_ci					  fault->write, &fault->map_writable,
428962306a36Sopenharmony_ci					  &fault->hva);
429062306a36Sopenharmony_ci	if (!async)
429162306a36Sopenharmony_ci		return RET_PF_CONTINUE; /* *pfn has correct page already */
429262306a36Sopenharmony_ci
429362306a36Sopenharmony_ci	if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
429462306a36Sopenharmony_ci		trace_kvm_try_async_get_page(fault->addr, fault->gfn);
429562306a36Sopenharmony_ci		if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
429662306a36Sopenharmony_ci			trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
429762306a36Sopenharmony_ci			kvm_make_request(KVM_REQ_APF_HALT, vcpu);
429862306a36Sopenharmony_ci			return RET_PF_RETRY;
429962306a36Sopenharmony_ci		} else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
430062306a36Sopenharmony_ci			return RET_PF_RETRY;
430162306a36Sopenharmony_ci		}
430262306a36Sopenharmony_ci	}
430362306a36Sopenharmony_ci
430462306a36Sopenharmony_ci	/*
430562306a36Sopenharmony_ci	 * Allow gup to bail on pending non-fatal signals when it's also allowed
430662306a36Sopenharmony_ci	 * to wait for IO.  Note, gup always bails if it is unable to quickly
430762306a36Sopenharmony_ci	 * get a page and a fatal signal, i.e. SIGKILL, is pending.
430862306a36Sopenharmony_ci	 */
430962306a36Sopenharmony_ci	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
431062306a36Sopenharmony_ci					  fault->write, &fault->map_writable,
431162306a36Sopenharmony_ci					  &fault->hva);
431262306a36Sopenharmony_ci	return RET_PF_CONTINUE;
431362306a36Sopenharmony_ci}
431462306a36Sopenharmony_ci
431562306a36Sopenharmony_cistatic int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
431662306a36Sopenharmony_ci			   unsigned int access)
431762306a36Sopenharmony_ci{
431862306a36Sopenharmony_ci	int ret;
431962306a36Sopenharmony_ci
432062306a36Sopenharmony_ci	fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
432162306a36Sopenharmony_ci	smp_rmb();
432262306a36Sopenharmony_ci
432362306a36Sopenharmony_ci	ret = __kvm_faultin_pfn(vcpu, fault);
432462306a36Sopenharmony_ci	if (ret != RET_PF_CONTINUE)
432562306a36Sopenharmony_ci		return ret;
432662306a36Sopenharmony_ci
432762306a36Sopenharmony_ci	if (unlikely(is_error_pfn(fault->pfn)))
432862306a36Sopenharmony_ci		return kvm_handle_error_pfn(vcpu, fault);
432962306a36Sopenharmony_ci
433062306a36Sopenharmony_ci	if (unlikely(!fault->slot))
433162306a36Sopenharmony_ci		return kvm_handle_noslot_fault(vcpu, fault, access);
433262306a36Sopenharmony_ci
433362306a36Sopenharmony_ci	return RET_PF_CONTINUE;
433462306a36Sopenharmony_ci}
433562306a36Sopenharmony_ci
433662306a36Sopenharmony_ci/*
433762306a36Sopenharmony_ci * Returns true if the page fault is stale and needs to be retried, i.e. if the
433862306a36Sopenharmony_ci * root was invalidated by a memslot update or a relevant mmu_notifier fired.
433962306a36Sopenharmony_ci */
434062306a36Sopenharmony_cistatic bool is_page_fault_stale(struct kvm_vcpu *vcpu,
434162306a36Sopenharmony_ci				struct kvm_page_fault *fault)
434262306a36Sopenharmony_ci{
434362306a36Sopenharmony_ci	struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
434462306a36Sopenharmony_ci
434562306a36Sopenharmony_ci	/* Special roots, e.g. pae_root, are not backed by shadow pages. */
434662306a36Sopenharmony_ci	if (sp && is_obsolete_sp(vcpu->kvm, sp))
434762306a36Sopenharmony_ci		return true;
434862306a36Sopenharmony_ci
434962306a36Sopenharmony_ci	/*
435062306a36Sopenharmony_ci	 * Roots without an associated shadow page are considered invalid if
435162306a36Sopenharmony_ci	 * there is a pending request to free obsolete roots.  The request is
435262306a36Sopenharmony_ci	 * only a hint that the current root _may_ be obsolete and needs to be
435362306a36Sopenharmony_ci	 * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
435462306a36Sopenharmony_ci	 * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
435562306a36Sopenharmony_ci	 * to reload even if no vCPU is actively using the root.
435662306a36Sopenharmony_ci	 */
435762306a36Sopenharmony_ci	if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
435862306a36Sopenharmony_ci		return true;
435962306a36Sopenharmony_ci
436062306a36Sopenharmony_ci	return fault->slot &&
436162306a36Sopenharmony_ci	       mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva);
436262306a36Sopenharmony_ci}
436362306a36Sopenharmony_ci
436462306a36Sopenharmony_cistatic int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
436562306a36Sopenharmony_ci{
436662306a36Sopenharmony_ci	int r;
436762306a36Sopenharmony_ci
436862306a36Sopenharmony_ci	/* Dummy roots are used only for shadowing bad guest roots. */
436962306a36Sopenharmony_ci	if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa)))
437062306a36Sopenharmony_ci		return RET_PF_RETRY;
437162306a36Sopenharmony_ci
437262306a36Sopenharmony_ci	if (page_fault_handle_page_track(vcpu, fault))
437362306a36Sopenharmony_ci		return RET_PF_EMULATE;
437462306a36Sopenharmony_ci
437562306a36Sopenharmony_ci	r = fast_page_fault(vcpu, fault);
437662306a36Sopenharmony_ci	if (r != RET_PF_INVALID)
437762306a36Sopenharmony_ci		return r;
437862306a36Sopenharmony_ci
437962306a36Sopenharmony_ci	r = mmu_topup_memory_caches(vcpu, false);
438062306a36Sopenharmony_ci	if (r)
438162306a36Sopenharmony_ci		return r;
438262306a36Sopenharmony_ci
438362306a36Sopenharmony_ci	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
438462306a36Sopenharmony_ci	if (r != RET_PF_CONTINUE)
438562306a36Sopenharmony_ci		return r;
438662306a36Sopenharmony_ci
438762306a36Sopenharmony_ci	r = RET_PF_RETRY;
438862306a36Sopenharmony_ci	write_lock(&vcpu->kvm->mmu_lock);
438962306a36Sopenharmony_ci
439062306a36Sopenharmony_ci	if (is_page_fault_stale(vcpu, fault))
439162306a36Sopenharmony_ci		goto out_unlock;
439262306a36Sopenharmony_ci
439362306a36Sopenharmony_ci	r = make_mmu_pages_available(vcpu);
439462306a36Sopenharmony_ci	if (r)
439562306a36Sopenharmony_ci		goto out_unlock;
439662306a36Sopenharmony_ci
439762306a36Sopenharmony_ci	r = direct_map(vcpu, fault);
439862306a36Sopenharmony_ci
439962306a36Sopenharmony_ciout_unlock:
440062306a36Sopenharmony_ci	write_unlock(&vcpu->kvm->mmu_lock);
440162306a36Sopenharmony_ci	kvm_release_pfn_clean(fault->pfn);
440262306a36Sopenharmony_ci	return r;
440362306a36Sopenharmony_ci}
440462306a36Sopenharmony_ci
440562306a36Sopenharmony_cistatic int nonpaging_page_fault(struct kvm_vcpu *vcpu,
440662306a36Sopenharmony_ci				struct kvm_page_fault *fault)
440762306a36Sopenharmony_ci{
440862306a36Sopenharmony_ci	/* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
440962306a36Sopenharmony_ci	fault->max_level = PG_LEVEL_2M;
441062306a36Sopenharmony_ci	return direct_page_fault(vcpu, fault);
441162306a36Sopenharmony_ci}
441262306a36Sopenharmony_ci
441362306a36Sopenharmony_ciint kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
441462306a36Sopenharmony_ci				u64 fault_address, char *insn, int insn_len)
441562306a36Sopenharmony_ci{
441662306a36Sopenharmony_ci	int r = 1;
441762306a36Sopenharmony_ci	u32 flags = vcpu->arch.apf.host_apf_flags;
441862306a36Sopenharmony_ci
441962306a36Sopenharmony_ci#ifndef CONFIG_X86_64
442062306a36Sopenharmony_ci	/* A 64-bit CR2 should be impossible on 32-bit KVM. */
442162306a36Sopenharmony_ci	if (WARN_ON_ONCE(fault_address >> 32))
442262306a36Sopenharmony_ci		return -EFAULT;
442362306a36Sopenharmony_ci#endif
442462306a36Sopenharmony_ci
442562306a36Sopenharmony_ci	vcpu->arch.l1tf_flush_l1d = true;
442662306a36Sopenharmony_ci	if (!flags) {
442762306a36Sopenharmony_ci		trace_kvm_page_fault(vcpu, fault_address, error_code);
442862306a36Sopenharmony_ci
442962306a36Sopenharmony_ci		if (kvm_event_needs_reinjection(vcpu))
443062306a36Sopenharmony_ci			kvm_mmu_unprotect_page_virt(vcpu, fault_address);
443162306a36Sopenharmony_ci		r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
443262306a36Sopenharmony_ci				insn_len);
443362306a36Sopenharmony_ci	} else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
443462306a36Sopenharmony_ci		vcpu->arch.apf.host_apf_flags = 0;
443562306a36Sopenharmony_ci		local_irq_disable();
443662306a36Sopenharmony_ci		kvm_async_pf_task_wait_schedule(fault_address);
443762306a36Sopenharmony_ci		local_irq_enable();
443862306a36Sopenharmony_ci	} else {
443962306a36Sopenharmony_ci		WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
444062306a36Sopenharmony_ci	}
444162306a36Sopenharmony_ci
444262306a36Sopenharmony_ci	return r;
444362306a36Sopenharmony_ci}
444462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_handle_page_fault);
444562306a36Sopenharmony_ci
444662306a36Sopenharmony_ci#ifdef CONFIG_X86_64
444762306a36Sopenharmony_cistatic int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
444862306a36Sopenharmony_ci				  struct kvm_page_fault *fault)
444962306a36Sopenharmony_ci{
445062306a36Sopenharmony_ci	int r;
445162306a36Sopenharmony_ci
445262306a36Sopenharmony_ci	if (page_fault_handle_page_track(vcpu, fault))
445362306a36Sopenharmony_ci		return RET_PF_EMULATE;
445462306a36Sopenharmony_ci
445562306a36Sopenharmony_ci	r = fast_page_fault(vcpu, fault);
445662306a36Sopenharmony_ci	if (r != RET_PF_INVALID)
445762306a36Sopenharmony_ci		return r;
445862306a36Sopenharmony_ci
445962306a36Sopenharmony_ci	r = mmu_topup_memory_caches(vcpu, false);
446062306a36Sopenharmony_ci	if (r)
446162306a36Sopenharmony_ci		return r;
446262306a36Sopenharmony_ci
446362306a36Sopenharmony_ci	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
446462306a36Sopenharmony_ci	if (r != RET_PF_CONTINUE)
446562306a36Sopenharmony_ci		return r;
446662306a36Sopenharmony_ci
446762306a36Sopenharmony_ci	r = RET_PF_RETRY;
446862306a36Sopenharmony_ci	read_lock(&vcpu->kvm->mmu_lock);
446962306a36Sopenharmony_ci
447062306a36Sopenharmony_ci	if (is_page_fault_stale(vcpu, fault))
447162306a36Sopenharmony_ci		goto out_unlock;
447262306a36Sopenharmony_ci
447362306a36Sopenharmony_ci	r = kvm_tdp_mmu_map(vcpu, fault);
447462306a36Sopenharmony_ci
447562306a36Sopenharmony_ciout_unlock:
447662306a36Sopenharmony_ci	read_unlock(&vcpu->kvm->mmu_lock);
447762306a36Sopenharmony_ci	kvm_release_pfn_clean(fault->pfn);
447862306a36Sopenharmony_ci	return r;
447962306a36Sopenharmony_ci}
448062306a36Sopenharmony_ci#endif
448162306a36Sopenharmony_ci
448262306a36Sopenharmony_ciint kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
448362306a36Sopenharmony_ci{
448462306a36Sopenharmony_ci	/*
448562306a36Sopenharmony_ci	 * If the guest's MTRRs may be used to compute the "real" memtype,
448662306a36Sopenharmony_ci	 * restrict the mapping level to ensure KVM uses a consistent memtype
448762306a36Sopenharmony_ci	 * across the entire mapping.  If the host MTRRs are ignored by TDP
448862306a36Sopenharmony_ci	 * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA
448962306a36Sopenharmony_ci	 * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype
449062306a36Sopenharmony_ci	 * from the guest's MTRRs so that guest accesses to memory that is
449162306a36Sopenharmony_ci	 * DMA'd aren't cached against the guest's wishes.
449262306a36Sopenharmony_ci	 *
449362306a36Sopenharmony_ci	 * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
449462306a36Sopenharmony_ci	 * e.g. KVM will force UC memtype for host MMIO.
449562306a36Sopenharmony_ci	 */
449662306a36Sopenharmony_ci	if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
449762306a36Sopenharmony_ci		for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
449862306a36Sopenharmony_ci			int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
449962306a36Sopenharmony_ci			gfn_t base = gfn_round_for_level(fault->gfn,
450062306a36Sopenharmony_ci							 fault->max_level);
450162306a36Sopenharmony_ci
450262306a36Sopenharmony_ci			if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
450362306a36Sopenharmony_ci				break;
450462306a36Sopenharmony_ci		}
450562306a36Sopenharmony_ci	}
450662306a36Sopenharmony_ci
450762306a36Sopenharmony_ci#ifdef CONFIG_X86_64
450862306a36Sopenharmony_ci	if (tdp_mmu_enabled)
450962306a36Sopenharmony_ci		return kvm_tdp_mmu_page_fault(vcpu, fault);
451062306a36Sopenharmony_ci#endif
451162306a36Sopenharmony_ci
451262306a36Sopenharmony_ci	return direct_page_fault(vcpu, fault);
451362306a36Sopenharmony_ci}
451462306a36Sopenharmony_ci
451562306a36Sopenharmony_cistatic void nonpaging_init_context(struct kvm_mmu *context)
451662306a36Sopenharmony_ci{
451762306a36Sopenharmony_ci	context->page_fault = nonpaging_page_fault;
451862306a36Sopenharmony_ci	context->gva_to_gpa = nonpaging_gva_to_gpa;
451962306a36Sopenharmony_ci	context->sync_spte = NULL;
452062306a36Sopenharmony_ci}
452162306a36Sopenharmony_ci
452262306a36Sopenharmony_cistatic inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
452362306a36Sopenharmony_ci				  union kvm_mmu_page_role role)
452462306a36Sopenharmony_ci{
452562306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
452662306a36Sopenharmony_ci
452762306a36Sopenharmony_ci	if (!VALID_PAGE(root->hpa))
452862306a36Sopenharmony_ci		return false;
452962306a36Sopenharmony_ci
453062306a36Sopenharmony_ci	if (!role.direct && pgd != root->pgd)
453162306a36Sopenharmony_ci		return false;
453262306a36Sopenharmony_ci
453362306a36Sopenharmony_ci	sp = root_to_sp(root->hpa);
453462306a36Sopenharmony_ci	if (WARN_ON_ONCE(!sp))
453562306a36Sopenharmony_ci		return false;
453662306a36Sopenharmony_ci
453762306a36Sopenharmony_ci	return role.word == sp->role.word;
453862306a36Sopenharmony_ci}
453962306a36Sopenharmony_ci
454062306a36Sopenharmony_ci/*
454162306a36Sopenharmony_ci * Find out if a previously cached root matching the new pgd/role is available,
454262306a36Sopenharmony_ci * and insert the current root as the MRU in the cache.
454362306a36Sopenharmony_ci * If a matching root is found, it is assigned to kvm_mmu->root and
454462306a36Sopenharmony_ci * true is returned.
454562306a36Sopenharmony_ci * If no match is found, kvm_mmu->root is left invalid, the LRU root is
454662306a36Sopenharmony_ci * evicted to make room for the current root, and false is returned.
454762306a36Sopenharmony_ci */
454862306a36Sopenharmony_cistatic bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
454962306a36Sopenharmony_ci					      gpa_t new_pgd,
455062306a36Sopenharmony_ci					      union kvm_mmu_page_role new_role)
455162306a36Sopenharmony_ci{
455262306a36Sopenharmony_ci	uint i;
455362306a36Sopenharmony_ci
455462306a36Sopenharmony_ci	if (is_root_usable(&mmu->root, new_pgd, new_role))
455562306a36Sopenharmony_ci		return true;
455662306a36Sopenharmony_ci
455762306a36Sopenharmony_ci	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
455862306a36Sopenharmony_ci		/*
455962306a36Sopenharmony_ci		 * The swaps end up rotating the cache like this:
456062306a36Sopenharmony_ci		 *   C   0 1 2 3   (on entry to the function)
456162306a36Sopenharmony_ci		 *   0   C 1 2 3
456262306a36Sopenharmony_ci		 *   1   C 0 2 3
456362306a36Sopenharmony_ci		 *   2   C 0 1 3
456462306a36Sopenharmony_ci		 *   3   C 0 1 2   (on exit from the loop)
456562306a36Sopenharmony_ci		 */
456662306a36Sopenharmony_ci		swap(mmu->root, mmu->prev_roots[i]);
456762306a36Sopenharmony_ci		if (is_root_usable(&mmu->root, new_pgd, new_role))
456862306a36Sopenharmony_ci			return true;
456962306a36Sopenharmony_ci	}
457062306a36Sopenharmony_ci
457162306a36Sopenharmony_ci	kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
457262306a36Sopenharmony_ci	return false;
457362306a36Sopenharmony_ci}
457462306a36Sopenharmony_ci
457562306a36Sopenharmony_ci/*
457662306a36Sopenharmony_ci * Find out if a previously cached root matching the new pgd/role is available.
457762306a36Sopenharmony_ci * On entry, mmu->root is invalid.
457862306a36Sopenharmony_ci * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
457962306a36Sopenharmony_ci * of the cache becomes invalid, and true is returned.
458062306a36Sopenharmony_ci * If no match is found, kvm_mmu->root is left invalid and false is returned.
458162306a36Sopenharmony_ci */
458262306a36Sopenharmony_cistatic bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
458362306a36Sopenharmony_ci					     gpa_t new_pgd,
458462306a36Sopenharmony_ci					     union kvm_mmu_page_role new_role)
458562306a36Sopenharmony_ci{
458662306a36Sopenharmony_ci	uint i;
458762306a36Sopenharmony_ci
458862306a36Sopenharmony_ci	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
458962306a36Sopenharmony_ci		if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role))
459062306a36Sopenharmony_ci			goto hit;
459162306a36Sopenharmony_ci
459262306a36Sopenharmony_ci	return false;
459362306a36Sopenharmony_ci
459462306a36Sopenharmony_cihit:
459562306a36Sopenharmony_ci	swap(mmu->root, mmu->prev_roots[i]);
459662306a36Sopenharmony_ci	/* Bubble up the remaining roots.  */
459762306a36Sopenharmony_ci	for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
459862306a36Sopenharmony_ci		mmu->prev_roots[i] = mmu->prev_roots[i + 1];
459962306a36Sopenharmony_ci	mmu->prev_roots[i].hpa = INVALID_PAGE;
460062306a36Sopenharmony_ci	return true;
460162306a36Sopenharmony_ci}
460262306a36Sopenharmony_ci
460362306a36Sopenharmony_cistatic bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
460462306a36Sopenharmony_ci			    gpa_t new_pgd, union kvm_mmu_page_role new_role)
460562306a36Sopenharmony_ci{
460662306a36Sopenharmony_ci	/*
460762306a36Sopenharmony_ci	 * Limit reuse to 64-bit hosts+VMs without "special" roots in order to
460862306a36Sopenharmony_ci	 * avoid having to deal with PDPTEs and other complexities.
460962306a36Sopenharmony_ci	 */
461062306a36Sopenharmony_ci	if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa))
461162306a36Sopenharmony_ci		kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
461262306a36Sopenharmony_ci
461362306a36Sopenharmony_ci	if (VALID_PAGE(mmu->root.hpa))
461462306a36Sopenharmony_ci		return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
461562306a36Sopenharmony_ci	else
461662306a36Sopenharmony_ci		return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
461762306a36Sopenharmony_ci}
461862306a36Sopenharmony_ci
461962306a36Sopenharmony_civoid kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
462062306a36Sopenharmony_ci{
462162306a36Sopenharmony_ci	struct kvm_mmu *mmu = vcpu->arch.mmu;
462262306a36Sopenharmony_ci	union kvm_mmu_page_role new_role = mmu->root_role;
462362306a36Sopenharmony_ci
462462306a36Sopenharmony_ci	/*
462562306a36Sopenharmony_ci	 * Return immediately if no usable root was found, kvm_mmu_reload()
462662306a36Sopenharmony_ci	 * will establish a valid root prior to the next VM-Enter.
462762306a36Sopenharmony_ci	 */
462862306a36Sopenharmony_ci	if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role))
462962306a36Sopenharmony_ci		return;
463062306a36Sopenharmony_ci
463162306a36Sopenharmony_ci	/*
463262306a36Sopenharmony_ci	 * It's possible that the cached previous root page is obsolete because
463362306a36Sopenharmony_ci	 * of a change in the MMU generation number. However, changing the
463462306a36Sopenharmony_ci	 * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
463562306a36Sopenharmony_ci	 * which will free the root set here and allocate a new one.
463662306a36Sopenharmony_ci	 */
463762306a36Sopenharmony_ci	kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
463862306a36Sopenharmony_ci
463962306a36Sopenharmony_ci	if (force_flush_and_sync_on_reuse) {
464062306a36Sopenharmony_ci		kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
464162306a36Sopenharmony_ci		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
464262306a36Sopenharmony_ci	}
464362306a36Sopenharmony_ci
464462306a36Sopenharmony_ci	/*
464562306a36Sopenharmony_ci	 * The last MMIO access's GVA and GPA are cached in the VCPU. When
464662306a36Sopenharmony_ci	 * switching to a new CR3, that GVA->GPA mapping may no longer be
464762306a36Sopenharmony_ci	 * valid. So clear any cached MMIO info even when we don't need to sync
464862306a36Sopenharmony_ci	 * the shadow page tables.
464962306a36Sopenharmony_ci	 */
465062306a36Sopenharmony_ci	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
465162306a36Sopenharmony_ci
465262306a36Sopenharmony_ci	/*
465362306a36Sopenharmony_ci	 * If this is a direct root page, it doesn't have a write flooding
465462306a36Sopenharmony_ci	 * count. Otherwise, clear the write flooding count.
465562306a36Sopenharmony_ci	 */
465662306a36Sopenharmony_ci	if (!new_role.direct) {
465762306a36Sopenharmony_ci		struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
465862306a36Sopenharmony_ci
465962306a36Sopenharmony_ci		if (!WARN_ON_ONCE(!sp))
466062306a36Sopenharmony_ci			__clear_sp_write_flooding_count(sp);
466162306a36Sopenharmony_ci	}
466262306a36Sopenharmony_ci}
466362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
466462306a36Sopenharmony_ci
466562306a36Sopenharmony_cistatic bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
466662306a36Sopenharmony_ci			   unsigned int access)
466762306a36Sopenharmony_ci{
466862306a36Sopenharmony_ci	if (unlikely(is_mmio_spte(*sptep))) {
466962306a36Sopenharmony_ci		if (gfn != get_mmio_spte_gfn(*sptep)) {
467062306a36Sopenharmony_ci			mmu_spte_clear_no_track(sptep);
467162306a36Sopenharmony_ci			return true;
467262306a36Sopenharmony_ci		}
467362306a36Sopenharmony_ci
467462306a36Sopenharmony_ci		mark_mmio_spte(vcpu, sptep, gfn, access);
467562306a36Sopenharmony_ci		return true;
467662306a36Sopenharmony_ci	}
467762306a36Sopenharmony_ci
467862306a36Sopenharmony_ci	return false;
467962306a36Sopenharmony_ci}
468062306a36Sopenharmony_ci
468162306a36Sopenharmony_ci#define PTTYPE_EPT 18 /* arbitrary */
468262306a36Sopenharmony_ci#define PTTYPE PTTYPE_EPT
468362306a36Sopenharmony_ci#include "paging_tmpl.h"
468462306a36Sopenharmony_ci#undef PTTYPE
468562306a36Sopenharmony_ci
468662306a36Sopenharmony_ci#define PTTYPE 64
468762306a36Sopenharmony_ci#include "paging_tmpl.h"
468862306a36Sopenharmony_ci#undef PTTYPE
468962306a36Sopenharmony_ci
469062306a36Sopenharmony_ci#define PTTYPE 32
469162306a36Sopenharmony_ci#include "paging_tmpl.h"
469262306a36Sopenharmony_ci#undef PTTYPE
469362306a36Sopenharmony_ci
469462306a36Sopenharmony_cistatic void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
469562306a36Sopenharmony_ci				    u64 pa_bits_rsvd, int level, bool nx,
469662306a36Sopenharmony_ci				    bool gbpages, bool pse, bool amd)
469762306a36Sopenharmony_ci{
469862306a36Sopenharmony_ci	u64 gbpages_bit_rsvd = 0;
469962306a36Sopenharmony_ci	u64 nonleaf_bit8_rsvd = 0;
470062306a36Sopenharmony_ci	u64 high_bits_rsvd;
470162306a36Sopenharmony_ci
470262306a36Sopenharmony_ci	rsvd_check->bad_mt_xwr = 0;
470362306a36Sopenharmony_ci
470462306a36Sopenharmony_ci	if (!gbpages)
470562306a36Sopenharmony_ci		gbpages_bit_rsvd = rsvd_bits(7, 7);
470662306a36Sopenharmony_ci
470762306a36Sopenharmony_ci	if (level == PT32E_ROOT_LEVEL)
470862306a36Sopenharmony_ci		high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
470962306a36Sopenharmony_ci	else
471062306a36Sopenharmony_ci		high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
471162306a36Sopenharmony_ci
471262306a36Sopenharmony_ci	/* Note, NX doesn't exist in PDPTEs, this is handled below. */
471362306a36Sopenharmony_ci	if (!nx)
471462306a36Sopenharmony_ci		high_bits_rsvd |= rsvd_bits(63, 63);
471562306a36Sopenharmony_ci
471662306a36Sopenharmony_ci	/*
471762306a36Sopenharmony_ci	 * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
471862306a36Sopenharmony_ci	 * leaf entries) on AMD CPUs only.
471962306a36Sopenharmony_ci	 */
472062306a36Sopenharmony_ci	if (amd)
472162306a36Sopenharmony_ci		nonleaf_bit8_rsvd = rsvd_bits(8, 8);
472262306a36Sopenharmony_ci
472362306a36Sopenharmony_ci	switch (level) {
472462306a36Sopenharmony_ci	case PT32_ROOT_LEVEL:
472562306a36Sopenharmony_ci		/* no rsvd bits for 2 level 4K page table entries */
472662306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[0][1] = 0;
472762306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[0][0] = 0;
472862306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[1][0] =
472962306a36Sopenharmony_ci			rsvd_check->rsvd_bits_mask[0][0];
473062306a36Sopenharmony_ci
473162306a36Sopenharmony_ci		if (!pse) {
473262306a36Sopenharmony_ci			rsvd_check->rsvd_bits_mask[1][1] = 0;
473362306a36Sopenharmony_ci			break;
473462306a36Sopenharmony_ci		}
473562306a36Sopenharmony_ci
473662306a36Sopenharmony_ci		if (is_cpuid_PSE36())
473762306a36Sopenharmony_ci			/* 36bits PSE 4MB page */
473862306a36Sopenharmony_ci			rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
473962306a36Sopenharmony_ci		else
474062306a36Sopenharmony_ci			/* 32 bits PSE 4MB page */
474162306a36Sopenharmony_ci			rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
474262306a36Sopenharmony_ci		break;
474362306a36Sopenharmony_ci	case PT32E_ROOT_LEVEL:
474462306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
474562306a36Sopenharmony_ci						   high_bits_rsvd |
474662306a36Sopenharmony_ci						   rsvd_bits(5, 8) |
474762306a36Sopenharmony_ci						   rsvd_bits(1, 2);	/* PDPTE */
474862306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;	/* PDE */
474962306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;	/* PTE */
475062306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
475162306a36Sopenharmony_ci						   rsvd_bits(13, 20);	/* large page */
475262306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[1][0] =
475362306a36Sopenharmony_ci			rsvd_check->rsvd_bits_mask[0][0];
475462306a36Sopenharmony_ci		break;
475562306a36Sopenharmony_ci	case PT64_ROOT_5LEVEL:
475662306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
475762306a36Sopenharmony_ci						   nonleaf_bit8_rsvd |
475862306a36Sopenharmony_ci						   rsvd_bits(7, 7);
475962306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[1][4] =
476062306a36Sopenharmony_ci			rsvd_check->rsvd_bits_mask[0][4];
476162306a36Sopenharmony_ci		fallthrough;
476262306a36Sopenharmony_ci	case PT64_ROOT_4LEVEL:
476362306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
476462306a36Sopenharmony_ci						   nonleaf_bit8_rsvd |
476562306a36Sopenharmony_ci						   rsvd_bits(7, 7);
476662306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
476762306a36Sopenharmony_ci						   gbpages_bit_rsvd;
476862306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
476962306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
477062306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[1][3] =
477162306a36Sopenharmony_ci			rsvd_check->rsvd_bits_mask[0][3];
477262306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
477362306a36Sopenharmony_ci						   gbpages_bit_rsvd |
477462306a36Sopenharmony_ci						   rsvd_bits(13, 29);
477562306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
477662306a36Sopenharmony_ci						   rsvd_bits(13, 20); /* large page */
477762306a36Sopenharmony_ci		rsvd_check->rsvd_bits_mask[1][0] =
477862306a36Sopenharmony_ci			rsvd_check->rsvd_bits_mask[0][0];
477962306a36Sopenharmony_ci		break;
478062306a36Sopenharmony_ci	}
478162306a36Sopenharmony_ci}
478262306a36Sopenharmony_ci
478362306a36Sopenharmony_cistatic void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
478462306a36Sopenharmony_ci					struct kvm_mmu *context)
478562306a36Sopenharmony_ci{
478662306a36Sopenharmony_ci	__reset_rsvds_bits_mask(&context->guest_rsvd_check,
478762306a36Sopenharmony_ci				vcpu->arch.reserved_gpa_bits,
478862306a36Sopenharmony_ci				context->cpu_role.base.level, is_efer_nx(context),
478962306a36Sopenharmony_ci				guest_can_use(vcpu, X86_FEATURE_GBPAGES),
479062306a36Sopenharmony_ci				is_cr4_pse(context),
479162306a36Sopenharmony_ci				guest_cpuid_is_amd_or_hygon(vcpu));
479262306a36Sopenharmony_ci}
479362306a36Sopenharmony_ci
479462306a36Sopenharmony_cistatic void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
479562306a36Sopenharmony_ci					u64 pa_bits_rsvd, bool execonly,
479662306a36Sopenharmony_ci					int huge_page_level)
479762306a36Sopenharmony_ci{
479862306a36Sopenharmony_ci	u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
479962306a36Sopenharmony_ci	u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
480062306a36Sopenharmony_ci	u64 bad_mt_xwr;
480162306a36Sopenharmony_ci
480262306a36Sopenharmony_ci	if (huge_page_level < PG_LEVEL_1G)
480362306a36Sopenharmony_ci		large_1g_rsvd = rsvd_bits(7, 7);
480462306a36Sopenharmony_ci	if (huge_page_level < PG_LEVEL_2M)
480562306a36Sopenharmony_ci		large_2m_rsvd = rsvd_bits(7, 7);
480662306a36Sopenharmony_ci
480762306a36Sopenharmony_ci	rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
480862306a36Sopenharmony_ci	rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
480962306a36Sopenharmony_ci	rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
481062306a36Sopenharmony_ci	rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
481162306a36Sopenharmony_ci	rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
481262306a36Sopenharmony_ci
481362306a36Sopenharmony_ci	/* large page */
481462306a36Sopenharmony_ci	rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
481562306a36Sopenharmony_ci	rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
481662306a36Sopenharmony_ci	rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
481762306a36Sopenharmony_ci	rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
481862306a36Sopenharmony_ci	rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
481962306a36Sopenharmony_ci
482062306a36Sopenharmony_ci	bad_mt_xwr = 0xFFull << (2 * 8);	/* bits 3..5 must not be 2 */
482162306a36Sopenharmony_ci	bad_mt_xwr |= 0xFFull << (3 * 8);	/* bits 3..5 must not be 3 */
482262306a36Sopenharmony_ci	bad_mt_xwr |= 0xFFull << (7 * 8);	/* bits 3..5 must not be 7 */
482362306a36Sopenharmony_ci	bad_mt_xwr |= REPEAT_BYTE(1ull << 2);	/* bits 0..2 must not be 010 */
482462306a36Sopenharmony_ci	bad_mt_xwr |= REPEAT_BYTE(1ull << 6);	/* bits 0..2 must not be 110 */
482562306a36Sopenharmony_ci	if (!execonly) {
482662306a36Sopenharmony_ci		/* bits 0..2 must not be 100 unless VMX capabilities allow it */
482762306a36Sopenharmony_ci		bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
482862306a36Sopenharmony_ci	}
482962306a36Sopenharmony_ci	rsvd_check->bad_mt_xwr = bad_mt_xwr;
483062306a36Sopenharmony_ci}
483162306a36Sopenharmony_ci
483262306a36Sopenharmony_cistatic void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
483362306a36Sopenharmony_ci		struct kvm_mmu *context, bool execonly, int huge_page_level)
483462306a36Sopenharmony_ci{
483562306a36Sopenharmony_ci	__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
483662306a36Sopenharmony_ci				    vcpu->arch.reserved_gpa_bits, execonly,
483762306a36Sopenharmony_ci				    huge_page_level);
483862306a36Sopenharmony_ci}
483962306a36Sopenharmony_ci
484062306a36Sopenharmony_cistatic inline u64 reserved_hpa_bits(void)
484162306a36Sopenharmony_ci{
484262306a36Sopenharmony_ci	return rsvd_bits(shadow_phys_bits, 63);
484362306a36Sopenharmony_ci}
484462306a36Sopenharmony_ci
484562306a36Sopenharmony_ci/*
484662306a36Sopenharmony_ci * the page table on host is the shadow page table for the page
484762306a36Sopenharmony_ci * table in guest or amd nested guest, its mmu features completely
484862306a36Sopenharmony_ci * follow the features in guest.
484962306a36Sopenharmony_ci */
485062306a36Sopenharmony_cistatic void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
485162306a36Sopenharmony_ci					struct kvm_mmu *context)
485262306a36Sopenharmony_ci{
485362306a36Sopenharmony_ci	/* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
485462306a36Sopenharmony_ci	bool is_amd = true;
485562306a36Sopenharmony_ci	/* KVM doesn't use 2-level page tables for the shadow MMU. */
485662306a36Sopenharmony_ci	bool is_pse = false;
485762306a36Sopenharmony_ci	struct rsvd_bits_validate *shadow_zero_check;
485862306a36Sopenharmony_ci	int i;
485962306a36Sopenharmony_ci
486062306a36Sopenharmony_ci	WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
486162306a36Sopenharmony_ci
486262306a36Sopenharmony_ci	shadow_zero_check = &context->shadow_zero_check;
486362306a36Sopenharmony_ci	__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
486462306a36Sopenharmony_ci				context->root_role.level,
486562306a36Sopenharmony_ci				context->root_role.efer_nx,
486662306a36Sopenharmony_ci				guest_can_use(vcpu, X86_FEATURE_GBPAGES),
486762306a36Sopenharmony_ci				is_pse, is_amd);
486862306a36Sopenharmony_ci
486962306a36Sopenharmony_ci	if (!shadow_me_mask)
487062306a36Sopenharmony_ci		return;
487162306a36Sopenharmony_ci
487262306a36Sopenharmony_ci	for (i = context->root_role.level; --i >= 0;) {
487362306a36Sopenharmony_ci		/*
487462306a36Sopenharmony_ci		 * So far shadow_me_value is a constant during KVM's life
487562306a36Sopenharmony_ci		 * time.  Bits in shadow_me_value are allowed to be set.
487662306a36Sopenharmony_ci		 * Bits in shadow_me_mask but not in shadow_me_value are
487762306a36Sopenharmony_ci		 * not allowed to be set.
487862306a36Sopenharmony_ci		 */
487962306a36Sopenharmony_ci		shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask;
488062306a36Sopenharmony_ci		shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask;
488162306a36Sopenharmony_ci		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value;
488262306a36Sopenharmony_ci		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value;
488362306a36Sopenharmony_ci	}
488462306a36Sopenharmony_ci
488562306a36Sopenharmony_ci}
488662306a36Sopenharmony_ci
488762306a36Sopenharmony_cistatic inline bool boot_cpu_is_amd(void)
488862306a36Sopenharmony_ci{
488962306a36Sopenharmony_ci	WARN_ON_ONCE(!tdp_enabled);
489062306a36Sopenharmony_ci	return shadow_x_mask == 0;
489162306a36Sopenharmony_ci}
489262306a36Sopenharmony_ci
489362306a36Sopenharmony_ci/*
489462306a36Sopenharmony_ci * the direct page table on host, use as much mmu features as
489562306a36Sopenharmony_ci * possible, however, kvm currently does not do execution-protection.
489662306a36Sopenharmony_ci */
489762306a36Sopenharmony_cistatic void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
489862306a36Sopenharmony_ci{
489962306a36Sopenharmony_ci	struct rsvd_bits_validate *shadow_zero_check;
490062306a36Sopenharmony_ci	int i;
490162306a36Sopenharmony_ci
490262306a36Sopenharmony_ci	shadow_zero_check = &context->shadow_zero_check;
490362306a36Sopenharmony_ci
490462306a36Sopenharmony_ci	if (boot_cpu_is_amd())
490562306a36Sopenharmony_ci		__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
490662306a36Sopenharmony_ci					context->root_role.level, true,
490762306a36Sopenharmony_ci					boot_cpu_has(X86_FEATURE_GBPAGES),
490862306a36Sopenharmony_ci					false, true);
490962306a36Sopenharmony_ci	else
491062306a36Sopenharmony_ci		__reset_rsvds_bits_mask_ept(shadow_zero_check,
491162306a36Sopenharmony_ci					    reserved_hpa_bits(), false,
491262306a36Sopenharmony_ci					    max_huge_page_level);
491362306a36Sopenharmony_ci
491462306a36Sopenharmony_ci	if (!shadow_me_mask)
491562306a36Sopenharmony_ci		return;
491662306a36Sopenharmony_ci
491762306a36Sopenharmony_ci	for (i = context->root_role.level; --i >= 0;) {
491862306a36Sopenharmony_ci		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
491962306a36Sopenharmony_ci		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
492062306a36Sopenharmony_ci	}
492162306a36Sopenharmony_ci}
492262306a36Sopenharmony_ci
492362306a36Sopenharmony_ci/*
492462306a36Sopenharmony_ci * as the comments in reset_shadow_zero_bits_mask() except it
492562306a36Sopenharmony_ci * is the shadow page table for intel nested guest.
492662306a36Sopenharmony_ci */
492762306a36Sopenharmony_cistatic void
492862306a36Sopenharmony_cireset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
492962306a36Sopenharmony_ci{
493062306a36Sopenharmony_ci	__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
493162306a36Sopenharmony_ci				    reserved_hpa_bits(), execonly,
493262306a36Sopenharmony_ci				    max_huge_page_level);
493362306a36Sopenharmony_ci}
493462306a36Sopenharmony_ci
493562306a36Sopenharmony_ci#define BYTE_MASK(access) \
493662306a36Sopenharmony_ci	((1 & (access) ? 2 : 0) | \
493762306a36Sopenharmony_ci	 (2 & (access) ? 4 : 0) | \
493862306a36Sopenharmony_ci	 (3 & (access) ? 8 : 0) | \
493962306a36Sopenharmony_ci	 (4 & (access) ? 16 : 0) | \
494062306a36Sopenharmony_ci	 (5 & (access) ? 32 : 0) | \
494162306a36Sopenharmony_ci	 (6 & (access) ? 64 : 0) | \
494262306a36Sopenharmony_ci	 (7 & (access) ? 128 : 0))
494362306a36Sopenharmony_ci
494462306a36Sopenharmony_ci
494562306a36Sopenharmony_cistatic void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
494662306a36Sopenharmony_ci{
494762306a36Sopenharmony_ci	unsigned byte;
494862306a36Sopenharmony_ci
494962306a36Sopenharmony_ci	const u8 x = BYTE_MASK(ACC_EXEC_MASK);
495062306a36Sopenharmony_ci	const u8 w = BYTE_MASK(ACC_WRITE_MASK);
495162306a36Sopenharmony_ci	const u8 u = BYTE_MASK(ACC_USER_MASK);
495262306a36Sopenharmony_ci
495362306a36Sopenharmony_ci	bool cr4_smep = is_cr4_smep(mmu);
495462306a36Sopenharmony_ci	bool cr4_smap = is_cr4_smap(mmu);
495562306a36Sopenharmony_ci	bool cr0_wp = is_cr0_wp(mmu);
495662306a36Sopenharmony_ci	bool efer_nx = is_efer_nx(mmu);
495762306a36Sopenharmony_ci
495862306a36Sopenharmony_ci	for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
495962306a36Sopenharmony_ci		unsigned pfec = byte << 1;
496062306a36Sopenharmony_ci
496162306a36Sopenharmony_ci		/*
496262306a36Sopenharmony_ci		 * Each "*f" variable has a 1 bit for each UWX value
496362306a36Sopenharmony_ci		 * that causes a fault with the given PFEC.
496462306a36Sopenharmony_ci		 */
496562306a36Sopenharmony_ci
496662306a36Sopenharmony_ci		/* Faults from writes to non-writable pages */
496762306a36Sopenharmony_ci		u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
496862306a36Sopenharmony_ci		/* Faults from user mode accesses to supervisor pages */
496962306a36Sopenharmony_ci		u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
497062306a36Sopenharmony_ci		/* Faults from fetches of non-executable pages*/
497162306a36Sopenharmony_ci		u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
497262306a36Sopenharmony_ci		/* Faults from kernel mode fetches of user pages */
497362306a36Sopenharmony_ci		u8 smepf = 0;
497462306a36Sopenharmony_ci		/* Faults from kernel mode accesses of user pages */
497562306a36Sopenharmony_ci		u8 smapf = 0;
497662306a36Sopenharmony_ci
497762306a36Sopenharmony_ci		if (!ept) {
497862306a36Sopenharmony_ci			/* Faults from kernel mode accesses to user pages */
497962306a36Sopenharmony_ci			u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
498062306a36Sopenharmony_ci
498162306a36Sopenharmony_ci			/* Not really needed: !nx will cause pte.nx to fault */
498262306a36Sopenharmony_ci			if (!efer_nx)
498362306a36Sopenharmony_ci				ff = 0;
498462306a36Sopenharmony_ci
498562306a36Sopenharmony_ci			/* Allow supervisor writes if !cr0.wp */
498662306a36Sopenharmony_ci			if (!cr0_wp)
498762306a36Sopenharmony_ci				wf = (pfec & PFERR_USER_MASK) ? wf : 0;
498862306a36Sopenharmony_ci
498962306a36Sopenharmony_ci			/* Disallow supervisor fetches of user code if cr4.smep */
499062306a36Sopenharmony_ci			if (cr4_smep)
499162306a36Sopenharmony_ci				smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
499262306a36Sopenharmony_ci
499362306a36Sopenharmony_ci			/*
499462306a36Sopenharmony_ci			 * SMAP:kernel-mode data accesses from user-mode
499562306a36Sopenharmony_ci			 * mappings should fault. A fault is considered
499662306a36Sopenharmony_ci			 * as a SMAP violation if all of the following
499762306a36Sopenharmony_ci			 * conditions are true:
499862306a36Sopenharmony_ci			 *   - X86_CR4_SMAP is set in CR4
499962306a36Sopenharmony_ci			 *   - A user page is accessed
500062306a36Sopenharmony_ci			 *   - The access is not a fetch
500162306a36Sopenharmony_ci			 *   - The access is supervisor mode
500262306a36Sopenharmony_ci			 *   - If implicit supervisor access or X86_EFLAGS_AC is clear
500362306a36Sopenharmony_ci			 *
500462306a36Sopenharmony_ci			 * Here, we cover the first four conditions.
500562306a36Sopenharmony_ci			 * The fifth is computed dynamically in permission_fault();
500662306a36Sopenharmony_ci			 * PFERR_RSVD_MASK bit will be set in PFEC if the access is
500762306a36Sopenharmony_ci			 * *not* subject to SMAP restrictions.
500862306a36Sopenharmony_ci			 */
500962306a36Sopenharmony_ci			if (cr4_smap)
501062306a36Sopenharmony_ci				smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
501162306a36Sopenharmony_ci		}
501262306a36Sopenharmony_ci
501362306a36Sopenharmony_ci		mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
501462306a36Sopenharmony_ci	}
501562306a36Sopenharmony_ci}
501662306a36Sopenharmony_ci
501762306a36Sopenharmony_ci/*
501862306a36Sopenharmony_ci* PKU is an additional mechanism by which the paging controls access to
501962306a36Sopenharmony_ci* user-mode addresses based on the value in the PKRU register.  Protection
502062306a36Sopenharmony_ci* key violations are reported through a bit in the page fault error code.
502162306a36Sopenharmony_ci* Unlike other bits of the error code, the PK bit is not known at the
502262306a36Sopenharmony_ci* call site of e.g. gva_to_gpa; it must be computed directly in
502362306a36Sopenharmony_ci* permission_fault based on two bits of PKRU, on some machine state (CR4,
502462306a36Sopenharmony_ci* CR0, EFER, CPL), and on other bits of the error code and the page tables.
502562306a36Sopenharmony_ci*
502662306a36Sopenharmony_ci* In particular the following conditions come from the error code, the
502762306a36Sopenharmony_ci* page tables and the machine state:
502862306a36Sopenharmony_ci* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
502962306a36Sopenharmony_ci* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
503062306a36Sopenharmony_ci* - PK is always zero if U=0 in the page tables
503162306a36Sopenharmony_ci* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
503262306a36Sopenharmony_ci*
503362306a36Sopenharmony_ci* The PKRU bitmask caches the result of these four conditions.  The error
503462306a36Sopenharmony_ci* code (minus the P bit) and the page table's U bit form an index into the
503562306a36Sopenharmony_ci* PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
503662306a36Sopenharmony_ci* with the two bits of the PKRU register corresponding to the protection key.
503762306a36Sopenharmony_ci* For the first three conditions above the bits will be 00, thus masking
503862306a36Sopenharmony_ci* away both AD and WD.  For all reads or if the last condition holds, WD
503962306a36Sopenharmony_ci* only will be masked away.
504062306a36Sopenharmony_ci*/
504162306a36Sopenharmony_cistatic void update_pkru_bitmask(struct kvm_mmu *mmu)
504262306a36Sopenharmony_ci{
504362306a36Sopenharmony_ci	unsigned bit;
504462306a36Sopenharmony_ci	bool wp;
504562306a36Sopenharmony_ci
504662306a36Sopenharmony_ci	mmu->pkru_mask = 0;
504762306a36Sopenharmony_ci
504862306a36Sopenharmony_ci	if (!is_cr4_pke(mmu))
504962306a36Sopenharmony_ci		return;
505062306a36Sopenharmony_ci
505162306a36Sopenharmony_ci	wp = is_cr0_wp(mmu);
505262306a36Sopenharmony_ci
505362306a36Sopenharmony_ci	for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
505462306a36Sopenharmony_ci		unsigned pfec, pkey_bits;
505562306a36Sopenharmony_ci		bool check_pkey, check_write, ff, uf, wf, pte_user;
505662306a36Sopenharmony_ci
505762306a36Sopenharmony_ci		pfec = bit << 1;
505862306a36Sopenharmony_ci		ff = pfec & PFERR_FETCH_MASK;
505962306a36Sopenharmony_ci		uf = pfec & PFERR_USER_MASK;
506062306a36Sopenharmony_ci		wf = pfec & PFERR_WRITE_MASK;
506162306a36Sopenharmony_ci
506262306a36Sopenharmony_ci		/* PFEC.RSVD is replaced by ACC_USER_MASK. */
506362306a36Sopenharmony_ci		pte_user = pfec & PFERR_RSVD_MASK;
506462306a36Sopenharmony_ci
506562306a36Sopenharmony_ci		/*
506662306a36Sopenharmony_ci		 * Only need to check the access which is not an
506762306a36Sopenharmony_ci		 * instruction fetch and is to a user page.
506862306a36Sopenharmony_ci		 */
506962306a36Sopenharmony_ci		check_pkey = (!ff && pte_user);
507062306a36Sopenharmony_ci		/*
507162306a36Sopenharmony_ci		 * write access is controlled by PKRU if it is a
507262306a36Sopenharmony_ci		 * user access or CR0.WP = 1.
507362306a36Sopenharmony_ci		 */
507462306a36Sopenharmony_ci		check_write = check_pkey && wf && (uf || wp);
507562306a36Sopenharmony_ci
507662306a36Sopenharmony_ci		/* PKRU.AD stops both read and write access. */
507762306a36Sopenharmony_ci		pkey_bits = !!check_pkey;
507862306a36Sopenharmony_ci		/* PKRU.WD stops write access. */
507962306a36Sopenharmony_ci		pkey_bits |= (!!check_write) << 1;
508062306a36Sopenharmony_ci
508162306a36Sopenharmony_ci		mmu->pkru_mask |= (pkey_bits & 3) << pfec;
508262306a36Sopenharmony_ci	}
508362306a36Sopenharmony_ci}
508462306a36Sopenharmony_ci
508562306a36Sopenharmony_cistatic void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
508662306a36Sopenharmony_ci					struct kvm_mmu *mmu)
508762306a36Sopenharmony_ci{
508862306a36Sopenharmony_ci	if (!is_cr0_pg(mmu))
508962306a36Sopenharmony_ci		return;
509062306a36Sopenharmony_ci
509162306a36Sopenharmony_ci	reset_guest_rsvds_bits_mask(vcpu, mmu);
509262306a36Sopenharmony_ci	update_permission_bitmask(mmu, false);
509362306a36Sopenharmony_ci	update_pkru_bitmask(mmu);
509462306a36Sopenharmony_ci}
509562306a36Sopenharmony_ci
509662306a36Sopenharmony_cistatic void paging64_init_context(struct kvm_mmu *context)
509762306a36Sopenharmony_ci{
509862306a36Sopenharmony_ci	context->page_fault = paging64_page_fault;
509962306a36Sopenharmony_ci	context->gva_to_gpa = paging64_gva_to_gpa;
510062306a36Sopenharmony_ci	context->sync_spte = paging64_sync_spte;
510162306a36Sopenharmony_ci}
510262306a36Sopenharmony_ci
510362306a36Sopenharmony_cistatic void paging32_init_context(struct kvm_mmu *context)
510462306a36Sopenharmony_ci{
510562306a36Sopenharmony_ci	context->page_fault = paging32_page_fault;
510662306a36Sopenharmony_ci	context->gva_to_gpa = paging32_gva_to_gpa;
510762306a36Sopenharmony_ci	context->sync_spte = paging32_sync_spte;
510862306a36Sopenharmony_ci}
510962306a36Sopenharmony_ci
511062306a36Sopenharmony_cistatic union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
511162306a36Sopenharmony_ci					    const struct kvm_mmu_role_regs *regs)
511262306a36Sopenharmony_ci{
511362306a36Sopenharmony_ci	union kvm_cpu_role role = {0};
511462306a36Sopenharmony_ci
511562306a36Sopenharmony_ci	role.base.access = ACC_ALL;
511662306a36Sopenharmony_ci	role.base.smm = is_smm(vcpu);
511762306a36Sopenharmony_ci	role.base.guest_mode = is_guest_mode(vcpu);
511862306a36Sopenharmony_ci	role.ext.valid = 1;
511962306a36Sopenharmony_ci
512062306a36Sopenharmony_ci	if (!____is_cr0_pg(regs)) {
512162306a36Sopenharmony_ci		role.base.direct = 1;
512262306a36Sopenharmony_ci		return role;
512362306a36Sopenharmony_ci	}
512462306a36Sopenharmony_ci
512562306a36Sopenharmony_ci	role.base.efer_nx = ____is_efer_nx(regs);
512662306a36Sopenharmony_ci	role.base.cr0_wp = ____is_cr0_wp(regs);
512762306a36Sopenharmony_ci	role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
512862306a36Sopenharmony_ci	role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
512962306a36Sopenharmony_ci	role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
513062306a36Sopenharmony_ci
513162306a36Sopenharmony_ci	if (____is_efer_lma(regs))
513262306a36Sopenharmony_ci		role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL
513362306a36Sopenharmony_ci							: PT64_ROOT_4LEVEL;
513462306a36Sopenharmony_ci	else if (____is_cr4_pae(regs))
513562306a36Sopenharmony_ci		role.base.level = PT32E_ROOT_LEVEL;
513662306a36Sopenharmony_ci	else
513762306a36Sopenharmony_ci		role.base.level = PT32_ROOT_LEVEL;
513862306a36Sopenharmony_ci
513962306a36Sopenharmony_ci	role.ext.cr4_smep = ____is_cr4_smep(regs);
514062306a36Sopenharmony_ci	role.ext.cr4_smap = ____is_cr4_smap(regs);
514162306a36Sopenharmony_ci	role.ext.cr4_pse = ____is_cr4_pse(regs);
514262306a36Sopenharmony_ci
514362306a36Sopenharmony_ci	/* PKEY and LA57 are active iff long mode is active. */
514462306a36Sopenharmony_ci	role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
514562306a36Sopenharmony_ci	role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
514662306a36Sopenharmony_ci	role.ext.efer_lma = ____is_efer_lma(regs);
514762306a36Sopenharmony_ci	return role;
514862306a36Sopenharmony_ci}
514962306a36Sopenharmony_ci
515062306a36Sopenharmony_civoid __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
515162306a36Sopenharmony_ci					struct kvm_mmu *mmu)
515262306a36Sopenharmony_ci{
515362306a36Sopenharmony_ci	const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP);
515462306a36Sopenharmony_ci
515562306a36Sopenharmony_ci	BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP);
515662306a36Sopenharmony_ci	BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS));
515762306a36Sopenharmony_ci
515862306a36Sopenharmony_ci	if (is_cr0_wp(mmu) == cr0_wp)
515962306a36Sopenharmony_ci		return;
516062306a36Sopenharmony_ci
516162306a36Sopenharmony_ci	mmu->cpu_role.base.cr0_wp = cr0_wp;
516262306a36Sopenharmony_ci	reset_guest_paging_metadata(vcpu, mmu);
516362306a36Sopenharmony_ci}
516462306a36Sopenharmony_ci
516562306a36Sopenharmony_cistatic inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
516662306a36Sopenharmony_ci{
516762306a36Sopenharmony_ci	/* tdp_root_level is architecture forced level, use it if nonzero */
516862306a36Sopenharmony_ci	if (tdp_root_level)
516962306a36Sopenharmony_ci		return tdp_root_level;
517062306a36Sopenharmony_ci
517162306a36Sopenharmony_ci	/* Use 5-level TDP if and only if it's useful/necessary. */
517262306a36Sopenharmony_ci	if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
517362306a36Sopenharmony_ci		return 4;
517462306a36Sopenharmony_ci
517562306a36Sopenharmony_ci	return max_tdp_level;
517662306a36Sopenharmony_ci}
517762306a36Sopenharmony_ci
517862306a36Sopenharmony_cistatic union kvm_mmu_page_role
517962306a36Sopenharmony_cikvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
518062306a36Sopenharmony_ci				union kvm_cpu_role cpu_role)
518162306a36Sopenharmony_ci{
518262306a36Sopenharmony_ci	union kvm_mmu_page_role role = {0};
518362306a36Sopenharmony_ci
518462306a36Sopenharmony_ci	role.access = ACC_ALL;
518562306a36Sopenharmony_ci	role.cr0_wp = true;
518662306a36Sopenharmony_ci	role.efer_nx = true;
518762306a36Sopenharmony_ci	role.smm = cpu_role.base.smm;
518862306a36Sopenharmony_ci	role.guest_mode = cpu_role.base.guest_mode;
518962306a36Sopenharmony_ci	role.ad_disabled = !kvm_ad_enabled();
519062306a36Sopenharmony_ci	role.level = kvm_mmu_get_tdp_level(vcpu);
519162306a36Sopenharmony_ci	role.direct = true;
519262306a36Sopenharmony_ci	role.has_4_byte_gpte = false;
519362306a36Sopenharmony_ci
519462306a36Sopenharmony_ci	return role;
519562306a36Sopenharmony_ci}
519662306a36Sopenharmony_ci
519762306a36Sopenharmony_cistatic void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
519862306a36Sopenharmony_ci			     union kvm_cpu_role cpu_role)
519962306a36Sopenharmony_ci{
520062306a36Sopenharmony_ci	struct kvm_mmu *context = &vcpu->arch.root_mmu;
520162306a36Sopenharmony_ci	union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
520262306a36Sopenharmony_ci
520362306a36Sopenharmony_ci	if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
520462306a36Sopenharmony_ci	    root_role.word == context->root_role.word)
520562306a36Sopenharmony_ci		return;
520662306a36Sopenharmony_ci
520762306a36Sopenharmony_ci	context->cpu_role.as_u64 = cpu_role.as_u64;
520862306a36Sopenharmony_ci	context->root_role.word = root_role.word;
520962306a36Sopenharmony_ci	context->page_fault = kvm_tdp_page_fault;
521062306a36Sopenharmony_ci	context->sync_spte = NULL;
521162306a36Sopenharmony_ci	context->get_guest_pgd = get_guest_cr3;
521262306a36Sopenharmony_ci	context->get_pdptr = kvm_pdptr_read;
521362306a36Sopenharmony_ci	context->inject_page_fault = kvm_inject_page_fault;
521462306a36Sopenharmony_ci
521562306a36Sopenharmony_ci	if (!is_cr0_pg(context))
521662306a36Sopenharmony_ci		context->gva_to_gpa = nonpaging_gva_to_gpa;
521762306a36Sopenharmony_ci	else if (is_cr4_pae(context))
521862306a36Sopenharmony_ci		context->gva_to_gpa = paging64_gva_to_gpa;
521962306a36Sopenharmony_ci	else
522062306a36Sopenharmony_ci		context->gva_to_gpa = paging32_gva_to_gpa;
522162306a36Sopenharmony_ci
522262306a36Sopenharmony_ci	reset_guest_paging_metadata(vcpu, context);
522362306a36Sopenharmony_ci	reset_tdp_shadow_zero_bits_mask(context);
522462306a36Sopenharmony_ci}
522562306a36Sopenharmony_ci
522662306a36Sopenharmony_cistatic void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
522762306a36Sopenharmony_ci				    union kvm_cpu_role cpu_role,
522862306a36Sopenharmony_ci				    union kvm_mmu_page_role root_role)
522962306a36Sopenharmony_ci{
523062306a36Sopenharmony_ci	if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
523162306a36Sopenharmony_ci	    root_role.word == context->root_role.word)
523262306a36Sopenharmony_ci		return;
523362306a36Sopenharmony_ci
523462306a36Sopenharmony_ci	context->cpu_role.as_u64 = cpu_role.as_u64;
523562306a36Sopenharmony_ci	context->root_role.word = root_role.word;
523662306a36Sopenharmony_ci
523762306a36Sopenharmony_ci	if (!is_cr0_pg(context))
523862306a36Sopenharmony_ci		nonpaging_init_context(context);
523962306a36Sopenharmony_ci	else if (is_cr4_pae(context))
524062306a36Sopenharmony_ci		paging64_init_context(context);
524162306a36Sopenharmony_ci	else
524262306a36Sopenharmony_ci		paging32_init_context(context);
524362306a36Sopenharmony_ci
524462306a36Sopenharmony_ci	reset_guest_paging_metadata(vcpu, context);
524562306a36Sopenharmony_ci	reset_shadow_zero_bits_mask(vcpu, context);
524662306a36Sopenharmony_ci}
524762306a36Sopenharmony_ci
524862306a36Sopenharmony_cistatic void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
524962306a36Sopenharmony_ci				union kvm_cpu_role cpu_role)
525062306a36Sopenharmony_ci{
525162306a36Sopenharmony_ci	struct kvm_mmu *context = &vcpu->arch.root_mmu;
525262306a36Sopenharmony_ci	union kvm_mmu_page_role root_role;
525362306a36Sopenharmony_ci
525462306a36Sopenharmony_ci	root_role = cpu_role.base;
525562306a36Sopenharmony_ci
525662306a36Sopenharmony_ci	/* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
525762306a36Sopenharmony_ci	root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
525862306a36Sopenharmony_ci
525962306a36Sopenharmony_ci	/*
526062306a36Sopenharmony_ci	 * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
526162306a36Sopenharmony_ci	 * KVM uses NX when TDP is disabled to handle a variety of scenarios,
526262306a36Sopenharmony_ci	 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
526362306a36Sopenharmony_ci	 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
526462306a36Sopenharmony_ci	 * The iTLB multi-hit workaround can be toggled at any time, so assume
526562306a36Sopenharmony_ci	 * NX can be used by any non-nested shadow MMU to avoid having to reset
526662306a36Sopenharmony_ci	 * MMU contexts.
526762306a36Sopenharmony_ci	 */
526862306a36Sopenharmony_ci	root_role.efer_nx = true;
526962306a36Sopenharmony_ci
527062306a36Sopenharmony_ci	shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
527162306a36Sopenharmony_ci}
527262306a36Sopenharmony_ci
527362306a36Sopenharmony_civoid kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
527462306a36Sopenharmony_ci			     unsigned long cr4, u64 efer, gpa_t nested_cr3)
527562306a36Sopenharmony_ci{
527662306a36Sopenharmony_ci	struct kvm_mmu *context = &vcpu->arch.guest_mmu;
527762306a36Sopenharmony_ci	struct kvm_mmu_role_regs regs = {
527862306a36Sopenharmony_ci		.cr0 = cr0,
527962306a36Sopenharmony_ci		.cr4 = cr4 & ~X86_CR4_PKE,
528062306a36Sopenharmony_ci		.efer = efer,
528162306a36Sopenharmony_ci	};
528262306a36Sopenharmony_ci	union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
528362306a36Sopenharmony_ci	union kvm_mmu_page_role root_role;
528462306a36Sopenharmony_ci
528562306a36Sopenharmony_ci	/* NPT requires CR0.PG=1. */
528662306a36Sopenharmony_ci	WARN_ON_ONCE(cpu_role.base.direct);
528762306a36Sopenharmony_ci
528862306a36Sopenharmony_ci	root_role = cpu_role.base;
528962306a36Sopenharmony_ci	root_role.level = kvm_mmu_get_tdp_level(vcpu);
529062306a36Sopenharmony_ci	if (root_role.level == PT64_ROOT_5LEVEL &&
529162306a36Sopenharmony_ci	    cpu_role.base.level == PT64_ROOT_4LEVEL)
529262306a36Sopenharmony_ci		root_role.passthrough = 1;
529362306a36Sopenharmony_ci
529462306a36Sopenharmony_ci	shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
529562306a36Sopenharmony_ci	kvm_mmu_new_pgd(vcpu, nested_cr3);
529662306a36Sopenharmony_ci}
529762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
529862306a36Sopenharmony_ci
529962306a36Sopenharmony_cistatic union kvm_cpu_role
530062306a36Sopenharmony_cikvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
530162306a36Sopenharmony_ci				   bool execonly, u8 level)
530262306a36Sopenharmony_ci{
530362306a36Sopenharmony_ci	union kvm_cpu_role role = {0};
530462306a36Sopenharmony_ci
530562306a36Sopenharmony_ci	/*
530662306a36Sopenharmony_ci	 * KVM does not support SMM transfer monitors, and consequently does not
530762306a36Sopenharmony_ci	 * support the "entry to SMM" control either.  role.base.smm is always 0.
530862306a36Sopenharmony_ci	 */
530962306a36Sopenharmony_ci	WARN_ON_ONCE(is_smm(vcpu));
531062306a36Sopenharmony_ci	role.base.level = level;
531162306a36Sopenharmony_ci	role.base.has_4_byte_gpte = false;
531262306a36Sopenharmony_ci	role.base.direct = false;
531362306a36Sopenharmony_ci	role.base.ad_disabled = !accessed_dirty;
531462306a36Sopenharmony_ci	role.base.guest_mode = true;
531562306a36Sopenharmony_ci	role.base.access = ACC_ALL;
531662306a36Sopenharmony_ci
531762306a36Sopenharmony_ci	role.ext.word = 0;
531862306a36Sopenharmony_ci	role.ext.execonly = execonly;
531962306a36Sopenharmony_ci	role.ext.valid = 1;
532062306a36Sopenharmony_ci
532162306a36Sopenharmony_ci	return role;
532262306a36Sopenharmony_ci}
532362306a36Sopenharmony_ci
532462306a36Sopenharmony_civoid kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
532562306a36Sopenharmony_ci			     int huge_page_level, bool accessed_dirty,
532662306a36Sopenharmony_ci			     gpa_t new_eptp)
532762306a36Sopenharmony_ci{
532862306a36Sopenharmony_ci	struct kvm_mmu *context = &vcpu->arch.guest_mmu;
532962306a36Sopenharmony_ci	u8 level = vmx_eptp_page_walk_level(new_eptp);
533062306a36Sopenharmony_ci	union kvm_cpu_role new_mode =
533162306a36Sopenharmony_ci		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
533262306a36Sopenharmony_ci						   execonly, level);
533362306a36Sopenharmony_ci
533462306a36Sopenharmony_ci	if (new_mode.as_u64 != context->cpu_role.as_u64) {
533562306a36Sopenharmony_ci		/* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
533662306a36Sopenharmony_ci		context->cpu_role.as_u64 = new_mode.as_u64;
533762306a36Sopenharmony_ci		context->root_role.word = new_mode.base.word;
533862306a36Sopenharmony_ci
533962306a36Sopenharmony_ci		context->page_fault = ept_page_fault;
534062306a36Sopenharmony_ci		context->gva_to_gpa = ept_gva_to_gpa;
534162306a36Sopenharmony_ci		context->sync_spte = ept_sync_spte;
534262306a36Sopenharmony_ci
534362306a36Sopenharmony_ci		update_permission_bitmask(context, true);
534462306a36Sopenharmony_ci		context->pkru_mask = 0;
534562306a36Sopenharmony_ci		reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
534662306a36Sopenharmony_ci		reset_ept_shadow_zero_bits_mask(context, execonly);
534762306a36Sopenharmony_ci	}
534862306a36Sopenharmony_ci
534962306a36Sopenharmony_ci	kvm_mmu_new_pgd(vcpu, new_eptp);
535062306a36Sopenharmony_ci}
535162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
535262306a36Sopenharmony_ci
535362306a36Sopenharmony_cistatic void init_kvm_softmmu(struct kvm_vcpu *vcpu,
535462306a36Sopenharmony_ci			     union kvm_cpu_role cpu_role)
535562306a36Sopenharmony_ci{
535662306a36Sopenharmony_ci	struct kvm_mmu *context = &vcpu->arch.root_mmu;
535762306a36Sopenharmony_ci
535862306a36Sopenharmony_ci	kvm_init_shadow_mmu(vcpu, cpu_role);
535962306a36Sopenharmony_ci
536062306a36Sopenharmony_ci	context->get_guest_pgd     = get_guest_cr3;
536162306a36Sopenharmony_ci	context->get_pdptr         = kvm_pdptr_read;
536262306a36Sopenharmony_ci	context->inject_page_fault = kvm_inject_page_fault;
536362306a36Sopenharmony_ci}
536462306a36Sopenharmony_ci
536562306a36Sopenharmony_cistatic void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
536662306a36Sopenharmony_ci				union kvm_cpu_role new_mode)
536762306a36Sopenharmony_ci{
536862306a36Sopenharmony_ci	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
536962306a36Sopenharmony_ci
537062306a36Sopenharmony_ci	if (new_mode.as_u64 == g_context->cpu_role.as_u64)
537162306a36Sopenharmony_ci		return;
537262306a36Sopenharmony_ci
537362306a36Sopenharmony_ci	g_context->cpu_role.as_u64   = new_mode.as_u64;
537462306a36Sopenharmony_ci	g_context->get_guest_pgd     = get_guest_cr3;
537562306a36Sopenharmony_ci	g_context->get_pdptr         = kvm_pdptr_read;
537662306a36Sopenharmony_ci	g_context->inject_page_fault = kvm_inject_page_fault;
537762306a36Sopenharmony_ci
537862306a36Sopenharmony_ci	/*
537962306a36Sopenharmony_ci	 * L2 page tables are never shadowed, so there is no need to sync
538062306a36Sopenharmony_ci	 * SPTEs.
538162306a36Sopenharmony_ci	 */
538262306a36Sopenharmony_ci	g_context->sync_spte         = NULL;
538362306a36Sopenharmony_ci
538462306a36Sopenharmony_ci	/*
538562306a36Sopenharmony_ci	 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
538662306a36Sopenharmony_ci	 * L1's nested page tables (e.g. EPT12). The nested translation
538762306a36Sopenharmony_ci	 * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
538862306a36Sopenharmony_ci	 * L2's page tables as the first level of translation and L1's
538962306a36Sopenharmony_ci	 * nested page tables as the second level of translation. Basically
539062306a36Sopenharmony_ci	 * the gva_to_gpa functions between mmu and nested_mmu are swapped.
539162306a36Sopenharmony_ci	 */
539262306a36Sopenharmony_ci	if (!is_paging(vcpu))
539362306a36Sopenharmony_ci		g_context->gva_to_gpa = nonpaging_gva_to_gpa;
539462306a36Sopenharmony_ci	else if (is_long_mode(vcpu))
539562306a36Sopenharmony_ci		g_context->gva_to_gpa = paging64_gva_to_gpa;
539662306a36Sopenharmony_ci	else if (is_pae(vcpu))
539762306a36Sopenharmony_ci		g_context->gva_to_gpa = paging64_gva_to_gpa;
539862306a36Sopenharmony_ci	else
539962306a36Sopenharmony_ci		g_context->gva_to_gpa = paging32_gva_to_gpa;
540062306a36Sopenharmony_ci
540162306a36Sopenharmony_ci	reset_guest_paging_metadata(vcpu, g_context);
540262306a36Sopenharmony_ci}
540362306a36Sopenharmony_ci
540462306a36Sopenharmony_civoid kvm_init_mmu(struct kvm_vcpu *vcpu)
540562306a36Sopenharmony_ci{
540662306a36Sopenharmony_ci	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
540762306a36Sopenharmony_ci	union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
540862306a36Sopenharmony_ci
540962306a36Sopenharmony_ci	if (mmu_is_nested(vcpu))
541062306a36Sopenharmony_ci		init_kvm_nested_mmu(vcpu, cpu_role);
541162306a36Sopenharmony_ci	else if (tdp_enabled)
541262306a36Sopenharmony_ci		init_kvm_tdp_mmu(vcpu, cpu_role);
541362306a36Sopenharmony_ci	else
541462306a36Sopenharmony_ci		init_kvm_softmmu(vcpu, cpu_role);
541562306a36Sopenharmony_ci}
541662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_init_mmu);
541762306a36Sopenharmony_ci
541862306a36Sopenharmony_civoid kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
541962306a36Sopenharmony_ci{
542062306a36Sopenharmony_ci	/*
542162306a36Sopenharmony_ci	 * Invalidate all MMU roles to force them to reinitialize as CPUID
542262306a36Sopenharmony_ci	 * information is factored into reserved bit calculations.
542362306a36Sopenharmony_ci	 *
542462306a36Sopenharmony_ci	 * Correctly handling multiple vCPU models with respect to paging and
542562306a36Sopenharmony_ci	 * physical address properties) in a single VM would require tracking
542662306a36Sopenharmony_ci	 * all relevant CPUID information in kvm_mmu_page_role. That is very
542762306a36Sopenharmony_ci	 * undesirable as it would increase the memory requirements for
542862306a36Sopenharmony_ci	 * gfn_write_track (see struct kvm_mmu_page_role comments).  For now
542962306a36Sopenharmony_ci	 * that problem is swept under the rug; KVM's CPUID API is horrific and
543062306a36Sopenharmony_ci	 * it's all but impossible to solve it without introducing a new API.
543162306a36Sopenharmony_ci	 */
543262306a36Sopenharmony_ci	vcpu->arch.root_mmu.root_role.word = 0;
543362306a36Sopenharmony_ci	vcpu->arch.guest_mmu.root_role.word = 0;
543462306a36Sopenharmony_ci	vcpu->arch.nested_mmu.root_role.word = 0;
543562306a36Sopenharmony_ci	vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
543662306a36Sopenharmony_ci	vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
543762306a36Sopenharmony_ci	vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
543862306a36Sopenharmony_ci	kvm_mmu_reset_context(vcpu);
543962306a36Sopenharmony_ci
544062306a36Sopenharmony_ci	/*
544162306a36Sopenharmony_ci	 * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
544262306a36Sopenharmony_ci	 * kvm_arch_vcpu_ioctl().
544362306a36Sopenharmony_ci	 */
544462306a36Sopenharmony_ci	KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm);
544562306a36Sopenharmony_ci}
544662306a36Sopenharmony_ci
544762306a36Sopenharmony_civoid kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
544862306a36Sopenharmony_ci{
544962306a36Sopenharmony_ci	kvm_mmu_unload(vcpu);
545062306a36Sopenharmony_ci	kvm_init_mmu(vcpu);
545162306a36Sopenharmony_ci}
545262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
545362306a36Sopenharmony_ci
545462306a36Sopenharmony_ciint kvm_mmu_load(struct kvm_vcpu *vcpu)
545562306a36Sopenharmony_ci{
545662306a36Sopenharmony_ci	int r;
545762306a36Sopenharmony_ci
545862306a36Sopenharmony_ci	r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
545962306a36Sopenharmony_ci	if (r)
546062306a36Sopenharmony_ci		goto out;
546162306a36Sopenharmony_ci	r = mmu_alloc_special_roots(vcpu);
546262306a36Sopenharmony_ci	if (r)
546362306a36Sopenharmony_ci		goto out;
546462306a36Sopenharmony_ci	if (vcpu->arch.mmu->root_role.direct)
546562306a36Sopenharmony_ci		r = mmu_alloc_direct_roots(vcpu);
546662306a36Sopenharmony_ci	else
546762306a36Sopenharmony_ci		r = mmu_alloc_shadow_roots(vcpu);
546862306a36Sopenharmony_ci	if (r)
546962306a36Sopenharmony_ci		goto out;
547062306a36Sopenharmony_ci
547162306a36Sopenharmony_ci	kvm_mmu_sync_roots(vcpu);
547262306a36Sopenharmony_ci
547362306a36Sopenharmony_ci	kvm_mmu_load_pgd(vcpu);
547462306a36Sopenharmony_ci
547562306a36Sopenharmony_ci	/*
547662306a36Sopenharmony_ci	 * Flush any TLB entries for the new root, the provenance of the root
547762306a36Sopenharmony_ci	 * is unknown.  Even if KVM ensures there are no stale TLB entries
547862306a36Sopenharmony_ci	 * for a freed root, in theory another hypervisor could have left
547962306a36Sopenharmony_ci	 * stale entries.  Flushing on alloc also allows KVM to skip the TLB
548062306a36Sopenharmony_ci	 * flush when freeing a root (see kvm_tdp_mmu_put_root()).
548162306a36Sopenharmony_ci	 */
548262306a36Sopenharmony_ci	static_call(kvm_x86_flush_tlb_current)(vcpu);
548362306a36Sopenharmony_ciout:
548462306a36Sopenharmony_ci	return r;
548562306a36Sopenharmony_ci}
548662306a36Sopenharmony_ci
548762306a36Sopenharmony_civoid kvm_mmu_unload(struct kvm_vcpu *vcpu)
548862306a36Sopenharmony_ci{
548962306a36Sopenharmony_ci	struct kvm *kvm = vcpu->kvm;
549062306a36Sopenharmony_ci
549162306a36Sopenharmony_ci	kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
549262306a36Sopenharmony_ci	WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
549362306a36Sopenharmony_ci	kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
549462306a36Sopenharmony_ci	WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
549562306a36Sopenharmony_ci	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
549662306a36Sopenharmony_ci}
549762306a36Sopenharmony_ci
549862306a36Sopenharmony_cistatic bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
549962306a36Sopenharmony_ci{
550062306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
550162306a36Sopenharmony_ci
550262306a36Sopenharmony_ci	if (!VALID_PAGE(root_hpa))
550362306a36Sopenharmony_ci		return false;
550462306a36Sopenharmony_ci
550562306a36Sopenharmony_ci	/*
550662306a36Sopenharmony_ci	 * When freeing obsolete roots, treat roots as obsolete if they don't
550762306a36Sopenharmony_ci	 * have an associated shadow page, as it's impossible to determine if
550862306a36Sopenharmony_ci	 * such roots are fresh or stale.  This does mean KVM will get false
550962306a36Sopenharmony_ci	 * positives and free roots that don't strictly need to be freed, but
551062306a36Sopenharmony_ci	 * such false positives are relatively rare:
551162306a36Sopenharmony_ci	 *
551262306a36Sopenharmony_ci	 *  (a) only PAE paging and nested NPT have roots without shadow pages
551362306a36Sopenharmony_ci	 *      (or any shadow paging flavor with a dummy root, see note below)
551462306a36Sopenharmony_ci	 *  (b) remote reloads due to a memslot update obsoletes _all_ roots
551562306a36Sopenharmony_ci	 *  (c) KVM doesn't track previous roots for PAE paging, and the guest
551662306a36Sopenharmony_ci	 *      is unlikely to zap an in-use PGD.
551762306a36Sopenharmony_ci	 *
551862306a36Sopenharmony_ci	 * Note!  Dummy roots are unique in that they are obsoleted by memslot
551962306a36Sopenharmony_ci	 * _creation_!  See also FNAME(fetch).
552062306a36Sopenharmony_ci	 */
552162306a36Sopenharmony_ci	sp = root_to_sp(root_hpa);
552262306a36Sopenharmony_ci	return !sp || is_obsolete_sp(kvm, sp);
552362306a36Sopenharmony_ci}
552462306a36Sopenharmony_ci
552562306a36Sopenharmony_cistatic void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
552662306a36Sopenharmony_ci{
552762306a36Sopenharmony_ci	unsigned long roots_to_free = 0;
552862306a36Sopenharmony_ci	int i;
552962306a36Sopenharmony_ci
553062306a36Sopenharmony_ci	if (is_obsolete_root(kvm, mmu->root.hpa))
553162306a36Sopenharmony_ci		roots_to_free |= KVM_MMU_ROOT_CURRENT;
553262306a36Sopenharmony_ci
553362306a36Sopenharmony_ci	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
553462306a36Sopenharmony_ci		if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa))
553562306a36Sopenharmony_ci			roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
553662306a36Sopenharmony_ci	}
553762306a36Sopenharmony_ci
553862306a36Sopenharmony_ci	if (roots_to_free)
553962306a36Sopenharmony_ci		kvm_mmu_free_roots(kvm, mmu, roots_to_free);
554062306a36Sopenharmony_ci}
554162306a36Sopenharmony_ci
554262306a36Sopenharmony_civoid kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
554362306a36Sopenharmony_ci{
554462306a36Sopenharmony_ci	__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
554562306a36Sopenharmony_ci	__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
554662306a36Sopenharmony_ci}
554762306a36Sopenharmony_ci
554862306a36Sopenharmony_cistatic u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
554962306a36Sopenharmony_ci				    int *bytes)
555062306a36Sopenharmony_ci{
555162306a36Sopenharmony_ci	u64 gentry = 0;
555262306a36Sopenharmony_ci	int r;
555362306a36Sopenharmony_ci
555462306a36Sopenharmony_ci	/*
555562306a36Sopenharmony_ci	 * Assume that the pte write on a page table of the same type
555662306a36Sopenharmony_ci	 * as the current vcpu paging mode since we update the sptes only
555762306a36Sopenharmony_ci	 * when they have the same mode.
555862306a36Sopenharmony_ci	 */
555962306a36Sopenharmony_ci	if (is_pae(vcpu) && *bytes == 4) {
556062306a36Sopenharmony_ci		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
556162306a36Sopenharmony_ci		*gpa &= ~(gpa_t)7;
556262306a36Sopenharmony_ci		*bytes = 8;
556362306a36Sopenharmony_ci	}
556462306a36Sopenharmony_ci
556562306a36Sopenharmony_ci	if (*bytes == 4 || *bytes == 8) {
556662306a36Sopenharmony_ci		r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
556762306a36Sopenharmony_ci		if (r)
556862306a36Sopenharmony_ci			gentry = 0;
556962306a36Sopenharmony_ci	}
557062306a36Sopenharmony_ci
557162306a36Sopenharmony_ci	return gentry;
557262306a36Sopenharmony_ci}
557362306a36Sopenharmony_ci
557462306a36Sopenharmony_ci/*
557562306a36Sopenharmony_ci * If we're seeing too many writes to a page, it may no longer be a page table,
557662306a36Sopenharmony_ci * or we may be forking, in which case it is better to unmap the page.
557762306a36Sopenharmony_ci */
557862306a36Sopenharmony_cistatic bool detect_write_flooding(struct kvm_mmu_page *sp)
557962306a36Sopenharmony_ci{
558062306a36Sopenharmony_ci	/*
558162306a36Sopenharmony_ci	 * Skip write-flooding detected for the sp whose level is 1, because
558262306a36Sopenharmony_ci	 * it can become unsync, then the guest page is not write-protected.
558362306a36Sopenharmony_ci	 */
558462306a36Sopenharmony_ci	if (sp->role.level == PG_LEVEL_4K)
558562306a36Sopenharmony_ci		return false;
558662306a36Sopenharmony_ci
558762306a36Sopenharmony_ci	atomic_inc(&sp->write_flooding_count);
558862306a36Sopenharmony_ci	return atomic_read(&sp->write_flooding_count) >= 3;
558962306a36Sopenharmony_ci}
559062306a36Sopenharmony_ci
559162306a36Sopenharmony_ci/*
559262306a36Sopenharmony_ci * Misaligned accesses are too much trouble to fix up; also, they usually
559362306a36Sopenharmony_ci * indicate a page is not used as a page table.
559462306a36Sopenharmony_ci */
559562306a36Sopenharmony_cistatic bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
559662306a36Sopenharmony_ci				    int bytes)
559762306a36Sopenharmony_ci{
559862306a36Sopenharmony_ci	unsigned offset, pte_size, misaligned;
559962306a36Sopenharmony_ci
560062306a36Sopenharmony_ci	offset = offset_in_page(gpa);
560162306a36Sopenharmony_ci	pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
560262306a36Sopenharmony_ci
560362306a36Sopenharmony_ci	/*
560462306a36Sopenharmony_ci	 * Sometimes, the OS only writes the last one bytes to update status
560562306a36Sopenharmony_ci	 * bits, for example, in linux, andb instruction is used in clear_bit().
560662306a36Sopenharmony_ci	 */
560762306a36Sopenharmony_ci	if (!(offset & (pte_size - 1)) && bytes == 1)
560862306a36Sopenharmony_ci		return false;
560962306a36Sopenharmony_ci
561062306a36Sopenharmony_ci	misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
561162306a36Sopenharmony_ci	misaligned |= bytes < 4;
561262306a36Sopenharmony_ci
561362306a36Sopenharmony_ci	return misaligned;
561462306a36Sopenharmony_ci}
561562306a36Sopenharmony_ci
561662306a36Sopenharmony_cistatic u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
561762306a36Sopenharmony_ci{
561862306a36Sopenharmony_ci	unsigned page_offset, quadrant;
561962306a36Sopenharmony_ci	u64 *spte;
562062306a36Sopenharmony_ci	int level;
562162306a36Sopenharmony_ci
562262306a36Sopenharmony_ci	page_offset = offset_in_page(gpa);
562362306a36Sopenharmony_ci	level = sp->role.level;
562462306a36Sopenharmony_ci	*nspte = 1;
562562306a36Sopenharmony_ci	if (sp->role.has_4_byte_gpte) {
562662306a36Sopenharmony_ci		page_offset <<= 1;	/* 32->64 */
562762306a36Sopenharmony_ci		/*
562862306a36Sopenharmony_ci		 * A 32-bit pde maps 4MB while the shadow pdes map
562962306a36Sopenharmony_ci		 * only 2MB.  So we need to double the offset again
563062306a36Sopenharmony_ci		 * and zap two pdes instead of one.
563162306a36Sopenharmony_ci		 */
563262306a36Sopenharmony_ci		if (level == PT32_ROOT_LEVEL) {
563362306a36Sopenharmony_ci			page_offset &= ~7; /* kill rounding error */
563462306a36Sopenharmony_ci			page_offset <<= 1;
563562306a36Sopenharmony_ci			*nspte = 2;
563662306a36Sopenharmony_ci		}
563762306a36Sopenharmony_ci		quadrant = page_offset >> PAGE_SHIFT;
563862306a36Sopenharmony_ci		page_offset &= ~PAGE_MASK;
563962306a36Sopenharmony_ci		if (quadrant != sp->role.quadrant)
564062306a36Sopenharmony_ci			return NULL;
564162306a36Sopenharmony_ci	}
564262306a36Sopenharmony_ci
564362306a36Sopenharmony_ci	spte = &sp->spt[page_offset / sizeof(*spte)];
564462306a36Sopenharmony_ci	return spte;
564562306a36Sopenharmony_ci}
564662306a36Sopenharmony_ci
564762306a36Sopenharmony_civoid kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
564862306a36Sopenharmony_ci			 int bytes)
564962306a36Sopenharmony_ci{
565062306a36Sopenharmony_ci	gfn_t gfn = gpa >> PAGE_SHIFT;
565162306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
565262306a36Sopenharmony_ci	LIST_HEAD(invalid_list);
565362306a36Sopenharmony_ci	u64 entry, gentry, *spte;
565462306a36Sopenharmony_ci	int npte;
565562306a36Sopenharmony_ci	bool flush = false;
565662306a36Sopenharmony_ci
565762306a36Sopenharmony_ci	/*
565862306a36Sopenharmony_ci	 * If we don't have indirect shadow pages, it means no page is
565962306a36Sopenharmony_ci	 * write-protected, so we can exit simply.
566062306a36Sopenharmony_ci	 */
566162306a36Sopenharmony_ci	if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
566262306a36Sopenharmony_ci		return;
566362306a36Sopenharmony_ci
566462306a36Sopenharmony_ci	write_lock(&vcpu->kvm->mmu_lock);
566562306a36Sopenharmony_ci
566662306a36Sopenharmony_ci	gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
566762306a36Sopenharmony_ci
566862306a36Sopenharmony_ci	++vcpu->kvm->stat.mmu_pte_write;
566962306a36Sopenharmony_ci
567062306a36Sopenharmony_ci	for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) {
567162306a36Sopenharmony_ci		if (detect_write_misaligned(sp, gpa, bytes) ||
567262306a36Sopenharmony_ci		      detect_write_flooding(sp)) {
567362306a36Sopenharmony_ci			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
567462306a36Sopenharmony_ci			++vcpu->kvm->stat.mmu_flooded;
567562306a36Sopenharmony_ci			continue;
567662306a36Sopenharmony_ci		}
567762306a36Sopenharmony_ci
567862306a36Sopenharmony_ci		spte = get_written_sptes(sp, gpa, &npte);
567962306a36Sopenharmony_ci		if (!spte)
568062306a36Sopenharmony_ci			continue;
568162306a36Sopenharmony_ci
568262306a36Sopenharmony_ci		while (npte--) {
568362306a36Sopenharmony_ci			entry = *spte;
568462306a36Sopenharmony_ci			mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
568562306a36Sopenharmony_ci			if (gentry && sp->role.level != PG_LEVEL_4K)
568662306a36Sopenharmony_ci				++vcpu->kvm->stat.mmu_pde_zapped;
568762306a36Sopenharmony_ci			if (is_shadow_present_pte(entry))
568862306a36Sopenharmony_ci				flush = true;
568962306a36Sopenharmony_ci			++spte;
569062306a36Sopenharmony_ci		}
569162306a36Sopenharmony_ci	}
569262306a36Sopenharmony_ci	kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
569362306a36Sopenharmony_ci	write_unlock(&vcpu->kvm->mmu_lock);
569462306a36Sopenharmony_ci}
569562306a36Sopenharmony_ci
569662306a36Sopenharmony_ciint noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
569762306a36Sopenharmony_ci		       void *insn, int insn_len)
569862306a36Sopenharmony_ci{
569962306a36Sopenharmony_ci	int r, emulation_type = EMULTYPE_PF;
570062306a36Sopenharmony_ci	bool direct = vcpu->arch.mmu->root_role.direct;
570162306a36Sopenharmony_ci
570262306a36Sopenharmony_ci	/*
570362306a36Sopenharmony_ci	 * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
570462306a36Sopenharmony_ci	 * checks when emulating instructions that triggers implicit access.
570562306a36Sopenharmony_ci	 * WARN if hardware generates a fault with an error code that collides
570662306a36Sopenharmony_ci	 * with the KVM-defined value.  Clear the flag and continue on, i.e.
570762306a36Sopenharmony_ci	 * don't terminate the VM, as KVM can't possibly be relying on a flag
570862306a36Sopenharmony_ci	 * that KVM doesn't know about.
570962306a36Sopenharmony_ci	 */
571062306a36Sopenharmony_ci	if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
571162306a36Sopenharmony_ci		error_code &= ~PFERR_IMPLICIT_ACCESS;
571262306a36Sopenharmony_ci
571362306a36Sopenharmony_ci	if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
571462306a36Sopenharmony_ci		return RET_PF_RETRY;
571562306a36Sopenharmony_ci
571662306a36Sopenharmony_ci	r = RET_PF_INVALID;
571762306a36Sopenharmony_ci	if (unlikely(error_code & PFERR_RSVD_MASK)) {
571862306a36Sopenharmony_ci		r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
571962306a36Sopenharmony_ci		if (r == RET_PF_EMULATE)
572062306a36Sopenharmony_ci			goto emulate;
572162306a36Sopenharmony_ci	}
572262306a36Sopenharmony_ci
572362306a36Sopenharmony_ci	if (r == RET_PF_INVALID) {
572462306a36Sopenharmony_ci		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
572562306a36Sopenharmony_ci					  lower_32_bits(error_code), false,
572662306a36Sopenharmony_ci					  &emulation_type);
572762306a36Sopenharmony_ci		if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
572862306a36Sopenharmony_ci			return -EIO;
572962306a36Sopenharmony_ci	}
573062306a36Sopenharmony_ci
573162306a36Sopenharmony_ci	if (r < 0)
573262306a36Sopenharmony_ci		return r;
573362306a36Sopenharmony_ci	if (r != RET_PF_EMULATE)
573462306a36Sopenharmony_ci		return 1;
573562306a36Sopenharmony_ci
573662306a36Sopenharmony_ci	/*
573762306a36Sopenharmony_ci	 * Before emulating the instruction, check if the error code
573862306a36Sopenharmony_ci	 * was due to a RO violation while translating the guest page.
573962306a36Sopenharmony_ci	 * This can occur when using nested virtualization with nested
574062306a36Sopenharmony_ci	 * paging in both guests. If true, we simply unprotect the page
574162306a36Sopenharmony_ci	 * and resume the guest.
574262306a36Sopenharmony_ci	 */
574362306a36Sopenharmony_ci	if (vcpu->arch.mmu->root_role.direct &&
574462306a36Sopenharmony_ci	    (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
574562306a36Sopenharmony_ci		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
574662306a36Sopenharmony_ci		return 1;
574762306a36Sopenharmony_ci	}
574862306a36Sopenharmony_ci
574962306a36Sopenharmony_ci	/*
575062306a36Sopenharmony_ci	 * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
575162306a36Sopenharmony_ci	 * optimistically try to just unprotect the page and let the processor
575262306a36Sopenharmony_ci	 * re-execute the instruction that caused the page fault.  Do not allow
575362306a36Sopenharmony_ci	 * retrying MMIO emulation, as it's not only pointless but could also
575462306a36Sopenharmony_ci	 * cause us to enter an infinite loop because the processor will keep
575562306a36Sopenharmony_ci	 * faulting on the non-existent MMIO address.  Retrying an instruction
575662306a36Sopenharmony_ci	 * from a nested guest is also pointless and dangerous as we are only
575762306a36Sopenharmony_ci	 * explicitly shadowing L1's page tables, i.e. unprotecting something
575862306a36Sopenharmony_ci	 * for L1 isn't going to magically fix whatever issue cause L2 to fail.
575962306a36Sopenharmony_ci	 */
576062306a36Sopenharmony_ci	if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
576162306a36Sopenharmony_ci		emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
576262306a36Sopenharmony_ciemulate:
576362306a36Sopenharmony_ci	return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
576462306a36Sopenharmony_ci				       insn_len);
576562306a36Sopenharmony_ci}
576662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
576762306a36Sopenharmony_ci
576862306a36Sopenharmony_cistatic void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
576962306a36Sopenharmony_ci				      u64 addr, hpa_t root_hpa)
577062306a36Sopenharmony_ci{
577162306a36Sopenharmony_ci	struct kvm_shadow_walk_iterator iterator;
577262306a36Sopenharmony_ci
577362306a36Sopenharmony_ci	vcpu_clear_mmio_info(vcpu, addr);
577462306a36Sopenharmony_ci
577562306a36Sopenharmony_ci	/*
577662306a36Sopenharmony_ci	 * Walking and synchronizing SPTEs both assume they are operating in
577762306a36Sopenharmony_ci	 * the context of the current MMU, and would need to be reworked if
577862306a36Sopenharmony_ci	 * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT.
577962306a36Sopenharmony_ci	 */
578062306a36Sopenharmony_ci	if (WARN_ON_ONCE(mmu != vcpu->arch.mmu))
578162306a36Sopenharmony_ci		return;
578262306a36Sopenharmony_ci
578362306a36Sopenharmony_ci	if (!VALID_PAGE(root_hpa))
578462306a36Sopenharmony_ci		return;
578562306a36Sopenharmony_ci
578662306a36Sopenharmony_ci	write_lock(&vcpu->kvm->mmu_lock);
578762306a36Sopenharmony_ci	for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) {
578862306a36Sopenharmony_ci		struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep);
578962306a36Sopenharmony_ci
579062306a36Sopenharmony_ci		if (sp->unsync) {
579162306a36Sopenharmony_ci			int ret = kvm_sync_spte(vcpu, sp, iterator.index);
579262306a36Sopenharmony_ci
579362306a36Sopenharmony_ci			if (ret < 0)
579462306a36Sopenharmony_ci				mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL);
579562306a36Sopenharmony_ci			if (ret)
579662306a36Sopenharmony_ci				kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep);
579762306a36Sopenharmony_ci		}
579862306a36Sopenharmony_ci
579962306a36Sopenharmony_ci		if (!sp->unsync_children)
580062306a36Sopenharmony_ci			break;
580162306a36Sopenharmony_ci	}
580262306a36Sopenharmony_ci	write_unlock(&vcpu->kvm->mmu_lock);
580362306a36Sopenharmony_ci}
580462306a36Sopenharmony_ci
580562306a36Sopenharmony_civoid kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
580662306a36Sopenharmony_ci			     u64 addr, unsigned long roots)
580762306a36Sopenharmony_ci{
580862306a36Sopenharmony_ci	int i;
580962306a36Sopenharmony_ci
581062306a36Sopenharmony_ci	WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL);
581162306a36Sopenharmony_ci
581262306a36Sopenharmony_ci	/* It's actually a GPA for vcpu->arch.guest_mmu.  */
581362306a36Sopenharmony_ci	if (mmu != &vcpu->arch.guest_mmu) {
581462306a36Sopenharmony_ci		/* INVLPG on a non-canonical address is a NOP according to the SDM.  */
581562306a36Sopenharmony_ci		if (is_noncanonical_address(addr, vcpu))
581662306a36Sopenharmony_ci			return;
581762306a36Sopenharmony_ci
581862306a36Sopenharmony_ci		static_call(kvm_x86_flush_tlb_gva)(vcpu, addr);
581962306a36Sopenharmony_ci	}
582062306a36Sopenharmony_ci
582162306a36Sopenharmony_ci	if (!mmu->sync_spte)
582262306a36Sopenharmony_ci		return;
582362306a36Sopenharmony_ci
582462306a36Sopenharmony_ci	if (roots & KVM_MMU_ROOT_CURRENT)
582562306a36Sopenharmony_ci		__kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa);
582662306a36Sopenharmony_ci
582762306a36Sopenharmony_ci	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
582862306a36Sopenharmony_ci		if (roots & KVM_MMU_ROOT_PREVIOUS(i))
582962306a36Sopenharmony_ci			__kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa);
583062306a36Sopenharmony_ci	}
583162306a36Sopenharmony_ci}
583262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr);
583362306a36Sopenharmony_ci
583462306a36Sopenharmony_civoid kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
583562306a36Sopenharmony_ci{
583662306a36Sopenharmony_ci	/*
583762306a36Sopenharmony_ci	 * INVLPG is required to invalidate any global mappings for the VA,
583862306a36Sopenharmony_ci	 * irrespective of PCID.  Blindly sync all roots as it would take
583962306a36Sopenharmony_ci	 * roughly the same amount of work/time to determine whether any of the
584062306a36Sopenharmony_ci	 * previous roots have a global mapping.
584162306a36Sopenharmony_ci	 *
584262306a36Sopenharmony_ci	 * Mappings not reachable via the current or previous cached roots will
584362306a36Sopenharmony_ci	 * be synced when switching to that new cr3, so nothing needs to be
584462306a36Sopenharmony_ci	 * done here for them.
584562306a36Sopenharmony_ci	 */
584662306a36Sopenharmony_ci	kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL);
584762306a36Sopenharmony_ci	++vcpu->stat.invlpg;
584862306a36Sopenharmony_ci}
584962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
585062306a36Sopenharmony_ci
585162306a36Sopenharmony_ci
585262306a36Sopenharmony_civoid kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
585362306a36Sopenharmony_ci{
585462306a36Sopenharmony_ci	struct kvm_mmu *mmu = vcpu->arch.mmu;
585562306a36Sopenharmony_ci	unsigned long roots = 0;
585662306a36Sopenharmony_ci	uint i;
585762306a36Sopenharmony_ci
585862306a36Sopenharmony_ci	if (pcid == kvm_get_active_pcid(vcpu))
585962306a36Sopenharmony_ci		roots |= KVM_MMU_ROOT_CURRENT;
586062306a36Sopenharmony_ci
586162306a36Sopenharmony_ci	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
586262306a36Sopenharmony_ci		if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
586362306a36Sopenharmony_ci		    pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd))
586462306a36Sopenharmony_ci			roots |= KVM_MMU_ROOT_PREVIOUS(i);
586562306a36Sopenharmony_ci	}
586662306a36Sopenharmony_ci
586762306a36Sopenharmony_ci	if (roots)
586862306a36Sopenharmony_ci		kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots);
586962306a36Sopenharmony_ci	++vcpu->stat.invlpg;
587062306a36Sopenharmony_ci
587162306a36Sopenharmony_ci	/*
587262306a36Sopenharmony_ci	 * Mappings not reachable via the current cr3 or the prev_roots will be
587362306a36Sopenharmony_ci	 * synced when switching to that cr3, so nothing needs to be done here
587462306a36Sopenharmony_ci	 * for them.
587562306a36Sopenharmony_ci	 */
587662306a36Sopenharmony_ci}
587762306a36Sopenharmony_ci
587862306a36Sopenharmony_civoid kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
587962306a36Sopenharmony_ci		       int tdp_max_root_level, int tdp_huge_page_level)
588062306a36Sopenharmony_ci{
588162306a36Sopenharmony_ci	tdp_enabled = enable_tdp;
588262306a36Sopenharmony_ci	tdp_root_level = tdp_forced_root_level;
588362306a36Sopenharmony_ci	max_tdp_level = tdp_max_root_level;
588462306a36Sopenharmony_ci
588562306a36Sopenharmony_ci#ifdef CONFIG_X86_64
588662306a36Sopenharmony_ci	tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled;
588762306a36Sopenharmony_ci#endif
588862306a36Sopenharmony_ci	/*
588962306a36Sopenharmony_ci	 * max_huge_page_level reflects KVM's MMU capabilities irrespective
589062306a36Sopenharmony_ci	 * of kernel support, e.g. KVM may be capable of using 1GB pages when
589162306a36Sopenharmony_ci	 * the kernel is not.  But, KVM never creates a page size greater than
589262306a36Sopenharmony_ci	 * what is used by the kernel for any given HVA, i.e. the kernel's
589362306a36Sopenharmony_ci	 * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
589462306a36Sopenharmony_ci	 */
589562306a36Sopenharmony_ci	if (tdp_enabled)
589662306a36Sopenharmony_ci		max_huge_page_level = tdp_huge_page_level;
589762306a36Sopenharmony_ci	else if (boot_cpu_has(X86_FEATURE_GBPAGES))
589862306a36Sopenharmony_ci		max_huge_page_level = PG_LEVEL_1G;
589962306a36Sopenharmony_ci	else
590062306a36Sopenharmony_ci		max_huge_page_level = PG_LEVEL_2M;
590162306a36Sopenharmony_ci}
590262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_configure_mmu);
590362306a36Sopenharmony_ci
590462306a36Sopenharmony_ci/* The return value indicates if tlb flush on all vcpus is needed. */
590562306a36Sopenharmony_citypedef bool (*slot_rmaps_handler) (struct kvm *kvm,
590662306a36Sopenharmony_ci				    struct kvm_rmap_head *rmap_head,
590762306a36Sopenharmony_ci				    const struct kvm_memory_slot *slot);
590862306a36Sopenharmony_ci
590962306a36Sopenharmony_cistatic __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
591062306a36Sopenharmony_ci					      const struct kvm_memory_slot *slot,
591162306a36Sopenharmony_ci					      slot_rmaps_handler fn,
591262306a36Sopenharmony_ci					      int start_level, int end_level,
591362306a36Sopenharmony_ci					      gfn_t start_gfn, gfn_t end_gfn,
591462306a36Sopenharmony_ci					      bool flush_on_yield, bool flush)
591562306a36Sopenharmony_ci{
591662306a36Sopenharmony_ci	struct slot_rmap_walk_iterator iterator;
591762306a36Sopenharmony_ci
591862306a36Sopenharmony_ci	lockdep_assert_held_write(&kvm->mmu_lock);
591962306a36Sopenharmony_ci
592062306a36Sopenharmony_ci	for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
592162306a36Sopenharmony_ci			end_gfn, &iterator) {
592262306a36Sopenharmony_ci		if (iterator.rmap)
592362306a36Sopenharmony_ci			flush |= fn(kvm, iterator.rmap, slot);
592462306a36Sopenharmony_ci
592562306a36Sopenharmony_ci		if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
592662306a36Sopenharmony_ci			if (flush && flush_on_yield) {
592762306a36Sopenharmony_ci				kvm_flush_remote_tlbs_range(kvm, start_gfn,
592862306a36Sopenharmony_ci							    iterator.gfn - start_gfn + 1);
592962306a36Sopenharmony_ci				flush = false;
593062306a36Sopenharmony_ci			}
593162306a36Sopenharmony_ci			cond_resched_rwlock_write(&kvm->mmu_lock);
593262306a36Sopenharmony_ci		}
593362306a36Sopenharmony_ci	}
593462306a36Sopenharmony_ci
593562306a36Sopenharmony_ci	return flush;
593662306a36Sopenharmony_ci}
593762306a36Sopenharmony_ci
593862306a36Sopenharmony_cistatic __always_inline bool walk_slot_rmaps(struct kvm *kvm,
593962306a36Sopenharmony_ci					    const struct kvm_memory_slot *slot,
594062306a36Sopenharmony_ci					    slot_rmaps_handler fn,
594162306a36Sopenharmony_ci					    int start_level, int end_level,
594262306a36Sopenharmony_ci					    bool flush_on_yield)
594362306a36Sopenharmony_ci{
594462306a36Sopenharmony_ci	return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
594562306a36Sopenharmony_ci				 slot->base_gfn, slot->base_gfn + slot->npages - 1,
594662306a36Sopenharmony_ci				 flush_on_yield, false);
594762306a36Sopenharmony_ci}
594862306a36Sopenharmony_ci
594962306a36Sopenharmony_cistatic __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
595062306a36Sopenharmony_ci					       const struct kvm_memory_slot *slot,
595162306a36Sopenharmony_ci					       slot_rmaps_handler fn,
595262306a36Sopenharmony_ci					       bool flush_on_yield)
595362306a36Sopenharmony_ci{
595462306a36Sopenharmony_ci	return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
595562306a36Sopenharmony_ci}
595662306a36Sopenharmony_ci
595762306a36Sopenharmony_cistatic void free_mmu_pages(struct kvm_mmu *mmu)
595862306a36Sopenharmony_ci{
595962306a36Sopenharmony_ci	if (!tdp_enabled && mmu->pae_root)
596062306a36Sopenharmony_ci		set_memory_encrypted((unsigned long)mmu->pae_root, 1);
596162306a36Sopenharmony_ci	free_page((unsigned long)mmu->pae_root);
596262306a36Sopenharmony_ci	free_page((unsigned long)mmu->pml4_root);
596362306a36Sopenharmony_ci	free_page((unsigned long)mmu->pml5_root);
596462306a36Sopenharmony_ci}
596562306a36Sopenharmony_ci
596662306a36Sopenharmony_cistatic int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
596762306a36Sopenharmony_ci{
596862306a36Sopenharmony_ci	struct page *page;
596962306a36Sopenharmony_ci	int i;
597062306a36Sopenharmony_ci
597162306a36Sopenharmony_ci	mmu->root.hpa = INVALID_PAGE;
597262306a36Sopenharmony_ci	mmu->root.pgd = 0;
597362306a36Sopenharmony_ci	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
597462306a36Sopenharmony_ci		mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
597562306a36Sopenharmony_ci
597662306a36Sopenharmony_ci	/* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
597762306a36Sopenharmony_ci	if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
597862306a36Sopenharmony_ci		return 0;
597962306a36Sopenharmony_ci
598062306a36Sopenharmony_ci	/*
598162306a36Sopenharmony_ci	 * When using PAE paging, the four PDPTEs are treated as 'root' pages,
598262306a36Sopenharmony_ci	 * while the PDP table is a per-vCPU construct that's allocated at MMU
598362306a36Sopenharmony_ci	 * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
598462306a36Sopenharmony_ci	 * x86_64.  Therefore we need to allocate the PDP table in the first
598562306a36Sopenharmony_ci	 * 4GB of memory, which happens to fit the DMA32 zone.  TDP paging
598662306a36Sopenharmony_ci	 * generally doesn't use PAE paging and can skip allocating the PDP
598762306a36Sopenharmony_ci	 * table.  The main exception, handled here, is SVM's 32-bit NPT.  The
598862306a36Sopenharmony_ci	 * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
598962306a36Sopenharmony_ci	 * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
599062306a36Sopenharmony_ci	 */
599162306a36Sopenharmony_ci	if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
599262306a36Sopenharmony_ci		return 0;
599362306a36Sopenharmony_ci
599462306a36Sopenharmony_ci	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
599562306a36Sopenharmony_ci	if (!page)
599662306a36Sopenharmony_ci		return -ENOMEM;
599762306a36Sopenharmony_ci
599862306a36Sopenharmony_ci	mmu->pae_root = page_address(page);
599962306a36Sopenharmony_ci
600062306a36Sopenharmony_ci	/*
600162306a36Sopenharmony_ci	 * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
600262306a36Sopenharmony_ci	 * get the CPU to treat the PDPTEs as encrypted.  Decrypt the page so
600362306a36Sopenharmony_ci	 * that KVM's writes and the CPU's reads get along.  Note, this is
600462306a36Sopenharmony_ci	 * only necessary when using shadow paging, as 64-bit NPT can get at
600562306a36Sopenharmony_ci	 * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
600662306a36Sopenharmony_ci	 * by 32-bit kernels (when KVM itself uses 32-bit NPT).
600762306a36Sopenharmony_ci	 */
600862306a36Sopenharmony_ci	if (!tdp_enabled)
600962306a36Sopenharmony_ci		set_memory_decrypted((unsigned long)mmu->pae_root, 1);
601062306a36Sopenharmony_ci	else
601162306a36Sopenharmony_ci		WARN_ON_ONCE(shadow_me_value);
601262306a36Sopenharmony_ci
601362306a36Sopenharmony_ci	for (i = 0; i < 4; ++i)
601462306a36Sopenharmony_ci		mmu->pae_root[i] = INVALID_PAE_ROOT;
601562306a36Sopenharmony_ci
601662306a36Sopenharmony_ci	return 0;
601762306a36Sopenharmony_ci}
601862306a36Sopenharmony_ci
601962306a36Sopenharmony_ciint kvm_mmu_create(struct kvm_vcpu *vcpu)
602062306a36Sopenharmony_ci{
602162306a36Sopenharmony_ci	int ret;
602262306a36Sopenharmony_ci
602362306a36Sopenharmony_ci	vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
602462306a36Sopenharmony_ci	vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
602562306a36Sopenharmony_ci
602662306a36Sopenharmony_ci	vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
602762306a36Sopenharmony_ci	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
602862306a36Sopenharmony_ci
602962306a36Sopenharmony_ci	vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
603062306a36Sopenharmony_ci
603162306a36Sopenharmony_ci	vcpu->arch.mmu = &vcpu->arch.root_mmu;
603262306a36Sopenharmony_ci	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
603362306a36Sopenharmony_ci
603462306a36Sopenharmony_ci	ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
603562306a36Sopenharmony_ci	if (ret)
603662306a36Sopenharmony_ci		return ret;
603762306a36Sopenharmony_ci
603862306a36Sopenharmony_ci	ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
603962306a36Sopenharmony_ci	if (ret)
604062306a36Sopenharmony_ci		goto fail_allocate_root;
604162306a36Sopenharmony_ci
604262306a36Sopenharmony_ci	return ret;
604362306a36Sopenharmony_ci fail_allocate_root:
604462306a36Sopenharmony_ci	free_mmu_pages(&vcpu->arch.guest_mmu);
604562306a36Sopenharmony_ci	return ret;
604662306a36Sopenharmony_ci}
604762306a36Sopenharmony_ci
604862306a36Sopenharmony_ci#define BATCH_ZAP_PAGES	10
604962306a36Sopenharmony_cistatic void kvm_zap_obsolete_pages(struct kvm *kvm)
605062306a36Sopenharmony_ci{
605162306a36Sopenharmony_ci	struct kvm_mmu_page *sp, *node;
605262306a36Sopenharmony_ci	int nr_zapped, batch = 0;
605362306a36Sopenharmony_ci	bool unstable;
605462306a36Sopenharmony_ci
605562306a36Sopenharmony_cirestart:
605662306a36Sopenharmony_ci	list_for_each_entry_safe_reverse(sp, node,
605762306a36Sopenharmony_ci	      &kvm->arch.active_mmu_pages, link) {
605862306a36Sopenharmony_ci		/*
605962306a36Sopenharmony_ci		 * No obsolete valid page exists before a newly created page
606062306a36Sopenharmony_ci		 * since active_mmu_pages is a FIFO list.
606162306a36Sopenharmony_ci		 */
606262306a36Sopenharmony_ci		if (!is_obsolete_sp(kvm, sp))
606362306a36Sopenharmony_ci			break;
606462306a36Sopenharmony_ci
606562306a36Sopenharmony_ci		/*
606662306a36Sopenharmony_ci		 * Invalid pages should never land back on the list of active
606762306a36Sopenharmony_ci		 * pages.  Skip the bogus page, otherwise we'll get stuck in an
606862306a36Sopenharmony_ci		 * infinite loop if the page gets put back on the list (again).
606962306a36Sopenharmony_ci		 */
607062306a36Sopenharmony_ci		if (WARN_ON_ONCE(sp->role.invalid))
607162306a36Sopenharmony_ci			continue;
607262306a36Sopenharmony_ci
607362306a36Sopenharmony_ci		/*
607462306a36Sopenharmony_ci		 * No need to flush the TLB since we're only zapping shadow
607562306a36Sopenharmony_ci		 * pages with an obsolete generation number and all vCPUS have
607662306a36Sopenharmony_ci		 * loaded a new root, i.e. the shadow pages being zapped cannot
607762306a36Sopenharmony_ci		 * be in active use by the guest.
607862306a36Sopenharmony_ci		 */
607962306a36Sopenharmony_ci		if (batch >= BATCH_ZAP_PAGES &&
608062306a36Sopenharmony_ci		    cond_resched_rwlock_write(&kvm->mmu_lock)) {
608162306a36Sopenharmony_ci			batch = 0;
608262306a36Sopenharmony_ci			goto restart;
608362306a36Sopenharmony_ci		}
608462306a36Sopenharmony_ci
608562306a36Sopenharmony_ci		unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
608662306a36Sopenharmony_ci				&kvm->arch.zapped_obsolete_pages, &nr_zapped);
608762306a36Sopenharmony_ci		batch += nr_zapped;
608862306a36Sopenharmony_ci
608962306a36Sopenharmony_ci		if (unstable)
609062306a36Sopenharmony_ci			goto restart;
609162306a36Sopenharmony_ci	}
609262306a36Sopenharmony_ci
609362306a36Sopenharmony_ci	/*
609462306a36Sopenharmony_ci	 * Kick all vCPUs (via remote TLB flush) before freeing the page tables
609562306a36Sopenharmony_ci	 * to ensure KVM is not in the middle of a lockless shadow page table
609662306a36Sopenharmony_ci	 * walk, which may reference the pages.  The remote TLB flush itself is
609762306a36Sopenharmony_ci	 * not required and is simply a convenient way to kick vCPUs as needed.
609862306a36Sopenharmony_ci	 * KVM performs a local TLB flush when allocating a new root (see
609962306a36Sopenharmony_ci	 * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
610062306a36Sopenharmony_ci	 * running with an obsolete MMU.
610162306a36Sopenharmony_ci	 */
610262306a36Sopenharmony_ci	kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
610362306a36Sopenharmony_ci}
610462306a36Sopenharmony_ci
610562306a36Sopenharmony_ci/*
610662306a36Sopenharmony_ci * Fast invalidate all shadow pages and use lock-break technique
610762306a36Sopenharmony_ci * to zap obsolete pages.
610862306a36Sopenharmony_ci *
610962306a36Sopenharmony_ci * It's required when memslot is being deleted or VM is being
611062306a36Sopenharmony_ci * destroyed, in these cases, we should ensure that KVM MMU does
611162306a36Sopenharmony_ci * not use any resource of the being-deleted slot or all slots
611262306a36Sopenharmony_ci * after calling the function.
611362306a36Sopenharmony_ci */
611462306a36Sopenharmony_cistatic void kvm_mmu_zap_all_fast(struct kvm *kvm)
611562306a36Sopenharmony_ci{
611662306a36Sopenharmony_ci	lockdep_assert_held(&kvm->slots_lock);
611762306a36Sopenharmony_ci
611862306a36Sopenharmony_ci	write_lock(&kvm->mmu_lock);
611962306a36Sopenharmony_ci	trace_kvm_mmu_zap_all_fast(kvm);
612062306a36Sopenharmony_ci
612162306a36Sopenharmony_ci	/*
612262306a36Sopenharmony_ci	 * Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
612362306a36Sopenharmony_ci	 * held for the entire duration of zapping obsolete pages, it's
612462306a36Sopenharmony_ci	 * impossible for there to be multiple invalid generations associated
612562306a36Sopenharmony_ci	 * with *valid* shadow pages at any given time, i.e. there is exactly
612662306a36Sopenharmony_ci	 * one valid generation and (at most) one invalid generation.
612762306a36Sopenharmony_ci	 */
612862306a36Sopenharmony_ci	kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
612962306a36Sopenharmony_ci
613062306a36Sopenharmony_ci	/*
613162306a36Sopenharmony_ci	 * In order to ensure all vCPUs drop their soon-to-be invalid roots,
613262306a36Sopenharmony_ci	 * invalidating TDP MMU roots must be done while holding mmu_lock for
613362306a36Sopenharmony_ci	 * write and in the same critical section as making the reload request,
613462306a36Sopenharmony_ci	 * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
613562306a36Sopenharmony_ci	 */
613662306a36Sopenharmony_ci	if (tdp_mmu_enabled)
613762306a36Sopenharmony_ci		kvm_tdp_mmu_invalidate_all_roots(kvm);
613862306a36Sopenharmony_ci
613962306a36Sopenharmony_ci	/*
614062306a36Sopenharmony_ci	 * Notify all vcpus to reload its shadow page table and flush TLB.
614162306a36Sopenharmony_ci	 * Then all vcpus will switch to new shadow page table with the new
614262306a36Sopenharmony_ci	 * mmu_valid_gen.
614362306a36Sopenharmony_ci	 *
614462306a36Sopenharmony_ci	 * Note: we need to do this under the protection of mmu_lock,
614562306a36Sopenharmony_ci	 * otherwise, vcpu would purge shadow page but miss tlb flush.
614662306a36Sopenharmony_ci	 */
614762306a36Sopenharmony_ci	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
614862306a36Sopenharmony_ci
614962306a36Sopenharmony_ci	kvm_zap_obsolete_pages(kvm);
615062306a36Sopenharmony_ci
615162306a36Sopenharmony_ci	write_unlock(&kvm->mmu_lock);
615262306a36Sopenharmony_ci
615362306a36Sopenharmony_ci	/*
615462306a36Sopenharmony_ci	 * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
615562306a36Sopenharmony_ci	 * returning to the caller, e.g. if the zap is in response to a memslot
615662306a36Sopenharmony_ci	 * deletion, mmu_notifier callbacks will be unable to reach the SPTEs
615762306a36Sopenharmony_ci	 * associated with the deleted memslot once the update completes, and
615862306a36Sopenharmony_ci	 * Deferring the zap until the final reference to the root is put would
615962306a36Sopenharmony_ci	 * lead to use-after-free.
616062306a36Sopenharmony_ci	 */
616162306a36Sopenharmony_ci	if (tdp_mmu_enabled)
616262306a36Sopenharmony_ci		kvm_tdp_mmu_zap_invalidated_roots(kvm);
616362306a36Sopenharmony_ci}
616462306a36Sopenharmony_ci
616562306a36Sopenharmony_cistatic bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
616662306a36Sopenharmony_ci{
616762306a36Sopenharmony_ci	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
616862306a36Sopenharmony_ci}
616962306a36Sopenharmony_ci
617062306a36Sopenharmony_civoid kvm_mmu_init_vm(struct kvm *kvm)
617162306a36Sopenharmony_ci{
617262306a36Sopenharmony_ci	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
617362306a36Sopenharmony_ci	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
617462306a36Sopenharmony_ci	INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
617562306a36Sopenharmony_ci	spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
617662306a36Sopenharmony_ci
617762306a36Sopenharmony_ci	if (tdp_mmu_enabled)
617862306a36Sopenharmony_ci		kvm_mmu_init_tdp_mmu(kvm);
617962306a36Sopenharmony_ci
618062306a36Sopenharmony_ci	kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
618162306a36Sopenharmony_ci	kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
618262306a36Sopenharmony_ci
618362306a36Sopenharmony_ci	kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO;
618462306a36Sopenharmony_ci
618562306a36Sopenharmony_ci	kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
618662306a36Sopenharmony_ci	kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
618762306a36Sopenharmony_ci}
618862306a36Sopenharmony_ci
618962306a36Sopenharmony_cistatic void mmu_free_vm_memory_caches(struct kvm *kvm)
619062306a36Sopenharmony_ci{
619162306a36Sopenharmony_ci	kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache);
619262306a36Sopenharmony_ci	kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache);
619362306a36Sopenharmony_ci	kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache);
619462306a36Sopenharmony_ci}
619562306a36Sopenharmony_ci
619662306a36Sopenharmony_civoid kvm_mmu_uninit_vm(struct kvm *kvm)
619762306a36Sopenharmony_ci{
619862306a36Sopenharmony_ci	if (tdp_mmu_enabled)
619962306a36Sopenharmony_ci		kvm_mmu_uninit_tdp_mmu(kvm);
620062306a36Sopenharmony_ci
620162306a36Sopenharmony_ci	mmu_free_vm_memory_caches(kvm);
620262306a36Sopenharmony_ci}
620362306a36Sopenharmony_ci
620462306a36Sopenharmony_cistatic bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
620562306a36Sopenharmony_ci{
620662306a36Sopenharmony_ci	const struct kvm_memory_slot *memslot;
620762306a36Sopenharmony_ci	struct kvm_memslots *slots;
620862306a36Sopenharmony_ci	struct kvm_memslot_iter iter;
620962306a36Sopenharmony_ci	bool flush = false;
621062306a36Sopenharmony_ci	gfn_t start, end;
621162306a36Sopenharmony_ci	int i;
621262306a36Sopenharmony_ci
621362306a36Sopenharmony_ci	if (!kvm_memslots_have_rmaps(kvm))
621462306a36Sopenharmony_ci		return flush;
621562306a36Sopenharmony_ci
621662306a36Sopenharmony_ci	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
621762306a36Sopenharmony_ci		slots = __kvm_memslots(kvm, i);
621862306a36Sopenharmony_ci
621962306a36Sopenharmony_ci		kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
622062306a36Sopenharmony_ci			memslot = iter.slot;
622162306a36Sopenharmony_ci			start = max(gfn_start, memslot->base_gfn);
622262306a36Sopenharmony_ci			end = min(gfn_end, memslot->base_gfn + memslot->npages);
622362306a36Sopenharmony_ci			if (WARN_ON_ONCE(start >= end))
622462306a36Sopenharmony_ci				continue;
622562306a36Sopenharmony_ci
622662306a36Sopenharmony_ci			flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap,
622762306a36Sopenharmony_ci						  PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
622862306a36Sopenharmony_ci						  start, end - 1, true, flush);
622962306a36Sopenharmony_ci		}
623062306a36Sopenharmony_ci	}
623162306a36Sopenharmony_ci
623262306a36Sopenharmony_ci	return flush;
623362306a36Sopenharmony_ci}
623462306a36Sopenharmony_ci
623562306a36Sopenharmony_ci/*
623662306a36Sopenharmony_ci * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
623762306a36Sopenharmony_ci * (not including it)
623862306a36Sopenharmony_ci */
623962306a36Sopenharmony_civoid kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
624062306a36Sopenharmony_ci{
624162306a36Sopenharmony_ci	bool flush;
624262306a36Sopenharmony_ci
624362306a36Sopenharmony_ci	if (WARN_ON_ONCE(gfn_end <= gfn_start))
624462306a36Sopenharmony_ci		return;
624562306a36Sopenharmony_ci
624662306a36Sopenharmony_ci	write_lock(&kvm->mmu_lock);
624762306a36Sopenharmony_ci
624862306a36Sopenharmony_ci	kvm_mmu_invalidate_begin(kvm, 0, -1ul);
624962306a36Sopenharmony_ci
625062306a36Sopenharmony_ci	flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
625162306a36Sopenharmony_ci
625262306a36Sopenharmony_ci	if (tdp_mmu_enabled)
625362306a36Sopenharmony_ci		flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush);
625462306a36Sopenharmony_ci
625562306a36Sopenharmony_ci	if (flush)
625662306a36Sopenharmony_ci		kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
625762306a36Sopenharmony_ci
625862306a36Sopenharmony_ci	kvm_mmu_invalidate_end(kvm, 0, -1ul);
625962306a36Sopenharmony_ci
626062306a36Sopenharmony_ci	write_unlock(&kvm->mmu_lock);
626162306a36Sopenharmony_ci}
626262306a36Sopenharmony_ci
626362306a36Sopenharmony_cistatic bool slot_rmap_write_protect(struct kvm *kvm,
626462306a36Sopenharmony_ci				    struct kvm_rmap_head *rmap_head,
626562306a36Sopenharmony_ci				    const struct kvm_memory_slot *slot)
626662306a36Sopenharmony_ci{
626762306a36Sopenharmony_ci	return rmap_write_protect(rmap_head, false);
626862306a36Sopenharmony_ci}
626962306a36Sopenharmony_ci
627062306a36Sopenharmony_civoid kvm_mmu_slot_remove_write_access(struct kvm *kvm,
627162306a36Sopenharmony_ci				      const struct kvm_memory_slot *memslot,
627262306a36Sopenharmony_ci				      int start_level)
627362306a36Sopenharmony_ci{
627462306a36Sopenharmony_ci	if (kvm_memslots_have_rmaps(kvm)) {
627562306a36Sopenharmony_ci		write_lock(&kvm->mmu_lock);
627662306a36Sopenharmony_ci		walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect,
627762306a36Sopenharmony_ci				start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
627862306a36Sopenharmony_ci		write_unlock(&kvm->mmu_lock);
627962306a36Sopenharmony_ci	}
628062306a36Sopenharmony_ci
628162306a36Sopenharmony_ci	if (tdp_mmu_enabled) {
628262306a36Sopenharmony_ci		read_lock(&kvm->mmu_lock);
628362306a36Sopenharmony_ci		kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
628462306a36Sopenharmony_ci		read_unlock(&kvm->mmu_lock);
628562306a36Sopenharmony_ci	}
628662306a36Sopenharmony_ci}
628762306a36Sopenharmony_ci
628862306a36Sopenharmony_cistatic inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min)
628962306a36Sopenharmony_ci{
629062306a36Sopenharmony_ci	return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
629162306a36Sopenharmony_ci}
629262306a36Sopenharmony_ci
629362306a36Sopenharmony_cistatic bool need_topup_split_caches_or_resched(struct kvm *kvm)
629462306a36Sopenharmony_ci{
629562306a36Sopenharmony_ci	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
629662306a36Sopenharmony_ci		return true;
629762306a36Sopenharmony_ci
629862306a36Sopenharmony_ci	/*
629962306a36Sopenharmony_ci	 * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed
630062306a36Sopenharmony_ci	 * to split a single huge page. Calculating how many are actually needed
630162306a36Sopenharmony_ci	 * is possible but not worth the complexity.
630262306a36Sopenharmony_ci	 */
630362306a36Sopenharmony_ci	return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) ||
630462306a36Sopenharmony_ci	       need_topup(&kvm->arch.split_page_header_cache, 1) ||
630562306a36Sopenharmony_ci	       need_topup(&kvm->arch.split_shadow_page_cache, 1);
630662306a36Sopenharmony_ci}
630762306a36Sopenharmony_ci
630862306a36Sopenharmony_cistatic int topup_split_caches(struct kvm *kvm)
630962306a36Sopenharmony_ci{
631062306a36Sopenharmony_ci	/*
631162306a36Sopenharmony_ci	 * Allocating rmap list entries when splitting huge pages for nested
631262306a36Sopenharmony_ci	 * MMUs is uncommon as KVM needs to use a list if and only if there is
631362306a36Sopenharmony_ci	 * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be
631462306a36Sopenharmony_ci	 * aliased by multiple L2 gfns and/or from multiple nested roots with
631562306a36Sopenharmony_ci	 * different roles.  Aliasing gfns when using TDP is atypical for VMMs;
631662306a36Sopenharmony_ci	 * a few gfns are often aliased during boot, e.g. when remapping BIOS,
631762306a36Sopenharmony_ci	 * but aliasing rarely occurs post-boot or for many gfns.  If there is
631862306a36Sopenharmony_ci	 * only one rmap entry, rmap->val points directly at that one entry and
631962306a36Sopenharmony_ci	 * doesn't need to allocate a list.  Buffer the cache by the default
632062306a36Sopenharmony_ci	 * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM
632162306a36Sopenharmony_ci	 * encounters an aliased gfn or two.
632262306a36Sopenharmony_ci	 */
632362306a36Sopenharmony_ci	const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS +
632462306a36Sopenharmony_ci			     KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE;
632562306a36Sopenharmony_ci	int r;
632662306a36Sopenharmony_ci
632762306a36Sopenharmony_ci	lockdep_assert_held(&kvm->slots_lock);
632862306a36Sopenharmony_ci
632962306a36Sopenharmony_ci	r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity,
633062306a36Sopenharmony_ci					 SPLIT_DESC_CACHE_MIN_NR_OBJECTS);
633162306a36Sopenharmony_ci	if (r)
633262306a36Sopenharmony_ci		return r;
633362306a36Sopenharmony_ci
633462306a36Sopenharmony_ci	r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1);
633562306a36Sopenharmony_ci	if (r)
633662306a36Sopenharmony_ci		return r;
633762306a36Sopenharmony_ci
633862306a36Sopenharmony_ci	return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1);
633962306a36Sopenharmony_ci}
634062306a36Sopenharmony_ci
634162306a36Sopenharmony_cistatic struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep)
634262306a36Sopenharmony_ci{
634362306a36Sopenharmony_ci	struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
634462306a36Sopenharmony_ci	struct shadow_page_caches caches = {};
634562306a36Sopenharmony_ci	union kvm_mmu_page_role role;
634662306a36Sopenharmony_ci	unsigned int access;
634762306a36Sopenharmony_ci	gfn_t gfn;
634862306a36Sopenharmony_ci
634962306a36Sopenharmony_ci	gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
635062306a36Sopenharmony_ci	access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep));
635162306a36Sopenharmony_ci
635262306a36Sopenharmony_ci	/*
635362306a36Sopenharmony_ci	 * Note, huge page splitting always uses direct shadow pages, regardless
635462306a36Sopenharmony_ci	 * of whether the huge page itself is mapped by a direct or indirect
635562306a36Sopenharmony_ci	 * shadow page, since the huge page region itself is being directly
635662306a36Sopenharmony_ci	 * mapped with smaller pages.
635762306a36Sopenharmony_ci	 */
635862306a36Sopenharmony_ci	role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access);
635962306a36Sopenharmony_ci
636062306a36Sopenharmony_ci	/* Direct SPs do not require a shadowed_info_cache. */
636162306a36Sopenharmony_ci	caches.page_header_cache = &kvm->arch.split_page_header_cache;
636262306a36Sopenharmony_ci	caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
636362306a36Sopenharmony_ci
636462306a36Sopenharmony_ci	/* Safe to pass NULL for vCPU since requesting a direct SP. */
636562306a36Sopenharmony_ci	return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
636662306a36Sopenharmony_ci}
636762306a36Sopenharmony_ci
636862306a36Sopenharmony_cistatic void shadow_mmu_split_huge_page(struct kvm *kvm,
636962306a36Sopenharmony_ci				       const struct kvm_memory_slot *slot,
637062306a36Sopenharmony_ci				       u64 *huge_sptep)
637162306a36Sopenharmony_ci
637262306a36Sopenharmony_ci{
637362306a36Sopenharmony_ci	struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache;
637462306a36Sopenharmony_ci	u64 huge_spte = READ_ONCE(*huge_sptep);
637562306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
637662306a36Sopenharmony_ci	bool flush = false;
637762306a36Sopenharmony_ci	u64 *sptep, spte;
637862306a36Sopenharmony_ci	gfn_t gfn;
637962306a36Sopenharmony_ci	int index;
638062306a36Sopenharmony_ci
638162306a36Sopenharmony_ci	sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep);
638262306a36Sopenharmony_ci
638362306a36Sopenharmony_ci	for (index = 0; index < SPTE_ENT_PER_PAGE; index++) {
638462306a36Sopenharmony_ci		sptep = &sp->spt[index];
638562306a36Sopenharmony_ci		gfn = kvm_mmu_page_get_gfn(sp, index);
638662306a36Sopenharmony_ci
638762306a36Sopenharmony_ci		/*
638862306a36Sopenharmony_ci		 * The SP may already have populated SPTEs, e.g. if this huge
638962306a36Sopenharmony_ci		 * page is aliased by multiple sptes with the same access
639062306a36Sopenharmony_ci		 * permissions. These entries are guaranteed to map the same
639162306a36Sopenharmony_ci		 * gfn-to-pfn translation since the SP is direct, so no need to
639262306a36Sopenharmony_ci		 * modify them.
639362306a36Sopenharmony_ci		 *
639462306a36Sopenharmony_ci		 * However, if a given SPTE points to a lower level page table,
639562306a36Sopenharmony_ci		 * that lower level page table may only be partially populated.
639662306a36Sopenharmony_ci		 * Installing such SPTEs would effectively unmap a potion of the
639762306a36Sopenharmony_ci		 * huge page. Unmapping guest memory always requires a TLB flush
639862306a36Sopenharmony_ci		 * since a subsequent operation on the unmapped regions would
639962306a36Sopenharmony_ci		 * fail to detect the need to flush.
640062306a36Sopenharmony_ci		 */
640162306a36Sopenharmony_ci		if (is_shadow_present_pte(*sptep)) {
640262306a36Sopenharmony_ci			flush |= !is_last_spte(*sptep, sp->role.level);
640362306a36Sopenharmony_ci			continue;
640462306a36Sopenharmony_ci		}
640562306a36Sopenharmony_ci
640662306a36Sopenharmony_ci		spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index);
640762306a36Sopenharmony_ci		mmu_spte_set(sptep, spte);
640862306a36Sopenharmony_ci		__rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
640962306a36Sopenharmony_ci	}
641062306a36Sopenharmony_ci
641162306a36Sopenharmony_ci	__link_shadow_page(kvm, cache, huge_sptep, sp, flush);
641262306a36Sopenharmony_ci}
641362306a36Sopenharmony_ci
641462306a36Sopenharmony_cistatic int shadow_mmu_try_split_huge_page(struct kvm *kvm,
641562306a36Sopenharmony_ci					  const struct kvm_memory_slot *slot,
641662306a36Sopenharmony_ci					  u64 *huge_sptep)
641762306a36Sopenharmony_ci{
641862306a36Sopenharmony_ci	struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
641962306a36Sopenharmony_ci	int level, r = 0;
642062306a36Sopenharmony_ci	gfn_t gfn;
642162306a36Sopenharmony_ci	u64 spte;
642262306a36Sopenharmony_ci
642362306a36Sopenharmony_ci	/* Grab information for the tracepoint before dropping the MMU lock. */
642462306a36Sopenharmony_ci	gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
642562306a36Sopenharmony_ci	level = huge_sp->role.level;
642662306a36Sopenharmony_ci	spte = *huge_sptep;
642762306a36Sopenharmony_ci
642862306a36Sopenharmony_ci	if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) {
642962306a36Sopenharmony_ci		r = -ENOSPC;
643062306a36Sopenharmony_ci		goto out;
643162306a36Sopenharmony_ci	}
643262306a36Sopenharmony_ci
643362306a36Sopenharmony_ci	if (need_topup_split_caches_or_resched(kvm)) {
643462306a36Sopenharmony_ci		write_unlock(&kvm->mmu_lock);
643562306a36Sopenharmony_ci		cond_resched();
643662306a36Sopenharmony_ci		/*
643762306a36Sopenharmony_ci		 * If the topup succeeds, return -EAGAIN to indicate that the
643862306a36Sopenharmony_ci		 * rmap iterator should be restarted because the MMU lock was
643962306a36Sopenharmony_ci		 * dropped.
644062306a36Sopenharmony_ci		 */
644162306a36Sopenharmony_ci		r = topup_split_caches(kvm) ?: -EAGAIN;
644262306a36Sopenharmony_ci		write_lock(&kvm->mmu_lock);
644362306a36Sopenharmony_ci		goto out;
644462306a36Sopenharmony_ci	}
644562306a36Sopenharmony_ci
644662306a36Sopenharmony_ci	shadow_mmu_split_huge_page(kvm, slot, huge_sptep);
644762306a36Sopenharmony_ci
644862306a36Sopenharmony_ciout:
644962306a36Sopenharmony_ci	trace_kvm_mmu_split_huge_page(gfn, spte, level, r);
645062306a36Sopenharmony_ci	return r;
645162306a36Sopenharmony_ci}
645262306a36Sopenharmony_ci
645362306a36Sopenharmony_cistatic bool shadow_mmu_try_split_huge_pages(struct kvm *kvm,
645462306a36Sopenharmony_ci					    struct kvm_rmap_head *rmap_head,
645562306a36Sopenharmony_ci					    const struct kvm_memory_slot *slot)
645662306a36Sopenharmony_ci{
645762306a36Sopenharmony_ci	struct rmap_iterator iter;
645862306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
645962306a36Sopenharmony_ci	u64 *huge_sptep;
646062306a36Sopenharmony_ci	int r;
646162306a36Sopenharmony_ci
646262306a36Sopenharmony_cirestart:
646362306a36Sopenharmony_ci	for_each_rmap_spte(rmap_head, &iter, huge_sptep) {
646462306a36Sopenharmony_ci		sp = sptep_to_sp(huge_sptep);
646562306a36Sopenharmony_ci
646662306a36Sopenharmony_ci		/* TDP MMU is enabled, so rmap only contains nested MMU SPs. */
646762306a36Sopenharmony_ci		if (WARN_ON_ONCE(!sp->role.guest_mode))
646862306a36Sopenharmony_ci			continue;
646962306a36Sopenharmony_ci
647062306a36Sopenharmony_ci		/* The rmaps should never contain non-leaf SPTEs. */
647162306a36Sopenharmony_ci		if (WARN_ON_ONCE(!is_large_pte(*huge_sptep)))
647262306a36Sopenharmony_ci			continue;
647362306a36Sopenharmony_ci
647462306a36Sopenharmony_ci		/* SPs with level >PG_LEVEL_4K should never by unsync. */
647562306a36Sopenharmony_ci		if (WARN_ON_ONCE(sp->unsync))
647662306a36Sopenharmony_ci			continue;
647762306a36Sopenharmony_ci
647862306a36Sopenharmony_ci		/* Don't bother splitting huge pages on invalid SPs. */
647962306a36Sopenharmony_ci		if (sp->role.invalid)
648062306a36Sopenharmony_ci			continue;
648162306a36Sopenharmony_ci
648262306a36Sopenharmony_ci		r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep);
648362306a36Sopenharmony_ci
648462306a36Sopenharmony_ci		/*
648562306a36Sopenharmony_ci		 * The split succeeded or needs to be retried because the MMU
648662306a36Sopenharmony_ci		 * lock was dropped. Either way, restart the iterator to get it
648762306a36Sopenharmony_ci		 * back into a consistent state.
648862306a36Sopenharmony_ci		 */
648962306a36Sopenharmony_ci		if (!r || r == -EAGAIN)
649062306a36Sopenharmony_ci			goto restart;
649162306a36Sopenharmony_ci
649262306a36Sopenharmony_ci		/* The split failed and shouldn't be retried (e.g. -ENOMEM). */
649362306a36Sopenharmony_ci		break;
649462306a36Sopenharmony_ci	}
649562306a36Sopenharmony_ci
649662306a36Sopenharmony_ci	return false;
649762306a36Sopenharmony_ci}
649862306a36Sopenharmony_ci
649962306a36Sopenharmony_cistatic void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
650062306a36Sopenharmony_ci						const struct kvm_memory_slot *slot,
650162306a36Sopenharmony_ci						gfn_t start, gfn_t end,
650262306a36Sopenharmony_ci						int target_level)
650362306a36Sopenharmony_ci{
650462306a36Sopenharmony_ci	int level;
650562306a36Sopenharmony_ci
650662306a36Sopenharmony_ci	/*
650762306a36Sopenharmony_ci	 * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working
650862306a36Sopenharmony_ci	 * down to the target level. This ensures pages are recursively split
650962306a36Sopenharmony_ci	 * all the way to the target level. There's no need to split pages
651062306a36Sopenharmony_ci	 * already at the target level.
651162306a36Sopenharmony_ci	 */
651262306a36Sopenharmony_ci	for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
651362306a36Sopenharmony_ci		__walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages,
651462306a36Sopenharmony_ci				  level, level, start, end - 1, true, false);
651562306a36Sopenharmony_ci}
651662306a36Sopenharmony_ci
651762306a36Sopenharmony_ci/* Must be called with the mmu_lock held in write-mode. */
651862306a36Sopenharmony_civoid kvm_mmu_try_split_huge_pages(struct kvm *kvm,
651962306a36Sopenharmony_ci				   const struct kvm_memory_slot *memslot,
652062306a36Sopenharmony_ci				   u64 start, u64 end,
652162306a36Sopenharmony_ci				   int target_level)
652262306a36Sopenharmony_ci{
652362306a36Sopenharmony_ci	if (!tdp_mmu_enabled)
652462306a36Sopenharmony_ci		return;
652562306a36Sopenharmony_ci
652662306a36Sopenharmony_ci	if (kvm_memslots_have_rmaps(kvm))
652762306a36Sopenharmony_ci		kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
652862306a36Sopenharmony_ci
652962306a36Sopenharmony_ci	kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false);
653062306a36Sopenharmony_ci
653162306a36Sopenharmony_ci	/*
653262306a36Sopenharmony_ci	 * A TLB flush is unnecessary at this point for the same resons as in
653362306a36Sopenharmony_ci	 * kvm_mmu_slot_try_split_huge_pages().
653462306a36Sopenharmony_ci	 */
653562306a36Sopenharmony_ci}
653662306a36Sopenharmony_ci
653762306a36Sopenharmony_civoid kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
653862306a36Sopenharmony_ci					const struct kvm_memory_slot *memslot,
653962306a36Sopenharmony_ci					int target_level)
654062306a36Sopenharmony_ci{
654162306a36Sopenharmony_ci	u64 start = memslot->base_gfn;
654262306a36Sopenharmony_ci	u64 end = start + memslot->npages;
654362306a36Sopenharmony_ci
654462306a36Sopenharmony_ci	if (!tdp_mmu_enabled)
654562306a36Sopenharmony_ci		return;
654662306a36Sopenharmony_ci
654762306a36Sopenharmony_ci	if (kvm_memslots_have_rmaps(kvm)) {
654862306a36Sopenharmony_ci		write_lock(&kvm->mmu_lock);
654962306a36Sopenharmony_ci		kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
655062306a36Sopenharmony_ci		write_unlock(&kvm->mmu_lock);
655162306a36Sopenharmony_ci	}
655262306a36Sopenharmony_ci
655362306a36Sopenharmony_ci	read_lock(&kvm->mmu_lock);
655462306a36Sopenharmony_ci	kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
655562306a36Sopenharmony_ci	read_unlock(&kvm->mmu_lock);
655662306a36Sopenharmony_ci
655762306a36Sopenharmony_ci	/*
655862306a36Sopenharmony_ci	 * No TLB flush is necessary here. KVM will flush TLBs after
655962306a36Sopenharmony_ci	 * write-protecting and/or clearing dirty on the newly split SPTEs to
656062306a36Sopenharmony_ci	 * ensure that guest writes are reflected in the dirty log before the
656162306a36Sopenharmony_ci	 * ioctl to enable dirty logging on this memslot completes. Since the
656262306a36Sopenharmony_ci	 * split SPTEs retain the write and dirty bits of the huge SPTE, it is
656362306a36Sopenharmony_ci	 * safe for KVM to decide if a TLB flush is necessary based on the split
656462306a36Sopenharmony_ci	 * SPTEs.
656562306a36Sopenharmony_ci	 */
656662306a36Sopenharmony_ci}
656762306a36Sopenharmony_ci
656862306a36Sopenharmony_cistatic bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
656962306a36Sopenharmony_ci					 struct kvm_rmap_head *rmap_head,
657062306a36Sopenharmony_ci					 const struct kvm_memory_slot *slot)
657162306a36Sopenharmony_ci{
657262306a36Sopenharmony_ci	u64 *sptep;
657362306a36Sopenharmony_ci	struct rmap_iterator iter;
657462306a36Sopenharmony_ci	int need_tlb_flush = 0;
657562306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
657662306a36Sopenharmony_ci
657762306a36Sopenharmony_cirestart:
657862306a36Sopenharmony_ci	for_each_rmap_spte(rmap_head, &iter, sptep) {
657962306a36Sopenharmony_ci		sp = sptep_to_sp(sptep);
658062306a36Sopenharmony_ci
658162306a36Sopenharmony_ci		/*
658262306a36Sopenharmony_ci		 * We cannot do huge page mapping for indirect shadow pages,
658362306a36Sopenharmony_ci		 * which are found on the last rmap (level = 1) when not using
658462306a36Sopenharmony_ci		 * tdp; such shadow pages are synced with the page table in
658562306a36Sopenharmony_ci		 * the guest, and the guest page table is using 4K page size
658662306a36Sopenharmony_ci		 * mapping if the indirect sp has level = 1.
658762306a36Sopenharmony_ci		 */
658862306a36Sopenharmony_ci		if (sp->role.direct &&
658962306a36Sopenharmony_ci		    sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
659062306a36Sopenharmony_ci							       PG_LEVEL_NUM)) {
659162306a36Sopenharmony_ci			kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
659262306a36Sopenharmony_ci
659362306a36Sopenharmony_ci			if (kvm_available_flush_remote_tlbs_range())
659462306a36Sopenharmony_ci				kvm_flush_remote_tlbs_sptep(kvm, sptep);
659562306a36Sopenharmony_ci			else
659662306a36Sopenharmony_ci				need_tlb_flush = 1;
659762306a36Sopenharmony_ci
659862306a36Sopenharmony_ci			goto restart;
659962306a36Sopenharmony_ci		}
660062306a36Sopenharmony_ci	}
660162306a36Sopenharmony_ci
660262306a36Sopenharmony_ci	return need_tlb_flush;
660362306a36Sopenharmony_ci}
660462306a36Sopenharmony_ci
660562306a36Sopenharmony_cistatic void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
660662306a36Sopenharmony_ci					   const struct kvm_memory_slot *slot)
660762306a36Sopenharmony_ci{
660862306a36Sopenharmony_ci	/*
660962306a36Sopenharmony_ci	 * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap
661062306a36Sopenharmony_ci	 * pages that are already mapped at the maximum hugepage level.
661162306a36Sopenharmony_ci	 */
661262306a36Sopenharmony_ci	if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte,
661362306a36Sopenharmony_ci			    PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
661462306a36Sopenharmony_ci		kvm_flush_remote_tlbs_memslot(kvm, slot);
661562306a36Sopenharmony_ci}
661662306a36Sopenharmony_ci
661762306a36Sopenharmony_civoid kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
661862306a36Sopenharmony_ci				   const struct kvm_memory_slot *slot)
661962306a36Sopenharmony_ci{
662062306a36Sopenharmony_ci	if (kvm_memslots_have_rmaps(kvm)) {
662162306a36Sopenharmony_ci		write_lock(&kvm->mmu_lock);
662262306a36Sopenharmony_ci		kvm_rmap_zap_collapsible_sptes(kvm, slot);
662362306a36Sopenharmony_ci		write_unlock(&kvm->mmu_lock);
662462306a36Sopenharmony_ci	}
662562306a36Sopenharmony_ci
662662306a36Sopenharmony_ci	if (tdp_mmu_enabled) {
662762306a36Sopenharmony_ci		read_lock(&kvm->mmu_lock);
662862306a36Sopenharmony_ci		kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
662962306a36Sopenharmony_ci		read_unlock(&kvm->mmu_lock);
663062306a36Sopenharmony_ci	}
663162306a36Sopenharmony_ci}
663262306a36Sopenharmony_ci
663362306a36Sopenharmony_civoid kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
663462306a36Sopenharmony_ci				   const struct kvm_memory_slot *memslot)
663562306a36Sopenharmony_ci{
663662306a36Sopenharmony_ci	if (kvm_memslots_have_rmaps(kvm)) {
663762306a36Sopenharmony_ci		write_lock(&kvm->mmu_lock);
663862306a36Sopenharmony_ci		/*
663962306a36Sopenharmony_ci		 * Clear dirty bits only on 4k SPTEs since the legacy MMU only
664062306a36Sopenharmony_ci		 * support dirty logging at a 4k granularity.
664162306a36Sopenharmony_ci		 */
664262306a36Sopenharmony_ci		walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false);
664362306a36Sopenharmony_ci		write_unlock(&kvm->mmu_lock);
664462306a36Sopenharmony_ci	}
664562306a36Sopenharmony_ci
664662306a36Sopenharmony_ci	if (tdp_mmu_enabled) {
664762306a36Sopenharmony_ci		read_lock(&kvm->mmu_lock);
664862306a36Sopenharmony_ci		kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
664962306a36Sopenharmony_ci		read_unlock(&kvm->mmu_lock);
665062306a36Sopenharmony_ci	}
665162306a36Sopenharmony_ci
665262306a36Sopenharmony_ci	/*
665362306a36Sopenharmony_ci	 * The caller will flush the TLBs after this function returns.
665462306a36Sopenharmony_ci	 *
665562306a36Sopenharmony_ci	 * It's also safe to flush TLBs out of mmu lock here as currently this
665662306a36Sopenharmony_ci	 * function is only used for dirty logging, in which case flushing TLB
665762306a36Sopenharmony_ci	 * out of mmu lock also guarantees no dirty pages will be lost in
665862306a36Sopenharmony_ci	 * dirty_bitmap.
665962306a36Sopenharmony_ci	 */
666062306a36Sopenharmony_ci}
666162306a36Sopenharmony_ci
666262306a36Sopenharmony_cistatic void kvm_mmu_zap_all(struct kvm *kvm)
666362306a36Sopenharmony_ci{
666462306a36Sopenharmony_ci	struct kvm_mmu_page *sp, *node;
666562306a36Sopenharmony_ci	LIST_HEAD(invalid_list);
666662306a36Sopenharmony_ci	int ign;
666762306a36Sopenharmony_ci
666862306a36Sopenharmony_ci	write_lock(&kvm->mmu_lock);
666962306a36Sopenharmony_cirestart:
667062306a36Sopenharmony_ci	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
667162306a36Sopenharmony_ci		if (WARN_ON_ONCE(sp->role.invalid))
667262306a36Sopenharmony_ci			continue;
667362306a36Sopenharmony_ci		if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
667462306a36Sopenharmony_ci			goto restart;
667562306a36Sopenharmony_ci		if (cond_resched_rwlock_write(&kvm->mmu_lock))
667662306a36Sopenharmony_ci			goto restart;
667762306a36Sopenharmony_ci	}
667862306a36Sopenharmony_ci
667962306a36Sopenharmony_ci	kvm_mmu_commit_zap_page(kvm, &invalid_list);
668062306a36Sopenharmony_ci
668162306a36Sopenharmony_ci	if (tdp_mmu_enabled)
668262306a36Sopenharmony_ci		kvm_tdp_mmu_zap_all(kvm);
668362306a36Sopenharmony_ci
668462306a36Sopenharmony_ci	write_unlock(&kvm->mmu_lock);
668562306a36Sopenharmony_ci}
668662306a36Sopenharmony_ci
668762306a36Sopenharmony_civoid kvm_arch_flush_shadow_all(struct kvm *kvm)
668862306a36Sopenharmony_ci{
668962306a36Sopenharmony_ci	kvm_mmu_zap_all(kvm);
669062306a36Sopenharmony_ci}
669162306a36Sopenharmony_ci
669262306a36Sopenharmony_civoid kvm_arch_flush_shadow_memslot(struct kvm *kvm,
669362306a36Sopenharmony_ci				   struct kvm_memory_slot *slot)
669462306a36Sopenharmony_ci{
669562306a36Sopenharmony_ci	kvm_mmu_zap_all_fast(kvm);
669662306a36Sopenharmony_ci}
669762306a36Sopenharmony_ci
669862306a36Sopenharmony_civoid kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
669962306a36Sopenharmony_ci{
670062306a36Sopenharmony_ci	WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
670162306a36Sopenharmony_ci
670262306a36Sopenharmony_ci	gen &= MMIO_SPTE_GEN_MASK;
670362306a36Sopenharmony_ci
670462306a36Sopenharmony_ci	/*
670562306a36Sopenharmony_ci	 * Generation numbers are incremented in multiples of the number of
670662306a36Sopenharmony_ci	 * address spaces in order to provide unique generations across all
670762306a36Sopenharmony_ci	 * address spaces.  Strip what is effectively the address space
670862306a36Sopenharmony_ci	 * modifier prior to checking for a wrap of the MMIO generation so
670962306a36Sopenharmony_ci	 * that a wrap in any address space is detected.
671062306a36Sopenharmony_ci	 */
671162306a36Sopenharmony_ci	gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
671262306a36Sopenharmony_ci
671362306a36Sopenharmony_ci	/*
671462306a36Sopenharmony_ci	 * The very rare case: if the MMIO generation number has wrapped,
671562306a36Sopenharmony_ci	 * zap all shadow pages.
671662306a36Sopenharmony_ci	 */
671762306a36Sopenharmony_ci	if (unlikely(gen == 0)) {
671862306a36Sopenharmony_ci		kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n");
671962306a36Sopenharmony_ci		kvm_mmu_zap_all_fast(kvm);
672062306a36Sopenharmony_ci	}
672162306a36Sopenharmony_ci}
672262306a36Sopenharmony_ci
672362306a36Sopenharmony_cistatic unsigned long mmu_shrink_scan(struct shrinker *shrink,
672462306a36Sopenharmony_ci				     struct shrink_control *sc)
672562306a36Sopenharmony_ci{
672662306a36Sopenharmony_ci	struct kvm *kvm;
672762306a36Sopenharmony_ci	int nr_to_scan = sc->nr_to_scan;
672862306a36Sopenharmony_ci	unsigned long freed = 0;
672962306a36Sopenharmony_ci
673062306a36Sopenharmony_ci	mutex_lock(&kvm_lock);
673162306a36Sopenharmony_ci
673262306a36Sopenharmony_ci	list_for_each_entry(kvm, &vm_list, vm_list) {
673362306a36Sopenharmony_ci		int idx;
673462306a36Sopenharmony_ci		LIST_HEAD(invalid_list);
673562306a36Sopenharmony_ci
673662306a36Sopenharmony_ci		/*
673762306a36Sopenharmony_ci		 * Never scan more than sc->nr_to_scan VM instances.
673862306a36Sopenharmony_ci		 * Will not hit this condition practically since we do not try
673962306a36Sopenharmony_ci		 * to shrink more than one VM and it is very unlikely to see
674062306a36Sopenharmony_ci		 * !n_used_mmu_pages so many times.
674162306a36Sopenharmony_ci		 */
674262306a36Sopenharmony_ci		if (!nr_to_scan--)
674362306a36Sopenharmony_ci			break;
674462306a36Sopenharmony_ci		/*
674562306a36Sopenharmony_ci		 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
674662306a36Sopenharmony_ci		 * here. We may skip a VM instance errorneosly, but we do not
674762306a36Sopenharmony_ci		 * want to shrink a VM that only started to populate its MMU
674862306a36Sopenharmony_ci		 * anyway.
674962306a36Sopenharmony_ci		 */
675062306a36Sopenharmony_ci		if (!kvm->arch.n_used_mmu_pages &&
675162306a36Sopenharmony_ci		    !kvm_has_zapped_obsolete_pages(kvm))
675262306a36Sopenharmony_ci			continue;
675362306a36Sopenharmony_ci
675462306a36Sopenharmony_ci		idx = srcu_read_lock(&kvm->srcu);
675562306a36Sopenharmony_ci		write_lock(&kvm->mmu_lock);
675662306a36Sopenharmony_ci
675762306a36Sopenharmony_ci		if (kvm_has_zapped_obsolete_pages(kvm)) {
675862306a36Sopenharmony_ci			kvm_mmu_commit_zap_page(kvm,
675962306a36Sopenharmony_ci			      &kvm->arch.zapped_obsolete_pages);
676062306a36Sopenharmony_ci			goto unlock;
676162306a36Sopenharmony_ci		}
676262306a36Sopenharmony_ci
676362306a36Sopenharmony_ci		freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
676462306a36Sopenharmony_ci
676562306a36Sopenharmony_ciunlock:
676662306a36Sopenharmony_ci		write_unlock(&kvm->mmu_lock);
676762306a36Sopenharmony_ci		srcu_read_unlock(&kvm->srcu, idx);
676862306a36Sopenharmony_ci
676962306a36Sopenharmony_ci		/*
677062306a36Sopenharmony_ci		 * unfair on small ones
677162306a36Sopenharmony_ci		 * per-vm shrinkers cry out
677262306a36Sopenharmony_ci		 * sadness comes quickly
677362306a36Sopenharmony_ci		 */
677462306a36Sopenharmony_ci		list_move_tail(&kvm->vm_list, &vm_list);
677562306a36Sopenharmony_ci		break;
677662306a36Sopenharmony_ci	}
677762306a36Sopenharmony_ci
677862306a36Sopenharmony_ci	mutex_unlock(&kvm_lock);
677962306a36Sopenharmony_ci	return freed;
678062306a36Sopenharmony_ci}
678162306a36Sopenharmony_ci
678262306a36Sopenharmony_cistatic unsigned long mmu_shrink_count(struct shrinker *shrink,
678362306a36Sopenharmony_ci				      struct shrink_control *sc)
678462306a36Sopenharmony_ci{
678562306a36Sopenharmony_ci	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
678662306a36Sopenharmony_ci}
678762306a36Sopenharmony_ci
678862306a36Sopenharmony_cistatic struct shrinker mmu_shrinker = {
678962306a36Sopenharmony_ci	.count_objects = mmu_shrink_count,
679062306a36Sopenharmony_ci	.scan_objects = mmu_shrink_scan,
679162306a36Sopenharmony_ci	.seeks = DEFAULT_SEEKS * 10,
679262306a36Sopenharmony_ci};
679362306a36Sopenharmony_ci
679462306a36Sopenharmony_cistatic void mmu_destroy_caches(void)
679562306a36Sopenharmony_ci{
679662306a36Sopenharmony_ci	kmem_cache_destroy(pte_list_desc_cache);
679762306a36Sopenharmony_ci	kmem_cache_destroy(mmu_page_header_cache);
679862306a36Sopenharmony_ci}
679962306a36Sopenharmony_ci
680062306a36Sopenharmony_cistatic int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
680162306a36Sopenharmony_ci{
680262306a36Sopenharmony_ci	if (nx_hugepage_mitigation_hard_disabled)
680362306a36Sopenharmony_ci		return sysfs_emit(buffer, "never\n");
680462306a36Sopenharmony_ci
680562306a36Sopenharmony_ci	return param_get_bool(buffer, kp);
680662306a36Sopenharmony_ci}
680762306a36Sopenharmony_ci
680862306a36Sopenharmony_cistatic bool get_nx_auto_mode(void)
680962306a36Sopenharmony_ci{
681062306a36Sopenharmony_ci	/* Return true when CPU has the bug, and mitigations are ON */
681162306a36Sopenharmony_ci	return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
681262306a36Sopenharmony_ci}
681362306a36Sopenharmony_ci
681462306a36Sopenharmony_cistatic void __set_nx_huge_pages(bool val)
681562306a36Sopenharmony_ci{
681662306a36Sopenharmony_ci	nx_huge_pages = itlb_multihit_kvm_mitigation = val;
681762306a36Sopenharmony_ci}
681862306a36Sopenharmony_ci
681962306a36Sopenharmony_cistatic int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
682062306a36Sopenharmony_ci{
682162306a36Sopenharmony_ci	bool old_val = nx_huge_pages;
682262306a36Sopenharmony_ci	bool new_val;
682362306a36Sopenharmony_ci
682462306a36Sopenharmony_ci	if (nx_hugepage_mitigation_hard_disabled)
682562306a36Sopenharmony_ci		return -EPERM;
682662306a36Sopenharmony_ci
682762306a36Sopenharmony_ci	/* In "auto" mode deploy workaround only if CPU has the bug. */
682862306a36Sopenharmony_ci	if (sysfs_streq(val, "off")) {
682962306a36Sopenharmony_ci		new_val = 0;
683062306a36Sopenharmony_ci	} else if (sysfs_streq(val, "force")) {
683162306a36Sopenharmony_ci		new_val = 1;
683262306a36Sopenharmony_ci	} else if (sysfs_streq(val, "auto")) {
683362306a36Sopenharmony_ci		new_val = get_nx_auto_mode();
683462306a36Sopenharmony_ci	} else if (sysfs_streq(val, "never")) {
683562306a36Sopenharmony_ci		new_val = 0;
683662306a36Sopenharmony_ci
683762306a36Sopenharmony_ci		mutex_lock(&kvm_lock);
683862306a36Sopenharmony_ci		if (!list_empty(&vm_list)) {
683962306a36Sopenharmony_ci			mutex_unlock(&kvm_lock);
684062306a36Sopenharmony_ci			return -EBUSY;
684162306a36Sopenharmony_ci		}
684262306a36Sopenharmony_ci		nx_hugepage_mitigation_hard_disabled = true;
684362306a36Sopenharmony_ci		mutex_unlock(&kvm_lock);
684462306a36Sopenharmony_ci	} else if (kstrtobool(val, &new_val) < 0) {
684562306a36Sopenharmony_ci		return -EINVAL;
684662306a36Sopenharmony_ci	}
684762306a36Sopenharmony_ci
684862306a36Sopenharmony_ci	__set_nx_huge_pages(new_val);
684962306a36Sopenharmony_ci
685062306a36Sopenharmony_ci	if (new_val != old_val) {
685162306a36Sopenharmony_ci		struct kvm *kvm;
685262306a36Sopenharmony_ci
685362306a36Sopenharmony_ci		mutex_lock(&kvm_lock);
685462306a36Sopenharmony_ci
685562306a36Sopenharmony_ci		list_for_each_entry(kvm, &vm_list, vm_list) {
685662306a36Sopenharmony_ci			mutex_lock(&kvm->slots_lock);
685762306a36Sopenharmony_ci			kvm_mmu_zap_all_fast(kvm);
685862306a36Sopenharmony_ci			mutex_unlock(&kvm->slots_lock);
685962306a36Sopenharmony_ci
686062306a36Sopenharmony_ci			wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
686162306a36Sopenharmony_ci		}
686262306a36Sopenharmony_ci		mutex_unlock(&kvm_lock);
686362306a36Sopenharmony_ci	}
686462306a36Sopenharmony_ci
686562306a36Sopenharmony_ci	return 0;
686662306a36Sopenharmony_ci}
686762306a36Sopenharmony_ci
686862306a36Sopenharmony_ci/*
686962306a36Sopenharmony_ci * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
687062306a36Sopenharmony_ci * its default value of -1 is technically undefined behavior for a boolean.
687162306a36Sopenharmony_ci * Forward the module init call to SPTE code so that it too can handle module
687262306a36Sopenharmony_ci * params that need to be resolved/snapshot.
687362306a36Sopenharmony_ci */
687462306a36Sopenharmony_civoid __init kvm_mmu_x86_module_init(void)
687562306a36Sopenharmony_ci{
687662306a36Sopenharmony_ci	if (nx_huge_pages == -1)
687762306a36Sopenharmony_ci		__set_nx_huge_pages(get_nx_auto_mode());
687862306a36Sopenharmony_ci
687962306a36Sopenharmony_ci	/*
688062306a36Sopenharmony_ci	 * Snapshot userspace's desire to enable the TDP MMU. Whether or not the
688162306a36Sopenharmony_ci	 * TDP MMU is actually enabled is determined in kvm_configure_mmu()
688262306a36Sopenharmony_ci	 * when the vendor module is loaded.
688362306a36Sopenharmony_ci	 */
688462306a36Sopenharmony_ci	tdp_mmu_allowed = tdp_mmu_enabled;
688562306a36Sopenharmony_ci
688662306a36Sopenharmony_ci	kvm_mmu_spte_module_init();
688762306a36Sopenharmony_ci}
688862306a36Sopenharmony_ci
688962306a36Sopenharmony_ci/*
689062306a36Sopenharmony_ci * The bulk of the MMU initialization is deferred until the vendor module is
689162306a36Sopenharmony_ci * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
689262306a36Sopenharmony_ci * to be reset when a potentially different vendor module is loaded.
689362306a36Sopenharmony_ci */
689462306a36Sopenharmony_ciint kvm_mmu_vendor_module_init(void)
689562306a36Sopenharmony_ci{
689662306a36Sopenharmony_ci	int ret = -ENOMEM;
689762306a36Sopenharmony_ci
689862306a36Sopenharmony_ci	/*
689962306a36Sopenharmony_ci	 * MMU roles use union aliasing which is, generally speaking, an
690062306a36Sopenharmony_ci	 * undefined behavior. However, we supposedly know how compilers behave
690162306a36Sopenharmony_ci	 * and the current status quo is unlikely to change. Guardians below are
690262306a36Sopenharmony_ci	 * supposed to let us know if the assumption becomes false.
690362306a36Sopenharmony_ci	 */
690462306a36Sopenharmony_ci	BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
690562306a36Sopenharmony_ci	BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
690662306a36Sopenharmony_ci	BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64));
690762306a36Sopenharmony_ci
690862306a36Sopenharmony_ci	kvm_mmu_reset_all_pte_masks();
690962306a36Sopenharmony_ci
691062306a36Sopenharmony_ci	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
691162306a36Sopenharmony_ci					    sizeof(struct pte_list_desc),
691262306a36Sopenharmony_ci					    0, SLAB_ACCOUNT, NULL);
691362306a36Sopenharmony_ci	if (!pte_list_desc_cache)
691462306a36Sopenharmony_ci		goto out;
691562306a36Sopenharmony_ci
691662306a36Sopenharmony_ci	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
691762306a36Sopenharmony_ci						  sizeof(struct kvm_mmu_page),
691862306a36Sopenharmony_ci						  0, SLAB_ACCOUNT, NULL);
691962306a36Sopenharmony_ci	if (!mmu_page_header_cache)
692062306a36Sopenharmony_ci		goto out;
692162306a36Sopenharmony_ci
692262306a36Sopenharmony_ci	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
692362306a36Sopenharmony_ci		goto out;
692462306a36Sopenharmony_ci
692562306a36Sopenharmony_ci	ret = register_shrinker(&mmu_shrinker, "x86-mmu");
692662306a36Sopenharmony_ci	if (ret)
692762306a36Sopenharmony_ci		goto out_shrinker;
692862306a36Sopenharmony_ci
692962306a36Sopenharmony_ci	return 0;
693062306a36Sopenharmony_ci
693162306a36Sopenharmony_ciout_shrinker:
693262306a36Sopenharmony_ci	percpu_counter_destroy(&kvm_total_used_mmu_pages);
693362306a36Sopenharmony_ciout:
693462306a36Sopenharmony_ci	mmu_destroy_caches();
693562306a36Sopenharmony_ci	return ret;
693662306a36Sopenharmony_ci}
693762306a36Sopenharmony_ci
693862306a36Sopenharmony_civoid kvm_mmu_destroy(struct kvm_vcpu *vcpu)
693962306a36Sopenharmony_ci{
694062306a36Sopenharmony_ci	kvm_mmu_unload(vcpu);
694162306a36Sopenharmony_ci	free_mmu_pages(&vcpu->arch.root_mmu);
694262306a36Sopenharmony_ci	free_mmu_pages(&vcpu->arch.guest_mmu);
694362306a36Sopenharmony_ci	mmu_free_memory_caches(vcpu);
694462306a36Sopenharmony_ci}
694562306a36Sopenharmony_ci
694662306a36Sopenharmony_civoid kvm_mmu_vendor_module_exit(void)
694762306a36Sopenharmony_ci{
694862306a36Sopenharmony_ci	mmu_destroy_caches();
694962306a36Sopenharmony_ci	percpu_counter_destroy(&kvm_total_used_mmu_pages);
695062306a36Sopenharmony_ci	unregister_shrinker(&mmu_shrinker);
695162306a36Sopenharmony_ci}
695262306a36Sopenharmony_ci
695362306a36Sopenharmony_ci/*
695462306a36Sopenharmony_ci * Calculate the effective recovery period, accounting for '0' meaning "let KVM
695562306a36Sopenharmony_ci * select a halving time of 1 hour".  Returns true if recovery is enabled.
695662306a36Sopenharmony_ci */
695762306a36Sopenharmony_cistatic bool calc_nx_huge_pages_recovery_period(uint *period)
695862306a36Sopenharmony_ci{
695962306a36Sopenharmony_ci	/*
696062306a36Sopenharmony_ci	 * Use READ_ONCE to get the params, this may be called outside of the
696162306a36Sopenharmony_ci	 * param setters, e.g. by the kthread to compute its next timeout.
696262306a36Sopenharmony_ci	 */
696362306a36Sopenharmony_ci	bool enabled = READ_ONCE(nx_huge_pages);
696462306a36Sopenharmony_ci	uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
696562306a36Sopenharmony_ci
696662306a36Sopenharmony_ci	if (!enabled || !ratio)
696762306a36Sopenharmony_ci		return false;
696862306a36Sopenharmony_ci
696962306a36Sopenharmony_ci	*period = READ_ONCE(nx_huge_pages_recovery_period_ms);
697062306a36Sopenharmony_ci	if (!*period) {
697162306a36Sopenharmony_ci		/* Make sure the period is not less than one second.  */
697262306a36Sopenharmony_ci		ratio = min(ratio, 3600u);
697362306a36Sopenharmony_ci		*period = 60 * 60 * 1000 / ratio;
697462306a36Sopenharmony_ci	}
697562306a36Sopenharmony_ci	return true;
697662306a36Sopenharmony_ci}
697762306a36Sopenharmony_ci
697862306a36Sopenharmony_cistatic int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
697962306a36Sopenharmony_ci{
698062306a36Sopenharmony_ci	bool was_recovery_enabled, is_recovery_enabled;
698162306a36Sopenharmony_ci	uint old_period, new_period;
698262306a36Sopenharmony_ci	int err;
698362306a36Sopenharmony_ci
698462306a36Sopenharmony_ci	if (nx_hugepage_mitigation_hard_disabled)
698562306a36Sopenharmony_ci		return -EPERM;
698662306a36Sopenharmony_ci
698762306a36Sopenharmony_ci	was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
698862306a36Sopenharmony_ci
698962306a36Sopenharmony_ci	err = param_set_uint(val, kp);
699062306a36Sopenharmony_ci	if (err)
699162306a36Sopenharmony_ci		return err;
699262306a36Sopenharmony_ci
699362306a36Sopenharmony_ci	is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
699462306a36Sopenharmony_ci
699562306a36Sopenharmony_ci	if (is_recovery_enabled &&
699662306a36Sopenharmony_ci	    (!was_recovery_enabled || old_period > new_period)) {
699762306a36Sopenharmony_ci		struct kvm *kvm;
699862306a36Sopenharmony_ci
699962306a36Sopenharmony_ci		mutex_lock(&kvm_lock);
700062306a36Sopenharmony_ci
700162306a36Sopenharmony_ci		list_for_each_entry(kvm, &vm_list, vm_list)
700262306a36Sopenharmony_ci			wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
700362306a36Sopenharmony_ci
700462306a36Sopenharmony_ci		mutex_unlock(&kvm_lock);
700562306a36Sopenharmony_ci	}
700662306a36Sopenharmony_ci
700762306a36Sopenharmony_ci	return err;
700862306a36Sopenharmony_ci}
700962306a36Sopenharmony_ci
701062306a36Sopenharmony_cistatic void kvm_recover_nx_huge_pages(struct kvm *kvm)
701162306a36Sopenharmony_ci{
701262306a36Sopenharmony_ci	unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
701362306a36Sopenharmony_ci	struct kvm_memory_slot *slot;
701462306a36Sopenharmony_ci	int rcu_idx;
701562306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
701662306a36Sopenharmony_ci	unsigned int ratio;
701762306a36Sopenharmony_ci	LIST_HEAD(invalid_list);
701862306a36Sopenharmony_ci	bool flush = false;
701962306a36Sopenharmony_ci	ulong to_zap;
702062306a36Sopenharmony_ci
702162306a36Sopenharmony_ci	rcu_idx = srcu_read_lock(&kvm->srcu);
702262306a36Sopenharmony_ci	write_lock(&kvm->mmu_lock);
702362306a36Sopenharmony_ci
702462306a36Sopenharmony_ci	/*
702562306a36Sopenharmony_ci	 * Zapping TDP MMU shadow pages, including the remote TLB flush, must
702662306a36Sopenharmony_ci	 * be done under RCU protection, because the pages are freed via RCU
702762306a36Sopenharmony_ci	 * callback.
702862306a36Sopenharmony_ci	 */
702962306a36Sopenharmony_ci	rcu_read_lock();
703062306a36Sopenharmony_ci
703162306a36Sopenharmony_ci	ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
703262306a36Sopenharmony_ci	to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
703362306a36Sopenharmony_ci	for ( ; to_zap; --to_zap) {
703462306a36Sopenharmony_ci		if (list_empty(&kvm->arch.possible_nx_huge_pages))
703562306a36Sopenharmony_ci			break;
703662306a36Sopenharmony_ci
703762306a36Sopenharmony_ci		/*
703862306a36Sopenharmony_ci		 * We use a separate list instead of just using active_mmu_pages
703962306a36Sopenharmony_ci		 * because the number of shadow pages that be replaced with an
704062306a36Sopenharmony_ci		 * NX huge page is expected to be relatively small compared to
704162306a36Sopenharmony_ci		 * the total number of shadow pages.  And because the TDP MMU
704262306a36Sopenharmony_ci		 * doesn't use active_mmu_pages.
704362306a36Sopenharmony_ci		 */
704462306a36Sopenharmony_ci		sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
704562306a36Sopenharmony_ci				      struct kvm_mmu_page,
704662306a36Sopenharmony_ci				      possible_nx_huge_page_link);
704762306a36Sopenharmony_ci		WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
704862306a36Sopenharmony_ci		WARN_ON_ONCE(!sp->role.direct);
704962306a36Sopenharmony_ci
705062306a36Sopenharmony_ci		/*
705162306a36Sopenharmony_ci		 * Unaccount and do not attempt to recover any NX Huge Pages
705262306a36Sopenharmony_ci		 * that are being dirty tracked, as they would just be faulted
705362306a36Sopenharmony_ci		 * back in as 4KiB pages. The NX Huge Pages in this slot will be
705462306a36Sopenharmony_ci		 * recovered, along with all the other huge pages in the slot,
705562306a36Sopenharmony_ci		 * when dirty logging is disabled.
705662306a36Sopenharmony_ci		 *
705762306a36Sopenharmony_ci		 * Since gfn_to_memslot() is relatively expensive, it helps to
705862306a36Sopenharmony_ci		 * skip it if it the test cannot possibly return true.  On the
705962306a36Sopenharmony_ci		 * other hand, if any memslot has logging enabled, chances are
706062306a36Sopenharmony_ci		 * good that all of them do, in which case unaccount_nx_huge_page()
706162306a36Sopenharmony_ci		 * is much cheaper than zapping the page.
706262306a36Sopenharmony_ci		 *
706362306a36Sopenharmony_ci		 * If a memslot update is in progress, reading an incorrect value
706462306a36Sopenharmony_ci		 * of kvm->nr_memslots_dirty_logging is not a problem: if it is
706562306a36Sopenharmony_ci		 * becoming zero, gfn_to_memslot() will be done unnecessarily; if
706662306a36Sopenharmony_ci		 * it is becoming nonzero, the page will be zapped unnecessarily.
706762306a36Sopenharmony_ci		 * Either way, this only affects efficiency in racy situations,
706862306a36Sopenharmony_ci		 * and not correctness.
706962306a36Sopenharmony_ci		 */
707062306a36Sopenharmony_ci		slot = NULL;
707162306a36Sopenharmony_ci		if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
707262306a36Sopenharmony_ci			struct kvm_memslots *slots;
707362306a36Sopenharmony_ci
707462306a36Sopenharmony_ci			slots = kvm_memslots_for_spte_role(kvm, sp->role);
707562306a36Sopenharmony_ci			slot = __gfn_to_memslot(slots, sp->gfn);
707662306a36Sopenharmony_ci			WARN_ON_ONCE(!slot);
707762306a36Sopenharmony_ci		}
707862306a36Sopenharmony_ci
707962306a36Sopenharmony_ci		if (slot && kvm_slot_dirty_track_enabled(slot))
708062306a36Sopenharmony_ci			unaccount_nx_huge_page(kvm, sp);
708162306a36Sopenharmony_ci		else if (is_tdp_mmu_page(sp))
708262306a36Sopenharmony_ci			flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
708362306a36Sopenharmony_ci		else
708462306a36Sopenharmony_ci			kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
708562306a36Sopenharmony_ci		WARN_ON_ONCE(sp->nx_huge_page_disallowed);
708662306a36Sopenharmony_ci
708762306a36Sopenharmony_ci		if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
708862306a36Sopenharmony_ci			kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
708962306a36Sopenharmony_ci			rcu_read_unlock();
709062306a36Sopenharmony_ci
709162306a36Sopenharmony_ci			cond_resched_rwlock_write(&kvm->mmu_lock);
709262306a36Sopenharmony_ci			flush = false;
709362306a36Sopenharmony_ci
709462306a36Sopenharmony_ci			rcu_read_lock();
709562306a36Sopenharmony_ci		}
709662306a36Sopenharmony_ci	}
709762306a36Sopenharmony_ci	kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
709862306a36Sopenharmony_ci
709962306a36Sopenharmony_ci	rcu_read_unlock();
710062306a36Sopenharmony_ci
710162306a36Sopenharmony_ci	write_unlock(&kvm->mmu_lock);
710262306a36Sopenharmony_ci	srcu_read_unlock(&kvm->srcu, rcu_idx);
710362306a36Sopenharmony_ci}
710462306a36Sopenharmony_ci
710562306a36Sopenharmony_cistatic long get_nx_huge_page_recovery_timeout(u64 start_time)
710662306a36Sopenharmony_ci{
710762306a36Sopenharmony_ci	bool enabled;
710862306a36Sopenharmony_ci	uint period;
710962306a36Sopenharmony_ci
711062306a36Sopenharmony_ci	enabled = calc_nx_huge_pages_recovery_period(&period);
711162306a36Sopenharmony_ci
711262306a36Sopenharmony_ci	return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
711362306a36Sopenharmony_ci		       : MAX_SCHEDULE_TIMEOUT;
711462306a36Sopenharmony_ci}
711562306a36Sopenharmony_ci
711662306a36Sopenharmony_cistatic int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
711762306a36Sopenharmony_ci{
711862306a36Sopenharmony_ci	u64 start_time;
711962306a36Sopenharmony_ci	long remaining_time;
712062306a36Sopenharmony_ci
712162306a36Sopenharmony_ci	while (true) {
712262306a36Sopenharmony_ci		start_time = get_jiffies_64();
712362306a36Sopenharmony_ci		remaining_time = get_nx_huge_page_recovery_timeout(start_time);
712462306a36Sopenharmony_ci
712562306a36Sopenharmony_ci		set_current_state(TASK_INTERRUPTIBLE);
712662306a36Sopenharmony_ci		while (!kthread_should_stop() && remaining_time > 0) {
712762306a36Sopenharmony_ci			schedule_timeout(remaining_time);
712862306a36Sopenharmony_ci			remaining_time = get_nx_huge_page_recovery_timeout(start_time);
712962306a36Sopenharmony_ci			set_current_state(TASK_INTERRUPTIBLE);
713062306a36Sopenharmony_ci		}
713162306a36Sopenharmony_ci
713262306a36Sopenharmony_ci		set_current_state(TASK_RUNNING);
713362306a36Sopenharmony_ci
713462306a36Sopenharmony_ci		if (kthread_should_stop())
713562306a36Sopenharmony_ci			return 0;
713662306a36Sopenharmony_ci
713762306a36Sopenharmony_ci		kvm_recover_nx_huge_pages(kvm);
713862306a36Sopenharmony_ci	}
713962306a36Sopenharmony_ci}
714062306a36Sopenharmony_ci
714162306a36Sopenharmony_ciint kvm_mmu_post_init_vm(struct kvm *kvm)
714262306a36Sopenharmony_ci{
714362306a36Sopenharmony_ci	int err;
714462306a36Sopenharmony_ci
714562306a36Sopenharmony_ci	if (nx_hugepage_mitigation_hard_disabled)
714662306a36Sopenharmony_ci		return 0;
714762306a36Sopenharmony_ci
714862306a36Sopenharmony_ci	err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
714962306a36Sopenharmony_ci					  "kvm-nx-lpage-recovery",
715062306a36Sopenharmony_ci					  &kvm->arch.nx_huge_page_recovery_thread);
715162306a36Sopenharmony_ci	if (!err)
715262306a36Sopenharmony_ci		kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
715362306a36Sopenharmony_ci
715462306a36Sopenharmony_ci	return err;
715562306a36Sopenharmony_ci}
715662306a36Sopenharmony_ci
715762306a36Sopenharmony_civoid kvm_mmu_pre_destroy_vm(struct kvm *kvm)
715862306a36Sopenharmony_ci{
715962306a36Sopenharmony_ci	if (kvm->arch.nx_huge_page_recovery_thread)
716062306a36Sopenharmony_ci		kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
716162306a36Sopenharmony_ci}
7162