162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Kernel-based Virtual Machine driver for Linux
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * This module enables machines with Intel VT-x extensions to run virtual
662306a36Sopenharmony_ci * machines without emulation or binary translation.
762306a36Sopenharmony_ci *
862306a36Sopenharmony_ci * MMU support
962306a36Sopenharmony_ci *
1062306a36Sopenharmony_ci * Copyright (C) 2006 Qumranet, Inc.
1162306a36Sopenharmony_ci * Copyright 2010 Red Hat, Inc. and/or its affiliates.
1262306a36Sopenharmony_ci *
1362306a36Sopenharmony_ci * Authors:
1462306a36Sopenharmony_ci *   Yaniv Kamay  <yaniv@qumranet.com>
1562306a36Sopenharmony_ci *   Avi Kivity   <avi@qumranet.com>
1662306a36Sopenharmony_ci */
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci/*
1962306a36Sopenharmony_ci * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables,
2062306a36Sopenharmony_ci * as well as guest EPT tables, so the code in this file is compiled thrice,
2162306a36Sopenharmony_ci * once per guest PTE type.  The per-type defines are #undef'd at the end.
2262306a36Sopenharmony_ci */
2362306a36Sopenharmony_ci
2462306a36Sopenharmony_ci#if PTTYPE == 64
2562306a36Sopenharmony_ci	#define pt_element_t u64
2662306a36Sopenharmony_ci	#define guest_walker guest_walker64
2762306a36Sopenharmony_ci	#define FNAME(name) paging##64_##name
2862306a36Sopenharmony_ci	#define PT_LEVEL_BITS 9
2962306a36Sopenharmony_ci	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
3062306a36Sopenharmony_ci	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
3162306a36Sopenharmony_ci	#define PT_HAVE_ACCESSED_DIRTY(mmu) true
3262306a36Sopenharmony_ci	#ifdef CONFIG_X86_64
3362306a36Sopenharmony_ci	#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
3462306a36Sopenharmony_ci	#else
3562306a36Sopenharmony_ci	#define PT_MAX_FULL_LEVELS 2
3662306a36Sopenharmony_ci	#endif
3762306a36Sopenharmony_ci#elif PTTYPE == 32
3862306a36Sopenharmony_ci	#define pt_element_t u32
3962306a36Sopenharmony_ci	#define guest_walker guest_walker32
4062306a36Sopenharmony_ci	#define FNAME(name) paging##32_##name
4162306a36Sopenharmony_ci	#define PT_LEVEL_BITS 10
4262306a36Sopenharmony_ci	#define PT_MAX_FULL_LEVELS 2
4362306a36Sopenharmony_ci	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
4462306a36Sopenharmony_ci	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
4562306a36Sopenharmony_ci	#define PT_HAVE_ACCESSED_DIRTY(mmu) true
4662306a36Sopenharmony_ci
4762306a36Sopenharmony_ci	#define PT32_DIR_PSE36_SIZE 4
4862306a36Sopenharmony_ci	#define PT32_DIR_PSE36_SHIFT 13
4962306a36Sopenharmony_ci	#define PT32_DIR_PSE36_MASK \
5062306a36Sopenharmony_ci		(((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
5162306a36Sopenharmony_ci#elif PTTYPE == PTTYPE_EPT
5262306a36Sopenharmony_ci	#define pt_element_t u64
5362306a36Sopenharmony_ci	#define guest_walker guest_walkerEPT
5462306a36Sopenharmony_ci	#define FNAME(name) ept_##name
5562306a36Sopenharmony_ci	#define PT_LEVEL_BITS 9
5662306a36Sopenharmony_ci	#define PT_GUEST_DIRTY_SHIFT 9
5762306a36Sopenharmony_ci	#define PT_GUEST_ACCESSED_SHIFT 8
5862306a36Sopenharmony_ci	#define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled)
5962306a36Sopenharmony_ci	#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
6062306a36Sopenharmony_ci#else
6162306a36Sopenharmony_ci	#error Invalid PTTYPE value
6262306a36Sopenharmony_ci#endif
6362306a36Sopenharmony_ci
6462306a36Sopenharmony_ci/* Common logic, but per-type values.  These also need to be undefined. */
6562306a36Sopenharmony_ci#define PT_BASE_ADDR_MASK	((pt_element_t)(((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
6662306a36Sopenharmony_ci#define PT_LVL_ADDR_MASK(lvl)	__PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
6762306a36Sopenharmony_ci#define PT_LVL_OFFSET_MASK(lvl)	__PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
6862306a36Sopenharmony_ci#define PT_INDEX(addr, lvl)	__PT_INDEX(addr, lvl, PT_LEVEL_BITS)
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_ci#define PT_GUEST_DIRTY_MASK    (1 << PT_GUEST_DIRTY_SHIFT)
7162306a36Sopenharmony_ci#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
7462306a36Sopenharmony_ci#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PG_LEVEL_4K)
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci/*
7762306a36Sopenharmony_ci * The guest_walker structure emulates the behavior of the hardware page
7862306a36Sopenharmony_ci * table walker.
7962306a36Sopenharmony_ci */
8062306a36Sopenharmony_cistruct guest_walker {
8162306a36Sopenharmony_ci	int level;
8262306a36Sopenharmony_ci	unsigned max_level;
8362306a36Sopenharmony_ci	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
8462306a36Sopenharmony_ci	pt_element_t ptes[PT_MAX_FULL_LEVELS];
8562306a36Sopenharmony_ci	pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
8662306a36Sopenharmony_ci	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
8762306a36Sopenharmony_ci	pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
8862306a36Sopenharmony_ci	bool pte_writable[PT_MAX_FULL_LEVELS];
8962306a36Sopenharmony_ci	unsigned int pt_access[PT_MAX_FULL_LEVELS];
9062306a36Sopenharmony_ci	unsigned int pte_access;
9162306a36Sopenharmony_ci	gfn_t gfn;
9262306a36Sopenharmony_ci	struct x86_exception fault;
9362306a36Sopenharmony_ci};
9462306a36Sopenharmony_ci
9562306a36Sopenharmony_ci#if PTTYPE == 32
9662306a36Sopenharmony_cistatic inline gfn_t pse36_gfn_delta(u32 gpte)
9762306a36Sopenharmony_ci{
9862306a36Sopenharmony_ci	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
9962306a36Sopenharmony_ci
10062306a36Sopenharmony_ci	return (gpte & PT32_DIR_PSE36_MASK) << shift;
10162306a36Sopenharmony_ci}
10262306a36Sopenharmony_ci#endif
10362306a36Sopenharmony_ci
10462306a36Sopenharmony_cistatic gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
10562306a36Sopenharmony_ci{
10662306a36Sopenharmony_ci	return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
10762306a36Sopenharmony_ci}
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_cistatic inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access,
11062306a36Sopenharmony_ci					     unsigned gpte)
11162306a36Sopenharmony_ci{
11262306a36Sopenharmony_ci	unsigned mask;
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_ci	/* dirty bit is not supported, so no need to track it */
11562306a36Sopenharmony_ci	if (!PT_HAVE_ACCESSED_DIRTY(mmu))
11662306a36Sopenharmony_ci		return;
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_ci	mask = (unsigned)~ACC_WRITE_MASK;
12162306a36Sopenharmony_ci	/* Allow write access to dirty gptes */
12262306a36Sopenharmony_ci	mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
12362306a36Sopenharmony_ci		PT_WRITABLE_MASK;
12462306a36Sopenharmony_ci	*access &= mask;
12562306a36Sopenharmony_ci}
12662306a36Sopenharmony_ci
12762306a36Sopenharmony_cistatic inline int FNAME(is_present_gpte)(unsigned long pte)
12862306a36Sopenharmony_ci{
12962306a36Sopenharmony_ci#if PTTYPE != PTTYPE_EPT
13062306a36Sopenharmony_ci	return pte & PT_PRESENT_MASK;
13162306a36Sopenharmony_ci#else
13262306a36Sopenharmony_ci	return pte & 7;
13362306a36Sopenharmony_ci#endif
13462306a36Sopenharmony_ci}
13562306a36Sopenharmony_ci
13662306a36Sopenharmony_cistatic bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte)
13762306a36Sopenharmony_ci{
13862306a36Sopenharmony_ci#if PTTYPE != PTTYPE_EPT
13962306a36Sopenharmony_ci	return false;
14062306a36Sopenharmony_ci#else
14162306a36Sopenharmony_ci	return __is_bad_mt_xwr(rsvd_check, gpte);
14262306a36Sopenharmony_ci#endif
14362306a36Sopenharmony_ci}
14462306a36Sopenharmony_ci
14562306a36Sopenharmony_cistatic bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
14662306a36Sopenharmony_ci{
14762306a36Sopenharmony_ci	return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) ||
14862306a36Sopenharmony_ci	       FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte);
14962306a36Sopenharmony_ci}
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_cistatic bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
15262306a36Sopenharmony_ci				  struct kvm_mmu_page *sp, u64 *spte,
15362306a36Sopenharmony_ci				  u64 gpte)
15462306a36Sopenharmony_ci{
15562306a36Sopenharmony_ci	if (!FNAME(is_present_gpte)(gpte))
15662306a36Sopenharmony_ci		goto no_present;
15762306a36Sopenharmony_ci
15862306a36Sopenharmony_ci	/* Prefetch only accessed entries (unless A/D bits are disabled). */
15962306a36Sopenharmony_ci	if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) &&
16062306a36Sopenharmony_ci	    !(gpte & PT_GUEST_ACCESSED_MASK))
16162306a36Sopenharmony_ci		goto no_present;
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_ci	if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PG_LEVEL_4K))
16462306a36Sopenharmony_ci		goto no_present;
16562306a36Sopenharmony_ci
16662306a36Sopenharmony_ci	return false;
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_cino_present:
16962306a36Sopenharmony_ci	drop_spte(vcpu->kvm, spte);
17062306a36Sopenharmony_ci	return true;
17162306a36Sopenharmony_ci}
17262306a36Sopenharmony_ci
17362306a36Sopenharmony_ci/*
17462306a36Sopenharmony_ci * For PTTYPE_EPT, a page table can be executable but not readable
17562306a36Sopenharmony_ci * on supported processors. Therefore, set_spte does not automatically
17662306a36Sopenharmony_ci * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
17762306a36Sopenharmony_ci * to signify readability since it isn't used in the EPT case
17862306a36Sopenharmony_ci */
17962306a36Sopenharmony_cistatic inline unsigned FNAME(gpte_access)(u64 gpte)
18062306a36Sopenharmony_ci{
18162306a36Sopenharmony_ci	unsigned access;
18262306a36Sopenharmony_ci#if PTTYPE == PTTYPE_EPT
18362306a36Sopenharmony_ci	access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
18462306a36Sopenharmony_ci		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
18562306a36Sopenharmony_ci		((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
18662306a36Sopenharmony_ci#else
18762306a36Sopenharmony_ci	BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
18862306a36Sopenharmony_ci	BUILD_BUG_ON(ACC_EXEC_MASK != 1);
18962306a36Sopenharmony_ci	access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK);
19062306a36Sopenharmony_ci	/* Combine NX with P (which is set here) to get ACC_EXEC_MASK.  */
19162306a36Sopenharmony_ci	access ^= (gpte >> PT64_NX_SHIFT);
19262306a36Sopenharmony_ci#endif
19362306a36Sopenharmony_ci
19462306a36Sopenharmony_ci	return access;
19562306a36Sopenharmony_ci}
19662306a36Sopenharmony_ci
19762306a36Sopenharmony_cistatic int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
19862306a36Sopenharmony_ci					     struct kvm_mmu *mmu,
19962306a36Sopenharmony_ci					     struct guest_walker *walker,
20062306a36Sopenharmony_ci					     gpa_t addr, int write_fault)
20162306a36Sopenharmony_ci{
20262306a36Sopenharmony_ci	unsigned level, index;
20362306a36Sopenharmony_ci	pt_element_t pte, orig_pte;
20462306a36Sopenharmony_ci	pt_element_t __user *ptep_user;
20562306a36Sopenharmony_ci	gfn_t table_gfn;
20662306a36Sopenharmony_ci	int ret;
20762306a36Sopenharmony_ci
20862306a36Sopenharmony_ci	/* dirty/accessed bits are not supported, so no need to update them */
20962306a36Sopenharmony_ci	if (!PT_HAVE_ACCESSED_DIRTY(mmu))
21062306a36Sopenharmony_ci		return 0;
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ci	for (level = walker->max_level; level >= walker->level; --level) {
21362306a36Sopenharmony_ci		pte = orig_pte = walker->ptes[level - 1];
21462306a36Sopenharmony_ci		table_gfn = walker->table_gfn[level - 1];
21562306a36Sopenharmony_ci		ptep_user = walker->ptep_user[level - 1];
21662306a36Sopenharmony_ci		index = offset_in_page(ptep_user) / sizeof(pt_element_t);
21762306a36Sopenharmony_ci		if (!(pte & PT_GUEST_ACCESSED_MASK)) {
21862306a36Sopenharmony_ci			trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
21962306a36Sopenharmony_ci			pte |= PT_GUEST_ACCESSED_MASK;
22062306a36Sopenharmony_ci		}
22162306a36Sopenharmony_ci		if (level == walker->level && write_fault &&
22262306a36Sopenharmony_ci				!(pte & PT_GUEST_DIRTY_MASK)) {
22362306a36Sopenharmony_ci			trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
22462306a36Sopenharmony_ci#if PTTYPE == PTTYPE_EPT
22562306a36Sopenharmony_ci			if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr))
22662306a36Sopenharmony_ci				return -EINVAL;
22762306a36Sopenharmony_ci#endif
22862306a36Sopenharmony_ci			pte |= PT_GUEST_DIRTY_MASK;
22962306a36Sopenharmony_ci		}
23062306a36Sopenharmony_ci		if (pte == orig_pte)
23162306a36Sopenharmony_ci			continue;
23262306a36Sopenharmony_ci
23362306a36Sopenharmony_ci		/*
23462306a36Sopenharmony_ci		 * If the slot is read-only, simply do not process the accessed
23562306a36Sopenharmony_ci		 * and dirty bits.  This is the correct thing to do if the slot
23662306a36Sopenharmony_ci		 * is ROM, and page tables in read-as-ROM/write-as-MMIO slots
23762306a36Sopenharmony_ci		 * are only supported if the accessed and dirty bits are already
23862306a36Sopenharmony_ci		 * set in the ROM (so that MMIO writes are never needed).
23962306a36Sopenharmony_ci		 *
24062306a36Sopenharmony_ci		 * Note that NPT does not allow this at all and faults, since
24162306a36Sopenharmony_ci		 * it always wants nested page table entries for the guest
24262306a36Sopenharmony_ci		 * page tables to be writable.  And EPT works but will simply
24362306a36Sopenharmony_ci		 * overwrite the read-only memory to set the accessed and dirty
24462306a36Sopenharmony_ci		 * bits.
24562306a36Sopenharmony_ci		 */
24662306a36Sopenharmony_ci		if (unlikely(!walker->pte_writable[level - 1]))
24762306a36Sopenharmony_ci			continue;
24862306a36Sopenharmony_ci
24962306a36Sopenharmony_ci		ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault);
25062306a36Sopenharmony_ci		if (ret)
25162306a36Sopenharmony_ci			return ret;
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_ci		kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
25462306a36Sopenharmony_ci		walker->ptes[level - 1] = pte;
25562306a36Sopenharmony_ci	}
25662306a36Sopenharmony_ci	return 0;
25762306a36Sopenharmony_ci}
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_cistatic inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
26062306a36Sopenharmony_ci{
26162306a36Sopenharmony_ci	unsigned pkeys = 0;
26262306a36Sopenharmony_ci#if PTTYPE == 64
26362306a36Sopenharmony_ci	pte_t pte = {.pte = gpte};
26462306a36Sopenharmony_ci
26562306a36Sopenharmony_ci	pkeys = pte_flags_pkey(pte_flags(pte));
26662306a36Sopenharmony_ci#endif
26762306a36Sopenharmony_ci	return pkeys;
26862306a36Sopenharmony_ci}
26962306a36Sopenharmony_ci
27062306a36Sopenharmony_cistatic inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
27162306a36Sopenharmony_ci				       unsigned int level, unsigned int gpte)
27262306a36Sopenharmony_ci{
27362306a36Sopenharmony_ci	/*
27462306a36Sopenharmony_ci	 * For EPT and PAE paging (both variants), bit 7 is either reserved at
27562306a36Sopenharmony_ci	 * all level or indicates a huge page (ignoring CR3/EPTP).  In either
27662306a36Sopenharmony_ci	 * case, bit 7 being set terminates the walk.
27762306a36Sopenharmony_ci	 */
27862306a36Sopenharmony_ci#if PTTYPE == 32
27962306a36Sopenharmony_ci	/*
28062306a36Sopenharmony_ci	 * 32-bit paging requires special handling because bit 7 is ignored if
28162306a36Sopenharmony_ci	 * CR4.PSE=0, not reserved.  Clear bit 7 in the gpte if the level is
28262306a36Sopenharmony_ci	 * greater than the last level for which bit 7 is the PAGE_SIZE bit.
28362306a36Sopenharmony_ci	 *
28462306a36Sopenharmony_ci	 * The RHS has bit 7 set iff level < (2 + PSE).  If it is clear, bit 7
28562306a36Sopenharmony_ci	 * is not reserved and does not indicate a large page at this level,
28662306a36Sopenharmony_ci	 * so clear PT_PAGE_SIZE_MASK in gpte if that is the case.
28762306a36Sopenharmony_ci	 */
28862306a36Sopenharmony_ci	gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse);
28962306a36Sopenharmony_ci#endif
29062306a36Sopenharmony_ci	/*
29162306a36Sopenharmony_ci	 * PG_LEVEL_4K always terminates.  The RHS has bit 7 set
29262306a36Sopenharmony_ci	 * iff level <= PG_LEVEL_4K, which for our purpose means
29362306a36Sopenharmony_ci	 * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
29462306a36Sopenharmony_ci	 */
29562306a36Sopenharmony_ci	gpte |= level - PG_LEVEL_4K - 1;
29662306a36Sopenharmony_ci
29762306a36Sopenharmony_ci	return gpte & PT_PAGE_SIZE_MASK;
29862306a36Sopenharmony_ci}
29962306a36Sopenharmony_ci/*
30062306a36Sopenharmony_ci * Fetch a guest pte for a guest virtual address, or for an L2's GPA.
30162306a36Sopenharmony_ci */
30262306a36Sopenharmony_cistatic int FNAME(walk_addr_generic)(struct guest_walker *walker,
30362306a36Sopenharmony_ci				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
30462306a36Sopenharmony_ci				    gpa_t addr, u64 access)
30562306a36Sopenharmony_ci{
30662306a36Sopenharmony_ci	int ret;
30762306a36Sopenharmony_ci	pt_element_t pte;
30862306a36Sopenharmony_ci	pt_element_t __user *ptep_user;
30962306a36Sopenharmony_ci	gfn_t table_gfn;
31062306a36Sopenharmony_ci	u64 pt_access, pte_access;
31162306a36Sopenharmony_ci	unsigned index, accessed_dirty, pte_pkey;
31262306a36Sopenharmony_ci	u64 nested_access;
31362306a36Sopenharmony_ci	gpa_t pte_gpa;
31462306a36Sopenharmony_ci	bool have_ad;
31562306a36Sopenharmony_ci	int offset;
31662306a36Sopenharmony_ci	u64 walk_nx_mask = 0;
31762306a36Sopenharmony_ci	const int write_fault = access & PFERR_WRITE_MASK;
31862306a36Sopenharmony_ci	const int user_fault  = access & PFERR_USER_MASK;
31962306a36Sopenharmony_ci	const int fetch_fault = access & PFERR_FETCH_MASK;
32062306a36Sopenharmony_ci	u16 errcode = 0;
32162306a36Sopenharmony_ci	gpa_t real_gpa;
32262306a36Sopenharmony_ci	gfn_t gfn;
32362306a36Sopenharmony_ci
32462306a36Sopenharmony_ci	trace_kvm_mmu_pagetable_walk(addr, access);
32562306a36Sopenharmony_ciretry_walk:
32662306a36Sopenharmony_ci	walker->level = mmu->cpu_role.base.level;
32762306a36Sopenharmony_ci	pte           = kvm_mmu_get_guest_pgd(vcpu, mmu);
32862306a36Sopenharmony_ci	have_ad       = PT_HAVE_ACCESSED_DIRTY(mmu);
32962306a36Sopenharmony_ci
33062306a36Sopenharmony_ci#if PTTYPE == 64
33162306a36Sopenharmony_ci	walk_nx_mask = 1ULL << PT64_NX_SHIFT;
33262306a36Sopenharmony_ci	if (walker->level == PT32E_ROOT_LEVEL) {
33362306a36Sopenharmony_ci		pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
33462306a36Sopenharmony_ci		trace_kvm_mmu_paging_element(pte, walker->level);
33562306a36Sopenharmony_ci		if (!FNAME(is_present_gpte)(pte))
33662306a36Sopenharmony_ci			goto error;
33762306a36Sopenharmony_ci		--walker->level;
33862306a36Sopenharmony_ci	}
33962306a36Sopenharmony_ci#endif
34062306a36Sopenharmony_ci	walker->max_level = walker->level;
34162306a36Sopenharmony_ci
34262306a36Sopenharmony_ci	/*
34362306a36Sopenharmony_ci	 * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
34462306a36Sopenharmony_ci	 * by the MOV to CR instruction are treated as reads and do not cause the
34562306a36Sopenharmony_ci	 * processor to set the dirty flag in any EPT paging-structure entry.
34662306a36Sopenharmony_ci	 */
34762306a36Sopenharmony_ci	nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
34862306a36Sopenharmony_ci
34962306a36Sopenharmony_ci	pte_access = ~0;
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_ci	/*
35262306a36Sopenharmony_ci	 * Queue a page fault for injection if this assertion fails, as callers
35362306a36Sopenharmony_ci	 * assume that walker.fault contains sane info on a walk failure.  I.e.
35462306a36Sopenharmony_ci	 * avoid making the situation worse by inducing even worse badness
35562306a36Sopenharmony_ci	 * between when the assertion fails and when KVM kicks the vCPU out to
35662306a36Sopenharmony_ci	 * userspace (because the VM is bugged).
35762306a36Sopenharmony_ci	 */
35862306a36Sopenharmony_ci	if (KVM_BUG_ON(is_long_mode(vcpu) && !is_pae(vcpu), vcpu->kvm))
35962306a36Sopenharmony_ci		goto error;
36062306a36Sopenharmony_ci
36162306a36Sopenharmony_ci	++walker->level;
36262306a36Sopenharmony_ci
36362306a36Sopenharmony_ci	do {
36462306a36Sopenharmony_ci		struct kvm_memory_slot *slot;
36562306a36Sopenharmony_ci		unsigned long host_addr;
36662306a36Sopenharmony_ci
36762306a36Sopenharmony_ci		pt_access = pte_access;
36862306a36Sopenharmony_ci		--walker->level;
36962306a36Sopenharmony_ci
37062306a36Sopenharmony_ci		index = PT_INDEX(addr, walker->level);
37162306a36Sopenharmony_ci		table_gfn = gpte_to_gfn(pte);
37262306a36Sopenharmony_ci		offset    = index * sizeof(pt_element_t);
37362306a36Sopenharmony_ci		pte_gpa   = gfn_to_gpa(table_gfn) + offset;
37462306a36Sopenharmony_ci
37562306a36Sopenharmony_ci		BUG_ON(walker->level < 1);
37662306a36Sopenharmony_ci		walker->table_gfn[walker->level - 1] = table_gfn;
37762306a36Sopenharmony_ci		walker->pte_gpa[walker->level - 1] = pte_gpa;
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_ci		real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn),
38062306a36Sopenharmony_ci					     nested_access, &walker->fault);
38162306a36Sopenharmony_ci
38262306a36Sopenharmony_ci		/*
38362306a36Sopenharmony_ci		 * FIXME: This can happen if emulation (for of an INS/OUTS
38462306a36Sopenharmony_ci		 * instruction) triggers a nested page fault.  The exit
38562306a36Sopenharmony_ci		 * qualification / exit info field will incorrectly have
38662306a36Sopenharmony_ci		 * "guest page access" as the nested page fault's cause,
38762306a36Sopenharmony_ci		 * instead of "guest page structure access".  To fix this,
38862306a36Sopenharmony_ci		 * the x86_exception struct should be augmented with enough
38962306a36Sopenharmony_ci		 * information to fix the exit_qualification or exit_info_1
39062306a36Sopenharmony_ci		 * fields.
39162306a36Sopenharmony_ci		 */
39262306a36Sopenharmony_ci		if (unlikely(real_gpa == INVALID_GPA))
39362306a36Sopenharmony_ci			return 0;
39462306a36Sopenharmony_ci
39562306a36Sopenharmony_ci		slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa));
39662306a36Sopenharmony_ci		if (!kvm_is_visible_memslot(slot))
39762306a36Sopenharmony_ci			goto error;
39862306a36Sopenharmony_ci
39962306a36Sopenharmony_ci		host_addr = gfn_to_hva_memslot_prot(slot, gpa_to_gfn(real_gpa),
40062306a36Sopenharmony_ci					    &walker->pte_writable[walker->level - 1]);
40162306a36Sopenharmony_ci		if (unlikely(kvm_is_error_hva(host_addr)))
40262306a36Sopenharmony_ci			goto error;
40362306a36Sopenharmony_ci
40462306a36Sopenharmony_ci		ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
40562306a36Sopenharmony_ci		if (unlikely(__get_user(pte, ptep_user)))
40662306a36Sopenharmony_ci			goto error;
40762306a36Sopenharmony_ci		walker->ptep_user[walker->level - 1] = ptep_user;
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci		trace_kvm_mmu_paging_element(pte, walker->level);
41062306a36Sopenharmony_ci
41162306a36Sopenharmony_ci		/*
41262306a36Sopenharmony_ci		 * Inverting the NX it lets us AND it like other
41362306a36Sopenharmony_ci		 * permission bits.
41462306a36Sopenharmony_ci		 */
41562306a36Sopenharmony_ci		pte_access = pt_access & (pte ^ walk_nx_mask);
41662306a36Sopenharmony_ci
41762306a36Sopenharmony_ci		if (unlikely(!FNAME(is_present_gpte)(pte)))
41862306a36Sopenharmony_ci			goto error;
41962306a36Sopenharmony_ci
42062306a36Sopenharmony_ci		if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) {
42162306a36Sopenharmony_ci			errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
42262306a36Sopenharmony_ci			goto error;
42362306a36Sopenharmony_ci		}
42462306a36Sopenharmony_ci
42562306a36Sopenharmony_ci		walker->ptes[walker->level - 1] = pte;
42662306a36Sopenharmony_ci
42762306a36Sopenharmony_ci		/* Convert to ACC_*_MASK flags for struct guest_walker.  */
42862306a36Sopenharmony_ci		walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
42962306a36Sopenharmony_ci	} while (!FNAME(is_last_gpte)(mmu, walker->level, pte));
43062306a36Sopenharmony_ci
43162306a36Sopenharmony_ci	pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
43262306a36Sopenharmony_ci	accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
43362306a36Sopenharmony_ci
43462306a36Sopenharmony_ci	/* Convert to ACC_*_MASK flags for struct guest_walker.  */
43562306a36Sopenharmony_ci	walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
43662306a36Sopenharmony_ci	errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
43762306a36Sopenharmony_ci	if (unlikely(errcode))
43862306a36Sopenharmony_ci		goto error;
43962306a36Sopenharmony_ci
44062306a36Sopenharmony_ci	gfn = gpte_to_gfn_lvl(pte, walker->level);
44162306a36Sopenharmony_ci	gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_ci#if PTTYPE == 32
44462306a36Sopenharmony_ci	if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36())
44562306a36Sopenharmony_ci		gfn += pse36_gfn_delta(pte);
44662306a36Sopenharmony_ci#endif
44762306a36Sopenharmony_ci
44862306a36Sopenharmony_ci	real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault);
44962306a36Sopenharmony_ci	if (real_gpa == INVALID_GPA)
45062306a36Sopenharmony_ci		return 0;
45162306a36Sopenharmony_ci
45262306a36Sopenharmony_ci	walker->gfn = real_gpa >> PAGE_SHIFT;
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_ci	if (!write_fault)
45562306a36Sopenharmony_ci		FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte);
45662306a36Sopenharmony_ci	else
45762306a36Sopenharmony_ci		/*
45862306a36Sopenharmony_ci		 * On a write fault, fold the dirty bit into accessed_dirty.
45962306a36Sopenharmony_ci		 * For modes without A/D bits support accessed_dirty will be
46062306a36Sopenharmony_ci		 * always clear.
46162306a36Sopenharmony_ci		 */
46262306a36Sopenharmony_ci		accessed_dirty &= pte >>
46362306a36Sopenharmony_ci			(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
46462306a36Sopenharmony_ci
46562306a36Sopenharmony_ci	if (unlikely(!accessed_dirty)) {
46662306a36Sopenharmony_ci		ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker,
46762306a36Sopenharmony_ci							addr, write_fault);
46862306a36Sopenharmony_ci		if (unlikely(ret < 0))
46962306a36Sopenharmony_ci			goto error;
47062306a36Sopenharmony_ci		else if (ret)
47162306a36Sopenharmony_ci			goto retry_walk;
47262306a36Sopenharmony_ci	}
47362306a36Sopenharmony_ci
47462306a36Sopenharmony_ci	return 1;
47562306a36Sopenharmony_ci
47662306a36Sopenharmony_cierror:
47762306a36Sopenharmony_ci	errcode |= write_fault | user_fault;
47862306a36Sopenharmony_ci	if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu)))
47962306a36Sopenharmony_ci		errcode |= PFERR_FETCH_MASK;
48062306a36Sopenharmony_ci
48162306a36Sopenharmony_ci	walker->fault.vector = PF_VECTOR;
48262306a36Sopenharmony_ci	walker->fault.error_code_valid = true;
48362306a36Sopenharmony_ci	walker->fault.error_code = errcode;
48462306a36Sopenharmony_ci
48562306a36Sopenharmony_ci#if PTTYPE == PTTYPE_EPT
48662306a36Sopenharmony_ci	/*
48762306a36Sopenharmony_ci	 * Use PFERR_RSVD_MASK in error_code to tell if EPT
48862306a36Sopenharmony_ci	 * misconfiguration requires to be injected. The detection is
48962306a36Sopenharmony_ci	 * done by is_rsvd_bits_set() above.
49062306a36Sopenharmony_ci	 *
49162306a36Sopenharmony_ci	 * We set up the value of exit_qualification to inject:
49262306a36Sopenharmony_ci	 * [2:0] - Derive from the access bits. The exit_qualification might be
49362306a36Sopenharmony_ci	 *         out of date if it is serving an EPT misconfiguration.
49462306a36Sopenharmony_ci	 * [5:3] - Calculated by the page walk of the guest EPT page tables
49562306a36Sopenharmony_ci	 * [7:8] - Derived from [7:8] of real exit_qualification
49662306a36Sopenharmony_ci	 *
49762306a36Sopenharmony_ci	 * The other bits are set to 0.
49862306a36Sopenharmony_ci	 */
49962306a36Sopenharmony_ci	if (!(errcode & PFERR_RSVD_MASK)) {
50062306a36Sopenharmony_ci		vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID |
50162306a36Sopenharmony_ci						  EPT_VIOLATION_GVA_TRANSLATED);
50262306a36Sopenharmony_ci		if (write_fault)
50362306a36Sopenharmony_ci			vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
50462306a36Sopenharmony_ci		if (user_fault)
50562306a36Sopenharmony_ci			vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ;
50662306a36Sopenharmony_ci		if (fetch_fault)
50762306a36Sopenharmony_ci			vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
50862306a36Sopenharmony_ci
50962306a36Sopenharmony_ci		/*
51062306a36Sopenharmony_ci		 * Note, pte_access holds the raw RWX bits from the EPTE, not
51162306a36Sopenharmony_ci		 * ACC_*_MASK flags!
51262306a36Sopenharmony_ci		 */
51362306a36Sopenharmony_ci		vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) <<
51462306a36Sopenharmony_ci						 EPT_VIOLATION_RWX_SHIFT;
51562306a36Sopenharmony_ci	}
51662306a36Sopenharmony_ci#endif
51762306a36Sopenharmony_ci	walker->fault.address = addr;
51862306a36Sopenharmony_ci	walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
51962306a36Sopenharmony_ci	walker->fault.async_page_fault = false;
52062306a36Sopenharmony_ci
52162306a36Sopenharmony_ci	trace_kvm_mmu_walker_error(walker->fault.error_code);
52262306a36Sopenharmony_ci	return 0;
52362306a36Sopenharmony_ci}
52462306a36Sopenharmony_ci
52562306a36Sopenharmony_cistatic int FNAME(walk_addr)(struct guest_walker *walker,
52662306a36Sopenharmony_ci			    struct kvm_vcpu *vcpu, gpa_t addr, u64 access)
52762306a36Sopenharmony_ci{
52862306a36Sopenharmony_ci	return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr,
52962306a36Sopenharmony_ci					access);
53062306a36Sopenharmony_ci}
53162306a36Sopenharmony_ci
53262306a36Sopenharmony_cistatic bool
53362306a36Sopenharmony_ciFNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
53462306a36Sopenharmony_ci		     u64 *spte, pt_element_t gpte)
53562306a36Sopenharmony_ci{
53662306a36Sopenharmony_ci	struct kvm_memory_slot *slot;
53762306a36Sopenharmony_ci	unsigned pte_access;
53862306a36Sopenharmony_ci	gfn_t gfn;
53962306a36Sopenharmony_ci	kvm_pfn_t pfn;
54062306a36Sopenharmony_ci
54162306a36Sopenharmony_ci	if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
54262306a36Sopenharmony_ci		return false;
54362306a36Sopenharmony_ci
54462306a36Sopenharmony_ci	gfn = gpte_to_gfn(gpte);
54562306a36Sopenharmony_ci	pte_access = sp->role.access & FNAME(gpte_access)(gpte);
54662306a36Sopenharmony_ci	FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
54762306a36Sopenharmony_ci
54862306a36Sopenharmony_ci	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, pte_access & ACC_WRITE_MASK);
54962306a36Sopenharmony_ci	if (!slot)
55062306a36Sopenharmony_ci		return false;
55162306a36Sopenharmony_ci
55262306a36Sopenharmony_ci	pfn = gfn_to_pfn_memslot_atomic(slot, gfn);
55362306a36Sopenharmony_ci	if (is_error_pfn(pfn))
55462306a36Sopenharmony_ci		return false;
55562306a36Sopenharmony_ci
55662306a36Sopenharmony_ci	mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL);
55762306a36Sopenharmony_ci	kvm_release_pfn_clean(pfn);
55862306a36Sopenharmony_ci	return true;
55962306a36Sopenharmony_ci}
56062306a36Sopenharmony_ci
56162306a36Sopenharmony_cistatic bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
56262306a36Sopenharmony_ci				struct guest_walker *gw, int level)
56362306a36Sopenharmony_ci{
56462306a36Sopenharmony_ci	pt_element_t curr_pte;
56562306a36Sopenharmony_ci	gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
56662306a36Sopenharmony_ci	u64 mask;
56762306a36Sopenharmony_ci	int r, index;
56862306a36Sopenharmony_ci
56962306a36Sopenharmony_ci	if (level == PG_LEVEL_4K) {
57062306a36Sopenharmony_ci		mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
57162306a36Sopenharmony_ci		base_gpa = pte_gpa & ~mask;
57262306a36Sopenharmony_ci		index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
57362306a36Sopenharmony_ci
57462306a36Sopenharmony_ci		r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa,
57562306a36Sopenharmony_ci				gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
57662306a36Sopenharmony_ci		curr_pte = gw->prefetch_ptes[index];
57762306a36Sopenharmony_ci	} else
57862306a36Sopenharmony_ci		r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa,
57962306a36Sopenharmony_ci				  &curr_pte, sizeof(curr_pte));
58062306a36Sopenharmony_ci
58162306a36Sopenharmony_ci	return r || curr_pte != gw->ptes[level - 1];
58262306a36Sopenharmony_ci}
58362306a36Sopenharmony_ci
58462306a36Sopenharmony_cistatic void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
58562306a36Sopenharmony_ci				u64 *sptep)
58662306a36Sopenharmony_ci{
58762306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
58862306a36Sopenharmony_ci	pt_element_t *gptep = gw->prefetch_ptes;
58962306a36Sopenharmony_ci	u64 *spte;
59062306a36Sopenharmony_ci	int i;
59162306a36Sopenharmony_ci
59262306a36Sopenharmony_ci	sp = sptep_to_sp(sptep);
59362306a36Sopenharmony_ci
59462306a36Sopenharmony_ci	if (sp->role.level > PG_LEVEL_4K)
59562306a36Sopenharmony_ci		return;
59662306a36Sopenharmony_ci
59762306a36Sopenharmony_ci	/*
59862306a36Sopenharmony_ci	 * If addresses are being invalidated, skip prefetching to avoid
59962306a36Sopenharmony_ci	 * accidentally prefetching those addresses.
60062306a36Sopenharmony_ci	 */
60162306a36Sopenharmony_ci	if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
60262306a36Sopenharmony_ci		return;
60362306a36Sopenharmony_ci
60462306a36Sopenharmony_ci	if (sp->role.direct)
60562306a36Sopenharmony_ci		return __direct_pte_prefetch(vcpu, sp, sptep);
60662306a36Sopenharmony_ci
60762306a36Sopenharmony_ci	i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
60862306a36Sopenharmony_ci	spte = sp->spt + i;
60962306a36Sopenharmony_ci
61062306a36Sopenharmony_ci	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
61162306a36Sopenharmony_ci		if (spte == sptep)
61262306a36Sopenharmony_ci			continue;
61362306a36Sopenharmony_ci
61462306a36Sopenharmony_ci		if (is_shadow_present_pte(*spte))
61562306a36Sopenharmony_ci			continue;
61662306a36Sopenharmony_ci
61762306a36Sopenharmony_ci		if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i]))
61862306a36Sopenharmony_ci			break;
61962306a36Sopenharmony_ci	}
62062306a36Sopenharmony_ci}
62162306a36Sopenharmony_ci
62262306a36Sopenharmony_ci/*
62362306a36Sopenharmony_ci * Fetch a shadow pte for a specific level in the paging hierarchy.
62462306a36Sopenharmony_ci * If the guest tries to write a write-protected page, we need to
62562306a36Sopenharmony_ci * emulate this operation, return 1 to indicate this case.
62662306a36Sopenharmony_ci */
62762306a36Sopenharmony_cistatic int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
62862306a36Sopenharmony_ci			 struct guest_walker *gw)
62962306a36Sopenharmony_ci{
63062306a36Sopenharmony_ci	struct kvm_mmu_page *sp = NULL;
63162306a36Sopenharmony_ci	struct kvm_shadow_walk_iterator it;
63262306a36Sopenharmony_ci	unsigned int direct_access, access;
63362306a36Sopenharmony_ci	int top_level, ret;
63462306a36Sopenharmony_ci	gfn_t base_gfn = fault->gfn;
63562306a36Sopenharmony_ci
63662306a36Sopenharmony_ci	WARN_ON_ONCE(gw->gfn != base_gfn);
63762306a36Sopenharmony_ci	direct_access = gw->pte_access;
63862306a36Sopenharmony_ci
63962306a36Sopenharmony_ci	top_level = vcpu->arch.mmu->cpu_role.base.level;
64062306a36Sopenharmony_ci	if (top_level == PT32E_ROOT_LEVEL)
64162306a36Sopenharmony_ci		top_level = PT32_ROOT_LEVEL;
64262306a36Sopenharmony_ci	/*
64362306a36Sopenharmony_ci	 * Verify that the top-level gpte is still there.  Since the page
64462306a36Sopenharmony_ci	 * is a root page, it is either write protected (and cannot be
64562306a36Sopenharmony_ci	 * changed from now on) or it is invalid (in which case, we don't
64662306a36Sopenharmony_ci	 * really care if it changes underneath us after this point).
64762306a36Sopenharmony_ci	 */
64862306a36Sopenharmony_ci	if (FNAME(gpte_changed)(vcpu, gw, top_level))
64962306a36Sopenharmony_ci		goto out_gpte_changed;
65062306a36Sopenharmony_ci
65162306a36Sopenharmony_ci	if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
65262306a36Sopenharmony_ci		goto out_gpte_changed;
65362306a36Sopenharmony_ci
65462306a36Sopenharmony_ci	/*
65562306a36Sopenharmony_ci	 * Load a new root and retry the faulting instruction in the extremely
65662306a36Sopenharmony_ci	 * unlikely scenario that the guest root gfn became visible between
65762306a36Sopenharmony_ci	 * loading a dummy root and handling the resulting page fault, e.g. if
65862306a36Sopenharmony_ci	 * userspace create a memslot in the interim.
65962306a36Sopenharmony_ci	 */
66062306a36Sopenharmony_ci	if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) {
66162306a36Sopenharmony_ci		kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu);
66262306a36Sopenharmony_ci		goto out_gpte_changed;
66362306a36Sopenharmony_ci	}
66462306a36Sopenharmony_ci
66562306a36Sopenharmony_ci	for_each_shadow_entry(vcpu, fault->addr, it) {
66662306a36Sopenharmony_ci		gfn_t table_gfn;
66762306a36Sopenharmony_ci
66862306a36Sopenharmony_ci		clear_sp_write_flooding_count(it.sptep);
66962306a36Sopenharmony_ci		if (it.level == gw->level)
67062306a36Sopenharmony_ci			break;
67162306a36Sopenharmony_ci
67262306a36Sopenharmony_ci		table_gfn = gw->table_gfn[it.level - 2];
67362306a36Sopenharmony_ci		access = gw->pt_access[it.level - 2];
67462306a36Sopenharmony_ci		sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn,
67562306a36Sopenharmony_ci					  false, access);
67662306a36Sopenharmony_ci
67762306a36Sopenharmony_ci		if (sp != ERR_PTR(-EEXIST)) {
67862306a36Sopenharmony_ci			/*
67962306a36Sopenharmony_ci			 * We must synchronize the pagetable before linking it
68062306a36Sopenharmony_ci			 * because the guest doesn't need to flush tlb when
68162306a36Sopenharmony_ci			 * the gpte is changed from non-present to present.
68262306a36Sopenharmony_ci			 * Otherwise, the guest may use the wrong mapping.
68362306a36Sopenharmony_ci			 *
68462306a36Sopenharmony_ci			 * For PG_LEVEL_4K, kvm_mmu_get_page() has already
68562306a36Sopenharmony_ci			 * synchronized it transiently via kvm_sync_page().
68662306a36Sopenharmony_ci			 *
68762306a36Sopenharmony_ci			 * For higher level pagetable, we synchronize it via
68862306a36Sopenharmony_ci			 * the slower mmu_sync_children().  If it needs to
68962306a36Sopenharmony_ci			 * break, some progress has been made; return
69062306a36Sopenharmony_ci			 * RET_PF_RETRY and retry on the next #PF.
69162306a36Sopenharmony_ci			 * KVM_REQ_MMU_SYNC is not necessary but it
69262306a36Sopenharmony_ci			 * expedites the process.
69362306a36Sopenharmony_ci			 */
69462306a36Sopenharmony_ci			if (sp->unsync_children &&
69562306a36Sopenharmony_ci			    mmu_sync_children(vcpu, sp, false))
69662306a36Sopenharmony_ci				return RET_PF_RETRY;
69762306a36Sopenharmony_ci		}
69862306a36Sopenharmony_ci
69962306a36Sopenharmony_ci		/*
70062306a36Sopenharmony_ci		 * Verify that the gpte in the page we've just write
70162306a36Sopenharmony_ci		 * protected is still there.
70262306a36Sopenharmony_ci		 */
70362306a36Sopenharmony_ci		if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
70462306a36Sopenharmony_ci			goto out_gpte_changed;
70562306a36Sopenharmony_ci
70662306a36Sopenharmony_ci		if (sp != ERR_PTR(-EEXIST))
70762306a36Sopenharmony_ci			link_shadow_page(vcpu, it.sptep, sp);
70862306a36Sopenharmony_ci
70962306a36Sopenharmony_ci		if (fault->write && table_gfn == fault->gfn)
71062306a36Sopenharmony_ci			fault->write_fault_to_shadow_pgtable = true;
71162306a36Sopenharmony_ci	}
71262306a36Sopenharmony_ci
71362306a36Sopenharmony_ci	/*
71462306a36Sopenharmony_ci	 * Adjust the hugepage size _after_ resolving indirect shadow pages.
71562306a36Sopenharmony_ci	 * KVM doesn't support mapping hugepages into the guest for gfns that
71662306a36Sopenharmony_ci	 * are being shadowed by KVM, i.e. allocating a new shadow page may
71762306a36Sopenharmony_ci	 * affect the allowed hugepage size.
71862306a36Sopenharmony_ci	 */
71962306a36Sopenharmony_ci	kvm_mmu_hugepage_adjust(vcpu, fault);
72062306a36Sopenharmony_ci
72162306a36Sopenharmony_ci	trace_kvm_mmu_spte_requested(fault);
72262306a36Sopenharmony_ci
72362306a36Sopenharmony_ci	for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
72462306a36Sopenharmony_ci		/*
72562306a36Sopenharmony_ci		 * We cannot overwrite existing page tables with an NX
72662306a36Sopenharmony_ci		 * large page, as the leaf could be executable.
72762306a36Sopenharmony_ci		 */
72862306a36Sopenharmony_ci		if (fault->nx_huge_page_workaround_enabled)
72962306a36Sopenharmony_ci			disallowed_hugepage_adjust(fault, *it.sptep, it.level);
73062306a36Sopenharmony_ci
73162306a36Sopenharmony_ci		base_gfn = gfn_round_for_level(fault->gfn, it.level);
73262306a36Sopenharmony_ci		if (it.level == fault->goal_level)
73362306a36Sopenharmony_ci			break;
73462306a36Sopenharmony_ci
73562306a36Sopenharmony_ci		validate_direct_spte(vcpu, it.sptep, direct_access);
73662306a36Sopenharmony_ci
73762306a36Sopenharmony_ci		sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn,
73862306a36Sopenharmony_ci					  true, direct_access);
73962306a36Sopenharmony_ci		if (sp == ERR_PTR(-EEXIST))
74062306a36Sopenharmony_ci			continue;
74162306a36Sopenharmony_ci
74262306a36Sopenharmony_ci		link_shadow_page(vcpu, it.sptep, sp);
74362306a36Sopenharmony_ci		if (fault->huge_page_disallowed)
74462306a36Sopenharmony_ci			account_nx_huge_page(vcpu->kvm, sp,
74562306a36Sopenharmony_ci					     fault->req_level >= it.level);
74662306a36Sopenharmony_ci	}
74762306a36Sopenharmony_ci
74862306a36Sopenharmony_ci	if (WARN_ON_ONCE(it.level != fault->goal_level))
74962306a36Sopenharmony_ci		return -EFAULT;
75062306a36Sopenharmony_ci
75162306a36Sopenharmony_ci	ret = mmu_set_spte(vcpu, fault->slot, it.sptep, gw->pte_access,
75262306a36Sopenharmony_ci			   base_gfn, fault->pfn, fault);
75362306a36Sopenharmony_ci	if (ret == RET_PF_SPURIOUS)
75462306a36Sopenharmony_ci		return ret;
75562306a36Sopenharmony_ci
75662306a36Sopenharmony_ci	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
75762306a36Sopenharmony_ci	return ret;
75862306a36Sopenharmony_ci
75962306a36Sopenharmony_ciout_gpte_changed:
76062306a36Sopenharmony_ci	return RET_PF_RETRY;
76162306a36Sopenharmony_ci}
76262306a36Sopenharmony_ci
76362306a36Sopenharmony_ci/*
76462306a36Sopenharmony_ci * Page fault handler.  There are several causes for a page fault:
76562306a36Sopenharmony_ci *   - there is no shadow pte for the guest pte
76662306a36Sopenharmony_ci *   - write access through a shadow pte marked read only so that we can set
76762306a36Sopenharmony_ci *     the dirty bit
76862306a36Sopenharmony_ci *   - write access to a shadow pte marked read only so we can update the page
76962306a36Sopenharmony_ci *     dirty bitmap, when userspace requests it
77062306a36Sopenharmony_ci *   - mmio access; in this case we will never install a present shadow pte
77162306a36Sopenharmony_ci *   - normal guest page fault due to the guest pte marked not present, not
77262306a36Sopenharmony_ci *     writable, or not executable
77362306a36Sopenharmony_ci *
77462306a36Sopenharmony_ci *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
77562306a36Sopenharmony_ci *           a negative value on error.
77662306a36Sopenharmony_ci */
77762306a36Sopenharmony_cistatic int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
77862306a36Sopenharmony_ci{
77962306a36Sopenharmony_ci	struct guest_walker walker;
78062306a36Sopenharmony_ci	int r;
78162306a36Sopenharmony_ci
78262306a36Sopenharmony_ci	WARN_ON_ONCE(fault->is_tdp);
78362306a36Sopenharmony_ci
78462306a36Sopenharmony_ci	/*
78562306a36Sopenharmony_ci	 * Look up the guest pte for the faulting address.
78662306a36Sopenharmony_ci	 * If PFEC.RSVD is set, this is a shadow page fault.
78762306a36Sopenharmony_ci	 * The bit needs to be cleared before walking guest page tables.
78862306a36Sopenharmony_ci	 */
78962306a36Sopenharmony_ci	r = FNAME(walk_addr)(&walker, vcpu, fault->addr,
79062306a36Sopenharmony_ci			     fault->error_code & ~PFERR_RSVD_MASK);
79162306a36Sopenharmony_ci
79262306a36Sopenharmony_ci	/*
79362306a36Sopenharmony_ci	 * The page is not mapped by the guest.  Let the guest handle it.
79462306a36Sopenharmony_ci	 */
79562306a36Sopenharmony_ci	if (!r) {
79662306a36Sopenharmony_ci		if (!fault->prefetch)
79762306a36Sopenharmony_ci			kvm_inject_emulated_page_fault(vcpu, &walker.fault);
79862306a36Sopenharmony_ci
79962306a36Sopenharmony_ci		return RET_PF_RETRY;
80062306a36Sopenharmony_ci	}
80162306a36Sopenharmony_ci
80262306a36Sopenharmony_ci	fault->gfn = walker.gfn;
80362306a36Sopenharmony_ci	fault->max_level = walker.level;
80462306a36Sopenharmony_ci	fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
80562306a36Sopenharmony_ci
80662306a36Sopenharmony_ci	if (page_fault_handle_page_track(vcpu, fault)) {
80762306a36Sopenharmony_ci		shadow_page_table_clear_flood(vcpu, fault->addr);
80862306a36Sopenharmony_ci		return RET_PF_EMULATE;
80962306a36Sopenharmony_ci	}
81062306a36Sopenharmony_ci
81162306a36Sopenharmony_ci	r = mmu_topup_memory_caches(vcpu, true);
81262306a36Sopenharmony_ci	if (r)
81362306a36Sopenharmony_ci		return r;
81462306a36Sopenharmony_ci
81562306a36Sopenharmony_ci	r = kvm_faultin_pfn(vcpu, fault, walker.pte_access);
81662306a36Sopenharmony_ci	if (r != RET_PF_CONTINUE)
81762306a36Sopenharmony_ci		return r;
81862306a36Sopenharmony_ci
81962306a36Sopenharmony_ci	/*
82062306a36Sopenharmony_ci	 * Do not change pte_access if the pfn is a mmio page, otherwise
82162306a36Sopenharmony_ci	 * we will cache the incorrect access into mmio spte.
82262306a36Sopenharmony_ci	 */
82362306a36Sopenharmony_ci	if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
82462306a36Sopenharmony_ci	    !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
82562306a36Sopenharmony_ci		walker.pte_access |= ACC_WRITE_MASK;
82662306a36Sopenharmony_ci		walker.pte_access &= ~ACC_USER_MASK;
82762306a36Sopenharmony_ci
82862306a36Sopenharmony_ci		/*
82962306a36Sopenharmony_ci		 * If we converted a user page to a kernel page,
83062306a36Sopenharmony_ci		 * so that the kernel can write to it when cr0.wp=0,
83162306a36Sopenharmony_ci		 * then we should prevent the kernel from executing it
83262306a36Sopenharmony_ci		 * if SMEP is enabled.
83362306a36Sopenharmony_ci		 */
83462306a36Sopenharmony_ci		if (is_cr4_smep(vcpu->arch.mmu))
83562306a36Sopenharmony_ci			walker.pte_access &= ~ACC_EXEC_MASK;
83662306a36Sopenharmony_ci	}
83762306a36Sopenharmony_ci
83862306a36Sopenharmony_ci	r = RET_PF_RETRY;
83962306a36Sopenharmony_ci	write_lock(&vcpu->kvm->mmu_lock);
84062306a36Sopenharmony_ci
84162306a36Sopenharmony_ci	if (is_page_fault_stale(vcpu, fault))
84262306a36Sopenharmony_ci		goto out_unlock;
84362306a36Sopenharmony_ci
84462306a36Sopenharmony_ci	r = make_mmu_pages_available(vcpu);
84562306a36Sopenharmony_ci	if (r)
84662306a36Sopenharmony_ci		goto out_unlock;
84762306a36Sopenharmony_ci	r = FNAME(fetch)(vcpu, fault, &walker);
84862306a36Sopenharmony_ci
84962306a36Sopenharmony_ciout_unlock:
85062306a36Sopenharmony_ci	write_unlock(&vcpu->kvm->mmu_lock);
85162306a36Sopenharmony_ci	kvm_release_pfn_clean(fault->pfn);
85262306a36Sopenharmony_ci	return r;
85362306a36Sopenharmony_ci}
85462306a36Sopenharmony_ci
85562306a36Sopenharmony_cistatic gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
85662306a36Sopenharmony_ci{
85762306a36Sopenharmony_ci	int offset = 0;
85862306a36Sopenharmony_ci
85962306a36Sopenharmony_ci	WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
86062306a36Sopenharmony_ci
86162306a36Sopenharmony_ci	if (PTTYPE == 32)
86262306a36Sopenharmony_ci		offset = sp->role.quadrant << SPTE_LEVEL_BITS;
86362306a36Sopenharmony_ci
86462306a36Sopenharmony_ci	return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
86562306a36Sopenharmony_ci}
86662306a36Sopenharmony_ci
86762306a36Sopenharmony_ci/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
86862306a36Sopenharmony_cistatic gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
86962306a36Sopenharmony_ci			       gpa_t addr, u64 access,
87062306a36Sopenharmony_ci			       struct x86_exception *exception)
87162306a36Sopenharmony_ci{
87262306a36Sopenharmony_ci	struct guest_walker walker;
87362306a36Sopenharmony_ci	gpa_t gpa = INVALID_GPA;
87462306a36Sopenharmony_ci	int r;
87562306a36Sopenharmony_ci
87662306a36Sopenharmony_ci#ifndef CONFIG_X86_64
87762306a36Sopenharmony_ci	/* A 64-bit GVA should be impossible on 32-bit KVM. */
87862306a36Sopenharmony_ci	WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu);
87962306a36Sopenharmony_ci#endif
88062306a36Sopenharmony_ci
88162306a36Sopenharmony_ci	r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access);
88262306a36Sopenharmony_ci
88362306a36Sopenharmony_ci	if (r) {
88462306a36Sopenharmony_ci		gpa = gfn_to_gpa(walker.gfn);
88562306a36Sopenharmony_ci		gpa |= addr & ~PAGE_MASK;
88662306a36Sopenharmony_ci	} else if (exception)
88762306a36Sopenharmony_ci		*exception = walker.fault;
88862306a36Sopenharmony_ci
88962306a36Sopenharmony_ci	return gpa;
89062306a36Sopenharmony_ci}
89162306a36Sopenharmony_ci
89262306a36Sopenharmony_ci/*
89362306a36Sopenharmony_ci * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is
89462306a36Sopenharmony_ci * safe because:
89562306a36Sopenharmony_ci * - The spte has a reference to the struct page, so the pfn for a given gfn
89662306a36Sopenharmony_ci *   can't change unless all sptes pointing to it are nuked first.
89762306a36Sopenharmony_ci *
89862306a36Sopenharmony_ci * Returns
89962306a36Sopenharmony_ci * < 0: failed to sync spte
90062306a36Sopenharmony_ci *   0: the spte is synced and no tlb flushing is required
90162306a36Sopenharmony_ci * > 0: the spte is synced and tlb flushing is required
90262306a36Sopenharmony_ci */
90362306a36Sopenharmony_cistatic int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
90462306a36Sopenharmony_ci{
90562306a36Sopenharmony_ci	bool host_writable;
90662306a36Sopenharmony_ci	gpa_t first_pte_gpa;
90762306a36Sopenharmony_ci	u64 *sptep, spte;
90862306a36Sopenharmony_ci	struct kvm_memory_slot *slot;
90962306a36Sopenharmony_ci	unsigned pte_access;
91062306a36Sopenharmony_ci	pt_element_t gpte;
91162306a36Sopenharmony_ci	gpa_t pte_gpa;
91262306a36Sopenharmony_ci	gfn_t gfn;
91362306a36Sopenharmony_ci
91462306a36Sopenharmony_ci	if (WARN_ON_ONCE(!sp->spt[i]))
91562306a36Sopenharmony_ci		return 0;
91662306a36Sopenharmony_ci
91762306a36Sopenharmony_ci	first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
91862306a36Sopenharmony_ci	pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
91962306a36Sopenharmony_ci
92062306a36Sopenharmony_ci	if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
92162306a36Sopenharmony_ci				       sizeof(pt_element_t)))
92262306a36Sopenharmony_ci		return -1;
92362306a36Sopenharmony_ci
92462306a36Sopenharmony_ci	if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte))
92562306a36Sopenharmony_ci		return 1;
92662306a36Sopenharmony_ci
92762306a36Sopenharmony_ci	gfn = gpte_to_gfn(gpte);
92862306a36Sopenharmony_ci	pte_access = sp->role.access;
92962306a36Sopenharmony_ci	pte_access &= FNAME(gpte_access)(gpte);
93062306a36Sopenharmony_ci	FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
93162306a36Sopenharmony_ci
93262306a36Sopenharmony_ci	if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
93362306a36Sopenharmony_ci		return 0;
93462306a36Sopenharmony_ci
93562306a36Sopenharmony_ci	/*
93662306a36Sopenharmony_ci	 * Drop the SPTE if the new protections would result in a RWX=0
93762306a36Sopenharmony_ci	 * SPTE or if the gfn is changing.  The RWX=0 case only affects
93862306a36Sopenharmony_ci	 * EPT with execute-only support, i.e. EPT without an effective
93962306a36Sopenharmony_ci	 * "present" bit, as all other paging modes will create a
94062306a36Sopenharmony_ci	 * read-only SPTE if pte_access is zero.
94162306a36Sopenharmony_ci	 */
94262306a36Sopenharmony_ci	if ((!pte_access && !shadow_present_mask) ||
94362306a36Sopenharmony_ci	    gfn != kvm_mmu_page_get_gfn(sp, i)) {
94462306a36Sopenharmony_ci		drop_spte(vcpu->kvm, &sp->spt[i]);
94562306a36Sopenharmony_ci		return 1;
94662306a36Sopenharmony_ci	}
94762306a36Sopenharmony_ci	/*
94862306a36Sopenharmony_ci	 * Do nothing if the permissions are unchanged.  The existing SPTE is
94962306a36Sopenharmony_ci	 * still, and prefetch_invalid_gpte() has verified that the A/D bits
95062306a36Sopenharmony_ci	 * are set in the "new" gPTE, i.e. there is no danger of missing an A/D
95162306a36Sopenharmony_ci	 * update due to A/D bits being set in the SPTE but not the gPTE.
95262306a36Sopenharmony_ci	 */
95362306a36Sopenharmony_ci	if (kvm_mmu_page_get_access(sp, i) == pte_access)
95462306a36Sopenharmony_ci		return 0;
95562306a36Sopenharmony_ci
95662306a36Sopenharmony_ci	/* Update the shadowed access bits in case they changed. */
95762306a36Sopenharmony_ci	kvm_mmu_page_set_access(sp, i, pte_access);
95862306a36Sopenharmony_ci
95962306a36Sopenharmony_ci	sptep = &sp->spt[i];
96062306a36Sopenharmony_ci	spte = *sptep;
96162306a36Sopenharmony_ci	host_writable = spte & shadow_host_writable_mask;
96262306a36Sopenharmony_ci	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
96362306a36Sopenharmony_ci	make_spte(vcpu, sp, slot, pte_access, gfn,
96462306a36Sopenharmony_ci		  spte_to_pfn(spte), spte, true, false,
96562306a36Sopenharmony_ci		  host_writable, &spte);
96662306a36Sopenharmony_ci
96762306a36Sopenharmony_ci	return mmu_spte_update(sptep, spte);
96862306a36Sopenharmony_ci}
96962306a36Sopenharmony_ci
97062306a36Sopenharmony_ci#undef pt_element_t
97162306a36Sopenharmony_ci#undef guest_walker
97262306a36Sopenharmony_ci#undef FNAME
97362306a36Sopenharmony_ci#undef PT_BASE_ADDR_MASK
97462306a36Sopenharmony_ci#undef PT_INDEX
97562306a36Sopenharmony_ci#undef PT_LVL_ADDR_MASK
97662306a36Sopenharmony_ci#undef PT_LVL_OFFSET_MASK
97762306a36Sopenharmony_ci#undef PT_LEVEL_BITS
97862306a36Sopenharmony_ci#undef PT_MAX_FULL_LEVELS
97962306a36Sopenharmony_ci#undef gpte_to_gfn
98062306a36Sopenharmony_ci#undef gpte_to_gfn_lvl
98162306a36Sopenharmony_ci#undef PT_GUEST_ACCESSED_MASK
98262306a36Sopenharmony_ci#undef PT_GUEST_DIRTY_MASK
98362306a36Sopenharmony_ci#undef PT_GUEST_DIRTY_SHIFT
98462306a36Sopenharmony_ci#undef PT_GUEST_ACCESSED_SHIFT
98562306a36Sopenharmony_ci#undef PT_HAVE_ACCESSED_DIRTY
986