162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
362306a36Sopenharmony_ci
462306a36Sopenharmony_ci#include "mmu.h"
562306a36Sopenharmony_ci#include "mmu_internal.h"
662306a36Sopenharmony_ci#include "mmutrace.h"
762306a36Sopenharmony_ci#include "tdp_iter.h"
862306a36Sopenharmony_ci#include "tdp_mmu.h"
962306a36Sopenharmony_ci#include "spte.h"
1062306a36Sopenharmony_ci
1162306a36Sopenharmony_ci#include <asm/cmpxchg.h>
1262306a36Sopenharmony_ci#include <trace/events/kvm.h>
1362306a36Sopenharmony_ci
1462306a36Sopenharmony_ci/* Initializes the TDP MMU for the VM, if enabled. */
1562306a36Sopenharmony_civoid kvm_mmu_init_tdp_mmu(struct kvm *kvm)
1662306a36Sopenharmony_ci{
1762306a36Sopenharmony_ci	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
1862306a36Sopenharmony_ci	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
1962306a36Sopenharmony_ci}
2062306a36Sopenharmony_ci
2162306a36Sopenharmony_ci/* Arbitrarily returns true so that this may be used in if statements. */
2262306a36Sopenharmony_cistatic __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
2362306a36Sopenharmony_ci							     bool shared)
2462306a36Sopenharmony_ci{
2562306a36Sopenharmony_ci	if (shared)
2662306a36Sopenharmony_ci		lockdep_assert_held_read(&kvm->mmu_lock);
2762306a36Sopenharmony_ci	else
2862306a36Sopenharmony_ci		lockdep_assert_held_write(&kvm->mmu_lock);
2962306a36Sopenharmony_ci
3062306a36Sopenharmony_ci	return true;
3162306a36Sopenharmony_ci}
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_civoid kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
3462306a36Sopenharmony_ci{
3562306a36Sopenharmony_ci	/*
3662306a36Sopenharmony_ci	 * Invalidate all roots, which besides the obvious, schedules all roots
3762306a36Sopenharmony_ci	 * for zapping and thus puts the TDP MMU's reference to each root, i.e.
3862306a36Sopenharmony_ci	 * ultimately frees all roots.
3962306a36Sopenharmony_ci	 */
4062306a36Sopenharmony_ci	kvm_tdp_mmu_invalidate_all_roots(kvm);
4162306a36Sopenharmony_ci	kvm_tdp_mmu_zap_invalidated_roots(kvm);
4262306a36Sopenharmony_ci
4362306a36Sopenharmony_ci	WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
4462306a36Sopenharmony_ci	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_ci	/*
4762306a36Sopenharmony_ci	 * Ensure that all the outstanding RCU callbacks to free shadow pages
4862306a36Sopenharmony_ci	 * can run before the VM is torn down.  Putting the last reference to
4962306a36Sopenharmony_ci	 * zapped roots will create new callbacks.
5062306a36Sopenharmony_ci	 */
5162306a36Sopenharmony_ci	rcu_barrier();
5262306a36Sopenharmony_ci}
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_cistatic void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
5562306a36Sopenharmony_ci{
5662306a36Sopenharmony_ci	free_page((unsigned long)sp->spt);
5762306a36Sopenharmony_ci	kmem_cache_free(mmu_page_header_cache, sp);
5862306a36Sopenharmony_ci}
5962306a36Sopenharmony_ci
6062306a36Sopenharmony_ci/*
6162306a36Sopenharmony_ci * This is called through call_rcu in order to free TDP page table memory
6262306a36Sopenharmony_ci * safely with respect to other kernel threads that may be operating on
6362306a36Sopenharmony_ci * the memory.
6462306a36Sopenharmony_ci * By only accessing TDP MMU page table memory in an RCU read critical
6562306a36Sopenharmony_ci * section, and freeing it after a grace period, lockless access to that
6662306a36Sopenharmony_ci * memory won't use it after it is freed.
6762306a36Sopenharmony_ci */
6862306a36Sopenharmony_cistatic void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
6962306a36Sopenharmony_ci{
7062306a36Sopenharmony_ci	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
7162306a36Sopenharmony_ci					       rcu_head);
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci	tdp_mmu_free_sp(sp);
7462306a36Sopenharmony_ci}
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_civoid kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
7762306a36Sopenharmony_ci			  bool shared)
7862306a36Sopenharmony_ci{
7962306a36Sopenharmony_ci	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
8062306a36Sopenharmony_ci
8162306a36Sopenharmony_ci	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
8262306a36Sopenharmony_ci		return;
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci	/*
8562306a36Sopenharmony_ci	 * The TDP MMU itself holds a reference to each root until the root is
8662306a36Sopenharmony_ci	 * explicitly invalidated, i.e. the final reference should be never be
8762306a36Sopenharmony_ci	 * put for a valid root.
8862306a36Sopenharmony_ci	 */
8962306a36Sopenharmony_ci	KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
9262306a36Sopenharmony_ci	list_del_rcu(&root->link);
9362306a36Sopenharmony_ci	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
9462306a36Sopenharmony_ci	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
9562306a36Sopenharmony_ci}
9662306a36Sopenharmony_ci
9762306a36Sopenharmony_ci/*
9862306a36Sopenharmony_ci * Returns the next root after @prev_root (or the first root if @prev_root is
9962306a36Sopenharmony_ci * NULL).  A reference to the returned root is acquired, and the reference to
10062306a36Sopenharmony_ci * @prev_root is released (the caller obviously must hold a reference to
10162306a36Sopenharmony_ci * @prev_root if it's non-NULL).
10262306a36Sopenharmony_ci *
10362306a36Sopenharmony_ci * If @only_valid is true, invalid roots are skipped.
10462306a36Sopenharmony_ci *
10562306a36Sopenharmony_ci * Returns NULL if the end of tdp_mmu_roots was reached.
10662306a36Sopenharmony_ci */
10762306a36Sopenharmony_cistatic struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
10862306a36Sopenharmony_ci					      struct kvm_mmu_page *prev_root,
10962306a36Sopenharmony_ci					      bool shared, bool only_valid)
11062306a36Sopenharmony_ci{
11162306a36Sopenharmony_ci	struct kvm_mmu_page *next_root;
11262306a36Sopenharmony_ci
11362306a36Sopenharmony_ci	rcu_read_lock();
11462306a36Sopenharmony_ci
11562306a36Sopenharmony_ci	if (prev_root)
11662306a36Sopenharmony_ci		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
11762306a36Sopenharmony_ci						  &prev_root->link,
11862306a36Sopenharmony_ci						  typeof(*prev_root), link);
11962306a36Sopenharmony_ci	else
12062306a36Sopenharmony_ci		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
12162306a36Sopenharmony_ci						   typeof(*next_root), link);
12262306a36Sopenharmony_ci
12362306a36Sopenharmony_ci	while (next_root) {
12462306a36Sopenharmony_ci		if ((!only_valid || !next_root->role.invalid) &&
12562306a36Sopenharmony_ci		    kvm_tdp_mmu_get_root(next_root))
12662306a36Sopenharmony_ci			break;
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_ci		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
12962306a36Sopenharmony_ci				&next_root->link, typeof(*next_root), link);
13062306a36Sopenharmony_ci	}
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_ci	rcu_read_unlock();
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_ci	if (prev_root)
13562306a36Sopenharmony_ci		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
13662306a36Sopenharmony_ci
13762306a36Sopenharmony_ci	return next_root;
13862306a36Sopenharmony_ci}
13962306a36Sopenharmony_ci
14062306a36Sopenharmony_ci/*
14162306a36Sopenharmony_ci * Note: this iterator gets and puts references to the roots it iterates over.
14262306a36Sopenharmony_ci * This makes it safe to release the MMU lock and yield within the loop, but
14362306a36Sopenharmony_ci * if exiting the loop early, the caller must drop the reference to the most
14462306a36Sopenharmony_ci * recent root. (Unless keeping a live reference is desirable.)
14562306a36Sopenharmony_ci *
14662306a36Sopenharmony_ci * If shared is set, this function is operating under the MMU lock in read
14762306a36Sopenharmony_ci * mode. In the unlikely event that this thread must free a root, the lock
14862306a36Sopenharmony_ci * will be temporarily dropped and reacquired in write mode.
14962306a36Sopenharmony_ci */
15062306a36Sopenharmony_ci#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
15162306a36Sopenharmony_ci	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
15262306a36Sopenharmony_ci	     _root;								\
15362306a36Sopenharmony_ci	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
15462306a36Sopenharmony_ci		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
15562306a36Sopenharmony_ci		    kvm_mmu_page_as_id(_root) != _as_id) {			\
15662306a36Sopenharmony_ci		} else
15762306a36Sopenharmony_ci
15862306a36Sopenharmony_ci#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
15962306a36Sopenharmony_ci	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared)			\
16262306a36Sopenharmony_ci	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, false);		\
16362306a36Sopenharmony_ci	     _root;								\
16462306a36Sopenharmony_ci	     _root = tdp_mmu_next_root(_kvm, _root, _shared, false))		\
16562306a36Sopenharmony_ci		if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) {		\
16662306a36Sopenharmony_ci		} else
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_ci/*
16962306a36Sopenharmony_ci * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
17062306a36Sopenharmony_ci * the implication being that any flow that holds mmu_lock for read is
17162306a36Sopenharmony_ci * inherently yield-friendly and should use the yield-safe variant above.
17262306a36Sopenharmony_ci * Holding mmu_lock for write obviates the need for RCU protection as the list
17362306a36Sopenharmony_ci * is guaranteed to be stable.
17462306a36Sopenharmony_ci */
17562306a36Sopenharmony_ci#define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
17662306a36Sopenharmony_ci	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)	\
17762306a36Sopenharmony_ci		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&	\
17862306a36Sopenharmony_ci		    kvm_mmu_page_as_id(_root) != _as_id) {		\
17962306a36Sopenharmony_ci		} else
18062306a36Sopenharmony_ci
18162306a36Sopenharmony_cistatic struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
18262306a36Sopenharmony_ci{
18362306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
18462306a36Sopenharmony_ci
18562306a36Sopenharmony_ci	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
18662306a36Sopenharmony_ci	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
18762306a36Sopenharmony_ci
18862306a36Sopenharmony_ci	return sp;
18962306a36Sopenharmony_ci}
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_cistatic void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
19262306a36Sopenharmony_ci			    gfn_t gfn, union kvm_mmu_page_role role)
19362306a36Sopenharmony_ci{
19462306a36Sopenharmony_ci	INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_ci	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ci	sp->role = role;
19962306a36Sopenharmony_ci	sp->gfn = gfn;
20062306a36Sopenharmony_ci	sp->ptep = sptep;
20162306a36Sopenharmony_ci	sp->tdp_mmu_page = true;
20262306a36Sopenharmony_ci
20362306a36Sopenharmony_ci	trace_kvm_mmu_get_page(sp, true);
20462306a36Sopenharmony_ci}
20562306a36Sopenharmony_ci
20662306a36Sopenharmony_cistatic void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
20762306a36Sopenharmony_ci				  struct tdp_iter *iter)
20862306a36Sopenharmony_ci{
20962306a36Sopenharmony_ci	struct kvm_mmu_page *parent_sp;
21062306a36Sopenharmony_ci	union kvm_mmu_page_role role;
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ci	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
21362306a36Sopenharmony_ci
21462306a36Sopenharmony_ci	role = parent_sp->role;
21562306a36Sopenharmony_ci	role.level--;
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_ci	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
21862306a36Sopenharmony_ci}
21962306a36Sopenharmony_ci
22062306a36Sopenharmony_cihpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
22162306a36Sopenharmony_ci{
22262306a36Sopenharmony_ci	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
22362306a36Sopenharmony_ci	struct kvm *kvm = vcpu->kvm;
22462306a36Sopenharmony_ci	struct kvm_mmu_page *root;
22562306a36Sopenharmony_ci
22662306a36Sopenharmony_ci	lockdep_assert_held_write(&kvm->mmu_lock);
22762306a36Sopenharmony_ci
22862306a36Sopenharmony_ci	/*
22962306a36Sopenharmony_ci	 * Check for an existing root before allocating a new one.  Note, the
23062306a36Sopenharmony_ci	 * role check prevents consuming an invalid root.
23162306a36Sopenharmony_ci	 */
23262306a36Sopenharmony_ci	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
23362306a36Sopenharmony_ci		if (root->role.word == role.word &&
23462306a36Sopenharmony_ci		    kvm_tdp_mmu_get_root(root))
23562306a36Sopenharmony_ci			goto out;
23662306a36Sopenharmony_ci	}
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci	root = tdp_mmu_alloc_sp(vcpu);
23962306a36Sopenharmony_ci	tdp_mmu_init_sp(root, NULL, 0, role);
24062306a36Sopenharmony_ci
24162306a36Sopenharmony_ci	/*
24262306a36Sopenharmony_ci	 * TDP MMU roots are kept until they are explicitly invalidated, either
24362306a36Sopenharmony_ci	 * by a memslot update or by the destruction of the VM.  Initialize the
24462306a36Sopenharmony_ci	 * refcount to two; one reference for the vCPU, and one reference for
24562306a36Sopenharmony_ci	 * the TDP MMU itself, which is held until the root is invalidated and
24662306a36Sopenharmony_ci	 * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
24762306a36Sopenharmony_ci	 */
24862306a36Sopenharmony_ci	refcount_set(&root->tdp_mmu_root_count, 2);
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_ci	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
25162306a36Sopenharmony_ci	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
25262306a36Sopenharmony_ci	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
25362306a36Sopenharmony_ci
25462306a36Sopenharmony_ciout:
25562306a36Sopenharmony_ci	return __pa(root->spt);
25662306a36Sopenharmony_ci}
25762306a36Sopenharmony_ci
25862306a36Sopenharmony_cistatic void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
25962306a36Sopenharmony_ci				u64 old_spte, u64 new_spte, int level,
26062306a36Sopenharmony_ci				bool shared);
26162306a36Sopenharmony_ci
26262306a36Sopenharmony_cistatic void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
26362306a36Sopenharmony_ci{
26462306a36Sopenharmony_ci	kvm_account_pgtable_pages((void *)sp->spt, +1);
26562306a36Sopenharmony_ci	atomic64_inc(&kvm->arch.tdp_mmu_pages);
26662306a36Sopenharmony_ci}
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_cistatic void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
26962306a36Sopenharmony_ci{
27062306a36Sopenharmony_ci	kvm_account_pgtable_pages((void *)sp->spt, -1);
27162306a36Sopenharmony_ci	atomic64_dec(&kvm->arch.tdp_mmu_pages);
27262306a36Sopenharmony_ci}
27362306a36Sopenharmony_ci
27462306a36Sopenharmony_ci/**
27562306a36Sopenharmony_ci * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
27662306a36Sopenharmony_ci *
27762306a36Sopenharmony_ci * @kvm: kvm instance
27862306a36Sopenharmony_ci * @sp: the page to be removed
27962306a36Sopenharmony_ci * @shared: This operation may not be running under the exclusive use of
28062306a36Sopenharmony_ci *	    the MMU lock and the operation must synchronize with other
28162306a36Sopenharmony_ci *	    threads that might be adding or removing pages.
28262306a36Sopenharmony_ci */
28362306a36Sopenharmony_cistatic void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
28462306a36Sopenharmony_ci			      bool shared)
28562306a36Sopenharmony_ci{
28662306a36Sopenharmony_ci	tdp_unaccount_mmu_page(kvm, sp);
28762306a36Sopenharmony_ci
28862306a36Sopenharmony_ci	if (!sp->nx_huge_page_disallowed)
28962306a36Sopenharmony_ci		return;
29062306a36Sopenharmony_ci
29162306a36Sopenharmony_ci	if (shared)
29262306a36Sopenharmony_ci		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
29362306a36Sopenharmony_ci	else
29462306a36Sopenharmony_ci		lockdep_assert_held_write(&kvm->mmu_lock);
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ci	sp->nx_huge_page_disallowed = false;
29762306a36Sopenharmony_ci	untrack_possible_nx_huge_page(kvm, sp);
29862306a36Sopenharmony_ci
29962306a36Sopenharmony_ci	if (shared)
30062306a36Sopenharmony_ci		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
30162306a36Sopenharmony_ci}
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_ci/**
30462306a36Sopenharmony_ci * handle_removed_pt() - handle a page table removed from the TDP structure
30562306a36Sopenharmony_ci *
30662306a36Sopenharmony_ci * @kvm: kvm instance
30762306a36Sopenharmony_ci * @pt: the page removed from the paging structure
30862306a36Sopenharmony_ci * @shared: This operation may not be running under the exclusive use
30962306a36Sopenharmony_ci *	    of the MMU lock and the operation must synchronize with other
31062306a36Sopenharmony_ci *	    threads that might be modifying SPTEs.
31162306a36Sopenharmony_ci *
31262306a36Sopenharmony_ci * Given a page table that has been removed from the TDP paging structure,
31362306a36Sopenharmony_ci * iterates through the page table to clear SPTEs and free child page tables.
31462306a36Sopenharmony_ci *
31562306a36Sopenharmony_ci * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
31662306a36Sopenharmony_ci * protection. Since this thread removed it from the paging structure,
31762306a36Sopenharmony_ci * this thread will be responsible for ensuring the page is freed. Hence the
31862306a36Sopenharmony_ci * early rcu_dereferences in the function.
31962306a36Sopenharmony_ci */
32062306a36Sopenharmony_cistatic void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
32162306a36Sopenharmony_ci{
32262306a36Sopenharmony_ci	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
32362306a36Sopenharmony_ci	int level = sp->role.level;
32462306a36Sopenharmony_ci	gfn_t base_gfn = sp->gfn;
32562306a36Sopenharmony_ci	int i;
32662306a36Sopenharmony_ci
32762306a36Sopenharmony_ci	trace_kvm_mmu_prepare_zap_page(sp);
32862306a36Sopenharmony_ci
32962306a36Sopenharmony_ci	tdp_mmu_unlink_sp(kvm, sp, shared);
33062306a36Sopenharmony_ci
33162306a36Sopenharmony_ci	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
33262306a36Sopenharmony_ci		tdp_ptep_t sptep = pt + i;
33362306a36Sopenharmony_ci		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
33462306a36Sopenharmony_ci		u64 old_spte;
33562306a36Sopenharmony_ci
33662306a36Sopenharmony_ci		if (shared) {
33762306a36Sopenharmony_ci			/*
33862306a36Sopenharmony_ci			 * Set the SPTE to a nonpresent value that other
33962306a36Sopenharmony_ci			 * threads will not overwrite. If the SPTE was
34062306a36Sopenharmony_ci			 * already marked as removed then another thread
34162306a36Sopenharmony_ci			 * handling a page fault could overwrite it, so
34262306a36Sopenharmony_ci			 * set the SPTE until it is set from some other
34362306a36Sopenharmony_ci			 * value to the removed SPTE value.
34462306a36Sopenharmony_ci			 */
34562306a36Sopenharmony_ci			for (;;) {
34662306a36Sopenharmony_ci				old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
34762306a36Sopenharmony_ci				if (!is_removed_spte(old_spte))
34862306a36Sopenharmony_ci					break;
34962306a36Sopenharmony_ci				cpu_relax();
35062306a36Sopenharmony_ci			}
35162306a36Sopenharmony_ci		} else {
35262306a36Sopenharmony_ci			/*
35362306a36Sopenharmony_ci			 * If the SPTE is not MMU-present, there is no backing
35462306a36Sopenharmony_ci			 * page associated with the SPTE and so no side effects
35562306a36Sopenharmony_ci			 * that need to be recorded, and exclusive ownership of
35662306a36Sopenharmony_ci			 * mmu_lock ensures the SPTE can't be made present.
35762306a36Sopenharmony_ci			 * Note, zapping MMIO SPTEs is also unnecessary as they
35862306a36Sopenharmony_ci			 * are guarded by the memslots generation, not by being
35962306a36Sopenharmony_ci			 * unreachable.
36062306a36Sopenharmony_ci			 */
36162306a36Sopenharmony_ci			old_spte = kvm_tdp_mmu_read_spte(sptep);
36262306a36Sopenharmony_ci			if (!is_shadow_present_pte(old_spte))
36362306a36Sopenharmony_ci				continue;
36462306a36Sopenharmony_ci
36562306a36Sopenharmony_ci			/*
36662306a36Sopenharmony_ci			 * Use the common helper instead of a raw WRITE_ONCE as
36762306a36Sopenharmony_ci			 * the SPTE needs to be updated atomically if it can be
36862306a36Sopenharmony_ci			 * modified by a different vCPU outside of mmu_lock.
36962306a36Sopenharmony_ci			 * Even though the parent SPTE is !PRESENT, the TLB
37062306a36Sopenharmony_ci			 * hasn't yet been flushed, and both Intel and AMD
37162306a36Sopenharmony_ci			 * document that A/D assists can use upper-level PxE
37262306a36Sopenharmony_ci			 * entries that are cached in the TLB, i.e. the CPU can
37362306a36Sopenharmony_ci			 * still access the page and mark it dirty.
37462306a36Sopenharmony_ci			 *
37562306a36Sopenharmony_ci			 * No retry is needed in the atomic update path as the
37662306a36Sopenharmony_ci			 * sole concern is dropping a Dirty bit, i.e. no other
37762306a36Sopenharmony_ci			 * task can zap/remove the SPTE as mmu_lock is held for
37862306a36Sopenharmony_ci			 * write.  Marking the SPTE as a removed SPTE is not
37962306a36Sopenharmony_ci			 * strictly necessary for the same reason, but using
38062306a36Sopenharmony_ci			 * the remove SPTE value keeps the shared/exclusive
38162306a36Sopenharmony_ci			 * paths consistent and allows the handle_changed_spte()
38262306a36Sopenharmony_ci			 * call below to hardcode the new value to REMOVED_SPTE.
38362306a36Sopenharmony_ci			 *
38462306a36Sopenharmony_ci			 * Note, even though dropping a Dirty bit is the only
38562306a36Sopenharmony_ci			 * scenario where a non-atomic update could result in a
38662306a36Sopenharmony_ci			 * functional bug, simply checking the Dirty bit isn't
38762306a36Sopenharmony_ci			 * sufficient as a fast page fault could read the upper
38862306a36Sopenharmony_ci			 * level SPTE before it is zapped, and then make this
38962306a36Sopenharmony_ci			 * target SPTE writable, resume the guest, and set the
39062306a36Sopenharmony_ci			 * Dirty bit between reading the SPTE above and writing
39162306a36Sopenharmony_ci			 * it here.
39262306a36Sopenharmony_ci			 */
39362306a36Sopenharmony_ci			old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
39462306a36Sopenharmony_ci							  REMOVED_SPTE, level);
39562306a36Sopenharmony_ci		}
39662306a36Sopenharmony_ci		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
39762306a36Sopenharmony_ci				    old_spte, REMOVED_SPTE, level, shared);
39862306a36Sopenharmony_ci	}
39962306a36Sopenharmony_ci
40062306a36Sopenharmony_ci	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
40162306a36Sopenharmony_ci}
40262306a36Sopenharmony_ci
40362306a36Sopenharmony_ci/**
40462306a36Sopenharmony_ci * handle_changed_spte - handle bookkeeping associated with an SPTE change
40562306a36Sopenharmony_ci * @kvm: kvm instance
40662306a36Sopenharmony_ci * @as_id: the address space of the paging structure the SPTE was a part of
40762306a36Sopenharmony_ci * @gfn: the base GFN that was mapped by the SPTE
40862306a36Sopenharmony_ci * @old_spte: The value of the SPTE before the change
40962306a36Sopenharmony_ci * @new_spte: The value of the SPTE after the change
41062306a36Sopenharmony_ci * @level: the level of the PT the SPTE is part of in the paging structure
41162306a36Sopenharmony_ci * @shared: This operation may not be running under the exclusive use of
41262306a36Sopenharmony_ci *	    the MMU lock and the operation must synchronize with other
41362306a36Sopenharmony_ci *	    threads that might be modifying SPTEs.
41462306a36Sopenharmony_ci *
41562306a36Sopenharmony_ci * Handle bookkeeping that might result from the modification of a SPTE.  Note,
41662306a36Sopenharmony_ci * dirty logging updates are handled in common code, not here (see make_spte()
41762306a36Sopenharmony_ci * and fast_pf_fix_direct_spte()).
41862306a36Sopenharmony_ci */
41962306a36Sopenharmony_cistatic void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
42062306a36Sopenharmony_ci				u64 old_spte, u64 new_spte, int level,
42162306a36Sopenharmony_ci				bool shared)
42262306a36Sopenharmony_ci{
42362306a36Sopenharmony_ci	bool was_present = is_shadow_present_pte(old_spte);
42462306a36Sopenharmony_ci	bool is_present = is_shadow_present_pte(new_spte);
42562306a36Sopenharmony_ci	bool was_leaf = was_present && is_last_spte(old_spte, level);
42662306a36Sopenharmony_ci	bool is_leaf = is_present && is_last_spte(new_spte, level);
42762306a36Sopenharmony_ci	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
42862306a36Sopenharmony_ci
42962306a36Sopenharmony_ci	WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
43062306a36Sopenharmony_ci	WARN_ON_ONCE(level < PG_LEVEL_4K);
43162306a36Sopenharmony_ci	WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
43262306a36Sopenharmony_ci
43362306a36Sopenharmony_ci	/*
43462306a36Sopenharmony_ci	 * If this warning were to trigger it would indicate that there was a
43562306a36Sopenharmony_ci	 * missing MMU notifier or a race with some notifier handler.
43662306a36Sopenharmony_ci	 * A present, leaf SPTE should never be directly replaced with another
43762306a36Sopenharmony_ci	 * present leaf SPTE pointing to a different PFN. A notifier handler
43862306a36Sopenharmony_ci	 * should be zapping the SPTE before the main MM's page table is
43962306a36Sopenharmony_ci	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
44062306a36Sopenharmony_ci	 * thread before replacement.
44162306a36Sopenharmony_ci	 */
44262306a36Sopenharmony_ci	if (was_leaf && is_leaf && pfn_changed) {
44362306a36Sopenharmony_ci		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
44462306a36Sopenharmony_ci		       "SPTE with another present leaf SPTE mapping a\n"
44562306a36Sopenharmony_ci		       "different PFN!\n"
44662306a36Sopenharmony_ci		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
44762306a36Sopenharmony_ci		       as_id, gfn, old_spte, new_spte, level);
44862306a36Sopenharmony_ci
44962306a36Sopenharmony_ci		/*
45062306a36Sopenharmony_ci		 * Crash the host to prevent error propagation and guest data
45162306a36Sopenharmony_ci		 * corruption.
45262306a36Sopenharmony_ci		 */
45362306a36Sopenharmony_ci		BUG();
45462306a36Sopenharmony_ci	}
45562306a36Sopenharmony_ci
45662306a36Sopenharmony_ci	if (old_spte == new_spte)
45762306a36Sopenharmony_ci		return;
45862306a36Sopenharmony_ci
45962306a36Sopenharmony_ci	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
46062306a36Sopenharmony_ci
46162306a36Sopenharmony_ci	if (is_leaf)
46262306a36Sopenharmony_ci		check_spte_writable_invariants(new_spte);
46362306a36Sopenharmony_ci
46462306a36Sopenharmony_ci	/*
46562306a36Sopenharmony_ci	 * The only times a SPTE should be changed from a non-present to
46662306a36Sopenharmony_ci	 * non-present state is when an MMIO entry is installed/modified/
46762306a36Sopenharmony_ci	 * removed. In that case, there is nothing to do here.
46862306a36Sopenharmony_ci	 */
46962306a36Sopenharmony_ci	if (!was_present && !is_present) {
47062306a36Sopenharmony_ci		/*
47162306a36Sopenharmony_ci		 * If this change does not involve a MMIO SPTE or removed SPTE,
47262306a36Sopenharmony_ci		 * it is unexpected. Log the change, though it should not
47362306a36Sopenharmony_ci		 * impact the guest since both the former and current SPTEs
47462306a36Sopenharmony_ci		 * are nonpresent.
47562306a36Sopenharmony_ci		 */
47662306a36Sopenharmony_ci		if (WARN_ON_ONCE(!is_mmio_spte(old_spte) &&
47762306a36Sopenharmony_ci				 !is_mmio_spte(new_spte) &&
47862306a36Sopenharmony_ci				 !is_removed_spte(new_spte)))
47962306a36Sopenharmony_ci			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
48062306a36Sopenharmony_ci			       "should not be replaced with another,\n"
48162306a36Sopenharmony_ci			       "different nonpresent SPTE, unless one or both\n"
48262306a36Sopenharmony_ci			       "are MMIO SPTEs, or the new SPTE is\n"
48362306a36Sopenharmony_ci			       "a temporary removed SPTE.\n"
48462306a36Sopenharmony_ci			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
48562306a36Sopenharmony_ci			       as_id, gfn, old_spte, new_spte, level);
48662306a36Sopenharmony_ci		return;
48762306a36Sopenharmony_ci	}
48862306a36Sopenharmony_ci
48962306a36Sopenharmony_ci	if (is_leaf != was_leaf)
49062306a36Sopenharmony_ci		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_ci	if (was_leaf && is_dirty_spte(old_spte) &&
49362306a36Sopenharmony_ci	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
49462306a36Sopenharmony_ci		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
49562306a36Sopenharmony_ci
49662306a36Sopenharmony_ci	/*
49762306a36Sopenharmony_ci	 * Recursively handle child PTs if the change removed a subtree from
49862306a36Sopenharmony_ci	 * the paging structure.  Note the WARN on the PFN changing without the
49962306a36Sopenharmony_ci	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
50062306a36Sopenharmony_ci	 * pages are kernel allocations and should never be migrated.
50162306a36Sopenharmony_ci	 */
50262306a36Sopenharmony_ci	if (was_present && !was_leaf &&
50362306a36Sopenharmony_ci	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
50462306a36Sopenharmony_ci		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
50562306a36Sopenharmony_ci
50662306a36Sopenharmony_ci	if (was_leaf && is_accessed_spte(old_spte) &&
50762306a36Sopenharmony_ci	    (!is_present || !is_accessed_spte(new_spte) || pfn_changed))
50862306a36Sopenharmony_ci		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
50962306a36Sopenharmony_ci}
51062306a36Sopenharmony_ci
51162306a36Sopenharmony_ci/*
51262306a36Sopenharmony_ci * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
51362306a36Sopenharmony_ci * and handle the associated bookkeeping.  Do not mark the page dirty
51462306a36Sopenharmony_ci * in KVM's dirty bitmaps.
51562306a36Sopenharmony_ci *
51662306a36Sopenharmony_ci * If setting the SPTE fails because it has changed, iter->old_spte will be
51762306a36Sopenharmony_ci * refreshed to the current value of the spte.
51862306a36Sopenharmony_ci *
51962306a36Sopenharmony_ci * @kvm: kvm instance
52062306a36Sopenharmony_ci * @iter: a tdp_iter instance currently on the SPTE that should be set
52162306a36Sopenharmony_ci * @new_spte: The value the SPTE should be set to
52262306a36Sopenharmony_ci * Return:
52362306a36Sopenharmony_ci * * 0      - If the SPTE was set.
52462306a36Sopenharmony_ci * * -EBUSY - If the SPTE cannot be set. In this case this function will have
52562306a36Sopenharmony_ci *            no side-effects other than setting iter->old_spte to the last
52662306a36Sopenharmony_ci *            known value of the spte.
52762306a36Sopenharmony_ci */
52862306a36Sopenharmony_cistatic inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
52962306a36Sopenharmony_ci					  struct tdp_iter *iter,
53062306a36Sopenharmony_ci					  u64 new_spte)
53162306a36Sopenharmony_ci{
53262306a36Sopenharmony_ci	u64 *sptep = rcu_dereference(iter->sptep);
53362306a36Sopenharmony_ci
53462306a36Sopenharmony_ci	/*
53562306a36Sopenharmony_ci	 * The caller is responsible for ensuring the old SPTE is not a REMOVED
53662306a36Sopenharmony_ci	 * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
53762306a36Sopenharmony_ci	 * and pre-checking before inserting a new SPTE is advantageous as it
53862306a36Sopenharmony_ci	 * avoids unnecessary work.
53962306a36Sopenharmony_ci	 */
54062306a36Sopenharmony_ci	WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
54162306a36Sopenharmony_ci
54262306a36Sopenharmony_ci	lockdep_assert_held_read(&kvm->mmu_lock);
54362306a36Sopenharmony_ci
54462306a36Sopenharmony_ci	/*
54562306a36Sopenharmony_ci	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
54662306a36Sopenharmony_ci	 * does not hold the mmu_lock.  On failure, i.e. if a different logical
54762306a36Sopenharmony_ci	 * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
54862306a36Sopenharmony_ci	 * the current value, so the caller operates on fresh data, e.g. if it
54962306a36Sopenharmony_ci	 * retries tdp_mmu_set_spte_atomic()
55062306a36Sopenharmony_ci	 */
55162306a36Sopenharmony_ci	if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
55262306a36Sopenharmony_ci		return -EBUSY;
55362306a36Sopenharmony_ci
55462306a36Sopenharmony_ci	handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
55562306a36Sopenharmony_ci			    new_spte, iter->level, true);
55662306a36Sopenharmony_ci
55762306a36Sopenharmony_ci	return 0;
55862306a36Sopenharmony_ci}
55962306a36Sopenharmony_ci
56062306a36Sopenharmony_cistatic inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
56162306a36Sopenharmony_ci					  struct tdp_iter *iter)
56262306a36Sopenharmony_ci{
56362306a36Sopenharmony_ci	int ret;
56462306a36Sopenharmony_ci
56562306a36Sopenharmony_ci	/*
56662306a36Sopenharmony_ci	 * Freeze the SPTE by setting it to a special,
56762306a36Sopenharmony_ci	 * non-present value. This will stop other threads from
56862306a36Sopenharmony_ci	 * immediately installing a present entry in its place
56962306a36Sopenharmony_ci	 * before the TLBs are flushed.
57062306a36Sopenharmony_ci	 */
57162306a36Sopenharmony_ci	ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
57262306a36Sopenharmony_ci	if (ret)
57362306a36Sopenharmony_ci		return ret;
57462306a36Sopenharmony_ci
57562306a36Sopenharmony_ci	kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
57662306a36Sopenharmony_ci
57762306a36Sopenharmony_ci	/*
57862306a36Sopenharmony_ci	 * No other thread can overwrite the removed SPTE as they must either
57962306a36Sopenharmony_ci	 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
58062306a36Sopenharmony_ci	 * overwrite the special removed SPTE value. No bookkeeping is needed
58162306a36Sopenharmony_ci	 * here since the SPTE is going from non-present to non-present.  Use
58262306a36Sopenharmony_ci	 * the raw write helper to avoid an unnecessary check on volatile bits.
58362306a36Sopenharmony_ci	 */
58462306a36Sopenharmony_ci	__kvm_tdp_mmu_write_spte(iter->sptep, 0);
58562306a36Sopenharmony_ci
58662306a36Sopenharmony_ci	return 0;
58762306a36Sopenharmony_ci}
58862306a36Sopenharmony_ci
58962306a36Sopenharmony_ci
59062306a36Sopenharmony_ci/*
59162306a36Sopenharmony_ci * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
59262306a36Sopenharmony_ci * @kvm:	      KVM instance
59362306a36Sopenharmony_ci * @as_id:	      Address space ID, i.e. regular vs. SMM
59462306a36Sopenharmony_ci * @sptep:	      Pointer to the SPTE
59562306a36Sopenharmony_ci * @old_spte:	      The current value of the SPTE
59662306a36Sopenharmony_ci * @new_spte:	      The new value that will be set for the SPTE
59762306a36Sopenharmony_ci * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
59862306a36Sopenharmony_ci * @level:	      The level _containing_ the SPTE (its parent PT's level)
59962306a36Sopenharmony_ci *
60062306a36Sopenharmony_ci * Returns the old SPTE value, which _may_ be different than @old_spte if the
60162306a36Sopenharmony_ci * SPTE had voldatile bits.
60262306a36Sopenharmony_ci */
60362306a36Sopenharmony_cistatic u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
60462306a36Sopenharmony_ci			    u64 old_spte, u64 new_spte, gfn_t gfn, int level)
60562306a36Sopenharmony_ci{
60662306a36Sopenharmony_ci	lockdep_assert_held_write(&kvm->mmu_lock);
60762306a36Sopenharmony_ci
60862306a36Sopenharmony_ci	/*
60962306a36Sopenharmony_ci	 * No thread should be using this function to set SPTEs to or from the
61062306a36Sopenharmony_ci	 * temporary removed SPTE value.
61162306a36Sopenharmony_ci	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
61262306a36Sopenharmony_ci	 * should be used. If operating under the MMU lock in write mode, the
61362306a36Sopenharmony_ci	 * use of the removed SPTE should not be necessary.
61462306a36Sopenharmony_ci	 */
61562306a36Sopenharmony_ci	WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte));
61662306a36Sopenharmony_ci
61762306a36Sopenharmony_ci	old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
61862306a36Sopenharmony_ci
61962306a36Sopenharmony_ci	handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
62062306a36Sopenharmony_ci	return old_spte;
62162306a36Sopenharmony_ci}
62262306a36Sopenharmony_ci
62362306a36Sopenharmony_cistatic inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
62462306a36Sopenharmony_ci					 u64 new_spte)
62562306a36Sopenharmony_ci{
62662306a36Sopenharmony_ci	WARN_ON_ONCE(iter->yielded);
62762306a36Sopenharmony_ci	iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
62862306a36Sopenharmony_ci					  iter->old_spte, new_spte,
62962306a36Sopenharmony_ci					  iter->gfn, iter->level);
63062306a36Sopenharmony_ci}
63162306a36Sopenharmony_ci
63262306a36Sopenharmony_ci#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
63362306a36Sopenharmony_ci	for_each_tdp_pte(_iter, _root, _start, _end)
63462306a36Sopenharmony_ci
63562306a36Sopenharmony_ci#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
63662306a36Sopenharmony_ci	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
63762306a36Sopenharmony_ci		if (!is_shadow_present_pte(_iter.old_spte) ||		\
63862306a36Sopenharmony_ci		    !is_last_spte(_iter.old_spte, _iter.level))		\
63962306a36Sopenharmony_ci			continue;					\
64062306a36Sopenharmony_ci		else
64162306a36Sopenharmony_ci
64262306a36Sopenharmony_ci#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
64362306a36Sopenharmony_ci	for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
64462306a36Sopenharmony_ci
64562306a36Sopenharmony_ci/*
64662306a36Sopenharmony_ci * Yield if the MMU lock is contended or this thread needs to return control
64762306a36Sopenharmony_ci * to the scheduler.
64862306a36Sopenharmony_ci *
64962306a36Sopenharmony_ci * If this function should yield and flush is set, it will perform a remote
65062306a36Sopenharmony_ci * TLB flush before yielding.
65162306a36Sopenharmony_ci *
65262306a36Sopenharmony_ci * If this function yields, iter->yielded is set and the caller must skip to
65362306a36Sopenharmony_ci * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
65462306a36Sopenharmony_ci * over the paging structures to allow the iterator to continue its traversal
65562306a36Sopenharmony_ci * from the paging structure root.
65662306a36Sopenharmony_ci *
65762306a36Sopenharmony_ci * Returns true if this function yielded.
65862306a36Sopenharmony_ci */
65962306a36Sopenharmony_cistatic inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
66062306a36Sopenharmony_ci							  struct tdp_iter *iter,
66162306a36Sopenharmony_ci							  bool flush, bool shared)
66262306a36Sopenharmony_ci{
66362306a36Sopenharmony_ci	WARN_ON_ONCE(iter->yielded);
66462306a36Sopenharmony_ci
66562306a36Sopenharmony_ci	/* Ensure forward progress has been made before yielding. */
66662306a36Sopenharmony_ci	if (iter->next_last_level_gfn == iter->yielded_gfn)
66762306a36Sopenharmony_ci		return false;
66862306a36Sopenharmony_ci
66962306a36Sopenharmony_ci	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
67062306a36Sopenharmony_ci		if (flush)
67162306a36Sopenharmony_ci			kvm_flush_remote_tlbs(kvm);
67262306a36Sopenharmony_ci
67362306a36Sopenharmony_ci		rcu_read_unlock();
67462306a36Sopenharmony_ci
67562306a36Sopenharmony_ci		if (shared)
67662306a36Sopenharmony_ci			cond_resched_rwlock_read(&kvm->mmu_lock);
67762306a36Sopenharmony_ci		else
67862306a36Sopenharmony_ci			cond_resched_rwlock_write(&kvm->mmu_lock);
67962306a36Sopenharmony_ci
68062306a36Sopenharmony_ci		rcu_read_lock();
68162306a36Sopenharmony_ci
68262306a36Sopenharmony_ci		WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
68362306a36Sopenharmony_ci
68462306a36Sopenharmony_ci		iter->yielded = true;
68562306a36Sopenharmony_ci	}
68662306a36Sopenharmony_ci
68762306a36Sopenharmony_ci	return iter->yielded;
68862306a36Sopenharmony_ci}
68962306a36Sopenharmony_ci
69062306a36Sopenharmony_cistatic inline gfn_t tdp_mmu_max_gfn_exclusive(void)
69162306a36Sopenharmony_ci{
69262306a36Sopenharmony_ci	/*
69362306a36Sopenharmony_ci	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
69462306a36Sopenharmony_ci	 * a gpa range that would exceed the max gfn, and KVM does not create
69562306a36Sopenharmony_ci	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
69662306a36Sopenharmony_ci	 * the slow emulation path every time.
69762306a36Sopenharmony_ci	 */
69862306a36Sopenharmony_ci	return kvm_mmu_max_gfn() + 1;
69962306a36Sopenharmony_ci}
70062306a36Sopenharmony_ci
70162306a36Sopenharmony_cistatic void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
70262306a36Sopenharmony_ci			       bool shared, int zap_level)
70362306a36Sopenharmony_ci{
70462306a36Sopenharmony_ci	struct tdp_iter iter;
70562306a36Sopenharmony_ci
70662306a36Sopenharmony_ci	gfn_t end = tdp_mmu_max_gfn_exclusive();
70762306a36Sopenharmony_ci	gfn_t start = 0;
70862306a36Sopenharmony_ci
70962306a36Sopenharmony_ci	for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
71062306a36Sopenharmony_ciretry:
71162306a36Sopenharmony_ci		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
71262306a36Sopenharmony_ci			continue;
71362306a36Sopenharmony_ci
71462306a36Sopenharmony_ci		if (!is_shadow_present_pte(iter.old_spte))
71562306a36Sopenharmony_ci			continue;
71662306a36Sopenharmony_ci
71762306a36Sopenharmony_ci		if (iter.level > zap_level)
71862306a36Sopenharmony_ci			continue;
71962306a36Sopenharmony_ci
72062306a36Sopenharmony_ci		if (!shared)
72162306a36Sopenharmony_ci			tdp_mmu_iter_set_spte(kvm, &iter, 0);
72262306a36Sopenharmony_ci		else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
72362306a36Sopenharmony_ci			goto retry;
72462306a36Sopenharmony_ci	}
72562306a36Sopenharmony_ci}
72662306a36Sopenharmony_ci
72762306a36Sopenharmony_cistatic void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
72862306a36Sopenharmony_ci			     bool shared)
72962306a36Sopenharmony_ci{
73062306a36Sopenharmony_ci
73162306a36Sopenharmony_ci	/*
73262306a36Sopenharmony_ci	 * The root must have an elevated refcount so that it's reachable via
73362306a36Sopenharmony_ci	 * mmu_notifier callbacks, which allows this path to yield and drop
73462306a36Sopenharmony_ci	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
73562306a36Sopenharmony_ci	 * must drop all references to relevant pages prior to completing the
73662306a36Sopenharmony_ci	 * callback.  Dropping mmu_lock with an unreachable root would result
73762306a36Sopenharmony_ci	 * in zapping SPTEs after a relevant mmu_notifier callback completes
73862306a36Sopenharmony_ci	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
73962306a36Sopenharmony_ci	 * dirty accessed bits to the SPTE's associated struct page.
74062306a36Sopenharmony_ci	 */
74162306a36Sopenharmony_ci	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
74262306a36Sopenharmony_ci
74362306a36Sopenharmony_ci	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
74462306a36Sopenharmony_ci
74562306a36Sopenharmony_ci	rcu_read_lock();
74662306a36Sopenharmony_ci
74762306a36Sopenharmony_ci	/*
74862306a36Sopenharmony_ci	 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
74962306a36Sopenharmony_ci	 * split the zap into two passes.  On the first pass, zap at the 1gb
75062306a36Sopenharmony_ci	 * level, and then zap top-level SPs on the second pass.  "1gb" is not
75162306a36Sopenharmony_ci	 * arbitrary, as KVM must be able to zap a 1gb shadow page without
75262306a36Sopenharmony_ci	 * inducing a stall to allow in-place replacement with a 1gb hugepage.
75362306a36Sopenharmony_ci	 *
75462306a36Sopenharmony_ci	 * Because zapping a SP recurses on its children, stepping down to
75562306a36Sopenharmony_ci	 * PG_LEVEL_4K in the iterator itself is unnecessary.
75662306a36Sopenharmony_ci	 */
75762306a36Sopenharmony_ci	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
75862306a36Sopenharmony_ci	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
75962306a36Sopenharmony_ci
76062306a36Sopenharmony_ci	rcu_read_unlock();
76162306a36Sopenharmony_ci}
76262306a36Sopenharmony_ci
76362306a36Sopenharmony_cibool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
76462306a36Sopenharmony_ci{
76562306a36Sopenharmony_ci	u64 old_spte;
76662306a36Sopenharmony_ci
76762306a36Sopenharmony_ci	/*
76862306a36Sopenharmony_ci	 * This helper intentionally doesn't allow zapping a root shadow page,
76962306a36Sopenharmony_ci	 * which doesn't have a parent page table and thus no associated entry.
77062306a36Sopenharmony_ci	 */
77162306a36Sopenharmony_ci	if (WARN_ON_ONCE(!sp->ptep))
77262306a36Sopenharmony_ci		return false;
77362306a36Sopenharmony_ci
77462306a36Sopenharmony_ci	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
77562306a36Sopenharmony_ci	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
77662306a36Sopenharmony_ci		return false;
77762306a36Sopenharmony_ci
77862306a36Sopenharmony_ci	tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
77962306a36Sopenharmony_ci			 sp->gfn, sp->role.level + 1);
78062306a36Sopenharmony_ci
78162306a36Sopenharmony_ci	return true;
78262306a36Sopenharmony_ci}
78362306a36Sopenharmony_ci
78462306a36Sopenharmony_ci/*
78562306a36Sopenharmony_ci * If can_yield is true, will release the MMU lock and reschedule if the
78662306a36Sopenharmony_ci * scheduler needs the CPU or there is contention on the MMU lock. If this
78762306a36Sopenharmony_ci * function cannot yield, it will not release the MMU lock or reschedule and
78862306a36Sopenharmony_ci * the caller must ensure it does not supply too large a GFN range, or the
78962306a36Sopenharmony_ci * operation can cause a soft lockup.
79062306a36Sopenharmony_ci */
79162306a36Sopenharmony_cistatic bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
79262306a36Sopenharmony_ci			      gfn_t start, gfn_t end, bool can_yield, bool flush)
79362306a36Sopenharmony_ci{
79462306a36Sopenharmony_ci	struct tdp_iter iter;
79562306a36Sopenharmony_ci
79662306a36Sopenharmony_ci	end = min(end, tdp_mmu_max_gfn_exclusive());
79762306a36Sopenharmony_ci
79862306a36Sopenharmony_ci	lockdep_assert_held_write(&kvm->mmu_lock);
79962306a36Sopenharmony_ci
80062306a36Sopenharmony_ci	rcu_read_lock();
80162306a36Sopenharmony_ci
80262306a36Sopenharmony_ci	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
80362306a36Sopenharmony_ci		if (can_yield &&
80462306a36Sopenharmony_ci		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
80562306a36Sopenharmony_ci			flush = false;
80662306a36Sopenharmony_ci			continue;
80762306a36Sopenharmony_ci		}
80862306a36Sopenharmony_ci
80962306a36Sopenharmony_ci		if (!is_shadow_present_pte(iter.old_spte) ||
81062306a36Sopenharmony_ci		    !is_last_spte(iter.old_spte, iter.level))
81162306a36Sopenharmony_ci			continue;
81262306a36Sopenharmony_ci
81362306a36Sopenharmony_ci		tdp_mmu_iter_set_spte(kvm, &iter, 0);
81462306a36Sopenharmony_ci		flush = true;
81562306a36Sopenharmony_ci	}
81662306a36Sopenharmony_ci
81762306a36Sopenharmony_ci	rcu_read_unlock();
81862306a36Sopenharmony_ci
81962306a36Sopenharmony_ci	/*
82062306a36Sopenharmony_ci	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
82162306a36Sopenharmony_ci	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
82262306a36Sopenharmony_ci	 */
82362306a36Sopenharmony_ci	return flush;
82462306a36Sopenharmony_ci}
82562306a36Sopenharmony_ci
82662306a36Sopenharmony_ci/*
82762306a36Sopenharmony_ci * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
82862306a36Sopenharmony_ci * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
82962306a36Sopenharmony_ci * more SPTEs were zapped since the MMU lock was last acquired.
83062306a36Sopenharmony_ci */
83162306a36Sopenharmony_cibool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
83262306a36Sopenharmony_ci{
83362306a36Sopenharmony_ci	struct kvm_mmu_page *root;
83462306a36Sopenharmony_ci
83562306a36Sopenharmony_ci	for_each_tdp_mmu_root_yield_safe(kvm, root, false)
83662306a36Sopenharmony_ci		flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
83762306a36Sopenharmony_ci
83862306a36Sopenharmony_ci	return flush;
83962306a36Sopenharmony_ci}
84062306a36Sopenharmony_ci
84162306a36Sopenharmony_civoid kvm_tdp_mmu_zap_all(struct kvm *kvm)
84262306a36Sopenharmony_ci{
84362306a36Sopenharmony_ci	struct kvm_mmu_page *root;
84462306a36Sopenharmony_ci
84562306a36Sopenharmony_ci	/*
84662306a36Sopenharmony_ci	 * Zap all roots, including invalid roots, as all SPTEs must be dropped
84762306a36Sopenharmony_ci	 * before returning to the caller.  Zap directly even if the root is
84862306a36Sopenharmony_ci	 * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
84962306a36Sopenharmony_ci	 * all that expensive and mmu_lock is already held, which means the
85062306a36Sopenharmony_ci	 * worker has yielded, i.e. flushing the work instead of zapping here
85162306a36Sopenharmony_ci	 * isn't guaranteed to be any faster.
85262306a36Sopenharmony_ci	 *
85362306a36Sopenharmony_ci	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
85462306a36Sopenharmony_ci	 * is being destroyed or the userspace VMM has exited.  In both cases,
85562306a36Sopenharmony_ci	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
85662306a36Sopenharmony_ci	 */
85762306a36Sopenharmony_ci	for_each_tdp_mmu_root_yield_safe(kvm, root, false)
85862306a36Sopenharmony_ci		tdp_mmu_zap_root(kvm, root, false);
85962306a36Sopenharmony_ci}
86062306a36Sopenharmony_ci
86162306a36Sopenharmony_ci/*
86262306a36Sopenharmony_ci * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
86362306a36Sopenharmony_ci * zap" completes.
86462306a36Sopenharmony_ci */
86562306a36Sopenharmony_civoid kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
86662306a36Sopenharmony_ci{
86762306a36Sopenharmony_ci	struct kvm_mmu_page *root;
86862306a36Sopenharmony_ci
86962306a36Sopenharmony_ci	read_lock(&kvm->mmu_lock);
87062306a36Sopenharmony_ci
87162306a36Sopenharmony_ci	for_each_tdp_mmu_root_yield_safe(kvm, root, true) {
87262306a36Sopenharmony_ci		if (!root->tdp_mmu_scheduled_root_to_zap)
87362306a36Sopenharmony_ci			continue;
87462306a36Sopenharmony_ci
87562306a36Sopenharmony_ci		root->tdp_mmu_scheduled_root_to_zap = false;
87662306a36Sopenharmony_ci		KVM_BUG_ON(!root->role.invalid, kvm);
87762306a36Sopenharmony_ci
87862306a36Sopenharmony_ci		/*
87962306a36Sopenharmony_ci		 * A TLB flush is not necessary as KVM performs a local TLB
88062306a36Sopenharmony_ci		 * flush when allocating a new root (see kvm_mmu_load()), and
88162306a36Sopenharmony_ci		 * when migrating a vCPU to a different pCPU.  Note, the local
88262306a36Sopenharmony_ci		 * TLB flush on reuse also invalidates paging-structure-cache
88362306a36Sopenharmony_ci		 * entries, i.e. TLB entries for intermediate paging structures,
88462306a36Sopenharmony_ci		 * that may be zapped, as such entries are associated with the
88562306a36Sopenharmony_ci		 * ASID on both VMX and SVM.
88662306a36Sopenharmony_ci		 */
88762306a36Sopenharmony_ci		tdp_mmu_zap_root(kvm, root, true);
88862306a36Sopenharmony_ci
88962306a36Sopenharmony_ci		/*
89062306a36Sopenharmony_ci		 * The referenced needs to be put *after* zapping the root, as
89162306a36Sopenharmony_ci		 * the root must be reachable by mmu_notifiers while it's being
89262306a36Sopenharmony_ci		 * zapped
89362306a36Sopenharmony_ci		 */
89462306a36Sopenharmony_ci		kvm_tdp_mmu_put_root(kvm, root, true);
89562306a36Sopenharmony_ci	}
89662306a36Sopenharmony_ci
89762306a36Sopenharmony_ci	read_unlock(&kvm->mmu_lock);
89862306a36Sopenharmony_ci}
89962306a36Sopenharmony_ci
90062306a36Sopenharmony_ci/*
90162306a36Sopenharmony_ci * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
90262306a36Sopenharmony_ci * is about to be zapped, e.g. in response to a memslots update.  The actual
90362306a36Sopenharmony_ci * zapping is done separately so that it happens with mmu_lock with read,
90462306a36Sopenharmony_ci * whereas invalidating roots must be done with mmu_lock held for write (unless
90562306a36Sopenharmony_ci * the VM is being destroyed).
90662306a36Sopenharmony_ci *
90762306a36Sopenharmony_ci * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
90862306a36Sopenharmony_ci * See kvm_tdp_mmu_get_vcpu_root_hpa().
90962306a36Sopenharmony_ci */
91062306a36Sopenharmony_civoid kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
91162306a36Sopenharmony_ci{
91262306a36Sopenharmony_ci	struct kvm_mmu_page *root;
91362306a36Sopenharmony_ci
91462306a36Sopenharmony_ci	/*
91562306a36Sopenharmony_ci	 * mmu_lock must be held for write to ensure that a root doesn't become
91662306a36Sopenharmony_ci	 * invalid while there are active readers (invalidating a root while
91762306a36Sopenharmony_ci	 * there are active readers may or may not be problematic in practice,
91862306a36Sopenharmony_ci	 * but it's uncharted territory and not supported).
91962306a36Sopenharmony_ci	 *
92062306a36Sopenharmony_ci	 * Waive the assertion if there are no users of @kvm, i.e. the VM is
92162306a36Sopenharmony_ci	 * being destroyed after all references have been put, or if no vCPUs
92262306a36Sopenharmony_ci	 * have been created (which means there are no roots), i.e. the VM is
92362306a36Sopenharmony_ci	 * being destroyed in an error path of KVM_CREATE_VM.
92462306a36Sopenharmony_ci	 */
92562306a36Sopenharmony_ci	if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
92662306a36Sopenharmony_ci	    refcount_read(&kvm->users_count) && kvm->created_vcpus)
92762306a36Sopenharmony_ci		lockdep_assert_held_write(&kvm->mmu_lock);
92862306a36Sopenharmony_ci
92962306a36Sopenharmony_ci	/*
93062306a36Sopenharmony_ci	 * As above, mmu_lock isn't held when destroying the VM!  There can't
93162306a36Sopenharmony_ci	 * be other references to @kvm, i.e. nothing else can invalidate roots
93262306a36Sopenharmony_ci	 * or get/put references to roots.
93362306a36Sopenharmony_ci	 */
93462306a36Sopenharmony_ci	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
93562306a36Sopenharmony_ci		/*
93662306a36Sopenharmony_ci		 * Note, invalid roots can outlive a memslot update!  Invalid
93762306a36Sopenharmony_ci		 * roots must be *zapped* before the memslot update completes,
93862306a36Sopenharmony_ci		 * but a different task can acquire a reference and keep the
93962306a36Sopenharmony_ci		 * root alive after its been zapped.
94062306a36Sopenharmony_ci		 */
94162306a36Sopenharmony_ci		if (!root->role.invalid) {
94262306a36Sopenharmony_ci			root->tdp_mmu_scheduled_root_to_zap = true;
94362306a36Sopenharmony_ci			root->role.invalid = true;
94462306a36Sopenharmony_ci		}
94562306a36Sopenharmony_ci	}
94662306a36Sopenharmony_ci}
94762306a36Sopenharmony_ci
94862306a36Sopenharmony_ci/*
94962306a36Sopenharmony_ci * Installs a last-level SPTE to handle a TDP page fault.
95062306a36Sopenharmony_ci * (NPT/EPT violation/misconfiguration)
95162306a36Sopenharmony_ci */
95262306a36Sopenharmony_cistatic int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
95362306a36Sopenharmony_ci					  struct kvm_page_fault *fault,
95462306a36Sopenharmony_ci					  struct tdp_iter *iter)
95562306a36Sopenharmony_ci{
95662306a36Sopenharmony_ci	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
95762306a36Sopenharmony_ci	u64 new_spte;
95862306a36Sopenharmony_ci	int ret = RET_PF_FIXED;
95962306a36Sopenharmony_ci	bool wrprot = false;
96062306a36Sopenharmony_ci
96162306a36Sopenharmony_ci	if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
96262306a36Sopenharmony_ci		return RET_PF_RETRY;
96362306a36Sopenharmony_ci
96462306a36Sopenharmony_ci	if (unlikely(!fault->slot))
96562306a36Sopenharmony_ci		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
96662306a36Sopenharmony_ci	else
96762306a36Sopenharmony_ci		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
96862306a36Sopenharmony_ci					 fault->pfn, iter->old_spte, fault->prefetch, true,
96962306a36Sopenharmony_ci					 fault->map_writable, &new_spte);
97062306a36Sopenharmony_ci
97162306a36Sopenharmony_ci	if (new_spte == iter->old_spte)
97262306a36Sopenharmony_ci		ret = RET_PF_SPURIOUS;
97362306a36Sopenharmony_ci	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
97462306a36Sopenharmony_ci		return RET_PF_RETRY;
97562306a36Sopenharmony_ci	else if (is_shadow_present_pte(iter->old_spte) &&
97662306a36Sopenharmony_ci		 !is_last_spte(iter->old_spte, iter->level))
97762306a36Sopenharmony_ci		kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
97862306a36Sopenharmony_ci
97962306a36Sopenharmony_ci	/*
98062306a36Sopenharmony_ci	 * If the page fault was caused by a write but the page is write
98162306a36Sopenharmony_ci	 * protected, emulation is needed. If the emulation was skipped,
98262306a36Sopenharmony_ci	 * the vCPU would have the same fault again.
98362306a36Sopenharmony_ci	 */
98462306a36Sopenharmony_ci	if (wrprot) {
98562306a36Sopenharmony_ci		if (fault->write)
98662306a36Sopenharmony_ci			ret = RET_PF_EMULATE;
98762306a36Sopenharmony_ci	}
98862306a36Sopenharmony_ci
98962306a36Sopenharmony_ci	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
99062306a36Sopenharmony_ci	if (unlikely(is_mmio_spte(new_spte))) {
99162306a36Sopenharmony_ci		vcpu->stat.pf_mmio_spte_created++;
99262306a36Sopenharmony_ci		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
99362306a36Sopenharmony_ci				     new_spte);
99462306a36Sopenharmony_ci		ret = RET_PF_EMULATE;
99562306a36Sopenharmony_ci	} else {
99662306a36Sopenharmony_ci		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
99762306a36Sopenharmony_ci				       rcu_dereference(iter->sptep));
99862306a36Sopenharmony_ci	}
99962306a36Sopenharmony_ci
100062306a36Sopenharmony_ci	return ret;
100162306a36Sopenharmony_ci}
100262306a36Sopenharmony_ci
100362306a36Sopenharmony_ci/*
100462306a36Sopenharmony_ci * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
100562306a36Sopenharmony_ci * provided page table.
100662306a36Sopenharmony_ci *
100762306a36Sopenharmony_ci * @kvm: kvm instance
100862306a36Sopenharmony_ci * @iter: a tdp_iter instance currently on the SPTE that should be set
100962306a36Sopenharmony_ci * @sp: The new TDP page table to install.
101062306a36Sopenharmony_ci * @shared: This operation is running under the MMU lock in read mode.
101162306a36Sopenharmony_ci *
101262306a36Sopenharmony_ci * Returns: 0 if the new page table was installed. Non-0 if the page table
101362306a36Sopenharmony_ci *          could not be installed (e.g. the atomic compare-exchange failed).
101462306a36Sopenharmony_ci */
101562306a36Sopenharmony_cistatic int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
101662306a36Sopenharmony_ci			   struct kvm_mmu_page *sp, bool shared)
101762306a36Sopenharmony_ci{
101862306a36Sopenharmony_ci	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
101962306a36Sopenharmony_ci	int ret = 0;
102062306a36Sopenharmony_ci
102162306a36Sopenharmony_ci	if (shared) {
102262306a36Sopenharmony_ci		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
102362306a36Sopenharmony_ci		if (ret)
102462306a36Sopenharmony_ci			return ret;
102562306a36Sopenharmony_ci	} else {
102662306a36Sopenharmony_ci		tdp_mmu_iter_set_spte(kvm, iter, spte);
102762306a36Sopenharmony_ci	}
102862306a36Sopenharmony_ci
102962306a36Sopenharmony_ci	tdp_account_mmu_page(kvm, sp);
103062306a36Sopenharmony_ci
103162306a36Sopenharmony_ci	return 0;
103262306a36Sopenharmony_ci}
103362306a36Sopenharmony_ci
103462306a36Sopenharmony_cistatic int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
103562306a36Sopenharmony_ci				   struct kvm_mmu_page *sp, bool shared);
103662306a36Sopenharmony_ci
103762306a36Sopenharmony_ci/*
103862306a36Sopenharmony_ci * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
103962306a36Sopenharmony_ci * page tables and SPTEs to translate the faulting guest physical address.
104062306a36Sopenharmony_ci */
104162306a36Sopenharmony_ciint kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
104262306a36Sopenharmony_ci{
104362306a36Sopenharmony_ci	struct kvm_mmu *mmu = vcpu->arch.mmu;
104462306a36Sopenharmony_ci	struct kvm *kvm = vcpu->kvm;
104562306a36Sopenharmony_ci	struct tdp_iter iter;
104662306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
104762306a36Sopenharmony_ci	int ret = RET_PF_RETRY;
104862306a36Sopenharmony_ci
104962306a36Sopenharmony_ci	kvm_mmu_hugepage_adjust(vcpu, fault);
105062306a36Sopenharmony_ci
105162306a36Sopenharmony_ci	trace_kvm_mmu_spte_requested(fault);
105262306a36Sopenharmony_ci
105362306a36Sopenharmony_ci	rcu_read_lock();
105462306a36Sopenharmony_ci
105562306a36Sopenharmony_ci	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
105662306a36Sopenharmony_ci		int r;
105762306a36Sopenharmony_ci
105862306a36Sopenharmony_ci		if (fault->nx_huge_page_workaround_enabled)
105962306a36Sopenharmony_ci			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
106062306a36Sopenharmony_ci
106162306a36Sopenharmony_ci		/*
106262306a36Sopenharmony_ci		 * If SPTE has been frozen by another thread, just give up and
106362306a36Sopenharmony_ci		 * retry, avoiding unnecessary page table allocation and free.
106462306a36Sopenharmony_ci		 */
106562306a36Sopenharmony_ci		if (is_removed_spte(iter.old_spte))
106662306a36Sopenharmony_ci			goto retry;
106762306a36Sopenharmony_ci
106862306a36Sopenharmony_ci		if (iter.level == fault->goal_level)
106962306a36Sopenharmony_ci			goto map_target_level;
107062306a36Sopenharmony_ci
107162306a36Sopenharmony_ci		/* Step down into the lower level page table if it exists. */
107262306a36Sopenharmony_ci		if (is_shadow_present_pte(iter.old_spte) &&
107362306a36Sopenharmony_ci		    !is_large_pte(iter.old_spte))
107462306a36Sopenharmony_ci			continue;
107562306a36Sopenharmony_ci
107662306a36Sopenharmony_ci		/*
107762306a36Sopenharmony_ci		 * The SPTE is either non-present or points to a huge page that
107862306a36Sopenharmony_ci		 * needs to be split.
107962306a36Sopenharmony_ci		 */
108062306a36Sopenharmony_ci		sp = tdp_mmu_alloc_sp(vcpu);
108162306a36Sopenharmony_ci		tdp_mmu_init_child_sp(sp, &iter);
108262306a36Sopenharmony_ci
108362306a36Sopenharmony_ci		sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
108462306a36Sopenharmony_ci
108562306a36Sopenharmony_ci		if (is_shadow_present_pte(iter.old_spte))
108662306a36Sopenharmony_ci			r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
108762306a36Sopenharmony_ci		else
108862306a36Sopenharmony_ci			r = tdp_mmu_link_sp(kvm, &iter, sp, true);
108962306a36Sopenharmony_ci
109062306a36Sopenharmony_ci		/*
109162306a36Sopenharmony_ci		 * Force the guest to retry if installing an upper level SPTE
109262306a36Sopenharmony_ci		 * failed, e.g. because a different task modified the SPTE.
109362306a36Sopenharmony_ci		 */
109462306a36Sopenharmony_ci		if (r) {
109562306a36Sopenharmony_ci			tdp_mmu_free_sp(sp);
109662306a36Sopenharmony_ci			goto retry;
109762306a36Sopenharmony_ci		}
109862306a36Sopenharmony_ci
109962306a36Sopenharmony_ci		if (fault->huge_page_disallowed &&
110062306a36Sopenharmony_ci		    fault->req_level >= iter.level) {
110162306a36Sopenharmony_ci			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
110262306a36Sopenharmony_ci			if (sp->nx_huge_page_disallowed)
110362306a36Sopenharmony_ci				track_possible_nx_huge_page(kvm, sp);
110462306a36Sopenharmony_ci			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
110562306a36Sopenharmony_ci		}
110662306a36Sopenharmony_ci	}
110762306a36Sopenharmony_ci
110862306a36Sopenharmony_ci	/*
110962306a36Sopenharmony_ci	 * The walk aborted before reaching the target level, e.g. because the
111062306a36Sopenharmony_ci	 * iterator detected an upper level SPTE was frozen during traversal.
111162306a36Sopenharmony_ci	 */
111262306a36Sopenharmony_ci	WARN_ON_ONCE(iter.level == fault->goal_level);
111362306a36Sopenharmony_ci	goto retry;
111462306a36Sopenharmony_ci
111562306a36Sopenharmony_cimap_target_level:
111662306a36Sopenharmony_ci	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
111762306a36Sopenharmony_ci
111862306a36Sopenharmony_ciretry:
111962306a36Sopenharmony_ci	rcu_read_unlock();
112062306a36Sopenharmony_ci	return ret;
112162306a36Sopenharmony_ci}
112262306a36Sopenharmony_ci
112362306a36Sopenharmony_cibool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
112462306a36Sopenharmony_ci				 bool flush)
112562306a36Sopenharmony_ci{
112662306a36Sopenharmony_ci	struct kvm_mmu_page *root;
112762306a36Sopenharmony_ci
112862306a36Sopenharmony_ci	__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false, false)
112962306a36Sopenharmony_ci		flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
113062306a36Sopenharmony_ci					  range->may_block, flush);
113162306a36Sopenharmony_ci
113262306a36Sopenharmony_ci	return flush;
113362306a36Sopenharmony_ci}
113462306a36Sopenharmony_ci
113562306a36Sopenharmony_citypedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
113662306a36Sopenharmony_ci			      struct kvm_gfn_range *range);
113762306a36Sopenharmony_ci
113862306a36Sopenharmony_cistatic __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
113962306a36Sopenharmony_ci						   struct kvm_gfn_range *range,
114062306a36Sopenharmony_ci						   tdp_handler_t handler)
114162306a36Sopenharmony_ci{
114262306a36Sopenharmony_ci	struct kvm_mmu_page *root;
114362306a36Sopenharmony_ci	struct tdp_iter iter;
114462306a36Sopenharmony_ci	bool ret = false;
114562306a36Sopenharmony_ci
114662306a36Sopenharmony_ci	/*
114762306a36Sopenharmony_ci	 * Don't support rescheduling, none of the MMU notifiers that funnel
114862306a36Sopenharmony_ci	 * into this helper allow blocking; it'd be dead, wasteful code.
114962306a36Sopenharmony_ci	 */
115062306a36Sopenharmony_ci	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
115162306a36Sopenharmony_ci		rcu_read_lock();
115262306a36Sopenharmony_ci
115362306a36Sopenharmony_ci		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
115462306a36Sopenharmony_ci			ret |= handler(kvm, &iter, range);
115562306a36Sopenharmony_ci
115662306a36Sopenharmony_ci		rcu_read_unlock();
115762306a36Sopenharmony_ci	}
115862306a36Sopenharmony_ci
115962306a36Sopenharmony_ci	return ret;
116062306a36Sopenharmony_ci}
116162306a36Sopenharmony_ci
116262306a36Sopenharmony_ci/*
116362306a36Sopenharmony_ci * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
116462306a36Sopenharmony_ci * if any of the GFNs in the range have been accessed.
116562306a36Sopenharmony_ci *
116662306a36Sopenharmony_ci * No need to mark the corresponding PFN as accessed as this call is coming
116762306a36Sopenharmony_ci * from the clear_young() or clear_flush_young() notifier, which uses the
116862306a36Sopenharmony_ci * return value to determine if the page has been accessed.
116962306a36Sopenharmony_ci */
117062306a36Sopenharmony_cistatic bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
117162306a36Sopenharmony_ci			  struct kvm_gfn_range *range)
117262306a36Sopenharmony_ci{
117362306a36Sopenharmony_ci	u64 new_spte;
117462306a36Sopenharmony_ci
117562306a36Sopenharmony_ci	/* If we have a non-accessed entry we don't need to change the pte. */
117662306a36Sopenharmony_ci	if (!is_accessed_spte(iter->old_spte))
117762306a36Sopenharmony_ci		return false;
117862306a36Sopenharmony_ci
117962306a36Sopenharmony_ci	if (spte_ad_enabled(iter->old_spte)) {
118062306a36Sopenharmony_ci		iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
118162306a36Sopenharmony_ci							 iter->old_spte,
118262306a36Sopenharmony_ci							 shadow_accessed_mask,
118362306a36Sopenharmony_ci							 iter->level);
118462306a36Sopenharmony_ci		new_spte = iter->old_spte & ~shadow_accessed_mask;
118562306a36Sopenharmony_ci	} else {
118662306a36Sopenharmony_ci		/*
118762306a36Sopenharmony_ci		 * Capture the dirty status of the page, so that it doesn't get
118862306a36Sopenharmony_ci		 * lost when the SPTE is marked for access tracking.
118962306a36Sopenharmony_ci		 */
119062306a36Sopenharmony_ci		if (is_writable_pte(iter->old_spte))
119162306a36Sopenharmony_ci			kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
119262306a36Sopenharmony_ci
119362306a36Sopenharmony_ci		new_spte = mark_spte_for_access_track(iter->old_spte);
119462306a36Sopenharmony_ci		iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
119562306a36Sopenharmony_ci							iter->old_spte, new_spte,
119662306a36Sopenharmony_ci							iter->level);
119762306a36Sopenharmony_ci	}
119862306a36Sopenharmony_ci
119962306a36Sopenharmony_ci	trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
120062306a36Sopenharmony_ci				       iter->old_spte, new_spte);
120162306a36Sopenharmony_ci	return true;
120262306a36Sopenharmony_ci}
120362306a36Sopenharmony_ci
120462306a36Sopenharmony_cibool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
120562306a36Sopenharmony_ci{
120662306a36Sopenharmony_ci	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
120762306a36Sopenharmony_ci}
120862306a36Sopenharmony_ci
120962306a36Sopenharmony_cistatic bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
121062306a36Sopenharmony_ci			 struct kvm_gfn_range *range)
121162306a36Sopenharmony_ci{
121262306a36Sopenharmony_ci	return is_accessed_spte(iter->old_spte);
121362306a36Sopenharmony_ci}
121462306a36Sopenharmony_ci
121562306a36Sopenharmony_cibool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
121662306a36Sopenharmony_ci{
121762306a36Sopenharmony_ci	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
121862306a36Sopenharmony_ci}
121962306a36Sopenharmony_ci
122062306a36Sopenharmony_cistatic bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
122162306a36Sopenharmony_ci			 struct kvm_gfn_range *range)
122262306a36Sopenharmony_ci{
122362306a36Sopenharmony_ci	u64 new_spte;
122462306a36Sopenharmony_ci
122562306a36Sopenharmony_ci	/* Huge pages aren't expected to be modified without first being zapped. */
122662306a36Sopenharmony_ci	WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end);
122762306a36Sopenharmony_ci
122862306a36Sopenharmony_ci	if (iter->level != PG_LEVEL_4K ||
122962306a36Sopenharmony_ci	    !is_shadow_present_pte(iter->old_spte))
123062306a36Sopenharmony_ci		return false;
123162306a36Sopenharmony_ci
123262306a36Sopenharmony_ci	/*
123362306a36Sopenharmony_ci	 * Note, when changing a read-only SPTE, it's not strictly necessary to
123462306a36Sopenharmony_ci	 * zero the SPTE before setting the new PFN, but doing so preserves the
123562306a36Sopenharmony_ci	 * invariant that the PFN of a present * leaf SPTE can never change.
123662306a36Sopenharmony_ci	 * See handle_changed_spte().
123762306a36Sopenharmony_ci	 */
123862306a36Sopenharmony_ci	tdp_mmu_iter_set_spte(kvm, iter, 0);
123962306a36Sopenharmony_ci
124062306a36Sopenharmony_ci	if (!pte_write(range->arg.pte)) {
124162306a36Sopenharmony_ci		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
124262306a36Sopenharmony_ci								  pte_pfn(range->arg.pte));
124362306a36Sopenharmony_ci
124462306a36Sopenharmony_ci		tdp_mmu_iter_set_spte(kvm, iter, new_spte);
124562306a36Sopenharmony_ci	}
124662306a36Sopenharmony_ci
124762306a36Sopenharmony_ci	return true;
124862306a36Sopenharmony_ci}
124962306a36Sopenharmony_ci
125062306a36Sopenharmony_ci/*
125162306a36Sopenharmony_ci * Handle the changed_pte MMU notifier for the TDP MMU.
125262306a36Sopenharmony_ci * data is a pointer to the new pte_t mapping the HVA specified by the MMU
125362306a36Sopenharmony_ci * notifier.
125462306a36Sopenharmony_ci * Returns non-zero if a flush is needed before releasing the MMU lock.
125562306a36Sopenharmony_ci */
125662306a36Sopenharmony_cibool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
125762306a36Sopenharmony_ci{
125862306a36Sopenharmony_ci	/*
125962306a36Sopenharmony_ci	 * No need to handle the remote TLB flush under RCU protection, the
126062306a36Sopenharmony_ci	 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
126162306a36Sopenharmony_ci	 * shadow page. See the WARN on pfn_changed in handle_changed_spte().
126262306a36Sopenharmony_ci	 */
126362306a36Sopenharmony_ci	return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
126462306a36Sopenharmony_ci}
126562306a36Sopenharmony_ci
126662306a36Sopenharmony_ci/*
126762306a36Sopenharmony_ci * Remove write access from all SPTEs at or above min_level that map GFNs
126862306a36Sopenharmony_ci * [start, end). Returns true if an SPTE has been changed and the TLBs need to
126962306a36Sopenharmony_ci * be flushed.
127062306a36Sopenharmony_ci */
127162306a36Sopenharmony_cistatic bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
127262306a36Sopenharmony_ci			     gfn_t start, gfn_t end, int min_level)
127362306a36Sopenharmony_ci{
127462306a36Sopenharmony_ci	struct tdp_iter iter;
127562306a36Sopenharmony_ci	u64 new_spte;
127662306a36Sopenharmony_ci	bool spte_set = false;
127762306a36Sopenharmony_ci
127862306a36Sopenharmony_ci	rcu_read_lock();
127962306a36Sopenharmony_ci
128062306a36Sopenharmony_ci	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
128162306a36Sopenharmony_ci
128262306a36Sopenharmony_ci	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
128362306a36Sopenharmony_ciretry:
128462306a36Sopenharmony_ci		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
128562306a36Sopenharmony_ci			continue;
128662306a36Sopenharmony_ci
128762306a36Sopenharmony_ci		if (!is_shadow_present_pte(iter.old_spte) ||
128862306a36Sopenharmony_ci		    !is_last_spte(iter.old_spte, iter.level) ||
128962306a36Sopenharmony_ci		    !(iter.old_spte & PT_WRITABLE_MASK))
129062306a36Sopenharmony_ci			continue;
129162306a36Sopenharmony_ci
129262306a36Sopenharmony_ci		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
129362306a36Sopenharmony_ci
129462306a36Sopenharmony_ci		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
129562306a36Sopenharmony_ci			goto retry;
129662306a36Sopenharmony_ci
129762306a36Sopenharmony_ci		spte_set = true;
129862306a36Sopenharmony_ci	}
129962306a36Sopenharmony_ci
130062306a36Sopenharmony_ci	rcu_read_unlock();
130162306a36Sopenharmony_ci	return spte_set;
130262306a36Sopenharmony_ci}
130362306a36Sopenharmony_ci
130462306a36Sopenharmony_ci/*
130562306a36Sopenharmony_ci * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
130662306a36Sopenharmony_ci * only affect leaf SPTEs down to min_level.
130762306a36Sopenharmony_ci * Returns true if an SPTE has been changed and the TLBs need to be flushed.
130862306a36Sopenharmony_ci */
130962306a36Sopenharmony_cibool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
131062306a36Sopenharmony_ci			     const struct kvm_memory_slot *slot, int min_level)
131162306a36Sopenharmony_ci{
131262306a36Sopenharmony_ci	struct kvm_mmu_page *root;
131362306a36Sopenharmony_ci	bool spte_set = false;
131462306a36Sopenharmony_ci
131562306a36Sopenharmony_ci	lockdep_assert_held_read(&kvm->mmu_lock);
131662306a36Sopenharmony_ci
131762306a36Sopenharmony_ci	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
131862306a36Sopenharmony_ci		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
131962306a36Sopenharmony_ci			     slot->base_gfn + slot->npages, min_level);
132062306a36Sopenharmony_ci
132162306a36Sopenharmony_ci	return spte_set;
132262306a36Sopenharmony_ci}
132362306a36Sopenharmony_ci
132462306a36Sopenharmony_cistatic struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
132562306a36Sopenharmony_ci{
132662306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
132762306a36Sopenharmony_ci
132862306a36Sopenharmony_ci	gfp |= __GFP_ZERO;
132962306a36Sopenharmony_ci
133062306a36Sopenharmony_ci	sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
133162306a36Sopenharmony_ci	if (!sp)
133262306a36Sopenharmony_ci		return NULL;
133362306a36Sopenharmony_ci
133462306a36Sopenharmony_ci	sp->spt = (void *)__get_free_page(gfp);
133562306a36Sopenharmony_ci	if (!sp->spt) {
133662306a36Sopenharmony_ci		kmem_cache_free(mmu_page_header_cache, sp);
133762306a36Sopenharmony_ci		return NULL;
133862306a36Sopenharmony_ci	}
133962306a36Sopenharmony_ci
134062306a36Sopenharmony_ci	return sp;
134162306a36Sopenharmony_ci}
134262306a36Sopenharmony_ci
134362306a36Sopenharmony_cistatic struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
134462306a36Sopenharmony_ci						       struct tdp_iter *iter,
134562306a36Sopenharmony_ci						       bool shared)
134662306a36Sopenharmony_ci{
134762306a36Sopenharmony_ci	struct kvm_mmu_page *sp;
134862306a36Sopenharmony_ci
134962306a36Sopenharmony_ci	/*
135062306a36Sopenharmony_ci	 * Since we are allocating while under the MMU lock we have to be
135162306a36Sopenharmony_ci	 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
135262306a36Sopenharmony_ci	 * reclaim and to avoid making any filesystem callbacks (which can end
135362306a36Sopenharmony_ci	 * up invoking KVM MMU notifiers, resulting in a deadlock).
135462306a36Sopenharmony_ci	 *
135562306a36Sopenharmony_ci	 * If this allocation fails we drop the lock and retry with reclaim
135662306a36Sopenharmony_ci	 * allowed.
135762306a36Sopenharmony_ci	 */
135862306a36Sopenharmony_ci	sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
135962306a36Sopenharmony_ci	if (sp)
136062306a36Sopenharmony_ci		return sp;
136162306a36Sopenharmony_ci
136262306a36Sopenharmony_ci	rcu_read_unlock();
136362306a36Sopenharmony_ci
136462306a36Sopenharmony_ci	if (shared)
136562306a36Sopenharmony_ci		read_unlock(&kvm->mmu_lock);
136662306a36Sopenharmony_ci	else
136762306a36Sopenharmony_ci		write_unlock(&kvm->mmu_lock);
136862306a36Sopenharmony_ci
136962306a36Sopenharmony_ci	iter->yielded = true;
137062306a36Sopenharmony_ci	sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
137162306a36Sopenharmony_ci
137262306a36Sopenharmony_ci	if (shared)
137362306a36Sopenharmony_ci		read_lock(&kvm->mmu_lock);
137462306a36Sopenharmony_ci	else
137562306a36Sopenharmony_ci		write_lock(&kvm->mmu_lock);
137662306a36Sopenharmony_ci
137762306a36Sopenharmony_ci	rcu_read_lock();
137862306a36Sopenharmony_ci
137962306a36Sopenharmony_ci	return sp;
138062306a36Sopenharmony_ci}
138162306a36Sopenharmony_ci
138262306a36Sopenharmony_ci/* Note, the caller is responsible for initializing @sp. */
138362306a36Sopenharmony_cistatic int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
138462306a36Sopenharmony_ci				   struct kvm_mmu_page *sp, bool shared)
138562306a36Sopenharmony_ci{
138662306a36Sopenharmony_ci	const u64 huge_spte = iter->old_spte;
138762306a36Sopenharmony_ci	const int level = iter->level;
138862306a36Sopenharmony_ci	int ret, i;
138962306a36Sopenharmony_ci
139062306a36Sopenharmony_ci	/*
139162306a36Sopenharmony_ci	 * No need for atomics when writing to sp->spt since the page table has
139262306a36Sopenharmony_ci	 * not been linked in yet and thus is not reachable from any other CPU.
139362306a36Sopenharmony_ci	 */
139462306a36Sopenharmony_ci	for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
139562306a36Sopenharmony_ci		sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
139662306a36Sopenharmony_ci
139762306a36Sopenharmony_ci	/*
139862306a36Sopenharmony_ci	 * Replace the huge spte with a pointer to the populated lower level
139962306a36Sopenharmony_ci	 * page table. Since we are making this change without a TLB flush vCPUs
140062306a36Sopenharmony_ci	 * will see a mix of the split mappings and the original huge mapping,
140162306a36Sopenharmony_ci	 * depending on what's currently in their TLB. This is fine from a
140262306a36Sopenharmony_ci	 * correctness standpoint since the translation will be the same either
140362306a36Sopenharmony_ci	 * way.
140462306a36Sopenharmony_ci	 */
140562306a36Sopenharmony_ci	ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
140662306a36Sopenharmony_ci	if (ret)
140762306a36Sopenharmony_ci		goto out;
140862306a36Sopenharmony_ci
140962306a36Sopenharmony_ci	/*
141062306a36Sopenharmony_ci	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
141162306a36Sopenharmony_ci	 * are overwriting from the page stats. But we have to manually update
141262306a36Sopenharmony_ci	 * the page stats with the new present child pages.
141362306a36Sopenharmony_ci	 */
141462306a36Sopenharmony_ci	kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
141562306a36Sopenharmony_ci
141662306a36Sopenharmony_ciout:
141762306a36Sopenharmony_ci	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
141862306a36Sopenharmony_ci	return ret;
141962306a36Sopenharmony_ci}
142062306a36Sopenharmony_ci
142162306a36Sopenharmony_cistatic int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
142262306a36Sopenharmony_ci					 struct kvm_mmu_page *root,
142362306a36Sopenharmony_ci					 gfn_t start, gfn_t end,
142462306a36Sopenharmony_ci					 int target_level, bool shared)
142562306a36Sopenharmony_ci{
142662306a36Sopenharmony_ci	struct kvm_mmu_page *sp = NULL;
142762306a36Sopenharmony_ci	struct tdp_iter iter;
142862306a36Sopenharmony_ci	int ret = 0;
142962306a36Sopenharmony_ci
143062306a36Sopenharmony_ci	rcu_read_lock();
143162306a36Sopenharmony_ci
143262306a36Sopenharmony_ci	/*
143362306a36Sopenharmony_ci	 * Traverse the page table splitting all huge pages above the target
143462306a36Sopenharmony_ci	 * level into one lower level. For example, if we encounter a 1GB page
143562306a36Sopenharmony_ci	 * we split it into 512 2MB pages.
143662306a36Sopenharmony_ci	 *
143762306a36Sopenharmony_ci	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
143862306a36Sopenharmony_ci	 * to visit an SPTE before ever visiting its children, which means we
143962306a36Sopenharmony_ci	 * will correctly recursively split huge pages that are more than one
144062306a36Sopenharmony_ci	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
144162306a36Sopenharmony_ci	 * and then splitting each of those to 512 4KB pages).
144262306a36Sopenharmony_ci	 */
144362306a36Sopenharmony_ci	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
144462306a36Sopenharmony_ciretry:
144562306a36Sopenharmony_ci		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
144662306a36Sopenharmony_ci			continue;
144762306a36Sopenharmony_ci
144862306a36Sopenharmony_ci		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
144962306a36Sopenharmony_ci			continue;
145062306a36Sopenharmony_ci
145162306a36Sopenharmony_ci		if (!sp) {
145262306a36Sopenharmony_ci			sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
145362306a36Sopenharmony_ci			if (!sp) {
145462306a36Sopenharmony_ci				ret = -ENOMEM;
145562306a36Sopenharmony_ci				trace_kvm_mmu_split_huge_page(iter.gfn,
145662306a36Sopenharmony_ci							      iter.old_spte,
145762306a36Sopenharmony_ci							      iter.level, ret);
145862306a36Sopenharmony_ci				break;
145962306a36Sopenharmony_ci			}
146062306a36Sopenharmony_ci
146162306a36Sopenharmony_ci			if (iter.yielded)
146262306a36Sopenharmony_ci				continue;
146362306a36Sopenharmony_ci		}
146462306a36Sopenharmony_ci
146562306a36Sopenharmony_ci		tdp_mmu_init_child_sp(sp, &iter);
146662306a36Sopenharmony_ci
146762306a36Sopenharmony_ci		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
146862306a36Sopenharmony_ci			goto retry;
146962306a36Sopenharmony_ci
147062306a36Sopenharmony_ci		sp = NULL;
147162306a36Sopenharmony_ci	}
147262306a36Sopenharmony_ci
147362306a36Sopenharmony_ci	rcu_read_unlock();
147462306a36Sopenharmony_ci
147562306a36Sopenharmony_ci	/*
147662306a36Sopenharmony_ci	 * It's possible to exit the loop having never used the last sp if, for
147762306a36Sopenharmony_ci	 * example, a vCPU doing HugePage NX splitting wins the race and
147862306a36Sopenharmony_ci	 * installs its own sp in place of the last sp we tried to split.
147962306a36Sopenharmony_ci	 */
148062306a36Sopenharmony_ci	if (sp)
148162306a36Sopenharmony_ci		tdp_mmu_free_sp(sp);
148262306a36Sopenharmony_ci
148362306a36Sopenharmony_ci	return ret;
148462306a36Sopenharmony_ci}
148562306a36Sopenharmony_ci
148662306a36Sopenharmony_ci
148762306a36Sopenharmony_ci/*
148862306a36Sopenharmony_ci * Try to split all huge pages mapped by the TDP MMU down to the target level.
148962306a36Sopenharmony_ci */
149062306a36Sopenharmony_civoid kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
149162306a36Sopenharmony_ci				      const struct kvm_memory_slot *slot,
149262306a36Sopenharmony_ci				      gfn_t start, gfn_t end,
149362306a36Sopenharmony_ci				      int target_level, bool shared)
149462306a36Sopenharmony_ci{
149562306a36Sopenharmony_ci	struct kvm_mmu_page *root;
149662306a36Sopenharmony_ci	int r = 0;
149762306a36Sopenharmony_ci
149862306a36Sopenharmony_ci	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
149962306a36Sopenharmony_ci
150062306a36Sopenharmony_ci	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
150162306a36Sopenharmony_ci		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
150262306a36Sopenharmony_ci		if (r) {
150362306a36Sopenharmony_ci			kvm_tdp_mmu_put_root(kvm, root, shared);
150462306a36Sopenharmony_ci			break;
150562306a36Sopenharmony_ci		}
150662306a36Sopenharmony_ci	}
150762306a36Sopenharmony_ci}
150862306a36Sopenharmony_ci
150962306a36Sopenharmony_ci/*
151062306a36Sopenharmony_ci * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
151162306a36Sopenharmony_ci * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
151262306a36Sopenharmony_ci * If AD bits are not enabled, this will require clearing the writable bit on
151362306a36Sopenharmony_ci * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
151462306a36Sopenharmony_ci * be flushed.
151562306a36Sopenharmony_ci */
151662306a36Sopenharmony_cistatic bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
151762306a36Sopenharmony_ci			   gfn_t start, gfn_t end)
151862306a36Sopenharmony_ci{
151962306a36Sopenharmony_ci	u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
152062306a36Sopenharmony_ci	struct tdp_iter iter;
152162306a36Sopenharmony_ci	bool spte_set = false;
152262306a36Sopenharmony_ci
152362306a36Sopenharmony_ci	rcu_read_lock();
152462306a36Sopenharmony_ci
152562306a36Sopenharmony_ci	tdp_root_for_each_leaf_pte(iter, root, start, end) {
152662306a36Sopenharmony_ciretry:
152762306a36Sopenharmony_ci		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
152862306a36Sopenharmony_ci			continue;
152962306a36Sopenharmony_ci
153062306a36Sopenharmony_ci		if (!is_shadow_present_pte(iter.old_spte))
153162306a36Sopenharmony_ci			continue;
153262306a36Sopenharmony_ci
153362306a36Sopenharmony_ci		KVM_MMU_WARN_ON(kvm_ad_enabled() &&
153462306a36Sopenharmony_ci				spte_ad_need_write_protect(iter.old_spte));
153562306a36Sopenharmony_ci
153662306a36Sopenharmony_ci		if (!(iter.old_spte & dbit))
153762306a36Sopenharmony_ci			continue;
153862306a36Sopenharmony_ci
153962306a36Sopenharmony_ci		if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
154062306a36Sopenharmony_ci			goto retry;
154162306a36Sopenharmony_ci
154262306a36Sopenharmony_ci		spte_set = true;
154362306a36Sopenharmony_ci	}
154462306a36Sopenharmony_ci
154562306a36Sopenharmony_ci	rcu_read_unlock();
154662306a36Sopenharmony_ci	return spte_set;
154762306a36Sopenharmony_ci}
154862306a36Sopenharmony_ci
154962306a36Sopenharmony_ci/*
155062306a36Sopenharmony_ci * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
155162306a36Sopenharmony_ci * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
155262306a36Sopenharmony_ci * If AD bits are not enabled, this will require clearing the writable bit on
155362306a36Sopenharmony_ci * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
155462306a36Sopenharmony_ci * be flushed.
155562306a36Sopenharmony_ci */
155662306a36Sopenharmony_cibool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
155762306a36Sopenharmony_ci				  const struct kvm_memory_slot *slot)
155862306a36Sopenharmony_ci{
155962306a36Sopenharmony_ci	struct kvm_mmu_page *root;
156062306a36Sopenharmony_ci	bool spte_set = false;
156162306a36Sopenharmony_ci
156262306a36Sopenharmony_ci	lockdep_assert_held_read(&kvm->mmu_lock);
156362306a36Sopenharmony_ci
156462306a36Sopenharmony_ci	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
156562306a36Sopenharmony_ci		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
156662306a36Sopenharmony_ci				slot->base_gfn + slot->npages);
156762306a36Sopenharmony_ci
156862306a36Sopenharmony_ci	return spte_set;
156962306a36Sopenharmony_ci}
157062306a36Sopenharmony_ci
157162306a36Sopenharmony_ci/*
157262306a36Sopenharmony_ci * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
157362306a36Sopenharmony_ci * set in mask, starting at gfn. The given memslot is expected to contain all
157462306a36Sopenharmony_ci * the GFNs represented by set bits in the mask. If AD bits are enabled,
157562306a36Sopenharmony_ci * clearing the dirty status will involve clearing the dirty bit on each SPTE
157662306a36Sopenharmony_ci * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
157762306a36Sopenharmony_ci */
157862306a36Sopenharmony_cistatic void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
157962306a36Sopenharmony_ci				  gfn_t gfn, unsigned long mask, bool wrprot)
158062306a36Sopenharmony_ci{
158162306a36Sopenharmony_ci	u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
158262306a36Sopenharmony_ci						   shadow_dirty_mask;
158362306a36Sopenharmony_ci	struct tdp_iter iter;
158462306a36Sopenharmony_ci
158562306a36Sopenharmony_ci	lockdep_assert_held_write(&kvm->mmu_lock);
158662306a36Sopenharmony_ci
158762306a36Sopenharmony_ci	rcu_read_lock();
158862306a36Sopenharmony_ci
158962306a36Sopenharmony_ci	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
159062306a36Sopenharmony_ci				    gfn + BITS_PER_LONG) {
159162306a36Sopenharmony_ci		if (!mask)
159262306a36Sopenharmony_ci			break;
159362306a36Sopenharmony_ci
159462306a36Sopenharmony_ci		KVM_MMU_WARN_ON(kvm_ad_enabled() &&
159562306a36Sopenharmony_ci				spte_ad_need_write_protect(iter.old_spte));
159662306a36Sopenharmony_ci
159762306a36Sopenharmony_ci		if (iter.level > PG_LEVEL_4K ||
159862306a36Sopenharmony_ci		    !(mask & (1UL << (iter.gfn - gfn))))
159962306a36Sopenharmony_ci			continue;
160062306a36Sopenharmony_ci
160162306a36Sopenharmony_ci		mask &= ~(1UL << (iter.gfn - gfn));
160262306a36Sopenharmony_ci
160362306a36Sopenharmony_ci		if (!(iter.old_spte & dbit))
160462306a36Sopenharmony_ci			continue;
160562306a36Sopenharmony_ci
160662306a36Sopenharmony_ci		iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
160762306a36Sopenharmony_ci							iter.old_spte, dbit,
160862306a36Sopenharmony_ci							iter.level);
160962306a36Sopenharmony_ci
161062306a36Sopenharmony_ci		trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
161162306a36Sopenharmony_ci					       iter.old_spte,
161262306a36Sopenharmony_ci					       iter.old_spte & ~dbit);
161362306a36Sopenharmony_ci		kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
161462306a36Sopenharmony_ci	}
161562306a36Sopenharmony_ci
161662306a36Sopenharmony_ci	rcu_read_unlock();
161762306a36Sopenharmony_ci}
161862306a36Sopenharmony_ci
161962306a36Sopenharmony_ci/*
162062306a36Sopenharmony_ci * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
162162306a36Sopenharmony_ci * set in mask, starting at gfn. The given memslot is expected to contain all
162262306a36Sopenharmony_ci * the GFNs represented by set bits in the mask. If AD bits are enabled,
162362306a36Sopenharmony_ci * clearing the dirty status will involve clearing the dirty bit on each SPTE
162462306a36Sopenharmony_ci * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
162562306a36Sopenharmony_ci */
162662306a36Sopenharmony_civoid kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
162762306a36Sopenharmony_ci				       struct kvm_memory_slot *slot,
162862306a36Sopenharmony_ci				       gfn_t gfn, unsigned long mask,
162962306a36Sopenharmony_ci				       bool wrprot)
163062306a36Sopenharmony_ci{
163162306a36Sopenharmony_ci	struct kvm_mmu_page *root;
163262306a36Sopenharmony_ci
163362306a36Sopenharmony_ci	for_each_tdp_mmu_root(kvm, root, slot->as_id)
163462306a36Sopenharmony_ci		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
163562306a36Sopenharmony_ci}
163662306a36Sopenharmony_ci
163762306a36Sopenharmony_cistatic void zap_collapsible_spte_range(struct kvm *kvm,
163862306a36Sopenharmony_ci				       struct kvm_mmu_page *root,
163962306a36Sopenharmony_ci				       const struct kvm_memory_slot *slot)
164062306a36Sopenharmony_ci{
164162306a36Sopenharmony_ci	gfn_t start = slot->base_gfn;
164262306a36Sopenharmony_ci	gfn_t end = start + slot->npages;
164362306a36Sopenharmony_ci	struct tdp_iter iter;
164462306a36Sopenharmony_ci	int max_mapping_level;
164562306a36Sopenharmony_ci
164662306a36Sopenharmony_ci	rcu_read_lock();
164762306a36Sopenharmony_ci
164862306a36Sopenharmony_ci	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
164962306a36Sopenharmony_ciretry:
165062306a36Sopenharmony_ci		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
165162306a36Sopenharmony_ci			continue;
165262306a36Sopenharmony_ci
165362306a36Sopenharmony_ci		if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
165462306a36Sopenharmony_ci		    !is_shadow_present_pte(iter.old_spte))
165562306a36Sopenharmony_ci			continue;
165662306a36Sopenharmony_ci
165762306a36Sopenharmony_ci		/*
165862306a36Sopenharmony_ci		 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
165962306a36Sopenharmony_ci		 * a large page size, then its parent would have been zapped
166062306a36Sopenharmony_ci		 * instead of stepping down.
166162306a36Sopenharmony_ci		 */
166262306a36Sopenharmony_ci		if (is_last_spte(iter.old_spte, iter.level))
166362306a36Sopenharmony_ci			continue;
166462306a36Sopenharmony_ci
166562306a36Sopenharmony_ci		/*
166662306a36Sopenharmony_ci		 * If iter.gfn resides outside of the slot, i.e. the page for
166762306a36Sopenharmony_ci		 * the current level overlaps but is not contained by the slot,
166862306a36Sopenharmony_ci		 * then the SPTE can't be made huge.  More importantly, trying
166962306a36Sopenharmony_ci		 * to query that info from slot->arch.lpage_info will cause an
167062306a36Sopenharmony_ci		 * out-of-bounds access.
167162306a36Sopenharmony_ci		 */
167262306a36Sopenharmony_ci		if (iter.gfn < start || iter.gfn >= end)
167362306a36Sopenharmony_ci			continue;
167462306a36Sopenharmony_ci
167562306a36Sopenharmony_ci		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
167662306a36Sopenharmony_ci							      iter.gfn, PG_LEVEL_NUM);
167762306a36Sopenharmony_ci		if (max_mapping_level < iter.level)
167862306a36Sopenharmony_ci			continue;
167962306a36Sopenharmony_ci
168062306a36Sopenharmony_ci		/* Note, a successful atomic zap also does a remote TLB flush. */
168162306a36Sopenharmony_ci		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
168262306a36Sopenharmony_ci			goto retry;
168362306a36Sopenharmony_ci	}
168462306a36Sopenharmony_ci
168562306a36Sopenharmony_ci	rcu_read_unlock();
168662306a36Sopenharmony_ci}
168762306a36Sopenharmony_ci
168862306a36Sopenharmony_ci/*
168962306a36Sopenharmony_ci * Zap non-leaf SPTEs (and free their associated page tables) which could
169062306a36Sopenharmony_ci * be replaced by huge pages, for GFNs within the slot.
169162306a36Sopenharmony_ci */
169262306a36Sopenharmony_civoid kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
169362306a36Sopenharmony_ci				       const struct kvm_memory_slot *slot)
169462306a36Sopenharmony_ci{
169562306a36Sopenharmony_ci	struct kvm_mmu_page *root;
169662306a36Sopenharmony_ci
169762306a36Sopenharmony_ci	lockdep_assert_held_read(&kvm->mmu_lock);
169862306a36Sopenharmony_ci
169962306a36Sopenharmony_ci	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
170062306a36Sopenharmony_ci		zap_collapsible_spte_range(kvm, root, slot);
170162306a36Sopenharmony_ci}
170262306a36Sopenharmony_ci
170362306a36Sopenharmony_ci/*
170462306a36Sopenharmony_ci * Removes write access on the last level SPTE mapping this GFN and unsets the
170562306a36Sopenharmony_ci * MMU-writable bit to ensure future writes continue to be intercepted.
170662306a36Sopenharmony_ci * Returns true if an SPTE was set and a TLB flush is needed.
170762306a36Sopenharmony_ci */
170862306a36Sopenharmony_cistatic bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
170962306a36Sopenharmony_ci			      gfn_t gfn, int min_level)
171062306a36Sopenharmony_ci{
171162306a36Sopenharmony_ci	struct tdp_iter iter;
171262306a36Sopenharmony_ci	u64 new_spte;
171362306a36Sopenharmony_ci	bool spte_set = false;
171462306a36Sopenharmony_ci
171562306a36Sopenharmony_ci	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
171662306a36Sopenharmony_ci
171762306a36Sopenharmony_ci	rcu_read_lock();
171862306a36Sopenharmony_ci
171962306a36Sopenharmony_ci	for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
172062306a36Sopenharmony_ci		if (!is_shadow_present_pte(iter.old_spte) ||
172162306a36Sopenharmony_ci		    !is_last_spte(iter.old_spte, iter.level))
172262306a36Sopenharmony_ci			continue;
172362306a36Sopenharmony_ci
172462306a36Sopenharmony_ci		new_spte = iter.old_spte &
172562306a36Sopenharmony_ci			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
172662306a36Sopenharmony_ci
172762306a36Sopenharmony_ci		if (new_spte == iter.old_spte)
172862306a36Sopenharmony_ci			break;
172962306a36Sopenharmony_ci
173062306a36Sopenharmony_ci		tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
173162306a36Sopenharmony_ci		spte_set = true;
173262306a36Sopenharmony_ci	}
173362306a36Sopenharmony_ci
173462306a36Sopenharmony_ci	rcu_read_unlock();
173562306a36Sopenharmony_ci
173662306a36Sopenharmony_ci	return spte_set;
173762306a36Sopenharmony_ci}
173862306a36Sopenharmony_ci
173962306a36Sopenharmony_ci/*
174062306a36Sopenharmony_ci * Removes write access on the last level SPTE mapping this GFN and unsets the
174162306a36Sopenharmony_ci * MMU-writable bit to ensure future writes continue to be intercepted.
174262306a36Sopenharmony_ci * Returns true if an SPTE was set and a TLB flush is needed.
174362306a36Sopenharmony_ci */
174462306a36Sopenharmony_cibool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
174562306a36Sopenharmony_ci				   struct kvm_memory_slot *slot, gfn_t gfn,
174662306a36Sopenharmony_ci				   int min_level)
174762306a36Sopenharmony_ci{
174862306a36Sopenharmony_ci	struct kvm_mmu_page *root;
174962306a36Sopenharmony_ci	bool spte_set = false;
175062306a36Sopenharmony_ci
175162306a36Sopenharmony_ci	lockdep_assert_held_write(&kvm->mmu_lock);
175262306a36Sopenharmony_ci	for_each_tdp_mmu_root(kvm, root, slot->as_id)
175362306a36Sopenharmony_ci		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
175462306a36Sopenharmony_ci
175562306a36Sopenharmony_ci	return spte_set;
175662306a36Sopenharmony_ci}
175762306a36Sopenharmony_ci
175862306a36Sopenharmony_ci/*
175962306a36Sopenharmony_ci * Return the level of the lowest level SPTE added to sptes.
176062306a36Sopenharmony_ci * That SPTE may be non-present.
176162306a36Sopenharmony_ci *
176262306a36Sopenharmony_ci * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
176362306a36Sopenharmony_ci */
176462306a36Sopenharmony_ciint kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
176562306a36Sopenharmony_ci			 int *root_level)
176662306a36Sopenharmony_ci{
176762306a36Sopenharmony_ci	struct tdp_iter iter;
176862306a36Sopenharmony_ci	struct kvm_mmu *mmu = vcpu->arch.mmu;
176962306a36Sopenharmony_ci	gfn_t gfn = addr >> PAGE_SHIFT;
177062306a36Sopenharmony_ci	int leaf = -1;
177162306a36Sopenharmony_ci
177262306a36Sopenharmony_ci	*root_level = vcpu->arch.mmu->root_role.level;
177362306a36Sopenharmony_ci
177462306a36Sopenharmony_ci	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
177562306a36Sopenharmony_ci		leaf = iter.level;
177662306a36Sopenharmony_ci		sptes[leaf] = iter.old_spte;
177762306a36Sopenharmony_ci	}
177862306a36Sopenharmony_ci
177962306a36Sopenharmony_ci	return leaf;
178062306a36Sopenharmony_ci}
178162306a36Sopenharmony_ci
178262306a36Sopenharmony_ci/*
178362306a36Sopenharmony_ci * Returns the last level spte pointer of the shadow page walk for the given
178462306a36Sopenharmony_ci * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
178562306a36Sopenharmony_ci * walk could be performed, returns NULL and *spte does not contain valid data.
178662306a36Sopenharmony_ci *
178762306a36Sopenharmony_ci * Contract:
178862306a36Sopenharmony_ci *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
178962306a36Sopenharmony_ci *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
179062306a36Sopenharmony_ci *
179162306a36Sopenharmony_ci * WARNING: This function is only intended to be called during fast_page_fault.
179262306a36Sopenharmony_ci */
179362306a36Sopenharmony_ciu64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
179462306a36Sopenharmony_ci					u64 *spte)
179562306a36Sopenharmony_ci{
179662306a36Sopenharmony_ci	struct tdp_iter iter;
179762306a36Sopenharmony_ci	struct kvm_mmu *mmu = vcpu->arch.mmu;
179862306a36Sopenharmony_ci	gfn_t gfn = addr >> PAGE_SHIFT;
179962306a36Sopenharmony_ci	tdp_ptep_t sptep = NULL;
180062306a36Sopenharmony_ci
180162306a36Sopenharmony_ci	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
180262306a36Sopenharmony_ci		*spte = iter.old_spte;
180362306a36Sopenharmony_ci		sptep = iter.sptep;
180462306a36Sopenharmony_ci	}
180562306a36Sopenharmony_ci
180662306a36Sopenharmony_ci	/*
180762306a36Sopenharmony_ci	 * Perform the rcu_dereference to get the raw spte pointer value since
180862306a36Sopenharmony_ci	 * we are passing it up to fast_page_fault, which is shared with the
180962306a36Sopenharmony_ci	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
181062306a36Sopenharmony_ci	 * annotation.
181162306a36Sopenharmony_ci	 *
181262306a36Sopenharmony_ci	 * This is safe since fast_page_fault obeys the contracts of this
181362306a36Sopenharmony_ci	 * function as well as all TDP MMU contracts around modifying SPTEs
181462306a36Sopenharmony_ci	 * outside of mmu_lock.
181562306a36Sopenharmony_ci	 */
181662306a36Sopenharmony_ci	return rcu_dereference(sptep);
181762306a36Sopenharmony_ci}
1818