162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 362306a36Sopenharmony_ci 462306a36Sopenharmony_ci#include "mmu.h" 562306a36Sopenharmony_ci#include "mmu_internal.h" 662306a36Sopenharmony_ci#include "mmutrace.h" 762306a36Sopenharmony_ci#include "tdp_iter.h" 862306a36Sopenharmony_ci#include "tdp_mmu.h" 962306a36Sopenharmony_ci#include "spte.h" 1062306a36Sopenharmony_ci 1162306a36Sopenharmony_ci#include <asm/cmpxchg.h> 1262306a36Sopenharmony_ci#include <trace/events/kvm.h> 1362306a36Sopenharmony_ci 1462306a36Sopenharmony_ci/* Initializes the TDP MMU for the VM, if enabled. */ 1562306a36Sopenharmony_civoid kvm_mmu_init_tdp_mmu(struct kvm *kvm) 1662306a36Sopenharmony_ci{ 1762306a36Sopenharmony_ci INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 1862306a36Sopenharmony_ci spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 1962306a36Sopenharmony_ci} 2062306a36Sopenharmony_ci 2162306a36Sopenharmony_ci/* Arbitrarily returns true so that this may be used in if statements. */ 2262306a36Sopenharmony_cistatic __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 2362306a36Sopenharmony_ci bool shared) 2462306a36Sopenharmony_ci{ 2562306a36Sopenharmony_ci if (shared) 2662306a36Sopenharmony_ci lockdep_assert_held_read(&kvm->mmu_lock); 2762306a36Sopenharmony_ci else 2862306a36Sopenharmony_ci lockdep_assert_held_write(&kvm->mmu_lock); 2962306a36Sopenharmony_ci 3062306a36Sopenharmony_ci return true; 3162306a36Sopenharmony_ci} 3262306a36Sopenharmony_ci 3362306a36Sopenharmony_civoid kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 3462306a36Sopenharmony_ci{ 3562306a36Sopenharmony_ci /* 3662306a36Sopenharmony_ci * Invalidate all roots, which besides the obvious, schedules all roots 3762306a36Sopenharmony_ci * for zapping and thus puts the TDP MMU's reference to each root, i.e. 3862306a36Sopenharmony_ci * ultimately frees all roots. 3962306a36Sopenharmony_ci */ 4062306a36Sopenharmony_ci kvm_tdp_mmu_invalidate_all_roots(kvm); 4162306a36Sopenharmony_ci kvm_tdp_mmu_zap_invalidated_roots(kvm); 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_ci WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); 4462306a36Sopenharmony_ci WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_ci /* 4762306a36Sopenharmony_ci * Ensure that all the outstanding RCU callbacks to free shadow pages 4862306a36Sopenharmony_ci * can run before the VM is torn down. Putting the last reference to 4962306a36Sopenharmony_ci * zapped roots will create new callbacks. 5062306a36Sopenharmony_ci */ 5162306a36Sopenharmony_ci rcu_barrier(); 5262306a36Sopenharmony_ci} 5362306a36Sopenharmony_ci 5462306a36Sopenharmony_cistatic void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 5562306a36Sopenharmony_ci{ 5662306a36Sopenharmony_ci free_page((unsigned long)sp->spt); 5762306a36Sopenharmony_ci kmem_cache_free(mmu_page_header_cache, sp); 5862306a36Sopenharmony_ci} 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_ci/* 6162306a36Sopenharmony_ci * This is called through call_rcu in order to free TDP page table memory 6262306a36Sopenharmony_ci * safely with respect to other kernel threads that may be operating on 6362306a36Sopenharmony_ci * the memory. 6462306a36Sopenharmony_ci * By only accessing TDP MMU page table memory in an RCU read critical 6562306a36Sopenharmony_ci * section, and freeing it after a grace period, lockless access to that 6662306a36Sopenharmony_ci * memory won't use it after it is freed. 6762306a36Sopenharmony_ci */ 6862306a36Sopenharmony_cistatic void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 6962306a36Sopenharmony_ci{ 7062306a36Sopenharmony_ci struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 7162306a36Sopenharmony_ci rcu_head); 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci tdp_mmu_free_sp(sp); 7462306a36Sopenharmony_ci} 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_civoid kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 7762306a36Sopenharmony_ci bool shared) 7862306a36Sopenharmony_ci{ 7962306a36Sopenharmony_ci kvm_lockdep_assert_mmu_lock_held(kvm, shared); 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_ci if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 8262306a36Sopenharmony_ci return; 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci /* 8562306a36Sopenharmony_ci * The TDP MMU itself holds a reference to each root until the root is 8662306a36Sopenharmony_ci * explicitly invalidated, i.e. the final reference should be never be 8762306a36Sopenharmony_ci * put for a valid root. 8862306a36Sopenharmony_ci */ 8962306a36Sopenharmony_ci KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm); 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ci spin_lock(&kvm->arch.tdp_mmu_pages_lock); 9262306a36Sopenharmony_ci list_del_rcu(&root->link); 9362306a36Sopenharmony_ci spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 9462306a36Sopenharmony_ci call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 9562306a36Sopenharmony_ci} 9662306a36Sopenharmony_ci 9762306a36Sopenharmony_ci/* 9862306a36Sopenharmony_ci * Returns the next root after @prev_root (or the first root if @prev_root is 9962306a36Sopenharmony_ci * NULL). A reference to the returned root is acquired, and the reference to 10062306a36Sopenharmony_ci * @prev_root is released (the caller obviously must hold a reference to 10162306a36Sopenharmony_ci * @prev_root if it's non-NULL). 10262306a36Sopenharmony_ci * 10362306a36Sopenharmony_ci * If @only_valid is true, invalid roots are skipped. 10462306a36Sopenharmony_ci * 10562306a36Sopenharmony_ci * Returns NULL if the end of tdp_mmu_roots was reached. 10662306a36Sopenharmony_ci */ 10762306a36Sopenharmony_cistatic struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 10862306a36Sopenharmony_ci struct kvm_mmu_page *prev_root, 10962306a36Sopenharmony_ci bool shared, bool only_valid) 11062306a36Sopenharmony_ci{ 11162306a36Sopenharmony_ci struct kvm_mmu_page *next_root; 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_ci rcu_read_lock(); 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_ci if (prev_root) 11662306a36Sopenharmony_ci next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 11762306a36Sopenharmony_ci &prev_root->link, 11862306a36Sopenharmony_ci typeof(*prev_root), link); 11962306a36Sopenharmony_ci else 12062306a36Sopenharmony_ci next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 12162306a36Sopenharmony_ci typeof(*next_root), link); 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_ci while (next_root) { 12462306a36Sopenharmony_ci if ((!only_valid || !next_root->role.invalid) && 12562306a36Sopenharmony_ci kvm_tdp_mmu_get_root(next_root)) 12662306a36Sopenharmony_ci break; 12762306a36Sopenharmony_ci 12862306a36Sopenharmony_ci next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 12962306a36Sopenharmony_ci &next_root->link, typeof(*next_root), link); 13062306a36Sopenharmony_ci } 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_ci rcu_read_unlock(); 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci if (prev_root) 13562306a36Sopenharmony_ci kvm_tdp_mmu_put_root(kvm, prev_root, shared); 13662306a36Sopenharmony_ci 13762306a36Sopenharmony_ci return next_root; 13862306a36Sopenharmony_ci} 13962306a36Sopenharmony_ci 14062306a36Sopenharmony_ci/* 14162306a36Sopenharmony_ci * Note: this iterator gets and puts references to the roots it iterates over. 14262306a36Sopenharmony_ci * This makes it safe to release the MMU lock and yield within the loop, but 14362306a36Sopenharmony_ci * if exiting the loop early, the caller must drop the reference to the most 14462306a36Sopenharmony_ci * recent root. (Unless keeping a live reference is desirable.) 14562306a36Sopenharmony_ci * 14662306a36Sopenharmony_ci * If shared is set, this function is operating under the MMU lock in read 14762306a36Sopenharmony_ci * mode. In the unlikely event that this thread must free a root, the lock 14862306a36Sopenharmony_ci * will be temporarily dropped and reacquired in write mode. 14962306a36Sopenharmony_ci */ 15062306a36Sopenharmony_ci#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ 15162306a36Sopenharmony_ci for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ 15262306a36Sopenharmony_ci _root; \ 15362306a36Sopenharmony_ci _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ 15462306a36Sopenharmony_ci if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ 15562306a36Sopenharmony_ci kvm_mmu_page_as_id(_root) != _as_id) { \ 15662306a36Sopenharmony_ci } else 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ 15962306a36Sopenharmony_ci __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared) \ 16262306a36Sopenharmony_ci for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, false); \ 16362306a36Sopenharmony_ci _root; \ 16462306a36Sopenharmony_ci _root = tdp_mmu_next_root(_kvm, _root, _shared, false)) \ 16562306a36Sopenharmony_ci if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) { \ 16662306a36Sopenharmony_ci } else 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_ci/* 16962306a36Sopenharmony_ci * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, 17062306a36Sopenharmony_ci * the implication being that any flow that holds mmu_lock for read is 17162306a36Sopenharmony_ci * inherently yield-friendly and should use the yield-safe variant above. 17262306a36Sopenharmony_ci * Holding mmu_lock for write obviates the need for RCU protection as the list 17362306a36Sopenharmony_ci * is guaranteed to be stable. 17462306a36Sopenharmony_ci */ 17562306a36Sopenharmony_ci#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 17662306a36Sopenharmony_ci list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 17762306a36Sopenharmony_ci if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 17862306a36Sopenharmony_ci kvm_mmu_page_as_id(_root) != _as_id) { \ 17962306a36Sopenharmony_ci } else 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_cistatic struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 18262306a36Sopenharmony_ci{ 18362306a36Sopenharmony_ci struct kvm_mmu_page *sp; 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ci sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 18662306a36Sopenharmony_ci sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ci return sp; 18962306a36Sopenharmony_ci} 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_cistatic void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, 19262306a36Sopenharmony_ci gfn_t gfn, union kvm_mmu_page_role role) 19362306a36Sopenharmony_ci{ 19462306a36Sopenharmony_ci INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci sp->role = role; 19962306a36Sopenharmony_ci sp->gfn = gfn; 20062306a36Sopenharmony_ci sp->ptep = sptep; 20162306a36Sopenharmony_ci sp->tdp_mmu_page = true; 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_ci trace_kvm_mmu_get_page(sp, true); 20462306a36Sopenharmony_ci} 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_cistatic void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, 20762306a36Sopenharmony_ci struct tdp_iter *iter) 20862306a36Sopenharmony_ci{ 20962306a36Sopenharmony_ci struct kvm_mmu_page *parent_sp; 21062306a36Sopenharmony_ci union kvm_mmu_page_role role; 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); 21362306a36Sopenharmony_ci 21462306a36Sopenharmony_ci role = parent_sp->role; 21562306a36Sopenharmony_ci role.level--; 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 21862306a36Sopenharmony_ci} 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_cihpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 22162306a36Sopenharmony_ci{ 22262306a36Sopenharmony_ci union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; 22362306a36Sopenharmony_ci struct kvm *kvm = vcpu->kvm; 22462306a36Sopenharmony_ci struct kvm_mmu_page *root; 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci lockdep_assert_held_write(&kvm->mmu_lock); 22762306a36Sopenharmony_ci 22862306a36Sopenharmony_ci /* 22962306a36Sopenharmony_ci * Check for an existing root before allocating a new one. Note, the 23062306a36Sopenharmony_ci * role check prevents consuming an invalid root. 23162306a36Sopenharmony_ci */ 23262306a36Sopenharmony_ci for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 23362306a36Sopenharmony_ci if (root->role.word == role.word && 23462306a36Sopenharmony_ci kvm_tdp_mmu_get_root(root)) 23562306a36Sopenharmony_ci goto out; 23662306a36Sopenharmony_ci } 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci root = tdp_mmu_alloc_sp(vcpu); 23962306a36Sopenharmony_ci tdp_mmu_init_sp(root, NULL, 0, role); 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_ci /* 24262306a36Sopenharmony_ci * TDP MMU roots are kept until they are explicitly invalidated, either 24362306a36Sopenharmony_ci * by a memslot update or by the destruction of the VM. Initialize the 24462306a36Sopenharmony_ci * refcount to two; one reference for the vCPU, and one reference for 24562306a36Sopenharmony_ci * the TDP MMU itself, which is held until the root is invalidated and 24662306a36Sopenharmony_ci * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots(). 24762306a36Sopenharmony_ci */ 24862306a36Sopenharmony_ci refcount_set(&root->tdp_mmu_root_count, 2); 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci spin_lock(&kvm->arch.tdp_mmu_pages_lock); 25162306a36Sopenharmony_ci list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 25262306a36Sopenharmony_ci spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 25362306a36Sopenharmony_ci 25462306a36Sopenharmony_ciout: 25562306a36Sopenharmony_ci return __pa(root->spt); 25662306a36Sopenharmony_ci} 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_cistatic void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 25962306a36Sopenharmony_ci u64 old_spte, u64 new_spte, int level, 26062306a36Sopenharmony_ci bool shared); 26162306a36Sopenharmony_ci 26262306a36Sopenharmony_cistatic void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 26362306a36Sopenharmony_ci{ 26462306a36Sopenharmony_ci kvm_account_pgtable_pages((void *)sp->spt, +1); 26562306a36Sopenharmony_ci atomic64_inc(&kvm->arch.tdp_mmu_pages); 26662306a36Sopenharmony_ci} 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_cistatic void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 26962306a36Sopenharmony_ci{ 27062306a36Sopenharmony_ci kvm_account_pgtable_pages((void *)sp->spt, -1); 27162306a36Sopenharmony_ci atomic64_dec(&kvm->arch.tdp_mmu_pages); 27262306a36Sopenharmony_ci} 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_ci/** 27562306a36Sopenharmony_ci * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages 27662306a36Sopenharmony_ci * 27762306a36Sopenharmony_ci * @kvm: kvm instance 27862306a36Sopenharmony_ci * @sp: the page to be removed 27962306a36Sopenharmony_ci * @shared: This operation may not be running under the exclusive use of 28062306a36Sopenharmony_ci * the MMU lock and the operation must synchronize with other 28162306a36Sopenharmony_ci * threads that might be adding or removing pages. 28262306a36Sopenharmony_ci */ 28362306a36Sopenharmony_cistatic void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, 28462306a36Sopenharmony_ci bool shared) 28562306a36Sopenharmony_ci{ 28662306a36Sopenharmony_ci tdp_unaccount_mmu_page(kvm, sp); 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ci if (!sp->nx_huge_page_disallowed) 28962306a36Sopenharmony_ci return; 29062306a36Sopenharmony_ci 29162306a36Sopenharmony_ci if (shared) 29262306a36Sopenharmony_ci spin_lock(&kvm->arch.tdp_mmu_pages_lock); 29362306a36Sopenharmony_ci else 29462306a36Sopenharmony_ci lockdep_assert_held_write(&kvm->mmu_lock); 29562306a36Sopenharmony_ci 29662306a36Sopenharmony_ci sp->nx_huge_page_disallowed = false; 29762306a36Sopenharmony_ci untrack_possible_nx_huge_page(kvm, sp); 29862306a36Sopenharmony_ci 29962306a36Sopenharmony_ci if (shared) 30062306a36Sopenharmony_ci spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 30162306a36Sopenharmony_ci} 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_ci/** 30462306a36Sopenharmony_ci * handle_removed_pt() - handle a page table removed from the TDP structure 30562306a36Sopenharmony_ci * 30662306a36Sopenharmony_ci * @kvm: kvm instance 30762306a36Sopenharmony_ci * @pt: the page removed from the paging structure 30862306a36Sopenharmony_ci * @shared: This operation may not be running under the exclusive use 30962306a36Sopenharmony_ci * of the MMU lock and the operation must synchronize with other 31062306a36Sopenharmony_ci * threads that might be modifying SPTEs. 31162306a36Sopenharmony_ci * 31262306a36Sopenharmony_ci * Given a page table that has been removed from the TDP paging structure, 31362306a36Sopenharmony_ci * iterates through the page table to clear SPTEs and free child page tables. 31462306a36Sopenharmony_ci * 31562306a36Sopenharmony_ci * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 31662306a36Sopenharmony_ci * protection. Since this thread removed it from the paging structure, 31762306a36Sopenharmony_ci * this thread will be responsible for ensuring the page is freed. Hence the 31862306a36Sopenharmony_ci * early rcu_dereferences in the function. 31962306a36Sopenharmony_ci */ 32062306a36Sopenharmony_cistatic void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) 32162306a36Sopenharmony_ci{ 32262306a36Sopenharmony_ci struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 32362306a36Sopenharmony_ci int level = sp->role.level; 32462306a36Sopenharmony_ci gfn_t base_gfn = sp->gfn; 32562306a36Sopenharmony_ci int i; 32662306a36Sopenharmony_ci 32762306a36Sopenharmony_ci trace_kvm_mmu_prepare_zap_page(sp); 32862306a36Sopenharmony_ci 32962306a36Sopenharmony_ci tdp_mmu_unlink_sp(kvm, sp, shared); 33062306a36Sopenharmony_ci 33162306a36Sopenharmony_ci for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { 33262306a36Sopenharmony_ci tdp_ptep_t sptep = pt + i; 33362306a36Sopenharmony_ci gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 33462306a36Sopenharmony_ci u64 old_spte; 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_ci if (shared) { 33762306a36Sopenharmony_ci /* 33862306a36Sopenharmony_ci * Set the SPTE to a nonpresent value that other 33962306a36Sopenharmony_ci * threads will not overwrite. If the SPTE was 34062306a36Sopenharmony_ci * already marked as removed then another thread 34162306a36Sopenharmony_ci * handling a page fault could overwrite it, so 34262306a36Sopenharmony_ci * set the SPTE until it is set from some other 34362306a36Sopenharmony_ci * value to the removed SPTE value. 34462306a36Sopenharmony_ci */ 34562306a36Sopenharmony_ci for (;;) { 34662306a36Sopenharmony_ci old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); 34762306a36Sopenharmony_ci if (!is_removed_spte(old_spte)) 34862306a36Sopenharmony_ci break; 34962306a36Sopenharmony_ci cpu_relax(); 35062306a36Sopenharmony_ci } 35162306a36Sopenharmony_ci } else { 35262306a36Sopenharmony_ci /* 35362306a36Sopenharmony_ci * If the SPTE is not MMU-present, there is no backing 35462306a36Sopenharmony_ci * page associated with the SPTE and so no side effects 35562306a36Sopenharmony_ci * that need to be recorded, and exclusive ownership of 35662306a36Sopenharmony_ci * mmu_lock ensures the SPTE can't be made present. 35762306a36Sopenharmony_ci * Note, zapping MMIO SPTEs is also unnecessary as they 35862306a36Sopenharmony_ci * are guarded by the memslots generation, not by being 35962306a36Sopenharmony_ci * unreachable. 36062306a36Sopenharmony_ci */ 36162306a36Sopenharmony_ci old_spte = kvm_tdp_mmu_read_spte(sptep); 36262306a36Sopenharmony_ci if (!is_shadow_present_pte(old_spte)) 36362306a36Sopenharmony_ci continue; 36462306a36Sopenharmony_ci 36562306a36Sopenharmony_ci /* 36662306a36Sopenharmony_ci * Use the common helper instead of a raw WRITE_ONCE as 36762306a36Sopenharmony_ci * the SPTE needs to be updated atomically if it can be 36862306a36Sopenharmony_ci * modified by a different vCPU outside of mmu_lock. 36962306a36Sopenharmony_ci * Even though the parent SPTE is !PRESENT, the TLB 37062306a36Sopenharmony_ci * hasn't yet been flushed, and both Intel and AMD 37162306a36Sopenharmony_ci * document that A/D assists can use upper-level PxE 37262306a36Sopenharmony_ci * entries that are cached in the TLB, i.e. the CPU can 37362306a36Sopenharmony_ci * still access the page and mark it dirty. 37462306a36Sopenharmony_ci * 37562306a36Sopenharmony_ci * No retry is needed in the atomic update path as the 37662306a36Sopenharmony_ci * sole concern is dropping a Dirty bit, i.e. no other 37762306a36Sopenharmony_ci * task can zap/remove the SPTE as mmu_lock is held for 37862306a36Sopenharmony_ci * write. Marking the SPTE as a removed SPTE is not 37962306a36Sopenharmony_ci * strictly necessary for the same reason, but using 38062306a36Sopenharmony_ci * the remove SPTE value keeps the shared/exclusive 38162306a36Sopenharmony_ci * paths consistent and allows the handle_changed_spte() 38262306a36Sopenharmony_ci * call below to hardcode the new value to REMOVED_SPTE. 38362306a36Sopenharmony_ci * 38462306a36Sopenharmony_ci * Note, even though dropping a Dirty bit is the only 38562306a36Sopenharmony_ci * scenario where a non-atomic update could result in a 38662306a36Sopenharmony_ci * functional bug, simply checking the Dirty bit isn't 38762306a36Sopenharmony_ci * sufficient as a fast page fault could read the upper 38862306a36Sopenharmony_ci * level SPTE before it is zapped, and then make this 38962306a36Sopenharmony_ci * target SPTE writable, resume the guest, and set the 39062306a36Sopenharmony_ci * Dirty bit between reading the SPTE above and writing 39162306a36Sopenharmony_ci * it here. 39262306a36Sopenharmony_ci */ 39362306a36Sopenharmony_ci old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, 39462306a36Sopenharmony_ci REMOVED_SPTE, level); 39562306a36Sopenharmony_ci } 39662306a36Sopenharmony_ci handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 39762306a36Sopenharmony_ci old_spte, REMOVED_SPTE, level, shared); 39862306a36Sopenharmony_ci } 39962306a36Sopenharmony_ci 40062306a36Sopenharmony_ci call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 40162306a36Sopenharmony_ci} 40262306a36Sopenharmony_ci 40362306a36Sopenharmony_ci/** 40462306a36Sopenharmony_ci * handle_changed_spte - handle bookkeeping associated with an SPTE change 40562306a36Sopenharmony_ci * @kvm: kvm instance 40662306a36Sopenharmony_ci * @as_id: the address space of the paging structure the SPTE was a part of 40762306a36Sopenharmony_ci * @gfn: the base GFN that was mapped by the SPTE 40862306a36Sopenharmony_ci * @old_spte: The value of the SPTE before the change 40962306a36Sopenharmony_ci * @new_spte: The value of the SPTE after the change 41062306a36Sopenharmony_ci * @level: the level of the PT the SPTE is part of in the paging structure 41162306a36Sopenharmony_ci * @shared: This operation may not be running under the exclusive use of 41262306a36Sopenharmony_ci * the MMU lock and the operation must synchronize with other 41362306a36Sopenharmony_ci * threads that might be modifying SPTEs. 41462306a36Sopenharmony_ci * 41562306a36Sopenharmony_ci * Handle bookkeeping that might result from the modification of a SPTE. Note, 41662306a36Sopenharmony_ci * dirty logging updates are handled in common code, not here (see make_spte() 41762306a36Sopenharmony_ci * and fast_pf_fix_direct_spte()). 41862306a36Sopenharmony_ci */ 41962306a36Sopenharmony_cistatic void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 42062306a36Sopenharmony_ci u64 old_spte, u64 new_spte, int level, 42162306a36Sopenharmony_ci bool shared) 42262306a36Sopenharmony_ci{ 42362306a36Sopenharmony_ci bool was_present = is_shadow_present_pte(old_spte); 42462306a36Sopenharmony_ci bool is_present = is_shadow_present_pte(new_spte); 42562306a36Sopenharmony_ci bool was_leaf = was_present && is_last_spte(old_spte, level); 42662306a36Sopenharmony_ci bool is_leaf = is_present && is_last_spte(new_spte, level); 42762306a36Sopenharmony_ci bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 42862306a36Sopenharmony_ci 42962306a36Sopenharmony_ci WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL); 43062306a36Sopenharmony_ci WARN_ON_ONCE(level < PG_LEVEL_4K); 43162306a36Sopenharmony_ci WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 43262306a36Sopenharmony_ci 43362306a36Sopenharmony_ci /* 43462306a36Sopenharmony_ci * If this warning were to trigger it would indicate that there was a 43562306a36Sopenharmony_ci * missing MMU notifier or a race with some notifier handler. 43662306a36Sopenharmony_ci * A present, leaf SPTE should never be directly replaced with another 43762306a36Sopenharmony_ci * present leaf SPTE pointing to a different PFN. A notifier handler 43862306a36Sopenharmony_ci * should be zapping the SPTE before the main MM's page table is 43962306a36Sopenharmony_ci * changed, or the SPTE should be zeroed, and the TLBs flushed by the 44062306a36Sopenharmony_ci * thread before replacement. 44162306a36Sopenharmony_ci */ 44262306a36Sopenharmony_ci if (was_leaf && is_leaf && pfn_changed) { 44362306a36Sopenharmony_ci pr_err("Invalid SPTE change: cannot replace a present leaf\n" 44462306a36Sopenharmony_ci "SPTE with another present leaf SPTE mapping a\n" 44562306a36Sopenharmony_ci "different PFN!\n" 44662306a36Sopenharmony_ci "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 44762306a36Sopenharmony_ci as_id, gfn, old_spte, new_spte, level); 44862306a36Sopenharmony_ci 44962306a36Sopenharmony_ci /* 45062306a36Sopenharmony_ci * Crash the host to prevent error propagation and guest data 45162306a36Sopenharmony_ci * corruption. 45262306a36Sopenharmony_ci */ 45362306a36Sopenharmony_ci BUG(); 45462306a36Sopenharmony_ci } 45562306a36Sopenharmony_ci 45662306a36Sopenharmony_ci if (old_spte == new_spte) 45762306a36Sopenharmony_ci return; 45862306a36Sopenharmony_ci 45962306a36Sopenharmony_ci trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 46062306a36Sopenharmony_ci 46162306a36Sopenharmony_ci if (is_leaf) 46262306a36Sopenharmony_ci check_spte_writable_invariants(new_spte); 46362306a36Sopenharmony_ci 46462306a36Sopenharmony_ci /* 46562306a36Sopenharmony_ci * The only times a SPTE should be changed from a non-present to 46662306a36Sopenharmony_ci * non-present state is when an MMIO entry is installed/modified/ 46762306a36Sopenharmony_ci * removed. In that case, there is nothing to do here. 46862306a36Sopenharmony_ci */ 46962306a36Sopenharmony_ci if (!was_present && !is_present) { 47062306a36Sopenharmony_ci /* 47162306a36Sopenharmony_ci * If this change does not involve a MMIO SPTE or removed SPTE, 47262306a36Sopenharmony_ci * it is unexpected. Log the change, though it should not 47362306a36Sopenharmony_ci * impact the guest since both the former and current SPTEs 47462306a36Sopenharmony_ci * are nonpresent. 47562306a36Sopenharmony_ci */ 47662306a36Sopenharmony_ci if (WARN_ON_ONCE(!is_mmio_spte(old_spte) && 47762306a36Sopenharmony_ci !is_mmio_spte(new_spte) && 47862306a36Sopenharmony_ci !is_removed_spte(new_spte))) 47962306a36Sopenharmony_ci pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 48062306a36Sopenharmony_ci "should not be replaced with another,\n" 48162306a36Sopenharmony_ci "different nonpresent SPTE, unless one or both\n" 48262306a36Sopenharmony_ci "are MMIO SPTEs, or the new SPTE is\n" 48362306a36Sopenharmony_ci "a temporary removed SPTE.\n" 48462306a36Sopenharmony_ci "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 48562306a36Sopenharmony_ci as_id, gfn, old_spte, new_spte, level); 48662306a36Sopenharmony_ci return; 48762306a36Sopenharmony_ci } 48862306a36Sopenharmony_ci 48962306a36Sopenharmony_ci if (is_leaf != was_leaf) 49062306a36Sopenharmony_ci kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 49162306a36Sopenharmony_ci 49262306a36Sopenharmony_ci if (was_leaf && is_dirty_spte(old_spte) && 49362306a36Sopenharmony_ci (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) 49462306a36Sopenharmony_ci kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 49562306a36Sopenharmony_ci 49662306a36Sopenharmony_ci /* 49762306a36Sopenharmony_ci * Recursively handle child PTs if the change removed a subtree from 49862306a36Sopenharmony_ci * the paging structure. Note the WARN on the PFN changing without the 49962306a36Sopenharmony_ci * SPTE being converted to a hugepage (leaf) or being zapped. Shadow 50062306a36Sopenharmony_ci * pages are kernel allocations and should never be migrated. 50162306a36Sopenharmony_ci */ 50262306a36Sopenharmony_ci if (was_present && !was_leaf && 50362306a36Sopenharmony_ci (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) 50462306a36Sopenharmony_ci handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); 50562306a36Sopenharmony_ci 50662306a36Sopenharmony_ci if (was_leaf && is_accessed_spte(old_spte) && 50762306a36Sopenharmony_ci (!is_present || !is_accessed_spte(new_spte) || pfn_changed)) 50862306a36Sopenharmony_ci kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 50962306a36Sopenharmony_ci} 51062306a36Sopenharmony_ci 51162306a36Sopenharmony_ci/* 51262306a36Sopenharmony_ci * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 51362306a36Sopenharmony_ci * and handle the associated bookkeeping. Do not mark the page dirty 51462306a36Sopenharmony_ci * in KVM's dirty bitmaps. 51562306a36Sopenharmony_ci * 51662306a36Sopenharmony_ci * If setting the SPTE fails because it has changed, iter->old_spte will be 51762306a36Sopenharmony_ci * refreshed to the current value of the spte. 51862306a36Sopenharmony_ci * 51962306a36Sopenharmony_ci * @kvm: kvm instance 52062306a36Sopenharmony_ci * @iter: a tdp_iter instance currently on the SPTE that should be set 52162306a36Sopenharmony_ci * @new_spte: The value the SPTE should be set to 52262306a36Sopenharmony_ci * Return: 52362306a36Sopenharmony_ci * * 0 - If the SPTE was set. 52462306a36Sopenharmony_ci * * -EBUSY - If the SPTE cannot be set. In this case this function will have 52562306a36Sopenharmony_ci * no side-effects other than setting iter->old_spte to the last 52662306a36Sopenharmony_ci * known value of the spte. 52762306a36Sopenharmony_ci */ 52862306a36Sopenharmony_cistatic inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, 52962306a36Sopenharmony_ci struct tdp_iter *iter, 53062306a36Sopenharmony_ci u64 new_spte) 53162306a36Sopenharmony_ci{ 53262306a36Sopenharmony_ci u64 *sptep = rcu_dereference(iter->sptep); 53362306a36Sopenharmony_ci 53462306a36Sopenharmony_ci /* 53562306a36Sopenharmony_ci * The caller is responsible for ensuring the old SPTE is not a REMOVED 53662306a36Sopenharmony_ci * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, 53762306a36Sopenharmony_ci * and pre-checking before inserting a new SPTE is advantageous as it 53862306a36Sopenharmony_ci * avoids unnecessary work. 53962306a36Sopenharmony_ci */ 54062306a36Sopenharmony_ci WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); 54162306a36Sopenharmony_ci 54262306a36Sopenharmony_ci lockdep_assert_held_read(&kvm->mmu_lock); 54362306a36Sopenharmony_ci 54462306a36Sopenharmony_ci /* 54562306a36Sopenharmony_ci * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and 54662306a36Sopenharmony_ci * does not hold the mmu_lock. On failure, i.e. if a different logical 54762306a36Sopenharmony_ci * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with 54862306a36Sopenharmony_ci * the current value, so the caller operates on fresh data, e.g. if it 54962306a36Sopenharmony_ci * retries tdp_mmu_set_spte_atomic() 55062306a36Sopenharmony_ci */ 55162306a36Sopenharmony_ci if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) 55262306a36Sopenharmony_ci return -EBUSY; 55362306a36Sopenharmony_ci 55462306a36Sopenharmony_ci handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 55562306a36Sopenharmony_ci new_spte, iter->level, true); 55662306a36Sopenharmony_ci 55762306a36Sopenharmony_ci return 0; 55862306a36Sopenharmony_ci} 55962306a36Sopenharmony_ci 56062306a36Sopenharmony_cistatic inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, 56162306a36Sopenharmony_ci struct tdp_iter *iter) 56262306a36Sopenharmony_ci{ 56362306a36Sopenharmony_ci int ret; 56462306a36Sopenharmony_ci 56562306a36Sopenharmony_ci /* 56662306a36Sopenharmony_ci * Freeze the SPTE by setting it to a special, 56762306a36Sopenharmony_ci * non-present value. This will stop other threads from 56862306a36Sopenharmony_ci * immediately installing a present entry in its place 56962306a36Sopenharmony_ci * before the TLBs are flushed. 57062306a36Sopenharmony_ci */ 57162306a36Sopenharmony_ci ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); 57262306a36Sopenharmony_ci if (ret) 57362306a36Sopenharmony_ci return ret; 57462306a36Sopenharmony_ci 57562306a36Sopenharmony_ci kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level); 57662306a36Sopenharmony_ci 57762306a36Sopenharmony_ci /* 57862306a36Sopenharmony_ci * No other thread can overwrite the removed SPTE as they must either 57962306a36Sopenharmony_ci * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not 58062306a36Sopenharmony_ci * overwrite the special removed SPTE value. No bookkeeping is needed 58162306a36Sopenharmony_ci * here since the SPTE is going from non-present to non-present. Use 58262306a36Sopenharmony_ci * the raw write helper to avoid an unnecessary check on volatile bits. 58362306a36Sopenharmony_ci */ 58462306a36Sopenharmony_ci __kvm_tdp_mmu_write_spte(iter->sptep, 0); 58562306a36Sopenharmony_ci 58662306a36Sopenharmony_ci return 0; 58762306a36Sopenharmony_ci} 58862306a36Sopenharmony_ci 58962306a36Sopenharmony_ci 59062306a36Sopenharmony_ci/* 59162306a36Sopenharmony_ci * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 59262306a36Sopenharmony_ci * @kvm: KVM instance 59362306a36Sopenharmony_ci * @as_id: Address space ID, i.e. regular vs. SMM 59462306a36Sopenharmony_ci * @sptep: Pointer to the SPTE 59562306a36Sopenharmony_ci * @old_spte: The current value of the SPTE 59662306a36Sopenharmony_ci * @new_spte: The new value that will be set for the SPTE 59762306a36Sopenharmony_ci * @gfn: The base GFN that was (or will be) mapped by the SPTE 59862306a36Sopenharmony_ci * @level: The level _containing_ the SPTE (its parent PT's level) 59962306a36Sopenharmony_ci * 60062306a36Sopenharmony_ci * Returns the old SPTE value, which _may_ be different than @old_spte if the 60162306a36Sopenharmony_ci * SPTE had voldatile bits. 60262306a36Sopenharmony_ci */ 60362306a36Sopenharmony_cistatic u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, 60462306a36Sopenharmony_ci u64 old_spte, u64 new_spte, gfn_t gfn, int level) 60562306a36Sopenharmony_ci{ 60662306a36Sopenharmony_ci lockdep_assert_held_write(&kvm->mmu_lock); 60762306a36Sopenharmony_ci 60862306a36Sopenharmony_ci /* 60962306a36Sopenharmony_ci * No thread should be using this function to set SPTEs to or from the 61062306a36Sopenharmony_ci * temporary removed SPTE value. 61162306a36Sopenharmony_ci * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 61262306a36Sopenharmony_ci * should be used. If operating under the MMU lock in write mode, the 61362306a36Sopenharmony_ci * use of the removed SPTE should not be necessary. 61462306a36Sopenharmony_ci */ 61562306a36Sopenharmony_ci WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte)); 61662306a36Sopenharmony_ci 61762306a36Sopenharmony_ci old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); 61862306a36Sopenharmony_ci 61962306a36Sopenharmony_ci handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); 62062306a36Sopenharmony_ci return old_spte; 62162306a36Sopenharmony_ci} 62262306a36Sopenharmony_ci 62362306a36Sopenharmony_cistatic inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, 62462306a36Sopenharmony_ci u64 new_spte) 62562306a36Sopenharmony_ci{ 62662306a36Sopenharmony_ci WARN_ON_ONCE(iter->yielded); 62762306a36Sopenharmony_ci iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, 62862306a36Sopenharmony_ci iter->old_spte, new_spte, 62962306a36Sopenharmony_ci iter->gfn, iter->level); 63062306a36Sopenharmony_ci} 63162306a36Sopenharmony_ci 63262306a36Sopenharmony_ci#define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 63362306a36Sopenharmony_ci for_each_tdp_pte(_iter, _root, _start, _end) 63462306a36Sopenharmony_ci 63562306a36Sopenharmony_ci#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 63662306a36Sopenharmony_ci tdp_root_for_each_pte(_iter, _root, _start, _end) \ 63762306a36Sopenharmony_ci if (!is_shadow_present_pte(_iter.old_spte) || \ 63862306a36Sopenharmony_ci !is_last_spte(_iter.old_spte, _iter.level)) \ 63962306a36Sopenharmony_ci continue; \ 64062306a36Sopenharmony_ci else 64162306a36Sopenharmony_ci 64262306a36Sopenharmony_ci#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 64362306a36Sopenharmony_ci for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end) 64462306a36Sopenharmony_ci 64562306a36Sopenharmony_ci/* 64662306a36Sopenharmony_ci * Yield if the MMU lock is contended or this thread needs to return control 64762306a36Sopenharmony_ci * to the scheduler. 64862306a36Sopenharmony_ci * 64962306a36Sopenharmony_ci * If this function should yield and flush is set, it will perform a remote 65062306a36Sopenharmony_ci * TLB flush before yielding. 65162306a36Sopenharmony_ci * 65262306a36Sopenharmony_ci * If this function yields, iter->yielded is set and the caller must skip to 65362306a36Sopenharmony_ci * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 65462306a36Sopenharmony_ci * over the paging structures to allow the iterator to continue its traversal 65562306a36Sopenharmony_ci * from the paging structure root. 65662306a36Sopenharmony_ci * 65762306a36Sopenharmony_ci * Returns true if this function yielded. 65862306a36Sopenharmony_ci */ 65962306a36Sopenharmony_cistatic inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 66062306a36Sopenharmony_ci struct tdp_iter *iter, 66162306a36Sopenharmony_ci bool flush, bool shared) 66262306a36Sopenharmony_ci{ 66362306a36Sopenharmony_ci WARN_ON_ONCE(iter->yielded); 66462306a36Sopenharmony_ci 66562306a36Sopenharmony_ci /* Ensure forward progress has been made before yielding. */ 66662306a36Sopenharmony_ci if (iter->next_last_level_gfn == iter->yielded_gfn) 66762306a36Sopenharmony_ci return false; 66862306a36Sopenharmony_ci 66962306a36Sopenharmony_ci if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 67062306a36Sopenharmony_ci if (flush) 67162306a36Sopenharmony_ci kvm_flush_remote_tlbs(kvm); 67262306a36Sopenharmony_ci 67362306a36Sopenharmony_ci rcu_read_unlock(); 67462306a36Sopenharmony_ci 67562306a36Sopenharmony_ci if (shared) 67662306a36Sopenharmony_ci cond_resched_rwlock_read(&kvm->mmu_lock); 67762306a36Sopenharmony_ci else 67862306a36Sopenharmony_ci cond_resched_rwlock_write(&kvm->mmu_lock); 67962306a36Sopenharmony_ci 68062306a36Sopenharmony_ci rcu_read_lock(); 68162306a36Sopenharmony_ci 68262306a36Sopenharmony_ci WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); 68362306a36Sopenharmony_ci 68462306a36Sopenharmony_ci iter->yielded = true; 68562306a36Sopenharmony_ci } 68662306a36Sopenharmony_ci 68762306a36Sopenharmony_ci return iter->yielded; 68862306a36Sopenharmony_ci} 68962306a36Sopenharmony_ci 69062306a36Sopenharmony_cistatic inline gfn_t tdp_mmu_max_gfn_exclusive(void) 69162306a36Sopenharmony_ci{ 69262306a36Sopenharmony_ci /* 69362306a36Sopenharmony_ci * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with 69462306a36Sopenharmony_ci * a gpa range that would exceed the max gfn, and KVM does not create 69562306a36Sopenharmony_ci * MMIO SPTEs for "impossible" gfns, instead sending such accesses down 69662306a36Sopenharmony_ci * the slow emulation path every time. 69762306a36Sopenharmony_ci */ 69862306a36Sopenharmony_ci return kvm_mmu_max_gfn() + 1; 69962306a36Sopenharmony_ci} 70062306a36Sopenharmony_ci 70162306a36Sopenharmony_cistatic void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 70262306a36Sopenharmony_ci bool shared, int zap_level) 70362306a36Sopenharmony_ci{ 70462306a36Sopenharmony_ci struct tdp_iter iter; 70562306a36Sopenharmony_ci 70662306a36Sopenharmony_ci gfn_t end = tdp_mmu_max_gfn_exclusive(); 70762306a36Sopenharmony_ci gfn_t start = 0; 70862306a36Sopenharmony_ci 70962306a36Sopenharmony_ci for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { 71062306a36Sopenharmony_ciretry: 71162306a36Sopenharmony_ci if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 71262306a36Sopenharmony_ci continue; 71362306a36Sopenharmony_ci 71462306a36Sopenharmony_ci if (!is_shadow_present_pte(iter.old_spte)) 71562306a36Sopenharmony_ci continue; 71662306a36Sopenharmony_ci 71762306a36Sopenharmony_ci if (iter.level > zap_level) 71862306a36Sopenharmony_ci continue; 71962306a36Sopenharmony_ci 72062306a36Sopenharmony_ci if (!shared) 72162306a36Sopenharmony_ci tdp_mmu_iter_set_spte(kvm, &iter, 0); 72262306a36Sopenharmony_ci else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) 72362306a36Sopenharmony_ci goto retry; 72462306a36Sopenharmony_ci } 72562306a36Sopenharmony_ci} 72662306a36Sopenharmony_ci 72762306a36Sopenharmony_cistatic void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 72862306a36Sopenharmony_ci bool shared) 72962306a36Sopenharmony_ci{ 73062306a36Sopenharmony_ci 73162306a36Sopenharmony_ci /* 73262306a36Sopenharmony_ci * The root must have an elevated refcount so that it's reachable via 73362306a36Sopenharmony_ci * mmu_notifier callbacks, which allows this path to yield and drop 73462306a36Sopenharmony_ci * mmu_lock. When handling an unmap/release mmu_notifier command, KVM 73562306a36Sopenharmony_ci * must drop all references to relevant pages prior to completing the 73662306a36Sopenharmony_ci * callback. Dropping mmu_lock with an unreachable root would result 73762306a36Sopenharmony_ci * in zapping SPTEs after a relevant mmu_notifier callback completes 73862306a36Sopenharmony_ci * and lead to use-after-free as zapping a SPTE triggers "writeback" of 73962306a36Sopenharmony_ci * dirty accessed bits to the SPTE's associated struct page. 74062306a36Sopenharmony_ci */ 74162306a36Sopenharmony_ci WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); 74262306a36Sopenharmony_ci 74362306a36Sopenharmony_ci kvm_lockdep_assert_mmu_lock_held(kvm, shared); 74462306a36Sopenharmony_ci 74562306a36Sopenharmony_ci rcu_read_lock(); 74662306a36Sopenharmony_ci 74762306a36Sopenharmony_ci /* 74862306a36Sopenharmony_ci * To avoid RCU stalls due to recursively removing huge swaths of SPs, 74962306a36Sopenharmony_ci * split the zap into two passes. On the first pass, zap at the 1gb 75062306a36Sopenharmony_ci * level, and then zap top-level SPs on the second pass. "1gb" is not 75162306a36Sopenharmony_ci * arbitrary, as KVM must be able to zap a 1gb shadow page without 75262306a36Sopenharmony_ci * inducing a stall to allow in-place replacement with a 1gb hugepage. 75362306a36Sopenharmony_ci * 75462306a36Sopenharmony_ci * Because zapping a SP recurses on its children, stepping down to 75562306a36Sopenharmony_ci * PG_LEVEL_4K in the iterator itself is unnecessary. 75662306a36Sopenharmony_ci */ 75762306a36Sopenharmony_ci __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 75862306a36Sopenharmony_ci __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 75962306a36Sopenharmony_ci 76062306a36Sopenharmony_ci rcu_read_unlock(); 76162306a36Sopenharmony_ci} 76262306a36Sopenharmony_ci 76362306a36Sopenharmony_cibool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 76462306a36Sopenharmony_ci{ 76562306a36Sopenharmony_ci u64 old_spte; 76662306a36Sopenharmony_ci 76762306a36Sopenharmony_ci /* 76862306a36Sopenharmony_ci * This helper intentionally doesn't allow zapping a root shadow page, 76962306a36Sopenharmony_ci * which doesn't have a parent page table and thus no associated entry. 77062306a36Sopenharmony_ci */ 77162306a36Sopenharmony_ci if (WARN_ON_ONCE(!sp->ptep)) 77262306a36Sopenharmony_ci return false; 77362306a36Sopenharmony_ci 77462306a36Sopenharmony_ci old_spte = kvm_tdp_mmu_read_spte(sp->ptep); 77562306a36Sopenharmony_ci if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) 77662306a36Sopenharmony_ci return false; 77762306a36Sopenharmony_ci 77862306a36Sopenharmony_ci tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, 77962306a36Sopenharmony_ci sp->gfn, sp->role.level + 1); 78062306a36Sopenharmony_ci 78162306a36Sopenharmony_ci return true; 78262306a36Sopenharmony_ci} 78362306a36Sopenharmony_ci 78462306a36Sopenharmony_ci/* 78562306a36Sopenharmony_ci * If can_yield is true, will release the MMU lock and reschedule if the 78662306a36Sopenharmony_ci * scheduler needs the CPU or there is contention on the MMU lock. If this 78762306a36Sopenharmony_ci * function cannot yield, it will not release the MMU lock or reschedule and 78862306a36Sopenharmony_ci * the caller must ensure it does not supply too large a GFN range, or the 78962306a36Sopenharmony_ci * operation can cause a soft lockup. 79062306a36Sopenharmony_ci */ 79162306a36Sopenharmony_cistatic bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, 79262306a36Sopenharmony_ci gfn_t start, gfn_t end, bool can_yield, bool flush) 79362306a36Sopenharmony_ci{ 79462306a36Sopenharmony_ci struct tdp_iter iter; 79562306a36Sopenharmony_ci 79662306a36Sopenharmony_ci end = min(end, tdp_mmu_max_gfn_exclusive()); 79762306a36Sopenharmony_ci 79862306a36Sopenharmony_ci lockdep_assert_held_write(&kvm->mmu_lock); 79962306a36Sopenharmony_ci 80062306a36Sopenharmony_ci rcu_read_lock(); 80162306a36Sopenharmony_ci 80262306a36Sopenharmony_ci for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { 80362306a36Sopenharmony_ci if (can_yield && 80462306a36Sopenharmony_ci tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { 80562306a36Sopenharmony_ci flush = false; 80662306a36Sopenharmony_ci continue; 80762306a36Sopenharmony_ci } 80862306a36Sopenharmony_ci 80962306a36Sopenharmony_ci if (!is_shadow_present_pte(iter.old_spte) || 81062306a36Sopenharmony_ci !is_last_spte(iter.old_spte, iter.level)) 81162306a36Sopenharmony_ci continue; 81262306a36Sopenharmony_ci 81362306a36Sopenharmony_ci tdp_mmu_iter_set_spte(kvm, &iter, 0); 81462306a36Sopenharmony_ci flush = true; 81562306a36Sopenharmony_ci } 81662306a36Sopenharmony_ci 81762306a36Sopenharmony_ci rcu_read_unlock(); 81862306a36Sopenharmony_ci 81962306a36Sopenharmony_ci /* 82062306a36Sopenharmony_ci * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need 82162306a36Sopenharmony_ci * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. 82262306a36Sopenharmony_ci */ 82362306a36Sopenharmony_ci return flush; 82462306a36Sopenharmony_ci} 82562306a36Sopenharmony_ci 82662306a36Sopenharmony_ci/* 82762306a36Sopenharmony_ci * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns 82862306a36Sopenharmony_ci * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or 82962306a36Sopenharmony_ci * more SPTEs were zapped since the MMU lock was last acquired. 83062306a36Sopenharmony_ci */ 83162306a36Sopenharmony_cibool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) 83262306a36Sopenharmony_ci{ 83362306a36Sopenharmony_ci struct kvm_mmu_page *root; 83462306a36Sopenharmony_ci 83562306a36Sopenharmony_ci for_each_tdp_mmu_root_yield_safe(kvm, root, false) 83662306a36Sopenharmony_ci flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); 83762306a36Sopenharmony_ci 83862306a36Sopenharmony_ci return flush; 83962306a36Sopenharmony_ci} 84062306a36Sopenharmony_ci 84162306a36Sopenharmony_civoid kvm_tdp_mmu_zap_all(struct kvm *kvm) 84262306a36Sopenharmony_ci{ 84362306a36Sopenharmony_ci struct kvm_mmu_page *root; 84462306a36Sopenharmony_ci 84562306a36Sopenharmony_ci /* 84662306a36Sopenharmony_ci * Zap all roots, including invalid roots, as all SPTEs must be dropped 84762306a36Sopenharmony_ci * before returning to the caller. Zap directly even if the root is 84862306a36Sopenharmony_ci * also being zapped by a worker. Walking zapped top-level SPTEs isn't 84962306a36Sopenharmony_ci * all that expensive and mmu_lock is already held, which means the 85062306a36Sopenharmony_ci * worker has yielded, i.e. flushing the work instead of zapping here 85162306a36Sopenharmony_ci * isn't guaranteed to be any faster. 85262306a36Sopenharmony_ci * 85362306a36Sopenharmony_ci * A TLB flush is unnecessary, KVM zaps everything if and only the VM 85462306a36Sopenharmony_ci * is being destroyed or the userspace VMM has exited. In both cases, 85562306a36Sopenharmony_ci * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 85662306a36Sopenharmony_ci */ 85762306a36Sopenharmony_ci for_each_tdp_mmu_root_yield_safe(kvm, root, false) 85862306a36Sopenharmony_ci tdp_mmu_zap_root(kvm, root, false); 85962306a36Sopenharmony_ci} 86062306a36Sopenharmony_ci 86162306a36Sopenharmony_ci/* 86262306a36Sopenharmony_ci * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast 86362306a36Sopenharmony_ci * zap" completes. 86462306a36Sopenharmony_ci */ 86562306a36Sopenharmony_civoid kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 86662306a36Sopenharmony_ci{ 86762306a36Sopenharmony_ci struct kvm_mmu_page *root; 86862306a36Sopenharmony_ci 86962306a36Sopenharmony_ci read_lock(&kvm->mmu_lock); 87062306a36Sopenharmony_ci 87162306a36Sopenharmony_ci for_each_tdp_mmu_root_yield_safe(kvm, root, true) { 87262306a36Sopenharmony_ci if (!root->tdp_mmu_scheduled_root_to_zap) 87362306a36Sopenharmony_ci continue; 87462306a36Sopenharmony_ci 87562306a36Sopenharmony_ci root->tdp_mmu_scheduled_root_to_zap = false; 87662306a36Sopenharmony_ci KVM_BUG_ON(!root->role.invalid, kvm); 87762306a36Sopenharmony_ci 87862306a36Sopenharmony_ci /* 87962306a36Sopenharmony_ci * A TLB flush is not necessary as KVM performs a local TLB 88062306a36Sopenharmony_ci * flush when allocating a new root (see kvm_mmu_load()), and 88162306a36Sopenharmony_ci * when migrating a vCPU to a different pCPU. Note, the local 88262306a36Sopenharmony_ci * TLB flush on reuse also invalidates paging-structure-cache 88362306a36Sopenharmony_ci * entries, i.e. TLB entries for intermediate paging structures, 88462306a36Sopenharmony_ci * that may be zapped, as such entries are associated with the 88562306a36Sopenharmony_ci * ASID on both VMX and SVM. 88662306a36Sopenharmony_ci */ 88762306a36Sopenharmony_ci tdp_mmu_zap_root(kvm, root, true); 88862306a36Sopenharmony_ci 88962306a36Sopenharmony_ci /* 89062306a36Sopenharmony_ci * The referenced needs to be put *after* zapping the root, as 89162306a36Sopenharmony_ci * the root must be reachable by mmu_notifiers while it's being 89262306a36Sopenharmony_ci * zapped 89362306a36Sopenharmony_ci */ 89462306a36Sopenharmony_ci kvm_tdp_mmu_put_root(kvm, root, true); 89562306a36Sopenharmony_ci } 89662306a36Sopenharmony_ci 89762306a36Sopenharmony_ci read_unlock(&kvm->mmu_lock); 89862306a36Sopenharmony_ci} 89962306a36Sopenharmony_ci 90062306a36Sopenharmony_ci/* 90162306a36Sopenharmony_ci * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that 90262306a36Sopenharmony_ci * is about to be zapped, e.g. in response to a memslots update. The actual 90362306a36Sopenharmony_ci * zapping is done separately so that it happens with mmu_lock with read, 90462306a36Sopenharmony_ci * whereas invalidating roots must be done with mmu_lock held for write (unless 90562306a36Sopenharmony_ci * the VM is being destroyed). 90662306a36Sopenharmony_ci * 90762306a36Sopenharmony_ci * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. 90862306a36Sopenharmony_ci * See kvm_tdp_mmu_get_vcpu_root_hpa(). 90962306a36Sopenharmony_ci */ 91062306a36Sopenharmony_civoid kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 91162306a36Sopenharmony_ci{ 91262306a36Sopenharmony_ci struct kvm_mmu_page *root; 91362306a36Sopenharmony_ci 91462306a36Sopenharmony_ci /* 91562306a36Sopenharmony_ci * mmu_lock must be held for write to ensure that a root doesn't become 91662306a36Sopenharmony_ci * invalid while there are active readers (invalidating a root while 91762306a36Sopenharmony_ci * there are active readers may or may not be problematic in practice, 91862306a36Sopenharmony_ci * but it's uncharted territory and not supported). 91962306a36Sopenharmony_ci * 92062306a36Sopenharmony_ci * Waive the assertion if there are no users of @kvm, i.e. the VM is 92162306a36Sopenharmony_ci * being destroyed after all references have been put, or if no vCPUs 92262306a36Sopenharmony_ci * have been created (which means there are no roots), i.e. the VM is 92362306a36Sopenharmony_ci * being destroyed in an error path of KVM_CREATE_VM. 92462306a36Sopenharmony_ci */ 92562306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_PROVE_LOCKING) && 92662306a36Sopenharmony_ci refcount_read(&kvm->users_count) && kvm->created_vcpus) 92762306a36Sopenharmony_ci lockdep_assert_held_write(&kvm->mmu_lock); 92862306a36Sopenharmony_ci 92962306a36Sopenharmony_ci /* 93062306a36Sopenharmony_ci * As above, mmu_lock isn't held when destroying the VM! There can't 93162306a36Sopenharmony_ci * be other references to @kvm, i.e. nothing else can invalidate roots 93262306a36Sopenharmony_ci * or get/put references to roots. 93362306a36Sopenharmony_ci */ 93462306a36Sopenharmony_ci list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 93562306a36Sopenharmony_ci /* 93662306a36Sopenharmony_ci * Note, invalid roots can outlive a memslot update! Invalid 93762306a36Sopenharmony_ci * roots must be *zapped* before the memslot update completes, 93862306a36Sopenharmony_ci * but a different task can acquire a reference and keep the 93962306a36Sopenharmony_ci * root alive after its been zapped. 94062306a36Sopenharmony_ci */ 94162306a36Sopenharmony_ci if (!root->role.invalid) { 94262306a36Sopenharmony_ci root->tdp_mmu_scheduled_root_to_zap = true; 94362306a36Sopenharmony_ci root->role.invalid = true; 94462306a36Sopenharmony_ci } 94562306a36Sopenharmony_ci } 94662306a36Sopenharmony_ci} 94762306a36Sopenharmony_ci 94862306a36Sopenharmony_ci/* 94962306a36Sopenharmony_ci * Installs a last-level SPTE to handle a TDP page fault. 95062306a36Sopenharmony_ci * (NPT/EPT violation/misconfiguration) 95162306a36Sopenharmony_ci */ 95262306a36Sopenharmony_cistatic int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 95362306a36Sopenharmony_ci struct kvm_page_fault *fault, 95462306a36Sopenharmony_ci struct tdp_iter *iter) 95562306a36Sopenharmony_ci{ 95662306a36Sopenharmony_ci struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 95762306a36Sopenharmony_ci u64 new_spte; 95862306a36Sopenharmony_ci int ret = RET_PF_FIXED; 95962306a36Sopenharmony_ci bool wrprot = false; 96062306a36Sopenharmony_ci 96162306a36Sopenharmony_ci if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) 96262306a36Sopenharmony_ci return RET_PF_RETRY; 96362306a36Sopenharmony_ci 96462306a36Sopenharmony_ci if (unlikely(!fault->slot)) 96562306a36Sopenharmony_ci new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 96662306a36Sopenharmony_ci else 96762306a36Sopenharmony_ci wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 96862306a36Sopenharmony_ci fault->pfn, iter->old_spte, fault->prefetch, true, 96962306a36Sopenharmony_ci fault->map_writable, &new_spte); 97062306a36Sopenharmony_ci 97162306a36Sopenharmony_ci if (new_spte == iter->old_spte) 97262306a36Sopenharmony_ci ret = RET_PF_SPURIOUS; 97362306a36Sopenharmony_ci else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 97462306a36Sopenharmony_ci return RET_PF_RETRY; 97562306a36Sopenharmony_ci else if (is_shadow_present_pte(iter->old_spte) && 97662306a36Sopenharmony_ci !is_last_spte(iter->old_spte, iter->level)) 97762306a36Sopenharmony_ci kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); 97862306a36Sopenharmony_ci 97962306a36Sopenharmony_ci /* 98062306a36Sopenharmony_ci * If the page fault was caused by a write but the page is write 98162306a36Sopenharmony_ci * protected, emulation is needed. If the emulation was skipped, 98262306a36Sopenharmony_ci * the vCPU would have the same fault again. 98362306a36Sopenharmony_ci */ 98462306a36Sopenharmony_ci if (wrprot) { 98562306a36Sopenharmony_ci if (fault->write) 98662306a36Sopenharmony_ci ret = RET_PF_EMULATE; 98762306a36Sopenharmony_ci } 98862306a36Sopenharmony_ci 98962306a36Sopenharmony_ci /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 99062306a36Sopenharmony_ci if (unlikely(is_mmio_spte(new_spte))) { 99162306a36Sopenharmony_ci vcpu->stat.pf_mmio_spte_created++; 99262306a36Sopenharmony_ci trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 99362306a36Sopenharmony_ci new_spte); 99462306a36Sopenharmony_ci ret = RET_PF_EMULATE; 99562306a36Sopenharmony_ci } else { 99662306a36Sopenharmony_ci trace_kvm_mmu_set_spte(iter->level, iter->gfn, 99762306a36Sopenharmony_ci rcu_dereference(iter->sptep)); 99862306a36Sopenharmony_ci } 99962306a36Sopenharmony_ci 100062306a36Sopenharmony_ci return ret; 100162306a36Sopenharmony_ci} 100262306a36Sopenharmony_ci 100362306a36Sopenharmony_ci/* 100462306a36Sopenharmony_ci * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the 100562306a36Sopenharmony_ci * provided page table. 100662306a36Sopenharmony_ci * 100762306a36Sopenharmony_ci * @kvm: kvm instance 100862306a36Sopenharmony_ci * @iter: a tdp_iter instance currently on the SPTE that should be set 100962306a36Sopenharmony_ci * @sp: The new TDP page table to install. 101062306a36Sopenharmony_ci * @shared: This operation is running under the MMU lock in read mode. 101162306a36Sopenharmony_ci * 101262306a36Sopenharmony_ci * Returns: 0 if the new page table was installed. Non-0 if the page table 101362306a36Sopenharmony_ci * could not be installed (e.g. the atomic compare-exchange failed). 101462306a36Sopenharmony_ci */ 101562306a36Sopenharmony_cistatic int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, 101662306a36Sopenharmony_ci struct kvm_mmu_page *sp, bool shared) 101762306a36Sopenharmony_ci{ 101862306a36Sopenharmony_ci u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); 101962306a36Sopenharmony_ci int ret = 0; 102062306a36Sopenharmony_ci 102162306a36Sopenharmony_ci if (shared) { 102262306a36Sopenharmony_ci ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); 102362306a36Sopenharmony_ci if (ret) 102462306a36Sopenharmony_ci return ret; 102562306a36Sopenharmony_ci } else { 102662306a36Sopenharmony_ci tdp_mmu_iter_set_spte(kvm, iter, spte); 102762306a36Sopenharmony_ci } 102862306a36Sopenharmony_ci 102962306a36Sopenharmony_ci tdp_account_mmu_page(kvm, sp); 103062306a36Sopenharmony_ci 103162306a36Sopenharmony_ci return 0; 103262306a36Sopenharmony_ci} 103362306a36Sopenharmony_ci 103462306a36Sopenharmony_cistatic int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 103562306a36Sopenharmony_ci struct kvm_mmu_page *sp, bool shared); 103662306a36Sopenharmony_ci 103762306a36Sopenharmony_ci/* 103862306a36Sopenharmony_ci * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 103962306a36Sopenharmony_ci * page tables and SPTEs to translate the faulting guest physical address. 104062306a36Sopenharmony_ci */ 104162306a36Sopenharmony_ciint kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 104262306a36Sopenharmony_ci{ 104362306a36Sopenharmony_ci struct kvm_mmu *mmu = vcpu->arch.mmu; 104462306a36Sopenharmony_ci struct kvm *kvm = vcpu->kvm; 104562306a36Sopenharmony_ci struct tdp_iter iter; 104662306a36Sopenharmony_ci struct kvm_mmu_page *sp; 104762306a36Sopenharmony_ci int ret = RET_PF_RETRY; 104862306a36Sopenharmony_ci 104962306a36Sopenharmony_ci kvm_mmu_hugepage_adjust(vcpu, fault); 105062306a36Sopenharmony_ci 105162306a36Sopenharmony_ci trace_kvm_mmu_spte_requested(fault); 105262306a36Sopenharmony_ci 105362306a36Sopenharmony_ci rcu_read_lock(); 105462306a36Sopenharmony_ci 105562306a36Sopenharmony_ci tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { 105662306a36Sopenharmony_ci int r; 105762306a36Sopenharmony_ci 105862306a36Sopenharmony_ci if (fault->nx_huge_page_workaround_enabled) 105962306a36Sopenharmony_ci disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 106062306a36Sopenharmony_ci 106162306a36Sopenharmony_ci /* 106262306a36Sopenharmony_ci * If SPTE has been frozen by another thread, just give up and 106362306a36Sopenharmony_ci * retry, avoiding unnecessary page table allocation and free. 106462306a36Sopenharmony_ci */ 106562306a36Sopenharmony_ci if (is_removed_spte(iter.old_spte)) 106662306a36Sopenharmony_ci goto retry; 106762306a36Sopenharmony_ci 106862306a36Sopenharmony_ci if (iter.level == fault->goal_level) 106962306a36Sopenharmony_ci goto map_target_level; 107062306a36Sopenharmony_ci 107162306a36Sopenharmony_ci /* Step down into the lower level page table if it exists. */ 107262306a36Sopenharmony_ci if (is_shadow_present_pte(iter.old_spte) && 107362306a36Sopenharmony_ci !is_large_pte(iter.old_spte)) 107462306a36Sopenharmony_ci continue; 107562306a36Sopenharmony_ci 107662306a36Sopenharmony_ci /* 107762306a36Sopenharmony_ci * The SPTE is either non-present or points to a huge page that 107862306a36Sopenharmony_ci * needs to be split. 107962306a36Sopenharmony_ci */ 108062306a36Sopenharmony_ci sp = tdp_mmu_alloc_sp(vcpu); 108162306a36Sopenharmony_ci tdp_mmu_init_child_sp(sp, &iter); 108262306a36Sopenharmony_ci 108362306a36Sopenharmony_ci sp->nx_huge_page_disallowed = fault->huge_page_disallowed; 108462306a36Sopenharmony_ci 108562306a36Sopenharmony_ci if (is_shadow_present_pte(iter.old_spte)) 108662306a36Sopenharmony_ci r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); 108762306a36Sopenharmony_ci else 108862306a36Sopenharmony_ci r = tdp_mmu_link_sp(kvm, &iter, sp, true); 108962306a36Sopenharmony_ci 109062306a36Sopenharmony_ci /* 109162306a36Sopenharmony_ci * Force the guest to retry if installing an upper level SPTE 109262306a36Sopenharmony_ci * failed, e.g. because a different task modified the SPTE. 109362306a36Sopenharmony_ci */ 109462306a36Sopenharmony_ci if (r) { 109562306a36Sopenharmony_ci tdp_mmu_free_sp(sp); 109662306a36Sopenharmony_ci goto retry; 109762306a36Sopenharmony_ci } 109862306a36Sopenharmony_ci 109962306a36Sopenharmony_ci if (fault->huge_page_disallowed && 110062306a36Sopenharmony_ci fault->req_level >= iter.level) { 110162306a36Sopenharmony_ci spin_lock(&kvm->arch.tdp_mmu_pages_lock); 110262306a36Sopenharmony_ci if (sp->nx_huge_page_disallowed) 110362306a36Sopenharmony_ci track_possible_nx_huge_page(kvm, sp); 110462306a36Sopenharmony_ci spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 110562306a36Sopenharmony_ci } 110662306a36Sopenharmony_ci } 110762306a36Sopenharmony_ci 110862306a36Sopenharmony_ci /* 110962306a36Sopenharmony_ci * The walk aborted before reaching the target level, e.g. because the 111062306a36Sopenharmony_ci * iterator detected an upper level SPTE was frozen during traversal. 111162306a36Sopenharmony_ci */ 111262306a36Sopenharmony_ci WARN_ON_ONCE(iter.level == fault->goal_level); 111362306a36Sopenharmony_ci goto retry; 111462306a36Sopenharmony_ci 111562306a36Sopenharmony_cimap_target_level: 111662306a36Sopenharmony_ci ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 111762306a36Sopenharmony_ci 111862306a36Sopenharmony_ciretry: 111962306a36Sopenharmony_ci rcu_read_unlock(); 112062306a36Sopenharmony_ci return ret; 112162306a36Sopenharmony_ci} 112262306a36Sopenharmony_ci 112362306a36Sopenharmony_cibool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 112462306a36Sopenharmony_ci bool flush) 112562306a36Sopenharmony_ci{ 112662306a36Sopenharmony_ci struct kvm_mmu_page *root; 112762306a36Sopenharmony_ci 112862306a36Sopenharmony_ci __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false, false) 112962306a36Sopenharmony_ci flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end, 113062306a36Sopenharmony_ci range->may_block, flush); 113162306a36Sopenharmony_ci 113262306a36Sopenharmony_ci return flush; 113362306a36Sopenharmony_ci} 113462306a36Sopenharmony_ci 113562306a36Sopenharmony_citypedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, 113662306a36Sopenharmony_ci struct kvm_gfn_range *range); 113762306a36Sopenharmony_ci 113862306a36Sopenharmony_cistatic __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, 113962306a36Sopenharmony_ci struct kvm_gfn_range *range, 114062306a36Sopenharmony_ci tdp_handler_t handler) 114162306a36Sopenharmony_ci{ 114262306a36Sopenharmony_ci struct kvm_mmu_page *root; 114362306a36Sopenharmony_ci struct tdp_iter iter; 114462306a36Sopenharmony_ci bool ret = false; 114562306a36Sopenharmony_ci 114662306a36Sopenharmony_ci /* 114762306a36Sopenharmony_ci * Don't support rescheduling, none of the MMU notifiers that funnel 114862306a36Sopenharmony_ci * into this helper allow blocking; it'd be dead, wasteful code. 114962306a36Sopenharmony_ci */ 115062306a36Sopenharmony_ci for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { 115162306a36Sopenharmony_ci rcu_read_lock(); 115262306a36Sopenharmony_ci 115362306a36Sopenharmony_ci tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) 115462306a36Sopenharmony_ci ret |= handler(kvm, &iter, range); 115562306a36Sopenharmony_ci 115662306a36Sopenharmony_ci rcu_read_unlock(); 115762306a36Sopenharmony_ci } 115862306a36Sopenharmony_ci 115962306a36Sopenharmony_ci return ret; 116062306a36Sopenharmony_ci} 116162306a36Sopenharmony_ci 116262306a36Sopenharmony_ci/* 116362306a36Sopenharmony_ci * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 116462306a36Sopenharmony_ci * if any of the GFNs in the range have been accessed. 116562306a36Sopenharmony_ci * 116662306a36Sopenharmony_ci * No need to mark the corresponding PFN as accessed as this call is coming 116762306a36Sopenharmony_ci * from the clear_young() or clear_flush_young() notifier, which uses the 116862306a36Sopenharmony_ci * return value to determine if the page has been accessed. 116962306a36Sopenharmony_ci */ 117062306a36Sopenharmony_cistatic bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, 117162306a36Sopenharmony_ci struct kvm_gfn_range *range) 117262306a36Sopenharmony_ci{ 117362306a36Sopenharmony_ci u64 new_spte; 117462306a36Sopenharmony_ci 117562306a36Sopenharmony_ci /* If we have a non-accessed entry we don't need to change the pte. */ 117662306a36Sopenharmony_ci if (!is_accessed_spte(iter->old_spte)) 117762306a36Sopenharmony_ci return false; 117862306a36Sopenharmony_ci 117962306a36Sopenharmony_ci if (spte_ad_enabled(iter->old_spte)) { 118062306a36Sopenharmony_ci iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, 118162306a36Sopenharmony_ci iter->old_spte, 118262306a36Sopenharmony_ci shadow_accessed_mask, 118362306a36Sopenharmony_ci iter->level); 118462306a36Sopenharmony_ci new_spte = iter->old_spte & ~shadow_accessed_mask; 118562306a36Sopenharmony_ci } else { 118662306a36Sopenharmony_ci /* 118762306a36Sopenharmony_ci * Capture the dirty status of the page, so that it doesn't get 118862306a36Sopenharmony_ci * lost when the SPTE is marked for access tracking. 118962306a36Sopenharmony_ci */ 119062306a36Sopenharmony_ci if (is_writable_pte(iter->old_spte)) 119162306a36Sopenharmony_ci kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte)); 119262306a36Sopenharmony_ci 119362306a36Sopenharmony_ci new_spte = mark_spte_for_access_track(iter->old_spte); 119462306a36Sopenharmony_ci iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, 119562306a36Sopenharmony_ci iter->old_spte, new_spte, 119662306a36Sopenharmony_ci iter->level); 119762306a36Sopenharmony_ci } 119862306a36Sopenharmony_ci 119962306a36Sopenharmony_ci trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, 120062306a36Sopenharmony_ci iter->old_spte, new_spte); 120162306a36Sopenharmony_ci return true; 120262306a36Sopenharmony_ci} 120362306a36Sopenharmony_ci 120462306a36Sopenharmony_cibool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 120562306a36Sopenharmony_ci{ 120662306a36Sopenharmony_ci return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); 120762306a36Sopenharmony_ci} 120862306a36Sopenharmony_ci 120962306a36Sopenharmony_cistatic bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, 121062306a36Sopenharmony_ci struct kvm_gfn_range *range) 121162306a36Sopenharmony_ci{ 121262306a36Sopenharmony_ci return is_accessed_spte(iter->old_spte); 121362306a36Sopenharmony_ci} 121462306a36Sopenharmony_ci 121562306a36Sopenharmony_cibool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 121662306a36Sopenharmony_ci{ 121762306a36Sopenharmony_ci return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); 121862306a36Sopenharmony_ci} 121962306a36Sopenharmony_ci 122062306a36Sopenharmony_cistatic bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, 122162306a36Sopenharmony_ci struct kvm_gfn_range *range) 122262306a36Sopenharmony_ci{ 122362306a36Sopenharmony_ci u64 new_spte; 122462306a36Sopenharmony_ci 122562306a36Sopenharmony_ci /* Huge pages aren't expected to be modified without first being zapped. */ 122662306a36Sopenharmony_ci WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end); 122762306a36Sopenharmony_ci 122862306a36Sopenharmony_ci if (iter->level != PG_LEVEL_4K || 122962306a36Sopenharmony_ci !is_shadow_present_pte(iter->old_spte)) 123062306a36Sopenharmony_ci return false; 123162306a36Sopenharmony_ci 123262306a36Sopenharmony_ci /* 123362306a36Sopenharmony_ci * Note, when changing a read-only SPTE, it's not strictly necessary to 123462306a36Sopenharmony_ci * zero the SPTE before setting the new PFN, but doing so preserves the 123562306a36Sopenharmony_ci * invariant that the PFN of a present * leaf SPTE can never change. 123662306a36Sopenharmony_ci * See handle_changed_spte(). 123762306a36Sopenharmony_ci */ 123862306a36Sopenharmony_ci tdp_mmu_iter_set_spte(kvm, iter, 0); 123962306a36Sopenharmony_ci 124062306a36Sopenharmony_ci if (!pte_write(range->arg.pte)) { 124162306a36Sopenharmony_ci new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, 124262306a36Sopenharmony_ci pte_pfn(range->arg.pte)); 124362306a36Sopenharmony_ci 124462306a36Sopenharmony_ci tdp_mmu_iter_set_spte(kvm, iter, new_spte); 124562306a36Sopenharmony_ci } 124662306a36Sopenharmony_ci 124762306a36Sopenharmony_ci return true; 124862306a36Sopenharmony_ci} 124962306a36Sopenharmony_ci 125062306a36Sopenharmony_ci/* 125162306a36Sopenharmony_ci * Handle the changed_pte MMU notifier for the TDP MMU. 125262306a36Sopenharmony_ci * data is a pointer to the new pte_t mapping the HVA specified by the MMU 125362306a36Sopenharmony_ci * notifier. 125462306a36Sopenharmony_ci * Returns non-zero if a flush is needed before releasing the MMU lock. 125562306a36Sopenharmony_ci */ 125662306a36Sopenharmony_cibool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 125762306a36Sopenharmony_ci{ 125862306a36Sopenharmony_ci /* 125962306a36Sopenharmony_ci * No need to handle the remote TLB flush under RCU protection, the 126062306a36Sopenharmony_ci * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a 126162306a36Sopenharmony_ci * shadow page. See the WARN on pfn_changed in handle_changed_spte(). 126262306a36Sopenharmony_ci */ 126362306a36Sopenharmony_ci return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); 126462306a36Sopenharmony_ci} 126562306a36Sopenharmony_ci 126662306a36Sopenharmony_ci/* 126762306a36Sopenharmony_ci * Remove write access from all SPTEs at or above min_level that map GFNs 126862306a36Sopenharmony_ci * [start, end). Returns true if an SPTE has been changed and the TLBs need to 126962306a36Sopenharmony_ci * be flushed. 127062306a36Sopenharmony_ci */ 127162306a36Sopenharmony_cistatic bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 127262306a36Sopenharmony_ci gfn_t start, gfn_t end, int min_level) 127362306a36Sopenharmony_ci{ 127462306a36Sopenharmony_ci struct tdp_iter iter; 127562306a36Sopenharmony_ci u64 new_spte; 127662306a36Sopenharmony_ci bool spte_set = false; 127762306a36Sopenharmony_ci 127862306a36Sopenharmony_ci rcu_read_lock(); 127962306a36Sopenharmony_ci 128062306a36Sopenharmony_ci BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 128162306a36Sopenharmony_ci 128262306a36Sopenharmony_ci for_each_tdp_pte_min_level(iter, root, min_level, start, end) { 128362306a36Sopenharmony_ciretry: 128462306a36Sopenharmony_ci if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 128562306a36Sopenharmony_ci continue; 128662306a36Sopenharmony_ci 128762306a36Sopenharmony_ci if (!is_shadow_present_pte(iter.old_spte) || 128862306a36Sopenharmony_ci !is_last_spte(iter.old_spte, iter.level) || 128962306a36Sopenharmony_ci !(iter.old_spte & PT_WRITABLE_MASK)) 129062306a36Sopenharmony_ci continue; 129162306a36Sopenharmony_ci 129262306a36Sopenharmony_ci new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 129362306a36Sopenharmony_ci 129462306a36Sopenharmony_ci if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 129562306a36Sopenharmony_ci goto retry; 129662306a36Sopenharmony_ci 129762306a36Sopenharmony_ci spte_set = true; 129862306a36Sopenharmony_ci } 129962306a36Sopenharmony_ci 130062306a36Sopenharmony_ci rcu_read_unlock(); 130162306a36Sopenharmony_ci return spte_set; 130262306a36Sopenharmony_ci} 130362306a36Sopenharmony_ci 130462306a36Sopenharmony_ci/* 130562306a36Sopenharmony_ci * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 130662306a36Sopenharmony_ci * only affect leaf SPTEs down to min_level. 130762306a36Sopenharmony_ci * Returns true if an SPTE has been changed and the TLBs need to be flushed. 130862306a36Sopenharmony_ci */ 130962306a36Sopenharmony_cibool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 131062306a36Sopenharmony_ci const struct kvm_memory_slot *slot, int min_level) 131162306a36Sopenharmony_ci{ 131262306a36Sopenharmony_ci struct kvm_mmu_page *root; 131362306a36Sopenharmony_ci bool spte_set = false; 131462306a36Sopenharmony_ci 131562306a36Sopenharmony_ci lockdep_assert_held_read(&kvm->mmu_lock); 131662306a36Sopenharmony_ci 131762306a36Sopenharmony_ci for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 131862306a36Sopenharmony_ci spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 131962306a36Sopenharmony_ci slot->base_gfn + slot->npages, min_level); 132062306a36Sopenharmony_ci 132162306a36Sopenharmony_ci return spte_set; 132262306a36Sopenharmony_ci} 132362306a36Sopenharmony_ci 132462306a36Sopenharmony_cistatic struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) 132562306a36Sopenharmony_ci{ 132662306a36Sopenharmony_ci struct kvm_mmu_page *sp; 132762306a36Sopenharmony_ci 132862306a36Sopenharmony_ci gfp |= __GFP_ZERO; 132962306a36Sopenharmony_ci 133062306a36Sopenharmony_ci sp = kmem_cache_alloc(mmu_page_header_cache, gfp); 133162306a36Sopenharmony_ci if (!sp) 133262306a36Sopenharmony_ci return NULL; 133362306a36Sopenharmony_ci 133462306a36Sopenharmony_ci sp->spt = (void *)__get_free_page(gfp); 133562306a36Sopenharmony_ci if (!sp->spt) { 133662306a36Sopenharmony_ci kmem_cache_free(mmu_page_header_cache, sp); 133762306a36Sopenharmony_ci return NULL; 133862306a36Sopenharmony_ci } 133962306a36Sopenharmony_ci 134062306a36Sopenharmony_ci return sp; 134162306a36Sopenharmony_ci} 134262306a36Sopenharmony_ci 134362306a36Sopenharmony_cistatic struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, 134462306a36Sopenharmony_ci struct tdp_iter *iter, 134562306a36Sopenharmony_ci bool shared) 134662306a36Sopenharmony_ci{ 134762306a36Sopenharmony_ci struct kvm_mmu_page *sp; 134862306a36Sopenharmony_ci 134962306a36Sopenharmony_ci /* 135062306a36Sopenharmony_ci * Since we are allocating while under the MMU lock we have to be 135162306a36Sopenharmony_ci * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct 135262306a36Sopenharmony_ci * reclaim and to avoid making any filesystem callbacks (which can end 135362306a36Sopenharmony_ci * up invoking KVM MMU notifiers, resulting in a deadlock). 135462306a36Sopenharmony_ci * 135562306a36Sopenharmony_ci * If this allocation fails we drop the lock and retry with reclaim 135662306a36Sopenharmony_ci * allowed. 135762306a36Sopenharmony_ci */ 135862306a36Sopenharmony_ci sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); 135962306a36Sopenharmony_ci if (sp) 136062306a36Sopenharmony_ci return sp; 136162306a36Sopenharmony_ci 136262306a36Sopenharmony_ci rcu_read_unlock(); 136362306a36Sopenharmony_ci 136462306a36Sopenharmony_ci if (shared) 136562306a36Sopenharmony_ci read_unlock(&kvm->mmu_lock); 136662306a36Sopenharmony_ci else 136762306a36Sopenharmony_ci write_unlock(&kvm->mmu_lock); 136862306a36Sopenharmony_ci 136962306a36Sopenharmony_ci iter->yielded = true; 137062306a36Sopenharmony_ci sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); 137162306a36Sopenharmony_ci 137262306a36Sopenharmony_ci if (shared) 137362306a36Sopenharmony_ci read_lock(&kvm->mmu_lock); 137462306a36Sopenharmony_ci else 137562306a36Sopenharmony_ci write_lock(&kvm->mmu_lock); 137662306a36Sopenharmony_ci 137762306a36Sopenharmony_ci rcu_read_lock(); 137862306a36Sopenharmony_ci 137962306a36Sopenharmony_ci return sp; 138062306a36Sopenharmony_ci} 138162306a36Sopenharmony_ci 138262306a36Sopenharmony_ci/* Note, the caller is responsible for initializing @sp. */ 138362306a36Sopenharmony_cistatic int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 138462306a36Sopenharmony_ci struct kvm_mmu_page *sp, bool shared) 138562306a36Sopenharmony_ci{ 138662306a36Sopenharmony_ci const u64 huge_spte = iter->old_spte; 138762306a36Sopenharmony_ci const int level = iter->level; 138862306a36Sopenharmony_ci int ret, i; 138962306a36Sopenharmony_ci 139062306a36Sopenharmony_ci /* 139162306a36Sopenharmony_ci * No need for atomics when writing to sp->spt since the page table has 139262306a36Sopenharmony_ci * not been linked in yet and thus is not reachable from any other CPU. 139362306a36Sopenharmony_ci */ 139462306a36Sopenharmony_ci for (i = 0; i < SPTE_ENT_PER_PAGE; i++) 139562306a36Sopenharmony_ci sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); 139662306a36Sopenharmony_ci 139762306a36Sopenharmony_ci /* 139862306a36Sopenharmony_ci * Replace the huge spte with a pointer to the populated lower level 139962306a36Sopenharmony_ci * page table. Since we are making this change without a TLB flush vCPUs 140062306a36Sopenharmony_ci * will see a mix of the split mappings and the original huge mapping, 140162306a36Sopenharmony_ci * depending on what's currently in their TLB. This is fine from a 140262306a36Sopenharmony_ci * correctness standpoint since the translation will be the same either 140362306a36Sopenharmony_ci * way. 140462306a36Sopenharmony_ci */ 140562306a36Sopenharmony_ci ret = tdp_mmu_link_sp(kvm, iter, sp, shared); 140662306a36Sopenharmony_ci if (ret) 140762306a36Sopenharmony_ci goto out; 140862306a36Sopenharmony_ci 140962306a36Sopenharmony_ci /* 141062306a36Sopenharmony_ci * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we 141162306a36Sopenharmony_ci * are overwriting from the page stats. But we have to manually update 141262306a36Sopenharmony_ci * the page stats with the new present child pages. 141362306a36Sopenharmony_ci */ 141462306a36Sopenharmony_ci kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); 141562306a36Sopenharmony_ci 141662306a36Sopenharmony_ciout: 141762306a36Sopenharmony_ci trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); 141862306a36Sopenharmony_ci return ret; 141962306a36Sopenharmony_ci} 142062306a36Sopenharmony_ci 142162306a36Sopenharmony_cistatic int tdp_mmu_split_huge_pages_root(struct kvm *kvm, 142262306a36Sopenharmony_ci struct kvm_mmu_page *root, 142362306a36Sopenharmony_ci gfn_t start, gfn_t end, 142462306a36Sopenharmony_ci int target_level, bool shared) 142562306a36Sopenharmony_ci{ 142662306a36Sopenharmony_ci struct kvm_mmu_page *sp = NULL; 142762306a36Sopenharmony_ci struct tdp_iter iter; 142862306a36Sopenharmony_ci int ret = 0; 142962306a36Sopenharmony_ci 143062306a36Sopenharmony_ci rcu_read_lock(); 143162306a36Sopenharmony_ci 143262306a36Sopenharmony_ci /* 143362306a36Sopenharmony_ci * Traverse the page table splitting all huge pages above the target 143462306a36Sopenharmony_ci * level into one lower level. For example, if we encounter a 1GB page 143562306a36Sopenharmony_ci * we split it into 512 2MB pages. 143662306a36Sopenharmony_ci * 143762306a36Sopenharmony_ci * Since the TDP iterator uses a pre-order traversal, we are guaranteed 143862306a36Sopenharmony_ci * to visit an SPTE before ever visiting its children, which means we 143962306a36Sopenharmony_ci * will correctly recursively split huge pages that are more than one 144062306a36Sopenharmony_ci * level above the target level (e.g. splitting a 1GB to 512 2MB pages, 144162306a36Sopenharmony_ci * and then splitting each of those to 512 4KB pages). 144262306a36Sopenharmony_ci */ 144362306a36Sopenharmony_ci for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { 144462306a36Sopenharmony_ciretry: 144562306a36Sopenharmony_ci if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 144662306a36Sopenharmony_ci continue; 144762306a36Sopenharmony_ci 144862306a36Sopenharmony_ci if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) 144962306a36Sopenharmony_ci continue; 145062306a36Sopenharmony_ci 145162306a36Sopenharmony_ci if (!sp) { 145262306a36Sopenharmony_ci sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); 145362306a36Sopenharmony_ci if (!sp) { 145462306a36Sopenharmony_ci ret = -ENOMEM; 145562306a36Sopenharmony_ci trace_kvm_mmu_split_huge_page(iter.gfn, 145662306a36Sopenharmony_ci iter.old_spte, 145762306a36Sopenharmony_ci iter.level, ret); 145862306a36Sopenharmony_ci break; 145962306a36Sopenharmony_ci } 146062306a36Sopenharmony_ci 146162306a36Sopenharmony_ci if (iter.yielded) 146262306a36Sopenharmony_ci continue; 146362306a36Sopenharmony_ci } 146462306a36Sopenharmony_ci 146562306a36Sopenharmony_ci tdp_mmu_init_child_sp(sp, &iter); 146662306a36Sopenharmony_ci 146762306a36Sopenharmony_ci if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) 146862306a36Sopenharmony_ci goto retry; 146962306a36Sopenharmony_ci 147062306a36Sopenharmony_ci sp = NULL; 147162306a36Sopenharmony_ci } 147262306a36Sopenharmony_ci 147362306a36Sopenharmony_ci rcu_read_unlock(); 147462306a36Sopenharmony_ci 147562306a36Sopenharmony_ci /* 147662306a36Sopenharmony_ci * It's possible to exit the loop having never used the last sp if, for 147762306a36Sopenharmony_ci * example, a vCPU doing HugePage NX splitting wins the race and 147862306a36Sopenharmony_ci * installs its own sp in place of the last sp we tried to split. 147962306a36Sopenharmony_ci */ 148062306a36Sopenharmony_ci if (sp) 148162306a36Sopenharmony_ci tdp_mmu_free_sp(sp); 148262306a36Sopenharmony_ci 148362306a36Sopenharmony_ci return ret; 148462306a36Sopenharmony_ci} 148562306a36Sopenharmony_ci 148662306a36Sopenharmony_ci 148762306a36Sopenharmony_ci/* 148862306a36Sopenharmony_ci * Try to split all huge pages mapped by the TDP MMU down to the target level. 148962306a36Sopenharmony_ci */ 149062306a36Sopenharmony_civoid kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, 149162306a36Sopenharmony_ci const struct kvm_memory_slot *slot, 149262306a36Sopenharmony_ci gfn_t start, gfn_t end, 149362306a36Sopenharmony_ci int target_level, bool shared) 149462306a36Sopenharmony_ci{ 149562306a36Sopenharmony_ci struct kvm_mmu_page *root; 149662306a36Sopenharmony_ci int r = 0; 149762306a36Sopenharmony_ci 149862306a36Sopenharmony_ci kvm_lockdep_assert_mmu_lock_held(kvm, shared); 149962306a36Sopenharmony_ci 150062306a36Sopenharmony_ci for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { 150162306a36Sopenharmony_ci r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 150262306a36Sopenharmony_ci if (r) { 150362306a36Sopenharmony_ci kvm_tdp_mmu_put_root(kvm, root, shared); 150462306a36Sopenharmony_ci break; 150562306a36Sopenharmony_ci } 150662306a36Sopenharmony_ci } 150762306a36Sopenharmony_ci} 150862306a36Sopenharmony_ci 150962306a36Sopenharmony_ci/* 151062306a36Sopenharmony_ci * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 151162306a36Sopenharmony_ci * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 151262306a36Sopenharmony_ci * If AD bits are not enabled, this will require clearing the writable bit on 151362306a36Sopenharmony_ci * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 151462306a36Sopenharmony_ci * be flushed. 151562306a36Sopenharmony_ci */ 151662306a36Sopenharmony_cistatic bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 151762306a36Sopenharmony_ci gfn_t start, gfn_t end) 151862306a36Sopenharmony_ci{ 151962306a36Sopenharmony_ci u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK; 152062306a36Sopenharmony_ci struct tdp_iter iter; 152162306a36Sopenharmony_ci bool spte_set = false; 152262306a36Sopenharmony_ci 152362306a36Sopenharmony_ci rcu_read_lock(); 152462306a36Sopenharmony_ci 152562306a36Sopenharmony_ci tdp_root_for_each_leaf_pte(iter, root, start, end) { 152662306a36Sopenharmony_ciretry: 152762306a36Sopenharmony_ci if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 152862306a36Sopenharmony_ci continue; 152962306a36Sopenharmony_ci 153062306a36Sopenharmony_ci if (!is_shadow_present_pte(iter.old_spte)) 153162306a36Sopenharmony_ci continue; 153262306a36Sopenharmony_ci 153362306a36Sopenharmony_ci KVM_MMU_WARN_ON(kvm_ad_enabled() && 153462306a36Sopenharmony_ci spte_ad_need_write_protect(iter.old_spte)); 153562306a36Sopenharmony_ci 153662306a36Sopenharmony_ci if (!(iter.old_spte & dbit)) 153762306a36Sopenharmony_ci continue; 153862306a36Sopenharmony_ci 153962306a36Sopenharmony_ci if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) 154062306a36Sopenharmony_ci goto retry; 154162306a36Sopenharmony_ci 154262306a36Sopenharmony_ci spte_set = true; 154362306a36Sopenharmony_ci } 154462306a36Sopenharmony_ci 154562306a36Sopenharmony_ci rcu_read_unlock(); 154662306a36Sopenharmony_ci return spte_set; 154762306a36Sopenharmony_ci} 154862306a36Sopenharmony_ci 154962306a36Sopenharmony_ci/* 155062306a36Sopenharmony_ci * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 155162306a36Sopenharmony_ci * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 155262306a36Sopenharmony_ci * If AD bits are not enabled, this will require clearing the writable bit on 155362306a36Sopenharmony_ci * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 155462306a36Sopenharmony_ci * be flushed. 155562306a36Sopenharmony_ci */ 155662306a36Sopenharmony_cibool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 155762306a36Sopenharmony_ci const struct kvm_memory_slot *slot) 155862306a36Sopenharmony_ci{ 155962306a36Sopenharmony_ci struct kvm_mmu_page *root; 156062306a36Sopenharmony_ci bool spte_set = false; 156162306a36Sopenharmony_ci 156262306a36Sopenharmony_ci lockdep_assert_held_read(&kvm->mmu_lock); 156362306a36Sopenharmony_ci 156462306a36Sopenharmony_ci for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 156562306a36Sopenharmony_ci spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 156662306a36Sopenharmony_ci slot->base_gfn + slot->npages); 156762306a36Sopenharmony_ci 156862306a36Sopenharmony_ci return spte_set; 156962306a36Sopenharmony_ci} 157062306a36Sopenharmony_ci 157162306a36Sopenharmony_ci/* 157262306a36Sopenharmony_ci * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 157362306a36Sopenharmony_ci * set in mask, starting at gfn. The given memslot is expected to contain all 157462306a36Sopenharmony_ci * the GFNs represented by set bits in the mask. If AD bits are enabled, 157562306a36Sopenharmony_ci * clearing the dirty status will involve clearing the dirty bit on each SPTE 157662306a36Sopenharmony_ci * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 157762306a36Sopenharmony_ci */ 157862306a36Sopenharmony_cistatic void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 157962306a36Sopenharmony_ci gfn_t gfn, unsigned long mask, bool wrprot) 158062306a36Sopenharmony_ci{ 158162306a36Sopenharmony_ci u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK : 158262306a36Sopenharmony_ci shadow_dirty_mask; 158362306a36Sopenharmony_ci struct tdp_iter iter; 158462306a36Sopenharmony_ci 158562306a36Sopenharmony_ci lockdep_assert_held_write(&kvm->mmu_lock); 158662306a36Sopenharmony_ci 158762306a36Sopenharmony_ci rcu_read_lock(); 158862306a36Sopenharmony_ci 158962306a36Sopenharmony_ci tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 159062306a36Sopenharmony_ci gfn + BITS_PER_LONG) { 159162306a36Sopenharmony_ci if (!mask) 159262306a36Sopenharmony_ci break; 159362306a36Sopenharmony_ci 159462306a36Sopenharmony_ci KVM_MMU_WARN_ON(kvm_ad_enabled() && 159562306a36Sopenharmony_ci spte_ad_need_write_protect(iter.old_spte)); 159662306a36Sopenharmony_ci 159762306a36Sopenharmony_ci if (iter.level > PG_LEVEL_4K || 159862306a36Sopenharmony_ci !(mask & (1UL << (iter.gfn - gfn)))) 159962306a36Sopenharmony_ci continue; 160062306a36Sopenharmony_ci 160162306a36Sopenharmony_ci mask &= ~(1UL << (iter.gfn - gfn)); 160262306a36Sopenharmony_ci 160362306a36Sopenharmony_ci if (!(iter.old_spte & dbit)) 160462306a36Sopenharmony_ci continue; 160562306a36Sopenharmony_ci 160662306a36Sopenharmony_ci iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep, 160762306a36Sopenharmony_ci iter.old_spte, dbit, 160862306a36Sopenharmony_ci iter.level); 160962306a36Sopenharmony_ci 161062306a36Sopenharmony_ci trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, 161162306a36Sopenharmony_ci iter.old_spte, 161262306a36Sopenharmony_ci iter.old_spte & ~dbit); 161362306a36Sopenharmony_ci kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte)); 161462306a36Sopenharmony_ci } 161562306a36Sopenharmony_ci 161662306a36Sopenharmony_ci rcu_read_unlock(); 161762306a36Sopenharmony_ci} 161862306a36Sopenharmony_ci 161962306a36Sopenharmony_ci/* 162062306a36Sopenharmony_ci * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 162162306a36Sopenharmony_ci * set in mask, starting at gfn. The given memslot is expected to contain all 162262306a36Sopenharmony_ci * the GFNs represented by set bits in the mask. If AD bits are enabled, 162362306a36Sopenharmony_ci * clearing the dirty status will involve clearing the dirty bit on each SPTE 162462306a36Sopenharmony_ci * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 162562306a36Sopenharmony_ci */ 162662306a36Sopenharmony_civoid kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 162762306a36Sopenharmony_ci struct kvm_memory_slot *slot, 162862306a36Sopenharmony_ci gfn_t gfn, unsigned long mask, 162962306a36Sopenharmony_ci bool wrprot) 163062306a36Sopenharmony_ci{ 163162306a36Sopenharmony_ci struct kvm_mmu_page *root; 163262306a36Sopenharmony_ci 163362306a36Sopenharmony_ci for_each_tdp_mmu_root(kvm, root, slot->as_id) 163462306a36Sopenharmony_ci clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 163562306a36Sopenharmony_ci} 163662306a36Sopenharmony_ci 163762306a36Sopenharmony_cistatic void zap_collapsible_spte_range(struct kvm *kvm, 163862306a36Sopenharmony_ci struct kvm_mmu_page *root, 163962306a36Sopenharmony_ci const struct kvm_memory_slot *slot) 164062306a36Sopenharmony_ci{ 164162306a36Sopenharmony_ci gfn_t start = slot->base_gfn; 164262306a36Sopenharmony_ci gfn_t end = start + slot->npages; 164362306a36Sopenharmony_ci struct tdp_iter iter; 164462306a36Sopenharmony_ci int max_mapping_level; 164562306a36Sopenharmony_ci 164662306a36Sopenharmony_ci rcu_read_lock(); 164762306a36Sopenharmony_ci 164862306a36Sopenharmony_ci for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { 164962306a36Sopenharmony_ciretry: 165062306a36Sopenharmony_ci if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 165162306a36Sopenharmony_ci continue; 165262306a36Sopenharmony_ci 165362306a36Sopenharmony_ci if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || 165462306a36Sopenharmony_ci !is_shadow_present_pte(iter.old_spte)) 165562306a36Sopenharmony_ci continue; 165662306a36Sopenharmony_ci 165762306a36Sopenharmony_ci /* 165862306a36Sopenharmony_ci * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with 165962306a36Sopenharmony_ci * a large page size, then its parent would have been zapped 166062306a36Sopenharmony_ci * instead of stepping down. 166162306a36Sopenharmony_ci */ 166262306a36Sopenharmony_ci if (is_last_spte(iter.old_spte, iter.level)) 166362306a36Sopenharmony_ci continue; 166462306a36Sopenharmony_ci 166562306a36Sopenharmony_ci /* 166662306a36Sopenharmony_ci * If iter.gfn resides outside of the slot, i.e. the page for 166762306a36Sopenharmony_ci * the current level overlaps but is not contained by the slot, 166862306a36Sopenharmony_ci * then the SPTE can't be made huge. More importantly, trying 166962306a36Sopenharmony_ci * to query that info from slot->arch.lpage_info will cause an 167062306a36Sopenharmony_ci * out-of-bounds access. 167162306a36Sopenharmony_ci */ 167262306a36Sopenharmony_ci if (iter.gfn < start || iter.gfn >= end) 167362306a36Sopenharmony_ci continue; 167462306a36Sopenharmony_ci 167562306a36Sopenharmony_ci max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, 167662306a36Sopenharmony_ci iter.gfn, PG_LEVEL_NUM); 167762306a36Sopenharmony_ci if (max_mapping_level < iter.level) 167862306a36Sopenharmony_ci continue; 167962306a36Sopenharmony_ci 168062306a36Sopenharmony_ci /* Note, a successful atomic zap also does a remote TLB flush. */ 168162306a36Sopenharmony_ci if (tdp_mmu_zap_spte_atomic(kvm, &iter)) 168262306a36Sopenharmony_ci goto retry; 168362306a36Sopenharmony_ci } 168462306a36Sopenharmony_ci 168562306a36Sopenharmony_ci rcu_read_unlock(); 168662306a36Sopenharmony_ci} 168762306a36Sopenharmony_ci 168862306a36Sopenharmony_ci/* 168962306a36Sopenharmony_ci * Zap non-leaf SPTEs (and free their associated page tables) which could 169062306a36Sopenharmony_ci * be replaced by huge pages, for GFNs within the slot. 169162306a36Sopenharmony_ci */ 169262306a36Sopenharmony_civoid kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 169362306a36Sopenharmony_ci const struct kvm_memory_slot *slot) 169462306a36Sopenharmony_ci{ 169562306a36Sopenharmony_ci struct kvm_mmu_page *root; 169662306a36Sopenharmony_ci 169762306a36Sopenharmony_ci lockdep_assert_held_read(&kvm->mmu_lock); 169862306a36Sopenharmony_ci 169962306a36Sopenharmony_ci for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 170062306a36Sopenharmony_ci zap_collapsible_spte_range(kvm, root, slot); 170162306a36Sopenharmony_ci} 170262306a36Sopenharmony_ci 170362306a36Sopenharmony_ci/* 170462306a36Sopenharmony_ci * Removes write access on the last level SPTE mapping this GFN and unsets the 170562306a36Sopenharmony_ci * MMU-writable bit to ensure future writes continue to be intercepted. 170662306a36Sopenharmony_ci * Returns true if an SPTE was set and a TLB flush is needed. 170762306a36Sopenharmony_ci */ 170862306a36Sopenharmony_cistatic bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 170962306a36Sopenharmony_ci gfn_t gfn, int min_level) 171062306a36Sopenharmony_ci{ 171162306a36Sopenharmony_ci struct tdp_iter iter; 171262306a36Sopenharmony_ci u64 new_spte; 171362306a36Sopenharmony_ci bool spte_set = false; 171462306a36Sopenharmony_ci 171562306a36Sopenharmony_ci BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 171662306a36Sopenharmony_ci 171762306a36Sopenharmony_ci rcu_read_lock(); 171862306a36Sopenharmony_ci 171962306a36Sopenharmony_ci for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { 172062306a36Sopenharmony_ci if (!is_shadow_present_pte(iter.old_spte) || 172162306a36Sopenharmony_ci !is_last_spte(iter.old_spte, iter.level)) 172262306a36Sopenharmony_ci continue; 172362306a36Sopenharmony_ci 172462306a36Sopenharmony_ci new_spte = iter.old_spte & 172562306a36Sopenharmony_ci ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 172662306a36Sopenharmony_ci 172762306a36Sopenharmony_ci if (new_spte == iter.old_spte) 172862306a36Sopenharmony_ci break; 172962306a36Sopenharmony_ci 173062306a36Sopenharmony_ci tdp_mmu_iter_set_spte(kvm, &iter, new_spte); 173162306a36Sopenharmony_ci spte_set = true; 173262306a36Sopenharmony_ci } 173362306a36Sopenharmony_ci 173462306a36Sopenharmony_ci rcu_read_unlock(); 173562306a36Sopenharmony_ci 173662306a36Sopenharmony_ci return spte_set; 173762306a36Sopenharmony_ci} 173862306a36Sopenharmony_ci 173962306a36Sopenharmony_ci/* 174062306a36Sopenharmony_ci * Removes write access on the last level SPTE mapping this GFN and unsets the 174162306a36Sopenharmony_ci * MMU-writable bit to ensure future writes continue to be intercepted. 174262306a36Sopenharmony_ci * Returns true if an SPTE was set and a TLB flush is needed. 174362306a36Sopenharmony_ci */ 174462306a36Sopenharmony_cibool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 174562306a36Sopenharmony_ci struct kvm_memory_slot *slot, gfn_t gfn, 174662306a36Sopenharmony_ci int min_level) 174762306a36Sopenharmony_ci{ 174862306a36Sopenharmony_ci struct kvm_mmu_page *root; 174962306a36Sopenharmony_ci bool spte_set = false; 175062306a36Sopenharmony_ci 175162306a36Sopenharmony_ci lockdep_assert_held_write(&kvm->mmu_lock); 175262306a36Sopenharmony_ci for_each_tdp_mmu_root(kvm, root, slot->as_id) 175362306a36Sopenharmony_ci spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 175462306a36Sopenharmony_ci 175562306a36Sopenharmony_ci return spte_set; 175662306a36Sopenharmony_ci} 175762306a36Sopenharmony_ci 175862306a36Sopenharmony_ci/* 175962306a36Sopenharmony_ci * Return the level of the lowest level SPTE added to sptes. 176062306a36Sopenharmony_ci * That SPTE may be non-present. 176162306a36Sopenharmony_ci * 176262306a36Sopenharmony_ci * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 176362306a36Sopenharmony_ci */ 176462306a36Sopenharmony_ciint kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 176562306a36Sopenharmony_ci int *root_level) 176662306a36Sopenharmony_ci{ 176762306a36Sopenharmony_ci struct tdp_iter iter; 176862306a36Sopenharmony_ci struct kvm_mmu *mmu = vcpu->arch.mmu; 176962306a36Sopenharmony_ci gfn_t gfn = addr >> PAGE_SHIFT; 177062306a36Sopenharmony_ci int leaf = -1; 177162306a36Sopenharmony_ci 177262306a36Sopenharmony_ci *root_level = vcpu->arch.mmu->root_role.level; 177362306a36Sopenharmony_ci 177462306a36Sopenharmony_ci tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 177562306a36Sopenharmony_ci leaf = iter.level; 177662306a36Sopenharmony_ci sptes[leaf] = iter.old_spte; 177762306a36Sopenharmony_ci } 177862306a36Sopenharmony_ci 177962306a36Sopenharmony_ci return leaf; 178062306a36Sopenharmony_ci} 178162306a36Sopenharmony_ci 178262306a36Sopenharmony_ci/* 178362306a36Sopenharmony_ci * Returns the last level spte pointer of the shadow page walk for the given 178462306a36Sopenharmony_ci * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 178562306a36Sopenharmony_ci * walk could be performed, returns NULL and *spte does not contain valid data. 178662306a36Sopenharmony_ci * 178762306a36Sopenharmony_ci * Contract: 178862306a36Sopenharmony_ci * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 178962306a36Sopenharmony_ci * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 179062306a36Sopenharmony_ci * 179162306a36Sopenharmony_ci * WARNING: This function is only intended to be called during fast_page_fault. 179262306a36Sopenharmony_ci */ 179362306a36Sopenharmony_ciu64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, 179462306a36Sopenharmony_ci u64 *spte) 179562306a36Sopenharmony_ci{ 179662306a36Sopenharmony_ci struct tdp_iter iter; 179762306a36Sopenharmony_ci struct kvm_mmu *mmu = vcpu->arch.mmu; 179862306a36Sopenharmony_ci gfn_t gfn = addr >> PAGE_SHIFT; 179962306a36Sopenharmony_ci tdp_ptep_t sptep = NULL; 180062306a36Sopenharmony_ci 180162306a36Sopenharmony_ci tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 180262306a36Sopenharmony_ci *spte = iter.old_spte; 180362306a36Sopenharmony_ci sptep = iter.sptep; 180462306a36Sopenharmony_ci } 180562306a36Sopenharmony_ci 180662306a36Sopenharmony_ci /* 180762306a36Sopenharmony_ci * Perform the rcu_dereference to get the raw spte pointer value since 180862306a36Sopenharmony_ci * we are passing it up to fast_page_fault, which is shared with the 180962306a36Sopenharmony_ci * legacy MMU and thus does not retain the TDP MMU-specific __rcu 181062306a36Sopenharmony_ci * annotation. 181162306a36Sopenharmony_ci * 181262306a36Sopenharmony_ci * This is safe since fast_page_fault obeys the contracts of this 181362306a36Sopenharmony_ci * function as well as all TDP MMU contracts around modifying SPTEs 181462306a36Sopenharmony_ci * outside of mmu_lock. 181562306a36Sopenharmony_ci */ 181662306a36Sopenharmony_ci return rcu_dereference(sptep); 181762306a36Sopenharmony_ci} 1818