162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Kernel-based Virtual Machine driver for Linux 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * This module enables machines with Intel VT-x extensions to run virtual 662306a36Sopenharmony_ci * machines without emulation or binary translation. 762306a36Sopenharmony_ci * 862306a36Sopenharmony_ci * MMU support 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * Copyright (C) 2006 Qumranet, Inc. 1162306a36Sopenharmony_ci * Copyright 2010 Red Hat, Inc. and/or its affiliates. 1262306a36Sopenharmony_ci * 1362306a36Sopenharmony_ci * Authors: 1462306a36Sopenharmony_ci * Yaniv Kamay <yaniv@qumranet.com> 1562306a36Sopenharmony_ci * Avi Kivity <avi@qumranet.com> 1662306a36Sopenharmony_ci */ 1762306a36Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 1862306a36Sopenharmony_ci 1962306a36Sopenharmony_ci#include "irq.h" 2062306a36Sopenharmony_ci#include "ioapic.h" 2162306a36Sopenharmony_ci#include "mmu.h" 2262306a36Sopenharmony_ci#include "mmu_internal.h" 2362306a36Sopenharmony_ci#include "tdp_mmu.h" 2462306a36Sopenharmony_ci#include "x86.h" 2562306a36Sopenharmony_ci#include "kvm_cache_regs.h" 2662306a36Sopenharmony_ci#include "smm.h" 2762306a36Sopenharmony_ci#include "kvm_emulate.h" 2862306a36Sopenharmony_ci#include "page_track.h" 2962306a36Sopenharmony_ci#include "cpuid.h" 3062306a36Sopenharmony_ci#include "spte.h" 3162306a36Sopenharmony_ci 3262306a36Sopenharmony_ci#include <linux/kvm_host.h> 3362306a36Sopenharmony_ci#include <linux/types.h> 3462306a36Sopenharmony_ci#include <linux/string.h> 3562306a36Sopenharmony_ci#include <linux/mm.h> 3662306a36Sopenharmony_ci#include <linux/highmem.h> 3762306a36Sopenharmony_ci#include <linux/moduleparam.h> 3862306a36Sopenharmony_ci#include <linux/export.h> 3962306a36Sopenharmony_ci#include <linux/swap.h> 4062306a36Sopenharmony_ci#include <linux/hugetlb.h> 4162306a36Sopenharmony_ci#include <linux/compiler.h> 4262306a36Sopenharmony_ci#include <linux/srcu.h> 4362306a36Sopenharmony_ci#include <linux/slab.h> 4462306a36Sopenharmony_ci#include <linux/sched/signal.h> 4562306a36Sopenharmony_ci#include <linux/uaccess.h> 4662306a36Sopenharmony_ci#include <linux/hash.h> 4762306a36Sopenharmony_ci#include <linux/kern_levels.h> 4862306a36Sopenharmony_ci#include <linux/kstrtox.h> 4962306a36Sopenharmony_ci#include <linux/kthread.h> 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ci#include <asm/page.h> 5262306a36Sopenharmony_ci#include <asm/memtype.h> 5362306a36Sopenharmony_ci#include <asm/cmpxchg.h> 5462306a36Sopenharmony_ci#include <asm/io.h> 5562306a36Sopenharmony_ci#include <asm/set_memory.h> 5662306a36Sopenharmony_ci#include <asm/vmx.h> 5762306a36Sopenharmony_ci 5862306a36Sopenharmony_ci#include "trace.h" 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_ciextern bool itlb_multihit_kvm_mitigation; 6162306a36Sopenharmony_ci 6262306a36Sopenharmony_cistatic bool nx_hugepage_mitigation_hard_disabled; 6362306a36Sopenharmony_ci 6462306a36Sopenharmony_ciint __read_mostly nx_huge_pages = -1; 6562306a36Sopenharmony_cistatic uint __read_mostly nx_huge_pages_recovery_period_ms; 6662306a36Sopenharmony_ci#ifdef CONFIG_PREEMPT_RT 6762306a36Sopenharmony_ci/* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ 6862306a36Sopenharmony_cistatic uint __read_mostly nx_huge_pages_recovery_ratio = 0; 6962306a36Sopenharmony_ci#else 7062306a36Sopenharmony_cistatic uint __read_mostly nx_huge_pages_recovery_ratio = 60; 7162306a36Sopenharmony_ci#endif 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_cistatic int get_nx_huge_pages(char *buffer, const struct kernel_param *kp); 7462306a36Sopenharmony_cistatic int set_nx_huge_pages(const char *val, const struct kernel_param *kp); 7562306a36Sopenharmony_cistatic int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp); 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_cistatic const struct kernel_param_ops nx_huge_pages_ops = { 7862306a36Sopenharmony_ci .set = set_nx_huge_pages, 7962306a36Sopenharmony_ci .get = get_nx_huge_pages, 8062306a36Sopenharmony_ci}; 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_cistatic const struct kernel_param_ops nx_huge_pages_recovery_param_ops = { 8362306a36Sopenharmony_ci .set = set_nx_huge_pages_recovery_param, 8462306a36Sopenharmony_ci .get = param_get_uint, 8562306a36Sopenharmony_ci}; 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_cimodule_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); 8862306a36Sopenharmony_ci__MODULE_PARM_TYPE(nx_huge_pages, "bool"); 8962306a36Sopenharmony_cimodule_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops, 9062306a36Sopenharmony_ci &nx_huge_pages_recovery_ratio, 0644); 9162306a36Sopenharmony_ci__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); 9262306a36Sopenharmony_cimodule_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops, 9362306a36Sopenharmony_ci &nx_huge_pages_recovery_period_ms, 0644); 9462306a36Sopenharmony_ci__MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint"); 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_cistatic bool __read_mostly force_flush_and_sync_on_reuse; 9762306a36Sopenharmony_cimodule_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_ci/* 10062306a36Sopenharmony_ci * When setting this variable to true it enables Two-Dimensional-Paging 10162306a36Sopenharmony_ci * where the hardware walks 2 page tables: 10262306a36Sopenharmony_ci * 1. the guest-virtual to guest-physical 10362306a36Sopenharmony_ci * 2. while doing 1. it walks guest-physical to host-physical 10462306a36Sopenharmony_ci * If the hardware supports that we don't need to do shadow paging. 10562306a36Sopenharmony_ci */ 10662306a36Sopenharmony_cibool tdp_enabled = false; 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_cistatic bool __ro_after_init tdp_mmu_allowed; 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci#ifdef CONFIG_X86_64 11162306a36Sopenharmony_cibool __read_mostly tdp_mmu_enabled = true; 11262306a36Sopenharmony_cimodule_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); 11362306a36Sopenharmony_ci#endif 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_cistatic int max_huge_page_level __read_mostly; 11662306a36Sopenharmony_cistatic int tdp_root_level __read_mostly; 11762306a36Sopenharmony_cistatic int max_tdp_level __read_mostly; 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci#define PTE_PREFETCH_NUM 8 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci#include <trace/events/kvm.h> 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_ci/* make pte_list_desc fit well in cache lines */ 12462306a36Sopenharmony_ci#define PTE_LIST_EXT 14 12562306a36Sopenharmony_ci 12662306a36Sopenharmony_ci/* 12762306a36Sopenharmony_ci * struct pte_list_desc is the core data structure used to implement a custom 12862306a36Sopenharmony_ci * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a 12962306a36Sopenharmony_ci * given GFN when used in the context of rmaps. Using a custom list allows KVM 13062306a36Sopenharmony_ci * to optimize for the common case where many GFNs will have at most a handful 13162306a36Sopenharmony_ci * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small 13262306a36Sopenharmony_ci * memory footprint, which in turn improves runtime performance by exploiting 13362306a36Sopenharmony_ci * cache locality. 13462306a36Sopenharmony_ci * 13562306a36Sopenharmony_ci * A list is comprised of one or more pte_list_desc objects (descriptors). 13662306a36Sopenharmony_ci * Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor 13762306a36Sopenharmony_ci * is full and a new SPTEs needs to be added, a new descriptor is allocated and 13862306a36Sopenharmony_ci * becomes the head of the list. This means that by definitions, all tail 13962306a36Sopenharmony_ci * descriptors are full. 14062306a36Sopenharmony_ci * 14162306a36Sopenharmony_ci * Note, the meta data fields are deliberately placed at the start of the 14262306a36Sopenharmony_ci * structure to optimize the cacheline layout; accessing the descriptor will 14362306a36Sopenharmony_ci * touch only a single cacheline so long as @spte_count<=6 (or if only the 14462306a36Sopenharmony_ci * descriptors metadata is accessed). 14562306a36Sopenharmony_ci */ 14662306a36Sopenharmony_cistruct pte_list_desc { 14762306a36Sopenharmony_ci struct pte_list_desc *more; 14862306a36Sopenharmony_ci /* The number of PTEs stored in _this_ descriptor. */ 14962306a36Sopenharmony_ci u32 spte_count; 15062306a36Sopenharmony_ci /* The number of PTEs stored in all tails of this descriptor. */ 15162306a36Sopenharmony_ci u32 tail_count; 15262306a36Sopenharmony_ci u64 *sptes[PTE_LIST_EXT]; 15362306a36Sopenharmony_ci}; 15462306a36Sopenharmony_ci 15562306a36Sopenharmony_cistruct kvm_shadow_walk_iterator { 15662306a36Sopenharmony_ci u64 addr; 15762306a36Sopenharmony_ci hpa_t shadow_addr; 15862306a36Sopenharmony_ci u64 *sptep; 15962306a36Sopenharmony_ci int level; 16062306a36Sopenharmony_ci unsigned index; 16162306a36Sopenharmony_ci}; 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_ci#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \ 16462306a36Sopenharmony_ci for (shadow_walk_init_using_root(&(_walker), (_vcpu), \ 16562306a36Sopenharmony_ci (_root), (_addr)); \ 16662306a36Sopenharmony_ci shadow_walk_okay(&(_walker)); \ 16762306a36Sopenharmony_ci shadow_walk_next(&(_walker))) 16862306a36Sopenharmony_ci 16962306a36Sopenharmony_ci#define for_each_shadow_entry(_vcpu, _addr, _walker) \ 17062306a36Sopenharmony_ci for (shadow_walk_init(&(_walker), _vcpu, _addr); \ 17162306a36Sopenharmony_ci shadow_walk_okay(&(_walker)); \ 17262306a36Sopenharmony_ci shadow_walk_next(&(_walker))) 17362306a36Sopenharmony_ci 17462306a36Sopenharmony_ci#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \ 17562306a36Sopenharmony_ci for (shadow_walk_init(&(_walker), _vcpu, _addr); \ 17662306a36Sopenharmony_ci shadow_walk_okay(&(_walker)) && \ 17762306a36Sopenharmony_ci ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ 17862306a36Sopenharmony_ci __shadow_walk_next(&(_walker), spte)) 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_cistatic struct kmem_cache *pte_list_desc_cache; 18162306a36Sopenharmony_cistruct kmem_cache *mmu_page_header_cache; 18262306a36Sopenharmony_cistatic struct percpu_counter kvm_total_used_mmu_pages; 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_cistatic void mmu_spte_set(u64 *sptep, u64 spte); 18562306a36Sopenharmony_ci 18662306a36Sopenharmony_cistruct kvm_mmu_role_regs { 18762306a36Sopenharmony_ci const unsigned long cr0; 18862306a36Sopenharmony_ci const unsigned long cr4; 18962306a36Sopenharmony_ci const u64 efer; 19062306a36Sopenharmony_ci}; 19162306a36Sopenharmony_ci 19262306a36Sopenharmony_ci#define CREATE_TRACE_POINTS 19362306a36Sopenharmony_ci#include "mmutrace.h" 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci/* 19662306a36Sopenharmony_ci * Yes, lot's of underscores. They're a hint that you probably shouldn't be 19762306a36Sopenharmony_ci * reading from the role_regs. Once the root_role is constructed, it becomes 19862306a36Sopenharmony_ci * the single source of truth for the MMU's state. 19962306a36Sopenharmony_ci */ 20062306a36Sopenharmony_ci#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ 20162306a36Sopenharmony_cistatic inline bool __maybe_unused \ 20262306a36Sopenharmony_ci____is_##reg##_##name(const struct kvm_mmu_role_regs *regs) \ 20362306a36Sopenharmony_ci{ \ 20462306a36Sopenharmony_ci return !!(regs->reg & flag); \ 20562306a36Sopenharmony_ci} 20662306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG); 20762306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP); 20862306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE); 20962306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE); 21062306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP); 21162306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP); 21262306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE); 21362306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57); 21462306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX); 21562306a36Sopenharmony_ciBUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci/* 21862306a36Sopenharmony_ci * The MMU itself (with a valid role) is the single source of truth for the 21962306a36Sopenharmony_ci * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The 22062306a36Sopenharmony_ci * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1, 22162306a36Sopenharmony_ci * and the vCPU may be incorrect/irrelevant. 22262306a36Sopenharmony_ci */ 22362306a36Sopenharmony_ci#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ 22462306a36Sopenharmony_cistatic inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \ 22562306a36Sopenharmony_ci{ \ 22662306a36Sopenharmony_ci return !!(mmu->cpu_role. base_or_ext . reg##_##name); \ 22762306a36Sopenharmony_ci} 22862306a36Sopenharmony_ciBUILD_MMU_ROLE_ACCESSOR(base, cr0, wp); 22962306a36Sopenharmony_ciBUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse); 23062306a36Sopenharmony_ciBUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep); 23162306a36Sopenharmony_ciBUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap); 23262306a36Sopenharmony_ciBUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke); 23362306a36Sopenharmony_ciBUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57); 23462306a36Sopenharmony_ciBUILD_MMU_ROLE_ACCESSOR(base, efer, nx); 23562306a36Sopenharmony_ciBUILD_MMU_ROLE_ACCESSOR(ext, efer, lma); 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_cistatic inline bool is_cr0_pg(struct kvm_mmu *mmu) 23862306a36Sopenharmony_ci{ 23962306a36Sopenharmony_ci return mmu->cpu_role.base.level > 0; 24062306a36Sopenharmony_ci} 24162306a36Sopenharmony_ci 24262306a36Sopenharmony_cistatic inline bool is_cr4_pae(struct kvm_mmu *mmu) 24362306a36Sopenharmony_ci{ 24462306a36Sopenharmony_ci return !mmu->cpu_role.base.has_4_byte_gpte; 24562306a36Sopenharmony_ci} 24662306a36Sopenharmony_ci 24762306a36Sopenharmony_cistatic struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) 24862306a36Sopenharmony_ci{ 24962306a36Sopenharmony_ci struct kvm_mmu_role_regs regs = { 25062306a36Sopenharmony_ci .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS), 25162306a36Sopenharmony_ci .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS), 25262306a36Sopenharmony_ci .efer = vcpu->arch.efer, 25362306a36Sopenharmony_ci }; 25462306a36Sopenharmony_ci 25562306a36Sopenharmony_ci return regs; 25662306a36Sopenharmony_ci} 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_cistatic unsigned long get_guest_cr3(struct kvm_vcpu *vcpu) 25962306a36Sopenharmony_ci{ 26062306a36Sopenharmony_ci return kvm_read_cr3(vcpu); 26162306a36Sopenharmony_ci} 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_cistatic inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu, 26462306a36Sopenharmony_ci struct kvm_mmu *mmu) 26562306a36Sopenharmony_ci{ 26662306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3) 26762306a36Sopenharmony_ci return kvm_read_cr3(vcpu); 26862306a36Sopenharmony_ci 26962306a36Sopenharmony_ci return mmu->get_guest_pgd(vcpu); 27062306a36Sopenharmony_ci} 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_cistatic inline bool kvm_available_flush_remote_tlbs_range(void) 27362306a36Sopenharmony_ci{ 27462306a36Sopenharmony_ci return kvm_x86_ops.flush_remote_tlbs_range; 27562306a36Sopenharmony_ci} 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_ciint kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) 27862306a36Sopenharmony_ci{ 27962306a36Sopenharmony_ci if (!kvm_x86_ops.flush_remote_tlbs_range) 28062306a36Sopenharmony_ci return -EOPNOTSUPP; 28162306a36Sopenharmony_ci 28262306a36Sopenharmony_ci return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages); 28362306a36Sopenharmony_ci} 28462306a36Sopenharmony_ci 28562306a36Sopenharmony_cistatic gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index); 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci/* Flush the range of guest memory mapped by the given SPTE. */ 28862306a36Sopenharmony_cistatic void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep) 28962306a36Sopenharmony_ci{ 29062306a36Sopenharmony_ci struct kvm_mmu_page *sp = sptep_to_sp(sptep); 29162306a36Sopenharmony_ci gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep)); 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level); 29462306a36Sopenharmony_ci} 29562306a36Sopenharmony_ci 29662306a36Sopenharmony_cistatic void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, 29762306a36Sopenharmony_ci unsigned int access) 29862306a36Sopenharmony_ci{ 29962306a36Sopenharmony_ci u64 spte = make_mmio_spte(vcpu, gfn, access); 30062306a36Sopenharmony_ci 30162306a36Sopenharmony_ci trace_mark_mmio_spte(sptep, gfn, spte); 30262306a36Sopenharmony_ci mmu_spte_set(sptep, spte); 30362306a36Sopenharmony_ci} 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_cistatic gfn_t get_mmio_spte_gfn(u64 spte) 30662306a36Sopenharmony_ci{ 30762306a36Sopenharmony_ci u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; 30862306a36Sopenharmony_ci 30962306a36Sopenharmony_ci gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN) 31062306a36Sopenharmony_ci & shadow_nonpresent_or_rsvd_mask; 31162306a36Sopenharmony_ci 31262306a36Sopenharmony_ci return gpa >> PAGE_SHIFT; 31362306a36Sopenharmony_ci} 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_cistatic unsigned get_mmio_spte_access(u64 spte) 31662306a36Sopenharmony_ci{ 31762306a36Sopenharmony_ci return spte & shadow_mmio_access_mask; 31862306a36Sopenharmony_ci} 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_cistatic bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) 32162306a36Sopenharmony_ci{ 32262306a36Sopenharmony_ci u64 kvm_gen, spte_gen, gen; 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_ci gen = kvm_vcpu_memslots(vcpu)->generation; 32562306a36Sopenharmony_ci if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS)) 32662306a36Sopenharmony_ci return false; 32762306a36Sopenharmony_ci 32862306a36Sopenharmony_ci kvm_gen = gen & MMIO_SPTE_GEN_MASK; 32962306a36Sopenharmony_ci spte_gen = get_mmio_spte_generation(spte); 33062306a36Sopenharmony_ci 33162306a36Sopenharmony_ci trace_check_mmio_spte(spte, kvm_gen, spte_gen); 33262306a36Sopenharmony_ci return likely(kvm_gen == spte_gen); 33362306a36Sopenharmony_ci} 33462306a36Sopenharmony_ci 33562306a36Sopenharmony_cistatic int is_cpuid_PSE36(void) 33662306a36Sopenharmony_ci{ 33762306a36Sopenharmony_ci return 1; 33862306a36Sopenharmony_ci} 33962306a36Sopenharmony_ci 34062306a36Sopenharmony_ci#ifdef CONFIG_X86_64 34162306a36Sopenharmony_cistatic void __set_spte(u64 *sptep, u64 spte) 34262306a36Sopenharmony_ci{ 34362306a36Sopenharmony_ci WRITE_ONCE(*sptep, spte); 34462306a36Sopenharmony_ci} 34562306a36Sopenharmony_ci 34662306a36Sopenharmony_cistatic void __update_clear_spte_fast(u64 *sptep, u64 spte) 34762306a36Sopenharmony_ci{ 34862306a36Sopenharmony_ci WRITE_ONCE(*sptep, spte); 34962306a36Sopenharmony_ci} 35062306a36Sopenharmony_ci 35162306a36Sopenharmony_cistatic u64 __update_clear_spte_slow(u64 *sptep, u64 spte) 35262306a36Sopenharmony_ci{ 35362306a36Sopenharmony_ci return xchg(sptep, spte); 35462306a36Sopenharmony_ci} 35562306a36Sopenharmony_ci 35662306a36Sopenharmony_cistatic u64 __get_spte_lockless(u64 *sptep) 35762306a36Sopenharmony_ci{ 35862306a36Sopenharmony_ci return READ_ONCE(*sptep); 35962306a36Sopenharmony_ci} 36062306a36Sopenharmony_ci#else 36162306a36Sopenharmony_ciunion split_spte { 36262306a36Sopenharmony_ci struct { 36362306a36Sopenharmony_ci u32 spte_low; 36462306a36Sopenharmony_ci u32 spte_high; 36562306a36Sopenharmony_ci }; 36662306a36Sopenharmony_ci u64 spte; 36762306a36Sopenharmony_ci}; 36862306a36Sopenharmony_ci 36962306a36Sopenharmony_cistatic void count_spte_clear(u64 *sptep, u64 spte) 37062306a36Sopenharmony_ci{ 37162306a36Sopenharmony_ci struct kvm_mmu_page *sp = sptep_to_sp(sptep); 37262306a36Sopenharmony_ci 37362306a36Sopenharmony_ci if (is_shadow_present_pte(spte)) 37462306a36Sopenharmony_ci return; 37562306a36Sopenharmony_ci 37662306a36Sopenharmony_ci /* Ensure the spte is completely set before we increase the count */ 37762306a36Sopenharmony_ci smp_wmb(); 37862306a36Sopenharmony_ci sp->clear_spte_count++; 37962306a36Sopenharmony_ci} 38062306a36Sopenharmony_ci 38162306a36Sopenharmony_cistatic void __set_spte(u64 *sptep, u64 spte) 38262306a36Sopenharmony_ci{ 38362306a36Sopenharmony_ci union split_spte *ssptep, sspte; 38462306a36Sopenharmony_ci 38562306a36Sopenharmony_ci ssptep = (union split_spte *)sptep; 38662306a36Sopenharmony_ci sspte = (union split_spte)spte; 38762306a36Sopenharmony_ci 38862306a36Sopenharmony_ci ssptep->spte_high = sspte.spte_high; 38962306a36Sopenharmony_ci 39062306a36Sopenharmony_ci /* 39162306a36Sopenharmony_ci * If we map the spte from nonpresent to present, We should store 39262306a36Sopenharmony_ci * the high bits firstly, then set present bit, so cpu can not 39362306a36Sopenharmony_ci * fetch this spte while we are setting the spte. 39462306a36Sopenharmony_ci */ 39562306a36Sopenharmony_ci smp_wmb(); 39662306a36Sopenharmony_ci 39762306a36Sopenharmony_ci WRITE_ONCE(ssptep->spte_low, sspte.spte_low); 39862306a36Sopenharmony_ci} 39962306a36Sopenharmony_ci 40062306a36Sopenharmony_cistatic void __update_clear_spte_fast(u64 *sptep, u64 spte) 40162306a36Sopenharmony_ci{ 40262306a36Sopenharmony_ci union split_spte *ssptep, sspte; 40362306a36Sopenharmony_ci 40462306a36Sopenharmony_ci ssptep = (union split_spte *)sptep; 40562306a36Sopenharmony_ci sspte = (union split_spte)spte; 40662306a36Sopenharmony_ci 40762306a36Sopenharmony_ci WRITE_ONCE(ssptep->spte_low, sspte.spte_low); 40862306a36Sopenharmony_ci 40962306a36Sopenharmony_ci /* 41062306a36Sopenharmony_ci * If we map the spte from present to nonpresent, we should clear 41162306a36Sopenharmony_ci * present bit firstly to avoid vcpu fetch the old high bits. 41262306a36Sopenharmony_ci */ 41362306a36Sopenharmony_ci smp_wmb(); 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_ci ssptep->spte_high = sspte.spte_high; 41662306a36Sopenharmony_ci count_spte_clear(sptep, spte); 41762306a36Sopenharmony_ci} 41862306a36Sopenharmony_ci 41962306a36Sopenharmony_cistatic u64 __update_clear_spte_slow(u64 *sptep, u64 spte) 42062306a36Sopenharmony_ci{ 42162306a36Sopenharmony_ci union split_spte *ssptep, sspte, orig; 42262306a36Sopenharmony_ci 42362306a36Sopenharmony_ci ssptep = (union split_spte *)sptep; 42462306a36Sopenharmony_ci sspte = (union split_spte)spte; 42562306a36Sopenharmony_ci 42662306a36Sopenharmony_ci /* xchg acts as a barrier before the setting of the high bits */ 42762306a36Sopenharmony_ci orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); 42862306a36Sopenharmony_ci orig.spte_high = ssptep->spte_high; 42962306a36Sopenharmony_ci ssptep->spte_high = sspte.spte_high; 43062306a36Sopenharmony_ci count_spte_clear(sptep, spte); 43162306a36Sopenharmony_ci 43262306a36Sopenharmony_ci return orig.spte; 43362306a36Sopenharmony_ci} 43462306a36Sopenharmony_ci 43562306a36Sopenharmony_ci/* 43662306a36Sopenharmony_ci * The idea using the light way get the spte on x86_32 guest is from 43762306a36Sopenharmony_ci * gup_get_pte (mm/gup.c). 43862306a36Sopenharmony_ci * 43962306a36Sopenharmony_ci * An spte tlb flush may be pending, because kvm_set_pte_rmap 44062306a36Sopenharmony_ci * coalesces them and we are running out of the MMU lock. Therefore 44162306a36Sopenharmony_ci * we need to protect against in-progress updates of the spte. 44262306a36Sopenharmony_ci * 44362306a36Sopenharmony_ci * Reading the spte while an update is in progress may get the old value 44462306a36Sopenharmony_ci * for the high part of the spte. The race is fine for a present->non-present 44562306a36Sopenharmony_ci * change (because the high part of the spte is ignored for non-present spte), 44662306a36Sopenharmony_ci * but for a present->present change we must reread the spte. 44762306a36Sopenharmony_ci * 44862306a36Sopenharmony_ci * All such changes are done in two steps (present->non-present and 44962306a36Sopenharmony_ci * non-present->present), hence it is enough to count the number of 45062306a36Sopenharmony_ci * present->non-present updates: if it changed while reading the spte, 45162306a36Sopenharmony_ci * we might have hit the race. This is done using clear_spte_count. 45262306a36Sopenharmony_ci */ 45362306a36Sopenharmony_cistatic u64 __get_spte_lockless(u64 *sptep) 45462306a36Sopenharmony_ci{ 45562306a36Sopenharmony_ci struct kvm_mmu_page *sp = sptep_to_sp(sptep); 45662306a36Sopenharmony_ci union split_spte spte, *orig = (union split_spte *)sptep; 45762306a36Sopenharmony_ci int count; 45862306a36Sopenharmony_ci 45962306a36Sopenharmony_ciretry: 46062306a36Sopenharmony_ci count = sp->clear_spte_count; 46162306a36Sopenharmony_ci smp_rmb(); 46262306a36Sopenharmony_ci 46362306a36Sopenharmony_ci spte.spte_low = orig->spte_low; 46462306a36Sopenharmony_ci smp_rmb(); 46562306a36Sopenharmony_ci 46662306a36Sopenharmony_ci spte.spte_high = orig->spte_high; 46762306a36Sopenharmony_ci smp_rmb(); 46862306a36Sopenharmony_ci 46962306a36Sopenharmony_ci if (unlikely(spte.spte_low != orig->spte_low || 47062306a36Sopenharmony_ci count != sp->clear_spte_count)) 47162306a36Sopenharmony_ci goto retry; 47262306a36Sopenharmony_ci 47362306a36Sopenharmony_ci return spte.spte; 47462306a36Sopenharmony_ci} 47562306a36Sopenharmony_ci#endif 47662306a36Sopenharmony_ci 47762306a36Sopenharmony_ci/* Rules for using mmu_spte_set: 47862306a36Sopenharmony_ci * Set the sptep from nonpresent to present. 47962306a36Sopenharmony_ci * Note: the sptep being assigned *must* be either not present 48062306a36Sopenharmony_ci * or in a state where the hardware will not attempt to update 48162306a36Sopenharmony_ci * the spte. 48262306a36Sopenharmony_ci */ 48362306a36Sopenharmony_cistatic void mmu_spte_set(u64 *sptep, u64 new_spte) 48462306a36Sopenharmony_ci{ 48562306a36Sopenharmony_ci WARN_ON_ONCE(is_shadow_present_pte(*sptep)); 48662306a36Sopenharmony_ci __set_spte(sptep, new_spte); 48762306a36Sopenharmony_ci} 48862306a36Sopenharmony_ci 48962306a36Sopenharmony_ci/* 49062306a36Sopenharmony_ci * Update the SPTE (excluding the PFN), but do not track changes in its 49162306a36Sopenharmony_ci * accessed/dirty status. 49262306a36Sopenharmony_ci */ 49362306a36Sopenharmony_cistatic u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) 49462306a36Sopenharmony_ci{ 49562306a36Sopenharmony_ci u64 old_spte = *sptep; 49662306a36Sopenharmony_ci 49762306a36Sopenharmony_ci WARN_ON_ONCE(!is_shadow_present_pte(new_spte)); 49862306a36Sopenharmony_ci check_spte_writable_invariants(new_spte); 49962306a36Sopenharmony_ci 50062306a36Sopenharmony_ci if (!is_shadow_present_pte(old_spte)) { 50162306a36Sopenharmony_ci mmu_spte_set(sptep, new_spte); 50262306a36Sopenharmony_ci return old_spte; 50362306a36Sopenharmony_ci } 50462306a36Sopenharmony_ci 50562306a36Sopenharmony_ci if (!spte_has_volatile_bits(old_spte)) 50662306a36Sopenharmony_ci __update_clear_spte_fast(sptep, new_spte); 50762306a36Sopenharmony_ci else 50862306a36Sopenharmony_ci old_spte = __update_clear_spte_slow(sptep, new_spte); 50962306a36Sopenharmony_ci 51062306a36Sopenharmony_ci WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); 51162306a36Sopenharmony_ci 51262306a36Sopenharmony_ci return old_spte; 51362306a36Sopenharmony_ci} 51462306a36Sopenharmony_ci 51562306a36Sopenharmony_ci/* Rules for using mmu_spte_update: 51662306a36Sopenharmony_ci * Update the state bits, it means the mapped pfn is not changed. 51762306a36Sopenharmony_ci * 51862306a36Sopenharmony_ci * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote 51962306a36Sopenharmony_ci * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only 52062306a36Sopenharmony_ci * spte, even though the writable spte might be cached on a CPU's TLB. 52162306a36Sopenharmony_ci * 52262306a36Sopenharmony_ci * Returns true if the TLB needs to be flushed 52362306a36Sopenharmony_ci */ 52462306a36Sopenharmony_cistatic bool mmu_spte_update(u64 *sptep, u64 new_spte) 52562306a36Sopenharmony_ci{ 52662306a36Sopenharmony_ci bool flush = false; 52762306a36Sopenharmony_ci u64 old_spte = mmu_spte_update_no_track(sptep, new_spte); 52862306a36Sopenharmony_ci 52962306a36Sopenharmony_ci if (!is_shadow_present_pte(old_spte)) 53062306a36Sopenharmony_ci return false; 53162306a36Sopenharmony_ci 53262306a36Sopenharmony_ci /* 53362306a36Sopenharmony_ci * For the spte updated out of mmu-lock is safe, since 53462306a36Sopenharmony_ci * we always atomically update it, see the comments in 53562306a36Sopenharmony_ci * spte_has_volatile_bits(). 53662306a36Sopenharmony_ci */ 53762306a36Sopenharmony_ci if (is_mmu_writable_spte(old_spte) && 53862306a36Sopenharmony_ci !is_writable_pte(new_spte)) 53962306a36Sopenharmony_ci flush = true; 54062306a36Sopenharmony_ci 54162306a36Sopenharmony_ci /* 54262306a36Sopenharmony_ci * Flush TLB when accessed/dirty states are changed in the page tables, 54362306a36Sopenharmony_ci * to guarantee consistency between TLB and page tables. 54462306a36Sopenharmony_ci */ 54562306a36Sopenharmony_ci 54662306a36Sopenharmony_ci if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) { 54762306a36Sopenharmony_ci flush = true; 54862306a36Sopenharmony_ci kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 54962306a36Sopenharmony_ci } 55062306a36Sopenharmony_ci 55162306a36Sopenharmony_ci if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) { 55262306a36Sopenharmony_ci flush = true; 55362306a36Sopenharmony_ci kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 55462306a36Sopenharmony_ci } 55562306a36Sopenharmony_ci 55662306a36Sopenharmony_ci return flush; 55762306a36Sopenharmony_ci} 55862306a36Sopenharmony_ci 55962306a36Sopenharmony_ci/* 56062306a36Sopenharmony_ci * Rules for using mmu_spte_clear_track_bits: 56162306a36Sopenharmony_ci * It sets the sptep from present to nonpresent, and track the 56262306a36Sopenharmony_ci * state bits, it is used to clear the last level sptep. 56362306a36Sopenharmony_ci * Returns the old PTE. 56462306a36Sopenharmony_ci */ 56562306a36Sopenharmony_cistatic u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) 56662306a36Sopenharmony_ci{ 56762306a36Sopenharmony_ci kvm_pfn_t pfn; 56862306a36Sopenharmony_ci u64 old_spte = *sptep; 56962306a36Sopenharmony_ci int level = sptep_to_sp(sptep)->role.level; 57062306a36Sopenharmony_ci struct page *page; 57162306a36Sopenharmony_ci 57262306a36Sopenharmony_ci if (!is_shadow_present_pte(old_spte) || 57362306a36Sopenharmony_ci !spte_has_volatile_bits(old_spte)) 57462306a36Sopenharmony_ci __update_clear_spte_fast(sptep, 0ull); 57562306a36Sopenharmony_ci else 57662306a36Sopenharmony_ci old_spte = __update_clear_spte_slow(sptep, 0ull); 57762306a36Sopenharmony_ci 57862306a36Sopenharmony_ci if (!is_shadow_present_pte(old_spte)) 57962306a36Sopenharmony_ci return old_spte; 58062306a36Sopenharmony_ci 58162306a36Sopenharmony_ci kvm_update_page_stats(kvm, level, -1); 58262306a36Sopenharmony_ci 58362306a36Sopenharmony_ci pfn = spte_to_pfn(old_spte); 58462306a36Sopenharmony_ci 58562306a36Sopenharmony_ci /* 58662306a36Sopenharmony_ci * KVM doesn't hold a reference to any pages mapped into the guest, and 58762306a36Sopenharmony_ci * instead uses the mmu_notifier to ensure that KVM unmaps any pages 58862306a36Sopenharmony_ci * before they are reclaimed. Sanity check that, if the pfn is backed 58962306a36Sopenharmony_ci * by a refcounted page, the refcount is elevated. 59062306a36Sopenharmony_ci */ 59162306a36Sopenharmony_ci page = kvm_pfn_to_refcounted_page(pfn); 59262306a36Sopenharmony_ci WARN_ON_ONCE(page && !page_count(page)); 59362306a36Sopenharmony_ci 59462306a36Sopenharmony_ci if (is_accessed_spte(old_spte)) 59562306a36Sopenharmony_ci kvm_set_pfn_accessed(pfn); 59662306a36Sopenharmony_ci 59762306a36Sopenharmony_ci if (is_dirty_spte(old_spte)) 59862306a36Sopenharmony_ci kvm_set_pfn_dirty(pfn); 59962306a36Sopenharmony_ci 60062306a36Sopenharmony_ci return old_spte; 60162306a36Sopenharmony_ci} 60262306a36Sopenharmony_ci 60362306a36Sopenharmony_ci/* 60462306a36Sopenharmony_ci * Rules for using mmu_spte_clear_no_track: 60562306a36Sopenharmony_ci * Directly clear spte without caring the state bits of sptep, 60662306a36Sopenharmony_ci * it is used to set the upper level spte. 60762306a36Sopenharmony_ci */ 60862306a36Sopenharmony_cistatic void mmu_spte_clear_no_track(u64 *sptep) 60962306a36Sopenharmony_ci{ 61062306a36Sopenharmony_ci __update_clear_spte_fast(sptep, 0ull); 61162306a36Sopenharmony_ci} 61262306a36Sopenharmony_ci 61362306a36Sopenharmony_cistatic u64 mmu_spte_get_lockless(u64 *sptep) 61462306a36Sopenharmony_ci{ 61562306a36Sopenharmony_ci return __get_spte_lockless(sptep); 61662306a36Sopenharmony_ci} 61762306a36Sopenharmony_ci 61862306a36Sopenharmony_ci/* Returns the Accessed status of the PTE and resets it at the same time. */ 61962306a36Sopenharmony_cistatic bool mmu_spte_age(u64 *sptep) 62062306a36Sopenharmony_ci{ 62162306a36Sopenharmony_ci u64 spte = mmu_spte_get_lockless(sptep); 62262306a36Sopenharmony_ci 62362306a36Sopenharmony_ci if (!is_accessed_spte(spte)) 62462306a36Sopenharmony_ci return false; 62562306a36Sopenharmony_ci 62662306a36Sopenharmony_ci if (spte_ad_enabled(spte)) { 62762306a36Sopenharmony_ci clear_bit((ffs(shadow_accessed_mask) - 1), 62862306a36Sopenharmony_ci (unsigned long *)sptep); 62962306a36Sopenharmony_ci } else { 63062306a36Sopenharmony_ci /* 63162306a36Sopenharmony_ci * Capture the dirty status of the page, so that it doesn't get 63262306a36Sopenharmony_ci * lost when the SPTE is marked for access tracking. 63362306a36Sopenharmony_ci */ 63462306a36Sopenharmony_ci if (is_writable_pte(spte)) 63562306a36Sopenharmony_ci kvm_set_pfn_dirty(spte_to_pfn(spte)); 63662306a36Sopenharmony_ci 63762306a36Sopenharmony_ci spte = mark_spte_for_access_track(spte); 63862306a36Sopenharmony_ci mmu_spte_update_no_track(sptep, spte); 63962306a36Sopenharmony_ci } 64062306a36Sopenharmony_ci 64162306a36Sopenharmony_ci return true; 64262306a36Sopenharmony_ci} 64362306a36Sopenharmony_ci 64462306a36Sopenharmony_cistatic inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) 64562306a36Sopenharmony_ci{ 64662306a36Sopenharmony_ci return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct; 64762306a36Sopenharmony_ci} 64862306a36Sopenharmony_ci 64962306a36Sopenharmony_cistatic void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) 65062306a36Sopenharmony_ci{ 65162306a36Sopenharmony_ci if (is_tdp_mmu_active(vcpu)) { 65262306a36Sopenharmony_ci kvm_tdp_mmu_walk_lockless_begin(); 65362306a36Sopenharmony_ci } else { 65462306a36Sopenharmony_ci /* 65562306a36Sopenharmony_ci * Prevent page table teardown by making any free-er wait during 65662306a36Sopenharmony_ci * kvm_flush_remote_tlbs() IPI to all active vcpus. 65762306a36Sopenharmony_ci */ 65862306a36Sopenharmony_ci local_irq_disable(); 65962306a36Sopenharmony_ci 66062306a36Sopenharmony_ci /* 66162306a36Sopenharmony_ci * Make sure a following spte read is not reordered ahead of the write 66262306a36Sopenharmony_ci * to vcpu->mode. 66362306a36Sopenharmony_ci */ 66462306a36Sopenharmony_ci smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); 66562306a36Sopenharmony_ci } 66662306a36Sopenharmony_ci} 66762306a36Sopenharmony_ci 66862306a36Sopenharmony_cistatic void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) 66962306a36Sopenharmony_ci{ 67062306a36Sopenharmony_ci if (is_tdp_mmu_active(vcpu)) { 67162306a36Sopenharmony_ci kvm_tdp_mmu_walk_lockless_end(); 67262306a36Sopenharmony_ci } else { 67362306a36Sopenharmony_ci /* 67462306a36Sopenharmony_ci * Make sure the write to vcpu->mode is not reordered in front of 67562306a36Sopenharmony_ci * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us 67662306a36Sopenharmony_ci * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. 67762306a36Sopenharmony_ci */ 67862306a36Sopenharmony_ci smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); 67962306a36Sopenharmony_ci local_irq_enable(); 68062306a36Sopenharmony_ci } 68162306a36Sopenharmony_ci} 68262306a36Sopenharmony_ci 68362306a36Sopenharmony_cistatic int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) 68462306a36Sopenharmony_ci{ 68562306a36Sopenharmony_ci int r; 68662306a36Sopenharmony_ci 68762306a36Sopenharmony_ci /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */ 68862306a36Sopenharmony_ci r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, 68962306a36Sopenharmony_ci 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); 69062306a36Sopenharmony_ci if (r) 69162306a36Sopenharmony_ci return r; 69262306a36Sopenharmony_ci r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, 69362306a36Sopenharmony_ci PT64_ROOT_MAX_LEVEL); 69462306a36Sopenharmony_ci if (r) 69562306a36Sopenharmony_ci return r; 69662306a36Sopenharmony_ci if (maybe_indirect) { 69762306a36Sopenharmony_ci r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache, 69862306a36Sopenharmony_ci PT64_ROOT_MAX_LEVEL); 69962306a36Sopenharmony_ci if (r) 70062306a36Sopenharmony_ci return r; 70162306a36Sopenharmony_ci } 70262306a36Sopenharmony_ci return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, 70362306a36Sopenharmony_ci PT64_ROOT_MAX_LEVEL); 70462306a36Sopenharmony_ci} 70562306a36Sopenharmony_ci 70662306a36Sopenharmony_cistatic void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 70762306a36Sopenharmony_ci{ 70862306a36Sopenharmony_ci kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); 70962306a36Sopenharmony_ci kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); 71062306a36Sopenharmony_ci kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache); 71162306a36Sopenharmony_ci kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); 71262306a36Sopenharmony_ci} 71362306a36Sopenharmony_ci 71462306a36Sopenharmony_cistatic void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) 71562306a36Sopenharmony_ci{ 71662306a36Sopenharmony_ci kmem_cache_free(pte_list_desc_cache, pte_list_desc); 71762306a36Sopenharmony_ci} 71862306a36Sopenharmony_ci 71962306a36Sopenharmony_cistatic bool sp_has_gptes(struct kvm_mmu_page *sp); 72062306a36Sopenharmony_ci 72162306a36Sopenharmony_cistatic gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) 72262306a36Sopenharmony_ci{ 72362306a36Sopenharmony_ci if (sp->role.passthrough) 72462306a36Sopenharmony_ci return sp->gfn; 72562306a36Sopenharmony_ci 72662306a36Sopenharmony_ci if (!sp->role.direct) 72762306a36Sopenharmony_ci return sp->shadowed_translation[index] >> PAGE_SHIFT; 72862306a36Sopenharmony_ci 72962306a36Sopenharmony_ci return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS)); 73062306a36Sopenharmony_ci} 73162306a36Sopenharmony_ci 73262306a36Sopenharmony_ci/* 73362306a36Sopenharmony_ci * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note 73462306a36Sopenharmony_ci * that the SPTE itself may have a more constrained access permissions that 73562306a36Sopenharmony_ci * what the guest enforces. For example, a guest may create an executable 73662306a36Sopenharmony_ci * huge PTE but KVM may disallow execution to mitigate iTLB multihit. 73762306a36Sopenharmony_ci */ 73862306a36Sopenharmony_cistatic u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) 73962306a36Sopenharmony_ci{ 74062306a36Sopenharmony_ci if (sp_has_gptes(sp)) 74162306a36Sopenharmony_ci return sp->shadowed_translation[index] & ACC_ALL; 74262306a36Sopenharmony_ci 74362306a36Sopenharmony_ci /* 74462306a36Sopenharmony_ci * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs, 74562306a36Sopenharmony_ci * KVM is not shadowing any guest page tables, so the "guest access 74662306a36Sopenharmony_ci * permissions" are just ACC_ALL. 74762306a36Sopenharmony_ci * 74862306a36Sopenharmony_ci * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM 74962306a36Sopenharmony_ci * is shadowing a guest huge page with small pages, the guest access 75062306a36Sopenharmony_ci * permissions being shadowed are the access permissions of the huge 75162306a36Sopenharmony_ci * page. 75262306a36Sopenharmony_ci * 75362306a36Sopenharmony_ci * In both cases, sp->role.access contains the correct access bits. 75462306a36Sopenharmony_ci */ 75562306a36Sopenharmony_ci return sp->role.access; 75662306a36Sopenharmony_ci} 75762306a36Sopenharmony_ci 75862306a36Sopenharmony_cistatic void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, 75962306a36Sopenharmony_ci gfn_t gfn, unsigned int access) 76062306a36Sopenharmony_ci{ 76162306a36Sopenharmony_ci if (sp_has_gptes(sp)) { 76262306a36Sopenharmony_ci sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; 76362306a36Sopenharmony_ci return; 76462306a36Sopenharmony_ci } 76562306a36Sopenharmony_ci 76662306a36Sopenharmony_ci WARN_ONCE(access != kvm_mmu_page_get_access(sp, index), 76762306a36Sopenharmony_ci "access mismatch under %s page %llx (expected %u, got %u)\n", 76862306a36Sopenharmony_ci sp->role.passthrough ? "passthrough" : "direct", 76962306a36Sopenharmony_ci sp->gfn, kvm_mmu_page_get_access(sp, index), access); 77062306a36Sopenharmony_ci 77162306a36Sopenharmony_ci WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index), 77262306a36Sopenharmony_ci "gfn mismatch under %s page %llx (expected %llx, got %llx)\n", 77362306a36Sopenharmony_ci sp->role.passthrough ? "passthrough" : "direct", 77462306a36Sopenharmony_ci sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn); 77562306a36Sopenharmony_ci} 77662306a36Sopenharmony_ci 77762306a36Sopenharmony_cistatic void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index, 77862306a36Sopenharmony_ci unsigned int access) 77962306a36Sopenharmony_ci{ 78062306a36Sopenharmony_ci gfn_t gfn = kvm_mmu_page_get_gfn(sp, index); 78162306a36Sopenharmony_ci 78262306a36Sopenharmony_ci kvm_mmu_page_set_translation(sp, index, gfn, access); 78362306a36Sopenharmony_ci} 78462306a36Sopenharmony_ci 78562306a36Sopenharmony_ci/* 78662306a36Sopenharmony_ci * Return the pointer to the large page information for a given gfn, 78762306a36Sopenharmony_ci * handling slots that are not large page aligned. 78862306a36Sopenharmony_ci */ 78962306a36Sopenharmony_cistatic struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, 79062306a36Sopenharmony_ci const struct kvm_memory_slot *slot, int level) 79162306a36Sopenharmony_ci{ 79262306a36Sopenharmony_ci unsigned long idx; 79362306a36Sopenharmony_ci 79462306a36Sopenharmony_ci idx = gfn_to_index(gfn, slot->base_gfn, level); 79562306a36Sopenharmony_ci return &slot->arch.lpage_info[level - 2][idx]; 79662306a36Sopenharmony_ci} 79762306a36Sopenharmony_ci 79862306a36Sopenharmony_cistatic void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, 79962306a36Sopenharmony_ci gfn_t gfn, int count) 80062306a36Sopenharmony_ci{ 80162306a36Sopenharmony_ci struct kvm_lpage_info *linfo; 80262306a36Sopenharmony_ci int i; 80362306a36Sopenharmony_ci 80462306a36Sopenharmony_ci for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { 80562306a36Sopenharmony_ci linfo = lpage_info_slot(gfn, slot, i); 80662306a36Sopenharmony_ci linfo->disallow_lpage += count; 80762306a36Sopenharmony_ci WARN_ON_ONCE(linfo->disallow_lpage < 0); 80862306a36Sopenharmony_ci } 80962306a36Sopenharmony_ci} 81062306a36Sopenharmony_ci 81162306a36Sopenharmony_civoid kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) 81262306a36Sopenharmony_ci{ 81362306a36Sopenharmony_ci update_gfn_disallow_lpage_count(slot, gfn, 1); 81462306a36Sopenharmony_ci} 81562306a36Sopenharmony_ci 81662306a36Sopenharmony_civoid kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) 81762306a36Sopenharmony_ci{ 81862306a36Sopenharmony_ci update_gfn_disallow_lpage_count(slot, gfn, -1); 81962306a36Sopenharmony_ci} 82062306a36Sopenharmony_ci 82162306a36Sopenharmony_cistatic void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) 82262306a36Sopenharmony_ci{ 82362306a36Sopenharmony_ci struct kvm_memslots *slots; 82462306a36Sopenharmony_ci struct kvm_memory_slot *slot; 82562306a36Sopenharmony_ci gfn_t gfn; 82662306a36Sopenharmony_ci 82762306a36Sopenharmony_ci kvm->arch.indirect_shadow_pages++; 82862306a36Sopenharmony_ci gfn = sp->gfn; 82962306a36Sopenharmony_ci slots = kvm_memslots_for_spte_role(kvm, sp->role); 83062306a36Sopenharmony_ci slot = __gfn_to_memslot(slots, gfn); 83162306a36Sopenharmony_ci 83262306a36Sopenharmony_ci /* the non-leaf shadow pages are keeping readonly. */ 83362306a36Sopenharmony_ci if (sp->role.level > PG_LEVEL_4K) 83462306a36Sopenharmony_ci return __kvm_write_track_add_gfn(kvm, slot, gfn); 83562306a36Sopenharmony_ci 83662306a36Sopenharmony_ci kvm_mmu_gfn_disallow_lpage(slot, gfn); 83762306a36Sopenharmony_ci 83862306a36Sopenharmony_ci if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) 83962306a36Sopenharmony_ci kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K); 84062306a36Sopenharmony_ci} 84162306a36Sopenharmony_ci 84262306a36Sopenharmony_civoid track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) 84362306a36Sopenharmony_ci{ 84462306a36Sopenharmony_ci /* 84562306a36Sopenharmony_ci * If it's possible to replace the shadow page with an NX huge page, 84662306a36Sopenharmony_ci * i.e. if the shadow page is the only thing currently preventing KVM 84762306a36Sopenharmony_ci * from using a huge page, add the shadow page to the list of "to be 84862306a36Sopenharmony_ci * zapped for NX recovery" pages. Note, the shadow page can already be 84962306a36Sopenharmony_ci * on the list if KVM is reusing an existing shadow page, i.e. if KVM 85062306a36Sopenharmony_ci * links a shadow page at multiple points. 85162306a36Sopenharmony_ci */ 85262306a36Sopenharmony_ci if (!list_empty(&sp->possible_nx_huge_page_link)) 85362306a36Sopenharmony_ci return; 85462306a36Sopenharmony_ci 85562306a36Sopenharmony_ci ++kvm->stat.nx_lpage_splits; 85662306a36Sopenharmony_ci list_add_tail(&sp->possible_nx_huge_page_link, 85762306a36Sopenharmony_ci &kvm->arch.possible_nx_huge_pages); 85862306a36Sopenharmony_ci} 85962306a36Sopenharmony_ci 86062306a36Sopenharmony_cistatic void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, 86162306a36Sopenharmony_ci bool nx_huge_page_possible) 86262306a36Sopenharmony_ci{ 86362306a36Sopenharmony_ci sp->nx_huge_page_disallowed = true; 86462306a36Sopenharmony_ci 86562306a36Sopenharmony_ci if (nx_huge_page_possible) 86662306a36Sopenharmony_ci track_possible_nx_huge_page(kvm, sp); 86762306a36Sopenharmony_ci} 86862306a36Sopenharmony_ci 86962306a36Sopenharmony_cistatic void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) 87062306a36Sopenharmony_ci{ 87162306a36Sopenharmony_ci struct kvm_memslots *slots; 87262306a36Sopenharmony_ci struct kvm_memory_slot *slot; 87362306a36Sopenharmony_ci gfn_t gfn; 87462306a36Sopenharmony_ci 87562306a36Sopenharmony_ci kvm->arch.indirect_shadow_pages--; 87662306a36Sopenharmony_ci gfn = sp->gfn; 87762306a36Sopenharmony_ci slots = kvm_memslots_for_spte_role(kvm, sp->role); 87862306a36Sopenharmony_ci slot = __gfn_to_memslot(slots, gfn); 87962306a36Sopenharmony_ci if (sp->role.level > PG_LEVEL_4K) 88062306a36Sopenharmony_ci return __kvm_write_track_remove_gfn(kvm, slot, gfn); 88162306a36Sopenharmony_ci 88262306a36Sopenharmony_ci kvm_mmu_gfn_allow_lpage(slot, gfn); 88362306a36Sopenharmony_ci} 88462306a36Sopenharmony_ci 88562306a36Sopenharmony_civoid untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) 88662306a36Sopenharmony_ci{ 88762306a36Sopenharmony_ci if (list_empty(&sp->possible_nx_huge_page_link)) 88862306a36Sopenharmony_ci return; 88962306a36Sopenharmony_ci 89062306a36Sopenharmony_ci --kvm->stat.nx_lpage_splits; 89162306a36Sopenharmony_ci list_del_init(&sp->possible_nx_huge_page_link); 89262306a36Sopenharmony_ci} 89362306a36Sopenharmony_ci 89462306a36Sopenharmony_cistatic void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) 89562306a36Sopenharmony_ci{ 89662306a36Sopenharmony_ci sp->nx_huge_page_disallowed = false; 89762306a36Sopenharmony_ci 89862306a36Sopenharmony_ci untrack_possible_nx_huge_page(kvm, sp); 89962306a36Sopenharmony_ci} 90062306a36Sopenharmony_ci 90162306a36Sopenharmony_cistatic struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, 90262306a36Sopenharmony_ci gfn_t gfn, 90362306a36Sopenharmony_ci bool no_dirty_log) 90462306a36Sopenharmony_ci{ 90562306a36Sopenharmony_ci struct kvm_memory_slot *slot; 90662306a36Sopenharmony_ci 90762306a36Sopenharmony_ci slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 90862306a36Sopenharmony_ci if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 90962306a36Sopenharmony_ci return NULL; 91062306a36Sopenharmony_ci if (no_dirty_log && kvm_slot_dirty_track_enabled(slot)) 91162306a36Sopenharmony_ci return NULL; 91262306a36Sopenharmony_ci 91362306a36Sopenharmony_ci return slot; 91462306a36Sopenharmony_ci} 91562306a36Sopenharmony_ci 91662306a36Sopenharmony_ci/* 91762306a36Sopenharmony_ci * About rmap_head encoding: 91862306a36Sopenharmony_ci * 91962306a36Sopenharmony_ci * If the bit zero of rmap_head->val is clear, then it points to the only spte 92062306a36Sopenharmony_ci * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct 92162306a36Sopenharmony_ci * pte_list_desc containing more mappings. 92262306a36Sopenharmony_ci */ 92362306a36Sopenharmony_ci 92462306a36Sopenharmony_ci/* 92562306a36Sopenharmony_ci * Returns the number of pointers in the rmap chain, not counting the new one. 92662306a36Sopenharmony_ci */ 92762306a36Sopenharmony_cistatic int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, 92862306a36Sopenharmony_ci struct kvm_rmap_head *rmap_head) 92962306a36Sopenharmony_ci{ 93062306a36Sopenharmony_ci struct pte_list_desc *desc; 93162306a36Sopenharmony_ci int count = 0; 93262306a36Sopenharmony_ci 93362306a36Sopenharmony_ci if (!rmap_head->val) { 93462306a36Sopenharmony_ci rmap_head->val = (unsigned long)spte; 93562306a36Sopenharmony_ci } else if (!(rmap_head->val & 1)) { 93662306a36Sopenharmony_ci desc = kvm_mmu_memory_cache_alloc(cache); 93762306a36Sopenharmony_ci desc->sptes[0] = (u64 *)rmap_head->val; 93862306a36Sopenharmony_ci desc->sptes[1] = spte; 93962306a36Sopenharmony_ci desc->spte_count = 2; 94062306a36Sopenharmony_ci desc->tail_count = 0; 94162306a36Sopenharmony_ci rmap_head->val = (unsigned long)desc | 1; 94262306a36Sopenharmony_ci ++count; 94362306a36Sopenharmony_ci } else { 94462306a36Sopenharmony_ci desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 94562306a36Sopenharmony_ci count = desc->tail_count + desc->spte_count; 94662306a36Sopenharmony_ci 94762306a36Sopenharmony_ci /* 94862306a36Sopenharmony_ci * If the previous head is full, allocate a new head descriptor 94962306a36Sopenharmony_ci * as tail descriptors are always kept full. 95062306a36Sopenharmony_ci */ 95162306a36Sopenharmony_ci if (desc->spte_count == PTE_LIST_EXT) { 95262306a36Sopenharmony_ci desc = kvm_mmu_memory_cache_alloc(cache); 95362306a36Sopenharmony_ci desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul); 95462306a36Sopenharmony_ci desc->spte_count = 0; 95562306a36Sopenharmony_ci desc->tail_count = count; 95662306a36Sopenharmony_ci rmap_head->val = (unsigned long)desc | 1; 95762306a36Sopenharmony_ci } 95862306a36Sopenharmony_ci desc->sptes[desc->spte_count++] = spte; 95962306a36Sopenharmony_ci } 96062306a36Sopenharmony_ci return count; 96162306a36Sopenharmony_ci} 96262306a36Sopenharmony_ci 96362306a36Sopenharmony_cistatic void pte_list_desc_remove_entry(struct kvm *kvm, 96462306a36Sopenharmony_ci struct kvm_rmap_head *rmap_head, 96562306a36Sopenharmony_ci struct pte_list_desc *desc, int i) 96662306a36Sopenharmony_ci{ 96762306a36Sopenharmony_ci struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 96862306a36Sopenharmony_ci int j = head_desc->spte_count - 1; 96962306a36Sopenharmony_ci 97062306a36Sopenharmony_ci /* 97162306a36Sopenharmony_ci * The head descriptor should never be empty. A new head is added only 97262306a36Sopenharmony_ci * when adding an entry and the previous head is full, and heads are 97362306a36Sopenharmony_ci * removed (this flow) when they become empty. 97462306a36Sopenharmony_ci */ 97562306a36Sopenharmony_ci KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm); 97662306a36Sopenharmony_ci 97762306a36Sopenharmony_ci /* 97862306a36Sopenharmony_ci * Replace the to-be-freed SPTE with the last valid entry from the head 97962306a36Sopenharmony_ci * descriptor to ensure that tail descriptors are full at all times. 98062306a36Sopenharmony_ci * Note, this also means that tail_count is stable for each descriptor. 98162306a36Sopenharmony_ci */ 98262306a36Sopenharmony_ci desc->sptes[i] = head_desc->sptes[j]; 98362306a36Sopenharmony_ci head_desc->sptes[j] = NULL; 98462306a36Sopenharmony_ci head_desc->spte_count--; 98562306a36Sopenharmony_ci if (head_desc->spte_count) 98662306a36Sopenharmony_ci return; 98762306a36Sopenharmony_ci 98862306a36Sopenharmony_ci /* 98962306a36Sopenharmony_ci * The head descriptor is empty. If there are no tail descriptors, 99062306a36Sopenharmony_ci * nullify the rmap head to mark the list as emtpy, else point the rmap 99162306a36Sopenharmony_ci * head at the next descriptor, i.e. the new head. 99262306a36Sopenharmony_ci */ 99362306a36Sopenharmony_ci if (!head_desc->more) 99462306a36Sopenharmony_ci rmap_head->val = 0; 99562306a36Sopenharmony_ci else 99662306a36Sopenharmony_ci rmap_head->val = (unsigned long)head_desc->more | 1; 99762306a36Sopenharmony_ci mmu_free_pte_list_desc(head_desc); 99862306a36Sopenharmony_ci} 99962306a36Sopenharmony_ci 100062306a36Sopenharmony_cistatic void pte_list_remove(struct kvm *kvm, u64 *spte, 100162306a36Sopenharmony_ci struct kvm_rmap_head *rmap_head) 100262306a36Sopenharmony_ci{ 100362306a36Sopenharmony_ci struct pte_list_desc *desc; 100462306a36Sopenharmony_ci int i; 100562306a36Sopenharmony_ci 100662306a36Sopenharmony_ci if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) 100762306a36Sopenharmony_ci return; 100862306a36Sopenharmony_ci 100962306a36Sopenharmony_ci if (!(rmap_head->val & 1)) { 101062306a36Sopenharmony_ci if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) 101162306a36Sopenharmony_ci return; 101262306a36Sopenharmony_ci 101362306a36Sopenharmony_ci rmap_head->val = 0; 101462306a36Sopenharmony_ci } else { 101562306a36Sopenharmony_ci desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 101662306a36Sopenharmony_ci while (desc) { 101762306a36Sopenharmony_ci for (i = 0; i < desc->spte_count; ++i) { 101862306a36Sopenharmony_ci if (desc->sptes[i] == spte) { 101962306a36Sopenharmony_ci pte_list_desc_remove_entry(kvm, rmap_head, 102062306a36Sopenharmony_ci desc, i); 102162306a36Sopenharmony_ci return; 102262306a36Sopenharmony_ci } 102362306a36Sopenharmony_ci } 102462306a36Sopenharmony_ci desc = desc->more; 102562306a36Sopenharmony_ci } 102662306a36Sopenharmony_ci 102762306a36Sopenharmony_ci KVM_BUG_ON_DATA_CORRUPTION(true, kvm); 102862306a36Sopenharmony_ci } 102962306a36Sopenharmony_ci} 103062306a36Sopenharmony_ci 103162306a36Sopenharmony_cistatic void kvm_zap_one_rmap_spte(struct kvm *kvm, 103262306a36Sopenharmony_ci struct kvm_rmap_head *rmap_head, u64 *sptep) 103362306a36Sopenharmony_ci{ 103462306a36Sopenharmony_ci mmu_spte_clear_track_bits(kvm, sptep); 103562306a36Sopenharmony_ci pte_list_remove(kvm, sptep, rmap_head); 103662306a36Sopenharmony_ci} 103762306a36Sopenharmony_ci 103862306a36Sopenharmony_ci/* Return true if at least one SPTE was zapped, false otherwise */ 103962306a36Sopenharmony_cistatic bool kvm_zap_all_rmap_sptes(struct kvm *kvm, 104062306a36Sopenharmony_ci struct kvm_rmap_head *rmap_head) 104162306a36Sopenharmony_ci{ 104262306a36Sopenharmony_ci struct pte_list_desc *desc, *next; 104362306a36Sopenharmony_ci int i; 104462306a36Sopenharmony_ci 104562306a36Sopenharmony_ci if (!rmap_head->val) 104662306a36Sopenharmony_ci return false; 104762306a36Sopenharmony_ci 104862306a36Sopenharmony_ci if (!(rmap_head->val & 1)) { 104962306a36Sopenharmony_ci mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); 105062306a36Sopenharmony_ci goto out; 105162306a36Sopenharmony_ci } 105262306a36Sopenharmony_ci 105362306a36Sopenharmony_ci desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 105462306a36Sopenharmony_ci 105562306a36Sopenharmony_ci for (; desc; desc = next) { 105662306a36Sopenharmony_ci for (i = 0; i < desc->spte_count; i++) 105762306a36Sopenharmony_ci mmu_spte_clear_track_bits(kvm, desc->sptes[i]); 105862306a36Sopenharmony_ci next = desc->more; 105962306a36Sopenharmony_ci mmu_free_pte_list_desc(desc); 106062306a36Sopenharmony_ci } 106162306a36Sopenharmony_ciout: 106262306a36Sopenharmony_ci /* rmap_head is meaningless now, remember to reset it */ 106362306a36Sopenharmony_ci rmap_head->val = 0; 106462306a36Sopenharmony_ci return true; 106562306a36Sopenharmony_ci} 106662306a36Sopenharmony_ci 106762306a36Sopenharmony_ciunsigned int pte_list_count(struct kvm_rmap_head *rmap_head) 106862306a36Sopenharmony_ci{ 106962306a36Sopenharmony_ci struct pte_list_desc *desc; 107062306a36Sopenharmony_ci 107162306a36Sopenharmony_ci if (!rmap_head->val) 107262306a36Sopenharmony_ci return 0; 107362306a36Sopenharmony_ci else if (!(rmap_head->val & 1)) 107462306a36Sopenharmony_ci return 1; 107562306a36Sopenharmony_ci 107662306a36Sopenharmony_ci desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 107762306a36Sopenharmony_ci return desc->tail_count + desc->spte_count; 107862306a36Sopenharmony_ci} 107962306a36Sopenharmony_ci 108062306a36Sopenharmony_cistatic struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, 108162306a36Sopenharmony_ci const struct kvm_memory_slot *slot) 108262306a36Sopenharmony_ci{ 108362306a36Sopenharmony_ci unsigned long idx; 108462306a36Sopenharmony_ci 108562306a36Sopenharmony_ci idx = gfn_to_index(gfn, slot->base_gfn, level); 108662306a36Sopenharmony_ci return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; 108762306a36Sopenharmony_ci} 108862306a36Sopenharmony_ci 108962306a36Sopenharmony_cistatic void rmap_remove(struct kvm *kvm, u64 *spte) 109062306a36Sopenharmony_ci{ 109162306a36Sopenharmony_ci struct kvm_memslots *slots; 109262306a36Sopenharmony_ci struct kvm_memory_slot *slot; 109362306a36Sopenharmony_ci struct kvm_mmu_page *sp; 109462306a36Sopenharmony_ci gfn_t gfn; 109562306a36Sopenharmony_ci struct kvm_rmap_head *rmap_head; 109662306a36Sopenharmony_ci 109762306a36Sopenharmony_ci sp = sptep_to_sp(spte); 109862306a36Sopenharmony_ci gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte)); 109962306a36Sopenharmony_ci 110062306a36Sopenharmony_ci /* 110162306a36Sopenharmony_ci * Unlike rmap_add, rmap_remove does not run in the context of a vCPU 110262306a36Sopenharmony_ci * so we have to determine which memslots to use based on context 110362306a36Sopenharmony_ci * information in sp->role. 110462306a36Sopenharmony_ci */ 110562306a36Sopenharmony_ci slots = kvm_memslots_for_spte_role(kvm, sp->role); 110662306a36Sopenharmony_ci 110762306a36Sopenharmony_ci slot = __gfn_to_memslot(slots, gfn); 110862306a36Sopenharmony_ci rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); 110962306a36Sopenharmony_ci 111062306a36Sopenharmony_ci pte_list_remove(kvm, spte, rmap_head); 111162306a36Sopenharmony_ci} 111262306a36Sopenharmony_ci 111362306a36Sopenharmony_ci/* 111462306a36Sopenharmony_ci * Used by the following functions to iterate through the sptes linked by a 111562306a36Sopenharmony_ci * rmap. All fields are private and not assumed to be used outside. 111662306a36Sopenharmony_ci */ 111762306a36Sopenharmony_cistruct rmap_iterator { 111862306a36Sopenharmony_ci /* private fields */ 111962306a36Sopenharmony_ci struct pte_list_desc *desc; /* holds the sptep if not NULL */ 112062306a36Sopenharmony_ci int pos; /* index of the sptep */ 112162306a36Sopenharmony_ci}; 112262306a36Sopenharmony_ci 112362306a36Sopenharmony_ci/* 112462306a36Sopenharmony_ci * Iteration must be started by this function. This should also be used after 112562306a36Sopenharmony_ci * removing/dropping sptes from the rmap link because in such cases the 112662306a36Sopenharmony_ci * information in the iterator may not be valid. 112762306a36Sopenharmony_ci * 112862306a36Sopenharmony_ci * Returns sptep if found, NULL otherwise. 112962306a36Sopenharmony_ci */ 113062306a36Sopenharmony_cistatic u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, 113162306a36Sopenharmony_ci struct rmap_iterator *iter) 113262306a36Sopenharmony_ci{ 113362306a36Sopenharmony_ci u64 *sptep; 113462306a36Sopenharmony_ci 113562306a36Sopenharmony_ci if (!rmap_head->val) 113662306a36Sopenharmony_ci return NULL; 113762306a36Sopenharmony_ci 113862306a36Sopenharmony_ci if (!(rmap_head->val & 1)) { 113962306a36Sopenharmony_ci iter->desc = NULL; 114062306a36Sopenharmony_ci sptep = (u64 *)rmap_head->val; 114162306a36Sopenharmony_ci goto out; 114262306a36Sopenharmony_ci } 114362306a36Sopenharmony_ci 114462306a36Sopenharmony_ci iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 114562306a36Sopenharmony_ci iter->pos = 0; 114662306a36Sopenharmony_ci sptep = iter->desc->sptes[iter->pos]; 114762306a36Sopenharmony_ciout: 114862306a36Sopenharmony_ci BUG_ON(!is_shadow_present_pte(*sptep)); 114962306a36Sopenharmony_ci return sptep; 115062306a36Sopenharmony_ci} 115162306a36Sopenharmony_ci 115262306a36Sopenharmony_ci/* 115362306a36Sopenharmony_ci * Must be used with a valid iterator: e.g. after rmap_get_first(). 115462306a36Sopenharmony_ci * 115562306a36Sopenharmony_ci * Returns sptep if found, NULL otherwise. 115662306a36Sopenharmony_ci */ 115762306a36Sopenharmony_cistatic u64 *rmap_get_next(struct rmap_iterator *iter) 115862306a36Sopenharmony_ci{ 115962306a36Sopenharmony_ci u64 *sptep; 116062306a36Sopenharmony_ci 116162306a36Sopenharmony_ci if (iter->desc) { 116262306a36Sopenharmony_ci if (iter->pos < PTE_LIST_EXT - 1) { 116362306a36Sopenharmony_ci ++iter->pos; 116462306a36Sopenharmony_ci sptep = iter->desc->sptes[iter->pos]; 116562306a36Sopenharmony_ci if (sptep) 116662306a36Sopenharmony_ci goto out; 116762306a36Sopenharmony_ci } 116862306a36Sopenharmony_ci 116962306a36Sopenharmony_ci iter->desc = iter->desc->more; 117062306a36Sopenharmony_ci 117162306a36Sopenharmony_ci if (iter->desc) { 117262306a36Sopenharmony_ci iter->pos = 0; 117362306a36Sopenharmony_ci /* desc->sptes[0] cannot be NULL */ 117462306a36Sopenharmony_ci sptep = iter->desc->sptes[iter->pos]; 117562306a36Sopenharmony_ci goto out; 117662306a36Sopenharmony_ci } 117762306a36Sopenharmony_ci } 117862306a36Sopenharmony_ci 117962306a36Sopenharmony_ci return NULL; 118062306a36Sopenharmony_ciout: 118162306a36Sopenharmony_ci BUG_ON(!is_shadow_present_pte(*sptep)); 118262306a36Sopenharmony_ci return sptep; 118362306a36Sopenharmony_ci} 118462306a36Sopenharmony_ci 118562306a36Sopenharmony_ci#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ 118662306a36Sopenharmony_ci for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ 118762306a36Sopenharmony_ci _spte_; _spte_ = rmap_get_next(_iter_)) 118862306a36Sopenharmony_ci 118962306a36Sopenharmony_cistatic void drop_spte(struct kvm *kvm, u64 *sptep) 119062306a36Sopenharmony_ci{ 119162306a36Sopenharmony_ci u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep); 119262306a36Sopenharmony_ci 119362306a36Sopenharmony_ci if (is_shadow_present_pte(old_spte)) 119462306a36Sopenharmony_ci rmap_remove(kvm, sptep); 119562306a36Sopenharmony_ci} 119662306a36Sopenharmony_ci 119762306a36Sopenharmony_cistatic void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) 119862306a36Sopenharmony_ci{ 119962306a36Sopenharmony_ci struct kvm_mmu_page *sp; 120062306a36Sopenharmony_ci 120162306a36Sopenharmony_ci sp = sptep_to_sp(sptep); 120262306a36Sopenharmony_ci WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K); 120362306a36Sopenharmony_ci 120462306a36Sopenharmony_ci drop_spte(kvm, sptep); 120562306a36Sopenharmony_ci 120662306a36Sopenharmony_ci if (flush) 120762306a36Sopenharmony_ci kvm_flush_remote_tlbs_sptep(kvm, sptep); 120862306a36Sopenharmony_ci} 120962306a36Sopenharmony_ci 121062306a36Sopenharmony_ci/* 121162306a36Sopenharmony_ci * Write-protect on the specified @sptep, @pt_protect indicates whether 121262306a36Sopenharmony_ci * spte write-protection is caused by protecting shadow page table. 121362306a36Sopenharmony_ci * 121462306a36Sopenharmony_ci * Note: write protection is difference between dirty logging and spte 121562306a36Sopenharmony_ci * protection: 121662306a36Sopenharmony_ci * - for dirty logging, the spte can be set to writable at anytime if 121762306a36Sopenharmony_ci * its dirty bitmap is properly set. 121862306a36Sopenharmony_ci * - for spte protection, the spte can be writable only after unsync-ing 121962306a36Sopenharmony_ci * shadow page. 122062306a36Sopenharmony_ci * 122162306a36Sopenharmony_ci * Return true if tlb need be flushed. 122262306a36Sopenharmony_ci */ 122362306a36Sopenharmony_cistatic bool spte_write_protect(u64 *sptep, bool pt_protect) 122462306a36Sopenharmony_ci{ 122562306a36Sopenharmony_ci u64 spte = *sptep; 122662306a36Sopenharmony_ci 122762306a36Sopenharmony_ci if (!is_writable_pte(spte) && 122862306a36Sopenharmony_ci !(pt_protect && is_mmu_writable_spte(spte))) 122962306a36Sopenharmony_ci return false; 123062306a36Sopenharmony_ci 123162306a36Sopenharmony_ci if (pt_protect) 123262306a36Sopenharmony_ci spte &= ~shadow_mmu_writable_mask; 123362306a36Sopenharmony_ci spte = spte & ~PT_WRITABLE_MASK; 123462306a36Sopenharmony_ci 123562306a36Sopenharmony_ci return mmu_spte_update(sptep, spte); 123662306a36Sopenharmony_ci} 123762306a36Sopenharmony_ci 123862306a36Sopenharmony_cistatic bool rmap_write_protect(struct kvm_rmap_head *rmap_head, 123962306a36Sopenharmony_ci bool pt_protect) 124062306a36Sopenharmony_ci{ 124162306a36Sopenharmony_ci u64 *sptep; 124262306a36Sopenharmony_ci struct rmap_iterator iter; 124362306a36Sopenharmony_ci bool flush = false; 124462306a36Sopenharmony_ci 124562306a36Sopenharmony_ci for_each_rmap_spte(rmap_head, &iter, sptep) 124662306a36Sopenharmony_ci flush |= spte_write_protect(sptep, pt_protect); 124762306a36Sopenharmony_ci 124862306a36Sopenharmony_ci return flush; 124962306a36Sopenharmony_ci} 125062306a36Sopenharmony_ci 125162306a36Sopenharmony_cistatic bool spte_clear_dirty(u64 *sptep) 125262306a36Sopenharmony_ci{ 125362306a36Sopenharmony_ci u64 spte = *sptep; 125462306a36Sopenharmony_ci 125562306a36Sopenharmony_ci KVM_MMU_WARN_ON(!spte_ad_enabled(spte)); 125662306a36Sopenharmony_ci spte &= ~shadow_dirty_mask; 125762306a36Sopenharmony_ci return mmu_spte_update(sptep, spte); 125862306a36Sopenharmony_ci} 125962306a36Sopenharmony_ci 126062306a36Sopenharmony_cistatic bool spte_wrprot_for_clear_dirty(u64 *sptep) 126162306a36Sopenharmony_ci{ 126262306a36Sopenharmony_ci bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, 126362306a36Sopenharmony_ci (unsigned long *)sptep); 126462306a36Sopenharmony_ci if (was_writable && !spte_ad_enabled(*sptep)) 126562306a36Sopenharmony_ci kvm_set_pfn_dirty(spte_to_pfn(*sptep)); 126662306a36Sopenharmony_ci 126762306a36Sopenharmony_ci return was_writable; 126862306a36Sopenharmony_ci} 126962306a36Sopenharmony_ci 127062306a36Sopenharmony_ci/* 127162306a36Sopenharmony_ci * Gets the GFN ready for another round of dirty logging by clearing the 127262306a36Sopenharmony_ci * - D bit on ad-enabled SPTEs, and 127362306a36Sopenharmony_ci * - W bit on ad-disabled SPTEs. 127462306a36Sopenharmony_ci * Returns true iff any D or W bits were cleared. 127562306a36Sopenharmony_ci */ 127662306a36Sopenharmony_cistatic bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 127762306a36Sopenharmony_ci const struct kvm_memory_slot *slot) 127862306a36Sopenharmony_ci{ 127962306a36Sopenharmony_ci u64 *sptep; 128062306a36Sopenharmony_ci struct rmap_iterator iter; 128162306a36Sopenharmony_ci bool flush = false; 128262306a36Sopenharmony_ci 128362306a36Sopenharmony_ci for_each_rmap_spte(rmap_head, &iter, sptep) 128462306a36Sopenharmony_ci if (spte_ad_need_write_protect(*sptep)) 128562306a36Sopenharmony_ci flush |= spte_wrprot_for_clear_dirty(sptep); 128662306a36Sopenharmony_ci else 128762306a36Sopenharmony_ci flush |= spte_clear_dirty(sptep); 128862306a36Sopenharmony_ci 128962306a36Sopenharmony_ci return flush; 129062306a36Sopenharmony_ci} 129162306a36Sopenharmony_ci 129262306a36Sopenharmony_ci/** 129362306a36Sopenharmony_ci * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages 129462306a36Sopenharmony_ci * @kvm: kvm instance 129562306a36Sopenharmony_ci * @slot: slot to protect 129662306a36Sopenharmony_ci * @gfn_offset: start of the BITS_PER_LONG pages we care about 129762306a36Sopenharmony_ci * @mask: indicates which pages we should protect 129862306a36Sopenharmony_ci * 129962306a36Sopenharmony_ci * Used when we do not need to care about huge page mappings. 130062306a36Sopenharmony_ci */ 130162306a36Sopenharmony_cistatic void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 130262306a36Sopenharmony_ci struct kvm_memory_slot *slot, 130362306a36Sopenharmony_ci gfn_t gfn_offset, unsigned long mask) 130462306a36Sopenharmony_ci{ 130562306a36Sopenharmony_ci struct kvm_rmap_head *rmap_head; 130662306a36Sopenharmony_ci 130762306a36Sopenharmony_ci if (tdp_mmu_enabled) 130862306a36Sopenharmony_ci kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, 130962306a36Sopenharmony_ci slot->base_gfn + gfn_offset, mask, true); 131062306a36Sopenharmony_ci 131162306a36Sopenharmony_ci if (!kvm_memslots_have_rmaps(kvm)) 131262306a36Sopenharmony_ci return; 131362306a36Sopenharmony_ci 131462306a36Sopenharmony_ci while (mask) { 131562306a36Sopenharmony_ci rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), 131662306a36Sopenharmony_ci PG_LEVEL_4K, slot); 131762306a36Sopenharmony_ci rmap_write_protect(rmap_head, false); 131862306a36Sopenharmony_ci 131962306a36Sopenharmony_ci /* clear the first set bit */ 132062306a36Sopenharmony_ci mask &= mask - 1; 132162306a36Sopenharmony_ci } 132262306a36Sopenharmony_ci} 132362306a36Sopenharmony_ci 132462306a36Sopenharmony_ci/** 132562306a36Sopenharmony_ci * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write 132662306a36Sopenharmony_ci * protect the page if the D-bit isn't supported. 132762306a36Sopenharmony_ci * @kvm: kvm instance 132862306a36Sopenharmony_ci * @slot: slot to clear D-bit 132962306a36Sopenharmony_ci * @gfn_offset: start of the BITS_PER_LONG pages we care about 133062306a36Sopenharmony_ci * @mask: indicates which pages we should clear D-bit 133162306a36Sopenharmony_ci * 133262306a36Sopenharmony_ci * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. 133362306a36Sopenharmony_ci */ 133462306a36Sopenharmony_cistatic void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, 133562306a36Sopenharmony_ci struct kvm_memory_slot *slot, 133662306a36Sopenharmony_ci gfn_t gfn_offset, unsigned long mask) 133762306a36Sopenharmony_ci{ 133862306a36Sopenharmony_ci struct kvm_rmap_head *rmap_head; 133962306a36Sopenharmony_ci 134062306a36Sopenharmony_ci if (tdp_mmu_enabled) 134162306a36Sopenharmony_ci kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, 134262306a36Sopenharmony_ci slot->base_gfn + gfn_offset, mask, false); 134362306a36Sopenharmony_ci 134462306a36Sopenharmony_ci if (!kvm_memslots_have_rmaps(kvm)) 134562306a36Sopenharmony_ci return; 134662306a36Sopenharmony_ci 134762306a36Sopenharmony_ci while (mask) { 134862306a36Sopenharmony_ci rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), 134962306a36Sopenharmony_ci PG_LEVEL_4K, slot); 135062306a36Sopenharmony_ci __rmap_clear_dirty(kvm, rmap_head, slot); 135162306a36Sopenharmony_ci 135262306a36Sopenharmony_ci /* clear the first set bit */ 135362306a36Sopenharmony_ci mask &= mask - 1; 135462306a36Sopenharmony_ci } 135562306a36Sopenharmony_ci} 135662306a36Sopenharmony_ci 135762306a36Sopenharmony_ci/** 135862306a36Sopenharmony_ci * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected 135962306a36Sopenharmony_ci * PT level pages. 136062306a36Sopenharmony_ci * 136162306a36Sopenharmony_ci * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to 136262306a36Sopenharmony_ci * enable dirty logging for them. 136362306a36Sopenharmony_ci * 136462306a36Sopenharmony_ci * We need to care about huge page mappings: e.g. during dirty logging we may 136562306a36Sopenharmony_ci * have such mappings. 136662306a36Sopenharmony_ci */ 136762306a36Sopenharmony_civoid kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 136862306a36Sopenharmony_ci struct kvm_memory_slot *slot, 136962306a36Sopenharmony_ci gfn_t gfn_offset, unsigned long mask) 137062306a36Sopenharmony_ci{ 137162306a36Sopenharmony_ci /* 137262306a36Sopenharmony_ci * Huge pages are NOT write protected when we start dirty logging in 137362306a36Sopenharmony_ci * initially-all-set mode; must write protect them here so that they 137462306a36Sopenharmony_ci * are split to 4K on the first write. 137562306a36Sopenharmony_ci * 137662306a36Sopenharmony_ci * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn 137762306a36Sopenharmony_ci * of memslot has no such restriction, so the range can cross two large 137862306a36Sopenharmony_ci * pages. 137962306a36Sopenharmony_ci */ 138062306a36Sopenharmony_ci if (kvm_dirty_log_manual_protect_and_init_set(kvm)) { 138162306a36Sopenharmony_ci gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask); 138262306a36Sopenharmony_ci gfn_t end = slot->base_gfn + gfn_offset + __fls(mask); 138362306a36Sopenharmony_ci 138462306a36Sopenharmony_ci if (READ_ONCE(eager_page_split)) 138562306a36Sopenharmony_ci kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K); 138662306a36Sopenharmony_ci 138762306a36Sopenharmony_ci kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); 138862306a36Sopenharmony_ci 138962306a36Sopenharmony_ci /* Cross two large pages? */ 139062306a36Sopenharmony_ci if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) != 139162306a36Sopenharmony_ci ALIGN(end << PAGE_SHIFT, PMD_SIZE)) 139262306a36Sopenharmony_ci kvm_mmu_slot_gfn_write_protect(kvm, slot, end, 139362306a36Sopenharmony_ci PG_LEVEL_2M); 139462306a36Sopenharmony_ci } 139562306a36Sopenharmony_ci 139662306a36Sopenharmony_ci /* Now handle 4K PTEs. */ 139762306a36Sopenharmony_ci if (kvm_x86_ops.cpu_dirty_log_size) 139862306a36Sopenharmony_ci kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); 139962306a36Sopenharmony_ci else 140062306a36Sopenharmony_ci kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 140162306a36Sopenharmony_ci} 140262306a36Sopenharmony_ci 140362306a36Sopenharmony_ciint kvm_cpu_dirty_log_size(void) 140462306a36Sopenharmony_ci{ 140562306a36Sopenharmony_ci return kvm_x86_ops.cpu_dirty_log_size; 140662306a36Sopenharmony_ci} 140762306a36Sopenharmony_ci 140862306a36Sopenharmony_cibool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, 140962306a36Sopenharmony_ci struct kvm_memory_slot *slot, u64 gfn, 141062306a36Sopenharmony_ci int min_level) 141162306a36Sopenharmony_ci{ 141262306a36Sopenharmony_ci struct kvm_rmap_head *rmap_head; 141362306a36Sopenharmony_ci int i; 141462306a36Sopenharmony_ci bool write_protected = false; 141562306a36Sopenharmony_ci 141662306a36Sopenharmony_ci if (kvm_memslots_have_rmaps(kvm)) { 141762306a36Sopenharmony_ci for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { 141862306a36Sopenharmony_ci rmap_head = gfn_to_rmap(gfn, i, slot); 141962306a36Sopenharmony_ci write_protected |= rmap_write_protect(rmap_head, true); 142062306a36Sopenharmony_ci } 142162306a36Sopenharmony_ci } 142262306a36Sopenharmony_ci 142362306a36Sopenharmony_ci if (tdp_mmu_enabled) 142462306a36Sopenharmony_ci write_protected |= 142562306a36Sopenharmony_ci kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level); 142662306a36Sopenharmony_ci 142762306a36Sopenharmony_ci return write_protected; 142862306a36Sopenharmony_ci} 142962306a36Sopenharmony_ci 143062306a36Sopenharmony_cistatic bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) 143162306a36Sopenharmony_ci{ 143262306a36Sopenharmony_ci struct kvm_memory_slot *slot; 143362306a36Sopenharmony_ci 143462306a36Sopenharmony_ci slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 143562306a36Sopenharmony_ci return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); 143662306a36Sopenharmony_ci} 143762306a36Sopenharmony_ci 143862306a36Sopenharmony_cistatic bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 143962306a36Sopenharmony_ci const struct kvm_memory_slot *slot) 144062306a36Sopenharmony_ci{ 144162306a36Sopenharmony_ci return kvm_zap_all_rmap_sptes(kvm, rmap_head); 144262306a36Sopenharmony_ci} 144362306a36Sopenharmony_ci 144462306a36Sopenharmony_cistatic bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 144562306a36Sopenharmony_ci struct kvm_memory_slot *slot, gfn_t gfn, int level, 144662306a36Sopenharmony_ci pte_t unused) 144762306a36Sopenharmony_ci{ 144862306a36Sopenharmony_ci return __kvm_zap_rmap(kvm, rmap_head, slot); 144962306a36Sopenharmony_ci} 145062306a36Sopenharmony_ci 145162306a36Sopenharmony_cistatic bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 145262306a36Sopenharmony_ci struct kvm_memory_slot *slot, gfn_t gfn, int level, 145362306a36Sopenharmony_ci pte_t pte) 145462306a36Sopenharmony_ci{ 145562306a36Sopenharmony_ci u64 *sptep; 145662306a36Sopenharmony_ci struct rmap_iterator iter; 145762306a36Sopenharmony_ci bool need_flush = false; 145862306a36Sopenharmony_ci u64 new_spte; 145962306a36Sopenharmony_ci kvm_pfn_t new_pfn; 146062306a36Sopenharmony_ci 146162306a36Sopenharmony_ci WARN_ON_ONCE(pte_huge(pte)); 146262306a36Sopenharmony_ci new_pfn = pte_pfn(pte); 146362306a36Sopenharmony_ci 146462306a36Sopenharmony_cirestart: 146562306a36Sopenharmony_ci for_each_rmap_spte(rmap_head, &iter, sptep) { 146662306a36Sopenharmony_ci need_flush = true; 146762306a36Sopenharmony_ci 146862306a36Sopenharmony_ci if (pte_write(pte)) { 146962306a36Sopenharmony_ci kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); 147062306a36Sopenharmony_ci goto restart; 147162306a36Sopenharmony_ci } else { 147262306a36Sopenharmony_ci new_spte = kvm_mmu_changed_pte_notifier_make_spte( 147362306a36Sopenharmony_ci *sptep, new_pfn); 147462306a36Sopenharmony_ci 147562306a36Sopenharmony_ci mmu_spte_clear_track_bits(kvm, sptep); 147662306a36Sopenharmony_ci mmu_spte_set(sptep, new_spte); 147762306a36Sopenharmony_ci } 147862306a36Sopenharmony_ci } 147962306a36Sopenharmony_ci 148062306a36Sopenharmony_ci if (need_flush && kvm_available_flush_remote_tlbs_range()) { 148162306a36Sopenharmony_ci kvm_flush_remote_tlbs_gfn(kvm, gfn, level); 148262306a36Sopenharmony_ci return false; 148362306a36Sopenharmony_ci } 148462306a36Sopenharmony_ci 148562306a36Sopenharmony_ci return need_flush; 148662306a36Sopenharmony_ci} 148762306a36Sopenharmony_ci 148862306a36Sopenharmony_cistruct slot_rmap_walk_iterator { 148962306a36Sopenharmony_ci /* input fields. */ 149062306a36Sopenharmony_ci const struct kvm_memory_slot *slot; 149162306a36Sopenharmony_ci gfn_t start_gfn; 149262306a36Sopenharmony_ci gfn_t end_gfn; 149362306a36Sopenharmony_ci int start_level; 149462306a36Sopenharmony_ci int end_level; 149562306a36Sopenharmony_ci 149662306a36Sopenharmony_ci /* output fields. */ 149762306a36Sopenharmony_ci gfn_t gfn; 149862306a36Sopenharmony_ci struct kvm_rmap_head *rmap; 149962306a36Sopenharmony_ci int level; 150062306a36Sopenharmony_ci 150162306a36Sopenharmony_ci /* private field. */ 150262306a36Sopenharmony_ci struct kvm_rmap_head *end_rmap; 150362306a36Sopenharmony_ci}; 150462306a36Sopenharmony_ci 150562306a36Sopenharmony_cistatic void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, 150662306a36Sopenharmony_ci int level) 150762306a36Sopenharmony_ci{ 150862306a36Sopenharmony_ci iterator->level = level; 150962306a36Sopenharmony_ci iterator->gfn = iterator->start_gfn; 151062306a36Sopenharmony_ci iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot); 151162306a36Sopenharmony_ci iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot); 151262306a36Sopenharmony_ci} 151362306a36Sopenharmony_ci 151462306a36Sopenharmony_cistatic void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, 151562306a36Sopenharmony_ci const struct kvm_memory_slot *slot, 151662306a36Sopenharmony_ci int start_level, int end_level, 151762306a36Sopenharmony_ci gfn_t start_gfn, gfn_t end_gfn) 151862306a36Sopenharmony_ci{ 151962306a36Sopenharmony_ci iterator->slot = slot; 152062306a36Sopenharmony_ci iterator->start_level = start_level; 152162306a36Sopenharmony_ci iterator->end_level = end_level; 152262306a36Sopenharmony_ci iterator->start_gfn = start_gfn; 152362306a36Sopenharmony_ci iterator->end_gfn = end_gfn; 152462306a36Sopenharmony_ci 152562306a36Sopenharmony_ci rmap_walk_init_level(iterator, iterator->start_level); 152662306a36Sopenharmony_ci} 152762306a36Sopenharmony_ci 152862306a36Sopenharmony_cistatic bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) 152962306a36Sopenharmony_ci{ 153062306a36Sopenharmony_ci return !!iterator->rmap; 153162306a36Sopenharmony_ci} 153262306a36Sopenharmony_ci 153362306a36Sopenharmony_cistatic void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) 153462306a36Sopenharmony_ci{ 153562306a36Sopenharmony_ci while (++iterator->rmap <= iterator->end_rmap) { 153662306a36Sopenharmony_ci iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); 153762306a36Sopenharmony_ci 153862306a36Sopenharmony_ci if (iterator->rmap->val) 153962306a36Sopenharmony_ci return; 154062306a36Sopenharmony_ci } 154162306a36Sopenharmony_ci 154262306a36Sopenharmony_ci if (++iterator->level > iterator->end_level) { 154362306a36Sopenharmony_ci iterator->rmap = NULL; 154462306a36Sopenharmony_ci return; 154562306a36Sopenharmony_ci } 154662306a36Sopenharmony_ci 154762306a36Sopenharmony_ci rmap_walk_init_level(iterator, iterator->level); 154862306a36Sopenharmony_ci} 154962306a36Sopenharmony_ci 155062306a36Sopenharmony_ci#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \ 155162306a36Sopenharmony_ci _start_gfn, _end_gfn, _iter_) \ 155262306a36Sopenharmony_ci for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \ 155362306a36Sopenharmony_ci _end_level_, _start_gfn, _end_gfn); \ 155462306a36Sopenharmony_ci slot_rmap_walk_okay(_iter_); \ 155562306a36Sopenharmony_ci slot_rmap_walk_next(_iter_)) 155662306a36Sopenharmony_ci 155762306a36Sopenharmony_citypedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 155862306a36Sopenharmony_ci struct kvm_memory_slot *slot, gfn_t gfn, 155962306a36Sopenharmony_ci int level, pte_t pte); 156062306a36Sopenharmony_ci 156162306a36Sopenharmony_cistatic __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, 156262306a36Sopenharmony_ci struct kvm_gfn_range *range, 156362306a36Sopenharmony_ci rmap_handler_t handler) 156462306a36Sopenharmony_ci{ 156562306a36Sopenharmony_ci struct slot_rmap_walk_iterator iterator; 156662306a36Sopenharmony_ci bool ret = false; 156762306a36Sopenharmony_ci 156862306a36Sopenharmony_ci for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, 156962306a36Sopenharmony_ci range->start, range->end - 1, &iterator) 157062306a36Sopenharmony_ci ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, 157162306a36Sopenharmony_ci iterator.level, range->arg.pte); 157262306a36Sopenharmony_ci 157362306a36Sopenharmony_ci return ret; 157462306a36Sopenharmony_ci} 157562306a36Sopenharmony_ci 157662306a36Sopenharmony_cibool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 157762306a36Sopenharmony_ci{ 157862306a36Sopenharmony_ci bool flush = false; 157962306a36Sopenharmony_ci 158062306a36Sopenharmony_ci if (kvm_memslots_have_rmaps(kvm)) 158162306a36Sopenharmony_ci flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap); 158262306a36Sopenharmony_ci 158362306a36Sopenharmony_ci if (tdp_mmu_enabled) 158462306a36Sopenharmony_ci flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); 158562306a36Sopenharmony_ci 158662306a36Sopenharmony_ci if (kvm_x86_ops.set_apic_access_page_addr && 158762306a36Sopenharmony_ci range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) 158862306a36Sopenharmony_ci kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); 158962306a36Sopenharmony_ci 159062306a36Sopenharmony_ci return flush; 159162306a36Sopenharmony_ci} 159262306a36Sopenharmony_ci 159362306a36Sopenharmony_cibool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 159462306a36Sopenharmony_ci{ 159562306a36Sopenharmony_ci bool flush = false; 159662306a36Sopenharmony_ci 159762306a36Sopenharmony_ci if (kvm_memslots_have_rmaps(kvm)) 159862306a36Sopenharmony_ci flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap); 159962306a36Sopenharmony_ci 160062306a36Sopenharmony_ci if (tdp_mmu_enabled) 160162306a36Sopenharmony_ci flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); 160262306a36Sopenharmony_ci 160362306a36Sopenharmony_ci return flush; 160462306a36Sopenharmony_ci} 160562306a36Sopenharmony_ci 160662306a36Sopenharmony_cistatic bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 160762306a36Sopenharmony_ci struct kvm_memory_slot *slot, gfn_t gfn, int level, 160862306a36Sopenharmony_ci pte_t unused) 160962306a36Sopenharmony_ci{ 161062306a36Sopenharmony_ci u64 *sptep; 161162306a36Sopenharmony_ci struct rmap_iterator iter; 161262306a36Sopenharmony_ci int young = 0; 161362306a36Sopenharmony_ci 161462306a36Sopenharmony_ci for_each_rmap_spte(rmap_head, &iter, sptep) 161562306a36Sopenharmony_ci young |= mmu_spte_age(sptep); 161662306a36Sopenharmony_ci 161762306a36Sopenharmony_ci return young; 161862306a36Sopenharmony_ci} 161962306a36Sopenharmony_ci 162062306a36Sopenharmony_cistatic bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 162162306a36Sopenharmony_ci struct kvm_memory_slot *slot, gfn_t gfn, 162262306a36Sopenharmony_ci int level, pte_t unused) 162362306a36Sopenharmony_ci{ 162462306a36Sopenharmony_ci u64 *sptep; 162562306a36Sopenharmony_ci struct rmap_iterator iter; 162662306a36Sopenharmony_ci 162762306a36Sopenharmony_ci for_each_rmap_spte(rmap_head, &iter, sptep) 162862306a36Sopenharmony_ci if (is_accessed_spte(*sptep)) 162962306a36Sopenharmony_ci return true; 163062306a36Sopenharmony_ci return false; 163162306a36Sopenharmony_ci} 163262306a36Sopenharmony_ci 163362306a36Sopenharmony_ci#define RMAP_RECYCLE_THRESHOLD 1000 163462306a36Sopenharmony_ci 163562306a36Sopenharmony_cistatic void __rmap_add(struct kvm *kvm, 163662306a36Sopenharmony_ci struct kvm_mmu_memory_cache *cache, 163762306a36Sopenharmony_ci const struct kvm_memory_slot *slot, 163862306a36Sopenharmony_ci u64 *spte, gfn_t gfn, unsigned int access) 163962306a36Sopenharmony_ci{ 164062306a36Sopenharmony_ci struct kvm_mmu_page *sp; 164162306a36Sopenharmony_ci struct kvm_rmap_head *rmap_head; 164262306a36Sopenharmony_ci int rmap_count; 164362306a36Sopenharmony_ci 164462306a36Sopenharmony_ci sp = sptep_to_sp(spte); 164562306a36Sopenharmony_ci kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access); 164662306a36Sopenharmony_ci kvm_update_page_stats(kvm, sp->role.level, 1); 164762306a36Sopenharmony_ci 164862306a36Sopenharmony_ci rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); 164962306a36Sopenharmony_ci rmap_count = pte_list_add(cache, spte, rmap_head); 165062306a36Sopenharmony_ci 165162306a36Sopenharmony_ci if (rmap_count > kvm->stat.max_mmu_rmap_size) 165262306a36Sopenharmony_ci kvm->stat.max_mmu_rmap_size = rmap_count; 165362306a36Sopenharmony_ci if (rmap_count > RMAP_RECYCLE_THRESHOLD) { 165462306a36Sopenharmony_ci kvm_zap_all_rmap_sptes(kvm, rmap_head); 165562306a36Sopenharmony_ci kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level); 165662306a36Sopenharmony_ci } 165762306a36Sopenharmony_ci} 165862306a36Sopenharmony_ci 165962306a36Sopenharmony_cistatic void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, 166062306a36Sopenharmony_ci u64 *spte, gfn_t gfn, unsigned int access) 166162306a36Sopenharmony_ci{ 166262306a36Sopenharmony_ci struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache; 166362306a36Sopenharmony_ci 166462306a36Sopenharmony_ci __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access); 166562306a36Sopenharmony_ci} 166662306a36Sopenharmony_ci 166762306a36Sopenharmony_cibool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 166862306a36Sopenharmony_ci{ 166962306a36Sopenharmony_ci bool young = false; 167062306a36Sopenharmony_ci 167162306a36Sopenharmony_ci if (kvm_memslots_have_rmaps(kvm)) 167262306a36Sopenharmony_ci young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); 167362306a36Sopenharmony_ci 167462306a36Sopenharmony_ci if (tdp_mmu_enabled) 167562306a36Sopenharmony_ci young |= kvm_tdp_mmu_age_gfn_range(kvm, range); 167662306a36Sopenharmony_ci 167762306a36Sopenharmony_ci return young; 167862306a36Sopenharmony_ci} 167962306a36Sopenharmony_ci 168062306a36Sopenharmony_cibool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 168162306a36Sopenharmony_ci{ 168262306a36Sopenharmony_ci bool young = false; 168362306a36Sopenharmony_ci 168462306a36Sopenharmony_ci if (kvm_memslots_have_rmaps(kvm)) 168562306a36Sopenharmony_ci young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap); 168662306a36Sopenharmony_ci 168762306a36Sopenharmony_ci if (tdp_mmu_enabled) 168862306a36Sopenharmony_ci young |= kvm_tdp_mmu_test_age_gfn(kvm, range); 168962306a36Sopenharmony_ci 169062306a36Sopenharmony_ci return young; 169162306a36Sopenharmony_ci} 169262306a36Sopenharmony_ci 169362306a36Sopenharmony_cistatic void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp) 169462306a36Sopenharmony_ci{ 169562306a36Sopenharmony_ci#ifdef CONFIG_KVM_PROVE_MMU 169662306a36Sopenharmony_ci int i; 169762306a36Sopenharmony_ci 169862306a36Sopenharmony_ci for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { 169962306a36Sopenharmony_ci if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i]))) 170062306a36Sopenharmony_ci pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free", 170162306a36Sopenharmony_ci sp->spt[i], &sp->spt[i], 170262306a36Sopenharmony_ci kvm_mmu_page_get_gfn(sp, i)); 170362306a36Sopenharmony_ci } 170462306a36Sopenharmony_ci#endif 170562306a36Sopenharmony_ci} 170662306a36Sopenharmony_ci 170762306a36Sopenharmony_ci/* 170862306a36Sopenharmony_ci * This value is the sum of all of the kvm instances's 170962306a36Sopenharmony_ci * kvm->arch.n_used_mmu_pages values. We need a global, 171062306a36Sopenharmony_ci * aggregate version in order to make the slab shrinker 171162306a36Sopenharmony_ci * faster 171262306a36Sopenharmony_ci */ 171362306a36Sopenharmony_cistatic inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) 171462306a36Sopenharmony_ci{ 171562306a36Sopenharmony_ci kvm->arch.n_used_mmu_pages += nr; 171662306a36Sopenharmony_ci percpu_counter_add(&kvm_total_used_mmu_pages, nr); 171762306a36Sopenharmony_ci} 171862306a36Sopenharmony_ci 171962306a36Sopenharmony_cistatic void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 172062306a36Sopenharmony_ci{ 172162306a36Sopenharmony_ci kvm_mod_used_mmu_pages(kvm, +1); 172262306a36Sopenharmony_ci kvm_account_pgtable_pages((void *)sp->spt, +1); 172362306a36Sopenharmony_ci} 172462306a36Sopenharmony_ci 172562306a36Sopenharmony_cistatic void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 172662306a36Sopenharmony_ci{ 172762306a36Sopenharmony_ci kvm_mod_used_mmu_pages(kvm, -1); 172862306a36Sopenharmony_ci kvm_account_pgtable_pages((void *)sp->spt, -1); 172962306a36Sopenharmony_ci} 173062306a36Sopenharmony_ci 173162306a36Sopenharmony_cistatic void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) 173262306a36Sopenharmony_ci{ 173362306a36Sopenharmony_ci kvm_mmu_check_sptes_at_free(sp); 173462306a36Sopenharmony_ci 173562306a36Sopenharmony_ci hlist_del(&sp->hash_link); 173662306a36Sopenharmony_ci list_del(&sp->link); 173762306a36Sopenharmony_ci free_page((unsigned long)sp->spt); 173862306a36Sopenharmony_ci if (!sp->role.direct) 173962306a36Sopenharmony_ci free_page((unsigned long)sp->shadowed_translation); 174062306a36Sopenharmony_ci kmem_cache_free(mmu_page_header_cache, sp); 174162306a36Sopenharmony_ci} 174262306a36Sopenharmony_ci 174362306a36Sopenharmony_cistatic unsigned kvm_page_table_hashfn(gfn_t gfn) 174462306a36Sopenharmony_ci{ 174562306a36Sopenharmony_ci return hash_64(gfn, KVM_MMU_HASH_SHIFT); 174662306a36Sopenharmony_ci} 174762306a36Sopenharmony_ci 174862306a36Sopenharmony_cistatic void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, 174962306a36Sopenharmony_ci struct kvm_mmu_page *sp, u64 *parent_pte) 175062306a36Sopenharmony_ci{ 175162306a36Sopenharmony_ci if (!parent_pte) 175262306a36Sopenharmony_ci return; 175362306a36Sopenharmony_ci 175462306a36Sopenharmony_ci pte_list_add(cache, parent_pte, &sp->parent_ptes); 175562306a36Sopenharmony_ci} 175662306a36Sopenharmony_ci 175762306a36Sopenharmony_cistatic void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, 175862306a36Sopenharmony_ci u64 *parent_pte) 175962306a36Sopenharmony_ci{ 176062306a36Sopenharmony_ci pte_list_remove(kvm, parent_pte, &sp->parent_ptes); 176162306a36Sopenharmony_ci} 176262306a36Sopenharmony_ci 176362306a36Sopenharmony_cistatic void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, 176462306a36Sopenharmony_ci u64 *parent_pte) 176562306a36Sopenharmony_ci{ 176662306a36Sopenharmony_ci mmu_page_remove_parent_pte(kvm, sp, parent_pte); 176762306a36Sopenharmony_ci mmu_spte_clear_no_track(parent_pte); 176862306a36Sopenharmony_ci} 176962306a36Sopenharmony_ci 177062306a36Sopenharmony_cistatic void mark_unsync(u64 *spte); 177162306a36Sopenharmony_cistatic void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) 177262306a36Sopenharmony_ci{ 177362306a36Sopenharmony_ci u64 *sptep; 177462306a36Sopenharmony_ci struct rmap_iterator iter; 177562306a36Sopenharmony_ci 177662306a36Sopenharmony_ci for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) { 177762306a36Sopenharmony_ci mark_unsync(sptep); 177862306a36Sopenharmony_ci } 177962306a36Sopenharmony_ci} 178062306a36Sopenharmony_ci 178162306a36Sopenharmony_cistatic void mark_unsync(u64 *spte) 178262306a36Sopenharmony_ci{ 178362306a36Sopenharmony_ci struct kvm_mmu_page *sp; 178462306a36Sopenharmony_ci 178562306a36Sopenharmony_ci sp = sptep_to_sp(spte); 178662306a36Sopenharmony_ci if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap)) 178762306a36Sopenharmony_ci return; 178862306a36Sopenharmony_ci if (sp->unsync_children++) 178962306a36Sopenharmony_ci return; 179062306a36Sopenharmony_ci kvm_mmu_mark_parents_unsync(sp); 179162306a36Sopenharmony_ci} 179262306a36Sopenharmony_ci 179362306a36Sopenharmony_ci#define KVM_PAGE_ARRAY_NR 16 179462306a36Sopenharmony_ci 179562306a36Sopenharmony_cistruct kvm_mmu_pages { 179662306a36Sopenharmony_ci struct mmu_page_and_offset { 179762306a36Sopenharmony_ci struct kvm_mmu_page *sp; 179862306a36Sopenharmony_ci unsigned int idx; 179962306a36Sopenharmony_ci } page[KVM_PAGE_ARRAY_NR]; 180062306a36Sopenharmony_ci unsigned int nr; 180162306a36Sopenharmony_ci}; 180262306a36Sopenharmony_ci 180362306a36Sopenharmony_cistatic int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, 180462306a36Sopenharmony_ci int idx) 180562306a36Sopenharmony_ci{ 180662306a36Sopenharmony_ci int i; 180762306a36Sopenharmony_ci 180862306a36Sopenharmony_ci if (sp->unsync) 180962306a36Sopenharmony_ci for (i=0; i < pvec->nr; i++) 181062306a36Sopenharmony_ci if (pvec->page[i].sp == sp) 181162306a36Sopenharmony_ci return 0; 181262306a36Sopenharmony_ci 181362306a36Sopenharmony_ci pvec->page[pvec->nr].sp = sp; 181462306a36Sopenharmony_ci pvec->page[pvec->nr].idx = idx; 181562306a36Sopenharmony_ci pvec->nr++; 181662306a36Sopenharmony_ci return (pvec->nr == KVM_PAGE_ARRAY_NR); 181762306a36Sopenharmony_ci} 181862306a36Sopenharmony_ci 181962306a36Sopenharmony_cistatic inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) 182062306a36Sopenharmony_ci{ 182162306a36Sopenharmony_ci --sp->unsync_children; 182262306a36Sopenharmony_ci WARN_ON_ONCE((int)sp->unsync_children < 0); 182362306a36Sopenharmony_ci __clear_bit(idx, sp->unsync_child_bitmap); 182462306a36Sopenharmony_ci} 182562306a36Sopenharmony_ci 182662306a36Sopenharmony_cistatic int __mmu_unsync_walk(struct kvm_mmu_page *sp, 182762306a36Sopenharmony_ci struct kvm_mmu_pages *pvec) 182862306a36Sopenharmony_ci{ 182962306a36Sopenharmony_ci int i, ret, nr_unsync_leaf = 0; 183062306a36Sopenharmony_ci 183162306a36Sopenharmony_ci for_each_set_bit(i, sp->unsync_child_bitmap, 512) { 183262306a36Sopenharmony_ci struct kvm_mmu_page *child; 183362306a36Sopenharmony_ci u64 ent = sp->spt[i]; 183462306a36Sopenharmony_ci 183562306a36Sopenharmony_ci if (!is_shadow_present_pte(ent) || is_large_pte(ent)) { 183662306a36Sopenharmony_ci clear_unsync_child_bit(sp, i); 183762306a36Sopenharmony_ci continue; 183862306a36Sopenharmony_ci } 183962306a36Sopenharmony_ci 184062306a36Sopenharmony_ci child = spte_to_child_sp(ent); 184162306a36Sopenharmony_ci 184262306a36Sopenharmony_ci if (child->unsync_children) { 184362306a36Sopenharmony_ci if (mmu_pages_add(pvec, child, i)) 184462306a36Sopenharmony_ci return -ENOSPC; 184562306a36Sopenharmony_ci 184662306a36Sopenharmony_ci ret = __mmu_unsync_walk(child, pvec); 184762306a36Sopenharmony_ci if (!ret) { 184862306a36Sopenharmony_ci clear_unsync_child_bit(sp, i); 184962306a36Sopenharmony_ci continue; 185062306a36Sopenharmony_ci } else if (ret > 0) { 185162306a36Sopenharmony_ci nr_unsync_leaf += ret; 185262306a36Sopenharmony_ci } else 185362306a36Sopenharmony_ci return ret; 185462306a36Sopenharmony_ci } else if (child->unsync) { 185562306a36Sopenharmony_ci nr_unsync_leaf++; 185662306a36Sopenharmony_ci if (mmu_pages_add(pvec, child, i)) 185762306a36Sopenharmony_ci return -ENOSPC; 185862306a36Sopenharmony_ci } else 185962306a36Sopenharmony_ci clear_unsync_child_bit(sp, i); 186062306a36Sopenharmony_ci } 186162306a36Sopenharmony_ci 186262306a36Sopenharmony_ci return nr_unsync_leaf; 186362306a36Sopenharmony_ci} 186462306a36Sopenharmony_ci 186562306a36Sopenharmony_ci#define INVALID_INDEX (-1) 186662306a36Sopenharmony_ci 186762306a36Sopenharmony_cistatic int mmu_unsync_walk(struct kvm_mmu_page *sp, 186862306a36Sopenharmony_ci struct kvm_mmu_pages *pvec) 186962306a36Sopenharmony_ci{ 187062306a36Sopenharmony_ci pvec->nr = 0; 187162306a36Sopenharmony_ci if (!sp->unsync_children) 187262306a36Sopenharmony_ci return 0; 187362306a36Sopenharmony_ci 187462306a36Sopenharmony_ci mmu_pages_add(pvec, sp, INVALID_INDEX); 187562306a36Sopenharmony_ci return __mmu_unsync_walk(sp, pvec); 187662306a36Sopenharmony_ci} 187762306a36Sopenharmony_ci 187862306a36Sopenharmony_cistatic void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 187962306a36Sopenharmony_ci{ 188062306a36Sopenharmony_ci WARN_ON_ONCE(!sp->unsync); 188162306a36Sopenharmony_ci trace_kvm_mmu_sync_page(sp); 188262306a36Sopenharmony_ci sp->unsync = 0; 188362306a36Sopenharmony_ci --kvm->stat.mmu_unsync; 188462306a36Sopenharmony_ci} 188562306a36Sopenharmony_ci 188662306a36Sopenharmony_cistatic bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 188762306a36Sopenharmony_ci struct list_head *invalid_list); 188862306a36Sopenharmony_cistatic void kvm_mmu_commit_zap_page(struct kvm *kvm, 188962306a36Sopenharmony_ci struct list_head *invalid_list); 189062306a36Sopenharmony_ci 189162306a36Sopenharmony_cistatic bool sp_has_gptes(struct kvm_mmu_page *sp) 189262306a36Sopenharmony_ci{ 189362306a36Sopenharmony_ci if (sp->role.direct) 189462306a36Sopenharmony_ci return false; 189562306a36Sopenharmony_ci 189662306a36Sopenharmony_ci if (sp->role.passthrough) 189762306a36Sopenharmony_ci return false; 189862306a36Sopenharmony_ci 189962306a36Sopenharmony_ci return true; 190062306a36Sopenharmony_ci} 190162306a36Sopenharmony_ci 190262306a36Sopenharmony_ci#define for_each_valid_sp(_kvm, _sp, _list) \ 190362306a36Sopenharmony_ci hlist_for_each_entry(_sp, _list, hash_link) \ 190462306a36Sopenharmony_ci if (is_obsolete_sp((_kvm), (_sp))) { \ 190562306a36Sopenharmony_ci } else 190662306a36Sopenharmony_ci 190762306a36Sopenharmony_ci#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \ 190862306a36Sopenharmony_ci for_each_valid_sp(_kvm, _sp, \ 190962306a36Sopenharmony_ci &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ 191062306a36Sopenharmony_ci if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else 191162306a36Sopenharmony_ci 191262306a36Sopenharmony_cistatic bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 191362306a36Sopenharmony_ci{ 191462306a36Sopenharmony_ci union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role; 191562306a36Sopenharmony_ci 191662306a36Sopenharmony_ci /* 191762306a36Sopenharmony_ci * Ignore various flags when verifying that it's safe to sync a shadow 191862306a36Sopenharmony_ci * page using the current MMU context. 191962306a36Sopenharmony_ci * 192062306a36Sopenharmony_ci * - level: not part of the overall MMU role and will never match as the MMU's 192162306a36Sopenharmony_ci * level tracks the root level 192262306a36Sopenharmony_ci * - access: updated based on the new guest PTE 192362306a36Sopenharmony_ci * - quadrant: not part of the overall MMU role (similar to level) 192462306a36Sopenharmony_ci */ 192562306a36Sopenharmony_ci const union kvm_mmu_page_role sync_role_ign = { 192662306a36Sopenharmony_ci .level = 0xf, 192762306a36Sopenharmony_ci .access = 0x7, 192862306a36Sopenharmony_ci .quadrant = 0x3, 192962306a36Sopenharmony_ci .passthrough = 0x1, 193062306a36Sopenharmony_ci }; 193162306a36Sopenharmony_ci 193262306a36Sopenharmony_ci /* 193362306a36Sopenharmony_ci * Direct pages can never be unsync, and KVM should never attempt to 193462306a36Sopenharmony_ci * sync a shadow page for a different MMU context, e.g. if the role 193562306a36Sopenharmony_ci * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the 193662306a36Sopenharmony_ci * reserved bits checks will be wrong, etc... 193762306a36Sopenharmony_ci */ 193862306a36Sopenharmony_ci if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte || 193962306a36Sopenharmony_ci (sp->role.word ^ root_role.word) & ~sync_role_ign.word)) 194062306a36Sopenharmony_ci return false; 194162306a36Sopenharmony_ci 194262306a36Sopenharmony_ci return true; 194362306a36Sopenharmony_ci} 194462306a36Sopenharmony_ci 194562306a36Sopenharmony_cistatic int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) 194662306a36Sopenharmony_ci{ 194762306a36Sopenharmony_ci if (!sp->spt[i]) 194862306a36Sopenharmony_ci return 0; 194962306a36Sopenharmony_ci 195062306a36Sopenharmony_ci return vcpu->arch.mmu->sync_spte(vcpu, sp, i); 195162306a36Sopenharmony_ci} 195262306a36Sopenharmony_ci 195362306a36Sopenharmony_cistatic int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 195462306a36Sopenharmony_ci{ 195562306a36Sopenharmony_ci int flush = 0; 195662306a36Sopenharmony_ci int i; 195762306a36Sopenharmony_ci 195862306a36Sopenharmony_ci if (!kvm_sync_page_check(vcpu, sp)) 195962306a36Sopenharmony_ci return -1; 196062306a36Sopenharmony_ci 196162306a36Sopenharmony_ci for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { 196262306a36Sopenharmony_ci int ret = kvm_sync_spte(vcpu, sp, i); 196362306a36Sopenharmony_ci 196462306a36Sopenharmony_ci if (ret < -1) 196562306a36Sopenharmony_ci return -1; 196662306a36Sopenharmony_ci flush |= ret; 196762306a36Sopenharmony_ci } 196862306a36Sopenharmony_ci 196962306a36Sopenharmony_ci /* 197062306a36Sopenharmony_ci * Note, any flush is purely for KVM's correctness, e.g. when dropping 197162306a36Sopenharmony_ci * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier 197262306a36Sopenharmony_ci * unmap or dirty logging event doesn't fail to flush. The guest is 197362306a36Sopenharmony_ci * responsible for flushing the TLB to ensure any changes in protection 197462306a36Sopenharmony_ci * bits are recognized, i.e. until the guest flushes or page faults on 197562306a36Sopenharmony_ci * a relevant address, KVM is architecturally allowed to let vCPUs use 197662306a36Sopenharmony_ci * cached translations with the old protection bits. 197762306a36Sopenharmony_ci */ 197862306a36Sopenharmony_ci return flush; 197962306a36Sopenharmony_ci} 198062306a36Sopenharmony_ci 198162306a36Sopenharmony_cistatic int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 198262306a36Sopenharmony_ci struct list_head *invalid_list) 198362306a36Sopenharmony_ci{ 198462306a36Sopenharmony_ci int ret = __kvm_sync_page(vcpu, sp); 198562306a36Sopenharmony_ci 198662306a36Sopenharmony_ci if (ret < 0) 198762306a36Sopenharmony_ci kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 198862306a36Sopenharmony_ci return ret; 198962306a36Sopenharmony_ci} 199062306a36Sopenharmony_ci 199162306a36Sopenharmony_cistatic bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, 199262306a36Sopenharmony_ci struct list_head *invalid_list, 199362306a36Sopenharmony_ci bool remote_flush) 199462306a36Sopenharmony_ci{ 199562306a36Sopenharmony_ci if (!remote_flush && list_empty(invalid_list)) 199662306a36Sopenharmony_ci return false; 199762306a36Sopenharmony_ci 199862306a36Sopenharmony_ci if (!list_empty(invalid_list)) 199962306a36Sopenharmony_ci kvm_mmu_commit_zap_page(kvm, invalid_list); 200062306a36Sopenharmony_ci else 200162306a36Sopenharmony_ci kvm_flush_remote_tlbs(kvm); 200262306a36Sopenharmony_ci return true; 200362306a36Sopenharmony_ci} 200462306a36Sopenharmony_ci 200562306a36Sopenharmony_cistatic bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 200662306a36Sopenharmony_ci{ 200762306a36Sopenharmony_ci if (sp->role.invalid) 200862306a36Sopenharmony_ci return true; 200962306a36Sopenharmony_ci 201062306a36Sopenharmony_ci /* TDP MMU pages do not use the MMU generation. */ 201162306a36Sopenharmony_ci return !is_tdp_mmu_page(sp) && 201262306a36Sopenharmony_ci unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); 201362306a36Sopenharmony_ci} 201462306a36Sopenharmony_ci 201562306a36Sopenharmony_cistruct mmu_page_path { 201662306a36Sopenharmony_ci struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL]; 201762306a36Sopenharmony_ci unsigned int idx[PT64_ROOT_MAX_LEVEL]; 201862306a36Sopenharmony_ci}; 201962306a36Sopenharmony_ci 202062306a36Sopenharmony_ci#define for_each_sp(pvec, sp, parents, i) \ 202162306a36Sopenharmony_ci for (i = mmu_pages_first(&pvec, &parents); \ 202262306a36Sopenharmony_ci i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ 202362306a36Sopenharmony_ci i = mmu_pages_next(&pvec, &parents, i)) 202462306a36Sopenharmony_ci 202562306a36Sopenharmony_cistatic int mmu_pages_next(struct kvm_mmu_pages *pvec, 202662306a36Sopenharmony_ci struct mmu_page_path *parents, 202762306a36Sopenharmony_ci int i) 202862306a36Sopenharmony_ci{ 202962306a36Sopenharmony_ci int n; 203062306a36Sopenharmony_ci 203162306a36Sopenharmony_ci for (n = i+1; n < pvec->nr; n++) { 203262306a36Sopenharmony_ci struct kvm_mmu_page *sp = pvec->page[n].sp; 203362306a36Sopenharmony_ci unsigned idx = pvec->page[n].idx; 203462306a36Sopenharmony_ci int level = sp->role.level; 203562306a36Sopenharmony_ci 203662306a36Sopenharmony_ci parents->idx[level-1] = idx; 203762306a36Sopenharmony_ci if (level == PG_LEVEL_4K) 203862306a36Sopenharmony_ci break; 203962306a36Sopenharmony_ci 204062306a36Sopenharmony_ci parents->parent[level-2] = sp; 204162306a36Sopenharmony_ci } 204262306a36Sopenharmony_ci 204362306a36Sopenharmony_ci return n; 204462306a36Sopenharmony_ci} 204562306a36Sopenharmony_ci 204662306a36Sopenharmony_cistatic int mmu_pages_first(struct kvm_mmu_pages *pvec, 204762306a36Sopenharmony_ci struct mmu_page_path *parents) 204862306a36Sopenharmony_ci{ 204962306a36Sopenharmony_ci struct kvm_mmu_page *sp; 205062306a36Sopenharmony_ci int level; 205162306a36Sopenharmony_ci 205262306a36Sopenharmony_ci if (pvec->nr == 0) 205362306a36Sopenharmony_ci return 0; 205462306a36Sopenharmony_ci 205562306a36Sopenharmony_ci WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX); 205662306a36Sopenharmony_ci 205762306a36Sopenharmony_ci sp = pvec->page[0].sp; 205862306a36Sopenharmony_ci level = sp->role.level; 205962306a36Sopenharmony_ci WARN_ON_ONCE(level == PG_LEVEL_4K); 206062306a36Sopenharmony_ci 206162306a36Sopenharmony_ci parents->parent[level-2] = sp; 206262306a36Sopenharmony_ci 206362306a36Sopenharmony_ci /* Also set up a sentinel. Further entries in pvec are all 206462306a36Sopenharmony_ci * children of sp, so this element is never overwritten. 206562306a36Sopenharmony_ci */ 206662306a36Sopenharmony_ci parents->parent[level-1] = NULL; 206762306a36Sopenharmony_ci return mmu_pages_next(pvec, parents, 0); 206862306a36Sopenharmony_ci} 206962306a36Sopenharmony_ci 207062306a36Sopenharmony_cistatic void mmu_pages_clear_parents(struct mmu_page_path *parents) 207162306a36Sopenharmony_ci{ 207262306a36Sopenharmony_ci struct kvm_mmu_page *sp; 207362306a36Sopenharmony_ci unsigned int level = 0; 207462306a36Sopenharmony_ci 207562306a36Sopenharmony_ci do { 207662306a36Sopenharmony_ci unsigned int idx = parents->idx[level]; 207762306a36Sopenharmony_ci sp = parents->parent[level]; 207862306a36Sopenharmony_ci if (!sp) 207962306a36Sopenharmony_ci return; 208062306a36Sopenharmony_ci 208162306a36Sopenharmony_ci WARN_ON_ONCE(idx == INVALID_INDEX); 208262306a36Sopenharmony_ci clear_unsync_child_bit(sp, idx); 208362306a36Sopenharmony_ci level++; 208462306a36Sopenharmony_ci } while (!sp->unsync_children); 208562306a36Sopenharmony_ci} 208662306a36Sopenharmony_ci 208762306a36Sopenharmony_cistatic int mmu_sync_children(struct kvm_vcpu *vcpu, 208862306a36Sopenharmony_ci struct kvm_mmu_page *parent, bool can_yield) 208962306a36Sopenharmony_ci{ 209062306a36Sopenharmony_ci int i; 209162306a36Sopenharmony_ci struct kvm_mmu_page *sp; 209262306a36Sopenharmony_ci struct mmu_page_path parents; 209362306a36Sopenharmony_ci struct kvm_mmu_pages pages; 209462306a36Sopenharmony_ci LIST_HEAD(invalid_list); 209562306a36Sopenharmony_ci bool flush = false; 209662306a36Sopenharmony_ci 209762306a36Sopenharmony_ci while (mmu_unsync_walk(parent, &pages)) { 209862306a36Sopenharmony_ci bool protected = false; 209962306a36Sopenharmony_ci 210062306a36Sopenharmony_ci for_each_sp(pages, sp, parents, i) 210162306a36Sopenharmony_ci protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn); 210262306a36Sopenharmony_ci 210362306a36Sopenharmony_ci if (protected) { 210462306a36Sopenharmony_ci kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true); 210562306a36Sopenharmony_ci flush = false; 210662306a36Sopenharmony_ci } 210762306a36Sopenharmony_ci 210862306a36Sopenharmony_ci for_each_sp(pages, sp, parents, i) { 210962306a36Sopenharmony_ci kvm_unlink_unsync_page(vcpu->kvm, sp); 211062306a36Sopenharmony_ci flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0; 211162306a36Sopenharmony_ci mmu_pages_clear_parents(&parents); 211262306a36Sopenharmony_ci } 211362306a36Sopenharmony_ci if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { 211462306a36Sopenharmony_ci kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); 211562306a36Sopenharmony_ci if (!can_yield) { 211662306a36Sopenharmony_ci kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 211762306a36Sopenharmony_ci return -EINTR; 211862306a36Sopenharmony_ci } 211962306a36Sopenharmony_ci 212062306a36Sopenharmony_ci cond_resched_rwlock_write(&vcpu->kvm->mmu_lock); 212162306a36Sopenharmony_ci flush = false; 212262306a36Sopenharmony_ci } 212362306a36Sopenharmony_ci } 212462306a36Sopenharmony_ci 212562306a36Sopenharmony_ci kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); 212662306a36Sopenharmony_ci return 0; 212762306a36Sopenharmony_ci} 212862306a36Sopenharmony_ci 212962306a36Sopenharmony_cistatic void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) 213062306a36Sopenharmony_ci{ 213162306a36Sopenharmony_ci atomic_set(&sp->write_flooding_count, 0); 213262306a36Sopenharmony_ci} 213362306a36Sopenharmony_ci 213462306a36Sopenharmony_cistatic void clear_sp_write_flooding_count(u64 *spte) 213562306a36Sopenharmony_ci{ 213662306a36Sopenharmony_ci __clear_sp_write_flooding_count(sptep_to_sp(spte)); 213762306a36Sopenharmony_ci} 213862306a36Sopenharmony_ci 213962306a36Sopenharmony_ci/* 214062306a36Sopenharmony_ci * The vCPU is required when finding indirect shadow pages; the shadow 214162306a36Sopenharmony_ci * page may already exist and syncing it needs the vCPU pointer in 214262306a36Sopenharmony_ci * order to read guest page tables. Direct shadow pages are never 214362306a36Sopenharmony_ci * unsync, thus @vcpu can be NULL if @role.direct is true. 214462306a36Sopenharmony_ci */ 214562306a36Sopenharmony_cistatic struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, 214662306a36Sopenharmony_ci struct kvm_vcpu *vcpu, 214762306a36Sopenharmony_ci gfn_t gfn, 214862306a36Sopenharmony_ci struct hlist_head *sp_list, 214962306a36Sopenharmony_ci union kvm_mmu_page_role role) 215062306a36Sopenharmony_ci{ 215162306a36Sopenharmony_ci struct kvm_mmu_page *sp; 215262306a36Sopenharmony_ci int ret; 215362306a36Sopenharmony_ci int collisions = 0; 215462306a36Sopenharmony_ci LIST_HEAD(invalid_list); 215562306a36Sopenharmony_ci 215662306a36Sopenharmony_ci for_each_valid_sp(kvm, sp, sp_list) { 215762306a36Sopenharmony_ci if (sp->gfn != gfn) { 215862306a36Sopenharmony_ci collisions++; 215962306a36Sopenharmony_ci continue; 216062306a36Sopenharmony_ci } 216162306a36Sopenharmony_ci 216262306a36Sopenharmony_ci if (sp->role.word != role.word) { 216362306a36Sopenharmony_ci /* 216462306a36Sopenharmony_ci * If the guest is creating an upper-level page, zap 216562306a36Sopenharmony_ci * unsync pages for the same gfn. While it's possible 216662306a36Sopenharmony_ci * the guest is using recursive page tables, in all 216762306a36Sopenharmony_ci * likelihood the guest has stopped using the unsync 216862306a36Sopenharmony_ci * page and is installing a completely unrelated page. 216962306a36Sopenharmony_ci * Unsync pages must not be left as is, because the new 217062306a36Sopenharmony_ci * upper-level page will be write-protected. 217162306a36Sopenharmony_ci */ 217262306a36Sopenharmony_ci if (role.level > PG_LEVEL_4K && sp->unsync) 217362306a36Sopenharmony_ci kvm_mmu_prepare_zap_page(kvm, sp, 217462306a36Sopenharmony_ci &invalid_list); 217562306a36Sopenharmony_ci continue; 217662306a36Sopenharmony_ci } 217762306a36Sopenharmony_ci 217862306a36Sopenharmony_ci /* unsync and write-flooding only apply to indirect SPs. */ 217962306a36Sopenharmony_ci if (sp->role.direct) 218062306a36Sopenharmony_ci goto out; 218162306a36Sopenharmony_ci 218262306a36Sopenharmony_ci if (sp->unsync) { 218362306a36Sopenharmony_ci if (KVM_BUG_ON(!vcpu, kvm)) 218462306a36Sopenharmony_ci break; 218562306a36Sopenharmony_ci 218662306a36Sopenharmony_ci /* 218762306a36Sopenharmony_ci * The page is good, but is stale. kvm_sync_page does 218862306a36Sopenharmony_ci * get the latest guest state, but (unlike mmu_unsync_children) 218962306a36Sopenharmony_ci * it doesn't write-protect the page or mark it synchronized! 219062306a36Sopenharmony_ci * This way the validity of the mapping is ensured, but the 219162306a36Sopenharmony_ci * overhead of write protection is not incurred until the 219262306a36Sopenharmony_ci * guest invalidates the TLB mapping. This allows multiple 219362306a36Sopenharmony_ci * SPs for a single gfn to be unsync. 219462306a36Sopenharmony_ci * 219562306a36Sopenharmony_ci * If the sync fails, the page is zapped. If so, break 219662306a36Sopenharmony_ci * in order to rebuild it. 219762306a36Sopenharmony_ci */ 219862306a36Sopenharmony_ci ret = kvm_sync_page(vcpu, sp, &invalid_list); 219962306a36Sopenharmony_ci if (ret < 0) 220062306a36Sopenharmony_ci break; 220162306a36Sopenharmony_ci 220262306a36Sopenharmony_ci WARN_ON_ONCE(!list_empty(&invalid_list)); 220362306a36Sopenharmony_ci if (ret > 0) 220462306a36Sopenharmony_ci kvm_flush_remote_tlbs(kvm); 220562306a36Sopenharmony_ci } 220662306a36Sopenharmony_ci 220762306a36Sopenharmony_ci __clear_sp_write_flooding_count(sp); 220862306a36Sopenharmony_ci 220962306a36Sopenharmony_ci goto out; 221062306a36Sopenharmony_ci } 221162306a36Sopenharmony_ci 221262306a36Sopenharmony_ci sp = NULL; 221362306a36Sopenharmony_ci ++kvm->stat.mmu_cache_miss; 221462306a36Sopenharmony_ci 221562306a36Sopenharmony_ciout: 221662306a36Sopenharmony_ci kvm_mmu_commit_zap_page(kvm, &invalid_list); 221762306a36Sopenharmony_ci 221862306a36Sopenharmony_ci if (collisions > kvm->stat.max_mmu_page_hash_collisions) 221962306a36Sopenharmony_ci kvm->stat.max_mmu_page_hash_collisions = collisions; 222062306a36Sopenharmony_ci return sp; 222162306a36Sopenharmony_ci} 222262306a36Sopenharmony_ci 222362306a36Sopenharmony_ci/* Caches used when allocating a new shadow page. */ 222462306a36Sopenharmony_cistruct shadow_page_caches { 222562306a36Sopenharmony_ci struct kvm_mmu_memory_cache *page_header_cache; 222662306a36Sopenharmony_ci struct kvm_mmu_memory_cache *shadow_page_cache; 222762306a36Sopenharmony_ci struct kvm_mmu_memory_cache *shadowed_info_cache; 222862306a36Sopenharmony_ci}; 222962306a36Sopenharmony_ci 223062306a36Sopenharmony_cistatic struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, 223162306a36Sopenharmony_ci struct shadow_page_caches *caches, 223262306a36Sopenharmony_ci gfn_t gfn, 223362306a36Sopenharmony_ci struct hlist_head *sp_list, 223462306a36Sopenharmony_ci union kvm_mmu_page_role role) 223562306a36Sopenharmony_ci{ 223662306a36Sopenharmony_ci struct kvm_mmu_page *sp; 223762306a36Sopenharmony_ci 223862306a36Sopenharmony_ci sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache); 223962306a36Sopenharmony_ci sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache); 224062306a36Sopenharmony_ci if (!role.direct) 224162306a36Sopenharmony_ci sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); 224262306a36Sopenharmony_ci 224362306a36Sopenharmony_ci set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 224462306a36Sopenharmony_ci 224562306a36Sopenharmony_ci INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); 224662306a36Sopenharmony_ci 224762306a36Sopenharmony_ci /* 224862306a36Sopenharmony_ci * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() 224962306a36Sopenharmony_ci * depends on valid pages being added to the head of the list. See 225062306a36Sopenharmony_ci * comments in kvm_zap_obsolete_pages(). 225162306a36Sopenharmony_ci */ 225262306a36Sopenharmony_ci sp->mmu_valid_gen = kvm->arch.mmu_valid_gen; 225362306a36Sopenharmony_ci list_add(&sp->link, &kvm->arch.active_mmu_pages); 225462306a36Sopenharmony_ci kvm_account_mmu_page(kvm, sp); 225562306a36Sopenharmony_ci 225662306a36Sopenharmony_ci sp->gfn = gfn; 225762306a36Sopenharmony_ci sp->role = role; 225862306a36Sopenharmony_ci hlist_add_head(&sp->hash_link, sp_list); 225962306a36Sopenharmony_ci if (sp_has_gptes(sp)) 226062306a36Sopenharmony_ci account_shadowed(kvm, sp); 226162306a36Sopenharmony_ci 226262306a36Sopenharmony_ci return sp; 226362306a36Sopenharmony_ci} 226462306a36Sopenharmony_ci 226562306a36Sopenharmony_ci/* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */ 226662306a36Sopenharmony_cistatic struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm, 226762306a36Sopenharmony_ci struct kvm_vcpu *vcpu, 226862306a36Sopenharmony_ci struct shadow_page_caches *caches, 226962306a36Sopenharmony_ci gfn_t gfn, 227062306a36Sopenharmony_ci union kvm_mmu_page_role role) 227162306a36Sopenharmony_ci{ 227262306a36Sopenharmony_ci struct hlist_head *sp_list; 227362306a36Sopenharmony_ci struct kvm_mmu_page *sp; 227462306a36Sopenharmony_ci bool created = false; 227562306a36Sopenharmony_ci 227662306a36Sopenharmony_ci sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; 227762306a36Sopenharmony_ci 227862306a36Sopenharmony_ci sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role); 227962306a36Sopenharmony_ci if (!sp) { 228062306a36Sopenharmony_ci created = true; 228162306a36Sopenharmony_ci sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role); 228262306a36Sopenharmony_ci } 228362306a36Sopenharmony_ci 228462306a36Sopenharmony_ci trace_kvm_mmu_get_page(sp, created); 228562306a36Sopenharmony_ci return sp; 228662306a36Sopenharmony_ci} 228762306a36Sopenharmony_ci 228862306a36Sopenharmony_cistatic struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, 228962306a36Sopenharmony_ci gfn_t gfn, 229062306a36Sopenharmony_ci union kvm_mmu_page_role role) 229162306a36Sopenharmony_ci{ 229262306a36Sopenharmony_ci struct shadow_page_caches caches = { 229362306a36Sopenharmony_ci .page_header_cache = &vcpu->arch.mmu_page_header_cache, 229462306a36Sopenharmony_ci .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache, 229562306a36Sopenharmony_ci .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache, 229662306a36Sopenharmony_ci }; 229762306a36Sopenharmony_ci 229862306a36Sopenharmony_ci return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role); 229962306a36Sopenharmony_ci} 230062306a36Sopenharmony_ci 230162306a36Sopenharmony_cistatic union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, 230262306a36Sopenharmony_ci unsigned int access) 230362306a36Sopenharmony_ci{ 230462306a36Sopenharmony_ci struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep); 230562306a36Sopenharmony_ci union kvm_mmu_page_role role; 230662306a36Sopenharmony_ci 230762306a36Sopenharmony_ci role = parent_sp->role; 230862306a36Sopenharmony_ci role.level--; 230962306a36Sopenharmony_ci role.access = access; 231062306a36Sopenharmony_ci role.direct = direct; 231162306a36Sopenharmony_ci role.passthrough = 0; 231262306a36Sopenharmony_ci 231362306a36Sopenharmony_ci /* 231462306a36Sopenharmony_ci * If the guest has 4-byte PTEs then that means it's using 32-bit, 231562306a36Sopenharmony_ci * 2-level, non-PAE paging. KVM shadows such guests with PAE paging 231662306a36Sopenharmony_ci * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must 231762306a36Sopenharmony_ci * shadow each guest page table with multiple shadow page tables, which 231862306a36Sopenharmony_ci * requires extra bookkeeping in the role. 231962306a36Sopenharmony_ci * 232062306a36Sopenharmony_ci * Specifically, to shadow the guest's page directory (which covers a 232162306a36Sopenharmony_ci * 4GiB address space), KVM uses 4 PAE page directories, each mapping 232262306a36Sopenharmony_ci * 1GiB of the address space. @role.quadrant encodes which quarter of 232362306a36Sopenharmony_ci * the address space each maps. 232462306a36Sopenharmony_ci * 232562306a36Sopenharmony_ci * To shadow the guest's page tables (which each map a 4MiB region), KVM 232662306a36Sopenharmony_ci * uses 2 PAE page tables, each mapping a 2MiB region. For these, 232762306a36Sopenharmony_ci * @role.quadrant encodes which half of the region they map. 232862306a36Sopenharmony_ci * 232962306a36Sopenharmony_ci * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE 233062306a36Sopenharmony_ci * consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow 233162306a36Sopenharmony_ci * PDPTEs; those 4 PAE page directories are pre-allocated and their 233262306a36Sopenharmony_ci * quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes 233362306a36Sopenharmony_ci * bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume 233462306a36Sopenharmony_ci * bit 21 in the PTE (the child here), KVM propagates that bit to the 233562306a36Sopenharmony_ci * quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE 233662306a36Sopenharmony_ci * covers bit 21 (see above), thus the quadrant is calculated from the 233762306a36Sopenharmony_ci * _least_ significant bit of the PDE index. 233862306a36Sopenharmony_ci */ 233962306a36Sopenharmony_ci if (role.has_4_byte_gpte) { 234062306a36Sopenharmony_ci WARN_ON_ONCE(role.level != PG_LEVEL_4K); 234162306a36Sopenharmony_ci role.quadrant = spte_index(sptep) & 1; 234262306a36Sopenharmony_ci } 234362306a36Sopenharmony_ci 234462306a36Sopenharmony_ci return role; 234562306a36Sopenharmony_ci} 234662306a36Sopenharmony_ci 234762306a36Sopenharmony_cistatic struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, 234862306a36Sopenharmony_ci u64 *sptep, gfn_t gfn, 234962306a36Sopenharmony_ci bool direct, unsigned int access) 235062306a36Sopenharmony_ci{ 235162306a36Sopenharmony_ci union kvm_mmu_page_role role; 235262306a36Sopenharmony_ci 235362306a36Sopenharmony_ci if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) 235462306a36Sopenharmony_ci return ERR_PTR(-EEXIST); 235562306a36Sopenharmony_ci 235662306a36Sopenharmony_ci role = kvm_mmu_child_role(sptep, direct, access); 235762306a36Sopenharmony_ci return kvm_mmu_get_shadow_page(vcpu, gfn, role); 235862306a36Sopenharmony_ci} 235962306a36Sopenharmony_ci 236062306a36Sopenharmony_cistatic void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, 236162306a36Sopenharmony_ci struct kvm_vcpu *vcpu, hpa_t root, 236262306a36Sopenharmony_ci u64 addr) 236362306a36Sopenharmony_ci{ 236462306a36Sopenharmony_ci iterator->addr = addr; 236562306a36Sopenharmony_ci iterator->shadow_addr = root; 236662306a36Sopenharmony_ci iterator->level = vcpu->arch.mmu->root_role.level; 236762306a36Sopenharmony_ci 236862306a36Sopenharmony_ci if (iterator->level >= PT64_ROOT_4LEVEL && 236962306a36Sopenharmony_ci vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL && 237062306a36Sopenharmony_ci !vcpu->arch.mmu->root_role.direct) 237162306a36Sopenharmony_ci iterator->level = PT32E_ROOT_LEVEL; 237262306a36Sopenharmony_ci 237362306a36Sopenharmony_ci if (iterator->level == PT32E_ROOT_LEVEL) { 237462306a36Sopenharmony_ci /* 237562306a36Sopenharmony_ci * prev_root is currently only used for 64-bit hosts. So only 237662306a36Sopenharmony_ci * the active root_hpa is valid here. 237762306a36Sopenharmony_ci */ 237862306a36Sopenharmony_ci BUG_ON(root != vcpu->arch.mmu->root.hpa); 237962306a36Sopenharmony_ci 238062306a36Sopenharmony_ci iterator->shadow_addr 238162306a36Sopenharmony_ci = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; 238262306a36Sopenharmony_ci iterator->shadow_addr &= SPTE_BASE_ADDR_MASK; 238362306a36Sopenharmony_ci --iterator->level; 238462306a36Sopenharmony_ci if (!iterator->shadow_addr) 238562306a36Sopenharmony_ci iterator->level = 0; 238662306a36Sopenharmony_ci } 238762306a36Sopenharmony_ci} 238862306a36Sopenharmony_ci 238962306a36Sopenharmony_cistatic void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, 239062306a36Sopenharmony_ci struct kvm_vcpu *vcpu, u64 addr) 239162306a36Sopenharmony_ci{ 239262306a36Sopenharmony_ci shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa, 239362306a36Sopenharmony_ci addr); 239462306a36Sopenharmony_ci} 239562306a36Sopenharmony_ci 239662306a36Sopenharmony_cistatic bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) 239762306a36Sopenharmony_ci{ 239862306a36Sopenharmony_ci if (iterator->level < PG_LEVEL_4K) 239962306a36Sopenharmony_ci return false; 240062306a36Sopenharmony_ci 240162306a36Sopenharmony_ci iterator->index = SPTE_INDEX(iterator->addr, iterator->level); 240262306a36Sopenharmony_ci iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; 240362306a36Sopenharmony_ci return true; 240462306a36Sopenharmony_ci} 240562306a36Sopenharmony_ci 240662306a36Sopenharmony_cistatic void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, 240762306a36Sopenharmony_ci u64 spte) 240862306a36Sopenharmony_ci{ 240962306a36Sopenharmony_ci if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) { 241062306a36Sopenharmony_ci iterator->level = 0; 241162306a36Sopenharmony_ci return; 241262306a36Sopenharmony_ci } 241362306a36Sopenharmony_ci 241462306a36Sopenharmony_ci iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK; 241562306a36Sopenharmony_ci --iterator->level; 241662306a36Sopenharmony_ci} 241762306a36Sopenharmony_ci 241862306a36Sopenharmony_cistatic void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) 241962306a36Sopenharmony_ci{ 242062306a36Sopenharmony_ci __shadow_walk_next(iterator, *iterator->sptep); 242162306a36Sopenharmony_ci} 242262306a36Sopenharmony_ci 242362306a36Sopenharmony_cistatic void __link_shadow_page(struct kvm *kvm, 242462306a36Sopenharmony_ci struct kvm_mmu_memory_cache *cache, u64 *sptep, 242562306a36Sopenharmony_ci struct kvm_mmu_page *sp, bool flush) 242662306a36Sopenharmony_ci{ 242762306a36Sopenharmony_ci u64 spte; 242862306a36Sopenharmony_ci 242962306a36Sopenharmony_ci BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); 243062306a36Sopenharmony_ci 243162306a36Sopenharmony_ci /* 243262306a36Sopenharmony_ci * If an SPTE is present already, it must be a leaf and therefore 243362306a36Sopenharmony_ci * a large one. Drop it, and flush the TLB if needed, before 243462306a36Sopenharmony_ci * installing sp. 243562306a36Sopenharmony_ci */ 243662306a36Sopenharmony_ci if (is_shadow_present_pte(*sptep)) 243762306a36Sopenharmony_ci drop_large_spte(kvm, sptep, flush); 243862306a36Sopenharmony_ci 243962306a36Sopenharmony_ci spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); 244062306a36Sopenharmony_ci 244162306a36Sopenharmony_ci mmu_spte_set(sptep, spte); 244262306a36Sopenharmony_ci 244362306a36Sopenharmony_ci mmu_page_add_parent_pte(cache, sp, sptep); 244462306a36Sopenharmony_ci 244562306a36Sopenharmony_ci /* 244662306a36Sopenharmony_ci * The non-direct sub-pagetable must be updated before linking. For 244762306a36Sopenharmony_ci * L1 sp, the pagetable is updated via kvm_sync_page() in 244862306a36Sopenharmony_ci * kvm_mmu_find_shadow_page() without write-protecting the gfn, 244962306a36Sopenharmony_ci * so sp->unsync can be true or false. For higher level non-direct 245062306a36Sopenharmony_ci * sp, the pagetable is updated/synced via mmu_sync_children() in 245162306a36Sopenharmony_ci * FNAME(fetch)(), so sp->unsync_children can only be false. 245262306a36Sopenharmony_ci * WARN_ON_ONCE() if anything happens unexpectedly. 245362306a36Sopenharmony_ci */ 245462306a36Sopenharmony_ci if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync) 245562306a36Sopenharmony_ci mark_unsync(sptep); 245662306a36Sopenharmony_ci} 245762306a36Sopenharmony_ci 245862306a36Sopenharmony_cistatic void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, 245962306a36Sopenharmony_ci struct kvm_mmu_page *sp) 246062306a36Sopenharmony_ci{ 246162306a36Sopenharmony_ci __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true); 246262306a36Sopenharmony_ci} 246362306a36Sopenharmony_ci 246462306a36Sopenharmony_cistatic void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, 246562306a36Sopenharmony_ci unsigned direct_access) 246662306a36Sopenharmony_ci{ 246762306a36Sopenharmony_ci if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { 246862306a36Sopenharmony_ci struct kvm_mmu_page *child; 246962306a36Sopenharmony_ci 247062306a36Sopenharmony_ci /* 247162306a36Sopenharmony_ci * For the direct sp, if the guest pte's dirty bit 247262306a36Sopenharmony_ci * changed form clean to dirty, it will corrupt the 247362306a36Sopenharmony_ci * sp's access: allow writable in the read-only sp, 247462306a36Sopenharmony_ci * so we should update the spte at this point to get 247562306a36Sopenharmony_ci * a new sp with the correct access. 247662306a36Sopenharmony_ci */ 247762306a36Sopenharmony_ci child = spte_to_child_sp(*sptep); 247862306a36Sopenharmony_ci if (child->role.access == direct_access) 247962306a36Sopenharmony_ci return; 248062306a36Sopenharmony_ci 248162306a36Sopenharmony_ci drop_parent_pte(vcpu->kvm, child, sptep); 248262306a36Sopenharmony_ci kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep); 248362306a36Sopenharmony_ci } 248462306a36Sopenharmony_ci} 248562306a36Sopenharmony_ci 248662306a36Sopenharmony_ci/* Returns the number of zapped non-leaf child shadow pages. */ 248762306a36Sopenharmony_cistatic int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, 248862306a36Sopenharmony_ci u64 *spte, struct list_head *invalid_list) 248962306a36Sopenharmony_ci{ 249062306a36Sopenharmony_ci u64 pte; 249162306a36Sopenharmony_ci struct kvm_mmu_page *child; 249262306a36Sopenharmony_ci 249362306a36Sopenharmony_ci pte = *spte; 249462306a36Sopenharmony_ci if (is_shadow_present_pte(pte)) { 249562306a36Sopenharmony_ci if (is_last_spte(pte, sp->role.level)) { 249662306a36Sopenharmony_ci drop_spte(kvm, spte); 249762306a36Sopenharmony_ci } else { 249862306a36Sopenharmony_ci child = spte_to_child_sp(pte); 249962306a36Sopenharmony_ci drop_parent_pte(kvm, child, spte); 250062306a36Sopenharmony_ci 250162306a36Sopenharmony_ci /* 250262306a36Sopenharmony_ci * Recursively zap nested TDP SPs, parentless SPs are 250362306a36Sopenharmony_ci * unlikely to be used again in the near future. This 250462306a36Sopenharmony_ci * avoids retaining a large number of stale nested SPs. 250562306a36Sopenharmony_ci */ 250662306a36Sopenharmony_ci if (tdp_enabled && invalid_list && 250762306a36Sopenharmony_ci child->role.guest_mode && !child->parent_ptes.val) 250862306a36Sopenharmony_ci return kvm_mmu_prepare_zap_page(kvm, child, 250962306a36Sopenharmony_ci invalid_list); 251062306a36Sopenharmony_ci } 251162306a36Sopenharmony_ci } else if (is_mmio_spte(pte)) { 251262306a36Sopenharmony_ci mmu_spte_clear_no_track(spte); 251362306a36Sopenharmony_ci } 251462306a36Sopenharmony_ci return 0; 251562306a36Sopenharmony_ci} 251662306a36Sopenharmony_ci 251762306a36Sopenharmony_cistatic int kvm_mmu_page_unlink_children(struct kvm *kvm, 251862306a36Sopenharmony_ci struct kvm_mmu_page *sp, 251962306a36Sopenharmony_ci struct list_head *invalid_list) 252062306a36Sopenharmony_ci{ 252162306a36Sopenharmony_ci int zapped = 0; 252262306a36Sopenharmony_ci unsigned i; 252362306a36Sopenharmony_ci 252462306a36Sopenharmony_ci for (i = 0; i < SPTE_ENT_PER_PAGE; ++i) 252562306a36Sopenharmony_ci zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); 252662306a36Sopenharmony_ci 252762306a36Sopenharmony_ci return zapped; 252862306a36Sopenharmony_ci} 252962306a36Sopenharmony_ci 253062306a36Sopenharmony_cistatic void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) 253162306a36Sopenharmony_ci{ 253262306a36Sopenharmony_ci u64 *sptep; 253362306a36Sopenharmony_ci struct rmap_iterator iter; 253462306a36Sopenharmony_ci 253562306a36Sopenharmony_ci while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) 253662306a36Sopenharmony_ci drop_parent_pte(kvm, sp, sptep); 253762306a36Sopenharmony_ci} 253862306a36Sopenharmony_ci 253962306a36Sopenharmony_cistatic int mmu_zap_unsync_children(struct kvm *kvm, 254062306a36Sopenharmony_ci struct kvm_mmu_page *parent, 254162306a36Sopenharmony_ci struct list_head *invalid_list) 254262306a36Sopenharmony_ci{ 254362306a36Sopenharmony_ci int i, zapped = 0; 254462306a36Sopenharmony_ci struct mmu_page_path parents; 254562306a36Sopenharmony_ci struct kvm_mmu_pages pages; 254662306a36Sopenharmony_ci 254762306a36Sopenharmony_ci if (parent->role.level == PG_LEVEL_4K) 254862306a36Sopenharmony_ci return 0; 254962306a36Sopenharmony_ci 255062306a36Sopenharmony_ci while (mmu_unsync_walk(parent, &pages)) { 255162306a36Sopenharmony_ci struct kvm_mmu_page *sp; 255262306a36Sopenharmony_ci 255362306a36Sopenharmony_ci for_each_sp(pages, sp, parents, i) { 255462306a36Sopenharmony_ci kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 255562306a36Sopenharmony_ci mmu_pages_clear_parents(&parents); 255662306a36Sopenharmony_ci zapped++; 255762306a36Sopenharmony_ci } 255862306a36Sopenharmony_ci } 255962306a36Sopenharmony_ci 256062306a36Sopenharmony_ci return zapped; 256162306a36Sopenharmony_ci} 256262306a36Sopenharmony_ci 256362306a36Sopenharmony_cistatic bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, 256462306a36Sopenharmony_ci struct kvm_mmu_page *sp, 256562306a36Sopenharmony_ci struct list_head *invalid_list, 256662306a36Sopenharmony_ci int *nr_zapped) 256762306a36Sopenharmony_ci{ 256862306a36Sopenharmony_ci bool list_unstable, zapped_root = false; 256962306a36Sopenharmony_ci 257062306a36Sopenharmony_ci lockdep_assert_held_write(&kvm->mmu_lock); 257162306a36Sopenharmony_ci trace_kvm_mmu_prepare_zap_page(sp); 257262306a36Sopenharmony_ci ++kvm->stat.mmu_shadow_zapped; 257362306a36Sopenharmony_ci *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); 257462306a36Sopenharmony_ci *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); 257562306a36Sopenharmony_ci kvm_mmu_unlink_parents(kvm, sp); 257662306a36Sopenharmony_ci 257762306a36Sopenharmony_ci /* Zapping children means active_mmu_pages has become unstable. */ 257862306a36Sopenharmony_ci list_unstable = *nr_zapped; 257962306a36Sopenharmony_ci 258062306a36Sopenharmony_ci if (!sp->role.invalid && sp_has_gptes(sp)) 258162306a36Sopenharmony_ci unaccount_shadowed(kvm, sp); 258262306a36Sopenharmony_ci 258362306a36Sopenharmony_ci if (sp->unsync) 258462306a36Sopenharmony_ci kvm_unlink_unsync_page(kvm, sp); 258562306a36Sopenharmony_ci if (!sp->root_count) { 258662306a36Sopenharmony_ci /* Count self */ 258762306a36Sopenharmony_ci (*nr_zapped)++; 258862306a36Sopenharmony_ci 258962306a36Sopenharmony_ci /* 259062306a36Sopenharmony_ci * Already invalid pages (previously active roots) are not on 259162306a36Sopenharmony_ci * the active page list. See list_del() in the "else" case of 259262306a36Sopenharmony_ci * !sp->root_count. 259362306a36Sopenharmony_ci */ 259462306a36Sopenharmony_ci if (sp->role.invalid) 259562306a36Sopenharmony_ci list_add(&sp->link, invalid_list); 259662306a36Sopenharmony_ci else 259762306a36Sopenharmony_ci list_move(&sp->link, invalid_list); 259862306a36Sopenharmony_ci kvm_unaccount_mmu_page(kvm, sp); 259962306a36Sopenharmony_ci } else { 260062306a36Sopenharmony_ci /* 260162306a36Sopenharmony_ci * Remove the active root from the active page list, the root 260262306a36Sopenharmony_ci * will be explicitly freed when the root_count hits zero. 260362306a36Sopenharmony_ci */ 260462306a36Sopenharmony_ci list_del(&sp->link); 260562306a36Sopenharmony_ci 260662306a36Sopenharmony_ci /* 260762306a36Sopenharmony_ci * Obsolete pages cannot be used on any vCPUs, see the comment 260862306a36Sopenharmony_ci * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also 260962306a36Sopenharmony_ci * treats invalid shadow pages as being obsolete. 261062306a36Sopenharmony_ci */ 261162306a36Sopenharmony_ci zapped_root = !is_obsolete_sp(kvm, sp); 261262306a36Sopenharmony_ci } 261362306a36Sopenharmony_ci 261462306a36Sopenharmony_ci if (sp->nx_huge_page_disallowed) 261562306a36Sopenharmony_ci unaccount_nx_huge_page(kvm, sp); 261662306a36Sopenharmony_ci 261762306a36Sopenharmony_ci sp->role.invalid = 1; 261862306a36Sopenharmony_ci 261962306a36Sopenharmony_ci /* 262062306a36Sopenharmony_ci * Make the request to free obsolete roots after marking the root 262162306a36Sopenharmony_ci * invalid, otherwise other vCPUs may not see it as invalid. 262262306a36Sopenharmony_ci */ 262362306a36Sopenharmony_ci if (zapped_root) 262462306a36Sopenharmony_ci kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS); 262562306a36Sopenharmony_ci return list_unstable; 262662306a36Sopenharmony_ci} 262762306a36Sopenharmony_ci 262862306a36Sopenharmony_cistatic bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 262962306a36Sopenharmony_ci struct list_head *invalid_list) 263062306a36Sopenharmony_ci{ 263162306a36Sopenharmony_ci int nr_zapped; 263262306a36Sopenharmony_ci 263362306a36Sopenharmony_ci __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped); 263462306a36Sopenharmony_ci return nr_zapped; 263562306a36Sopenharmony_ci} 263662306a36Sopenharmony_ci 263762306a36Sopenharmony_cistatic void kvm_mmu_commit_zap_page(struct kvm *kvm, 263862306a36Sopenharmony_ci struct list_head *invalid_list) 263962306a36Sopenharmony_ci{ 264062306a36Sopenharmony_ci struct kvm_mmu_page *sp, *nsp; 264162306a36Sopenharmony_ci 264262306a36Sopenharmony_ci if (list_empty(invalid_list)) 264362306a36Sopenharmony_ci return; 264462306a36Sopenharmony_ci 264562306a36Sopenharmony_ci /* 264662306a36Sopenharmony_ci * We need to make sure everyone sees our modifications to 264762306a36Sopenharmony_ci * the page tables and see changes to vcpu->mode here. The barrier 264862306a36Sopenharmony_ci * in the kvm_flush_remote_tlbs() achieves this. This pairs 264962306a36Sopenharmony_ci * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end. 265062306a36Sopenharmony_ci * 265162306a36Sopenharmony_ci * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit 265262306a36Sopenharmony_ci * guest mode and/or lockless shadow page table walks. 265362306a36Sopenharmony_ci */ 265462306a36Sopenharmony_ci kvm_flush_remote_tlbs(kvm); 265562306a36Sopenharmony_ci 265662306a36Sopenharmony_ci list_for_each_entry_safe(sp, nsp, invalid_list, link) { 265762306a36Sopenharmony_ci WARN_ON_ONCE(!sp->role.invalid || sp->root_count); 265862306a36Sopenharmony_ci kvm_mmu_free_shadow_page(sp); 265962306a36Sopenharmony_ci } 266062306a36Sopenharmony_ci} 266162306a36Sopenharmony_ci 266262306a36Sopenharmony_cistatic unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm, 266362306a36Sopenharmony_ci unsigned long nr_to_zap) 266462306a36Sopenharmony_ci{ 266562306a36Sopenharmony_ci unsigned long total_zapped = 0; 266662306a36Sopenharmony_ci struct kvm_mmu_page *sp, *tmp; 266762306a36Sopenharmony_ci LIST_HEAD(invalid_list); 266862306a36Sopenharmony_ci bool unstable; 266962306a36Sopenharmony_ci int nr_zapped; 267062306a36Sopenharmony_ci 267162306a36Sopenharmony_ci if (list_empty(&kvm->arch.active_mmu_pages)) 267262306a36Sopenharmony_ci return 0; 267362306a36Sopenharmony_ci 267462306a36Sopenharmony_cirestart: 267562306a36Sopenharmony_ci list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) { 267662306a36Sopenharmony_ci /* 267762306a36Sopenharmony_ci * Don't zap active root pages, the page itself can't be freed 267862306a36Sopenharmony_ci * and zapping it will just force vCPUs to realloc and reload. 267962306a36Sopenharmony_ci */ 268062306a36Sopenharmony_ci if (sp->root_count) 268162306a36Sopenharmony_ci continue; 268262306a36Sopenharmony_ci 268362306a36Sopenharmony_ci unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, 268462306a36Sopenharmony_ci &nr_zapped); 268562306a36Sopenharmony_ci total_zapped += nr_zapped; 268662306a36Sopenharmony_ci if (total_zapped >= nr_to_zap) 268762306a36Sopenharmony_ci break; 268862306a36Sopenharmony_ci 268962306a36Sopenharmony_ci if (unstable) 269062306a36Sopenharmony_ci goto restart; 269162306a36Sopenharmony_ci } 269262306a36Sopenharmony_ci 269362306a36Sopenharmony_ci kvm_mmu_commit_zap_page(kvm, &invalid_list); 269462306a36Sopenharmony_ci 269562306a36Sopenharmony_ci kvm->stat.mmu_recycled += total_zapped; 269662306a36Sopenharmony_ci return total_zapped; 269762306a36Sopenharmony_ci} 269862306a36Sopenharmony_ci 269962306a36Sopenharmony_cistatic inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) 270062306a36Sopenharmony_ci{ 270162306a36Sopenharmony_ci if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) 270262306a36Sopenharmony_ci return kvm->arch.n_max_mmu_pages - 270362306a36Sopenharmony_ci kvm->arch.n_used_mmu_pages; 270462306a36Sopenharmony_ci 270562306a36Sopenharmony_ci return 0; 270662306a36Sopenharmony_ci} 270762306a36Sopenharmony_ci 270862306a36Sopenharmony_cistatic int make_mmu_pages_available(struct kvm_vcpu *vcpu) 270962306a36Sopenharmony_ci{ 271062306a36Sopenharmony_ci unsigned long avail = kvm_mmu_available_pages(vcpu->kvm); 271162306a36Sopenharmony_ci 271262306a36Sopenharmony_ci if (likely(avail >= KVM_MIN_FREE_MMU_PAGES)) 271362306a36Sopenharmony_ci return 0; 271462306a36Sopenharmony_ci 271562306a36Sopenharmony_ci kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail); 271662306a36Sopenharmony_ci 271762306a36Sopenharmony_ci /* 271862306a36Sopenharmony_ci * Note, this check is intentionally soft, it only guarantees that one 271962306a36Sopenharmony_ci * page is available, while the caller may end up allocating as many as 272062306a36Sopenharmony_ci * four pages, e.g. for PAE roots or for 5-level paging. Temporarily 272162306a36Sopenharmony_ci * exceeding the (arbitrary by default) limit will not harm the host, 272262306a36Sopenharmony_ci * being too aggressive may unnecessarily kill the guest, and getting an 272362306a36Sopenharmony_ci * exact count is far more trouble than it's worth, especially in the 272462306a36Sopenharmony_ci * page fault paths. 272562306a36Sopenharmony_ci */ 272662306a36Sopenharmony_ci if (!kvm_mmu_available_pages(vcpu->kvm)) 272762306a36Sopenharmony_ci return -ENOSPC; 272862306a36Sopenharmony_ci return 0; 272962306a36Sopenharmony_ci} 273062306a36Sopenharmony_ci 273162306a36Sopenharmony_ci/* 273262306a36Sopenharmony_ci * Changing the number of mmu pages allocated to the vm 273362306a36Sopenharmony_ci * Note: if goal_nr_mmu_pages is too small, you will get dead lock 273462306a36Sopenharmony_ci */ 273562306a36Sopenharmony_civoid kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) 273662306a36Sopenharmony_ci{ 273762306a36Sopenharmony_ci write_lock(&kvm->mmu_lock); 273862306a36Sopenharmony_ci 273962306a36Sopenharmony_ci if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { 274062306a36Sopenharmony_ci kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - 274162306a36Sopenharmony_ci goal_nr_mmu_pages); 274262306a36Sopenharmony_ci 274362306a36Sopenharmony_ci goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; 274462306a36Sopenharmony_ci } 274562306a36Sopenharmony_ci 274662306a36Sopenharmony_ci kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; 274762306a36Sopenharmony_ci 274862306a36Sopenharmony_ci write_unlock(&kvm->mmu_lock); 274962306a36Sopenharmony_ci} 275062306a36Sopenharmony_ci 275162306a36Sopenharmony_ciint kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 275262306a36Sopenharmony_ci{ 275362306a36Sopenharmony_ci struct kvm_mmu_page *sp; 275462306a36Sopenharmony_ci LIST_HEAD(invalid_list); 275562306a36Sopenharmony_ci int r; 275662306a36Sopenharmony_ci 275762306a36Sopenharmony_ci r = 0; 275862306a36Sopenharmony_ci write_lock(&kvm->mmu_lock); 275962306a36Sopenharmony_ci for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { 276062306a36Sopenharmony_ci r = 1; 276162306a36Sopenharmony_ci kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 276262306a36Sopenharmony_ci } 276362306a36Sopenharmony_ci kvm_mmu_commit_zap_page(kvm, &invalid_list); 276462306a36Sopenharmony_ci write_unlock(&kvm->mmu_lock); 276562306a36Sopenharmony_ci 276662306a36Sopenharmony_ci return r; 276762306a36Sopenharmony_ci} 276862306a36Sopenharmony_ci 276962306a36Sopenharmony_cistatic int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 277062306a36Sopenharmony_ci{ 277162306a36Sopenharmony_ci gpa_t gpa; 277262306a36Sopenharmony_ci int r; 277362306a36Sopenharmony_ci 277462306a36Sopenharmony_ci if (vcpu->arch.mmu->root_role.direct) 277562306a36Sopenharmony_ci return 0; 277662306a36Sopenharmony_ci 277762306a36Sopenharmony_ci gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 277862306a36Sopenharmony_ci 277962306a36Sopenharmony_ci r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 278062306a36Sopenharmony_ci 278162306a36Sopenharmony_ci return r; 278262306a36Sopenharmony_ci} 278362306a36Sopenharmony_ci 278462306a36Sopenharmony_cistatic void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 278562306a36Sopenharmony_ci{ 278662306a36Sopenharmony_ci trace_kvm_mmu_unsync_page(sp); 278762306a36Sopenharmony_ci ++kvm->stat.mmu_unsync; 278862306a36Sopenharmony_ci sp->unsync = 1; 278962306a36Sopenharmony_ci 279062306a36Sopenharmony_ci kvm_mmu_mark_parents_unsync(sp); 279162306a36Sopenharmony_ci} 279262306a36Sopenharmony_ci 279362306a36Sopenharmony_ci/* 279462306a36Sopenharmony_ci * Attempt to unsync any shadow pages that can be reached by the specified gfn, 279562306a36Sopenharmony_ci * KVM is creating a writable mapping for said gfn. Returns 0 if all pages 279662306a36Sopenharmony_ci * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must 279762306a36Sopenharmony_ci * be write-protected. 279862306a36Sopenharmony_ci */ 279962306a36Sopenharmony_ciint mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, 280062306a36Sopenharmony_ci gfn_t gfn, bool can_unsync, bool prefetch) 280162306a36Sopenharmony_ci{ 280262306a36Sopenharmony_ci struct kvm_mmu_page *sp; 280362306a36Sopenharmony_ci bool locked = false; 280462306a36Sopenharmony_ci 280562306a36Sopenharmony_ci /* 280662306a36Sopenharmony_ci * Force write-protection if the page is being tracked. Note, the page 280762306a36Sopenharmony_ci * track machinery is used to write-protect upper-level shadow pages, 280862306a36Sopenharmony_ci * i.e. this guards the role.level == 4K assertion below! 280962306a36Sopenharmony_ci */ 281062306a36Sopenharmony_ci if (kvm_gfn_is_write_tracked(kvm, slot, gfn)) 281162306a36Sopenharmony_ci return -EPERM; 281262306a36Sopenharmony_ci 281362306a36Sopenharmony_ci /* 281462306a36Sopenharmony_ci * The page is not write-tracked, mark existing shadow pages unsync 281562306a36Sopenharmony_ci * unless KVM is synchronizing an unsync SP (can_unsync = false). In 281662306a36Sopenharmony_ci * that case, KVM must complete emulation of the guest TLB flush before 281762306a36Sopenharmony_ci * allowing shadow pages to become unsync (writable by the guest). 281862306a36Sopenharmony_ci */ 281962306a36Sopenharmony_ci for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { 282062306a36Sopenharmony_ci if (!can_unsync) 282162306a36Sopenharmony_ci return -EPERM; 282262306a36Sopenharmony_ci 282362306a36Sopenharmony_ci if (sp->unsync) 282462306a36Sopenharmony_ci continue; 282562306a36Sopenharmony_ci 282662306a36Sopenharmony_ci if (prefetch) 282762306a36Sopenharmony_ci return -EEXIST; 282862306a36Sopenharmony_ci 282962306a36Sopenharmony_ci /* 283062306a36Sopenharmony_ci * TDP MMU page faults require an additional spinlock as they 283162306a36Sopenharmony_ci * run with mmu_lock held for read, not write, and the unsync 283262306a36Sopenharmony_ci * logic is not thread safe. Take the spinklock regardless of 283362306a36Sopenharmony_ci * the MMU type to avoid extra conditionals/parameters, there's 283462306a36Sopenharmony_ci * no meaningful penalty if mmu_lock is held for write. 283562306a36Sopenharmony_ci */ 283662306a36Sopenharmony_ci if (!locked) { 283762306a36Sopenharmony_ci locked = true; 283862306a36Sopenharmony_ci spin_lock(&kvm->arch.mmu_unsync_pages_lock); 283962306a36Sopenharmony_ci 284062306a36Sopenharmony_ci /* 284162306a36Sopenharmony_ci * Recheck after taking the spinlock, a different vCPU 284262306a36Sopenharmony_ci * may have since marked the page unsync. A false 284362306a36Sopenharmony_ci * positive on the unprotected check above is not 284462306a36Sopenharmony_ci * possible as clearing sp->unsync _must_ hold mmu_lock 284562306a36Sopenharmony_ci * for write, i.e. unsync cannot transition from 0->1 284662306a36Sopenharmony_ci * while this CPU holds mmu_lock for read (or write). 284762306a36Sopenharmony_ci */ 284862306a36Sopenharmony_ci if (READ_ONCE(sp->unsync)) 284962306a36Sopenharmony_ci continue; 285062306a36Sopenharmony_ci } 285162306a36Sopenharmony_ci 285262306a36Sopenharmony_ci WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K); 285362306a36Sopenharmony_ci kvm_unsync_page(kvm, sp); 285462306a36Sopenharmony_ci } 285562306a36Sopenharmony_ci if (locked) 285662306a36Sopenharmony_ci spin_unlock(&kvm->arch.mmu_unsync_pages_lock); 285762306a36Sopenharmony_ci 285862306a36Sopenharmony_ci /* 285962306a36Sopenharmony_ci * We need to ensure that the marking of unsync pages is visible 286062306a36Sopenharmony_ci * before the SPTE is updated to allow writes because 286162306a36Sopenharmony_ci * kvm_mmu_sync_roots() checks the unsync flags without holding 286262306a36Sopenharmony_ci * the MMU lock and so can race with this. If the SPTE was updated 286362306a36Sopenharmony_ci * before the page had been marked as unsync-ed, something like the 286462306a36Sopenharmony_ci * following could happen: 286562306a36Sopenharmony_ci * 286662306a36Sopenharmony_ci * CPU 1 CPU 2 286762306a36Sopenharmony_ci * --------------------------------------------------------------------- 286862306a36Sopenharmony_ci * 1.2 Host updates SPTE 286962306a36Sopenharmony_ci * to be writable 287062306a36Sopenharmony_ci * 2.1 Guest writes a GPTE for GVA X. 287162306a36Sopenharmony_ci * (GPTE being in the guest page table shadowed 287262306a36Sopenharmony_ci * by the SP from CPU 1.) 287362306a36Sopenharmony_ci * This reads SPTE during the page table walk. 287462306a36Sopenharmony_ci * Since SPTE.W is read as 1, there is no 287562306a36Sopenharmony_ci * fault. 287662306a36Sopenharmony_ci * 287762306a36Sopenharmony_ci * 2.2 Guest issues TLB flush. 287862306a36Sopenharmony_ci * That causes a VM Exit. 287962306a36Sopenharmony_ci * 288062306a36Sopenharmony_ci * 2.3 Walking of unsync pages sees sp->unsync is 288162306a36Sopenharmony_ci * false and skips the page. 288262306a36Sopenharmony_ci * 288362306a36Sopenharmony_ci * 2.4 Guest accesses GVA X. 288462306a36Sopenharmony_ci * Since the mapping in the SP was not updated, 288562306a36Sopenharmony_ci * so the old mapping for GVA X incorrectly 288662306a36Sopenharmony_ci * gets used. 288762306a36Sopenharmony_ci * 1.1 Host marks SP 288862306a36Sopenharmony_ci * as unsync 288962306a36Sopenharmony_ci * (sp->unsync = true) 289062306a36Sopenharmony_ci * 289162306a36Sopenharmony_ci * The write barrier below ensures that 1.1 happens before 1.2 and thus 289262306a36Sopenharmony_ci * the situation in 2.4 does not arise. It pairs with the read barrier 289362306a36Sopenharmony_ci * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3. 289462306a36Sopenharmony_ci */ 289562306a36Sopenharmony_ci smp_wmb(); 289662306a36Sopenharmony_ci 289762306a36Sopenharmony_ci return 0; 289862306a36Sopenharmony_ci} 289962306a36Sopenharmony_ci 290062306a36Sopenharmony_cistatic int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, 290162306a36Sopenharmony_ci u64 *sptep, unsigned int pte_access, gfn_t gfn, 290262306a36Sopenharmony_ci kvm_pfn_t pfn, struct kvm_page_fault *fault) 290362306a36Sopenharmony_ci{ 290462306a36Sopenharmony_ci struct kvm_mmu_page *sp = sptep_to_sp(sptep); 290562306a36Sopenharmony_ci int level = sp->role.level; 290662306a36Sopenharmony_ci int was_rmapped = 0; 290762306a36Sopenharmony_ci int ret = RET_PF_FIXED; 290862306a36Sopenharmony_ci bool flush = false; 290962306a36Sopenharmony_ci bool wrprot; 291062306a36Sopenharmony_ci u64 spte; 291162306a36Sopenharmony_ci 291262306a36Sopenharmony_ci /* Prefetching always gets a writable pfn. */ 291362306a36Sopenharmony_ci bool host_writable = !fault || fault->map_writable; 291462306a36Sopenharmony_ci bool prefetch = !fault || fault->prefetch; 291562306a36Sopenharmony_ci bool write_fault = fault && fault->write; 291662306a36Sopenharmony_ci 291762306a36Sopenharmony_ci if (unlikely(is_noslot_pfn(pfn))) { 291862306a36Sopenharmony_ci vcpu->stat.pf_mmio_spte_created++; 291962306a36Sopenharmony_ci mark_mmio_spte(vcpu, sptep, gfn, pte_access); 292062306a36Sopenharmony_ci return RET_PF_EMULATE; 292162306a36Sopenharmony_ci } 292262306a36Sopenharmony_ci 292362306a36Sopenharmony_ci if (is_shadow_present_pte(*sptep)) { 292462306a36Sopenharmony_ci /* 292562306a36Sopenharmony_ci * If we overwrite a PTE page pointer with a 2MB PMD, unlink 292662306a36Sopenharmony_ci * the parent of the now unreachable PTE. 292762306a36Sopenharmony_ci */ 292862306a36Sopenharmony_ci if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) { 292962306a36Sopenharmony_ci struct kvm_mmu_page *child; 293062306a36Sopenharmony_ci u64 pte = *sptep; 293162306a36Sopenharmony_ci 293262306a36Sopenharmony_ci child = spte_to_child_sp(pte); 293362306a36Sopenharmony_ci drop_parent_pte(vcpu->kvm, child, sptep); 293462306a36Sopenharmony_ci flush = true; 293562306a36Sopenharmony_ci } else if (pfn != spte_to_pfn(*sptep)) { 293662306a36Sopenharmony_ci drop_spte(vcpu->kvm, sptep); 293762306a36Sopenharmony_ci flush = true; 293862306a36Sopenharmony_ci } else 293962306a36Sopenharmony_ci was_rmapped = 1; 294062306a36Sopenharmony_ci } 294162306a36Sopenharmony_ci 294262306a36Sopenharmony_ci wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch, 294362306a36Sopenharmony_ci true, host_writable, &spte); 294462306a36Sopenharmony_ci 294562306a36Sopenharmony_ci if (*sptep == spte) { 294662306a36Sopenharmony_ci ret = RET_PF_SPURIOUS; 294762306a36Sopenharmony_ci } else { 294862306a36Sopenharmony_ci flush |= mmu_spte_update(sptep, spte); 294962306a36Sopenharmony_ci trace_kvm_mmu_set_spte(level, gfn, sptep); 295062306a36Sopenharmony_ci } 295162306a36Sopenharmony_ci 295262306a36Sopenharmony_ci if (wrprot) { 295362306a36Sopenharmony_ci if (write_fault) 295462306a36Sopenharmony_ci ret = RET_PF_EMULATE; 295562306a36Sopenharmony_ci } 295662306a36Sopenharmony_ci 295762306a36Sopenharmony_ci if (flush) 295862306a36Sopenharmony_ci kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); 295962306a36Sopenharmony_ci 296062306a36Sopenharmony_ci if (!was_rmapped) { 296162306a36Sopenharmony_ci WARN_ON_ONCE(ret == RET_PF_SPURIOUS); 296262306a36Sopenharmony_ci rmap_add(vcpu, slot, sptep, gfn, pte_access); 296362306a36Sopenharmony_ci } else { 296462306a36Sopenharmony_ci /* Already rmapped but the pte_access bits may have changed. */ 296562306a36Sopenharmony_ci kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access); 296662306a36Sopenharmony_ci } 296762306a36Sopenharmony_ci 296862306a36Sopenharmony_ci return ret; 296962306a36Sopenharmony_ci} 297062306a36Sopenharmony_ci 297162306a36Sopenharmony_cistatic int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, 297262306a36Sopenharmony_ci struct kvm_mmu_page *sp, 297362306a36Sopenharmony_ci u64 *start, u64 *end) 297462306a36Sopenharmony_ci{ 297562306a36Sopenharmony_ci struct page *pages[PTE_PREFETCH_NUM]; 297662306a36Sopenharmony_ci struct kvm_memory_slot *slot; 297762306a36Sopenharmony_ci unsigned int access = sp->role.access; 297862306a36Sopenharmony_ci int i, ret; 297962306a36Sopenharmony_ci gfn_t gfn; 298062306a36Sopenharmony_ci 298162306a36Sopenharmony_ci gfn = kvm_mmu_page_get_gfn(sp, spte_index(start)); 298262306a36Sopenharmony_ci slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); 298362306a36Sopenharmony_ci if (!slot) 298462306a36Sopenharmony_ci return -1; 298562306a36Sopenharmony_ci 298662306a36Sopenharmony_ci ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); 298762306a36Sopenharmony_ci if (ret <= 0) 298862306a36Sopenharmony_ci return -1; 298962306a36Sopenharmony_ci 299062306a36Sopenharmony_ci for (i = 0; i < ret; i++, gfn++, start++) { 299162306a36Sopenharmony_ci mmu_set_spte(vcpu, slot, start, access, gfn, 299262306a36Sopenharmony_ci page_to_pfn(pages[i]), NULL); 299362306a36Sopenharmony_ci put_page(pages[i]); 299462306a36Sopenharmony_ci } 299562306a36Sopenharmony_ci 299662306a36Sopenharmony_ci return 0; 299762306a36Sopenharmony_ci} 299862306a36Sopenharmony_ci 299962306a36Sopenharmony_cistatic void __direct_pte_prefetch(struct kvm_vcpu *vcpu, 300062306a36Sopenharmony_ci struct kvm_mmu_page *sp, u64 *sptep) 300162306a36Sopenharmony_ci{ 300262306a36Sopenharmony_ci u64 *spte, *start = NULL; 300362306a36Sopenharmony_ci int i; 300462306a36Sopenharmony_ci 300562306a36Sopenharmony_ci WARN_ON_ONCE(!sp->role.direct); 300662306a36Sopenharmony_ci 300762306a36Sopenharmony_ci i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); 300862306a36Sopenharmony_ci spte = sp->spt + i; 300962306a36Sopenharmony_ci 301062306a36Sopenharmony_ci for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { 301162306a36Sopenharmony_ci if (is_shadow_present_pte(*spte) || spte == sptep) { 301262306a36Sopenharmony_ci if (!start) 301362306a36Sopenharmony_ci continue; 301462306a36Sopenharmony_ci if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) 301562306a36Sopenharmony_ci return; 301662306a36Sopenharmony_ci start = NULL; 301762306a36Sopenharmony_ci } else if (!start) 301862306a36Sopenharmony_ci start = spte; 301962306a36Sopenharmony_ci } 302062306a36Sopenharmony_ci if (start) 302162306a36Sopenharmony_ci direct_pte_prefetch_many(vcpu, sp, start, spte); 302262306a36Sopenharmony_ci} 302362306a36Sopenharmony_ci 302462306a36Sopenharmony_cistatic void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) 302562306a36Sopenharmony_ci{ 302662306a36Sopenharmony_ci struct kvm_mmu_page *sp; 302762306a36Sopenharmony_ci 302862306a36Sopenharmony_ci sp = sptep_to_sp(sptep); 302962306a36Sopenharmony_ci 303062306a36Sopenharmony_ci /* 303162306a36Sopenharmony_ci * Without accessed bits, there's no way to distinguish between 303262306a36Sopenharmony_ci * actually accessed translations and prefetched, so disable pte 303362306a36Sopenharmony_ci * prefetch if accessed bits aren't available. 303462306a36Sopenharmony_ci */ 303562306a36Sopenharmony_ci if (sp_ad_disabled(sp)) 303662306a36Sopenharmony_ci return; 303762306a36Sopenharmony_ci 303862306a36Sopenharmony_ci if (sp->role.level > PG_LEVEL_4K) 303962306a36Sopenharmony_ci return; 304062306a36Sopenharmony_ci 304162306a36Sopenharmony_ci /* 304262306a36Sopenharmony_ci * If addresses are being invalidated, skip prefetching to avoid 304362306a36Sopenharmony_ci * accidentally prefetching those addresses. 304462306a36Sopenharmony_ci */ 304562306a36Sopenharmony_ci if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) 304662306a36Sopenharmony_ci return; 304762306a36Sopenharmony_ci 304862306a36Sopenharmony_ci __direct_pte_prefetch(vcpu, sp, sptep); 304962306a36Sopenharmony_ci} 305062306a36Sopenharmony_ci 305162306a36Sopenharmony_ci/* 305262306a36Sopenharmony_ci * Lookup the mapping level for @gfn in the current mm. 305362306a36Sopenharmony_ci * 305462306a36Sopenharmony_ci * WARNING! Use of host_pfn_mapping_level() requires the caller and the end 305562306a36Sopenharmony_ci * consumer to be tied into KVM's handlers for MMU notifier events! 305662306a36Sopenharmony_ci * 305762306a36Sopenharmony_ci * There are several ways to safely use this helper: 305862306a36Sopenharmony_ci * 305962306a36Sopenharmony_ci * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before 306062306a36Sopenharmony_ci * consuming it. In this case, mmu_lock doesn't need to be held during the 306162306a36Sopenharmony_ci * lookup, but it does need to be held while checking the MMU notifier. 306262306a36Sopenharmony_ci * 306362306a36Sopenharmony_ci * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation 306462306a36Sopenharmony_ci * event for the hva. This can be done by explicit checking the MMU notifier 306562306a36Sopenharmony_ci * or by ensuring that KVM already has a valid mapping that covers the hva. 306662306a36Sopenharmony_ci * 306762306a36Sopenharmony_ci * - Do not use the result to install new mappings, e.g. use the host mapping 306862306a36Sopenharmony_ci * level only to decide whether or not to zap an entry. In this case, it's 306962306a36Sopenharmony_ci * not required to hold mmu_lock (though it's highly likely the caller will 307062306a36Sopenharmony_ci * want to hold mmu_lock anyways, e.g. to modify SPTEs). 307162306a36Sopenharmony_ci * 307262306a36Sopenharmony_ci * Note! The lookup can still race with modifications to host page tables, but 307362306a36Sopenharmony_ci * the above "rules" ensure KVM will not _consume_ the result of the walk if a 307462306a36Sopenharmony_ci * race with the primary MMU occurs. 307562306a36Sopenharmony_ci */ 307662306a36Sopenharmony_cistatic int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, 307762306a36Sopenharmony_ci const struct kvm_memory_slot *slot) 307862306a36Sopenharmony_ci{ 307962306a36Sopenharmony_ci int level = PG_LEVEL_4K; 308062306a36Sopenharmony_ci unsigned long hva; 308162306a36Sopenharmony_ci unsigned long flags; 308262306a36Sopenharmony_ci pgd_t pgd; 308362306a36Sopenharmony_ci p4d_t p4d; 308462306a36Sopenharmony_ci pud_t pud; 308562306a36Sopenharmony_ci pmd_t pmd; 308662306a36Sopenharmony_ci 308762306a36Sopenharmony_ci /* 308862306a36Sopenharmony_ci * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() 308962306a36Sopenharmony_ci * is not solely for performance, it's also necessary to avoid the 309062306a36Sopenharmony_ci * "writable" check in __gfn_to_hva_many(), which will always fail on 309162306a36Sopenharmony_ci * read-only memslots due to gfn_to_hva() assuming writes. Earlier 309262306a36Sopenharmony_ci * page fault steps have already verified the guest isn't writing a 309362306a36Sopenharmony_ci * read-only memslot. 309462306a36Sopenharmony_ci */ 309562306a36Sopenharmony_ci hva = __gfn_to_hva_memslot(slot, gfn); 309662306a36Sopenharmony_ci 309762306a36Sopenharmony_ci /* 309862306a36Sopenharmony_ci * Disable IRQs to prevent concurrent tear down of host page tables, 309962306a36Sopenharmony_ci * e.g. if the primary MMU promotes a P*D to a huge page and then frees 310062306a36Sopenharmony_ci * the original page table. 310162306a36Sopenharmony_ci */ 310262306a36Sopenharmony_ci local_irq_save(flags); 310362306a36Sopenharmony_ci 310462306a36Sopenharmony_ci /* 310562306a36Sopenharmony_ci * Read each entry once. As above, a non-leaf entry can be promoted to 310662306a36Sopenharmony_ci * a huge page _during_ this walk. Re-reading the entry could send the 310762306a36Sopenharmony_ci * walk into the weeks, e.g. p*d_large() returns false (sees the old 310862306a36Sopenharmony_ci * value) and then p*d_offset() walks into the target huge page instead 310962306a36Sopenharmony_ci * of the old page table (sees the new value). 311062306a36Sopenharmony_ci */ 311162306a36Sopenharmony_ci pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); 311262306a36Sopenharmony_ci if (pgd_none(pgd)) 311362306a36Sopenharmony_ci goto out; 311462306a36Sopenharmony_ci 311562306a36Sopenharmony_ci p4d = READ_ONCE(*p4d_offset(&pgd, hva)); 311662306a36Sopenharmony_ci if (p4d_none(p4d) || !p4d_present(p4d)) 311762306a36Sopenharmony_ci goto out; 311862306a36Sopenharmony_ci 311962306a36Sopenharmony_ci pud = READ_ONCE(*pud_offset(&p4d, hva)); 312062306a36Sopenharmony_ci if (pud_none(pud) || !pud_present(pud)) 312162306a36Sopenharmony_ci goto out; 312262306a36Sopenharmony_ci 312362306a36Sopenharmony_ci if (pud_large(pud)) { 312462306a36Sopenharmony_ci level = PG_LEVEL_1G; 312562306a36Sopenharmony_ci goto out; 312662306a36Sopenharmony_ci } 312762306a36Sopenharmony_ci 312862306a36Sopenharmony_ci pmd = READ_ONCE(*pmd_offset(&pud, hva)); 312962306a36Sopenharmony_ci if (pmd_none(pmd) || !pmd_present(pmd)) 313062306a36Sopenharmony_ci goto out; 313162306a36Sopenharmony_ci 313262306a36Sopenharmony_ci if (pmd_large(pmd)) 313362306a36Sopenharmony_ci level = PG_LEVEL_2M; 313462306a36Sopenharmony_ci 313562306a36Sopenharmony_ciout: 313662306a36Sopenharmony_ci local_irq_restore(flags); 313762306a36Sopenharmony_ci return level; 313862306a36Sopenharmony_ci} 313962306a36Sopenharmony_ci 314062306a36Sopenharmony_ciint kvm_mmu_max_mapping_level(struct kvm *kvm, 314162306a36Sopenharmony_ci const struct kvm_memory_slot *slot, gfn_t gfn, 314262306a36Sopenharmony_ci int max_level) 314362306a36Sopenharmony_ci{ 314462306a36Sopenharmony_ci struct kvm_lpage_info *linfo; 314562306a36Sopenharmony_ci int host_level; 314662306a36Sopenharmony_ci 314762306a36Sopenharmony_ci max_level = min(max_level, max_huge_page_level); 314862306a36Sopenharmony_ci for ( ; max_level > PG_LEVEL_4K; max_level--) { 314962306a36Sopenharmony_ci linfo = lpage_info_slot(gfn, slot, max_level); 315062306a36Sopenharmony_ci if (!linfo->disallow_lpage) 315162306a36Sopenharmony_ci break; 315262306a36Sopenharmony_ci } 315362306a36Sopenharmony_ci 315462306a36Sopenharmony_ci if (max_level == PG_LEVEL_4K) 315562306a36Sopenharmony_ci return PG_LEVEL_4K; 315662306a36Sopenharmony_ci 315762306a36Sopenharmony_ci host_level = host_pfn_mapping_level(kvm, gfn, slot); 315862306a36Sopenharmony_ci return min(host_level, max_level); 315962306a36Sopenharmony_ci} 316062306a36Sopenharmony_ci 316162306a36Sopenharmony_civoid kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 316262306a36Sopenharmony_ci{ 316362306a36Sopenharmony_ci struct kvm_memory_slot *slot = fault->slot; 316462306a36Sopenharmony_ci kvm_pfn_t mask; 316562306a36Sopenharmony_ci 316662306a36Sopenharmony_ci fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled; 316762306a36Sopenharmony_ci 316862306a36Sopenharmony_ci if (unlikely(fault->max_level == PG_LEVEL_4K)) 316962306a36Sopenharmony_ci return; 317062306a36Sopenharmony_ci 317162306a36Sopenharmony_ci if (is_error_noslot_pfn(fault->pfn)) 317262306a36Sopenharmony_ci return; 317362306a36Sopenharmony_ci 317462306a36Sopenharmony_ci if (kvm_slot_dirty_track_enabled(slot)) 317562306a36Sopenharmony_ci return; 317662306a36Sopenharmony_ci 317762306a36Sopenharmony_ci /* 317862306a36Sopenharmony_ci * Enforce the iTLB multihit workaround after capturing the requested 317962306a36Sopenharmony_ci * level, which will be used to do precise, accurate accounting. 318062306a36Sopenharmony_ci */ 318162306a36Sopenharmony_ci fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, 318262306a36Sopenharmony_ci fault->gfn, fault->max_level); 318362306a36Sopenharmony_ci if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) 318462306a36Sopenharmony_ci return; 318562306a36Sopenharmony_ci 318662306a36Sopenharmony_ci /* 318762306a36Sopenharmony_ci * mmu_invalidate_retry() was successful and mmu_lock is held, so 318862306a36Sopenharmony_ci * the pmd can't be split from under us. 318962306a36Sopenharmony_ci */ 319062306a36Sopenharmony_ci fault->goal_level = fault->req_level; 319162306a36Sopenharmony_ci mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1; 319262306a36Sopenharmony_ci VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask)); 319362306a36Sopenharmony_ci fault->pfn &= ~mask; 319462306a36Sopenharmony_ci} 319562306a36Sopenharmony_ci 319662306a36Sopenharmony_civoid disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level) 319762306a36Sopenharmony_ci{ 319862306a36Sopenharmony_ci if (cur_level > PG_LEVEL_4K && 319962306a36Sopenharmony_ci cur_level == fault->goal_level && 320062306a36Sopenharmony_ci is_shadow_present_pte(spte) && 320162306a36Sopenharmony_ci !is_large_pte(spte) && 320262306a36Sopenharmony_ci spte_to_child_sp(spte)->nx_huge_page_disallowed) { 320362306a36Sopenharmony_ci /* 320462306a36Sopenharmony_ci * A small SPTE exists for this pfn, but FNAME(fetch), 320562306a36Sopenharmony_ci * direct_map(), or kvm_tdp_mmu_map() would like to create a 320662306a36Sopenharmony_ci * large PTE instead: just force them to go down another level, 320762306a36Sopenharmony_ci * patching back for them into pfn the next 9 bits of the 320862306a36Sopenharmony_ci * address. 320962306a36Sopenharmony_ci */ 321062306a36Sopenharmony_ci u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) - 321162306a36Sopenharmony_ci KVM_PAGES_PER_HPAGE(cur_level - 1); 321262306a36Sopenharmony_ci fault->pfn |= fault->gfn & page_mask; 321362306a36Sopenharmony_ci fault->goal_level--; 321462306a36Sopenharmony_ci } 321562306a36Sopenharmony_ci} 321662306a36Sopenharmony_ci 321762306a36Sopenharmony_cistatic int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 321862306a36Sopenharmony_ci{ 321962306a36Sopenharmony_ci struct kvm_shadow_walk_iterator it; 322062306a36Sopenharmony_ci struct kvm_mmu_page *sp; 322162306a36Sopenharmony_ci int ret; 322262306a36Sopenharmony_ci gfn_t base_gfn = fault->gfn; 322362306a36Sopenharmony_ci 322462306a36Sopenharmony_ci kvm_mmu_hugepage_adjust(vcpu, fault); 322562306a36Sopenharmony_ci 322662306a36Sopenharmony_ci trace_kvm_mmu_spte_requested(fault); 322762306a36Sopenharmony_ci for_each_shadow_entry(vcpu, fault->addr, it) { 322862306a36Sopenharmony_ci /* 322962306a36Sopenharmony_ci * We cannot overwrite existing page tables with an NX 323062306a36Sopenharmony_ci * large page, as the leaf could be executable. 323162306a36Sopenharmony_ci */ 323262306a36Sopenharmony_ci if (fault->nx_huge_page_workaround_enabled) 323362306a36Sopenharmony_ci disallowed_hugepage_adjust(fault, *it.sptep, it.level); 323462306a36Sopenharmony_ci 323562306a36Sopenharmony_ci base_gfn = gfn_round_for_level(fault->gfn, it.level); 323662306a36Sopenharmony_ci if (it.level == fault->goal_level) 323762306a36Sopenharmony_ci break; 323862306a36Sopenharmony_ci 323962306a36Sopenharmony_ci sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL); 324062306a36Sopenharmony_ci if (sp == ERR_PTR(-EEXIST)) 324162306a36Sopenharmony_ci continue; 324262306a36Sopenharmony_ci 324362306a36Sopenharmony_ci link_shadow_page(vcpu, it.sptep, sp); 324462306a36Sopenharmony_ci if (fault->huge_page_disallowed) 324562306a36Sopenharmony_ci account_nx_huge_page(vcpu->kvm, sp, 324662306a36Sopenharmony_ci fault->req_level >= it.level); 324762306a36Sopenharmony_ci } 324862306a36Sopenharmony_ci 324962306a36Sopenharmony_ci if (WARN_ON_ONCE(it.level != fault->goal_level)) 325062306a36Sopenharmony_ci return -EFAULT; 325162306a36Sopenharmony_ci 325262306a36Sopenharmony_ci ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL, 325362306a36Sopenharmony_ci base_gfn, fault->pfn, fault); 325462306a36Sopenharmony_ci if (ret == RET_PF_SPURIOUS) 325562306a36Sopenharmony_ci return ret; 325662306a36Sopenharmony_ci 325762306a36Sopenharmony_ci direct_pte_prefetch(vcpu, it.sptep); 325862306a36Sopenharmony_ci return ret; 325962306a36Sopenharmony_ci} 326062306a36Sopenharmony_ci 326162306a36Sopenharmony_cistatic void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn) 326262306a36Sopenharmony_ci{ 326362306a36Sopenharmony_ci unsigned long hva = gfn_to_hva_memslot(slot, gfn); 326462306a36Sopenharmony_ci 326562306a36Sopenharmony_ci send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current); 326662306a36Sopenharmony_ci} 326762306a36Sopenharmony_ci 326862306a36Sopenharmony_cistatic int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 326962306a36Sopenharmony_ci{ 327062306a36Sopenharmony_ci if (is_sigpending_pfn(fault->pfn)) { 327162306a36Sopenharmony_ci kvm_handle_signal_exit(vcpu); 327262306a36Sopenharmony_ci return -EINTR; 327362306a36Sopenharmony_ci } 327462306a36Sopenharmony_ci 327562306a36Sopenharmony_ci /* 327662306a36Sopenharmony_ci * Do not cache the mmio info caused by writing the readonly gfn 327762306a36Sopenharmony_ci * into the spte otherwise read access on readonly gfn also can 327862306a36Sopenharmony_ci * caused mmio page fault and treat it as mmio access. 327962306a36Sopenharmony_ci */ 328062306a36Sopenharmony_ci if (fault->pfn == KVM_PFN_ERR_RO_FAULT) 328162306a36Sopenharmony_ci return RET_PF_EMULATE; 328262306a36Sopenharmony_ci 328362306a36Sopenharmony_ci if (fault->pfn == KVM_PFN_ERR_HWPOISON) { 328462306a36Sopenharmony_ci kvm_send_hwpoison_signal(fault->slot, fault->gfn); 328562306a36Sopenharmony_ci return RET_PF_RETRY; 328662306a36Sopenharmony_ci } 328762306a36Sopenharmony_ci 328862306a36Sopenharmony_ci return -EFAULT; 328962306a36Sopenharmony_ci} 329062306a36Sopenharmony_ci 329162306a36Sopenharmony_cistatic int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, 329262306a36Sopenharmony_ci struct kvm_page_fault *fault, 329362306a36Sopenharmony_ci unsigned int access) 329462306a36Sopenharmony_ci{ 329562306a36Sopenharmony_ci gva_t gva = fault->is_tdp ? 0 : fault->addr; 329662306a36Sopenharmony_ci 329762306a36Sopenharmony_ci vcpu_cache_mmio_info(vcpu, gva, fault->gfn, 329862306a36Sopenharmony_ci access & shadow_mmio_access_mask); 329962306a36Sopenharmony_ci 330062306a36Sopenharmony_ci /* 330162306a36Sopenharmony_ci * If MMIO caching is disabled, emulate immediately without 330262306a36Sopenharmony_ci * touching the shadow page tables as attempting to install an 330362306a36Sopenharmony_ci * MMIO SPTE will just be an expensive nop. 330462306a36Sopenharmony_ci */ 330562306a36Sopenharmony_ci if (unlikely(!enable_mmio_caching)) 330662306a36Sopenharmony_ci return RET_PF_EMULATE; 330762306a36Sopenharmony_ci 330862306a36Sopenharmony_ci /* 330962306a36Sopenharmony_ci * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR, 331062306a36Sopenharmony_ci * any guest that generates such gfns is running nested and is being 331162306a36Sopenharmony_ci * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and 331262306a36Sopenharmony_ci * only if L1's MAXPHYADDR is inaccurate with respect to the 331362306a36Sopenharmony_ci * hardware's). 331462306a36Sopenharmony_ci */ 331562306a36Sopenharmony_ci if (unlikely(fault->gfn > kvm_mmu_max_gfn())) 331662306a36Sopenharmony_ci return RET_PF_EMULATE; 331762306a36Sopenharmony_ci 331862306a36Sopenharmony_ci return RET_PF_CONTINUE; 331962306a36Sopenharmony_ci} 332062306a36Sopenharmony_ci 332162306a36Sopenharmony_cistatic bool page_fault_can_be_fast(struct kvm_page_fault *fault) 332262306a36Sopenharmony_ci{ 332362306a36Sopenharmony_ci /* 332462306a36Sopenharmony_ci * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only 332562306a36Sopenharmony_ci * reach the common page fault handler if the SPTE has an invalid MMIO 332662306a36Sopenharmony_ci * generation number. Refreshing the MMIO generation needs to go down 332762306a36Sopenharmony_ci * the slow path. Note, EPT Misconfigs do NOT set the PRESENT flag! 332862306a36Sopenharmony_ci */ 332962306a36Sopenharmony_ci if (fault->rsvd) 333062306a36Sopenharmony_ci return false; 333162306a36Sopenharmony_ci 333262306a36Sopenharmony_ci /* 333362306a36Sopenharmony_ci * #PF can be fast if: 333462306a36Sopenharmony_ci * 333562306a36Sopenharmony_ci * 1. The shadow page table entry is not present and A/D bits are 333662306a36Sopenharmony_ci * disabled _by KVM_, which could mean that the fault is potentially 333762306a36Sopenharmony_ci * caused by access tracking (if enabled). If A/D bits are enabled 333862306a36Sopenharmony_ci * by KVM, but disabled by L1 for L2, KVM is forced to disable A/D 333962306a36Sopenharmony_ci * bits for L2 and employ access tracking, but the fast page fault 334062306a36Sopenharmony_ci * mechanism only supports direct MMUs. 334162306a36Sopenharmony_ci * 2. The shadow page table entry is present, the access is a write, 334262306a36Sopenharmony_ci * and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e. 334362306a36Sopenharmony_ci * the fault was caused by a write-protection violation. If the 334462306a36Sopenharmony_ci * SPTE is MMU-writable (determined later), the fault can be fixed 334562306a36Sopenharmony_ci * by setting the Writable bit, which can be done out of mmu_lock. 334662306a36Sopenharmony_ci */ 334762306a36Sopenharmony_ci if (!fault->present) 334862306a36Sopenharmony_ci return !kvm_ad_enabled(); 334962306a36Sopenharmony_ci 335062306a36Sopenharmony_ci /* 335162306a36Sopenharmony_ci * Note, instruction fetches and writes are mutually exclusive, ignore 335262306a36Sopenharmony_ci * the "exec" flag. 335362306a36Sopenharmony_ci */ 335462306a36Sopenharmony_ci return fault->write; 335562306a36Sopenharmony_ci} 335662306a36Sopenharmony_ci 335762306a36Sopenharmony_ci/* 335862306a36Sopenharmony_ci * Returns true if the SPTE was fixed successfully. Otherwise, 335962306a36Sopenharmony_ci * someone else modified the SPTE from its original value. 336062306a36Sopenharmony_ci */ 336162306a36Sopenharmony_cistatic bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, 336262306a36Sopenharmony_ci struct kvm_page_fault *fault, 336362306a36Sopenharmony_ci u64 *sptep, u64 old_spte, u64 new_spte) 336462306a36Sopenharmony_ci{ 336562306a36Sopenharmony_ci /* 336662306a36Sopenharmony_ci * Theoretically we could also set dirty bit (and flush TLB) here in 336762306a36Sopenharmony_ci * order to eliminate unnecessary PML logging. See comments in 336862306a36Sopenharmony_ci * set_spte. But fast_page_fault is very unlikely to happen with PML 336962306a36Sopenharmony_ci * enabled, so we do not do this. This might result in the same GPA 337062306a36Sopenharmony_ci * to be logged in PML buffer again when the write really happens, and 337162306a36Sopenharmony_ci * eventually to be called by mark_page_dirty twice. But it's also no 337262306a36Sopenharmony_ci * harm. This also avoids the TLB flush needed after setting dirty bit 337362306a36Sopenharmony_ci * so non-PML cases won't be impacted. 337462306a36Sopenharmony_ci * 337562306a36Sopenharmony_ci * Compare with set_spte where instead shadow_dirty_mask is set. 337662306a36Sopenharmony_ci */ 337762306a36Sopenharmony_ci if (!try_cmpxchg64(sptep, &old_spte, new_spte)) 337862306a36Sopenharmony_ci return false; 337962306a36Sopenharmony_ci 338062306a36Sopenharmony_ci if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) 338162306a36Sopenharmony_ci mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn); 338262306a36Sopenharmony_ci 338362306a36Sopenharmony_ci return true; 338462306a36Sopenharmony_ci} 338562306a36Sopenharmony_ci 338662306a36Sopenharmony_cistatic bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) 338762306a36Sopenharmony_ci{ 338862306a36Sopenharmony_ci if (fault->exec) 338962306a36Sopenharmony_ci return is_executable_pte(spte); 339062306a36Sopenharmony_ci 339162306a36Sopenharmony_ci if (fault->write) 339262306a36Sopenharmony_ci return is_writable_pte(spte); 339362306a36Sopenharmony_ci 339462306a36Sopenharmony_ci /* Fault was on Read access */ 339562306a36Sopenharmony_ci return spte & PT_PRESENT_MASK; 339662306a36Sopenharmony_ci} 339762306a36Sopenharmony_ci 339862306a36Sopenharmony_ci/* 339962306a36Sopenharmony_ci * Returns the last level spte pointer of the shadow page walk for the given 340062306a36Sopenharmony_ci * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 340162306a36Sopenharmony_ci * walk could be performed, returns NULL and *spte does not contain valid data. 340262306a36Sopenharmony_ci * 340362306a36Sopenharmony_ci * Contract: 340462306a36Sopenharmony_ci * - Must be called between walk_shadow_page_lockless_{begin,end}. 340562306a36Sopenharmony_ci * - The returned sptep must not be used after walk_shadow_page_lockless_end. 340662306a36Sopenharmony_ci */ 340762306a36Sopenharmony_cistatic u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) 340862306a36Sopenharmony_ci{ 340962306a36Sopenharmony_ci struct kvm_shadow_walk_iterator iterator; 341062306a36Sopenharmony_ci u64 old_spte; 341162306a36Sopenharmony_ci u64 *sptep = NULL; 341262306a36Sopenharmony_ci 341362306a36Sopenharmony_ci for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { 341462306a36Sopenharmony_ci sptep = iterator.sptep; 341562306a36Sopenharmony_ci *spte = old_spte; 341662306a36Sopenharmony_ci } 341762306a36Sopenharmony_ci 341862306a36Sopenharmony_ci return sptep; 341962306a36Sopenharmony_ci} 342062306a36Sopenharmony_ci 342162306a36Sopenharmony_ci/* 342262306a36Sopenharmony_ci * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. 342362306a36Sopenharmony_ci */ 342462306a36Sopenharmony_cistatic int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 342562306a36Sopenharmony_ci{ 342662306a36Sopenharmony_ci struct kvm_mmu_page *sp; 342762306a36Sopenharmony_ci int ret = RET_PF_INVALID; 342862306a36Sopenharmony_ci u64 spte = 0ull; 342962306a36Sopenharmony_ci u64 *sptep = NULL; 343062306a36Sopenharmony_ci uint retry_count = 0; 343162306a36Sopenharmony_ci 343262306a36Sopenharmony_ci if (!page_fault_can_be_fast(fault)) 343362306a36Sopenharmony_ci return ret; 343462306a36Sopenharmony_ci 343562306a36Sopenharmony_ci walk_shadow_page_lockless_begin(vcpu); 343662306a36Sopenharmony_ci 343762306a36Sopenharmony_ci do { 343862306a36Sopenharmony_ci u64 new_spte; 343962306a36Sopenharmony_ci 344062306a36Sopenharmony_ci if (tdp_mmu_enabled) 344162306a36Sopenharmony_ci sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte); 344262306a36Sopenharmony_ci else 344362306a36Sopenharmony_ci sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); 344462306a36Sopenharmony_ci 344562306a36Sopenharmony_ci if (!is_shadow_present_pte(spte)) 344662306a36Sopenharmony_ci break; 344762306a36Sopenharmony_ci 344862306a36Sopenharmony_ci sp = sptep_to_sp(sptep); 344962306a36Sopenharmony_ci if (!is_last_spte(spte, sp->role.level)) 345062306a36Sopenharmony_ci break; 345162306a36Sopenharmony_ci 345262306a36Sopenharmony_ci /* 345362306a36Sopenharmony_ci * Check whether the memory access that caused the fault would 345462306a36Sopenharmony_ci * still cause it if it were to be performed right now. If not, 345562306a36Sopenharmony_ci * then this is a spurious fault caused by TLB lazily flushed, 345662306a36Sopenharmony_ci * or some other CPU has already fixed the PTE after the 345762306a36Sopenharmony_ci * current CPU took the fault. 345862306a36Sopenharmony_ci * 345962306a36Sopenharmony_ci * Need not check the access of upper level table entries since 346062306a36Sopenharmony_ci * they are always ACC_ALL. 346162306a36Sopenharmony_ci */ 346262306a36Sopenharmony_ci if (is_access_allowed(fault, spte)) { 346362306a36Sopenharmony_ci ret = RET_PF_SPURIOUS; 346462306a36Sopenharmony_ci break; 346562306a36Sopenharmony_ci } 346662306a36Sopenharmony_ci 346762306a36Sopenharmony_ci new_spte = spte; 346862306a36Sopenharmony_ci 346962306a36Sopenharmony_ci /* 347062306a36Sopenharmony_ci * KVM only supports fixing page faults outside of MMU lock for 347162306a36Sopenharmony_ci * direct MMUs, nested MMUs are always indirect, and KVM always 347262306a36Sopenharmony_ci * uses A/D bits for non-nested MMUs. Thus, if A/D bits are 347362306a36Sopenharmony_ci * enabled, the SPTE can't be an access-tracked SPTE. 347462306a36Sopenharmony_ci */ 347562306a36Sopenharmony_ci if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte)) 347662306a36Sopenharmony_ci new_spte = restore_acc_track_spte(new_spte); 347762306a36Sopenharmony_ci 347862306a36Sopenharmony_ci /* 347962306a36Sopenharmony_ci * To keep things simple, only SPTEs that are MMU-writable can 348062306a36Sopenharmony_ci * be made fully writable outside of mmu_lock, e.g. only SPTEs 348162306a36Sopenharmony_ci * that were write-protected for dirty-logging or access 348262306a36Sopenharmony_ci * tracking are handled here. Don't bother checking if the 348362306a36Sopenharmony_ci * SPTE is writable to prioritize running with A/D bits enabled. 348462306a36Sopenharmony_ci * The is_access_allowed() check above handles the common case 348562306a36Sopenharmony_ci * of the fault being spurious, and the SPTE is known to be 348662306a36Sopenharmony_ci * shadow-present, i.e. except for access tracking restoration 348762306a36Sopenharmony_ci * making the new SPTE writable, the check is wasteful. 348862306a36Sopenharmony_ci */ 348962306a36Sopenharmony_ci if (fault->write && is_mmu_writable_spte(spte)) { 349062306a36Sopenharmony_ci new_spte |= PT_WRITABLE_MASK; 349162306a36Sopenharmony_ci 349262306a36Sopenharmony_ci /* 349362306a36Sopenharmony_ci * Do not fix write-permission on the large spte when 349462306a36Sopenharmony_ci * dirty logging is enabled. Since we only dirty the 349562306a36Sopenharmony_ci * first page into the dirty-bitmap in 349662306a36Sopenharmony_ci * fast_pf_fix_direct_spte(), other pages are missed 349762306a36Sopenharmony_ci * if its slot has dirty logging enabled. 349862306a36Sopenharmony_ci * 349962306a36Sopenharmony_ci * Instead, we let the slow page fault path create a 350062306a36Sopenharmony_ci * normal spte to fix the access. 350162306a36Sopenharmony_ci */ 350262306a36Sopenharmony_ci if (sp->role.level > PG_LEVEL_4K && 350362306a36Sopenharmony_ci kvm_slot_dirty_track_enabled(fault->slot)) 350462306a36Sopenharmony_ci break; 350562306a36Sopenharmony_ci } 350662306a36Sopenharmony_ci 350762306a36Sopenharmony_ci /* Verify that the fault can be handled in the fast path */ 350862306a36Sopenharmony_ci if (new_spte == spte || 350962306a36Sopenharmony_ci !is_access_allowed(fault, new_spte)) 351062306a36Sopenharmony_ci break; 351162306a36Sopenharmony_ci 351262306a36Sopenharmony_ci /* 351362306a36Sopenharmony_ci * Currently, fast page fault only works for direct mapping 351462306a36Sopenharmony_ci * since the gfn is not stable for indirect shadow page. See 351562306a36Sopenharmony_ci * Documentation/virt/kvm/locking.rst to get more detail. 351662306a36Sopenharmony_ci */ 351762306a36Sopenharmony_ci if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) { 351862306a36Sopenharmony_ci ret = RET_PF_FIXED; 351962306a36Sopenharmony_ci break; 352062306a36Sopenharmony_ci } 352162306a36Sopenharmony_ci 352262306a36Sopenharmony_ci if (++retry_count > 4) { 352362306a36Sopenharmony_ci pr_warn_once("Fast #PF retrying more than 4 times.\n"); 352462306a36Sopenharmony_ci break; 352562306a36Sopenharmony_ci } 352662306a36Sopenharmony_ci 352762306a36Sopenharmony_ci } while (true); 352862306a36Sopenharmony_ci 352962306a36Sopenharmony_ci trace_fast_page_fault(vcpu, fault, sptep, spte, ret); 353062306a36Sopenharmony_ci walk_shadow_page_lockless_end(vcpu); 353162306a36Sopenharmony_ci 353262306a36Sopenharmony_ci if (ret != RET_PF_INVALID) 353362306a36Sopenharmony_ci vcpu->stat.pf_fast++; 353462306a36Sopenharmony_ci 353562306a36Sopenharmony_ci return ret; 353662306a36Sopenharmony_ci} 353762306a36Sopenharmony_ci 353862306a36Sopenharmony_cistatic void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, 353962306a36Sopenharmony_ci struct list_head *invalid_list) 354062306a36Sopenharmony_ci{ 354162306a36Sopenharmony_ci struct kvm_mmu_page *sp; 354262306a36Sopenharmony_ci 354362306a36Sopenharmony_ci if (!VALID_PAGE(*root_hpa)) 354462306a36Sopenharmony_ci return; 354562306a36Sopenharmony_ci 354662306a36Sopenharmony_ci sp = root_to_sp(*root_hpa); 354762306a36Sopenharmony_ci if (WARN_ON_ONCE(!sp)) 354862306a36Sopenharmony_ci return; 354962306a36Sopenharmony_ci 355062306a36Sopenharmony_ci if (is_tdp_mmu_page(sp)) 355162306a36Sopenharmony_ci kvm_tdp_mmu_put_root(kvm, sp, false); 355262306a36Sopenharmony_ci else if (!--sp->root_count && sp->role.invalid) 355362306a36Sopenharmony_ci kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 355462306a36Sopenharmony_ci 355562306a36Sopenharmony_ci *root_hpa = INVALID_PAGE; 355662306a36Sopenharmony_ci} 355762306a36Sopenharmony_ci 355862306a36Sopenharmony_ci/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */ 355962306a36Sopenharmony_civoid kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, 356062306a36Sopenharmony_ci ulong roots_to_free) 356162306a36Sopenharmony_ci{ 356262306a36Sopenharmony_ci int i; 356362306a36Sopenharmony_ci LIST_HEAD(invalid_list); 356462306a36Sopenharmony_ci bool free_active_root; 356562306a36Sopenharmony_ci 356662306a36Sopenharmony_ci WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL); 356762306a36Sopenharmony_ci 356862306a36Sopenharmony_ci BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); 356962306a36Sopenharmony_ci 357062306a36Sopenharmony_ci /* Before acquiring the MMU lock, see if we need to do any real work. */ 357162306a36Sopenharmony_ci free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT) 357262306a36Sopenharmony_ci && VALID_PAGE(mmu->root.hpa); 357362306a36Sopenharmony_ci 357462306a36Sopenharmony_ci if (!free_active_root) { 357562306a36Sopenharmony_ci for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 357662306a36Sopenharmony_ci if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) && 357762306a36Sopenharmony_ci VALID_PAGE(mmu->prev_roots[i].hpa)) 357862306a36Sopenharmony_ci break; 357962306a36Sopenharmony_ci 358062306a36Sopenharmony_ci if (i == KVM_MMU_NUM_PREV_ROOTS) 358162306a36Sopenharmony_ci return; 358262306a36Sopenharmony_ci } 358362306a36Sopenharmony_ci 358462306a36Sopenharmony_ci write_lock(&kvm->mmu_lock); 358562306a36Sopenharmony_ci 358662306a36Sopenharmony_ci for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 358762306a36Sopenharmony_ci if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) 358862306a36Sopenharmony_ci mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa, 358962306a36Sopenharmony_ci &invalid_list); 359062306a36Sopenharmony_ci 359162306a36Sopenharmony_ci if (free_active_root) { 359262306a36Sopenharmony_ci if (kvm_mmu_is_dummy_root(mmu->root.hpa)) { 359362306a36Sopenharmony_ci /* Nothing to cleanup for dummy roots. */ 359462306a36Sopenharmony_ci } else if (root_to_sp(mmu->root.hpa)) { 359562306a36Sopenharmony_ci mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list); 359662306a36Sopenharmony_ci } else if (mmu->pae_root) { 359762306a36Sopenharmony_ci for (i = 0; i < 4; ++i) { 359862306a36Sopenharmony_ci if (!IS_VALID_PAE_ROOT(mmu->pae_root[i])) 359962306a36Sopenharmony_ci continue; 360062306a36Sopenharmony_ci 360162306a36Sopenharmony_ci mmu_free_root_page(kvm, &mmu->pae_root[i], 360262306a36Sopenharmony_ci &invalid_list); 360362306a36Sopenharmony_ci mmu->pae_root[i] = INVALID_PAE_ROOT; 360462306a36Sopenharmony_ci } 360562306a36Sopenharmony_ci } 360662306a36Sopenharmony_ci mmu->root.hpa = INVALID_PAGE; 360762306a36Sopenharmony_ci mmu->root.pgd = 0; 360862306a36Sopenharmony_ci } 360962306a36Sopenharmony_ci 361062306a36Sopenharmony_ci kvm_mmu_commit_zap_page(kvm, &invalid_list); 361162306a36Sopenharmony_ci write_unlock(&kvm->mmu_lock); 361262306a36Sopenharmony_ci} 361362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_mmu_free_roots); 361462306a36Sopenharmony_ci 361562306a36Sopenharmony_civoid kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) 361662306a36Sopenharmony_ci{ 361762306a36Sopenharmony_ci unsigned long roots_to_free = 0; 361862306a36Sopenharmony_ci struct kvm_mmu_page *sp; 361962306a36Sopenharmony_ci hpa_t root_hpa; 362062306a36Sopenharmony_ci int i; 362162306a36Sopenharmony_ci 362262306a36Sopenharmony_ci /* 362362306a36Sopenharmony_ci * This should not be called while L2 is active, L2 can't invalidate 362462306a36Sopenharmony_ci * _only_ its own roots, e.g. INVVPID unconditionally exits. 362562306a36Sopenharmony_ci */ 362662306a36Sopenharmony_ci WARN_ON_ONCE(mmu->root_role.guest_mode); 362762306a36Sopenharmony_ci 362862306a36Sopenharmony_ci for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 362962306a36Sopenharmony_ci root_hpa = mmu->prev_roots[i].hpa; 363062306a36Sopenharmony_ci if (!VALID_PAGE(root_hpa)) 363162306a36Sopenharmony_ci continue; 363262306a36Sopenharmony_ci 363362306a36Sopenharmony_ci sp = root_to_sp(root_hpa); 363462306a36Sopenharmony_ci if (!sp || sp->role.guest_mode) 363562306a36Sopenharmony_ci roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 363662306a36Sopenharmony_ci } 363762306a36Sopenharmony_ci 363862306a36Sopenharmony_ci kvm_mmu_free_roots(kvm, mmu, roots_to_free); 363962306a36Sopenharmony_ci} 364062306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); 364162306a36Sopenharmony_ci 364262306a36Sopenharmony_cistatic hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, 364362306a36Sopenharmony_ci u8 level) 364462306a36Sopenharmony_ci{ 364562306a36Sopenharmony_ci union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; 364662306a36Sopenharmony_ci struct kvm_mmu_page *sp; 364762306a36Sopenharmony_ci 364862306a36Sopenharmony_ci role.level = level; 364962306a36Sopenharmony_ci role.quadrant = quadrant; 365062306a36Sopenharmony_ci 365162306a36Sopenharmony_ci WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte); 365262306a36Sopenharmony_ci WARN_ON_ONCE(role.direct && role.has_4_byte_gpte); 365362306a36Sopenharmony_ci 365462306a36Sopenharmony_ci sp = kvm_mmu_get_shadow_page(vcpu, gfn, role); 365562306a36Sopenharmony_ci ++sp->root_count; 365662306a36Sopenharmony_ci 365762306a36Sopenharmony_ci return __pa(sp->spt); 365862306a36Sopenharmony_ci} 365962306a36Sopenharmony_ci 366062306a36Sopenharmony_cistatic int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) 366162306a36Sopenharmony_ci{ 366262306a36Sopenharmony_ci struct kvm_mmu *mmu = vcpu->arch.mmu; 366362306a36Sopenharmony_ci u8 shadow_root_level = mmu->root_role.level; 366462306a36Sopenharmony_ci hpa_t root; 366562306a36Sopenharmony_ci unsigned i; 366662306a36Sopenharmony_ci int r; 366762306a36Sopenharmony_ci 366862306a36Sopenharmony_ci write_lock(&vcpu->kvm->mmu_lock); 366962306a36Sopenharmony_ci r = make_mmu_pages_available(vcpu); 367062306a36Sopenharmony_ci if (r < 0) 367162306a36Sopenharmony_ci goto out_unlock; 367262306a36Sopenharmony_ci 367362306a36Sopenharmony_ci if (tdp_mmu_enabled) { 367462306a36Sopenharmony_ci root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); 367562306a36Sopenharmony_ci mmu->root.hpa = root; 367662306a36Sopenharmony_ci } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { 367762306a36Sopenharmony_ci root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level); 367862306a36Sopenharmony_ci mmu->root.hpa = root; 367962306a36Sopenharmony_ci } else if (shadow_root_level == PT32E_ROOT_LEVEL) { 368062306a36Sopenharmony_ci if (WARN_ON_ONCE(!mmu->pae_root)) { 368162306a36Sopenharmony_ci r = -EIO; 368262306a36Sopenharmony_ci goto out_unlock; 368362306a36Sopenharmony_ci } 368462306a36Sopenharmony_ci 368562306a36Sopenharmony_ci for (i = 0; i < 4; ++i) { 368662306a36Sopenharmony_ci WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); 368762306a36Sopenharmony_ci 368862306a36Sopenharmony_ci root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0, 368962306a36Sopenharmony_ci PT32_ROOT_LEVEL); 369062306a36Sopenharmony_ci mmu->pae_root[i] = root | PT_PRESENT_MASK | 369162306a36Sopenharmony_ci shadow_me_value; 369262306a36Sopenharmony_ci } 369362306a36Sopenharmony_ci mmu->root.hpa = __pa(mmu->pae_root); 369462306a36Sopenharmony_ci } else { 369562306a36Sopenharmony_ci WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level); 369662306a36Sopenharmony_ci r = -EIO; 369762306a36Sopenharmony_ci goto out_unlock; 369862306a36Sopenharmony_ci } 369962306a36Sopenharmony_ci 370062306a36Sopenharmony_ci /* root.pgd is ignored for direct MMUs. */ 370162306a36Sopenharmony_ci mmu->root.pgd = 0; 370262306a36Sopenharmony_ciout_unlock: 370362306a36Sopenharmony_ci write_unlock(&vcpu->kvm->mmu_lock); 370462306a36Sopenharmony_ci return r; 370562306a36Sopenharmony_ci} 370662306a36Sopenharmony_ci 370762306a36Sopenharmony_cistatic int mmu_first_shadow_root_alloc(struct kvm *kvm) 370862306a36Sopenharmony_ci{ 370962306a36Sopenharmony_ci struct kvm_memslots *slots; 371062306a36Sopenharmony_ci struct kvm_memory_slot *slot; 371162306a36Sopenharmony_ci int r = 0, i, bkt; 371262306a36Sopenharmony_ci 371362306a36Sopenharmony_ci /* 371462306a36Sopenharmony_ci * Check if this is the first shadow root being allocated before 371562306a36Sopenharmony_ci * taking the lock. 371662306a36Sopenharmony_ci */ 371762306a36Sopenharmony_ci if (kvm_shadow_root_allocated(kvm)) 371862306a36Sopenharmony_ci return 0; 371962306a36Sopenharmony_ci 372062306a36Sopenharmony_ci mutex_lock(&kvm->slots_arch_lock); 372162306a36Sopenharmony_ci 372262306a36Sopenharmony_ci /* Recheck, under the lock, whether this is the first shadow root. */ 372362306a36Sopenharmony_ci if (kvm_shadow_root_allocated(kvm)) 372462306a36Sopenharmony_ci goto out_unlock; 372562306a36Sopenharmony_ci 372662306a36Sopenharmony_ci /* 372762306a36Sopenharmony_ci * Check if anything actually needs to be allocated, e.g. all metadata 372862306a36Sopenharmony_ci * will be allocated upfront if TDP is disabled. 372962306a36Sopenharmony_ci */ 373062306a36Sopenharmony_ci if (kvm_memslots_have_rmaps(kvm) && 373162306a36Sopenharmony_ci kvm_page_track_write_tracking_enabled(kvm)) 373262306a36Sopenharmony_ci goto out_success; 373362306a36Sopenharmony_ci 373462306a36Sopenharmony_ci for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 373562306a36Sopenharmony_ci slots = __kvm_memslots(kvm, i); 373662306a36Sopenharmony_ci kvm_for_each_memslot(slot, bkt, slots) { 373762306a36Sopenharmony_ci /* 373862306a36Sopenharmony_ci * Both of these functions are no-ops if the target is 373962306a36Sopenharmony_ci * already allocated, so unconditionally calling both 374062306a36Sopenharmony_ci * is safe. Intentionally do NOT free allocations on 374162306a36Sopenharmony_ci * failure to avoid having to track which allocations 374262306a36Sopenharmony_ci * were made now versus when the memslot was created. 374362306a36Sopenharmony_ci * The metadata is guaranteed to be freed when the slot 374462306a36Sopenharmony_ci * is freed, and will be kept/used if userspace retries 374562306a36Sopenharmony_ci * KVM_RUN instead of killing the VM. 374662306a36Sopenharmony_ci */ 374762306a36Sopenharmony_ci r = memslot_rmap_alloc(slot, slot->npages); 374862306a36Sopenharmony_ci if (r) 374962306a36Sopenharmony_ci goto out_unlock; 375062306a36Sopenharmony_ci r = kvm_page_track_write_tracking_alloc(slot); 375162306a36Sopenharmony_ci if (r) 375262306a36Sopenharmony_ci goto out_unlock; 375362306a36Sopenharmony_ci } 375462306a36Sopenharmony_ci } 375562306a36Sopenharmony_ci 375662306a36Sopenharmony_ci /* 375762306a36Sopenharmony_ci * Ensure that shadow_root_allocated becomes true strictly after 375862306a36Sopenharmony_ci * all the related pointers are set. 375962306a36Sopenharmony_ci */ 376062306a36Sopenharmony_ciout_success: 376162306a36Sopenharmony_ci smp_store_release(&kvm->arch.shadow_root_allocated, true); 376262306a36Sopenharmony_ci 376362306a36Sopenharmony_ciout_unlock: 376462306a36Sopenharmony_ci mutex_unlock(&kvm->slots_arch_lock); 376562306a36Sopenharmony_ci return r; 376662306a36Sopenharmony_ci} 376762306a36Sopenharmony_ci 376862306a36Sopenharmony_cistatic int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) 376962306a36Sopenharmony_ci{ 377062306a36Sopenharmony_ci struct kvm_mmu *mmu = vcpu->arch.mmu; 377162306a36Sopenharmony_ci u64 pdptrs[4], pm_mask; 377262306a36Sopenharmony_ci gfn_t root_gfn, root_pgd; 377362306a36Sopenharmony_ci int quadrant, i, r; 377462306a36Sopenharmony_ci hpa_t root; 377562306a36Sopenharmony_ci 377662306a36Sopenharmony_ci root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu); 377762306a36Sopenharmony_ci root_gfn = root_pgd >> PAGE_SHIFT; 377862306a36Sopenharmony_ci 377962306a36Sopenharmony_ci if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { 378062306a36Sopenharmony_ci mmu->root.hpa = kvm_mmu_get_dummy_root(); 378162306a36Sopenharmony_ci return 0; 378262306a36Sopenharmony_ci } 378362306a36Sopenharmony_ci 378462306a36Sopenharmony_ci /* 378562306a36Sopenharmony_ci * On SVM, reading PDPTRs might access guest memory, which might fault 378662306a36Sopenharmony_ci * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock. 378762306a36Sopenharmony_ci */ 378862306a36Sopenharmony_ci if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) { 378962306a36Sopenharmony_ci for (i = 0; i < 4; ++i) { 379062306a36Sopenharmony_ci pdptrs[i] = mmu->get_pdptr(vcpu, i); 379162306a36Sopenharmony_ci if (!(pdptrs[i] & PT_PRESENT_MASK)) 379262306a36Sopenharmony_ci continue; 379362306a36Sopenharmony_ci 379462306a36Sopenharmony_ci if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT)) 379562306a36Sopenharmony_ci pdptrs[i] = 0; 379662306a36Sopenharmony_ci } 379762306a36Sopenharmony_ci } 379862306a36Sopenharmony_ci 379962306a36Sopenharmony_ci r = mmu_first_shadow_root_alloc(vcpu->kvm); 380062306a36Sopenharmony_ci if (r) 380162306a36Sopenharmony_ci return r; 380262306a36Sopenharmony_ci 380362306a36Sopenharmony_ci write_lock(&vcpu->kvm->mmu_lock); 380462306a36Sopenharmony_ci r = make_mmu_pages_available(vcpu); 380562306a36Sopenharmony_ci if (r < 0) 380662306a36Sopenharmony_ci goto out_unlock; 380762306a36Sopenharmony_ci 380862306a36Sopenharmony_ci /* 380962306a36Sopenharmony_ci * Do we shadow a long mode page table? If so we need to 381062306a36Sopenharmony_ci * write-protect the guests page table root. 381162306a36Sopenharmony_ci */ 381262306a36Sopenharmony_ci if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { 381362306a36Sopenharmony_ci root = mmu_alloc_root(vcpu, root_gfn, 0, 381462306a36Sopenharmony_ci mmu->root_role.level); 381562306a36Sopenharmony_ci mmu->root.hpa = root; 381662306a36Sopenharmony_ci goto set_root_pgd; 381762306a36Sopenharmony_ci } 381862306a36Sopenharmony_ci 381962306a36Sopenharmony_ci if (WARN_ON_ONCE(!mmu->pae_root)) { 382062306a36Sopenharmony_ci r = -EIO; 382162306a36Sopenharmony_ci goto out_unlock; 382262306a36Sopenharmony_ci } 382362306a36Sopenharmony_ci 382462306a36Sopenharmony_ci /* 382562306a36Sopenharmony_ci * We shadow a 32 bit page table. This may be a legacy 2-level 382662306a36Sopenharmony_ci * or a PAE 3-level page table. In either case we need to be aware that 382762306a36Sopenharmony_ci * the shadow page table may be a PAE or a long mode page table. 382862306a36Sopenharmony_ci */ 382962306a36Sopenharmony_ci pm_mask = PT_PRESENT_MASK | shadow_me_value; 383062306a36Sopenharmony_ci if (mmu->root_role.level >= PT64_ROOT_4LEVEL) { 383162306a36Sopenharmony_ci pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; 383262306a36Sopenharmony_ci 383362306a36Sopenharmony_ci if (WARN_ON_ONCE(!mmu->pml4_root)) { 383462306a36Sopenharmony_ci r = -EIO; 383562306a36Sopenharmony_ci goto out_unlock; 383662306a36Sopenharmony_ci } 383762306a36Sopenharmony_ci mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; 383862306a36Sopenharmony_ci 383962306a36Sopenharmony_ci if (mmu->root_role.level == PT64_ROOT_5LEVEL) { 384062306a36Sopenharmony_ci if (WARN_ON_ONCE(!mmu->pml5_root)) { 384162306a36Sopenharmony_ci r = -EIO; 384262306a36Sopenharmony_ci goto out_unlock; 384362306a36Sopenharmony_ci } 384462306a36Sopenharmony_ci mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask; 384562306a36Sopenharmony_ci } 384662306a36Sopenharmony_ci } 384762306a36Sopenharmony_ci 384862306a36Sopenharmony_ci for (i = 0; i < 4; ++i) { 384962306a36Sopenharmony_ci WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); 385062306a36Sopenharmony_ci 385162306a36Sopenharmony_ci if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) { 385262306a36Sopenharmony_ci if (!(pdptrs[i] & PT_PRESENT_MASK)) { 385362306a36Sopenharmony_ci mmu->pae_root[i] = INVALID_PAE_ROOT; 385462306a36Sopenharmony_ci continue; 385562306a36Sopenharmony_ci } 385662306a36Sopenharmony_ci root_gfn = pdptrs[i] >> PAGE_SHIFT; 385762306a36Sopenharmony_ci } 385862306a36Sopenharmony_ci 385962306a36Sopenharmony_ci /* 386062306a36Sopenharmony_ci * If shadowing 32-bit non-PAE page tables, each PAE page 386162306a36Sopenharmony_ci * directory maps one quarter of the guest's non-PAE page 386262306a36Sopenharmony_ci * directory. Othwerise each PAE page direct shadows one guest 386362306a36Sopenharmony_ci * PAE page directory so that quadrant should be 0. 386462306a36Sopenharmony_ci */ 386562306a36Sopenharmony_ci quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0; 386662306a36Sopenharmony_ci 386762306a36Sopenharmony_ci root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL); 386862306a36Sopenharmony_ci mmu->pae_root[i] = root | pm_mask; 386962306a36Sopenharmony_ci } 387062306a36Sopenharmony_ci 387162306a36Sopenharmony_ci if (mmu->root_role.level == PT64_ROOT_5LEVEL) 387262306a36Sopenharmony_ci mmu->root.hpa = __pa(mmu->pml5_root); 387362306a36Sopenharmony_ci else if (mmu->root_role.level == PT64_ROOT_4LEVEL) 387462306a36Sopenharmony_ci mmu->root.hpa = __pa(mmu->pml4_root); 387562306a36Sopenharmony_ci else 387662306a36Sopenharmony_ci mmu->root.hpa = __pa(mmu->pae_root); 387762306a36Sopenharmony_ci 387862306a36Sopenharmony_ciset_root_pgd: 387962306a36Sopenharmony_ci mmu->root.pgd = root_pgd; 388062306a36Sopenharmony_ciout_unlock: 388162306a36Sopenharmony_ci write_unlock(&vcpu->kvm->mmu_lock); 388262306a36Sopenharmony_ci 388362306a36Sopenharmony_ci return r; 388462306a36Sopenharmony_ci} 388562306a36Sopenharmony_ci 388662306a36Sopenharmony_cistatic int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) 388762306a36Sopenharmony_ci{ 388862306a36Sopenharmony_ci struct kvm_mmu *mmu = vcpu->arch.mmu; 388962306a36Sopenharmony_ci bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL; 389062306a36Sopenharmony_ci u64 *pml5_root = NULL; 389162306a36Sopenharmony_ci u64 *pml4_root = NULL; 389262306a36Sopenharmony_ci u64 *pae_root; 389362306a36Sopenharmony_ci 389462306a36Sopenharmony_ci /* 389562306a36Sopenharmony_ci * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP 389662306a36Sopenharmony_ci * tables are allocated and initialized at root creation as there is no 389762306a36Sopenharmony_ci * equivalent level in the guest's NPT to shadow. Allocate the tables 389862306a36Sopenharmony_ci * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare. 389962306a36Sopenharmony_ci */ 390062306a36Sopenharmony_ci if (mmu->root_role.direct || 390162306a36Sopenharmony_ci mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL || 390262306a36Sopenharmony_ci mmu->root_role.level < PT64_ROOT_4LEVEL) 390362306a36Sopenharmony_ci return 0; 390462306a36Sopenharmony_ci 390562306a36Sopenharmony_ci /* 390662306a36Sopenharmony_ci * NPT, the only paging mode that uses this horror, uses a fixed number 390762306a36Sopenharmony_ci * of levels for the shadow page tables, e.g. all MMUs are 4-level or 390862306a36Sopenharmony_ci * all MMus are 5-level. Thus, this can safely require that pml5_root 390962306a36Sopenharmony_ci * is allocated if the other roots are valid and pml5 is needed, as any 391062306a36Sopenharmony_ci * prior MMU would also have required pml5. 391162306a36Sopenharmony_ci */ 391262306a36Sopenharmony_ci if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root)) 391362306a36Sopenharmony_ci return 0; 391462306a36Sopenharmony_ci 391562306a36Sopenharmony_ci /* 391662306a36Sopenharmony_ci * The special roots should always be allocated in concert. Yell and 391762306a36Sopenharmony_ci * bail if KVM ends up in a state where only one of the roots is valid. 391862306a36Sopenharmony_ci */ 391962306a36Sopenharmony_ci if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root || 392062306a36Sopenharmony_ci (need_pml5 && mmu->pml5_root))) 392162306a36Sopenharmony_ci return -EIO; 392262306a36Sopenharmony_ci 392362306a36Sopenharmony_ci /* 392462306a36Sopenharmony_ci * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and 392562306a36Sopenharmony_ci * doesn't need to be decrypted. 392662306a36Sopenharmony_ci */ 392762306a36Sopenharmony_ci pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 392862306a36Sopenharmony_ci if (!pae_root) 392962306a36Sopenharmony_ci return -ENOMEM; 393062306a36Sopenharmony_ci 393162306a36Sopenharmony_ci#ifdef CONFIG_X86_64 393262306a36Sopenharmony_ci pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 393362306a36Sopenharmony_ci if (!pml4_root) 393462306a36Sopenharmony_ci goto err_pml4; 393562306a36Sopenharmony_ci 393662306a36Sopenharmony_ci if (need_pml5) { 393762306a36Sopenharmony_ci pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 393862306a36Sopenharmony_ci if (!pml5_root) 393962306a36Sopenharmony_ci goto err_pml5; 394062306a36Sopenharmony_ci } 394162306a36Sopenharmony_ci#endif 394262306a36Sopenharmony_ci 394362306a36Sopenharmony_ci mmu->pae_root = pae_root; 394462306a36Sopenharmony_ci mmu->pml4_root = pml4_root; 394562306a36Sopenharmony_ci mmu->pml5_root = pml5_root; 394662306a36Sopenharmony_ci 394762306a36Sopenharmony_ci return 0; 394862306a36Sopenharmony_ci 394962306a36Sopenharmony_ci#ifdef CONFIG_X86_64 395062306a36Sopenharmony_cierr_pml5: 395162306a36Sopenharmony_ci free_page((unsigned long)pml4_root); 395262306a36Sopenharmony_cierr_pml4: 395362306a36Sopenharmony_ci free_page((unsigned long)pae_root); 395462306a36Sopenharmony_ci return -ENOMEM; 395562306a36Sopenharmony_ci#endif 395662306a36Sopenharmony_ci} 395762306a36Sopenharmony_ci 395862306a36Sopenharmony_cistatic bool is_unsync_root(hpa_t root) 395962306a36Sopenharmony_ci{ 396062306a36Sopenharmony_ci struct kvm_mmu_page *sp; 396162306a36Sopenharmony_ci 396262306a36Sopenharmony_ci if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root)) 396362306a36Sopenharmony_ci return false; 396462306a36Sopenharmony_ci 396562306a36Sopenharmony_ci /* 396662306a36Sopenharmony_ci * The read barrier orders the CPU's read of SPTE.W during the page table 396762306a36Sopenharmony_ci * walk before the reads of sp->unsync/sp->unsync_children here. 396862306a36Sopenharmony_ci * 396962306a36Sopenharmony_ci * Even if another CPU was marking the SP as unsync-ed simultaneously, 397062306a36Sopenharmony_ci * any guest page table changes are not guaranteed to be visible anyway 397162306a36Sopenharmony_ci * until this VCPU issues a TLB flush strictly after those changes are 397262306a36Sopenharmony_ci * made. We only need to ensure that the other CPU sets these flags 397362306a36Sopenharmony_ci * before any actual changes to the page tables are made. The comments 397462306a36Sopenharmony_ci * in mmu_try_to_unsync_pages() describe what could go wrong if this 397562306a36Sopenharmony_ci * requirement isn't satisfied. 397662306a36Sopenharmony_ci */ 397762306a36Sopenharmony_ci smp_rmb(); 397862306a36Sopenharmony_ci sp = root_to_sp(root); 397962306a36Sopenharmony_ci 398062306a36Sopenharmony_ci /* 398162306a36Sopenharmony_ci * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the 398262306a36Sopenharmony_ci * PDPTEs for a given PAE root need to be synchronized individually. 398362306a36Sopenharmony_ci */ 398462306a36Sopenharmony_ci if (WARN_ON_ONCE(!sp)) 398562306a36Sopenharmony_ci return false; 398662306a36Sopenharmony_ci 398762306a36Sopenharmony_ci if (sp->unsync || sp->unsync_children) 398862306a36Sopenharmony_ci return true; 398962306a36Sopenharmony_ci 399062306a36Sopenharmony_ci return false; 399162306a36Sopenharmony_ci} 399262306a36Sopenharmony_ci 399362306a36Sopenharmony_civoid kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 399462306a36Sopenharmony_ci{ 399562306a36Sopenharmony_ci int i; 399662306a36Sopenharmony_ci struct kvm_mmu_page *sp; 399762306a36Sopenharmony_ci 399862306a36Sopenharmony_ci if (vcpu->arch.mmu->root_role.direct) 399962306a36Sopenharmony_ci return; 400062306a36Sopenharmony_ci 400162306a36Sopenharmony_ci if (!VALID_PAGE(vcpu->arch.mmu->root.hpa)) 400262306a36Sopenharmony_ci return; 400362306a36Sopenharmony_ci 400462306a36Sopenharmony_ci vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); 400562306a36Sopenharmony_ci 400662306a36Sopenharmony_ci if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { 400762306a36Sopenharmony_ci hpa_t root = vcpu->arch.mmu->root.hpa; 400862306a36Sopenharmony_ci 400962306a36Sopenharmony_ci if (!is_unsync_root(root)) 401062306a36Sopenharmony_ci return; 401162306a36Sopenharmony_ci 401262306a36Sopenharmony_ci sp = root_to_sp(root); 401362306a36Sopenharmony_ci 401462306a36Sopenharmony_ci write_lock(&vcpu->kvm->mmu_lock); 401562306a36Sopenharmony_ci mmu_sync_children(vcpu, sp, true); 401662306a36Sopenharmony_ci write_unlock(&vcpu->kvm->mmu_lock); 401762306a36Sopenharmony_ci return; 401862306a36Sopenharmony_ci } 401962306a36Sopenharmony_ci 402062306a36Sopenharmony_ci write_lock(&vcpu->kvm->mmu_lock); 402162306a36Sopenharmony_ci 402262306a36Sopenharmony_ci for (i = 0; i < 4; ++i) { 402362306a36Sopenharmony_ci hpa_t root = vcpu->arch.mmu->pae_root[i]; 402462306a36Sopenharmony_ci 402562306a36Sopenharmony_ci if (IS_VALID_PAE_ROOT(root)) { 402662306a36Sopenharmony_ci sp = spte_to_child_sp(root); 402762306a36Sopenharmony_ci mmu_sync_children(vcpu, sp, true); 402862306a36Sopenharmony_ci } 402962306a36Sopenharmony_ci } 403062306a36Sopenharmony_ci 403162306a36Sopenharmony_ci write_unlock(&vcpu->kvm->mmu_lock); 403262306a36Sopenharmony_ci} 403362306a36Sopenharmony_ci 403462306a36Sopenharmony_civoid kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu) 403562306a36Sopenharmony_ci{ 403662306a36Sopenharmony_ci unsigned long roots_to_free = 0; 403762306a36Sopenharmony_ci int i; 403862306a36Sopenharmony_ci 403962306a36Sopenharmony_ci for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 404062306a36Sopenharmony_ci if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa)) 404162306a36Sopenharmony_ci roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 404262306a36Sopenharmony_ci 404362306a36Sopenharmony_ci /* sync prev_roots by simply freeing them */ 404462306a36Sopenharmony_ci kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free); 404562306a36Sopenharmony_ci} 404662306a36Sopenharmony_ci 404762306a36Sopenharmony_cistatic gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 404862306a36Sopenharmony_ci gpa_t vaddr, u64 access, 404962306a36Sopenharmony_ci struct x86_exception *exception) 405062306a36Sopenharmony_ci{ 405162306a36Sopenharmony_ci if (exception) 405262306a36Sopenharmony_ci exception->error_code = 0; 405362306a36Sopenharmony_ci return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception); 405462306a36Sopenharmony_ci} 405562306a36Sopenharmony_ci 405662306a36Sopenharmony_cistatic bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) 405762306a36Sopenharmony_ci{ 405862306a36Sopenharmony_ci /* 405962306a36Sopenharmony_ci * A nested guest cannot use the MMIO cache if it is using nested 406062306a36Sopenharmony_ci * page tables, because cr2 is a nGPA while the cache stores GPAs. 406162306a36Sopenharmony_ci */ 406262306a36Sopenharmony_ci if (mmu_is_nested(vcpu)) 406362306a36Sopenharmony_ci return false; 406462306a36Sopenharmony_ci 406562306a36Sopenharmony_ci if (direct) 406662306a36Sopenharmony_ci return vcpu_match_mmio_gpa(vcpu, addr); 406762306a36Sopenharmony_ci 406862306a36Sopenharmony_ci return vcpu_match_mmio_gva(vcpu, addr); 406962306a36Sopenharmony_ci} 407062306a36Sopenharmony_ci 407162306a36Sopenharmony_ci/* 407262306a36Sopenharmony_ci * Return the level of the lowest level SPTE added to sptes. 407362306a36Sopenharmony_ci * That SPTE may be non-present. 407462306a36Sopenharmony_ci * 407562306a36Sopenharmony_ci * Must be called between walk_shadow_page_lockless_{begin,end}. 407662306a36Sopenharmony_ci */ 407762306a36Sopenharmony_cistatic int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) 407862306a36Sopenharmony_ci{ 407962306a36Sopenharmony_ci struct kvm_shadow_walk_iterator iterator; 408062306a36Sopenharmony_ci int leaf = -1; 408162306a36Sopenharmony_ci u64 spte; 408262306a36Sopenharmony_ci 408362306a36Sopenharmony_ci for (shadow_walk_init(&iterator, vcpu, addr), 408462306a36Sopenharmony_ci *root_level = iterator.level; 408562306a36Sopenharmony_ci shadow_walk_okay(&iterator); 408662306a36Sopenharmony_ci __shadow_walk_next(&iterator, spte)) { 408762306a36Sopenharmony_ci leaf = iterator.level; 408862306a36Sopenharmony_ci spte = mmu_spte_get_lockless(iterator.sptep); 408962306a36Sopenharmony_ci 409062306a36Sopenharmony_ci sptes[leaf] = spte; 409162306a36Sopenharmony_ci } 409262306a36Sopenharmony_ci 409362306a36Sopenharmony_ci return leaf; 409462306a36Sopenharmony_ci} 409562306a36Sopenharmony_ci 409662306a36Sopenharmony_ci/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ 409762306a36Sopenharmony_cistatic bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) 409862306a36Sopenharmony_ci{ 409962306a36Sopenharmony_ci u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; 410062306a36Sopenharmony_ci struct rsvd_bits_validate *rsvd_check; 410162306a36Sopenharmony_ci int root, leaf, level; 410262306a36Sopenharmony_ci bool reserved = false; 410362306a36Sopenharmony_ci 410462306a36Sopenharmony_ci walk_shadow_page_lockless_begin(vcpu); 410562306a36Sopenharmony_ci 410662306a36Sopenharmony_ci if (is_tdp_mmu_active(vcpu)) 410762306a36Sopenharmony_ci leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); 410862306a36Sopenharmony_ci else 410962306a36Sopenharmony_ci leaf = get_walk(vcpu, addr, sptes, &root); 411062306a36Sopenharmony_ci 411162306a36Sopenharmony_ci walk_shadow_page_lockless_end(vcpu); 411262306a36Sopenharmony_ci 411362306a36Sopenharmony_ci if (unlikely(leaf < 0)) { 411462306a36Sopenharmony_ci *sptep = 0ull; 411562306a36Sopenharmony_ci return reserved; 411662306a36Sopenharmony_ci } 411762306a36Sopenharmony_ci 411862306a36Sopenharmony_ci *sptep = sptes[leaf]; 411962306a36Sopenharmony_ci 412062306a36Sopenharmony_ci /* 412162306a36Sopenharmony_ci * Skip reserved bits checks on the terminal leaf if it's not a valid 412262306a36Sopenharmony_ci * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by 412362306a36Sopenharmony_ci * design, always have reserved bits set. The purpose of the checks is 412462306a36Sopenharmony_ci * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs. 412562306a36Sopenharmony_ci */ 412662306a36Sopenharmony_ci if (!is_shadow_present_pte(sptes[leaf])) 412762306a36Sopenharmony_ci leaf++; 412862306a36Sopenharmony_ci 412962306a36Sopenharmony_ci rsvd_check = &vcpu->arch.mmu->shadow_zero_check; 413062306a36Sopenharmony_ci 413162306a36Sopenharmony_ci for (level = root; level >= leaf; level--) 413262306a36Sopenharmony_ci reserved |= is_rsvd_spte(rsvd_check, sptes[level], level); 413362306a36Sopenharmony_ci 413462306a36Sopenharmony_ci if (reserved) { 413562306a36Sopenharmony_ci pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n", 413662306a36Sopenharmony_ci __func__, addr); 413762306a36Sopenharmony_ci for (level = root; level >= leaf; level--) 413862306a36Sopenharmony_ci pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx", 413962306a36Sopenharmony_ci sptes[level], level, 414062306a36Sopenharmony_ci get_rsvd_bits(rsvd_check, sptes[level], level)); 414162306a36Sopenharmony_ci } 414262306a36Sopenharmony_ci 414362306a36Sopenharmony_ci return reserved; 414462306a36Sopenharmony_ci} 414562306a36Sopenharmony_ci 414662306a36Sopenharmony_cistatic int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) 414762306a36Sopenharmony_ci{ 414862306a36Sopenharmony_ci u64 spte; 414962306a36Sopenharmony_ci bool reserved; 415062306a36Sopenharmony_ci 415162306a36Sopenharmony_ci if (mmio_info_in_cache(vcpu, addr, direct)) 415262306a36Sopenharmony_ci return RET_PF_EMULATE; 415362306a36Sopenharmony_ci 415462306a36Sopenharmony_ci reserved = get_mmio_spte(vcpu, addr, &spte); 415562306a36Sopenharmony_ci if (WARN_ON_ONCE(reserved)) 415662306a36Sopenharmony_ci return -EINVAL; 415762306a36Sopenharmony_ci 415862306a36Sopenharmony_ci if (is_mmio_spte(spte)) { 415962306a36Sopenharmony_ci gfn_t gfn = get_mmio_spte_gfn(spte); 416062306a36Sopenharmony_ci unsigned int access = get_mmio_spte_access(spte); 416162306a36Sopenharmony_ci 416262306a36Sopenharmony_ci if (!check_mmio_spte(vcpu, spte)) 416362306a36Sopenharmony_ci return RET_PF_INVALID; 416462306a36Sopenharmony_ci 416562306a36Sopenharmony_ci if (direct) 416662306a36Sopenharmony_ci addr = 0; 416762306a36Sopenharmony_ci 416862306a36Sopenharmony_ci trace_handle_mmio_page_fault(addr, gfn, access); 416962306a36Sopenharmony_ci vcpu_cache_mmio_info(vcpu, addr, gfn, access); 417062306a36Sopenharmony_ci return RET_PF_EMULATE; 417162306a36Sopenharmony_ci } 417262306a36Sopenharmony_ci 417362306a36Sopenharmony_ci /* 417462306a36Sopenharmony_ci * If the page table is zapped by other cpus, let CPU fault again on 417562306a36Sopenharmony_ci * the address. 417662306a36Sopenharmony_ci */ 417762306a36Sopenharmony_ci return RET_PF_RETRY; 417862306a36Sopenharmony_ci} 417962306a36Sopenharmony_ci 418062306a36Sopenharmony_cistatic bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, 418162306a36Sopenharmony_ci struct kvm_page_fault *fault) 418262306a36Sopenharmony_ci{ 418362306a36Sopenharmony_ci if (unlikely(fault->rsvd)) 418462306a36Sopenharmony_ci return false; 418562306a36Sopenharmony_ci 418662306a36Sopenharmony_ci if (!fault->present || !fault->write) 418762306a36Sopenharmony_ci return false; 418862306a36Sopenharmony_ci 418962306a36Sopenharmony_ci /* 419062306a36Sopenharmony_ci * guest is writing the page which is write tracked which can 419162306a36Sopenharmony_ci * not be fixed by page fault handler. 419262306a36Sopenharmony_ci */ 419362306a36Sopenharmony_ci if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn)) 419462306a36Sopenharmony_ci return true; 419562306a36Sopenharmony_ci 419662306a36Sopenharmony_ci return false; 419762306a36Sopenharmony_ci} 419862306a36Sopenharmony_ci 419962306a36Sopenharmony_cistatic void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) 420062306a36Sopenharmony_ci{ 420162306a36Sopenharmony_ci struct kvm_shadow_walk_iterator iterator; 420262306a36Sopenharmony_ci u64 spte; 420362306a36Sopenharmony_ci 420462306a36Sopenharmony_ci walk_shadow_page_lockless_begin(vcpu); 420562306a36Sopenharmony_ci for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) 420662306a36Sopenharmony_ci clear_sp_write_flooding_count(iterator.sptep); 420762306a36Sopenharmony_ci walk_shadow_page_lockless_end(vcpu); 420862306a36Sopenharmony_ci} 420962306a36Sopenharmony_ci 421062306a36Sopenharmony_cistatic u32 alloc_apf_token(struct kvm_vcpu *vcpu) 421162306a36Sopenharmony_ci{ 421262306a36Sopenharmony_ci /* make sure the token value is not 0 */ 421362306a36Sopenharmony_ci u32 id = vcpu->arch.apf.id; 421462306a36Sopenharmony_ci 421562306a36Sopenharmony_ci if (id << 12 == 0) 421662306a36Sopenharmony_ci vcpu->arch.apf.id = 1; 421762306a36Sopenharmony_ci 421862306a36Sopenharmony_ci return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; 421962306a36Sopenharmony_ci} 422062306a36Sopenharmony_ci 422162306a36Sopenharmony_cistatic bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 422262306a36Sopenharmony_ci gfn_t gfn) 422362306a36Sopenharmony_ci{ 422462306a36Sopenharmony_ci struct kvm_arch_async_pf arch; 422562306a36Sopenharmony_ci 422662306a36Sopenharmony_ci arch.token = alloc_apf_token(vcpu); 422762306a36Sopenharmony_ci arch.gfn = gfn; 422862306a36Sopenharmony_ci arch.direct_map = vcpu->arch.mmu->root_role.direct; 422962306a36Sopenharmony_ci arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); 423062306a36Sopenharmony_ci 423162306a36Sopenharmony_ci return kvm_setup_async_pf(vcpu, cr2_or_gpa, 423262306a36Sopenharmony_ci kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); 423362306a36Sopenharmony_ci} 423462306a36Sopenharmony_ci 423562306a36Sopenharmony_civoid kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) 423662306a36Sopenharmony_ci{ 423762306a36Sopenharmony_ci int r; 423862306a36Sopenharmony_ci 423962306a36Sopenharmony_ci if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) || 424062306a36Sopenharmony_ci work->wakeup_all) 424162306a36Sopenharmony_ci return; 424262306a36Sopenharmony_ci 424362306a36Sopenharmony_ci r = kvm_mmu_reload(vcpu); 424462306a36Sopenharmony_ci if (unlikely(r)) 424562306a36Sopenharmony_ci return; 424662306a36Sopenharmony_ci 424762306a36Sopenharmony_ci if (!vcpu->arch.mmu->root_role.direct && 424862306a36Sopenharmony_ci work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) 424962306a36Sopenharmony_ci return; 425062306a36Sopenharmony_ci 425162306a36Sopenharmony_ci kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL); 425262306a36Sopenharmony_ci} 425362306a36Sopenharmony_ci 425462306a36Sopenharmony_cistatic int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 425562306a36Sopenharmony_ci{ 425662306a36Sopenharmony_ci struct kvm_memory_slot *slot = fault->slot; 425762306a36Sopenharmony_ci bool async; 425862306a36Sopenharmony_ci 425962306a36Sopenharmony_ci /* 426062306a36Sopenharmony_ci * Retry the page fault if the gfn hit a memslot that is being deleted 426162306a36Sopenharmony_ci * or moved. This ensures any existing SPTEs for the old memslot will 426262306a36Sopenharmony_ci * be zapped before KVM inserts a new MMIO SPTE for the gfn. 426362306a36Sopenharmony_ci */ 426462306a36Sopenharmony_ci if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) 426562306a36Sopenharmony_ci return RET_PF_RETRY; 426662306a36Sopenharmony_ci 426762306a36Sopenharmony_ci if (!kvm_is_visible_memslot(slot)) { 426862306a36Sopenharmony_ci /* Don't expose private memslots to L2. */ 426962306a36Sopenharmony_ci if (is_guest_mode(vcpu)) { 427062306a36Sopenharmony_ci fault->slot = NULL; 427162306a36Sopenharmony_ci fault->pfn = KVM_PFN_NOSLOT; 427262306a36Sopenharmony_ci fault->map_writable = false; 427362306a36Sopenharmony_ci return RET_PF_CONTINUE; 427462306a36Sopenharmony_ci } 427562306a36Sopenharmony_ci /* 427662306a36Sopenharmony_ci * If the APIC access page exists but is disabled, go directly 427762306a36Sopenharmony_ci * to emulation without caching the MMIO access or creating a 427862306a36Sopenharmony_ci * MMIO SPTE. That way the cache doesn't need to be purged 427962306a36Sopenharmony_ci * when the AVIC is re-enabled. 428062306a36Sopenharmony_ci */ 428162306a36Sopenharmony_ci if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && 428262306a36Sopenharmony_ci !kvm_apicv_activated(vcpu->kvm)) 428362306a36Sopenharmony_ci return RET_PF_EMULATE; 428462306a36Sopenharmony_ci } 428562306a36Sopenharmony_ci 428662306a36Sopenharmony_ci async = false; 428762306a36Sopenharmony_ci fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, 428862306a36Sopenharmony_ci fault->write, &fault->map_writable, 428962306a36Sopenharmony_ci &fault->hva); 429062306a36Sopenharmony_ci if (!async) 429162306a36Sopenharmony_ci return RET_PF_CONTINUE; /* *pfn has correct page already */ 429262306a36Sopenharmony_ci 429362306a36Sopenharmony_ci if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { 429462306a36Sopenharmony_ci trace_kvm_try_async_get_page(fault->addr, fault->gfn); 429562306a36Sopenharmony_ci if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) { 429662306a36Sopenharmony_ci trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn); 429762306a36Sopenharmony_ci kvm_make_request(KVM_REQ_APF_HALT, vcpu); 429862306a36Sopenharmony_ci return RET_PF_RETRY; 429962306a36Sopenharmony_ci } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) { 430062306a36Sopenharmony_ci return RET_PF_RETRY; 430162306a36Sopenharmony_ci } 430262306a36Sopenharmony_ci } 430362306a36Sopenharmony_ci 430462306a36Sopenharmony_ci /* 430562306a36Sopenharmony_ci * Allow gup to bail on pending non-fatal signals when it's also allowed 430662306a36Sopenharmony_ci * to wait for IO. Note, gup always bails if it is unable to quickly 430762306a36Sopenharmony_ci * get a page and a fatal signal, i.e. SIGKILL, is pending. 430862306a36Sopenharmony_ci */ 430962306a36Sopenharmony_ci fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL, 431062306a36Sopenharmony_ci fault->write, &fault->map_writable, 431162306a36Sopenharmony_ci &fault->hva); 431262306a36Sopenharmony_ci return RET_PF_CONTINUE; 431362306a36Sopenharmony_ci} 431462306a36Sopenharmony_ci 431562306a36Sopenharmony_cistatic int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, 431662306a36Sopenharmony_ci unsigned int access) 431762306a36Sopenharmony_ci{ 431862306a36Sopenharmony_ci int ret; 431962306a36Sopenharmony_ci 432062306a36Sopenharmony_ci fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; 432162306a36Sopenharmony_ci smp_rmb(); 432262306a36Sopenharmony_ci 432362306a36Sopenharmony_ci ret = __kvm_faultin_pfn(vcpu, fault); 432462306a36Sopenharmony_ci if (ret != RET_PF_CONTINUE) 432562306a36Sopenharmony_ci return ret; 432662306a36Sopenharmony_ci 432762306a36Sopenharmony_ci if (unlikely(is_error_pfn(fault->pfn))) 432862306a36Sopenharmony_ci return kvm_handle_error_pfn(vcpu, fault); 432962306a36Sopenharmony_ci 433062306a36Sopenharmony_ci if (unlikely(!fault->slot)) 433162306a36Sopenharmony_ci return kvm_handle_noslot_fault(vcpu, fault, access); 433262306a36Sopenharmony_ci 433362306a36Sopenharmony_ci return RET_PF_CONTINUE; 433462306a36Sopenharmony_ci} 433562306a36Sopenharmony_ci 433662306a36Sopenharmony_ci/* 433762306a36Sopenharmony_ci * Returns true if the page fault is stale and needs to be retried, i.e. if the 433862306a36Sopenharmony_ci * root was invalidated by a memslot update or a relevant mmu_notifier fired. 433962306a36Sopenharmony_ci */ 434062306a36Sopenharmony_cistatic bool is_page_fault_stale(struct kvm_vcpu *vcpu, 434162306a36Sopenharmony_ci struct kvm_page_fault *fault) 434262306a36Sopenharmony_ci{ 434362306a36Sopenharmony_ci struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa); 434462306a36Sopenharmony_ci 434562306a36Sopenharmony_ci /* Special roots, e.g. pae_root, are not backed by shadow pages. */ 434662306a36Sopenharmony_ci if (sp && is_obsolete_sp(vcpu->kvm, sp)) 434762306a36Sopenharmony_ci return true; 434862306a36Sopenharmony_ci 434962306a36Sopenharmony_ci /* 435062306a36Sopenharmony_ci * Roots without an associated shadow page are considered invalid if 435162306a36Sopenharmony_ci * there is a pending request to free obsolete roots. The request is 435262306a36Sopenharmony_ci * only a hint that the current root _may_ be obsolete and needs to be 435362306a36Sopenharmony_ci * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a 435462306a36Sopenharmony_ci * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs 435562306a36Sopenharmony_ci * to reload even if no vCPU is actively using the root. 435662306a36Sopenharmony_ci */ 435762306a36Sopenharmony_ci if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) 435862306a36Sopenharmony_ci return true; 435962306a36Sopenharmony_ci 436062306a36Sopenharmony_ci return fault->slot && 436162306a36Sopenharmony_ci mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva); 436262306a36Sopenharmony_ci} 436362306a36Sopenharmony_ci 436462306a36Sopenharmony_cistatic int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 436562306a36Sopenharmony_ci{ 436662306a36Sopenharmony_ci int r; 436762306a36Sopenharmony_ci 436862306a36Sopenharmony_ci /* Dummy roots are used only for shadowing bad guest roots. */ 436962306a36Sopenharmony_ci if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) 437062306a36Sopenharmony_ci return RET_PF_RETRY; 437162306a36Sopenharmony_ci 437262306a36Sopenharmony_ci if (page_fault_handle_page_track(vcpu, fault)) 437362306a36Sopenharmony_ci return RET_PF_EMULATE; 437462306a36Sopenharmony_ci 437562306a36Sopenharmony_ci r = fast_page_fault(vcpu, fault); 437662306a36Sopenharmony_ci if (r != RET_PF_INVALID) 437762306a36Sopenharmony_ci return r; 437862306a36Sopenharmony_ci 437962306a36Sopenharmony_ci r = mmu_topup_memory_caches(vcpu, false); 438062306a36Sopenharmony_ci if (r) 438162306a36Sopenharmony_ci return r; 438262306a36Sopenharmony_ci 438362306a36Sopenharmony_ci r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); 438462306a36Sopenharmony_ci if (r != RET_PF_CONTINUE) 438562306a36Sopenharmony_ci return r; 438662306a36Sopenharmony_ci 438762306a36Sopenharmony_ci r = RET_PF_RETRY; 438862306a36Sopenharmony_ci write_lock(&vcpu->kvm->mmu_lock); 438962306a36Sopenharmony_ci 439062306a36Sopenharmony_ci if (is_page_fault_stale(vcpu, fault)) 439162306a36Sopenharmony_ci goto out_unlock; 439262306a36Sopenharmony_ci 439362306a36Sopenharmony_ci r = make_mmu_pages_available(vcpu); 439462306a36Sopenharmony_ci if (r) 439562306a36Sopenharmony_ci goto out_unlock; 439662306a36Sopenharmony_ci 439762306a36Sopenharmony_ci r = direct_map(vcpu, fault); 439862306a36Sopenharmony_ci 439962306a36Sopenharmony_ciout_unlock: 440062306a36Sopenharmony_ci write_unlock(&vcpu->kvm->mmu_lock); 440162306a36Sopenharmony_ci kvm_release_pfn_clean(fault->pfn); 440262306a36Sopenharmony_ci return r; 440362306a36Sopenharmony_ci} 440462306a36Sopenharmony_ci 440562306a36Sopenharmony_cistatic int nonpaging_page_fault(struct kvm_vcpu *vcpu, 440662306a36Sopenharmony_ci struct kvm_page_fault *fault) 440762306a36Sopenharmony_ci{ 440862306a36Sopenharmony_ci /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ 440962306a36Sopenharmony_ci fault->max_level = PG_LEVEL_2M; 441062306a36Sopenharmony_ci return direct_page_fault(vcpu, fault); 441162306a36Sopenharmony_ci} 441262306a36Sopenharmony_ci 441362306a36Sopenharmony_ciint kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, 441462306a36Sopenharmony_ci u64 fault_address, char *insn, int insn_len) 441562306a36Sopenharmony_ci{ 441662306a36Sopenharmony_ci int r = 1; 441762306a36Sopenharmony_ci u32 flags = vcpu->arch.apf.host_apf_flags; 441862306a36Sopenharmony_ci 441962306a36Sopenharmony_ci#ifndef CONFIG_X86_64 442062306a36Sopenharmony_ci /* A 64-bit CR2 should be impossible on 32-bit KVM. */ 442162306a36Sopenharmony_ci if (WARN_ON_ONCE(fault_address >> 32)) 442262306a36Sopenharmony_ci return -EFAULT; 442362306a36Sopenharmony_ci#endif 442462306a36Sopenharmony_ci 442562306a36Sopenharmony_ci vcpu->arch.l1tf_flush_l1d = true; 442662306a36Sopenharmony_ci if (!flags) { 442762306a36Sopenharmony_ci trace_kvm_page_fault(vcpu, fault_address, error_code); 442862306a36Sopenharmony_ci 442962306a36Sopenharmony_ci if (kvm_event_needs_reinjection(vcpu)) 443062306a36Sopenharmony_ci kvm_mmu_unprotect_page_virt(vcpu, fault_address); 443162306a36Sopenharmony_ci r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, 443262306a36Sopenharmony_ci insn_len); 443362306a36Sopenharmony_ci } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { 443462306a36Sopenharmony_ci vcpu->arch.apf.host_apf_flags = 0; 443562306a36Sopenharmony_ci local_irq_disable(); 443662306a36Sopenharmony_ci kvm_async_pf_task_wait_schedule(fault_address); 443762306a36Sopenharmony_ci local_irq_enable(); 443862306a36Sopenharmony_ci } else { 443962306a36Sopenharmony_ci WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags); 444062306a36Sopenharmony_ci } 444162306a36Sopenharmony_ci 444262306a36Sopenharmony_ci return r; 444362306a36Sopenharmony_ci} 444462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_handle_page_fault); 444562306a36Sopenharmony_ci 444662306a36Sopenharmony_ci#ifdef CONFIG_X86_64 444762306a36Sopenharmony_cistatic int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, 444862306a36Sopenharmony_ci struct kvm_page_fault *fault) 444962306a36Sopenharmony_ci{ 445062306a36Sopenharmony_ci int r; 445162306a36Sopenharmony_ci 445262306a36Sopenharmony_ci if (page_fault_handle_page_track(vcpu, fault)) 445362306a36Sopenharmony_ci return RET_PF_EMULATE; 445462306a36Sopenharmony_ci 445562306a36Sopenharmony_ci r = fast_page_fault(vcpu, fault); 445662306a36Sopenharmony_ci if (r != RET_PF_INVALID) 445762306a36Sopenharmony_ci return r; 445862306a36Sopenharmony_ci 445962306a36Sopenharmony_ci r = mmu_topup_memory_caches(vcpu, false); 446062306a36Sopenharmony_ci if (r) 446162306a36Sopenharmony_ci return r; 446262306a36Sopenharmony_ci 446362306a36Sopenharmony_ci r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); 446462306a36Sopenharmony_ci if (r != RET_PF_CONTINUE) 446562306a36Sopenharmony_ci return r; 446662306a36Sopenharmony_ci 446762306a36Sopenharmony_ci r = RET_PF_RETRY; 446862306a36Sopenharmony_ci read_lock(&vcpu->kvm->mmu_lock); 446962306a36Sopenharmony_ci 447062306a36Sopenharmony_ci if (is_page_fault_stale(vcpu, fault)) 447162306a36Sopenharmony_ci goto out_unlock; 447262306a36Sopenharmony_ci 447362306a36Sopenharmony_ci r = kvm_tdp_mmu_map(vcpu, fault); 447462306a36Sopenharmony_ci 447562306a36Sopenharmony_ciout_unlock: 447662306a36Sopenharmony_ci read_unlock(&vcpu->kvm->mmu_lock); 447762306a36Sopenharmony_ci kvm_release_pfn_clean(fault->pfn); 447862306a36Sopenharmony_ci return r; 447962306a36Sopenharmony_ci} 448062306a36Sopenharmony_ci#endif 448162306a36Sopenharmony_ci 448262306a36Sopenharmony_ciint kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 448362306a36Sopenharmony_ci{ 448462306a36Sopenharmony_ci /* 448562306a36Sopenharmony_ci * If the guest's MTRRs may be used to compute the "real" memtype, 448662306a36Sopenharmony_ci * restrict the mapping level to ensure KVM uses a consistent memtype 448762306a36Sopenharmony_ci * across the entire mapping. If the host MTRRs are ignored by TDP 448862306a36Sopenharmony_ci * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA 448962306a36Sopenharmony_ci * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype 449062306a36Sopenharmony_ci * from the guest's MTRRs so that guest accesses to memory that is 449162306a36Sopenharmony_ci * DMA'd aren't cached against the guest's wishes. 449262306a36Sopenharmony_ci * 449362306a36Sopenharmony_ci * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs, 449462306a36Sopenharmony_ci * e.g. KVM will force UC memtype for host MMIO. 449562306a36Sopenharmony_ci */ 449662306a36Sopenharmony_ci if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) { 449762306a36Sopenharmony_ci for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { 449862306a36Sopenharmony_ci int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); 449962306a36Sopenharmony_ci gfn_t base = gfn_round_for_level(fault->gfn, 450062306a36Sopenharmony_ci fault->max_level); 450162306a36Sopenharmony_ci 450262306a36Sopenharmony_ci if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) 450362306a36Sopenharmony_ci break; 450462306a36Sopenharmony_ci } 450562306a36Sopenharmony_ci } 450662306a36Sopenharmony_ci 450762306a36Sopenharmony_ci#ifdef CONFIG_X86_64 450862306a36Sopenharmony_ci if (tdp_mmu_enabled) 450962306a36Sopenharmony_ci return kvm_tdp_mmu_page_fault(vcpu, fault); 451062306a36Sopenharmony_ci#endif 451162306a36Sopenharmony_ci 451262306a36Sopenharmony_ci return direct_page_fault(vcpu, fault); 451362306a36Sopenharmony_ci} 451462306a36Sopenharmony_ci 451562306a36Sopenharmony_cistatic void nonpaging_init_context(struct kvm_mmu *context) 451662306a36Sopenharmony_ci{ 451762306a36Sopenharmony_ci context->page_fault = nonpaging_page_fault; 451862306a36Sopenharmony_ci context->gva_to_gpa = nonpaging_gva_to_gpa; 451962306a36Sopenharmony_ci context->sync_spte = NULL; 452062306a36Sopenharmony_ci} 452162306a36Sopenharmony_ci 452262306a36Sopenharmony_cistatic inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, 452362306a36Sopenharmony_ci union kvm_mmu_page_role role) 452462306a36Sopenharmony_ci{ 452562306a36Sopenharmony_ci struct kvm_mmu_page *sp; 452662306a36Sopenharmony_ci 452762306a36Sopenharmony_ci if (!VALID_PAGE(root->hpa)) 452862306a36Sopenharmony_ci return false; 452962306a36Sopenharmony_ci 453062306a36Sopenharmony_ci if (!role.direct && pgd != root->pgd) 453162306a36Sopenharmony_ci return false; 453262306a36Sopenharmony_ci 453362306a36Sopenharmony_ci sp = root_to_sp(root->hpa); 453462306a36Sopenharmony_ci if (WARN_ON_ONCE(!sp)) 453562306a36Sopenharmony_ci return false; 453662306a36Sopenharmony_ci 453762306a36Sopenharmony_ci return role.word == sp->role.word; 453862306a36Sopenharmony_ci} 453962306a36Sopenharmony_ci 454062306a36Sopenharmony_ci/* 454162306a36Sopenharmony_ci * Find out if a previously cached root matching the new pgd/role is available, 454262306a36Sopenharmony_ci * and insert the current root as the MRU in the cache. 454362306a36Sopenharmony_ci * If a matching root is found, it is assigned to kvm_mmu->root and 454462306a36Sopenharmony_ci * true is returned. 454562306a36Sopenharmony_ci * If no match is found, kvm_mmu->root is left invalid, the LRU root is 454662306a36Sopenharmony_ci * evicted to make room for the current root, and false is returned. 454762306a36Sopenharmony_ci */ 454862306a36Sopenharmony_cistatic bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu, 454962306a36Sopenharmony_ci gpa_t new_pgd, 455062306a36Sopenharmony_ci union kvm_mmu_page_role new_role) 455162306a36Sopenharmony_ci{ 455262306a36Sopenharmony_ci uint i; 455362306a36Sopenharmony_ci 455462306a36Sopenharmony_ci if (is_root_usable(&mmu->root, new_pgd, new_role)) 455562306a36Sopenharmony_ci return true; 455662306a36Sopenharmony_ci 455762306a36Sopenharmony_ci for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 455862306a36Sopenharmony_ci /* 455962306a36Sopenharmony_ci * The swaps end up rotating the cache like this: 456062306a36Sopenharmony_ci * C 0 1 2 3 (on entry to the function) 456162306a36Sopenharmony_ci * 0 C 1 2 3 456262306a36Sopenharmony_ci * 1 C 0 2 3 456362306a36Sopenharmony_ci * 2 C 0 1 3 456462306a36Sopenharmony_ci * 3 C 0 1 2 (on exit from the loop) 456562306a36Sopenharmony_ci */ 456662306a36Sopenharmony_ci swap(mmu->root, mmu->prev_roots[i]); 456762306a36Sopenharmony_ci if (is_root_usable(&mmu->root, new_pgd, new_role)) 456862306a36Sopenharmony_ci return true; 456962306a36Sopenharmony_ci } 457062306a36Sopenharmony_ci 457162306a36Sopenharmony_ci kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT); 457262306a36Sopenharmony_ci return false; 457362306a36Sopenharmony_ci} 457462306a36Sopenharmony_ci 457562306a36Sopenharmony_ci/* 457662306a36Sopenharmony_ci * Find out if a previously cached root matching the new pgd/role is available. 457762306a36Sopenharmony_ci * On entry, mmu->root is invalid. 457862306a36Sopenharmony_ci * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry 457962306a36Sopenharmony_ci * of the cache becomes invalid, and true is returned. 458062306a36Sopenharmony_ci * If no match is found, kvm_mmu->root is left invalid and false is returned. 458162306a36Sopenharmony_ci */ 458262306a36Sopenharmony_cistatic bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu, 458362306a36Sopenharmony_ci gpa_t new_pgd, 458462306a36Sopenharmony_ci union kvm_mmu_page_role new_role) 458562306a36Sopenharmony_ci{ 458662306a36Sopenharmony_ci uint i; 458762306a36Sopenharmony_ci 458862306a36Sopenharmony_ci for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 458962306a36Sopenharmony_ci if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role)) 459062306a36Sopenharmony_ci goto hit; 459162306a36Sopenharmony_ci 459262306a36Sopenharmony_ci return false; 459362306a36Sopenharmony_ci 459462306a36Sopenharmony_cihit: 459562306a36Sopenharmony_ci swap(mmu->root, mmu->prev_roots[i]); 459662306a36Sopenharmony_ci /* Bubble up the remaining roots. */ 459762306a36Sopenharmony_ci for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++) 459862306a36Sopenharmony_ci mmu->prev_roots[i] = mmu->prev_roots[i + 1]; 459962306a36Sopenharmony_ci mmu->prev_roots[i].hpa = INVALID_PAGE; 460062306a36Sopenharmony_ci return true; 460162306a36Sopenharmony_ci} 460262306a36Sopenharmony_ci 460362306a36Sopenharmony_cistatic bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu, 460462306a36Sopenharmony_ci gpa_t new_pgd, union kvm_mmu_page_role new_role) 460562306a36Sopenharmony_ci{ 460662306a36Sopenharmony_ci /* 460762306a36Sopenharmony_ci * Limit reuse to 64-bit hosts+VMs without "special" roots in order to 460862306a36Sopenharmony_ci * avoid having to deal with PDPTEs and other complexities. 460962306a36Sopenharmony_ci */ 461062306a36Sopenharmony_ci if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa)) 461162306a36Sopenharmony_ci kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT); 461262306a36Sopenharmony_ci 461362306a36Sopenharmony_ci if (VALID_PAGE(mmu->root.hpa)) 461462306a36Sopenharmony_ci return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role); 461562306a36Sopenharmony_ci else 461662306a36Sopenharmony_ci return cached_root_find_without_current(kvm, mmu, new_pgd, new_role); 461762306a36Sopenharmony_ci} 461862306a36Sopenharmony_ci 461962306a36Sopenharmony_civoid kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) 462062306a36Sopenharmony_ci{ 462162306a36Sopenharmony_ci struct kvm_mmu *mmu = vcpu->arch.mmu; 462262306a36Sopenharmony_ci union kvm_mmu_page_role new_role = mmu->root_role; 462362306a36Sopenharmony_ci 462462306a36Sopenharmony_ci /* 462562306a36Sopenharmony_ci * Return immediately if no usable root was found, kvm_mmu_reload() 462662306a36Sopenharmony_ci * will establish a valid root prior to the next VM-Enter. 462762306a36Sopenharmony_ci */ 462862306a36Sopenharmony_ci if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) 462962306a36Sopenharmony_ci return; 463062306a36Sopenharmony_ci 463162306a36Sopenharmony_ci /* 463262306a36Sopenharmony_ci * It's possible that the cached previous root page is obsolete because 463362306a36Sopenharmony_ci * of a change in the MMU generation number. However, changing the 463462306a36Sopenharmony_ci * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, 463562306a36Sopenharmony_ci * which will free the root set here and allocate a new one. 463662306a36Sopenharmony_ci */ 463762306a36Sopenharmony_ci kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); 463862306a36Sopenharmony_ci 463962306a36Sopenharmony_ci if (force_flush_and_sync_on_reuse) { 464062306a36Sopenharmony_ci kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 464162306a36Sopenharmony_ci kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 464262306a36Sopenharmony_ci } 464362306a36Sopenharmony_ci 464462306a36Sopenharmony_ci /* 464562306a36Sopenharmony_ci * The last MMIO access's GVA and GPA are cached in the VCPU. When 464662306a36Sopenharmony_ci * switching to a new CR3, that GVA->GPA mapping may no longer be 464762306a36Sopenharmony_ci * valid. So clear any cached MMIO info even when we don't need to sync 464862306a36Sopenharmony_ci * the shadow page tables. 464962306a36Sopenharmony_ci */ 465062306a36Sopenharmony_ci vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); 465162306a36Sopenharmony_ci 465262306a36Sopenharmony_ci /* 465362306a36Sopenharmony_ci * If this is a direct root page, it doesn't have a write flooding 465462306a36Sopenharmony_ci * count. Otherwise, clear the write flooding count. 465562306a36Sopenharmony_ci */ 465662306a36Sopenharmony_ci if (!new_role.direct) { 465762306a36Sopenharmony_ci struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa); 465862306a36Sopenharmony_ci 465962306a36Sopenharmony_ci if (!WARN_ON_ONCE(!sp)) 466062306a36Sopenharmony_ci __clear_sp_write_flooding_count(sp); 466162306a36Sopenharmony_ci } 466262306a36Sopenharmony_ci} 466362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); 466462306a36Sopenharmony_ci 466562306a36Sopenharmony_cistatic bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, 466662306a36Sopenharmony_ci unsigned int access) 466762306a36Sopenharmony_ci{ 466862306a36Sopenharmony_ci if (unlikely(is_mmio_spte(*sptep))) { 466962306a36Sopenharmony_ci if (gfn != get_mmio_spte_gfn(*sptep)) { 467062306a36Sopenharmony_ci mmu_spte_clear_no_track(sptep); 467162306a36Sopenharmony_ci return true; 467262306a36Sopenharmony_ci } 467362306a36Sopenharmony_ci 467462306a36Sopenharmony_ci mark_mmio_spte(vcpu, sptep, gfn, access); 467562306a36Sopenharmony_ci return true; 467662306a36Sopenharmony_ci } 467762306a36Sopenharmony_ci 467862306a36Sopenharmony_ci return false; 467962306a36Sopenharmony_ci} 468062306a36Sopenharmony_ci 468162306a36Sopenharmony_ci#define PTTYPE_EPT 18 /* arbitrary */ 468262306a36Sopenharmony_ci#define PTTYPE PTTYPE_EPT 468362306a36Sopenharmony_ci#include "paging_tmpl.h" 468462306a36Sopenharmony_ci#undef PTTYPE 468562306a36Sopenharmony_ci 468662306a36Sopenharmony_ci#define PTTYPE 64 468762306a36Sopenharmony_ci#include "paging_tmpl.h" 468862306a36Sopenharmony_ci#undef PTTYPE 468962306a36Sopenharmony_ci 469062306a36Sopenharmony_ci#define PTTYPE 32 469162306a36Sopenharmony_ci#include "paging_tmpl.h" 469262306a36Sopenharmony_ci#undef PTTYPE 469362306a36Sopenharmony_ci 469462306a36Sopenharmony_cistatic void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, 469562306a36Sopenharmony_ci u64 pa_bits_rsvd, int level, bool nx, 469662306a36Sopenharmony_ci bool gbpages, bool pse, bool amd) 469762306a36Sopenharmony_ci{ 469862306a36Sopenharmony_ci u64 gbpages_bit_rsvd = 0; 469962306a36Sopenharmony_ci u64 nonleaf_bit8_rsvd = 0; 470062306a36Sopenharmony_ci u64 high_bits_rsvd; 470162306a36Sopenharmony_ci 470262306a36Sopenharmony_ci rsvd_check->bad_mt_xwr = 0; 470362306a36Sopenharmony_ci 470462306a36Sopenharmony_ci if (!gbpages) 470562306a36Sopenharmony_ci gbpages_bit_rsvd = rsvd_bits(7, 7); 470662306a36Sopenharmony_ci 470762306a36Sopenharmony_ci if (level == PT32E_ROOT_LEVEL) 470862306a36Sopenharmony_ci high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62); 470962306a36Sopenharmony_ci else 471062306a36Sopenharmony_ci high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); 471162306a36Sopenharmony_ci 471262306a36Sopenharmony_ci /* Note, NX doesn't exist in PDPTEs, this is handled below. */ 471362306a36Sopenharmony_ci if (!nx) 471462306a36Sopenharmony_ci high_bits_rsvd |= rsvd_bits(63, 63); 471562306a36Sopenharmony_ci 471662306a36Sopenharmony_ci /* 471762306a36Sopenharmony_ci * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for 471862306a36Sopenharmony_ci * leaf entries) on AMD CPUs only. 471962306a36Sopenharmony_ci */ 472062306a36Sopenharmony_ci if (amd) 472162306a36Sopenharmony_ci nonleaf_bit8_rsvd = rsvd_bits(8, 8); 472262306a36Sopenharmony_ci 472362306a36Sopenharmony_ci switch (level) { 472462306a36Sopenharmony_ci case PT32_ROOT_LEVEL: 472562306a36Sopenharmony_ci /* no rsvd bits for 2 level 4K page table entries */ 472662306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][1] = 0; 472762306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][0] = 0; 472862306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[1][0] = 472962306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][0]; 473062306a36Sopenharmony_ci 473162306a36Sopenharmony_ci if (!pse) { 473262306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[1][1] = 0; 473362306a36Sopenharmony_ci break; 473462306a36Sopenharmony_ci } 473562306a36Sopenharmony_ci 473662306a36Sopenharmony_ci if (is_cpuid_PSE36()) 473762306a36Sopenharmony_ci /* 36bits PSE 4MB page */ 473862306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); 473962306a36Sopenharmony_ci else 474062306a36Sopenharmony_ci /* 32 bits PSE 4MB page */ 474162306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); 474262306a36Sopenharmony_ci break; 474362306a36Sopenharmony_ci case PT32E_ROOT_LEVEL: 474462306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) | 474562306a36Sopenharmony_ci high_bits_rsvd | 474662306a36Sopenharmony_ci rsvd_bits(5, 8) | 474762306a36Sopenharmony_ci rsvd_bits(1, 2); /* PDPTE */ 474862306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */ 474962306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */ 475062306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | 475162306a36Sopenharmony_ci rsvd_bits(13, 20); /* large page */ 475262306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[1][0] = 475362306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][0]; 475462306a36Sopenharmony_ci break; 475562306a36Sopenharmony_ci case PT64_ROOT_5LEVEL: 475662306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | 475762306a36Sopenharmony_ci nonleaf_bit8_rsvd | 475862306a36Sopenharmony_ci rsvd_bits(7, 7); 475962306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[1][4] = 476062306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][4]; 476162306a36Sopenharmony_ci fallthrough; 476262306a36Sopenharmony_ci case PT64_ROOT_4LEVEL: 476362306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | 476462306a36Sopenharmony_ci nonleaf_bit8_rsvd | 476562306a36Sopenharmony_ci rsvd_bits(7, 7); 476662306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | 476762306a36Sopenharmony_ci gbpages_bit_rsvd; 476862306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; 476962306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; 477062306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[1][3] = 477162306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][3]; 477262306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | 477362306a36Sopenharmony_ci gbpages_bit_rsvd | 477462306a36Sopenharmony_ci rsvd_bits(13, 29); 477562306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | 477662306a36Sopenharmony_ci rsvd_bits(13, 20); /* large page */ 477762306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[1][0] = 477862306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][0]; 477962306a36Sopenharmony_ci break; 478062306a36Sopenharmony_ci } 478162306a36Sopenharmony_ci} 478262306a36Sopenharmony_ci 478362306a36Sopenharmony_cistatic void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, 478462306a36Sopenharmony_ci struct kvm_mmu *context) 478562306a36Sopenharmony_ci{ 478662306a36Sopenharmony_ci __reset_rsvds_bits_mask(&context->guest_rsvd_check, 478762306a36Sopenharmony_ci vcpu->arch.reserved_gpa_bits, 478862306a36Sopenharmony_ci context->cpu_role.base.level, is_efer_nx(context), 478962306a36Sopenharmony_ci guest_can_use(vcpu, X86_FEATURE_GBPAGES), 479062306a36Sopenharmony_ci is_cr4_pse(context), 479162306a36Sopenharmony_ci guest_cpuid_is_amd_or_hygon(vcpu)); 479262306a36Sopenharmony_ci} 479362306a36Sopenharmony_ci 479462306a36Sopenharmony_cistatic void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, 479562306a36Sopenharmony_ci u64 pa_bits_rsvd, bool execonly, 479662306a36Sopenharmony_ci int huge_page_level) 479762306a36Sopenharmony_ci{ 479862306a36Sopenharmony_ci u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); 479962306a36Sopenharmony_ci u64 large_1g_rsvd = 0, large_2m_rsvd = 0; 480062306a36Sopenharmony_ci u64 bad_mt_xwr; 480162306a36Sopenharmony_ci 480262306a36Sopenharmony_ci if (huge_page_level < PG_LEVEL_1G) 480362306a36Sopenharmony_ci large_1g_rsvd = rsvd_bits(7, 7); 480462306a36Sopenharmony_ci if (huge_page_level < PG_LEVEL_2M) 480562306a36Sopenharmony_ci large_2m_rsvd = rsvd_bits(7, 7); 480662306a36Sopenharmony_ci 480762306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7); 480862306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7); 480962306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd; 481062306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd; 481162306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; 481262306a36Sopenharmony_ci 481362306a36Sopenharmony_ci /* large page */ 481462306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; 481562306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; 481662306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd; 481762306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd; 481862306a36Sopenharmony_ci rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; 481962306a36Sopenharmony_ci 482062306a36Sopenharmony_ci bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ 482162306a36Sopenharmony_ci bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ 482262306a36Sopenharmony_ci bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */ 482362306a36Sopenharmony_ci bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */ 482462306a36Sopenharmony_ci bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ 482562306a36Sopenharmony_ci if (!execonly) { 482662306a36Sopenharmony_ci /* bits 0..2 must not be 100 unless VMX capabilities allow it */ 482762306a36Sopenharmony_ci bad_mt_xwr |= REPEAT_BYTE(1ull << 4); 482862306a36Sopenharmony_ci } 482962306a36Sopenharmony_ci rsvd_check->bad_mt_xwr = bad_mt_xwr; 483062306a36Sopenharmony_ci} 483162306a36Sopenharmony_ci 483262306a36Sopenharmony_cistatic void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, 483362306a36Sopenharmony_ci struct kvm_mmu *context, bool execonly, int huge_page_level) 483462306a36Sopenharmony_ci{ 483562306a36Sopenharmony_ci __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, 483662306a36Sopenharmony_ci vcpu->arch.reserved_gpa_bits, execonly, 483762306a36Sopenharmony_ci huge_page_level); 483862306a36Sopenharmony_ci} 483962306a36Sopenharmony_ci 484062306a36Sopenharmony_cistatic inline u64 reserved_hpa_bits(void) 484162306a36Sopenharmony_ci{ 484262306a36Sopenharmony_ci return rsvd_bits(shadow_phys_bits, 63); 484362306a36Sopenharmony_ci} 484462306a36Sopenharmony_ci 484562306a36Sopenharmony_ci/* 484662306a36Sopenharmony_ci * the page table on host is the shadow page table for the page 484762306a36Sopenharmony_ci * table in guest or amd nested guest, its mmu features completely 484862306a36Sopenharmony_ci * follow the features in guest. 484962306a36Sopenharmony_ci */ 485062306a36Sopenharmony_cistatic void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 485162306a36Sopenharmony_ci struct kvm_mmu *context) 485262306a36Sopenharmony_ci{ 485362306a36Sopenharmony_ci /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */ 485462306a36Sopenharmony_ci bool is_amd = true; 485562306a36Sopenharmony_ci /* KVM doesn't use 2-level page tables for the shadow MMU. */ 485662306a36Sopenharmony_ci bool is_pse = false; 485762306a36Sopenharmony_ci struct rsvd_bits_validate *shadow_zero_check; 485862306a36Sopenharmony_ci int i; 485962306a36Sopenharmony_ci 486062306a36Sopenharmony_ci WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL); 486162306a36Sopenharmony_ci 486262306a36Sopenharmony_ci shadow_zero_check = &context->shadow_zero_check; 486362306a36Sopenharmony_ci __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), 486462306a36Sopenharmony_ci context->root_role.level, 486562306a36Sopenharmony_ci context->root_role.efer_nx, 486662306a36Sopenharmony_ci guest_can_use(vcpu, X86_FEATURE_GBPAGES), 486762306a36Sopenharmony_ci is_pse, is_amd); 486862306a36Sopenharmony_ci 486962306a36Sopenharmony_ci if (!shadow_me_mask) 487062306a36Sopenharmony_ci return; 487162306a36Sopenharmony_ci 487262306a36Sopenharmony_ci for (i = context->root_role.level; --i >= 0;) { 487362306a36Sopenharmony_ci /* 487462306a36Sopenharmony_ci * So far shadow_me_value is a constant during KVM's life 487562306a36Sopenharmony_ci * time. Bits in shadow_me_value are allowed to be set. 487662306a36Sopenharmony_ci * Bits in shadow_me_mask but not in shadow_me_value are 487762306a36Sopenharmony_ci * not allowed to be set. 487862306a36Sopenharmony_ci */ 487962306a36Sopenharmony_ci shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask; 488062306a36Sopenharmony_ci shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask; 488162306a36Sopenharmony_ci shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value; 488262306a36Sopenharmony_ci shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value; 488362306a36Sopenharmony_ci } 488462306a36Sopenharmony_ci 488562306a36Sopenharmony_ci} 488662306a36Sopenharmony_ci 488762306a36Sopenharmony_cistatic inline bool boot_cpu_is_amd(void) 488862306a36Sopenharmony_ci{ 488962306a36Sopenharmony_ci WARN_ON_ONCE(!tdp_enabled); 489062306a36Sopenharmony_ci return shadow_x_mask == 0; 489162306a36Sopenharmony_ci} 489262306a36Sopenharmony_ci 489362306a36Sopenharmony_ci/* 489462306a36Sopenharmony_ci * the direct page table on host, use as much mmu features as 489562306a36Sopenharmony_ci * possible, however, kvm currently does not do execution-protection. 489662306a36Sopenharmony_ci */ 489762306a36Sopenharmony_cistatic void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) 489862306a36Sopenharmony_ci{ 489962306a36Sopenharmony_ci struct rsvd_bits_validate *shadow_zero_check; 490062306a36Sopenharmony_ci int i; 490162306a36Sopenharmony_ci 490262306a36Sopenharmony_ci shadow_zero_check = &context->shadow_zero_check; 490362306a36Sopenharmony_ci 490462306a36Sopenharmony_ci if (boot_cpu_is_amd()) 490562306a36Sopenharmony_ci __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), 490662306a36Sopenharmony_ci context->root_role.level, true, 490762306a36Sopenharmony_ci boot_cpu_has(X86_FEATURE_GBPAGES), 490862306a36Sopenharmony_ci false, true); 490962306a36Sopenharmony_ci else 491062306a36Sopenharmony_ci __reset_rsvds_bits_mask_ept(shadow_zero_check, 491162306a36Sopenharmony_ci reserved_hpa_bits(), false, 491262306a36Sopenharmony_ci max_huge_page_level); 491362306a36Sopenharmony_ci 491462306a36Sopenharmony_ci if (!shadow_me_mask) 491562306a36Sopenharmony_ci return; 491662306a36Sopenharmony_ci 491762306a36Sopenharmony_ci for (i = context->root_role.level; --i >= 0;) { 491862306a36Sopenharmony_ci shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; 491962306a36Sopenharmony_ci shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; 492062306a36Sopenharmony_ci } 492162306a36Sopenharmony_ci} 492262306a36Sopenharmony_ci 492362306a36Sopenharmony_ci/* 492462306a36Sopenharmony_ci * as the comments in reset_shadow_zero_bits_mask() except it 492562306a36Sopenharmony_ci * is the shadow page table for intel nested guest. 492662306a36Sopenharmony_ci */ 492762306a36Sopenharmony_cistatic void 492862306a36Sopenharmony_cireset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly) 492962306a36Sopenharmony_ci{ 493062306a36Sopenharmony_ci __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, 493162306a36Sopenharmony_ci reserved_hpa_bits(), execonly, 493262306a36Sopenharmony_ci max_huge_page_level); 493362306a36Sopenharmony_ci} 493462306a36Sopenharmony_ci 493562306a36Sopenharmony_ci#define BYTE_MASK(access) \ 493662306a36Sopenharmony_ci ((1 & (access) ? 2 : 0) | \ 493762306a36Sopenharmony_ci (2 & (access) ? 4 : 0) | \ 493862306a36Sopenharmony_ci (3 & (access) ? 8 : 0) | \ 493962306a36Sopenharmony_ci (4 & (access) ? 16 : 0) | \ 494062306a36Sopenharmony_ci (5 & (access) ? 32 : 0) | \ 494162306a36Sopenharmony_ci (6 & (access) ? 64 : 0) | \ 494262306a36Sopenharmony_ci (7 & (access) ? 128 : 0)) 494362306a36Sopenharmony_ci 494462306a36Sopenharmony_ci 494562306a36Sopenharmony_cistatic void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) 494662306a36Sopenharmony_ci{ 494762306a36Sopenharmony_ci unsigned byte; 494862306a36Sopenharmony_ci 494962306a36Sopenharmony_ci const u8 x = BYTE_MASK(ACC_EXEC_MASK); 495062306a36Sopenharmony_ci const u8 w = BYTE_MASK(ACC_WRITE_MASK); 495162306a36Sopenharmony_ci const u8 u = BYTE_MASK(ACC_USER_MASK); 495262306a36Sopenharmony_ci 495362306a36Sopenharmony_ci bool cr4_smep = is_cr4_smep(mmu); 495462306a36Sopenharmony_ci bool cr4_smap = is_cr4_smap(mmu); 495562306a36Sopenharmony_ci bool cr0_wp = is_cr0_wp(mmu); 495662306a36Sopenharmony_ci bool efer_nx = is_efer_nx(mmu); 495762306a36Sopenharmony_ci 495862306a36Sopenharmony_ci for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { 495962306a36Sopenharmony_ci unsigned pfec = byte << 1; 496062306a36Sopenharmony_ci 496162306a36Sopenharmony_ci /* 496262306a36Sopenharmony_ci * Each "*f" variable has a 1 bit for each UWX value 496362306a36Sopenharmony_ci * that causes a fault with the given PFEC. 496462306a36Sopenharmony_ci */ 496562306a36Sopenharmony_ci 496662306a36Sopenharmony_ci /* Faults from writes to non-writable pages */ 496762306a36Sopenharmony_ci u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0; 496862306a36Sopenharmony_ci /* Faults from user mode accesses to supervisor pages */ 496962306a36Sopenharmony_ci u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0; 497062306a36Sopenharmony_ci /* Faults from fetches of non-executable pages*/ 497162306a36Sopenharmony_ci u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0; 497262306a36Sopenharmony_ci /* Faults from kernel mode fetches of user pages */ 497362306a36Sopenharmony_ci u8 smepf = 0; 497462306a36Sopenharmony_ci /* Faults from kernel mode accesses of user pages */ 497562306a36Sopenharmony_ci u8 smapf = 0; 497662306a36Sopenharmony_ci 497762306a36Sopenharmony_ci if (!ept) { 497862306a36Sopenharmony_ci /* Faults from kernel mode accesses to user pages */ 497962306a36Sopenharmony_ci u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u; 498062306a36Sopenharmony_ci 498162306a36Sopenharmony_ci /* Not really needed: !nx will cause pte.nx to fault */ 498262306a36Sopenharmony_ci if (!efer_nx) 498362306a36Sopenharmony_ci ff = 0; 498462306a36Sopenharmony_ci 498562306a36Sopenharmony_ci /* Allow supervisor writes if !cr0.wp */ 498662306a36Sopenharmony_ci if (!cr0_wp) 498762306a36Sopenharmony_ci wf = (pfec & PFERR_USER_MASK) ? wf : 0; 498862306a36Sopenharmony_ci 498962306a36Sopenharmony_ci /* Disallow supervisor fetches of user code if cr4.smep */ 499062306a36Sopenharmony_ci if (cr4_smep) 499162306a36Sopenharmony_ci smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0; 499262306a36Sopenharmony_ci 499362306a36Sopenharmony_ci /* 499462306a36Sopenharmony_ci * SMAP:kernel-mode data accesses from user-mode 499562306a36Sopenharmony_ci * mappings should fault. A fault is considered 499662306a36Sopenharmony_ci * as a SMAP violation if all of the following 499762306a36Sopenharmony_ci * conditions are true: 499862306a36Sopenharmony_ci * - X86_CR4_SMAP is set in CR4 499962306a36Sopenharmony_ci * - A user page is accessed 500062306a36Sopenharmony_ci * - The access is not a fetch 500162306a36Sopenharmony_ci * - The access is supervisor mode 500262306a36Sopenharmony_ci * - If implicit supervisor access or X86_EFLAGS_AC is clear 500362306a36Sopenharmony_ci * 500462306a36Sopenharmony_ci * Here, we cover the first four conditions. 500562306a36Sopenharmony_ci * The fifth is computed dynamically in permission_fault(); 500662306a36Sopenharmony_ci * PFERR_RSVD_MASK bit will be set in PFEC if the access is 500762306a36Sopenharmony_ci * *not* subject to SMAP restrictions. 500862306a36Sopenharmony_ci */ 500962306a36Sopenharmony_ci if (cr4_smap) 501062306a36Sopenharmony_ci smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf; 501162306a36Sopenharmony_ci } 501262306a36Sopenharmony_ci 501362306a36Sopenharmony_ci mmu->permissions[byte] = ff | uf | wf | smepf | smapf; 501462306a36Sopenharmony_ci } 501562306a36Sopenharmony_ci} 501662306a36Sopenharmony_ci 501762306a36Sopenharmony_ci/* 501862306a36Sopenharmony_ci* PKU is an additional mechanism by which the paging controls access to 501962306a36Sopenharmony_ci* user-mode addresses based on the value in the PKRU register. Protection 502062306a36Sopenharmony_ci* key violations are reported through a bit in the page fault error code. 502162306a36Sopenharmony_ci* Unlike other bits of the error code, the PK bit is not known at the 502262306a36Sopenharmony_ci* call site of e.g. gva_to_gpa; it must be computed directly in 502362306a36Sopenharmony_ci* permission_fault based on two bits of PKRU, on some machine state (CR4, 502462306a36Sopenharmony_ci* CR0, EFER, CPL), and on other bits of the error code and the page tables. 502562306a36Sopenharmony_ci* 502662306a36Sopenharmony_ci* In particular the following conditions come from the error code, the 502762306a36Sopenharmony_ci* page tables and the machine state: 502862306a36Sopenharmony_ci* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1 502962306a36Sopenharmony_ci* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch) 503062306a36Sopenharmony_ci* - PK is always zero if U=0 in the page tables 503162306a36Sopenharmony_ci* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access. 503262306a36Sopenharmony_ci* 503362306a36Sopenharmony_ci* The PKRU bitmask caches the result of these four conditions. The error 503462306a36Sopenharmony_ci* code (minus the P bit) and the page table's U bit form an index into the 503562306a36Sopenharmony_ci* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed 503662306a36Sopenharmony_ci* with the two bits of the PKRU register corresponding to the protection key. 503762306a36Sopenharmony_ci* For the first three conditions above the bits will be 00, thus masking 503862306a36Sopenharmony_ci* away both AD and WD. For all reads or if the last condition holds, WD 503962306a36Sopenharmony_ci* only will be masked away. 504062306a36Sopenharmony_ci*/ 504162306a36Sopenharmony_cistatic void update_pkru_bitmask(struct kvm_mmu *mmu) 504262306a36Sopenharmony_ci{ 504362306a36Sopenharmony_ci unsigned bit; 504462306a36Sopenharmony_ci bool wp; 504562306a36Sopenharmony_ci 504662306a36Sopenharmony_ci mmu->pkru_mask = 0; 504762306a36Sopenharmony_ci 504862306a36Sopenharmony_ci if (!is_cr4_pke(mmu)) 504962306a36Sopenharmony_ci return; 505062306a36Sopenharmony_ci 505162306a36Sopenharmony_ci wp = is_cr0_wp(mmu); 505262306a36Sopenharmony_ci 505362306a36Sopenharmony_ci for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { 505462306a36Sopenharmony_ci unsigned pfec, pkey_bits; 505562306a36Sopenharmony_ci bool check_pkey, check_write, ff, uf, wf, pte_user; 505662306a36Sopenharmony_ci 505762306a36Sopenharmony_ci pfec = bit << 1; 505862306a36Sopenharmony_ci ff = pfec & PFERR_FETCH_MASK; 505962306a36Sopenharmony_ci uf = pfec & PFERR_USER_MASK; 506062306a36Sopenharmony_ci wf = pfec & PFERR_WRITE_MASK; 506162306a36Sopenharmony_ci 506262306a36Sopenharmony_ci /* PFEC.RSVD is replaced by ACC_USER_MASK. */ 506362306a36Sopenharmony_ci pte_user = pfec & PFERR_RSVD_MASK; 506462306a36Sopenharmony_ci 506562306a36Sopenharmony_ci /* 506662306a36Sopenharmony_ci * Only need to check the access which is not an 506762306a36Sopenharmony_ci * instruction fetch and is to a user page. 506862306a36Sopenharmony_ci */ 506962306a36Sopenharmony_ci check_pkey = (!ff && pte_user); 507062306a36Sopenharmony_ci /* 507162306a36Sopenharmony_ci * write access is controlled by PKRU if it is a 507262306a36Sopenharmony_ci * user access or CR0.WP = 1. 507362306a36Sopenharmony_ci */ 507462306a36Sopenharmony_ci check_write = check_pkey && wf && (uf || wp); 507562306a36Sopenharmony_ci 507662306a36Sopenharmony_ci /* PKRU.AD stops both read and write access. */ 507762306a36Sopenharmony_ci pkey_bits = !!check_pkey; 507862306a36Sopenharmony_ci /* PKRU.WD stops write access. */ 507962306a36Sopenharmony_ci pkey_bits |= (!!check_write) << 1; 508062306a36Sopenharmony_ci 508162306a36Sopenharmony_ci mmu->pkru_mask |= (pkey_bits & 3) << pfec; 508262306a36Sopenharmony_ci } 508362306a36Sopenharmony_ci} 508462306a36Sopenharmony_ci 508562306a36Sopenharmony_cistatic void reset_guest_paging_metadata(struct kvm_vcpu *vcpu, 508662306a36Sopenharmony_ci struct kvm_mmu *mmu) 508762306a36Sopenharmony_ci{ 508862306a36Sopenharmony_ci if (!is_cr0_pg(mmu)) 508962306a36Sopenharmony_ci return; 509062306a36Sopenharmony_ci 509162306a36Sopenharmony_ci reset_guest_rsvds_bits_mask(vcpu, mmu); 509262306a36Sopenharmony_ci update_permission_bitmask(mmu, false); 509362306a36Sopenharmony_ci update_pkru_bitmask(mmu); 509462306a36Sopenharmony_ci} 509562306a36Sopenharmony_ci 509662306a36Sopenharmony_cistatic void paging64_init_context(struct kvm_mmu *context) 509762306a36Sopenharmony_ci{ 509862306a36Sopenharmony_ci context->page_fault = paging64_page_fault; 509962306a36Sopenharmony_ci context->gva_to_gpa = paging64_gva_to_gpa; 510062306a36Sopenharmony_ci context->sync_spte = paging64_sync_spte; 510162306a36Sopenharmony_ci} 510262306a36Sopenharmony_ci 510362306a36Sopenharmony_cistatic void paging32_init_context(struct kvm_mmu *context) 510462306a36Sopenharmony_ci{ 510562306a36Sopenharmony_ci context->page_fault = paging32_page_fault; 510662306a36Sopenharmony_ci context->gva_to_gpa = paging32_gva_to_gpa; 510762306a36Sopenharmony_ci context->sync_spte = paging32_sync_spte; 510862306a36Sopenharmony_ci} 510962306a36Sopenharmony_ci 511062306a36Sopenharmony_cistatic union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu, 511162306a36Sopenharmony_ci const struct kvm_mmu_role_regs *regs) 511262306a36Sopenharmony_ci{ 511362306a36Sopenharmony_ci union kvm_cpu_role role = {0}; 511462306a36Sopenharmony_ci 511562306a36Sopenharmony_ci role.base.access = ACC_ALL; 511662306a36Sopenharmony_ci role.base.smm = is_smm(vcpu); 511762306a36Sopenharmony_ci role.base.guest_mode = is_guest_mode(vcpu); 511862306a36Sopenharmony_ci role.ext.valid = 1; 511962306a36Sopenharmony_ci 512062306a36Sopenharmony_ci if (!____is_cr0_pg(regs)) { 512162306a36Sopenharmony_ci role.base.direct = 1; 512262306a36Sopenharmony_ci return role; 512362306a36Sopenharmony_ci } 512462306a36Sopenharmony_ci 512562306a36Sopenharmony_ci role.base.efer_nx = ____is_efer_nx(regs); 512662306a36Sopenharmony_ci role.base.cr0_wp = ____is_cr0_wp(regs); 512762306a36Sopenharmony_ci role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs); 512862306a36Sopenharmony_ci role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs); 512962306a36Sopenharmony_ci role.base.has_4_byte_gpte = !____is_cr4_pae(regs); 513062306a36Sopenharmony_ci 513162306a36Sopenharmony_ci if (____is_efer_lma(regs)) 513262306a36Sopenharmony_ci role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL 513362306a36Sopenharmony_ci : PT64_ROOT_4LEVEL; 513462306a36Sopenharmony_ci else if (____is_cr4_pae(regs)) 513562306a36Sopenharmony_ci role.base.level = PT32E_ROOT_LEVEL; 513662306a36Sopenharmony_ci else 513762306a36Sopenharmony_ci role.base.level = PT32_ROOT_LEVEL; 513862306a36Sopenharmony_ci 513962306a36Sopenharmony_ci role.ext.cr4_smep = ____is_cr4_smep(regs); 514062306a36Sopenharmony_ci role.ext.cr4_smap = ____is_cr4_smap(regs); 514162306a36Sopenharmony_ci role.ext.cr4_pse = ____is_cr4_pse(regs); 514262306a36Sopenharmony_ci 514362306a36Sopenharmony_ci /* PKEY and LA57 are active iff long mode is active. */ 514462306a36Sopenharmony_ci role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); 514562306a36Sopenharmony_ci role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); 514662306a36Sopenharmony_ci role.ext.efer_lma = ____is_efer_lma(regs); 514762306a36Sopenharmony_ci return role; 514862306a36Sopenharmony_ci} 514962306a36Sopenharmony_ci 515062306a36Sopenharmony_civoid __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, 515162306a36Sopenharmony_ci struct kvm_mmu *mmu) 515262306a36Sopenharmony_ci{ 515362306a36Sopenharmony_ci const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP); 515462306a36Sopenharmony_ci 515562306a36Sopenharmony_ci BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP); 515662306a36Sopenharmony_ci BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS)); 515762306a36Sopenharmony_ci 515862306a36Sopenharmony_ci if (is_cr0_wp(mmu) == cr0_wp) 515962306a36Sopenharmony_ci return; 516062306a36Sopenharmony_ci 516162306a36Sopenharmony_ci mmu->cpu_role.base.cr0_wp = cr0_wp; 516262306a36Sopenharmony_ci reset_guest_paging_metadata(vcpu, mmu); 516362306a36Sopenharmony_ci} 516462306a36Sopenharmony_ci 516562306a36Sopenharmony_cistatic inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) 516662306a36Sopenharmony_ci{ 516762306a36Sopenharmony_ci /* tdp_root_level is architecture forced level, use it if nonzero */ 516862306a36Sopenharmony_ci if (tdp_root_level) 516962306a36Sopenharmony_ci return tdp_root_level; 517062306a36Sopenharmony_ci 517162306a36Sopenharmony_ci /* Use 5-level TDP if and only if it's useful/necessary. */ 517262306a36Sopenharmony_ci if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) 517362306a36Sopenharmony_ci return 4; 517462306a36Sopenharmony_ci 517562306a36Sopenharmony_ci return max_tdp_level; 517662306a36Sopenharmony_ci} 517762306a36Sopenharmony_ci 517862306a36Sopenharmony_cistatic union kvm_mmu_page_role 517962306a36Sopenharmony_cikvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, 518062306a36Sopenharmony_ci union kvm_cpu_role cpu_role) 518162306a36Sopenharmony_ci{ 518262306a36Sopenharmony_ci union kvm_mmu_page_role role = {0}; 518362306a36Sopenharmony_ci 518462306a36Sopenharmony_ci role.access = ACC_ALL; 518562306a36Sopenharmony_ci role.cr0_wp = true; 518662306a36Sopenharmony_ci role.efer_nx = true; 518762306a36Sopenharmony_ci role.smm = cpu_role.base.smm; 518862306a36Sopenharmony_ci role.guest_mode = cpu_role.base.guest_mode; 518962306a36Sopenharmony_ci role.ad_disabled = !kvm_ad_enabled(); 519062306a36Sopenharmony_ci role.level = kvm_mmu_get_tdp_level(vcpu); 519162306a36Sopenharmony_ci role.direct = true; 519262306a36Sopenharmony_ci role.has_4_byte_gpte = false; 519362306a36Sopenharmony_ci 519462306a36Sopenharmony_ci return role; 519562306a36Sopenharmony_ci} 519662306a36Sopenharmony_ci 519762306a36Sopenharmony_cistatic void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu, 519862306a36Sopenharmony_ci union kvm_cpu_role cpu_role) 519962306a36Sopenharmony_ci{ 520062306a36Sopenharmony_ci struct kvm_mmu *context = &vcpu->arch.root_mmu; 520162306a36Sopenharmony_ci union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role); 520262306a36Sopenharmony_ci 520362306a36Sopenharmony_ci if (cpu_role.as_u64 == context->cpu_role.as_u64 && 520462306a36Sopenharmony_ci root_role.word == context->root_role.word) 520562306a36Sopenharmony_ci return; 520662306a36Sopenharmony_ci 520762306a36Sopenharmony_ci context->cpu_role.as_u64 = cpu_role.as_u64; 520862306a36Sopenharmony_ci context->root_role.word = root_role.word; 520962306a36Sopenharmony_ci context->page_fault = kvm_tdp_page_fault; 521062306a36Sopenharmony_ci context->sync_spte = NULL; 521162306a36Sopenharmony_ci context->get_guest_pgd = get_guest_cr3; 521262306a36Sopenharmony_ci context->get_pdptr = kvm_pdptr_read; 521362306a36Sopenharmony_ci context->inject_page_fault = kvm_inject_page_fault; 521462306a36Sopenharmony_ci 521562306a36Sopenharmony_ci if (!is_cr0_pg(context)) 521662306a36Sopenharmony_ci context->gva_to_gpa = nonpaging_gva_to_gpa; 521762306a36Sopenharmony_ci else if (is_cr4_pae(context)) 521862306a36Sopenharmony_ci context->gva_to_gpa = paging64_gva_to_gpa; 521962306a36Sopenharmony_ci else 522062306a36Sopenharmony_ci context->gva_to_gpa = paging32_gva_to_gpa; 522162306a36Sopenharmony_ci 522262306a36Sopenharmony_ci reset_guest_paging_metadata(vcpu, context); 522362306a36Sopenharmony_ci reset_tdp_shadow_zero_bits_mask(context); 522462306a36Sopenharmony_ci} 522562306a36Sopenharmony_ci 522662306a36Sopenharmony_cistatic void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, 522762306a36Sopenharmony_ci union kvm_cpu_role cpu_role, 522862306a36Sopenharmony_ci union kvm_mmu_page_role root_role) 522962306a36Sopenharmony_ci{ 523062306a36Sopenharmony_ci if (cpu_role.as_u64 == context->cpu_role.as_u64 && 523162306a36Sopenharmony_ci root_role.word == context->root_role.word) 523262306a36Sopenharmony_ci return; 523362306a36Sopenharmony_ci 523462306a36Sopenharmony_ci context->cpu_role.as_u64 = cpu_role.as_u64; 523562306a36Sopenharmony_ci context->root_role.word = root_role.word; 523662306a36Sopenharmony_ci 523762306a36Sopenharmony_ci if (!is_cr0_pg(context)) 523862306a36Sopenharmony_ci nonpaging_init_context(context); 523962306a36Sopenharmony_ci else if (is_cr4_pae(context)) 524062306a36Sopenharmony_ci paging64_init_context(context); 524162306a36Sopenharmony_ci else 524262306a36Sopenharmony_ci paging32_init_context(context); 524362306a36Sopenharmony_ci 524462306a36Sopenharmony_ci reset_guest_paging_metadata(vcpu, context); 524562306a36Sopenharmony_ci reset_shadow_zero_bits_mask(vcpu, context); 524662306a36Sopenharmony_ci} 524762306a36Sopenharmony_ci 524862306a36Sopenharmony_cistatic void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, 524962306a36Sopenharmony_ci union kvm_cpu_role cpu_role) 525062306a36Sopenharmony_ci{ 525162306a36Sopenharmony_ci struct kvm_mmu *context = &vcpu->arch.root_mmu; 525262306a36Sopenharmony_ci union kvm_mmu_page_role root_role; 525362306a36Sopenharmony_ci 525462306a36Sopenharmony_ci root_role = cpu_role.base; 525562306a36Sopenharmony_ci 525662306a36Sopenharmony_ci /* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */ 525762306a36Sopenharmony_ci root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL); 525862306a36Sopenharmony_ci 525962306a36Sopenharmony_ci /* 526062306a36Sopenharmony_ci * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role. 526162306a36Sopenharmony_ci * KVM uses NX when TDP is disabled to handle a variety of scenarios, 526262306a36Sopenharmony_ci * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and 526362306a36Sopenharmony_ci * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0. 526462306a36Sopenharmony_ci * The iTLB multi-hit workaround can be toggled at any time, so assume 526562306a36Sopenharmony_ci * NX can be used by any non-nested shadow MMU to avoid having to reset 526662306a36Sopenharmony_ci * MMU contexts. 526762306a36Sopenharmony_ci */ 526862306a36Sopenharmony_ci root_role.efer_nx = true; 526962306a36Sopenharmony_ci 527062306a36Sopenharmony_ci shadow_mmu_init_context(vcpu, context, cpu_role, root_role); 527162306a36Sopenharmony_ci} 527262306a36Sopenharmony_ci 527362306a36Sopenharmony_civoid kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, 527462306a36Sopenharmony_ci unsigned long cr4, u64 efer, gpa_t nested_cr3) 527562306a36Sopenharmony_ci{ 527662306a36Sopenharmony_ci struct kvm_mmu *context = &vcpu->arch.guest_mmu; 527762306a36Sopenharmony_ci struct kvm_mmu_role_regs regs = { 527862306a36Sopenharmony_ci .cr0 = cr0, 527962306a36Sopenharmony_ci .cr4 = cr4 & ~X86_CR4_PKE, 528062306a36Sopenharmony_ci .efer = efer, 528162306a36Sopenharmony_ci }; 528262306a36Sopenharmony_ci union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, ®s); 528362306a36Sopenharmony_ci union kvm_mmu_page_role root_role; 528462306a36Sopenharmony_ci 528562306a36Sopenharmony_ci /* NPT requires CR0.PG=1. */ 528662306a36Sopenharmony_ci WARN_ON_ONCE(cpu_role.base.direct); 528762306a36Sopenharmony_ci 528862306a36Sopenharmony_ci root_role = cpu_role.base; 528962306a36Sopenharmony_ci root_role.level = kvm_mmu_get_tdp_level(vcpu); 529062306a36Sopenharmony_ci if (root_role.level == PT64_ROOT_5LEVEL && 529162306a36Sopenharmony_ci cpu_role.base.level == PT64_ROOT_4LEVEL) 529262306a36Sopenharmony_ci root_role.passthrough = 1; 529362306a36Sopenharmony_ci 529462306a36Sopenharmony_ci shadow_mmu_init_context(vcpu, context, cpu_role, root_role); 529562306a36Sopenharmony_ci kvm_mmu_new_pgd(vcpu, nested_cr3); 529662306a36Sopenharmony_ci} 529762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); 529862306a36Sopenharmony_ci 529962306a36Sopenharmony_cistatic union kvm_cpu_role 530062306a36Sopenharmony_cikvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, 530162306a36Sopenharmony_ci bool execonly, u8 level) 530262306a36Sopenharmony_ci{ 530362306a36Sopenharmony_ci union kvm_cpu_role role = {0}; 530462306a36Sopenharmony_ci 530562306a36Sopenharmony_ci /* 530662306a36Sopenharmony_ci * KVM does not support SMM transfer monitors, and consequently does not 530762306a36Sopenharmony_ci * support the "entry to SMM" control either. role.base.smm is always 0. 530862306a36Sopenharmony_ci */ 530962306a36Sopenharmony_ci WARN_ON_ONCE(is_smm(vcpu)); 531062306a36Sopenharmony_ci role.base.level = level; 531162306a36Sopenharmony_ci role.base.has_4_byte_gpte = false; 531262306a36Sopenharmony_ci role.base.direct = false; 531362306a36Sopenharmony_ci role.base.ad_disabled = !accessed_dirty; 531462306a36Sopenharmony_ci role.base.guest_mode = true; 531562306a36Sopenharmony_ci role.base.access = ACC_ALL; 531662306a36Sopenharmony_ci 531762306a36Sopenharmony_ci role.ext.word = 0; 531862306a36Sopenharmony_ci role.ext.execonly = execonly; 531962306a36Sopenharmony_ci role.ext.valid = 1; 532062306a36Sopenharmony_ci 532162306a36Sopenharmony_ci return role; 532262306a36Sopenharmony_ci} 532362306a36Sopenharmony_ci 532462306a36Sopenharmony_civoid kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, 532562306a36Sopenharmony_ci int huge_page_level, bool accessed_dirty, 532662306a36Sopenharmony_ci gpa_t new_eptp) 532762306a36Sopenharmony_ci{ 532862306a36Sopenharmony_ci struct kvm_mmu *context = &vcpu->arch.guest_mmu; 532962306a36Sopenharmony_ci u8 level = vmx_eptp_page_walk_level(new_eptp); 533062306a36Sopenharmony_ci union kvm_cpu_role new_mode = 533162306a36Sopenharmony_ci kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, 533262306a36Sopenharmony_ci execonly, level); 533362306a36Sopenharmony_ci 533462306a36Sopenharmony_ci if (new_mode.as_u64 != context->cpu_role.as_u64) { 533562306a36Sopenharmony_ci /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */ 533662306a36Sopenharmony_ci context->cpu_role.as_u64 = new_mode.as_u64; 533762306a36Sopenharmony_ci context->root_role.word = new_mode.base.word; 533862306a36Sopenharmony_ci 533962306a36Sopenharmony_ci context->page_fault = ept_page_fault; 534062306a36Sopenharmony_ci context->gva_to_gpa = ept_gva_to_gpa; 534162306a36Sopenharmony_ci context->sync_spte = ept_sync_spte; 534262306a36Sopenharmony_ci 534362306a36Sopenharmony_ci update_permission_bitmask(context, true); 534462306a36Sopenharmony_ci context->pkru_mask = 0; 534562306a36Sopenharmony_ci reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level); 534662306a36Sopenharmony_ci reset_ept_shadow_zero_bits_mask(context, execonly); 534762306a36Sopenharmony_ci } 534862306a36Sopenharmony_ci 534962306a36Sopenharmony_ci kvm_mmu_new_pgd(vcpu, new_eptp); 535062306a36Sopenharmony_ci} 535162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); 535262306a36Sopenharmony_ci 535362306a36Sopenharmony_cistatic void init_kvm_softmmu(struct kvm_vcpu *vcpu, 535462306a36Sopenharmony_ci union kvm_cpu_role cpu_role) 535562306a36Sopenharmony_ci{ 535662306a36Sopenharmony_ci struct kvm_mmu *context = &vcpu->arch.root_mmu; 535762306a36Sopenharmony_ci 535862306a36Sopenharmony_ci kvm_init_shadow_mmu(vcpu, cpu_role); 535962306a36Sopenharmony_ci 536062306a36Sopenharmony_ci context->get_guest_pgd = get_guest_cr3; 536162306a36Sopenharmony_ci context->get_pdptr = kvm_pdptr_read; 536262306a36Sopenharmony_ci context->inject_page_fault = kvm_inject_page_fault; 536362306a36Sopenharmony_ci} 536462306a36Sopenharmony_ci 536562306a36Sopenharmony_cistatic void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, 536662306a36Sopenharmony_ci union kvm_cpu_role new_mode) 536762306a36Sopenharmony_ci{ 536862306a36Sopenharmony_ci struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; 536962306a36Sopenharmony_ci 537062306a36Sopenharmony_ci if (new_mode.as_u64 == g_context->cpu_role.as_u64) 537162306a36Sopenharmony_ci return; 537262306a36Sopenharmony_ci 537362306a36Sopenharmony_ci g_context->cpu_role.as_u64 = new_mode.as_u64; 537462306a36Sopenharmony_ci g_context->get_guest_pgd = get_guest_cr3; 537562306a36Sopenharmony_ci g_context->get_pdptr = kvm_pdptr_read; 537662306a36Sopenharmony_ci g_context->inject_page_fault = kvm_inject_page_fault; 537762306a36Sopenharmony_ci 537862306a36Sopenharmony_ci /* 537962306a36Sopenharmony_ci * L2 page tables are never shadowed, so there is no need to sync 538062306a36Sopenharmony_ci * SPTEs. 538162306a36Sopenharmony_ci */ 538262306a36Sopenharmony_ci g_context->sync_spte = NULL; 538362306a36Sopenharmony_ci 538462306a36Sopenharmony_ci /* 538562306a36Sopenharmony_ci * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using 538662306a36Sopenharmony_ci * L1's nested page tables (e.g. EPT12). The nested translation 538762306a36Sopenharmony_ci * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using 538862306a36Sopenharmony_ci * L2's page tables as the first level of translation and L1's 538962306a36Sopenharmony_ci * nested page tables as the second level of translation. Basically 539062306a36Sopenharmony_ci * the gva_to_gpa functions between mmu and nested_mmu are swapped. 539162306a36Sopenharmony_ci */ 539262306a36Sopenharmony_ci if (!is_paging(vcpu)) 539362306a36Sopenharmony_ci g_context->gva_to_gpa = nonpaging_gva_to_gpa; 539462306a36Sopenharmony_ci else if (is_long_mode(vcpu)) 539562306a36Sopenharmony_ci g_context->gva_to_gpa = paging64_gva_to_gpa; 539662306a36Sopenharmony_ci else if (is_pae(vcpu)) 539762306a36Sopenharmony_ci g_context->gva_to_gpa = paging64_gva_to_gpa; 539862306a36Sopenharmony_ci else 539962306a36Sopenharmony_ci g_context->gva_to_gpa = paging32_gva_to_gpa; 540062306a36Sopenharmony_ci 540162306a36Sopenharmony_ci reset_guest_paging_metadata(vcpu, g_context); 540262306a36Sopenharmony_ci} 540362306a36Sopenharmony_ci 540462306a36Sopenharmony_civoid kvm_init_mmu(struct kvm_vcpu *vcpu) 540562306a36Sopenharmony_ci{ 540662306a36Sopenharmony_ci struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); 540762306a36Sopenharmony_ci union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, ®s); 540862306a36Sopenharmony_ci 540962306a36Sopenharmony_ci if (mmu_is_nested(vcpu)) 541062306a36Sopenharmony_ci init_kvm_nested_mmu(vcpu, cpu_role); 541162306a36Sopenharmony_ci else if (tdp_enabled) 541262306a36Sopenharmony_ci init_kvm_tdp_mmu(vcpu, cpu_role); 541362306a36Sopenharmony_ci else 541462306a36Sopenharmony_ci init_kvm_softmmu(vcpu, cpu_role); 541562306a36Sopenharmony_ci} 541662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_init_mmu); 541762306a36Sopenharmony_ci 541862306a36Sopenharmony_civoid kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) 541962306a36Sopenharmony_ci{ 542062306a36Sopenharmony_ci /* 542162306a36Sopenharmony_ci * Invalidate all MMU roles to force them to reinitialize as CPUID 542262306a36Sopenharmony_ci * information is factored into reserved bit calculations. 542362306a36Sopenharmony_ci * 542462306a36Sopenharmony_ci * Correctly handling multiple vCPU models with respect to paging and 542562306a36Sopenharmony_ci * physical address properties) in a single VM would require tracking 542662306a36Sopenharmony_ci * all relevant CPUID information in kvm_mmu_page_role. That is very 542762306a36Sopenharmony_ci * undesirable as it would increase the memory requirements for 542862306a36Sopenharmony_ci * gfn_write_track (see struct kvm_mmu_page_role comments). For now 542962306a36Sopenharmony_ci * that problem is swept under the rug; KVM's CPUID API is horrific and 543062306a36Sopenharmony_ci * it's all but impossible to solve it without introducing a new API. 543162306a36Sopenharmony_ci */ 543262306a36Sopenharmony_ci vcpu->arch.root_mmu.root_role.word = 0; 543362306a36Sopenharmony_ci vcpu->arch.guest_mmu.root_role.word = 0; 543462306a36Sopenharmony_ci vcpu->arch.nested_mmu.root_role.word = 0; 543562306a36Sopenharmony_ci vcpu->arch.root_mmu.cpu_role.ext.valid = 0; 543662306a36Sopenharmony_ci vcpu->arch.guest_mmu.cpu_role.ext.valid = 0; 543762306a36Sopenharmony_ci vcpu->arch.nested_mmu.cpu_role.ext.valid = 0; 543862306a36Sopenharmony_ci kvm_mmu_reset_context(vcpu); 543962306a36Sopenharmony_ci 544062306a36Sopenharmony_ci /* 544162306a36Sopenharmony_ci * Changing guest CPUID after KVM_RUN is forbidden, see the comment in 544262306a36Sopenharmony_ci * kvm_arch_vcpu_ioctl(). 544362306a36Sopenharmony_ci */ 544462306a36Sopenharmony_ci KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm); 544562306a36Sopenharmony_ci} 544662306a36Sopenharmony_ci 544762306a36Sopenharmony_civoid kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 544862306a36Sopenharmony_ci{ 544962306a36Sopenharmony_ci kvm_mmu_unload(vcpu); 545062306a36Sopenharmony_ci kvm_init_mmu(vcpu); 545162306a36Sopenharmony_ci} 545262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_mmu_reset_context); 545362306a36Sopenharmony_ci 545462306a36Sopenharmony_ciint kvm_mmu_load(struct kvm_vcpu *vcpu) 545562306a36Sopenharmony_ci{ 545662306a36Sopenharmony_ci int r; 545762306a36Sopenharmony_ci 545862306a36Sopenharmony_ci r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct); 545962306a36Sopenharmony_ci if (r) 546062306a36Sopenharmony_ci goto out; 546162306a36Sopenharmony_ci r = mmu_alloc_special_roots(vcpu); 546262306a36Sopenharmony_ci if (r) 546362306a36Sopenharmony_ci goto out; 546462306a36Sopenharmony_ci if (vcpu->arch.mmu->root_role.direct) 546562306a36Sopenharmony_ci r = mmu_alloc_direct_roots(vcpu); 546662306a36Sopenharmony_ci else 546762306a36Sopenharmony_ci r = mmu_alloc_shadow_roots(vcpu); 546862306a36Sopenharmony_ci if (r) 546962306a36Sopenharmony_ci goto out; 547062306a36Sopenharmony_ci 547162306a36Sopenharmony_ci kvm_mmu_sync_roots(vcpu); 547262306a36Sopenharmony_ci 547362306a36Sopenharmony_ci kvm_mmu_load_pgd(vcpu); 547462306a36Sopenharmony_ci 547562306a36Sopenharmony_ci /* 547662306a36Sopenharmony_ci * Flush any TLB entries for the new root, the provenance of the root 547762306a36Sopenharmony_ci * is unknown. Even if KVM ensures there are no stale TLB entries 547862306a36Sopenharmony_ci * for a freed root, in theory another hypervisor could have left 547962306a36Sopenharmony_ci * stale entries. Flushing on alloc also allows KVM to skip the TLB 548062306a36Sopenharmony_ci * flush when freeing a root (see kvm_tdp_mmu_put_root()). 548162306a36Sopenharmony_ci */ 548262306a36Sopenharmony_ci static_call(kvm_x86_flush_tlb_current)(vcpu); 548362306a36Sopenharmony_ciout: 548462306a36Sopenharmony_ci return r; 548562306a36Sopenharmony_ci} 548662306a36Sopenharmony_ci 548762306a36Sopenharmony_civoid kvm_mmu_unload(struct kvm_vcpu *vcpu) 548862306a36Sopenharmony_ci{ 548962306a36Sopenharmony_ci struct kvm *kvm = vcpu->kvm; 549062306a36Sopenharmony_ci 549162306a36Sopenharmony_ci kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL); 549262306a36Sopenharmony_ci WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa)); 549362306a36Sopenharmony_ci kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 549462306a36Sopenharmony_ci WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa)); 549562306a36Sopenharmony_ci vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); 549662306a36Sopenharmony_ci} 549762306a36Sopenharmony_ci 549862306a36Sopenharmony_cistatic bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa) 549962306a36Sopenharmony_ci{ 550062306a36Sopenharmony_ci struct kvm_mmu_page *sp; 550162306a36Sopenharmony_ci 550262306a36Sopenharmony_ci if (!VALID_PAGE(root_hpa)) 550362306a36Sopenharmony_ci return false; 550462306a36Sopenharmony_ci 550562306a36Sopenharmony_ci /* 550662306a36Sopenharmony_ci * When freeing obsolete roots, treat roots as obsolete if they don't 550762306a36Sopenharmony_ci * have an associated shadow page, as it's impossible to determine if 550862306a36Sopenharmony_ci * such roots are fresh or stale. This does mean KVM will get false 550962306a36Sopenharmony_ci * positives and free roots that don't strictly need to be freed, but 551062306a36Sopenharmony_ci * such false positives are relatively rare: 551162306a36Sopenharmony_ci * 551262306a36Sopenharmony_ci * (a) only PAE paging and nested NPT have roots without shadow pages 551362306a36Sopenharmony_ci * (or any shadow paging flavor with a dummy root, see note below) 551462306a36Sopenharmony_ci * (b) remote reloads due to a memslot update obsoletes _all_ roots 551562306a36Sopenharmony_ci * (c) KVM doesn't track previous roots for PAE paging, and the guest 551662306a36Sopenharmony_ci * is unlikely to zap an in-use PGD. 551762306a36Sopenharmony_ci * 551862306a36Sopenharmony_ci * Note! Dummy roots are unique in that they are obsoleted by memslot 551962306a36Sopenharmony_ci * _creation_! See also FNAME(fetch). 552062306a36Sopenharmony_ci */ 552162306a36Sopenharmony_ci sp = root_to_sp(root_hpa); 552262306a36Sopenharmony_ci return !sp || is_obsolete_sp(kvm, sp); 552362306a36Sopenharmony_ci} 552462306a36Sopenharmony_ci 552562306a36Sopenharmony_cistatic void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu) 552662306a36Sopenharmony_ci{ 552762306a36Sopenharmony_ci unsigned long roots_to_free = 0; 552862306a36Sopenharmony_ci int i; 552962306a36Sopenharmony_ci 553062306a36Sopenharmony_ci if (is_obsolete_root(kvm, mmu->root.hpa)) 553162306a36Sopenharmony_ci roots_to_free |= KVM_MMU_ROOT_CURRENT; 553262306a36Sopenharmony_ci 553362306a36Sopenharmony_ci for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 553462306a36Sopenharmony_ci if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa)) 553562306a36Sopenharmony_ci roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 553662306a36Sopenharmony_ci } 553762306a36Sopenharmony_ci 553862306a36Sopenharmony_ci if (roots_to_free) 553962306a36Sopenharmony_ci kvm_mmu_free_roots(kvm, mmu, roots_to_free); 554062306a36Sopenharmony_ci} 554162306a36Sopenharmony_ci 554262306a36Sopenharmony_civoid kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) 554362306a36Sopenharmony_ci{ 554462306a36Sopenharmony_ci __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu); 554562306a36Sopenharmony_ci __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); 554662306a36Sopenharmony_ci} 554762306a36Sopenharmony_ci 554862306a36Sopenharmony_cistatic u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, 554962306a36Sopenharmony_ci int *bytes) 555062306a36Sopenharmony_ci{ 555162306a36Sopenharmony_ci u64 gentry = 0; 555262306a36Sopenharmony_ci int r; 555362306a36Sopenharmony_ci 555462306a36Sopenharmony_ci /* 555562306a36Sopenharmony_ci * Assume that the pte write on a page table of the same type 555662306a36Sopenharmony_ci * as the current vcpu paging mode since we update the sptes only 555762306a36Sopenharmony_ci * when they have the same mode. 555862306a36Sopenharmony_ci */ 555962306a36Sopenharmony_ci if (is_pae(vcpu) && *bytes == 4) { 556062306a36Sopenharmony_ci /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 556162306a36Sopenharmony_ci *gpa &= ~(gpa_t)7; 556262306a36Sopenharmony_ci *bytes = 8; 556362306a36Sopenharmony_ci } 556462306a36Sopenharmony_ci 556562306a36Sopenharmony_ci if (*bytes == 4 || *bytes == 8) { 556662306a36Sopenharmony_ci r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes); 556762306a36Sopenharmony_ci if (r) 556862306a36Sopenharmony_ci gentry = 0; 556962306a36Sopenharmony_ci } 557062306a36Sopenharmony_ci 557162306a36Sopenharmony_ci return gentry; 557262306a36Sopenharmony_ci} 557362306a36Sopenharmony_ci 557462306a36Sopenharmony_ci/* 557562306a36Sopenharmony_ci * If we're seeing too many writes to a page, it may no longer be a page table, 557662306a36Sopenharmony_ci * or we may be forking, in which case it is better to unmap the page. 557762306a36Sopenharmony_ci */ 557862306a36Sopenharmony_cistatic bool detect_write_flooding(struct kvm_mmu_page *sp) 557962306a36Sopenharmony_ci{ 558062306a36Sopenharmony_ci /* 558162306a36Sopenharmony_ci * Skip write-flooding detected for the sp whose level is 1, because 558262306a36Sopenharmony_ci * it can become unsync, then the guest page is not write-protected. 558362306a36Sopenharmony_ci */ 558462306a36Sopenharmony_ci if (sp->role.level == PG_LEVEL_4K) 558562306a36Sopenharmony_ci return false; 558662306a36Sopenharmony_ci 558762306a36Sopenharmony_ci atomic_inc(&sp->write_flooding_count); 558862306a36Sopenharmony_ci return atomic_read(&sp->write_flooding_count) >= 3; 558962306a36Sopenharmony_ci} 559062306a36Sopenharmony_ci 559162306a36Sopenharmony_ci/* 559262306a36Sopenharmony_ci * Misaligned accesses are too much trouble to fix up; also, they usually 559362306a36Sopenharmony_ci * indicate a page is not used as a page table. 559462306a36Sopenharmony_ci */ 559562306a36Sopenharmony_cistatic bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, 559662306a36Sopenharmony_ci int bytes) 559762306a36Sopenharmony_ci{ 559862306a36Sopenharmony_ci unsigned offset, pte_size, misaligned; 559962306a36Sopenharmony_ci 560062306a36Sopenharmony_ci offset = offset_in_page(gpa); 560162306a36Sopenharmony_ci pte_size = sp->role.has_4_byte_gpte ? 4 : 8; 560262306a36Sopenharmony_ci 560362306a36Sopenharmony_ci /* 560462306a36Sopenharmony_ci * Sometimes, the OS only writes the last one bytes to update status 560562306a36Sopenharmony_ci * bits, for example, in linux, andb instruction is used in clear_bit(). 560662306a36Sopenharmony_ci */ 560762306a36Sopenharmony_ci if (!(offset & (pte_size - 1)) && bytes == 1) 560862306a36Sopenharmony_ci return false; 560962306a36Sopenharmony_ci 561062306a36Sopenharmony_ci misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 561162306a36Sopenharmony_ci misaligned |= bytes < 4; 561262306a36Sopenharmony_ci 561362306a36Sopenharmony_ci return misaligned; 561462306a36Sopenharmony_ci} 561562306a36Sopenharmony_ci 561662306a36Sopenharmony_cistatic u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) 561762306a36Sopenharmony_ci{ 561862306a36Sopenharmony_ci unsigned page_offset, quadrant; 561962306a36Sopenharmony_ci u64 *spte; 562062306a36Sopenharmony_ci int level; 562162306a36Sopenharmony_ci 562262306a36Sopenharmony_ci page_offset = offset_in_page(gpa); 562362306a36Sopenharmony_ci level = sp->role.level; 562462306a36Sopenharmony_ci *nspte = 1; 562562306a36Sopenharmony_ci if (sp->role.has_4_byte_gpte) { 562662306a36Sopenharmony_ci page_offset <<= 1; /* 32->64 */ 562762306a36Sopenharmony_ci /* 562862306a36Sopenharmony_ci * A 32-bit pde maps 4MB while the shadow pdes map 562962306a36Sopenharmony_ci * only 2MB. So we need to double the offset again 563062306a36Sopenharmony_ci * and zap two pdes instead of one. 563162306a36Sopenharmony_ci */ 563262306a36Sopenharmony_ci if (level == PT32_ROOT_LEVEL) { 563362306a36Sopenharmony_ci page_offset &= ~7; /* kill rounding error */ 563462306a36Sopenharmony_ci page_offset <<= 1; 563562306a36Sopenharmony_ci *nspte = 2; 563662306a36Sopenharmony_ci } 563762306a36Sopenharmony_ci quadrant = page_offset >> PAGE_SHIFT; 563862306a36Sopenharmony_ci page_offset &= ~PAGE_MASK; 563962306a36Sopenharmony_ci if (quadrant != sp->role.quadrant) 564062306a36Sopenharmony_ci return NULL; 564162306a36Sopenharmony_ci } 564262306a36Sopenharmony_ci 564362306a36Sopenharmony_ci spte = &sp->spt[page_offset / sizeof(*spte)]; 564462306a36Sopenharmony_ci return spte; 564562306a36Sopenharmony_ci} 564662306a36Sopenharmony_ci 564762306a36Sopenharmony_civoid kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, 564862306a36Sopenharmony_ci int bytes) 564962306a36Sopenharmony_ci{ 565062306a36Sopenharmony_ci gfn_t gfn = gpa >> PAGE_SHIFT; 565162306a36Sopenharmony_ci struct kvm_mmu_page *sp; 565262306a36Sopenharmony_ci LIST_HEAD(invalid_list); 565362306a36Sopenharmony_ci u64 entry, gentry, *spte; 565462306a36Sopenharmony_ci int npte; 565562306a36Sopenharmony_ci bool flush = false; 565662306a36Sopenharmony_ci 565762306a36Sopenharmony_ci /* 565862306a36Sopenharmony_ci * If we don't have indirect shadow pages, it means no page is 565962306a36Sopenharmony_ci * write-protected, so we can exit simply. 566062306a36Sopenharmony_ci */ 566162306a36Sopenharmony_ci if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) 566262306a36Sopenharmony_ci return; 566362306a36Sopenharmony_ci 566462306a36Sopenharmony_ci write_lock(&vcpu->kvm->mmu_lock); 566562306a36Sopenharmony_ci 566662306a36Sopenharmony_ci gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); 566762306a36Sopenharmony_ci 566862306a36Sopenharmony_ci ++vcpu->kvm->stat.mmu_pte_write; 566962306a36Sopenharmony_ci 567062306a36Sopenharmony_ci for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) { 567162306a36Sopenharmony_ci if (detect_write_misaligned(sp, gpa, bytes) || 567262306a36Sopenharmony_ci detect_write_flooding(sp)) { 567362306a36Sopenharmony_ci kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 567462306a36Sopenharmony_ci ++vcpu->kvm->stat.mmu_flooded; 567562306a36Sopenharmony_ci continue; 567662306a36Sopenharmony_ci } 567762306a36Sopenharmony_ci 567862306a36Sopenharmony_ci spte = get_written_sptes(sp, gpa, &npte); 567962306a36Sopenharmony_ci if (!spte) 568062306a36Sopenharmony_ci continue; 568162306a36Sopenharmony_ci 568262306a36Sopenharmony_ci while (npte--) { 568362306a36Sopenharmony_ci entry = *spte; 568462306a36Sopenharmony_ci mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); 568562306a36Sopenharmony_ci if (gentry && sp->role.level != PG_LEVEL_4K) 568662306a36Sopenharmony_ci ++vcpu->kvm->stat.mmu_pde_zapped; 568762306a36Sopenharmony_ci if (is_shadow_present_pte(entry)) 568862306a36Sopenharmony_ci flush = true; 568962306a36Sopenharmony_ci ++spte; 569062306a36Sopenharmony_ci } 569162306a36Sopenharmony_ci } 569262306a36Sopenharmony_ci kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); 569362306a36Sopenharmony_ci write_unlock(&vcpu->kvm->mmu_lock); 569462306a36Sopenharmony_ci} 569562306a36Sopenharmony_ci 569662306a36Sopenharmony_ciint noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, 569762306a36Sopenharmony_ci void *insn, int insn_len) 569862306a36Sopenharmony_ci{ 569962306a36Sopenharmony_ci int r, emulation_type = EMULTYPE_PF; 570062306a36Sopenharmony_ci bool direct = vcpu->arch.mmu->root_role.direct; 570162306a36Sopenharmony_ci 570262306a36Sopenharmony_ci /* 570362306a36Sopenharmony_ci * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP 570462306a36Sopenharmony_ci * checks when emulating instructions that triggers implicit access. 570562306a36Sopenharmony_ci * WARN if hardware generates a fault with an error code that collides 570662306a36Sopenharmony_ci * with the KVM-defined value. Clear the flag and continue on, i.e. 570762306a36Sopenharmony_ci * don't terminate the VM, as KVM can't possibly be relying on a flag 570862306a36Sopenharmony_ci * that KVM doesn't know about. 570962306a36Sopenharmony_ci */ 571062306a36Sopenharmony_ci if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS)) 571162306a36Sopenharmony_ci error_code &= ~PFERR_IMPLICIT_ACCESS; 571262306a36Sopenharmony_ci 571362306a36Sopenharmony_ci if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) 571462306a36Sopenharmony_ci return RET_PF_RETRY; 571562306a36Sopenharmony_ci 571662306a36Sopenharmony_ci r = RET_PF_INVALID; 571762306a36Sopenharmony_ci if (unlikely(error_code & PFERR_RSVD_MASK)) { 571862306a36Sopenharmony_ci r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); 571962306a36Sopenharmony_ci if (r == RET_PF_EMULATE) 572062306a36Sopenharmony_ci goto emulate; 572162306a36Sopenharmony_ci } 572262306a36Sopenharmony_ci 572362306a36Sopenharmony_ci if (r == RET_PF_INVALID) { 572462306a36Sopenharmony_ci r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, 572562306a36Sopenharmony_ci lower_32_bits(error_code), false, 572662306a36Sopenharmony_ci &emulation_type); 572762306a36Sopenharmony_ci if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) 572862306a36Sopenharmony_ci return -EIO; 572962306a36Sopenharmony_ci } 573062306a36Sopenharmony_ci 573162306a36Sopenharmony_ci if (r < 0) 573262306a36Sopenharmony_ci return r; 573362306a36Sopenharmony_ci if (r != RET_PF_EMULATE) 573462306a36Sopenharmony_ci return 1; 573562306a36Sopenharmony_ci 573662306a36Sopenharmony_ci /* 573762306a36Sopenharmony_ci * Before emulating the instruction, check if the error code 573862306a36Sopenharmony_ci * was due to a RO violation while translating the guest page. 573962306a36Sopenharmony_ci * This can occur when using nested virtualization with nested 574062306a36Sopenharmony_ci * paging in both guests. If true, we simply unprotect the page 574162306a36Sopenharmony_ci * and resume the guest. 574262306a36Sopenharmony_ci */ 574362306a36Sopenharmony_ci if (vcpu->arch.mmu->root_role.direct && 574462306a36Sopenharmony_ci (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { 574562306a36Sopenharmony_ci kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); 574662306a36Sopenharmony_ci return 1; 574762306a36Sopenharmony_ci } 574862306a36Sopenharmony_ci 574962306a36Sopenharmony_ci /* 575062306a36Sopenharmony_ci * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still 575162306a36Sopenharmony_ci * optimistically try to just unprotect the page and let the processor 575262306a36Sopenharmony_ci * re-execute the instruction that caused the page fault. Do not allow 575362306a36Sopenharmony_ci * retrying MMIO emulation, as it's not only pointless but could also 575462306a36Sopenharmony_ci * cause us to enter an infinite loop because the processor will keep 575562306a36Sopenharmony_ci * faulting on the non-existent MMIO address. Retrying an instruction 575662306a36Sopenharmony_ci * from a nested guest is also pointless and dangerous as we are only 575762306a36Sopenharmony_ci * explicitly shadowing L1's page tables, i.e. unprotecting something 575862306a36Sopenharmony_ci * for L1 isn't going to magically fix whatever issue cause L2 to fail. 575962306a36Sopenharmony_ci */ 576062306a36Sopenharmony_ci if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) 576162306a36Sopenharmony_ci emulation_type |= EMULTYPE_ALLOW_RETRY_PF; 576262306a36Sopenharmony_ciemulate: 576362306a36Sopenharmony_ci return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, 576462306a36Sopenharmony_ci insn_len); 576562306a36Sopenharmony_ci} 576662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_mmu_page_fault); 576762306a36Sopenharmony_ci 576862306a36Sopenharmony_cistatic void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 576962306a36Sopenharmony_ci u64 addr, hpa_t root_hpa) 577062306a36Sopenharmony_ci{ 577162306a36Sopenharmony_ci struct kvm_shadow_walk_iterator iterator; 577262306a36Sopenharmony_ci 577362306a36Sopenharmony_ci vcpu_clear_mmio_info(vcpu, addr); 577462306a36Sopenharmony_ci 577562306a36Sopenharmony_ci /* 577662306a36Sopenharmony_ci * Walking and synchronizing SPTEs both assume they are operating in 577762306a36Sopenharmony_ci * the context of the current MMU, and would need to be reworked if 577862306a36Sopenharmony_ci * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT. 577962306a36Sopenharmony_ci */ 578062306a36Sopenharmony_ci if (WARN_ON_ONCE(mmu != vcpu->arch.mmu)) 578162306a36Sopenharmony_ci return; 578262306a36Sopenharmony_ci 578362306a36Sopenharmony_ci if (!VALID_PAGE(root_hpa)) 578462306a36Sopenharmony_ci return; 578562306a36Sopenharmony_ci 578662306a36Sopenharmony_ci write_lock(&vcpu->kvm->mmu_lock); 578762306a36Sopenharmony_ci for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) { 578862306a36Sopenharmony_ci struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep); 578962306a36Sopenharmony_ci 579062306a36Sopenharmony_ci if (sp->unsync) { 579162306a36Sopenharmony_ci int ret = kvm_sync_spte(vcpu, sp, iterator.index); 579262306a36Sopenharmony_ci 579362306a36Sopenharmony_ci if (ret < 0) 579462306a36Sopenharmony_ci mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL); 579562306a36Sopenharmony_ci if (ret) 579662306a36Sopenharmony_ci kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep); 579762306a36Sopenharmony_ci } 579862306a36Sopenharmony_ci 579962306a36Sopenharmony_ci if (!sp->unsync_children) 580062306a36Sopenharmony_ci break; 580162306a36Sopenharmony_ci } 580262306a36Sopenharmony_ci write_unlock(&vcpu->kvm->mmu_lock); 580362306a36Sopenharmony_ci} 580462306a36Sopenharmony_ci 580562306a36Sopenharmony_civoid kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 580662306a36Sopenharmony_ci u64 addr, unsigned long roots) 580762306a36Sopenharmony_ci{ 580862306a36Sopenharmony_ci int i; 580962306a36Sopenharmony_ci 581062306a36Sopenharmony_ci WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL); 581162306a36Sopenharmony_ci 581262306a36Sopenharmony_ci /* It's actually a GPA for vcpu->arch.guest_mmu. */ 581362306a36Sopenharmony_ci if (mmu != &vcpu->arch.guest_mmu) { 581462306a36Sopenharmony_ci /* INVLPG on a non-canonical address is a NOP according to the SDM. */ 581562306a36Sopenharmony_ci if (is_noncanonical_address(addr, vcpu)) 581662306a36Sopenharmony_ci return; 581762306a36Sopenharmony_ci 581862306a36Sopenharmony_ci static_call(kvm_x86_flush_tlb_gva)(vcpu, addr); 581962306a36Sopenharmony_ci } 582062306a36Sopenharmony_ci 582162306a36Sopenharmony_ci if (!mmu->sync_spte) 582262306a36Sopenharmony_ci return; 582362306a36Sopenharmony_ci 582462306a36Sopenharmony_ci if (roots & KVM_MMU_ROOT_CURRENT) 582562306a36Sopenharmony_ci __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa); 582662306a36Sopenharmony_ci 582762306a36Sopenharmony_ci for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 582862306a36Sopenharmony_ci if (roots & KVM_MMU_ROOT_PREVIOUS(i)) 582962306a36Sopenharmony_ci __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa); 583062306a36Sopenharmony_ci } 583162306a36Sopenharmony_ci} 583262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr); 583362306a36Sopenharmony_ci 583462306a36Sopenharmony_civoid kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) 583562306a36Sopenharmony_ci{ 583662306a36Sopenharmony_ci /* 583762306a36Sopenharmony_ci * INVLPG is required to invalidate any global mappings for the VA, 583862306a36Sopenharmony_ci * irrespective of PCID. Blindly sync all roots as it would take 583962306a36Sopenharmony_ci * roughly the same amount of work/time to determine whether any of the 584062306a36Sopenharmony_ci * previous roots have a global mapping. 584162306a36Sopenharmony_ci * 584262306a36Sopenharmony_ci * Mappings not reachable via the current or previous cached roots will 584362306a36Sopenharmony_ci * be synced when switching to that new cr3, so nothing needs to be 584462306a36Sopenharmony_ci * done here for them. 584562306a36Sopenharmony_ci */ 584662306a36Sopenharmony_ci kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL); 584762306a36Sopenharmony_ci ++vcpu->stat.invlpg; 584862306a36Sopenharmony_ci} 584962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_mmu_invlpg); 585062306a36Sopenharmony_ci 585162306a36Sopenharmony_ci 585262306a36Sopenharmony_civoid kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) 585362306a36Sopenharmony_ci{ 585462306a36Sopenharmony_ci struct kvm_mmu *mmu = vcpu->arch.mmu; 585562306a36Sopenharmony_ci unsigned long roots = 0; 585662306a36Sopenharmony_ci uint i; 585762306a36Sopenharmony_ci 585862306a36Sopenharmony_ci if (pcid == kvm_get_active_pcid(vcpu)) 585962306a36Sopenharmony_ci roots |= KVM_MMU_ROOT_CURRENT; 586062306a36Sopenharmony_ci 586162306a36Sopenharmony_ci for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 586262306a36Sopenharmony_ci if (VALID_PAGE(mmu->prev_roots[i].hpa) && 586362306a36Sopenharmony_ci pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) 586462306a36Sopenharmony_ci roots |= KVM_MMU_ROOT_PREVIOUS(i); 586562306a36Sopenharmony_ci } 586662306a36Sopenharmony_ci 586762306a36Sopenharmony_ci if (roots) 586862306a36Sopenharmony_ci kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots); 586962306a36Sopenharmony_ci ++vcpu->stat.invlpg; 587062306a36Sopenharmony_ci 587162306a36Sopenharmony_ci /* 587262306a36Sopenharmony_ci * Mappings not reachable via the current cr3 or the prev_roots will be 587362306a36Sopenharmony_ci * synced when switching to that cr3, so nothing needs to be done here 587462306a36Sopenharmony_ci * for them. 587562306a36Sopenharmony_ci */ 587662306a36Sopenharmony_ci} 587762306a36Sopenharmony_ci 587862306a36Sopenharmony_civoid kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, 587962306a36Sopenharmony_ci int tdp_max_root_level, int tdp_huge_page_level) 588062306a36Sopenharmony_ci{ 588162306a36Sopenharmony_ci tdp_enabled = enable_tdp; 588262306a36Sopenharmony_ci tdp_root_level = tdp_forced_root_level; 588362306a36Sopenharmony_ci max_tdp_level = tdp_max_root_level; 588462306a36Sopenharmony_ci 588562306a36Sopenharmony_ci#ifdef CONFIG_X86_64 588662306a36Sopenharmony_ci tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled; 588762306a36Sopenharmony_ci#endif 588862306a36Sopenharmony_ci /* 588962306a36Sopenharmony_ci * max_huge_page_level reflects KVM's MMU capabilities irrespective 589062306a36Sopenharmony_ci * of kernel support, e.g. KVM may be capable of using 1GB pages when 589162306a36Sopenharmony_ci * the kernel is not. But, KVM never creates a page size greater than 589262306a36Sopenharmony_ci * what is used by the kernel for any given HVA, i.e. the kernel's 589362306a36Sopenharmony_ci * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust(). 589462306a36Sopenharmony_ci */ 589562306a36Sopenharmony_ci if (tdp_enabled) 589662306a36Sopenharmony_ci max_huge_page_level = tdp_huge_page_level; 589762306a36Sopenharmony_ci else if (boot_cpu_has(X86_FEATURE_GBPAGES)) 589862306a36Sopenharmony_ci max_huge_page_level = PG_LEVEL_1G; 589962306a36Sopenharmony_ci else 590062306a36Sopenharmony_ci max_huge_page_level = PG_LEVEL_2M; 590162306a36Sopenharmony_ci} 590262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(kvm_configure_mmu); 590362306a36Sopenharmony_ci 590462306a36Sopenharmony_ci/* The return value indicates if tlb flush on all vcpus is needed. */ 590562306a36Sopenharmony_citypedef bool (*slot_rmaps_handler) (struct kvm *kvm, 590662306a36Sopenharmony_ci struct kvm_rmap_head *rmap_head, 590762306a36Sopenharmony_ci const struct kvm_memory_slot *slot); 590862306a36Sopenharmony_ci 590962306a36Sopenharmony_cistatic __always_inline bool __walk_slot_rmaps(struct kvm *kvm, 591062306a36Sopenharmony_ci const struct kvm_memory_slot *slot, 591162306a36Sopenharmony_ci slot_rmaps_handler fn, 591262306a36Sopenharmony_ci int start_level, int end_level, 591362306a36Sopenharmony_ci gfn_t start_gfn, gfn_t end_gfn, 591462306a36Sopenharmony_ci bool flush_on_yield, bool flush) 591562306a36Sopenharmony_ci{ 591662306a36Sopenharmony_ci struct slot_rmap_walk_iterator iterator; 591762306a36Sopenharmony_ci 591862306a36Sopenharmony_ci lockdep_assert_held_write(&kvm->mmu_lock); 591962306a36Sopenharmony_ci 592062306a36Sopenharmony_ci for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, 592162306a36Sopenharmony_ci end_gfn, &iterator) { 592262306a36Sopenharmony_ci if (iterator.rmap) 592362306a36Sopenharmony_ci flush |= fn(kvm, iterator.rmap, slot); 592462306a36Sopenharmony_ci 592562306a36Sopenharmony_ci if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 592662306a36Sopenharmony_ci if (flush && flush_on_yield) { 592762306a36Sopenharmony_ci kvm_flush_remote_tlbs_range(kvm, start_gfn, 592862306a36Sopenharmony_ci iterator.gfn - start_gfn + 1); 592962306a36Sopenharmony_ci flush = false; 593062306a36Sopenharmony_ci } 593162306a36Sopenharmony_ci cond_resched_rwlock_write(&kvm->mmu_lock); 593262306a36Sopenharmony_ci } 593362306a36Sopenharmony_ci } 593462306a36Sopenharmony_ci 593562306a36Sopenharmony_ci return flush; 593662306a36Sopenharmony_ci} 593762306a36Sopenharmony_ci 593862306a36Sopenharmony_cistatic __always_inline bool walk_slot_rmaps(struct kvm *kvm, 593962306a36Sopenharmony_ci const struct kvm_memory_slot *slot, 594062306a36Sopenharmony_ci slot_rmaps_handler fn, 594162306a36Sopenharmony_ci int start_level, int end_level, 594262306a36Sopenharmony_ci bool flush_on_yield) 594362306a36Sopenharmony_ci{ 594462306a36Sopenharmony_ci return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, 594562306a36Sopenharmony_ci slot->base_gfn, slot->base_gfn + slot->npages - 1, 594662306a36Sopenharmony_ci flush_on_yield, false); 594762306a36Sopenharmony_ci} 594862306a36Sopenharmony_ci 594962306a36Sopenharmony_cistatic __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, 595062306a36Sopenharmony_ci const struct kvm_memory_slot *slot, 595162306a36Sopenharmony_ci slot_rmaps_handler fn, 595262306a36Sopenharmony_ci bool flush_on_yield) 595362306a36Sopenharmony_ci{ 595462306a36Sopenharmony_ci return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); 595562306a36Sopenharmony_ci} 595662306a36Sopenharmony_ci 595762306a36Sopenharmony_cistatic void free_mmu_pages(struct kvm_mmu *mmu) 595862306a36Sopenharmony_ci{ 595962306a36Sopenharmony_ci if (!tdp_enabled && mmu->pae_root) 596062306a36Sopenharmony_ci set_memory_encrypted((unsigned long)mmu->pae_root, 1); 596162306a36Sopenharmony_ci free_page((unsigned long)mmu->pae_root); 596262306a36Sopenharmony_ci free_page((unsigned long)mmu->pml4_root); 596362306a36Sopenharmony_ci free_page((unsigned long)mmu->pml5_root); 596462306a36Sopenharmony_ci} 596562306a36Sopenharmony_ci 596662306a36Sopenharmony_cistatic int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) 596762306a36Sopenharmony_ci{ 596862306a36Sopenharmony_ci struct page *page; 596962306a36Sopenharmony_ci int i; 597062306a36Sopenharmony_ci 597162306a36Sopenharmony_ci mmu->root.hpa = INVALID_PAGE; 597262306a36Sopenharmony_ci mmu->root.pgd = 0; 597362306a36Sopenharmony_ci for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 597462306a36Sopenharmony_ci mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; 597562306a36Sopenharmony_ci 597662306a36Sopenharmony_ci /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */ 597762306a36Sopenharmony_ci if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu) 597862306a36Sopenharmony_ci return 0; 597962306a36Sopenharmony_ci 598062306a36Sopenharmony_ci /* 598162306a36Sopenharmony_ci * When using PAE paging, the four PDPTEs are treated as 'root' pages, 598262306a36Sopenharmony_ci * while the PDP table is a per-vCPU construct that's allocated at MMU 598362306a36Sopenharmony_ci * creation. When emulating 32-bit mode, cr3 is only 32 bits even on 598462306a36Sopenharmony_ci * x86_64. Therefore we need to allocate the PDP table in the first 598562306a36Sopenharmony_ci * 4GB of memory, which happens to fit the DMA32 zone. TDP paging 598662306a36Sopenharmony_ci * generally doesn't use PAE paging and can skip allocating the PDP 598762306a36Sopenharmony_ci * table. The main exception, handled here, is SVM's 32-bit NPT. The 598862306a36Sopenharmony_ci * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit 598962306a36Sopenharmony_ci * KVM; that horror is handled on-demand by mmu_alloc_special_roots(). 599062306a36Sopenharmony_ci */ 599162306a36Sopenharmony_ci if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) 599262306a36Sopenharmony_ci return 0; 599362306a36Sopenharmony_ci 599462306a36Sopenharmony_ci page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); 599562306a36Sopenharmony_ci if (!page) 599662306a36Sopenharmony_ci return -ENOMEM; 599762306a36Sopenharmony_ci 599862306a36Sopenharmony_ci mmu->pae_root = page_address(page); 599962306a36Sopenharmony_ci 600062306a36Sopenharmony_ci /* 600162306a36Sopenharmony_ci * CR3 is only 32 bits when PAE paging is used, thus it's impossible to 600262306a36Sopenharmony_ci * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so 600362306a36Sopenharmony_ci * that KVM's writes and the CPU's reads get along. Note, this is 600462306a36Sopenharmony_ci * only necessary when using shadow paging, as 64-bit NPT can get at 600562306a36Sopenharmony_ci * the C-bit even when shadowing 32-bit NPT, and SME isn't supported 600662306a36Sopenharmony_ci * by 32-bit kernels (when KVM itself uses 32-bit NPT). 600762306a36Sopenharmony_ci */ 600862306a36Sopenharmony_ci if (!tdp_enabled) 600962306a36Sopenharmony_ci set_memory_decrypted((unsigned long)mmu->pae_root, 1); 601062306a36Sopenharmony_ci else 601162306a36Sopenharmony_ci WARN_ON_ONCE(shadow_me_value); 601262306a36Sopenharmony_ci 601362306a36Sopenharmony_ci for (i = 0; i < 4; ++i) 601462306a36Sopenharmony_ci mmu->pae_root[i] = INVALID_PAE_ROOT; 601562306a36Sopenharmony_ci 601662306a36Sopenharmony_ci return 0; 601762306a36Sopenharmony_ci} 601862306a36Sopenharmony_ci 601962306a36Sopenharmony_ciint kvm_mmu_create(struct kvm_vcpu *vcpu) 602062306a36Sopenharmony_ci{ 602162306a36Sopenharmony_ci int ret; 602262306a36Sopenharmony_ci 602362306a36Sopenharmony_ci vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; 602462306a36Sopenharmony_ci vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO; 602562306a36Sopenharmony_ci 602662306a36Sopenharmony_ci vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; 602762306a36Sopenharmony_ci vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; 602862306a36Sopenharmony_ci 602962306a36Sopenharmony_ci vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; 603062306a36Sopenharmony_ci 603162306a36Sopenharmony_ci vcpu->arch.mmu = &vcpu->arch.root_mmu; 603262306a36Sopenharmony_ci vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 603362306a36Sopenharmony_ci 603462306a36Sopenharmony_ci ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu); 603562306a36Sopenharmony_ci if (ret) 603662306a36Sopenharmony_ci return ret; 603762306a36Sopenharmony_ci 603862306a36Sopenharmony_ci ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu); 603962306a36Sopenharmony_ci if (ret) 604062306a36Sopenharmony_ci goto fail_allocate_root; 604162306a36Sopenharmony_ci 604262306a36Sopenharmony_ci return ret; 604362306a36Sopenharmony_ci fail_allocate_root: 604462306a36Sopenharmony_ci free_mmu_pages(&vcpu->arch.guest_mmu); 604562306a36Sopenharmony_ci return ret; 604662306a36Sopenharmony_ci} 604762306a36Sopenharmony_ci 604862306a36Sopenharmony_ci#define BATCH_ZAP_PAGES 10 604962306a36Sopenharmony_cistatic void kvm_zap_obsolete_pages(struct kvm *kvm) 605062306a36Sopenharmony_ci{ 605162306a36Sopenharmony_ci struct kvm_mmu_page *sp, *node; 605262306a36Sopenharmony_ci int nr_zapped, batch = 0; 605362306a36Sopenharmony_ci bool unstable; 605462306a36Sopenharmony_ci 605562306a36Sopenharmony_cirestart: 605662306a36Sopenharmony_ci list_for_each_entry_safe_reverse(sp, node, 605762306a36Sopenharmony_ci &kvm->arch.active_mmu_pages, link) { 605862306a36Sopenharmony_ci /* 605962306a36Sopenharmony_ci * No obsolete valid page exists before a newly created page 606062306a36Sopenharmony_ci * since active_mmu_pages is a FIFO list. 606162306a36Sopenharmony_ci */ 606262306a36Sopenharmony_ci if (!is_obsolete_sp(kvm, sp)) 606362306a36Sopenharmony_ci break; 606462306a36Sopenharmony_ci 606562306a36Sopenharmony_ci /* 606662306a36Sopenharmony_ci * Invalid pages should never land back on the list of active 606762306a36Sopenharmony_ci * pages. Skip the bogus page, otherwise we'll get stuck in an 606862306a36Sopenharmony_ci * infinite loop if the page gets put back on the list (again). 606962306a36Sopenharmony_ci */ 607062306a36Sopenharmony_ci if (WARN_ON_ONCE(sp->role.invalid)) 607162306a36Sopenharmony_ci continue; 607262306a36Sopenharmony_ci 607362306a36Sopenharmony_ci /* 607462306a36Sopenharmony_ci * No need to flush the TLB since we're only zapping shadow 607562306a36Sopenharmony_ci * pages with an obsolete generation number and all vCPUS have 607662306a36Sopenharmony_ci * loaded a new root, i.e. the shadow pages being zapped cannot 607762306a36Sopenharmony_ci * be in active use by the guest. 607862306a36Sopenharmony_ci */ 607962306a36Sopenharmony_ci if (batch >= BATCH_ZAP_PAGES && 608062306a36Sopenharmony_ci cond_resched_rwlock_write(&kvm->mmu_lock)) { 608162306a36Sopenharmony_ci batch = 0; 608262306a36Sopenharmony_ci goto restart; 608362306a36Sopenharmony_ci } 608462306a36Sopenharmony_ci 608562306a36Sopenharmony_ci unstable = __kvm_mmu_prepare_zap_page(kvm, sp, 608662306a36Sopenharmony_ci &kvm->arch.zapped_obsolete_pages, &nr_zapped); 608762306a36Sopenharmony_ci batch += nr_zapped; 608862306a36Sopenharmony_ci 608962306a36Sopenharmony_ci if (unstable) 609062306a36Sopenharmony_ci goto restart; 609162306a36Sopenharmony_ci } 609262306a36Sopenharmony_ci 609362306a36Sopenharmony_ci /* 609462306a36Sopenharmony_ci * Kick all vCPUs (via remote TLB flush) before freeing the page tables 609562306a36Sopenharmony_ci * to ensure KVM is not in the middle of a lockless shadow page table 609662306a36Sopenharmony_ci * walk, which may reference the pages. The remote TLB flush itself is 609762306a36Sopenharmony_ci * not required and is simply a convenient way to kick vCPUs as needed. 609862306a36Sopenharmony_ci * KVM performs a local TLB flush when allocating a new root (see 609962306a36Sopenharmony_ci * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are 610062306a36Sopenharmony_ci * running with an obsolete MMU. 610162306a36Sopenharmony_ci */ 610262306a36Sopenharmony_ci kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); 610362306a36Sopenharmony_ci} 610462306a36Sopenharmony_ci 610562306a36Sopenharmony_ci/* 610662306a36Sopenharmony_ci * Fast invalidate all shadow pages and use lock-break technique 610762306a36Sopenharmony_ci * to zap obsolete pages. 610862306a36Sopenharmony_ci * 610962306a36Sopenharmony_ci * It's required when memslot is being deleted or VM is being 611062306a36Sopenharmony_ci * destroyed, in these cases, we should ensure that KVM MMU does 611162306a36Sopenharmony_ci * not use any resource of the being-deleted slot or all slots 611262306a36Sopenharmony_ci * after calling the function. 611362306a36Sopenharmony_ci */ 611462306a36Sopenharmony_cistatic void kvm_mmu_zap_all_fast(struct kvm *kvm) 611562306a36Sopenharmony_ci{ 611662306a36Sopenharmony_ci lockdep_assert_held(&kvm->slots_lock); 611762306a36Sopenharmony_ci 611862306a36Sopenharmony_ci write_lock(&kvm->mmu_lock); 611962306a36Sopenharmony_ci trace_kvm_mmu_zap_all_fast(kvm); 612062306a36Sopenharmony_ci 612162306a36Sopenharmony_ci /* 612262306a36Sopenharmony_ci * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is 612362306a36Sopenharmony_ci * held for the entire duration of zapping obsolete pages, it's 612462306a36Sopenharmony_ci * impossible for there to be multiple invalid generations associated 612562306a36Sopenharmony_ci * with *valid* shadow pages at any given time, i.e. there is exactly 612662306a36Sopenharmony_ci * one valid generation and (at most) one invalid generation. 612762306a36Sopenharmony_ci */ 612862306a36Sopenharmony_ci kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1; 612962306a36Sopenharmony_ci 613062306a36Sopenharmony_ci /* 613162306a36Sopenharmony_ci * In order to ensure all vCPUs drop their soon-to-be invalid roots, 613262306a36Sopenharmony_ci * invalidating TDP MMU roots must be done while holding mmu_lock for 613362306a36Sopenharmony_ci * write and in the same critical section as making the reload request, 613462306a36Sopenharmony_ci * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield. 613562306a36Sopenharmony_ci */ 613662306a36Sopenharmony_ci if (tdp_mmu_enabled) 613762306a36Sopenharmony_ci kvm_tdp_mmu_invalidate_all_roots(kvm); 613862306a36Sopenharmony_ci 613962306a36Sopenharmony_ci /* 614062306a36Sopenharmony_ci * Notify all vcpus to reload its shadow page table and flush TLB. 614162306a36Sopenharmony_ci * Then all vcpus will switch to new shadow page table with the new 614262306a36Sopenharmony_ci * mmu_valid_gen. 614362306a36Sopenharmony_ci * 614462306a36Sopenharmony_ci * Note: we need to do this under the protection of mmu_lock, 614562306a36Sopenharmony_ci * otherwise, vcpu would purge shadow page but miss tlb flush. 614662306a36Sopenharmony_ci */ 614762306a36Sopenharmony_ci kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS); 614862306a36Sopenharmony_ci 614962306a36Sopenharmony_ci kvm_zap_obsolete_pages(kvm); 615062306a36Sopenharmony_ci 615162306a36Sopenharmony_ci write_unlock(&kvm->mmu_lock); 615262306a36Sopenharmony_ci 615362306a36Sopenharmony_ci /* 615462306a36Sopenharmony_ci * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before 615562306a36Sopenharmony_ci * returning to the caller, e.g. if the zap is in response to a memslot 615662306a36Sopenharmony_ci * deletion, mmu_notifier callbacks will be unable to reach the SPTEs 615762306a36Sopenharmony_ci * associated with the deleted memslot once the update completes, and 615862306a36Sopenharmony_ci * Deferring the zap until the final reference to the root is put would 615962306a36Sopenharmony_ci * lead to use-after-free. 616062306a36Sopenharmony_ci */ 616162306a36Sopenharmony_ci if (tdp_mmu_enabled) 616262306a36Sopenharmony_ci kvm_tdp_mmu_zap_invalidated_roots(kvm); 616362306a36Sopenharmony_ci} 616462306a36Sopenharmony_ci 616562306a36Sopenharmony_cistatic bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) 616662306a36Sopenharmony_ci{ 616762306a36Sopenharmony_ci return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); 616862306a36Sopenharmony_ci} 616962306a36Sopenharmony_ci 617062306a36Sopenharmony_civoid kvm_mmu_init_vm(struct kvm *kvm) 617162306a36Sopenharmony_ci{ 617262306a36Sopenharmony_ci INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 617362306a36Sopenharmony_ci INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); 617462306a36Sopenharmony_ci INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); 617562306a36Sopenharmony_ci spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); 617662306a36Sopenharmony_ci 617762306a36Sopenharmony_ci if (tdp_mmu_enabled) 617862306a36Sopenharmony_ci kvm_mmu_init_tdp_mmu(kvm); 617962306a36Sopenharmony_ci 618062306a36Sopenharmony_ci kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache; 618162306a36Sopenharmony_ci kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO; 618262306a36Sopenharmony_ci 618362306a36Sopenharmony_ci kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO; 618462306a36Sopenharmony_ci 618562306a36Sopenharmony_ci kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache; 618662306a36Sopenharmony_ci kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO; 618762306a36Sopenharmony_ci} 618862306a36Sopenharmony_ci 618962306a36Sopenharmony_cistatic void mmu_free_vm_memory_caches(struct kvm *kvm) 619062306a36Sopenharmony_ci{ 619162306a36Sopenharmony_ci kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache); 619262306a36Sopenharmony_ci kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache); 619362306a36Sopenharmony_ci kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache); 619462306a36Sopenharmony_ci} 619562306a36Sopenharmony_ci 619662306a36Sopenharmony_civoid kvm_mmu_uninit_vm(struct kvm *kvm) 619762306a36Sopenharmony_ci{ 619862306a36Sopenharmony_ci if (tdp_mmu_enabled) 619962306a36Sopenharmony_ci kvm_mmu_uninit_tdp_mmu(kvm); 620062306a36Sopenharmony_ci 620162306a36Sopenharmony_ci mmu_free_vm_memory_caches(kvm); 620262306a36Sopenharmony_ci} 620362306a36Sopenharmony_ci 620462306a36Sopenharmony_cistatic bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) 620562306a36Sopenharmony_ci{ 620662306a36Sopenharmony_ci const struct kvm_memory_slot *memslot; 620762306a36Sopenharmony_ci struct kvm_memslots *slots; 620862306a36Sopenharmony_ci struct kvm_memslot_iter iter; 620962306a36Sopenharmony_ci bool flush = false; 621062306a36Sopenharmony_ci gfn_t start, end; 621162306a36Sopenharmony_ci int i; 621262306a36Sopenharmony_ci 621362306a36Sopenharmony_ci if (!kvm_memslots_have_rmaps(kvm)) 621462306a36Sopenharmony_ci return flush; 621562306a36Sopenharmony_ci 621662306a36Sopenharmony_ci for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 621762306a36Sopenharmony_ci slots = __kvm_memslots(kvm, i); 621862306a36Sopenharmony_ci 621962306a36Sopenharmony_ci kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) { 622062306a36Sopenharmony_ci memslot = iter.slot; 622162306a36Sopenharmony_ci start = max(gfn_start, memslot->base_gfn); 622262306a36Sopenharmony_ci end = min(gfn_end, memslot->base_gfn + memslot->npages); 622362306a36Sopenharmony_ci if (WARN_ON_ONCE(start >= end)) 622462306a36Sopenharmony_ci continue; 622562306a36Sopenharmony_ci 622662306a36Sopenharmony_ci flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap, 622762306a36Sopenharmony_ci PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, 622862306a36Sopenharmony_ci start, end - 1, true, flush); 622962306a36Sopenharmony_ci } 623062306a36Sopenharmony_ci } 623162306a36Sopenharmony_ci 623262306a36Sopenharmony_ci return flush; 623362306a36Sopenharmony_ci} 623462306a36Sopenharmony_ci 623562306a36Sopenharmony_ci/* 623662306a36Sopenharmony_ci * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end 623762306a36Sopenharmony_ci * (not including it) 623862306a36Sopenharmony_ci */ 623962306a36Sopenharmony_civoid kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) 624062306a36Sopenharmony_ci{ 624162306a36Sopenharmony_ci bool flush; 624262306a36Sopenharmony_ci 624362306a36Sopenharmony_ci if (WARN_ON_ONCE(gfn_end <= gfn_start)) 624462306a36Sopenharmony_ci return; 624562306a36Sopenharmony_ci 624662306a36Sopenharmony_ci write_lock(&kvm->mmu_lock); 624762306a36Sopenharmony_ci 624862306a36Sopenharmony_ci kvm_mmu_invalidate_begin(kvm, 0, -1ul); 624962306a36Sopenharmony_ci 625062306a36Sopenharmony_ci flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); 625162306a36Sopenharmony_ci 625262306a36Sopenharmony_ci if (tdp_mmu_enabled) 625362306a36Sopenharmony_ci flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush); 625462306a36Sopenharmony_ci 625562306a36Sopenharmony_ci if (flush) 625662306a36Sopenharmony_ci kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start); 625762306a36Sopenharmony_ci 625862306a36Sopenharmony_ci kvm_mmu_invalidate_end(kvm, 0, -1ul); 625962306a36Sopenharmony_ci 626062306a36Sopenharmony_ci write_unlock(&kvm->mmu_lock); 626162306a36Sopenharmony_ci} 626262306a36Sopenharmony_ci 626362306a36Sopenharmony_cistatic bool slot_rmap_write_protect(struct kvm *kvm, 626462306a36Sopenharmony_ci struct kvm_rmap_head *rmap_head, 626562306a36Sopenharmony_ci const struct kvm_memory_slot *slot) 626662306a36Sopenharmony_ci{ 626762306a36Sopenharmony_ci return rmap_write_protect(rmap_head, false); 626862306a36Sopenharmony_ci} 626962306a36Sopenharmony_ci 627062306a36Sopenharmony_civoid kvm_mmu_slot_remove_write_access(struct kvm *kvm, 627162306a36Sopenharmony_ci const struct kvm_memory_slot *memslot, 627262306a36Sopenharmony_ci int start_level) 627362306a36Sopenharmony_ci{ 627462306a36Sopenharmony_ci if (kvm_memslots_have_rmaps(kvm)) { 627562306a36Sopenharmony_ci write_lock(&kvm->mmu_lock); 627662306a36Sopenharmony_ci walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect, 627762306a36Sopenharmony_ci start_level, KVM_MAX_HUGEPAGE_LEVEL, false); 627862306a36Sopenharmony_ci write_unlock(&kvm->mmu_lock); 627962306a36Sopenharmony_ci } 628062306a36Sopenharmony_ci 628162306a36Sopenharmony_ci if (tdp_mmu_enabled) { 628262306a36Sopenharmony_ci read_lock(&kvm->mmu_lock); 628362306a36Sopenharmony_ci kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); 628462306a36Sopenharmony_ci read_unlock(&kvm->mmu_lock); 628562306a36Sopenharmony_ci } 628662306a36Sopenharmony_ci} 628762306a36Sopenharmony_ci 628862306a36Sopenharmony_cistatic inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min) 628962306a36Sopenharmony_ci{ 629062306a36Sopenharmony_ci return kvm_mmu_memory_cache_nr_free_objects(cache) < min; 629162306a36Sopenharmony_ci} 629262306a36Sopenharmony_ci 629362306a36Sopenharmony_cistatic bool need_topup_split_caches_or_resched(struct kvm *kvm) 629462306a36Sopenharmony_ci{ 629562306a36Sopenharmony_ci if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) 629662306a36Sopenharmony_ci return true; 629762306a36Sopenharmony_ci 629862306a36Sopenharmony_ci /* 629962306a36Sopenharmony_ci * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed 630062306a36Sopenharmony_ci * to split a single huge page. Calculating how many are actually needed 630162306a36Sopenharmony_ci * is possible but not worth the complexity. 630262306a36Sopenharmony_ci */ 630362306a36Sopenharmony_ci return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) || 630462306a36Sopenharmony_ci need_topup(&kvm->arch.split_page_header_cache, 1) || 630562306a36Sopenharmony_ci need_topup(&kvm->arch.split_shadow_page_cache, 1); 630662306a36Sopenharmony_ci} 630762306a36Sopenharmony_ci 630862306a36Sopenharmony_cistatic int topup_split_caches(struct kvm *kvm) 630962306a36Sopenharmony_ci{ 631062306a36Sopenharmony_ci /* 631162306a36Sopenharmony_ci * Allocating rmap list entries when splitting huge pages for nested 631262306a36Sopenharmony_ci * MMUs is uncommon as KVM needs to use a list if and only if there is 631362306a36Sopenharmony_ci * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be 631462306a36Sopenharmony_ci * aliased by multiple L2 gfns and/or from multiple nested roots with 631562306a36Sopenharmony_ci * different roles. Aliasing gfns when using TDP is atypical for VMMs; 631662306a36Sopenharmony_ci * a few gfns are often aliased during boot, e.g. when remapping BIOS, 631762306a36Sopenharmony_ci * but aliasing rarely occurs post-boot or for many gfns. If there is 631862306a36Sopenharmony_ci * only one rmap entry, rmap->val points directly at that one entry and 631962306a36Sopenharmony_ci * doesn't need to allocate a list. Buffer the cache by the default 632062306a36Sopenharmony_ci * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM 632162306a36Sopenharmony_ci * encounters an aliased gfn or two. 632262306a36Sopenharmony_ci */ 632362306a36Sopenharmony_ci const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS + 632462306a36Sopenharmony_ci KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE; 632562306a36Sopenharmony_ci int r; 632662306a36Sopenharmony_ci 632762306a36Sopenharmony_ci lockdep_assert_held(&kvm->slots_lock); 632862306a36Sopenharmony_ci 632962306a36Sopenharmony_ci r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity, 633062306a36Sopenharmony_ci SPLIT_DESC_CACHE_MIN_NR_OBJECTS); 633162306a36Sopenharmony_ci if (r) 633262306a36Sopenharmony_ci return r; 633362306a36Sopenharmony_ci 633462306a36Sopenharmony_ci r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1); 633562306a36Sopenharmony_ci if (r) 633662306a36Sopenharmony_ci return r; 633762306a36Sopenharmony_ci 633862306a36Sopenharmony_ci return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1); 633962306a36Sopenharmony_ci} 634062306a36Sopenharmony_ci 634162306a36Sopenharmony_cistatic struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep) 634262306a36Sopenharmony_ci{ 634362306a36Sopenharmony_ci struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); 634462306a36Sopenharmony_ci struct shadow_page_caches caches = {}; 634562306a36Sopenharmony_ci union kvm_mmu_page_role role; 634662306a36Sopenharmony_ci unsigned int access; 634762306a36Sopenharmony_ci gfn_t gfn; 634862306a36Sopenharmony_ci 634962306a36Sopenharmony_ci gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); 635062306a36Sopenharmony_ci access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep)); 635162306a36Sopenharmony_ci 635262306a36Sopenharmony_ci /* 635362306a36Sopenharmony_ci * Note, huge page splitting always uses direct shadow pages, regardless 635462306a36Sopenharmony_ci * of whether the huge page itself is mapped by a direct or indirect 635562306a36Sopenharmony_ci * shadow page, since the huge page region itself is being directly 635662306a36Sopenharmony_ci * mapped with smaller pages. 635762306a36Sopenharmony_ci */ 635862306a36Sopenharmony_ci role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access); 635962306a36Sopenharmony_ci 636062306a36Sopenharmony_ci /* Direct SPs do not require a shadowed_info_cache. */ 636162306a36Sopenharmony_ci caches.page_header_cache = &kvm->arch.split_page_header_cache; 636262306a36Sopenharmony_ci caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache; 636362306a36Sopenharmony_ci 636462306a36Sopenharmony_ci /* Safe to pass NULL for vCPU since requesting a direct SP. */ 636562306a36Sopenharmony_ci return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role); 636662306a36Sopenharmony_ci} 636762306a36Sopenharmony_ci 636862306a36Sopenharmony_cistatic void shadow_mmu_split_huge_page(struct kvm *kvm, 636962306a36Sopenharmony_ci const struct kvm_memory_slot *slot, 637062306a36Sopenharmony_ci u64 *huge_sptep) 637162306a36Sopenharmony_ci 637262306a36Sopenharmony_ci{ 637362306a36Sopenharmony_ci struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache; 637462306a36Sopenharmony_ci u64 huge_spte = READ_ONCE(*huge_sptep); 637562306a36Sopenharmony_ci struct kvm_mmu_page *sp; 637662306a36Sopenharmony_ci bool flush = false; 637762306a36Sopenharmony_ci u64 *sptep, spte; 637862306a36Sopenharmony_ci gfn_t gfn; 637962306a36Sopenharmony_ci int index; 638062306a36Sopenharmony_ci 638162306a36Sopenharmony_ci sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep); 638262306a36Sopenharmony_ci 638362306a36Sopenharmony_ci for (index = 0; index < SPTE_ENT_PER_PAGE; index++) { 638462306a36Sopenharmony_ci sptep = &sp->spt[index]; 638562306a36Sopenharmony_ci gfn = kvm_mmu_page_get_gfn(sp, index); 638662306a36Sopenharmony_ci 638762306a36Sopenharmony_ci /* 638862306a36Sopenharmony_ci * The SP may already have populated SPTEs, e.g. if this huge 638962306a36Sopenharmony_ci * page is aliased by multiple sptes with the same access 639062306a36Sopenharmony_ci * permissions. These entries are guaranteed to map the same 639162306a36Sopenharmony_ci * gfn-to-pfn translation since the SP is direct, so no need to 639262306a36Sopenharmony_ci * modify them. 639362306a36Sopenharmony_ci * 639462306a36Sopenharmony_ci * However, if a given SPTE points to a lower level page table, 639562306a36Sopenharmony_ci * that lower level page table may only be partially populated. 639662306a36Sopenharmony_ci * Installing such SPTEs would effectively unmap a potion of the 639762306a36Sopenharmony_ci * huge page. Unmapping guest memory always requires a TLB flush 639862306a36Sopenharmony_ci * since a subsequent operation on the unmapped regions would 639962306a36Sopenharmony_ci * fail to detect the need to flush. 640062306a36Sopenharmony_ci */ 640162306a36Sopenharmony_ci if (is_shadow_present_pte(*sptep)) { 640262306a36Sopenharmony_ci flush |= !is_last_spte(*sptep, sp->role.level); 640362306a36Sopenharmony_ci continue; 640462306a36Sopenharmony_ci } 640562306a36Sopenharmony_ci 640662306a36Sopenharmony_ci spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index); 640762306a36Sopenharmony_ci mmu_spte_set(sptep, spte); 640862306a36Sopenharmony_ci __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access); 640962306a36Sopenharmony_ci } 641062306a36Sopenharmony_ci 641162306a36Sopenharmony_ci __link_shadow_page(kvm, cache, huge_sptep, sp, flush); 641262306a36Sopenharmony_ci} 641362306a36Sopenharmony_ci 641462306a36Sopenharmony_cistatic int shadow_mmu_try_split_huge_page(struct kvm *kvm, 641562306a36Sopenharmony_ci const struct kvm_memory_slot *slot, 641662306a36Sopenharmony_ci u64 *huge_sptep) 641762306a36Sopenharmony_ci{ 641862306a36Sopenharmony_ci struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); 641962306a36Sopenharmony_ci int level, r = 0; 642062306a36Sopenharmony_ci gfn_t gfn; 642162306a36Sopenharmony_ci u64 spte; 642262306a36Sopenharmony_ci 642362306a36Sopenharmony_ci /* Grab information for the tracepoint before dropping the MMU lock. */ 642462306a36Sopenharmony_ci gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); 642562306a36Sopenharmony_ci level = huge_sp->role.level; 642662306a36Sopenharmony_ci spte = *huge_sptep; 642762306a36Sopenharmony_ci 642862306a36Sopenharmony_ci if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) { 642962306a36Sopenharmony_ci r = -ENOSPC; 643062306a36Sopenharmony_ci goto out; 643162306a36Sopenharmony_ci } 643262306a36Sopenharmony_ci 643362306a36Sopenharmony_ci if (need_topup_split_caches_or_resched(kvm)) { 643462306a36Sopenharmony_ci write_unlock(&kvm->mmu_lock); 643562306a36Sopenharmony_ci cond_resched(); 643662306a36Sopenharmony_ci /* 643762306a36Sopenharmony_ci * If the topup succeeds, return -EAGAIN to indicate that the 643862306a36Sopenharmony_ci * rmap iterator should be restarted because the MMU lock was 643962306a36Sopenharmony_ci * dropped. 644062306a36Sopenharmony_ci */ 644162306a36Sopenharmony_ci r = topup_split_caches(kvm) ?: -EAGAIN; 644262306a36Sopenharmony_ci write_lock(&kvm->mmu_lock); 644362306a36Sopenharmony_ci goto out; 644462306a36Sopenharmony_ci } 644562306a36Sopenharmony_ci 644662306a36Sopenharmony_ci shadow_mmu_split_huge_page(kvm, slot, huge_sptep); 644762306a36Sopenharmony_ci 644862306a36Sopenharmony_ciout: 644962306a36Sopenharmony_ci trace_kvm_mmu_split_huge_page(gfn, spte, level, r); 645062306a36Sopenharmony_ci return r; 645162306a36Sopenharmony_ci} 645262306a36Sopenharmony_ci 645362306a36Sopenharmony_cistatic bool shadow_mmu_try_split_huge_pages(struct kvm *kvm, 645462306a36Sopenharmony_ci struct kvm_rmap_head *rmap_head, 645562306a36Sopenharmony_ci const struct kvm_memory_slot *slot) 645662306a36Sopenharmony_ci{ 645762306a36Sopenharmony_ci struct rmap_iterator iter; 645862306a36Sopenharmony_ci struct kvm_mmu_page *sp; 645962306a36Sopenharmony_ci u64 *huge_sptep; 646062306a36Sopenharmony_ci int r; 646162306a36Sopenharmony_ci 646262306a36Sopenharmony_cirestart: 646362306a36Sopenharmony_ci for_each_rmap_spte(rmap_head, &iter, huge_sptep) { 646462306a36Sopenharmony_ci sp = sptep_to_sp(huge_sptep); 646562306a36Sopenharmony_ci 646662306a36Sopenharmony_ci /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */ 646762306a36Sopenharmony_ci if (WARN_ON_ONCE(!sp->role.guest_mode)) 646862306a36Sopenharmony_ci continue; 646962306a36Sopenharmony_ci 647062306a36Sopenharmony_ci /* The rmaps should never contain non-leaf SPTEs. */ 647162306a36Sopenharmony_ci if (WARN_ON_ONCE(!is_large_pte(*huge_sptep))) 647262306a36Sopenharmony_ci continue; 647362306a36Sopenharmony_ci 647462306a36Sopenharmony_ci /* SPs with level >PG_LEVEL_4K should never by unsync. */ 647562306a36Sopenharmony_ci if (WARN_ON_ONCE(sp->unsync)) 647662306a36Sopenharmony_ci continue; 647762306a36Sopenharmony_ci 647862306a36Sopenharmony_ci /* Don't bother splitting huge pages on invalid SPs. */ 647962306a36Sopenharmony_ci if (sp->role.invalid) 648062306a36Sopenharmony_ci continue; 648162306a36Sopenharmony_ci 648262306a36Sopenharmony_ci r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep); 648362306a36Sopenharmony_ci 648462306a36Sopenharmony_ci /* 648562306a36Sopenharmony_ci * The split succeeded or needs to be retried because the MMU 648662306a36Sopenharmony_ci * lock was dropped. Either way, restart the iterator to get it 648762306a36Sopenharmony_ci * back into a consistent state. 648862306a36Sopenharmony_ci */ 648962306a36Sopenharmony_ci if (!r || r == -EAGAIN) 649062306a36Sopenharmony_ci goto restart; 649162306a36Sopenharmony_ci 649262306a36Sopenharmony_ci /* The split failed and shouldn't be retried (e.g. -ENOMEM). */ 649362306a36Sopenharmony_ci break; 649462306a36Sopenharmony_ci } 649562306a36Sopenharmony_ci 649662306a36Sopenharmony_ci return false; 649762306a36Sopenharmony_ci} 649862306a36Sopenharmony_ci 649962306a36Sopenharmony_cistatic void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm, 650062306a36Sopenharmony_ci const struct kvm_memory_slot *slot, 650162306a36Sopenharmony_ci gfn_t start, gfn_t end, 650262306a36Sopenharmony_ci int target_level) 650362306a36Sopenharmony_ci{ 650462306a36Sopenharmony_ci int level; 650562306a36Sopenharmony_ci 650662306a36Sopenharmony_ci /* 650762306a36Sopenharmony_ci * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working 650862306a36Sopenharmony_ci * down to the target level. This ensures pages are recursively split 650962306a36Sopenharmony_ci * all the way to the target level. There's no need to split pages 651062306a36Sopenharmony_ci * already at the target level. 651162306a36Sopenharmony_ci */ 651262306a36Sopenharmony_ci for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) 651362306a36Sopenharmony_ci __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages, 651462306a36Sopenharmony_ci level, level, start, end - 1, true, false); 651562306a36Sopenharmony_ci} 651662306a36Sopenharmony_ci 651762306a36Sopenharmony_ci/* Must be called with the mmu_lock held in write-mode. */ 651862306a36Sopenharmony_civoid kvm_mmu_try_split_huge_pages(struct kvm *kvm, 651962306a36Sopenharmony_ci const struct kvm_memory_slot *memslot, 652062306a36Sopenharmony_ci u64 start, u64 end, 652162306a36Sopenharmony_ci int target_level) 652262306a36Sopenharmony_ci{ 652362306a36Sopenharmony_ci if (!tdp_mmu_enabled) 652462306a36Sopenharmony_ci return; 652562306a36Sopenharmony_ci 652662306a36Sopenharmony_ci if (kvm_memslots_have_rmaps(kvm)) 652762306a36Sopenharmony_ci kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); 652862306a36Sopenharmony_ci 652962306a36Sopenharmony_ci kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false); 653062306a36Sopenharmony_ci 653162306a36Sopenharmony_ci /* 653262306a36Sopenharmony_ci * A TLB flush is unnecessary at this point for the same resons as in 653362306a36Sopenharmony_ci * kvm_mmu_slot_try_split_huge_pages(). 653462306a36Sopenharmony_ci */ 653562306a36Sopenharmony_ci} 653662306a36Sopenharmony_ci 653762306a36Sopenharmony_civoid kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, 653862306a36Sopenharmony_ci const struct kvm_memory_slot *memslot, 653962306a36Sopenharmony_ci int target_level) 654062306a36Sopenharmony_ci{ 654162306a36Sopenharmony_ci u64 start = memslot->base_gfn; 654262306a36Sopenharmony_ci u64 end = start + memslot->npages; 654362306a36Sopenharmony_ci 654462306a36Sopenharmony_ci if (!tdp_mmu_enabled) 654562306a36Sopenharmony_ci return; 654662306a36Sopenharmony_ci 654762306a36Sopenharmony_ci if (kvm_memslots_have_rmaps(kvm)) { 654862306a36Sopenharmony_ci write_lock(&kvm->mmu_lock); 654962306a36Sopenharmony_ci kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); 655062306a36Sopenharmony_ci write_unlock(&kvm->mmu_lock); 655162306a36Sopenharmony_ci } 655262306a36Sopenharmony_ci 655362306a36Sopenharmony_ci read_lock(&kvm->mmu_lock); 655462306a36Sopenharmony_ci kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); 655562306a36Sopenharmony_ci read_unlock(&kvm->mmu_lock); 655662306a36Sopenharmony_ci 655762306a36Sopenharmony_ci /* 655862306a36Sopenharmony_ci * No TLB flush is necessary here. KVM will flush TLBs after 655962306a36Sopenharmony_ci * write-protecting and/or clearing dirty on the newly split SPTEs to 656062306a36Sopenharmony_ci * ensure that guest writes are reflected in the dirty log before the 656162306a36Sopenharmony_ci * ioctl to enable dirty logging on this memslot completes. Since the 656262306a36Sopenharmony_ci * split SPTEs retain the write and dirty bits of the huge SPTE, it is 656362306a36Sopenharmony_ci * safe for KVM to decide if a TLB flush is necessary based on the split 656462306a36Sopenharmony_ci * SPTEs. 656562306a36Sopenharmony_ci */ 656662306a36Sopenharmony_ci} 656762306a36Sopenharmony_ci 656862306a36Sopenharmony_cistatic bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, 656962306a36Sopenharmony_ci struct kvm_rmap_head *rmap_head, 657062306a36Sopenharmony_ci const struct kvm_memory_slot *slot) 657162306a36Sopenharmony_ci{ 657262306a36Sopenharmony_ci u64 *sptep; 657362306a36Sopenharmony_ci struct rmap_iterator iter; 657462306a36Sopenharmony_ci int need_tlb_flush = 0; 657562306a36Sopenharmony_ci struct kvm_mmu_page *sp; 657662306a36Sopenharmony_ci 657762306a36Sopenharmony_cirestart: 657862306a36Sopenharmony_ci for_each_rmap_spte(rmap_head, &iter, sptep) { 657962306a36Sopenharmony_ci sp = sptep_to_sp(sptep); 658062306a36Sopenharmony_ci 658162306a36Sopenharmony_ci /* 658262306a36Sopenharmony_ci * We cannot do huge page mapping for indirect shadow pages, 658362306a36Sopenharmony_ci * which are found on the last rmap (level = 1) when not using 658462306a36Sopenharmony_ci * tdp; such shadow pages are synced with the page table in 658562306a36Sopenharmony_ci * the guest, and the guest page table is using 4K page size 658662306a36Sopenharmony_ci * mapping if the indirect sp has level = 1. 658762306a36Sopenharmony_ci */ 658862306a36Sopenharmony_ci if (sp->role.direct && 658962306a36Sopenharmony_ci sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, 659062306a36Sopenharmony_ci PG_LEVEL_NUM)) { 659162306a36Sopenharmony_ci kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); 659262306a36Sopenharmony_ci 659362306a36Sopenharmony_ci if (kvm_available_flush_remote_tlbs_range()) 659462306a36Sopenharmony_ci kvm_flush_remote_tlbs_sptep(kvm, sptep); 659562306a36Sopenharmony_ci else 659662306a36Sopenharmony_ci need_tlb_flush = 1; 659762306a36Sopenharmony_ci 659862306a36Sopenharmony_ci goto restart; 659962306a36Sopenharmony_ci } 660062306a36Sopenharmony_ci } 660162306a36Sopenharmony_ci 660262306a36Sopenharmony_ci return need_tlb_flush; 660362306a36Sopenharmony_ci} 660462306a36Sopenharmony_ci 660562306a36Sopenharmony_cistatic void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, 660662306a36Sopenharmony_ci const struct kvm_memory_slot *slot) 660762306a36Sopenharmony_ci{ 660862306a36Sopenharmony_ci /* 660962306a36Sopenharmony_ci * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap 661062306a36Sopenharmony_ci * pages that are already mapped at the maximum hugepage level. 661162306a36Sopenharmony_ci */ 661262306a36Sopenharmony_ci if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte, 661362306a36Sopenharmony_ci PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true)) 661462306a36Sopenharmony_ci kvm_flush_remote_tlbs_memslot(kvm, slot); 661562306a36Sopenharmony_ci} 661662306a36Sopenharmony_ci 661762306a36Sopenharmony_civoid kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, 661862306a36Sopenharmony_ci const struct kvm_memory_slot *slot) 661962306a36Sopenharmony_ci{ 662062306a36Sopenharmony_ci if (kvm_memslots_have_rmaps(kvm)) { 662162306a36Sopenharmony_ci write_lock(&kvm->mmu_lock); 662262306a36Sopenharmony_ci kvm_rmap_zap_collapsible_sptes(kvm, slot); 662362306a36Sopenharmony_ci write_unlock(&kvm->mmu_lock); 662462306a36Sopenharmony_ci } 662562306a36Sopenharmony_ci 662662306a36Sopenharmony_ci if (tdp_mmu_enabled) { 662762306a36Sopenharmony_ci read_lock(&kvm->mmu_lock); 662862306a36Sopenharmony_ci kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); 662962306a36Sopenharmony_ci read_unlock(&kvm->mmu_lock); 663062306a36Sopenharmony_ci } 663162306a36Sopenharmony_ci} 663262306a36Sopenharmony_ci 663362306a36Sopenharmony_civoid kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, 663462306a36Sopenharmony_ci const struct kvm_memory_slot *memslot) 663562306a36Sopenharmony_ci{ 663662306a36Sopenharmony_ci if (kvm_memslots_have_rmaps(kvm)) { 663762306a36Sopenharmony_ci write_lock(&kvm->mmu_lock); 663862306a36Sopenharmony_ci /* 663962306a36Sopenharmony_ci * Clear dirty bits only on 4k SPTEs since the legacy MMU only 664062306a36Sopenharmony_ci * support dirty logging at a 4k granularity. 664162306a36Sopenharmony_ci */ 664262306a36Sopenharmony_ci walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false); 664362306a36Sopenharmony_ci write_unlock(&kvm->mmu_lock); 664462306a36Sopenharmony_ci } 664562306a36Sopenharmony_ci 664662306a36Sopenharmony_ci if (tdp_mmu_enabled) { 664762306a36Sopenharmony_ci read_lock(&kvm->mmu_lock); 664862306a36Sopenharmony_ci kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); 664962306a36Sopenharmony_ci read_unlock(&kvm->mmu_lock); 665062306a36Sopenharmony_ci } 665162306a36Sopenharmony_ci 665262306a36Sopenharmony_ci /* 665362306a36Sopenharmony_ci * The caller will flush the TLBs after this function returns. 665462306a36Sopenharmony_ci * 665562306a36Sopenharmony_ci * It's also safe to flush TLBs out of mmu lock here as currently this 665662306a36Sopenharmony_ci * function is only used for dirty logging, in which case flushing TLB 665762306a36Sopenharmony_ci * out of mmu lock also guarantees no dirty pages will be lost in 665862306a36Sopenharmony_ci * dirty_bitmap. 665962306a36Sopenharmony_ci */ 666062306a36Sopenharmony_ci} 666162306a36Sopenharmony_ci 666262306a36Sopenharmony_cistatic void kvm_mmu_zap_all(struct kvm *kvm) 666362306a36Sopenharmony_ci{ 666462306a36Sopenharmony_ci struct kvm_mmu_page *sp, *node; 666562306a36Sopenharmony_ci LIST_HEAD(invalid_list); 666662306a36Sopenharmony_ci int ign; 666762306a36Sopenharmony_ci 666862306a36Sopenharmony_ci write_lock(&kvm->mmu_lock); 666962306a36Sopenharmony_cirestart: 667062306a36Sopenharmony_ci list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { 667162306a36Sopenharmony_ci if (WARN_ON_ONCE(sp->role.invalid)) 667262306a36Sopenharmony_ci continue; 667362306a36Sopenharmony_ci if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) 667462306a36Sopenharmony_ci goto restart; 667562306a36Sopenharmony_ci if (cond_resched_rwlock_write(&kvm->mmu_lock)) 667662306a36Sopenharmony_ci goto restart; 667762306a36Sopenharmony_ci } 667862306a36Sopenharmony_ci 667962306a36Sopenharmony_ci kvm_mmu_commit_zap_page(kvm, &invalid_list); 668062306a36Sopenharmony_ci 668162306a36Sopenharmony_ci if (tdp_mmu_enabled) 668262306a36Sopenharmony_ci kvm_tdp_mmu_zap_all(kvm); 668362306a36Sopenharmony_ci 668462306a36Sopenharmony_ci write_unlock(&kvm->mmu_lock); 668562306a36Sopenharmony_ci} 668662306a36Sopenharmony_ci 668762306a36Sopenharmony_civoid kvm_arch_flush_shadow_all(struct kvm *kvm) 668862306a36Sopenharmony_ci{ 668962306a36Sopenharmony_ci kvm_mmu_zap_all(kvm); 669062306a36Sopenharmony_ci} 669162306a36Sopenharmony_ci 669262306a36Sopenharmony_civoid kvm_arch_flush_shadow_memslot(struct kvm *kvm, 669362306a36Sopenharmony_ci struct kvm_memory_slot *slot) 669462306a36Sopenharmony_ci{ 669562306a36Sopenharmony_ci kvm_mmu_zap_all_fast(kvm); 669662306a36Sopenharmony_ci} 669762306a36Sopenharmony_ci 669862306a36Sopenharmony_civoid kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) 669962306a36Sopenharmony_ci{ 670062306a36Sopenharmony_ci WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); 670162306a36Sopenharmony_ci 670262306a36Sopenharmony_ci gen &= MMIO_SPTE_GEN_MASK; 670362306a36Sopenharmony_ci 670462306a36Sopenharmony_ci /* 670562306a36Sopenharmony_ci * Generation numbers are incremented in multiples of the number of 670662306a36Sopenharmony_ci * address spaces in order to provide unique generations across all 670762306a36Sopenharmony_ci * address spaces. Strip what is effectively the address space 670862306a36Sopenharmony_ci * modifier prior to checking for a wrap of the MMIO generation so 670962306a36Sopenharmony_ci * that a wrap in any address space is detected. 671062306a36Sopenharmony_ci */ 671162306a36Sopenharmony_ci gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1); 671262306a36Sopenharmony_ci 671362306a36Sopenharmony_ci /* 671462306a36Sopenharmony_ci * The very rare case: if the MMIO generation number has wrapped, 671562306a36Sopenharmony_ci * zap all shadow pages. 671662306a36Sopenharmony_ci */ 671762306a36Sopenharmony_ci if (unlikely(gen == 0)) { 671862306a36Sopenharmony_ci kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n"); 671962306a36Sopenharmony_ci kvm_mmu_zap_all_fast(kvm); 672062306a36Sopenharmony_ci } 672162306a36Sopenharmony_ci} 672262306a36Sopenharmony_ci 672362306a36Sopenharmony_cistatic unsigned long mmu_shrink_scan(struct shrinker *shrink, 672462306a36Sopenharmony_ci struct shrink_control *sc) 672562306a36Sopenharmony_ci{ 672662306a36Sopenharmony_ci struct kvm *kvm; 672762306a36Sopenharmony_ci int nr_to_scan = sc->nr_to_scan; 672862306a36Sopenharmony_ci unsigned long freed = 0; 672962306a36Sopenharmony_ci 673062306a36Sopenharmony_ci mutex_lock(&kvm_lock); 673162306a36Sopenharmony_ci 673262306a36Sopenharmony_ci list_for_each_entry(kvm, &vm_list, vm_list) { 673362306a36Sopenharmony_ci int idx; 673462306a36Sopenharmony_ci LIST_HEAD(invalid_list); 673562306a36Sopenharmony_ci 673662306a36Sopenharmony_ci /* 673762306a36Sopenharmony_ci * Never scan more than sc->nr_to_scan VM instances. 673862306a36Sopenharmony_ci * Will not hit this condition practically since we do not try 673962306a36Sopenharmony_ci * to shrink more than one VM and it is very unlikely to see 674062306a36Sopenharmony_ci * !n_used_mmu_pages so many times. 674162306a36Sopenharmony_ci */ 674262306a36Sopenharmony_ci if (!nr_to_scan--) 674362306a36Sopenharmony_ci break; 674462306a36Sopenharmony_ci /* 674562306a36Sopenharmony_ci * n_used_mmu_pages is accessed without holding kvm->mmu_lock 674662306a36Sopenharmony_ci * here. We may skip a VM instance errorneosly, but we do not 674762306a36Sopenharmony_ci * want to shrink a VM that only started to populate its MMU 674862306a36Sopenharmony_ci * anyway. 674962306a36Sopenharmony_ci */ 675062306a36Sopenharmony_ci if (!kvm->arch.n_used_mmu_pages && 675162306a36Sopenharmony_ci !kvm_has_zapped_obsolete_pages(kvm)) 675262306a36Sopenharmony_ci continue; 675362306a36Sopenharmony_ci 675462306a36Sopenharmony_ci idx = srcu_read_lock(&kvm->srcu); 675562306a36Sopenharmony_ci write_lock(&kvm->mmu_lock); 675662306a36Sopenharmony_ci 675762306a36Sopenharmony_ci if (kvm_has_zapped_obsolete_pages(kvm)) { 675862306a36Sopenharmony_ci kvm_mmu_commit_zap_page(kvm, 675962306a36Sopenharmony_ci &kvm->arch.zapped_obsolete_pages); 676062306a36Sopenharmony_ci goto unlock; 676162306a36Sopenharmony_ci } 676262306a36Sopenharmony_ci 676362306a36Sopenharmony_ci freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); 676462306a36Sopenharmony_ci 676562306a36Sopenharmony_ciunlock: 676662306a36Sopenharmony_ci write_unlock(&kvm->mmu_lock); 676762306a36Sopenharmony_ci srcu_read_unlock(&kvm->srcu, idx); 676862306a36Sopenharmony_ci 676962306a36Sopenharmony_ci /* 677062306a36Sopenharmony_ci * unfair on small ones 677162306a36Sopenharmony_ci * per-vm shrinkers cry out 677262306a36Sopenharmony_ci * sadness comes quickly 677362306a36Sopenharmony_ci */ 677462306a36Sopenharmony_ci list_move_tail(&kvm->vm_list, &vm_list); 677562306a36Sopenharmony_ci break; 677662306a36Sopenharmony_ci } 677762306a36Sopenharmony_ci 677862306a36Sopenharmony_ci mutex_unlock(&kvm_lock); 677962306a36Sopenharmony_ci return freed; 678062306a36Sopenharmony_ci} 678162306a36Sopenharmony_ci 678262306a36Sopenharmony_cistatic unsigned long mmu_shrink_count(struct shrinker *shrink, 678362306a36Sopenharmony_ci struct shrink_control *sc) 678462306a36Sopenharmony_ci{ 678562306a36Sopenharmony_ci return percpu_counter_read_positive(&kvm_total_used_mmu_pages); 678662306a36Sopenharmony_ci} 678762306a36Sopenharmony_ci 678862306a36Sopenharmony_cistatic struct shrinker mmu_shrinker = { 678962306a36Sopenharmony_ci .count_objects = mmu_shrink_count, 679062306a36Sopenharmony_ci .scan_objects = mmu_shrink_scan, 679162306a36Sopenharmony_ci .seeks = DEFAULT_SEEKS * 10, 679262306a36Sopenharmony_ci}; 679362306a36Sopenharmony_ci 679462306a36Sopenharmony_cistatic void mmu_destroy_caches(void) 679562306a36Sopenharmony_ci{ 679662306a36Sopenharmony_ci kmem_cache_destroy(pte_list_desc_cache); 679762306a36Sopenharmony_ci kmem_cache_destroy(mmu_page_header_cache); 679862306a36Sopenharmony_ci} 679962306a36Sopenharmony_ci 680062306a36Sopenharmony_cistatic int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) 680162306a36Sopenharmony_ci{ 680262306a36Sopenharmony_ci if (nx_hugepage_mitigation_hard_disabled) 680362306a36Sopenharmony_ci return sysfs_emit(buffer, "never\n"); 680462306a36Sopenharmony_ci 680562306a36Sopenharmony_ci return param_get_bool(buffer, kp); 680662306a36Sopenharmony_ci} 680762306a36Sopenharmony_ci 680862306a36Sopenharmony_cistatic bool get_nx_auto_mode(void) 680962306a36Sopenharmony_ci{ 681062306a36Sopenharmony_ci /* Return true when CPU has the bug, and mitigations are ON */ 681162306a36Sopenharmony_ci return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); 681262306a36Sopenharmony_ci} 681362306a36Sopenharmony_ci 681462306a36Sopenharmony_cistatic void __set_nx_huge_pages(bool val) 681562306a36Sopenharmony_ci{ 681662306a36Sopenharmony_ci nx_huge_pages = itlb_multihit_kvm_mitigation = val; 681762306a36Sopenharmony_ci} 681862306a36Sopenharmony_ci 681962306a36Sopenharmony_cistatic int set_nx_huge_pages(const char *val, const struct kernel_param *kp) 682062306a36Sopenharmony_ci{ 682162306a36Sopenharmony_ci bool old_val = nx_huge_pages; 682262306a36Sopenharmony_ci bool new_val; 682362306a36Sopenharmony_ci 682462306a36Sopenharmony_ci if (nx_hugepage_mitigation_hard_disabled) 682562306a36Sopenharmony_ci return -EPERM; 682662306a36Sopenharmony_ci 682762306a36Sopenharmony_ci /* In "auto" mode deploy workaround only if CPU has the bug. */ 682862306a36Sopenharmony_ci if (sysfs_streq(val, "off")) { 682962306a36Sopenharmony_ci new_val = 0; 683062306a36Sopenharmony_ci } else if (sysfs_streq(val, "force")) { 683162306a36Sopenharmony_ci new_val = 1; 683262306a36Sopenharmony_ci } else if (sysfs_streq(val, "auto")) { 683362306a36Sopenharmony_ci new_val = get_nx_auto_mode(); 683462306a36Sopenharmony_ci } else if (sysfs_streq(val, "never")) { 683562306a36Sopenharmony_ci new_val = 0; 683662306a36Sopenharmony_ci 683762306a36Sopenharmony_ci mutex_lock(&kvm_lock); 683862306a36Sopenharmony_ci if (!list_empty(&vm_list)) { 683962306a36Sopenharmony_ci mutex_unlock(&kvm_lock); 684062306a36Sopenharmony_ci return -EBUSY; 684162306a36Sopenharmony_ci } 684262306a36Sopenharmony_ci nx_hugepage_mitigation_hard_disabled = true; 684362306a36Sopenharmony_ci mutex_unlock(&kvm_lock); 684462306a36Sopenharmony_ci } else if (kstrtobool(val, &new_val) < 0) { 684562306a36Sopenharmony_ci return -EINVAL; 684662306a36Sopenharmony_ci } 684762306a36Sopenharmony_ci 684862306a36Sopenharmony_ci __set_nx_huge_pages(new_val); 684962306a36Sopenharmony_ci 685062306a36Sopenharmony_ci if (new_val != old_val) { 685162306a36Sopenharmony_ci struct kvm *kvm; 685262306a36Sopenharmony_ci 685362306a36Sopenharmony_ci mutex_lock(&kvm_lock); 685462306a36Sopenharmony_ci 685562306a36Sopenharmony_ci list_for_each_entry(kvm, &vm_list, vm_list) { 685662306a36Sopenharmony_ci mutex_lock(&kvm->slots_lock); 685762306a36Sopenharmony_ci kvm_mmu_zap_all_fast(kvm); 685862306a36Sopenharmony_ci mutex_unlock(&kvm->slots_lock); 685962306a36Sopenharmony_ci 686062306a36Sopenharmony_ci wake_up_process(kvm->arch.nx_huge_page_recovery_thread); 686162306a36Sopenharmony_ci } 686262306a36Sopenharmony_ci mutex_unlock(&kvm_lock); 686362306a36Sopenharmony_ci } 686462306a36Sopenharmony_ci 686562306a36Sopenharmony_ci return 0; 686662306a36Sopenharmony_ci} 686762306a36Sopenharmony_ci 686862306a36Sopenharmony_ci/* 686962306a36Sopenharmony_ci * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as 687062306a36Sopenharmony_ci * its default value of -1 is technically undefined behavior for a boolean. 687162306a36Sopenharmony_ci * Forward the module init call to SPTE code so that it too can handle module 687262306a36Sopenharmony_ci * params that need to be resolved/snapshot. 687362306a36Sopenharmony_ci */ 687462306a36Sopenharmony_civoid __init kvm_mmu_x86_module_init(void) 687562306a36Sopenharmony_ci{ 687662306a36Sopenharmony_ci if (nx_huge_pages == -1) 687762306a36Sopenharmony_ci __set_nx_huge_pages(get_nx_auto_mode()); 687862306a36Sopenharmony_ci 687962306a36Sopenharmony_ci /* 688062306a36Sopenharmony_ci * Snapshot userspace's desire to enable the TDP MMU. Whether or not the 688162306a36Sopenharmony_ci * TDP MMU is actually enabled is determined in kvm_configure_mmu() 688262306a36Sopenharmony_ci * when the vendor module is loaded. 688362306a36Sopenharmony_ci */ 688462306a36Sopenharmony_ci tdp_mmu_allowed = tdp_mmu_enabled; 688562306a36Sopenharmony_ci 688662306a36Sopenharmony_ci kvm_mmu_spte_module_init(); 688762306a36Sopenharmony_ci} 688862306a36Sopenharmony_ci 688962306a36Sopenharmony_ci/* 689062306a36Sopenharmony_ci * The bulk of the MMU initialization is deferred until the vendor module is 689162306a36Sopenharmony_ci * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need 689262306a36Sopenharmony_ci * to be reset when a potentially different vendor module is loaded. 689362306a36Sopenharmony_ci */ 689462306a36Sopenharmony_ciint kvm_mmu_vendor_module_init(void) 689562306a36Sopenharmony_ci{ 689662306a36Sopenharmony_ci int ret = -ENOMEM; 689762306a36Sopenharmony_ci 689862306a36Sopenharmony_ci /* 689962306a36Sopenharmony_ci * MMU roles use union aliasing which is, generally speaking, an 690062306a36Sopenharmony_ci * undefined behavior. However, we supposedly know how compilers behave 690162306a36Sopenharmony_ci * and the current status quo is unlikely to change. Guardians below are 690262306a36Sopenharmony_ci * supposed to let us know if the assumption becomes false. 690362306a36Sopenharmony_ci */ 690462306a36Sopenharmony_ci BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32)); 690562306a36Sopenharmony_ci BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32)); 690662306a36Sopenharmony_ci BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64)); 690762306a36Sopenharmony_ci 690862306a36Sopenharmony_ci kvm_mmu_reset_all_pte_masks(); 690962306a36Sopenharmony_ci 691062306a36Sopenharmony_ci pte_list_desc_cache = kmem_cache_create("pte_list_desc", 691162306a36Sopenharmony_ci sizeof(struct pte_list_desc), 691262306a36Sopenharmony_ci 0, SLAB_ACCOUNT, NULL); 691362306a36Sopenharmony_ci if (!pte_list_desc_cache) 691462306a36Sopenharmony_ci goto out; 691562306a36Sopenharmony_ci 691662306a36Sopenharmony_ci mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", 691762306a36Sopenharmony_ci sizeof(struct kvm_mmu_page), 691862306a36Sopenharmony_ci 0, SLAB_ACCOUNT, NULL); 691962306a36Sopenharmony_ci if (!mmu_page_header_cache) 692062306a36Sopenharmony_ci goto out; 692162306a36Sopenharmony_ci 692262306a36Sopenharmony_ci if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) 692362306a36Sopenharmony_ci goto out; 692462306a36Sopenharmony_ci 692562306a36Sopenharmony_ci ret = register_shrinker(&mmu_shrinker, "x86-mmu"); 692662306a36Sopenharmony_ci if (ret) 692762306a36Sopenharmony_ci goto out_shrinker; 692862306a36Sopenharmony_ci 692962306a36Sopenharmony_ci return 0; 693062306a36Sopenharmony_ci 693162306a36Sopenharmony_ciout_shrinker: 693262306a36Sopenharmony_ci percpu_counter_destroy(&kvm_total_used_mmu_pages); 693362306a36Sopenharmony_ciout: 693462306a36Sopenharmony_ci mmu_destroy_caches(); 693562306a36Sopenharmony_ci return ret; 693662306a36Sopenharmony_ci} 693762306a36Sopenharmony_ci 693862306a36Sopenharmony_civoid kvm_mmu_destroy(struct kvm_vcpu *vcpu) 693962306a36Sopenharmony_ci{ 694062306a36Sopenharmony_ci kvm_mmu_unload(vcpu); 694162306a36Sopenharmony_ci free_mmu_pages(&vcpu->arch.root_mmu); 694262306a36Sopenharmony_ci free_mmu_pages(&vcpu->arch.guest_mmu); 694362306a36Sopenharmony_ci mmu_free_memory_caches(vcpu); 694462306a36Sopenharmony_ci} 694562306a36Sopenharmony_ci 694662306a36Sopenharmony_civoid kvm_mmu_vendor_module_exit(void) 694762306a36Sopenharmony_ci{ 694862306a36Sopenharmony_ci mmu_destroy_caches(); 694962306a36Sopenharmony_ci percpu_counter_destroy(&kvm_total_used_mmu_pages); 695062306a36Sopenharmony_ci unregister_shrinker(&mmu_shrinker); 695162306a36Sopenharmony_ci} 695262306a36Sopenharmony_ci 695362306a36Sopenharmony_ci/* 695462306a36Sopenharmony_ci * Calculate the effective recovery period, accounting for '0' meaning "let KVM 695562306a36Sopenharmony_ci * select a halving time of 1 hour". Returns true if recovery is enabled. 695662306a36Sopenharmony_ci */ 695762306a36Sopenharmony_cistatic bool calc_nx_huge_pages_recovery_period(uint *period) 695862306a36Sopenharmony_ci{ 695962306a36Sopenharmony_ci /* 696062306a36Sopenharmony_ci * Use READ_ONCE to get the params, this may be called outside of the 696162306a36Sopenharmony_ci * param setters, e.g. by the kthread to compute its next timeout. 696262306a36Sopenharmony_ci */ 696362306a36Sopenharmony_ci bool enabled = READ_ONCE(nx_huge_pages); 696462306a36Sopenharmony_ci uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio); 696562306a36Sopenharmony_ci 696662306a36Sopenharmony_ci if (!enabled || !ratio) 696762306a36Sopenharmony_ci return false; 696862306a36Sopenharmony_ci 696962306a36Sopenharmony_ci *period = READ_ONCE(nx_huge_pages_recovery_period_ms); 697062306a36Sopenharmony_ci if (!*period) { 697162306a36Sopenharmony_ci /* Make sure the period is not less than one second. */ 697262306a36Sopenharmony_ci ratio = min(ratio, 3600u); 697362306a36Sopenharmony_ci *period = 60 * 60 * 1000 / ratio; 697462306a36Sopenharmony_ci } 697562306a36Sopenharmony_ci return true; 697662306a36Sopenharmony_ci} 697762306a36Sopenharmony_ci 697862306a36Sopenharmony_cistatic int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp) 697962306a36Sopenharmony_ci{ 698062306a36Sopenharmony_ci bool was_recovery_enabled, is_recovery_enabled; 698162306a36Sopenharmony_ci uint old_period, new_period; 698262306a36Sopenharmony_ci int err; 698362306a36Sopenharmony_ci 698462306a36Sopenharmony_ci if (nx_hugepage_mitigation_hard_disabled) 698562306a36Sopenharmony_ci return -EPERM; 698662306a36Sopenharmony_ci 698762306a36Sopenharmony_ci was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period); 698862306a36Sopenharmony_ci 698962306a36Sopenharmony_ci err = param_set_uint(val, kp); 699062306a36Sopenharmony_ci if (err) 699162306a36Sopenharmony_ci return err; 699262306a36Sopenharmony_ci 699362306a36Sopenharmony_ci is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period); 699462306a36Sopenharmony_ci 699562306a36Sopenharmony_ci if (is_recovery_enabled && 699662306a36Sopenharmony_ci (!was_recovery_enabled || old_period > new_period)) { 699762306a36Sopenharmony_ci struct kvm *kvm; 699862306a36Sopenharmony_ci 699962306a36Sopenharmony_ci mutex_lock(&kvm_lock); 700062306a36Sopenharmony_ci 700162306a36Sopenharmony_ci list_for_each_entry(kvm, &vm_list, vm_list) 700262306a36Sopenharmony_ci wake_up_process(kvm->arch.nx_huge_page_recovery_thread); 700362306a36Sopenharmony_ci 700462306a36Sopenharmony_ci mutex_unlock(&kvm_lock); 700562306a36Sopenharmony_ci } 700662306a36Sopenharmony_ci 700762306a36Sopenharmony_ci return err; 700862306a36Sopenharmony_ci} 700962306a36Sopenharmony_ci 701062306a36Sopenharmony_cistatic void kvm_recover_nx_huge_pages(struct kvm *kvm) 701162306a36Sopenharmony_ci{ 701262306a36Sopenharmony_ci unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; 701362306a36Sopenharmony_ci struct kvm_memory_slot *slot; 701462306a36Sopenharmony_ci int rcu_idx; 701562306a36Sopenharmony_ci struct kvm_mmu_page *sp; 701662306a36Sopenharmony_ci unsigned int ratio; 701762306a36Sopenharmony_ci LIST_HEAD(invalid_list); 701862306a36Sopenharmony_ci bool flush = false; 701962306a36Sopenharmony_ci ulong to_zap; 702062306a36Sopenharmony_ci 702162306a36Sopenharmony_ci rcu_idx = srcu_read_lock(&kvm->srcu); 702262306a36Sopenharmony_ci write_lock(&kvm->mmu_lock); 702362306a36Sopenharmony_ci 702462306a36Sopenharmony_ci /* 702562306a36Sopenharmony_ci * Zapping TDP MMU shadow pages, including the remote TLB flush, must 702662306a36Sopenharmony_ci * be done under RCU protection, because the pages are freed via RCU 702762306a36Sopenharmony_ci * callback. 702862306a36Sopenharmony_ci */ 702962306a36Sopenharmony_ci rcu_read_lock(); 703062306a36Sopenharmony_ci 703162306a36Sopenharmony_ci ratio = READ_ONCE(nx_huge_pages_recovery_ratio); 703262306a36Sopenharmony_ci to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; 703362306a36Sopenharmony_ci for ( ; to_zap; --to_zap) { 703462306a36Sopenharmony_ci if (list_empty(&kvm->arch.possible_nx_huge_pages)) 703562306a36Sopenharmony_ci break; 703662306a36Sopenharmony_ci 703762306a36Sopenharmony_ci /* 703862306a36Sopenharmony_ci * We use a separate list instead of just using active_mmu_pages 703962306a36Sopenharmony_ci * because the number of shadow pages that be replaced with an 704062306a36Sopenharmony_ci * NX huge page is expected to be relatively small compared to 704162306a36Sopenharmony_ci * the total number of shadow pages. And because the TDP MMU 704262306a36Sopenharmony_ci * doesn't use active_mmu_pages. 704362306a36Sopenharmony_ci */ 704462306a36Sopenharmony_ci sp = list_first_entry(&kvm->arch.possible_nx_huge_pages, 704562306a36Sopenharmony_ci struct kvm_mmu_page, 704662306a36Sopenharmony_ci possible_nx_huge_page_link); 704762306a36Sopenharmony_ci WARN_ON_ONCE(!sp->nx_huge_page_disallowed); 704862306a36Sopenharmony_ci WARN_ON_ONCE(!sp->role.direct); 704962306a36Sopenharmony_ci 705062306a36Sopenharmony_ci /* 705162306a36Sopenharmony_ci * Unaccount and do not attempt to recover any NX Huge Pages 705262306a36Sopenharmony_ci * that are being dirty tracked, as they would just be faulted 705362306a36Sopenharmony_ci * back in as 4KiB pages. The NX Huge Pages in this slot will be 705462306a36Sopenharmony_ci * recovered, along with all the other huge pages in the slot, 705562306a36Sopenharmony_ci * when dirty logging is disabled. 705662306a36Sopenharmony_ci * 705762306a36Sopenharmony_ci * Since gfn_to_memslot() is relatively expensive, it helps to 705862306a36Sopenharmony_ci * skip it if it the test cannot possibly return true. On the 705962306a36Sopenharmony_ci * other hand, if any memslot has logging enabled, chances are 706062306a36Sopenharmony_ci * good that all of them do, in which case unaccount_nx_huge_page() 706162306a36Sopenharmony_ci * is much cheaper than zapping the page. 706262306a36Sopenharmony_ci * 706362306a36Sopenharmony_ci * If a memslot update is in progress, reading an incorrect value 706462306a36Sopenharmony_ci * of kvm->nr_memslots_dirty_logging is not a problem: if it is 706562306a36Sopenharmony_ci * becoming zero, gfn_to_memslot() will be done unnecessarily; if 706662306a36Sopenharmony_ci * it is becoming nonzero, the page will be zapped unnecessarily. 706762306a36Sopenharmony_ci * Either way, this only affects efficiency in racy situations, 706862306a36Sopenharmony_ci * and not correctness. 706962306a36Sopenharmony_ci */ 707062306a36Sopenharmony_ci slot = NULL; 707162306a36Sopenharmony_ci if (atomic_read(&kvm->nr_memslots_dirty_logging)) { 707262306a36Sopenharmony_ci struct kvm_memslots *slots; 707362306a36Sopenharmony_ci 707462306a36Sopenharmony_ci slots = kvm_memslots_for_spte_role(kvm, sp->role); 707562306a36Sopenharmony_ci slot = __gfn_to_memslot(slots, sp->gfn); 707662306a36Sopenharmony_ci WARN_ON_ONCE(!slot); 707762306a36Sopenharmony_ci } 707862306a36Sopenharmony_ci 707962306a36Sopenharmony_ci if (slot && kvm_slot_dirty_track_enabled(slot)) 708062306a36Sopenharmony_ci unaccount_nx_huge_page(kvm, sp); 708162306a36Sopenharmony_ci else if (is_tdp_mmu_page(sp)) 708262306a36Sopenharmony_ci flush |= kvm_tdp_mmu_zap_sp(kvm, sp); 708362306a36Sopenharmony_ci else 708462306a36Sopenharmony_ci kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 708562306a36Sopenharmony_ci WARN_ON_ONCE(sp->nx_huge_page_disallowed); 708662306a36Sopenharmony_ci 708762306a36Sopenharmony_ci if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 708862306a36Sopenharmony_ci kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); 708962306a36Sopenharmony_ci rcu_read_unlock(); 709062306a36Sopenharmony_ci 709162306a36Sopenharmony_ci cond_resched_rwlock_write(&kvm->mmu_lock); 709262306a36Sopenharmony_ci flush = false; 709362306a36Sopenharmony_ci 709462306a36Sopenharmony_ci rcu_read_lock(); 709562306a36Sopenharmony_ci } 709662306a36Sopenharmony_ci } 709762306a36Sopenharmony_ci kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); 709862306a36Sopenharmony_ci 709962306a36Sopenharmony_ci rcu_read_unlock(); 710062306a36Sopenharmony_ci 710162306a36Sopenharmony_ci write_unlock(&kvm->mmu_lock); 710262306a36Sopenharmony_ci srcu_read_unlock(&kvm->srcu, rcu_idx); 710362306a36Sopenharmony_ci} 710462306a36Sopenharmony_ci 710562306a36Sopenharmony_cistatic long get_nx_huge_page_recovery_timeout(u64 start_time) 710662306a36Sopenharmony_ci{ 710762306a36Sopenharmony_ci bool enabled; 710862306a36Sopenharmony_ci uint period; 710962306a36Sopenharmony_ci 711062306a36Sopenharmony_ci enabled = calc_nx_huge_pages_recovery_period(&period); 711162306a36Sopenharmony_ci 711262306a36Sopenharmony_ci return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64() 711362306a36Sopenharmony_ci : MAX_SCHEDULE_TIMEOUT; 711462306a36Sopenharmony_ci} 711562306a36Sopenharmony_ci 711662306a36Sopenharmony_cistatic int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data) 711762306a36Sopenharmony_ci{ 711862306a36Sopenharmony_ci u64 start_time; 711962306a36Sopenharmony_ci long remaining_time; 712062306a36Sopenharmony_ci 712162306a36Sopenharmony_ci while (true) { 712262306a36Sopenharmony_ci start_time = get_jiffies_64(); 712362306a36Sopenharmony_ci remaining_time = get_nx_huge_page_recovery_timeout(start_time); 712462306a36Sopenharmony_ci 712562306a36Sopenharmony_ci set_current_state(TASK_INTERRUPTIBLE); 712662306a36Sopenharmony_ci while (!kthread_should_stop() && remaining_time > 0) { 712762306a36Sopenharmony_ci schedule_timeout(remaining_time); 712862306a36Sopenharmony_ci remaining_time = get_nx_huge_page_recovery_timeout(start_time); 712962306a36Sopenharmony_ci set_current_state(TASK_INTERRUPTIBLE); 713062306a36Sopenharmony_ci } 713162306a36Sopenharmony_ci 713262306a36Sopenharmony_ci set_current_state(TASK_RUNNING); 713362306a36Sopenharmony_ci 713462306a36Sopenharmony_ci if (kthread_should_stop()) 713562306a36Sopenharmony_ci return 0; 713662306a36Sopenharmony_ci 713762306a36Sopenharmony_ci kvm_recover_nx_huge_pages(kvm); 713862306a36Sopenharmony_ci } 713962306a36Sopenharmony_ci} 714062306a36Sopenharmony_ci 714162306a36Sopenharmony_ciint kvm_mmu_post_init_vm(struct kvm *kvm) 714262306a36Sopenharmony_ci{ 714362306a36Sopenharmony_ci int err; 714462306a36Sopenharmony_ci 714562306a36Sopenharmony_ci if (nx_hugepage_mitigation_hard_disabled) 714662306a36Sopenharmony_ci return 0; 714762306a36Sopenharmony_ci 714862306a36Sopenharmony_ci err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0, 714962306a36Sopenharmony_ci "kvm-nx-lpage-recovery", 715062306a36Sopenharmony_ci &kvm->arch.nx_huge_page_recovery_thread); 715162306a36Sopenharmony_ci if (!err) 715262306a36Sopenharmony_ci kthread_unpark(kvm->arch.nx_huge_page_recovery_thread); 715362306a36Sopenharmony_ci 715462306a36Sopenharmony_ci return err; 715562306a36Sopenharmony_ci} 715662306a36Sopenharmony_ci 715762306a36Sopenharmony_civoid kvm_mmu_pre_destroy_vm(struct kvm *kvm) 715862306a36Sopenharmony_ci{ 715962306a36Sopenharmony_ci if (kvm->arch.nx_huge_page_recovery_thread) 716062306a36Sopenharmony_ci kthread_stop(kvm->arch.nx_huge_page_recovery_thread); 716162306a36Sopenharmony_ci} 7162