18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Kernel-based Virtual Machine driver for Linux 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * This module enables machines with Intel VT-x extensions to run virtual 68c2ecf20Sopenharmony_ci * machines without emulation or binary translation. 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * MMU support 98c2ecf20Sopenharmony_ci * 108c2ecf20Sopenharmony_ci * Copyright (C) 2006 Qumranet, Inc. 118c2ecf20Sopenharmony_ci * Copyright 2010 Red Hat, Inc. and/or its affiliates. 128c2ecf20Sopenharmony_ci * 138c2ecf20Sopenharmony_ci * Authors: 148c2ecf20Sopenharmony_ci * Yaniv Kamay <yaniv@qumranet.com> 158c2ecf20Sopenharmony_ci * Avi Kivity <avi@qumranet.com> 168c2ecf20Sopenharmony_ci */ 178c2ecf20Sopenharmony_ci 188c2ecf20Sopenharmony_ci/* 198c2ecf20Sopenharmony_ci * We need the mmu code to access both 32-bit and 64-bit guest ptes, 208c2ecf20Sopenharmony_ci * so the code in this file is compiled twice, once per pte size. 218c2ecf20Sopenharmony_ci */ 228c2ecf20Sopenharmony_ci 238c2ecf20Sopenharmony_ci#if PTTYPE == 64 248c2ecf20Sopenharmony_ci #define pt_element_t u64 258c2ecf20Sopenharmony_ci #define guest_walker guest_walker64 268c2ecf20Sopenharmony_ci #define FNAME(name) paging##64_##name 278c2ecf20Sopenharmony_ci #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK 288c2ecf20Sopenharmony_ci #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) 298c2ecf20Sopenharmony_ci #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) 308c2ecf20Sopenharmony_ci #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 318c2ecf20Sopenharmony_ci #define PT_LEVEL_BITS PT64_LEVEL_BITS 328c2ecf20Sopenharmony_ci #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT 338c2ecf20Sopenharmony_ci #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT 348c2ecf20Sopenharmony_ci #define PT_HAVE_ACCESSED_DIRTY(mmu) true 358c2ecf20Sopenharmony_ci #ifdef CONFIG_X86_64 368c2ecf20Sopenharmony_ci #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL 378c2ecf20Sopenharmony_ci #define CMPXCHG "cmpxchgq" 388c2ecf20Sopenharmony_ci #else 398c2ecf20Sopenharmony_ci #define PT_MAX_FULL_LEVELS 2 408c2ecf20Sopenharmony_ci #endif 418c2ecf20Sopenharmony_ci#elif PTTYPE == 32 428c2ecf20Sopenharmony_ci #define pt_element_t u32 438c2ecf20Sopenharmony_ci #define guest_walker guest_walker32 448c2ecf20Sopenharmony_ci #define FNAME(name) paging##32_##name 458c2ecf20Sopenharmony_ci #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK 468c2ecf20Sopenharmony_ci #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) 478c2ecf20Sopenharmony_ci #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) 488c2ecf20Sopenharmony_ci #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 498c2ecf20Sopenharmony_ci #define PT_LEVEL_BITS PT32_LEVEL_BITS 508c2ecf20Sopenharmony_ci #define PT_MAX_FULL_LEVELS 2 518c2ecf20Sopenharmony_ci #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT 528c2ecf20Sopenharmony_ci #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT 538c2ecf20Sopenharmony_ci #define PT_HAVE_ACCESSED_DIRTY(mmu) true 548c2ecf20Sopenharmony_ci #define CMPXCHG "cmpxchgl" 558c2ecf20Sopenharmony_ci#elif PTTYPE == PTTYPE_EPT 568c2ecf20Sopenharmony_ci #define pt_element_t u64 578c2ecf20Sopenharmony_ci #define guest_walker guest_walkerEPT 588c2ecf20Sopenharmony_ci #define FNAME(name) ept_##name 598c2ecf20Sopenharmony_ci #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK 608c2ecf20Sopenharmony_ci #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) 618c2ecf20Sopenharmony_ci #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) 628c2ecf20Sopenharmony_ci #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 638c2ecf20Sopenharmony_ci #define PT_LEVEL_BITS PT64_LEVEL_BITS 648c2ecf20Sopenharmony_ci #define PT_GUEST_DIRTY_SHIFT 9 658c2ecf20Sopenharmony_ci #define PT_GUEST_ACCESSED_SHIFT 8 668c2ecf20Sopenharmony_ci #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad) 678c2ecf20Sopenharmony_ci #ifdef CONFIG_X86_64 688c2ecf20Sopenharmony_ci #define CMPXCHG "cmpxchgq" 698c2ecf20Sopenharmony_ci #endif 708c2ecf20Sopenharmony_ci #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL 718c2ecf20Sopenharmony_ci#else 728c2ecf20Sopenharmony_ci #error Invalid PTTYPE value 738c2ecf20Sopenharmony_ci#endif 748c2ecf20Sopenharmony_ci 758c2ecf20Sopenharmony_ci#define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT) 768c2ecf20Sopenharmony_ci#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) 778c2ecf20Sopenharmony_ci 788c2ecf20Sopenharmony_ci#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) 798c2ecf20Sopenharmony_ci#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PG_LEVEL_4K) 808c2ecf20Sopenharmony_ci 818c2ecf20Sopenharmony_ci/* 828c2ecf20Sopenharmony_ci * The guest_walker structure emulates the behavior of the hardware page 838c2ecf20Sopenharmony_ci * table walker. 848c2ecf20Sopenharmony_ci */ 858c2ecf20Sopenharmony_cistruct guest_walker { 868c2ecf20Sopenharmony_ci int level; 878c2ecf20Sopenharmony_ci unsigned max_level; 888c2ecf20Sopenharmony_ci gfn_t table_gfn[PT_MAX_FULL_LEVELS]; 898c2ecf20Sopenharmony_ci pt_element_t ptes[PT_MAX_FULL_LEVELS]; 908c2ecf20Sopenharmony_ci pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; 918c2ecf20Sopenharmony_ci gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; 928c2ecf20Sopenharmony_ci pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; 938c2ecf20Sopenharmony_ci bool pte_writable[PT_MAX_FULL_LEVELS]; 948c2ecf20Sopenharmony_ci unsigned int pt_access[PT_MAX_FULL_LEVELS]; 958c2ecf20Sopenharmony_ci unsigned int pte_access; 968c2ecf20Sopenharmony_ci gfn_t gfn; 978c2ecf20Sopenharmony_ci struct x86_exception fault; 988c2ecf20Sopenharmony_ci}; 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_cistatic gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) 1018c2ecf20Sopenharmony_ci{ 1028c2ecf20Sopenharmony_ci return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; 1038c2ecf20Sopenharmony_ci} 1048c2ecf20Sopenharmony_ci 1058c2ecf20Sopenharmony_cistatic inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access, 1068c2ecf20Sopenharmony_ci unsigned gpte) 1078c2ecf20Sopenharmony_ci{ 1088c2ecf20Sopenharmony_ci unsigned mask; 1098c2ecf20Sopenharmony_ci 1108c2ecf20Sopenharmony_ci /* dirty bit is not supported, so no need to track it */ 1118c2ecf20Sopenharmony_ci if (!PT_HAVE_ACCESSED_DIRTY(mmu)) 1128c2ecf20Sopenharmony_ci return; 1138c2ecf20Sopenharmony_ci 1148c2ecf20Sopenharmony_ci BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); 1158c2ecf20Sopenharmony_ci 1168c2ecf20Sopenharmony_ci mask = (unsigned)~ACC_WRITE_MASK; 1178c2ecf20Sopenharmony_ci /* Allow write access to dirty gptes */ 1188c2ecf20Sopenharmony_ci mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & 1198c2ecf20Sopenharmony_ci PT_WRITABLE_MASK; 1208c2ecf20Sopenharmony_ci *access &= mask; 1218c2ecf20Sopenharmony_ci} 1228c2ecf20Sopenharmony_ci 1238c2ecf20Sopenharmony_cistatic inline int FNAME(is_present_gpte)(unsigned long pte) 1248c2ecf20Sopenharmony_ci{ 1258c2ecf20Sopenharmony_ci#if PTTYPE != PTTYPE_EPT 1268c2ecf20Sopenharmony_ci return pte & PT_PRESENT_MASK; 1278c2ecf20Sopenharmony_ci#else 1288c2ecf20Sopenharmony_ci return pte & 7; 1298c2ecf20Sopenharmony_ci#endif 1308c2ecf20Sopenharmony_ci} 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_cistatic bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte) 1338c2ecf20Sopenharmony_ci{ 1348c2ecf20Sopenharmony_ci#if PTTYPE != PTTYPE_EPT 1358c2ecf20Sopenharmony_ci return false; 1368c2ecf20Sopenharmony_ci#else 1378c2ecf20Sopenharmony_ci return __is_bad_mt_xwr(rsvd_check, gpte); 1388c2ecf20Sopenharmony_ci#endif 1398c2ecf20Sopenharmony_ci} 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_cistatic bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) 1428c2ecf20Sopenharmony_ci{ 1438c2ecf20Sopenharmony_ci return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) || 1448c2ecf20Sopenharmony_ci FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte); 1458c2ecf20Sopenharmony_ci} 1468c2ecf20Sopenharmony_ci 1478c2ecf20Sopenharmony_cistatic int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 1488c2ecf20Sopenharmony_ci pt_element_t __user *ptep_user, unsigned index, 1498c2ecf20Sopenharmony_ci pt_element_t orig_pte, pt_element_t new_pte) 1508c2ecf20Sopenharmony_ci{ 1518c2ecf20Sopenharmony_ci int r = -EFAULT; 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_ci if (!user_access_begin(ptep_user, sizeof(pt_element_t))) 1548c2ecf20Sopenharmony_ci return -EFAULT; 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_ci#ifdef CMPXCHG 1578c2ecf20Sopenharmony_ci asm volatile("1:" LOCK_PREFIX CMPXCHG " %[new], %[ptr]\n" 1588c2ecf20Sopenharmony_ci "mov $0, %[r]\n" 1598c2ecf20Sopenharmony_ci "setnz %b[r]\n" 1608c2ecf20Sopenharmony_ci "2:" 1618c2ecf20Sopenharmony_ci _ASM_EXTABLE_UA(1b, 2b) 1628c2ecf20Sopenharmony_ci : [ptr] "+m" (*ptep_user), 1638c2ecf20Sopenharmony_ci [old] "+a" (orig_pte), 1648c2ecf20Sopenharmony_ci [r] "+q" (r) 1658c2ecf20Sopenharmony_ci : [new] "r" (new_pte) 1668c2ecf20Sopenharmony_ci : "memory"); 1678c2ecf20Sopenharmony_ci#else 1688c2ecf20Sopenharmony_ci asm volatile("1:" LOCK_PREFIX "cmpxchg8b %[ptr]\n" 1698c2ecf20Sopenharmony_ci "movl $0, %[r]\n" 1708c2ecf20Sopenharmony_ci "jz 2f\n" 1718c2ecf20Sopenharmony_ci "incl %[r]\n" 1728c2ecf20Sopenharmony_ci "2:" 1738c2ecf20Sopenharmony_ci _ASM_EXTABLE_UA(1b, 2b) 1748c2ecf20Sopenharmony_ci : [ptr] "+m" (*ptep_user), 1758c2ecf20Sopenharmony_ci [old] "+A" (orig_pte), 1768c2ecf20Sopenharmony_ci [r] "+rm" (r) 1778c2ecf20Sopenharmony_ci : [new_lo] "b" ((u32)new_pte), 1788c2ecf20Sopenharmony_ci [new_hi] "c" ((u32)(new_pte >> 32)) 1798c2ecf20Sopenharmony_ci : "memory"); 1808c2ecf20Sopenharmony_ci#endif 1818c2ecf20Sopenharmony_ci 1828c2ecf20Sopenharmony_ci user_access_end(); 1838c2ecf20Sopenharmony_ci return r; 1848c2ecf20Sopenharmony_ci} 1858c2ecf20Sopenharmony_ci 1868c2ecf20Sopenharmony_cistatic bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, 1878c2ecf20Sopenharmony_ci struct kvm_mmu_page *sp, u64 *spte, 1888c2ecf20Sopenharmony_ci u64 gpte) 1898c2ecf20Sopenharmony_ci{ 1908c2ecf20Sopenharmony_ci if (!FNAME(is_present_gpte)(gpte)) 1918c2ecf20Sopenharmony_ci goto no_present; 1928c2ecf20Sopenharmony_ci 1938c2ecf20Sopenharmony_ci /* if accessed bit is not supported prefetch non accessed gpte */ 1948c2ecf20Sopenharmony_ci if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) && 1958c2ecf20Sopenharmony_ci !(gpte & PT_GUEST_ACCESSED_MASK)) 1968c2ecf20Sopenharmony_ci goto no_present; 1978c2ecf20Sopenharmony_ci 1988c2ecf20Sopenharmony_ci if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PG_LEVEL_4K)) 1998c2ecf20Sopenharmony_ci goto no_present; 2008c2ecf20Sopenharmony_ci 2018c2ecf20Sopenharmony_ci return false; 2028c2ecf20Sopenharmony_ci 2038c2ecf20Sopenharmony_cino_present: 2048c2ecf20Sopenharmony_ci drop_spte(vcpu->kvm, spte); 2058c2ecf20Sopenharmony_ci return true; 2068c2ecf20Sopenharmony_ci} 2078c2ecf20Sopenharmony_ci 2088c2ecf20Sopenharmony_ci/* 2098c2ecf20Sopenharmony_ci * For PTTYPE_EPT, a page table can be executable but not readable 2108c2ecf20Sopenharmony_ci * on supported processors. Therefore, set_spte does not automatically 2118c2ecf20Sopenharmony_ci * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK 2128c2ecf20Sopenharmony_ci * to signify readability since it isn't used in the EPT case 2138c2ecf20Sopenharmony_ci */ 2148c2ecf20Sopenharmony_cistatic inline unsigned FNAME(gpte_access)(u64 gpte) 2158c2ecf20Sopenharmony_ci{ 2168c2ecf20Sopenharmony_ci unsigned access; 2178c2ecf20Sopenharmony_ci#if PTTYPE == PTTYPE_EPT 2188c2ecf20Sopenharmony_ci access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | 2198c2ecf20Sopenharmony_ci ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | 2208c2ecf20Sopenharmony_ci ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0); 2218c2ecf20Sopenharmony_ci#else 2228c2ecf20Sopenharmony_ci BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK); 2238c2ecf20Sopenharmony_ci BUILD_BUG_ON(ACC_EXEC_MASK != 1); 2248c2ecf20Sopenharmony_ci access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK); 2258c2ecf20Sopenharmony_ci /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */ 2268c2ecf20Sopenharmony_ci access ^= (gpte >> PT64_NX_SHIFT); 2278c2ecf20Sopenharmony_ci#endif 2288c2ecf20Sopenharmony_ci 2298c2ecf20Sopenharmony_ci return access; 2308c2ecf20Sopenharmony_ci} 2318c2ecf20Sopenharmony_ci 2328c2ecf20Sopenharmony_cistatic int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, 2338c2ecf20Sopenharmony_ci struct kvm_mmu *mmu, 2348c2ecf20Sopenharmony_ci struct guest_walker *walker, 2358c2ecf20Sopenharmony_ci gpa_t addr, int write_fault) 2368c2ecf20Sopenharmony_ci{ 2378c2ecf20Sopenharmony_ci unsigned level, index; 2388c2ecf20Sopenharmony_ci pt_element_t pte, orig_pte; 2398c2ecf20Sopenharmony_ci pt_element_t __user *ptep_user; 2408c2ecf20Sopenharmony_ci gfn_t table_gfn; 2418c2ecf20Sopenharmony_ci int ret; 2428c2ecf20Sopenharmony_ci 2438c2ecf20Sopenharmony_ci /* dirty/accessed bits are not supported, so no need to update them */ 2448c2ecf20Sopenharmony_ci if (!PT_HAVE_ACCESSED_DIRTY(mmu)) 2458c2ecf20Sopenharmony_ci return 0; 2468c2ecf20Sopenharmony_ci 2478c2ecf20Sopenharmony_ci for (level = walker->max_level; level >= walker->level; --level) { 2488c2ecf20Sopenharmony_ci pte = orig_pte = walker->ptes[level - 1]; 2498c2ecf20Sopenharmony_ci table_gfn = walker->table_gfn[level - 1]; 2508c2ecf20Sopenharmony_ci ptep_user = walker->ptep_user[level - 1]; 2518c2ecf20Sopenharmony_ci index = offset_in_page(ptep_user) / sizeof(pt_element_t); 2528c2ecf20Sopenharmony_ci if (!(pte & PT_GUEST_ACCESSED_MASK)) { 2538c2ecf20Sopenharmony_ci trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); 2548c2ecf20Sopenharmony_ci pte |= PT_GUEST_ACCESSED_MASK; 2558c2ecf20Sopenharmony_ci } 2568c2ecf20Sopenharmony_ci if (level == walker->level && write_fault && 2578c2ecf20Sopenharmony_ci !(pte & PT_GUEST_DIRTY_MASK)) { 2588c2ecf20Sopenharmony_ci trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 2598c2ecf20Sopenharmony_ci#if PTTYPE == PTTYPE_EPT 2608c2ecf20Sopenharmony_ci if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr)) 2618c2ecf20Sopenharmony_ci return -EINVAL; 2628c2ecf20Sopenharmony_ci#endif 2638c2ecf20Sopenharmony_ci pte |= PT_GUEST_DIRTY_MASK; 2648c2ecf20Sopenharmony_ci } 2658c2ecf20Sopenharmony_ci if (pte == orig_pte) 2668c2ecf20Sopenharmony_ci continue; 2678c2ecf20Sopenharmony_ci 2688c2ecf20Sopenharmony_ci /* 2698c2ecf20Sopenharmony_ci * If the slot is read-only, simply do not process the accessed 2708c2ecf20Sopenharmony_ci * and dirty bits. This is the correct thing to do if the slot 2718c2ecf20Sopenharmony_ci * is ROM, and page tables in read-as-ROM/write-as-MMIO slots 2728c2ecf20Sopenharmony_ci * are only supported if the accessed and dirty bits are already 2738c2ecf20Sopenharmony_ci * set in the ROM (so that MMIO writes are never needed). 2748c2ecf20Sopenharmony_ci * 2758c2ecf20Sopenharmony_ci * Note that NPT does not allow this at all and faults, since 2768c2ecf20Sopenharmony_ci * it always wants nested page table entries for the guest 2778c2ecf20Sopenharmony_ci * page tables to be writable. And EPT works but will simply 2788c2ecf20Sopenharmony_ci * overwrite the read-only memory to set the accessed and dirty 2798c2ecf20Sopenharmony_ci * bits. 2808c2ecf20Sopenharmony_ci */ 2818c2ecf20Sopenharmony_ci if (unlikely(!walker->pte_writable[level - 1])) 2828c2ecf20Sopenharmony_ci continue; 2838c2ecf20Sopenharmony_ci 2848c2ecf20Sopenharmony_ci ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte); 2858c2ecf20Sopenharmony_ci if (ret) 2868c2ecf20Sopenharmony_ci return ret; 2878c2ecf20Sopenharmony_ci 2888c2ecf20Sopenharmony_ci kvm_vcpu_mark_page_dirty(vcpu, table_gfn); 2898c2ecf20Sopenharmony_ci walker->ptes[level - 1] = pte; 2908c2ecf20Sopenharmony_ci } 2918c2ecf20Sopenharmony_ci return 0; 2928c2ecf20Sopenharmony_ci} 2938c2ecf20Sopenharmony_ci 2948c2ecf20Sopenharmony_cistatic inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) 2958c2ecf20Sopenharmony_ci{ 2968c2ecf20Sopenharmony_ci unsigned pkeys = 0; 2978c2ecf20Sopenharmony_ci#if PTTYPE == 64 2988c2ecf20Sopenharmony_ci pte_t pte = {.pte = gpte}; 2998c2ecf20Sopenharmony_ci 3008c2ecf20Sopenharmony_ci pkeys = pte_flags_pkey(pte_flags(pte)); 3018c2ecf20Sopenharmony_ci#endif 3028c2ecf20Sopenharmony_ci return pkeys; 3038c2ecf20Sopenharmony_ci} 3048c2ecf20Sopenharmony_ci 3058c2ecf20Sopenharmony_ci/* 3068c2ecf20Sopenharmony_ci * Fetch a guest pte for a guest virtual address, or for an L2's GPA. 3078c2ecf20Sopenharmony_ci */ 3088c2ecf20Sopenharmony_cistatic int FNAME(walk_addr_generic)(struct guest_walker *walker, 3098c2ecf20Sopenharmony_ci struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 3108c2ecf20Sopenharmony_ci gpa_t addr, u32 access) 3118c2ecf20Sopenharmony_ci{ 3128c2ecf20Sopenharmony_ci int ret; 3138c2ecf20Sopenharmony_ci pt_element_t pte; 3148c2ecf20Sopenharmony_ci pt_element_t __user *ptep_user; 3158c2ecf20Sopenharmony_ci gfn_t table_gfn; 3168c2ecf20Sopenharmony_ci u64 pt_access, pte_access; 3178c2ecf20Sopenharmony_ci unsigned index, accessed_dirty, pte_pkey; 3188c2ecf20Sopenharmony_ci unsigned nested_access; 3198c2ecf20Sopenharmony_ci gpa_t pte_gpa; 3208c2ecf20Sopenharmony_ci bool have_ad; 3218c2ecf20Sopenharmony_ci int offset; 3228c2ecf20Sopenharmony_ci u64 walk_nx_mask = 0; 3238c2ecf20Sopenharmony_ci const int write_fault = access & PFERR_WRITE_MASK; 3248c2ecf20Sopenharmony_ci const int user_fault = access & PFERR_USER_MASK; 3258c2ecf20Sopenharmony_ci const int fetch_fault = access & PFERR_FETCH_MASK; 3268c2ecf20Sopenharmony_ci u16 errcode = 0; 3278c2ecf20Sopenharmony_ci gpa_t real_gpa; 3288c2ecf20Sopenharmony_ci gfn_t gfn; 3298c2ecf20Sopenharmony_ci 3308c2ecf20Sopenharmony_ci trace_kvm_mmu_pagetable_walk(addr, access); 3318c2ecf20Sopenharmony_ciretry_walk: 3328c2ecf20Sopenharmony_ci walker->level = mmu->root_level; 3338c2ecf20Sopenharmony_ci pte = mmu->get_guest_pgd(vcpu); 3348c2ecf20Sopenharmony_ci have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); 3358c2ecf20Sopenharmony_ci 3368c2ecf20Sopenharmony_ci#if PTTYPE == 64 3378c2ecf20Sopenharmony_ci walk_nx_mask = 1ULL << PT64_NX_SHIFT; 3388c2ecf20Sopenharmony_ci if (walker->level == PT32E_ROOT_LEVEL) { 3398c2ecf20Sopenharmony_ci pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); 3408c2ecf20Sopenharmony_ci trace_kvm_mmu_paging_element(pte, walker->level); 3418c2ecf20Sopenharmony_ci if (!FNAME(is_present_gpte)(pte)) 3428c2ecf20Sopenharmony_ci goto error; 3438c2ecf20Sopenharmony_ci --walker->level; 3448c2ecf20Sopenharmony_ci } 3458c2ecf20Sopenharmony_ci#endif 3468c2ecf20Sopenharmony_ci walker->max_level = walker->level; 3478c2ecf20Sopenharmony_ci ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); 3488c2ecf20Sopenharmony_ci 3498c2ecf20Sopenharmony_ci /* 3508c2ecf20Sopenharmony_ci * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging 3518c2ecf20Sopenharmony_ci * by the MOV to CR instruction are treated as reads and do not cause the 3528c2ecf20Sopenharmony_ci * processor to set the dirty flag in any EPT paging-structure entry. 3538c2ecf20Sopenharmony_ci */ 3548c2ecf20Sopenharmony_ci nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK; 3558c2ecf20Sopenharmony_ci 3568c2ecf20Sopenharmony_ci pte_access = ~0; 3578c2ecf20Sopenharmony_ci ++walker->level; 3588c2ecf20Sopenharmony_ci 3598c2ecf20Sopenharmony_ci do { 3608c2ecf20Sopenharmony_ci unsigned long host_addr; 3618c2ecf20Sopenharmony_ci 3628c2ecf20Sopenharmony_ci pt_access = pte_access; 3638c2ecf20Sopenharmony_ci --walker->level; 3648c2ecf20Sopenharmony_ci 3658c2ecf20Sopenharmony_ci index = PT_INDEX(addr, walker->level); 3668c2ecf20Sopenharmony_ci table_gfn = gpte_to_gfn(pte); 3678c2ecf20Sopenharmony_ci offset = index * sizeof(pt_element_t); 3688c2ecf20Sopenharmony_ci pte_gpa = gfn_to_gpa(table_gfn) + offset; 3698c2ecf20Sopenharmony_ci 3708c2ecf20Sopenharmony_ci BUG_ON(walker->level < 1); 3718c2ecf20Sopenharmony_ci walker->table_gfn[walker->level - 1] = table_gfn; 3728c2ecf20Sopenharmony_ci walker->pte_gpa[walker->level - 1] = pte_gpa; 3738c2ecf20Sopenharmony_ci 3748c2ecf20Sopenharmony_ci real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), 3758c2ecf20Sopenharmony_ci nested_access, 3768c2ecf20Sopenharmony_ci &walker->fault); 3778c2ecf20Sopenharmony_ci 3788c2ecf20Sopenharmony_ci /* 3798c2ecf20Sopenharmony_ci * FIXME: This can happen if emulation (for of an INS/OUTS 3808c2ecf20Sopenharmony_ci * instruction) triggers a nested page fault. The exit 3818c2ecf20Sopenharmony_ci * qualification / exit info field will incorrectly have 3828c2ecf20Sopenharmony_ci * "guest page access" as the nested page fault's cause, 3838c2ecf20Sopenharmony_ci * instead of "guest page structure access". To fix this, 3848c2ecf20Sopenharmony_ci * the x86_exception struct should be augmented with enough 3858c2ecf20Sopenharmony_ci * information to fix the exit_qualification or exit_info_1 3868c2ecf20Sopenharmony_ci * fields. 3878c2ecf20Sopenharmony_ci */ 3888c2ecf20Sopenharmony_ci if (unlikely(real_gpa == UNMAPPED_GVA)) 3898c2ecf20Sopenharmony_ci return 0; 3908c2ecf20Sopenharmony_ci 3918c2ecf20Sopenharmony_ci host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa), 3928c2ecf20Sopenharmony_ci &walker->pte_writable[walker->level - 1]); 3938c2ecf20Sopenharmony_ci if (unlikely(kvm_is_error_hva(host_addr))) 3948c2ecf20Sopenharmony_ci goto error; 3958c2ecf20Sopenharmony_ci 3968c2ecf20Sopenharmony_ci ptep_user = (pt_element_t __user *)((void *)host_addr + offset); 3978c2ecf20Sopenharmony_ci if (unlikely(__get_user(pte, ptep_user))) 3988c2ecf20Sopenharmony_ci goto error; 3998c2ecf20Sopenharmony_ci walker->ptep_user[walker->level - 1] = ptep_user; 4008c2ecf20Sopenharmony_ci 4018c2ecf20Sopenharmony_ci trace_kvm_mmu_paging_element(pte, walker->level); 4028c2ecf20Sopenharmony_ci 4038c2ecf20Sopenharmony_ci /* 4048c2ecf20Sopenharmony_ci * Inverting the NX it lets us AND it like other 4058c2ecf20Sopenharmony_ci * permission bits. 4068c2ecf20Sopenharmony_ci */ 4078c2ecf20Sopenharmony_ci pte_access = pt_access & (pte ^ walk_nx_mask); 4088c2ecf20Sopenharmony_ci 4098c2ecf20Sopenharmony_ci if (unlikely(!FNAME(is_present_gpte)(pte))) 4108c2ecf20Sopenharmony_ci goto error; 4118c2ecf20Sopenharmony_ci 4128c2ecf20Sopenharmony_ci if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) { 4138c2ecf20Sopenharmony_ci errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK; 4148c2ecf20Sopenharmony_ci goto error; 4158c2ecf20Sopenharmony_ci } 4168c2ecf20Sopenharmony_ci 4178c2ecf20Sopenharmony_ci walker->ptes[walker->level - 1] = pte; 4188c2ecf20Sopenharmony_ci 4198c2ecf20Sopenharmony_ci /* Convert to ACC_*_MASK flags for struct guest_walker. */ 4208c2ecf20Sopenharmony_ci walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask); 4218c2ecf20Sopenharmony_ci } while (!is_last_gpte(mmu, walker->level, pte)); 4228c2ecf20Sopenharmony_ci 4238c2ecf20Sopenharmony_ci pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); 4248c2ecf20Sopenharmony_ci accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_ci /* Convert to ACC_*_MASK flags for struct guest_walker. */ 4278c2ecf20Sopenharmony_ci walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask); 4288c2ecf20Sopenharmony_ci errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); 4298c2ecf20Sopenharmony_ci if (unlikely(errcode)) 4308c2ecf20Sopenharmony_ci goto error; 4318c2ecf20Sopenharmony_ci 4328c2ecf20Sopenharmony_ci gfn = gpte_to_gfn_lvl(pte, walker->level); 4338c2ecf20Sopenharmony_ci gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; 4348c2ecf20Sopenharmony_ci 4358c2ecf20Sopenharmony_ci if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) 4368c2ecf20Sopenharmony_ci gfn += pse36_gfn_delta(pte); 4378c2ecf20Sopenharmony_ci 4388c2ecf20Sopenharmony_ci real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault); 4398c2ecf20Sopenharmony_ci if (real_gpa == UNMAPPED_GVA) 4408c2ecf20Sopenharmony_ci return 0; 4418c2ecf20Sopenharmony_ci 4428c2ecf20Sopenharmony_ci walker->gfn = real_gpa >> PAGE_SHIFT; 4438c2ecf20Sopenharmony_ci 4448c2ecf20Sopenharmony_ci if (!write_fault) 4458c2ecf20Sopenharmony_ci FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte); 4468c2ecf20Sopenharmony_ci else 4478c2ecf20Sopenharmony_ci /* 4488c2ecf20Sopenharmony_ci * On a write fault, fold the dirty bit into accessed_dirty. 4498c2ecf20Sopenharmony_ci * For modes without A/D bits support accessed_dirty will be 4508c2ecf20Sopenharmony_ci * always clear. 4518c2ecf20Sopenharmony_ci */ 4528c2ecf20Sopenharmony_ci accessed_dirty &= pte >> 4538c2ecf20Sopenharmony_ci (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); 4548c2ecf20Sopenharmony_ci 4558c2ecf20Sopenharmony_ci if (unlikely(!accessed_dirty)) { 4568c2ecf20Sopenharmony_ci ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, 4578c2ecf20Sopenharmony_ci addr, write_fault); 4588c2ecf20Sopenharmony_ci if (unlikely(ret < 0)) 4598c2ecf20Sopenharmony_ci goto error; 4608c2ecf20Sopenharmony_ci else if (ret) 4618c2ecf20Sopenharmony_ci goto retry_walk; 4628c2ecf20Sopenharmony_ci } 4638c2ecf20Sopenharmony_ci 4648c2ecf20Sopenharmony_ci pgprintk("%s: pte %llx pte_access %x pt_access %x\n", 4658c2ecf20Sopenharmony_ci __func__, (u64)pte, walker->pte_access, 4668c2ecf20Sopenharmony_ci walker->pt_access[walker->level - 1]); 4678c2ecf20Sopenharmony_ci return 1; 4688c2ecf20Sopenharmony_ci 4698c2ecf20Sopenharmony_cierror: 4708c2ecf20Sopenharmony_ci errcode |= write_fault | user_fault; 4718c2ecf20Sopenharmony_ci if (fetch_fault && (mmu->nx || mmu->mmu_role.ext.cr4_smep)) 4728c2ecf20Sopenharmony_ci errcode |= PFERR_FETCH_MASK; 4738c2ecf20Sopenharmony_ci 4748c2ecf20Sopenharmony_ci walker->fault.vector = PF_VECTOR; 4758c2ecf20Sopenharmony_ci walker->fault.error_code_valid = true; 4768c2ecf20Sopenharmony_ci walker->fault.error_code = errcode; 4778c2ecf20Sopenharmony_ci 4788c2ecf20Sopenharmony_ci#if PTTYPE == PTTYPE_EPT 4798c2ecf20Sopenharmony_ci /* 4808c2ecf20Sopenharmony_ci * Use PFERR_RSVD_MASK in error_code to to tell if EPT 4818c2ecf20Sopenharmony_ci * misconfiguration requires to be injected. The detection is 4828c2ecf20Sopenharmony_ci * done by is_rsvd_bits_set() above. 4838c2ecf20Sopenharmony_ci * 4848c2ecf20Sopenharmony_ci * We set up the value of exit_qualification to inject: 4858c2ecf20Sopenharmony_ci * [2:0] - Derive from the access bits. The exit_qualification might be 4868c2ecf20Sopenharmony_ci * out of date if it is serving an EPT misconfiguration. 4878c2ecf20Sopenharmony_ci * [5:3] - Calculated by the page walk of the guest EPT page tables 4888c2ecf20Sopenharmony_ci * [7:8] - Derived from [7:8] of real exit_qualification 4898c2ecf20Sopenharmony_ci * 4908c2ecf20Sopenharmony_ci * The other bits are set to 0. 4918c2ecf20Sopenharmony_ci */ 4928c2ecf20Sopenharmony_ci if (!(errcode & PFERR_RSVD_MASK)) { 4938c2ecf20Sopenharmony_ci vcpu->arch.exit_qualification &= 0x180; 4948c2ecf20Sopenharmony_ci if (write_fault) 4958c2ecf20Sopenharmony_ci vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE; 4968c2ecf20Sopenharmony_ci if (user_fault) 4978c2ecf20Sopenharmony_ci vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ; 4988c2ecf20Sopenharmony_ci if (fetch_fault) 4998c2ecf20Sopenharmony_ci vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR; 5008c2ecf20Sopenharmony_ci vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3; 5018c2ecf20Sopenharmony_ci } 5028c2ecf20Sopenharmony_ci#endif 5038c2ecf20Sopenharmony_ci walker->fault.address = addr; 5048c2ecf20Sopenharmony_ci walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; 5058c2ecf20Sopenharmony_ci 5068c2ecf20Sopenharmony_ci trace_kvm_mmu_walker_error(walker->fault.error_code); 5078c2ecf20Sopenharmony_ci return 0; 5088c2ecf20Sopenharmony_ci} 5098c2ecf20Sopenharmony_ci 5108c2ecf20Sopenharmony_cistatic int FNAME(walk_addr)(struct guest_walker *walker, 5118c2ecf20Sopenharmony_ci struct kvm_vcpu *vcpu, gpa_t addr, u32 access) 5128c2ecf20Sopenharmony_ci{ 5138c2ecf20Sopenharmony_ci return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr, 5148c2ecf20Sopenharmony_ci access); 5158c2ecf20Sopenharmony_ci} 5168c2ecf20Sopenharmony_ci 5178c2ecf20Sopenharmony_ci#if PTTYPE != PTTYPE_EPT 5188c2ecf20Sopenharmony_cistatic int FNAME(walk_addr_nested)(struct guest_walker *walker, 5198c2ecf20Sopenharmony_ci struct kvm_vcpu *vcpu, gva_t addr, 5208c2ecf20Sopenharmony_ci u32 access) 5218c2ecf20Sopenharmony_ci{ 5228c2ecf20Sopenharmony_ci return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, 5238c2ecf20Sopenharmony_ci addr, access); 5248c2ecf20Sopenharmony_ci} 5258c2ecf20Sopenharmony_ci#endif 5268c2ecf20Sopenharmony_ci 5278c2ecf20Sopenharmony_cistatic bool 5288c2ecf20Sopenharmony_ciFNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 5298c2ecf20Sopenharmony_ci u64 *spte, pt_element_t gpte, bool no_dirty_log) 5308c2ecf20Sopenharmony_ci{ 5318c2ecf20Sopenharmony_ci unsigned pte_access; 5328c2ecf20Sopenharmony_ci gfn_t gfn; 5338c2ecf20Sopenharmony_ci kvm_pfn_t pfn; 5348c2ecf20Sopenharmony_ci 5358c2ecf20Sopenharmony_ci if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) 5368c2ecf20Sopenharmony_ci return false; 5378c2ecf20Sopenharmony_ci 5388c2ecf20Sopenharmony_ci pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 5398c2ecf20Sopenharmony_ci 5408c2ecf20Sopenharmony_ci gfn = gpte_to_gfn(gpte); 5418c2ecf20Sopenharmony_ci pte_access = sp->role.access & FNAME(gpte_access)(gpte); 5428c2ecf20Sopenharmony_ci FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); 5438c2ecf20Sopenharmony_ci pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, 5448c2ecf20Sopenharmony_ci no_dirty_log && (pte_access & ACC_WRITE_MASK)); 5458c2ecf20Sopenharmony_ci if (is_error_pfn(pfn)) 5468c2ecf20Sopenharmony_ci return false; 5478c2ecf20Sopenharmony_ci 5488c2ecf20Sopenharmony_ci /* 5498c2ecf20Sopenharmony_ci * we call mmu_set_spte() with host_writable = true because 5508c2ecf20Sopenharmony_ci * pte_prefetch_gfn_to_pfn always gets a writable pfn. 5518c2ecf20Sopenharmony_ci */ 5528c2ecf20Sopenharmony_ci mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn, 5538c2ecf20Sopenharmony_ci true, true); 5548c2ecf20Sopenharmony_ci 5558c2ecf20Sopenharmony_ci kvm_release_pfn_clean(pfn); 5568c2ecf20Sopenharmony_ci return true; 5578c2ecf20Sopenharmony_ci} 5588c2ecf20Sopenharmony_ci 5598c2ecf20Sopenharmony_cistatic void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 5608c2ecf20Sopenharmony_ci u64 *spte, const void *pte) 5618c2ecf20Sopenharmony_ci{ 5628c2ecf20Sopenharmony_ci pt_element_t gpte = *(const pt_element_t *)pte; 5638c2ecf20Sopenharmony_ci 5648c2ecf20Sopenharmony_ci FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false); 5658c2ecf20Sopenharmony_ci} 5668c2ecf20Sopenharmony_ci 5678c2ecf20Sopenharmony_cistatic bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, 5688c2ecf20Sopenharmony_ci struct guest_walker *gw, int level) 5698c2ecf20Sopenharmony_ci{ 5708c2ecf20Sopenharmony_ci pt_element_t curr_pte; 5718c2ecf20Sopenharmony_ci gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1]; 5728c2ecf20Sopenharmony_ci u64 mask; 5738c2ecf20Sopenharmony_ci int r, index; 5748c2ecf20Sopenharmony_ci 5758c2ecf20Sopenharmony_ci if (level == PG_LEVEL_4K) { 5768c2ecf20Sopenharmony_ci mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1; 5778c2ecf20Sopenharmony_ci base_gpa = pte_gpa & ~mask; 5788c2ecf20Sopenharmony_ci index = (pte_gpa - base_gpa) / sizeof(pt_element_t); 5798c2ecf20Sopenharmony_ci 5808c2ecf20Sopenharmony_ci r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa, 5818c2ecf20Sopenharmony_ci gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); 5828c2ecf20Sopenharmony_ci curr_pte = gw->prefetch_ptes[index]; 5838c2ecf20Sopenharmony_ci } else 5848c2ecf20Sopenharmony_ci r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, 5858c2ecf20Sopenharmony_ci &curr_pte, sizeof(curr_pte)); 5868c2ecf20Sopenharmony_ci 5878c2ecf20Sopenharmony_ci return r || curr_pte != gw->ptes[level - 1]; 5888c2ecf20Sopenharmony_ci} 5898c2ecf20Sopenharmony_ci 5908c2ecf20Sopenharmony_cistatic void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, 5918c2ecf20Sopenharmony_ci u64 *sptep) 5928c2ecf20Sopenharmony_ci{ 5938c2ecf20Sopenharmony_ci struct kvm_mmu_page *sp; 5948c2ecf20Sopenharmony_ci pt_element_t *gptep = gw->prefetch_ptes; 5958c2ecf20Sopenharmony_ci u64 *spte; 5968c2ecf20Sopenharmony_ci int i; 5978c2ecf20Sopenharmony_ci 5988c2ecf20Sopenharmony_ci sp = sptep_to_sp(sptep); 5998c2ecf20Sopenharmony_ci 6008c2ecf20Sopenharmony_ci if (sp->role.level > PG_LEVEL_4K) 6018c2ecf20Sopenharmony_ci return; 6028c2ecf20Sopenharmony_ci 6038c2ecf20Sopenharmony_ci if (sp->role.direct) 6048c2ecf20Sopenharmony_ci return __direct_pte_prefetch(vcpu, sp, sptep); 6058c2ecf20Sopenharmony_ci 6068c2ecf20Sopenharmony_ci i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); 6078c2ecf20Sopenharmony_ci spte = sp->spt + i; 6088c2ecf20Sopenharmony_ci 6098c2ecf20Sopenharmony_ci for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { 6108c2ecf20Sopenharmony_ci if (spte == sptep) 6118c2ecf20Sopenharmony_ci continue; 6128c2ecf20Sopenharmony_ci 6138c2ecf20Sopenharmony_ci if (is_shadow_present_pte(*spte)) 6148c2ecf20Sopenharmony_ci continue; 6158c2ecf20Sopenharmony_ci 6168c2ecf20Sopenharmony_ci if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true)) 6178c2ecf20Sopenharmony_ci break; 6188c2ecf20Sopenharmony_ci } 6198c2ecf20Sopenharmony_ci} 6208c2ecf20Sopenharmony_ci 6218c2ecf20Sopenharmony_ci/* 6228c2ecf20Sopenharmony_ci * Fetch a shadow pte for a specific level in the paging hierarchy. 6238c2ecf20Sopenharmony_ci * If the guest tries to write a write-protected page, we need to 6248c2ecf20Sopenharmony_ci * emulate this operation, return 1 to indicate this case. 6258c2ecf20Sopenharmony_ci */ 6268c2ecf20Sopenharmony_cistatic int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, 6278c2ecf20Sopenharmony_ci struct guest_walker *gw, u32 error_code, 6288c2ecf20Sopenharmony_ci int max_level, kvm_pfn_t pfn, bool map_writable, 6298c2ecf20Sopenharmony_ci bool prefault) 6308c2ecf20Sopenharmony_ci{ 6318c2ecf20Sopenharmony_ci bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); 6328c2ecf20Sopenharmony_ci bool write_fault = error_code & PFERR_WRITE_MASK; 6338c2ecf20Sopenharmony_ci bool exec = error_code & PFERR_FETCH_MASK; 6348c2ecf20Sopenharmony_ci bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; 6358c2ecf20Sopenharmony_ci struct kvm_mmu_page *sp = NULL; 6368c2ecf20Sopenharmony_ci struct kvm_shadow_walk_iterator it; 6378c2ecf20Sopenharmony_ci unsigned int direct_access, access; 6388c2ecf20Sopenharmony_ci int top_level, level, req_level, ret; 6398c2ecf20Sopenharmony_ci gfn_t base_gfn = gw->gfn; 6408c2ecf20Sopenharmony_ci 6418c2ecf20Sopenharmony_ci direct_access = gw->pte_access; 6428c2ecf20Sopenharmony_ci 6438c2ecf20Sopenharmony_ci top_level = vcpu->arch.mmu->root_level; 6448c2ecf20Sopenharmony_ci if (top_level == PT32E_ROOT_LEVEL) 6458c2ecf20Sopenharmony_ci top_level = PT32_ROOT_LEVEL; 6468c2ecf20Sopenharmony_ci /* 6478c2ecf20Sopenharmony_ci * Verify that the top-level gpte is still there. Since the page 6488c2ecf20Sopenharmony_ci * is a root page, it is either write protected (and cannot be 6498c2ecf20Sopenharmony_ci * changed from now on) or it is invalid (in which case, we don't 6508c2ecf20Sopenharmony_ci * really care if it changes underneath us after this point). 6518c2ecf20Sopenharmony_ci */ 6528c2ecf20Sopenharmony_ci if (FNAME(gpte_changed)(vcpu, gw, top_level)) 6538c2ecf20Sopenharmony_ci goto out_gpte_changed; 6548c2ecf20Sopenharmony_ci 6558c2ecf20Sopenharmony_ci if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) 6568c2ecf20Sopenharmony_ci goto out_gpte_changed; 6578c2ecf20Sopenharmony_ci 6588c2ecf20Sopenharmony_ci for (shadow_walk_init(&it, vcpu, addr); 6598c2ecf20Sopenharmony_ci shadow_walk_okay(&it) && it.level > gw->level; 6608c2ecf20Sopenharmony_ci shadow_walk_next(&it)) { 6618c2ecf20Sopenharmony_ci gfn_t table_gfn; 6628c2ecf20Sopenharmony_ci 6638c2ecf20Sopenharmony_ci clear_sp_write_flooding_count(it.sptep); 6648c2ecf20Sopenharmony_ci drop_large_spte(vcpu, it.sptep); 6658c2ecf20Sopenharmony_ci 6668c2ecf20Sopenharmony_ci sp = NULL; 6678c2ecf20Sopenharmony_ci if (!is_shadow_present_pte(*it.sptep)) { 6688c2ecf20Sopenharmony_ci table_gfn = gw->table_gfn[it.level - 2]; 6698c2ecf20Sopenharmony_ci access = gw->pt_access[it.level - 2]; 6708c2ecf20Sopenharmony_ci sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, 6718c2ecf20Sopenharmony_ci false, access); 6728c2ecf20Sopenharmony_ci } 6738c2ecf20Sopenharmony_ci 6748c2ecf20Sopenharmony_ci /* 6758c2ecf20Sopenharmony_ci * Verify that the gpte in the page we've just write 6768c2ecf20Sopenharmony_ci * protected is still there. 6778c2ecf20Sopenharmony_ci */ 6788c2ecf20Sopenharmony_ci if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) 6798c2ecf20Sopenharmony_ci goto out_gpte_changed; 6808c2ecf20Sopenharmony_ci 6818c2ecf20Sopenharmony_ci if (sp) 6828c2ecf20Sopenharmony_ci link_shadow_page(vcpu, it.sptep, sp); 6838c2ecf20Sopenharmony_ci } 6848c2ecf20Sopenharmony_ci 6858c2ecf20Sopenharmony_ci level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn, 6868c2ecf20Sopenharmony_ci huge_page_disallowed, &req_level); 6878c2ecf20Sopenharmony_ci 6888c2ecf20Sopenharmony_ci trace_kvm_mmu_spte_requested(addr, gw->level, pfn); 6898c2ecf20Sopenharmony_ci 6908c2ecf20Sopenharmony_ci for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { 6918c2ecf20Sopenharmony_ci clear_sp_write_flooding_count(it.sptep); 6928c2ecf20Sopenharmony_ci 6938c2ecf20Sopenharmony_ci /* 6948c2ecf20Sopenharmony_ci * We cannot overwrite existing page tables with an NX 6958c2ecf20Sopenharmony_ci * large page, as the leaf could be executable. 6968c2ecf20Sopenharmony_ci */ 6978c2ecf20Sopenharmony_ci if (nx_huge_page_workaround_enabled) 6988c2ecf20Sopenharmony_ci disallowed_hugepage_adjust(*it.sptep, gw->gfn, it.level, 6998c2ecf20Sopenharmony_ci &pfn, &level); 7008c2ecf20Sopenharmony_ci 7018c2ecf20Sopenharmony_ci base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); 7028c2ecf20Sopenharmony_ci if (it.level == level) 7038c2ecf20Sopenharmony_ci break; 7048c2ecf20Sopenharmony_ci 7058c2ecf20Sopenharmony_ci validate_direct_spte(vcpu, it.sptep, direct_access); 7068c2ecf20Sopenharmony_ci 7078c2ecf20Sopenharmony_ci drop_large_spte(vcpu, it.sptep); 7088c2ecf20Sopenharmony_ci 7098c2ecf20Sopenharmony_ci if (!is_shadow_present_pte(*it.sptep)) { 7108c2ecf20Sopenharmony_ci sp = kvm_mmu_get_page(vcpu, base_gfn, addr, 7118c2ecf20Sopenharmony_ci it.level - 1, true, direct_access); 7128c2ecf20Sopenharmony_ci link_shadow_page(vcpu, it.sptep, sp); 7138c2ecf20Sopenharmony_ci if (huge_page_disallowed && req_level >= it.level) 7148c2ecf20Sopenharmony_ci account_huge_nx_page(vcpu->kvm, sp); 7158c2ecf20Sopenharmony_ci } 7168c2ecf20Sopenharmony_ci } 7178c2ecf20Sopenharmony_ci 7188c2ecf20Sopenharmony_ci ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, 7198c2ecf20Sopenharmony_ci it.level, base_gfn, pfn, prefault, map_writable); 7208c2ecf20Sopenharmony_ci if (ret == RET_PF_SPURIOUS) 7218c2ecf20Sopenharmony_ci return ret; 7228c2ecf20Sopenharmony_ci 7238c2ecf20Sopenharmony_ci FNAME(pte_prefetch)(vcpu, gw, it.sptep); 7248c2ecf20Sopenharmony_ci ++vcpu->stat.pf_fixed; 7258c2ecf20Sopenharmony_ci return ret; 7268c2ecf20Sopenharmony_ci 7278c2ecf20Sopenharmony_ciout_gpte_changed: 7288c2ecf20Sopenharmony_ci return RET_PF_RETRY; 7298c2ecf20Sopenharmony_ci} 7308c2ecf20Sopenharmony_ci 7318c2ecf20Sopenharmony_ci /* 7328c2ecf20Sopenharmony_ci * To see whether the mapped gfn can write its page table in the current 7338c2ecf20Sopenharmony_ci * mapping. 7348c2ecf20Sopenharmony_ci * 7358c2ecf20Sopenharmony_ci * It is the helper function of FNAME(page_fault). When guest uses large page 7368c2ecf20Sopenharmony_ci * size to map the writable gfn which is used as current page table, we should 7378c2ecf20Sopenharmony_ci * force kvm to use small page size to map it because new shadow page will be 7388c2ecf20Sopenharmony_ci * created when kvm establishes shadow page table that stop kvm using large 7398c2ecf20Sopenharmony_ci * page size. Do it early can avoid unnecessary #PF and emulation. 7408c2ecf20Sopenharmony_ci * 7418c2ecf20Sopenharmony_ci * @write_fault_to_shadow_pgtable will return true if the fault gfn is 7428c2ecf20Sopenharmony_ci * currently used as its page table. 7438c2ecf20Sopenharmony_ci * 7448c2ecf20Sopenharmony_ci * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok 7458c2ecf20Sopenharmony_ci * since the PDPT is always shadowed, that means, we can not use large page 7468c2ecf20Sopenharmony_ci * size to map the gfn which is used as PDPT. 7478c2ecf20Sopenharmony_ci */ 7488c2ecf20Sopenharmony_cistatic bool 7498c2ecf20Sopenharmony_ciFNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, 7508c2ecf20Sopenharmony_ci struct guest_walker *walker, bool user_fault, 7518c2ecf20Sopenharmony_ci bool *write_fault_to_shadow_pgtable) 7528c2ecf20Sopenharmony_ci{ 7538c2ecf20Sopenharmony_ci int level; 7548c2ecf20Sopenharmony_ci gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1); 7558c2ecf20Sopenharmony_ci bool self_changed = false; 7568c2ecf20Sopenharmony_ci 7578c2ecf20Sopenharmony_ci if (!(walker->pte_access & ACC_WRITE_MASK || 7588c2ecf20Sopenharmony_ci (!is_write_protection(vcpu) && !user_fault))) 7598c2ecf20Sopenharmony_ci return false; 7608c2ecf20Sopenharmony_ci 7618c2ecf20Sopenharmony_ci for (level = walker->level; level <= walker->max_level; level++) { 7628c2ecf20Sopenharmony_ci gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1]; 7638c2ecf20Sopenharmony_ci 7648c2ecf20Sopenharmony_ci self_changed |= !(gfn & mask); 7658c2ecf20Sopenharmony_ci *write_fault_to_shadow_pgtable |= !gfn; 7668c2ecf20Sopenharmony_ci } 7678c2ecf20Sopenharmony_ci 7688c2ecf20Sopenharmony_ci return self_changed; 7698c2ecf20Sopenharmony_ci} 7708c2ecf20Sopenharmony_ci 7718c2ecf20Sopenharmony_ci/* 7728c2ecf20Sopenharmony_ci * Page fault handler. There are several causes for a page fault: 7738c2ecf20Sopenharmony_ci * - there is no shadow pte for the guest pte 7748c2ecf20Sopenharmony_ci * - write access through a shadow pte marked read only so that we can set 7758c2ecf20Sopenharmony_ci * the dirty bit 7768c2ecf20Sopenharmony_ci * - write access to a shadow pte marked read only so we can update the page 7778c2ecf20Sopenharmony_ci * dirty bitmap, when userspace requests it 7788c2ecf20Sopenharmony_ci * - mmio access; in this case we will never install a present shadow pte 7798c2ecf20Sopenharmony_ci * - normal guest page fault due to the guest pte marked not present, not 7808c2ecf20Sopenharmony_ci * writable, or not executable 7818c2ecf20Sopenharmony_ci * 7828c2ecf20Sopenharmony_ci * Returns: 1 if we need to emulate the instruction, 0 otherwise, or 7838c2ecf20Sopenharmony_ci * a negative value on error. 7848c2ecf20Sopenharmony_ci */ 7858c2ecf20Sopenharmony_cistatic int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, 7868c2ecf20Sopenharmony_ci bool prefault) 7878c2ecf20Sopenharmony_ci{ 7888c2ecf20Sopenharmony_ci bool write_fault = error_code & PFERR_WRITE_MASK; 7898c2ecf20Sopenharmony_ci bool user_fault = error_code & PFERR_USER_MASK; 7908c2ecf20Sopenharmony_ci struct guest_walker walker; 7918c2ecf20Sopenharmony_ci int r; 7928c2ecf20Sopenharmony_ci kvm_pfn_t pfn; 7938c2ecf20Sopenharmony_ci unsigned long mmu_seq; 7948c2ecf20Sopenharmony_ci bool map_writable, is_self_change_mapping; 7958c2ecf20Sopenharmony_ci int max_level; 7968c2ecf20Sopenharmony_ci 7978c2ecf20Sopenharmony_ci pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 7988c2ecf20Sopenharmony_ci 7998c2ecf20Sopenharmony_ci /* 8008c2ecf20Sopenharmony_ci * If PFEC.RSVD is set, this is a shadow page fault. 8018c2ecf20Sopenharmony_ci * The bit needs to be cleared before walking guest page tables. 8028c2ecf20Sopenharmony_ci */ 8038c2ecf20Sopenharmony_ci error_code &= ~PFERR_RSVD_MASK; 8048c2ecf20Sopenharmony_ci 8058c2ecf20Sopenharmony_ci /* 8068c2ecf20Sopenharmony_ci * Look up the guest pte for the faulting address. 8078c2ecf20Sopenharmony_ci */ 8088c2ecf20Sopenharmony_ci r = FNAME(walk_addr)(&walker, vcpu, addr, error_code); 8098c2ecf20Sopenharmony_ci 8108c2ecf20Sopenharmony_ci /* 8118c2ecf20Sopenharmony_ci * The page is not mapped by the guest. Let the guest handle it. 8128c2ecf20Sopenharmony_ci */ 8138c2ecf20Sopenharmony_ci if (!r) { 8148c2ecf20Sopenharmony_ci pgprintk("%s: guest page fault\n", __func__); 8158c2ecf20Sopenharmony_ci if (!prefault) 8168c2ecf20Sopenharmony_ci kvm_inject_emulated_page_fault(vcpu, &walker.fault); 8178c2ecf20Sopenharmony_ci 8188c2ecf20Sopenharmony_ci return RET_PF_RETRY; 8198c2ecf20Sopenharmony_ci } 8208c2ecf20Sopenharmony_ci 8218c2ecf20Sopenharmony_ci if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) { 8228c2ecf20Sopenharmony_ci shadow_page_table_clear_flood(vcpu, addr); 8238c2ecf20Sopenharmony_ci return RET_PF_EMULATE; 8248c2ecf20Sopenharmony_ci } 8258c2ecf20Sopenharmony_ci 8268c2ecf20Sopenharmony_ci r = mmu_topup_memory_caches(vcpu, true); 8278c2ecf20Sopenharmony_ci if (r) 8288c2ecf20Sopenharmony_ci return r; 8298c2ecf20Sopenharmony_ci 8308c2ecf20Sopenharmony_ci vcpu->arch.write_fault_to_shadow_pgtable = false; 8318c2ecf20Sopenharmony_ci 8328c2ecf20Sopenharmony_ci is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, 8338c2ecf20Sopenharmony_ci &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); 8348c2ecf20Sopenharmony_ci 8358c2ecf20Sopenharmony_ci if (is_self_change_mapping) 8368c2ecf20Sopenharmony_ci max_level = PG_LEVEL_4K; 8378c2ecf20Sopenharmony_ci else 8388c2ecf20Sopenharmony_ci max_level = walker.level; 8398c2ecf20Sopenharmony_ci 8408c2ecf20Sopenharmony_ci mmu_seq = vcpu->kvm->mmu_notifier_seq; 8418c2ecf20Sopenharmony_ci smp_rmb(); 8428c2ecf20Sopenharmony_ci 8438c2ecf20Sopenharmony_ci if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, 8448c2ecf20Sopenharmony_ci &map_writable)) 8458c2ecf20Sopenharmony_ci return RET_PF_RETRY; 8468c2ecf20Sopenharmony_ci 8478c2ecf20Sopenharmony_ci if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) 8488c2ecf20Sopenharmony_ci return r; 8498c2ecf20Sopenharmony_ci 8508c2ecf20Sopenharmony_ci /* 8518c2ecf20Sopenharmony_ci * Do not change pte_access if the pfn is a mmio page, otherwise 8528c2ecf20Sopenharmony_ci * we will cache the incorrect access into mmio spte. 8538c2ecf20Sopenharmony_ci */ 8548c2ecf20Sopenharmony_ci if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) && 8558c2ecf20Sopenharmony_ci !is_write_protection(vcpu) && !user_fault && 8568c2ecf20Sopenharmony_ci !is_noslot_pfn(pfn)) { 8578c2ecf20Sopenharmony_ci walker.pte_access |= ACC_WRITE_MASK; 8588c2ecf20Sopenharmony_ci walker.pte_access &= ~ACC_USER_MASK; 8598c2ecf20Sopenharmony_ci 8608c2ecf20Sopenharmony_ci /* 8618c2ecf20Sopenharmony_ci * If we converted a user page to a kernel page, 8628c2ecf20Sopenharmony_ci * so that the kernel can write to it when cr0.wp=0, 8638c2ecf20Sopenharmony_ci * then we should prevent the kernel from executing it 8648c2ecf20Sopenharmony_ci * if SMEP is enabled. 8658c2ecf20Sopenharmony_ci */ 8668c2ecf20Sopenharmony_ci if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) 8678c2ecf20Sopenharmony_ci walker.pte_access &= ~ACC_EXEC_MASK; 8688c2ecf20Sopenharmony_ci } 8698c2ecf20Sopenharmony_ci 8708c2ecf20Sopenharmony_ci r = RET_PF_RETRY; 8718c2ecf20Sopenharmony_ci spin_lock(&vcpu->kvm->mmu_lock); 8728c2ecf20Sopenharmony_ci if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 8738c2ecf20Sopenharmony_ci goto out_unlock; 8748c2ecf20Sopenharmony_ci 8758c2ecf20Sopenharmony_ci kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 8768c2ecf20Sopenharmony_ci r = make_mmu_pages_available(vcpu); 8778c2ecf20Sopenharmony_ci if (r) 8788c2ecf20Sopenharmony_ci goto out_unlock; 8798c2ecf20Sopenharmony_ci r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn, 8808c2ecf20Sopenharmony_ci map_writable, prefault); 8818c2ecf20Sopenharmony_ci kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); 8828c2ecf20Sopenharmony_ci 8838c2ecf20Sopenharmony_ciout_unlock: 8848c2ecf20Sopenharmony_ci spin_unlock(&vcpu->kvm->mmu_lock); 8858c2ecf20Sopenharmony_ci kvm_release_pfn_clean(pfn); 8868c2ecf20Sopenharmony_ci return r; 8878c2ecf20Sopenharmony_ci} 8888c2ecf20Sopenharmony_ci 8898c2ecf20Sopenharmony_cistatic gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) 8908c2ecf20Sopenharmony_ci{ 8918c2ecf20Sopenharmony_ci int offset = 0; 8928c2ecf20Sopenharmony_ci 8938c2ecf20Sopenharmony_ci WARN_ON(sp->role.level != PG_LEVEL_4K); 8948c2ecf20Sopenharmony_ci 8958c2ecf20Sopenharmony_ci if (PTTYPE == 32) 8968c2ecf20Sopenharmony_ci offset = sp->role.quadrant << PT64_LEVEL_BITS; 8978c2ecf20Sopenharmony_ci 8988c2ecf20Sopenharmony_ci return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); 8998c2ecf20Sopenharmony_ci} 9008c2ecf20Sopenharmony_ci 9018c2ecf20Sopenharmony_cistatic void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) 9028c2ecf20Sopenharmony_ci{ 9038c2ecf20Sopenharmony_ci struct kvm_shadow_walk_iterator iterator; 9048c2ecf20Sopenharmony_ci struct kvm_mmu_page *sp; 9058c2ecf20Sopenharmony_ci u64 old_spte; 9068c2ecf20Sopenharmony_ci int level; 9078c2ecf20Sopenharmony_ci u64 *sptep; 9088c2ecf20Sopenharmony_ci 9098c2ecf20Sopenharmony_ci vcpu_clear_mmio_info(vcpu, gva); 9108c2ecf20Sopenharmony_ci 9118c2ecf20Sopenharmony_ci /* 9128c2ecf20Sopenharmony_ci * No need to check return value here, rmap_can_add() can 9138c2ecf20Sopenharmony_ci * help us to skip pte prefetch later. 9148c2ecf20Sopenharmony_ci */ 9158c2ecf20Sopenharmony_ci mmu_topup_memory_caches(vcpu, true); 9168c2ecf20Sopenharmony_ci 9178c2ecf20Sopenharmony_ci if (!VALID_PAGE(root_hpa)) { 9188c2ecf20Sopenharmony_ci WARN_ON(1); 9198c2ecf20Sopenharmony_ci return; 9208c2ecf20Sopenharmony_ci } 9218c2ecf20Sopenharmony_ci 9228c2ecf20Sopenharmony_ci spin_lock(&vcpu->kvm->mmu_lock); 9238c2ecf20Sopenharmony_ci for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) { 9248c2ecf20Sopenharmony_ci level = iterator.level; 9258c2ecf20Sopenharmony_ci sptep = iterator.sptep; 9268c2ecf20Sopenharmony_ci 9278c2ecf20Sopenharmony_ci sp = sptep_to_sp(sptep); 9288c2ecf20Sopenharmony_ci old_spte = *sptep; 9298c2ecf20Sopenharmony_ci if (is_last_spte(old_spte, level)) { 9308c2ecf20Sopenharmony_ci pt_element_t gpte; 9318c2ecf20Sopenharmony_ci gpa_t pte_gpa; 9328c2ecf20Sopenharmony_ci 9338c2ecf20Sopenharmony_ci if (!sp->unsync) 9348c2ecf20Sopenharmony_ci break; 9358c2ecf20Sopenharmony_ci 9368c2ecf20Sopenharmony_ci pte_gpa = FNAME(get_level1_sp_gpa)(sp); 9378c2ecf20Sopenharmony_ci pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); 9388c2ecf20Sopenharmony_ci 9398c2ecf20Sopenharmony_ci mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); 9408c2ecf20Sopenharmony_ci if (is_shadow_present_pte(old_spte)) 9418c2ecf20Sopenharmony_ci kvm_flush_remote_tlbs_with_address(vcpu->kvm, 9428c2ecf20Sopenharmony_ci sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); 9438c2ecf20Sopenharmony_ci 9448c2ecf20Sopenharmony_ci if (!rmap_can_add(vcpu)) 9458c2ecf20Sopenharmony_ci break; 9468c2ecf20Sopenharmony_ci 9478c2ecf20Sopenharmony_ci if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, 9488c2ecf20Sopenharmony_ci sizeof(pt_element_t))) 9498c2ecf20Sopenharmony_ci break; 9508c2ecf20Sopenharmony_ci 9518c2ecf20Sopenharmony_ci FNAME(update_pte)(vcpu, sp, sptep, &gpte); 9528c2ecf20Sopenharmony_ci } 9538c2ecf20Sopenharmony_ci 9548c2ecf20Sopenharmony_ci if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) 9558c2ecf20Sopenharmony_ci break; 9568c2ecf20Sopenharmony_ci } 9578c2ecf20Sopenharmony_ci spin_unlock(&vcpu->kvm->mmu_lock); 9588c2ecf20Sopenharmony_ci} 9598c2ecf20Sopenharmony_ci 9608c2ecf20Sopenharmony_ci/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ 9618c2ecf20Sopenharmony_cistatic gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access, 9628c2ecf20Sopenharmony_ci struct x86_exception *exception) 9638c2ecf20Sopenharmony_ci{ 9648c2ecf20Sopenharmony_ci struct guest_walker walker; 9658c2ecf20Sopenharmony_ci gpa_t gpa = UNMAPPED_GVA; 9668c2ecf20Sopenharmony_ci int r; 9678c2ecf20Sopenharmony_ci 9688c2ecf20Sopenharmony_ci r = FNAME(walk_addr)(&walker, vcpu, addr, access); 9698c2ecf20Sopenharmony_ci 9708c2ecf20Sopenharmony_ci if (r) { 9718c2ecf20Sopenharmony_ci gpa = gfn_to_gpa(walker.gfn); 9728c2ecf20Sopenharmony_ci gpa |= addr & ~PAGE_MASK; 9738c2ecf20Sopenharmony_ci } else if (exception) 9748c2ecf20Sopenharmony_ci *exception = walker.fault; 9758c2ecf20Sopenharmony_ci 9768c2ecf20Sopenharmony_ci return gpa; 9778c2ecf20Sopenharmony_ci} 9788c2ecf20Sopenharmony_ci 9798c2ecf20Sopenharmony_ci#if PTTYPE != PTTYPE_EPT 9808c2ecf20Sopenharmony_ci/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */ 9818c2ecf20Sopenharmony_cistatic gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, 9828c2ecf20Sopenharmony_ci u32 access, 9838c2ecf20Sopenharmony_ci struct x86_exception *exception) 9848c2ecf20Sopenharmony_ci{ 9858c2ecf20Sopenharmony_ci struct guest_walker walker; 9868c2ecf20Sopenharmony_ci gpa_t gpa = UNMAPPED_GVA; 9878c2ecf20Sopenharmony_ci int r; 9888c2ecf20Sopenharmony_ci 9898c2ecf20Sopenharmony_ci#ifndef CONFIG_X86_64 9908c2ecf20Sopenharmony_ci /* A 64-bit GVA should be impossible on 32-bit KVM. */ 9918c2ecf20Sopenharmony_ci WARN_ON_ONCE(vaddr >> 32); 9928c2ecf20Sopenharmony_ci#endif 9938c2ecf20Sopenharmony_ci 9948c2ecf20Sopenharmony_ci r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); 9958c2ecf20Sopenharmony_ci 9968c2ecf20Sopenharmony_ci if (r) { 9978c2ecf20Sopenharmony_ci gpa = gfn_to_gpa(walker.gfn); 9988c2ecf20Sopenharmony_ci gpa |= vaddr & ~PAGE_MASK; 9998c2ecf20Sopenharmony_ci } else if (exception) 10008c2ecf20Sopenharmony_ci *exception = walker.fault; 10018c2ecf20Sopenharmony_ci 10028c2ecf20Sopenharmony_ci return gpa; 10038c2ecf20Sopenharmony_ci} 10048c2ecf20Sopenharmony_ci#endif 10058c2ecf20Sopenharmony_ci 10068c2ecf20Sopenharmony_ci/* 10078c2ecf20Sopenharmony_ci * Using the cached information from sp->gfns is safe because: 10088c2ecf20Sopenharmony_ci * - The spte has a reference to the struct page, so the pfn for a given gfn 10098c2ecf20Sopenharmony_ci * can't change unless all sptes pointing to it are nuked first. 10108c2ecf20Sopenharmony_ci * 10118c2ecf20Sopenharmony_ci * Note: 10128c2ecf20Sopenharmony_ci * We should flush all tlbs if spte is dropped even though guest is 10138c2ecf20Sopenharmony_ci * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page 10148c2ecf20Sopenharmony_ci * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't 10158c2ecf20Sopenharmony_ci * used by guest then tlbs are not flushed, so guest is allowed to access the 10168c2ecf20Sopenharmony_ci * freed pages. 10178c2ecf20Sopenharmony_ci * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. 10188c2ecf20Sopenharmony_ci */ 10198c2ecf20Sopenharmony_cistatic int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 10208c2ecf20Sopenharmony_ci{ 10218c2ecf20Sopenharmony_ci int i, nr_present = 0; 10228c2ecf20Sopenharmony_ci bool host_writable; 10238c2ecf20Sopenharmony_ci gpa_t first_pte_gpa; 10248c2ecf20Sopenharmony_ci int set_spte_ret = 0; 10258c2ecf20Sopenharmony_ci 10268c2ecf20Sopenharmony_ci /* direct kvm_mmu_page can not be unsync. */ 10278c2ecf20Sopenharmony_ci BUG_ON(sp->role.direct); 10288c2ecf20Sopenharmony_ci 10298c2ecf20Sopenharmony_ci first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); 10308c2ecf20Sopenharmony_ci 10318c2ecf20Sopenharmony_ci for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 10328c2ecf20Sopenharmony_ci unsigned pte_access; 10338c2ecf20Sopenharmony_ci pt_element_t gpte; 10348c2ecf20Sopenharmony_ci gpa_t pte_gpa; 10358c2ecf20Sopenharmony_ci gfn_t gfn; 10368c2ecf20Sopenharmony_ci 10378c2ecf20Sopenharmony_ci if (!sp->spt[i]) 10388c2ecf20Sopenharmony_ci continue; 10398c2ecf20Sopenharmony_ci 10408c2ecf20Sopenharmony_ci pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); 10418c2ecf20Sopenharmony_ci 10428c2ecf20Sopenharmony_ci if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, 10438c2ecf20Sopenharmony_ci sizeof(pt_element_t))) 10448c2ecf20Sopenharmony_ci return 0; 10458c2ecf20Sopenharmony_ci 10468c2ecf20Sopenharmony_ci if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { 10478c2ecf20Sopenharmony_ci /* 10488c2ecf20Sopenharmony_ci * Update spte before increasing tlbs_dirty to make 10498c2ecf20Sopenharmony_ci * sure no tlb flush is lost after spte is zapped; see 10508c2ecf20Sopenharmony_ci * the comments in kvm_flush_remote_tlbs(). 10518c2ecf20Sopenharmony_ci */ 10528c2ecf20Sopenharmony_ci smp_wmb(); 10538c2ecf20Sopenharmony_ci vcpu->kvm->tlbs_dirty++; 10548c2ecf20Sopenharmony_ci continue; 10558c2ecf20Sopenharmony_ci } 10568c2ecf20Sopenharmony_ci 10578c2ecf20Sopenharmony_ci gfn = gpte_to_gfn(gpte); 10588c2ecf20Sopenharmony_ci pte_access = sp->role.access; 10598c2ecf20Sopenharmony_ci pte_access &= FNAME(gpte_access)(gpte); 10608c2ecf20Sopenharmony_ci FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); 10618c2ecf20Sopenharmony_ci 10628c2ecf20Sopenharmony_ci if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, 10638c2ecf20Sopenharmony_ci &nr_present)) 10648c2ecf20Sopenharmony_ci continue; 10658c2ecf20Sopenharmony_ci 10668c2ecf20Sopenharmony_ci if (gfn != sp->gfns[i]) { 10678c2ecf20Sopenharmony_ci drop_spte(vcpu->kvm, &sp->spt[i]); 10688c2ecf20Sopenharmony_ci /* 10698c2ecf20Sopenharmony_ci * The same as above where we are doing 10708c2ecf20Sopenharmony_ci * prefetch_invalid_gpte(). 10718c2ecf20Sopenharmony_ci */ 10728c2ecf20Sopenharmony_ci smp_wmb(); 10738c2ecf20Sopenharmony_ci vcpu->kvm->tlbs_dirty++; 10748c2ecf20Sopenharmony_ci continue; 10758c2ecf20Sopenharmony_ci } 10768c2ecf20Sopenharmony_ci 10778c2ecf20Sopenharmony_ci nr_present++; 10788c2ecf20Sopenharmony_ci 10798c2ecf20Sopenharmony_ci host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; 10808c2ecf20Sopenharmony_ci 10818c2ecf20Sopenharmony_ci set_spte_ret |= set_spte(vcpu, &sp->spt[i], 10828c2ecf20Sopenharmony_ci pte_access, PG_LEVEL_4K, 10838c2ecf20Sopenharmony_ci gfn, spte_to_pfn(sp->spt[i]), 10848c2ecf20Sopenharmony_ci true, false, host_writable); 10858c2ecf20Sopenharmony_ci } 10868c2ecf20Sopenharmony_ci 10878c2ecf20Sopenharmony_ci if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH) 10888c2ecf20Sopenharmony_ci kvm_flush_remote_tlbs(vcpu->kvm); 10898c2ecf20Sopenharmony_ci 10908c2ecf20Sopenharmony_ci return nr_present; 10918c2ecf20Sopenharmony_ci} 10928c2ecf20Sopenharmony_ci 10938c2ecf20Sopenharmony_ci#undef pt_element_t 10948c2ecf20Sopenharmony_ci#undef guest_walker 10958c2ecf20Sopenharmony_ci#undef FNAME 10968c2ecf20Sopenharmony_ci#undef PT_BASE_ADDR_MASK 10978c2ecf20Sopenharmony_ci#undef PT_INDEX 10988c2ecf20Sopenharmony_ci#undef PT_LVL_ADDR_MASK 10998c2ecf20Sopenharmony_ci#undef PT_LVL_OFFSET_MASK 11008c2ecf20Sopenharmony_ci#undef PT_LEVEL_BITS 11018c2ecf20Sopenharmony_ci#undef PT_MAX_FULL_LEVELS 11028c2ecf20Sopenharmony_ci#undef gpte_to_gfn 11038c2ecf20Sopenharmony_ci#undef gpte_to_gfn_lvl 11048c2ecf20Sopenharmony_ci#undef CMPXCHG 11058c2ecf20Sopenharmony_ci#undef PT_GUEST_ACCESSED_MASK 11068c2ecf20Sopenharmony_ci#undef PT_GUEST_DIRTY_MASK 11078c2ecf20Sopenharmony_ci#undef PT_GUEST_DIRTY_SHIFT 11088c2ecf20Sopenharmony_ci#undef PT_GUEST_ACCESSED_SHIFT 11098c2ecf20Sopenharmony_ci#undef PT_HAVE_ACCESSED_DIRTY 1110