1// SPDX-License-Identifier: GPL-2.0 2 3#include "mmu.h" 4#include "mmu_internal.h" 5#include "mmutrace.h" 6#include "tdp_iter.h" 7#include "tdp_mmu.h" 8#include "spte.h" 9 10#ifdef CONFIG_X86_64 11static bool __read_mostly tdp_mmu_enabled = false; 12module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 13#endif 14 15static bool is_tdp_mmu_enabled(void) 16{ 17#ifdef CONFIG_X86_64 18 return tdp_enabled && READ_ONCE(tdp_mmu_enabled); 19#else 20 return false; 21#endif /* CONFIG_X86_64 */ 22} 23 24/* Initializes the TDP MMU for the VM, if enabled. */ 25void kvm_mmu_init_tdp_mmu(struct kvm *kvm) 26{ 27 if (!is_tdp_mmu_enabled()) 28 return; 29 30 /* This should not be changed for the lifetime of the VM. */ 31 kvm->arch.tdp_mmu_enabled = true; 32 33 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 34 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 35} 36 37void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 38{ 39 if (!kvm->arch.tdp_mmu_enabled) 40 return; 41 42 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 43} 44 45static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) 46{ 47 if (kvm_mmu_put_root(kvm, root)) 48 kvm_tdp_mmu_free_root(kvm, root); 49} 50 51static inline bool tdp_mmu_next_root_valid(struct kvm *kvm, 52 struct kvm_mmu_page *root) 53{ 54 lockdep_assert_held(&kvm->mmu_lock); 55 56 if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link)) 57 return false; 58 59 kvm_mmu_get_root(kvm, root); 60 return true; 61 62} 63 64static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 65 struct kvm_mmu_page *root) 66{ 67 struct kvm_mmu_page *next_root; 68 69 next_root = list_next_entry(root, link); 70 tdp_mmu_put_root(kvm, root); 71 return next_root; 72} 73 74/* 75 * Note: this iterator gets and puts references to the roots it iterates over. 76 * This makes it safe to release the MMU lock and yield within the loop, but 77 * if exiting the loop early, the caller must drop the reference to the most 78 * recent root. (Unless keeping a live reference is desirable.) 79 */ 80#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ 81 for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \ 82 typeof(*_root), link); \ 83 tdp_mmu_next_root_valid(_kvm, _root); \ 84 _root = tdp_mmu_next_root(_kvm, _root)) 85 86#define for_each_tdp_mmu_root(_kvm, _root) \ 87 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) 88 89bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) 90{ 91 struct kvm_mmu_page *sp; 92 93 if (!kvm->arch.tdp_mmu_enabled) 94 return false; 95 if (WARN_ON(!VALID_PAGE(hpa))) 96 return false; 97 98 sp = to_shadow_page(hpa); 99 if (WARN_ON(!sp)) 100 return false; 101 102 return sp->tdp_mmu_page && sp->root_count; 103} 104 105static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 106 gfn_t start, gfn_t end, bool can_yield, bool flush); 107 108void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) 109{ 110 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 111 112 lockdep_assert_held(&kvm->mmu_lock); 113 114 WARN_ON(root->root_count); 115 WARN_ON(!root->tdp_mmu_page); 116 117 list_del(&root->link); 118 119 zap_gfn_range(kvm, root, 0, max_gfn, false, false); 120 121 free_page((unsigned long)root->spt); 122 kmem_cache_free(mmu_page_header_cache, root); 123} 124 125static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, 126 int level) 127{ 128 union kvm_mmu_page_role role; 129 130 role = vcpu->arch.mmu->mmu_role.base; 131 role.level = level; 132 role.direct = true; 133 role.gpte_is_8_bytes = true; 134 role.access = ACC_ALL; 135 136 return role; 137} 138 139static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, 140 int level) 141{ 142 struct kvm_mmu_page *sp; 143 144 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 145 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 146 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 147 148 sp->role.word = page_role_for_level(vcpu, level).word; 149 sp->gfn = gfn; 150 sp->tdp_mmu_page = true; 151 152 return sp; 153} 154 155static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) 156{ 157 union kvm_mmu_page_role role; 158 struct kvm *kvm = vcpu->kvm; 159 struct kvm_mmu_page *root; 160 161 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); 162 163 spin_lock(&kvm->mmu_lock); 164 165 /* Check for an existing root before allocating a new one. */ 166 for_each_tdp_mmu_root(kvm, root) { 167 if (root->role.word == role.word) { 168 kvm_mmu_get_root(kvm, root); 169 spin_unlock(&kvm->mmu_lock); 170 return root; 171 } 172 } 173 174 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); 175 root->root_count = 1; 176 177 list_add(&root->link, &kvm->arch.tdp_mmu_roots); 178 179 spin_unlock(&kvm->mmu_lock); 180 181 return root; 182} 183 184hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 185{ 186 struct kvm_mmu_page *root; 187 188 root = get_tdp_mmu_vcpu_root(vcpu); 189 if (!root) 190 return INVALID_PAGE; 191 192 return __pa(root->spt); 193} 194 195static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 196 u64 old_spte, u64 new_spte, int level); 197 198static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) 199{ 200 return sp->role.smm ? 1 : 0; 201} 202 203static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 204{ 205 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 206 207 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 208 return; 209 210 if (is_accessed_spte(old_spte) && 211 (!is_accessed_spte(new_spte) || pfn_changed)) 212 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 213} 214 215static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 216 u64 old_spte, u64 new_spte, int level) 217{ 218 bool pfn_changed; 219 struct kvm_memory_slot *slot; 220 221 if (level > PG_LEVEL_4K) 222 return; 223 224 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 225 226 if ((!is_writable_pte(old_spte) || pfn_changed) && 227 is_writable_pte(new_spte)) { 228 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 229 mark_page_dirty_in_slot(slot, gfn); 230 } 231} 232 233/** 234 * handle_changed_spte - handle bookkeeping associated with an SPTE change 235 * @kvm: kvm instance 236 * @as_id: the address space of the paging structure the SPTE was a part of 237 * @gfn: the base GFN that was mapped by the SPTE 238 * @old_spte: The value of the SPTE before the change 239 * @new_spte: The value of the SPTE after the change 240 * @level: the level of the PT the SPTE is part of in the paging structure 241 * 242 * Handle bookkeeping that might result from the modification of a SPTE. 243 * This function must be called for all TDP SPTE modifications. 244 */ 245static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 246 u64 old_spte, u64 new_spte, int level) 247{ 248 bool was_present = is_shadow_present_pte(old_spte); 249 bool is_present = is_shadow_present_pte(new_spte); 250 bool was_leaf = was_present && is_last_spte(old_spte, level); 251 bool is_leaf = is_present && is_last_spte(new_spte, level); 252 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 253 u64 *pt; 254 struct kvm_mmu_page *sp; 255 u64 old_child_spte; 256 int i; 257 258 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 259 WARN_ON(level < PG_LEVEL_4K); 260 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 261 262 /* 263 * If this warning were to trigger it would indicate that there was a 264 * missing MMU notifier or a race with some notifier handler. 265 * A present, leaf SPTE should never be directly replaced with another 266 * present leaf SPTE pointing to a differnt PFN. A notifier handler 267 * should be zapping the SPTE before the main MM's page table is 268 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 269 * thread before replacement. 270 */ 271 if (was_leaf && is_leaf && pfn_changed) { 272 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 273 "SPTE with another present leaf SPTE mapping a\n" 274 "different PFN!\n" 275 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 276 as_id, gfn, old_spte, new_spte, level); 277 278 /* 279 * Crash the host to prevent error propagation and guest data 280 * courruption. 281 */ 282 BUG(); 283 } 284 285 if (old_spte == new_spte) 286 return; 287 288 /* 289 * The only times a SPTE should be changed from a non-present to 290 * non-present state is when an MMIO entry is installed/modified/ 291 * removed. In that case, there is nothing to do here. 292 */ 293 if (!was_present && !is_present) { 294 /* 295 * If this change does not involve a MMIO SPTE, it is 296 * unexpected. Log the change, though it should not impact the 297 * guest since both the former and current SPTEs are nonpresent. 298 */ 299 if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) 300 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 301 "should not be replaced with another,\n" 302 "different nonpresent SPTE, unless one or both\n" 303 "are MMIO SPTEs.\n" 304 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 305 as_id, gfn, old_spte, new_spte, level); 306 return; 307 } 308 309 310 if (was_leaf && is_dirty_spte(old_spte) && 311 (!is_dirty_spte(new_spte) || pfn_changed)) 312 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 313 314 /* 315 * Recursively handle child PTs if the change removed a subtree from 316 * the paging structure. 317 */ 318 if (was_present && !was_leaf && (pfn_changed || !is_present)) { 319 pt = spte_to_child_pt(old_spte, level); 320 sp = sptep_to_sp(pt); 321 322 list_del(&sp->link); 323 324 if (sp->lpage_disallowed) 325 unaccount_huge_nx_page(kvm, sp); 326 327 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 328 old_child_spte = READ_ONCE(*(pt + i)); 329 WRITE_ONCE(*(pt + i), 0); 330 handle_changed_spte(kvm, as_id, 331 gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), 332 old_child_spte, 0, level - 1); 333 } 334 335 kvm_flush_remote_tlbs_with_address(kvm, gfn, 336 KVM_PAGES_PER_HPAGE(level)); 337 338 free_page((unsigned long)pt); 339 kmem_cache_free(mmu_page_header_cache, sp); 340 } 341} 342 343static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 344 u64 old_spte, u64 new_spte, int level) 345{ 346 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); 347 handle_changed_spte_acc_track(old_spte, new_spte, level); 348 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 349 new_spte, level); 350} 351 352static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 353 u64 new_spte, bool record_acc_track, 354 bool record_dirty_log) 355{ 356 u64 *root_pt = tdp_iter_root_pt(iter); 357 struct kvm_mmu_page *root = sptep_to_sp(root_pt); 358 int as_id = kvm_mmu_page_as_id(root); 359 360 WRITE_ONCE(*iter->sptep, new_spte); 361 362 __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, 363 iter->level); 364 if (record_acc_track) 365 handle_changed_spte_acc_track(iter->old_spte, new_spte, 366 iter->level); 367 if (record_dirty_log) 368 handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, 369 iter->old_spte, new_spte, 370 iter->level); 371} 372 373static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 374 u64 new_spte) 375{ 376 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 377} 378 379static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 380 struct tdp_iter *iter, 381 u64 new_spte) 382{ 383 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 384} 385 386static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 387 struct tdp_iter *iter, 388 u64 new_spte) 389{ 390 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 391} 392 393#define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 394 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) 395 396#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 397 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 398 if (!is_shadow_present_pte(_iter.old_spte) || \ 399 !is_last_spte(_iter.old_spte, _iter.level)) \ 400 continue; \ 401 else 402 403#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 404 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ 405 _mmu->shadow_root_level, _start, _end) 406 407/* 408 * Yield if the MMU lock is contended or this thread needs to return control 409 * to the scheduler. 410 * 411 * If this function should yield and flush is set, it will perform a remote 412 * TLB flush before yielding. 413 * 414 * If this function yields, it will also reset the tdp_iter's walk over the 415 * paging structure and the calling function should skip to the next 416 * iteration to allow the iterator to continue its traversal from the 417 * paging structure root. 418 * 419 * Return true if this function yielded and the iterator's traversal was reset. 420 * Return false if a yield was not needed. 421 */ 422static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, 423 struct tdp_iter *iter, bool flush) 424{ 425 /* Ensure forward progress has been made before yielding. */ 426 if (iter->next_last_level_gfn == iter->yielded_gfn) 427 return false; 428 429 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { 430 if (flush) 431 kvm_flush_remote_tlbs(kvm); 432 433 cond_resched_lock(&kvm->mmu_lock); 434 435 WARN_ON(iter->gfn > iter->next_last_level_gfn); 436 437 tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], 438 iter->root_level, iter->min_level, 439 iter->next_last_level_gfn); 440 441 return true; 442 } 443 444 return false; 445} 446 447/* 448 * Tears down the mappings for the range of gfns, [start, end), and frees the 449 * non-root pages mapping GFNs strictly within that range. Returns true if 450 * SPTEs have been cleared and a TLB flush is needed before releasing the 451 * MMU lock. 452 * If can_yield is true, will release the MMU lock and reschedule if the 453 * scheduler needs the CPU or there is contention on the MMU lock. If this 454 * function cannot yield, it will not release the MMU lock or reschedule and 455 * the caller must ensure it does not supply too large a GFN range, or the 456 * operation can cause a soft lockup. Note, in some use cases a flush may be 457 * required by prior actions. Ensure the pending flush is performed prior to 458 * yielding. 459 */ 460static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 461 gfn_t start, gfn_t end, bool can_yield, bool flush) 462{ 463 struct tdp_iter iter; 464 465 tdp_root_for_each_pte(iter, root, start, end) { 466 if (can_yield && 467 tdp_mmu_iter_cond_resched(kvm, &iter, flush)) { 468 flush = false; 469 continue; 470 } 471 472 if (!is_shadow_present_pte(iter.old_spte)) 473 continue; 474 475 /* 476 * If this is a non-last-level SPTE that covers a larger range 477 * than should be zapped, continue, and zap the mappings at a 478 * lower level. 479 */ 480 if ((iter.gfn < start || 481 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && 482 !is_last_spte(iter.old_spte, iter.level)) 483 continue; 484 485 tdp_mmu_set_spte(kvm, &iter, 0); 486 flush = true; 487 } 488 489 return flush; 490} 491 492/* 493 * Tears down the mappings for the range of gfns, [start, end), and frees the 494 * non-root pages mapping GFNs strictly within that range. Returns true if 495 * SPTEs have been cleared and a TLB flush is needed before releasing the 496 * MMU lock. 497 */ 498bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, 499 bool can_yield) 500{ 501 struct kvm_mmu_page *root; 502 bool flush = false; 503 504 for_each_tdp_mmu_root_yield_safe(kvm, root) 505 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush); 506 507 return flush; 508} 509 510void kvm_tdp_mmu_zap_all(struct kvm *kvm) 511{ 512 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 513 bool flush; 514 515 flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); 516 if (flush) 517 kvm_flush_remote_tlbs(kvm); 518} 519 520/* 521 * Installs a last-level SPTE to handle a TDP page fault. 522 * (NPT/EPT violation/misconfiguration) 523 */ 524static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, 525 int map_writable, 526 struct tdp_iter *iter, 527 kvm_pfn_t pfn, bool prefault) 528{ 529 u64 new_spte; 530 int ret = RET_PF_FIXED; 531 int make_spte_ret = 0; 532 533 if (unlikely(is_noslot_pfn(pfn))) { 534 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 535 trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); 536 } else 537 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, 538 pfn, iter->old_spte, prefault, true, 539 map_writable, !shadow_accessed_mask, 540 &new_spte); 541 542 if (new_spte == iter->old_spte) 543 ret = RET_PF_SPURIOUS; 544 else 545 tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); 546 547 /* 548 * If the page fault was caused by a write but the page is write 549 * protected, emulation is needed. If the emulation was skipped, 550 * the vCPU would have the same fault again. 551 */ 552 if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { 553 if (write) 554 ret = RET_PF_EMULATE; 555 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 556 } 557 558 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 559 if (unlikely(is_mmio_spte(new_spte))) 560 ret = RET_PF_EMULATE; 561 562 trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); 563 if (!prefault) 564 vcpu->stat.pf_fixed++; 565 566 return ret; 567} 568 569/* 570 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 571 * page tables and SPTEs to translate the faulting guest physical address. 572 */ 573int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, 574 int map_writable, int max_level, kvm_pfn_t pfn, 575 bool prefault) 576{ 577 bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); 578 bool write = error_code & PFERR_WRITE_MASK; 579 bool exec = error_code & PFERR_FETCH_MASK; 580 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; 581 struct kvm_mmu *mmu = vcpu->arch.mmu; 582 struct tdp_iter iter; 583 struct kvm_mmu_page *sp; 584 u64 *child_pt; 585 u64 new_spte; 586 int ret; 587 gfn_t gfn = gpa >> PAGE_SHIFT; 588 int level; 589 int req_level; 590 591 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) 592 return RET_PF_RETRY; 593 if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) 594 return RET_PF_RETRY; 595 596 level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, 597 huge_page_disallowed, &req_level); 598 599 trace_kvm_mmu_spte_requested(gpa, level, pfn); 600 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 601 if (nx_huge_page_workaround_enabled) 602 disallowed_hugepage_adjust(iter.old_spte, gfn, 603 iter.level, &pfn, &level); 604 605 if (iter.level == level) 606 break; 607 608 /* 609 * If there is an SPTE mapping a large page at a higher level 610 * than the target, that SPTE must be cleared and replaced 611 * with a non-leaf SPTE. 612 */ 613 if (is_shadow_present_pte(iter.old_spte) && 614 is_large_pte(iter.old_spte)) { 615 tdp_mmu_set_spte(vcpu->kvm, &iter, 0); 616 617 kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, 618 KVM_PAGES_PER_HPAGE(iter.level)); 619 620 /* 621 * The iter must explicitly re-read the spte here 622 * because the new value informs the !present 623 * path below. 624 */ 625 iter.old_spte = READ_ONCE(*iter.sptep); 626 } 627 628 if (!is_shadow_present_pte(iter.old_spte)) { 629 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); 630 list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); 631 child_pt = sp->spt; 632 clear_page(child_pt); 633 new_spte = make_nonleaf_spte(child_pt, 634 !shadow_accessed_mask); 635 636 trace_kvm_mmu_get_page(sp, true); 637 if (huge_page_disallowed && req_level >= iter.level) 638 account_huge_nx_page(vcpu->kvm, sp); 639 640 tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); 641 } 642 } 643 644 if (WARN_ON(iter.level != level)) 645 return RET_PF_RETRY; 646 647 ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, 648 pfn, prefault); 649 650 return ret; 651} 652 653static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, 654 unsigned long end, unsigned long data, 655 int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, 656 struct kvm_mmu_page *root, gfn_t start, 657 gfn_t end, unsigned long data)) 658{ 659 struct kvm_memslots *slots; 660 struct kvm_memory_slot *memslot; 661 struct kvm_mmu_page *root; 662 int ret = 0; 663 int as_id; 664 665 for_each_tdp_mmu_root_yield_safe(kvm, root) { 666 as_id = kvm_mmu_page_as_id(root); 667 slots = __kvm_memslots(kvm, as_id); 668 kvm_for_each_memslot(memslot, slots) { 669 unsigned long hva_start, hva_end; 670 gfn_t gfn_start, gfn_end; 671 672 hva_start = max(start, memslot->userspace_addr); 673 hva_end = min(end, memslot->userspace_addr + 674 (memslot->npages << PAGE_SHIFT)); 675 if (hva_start >= hva_end) 676 continue; 677 /* 678 * {gfn(page) | page intersects with [hva_start, hva_end)} = 679 * {gfn_start, gfn_start+1, ..., gfn_end-1}. 680 */ 681 gfn_start = hva_to_gfn_memslot(hva_start, memslot); 682 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 683 684 ret |= handler(kvm, memslot, root, gfn_start, 685 gfn_end, data); 686 } 687 } 688 689 return ret; 690} 691 692static int zap_gfn_range_hva_wrapper(struct kvm *kvm, 693 struct kvm_memory_slot *slot, 694 struct kvm_mmu_page *root, gfn_t start, 695 gfn_t end, unsigned long unused) 696{ 697 return zap_gfn_range(kvm, root, start, end, false, false); 698} 699 700int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, 701 unsigned long end) 702{ 703 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 704 zap_gfn_range_hva_wrapper); 705} 706 707/* 708 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 709 * if any of the GFNs in the range have been accessed. 710 */ 711static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, 712 struct kvm_mmu_page *root, gfn_t start, gfn_t end, 713 unsigned long unused) 714{ 715 struct tdp_iter iter; 716 int young = 0; 717 u64 new_spte = 0; 718 719 tdp_root_for_each_leaf_pte(iter, root, start, end) { 720 /* 721 * If we have a non-accessed entry we don't need to change the 722 * pte. 723 */ 724 if (!is_accessed_spte(iter.old_spte)) 725 continue; 726 727 new_spte = iter.old_spte; 728 729 if (spte_ad_enabled(new_spte)) { 730 clear_bit((ffs(shadow_accessed_mask) - 1), 731 (unsigned long *)&new_spte); 732 } else { 733 /* 734 * Capture the dirty status of the page, so that it doesn't get 735 * lost when the SPTE is marked for access tracking. 736 */ 737 if (is_writable_pte(new_spte)) 738 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 739 740 new_spte = mark_spte_for_access_track(new_spte); 741 } 742 new_spte &= ~shadow_dirty_mask; 743 744 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); 745 young = 1; 746 } 747 748 return young; 749} 750 751int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, 752 unsigned long end) 753{ 754 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 755 age_gfn_range); 756} 757 758static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, 759 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 760 unsigned long unused2) 761{ 762 struct tdp_iter iter; 763 764 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) 765 if (is_accessed_spte(iter.old_spte)) 766 return 1; 767 768 return 0; 769} 770 771int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) 772{ 773 return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, 774 test_age_gfn); 775} 776 777/* 778 * Handle the changed_pte MMU notifier for the TDP MMU. 779 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 780 * notifier. 781 * Returns non-zero if a flush is needed before releasing the MMU lock. 782 */ 783static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, 784 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 785 unsigned long data) 786{ 787 struct tdp_iter iter; 788 pte_t *ptep = (pte_t *)data; 789 kvm_pfn_t new_pfn; 790 u64 new_spte; 791 int need_flush = 0; 792 793 WARN_ON(pte_huge(*ptep)); 794 795 new_pfn = pte_pfn(*ptep); 796 797 tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { 798 if (iter.level != PG_LEVEL_4K) 799 continue; 800 801 if (!is_shadow_present_pte(iter.old_spte)) 802 break; 803 804 tdp_mmu_set_spte(kvm, &iter, 0); 805 806 kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); 807 808 if (!pte_write(*ptep)) { 809 new_spte = kvm_mmu_changed_pte_notifier_make_spte( 810 iter.old_spte, new_pfn); 811 812 tdp_mmu_set_spte(kvm, &iter, new_spte); 813 } 814 815 need_flush = 1; 816 } 817 818 if (need_flush) 819 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); 820 821 return 0; 822} 823 824int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, 825 pte_t *host_ptep) 826{ 827 return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, 828 (unsigned long)host_ptep, 829 set_tdp_spte); 830} 831 832/* 833 * Remove write access from all the SPTEs mapping GFNs [start, end). If 834 * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. 835 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 836 */ 837static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 838 gfn_t start, gfn_t end, int min_level) 839{ 840 struct tdp_iter iter; 841 u64 new_spte; 842 bool spte_set = false; 843 844 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 845 846 for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 847 min_level, start, end) { 848 if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) 849 continue; 850 851 if (!is_shadow_present_pte(iter.old_spte) || 852 !is_last_spte(iter.old_spte, iter.level)) 853 continue; 854 855 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 856 857 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 858 spte_set = true; 859 } 860 return spte_set; 861} 862 863/* 864 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 865 * only affect leaf SPTEs down to min_level. 866 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 867 */ 868bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, 869 int min_level) 870{ 871 struct kvm_mmu_page *root; 872 int root_as_id; 873 bool spte_set = false; 874 875 for_each_tdp_mmu_root_yield_safe(kvm, root) { 876 root_as_id = kvm_mmu_page_as_id(root); 877 if (root_as_id != slot->as_id) 878 continue; 879 880 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 881 slot->base_gfn + slot->npages, min_level); 882 } 883 884 return spte_set; 885} 886 887/* 888 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 889 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 890 * If AD bits are not enabled, this will require clearing the writable bit on 891 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 892 * be flushed. 893 */ 894static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 895 gfn_t start, gfn_t end) 896{ 897 struct tdp_iter iter; 898 u64 new_spte; 899 bool spte_set = false; 900 901 tdp_root_for_each_leaf_pte(iter, root, start, end) { 902 if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) 903 continue; 904 905 if (!is_shadow_present_pte(iter.old_spte)) 906 continue; 907 908 if (spte_ad_need_write_protect(iter.old_spte)) { 909 if (is_writable_pte(iter.old_spte)) 910 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 911 else 912 continue; 913 } else { 914 if (iter.old_spte & shadow_dirty_mask) 915 new_spte = iter.old_spte & ~shadow_dirty_mask; 916 else 917 continue; 918 } 919 920 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 921 spte_set = true; 922 } 923 return spte_set; 924} 925 926/* 927 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 928 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 929 * If AD bits are not enabled, this will require clearing the writable bit on 930 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 931 * be flushed. 932 */ 933bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) 934{ 935 struct kvm_mmu_page *root; 936 int root_as_id; 937 bool spte_set = false; 938 939 for_each_tdp_mmu_root_yield_safe(kvm, root) { 940 root_as_id = kvm_mmu_page_as_id(root); 941 if (root_as_id != slot->as_id) 942 continue; 943 944 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 945 slot->base_gfn + slot->npages); 946 } 947 948 return spte_set; 949} 950 951/* 952 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 953 * set in mask, starting at gfn. The given memslot is expected to contain all 954 * the GFNs represented by set bits in the mask. If AD bits are enabled, 955 * clearing the dirty status will involve clearing the dirty bit on each SPTE 956 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 957 */ 958static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 959 gfn_t gfn, unsigned long mask, bool wrprot) 960{ 961 struct tdp_iter iter; 962 u64 new_spte; 963 964 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 965 gfn + BITS_PER_LONG) { 966 if (!mask) 967 break; 968 969 if (iter.level > PG_LEVEL_4K || 970 !(mask & (1UL << (iter.gfn - gfn)))) 971 continue; 972 973 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 974 if (is_writable_pte(iter.old_spte)) 975 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 976 else 977 continue; 978 } else { 979 if (iter.old_spte & shadow_dirty_mask) 980 new_spte = iter.old_spte & ~shadow_dirty_mask; 981 else 982 continue; 983 } 984 985 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 986 987 mask &= ~(1UL << (iter.gfn - gfn)); 988 } 989} 990 991/* 992 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 993 * set in mask, starting at gfn. The given memslot is expected to contain all 994 * the GFNs represented by set bits in the mask. If AD bits are enabled, 995 * clearing the dirty status will involve clearing the dirty bit on each SPTE 996 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 997 */ 998void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 999 struct kvm_memory_slot *slot, 1000 gfn_t gfn, unsigned long mask, 1001 bool wrprot) 1002{ 1003 struct kvm_mmu_page *root; 1004 int root_as_id; 1005 1006 lockdep_assert_held(&kvm->mmu_lock); 1007 for_each_tdp_mmu_root(kvm, root) { 1008 root_as_id = kvm_mmu_page_as_id(root); 1009 if (root_as_id != slot->as_id) 1010 continue; 1011 1012 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1013 } 1014} 1015 1016/* 1017 * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is 1018 * only used for PML, and so will involve setting the dirty bit on each SPTE. 1019 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1020 */ 1021static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1022 gfn_t start, gfn_t end) 1023{ 1024 struct tdp_iter iter; 1025 u64 new_spte; 1026 bool spte_set = false; 1027 1028 tdp_root_for_each_pte(iter, root, start, end) { 1029 if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) 1030 continue; 1031 1032 if (!is_shadow_present_pte(iter.old_spte)) 1033 continue; 1034 1035 new_spte = iter.old_spte | shadow_dirty_mask; 1036 1037 tdp_mmu_set_spte(kvm, &iter, new_spte); 1038 spte_set = true; 1039 } 1040 1041 return spte_set; 1042} 1043 1044/* 1045 * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is 1046 * only used for PML, and so will involve setting the dirty bit on each SPTE. 1047 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1048 */ 1049bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) 1050{ 1051 struct kvm_mmu_page *root; 1052 int root_as_id; 1053 bool spte_set = false; 1054 1055 for_each_tdp_mmu_root_yield_safe(kvm, root) { 1056 root_as_id = kvm_mmu_page_as_id(root); 1057 if (root_as_id != slot->as_id) 1058 continue; 1059 1060 spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, 1061 slot->base_gfn + slot->npages); 1062 } 1063 return spte_set; 1064} 1065 1066/* 1067 * Clear leaf entries which could be replaced by large mappings, for 1068 * GFNs within the slot. 1069 */ 1070static void zap_collapsible_spte_range(struct kvm *kvm, 1071 struct kvm_mmu_page *root, 1072 gfn_t start, gfn_t end) 1073{ 1074 struct tdp_iter iter; 1075 kvm_pfn_t pfn; 1076 bool spte_set = false; 1077 1078 tdp_root_for_each_pte(iter, root, start, end) { 1079 if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) { 1080 spte_set = false; 1081 continue; 1082 } 1083 1084 if (!is_shadow_present_pte(iter.old_spte) || 1085 !is_last_spte(iter.old_spte, iter.level)) 1086 continue; 1087 1088 pfn = spte_to_pfn(iter.old_spte); 1089 if (kvm_is_reserved_pfn(pfn) || 1090 (!PageCompound(pfn_to_page(pfn)) && 1091 !kvm_is_zone_device_pfn(pfn))) 1092 continue; 1093 1094 tdp_mmu_set_spte(kvm, &iter, 0); 1095 1096 spte_set = true; 1097 } 1098 1099 if (spte_set) 1100 kvm_flush_remote_tlbs(kvm); 1101} 1102 1103/* 1104 * Clear non-leaf entries (and free associated page tables) which could 1105 * be replaced by large mappings, for GFNs within the slot. 1106 */ 1107void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1108 const struct kvm_memory_slot *slot) 1109{ 1110 struct kvm_mmu_page *root; 1111 int root_as_id; 1112 1113 for_each_tdp_mmu_root_yield_safe(kvm, root) { 1114 root_as_id = kvm_mmu_page_as_id(root); 1115 if (root_as_id != slot->as_id) 1116 continue; 1117 1118 zap_collapsible_spte_range(kvm, root, slot->base_gfn, 1119 slot->base_gfn + slot->npages); 1120 } 1121} 1122 1123/* 1124 * Removes write access on the last level SPTE mapping this GFN and unsets the 1125 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 1126 * Returns true if an SPTE was set and a TLB flush is needed. 1127 */ 1128static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1129 gfn_t gfn) 1130{ 1131 struct tdp_iter iter; 1132 u64 new_spte; 1133 bool spte_set = false; 1134 1135 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { 1136 new_spte = iter.old_spte & 1137 ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); 1138 1139 if (new_spte == iter.old_spte) 1140 break; 1141 1142 tdp_mmu_set_spte(kvm, &iter, new_spte); 1143 spte_set = true; 1144 } 1145 1146 return spte_set; 1147} 1148 1149/* 1150 * Removes write access on the last level SPTE mapping this GFN and unsets the 1151 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 1152 * Returns true if an SPTE was set and a TLB flush is needed. 1153 */ 1154bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1155 struct kvm_memory_slot *slot, gfn_t gfn) 1156{ 1157 struct kvm_mmu_page *root; 1158 int root_as_id; 1159 bool spte_set = false; 1160 1161 lockdep_assert_held(&kvm->mmu_lock); 1162 for_each_tdp_mmu_root(kvm, root) { 1163 root_as_id = kvm_mmu_page_as_id(root); 1164 if (root_as_id != slot->as_id) 1165 continue; 1166 1167 spte_set |= write_protect_gfn(kvm, root, gfn); 1168 } 1169 return spte_set; 1170} 1171 1172/* 1173 * Return the level of the lowest level SPTE added to sptes. 1174 * That SPTE may be non-present. 1175 */ 1176int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1177 int *root_level) 1178{ 1179 struct tdp_iter iter; 1180 struct kvm_mmu *mmu = vcpu->arch.mmu; 1181 gfn_t gfn = addr >> PAGE_SHIFT; 1182 int leaf = -1; 1183 1184 *root_level = vcpu->arch.mmu->shadow_root_level; 1185 1186 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1187 leaf = iter.level; 1188 sptes[leaf - 1] = iter.old_spte; 1189 } 1190 1191 return leaf; 1192} 1193