1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (C) 2020-2022 Loongson Technology Corporation Limited 4 */ 5 6#include <linux/highmem.h> 7#include <linux/hugetlb.h> 8#include <linux/page-flags.h> 9#include <linux/kvm_host.h> 10#include <linux/uaccess.h> 11#include <asm/mmu_context.h> 12#include <asm/pgalloc.h> 13#include <asm/tlb.h> 14#include "kvm_compat.h" 15 16/* 17 * KVM_MMU_CACHE_MIN_PAGES is the number of GPA page table translation levels 18 * for which pages need to be cached. 19 */ 20#if defined(__PAGETABLE_PMD_FOLDED) 21#define KVM_MMU_CACHE_MIN_PAGES 1 22#else 23#define KVM_MMU_CACHE_MIN_PAGES 2 24#endif 25 26static inline int kvm_pte_huge(pte_t pte) { return pte_val(pte) & _PAGE_HUGE; } 27static inline pte_t kvm_pte_mksmall(pte_t pte) 28{ 29 pte_val(pte) &= ~_PAGE_HUGE; 30 return pte; 31} 32static inline void kvm_set_pte(pte_t *ptep, pte_t val) 33{ 34 WRITE_ONCE(*ptep, val); 35} 36 37static int kvm_tlb_flush_gpa(struct kvm_vcpu *vcpu, unsigned long gpa) 38{ 39 preempt_disable(); 40 gpa &= (PAGE_MASK << 1); 41 invtlb(INVTLB_GID_ADDR, kvm_read_csr_gstat() & KVM_GSTAT_GID, gpa); 42 preempt_enable(); 43 return 0; 44} 45 46static inline pmd_t kvm_pmd_mkhuge(pmd_t pmd) 47{ 48#ifdef CONFIG_TRANSPARENT_HUGEPAGE 49 return pmd_mkhuge(pmd); 50#elif defined(CONFIG_HUGETLB_PAGE) 51 pte_t entry; 52 53 pte_val(entry) = pmd_val(pmd); 54 entry = pte_mkhuge(entry); 55 pmd_val(pmd) = pte_val(entry); 56#endif 57 return pmd; 58} 59 60static inline pmd_t kvm_pmd_mkclean(pmd_t pmd) 61{ 62#ifdef CONFIG_TRANSPARENT_HUGEPAGE 63 return pmd_mkclean(pmd); 64#elif defined(CONFIG_HUGETLB_PAGE) 65 pte_t entry; 66 67 pte_val(entry) = pmd_val(pmd); 68 entry = pte_mkclean(entry); 69 pmd_val(pmd) = pte_val(entry); 70#endif 71 return pmd; 72} 73 74static inline pmd_t kvm_pmd_mkold(pmd_t pmd) 75{ 76#ifdef CONFIG_TRANSPARENT_HUGEPAGE 77 return pmd_mkold(pmd); 78#elif defined(CONFIG_HUGETLB_PAGE) 79 pte_t entry; 80 81 pte_val(entry) = pmd_val(pmd); 82 entry = pte_mkold(entry); 83 pmd_val(pmd) = pte_val(entry); 84#endif 85 return pmd; 86} 87 88void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) 89{ 90 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); 91} 92 93/** 94 * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory. 95 * 96 * Allocate a blank KVM GPA page directory (PGD) for representing guest physical 97 * to host physical page mappings. 98 * 99 * Returns: Pointer to new KVM GPA page directory. 100 * NULL on allocation failure. 101 */ 102pgd_t *kvm_pgd_alloc(void) 103{ 104 pgd_t *ret; 105 struct page *page; 106 107 page = alloc_pages(GFP_KERNEL, 0); 108 if (!page) 109 return NULL; 110 ret = (pgd_t *) page_address(page); 111 if (ret) 112 pgd_init((unsigned long)ret); 113 114 return ret; 115} 116 117/** 118 * kvm_walk_pgd() - Walk page table with optional allocation. 119 * @pgd: Page directory pointer. 120 * @addr: Address to index page table using. 121 * @cache: MMU page cache to allocate new page tables from, or NULL. 122 * 123 * Walk the page tables pointed to by @pgd to find the PTE corresponding to the 124 * address @addr. If page tables don't exist for @addr, they will be created 125 * from the MMU cache if @cache is not NULL. 126 * 127 * Returns: Pointer to pte_t corresponding to @addr. 128 * NULL if a page table doesn't exist for @addr and !@cache. 129 * NULL if a page table allocation failed. 130 */ 131static pte_t *kvm_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache, 132 unsigned long addr) 133{ 134 p4d_t *p4d; 135 pud_t *pud; 136 pmd_t *pmd; 137 138 pgd += pgd_index(addr); 139 if (pgd_none(*pgd)) { 140 /* Not used yet */ 141 BUG(); 142 return NULL; 143 } 144 p4d = p4d_offset(pgd, addr); 145 pud = pud_offset(p4d, addr); 146 if (pud_none(*pud)) { 147 pmd_t *new_pmd; 148 149 if (!cache) 150 return NULL; 151 new_pmd = kvm_mmu_memory_cache_alloc(cache); 152 pmd_init((unsigned long)new_pmd, 153 (unsigned long)invalid_pte_table); 154 pud_populate(NULL, pud, new_pmd); 155 } 156 pmd = pmd_offset(pud, addr); 157#ifdef CONFIG_HUGETLB_PAGE 158 if (pmd_huge(*pmd)) { 159 return (pte_t *)pmd; 160 } 161#endif 162 if (pmd_none(*pmd)) { 163 pte_t *new_pte; 164 165 if (!cache) 166 return NULL; 167 new_pte = kvm_mmu_memory_cache_alloc(cache); 168 clear_page(new_pte); 169 pmd_populate_kernel(NULL, pmd, new_pte); 170 } 171 return pte_offset_kernel(pmd, addr); 172} 173 174/* Caller must hold kvm->mm_lock */ 175static pte_t *kvm_pte_for_gpa(struct kvm *kvm, 176 struct kvm_mmu_memory_cache *cache, 177 unsigned long addr) 178{ 179 return kvm_walk_pgd(kvm->arch.gpa_mm.pgd, cache, addr); 180} 181 182/* 183 * kvm_flush_gpa_{pte,pmd,pud,pgd,pt}. 184 * Flush a range of guest physical address space from the VM's GPA page tables. 185 */ 186 187static bool kvm_flush_gpa_pte(pte_t *pte, unsigned long start_gpa, 188 unsigned long end_gpa, unsigned long *data) 189{ 190 int i_min = pte_index(start_gpa); 191 int i_max = pte_index(end_gpa); 192 bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PTE - 1); 193 int i; 194 195 for (i = i_min; i <= i_max; ++i) { 196 if (!pte_present(pte[i])) 197 continue; 198 199 set_pte(pte + i, __pte(0)); 200 if (data) 201 *data = *data + 1; 202 } 203 return safe_to_remove; 204} 205 206static bool kvm_flush_gpa_pmd(pmd_t *pmd, unsigned long start_gpa, 207 unsigned long end_gpa, unsigned long *data) 208{ 209 pte_t *pte; 210 unsigned long end = ~0ul; 211 int i_min = pmd_index(start_gpa); 212 int i_max = pmd_index(end_gpa); 213 bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PMD - 1); 214 int i; 215 216 for (i = i_min; i <= i_max; ++i, start_gpa = 0) { 217 if (!pmd_present(pmd[i])) 218 continue; 219 220 if (pmd_huge(pmd[i]) && pmd_present(pmd[i])) { 221 pmd_clear(pmd + i); 222 if (data) 223 *data += PTRS_PER_PMD; 224 continue; 225 } 226 227 pte = pte_offset_kernel(pmd + i, 0); 228 if (i == i_max) 229 end = end_gpa; 230 231 if (kvm_flush_gpa_pte(pte, start_gpa, end, data)) { 232 pmd_clear(pmd + i); 233 pte_free_kernel(NULL, pte); 234 } else { 235 safe_to_remove = false; 236 } 237 } 238 return safe_to_remove; 239} 240 241static bool kvm_flush_gpa_pud(pud_t *pud, unsigned long start_gpa, 242 unsigned long end_gpa, unsigned long *data) 243{ 244 pmd_t *pmd; 245 unsigned long end = ~0ul; 246 int i_min = pud_index(start_gpa); 247 int i_max = pud_index(end_gpa); 248 bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PUD - 1); 249 int i; 250 251 for (i = i_min; i <= i_max; ++i, start_gpa = 0) { 252 if (!pud_present(pud[i])) 253 continue; 254 255 pmd = pmd_offset(pud + i, 0); 256 if (i == i_max) 257 end = end_gpa; 258 259 if (kvm_flush_gpa_pmd(pmd, start_gpa, end, data)) { 260 pud_clear(pud + i); 261 pmd_free(NULL, pmd); 262 } else { 263 safe_to_remove = false; 264 } 265 } 266 return safe_to_remove; 267} 268 269static bool kvm_flush_gpa_pgd(pgd_t *pgd, unsigned long start_gpa, 270 unsigned long end_gpa, unsigned long *data) 271{ 272 p4d_t *p4d; 273 pud_t *pud; 274 unsigned long end = ~0ul; 275 int i_min = pgd_index(start_gpa); 276 int i_max = pgd_index(end_gpa); 277 bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PGD - 1); 278 int i; 279 280 for (i = i_min; i <= i_max; ++i, start_gpa = 0) { 281 if (!pgd_present(pgd[i])) 282 continue; 283 284 p4d = p4d_offset(pgd, 0); 285 pud = pud_offset(p4d + i, 0); 286 if (i == i_max) 287 end = end_gpa; 288 289 if (kvm_flush_gpa_pud(pud, start_gpa, end, data)) { 290 pgd_clear(pgd + i); 291 pud_free(NULL, pud); 292 } else { 293 safe_to_remove = false; 294 } 295 } 296 return safe_to_remove; 297} 298 299/** 300 * kvm_flush_gpa_pt() - Flush a range of guest physical addresses. 301 * @kvm: KVM pointer. 302 * @start_gfn: Guest frame number of first page in GPA range to flush. 303 * @end_gfn: Guest frame number of last page in GPA range to flush. 304 * 305 * Flushes a range of GPA mappings from the GPA page tables. 306 * 307 * The caller must hold the @kvm->mmu_lock spinlock. 308 * 309 * Returns: Whether its safe to remove the top level page directory because 310 * all lower levels have been removed. 311 */ 312static bool kvm_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, void *data) 313{ 314 return kvm_flush_gpa_pgd(kvm->arch.gpa_mm.pgd, 315 start_gfn << PAGE_SHIFT, 316 end_gfn << PAGE_SHIFT, (unsigned long *)data); 317} 318 319/* 320 * kvm_mkclean_gpa_pt. 321 * Mark a range of guest physical address space clean (writes fault) in the VM's 322 * GPA page table to allow dirty page tracking. 323 */ 324 325static int kvm_mkclean_pte(pte_t *pte, unsigned long start, unsigned long end) 326{ 327 int ret = 0; 328 int i_min = pte_index(start); 329 int i_max = pte_index(end); 330 int i; 331 pte_t val; 332 333 for (i = i_min; i <= i_max; ++i) { 334 val = pte[i]; 335 if (pte_present(val) && pte_dirty(val)) { 336 set_pte(pte + i, pte_mkclean(val)); 337 ret = 1; 338 } 339 } 340 return ret; 341} 342 343static int kvm_mkclean_pmd(pmd_t *pmd, unsigned long start, unsigned long end) 344{ 345 int ret = 0; 346 pte_t *pte; 347 unsigned long cur_end = ~0ul; 348 int i_min = pmd_index(start); 349 int i_max = pmd_index(end); 350 int i; 351 pmd_t old, new; 352 353 for (i = i_min; i <= i_max; ++i, start = 0) { 354 if (!pmd_present(pmd[i])) 355 continue; 356 357 if (pmd_huge(pmd[i])) { 358 old = pmd[i]; 359 new = kvm_pmd_mkclean(old); 360 if (pmd_val(new) == pmd_val(old)) 361 continue; 362 set_pmd(pmd + i, new); 363 ret = 1; 364 continue; 365 } 366 367 pte = pte_offset_kernel(pmd + i, 0); 368 if (i == i_max) 369 cur_end = end; 370 371 ret |= kvm_mkclean_pte(pte, start, cur_end); 372 } 373 374 return ret; 375} 376 377static int kvm_mkclean_pud(pud_t *pud, unsigned long start, unsigned long end) 378{ 379 int ret = 0; 380 pmd_t *pmd; 381 unsigned long cur_end = ~0ul; 382 int i_min = pud_index(start); 383 int i_max = pud_index(end); 384 int i; 385 386 for (i = i_min; i <= i_max; ++i, start = 0) { 387 if (!pud_present(pud[i])) 388 continue; 389 390 pmd = pmd_offset(pud + i, 0); 391 if (i == i_max) 392 cur_end = end; 393 394 ret |= kvm_mkclean_pmd(pmd, start, cur_end); 395 } 396 return ret; 397} 398 399static int kvm_mkclean_pgd(pgd_t *pgd, unsigned long start, unsigned long end) 400{ 401 int ret = 0; 402 p4d_t *p4d; \ 403 pud_t *pud; 404 unsigned long cur_end = ~0ul; 405 int i_min = pgd_index(start); 406 int i_max = pgd_index(end); 407 int i; 408 409 for (i = i_min; i <= i_max; ++i, start = 0) { 410 if (!pgd_present(pgd[i])) 411 continue; 412 413 p4d = p4d_offset(pgd, 0); 414 pud = pud_offset(p4d + i, 0); 415 if (i == i_max) 416 cur_end = end; 417 418 ret |= kvm_mkclean_pud(pud, start, cur_end); 419 } 420 return ret; 421} 422 423/** 424 * kvm_mkclean_gpa_pt() - Make a range of guest physical addresses clean. 425 * @kvm: KVM pointer. 426 * @start_gfn: Guest frame number of first page in GPA range to flush. 427 * @end_gfn: Guest frame number of last page in GPA range to flush. 428 * 429 * Make a range of GPA mappings clean so that guest writes will fault and 430 * trigger dirty page logging. 431 * 432 * The caller must hold the @kvm->mmu_lock spinlock. 433 * 434 * Returns: Whether any GPA mappings were modified, which would require 435 * derived mappings (GVA page tables & TLB enties) to be 436 * invalidated. 437 */ 438static int kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn) 439{ 440 return kvm_mkclean_pgd(kvm->arch.gpa_mm.pgd, start_gfn << PAGE_SHIFT, 441 end_gfn << PAGE_SHIFT); 442} 443 444/** 445 * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages 446 * @kvm: The KVM pointer 447 * @slot: The memory slot associated with mask 448 * @gfn_offset: The gfn offset in memory slot 449 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory 450 * slot to be write protected 451 * 452 * Walks bits set in mask write protects the associated pte's. Caller must 453 * acquire @kvm->mmu_lock. 454 */ 455void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 456 struct kvm_memory_slot *slot, 457 gfn_t gfn_offset, unsigned long mask) 458{ 459 gfn_t base_gfn = slot->base_gfn + gfn_offset; 460 gfn_t start = base_gfn + __ffs(mask); 461 gfn_t end = base_gfn + __fls(mask); 462 463 kvm_mkclean_gpa_pt(kvm, start, end); 464} 465 466void kvm_arch_commit_memory_region(struct kvm *kvm, 467 const struct kvm_userspace_memory_region *mem, 468 struct kvm_memory_slot *old, 469 const struct kvm_memory_slot *new, 470 enum kvm_mr_change change) 471{ 472 int needs_flush; 473 474 kvm_debug("%s: kvm: %p slot: %d, GPA: %llx, size: %llx, QVA: %llx\n", 475 __func__, kvm, mem->slot, mem->guest_phys_addr, 476 mem->memory_size, mem->userspace_addr); 477 478 /* 479 * If dirty page logging is enabled, write protect all pages in the slot 480 * ready for dirty logging. 481 * 482 * There is no need to do this in any of the following cases: 483 * CREATE: No dirty mappings will already exist. 484 * MOVE/DELETE: The old mappings will already have been cleaned up by 485 * kvm_arch_flush_shadow_memslot() 486 */ 487 if (change == KVM_MR_FLAGS_ONLY && 488 (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) && 489 new->flags & KVM_MEM_LOG_DIRTY_PAGES)) { 490 spin_lock(&kvm->mmu_lock); 491 /* Write protect GPA page table entries */ 492 needs_flush = kvm_mkclean_gpa_pt(kvm, new->base_gfn, 493 new->base_gfn + new->npages - 1); 494 /* Let implementation do the rest */ 495 if (needs_flush) 496 kvm_flush_remote_tlbs(kvm); 497 spin_unlock(&kvm->mmu_lock); 498 } 499} 500 501void kvm_arch_flush_shadow_all(struct kvm *kvm) 502{ 503 /* Flush whole GPA */ 504 kvm_flush_gpa_pt(kvm, 0, ~0UL, NULL); 505 506 /* Flush vpid for each VCPU individually */ 507 kvm_flush_remote_tlbs(kvm); 508} 509 510void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 511 struct kvm_memory_slot *slot) 512{ 513 unsigned long npages; 514 515 /* 516 * The slot has been made invalid (ready for moving or deletion), so we 517 * need to ensure that it can no longer be accessed by any guest VCPUs. 518 */ 519 520 npages = 0; 521 spin_lock(&kvm->mmu_lock); 522 /* Flush slot from GPA */ 523 kvm_flush_gpa_pt(kvm, slot->base_gfn, 524 slot->base_gfn + slot->npages - 1, &npages); 525 /* Let implementation do the rest */ 526 if (npages) 527 kvm_flush_remote_tlbs(kvm); 528 spin_unlock(&kvm->mmu_lock); 529} 530 531void _kvm_destroy_mm(struct kvm *kvm) 532{ 533 /* It should always be safe to remove after flushing the whole range */ 534 WARN_ON(!kvm_flush_gpa_pt(kvm, 0, ~0UL, NULL)); 535 pgd_free(NULL, kvm->arch.gpa_mm.pgd); 536 kvm->arch.gpa_mm.pgd = NULL; 537} 538 539/* 540 * Mark a range of guest physical address space old (all accesses fault) in the 541 * VM's GPA page table to allow detection of commonly used pages. 542 */ 543 544static int kvm_mkold_pte(pte_t *pte, unsigned long start, 545 unsigned long end) 546{ 547 int ret = 0; 548 int i_min = pte_index(start); 549 int i_max = pte_index(end); 550 int i; 551 pte_t old, new; 552 553 for (i = i_min; i <= i_max; ++i) { 554 if (!pte_present(pte[i])) 555 continue; 556 557 old = pte[i]; 558 new = pte_mkold(old); 559 if (pte_val(new) == pte_val(old)) 560 continue; 561 set_pte(pte + i, new); 562 ret = 1; 563 } 564 565 return ret; 566} 567 568static int kvm_mkold_pmd(pmd_t *pmd, unsigned long start, unsigned long end) 569{ 570 int ret = 0; 571 pte_t *pte; 572 unsigned long cur_end = ~0ul; 573 int i_min = pmd_index(start); 574 int i_max = pmd_index(end); 575 int i; 576 pmd_t old, new; 577 578 for (i = i_min; i <= i_max; ++i, start = 0) { 579 if (!pmd_present(pmd[i])) 580 continue; 581 582 if (pmd_huge(pmd[i])) { 583 old = pmd[i]; 584 new = kvm_pmd_mkold(old); 585 if (pmd_val(new) == pmd_val(old)) 586 continue; 587 set_pmd(pmd + i, new); 588 ret = 1; 589 continue; 590 } 591 592 pte = pte_offset_kernel(pmd + i, 0); 593 if (i == i_max) 594 cur_end = end; 595 596 ret |= kvm_mkold_pte(pte, start, cur_end); 597 } 598 599 return ret; 600} 601 602static int kvm_mkold_pud(pud_t *pud, unsigned long start, unsigned long end) 603{ 604 int ret = 0; 605 pmd_t *pmd; 606 unsigned long cur_end = ~0ul; 607 int i_min = pud_index(start); 608 int i_max = pud_index(end); 609 int i; 610 611 for (i = i_min; i <= i_max; ++i, start = 0) { 612 if (!pud_present(pud[i])) 613 continue; 614 615 pmd = pmd_offset(pud + i, 0); 616 if (i == i_max) 617 cur_end = end; 618 619 ret |= kvm_mkold_pmd(pmd, start, cur_end); 620 } 621 622 return ret; 623} 624 625static int kvm_mkold_pgd(pgd_t *pgd, unsigned long start, unsigned long end) 626{ 627 int ret = 0; 628 p4d_t *p4d; 629 pud_t *pud; 630 unsigned long cur_end = ~0ul; 631 int i_min = pgd_index(start); 632 int i_max = pgd_index(end); 633 int i; 634 635 for (i = i_min; i <= i_max; ++i, start = 0) { 636 if (!pgd_present(pgd[i])) 637 continue; 638 639 p4d = p4d_offset(pgd, 0); 640 pud = pud_offset(p4d + i, 0); 641 if (i == i_max) 642 cur_end = end; 643 644 ret |= kvm_mkold_pud(pud, start, cur_end); 645 } 646 647 return ret; 648} 649 650static int handle_hva_to_gpa(struct kvm *kvm, 651 unsigned long start, 652 unsigned long end, 653 int (*handler)(struct kvm *kvm, gfn_t gfn, 654 gpa_t gfn_end, 655 struct kvm_memory_slot *memslot, 656 void *data), 657 void *data) 658{ 659 struct kvm_memslots *slots; 660 struct kvm_memory_slot *memslot; 661 int ret = 0; 662 663 slots = kvm_memslots(kvm); 664 665 /* we only care about the pages that the guest sees */ 666 kvm_for_each_memslot(memslot, slots) { 667 unsigned long hva_start, hva_end; 668 gfn_t gfn, gfn_end; 669 670 hva_start = max(start, memslot->userspace_addr); 671 hva_end = min(end, memslot->userspace_addr + 672 (memslot->npages << PAGE_SHIFT)); 673 if (hva_start >= hva_end) 674 continue; 675 676 /* 677 * {gfn(page) | page intersects with [hva_start, hva_end)} = 678 * {gfn_start, gfn_start+1, ..., gfn_end-1}. 679 */ 680 gfn = hva_to_gfn_memslot(hva_start, memslot); 681 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 682 ret |= handler(kvm, gfn, gfn_end, memslot, data); 683 } 684 685 return ret; 686} 687 688 689static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, 690 struct kvm_memory_slot *memslot, void *data) 691{ 692 unsigned long npages; 693 694 npages = 0; 695 kvm_flush_gpa_pt(kvm, gfn, gfn_end - 1, &npages); 696 *(unsigned long *)data = *(unsigned long *)data + npages; 697 698 return npages > 0; 699} 700 701int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable) 702{ 703 unsigned long npages; 704 705 npages = 0; 706 return handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &npages); 707} 708 709static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, 710 struct kvm_memory_slot *memslot, void *data) 711{ 712 gpa_t gpa = gfn << PAGE_SHIFT; 713 pte_t hva_pte = *(pte_t *)data; 714 pte_t *gpa_pte = kvm_pte_for_gpa(kvm, NULL, gpa); 715 pte_t old_pte; 716 717 if (!gpa_pte) 718 return 0; 719 720 /* Mapping may need adjusting depending on memslot flags */ 721 old_pte = *gpa_pte; 722 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte)) 723 hva_pte = pte_mkclean(hva_pte); 724 else if (memslot->flags & KVM_MEM_READONLY) 725 hva_pte = pte_wrprotect(hva_pte); 726 727 set_pte(gpa_pte, hva_pte); 728 729 /* Replacing an absent or old page doesn't need flushes */ 730 if (!pte_present(old_pte) || !pte_young(old_pte)) 731 return 0; 732 733 /* Pages swapped, aged, moved, or cleaned require flushes */ 734 return !pte_present(hva_pte) || 735 !pte_young(hva_pte) || 736 pte_pfn(old_pte) != pte_pfn(hva_pte) || 737 (pte_dirty(old_pte) && !pte_dirty(hva_pte)); 738} 739 740int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 741{ 742 unsigned long end = hva + PAGE_SIZE; 743 int ret; 744 745 ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte); 746 if (ret) 747 /* Flush vpid for each VCPU individually */ 748 kvm_flush_remote_tlbs(kvm); 749 return 0; 750} 751 752static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, 753 struct kvm_memory_slot *memslot, void *data) 754{ 755 return kvm_mkold_pgd(kvm->arch.gpa_mm.pgd, gfn << PAGE_SHIFT, 756 gfn_end << PAGE_SHIFT); 757} 758 759static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, 760 struct kvm_memory_slot *memslot, void *data) 761{ 762 gpa_t gpa = gfn << PAGE_SHIFT; 763 pte_t *gpa_pte = kvm_pte_for_gpa(kvm, NULL, gpa); 764 765 if (!gpa_pte) 766 return 0; 767 return pte_young(*gpa_pte); 768} 769 770int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) 771{ 772 return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); 773} 774 775int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 776{ 777 return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); 778} 779 780static pud_t *kvm_get_pud(struct kvm *kvm, 781 struct kvm_mmu_memory_cache *cache, phys_addr_t addr) 782{ 783 pgd_t *pgd; 784 785 pgd = kvm->arch.gpa_mm.pgd + pgd_index(addr); 786 if (pgd_none(*pgd)) { 787 /* Not used yet */ 788 BUG(); 789 return NULL; 790 } 791 792 return pud_offset(p4d_offset(pgd, addr), addr); 793} 794 795static pmd_t *kvm_get_pmd(struct kvm *kvm, 796 struct kvm_mmu_memory_cache *cache, phys_addr_t addr) 797{ 798 pud_t *pud; 799 pmd_t *pmd; 800 801 pud = kvm_get_pud(kvm, cache, addr); 802 if (!pud || pud_huge(*pud)) 803 return NULL; 804 805 if (pud_none(*pud)) { 806 if (!cache) 807 return NULL; 808 pmd = kvm_mmu_memory_cache_alloc(cache); 809 pmd_init((unsigned long)pmd, 810 (unsigned long)invalid_pte_table); 811 pud_populate(NULL, pud, pmd); 812 } 813 814 return pmd_offset(pud, addr); 815} 816 817static int kvm_set_pmd_huge(struct kvm_vcpu *vcpu, struct kvm_mmu_memory_cache 818 *cache, phys_addr_t addr, const pmd_t *new_pmd) 819{ 820 pmd_t *pmd, old_pmd; 821 822retry: 823 pmd = kvm_get_pmd(vcpu->kvm, cache, addr); 824 VM_BUG_ON(!pmd); 825 826 old_pmd = *pmd; 827 /* 828 * Multiple vcpus faulting on the same PMD entry, can 829 * lead to them sequentially updating the PMD with the 830 * same value. Following the break-before-make 831 * (pmd_clear() followed by tlb_flush()) process can 832 * hinder forward progress due to refaults generated 833 * on missing translations. 834 * 835 * Skip updating the page table if the entry is 836 * unchanged. 837 */ 838 if (pmd_val(old_pmd) == pmd_val(*new_pmd)) 839 return 0; 840 841 if (pmd_present(old_pmd)) { 842 /* 843 * If we already have PTE level mapping for this block, 844 * we must unmap it to avoid inconsistent TLB state and 845 * leaking the table page. We could end up in this situation 846 * if the memory slot was marked for dirty logging and was 847 * reverted, leaving PTE level mappings for the pages accessed 848 * during the period. So, unmap the PTE level mapping for this 849 * block and retry, as we could have released the upper level 850 * table in the process. 851 * 852 * Normal THP split/merge follows mmu_notifier callbacks and do 853 * get handled accordingly. 854 */ 855 if (!pmd_huge(old_pmd)) { 856 ++vcpu->stat.huge_merge_exits; 857 kvm_flush_gpa_pt(vcpu->kvm, 858 (addr & PMD_MASK) >> PAGE_SHIFT, 859 ((addr & PMD_MASK) + PMD_SIZE - 1) >> PAGE_SHIFT, NULL); 860 goto retry; 861 } 862 /* 863 * Mapping in huge pages should only happen through a 864 * fault. If a page is merged into a transparent huge 865 * page, the individual subpages of that huge page 866 * should be unmapped through MMU notifiers before we 867 * get here. 868 * 869 * Merging of CompoundPages is not supported; they 870 * should become splitting first, unmapped, merged, 871 * and mapped back in on-demand. 872 */ 873#ifdef CONFIG_TRANSPARENT_HUGEPAGE 874 WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); 875#endif 876 pmd_clear(pmd); 877 } 878 879 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 880 set_pmd(pmd, *new_pmd); 881 return 0; 882} 883 884/* 885 * Adjust pfn start boundary if support for transparent hugepage 886 */ 887static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, unsigned long *gpap) 888{ 889 kvm_pfn_t pfn = *pfnp; 890 gfn_t gfn = *gpap >> PAGE_SHIFT; 891 struct page *page = pfn_to_page(pfn); 892 893 /* 894 * PageTransCompoundMap() returns true for THP and 895 * hugetlbfs. Make sure the adjustment is done only for THP 896 * pages. 897 */ 898 if ((!PageHuge(page)) && PageTransCompound(page) && 899 (atomic_read(&page->_mapcount) < 0)) { 900 unsigned long mask; 901 /* 902 * The address we faulted on is backed by a transparent huge 903 * page. However, because we map the compound huge page and 904 * not the individual tail page, we need to transfer the 905 * refcount to the head page. We have to be careful that the 906 * THP doesn't start to split while we are adjusting the 907 * refcounts. 908 * 909 * We are sure this doesn't happen, because mmu_notifier_retry 910 * was successful and we are holding the mmu_lock, so if this 911 * THP is trying to split, it will be blocked in the mmu 912 * notifier before touching any of the pages, specifically 913 * before being able to call __split_huge_page_refcount(). 914 * 915 * We can therefore safely transfer the refcount from PG_tail 916 * to PG_head and switch the pfn from a tail page to the head 917 * page accordingly. 918 */ 919 mask = PTRS_PER_PMD - 1; 920 VM_BUG_ON((gfn & mask) != (pfn & mask)); 921 if (pfn & mask) { 922 *gpap &= PMD_MASK; 923 kvm_release_pfn_clean(pfn); 924 pfn &= ~mask; 925 kvm_get_pfn(pfn); 926 *pfnp = pfn; 927 } 928 929 return true; 930 } 931 932 return false; 933} 934 935static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot, 936 unsigned long hva, bool write) 937{ 938 gpa_t gpa_start; 939 hva_t uaddr_start, uaddr_end; 940 unsigned long map_size; 941 size_t size; 942 943 map_size = PMD_SIZE; 944 /* Disable dirty logging on HugePages */ 945 if ((memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) && write) 946 return false; 947 948 size = memslot->npages * PAGE_SIZE; 949 gpa_start = memslot->base_gfn << PAGE_SHIFT; 950 uaddr_start = memslot->userspace_addr; 951 uaddr_end = uaddr_start + size; 952 953 /* 954 * Pages belonging to memslots that don't have the same alignment 955 * within a PMD/PUD for userspace and GPA cannot be mapped with stage-2 956 * PMD/PUD entries, because we'll end up mapping the wrong pages. 957 * 958 * Consider a layout like the following: 959 * 960 * memslot->userspace_addr: 961 * +-----+--------------------+--------------------+---+ 962 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 963 * +-----+--------------------+--------------------+---+ 964 * 965 * memslot->base_gfn << PAGE_SIZE: 966 * +---+--------------------+--------------------+-----+ 967 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 968 * +---+--------------------+--------------------+-----+ 969 * 970 * If we create those stage-2 blocks, we'll end up with this incorrect 971 * mapping: 972 * d -> f 973 * e -> g 974 * f -> h 975 */ 976 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 977 return false; 978 979 /* 980 * Next, let's make sure we're not trying to map anything not covered 981 * by the memslot. This means we have to prohibit block size mappings 982 * for the beginning and end of a non-block aligned and non-block sized 983 * memory slot (illustrated by the head and tail parts of the 984 * userspace view above containing pages 'abcde' and 'xyz', 985 * respectively). 986 * 987 * Note that it doesn't matter if we do the check using the 988 * userspace_addr or the base_gfn, as both are equally aligned (per 989 * the check above) and equally sized. 990 */ 991 return (hva & ~(map_size - 1)) >= uaddr_start && 992 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 993} 994 995/** 996 * kvm_map_page_fast() - Fast path GPA fault handler. 997 * @vcpu: VCPU pointer. 998 * @gpa: Guest physical address of fault. 999 * @write: Whether the fault was due to a write. 1000 * 1001 * Perform fast path GPA fault handling, doing all that can be done without 1002 * calling into KVM. This handles marking old pages young (for idle page 1003 * tracking), and dirtying of clean pages (for dirty page logging). 1004 * 1005 * Returns: 0 on success, in which case we can update derived mappings and 1006 * resume guest execution. 1007 * -EFAULT on failure due to absent GPA mapping or write to 1008 * read-only page, in which case KVM must be consulted. 1009 */ 1010static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, 1011 bool write) 1012{ 1013 struct kvm *kvm = vcpu->kvm; 1014 gfn_t gfn = gpa >> PAGE_SHIFT; 1015 pte_t *ptep; 1016 kvm_pfn_t pfn = 0; /* silence bogus GCC warning */ 1017 bool pfn_valid = false; 1018 int ret = 0; 1019 struct kvm_memory_slot *slot; 1020 1021 spin_lock(&kvm->mmu_lock); 1022 1023 /* Fast path - just check GPA page table for an existing entry */ 1024 ptep = kvm_pte_for_gpa(kvm, NULL, gpa); 1025 if (!ptep || !pte_present(*ptep)) { 1026 ret = -EFAULT; 1027 goto out; 1028 } 1029 1030 /* Track access to pages marked old */ 1031 if (!pte_young(*ptep)) { 1032 set_pte(ptep, pte_mkyoung(*ptep)); 1033 pfn = pte_pfn(*ptep); 1034 pfn_valid = true; 1035 /* call kvm_set_pfn_accessed() after unlock */ 1036 } 1037 if (write && !pte_dirty(*ptep)) { 1038 if (!pte_write(*ptep)) { 1039 ret = -EFAULT; 1040 goto out; 1041 } 1042 1043 if (kvm_pte_huge(*ptep)) { 1044 /* 1045 * Do not set write permission when dirty logging is 1046 * enabled for HugePages 1047 */ 1048 slot = gfn_to_memslot(kvm, gfn); 1049 if (slot->flags & KVM_MEM_LOG_DIRTY_PAGES) { 1050 ret = -EFAULT; 1051 goto out; 1052 } 1053 } 1054 1055 /* Track dirtying of writeable pages */ 1056 set_pte(ptep, pte_mkdirty(*ptep)); 1057 pfn = pte_pfn(*ptep); 1058 if (pmd_huge(*((pmd_t *)ptep))) { 1059 int i; 1060 gfn_t base_gfn = (gpa & PMD_MASK) >> PAGE_SHIFT; 1061 1062 for (i = 0; i < PTRS_PER_PTE; i++) 1063 mark_page_dirty(kvm, base_gfn + i); 1064 } else 1065 mark_page_dirty(kvm, gfn); 1066 kvm_set_pfn_dirty(pfn); 1067 } 1068 1069out: 1070 spin_unlock(&kvm->mmu_lock); 1071 if (pfn_valid) 1072 kvm_set_pfn_accessed(pfn); 1073 return ret; 1074} 1075 1076/* 1077 * Split huge page 1078 */ 1079static pte_t *kvm_split_huge(struct kvm_vcpu *vcpu, pte_t *ptep, gfn_t gfn, 1080 struct vm_area_struct *vma, unsigned long hva) 1081{ 1082 int i; 1083 pte_t val, *child; 1084 struct kvm_mmu_memory_cache *memcache; 1085 1086 memcache = &vcpu->arch.mmu_page_cache; 1087 child = kvm_mmu_memory_cache_alloc(memcache); 1088 val = kvm_pte_mksmall(*ptep); 1089 for (i = 0; i < PTRS_PER_PTE; i++) { 1090 kvm_set_pte(child + i, val); 1091 pte_val(val) += PAGE_SIZE; 1092 } 1093 1094 /* The later kvm_flush_tlb_gpa() will flush hugepage tlb */ 1095 pte_val(val) = (unsigned long)child; 1096 kvm_set_pte(ptep, val); 1097 return child + (gfn & (PTRS_PER_PTE - 1)); 1098} 1099 1100/** 1101 * kvm_map_page() - Map a guest physical page. 1102 * @vcpu: VCPU pointer. 1103 * @gpa: Guest physical address of fault. 1104 * @write: Whether the fault was due to a write. 1105 * 1106 * Handle GPA faults by creating a new GPA mapping (or updating an existing 1107 * one). 1108 * 1109 * This takes care of marking pages young or dirty (idle/dirty page tracking), 1110 * asking KVM for the corresponding PFN, and creating a mapping in the GPA page 1111 * tables. Derived mappings (GVA page tables and TLBs) must be handled by the 1112 * caller. 1113 * 1114 * Returns: 0 on success, in which case the caller may use the @out_entry 1115 * and @out_buddy PTEs to update derived mappings and resume guest 1116 * execution. 1117 * -EFAULT if there is no memory region at @gpa or a write was 1118 * attempted to a read-only memory region. This is usually handled 1119 * as an MMIO access. 1120 */ 1121static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, 1122 bool write) 1123{ 1124 bool writeable; 1125 bool force_pte = false; 1126 int i, srcu_idx, err = 0, retry_no = 0; 1127 unsigned long hva; 1128 unsigned long mmu_seq; 1129 unsigned long prot_bits; 1130 unsigned long vma_pagesize; 1131 pte_t *ptep; 1132 kvm_pfn_t pfn; 1133 gfn_t gfn = gpa >> PAGE_SHIFT; 1134 struct vm_area_struct *vma; 1135 struct kvm *kvm = vcpu->kvm; 1136 struct kvm_memory_slot *memslot; 1137 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 1138 1139 /* Try the fast path to handle old / clean pages */ 1140 srcu_idx = srcu_read_lock(&kvm->srcu); 1141 err = kvm_map_page_fast(vcpu, gpa, write); 1142 if (!err) 1143 goto out; 1144 1145 memslot = gfn_to_memslot(kvm, gfn); 1146 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writeable); 1147 if (kvm_is_error_hva(hva) || (write && !writeable)) 1148 goto out; 1149 1150 /* Let's check if we will get back a huge page backed by hugetlbfs */ 1151 mmap_read_lock(current->mm); 1152 vma = find_vma_intersection(current->mm, hva, hva + 1); 1153 if (unlikely(!vma)) { 1154 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1155 mmap_read_unlock(current->mm); 1156 err = -EFAULT; 1157 goto out; 1158 } 1159 1160 vma_pagesize = vma_kernel_pagesize(vma); 1161 if ((vma_pagesize == PMD_SIZE) && 1162 !fault_supports_huge_mapping(memslot, hva, write)) { 1163 force_pte = true; 1164 vma_pagesize = PAGE_SIZE; 1165 ++vcpu->stat.huge_dec_exits; 1166 } 1167 1168 /* PMD is not folded, adjust gfn to new boundary */ 1169 if (vma_pagesize == PMD_SIZE) 1170 gfn = (gpa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; 1171 1172 mmap_read_unlock(current->mm); 1173 1174 /* We need a minimum of cached pages ready for page table creation */ 1175 err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); 1176 if (err) 1177 goto out; 1178 1179retry: 1180 /* 1181 * Used to check for invalidations in progress, of the pfn that is 1182 * returned by pfn_to_pfn_prot below. 1183 */ 1184 mmu_seq = kvm->mmu_notifier_seq; 1185 /* 1186 * Ensure the read of mmu_notifier_seq isn't reordered with PTE reads in 1187 * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't 1188 * risk the page we get a reference to getting unmapped before we have a 1189 * chance to grab the mmu_lock without mmu_notifier_retry() noticing. 1190 * 1191 * This smp_rmb() pairs with the effective smp_wmb() of the combination 1192 * of the pte_unmap_unlock() after the PTE is zapped, and the 1193 * spin_lock() in kvm_mmu_notifier_invalidate_<page|range_end>() before 1194 * mmu_notifier_seq is incremented. 1195 */ 1196 smp_rmb(); 1197 1198 /* Slow path - ask KVM core whether we can access this GPA */ 1199 pfn = gfn_to_pfn_prot(kvm, gfn, write, &writeable); 1200 if (is_error_noslot_pfn(pfn)) { 1201 err = -EFAULT; 1202 goto out; 1203 } 1204 1205 spin_lock(&kvm->mmu_lock); 1206 /* Check if an invalidation has taken place since we got pfn */ 1207 if (mmu_notifier_retry(kvm, mmu_seq)) { 1208 /* 1209 * This can happen when mappings are changed asynchronously, but 1210 * also synchronously if a COW is triggered by 1211 * gfn_to_pfn_prot(). 1212 */ 1213 spin_unlock(&kvm->mmu_lock); 1214 kvm_set_pfn_accessed(pfn); 1215 kvm_release_pfn_clean(pfn); 1216 if (retry_no > 100) { 1217 retry_no = 0; 1218 schedule(); 1219 } 1220 retry_no++; 1221 goto retry; 1222 } 1223 1224 if (vma_pagesize == PAGE_SIZE && !force_pte) { 1225 /* 1226 * Only PMD_SIZE transparent hugepages(THP) are 1227 * currently supported. This code will need to be 1228 * updated to support other THP sizes. 1229 * 1230 * Make sure the host VA and the guest IPA are sufficiently 1231 * aligned and that the block is contained within the memslot. 1232 */ 1233 ++vcpu->stat.huge_thp_exits; 1234 if (fault_supports_huge_mapping(memslot, hva, write) && 1235 transparent_hugepage_adjust(&pfn, &gpa)) { 1236 ++vcpu->stat.huge_adjust_exits; 1237 vma_pagesize = PMD_SIZE; 1238 } 1239 } 1240 1241 /* Set up the prot bits */ 1242 prot_bits = _PAGE_PRESENT | __READABLE; 1243 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1244 prot_bits |= _CACHE_SUC; 1245 else 1246 prot_bits |= _CACHE_CC; 1247 1248 if (writeable) { 1249 prot_bits |= _PAGE_WRITE; 1250 if (write) { 1251 prot_bits |= __WRITEABLE; 1252 mark_page_dirty(kvm, gfn); 1253 kvm_set_pfn_dirty(pfn); 1254 } 1255 } 1256 1257 if (vma_pagesize == PMD_SIZE) { 1258 pmd_t new_pmd = pfn_pmd(pfn, __pgprot(prot_bits)); 1259 new_pmd = pmd_mkhuge(new_pmd); 1260 if (writeable && write) { 1261 gfn_t base_gfn = (gpa & PMD_MASK) >> PAGE_SHIFT; 1262 for (i = 0; i < PTRS_PER_PTE; i++) 1263 mark_page_dirty(kvm, base_gfn + i); 1264 } 1265 1266 ++vcpu->stat.huge_set_exits; 1267 kvm_set_pmd_huge(vcpu, memcache, gpa, &new_pmd); 1268 } else { 1269 pte_t new_pte = pfn_pte(pfn, __pgprot(prot_bits)); 1270 if (writeable && write) 1271 mark_page_dirty(kvm, gfn); 1272 1273 /* Ensure page tables are allocated */ 1274 ptep = kvm_pte_for_gpa(kvm, memcache, gpa); 1275 if (ptep && kvm_pte_huge(*ptep) && write) 1276 ptep = kvm_split_huge(vcpu, ptep, gfn, vma, hva); 1277 1278 set_pte(ptep, new_pte); 1279 err = 0; 1280 } 1281 1282 spin_unlock(&kvm->mmu_lock); 1283 kvm_release_pfn_clean(pfn); 1284 kvm_set_pfn_accessed(pfn); 1285out: 1286 srcu_read_unlock(&kvm->srcu, srcu_idx); 1287 return err; 1288} 1289 1290int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv, 1291 bool write) 1292{ 1293 int ret; 1294 1295 ret = kvm_map_page(vcpu, badv, write); 1296 if (ret) 1297 return ret; 1298 1299 /* Invalidate this entry in the TLB */ 1300 return kvm_tlb_flush_gpa(vcpu, badv); 1301} 1302 1303/** 1304 * kvm_flush_tlb_all() - Flush all root TLB entries for 1305 * guests. 1306 * 1307 * Invalidate all entries including GVA-->GPA and GPA-->HPA mappings. 1308 */ 1309void kvm_flush_tlb_all(void) 1310{ 1311 unsigned long flags; 1312 1313 local_irq_save(flags); 1314 invtlb_all(INVTLB_ALLGID, 0, 0); 1315 local_irq_restore(flags); 1316} 1317