1// SPDX-License-Identifier: GPL-2.0 2/* 3 * KVM guest address space mapping code 4 * 5 * Copyright IBM Corp. 2007, 2016, 2018 6 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 7 * David Hildenbrand <david@redhat.com> 8 * Janosch Frank <frankja@linux.vnet.ibm.com> 9 */ 10 11#include <linux/kernel.h> 12#include <linux/pagewalk.h> 13#include <linux/swap.h> 14#include <linux/smp.h> 15#include <linux/spinlock.h> 16#include <linux/slab.h> 17#include <linux/swapops.h> 18#include <linux/ksm.h> 19#include <linux/mman.h> 20#include <linux/pgtable.h> 21 22#include <asm/pgalloc.h> 23#include <asm/gmap.h> 24#include <asm/tlb.h> 25 26#define GMAP_SHADOW_FAKE_TABLE 1ULL 27 28/** 29 * gmap_alloc - allocate and initialize a guest address space 30 * @mm: pointer to the parent mm_struct 31 * @limit: maximum address of the gmap address space 32 * 33 * Returns a guest address space structure. 34 */ 35static struct gmap *gmap_alloc(unsigned long limit) 36{ 37 struct gmap *gmap; 38 struct page *page; 39 unsigned long *table; 40 unsigned long etype, atype; 41 42 if (limit < _REGION3_SIZE) { 43 limit = _REGION3_SIZE - 1; 44 atype = _ASCE_TYPE_SEGMENT; 45 etype = _SEGMENT_ENTRY_EMPTY; 46 } else if (limit < _REGION2_SIZE) { 47 limit = _REGION2_SIZE - 1; 48 atype = _ASCE_TYPE_REGION3; 49 etype = _REGION3_ENTRY_EMPTY; 50 } else if (limit < _REGION1_SIZE) { 51 limit = _REGION1_SIZE - 1; 52 atype = _ASCE_TYPE_REGION2; 53 etype = _REGION2_ENTRY_EMPTY; 54 } else { 55 limit = -1UL; 56 atype = _ASCE_TYPE_REGION1; 57 etype = _REGION1_ENTRY_EMPTY; 58 } 59 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); 60 if (!gmap) 61 goto out; 62 INIT_LIST_HEAD(&gmap->crst_list); 63 INIT_LIST_HEAD(&gmap->children); 64 INIT_LIST_HEAD(&gmap->pt_list); 65 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); 66 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); 67 INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC); 68 spin_lock_init(&gmap->guest_table_lock); 69 spin_lock_init(&gmap->shadow_lock); 70 refcount_set(&gmap->ref_count, 1); 71 page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); 72 if (!page) 73 goto out_free; 74 page->index = 0; 75 list_add(&page->lru, &gmap->crst_list); 76 table = (unsigned long *) page_to_phys(page); 77 crst_table_init(table, etype); 78 gmap->table = table; 79 gmap->asce = atype | _ASCE_TABLE_LENGTH | 80 _ASCE_USER_BITS | __pa(table); 81 gmap->asce_end = limit; 82 return gmap; 83 84out_free: 85 kfree(gmap); 86out: 87 return NULL; 88} 89 90/** 91 * gmap_create - create a guest address space 92 * @mm: pointer to the parent mm_struct 93 * @limit: maximum size of the gmap address space 94 * 95 * Returns a guest address space structure. 96 */ 97struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) 98{ 99 struct gmap *gmap; 100 unsigned long gmap_asce; 101 102 gmap = gmap_alloc(limit); 103 if (!gmap) 104 return NULL; 105 gmap->mm = mm; 106 spin_lock(&mm->context.lock); 107 list_add_rcu(&gmap->list, &mm->context.gmap_list); 108 if (list_is_singular(&mm->context.gmap_list)) 109 gmap_asce = gmap->asce; 110 else 111 gmap_asce = -1UL; 112 WRITE_ONCE(mm->context.gmap_asce, gmap_asce); 113 spin_unlock(&mm->context.lock); 114 return gmap; 115} 116EXPORT_SYMBOL_GPL(gmap_create); 117 118static void gmap_flush_tlb(struct gmap *gmap) 119{ 120 if (MACHINE_HAS_IDTE) 121 __tlb_flush_idte(gmap->asce); 122 else 123 __tlb_flush_global(); 124} 125 126static void gmap_radix_tree_free(struct radix_tree_root *root) 127{ 128 struct radix_tree_iter iter; 129 unsigned long indices[16]; 130 unsigned long index; 131 void __rcu **slot; 132 int i, nr; 133 134 /* A radix tree is freed by deleting all of its entries */ 135 index = 0; 136 do { 137 nr = 0; 138 radix_tree_for_each_slot(slot, root, &iter, index) { 139 indices[nr] = iter.index; 140 if (++nr == 16) 141 break; 142 } 143 for (i = 0; i < nr; i++) { 144 index = indices[i]; 145 radix_tree_delete(root, index); 146 } 147 } while (nr > 0); 148} 149 150static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) 151{ 152 struct gmap_rmap *rmap, *rnext, *head; 153 struct radix_tree_iter iter; 154 unsigned long indices[16]; 155 unsigned long index; 156 void __rcu **slot; 157 int i, nr; 158 159 /* A radix tree is freed by deleting all of its entries */ 160 index = 0; 161 do { 162 nr = 0; 163 radix_tree_for_each_slot(slot, root, &iter, index) { 164 indices[nr] = iter.index; 165 if (++nr == 16) 166 break; 167 } 168 for (i = 0; i < nr; i++) { 169 index = indices[i]; 170 head = radix_tree_delete(root, index); 171 gmap_for_each_rmap_safe(rmap, rnext, head) 172 kfree(rmap); 173 } 174 } while (nr > 0); 175} 176 177/** 178 * gmap_free - free a guest address space 179 * @gmap: pointer to the guest address space structure 180 * 181 * No locks required. There are no references to this gmap anymore. 182 */ 183static void gmap_free(struct gmap *gmap) 184{ 185 struct page *page, *next; 186 187 /* Flush tlb of all gmaps (if not already done for shadows) */ 188 if (!(gmap_is_shadow(gmap) && gmap->removed)) 189 gmap_flush_tlb(gmap); 190 /* Free all segment & region tables. */ 191 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) 192 __free_pages(page, CRST_ALLOC_ORDER); 193 gmap_radix_tree_free(&gmap->guest_to_host); 194 gmap_radix_tree_free(&gmap->host_to_guest); 195 196 /* Free additional data for a shadow gmap */ 197 if (gmap_is_shadow(gmap)) { 198 /* Free all page tables. */ 199 list_for_each_entry_safe(page, next, &gmap->pt_list, lru) 200 page_table_free_pgste(page); 201 gmap_rmap_radix_tree_free(&gmap->host_to_rmap); 202 /* Release reference to the parent */ 203 gmap_put(gmap->parent); 204 } 205 206 kfree(gmap); 207} 208 209/** 210 * gmap_get - increase reference counter for guest address space 211 * @gmap: pointer to the guest address space structure 212 * 213 * Returns the gmap pointer 214 */ 215struct gmap *gmap_get(struct gmap *gmap) 216{ 217 refcount_inc(&gmap->ref_count); 218 return gmap; 219} 220EXPORT_SYMBOL_GPL(gmap_get); 221 222/** 223 * gmap_put - decrease reference counter for guest address space 224 * @gmap: pointer to the guest address space structure 225 * 226 * If the reference counter reaches zero the guest address space is freed. 227 */ 228void gmap_put(struct gmap *gmap) 229{ 230 if (refcount_dec_and_test(&gmap->ref_count)) 231 gmap_free(gmap); 232} 233EXPORT_SYMBOL_GPL(gmap_put); 234 235/** 236 * gmap_remove - remove a guest address space but do not free it yet 237 * @gmap: pointer to the guest address space structure 238 */ 239void gmap_remove(struct gmap *gmap) 240{ 241 struct gmap *sg, *next; 242 unsigned long gmap_asce; 243 244 /* Remove all shadow gmaps linked to this gmap */ 245 if (!list_empty(&gmap->children)) { 246 spin_lock(&gmap->shadow_lock); 247 list_for_each_entry_safe(sg, next, &gmap->children, list) { 248 list_del(&sg->list); 249 gmap_put(sg); 250 } 251 spin_unlock(&gmap->shadow_lock); 252 } 253 /* Remove gmap from the pre-mm list */ 254 spin_lock(&gmap->mm->context.lock); 255 list_del_rcu(&gmap->list); 256 if (list_empty(&gmap->mm->context.gmap_list)) 257 gmap_asce = 0; 258 else if (list_is_singular(&gmap->mm->context.gmap_list)) 259 gmap_asce = list_first_entry(&gmap->mm->context.gmap_list, 260 struct gmap, list)->asce; 261 else 262 gmap_asce = -1UL; 263 WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce); 264 spin_unlock(&gmap->mm->context.lock); 265 synchronize_rcu(); 266 /* Put reference */ 267 gmap_put(gmap); 268} 269EXPORT_SYMBOL_GPL(gmap_remove); 270 271/** 272 * gmap_enable - switch primary space to the guest address space 273 * @gmap: pointer to the guest address space structure 274 */ 275void gmap_enable(struct gmap *gmap) 276{ 277 S390_lowcore.gmap = (unsigned long) gmap; 278} 279EXPORT_SYMBOL_GPL(gmap_enable); 280 281/** 282 * gmap_disable - switch back to the standard primary address space 283 * @gmap: pointer to the guest address space structure 284 */ 285void gmap_disable(struct gmap *gmap) 286{ 287 S390_lowcore.gmap = 0UL; 288} 289EXPORT_SYMBOL_GPL(gmap_disable); 290 291/** 292 * gmap_get_enabled - get a pointer to the currently enabled gmap 293 * 294 * Returns a pointer to the currently enabled gmap. 0 if none is enabled. 295 */ 296struct gmap *gmap_get_enabled(void) 297{ 298 return (struct gmap *) S390_lowcore.gmap; 299} 300EXPORT_SYMBOL_GPL(gmap_get_enabled); 301 302/* 303 * gmap_alloc_table is assumed to be called with mmap_lock held 304 */ 305static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, 306 unsigned long init, unsigned long gaddr) 307{ 308 struct page *page; 309 unsigned long *new; 310 311 /* since we dont free the gmap table until gmap_free we can unlock */ 312 page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); 313 if (!page) 314 return -ENOMEM; 315 new = (unsigned long *) page_to_phys(page); 316 crst_table_init(new, init); 317 spin_lock(&gmap->guest_table_lock); 318 if (*table & _REGION_ENTRY_INVALID) { 319 list_add(&page->lru, &gmap->crst_list); 320 *table = (unsigned long) new | _REGION_ENTRY_LENGTH | 321 (*table & _REGION_ENTRY_TYPE_MASK); 322 page->index = gaddr; 323 page = NULL; 324 } 325 spin_unlock(&gmap->guest_table_lock); 326 if (page) 327 __free_pages(page, CRST_ALLOC_ORDER); 328 return 0; 329} 330 331/** 332 * __gmap_segment_gaddr - find virtual address from segment pointer 333 * @entry: pointer to a segment table entry in the guest address space 334 * 335 * Returns the virtual address in the guest address space for the segment 336 */ 337static unsigned long __gmap_segment_gaddr(unsigned long *entry) 338{ 339 struct page *page; 340 unsigned long offset, mask; 341 342 offset = (unsigned long) entry / sizeof(unsigned long); 343 offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; 344 mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); 345 page = virt_to_page((void *)((unsigned long) entry & mask)); 346 return page->index + offset; 347} 348 349/** 350 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address 351 * @gmap: pointer to the guest address space structure 352 * @vmaddr: address in the host process address space 353 * 354 * Returns 1 if a TLB flush is required 355 */ 356static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) 357{ 358 unsigned long *entry; 359 int flush = 0; 360 361 BUG_ON(gmap_is_shadow(gmap)); 362 spin_lock(&gmap->guest_table_lock); 363 entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 364 if (entry) { 365 flush = (*entry != _SEGMENT_ENTRY_EMPTY); 366 *entry = _SEGMENT_ENTRY_EMPTY; 367 } 368 spin_unlock(&gmap->guest_table_lock); 369 return flush; 370} 371 372/** 373 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address 374 * @gmap: pointer to the guest address space structure 375 * @gaddr: address in the guest address space 376 * 377 * Returns 1 if a TLB flush is required 378 */ 379static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) 380{ 381 unsigned long vmaddr; 382 383 vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, 384 gaddr >> PMD_SHIFT); 385 return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; 386} 387 388/** 389 * gmap_unmap_segment - unmap segment from the guest address space 390 * @gmap: pointer to the guest address space structure 391 * @to: address in the guest address space 392 * @len: length of the memory area to unmap 393 * 394 * Returns 0 if the unmap succeeded, -EINVAL if not. 395 */ 396int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 397{ 398 unsigned long off; 399 int flush; 400 401 BUG_ON(gmap_is_shadow(gmap)); 402 if ((to | len) & (PMD_SIZE - 1)) 403 return -EINVAL; 404 if (len == 0 || to + len < to) 405 return -EINVAL; 406 407 flush = 0; 408 mmap_write_lock(gmap->mm); 409 for (off = 0; off < len; off += PMD_SIZE) 410 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 411 mmap_write_unlock(gmap->mm); 412 if (flush) 413 gmap_flush_tlb(gmap); 414 return 0; 415} 416EXPORT_SYMBOL_GPL(gmap_unmap_segment); 417 418/** 419 * gmap_map_segment - map a segment to the guest address space 420 * @gmap: pointer to the guest address space structure 421 * @from: source address in the parent address space 422 * @to: target address in the guest address space 423 * @len: length of the memory area to map 424 * 425 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. 426 */ 427int gmap_map_segment(struct gmap *gmap, unsigned long from, 428 unsigned long to, unsigned long len) 429{ 430 unsigned long off; 431 int flush; 432 433 BUG_ON(gmap_is_shadow(gmap)); 434 if ((from | to | len) & (PMD_SIZE - 1)) 435 return -EINVAL; 436 if (len == 0 || from + len < from || to + len < to || 437 from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end) 438 return -EINVAL; 439 440 flush = 0; 441 mmap_write_lock(gmap->mm); 442 for (off = 0; off < len; off += PMD_SIZE) { 443 /* Remove old translation */ 444 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 445 /* Store new translation */ 446 if (radix_tree_insert(&gmap->guest_to_host, 447 (to + off) >> PMD_SHIFT, 448 (void *) from + off)) 449 break; 450 } 451 mmap_write_unlock(gmap->mm); 452 if (flush) 453 gmap_flush_tlb(gmap); 454 if (off >= len) 455 return 0; 456 gmap_unmap_segment(gmap, to, len); 457 return -ENOMEM; 458} 459EXPORT_SYMBOL_GPL(gmap_map_segment); 460 461/** 462 * __gmap_translate - translate a guest address to a user space address 463 * @gmap: pointer to guest mapping meta data structure 464 * @gaddr: guest address 465 * 466 * Returns user space address which corresponds to the guest address or 467 * -EFAULT if no such mapping exists. 468 * This function does not establish potentially missing page table entries. 469 * The mmap_lock of the mm that belongs to the address space must be held 470 * when this function gets called. 471 * 472 * Note: Can also be called for shadow gmaps. 473 */ 474unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) 475{ 476 unsigned long vmaddr; 477 478 vmaddr = (unsigned long) 479 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); 480 /* Note: guest_to_host is empty for a shadow gmap */ 481 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; 482} 483EXPORT_SYMBOL_GPL(__gmap_translate); 484 485/** 486 * gmap_translate - translate a guest address to a user space address 487 * @gmap: pointer to guest mapping meta data structure 488 * @gaddr: guest address 489 * 490 * Returns user space address which corresponds to the guest address or 491 * -EFAULT if no such mapping exists. 492 * This function does not establish potentially missing page table entries. 493 */ 494unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) 495{ 496 unsigned long rc; 497 498 mmap_read_lock(gmap->mm); 499 rc = __gmap_translate(gmap, gaddr); 500 mmap_read_unlock(gmap->mm); 501 return rc; 502} 503EXPORT_SYMBOL_GPL(gmap_translate); 504 505/** 506 * gmap_unlink - disconnect a page table from the gmap shadow tables 507 * @gmap: pointer to guest mapping meta data structure 508 * @table: pointer to the host page table 509 * @vmaddr: vm address associated with the host page table 510 */ 511void gmap_unlink(struct mm_struct *mm, unsigned long *table, 512 unsigned long vmaddr) 513{ 514 struct gmap *gmap; 515 int flush; 516 517 rcu_read_lock(); 518 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 519 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); 520 if (flush) 521 gmap_flush_tlb(gmap); 522 } 523 rcu_read_unlock(); 524} 525 526static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, 527 unsigned long gaddr); 528 529/** 530 * gmap_link - set up shadow page tables to connect a host to a guest address 531 * @gmap: pointer to guest mapping meta data structure 532 * @gaddr: guest address 533 * @vmaddr: vm address 534 * 535 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 536 * if the vm address is already mapped to a different guest segment. 537 * The mmap_lock of the mm that belongs to the address space must be held 538 * when this function gets called. 539 */ 540int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) 541{ 542 struct mm_struct *mm; 543 unsigned long *table; 544 spinlock_t *ptl; 545 pgd_t *pgd; 546 p4d_t *p4d; 547 pud_t *pud; 548 pmd_t *pmd; 549 u64 unprot; 550 int rc; 551 552 BUG_ON(gmap_is_shadow(gmap)); 553 /* Create higher level tables in the gmap page table */ 554 table = gmap->table; 555 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { 556 table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; 557 if ((*table & _REGION_ENTRY_INVALID) && 558 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, 559 gaddr & _REGION1_MASK)) 560 return -ENOMEM; 561 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 562 } 563 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { 564 table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; 565 if ((*table & _REGION_ENTRY_INVALID) && 566 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, 567 gaddr & _REGION2_MASK)) 568 return -ENOMEM; 569 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 570 } 571 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { 572 table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; 573 if ((*table & _REGION_ENTRY_INVALID) && 574 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, 575 gaddr & _REGION3_MASK)) 576 return -ENOMEM; 577 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 578 } 579 table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; 580 /* Walk the parent mm page table */ 581 mm = gmap->mm; 582 pgd = pgd_offset(mm, vmaddr); 583 VM_BUG_ON(pgd_none(*pgd)); 584 p4d = p4d_offset(pgd, vmaddr); 585 VM_BUG_ON(p4d_none(*p4d)); 586 pud = pud_offset(p4d, vmaddr); 587 VM_BUG_ON(pud_none(*pud)); 588 /* large puds cannot yet be handled */ 589 if (pud_large(*pud)) 590 return -EFAULT; 591 pmd = pmd_offset(pud, vmaddr); 592 VM_BUG_ON(pmd_none(*pmd)); 593 /* Are we allowed to use huge pages? */ 594 if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) 595 return -EFAULT; 596 /* Link gmap segment table entry location to page table. */ 597 rc = radix_tree_preload(GFP_KERNEL); 598 if (rc) 599 return rc; 600 ptl = pmd_lock(mm, pmd); 601 spin_lock(&gmap->guest_table_lock); 602 if (*table == _SEGMENT_ENTRY_EMPTY) { 603 rc = radix_tree_insert(&gmap->host_to_guest, 604 vmaddr >> PMD_SHIFT, table); 605 if (!rc) { 606 if (pmd_large(*pmd)) { 607 *table = (pmd_val(*pmd) & 608 _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) 609 | _SEGMENT_ENTRY_GMAP_UC; 610 } else 611 *table = pmd_val(*pmd) & 612 _SEGMENT_ENTRY_HARDWARE_BITS; 613 } 614 } else if (*table & _SEGMENT_ENTRY_PROTECT && 615 !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { 616 unprot = (u64)*table; 617 unprot &= ~_SEGMENT_ENTRY_PROTECT; 618 unprot |= _SEGMENT_ENTRY_GMAP_UC; 619 gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr); 620 } 621 spin_unlock(&gmap->guest_table_lock); 622 spin_unlock(ptl); 623 radix_tree_preload_end(); 624 return rc; 625} 626 627/** 628 * gmap_fault - resolve a fault on a guest address 629 * @gmap: pointer to guest mapping meta data structure 630 * @gaddr: guest address 631 * @fault_flags: flags to pass down to handle_mm_fault() 632 * 633 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 634 * if the vm address is already mapped to a different guest segment. 635 */ 636int gmap_fault(struct gmap *gmap, unsigned long gaddr, 637 unsigned int fault_flags) 638{ 639 unsigned long vmaddr; 640 int rc; 641 bool unlocked; 642 643 mmap_read_lock(gmap->mm); 644 645retry: 646 unlocked = false; 647 vmaddr = __gmap_translate(gmap, gaddr); 648 if (IS_ERR_VALUE(vmaddr)) { 649 rc = vmaddr; 650 goto out_up; 651 } 652 if (fixup_user_fault(gmap->mm, vmaddr, fault_flags, 653 &unlocked)) { 654 rc = -EFAULT; 655 goto out_up; 656 } 657 /* 658 * In the case that fixup_user_fault unlocked the mmap_lock during 659 * faultin redo __gmap_translate to not race with a map/unmap_segment. 660 */ 661 if (unlocked) 662 goto retry; 663 664 rc = __gmap_link(gmap, gaddr, vmaddr); 665out_up: 666 mmap_read_unlock(gmap->mm); 667 return rc; 668} 669EXPORT_SYMBOL_GPL(gmap_fault); 670 671/* 672 * this function is assumed to be called with mmap_lock held 673 */ 674void __gmap_zap(struct gmap *gmap, unsigned long gaddr) 675{ 676 unsigned long vmaddr; 677 spinlock_t *ptl; 678 pte_t *ptep; 679 680 /* Find the vm address for the guest address */ 681 vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, 682 gaddr >> PMD_SHIFT); 683 if (vmaddr) { 684 vmaddr |= gaddr & ~PMD_MASK; 685 /* Get pointer to the page table entry */ 686 ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); 687 if (likely(ptep)) { 688 ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); 689 pte_unmap_unlock(ptep, ptl); 690 } 691 } 692} 693EXPORT_SYMBOL_GPL(__gmap_zap); 694 695void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) 696{ 697 unsigned long gaddr, vmaddr, size; 698 struct vm_area_struct *vma; 699 700 mmap_read_lock(gmap->mm); 701 for (gaddr = from; gaddr < to; 702 gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { 703 /* Find the vm address for the guest address */ 704 vmaddr = (unsigned long) 705 radix_tree_lookup(&gmap->guest_to_host, 706 gaddr >> PMD_SHIFT); 707 if (!vmaddr) 708 continue; 709 vmaddr |= gaddr & ~PMD_MASK; 710 /* Find vma in the parent mm */ 711 vma = find_vma(gmap->mm, vmaddr); 712 if (!vma) 713 continue; 714 /* 715 * We do not discard pages that are backed by 716 * hugetlbfs, so we don't have to refault them. 717 */ 718 if (is_vm_hugetlb_page(vma)) 719 continue; 720 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); 721 zap_page_range(vma, vmaddr, size); 722 } 723 mmap_read_unlock(gmap->mm); 724} 725EXPORT_SYMBOL_GPL(gmap_discard); 726 727static LIST_HEAD(gmap_notifier_list); 728static DEFINE_SPINLOCK(gmap_notifier_lock); 729 730/** 731 * gmap_register_pte_notifier - register a pte invalidation callback 732 * @nb: pointer to the gmap notifier block 733 */ 734void gmap_register_pte_notifier(struct gmap_notifier *nb) 735{ 736 spin_lock(&gmap_notifier_lock); 737 list_add_rcu(&nb->list, &gmap_notifier_list); 738 spin_unlock(&gmap_notifier_lock); 739} 740EXPORT_SYMBOL_GPL(gmap_register_pte_notifier); 741 742/** 743 * gmap_unregister_pte_notifier - remove a pte invalidation callback 744 * @nb: pointer to the gmap notifier block 745 */ 746void gmap_unregister_pte_notifier(struct gmap_notifier *nb) 747{ 748 spin_lock(&gmap_notifier_lock); 749 list_del_rcu(&nb->list); 750 spin_unlock(&gmap_notifier_lock); 751 synchronize_rcu(); 752} 753EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier); 754 755/** 756 * gmap_call_notifier - call all registered invalidation callbacks 757 * @gmap: pointer to guest mapping meta data structure 758 * @start: start virtual address in the guest address space 759 * @end: end virtual address in the guest address space 760 */ 761static void gmap_call_notifier(struct gmap *gmap, unsigned long start, 762 unsigned long end) 763{ 764 struct gmap_notifier *nb; 765 766 list_for_each_entry(nb, &gmap_notifier_list, list) 767 nb->notifier_call(gmap, start, end); 768} 769 770/** 771 * gmap_table_walk - walk the gmap page tables 772 * @gmap: pointer to guest mapping meta data structure 773 * @gaddr: virtual address in the guest address space 774 * @level: page table level to stop at 775 * 776 * Returns a table entry pointer for the given guest address and @level 777 * @level=0 : returns a pointer to a page table table entry (or NULL) 778 * @level=1 : returns a pointer to a segment table entry (or NULL) 779 * @level=2 : returns a pointer to a region-3 table entry (or NULL) 780 * @level=3 : returns a pointer to a region-2 table entry (or NULL) 781 * @level=4 : returns a pointer to a region-1 table entry (or NULL) 782 * 783 * Returns NULL if the gmap page tables could not be walked to the 784 * requested level. 785 * 786 * Note: Can also be called for shadow gmaps. 787 */ 788static inline unsigned long *gmap_table_walk(struct gmap *gmap, 789 unsigned long gaddr, int level) 790{ 791 const int asce_type = gmap->asce & _ASCE_TYPE_MASK; 792 unsigned long *table = gmap->table; 793 794 if (gmap_is_shadow(gmap) && gmap->removed) 795 return NULL; 796 797 if (WARN_ON_ONCE(level > (asce_type >> 2) + 1)) 798 return NULL; 799 800 if (asce_type != _ASCE_TYPE_REGION1 && 801 gaddr & (-1UL << (31 + (asce_type >> 2) * 11))) 802 return NULL; 803 804 switch (asce_type) { 805 case _ASCE_TYPE_REGION1: 806 table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; 807 if (level == 4) 808 break; 809 if (*table & _REGION_ENTRY_INVALID) 810 return NULL; 811 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 812 fallthrough; 813 case _ASCE_TYPE_REGION2: 814 table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; 815 if (level == 3) 816 break; 817 if (*table & _REGION_ENTRY_INVALID) 818 return NULL; 819 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 820 fallthrough; 821 case _ASCE_TYPE_REGION3: 822 table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; 823 if (level == 2) 824 break; 825 if (*table & _REGION_ENTRY_INVALID) 826 return NULL; 827 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 828 fallthrough; 829 case _ASCE_TYPE_SEGMENT: 830 table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; 831 if (level == 1) 832 break; 833 if (*table & _REGION_ENTRY_INVALID) 834 return NULL; 835 table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); 836 table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT; 837 } 838 return table; 839} 840 841/** 842 * gmap_pte_op_walk - walk the gmap page table, get the page table lock 843 * and return the pte pointer 844 * @gmap: pointer to guest mapping meta data structure 845 * @gaddr: virtual address in the guest address space 846 * @ptl: pointer to the spinlock pointer 847 * 848 * Returns a pointer to the locked pte for a guest address, or NULL 849 */ 850static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, 851 spinlock_t **ptl) 852{ 853 unsigned long *table; 854 855 BUG_ON(gmap_is_shadow(gmap)); 856 /* Walk the gmap page table, lock and get pte pointer */ 857 table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ 858 if (!table || *table & _SEGMENT_ENTRY_INVALID) 859 return NULL; 860 return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); 861} 862 863/** 864 * gmap_pte_op_fixup - force a page in and connect the gmap page table 865 * @gmap: pointer to guest mapping meta data structure 866 * @gaddr: virtual address in the guest address space 867 * @vmaddr: address in the host process address space 868 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 869 * 870 * Returns 0 if the caller can retry __gmap_translate (might fail again), 871 * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing 872 * up or connecting the gmap page table. 873 */ 874static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, 875 unsigned long vmaddr, int prot) 876{ 877 struct mm_struct *mm = gmap->mm; 878 unsigned int fault_flags; 879 bool unlocked = false; 880 881 BUG_ON(gmap_is_shadow(gmap)); 882 fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; 883 if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked)) 884 return -EFAULT; 885 if (unlocked) 886 /* lost mmap_lock, caller has to retry __gmap_translate */ 887 return 0; 888 /* Connect the page tables */ 889 return __gmap_link(gmap, gaddr, vmaddr); 890} 891 892/** 893 * gmap_pte_op_end - release the page table lock 894 * @ptl: pointer to the spinlock pointer 895 */ 896static void gmap_pte_op_end(spinlock_t *ptl) 897{ 898 if (ptl) 899 spin_unlock(ptl); 900} 901 902/** 903 * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock 904 * and return the pmd pointer 905 * @gmap: pointer to guest mapping meta data structure 906 * @gaddr: virtual address in the guest address space 907 * 908 * Returns a pointer to the pmd for a guest address, or NULL 909 */ 910static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) 911{ 912 pmd_t *pmdp; 913 914 BUG_ON(gmap_is_shadow(gmap)); 915 pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1); 916 if (!pmdp) 917 return NULL; 918 919 /* without huge pages, there is no need to take the table lock */ 920 if (!gmap->mm->context.allow_gmap_hpage_1m) 921 return pmd_none(*pmdp) ? NULL : pmdp; 922 923 spin_lock(&gmap->guest_table_lock); 924 if (pmd_none(*pmdp)) { 925 spin_unlock(&gmap->guest_table_lock); 926 return NULL; 927 } 928 929 /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ 930 if (!pmd_large(*pmdp)) 931 spin_unlock(&gmap->guest_table_lock); 932 return pmdp; 933} 934 935/** 936 * gmap_pmd_op_end - release the guest_table_lock if needed 937 * @gmap: pointer to the guest mapping meta data structure 938 * @pmdp: pointer to the pmd 939 */ 940static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) 941{ 942 if (pmd_large(*pmdp)) 943 spin_unlock(&gmap->guest_table_lock); 944} 945 946/* 947 * gmap_protect_pmd - remove access rights to memory and set pmd notification bits 948 * @pmdp: pointer to the pmd to be protected 949 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 950 * @bits: notification bits to set 951 * 952 * Returns: 953 * 0 if successfully protected 954 * -EAGAIN if a fixup is needed 955 * -EINVAL if unsupported notifier bits have been specified 956 * 957 * Expected to be called with sg->mm->mmap_lock in read and 958 * guest_table_lock held. 959 */ 960static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, 961 pmd_t *pmdp, int prot, unsigned long bits) 962{ 963 int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID; 964 int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT; 965 pmd_t new = *pmdp; 966 967 /* Fixup needed */ 968 if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE))) 969 return -EAGAIN; 970 971 if (prot == PROT_NONE && !pmd_i) { 972 pmd_val(new) |= _SEGMENT_ENTRY_INVALID; 973 gmap_pmdp_xchg(gmap, pmdp, new, gaddr); 974 } 975 976 if (prot == PROT_READ && !pmd_p) { 977 pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID; 978 pmd_val(new) |= _SEGMENT_ENTRY_PROTECT; 979 gmap_pmdp_xchg(gmap, pmdp, new, gaddr); 980 } 981 982 if (bits & GMAP_NOTIFY_MPROT) 983 pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN; 984 985 /* Shadow GMAP protection needs split PMDs */ 986 if (bits & GMAP_NOTIFY_SHADOW) 987 return -EINVAL; 988 989 return 0; 990} 991 992/* 993 * gmap_protect_pte - remove access rights to memory and set pgste bits 994 * @gmap: pointer to guest mapping meta data structure 995 * @gaddr: virtual address in the guest address space 996 * @pmdp: pointer to the pmd associated with the pte 997 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 998 * @bits: notification bits to set 999 * 1000 * Returns 0 if successfully protected, -ENOMEM if out of memory and 1001 * -EAGAIN if a fixup is needed. 1002 * 1003 * Expected to be called with sg->mm->mmap_lock in read 1004 */ 1005static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, 1006 pmd_t *pmdp, int prot, unsigned long bits) 1007{ 1008 int rc; 1009 pte_t *ptep; 1010 spinlock_t *ptl = NULL; 1011 unsigned long pbits = 0; 1012 1013 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) 1014 return -EAGAIN; 1015 1016 ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl); 1017 if (!ptep) 1018 return -ENOMEM; 1019 1020 pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0; 1021 pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; 1022 /* Protect and unlock. */ 1023 rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); 1024 gmap_pte_op_end(ptl); 1025 return rc; 1026} 1027 1028/* 1029 * gmap_protect_range - remove access rights to memory and set pgste bits 1030 * @gmap: pointer to guest mapping meta data structure 1031 * @gaddr: virtual address in the guest address space 1032 * @len: size of area 1033 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 1034 * @bits: pgste notification bits to set 1035 * 1036 * Returns 0 if successfully protected, -ENOMEM if out of memory and 1037 * -EFAULT if gaddr is invalid (or mapping for shadows is missing). 1038 * 1039 * Called with sg->mm->mmap_lock in read. 1040 */ 1041static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, 1042 unsigned long len, int prot, unsigned long bits) 1043{ 1044 unsigned long vmaddr, dist; 1045 pmd_t *pmdp; 1046 int rc; 1047 1048 BUG_ON(gmap_is_shadow(gmap)); 1049 while (len) { 1050 rc = -EAGAIN; 1051 pmdp = gmap_pmd_op_walk(gmap, gaddr); 1052 if (pmdp) { 1053 if (!pmd_large(*pmdp)) { 1054 rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, 1055 bits); 1056 if (!rc) { 1057 len -= PAGE_SIZE; 1058 gaddr += PAGE_SIZE; 1059 } 1060 } else { 1061 rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, 1062 bits); 1063 if (!rc) { 1064 dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK); 1065 len = len < dist ? 0 : len - dist; 1066 gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE; 1067 } 1068 } 1069 gmap_pmd_op_end(gmap, pmdp); 1070 } 1071 if (rc) { 1072 if (rc == -EINVAL) 1073 return rc; 1074 1075 /* -EAGAIN, fixup of userspace mm and gmap */ 1076 vmaddr = __gmap_translate(gmap, gaddr); 1077 if (IS_ERR_VALUE(vmaddr)) 1078 return vmaddr; 1079 rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); 1080 if (rc) 1081 return rc; 1082 } 1083 } 1084 return 0; 1085} 1086 1087/** 1088 * gmap_mprotect_notify - change access rights for a range of ptes and 1089 * call the notifier if any pte changes again 1090 * @gmap: pointer to guest mapping meta data structure 1091 * @gaddr: virtual address in the guest address space 1092 * @len: size of area 1093 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 1094 * 1095 * Returns 0 if for each page in the given range a gmap mapping exists, 1096 * the new access rights could be set and the notifier could be armed. 1097 * If the gmap mapping is missing for one or more pages -EFAULT is 1098 * returned. If no memory could be allocated -ENOMEM is returned. 1099 * This function establishes missing page table entries. 1100 */ 1101int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, 1102 unsigned long len, int prot) 1103{ 1104 int rc; 1105 1106 if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap)) 1107 return -EINVAL; 1108 if (!MACHINE_HAS_ESOP && prot == PROT_READ) 1109 return -EINVAL; 1110 mmap_read_lock(gmap->mm); 1111 rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT); 1112 mmap_read_unlock(gmap->mm); 1113 return rc; 1114} 1115EXPORT_SYMBOL_GPL(gmap_mprotect_notify); 1116 1117/** 1118 * gmap_read_table - get an unsigned long value from a guest page table using 1119 * absolute addressing, without marking the page referenced. 1120 * @gmap: pointer to guest mapping meta data structure 1121 * @gaddr: virtual address in the guest address space 1122 * @val: pointer to the unsigned long value to return 1123 * 1124 * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT 1125 * if reading using the virtual address failed. -EINVAL if called on a gmap 1126 * shadow. 1127 * 1128 * Called with gmap->mm->mmap_lock in read. 1129 */ 1130int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) 1131{ 1132 unsigned long address, vmaddr; 1133 spinlock_t *ptl; 1134 pte_t *ptep, pte; 1135 int rc; 1136 1137 if (gmap_is_shadow(gmap)) 1138 return -EINVAL; 1139 1140 while (1) { 1141 rc = -EAGAIN; 1142 ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); 1143 if (ptep) { 1144 pte = *ptep; 1145 if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { 1146 address = pte_val(pte) & PAGE_MASK; 1147 address += gaddr & ~PAGE_MASK; 1148 *val = *(unsigned long *) address; 1149 pte_val(*ptep) |= _PAGE_YOUNG; 1150 /* Do *NOT* clear the _PAGE_INVALID bit! */ 1151 rc = 0; 1152 } 1153 gmap_pte_op_end(ptl); 1154 } 1155 if (!rc) 1156 break; 1157 vmaddr = __gmap_translate(gmap, gaddr); 1158 if (IS_ERR_VALUE(vmaddr)) { 1159 rc = vmaddr; 1160 break; 1161 } 1162 rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ); 1163 if (rc) 1164 break; 1165 } 1166 return rc; 1167} 1168EXPORT_SYMBOL_GPL(gmap_read_table); 1169 1170/** 1171 * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree 1172 * @sg: pointer to the shadow guest address space structure 1173 * @vmaddr: vm address associated with the rmap 1174 * @rmap: pointer to the rmap structure 1175 * 1176 * Called with the sg->guest_table_lock 1177 */ 1178static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, 1179 struct gmap_rmap *rmap) 1180{ 1181 void __rcu **slot; 1182 1183 BUG_ON(!gmap_is_shadow(sg)); 1184 slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); 1185 if (slot) { 1186 rmap->next = radix_tree_deref_slot_protected(slot, 1187 &sg->guest_table_lock); 1188 radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); 1189 } else { 1190 rmap->next = NULL; 1191 radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, 1192 rmap); 1193 } 1194} 1195 1196/** 1197 * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap 1198 * @sg: pointer to the shadow guest address space structure 1199 * @raddr: rmap address in the shadow gmap 1200 * @paddr: address in the parent guest address space 1201 * @len: length of the memory area to protect 1202 * 1203 * Returns 0 if successfully protected and the rmap was created, -ENOMEM 1204 * if out of memory and -EFAULT if paddr is invalid. 1205 */ 1206static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, 1207 unsigned long paddr, unsigned long len) 1208{ 1209 struct gmap *parent; 1210 struct gmap_rmap *rmap; 1211 unsigned long vmaddr; 1212 spinlock_t *ptl; 1213 pte_t *ptep; 1214 int rc; 1215 1216 BUG_ON(!gmap_is_shadow(sg)); 1217 parent = sg->parent; 1218 while (len) { 1219 vmaddr = __gmap_translate(parent, paddr); 1220 if (IS_ERR_VALUE(vmaddr)) 1221 return vmaddr; 1222 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); 1223 if (!rmap) 1224 return -ENOMEM; 1225 rmap->raddr = raddr; 1226 rc = radix_tree_preload(GFP_KERNEL); 1227 if (rc) { 1228 kfree(rmap); 1229 return rc; 1230 } 1231 rc = -EAGAIN; 1232 ptep = gmap_pte_op_walk(parent, paddr, &ptl); 1233 if (ptep) { 1234 spin_lock(&sg->guest_table_lock); 1235 rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ, 1236 PGSTE_VSIE_BIT); 1237 if (!rc) 1238 gmap_insert_rmap(sg, vmaddr, rmap); 1239 spin_unlock(&sg->guest_table_lock); 1240 gmap_pte_op_end(ptl); 1241 } 1242 radix_tree_preload_end(); 1243 if (rc) { 1244 kfree(rmap); 1245 rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ); 1246 if (rc) 1247 return rc; 1248 continue; 1249 } 1250 paddr += PAGE_SIZE; 1251 len -= PAGE_SIZE; 1252 } 1253 return 0; 1254} 1255 1256#define _SHADOW_RMAP_MASK 0x7 1257#define _SHADOW_RMAP_REGION1 0x5 1258#define _SHADOW_RMAP_REGION2 0x4 1259#define _SHADOW_RMAP_REGION3 0x3 1260#define _SHADOW_RMAP_SEGMENT 0x2 1261#define _SHADOW_RMAP_PGTABLE 0x1 1262 1263/** 1264 * gmap_idte_one - invalidate a single region or segment table entry 1265 * @asce: region or segment table *origin* + table-type bits 1266 * @vaddr: virtual address to identify the table entry to flush 1267 * 1268 * The invalid bit of a single region or segment table entry is set 1269 * and the associated TLB entries depending on the entry are flushed. 1270 * The table-type of the @asce identifies the portion of the @vaddr 1271 * that is used as the invalidation index. 1272 */ 1273static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) 1274{ 1275 asm volatile( 1276 " .insn rrf,0xb98e0000,%0,%1,0,0" 1277 : : "a" (asce), "a" (vaddr) : "cc", "memory"); 1278} 1279 1280/** 1281 * gmap_unshadow_page - remove a page from a shadow page table 1282 * @sg: pointer to the shadow guest address space structure 1283 * @raddr: rmap address in the shadow guest address space 1284 * 1285 * Called with the sg->guest_table_lock 1286 */ 1287static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) 1288{ 1289 unsigned long *table; 1290 1291 BUG_ON(!gmap_is_shadow(sg)); 1292 table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ 1293 if (!table || *table & _PAGE_INVALID) 1294 return; 1295 gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1); 1296 ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); 1297} 1298 1299/** 1300 * __gmap_unshadow_pgt - remove all entries from a shadow page table 1301 * @sg: pointer to the shadow guest address space structure 1302 * @raddr: rmap address in the shadow guest address space 1303 * @pgt: pointer to the start of a shadow page table 1304 * 1305 * Called with the sg->guest_table_lock 1306 */ 1307static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, 1308 unsigned long *pgt) 1309{ 1310 int i; 1311 1312 BUG_ON(!gmap_is_shadow(sg)); 1313 for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE) 1314 pgt[i] = _PAGE_INVALID; 1315} 1316 1317/** 1318 * gmap_unshadow_pgt - remove a shadow page table from a segment entry 1319 * @sg: pointer to the shadow guest address space structure 1320 * @raddr: address in the shadow guest address space 1321 * 1322 * Called with the sg->guest_table_lock 1323 */ 1324static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) 1325{ 1326 unsigned long sto, *ste, *pgt; 1327 struct page *page; 1328 1329 BUG_ON(!gmap_is_shadow(sg)); 1330 ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ 1331 if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) 1332 return; 1333 gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); 1334 sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); 1335 gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); 1336 pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); 1337 *ste = _SEGMENT_ENTRY_EMPTY; 1338 __gmap_unshadow_pgt(sg, raddr, pgt); 1339 /* Free page table */ 1340 page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); 1341 list_del(&page->lru); 1342 page_table_free_pgste(page); 1343} 1344 1345/** 1346 * __gmap_unshadow_sgt - remove all entries from a shadow segment table 1347 * @sg: pointer to the shadow guest address space structure 1348 * @raddr: rmap address in the shadow guest address space 1349 * @sgt: pointer to the start of a shadow segment table 1350 * 1351 * Called with the sg->guest_table_lock 1352 */ 1353static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, 1354 unsigned long *sgt) 1355{ 1356 unsigned long *pgt; 1357 struct page *page; 1358 int i; 1359 1360 BUG_ON(!gmap_is_shadow(sg)); 1361 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { 1362 if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) 1363 continue; 1364 pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); 1365 sgt[i] = _SEGMENT_ENTRY_EMPTY; 1366 __gmap_unshadow_pgt(sg, raddr, pgt); 1367 /* Free page table */ 1368 page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); 1369 list_del(&page->lru); 1370 page_table_free_pgste(page); 1371 } 1372} 1373 1374/** 1375 * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry 1376 * @sg: pointer to the shadow guest address space structure 1377 * @raddr: rmap address in the shadow guest address space 1378 * 1379 * Called with the shadow->guest_table_lock 1380 */ 1381static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) 1382{ 1383 unsigned long r3o, *r3e, *sgt; 1384 struct page *page; 1385 1386 BUG_ON(!gmap_is_shadow(sg)); 1387 r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ 1388 if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) 1389 return; 1390 gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); 1391 r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); 1392 gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); 1393 sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); 1394 *r3e = _REGION3_ENTRY_EMPTY; 1395 __gmap_unshadow_sgt(sg, raddr, sgt); 1396 /* Free segment table */ 1397 page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); 1398 list_del(&page->lru); 1399 __free_pages(page, CRST_ALLOC_ORDER); 1400} 1401 1402/** 1403 * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table 1404 * @sg: pointer to the shadow guest address space structure 1405 * @raddr: address in the shadow guest address space 1406 * @r3t: pointer to the start of a shadow region-3 table 1407 * 1408 * Called with the sg->guest_table_lock 1409 */ 1410static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, 1411 unsigned long *r3t) 1412{ 1413 unsigned long *sgt; 1414 struct page *page; 1415 int i; 1416 1417 BUG_ON(!gmap_is_shadow(sg)); 1418 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { 1419 if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) 1420 continue; 1421 sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); 1422 r3t[i] = _REGION3_ENTRY_EMPTY; 1423 __gmap_unshadow_sgt(sg, raddr, sgt); 1424 /* Free segment table */ 1425 page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); 1426 list_del(&page->lru); 1427 __free_pages(page, CRST_ALLOC_ORDER); 1428 } 1429} 1430 1431/** 1432 * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry 1433 * @sg: pointer to the shadow guest address space structure 1434 * @raddr: rmap address in the shadow guest address space 1435 * 1436 * Called with the sg->guest_table_lock 1437 */ 1438static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) 1439{ 1440 unsigned long r2o, *r2e, *r3t; 1441 struct page *page; 1442 1443 BUG_ON(!gmap_is_shadow(sg)); 1444 r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ 1445 if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) 1446 return; 1447 gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); 1448 r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); 1449 gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); 1450 r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); 1451 *r2e = _REGION2_ENTRY_EMPTY; 1452 __gmap_unshadow_r3t(sg, raddr, r3t); 1453 /* Free region 3 table */ 1454 page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); 1455 list_del(&page->lru); 1456 __free_pages(page, CRST_ALLOC_ORDER); 1457} 1458 1459/** 1460 * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table 1461 * @sg: pointer to the shadow guest address space structure 1462 * @raddr: rmap address in the shadow guest address space 1463 * @r2t: pointer to the start of a shadow region-2 table 1464 * 1465 * Called with the sg->guest_table_lock 1466 */ 1467static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, 1468 unsigned long *r2t) 1469{ 1470 unsigned long *r3t; 1471 struct page *page; 1472 int i; 1473 1474 BUG_ON(!gmap_is_shadow(sg)); 1475 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { 1476 if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) 1477 continue; 1478 r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); 1479 r2t[i] = _REGION2_ENTRY_EMPTY; 1480 __gmap_unshadow_r3t(sg, raddr, r3t); 1481 /* Free region 3 table */ 1482 page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); 1483 list_del(&page->lru); 1484 __free_pages(page, CRST_ALLOC_ORDER); 1485 } 1486} 1487 1488/** 1489 * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry 1490 * @sg: pointer to the shadow guest address space structure 1491 * @raddr: rmap address in the shadow guest address space 1492 * 1493 * Called with the sg->guest_table_lock 1494 */ 1495static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) 1496{ 1497 unsigned long r1o, *r1e, *r2t; 1498 struct page *page; 1499 1500 BUG_ON(!gmap_is_shadow(sg)); 1501 r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ 1502 if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) 1503 return; 1504 gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); 1505 r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); 1506 gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); 1507 r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); 1508 *r1e = _REGION1_ENTRY_EMPTY; 1509 __gmap_unshadow_r2t(sg, raddr, r2t); 1510 /* Free region 2 table */ 1511 page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); 1512 list_del(&page->lru); 1513 __free_pages(page, CRST_ALLOC_ORDER); 1514} 1515 1516/** 1517 * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table 1518 * @sg: pointer to the shadow guest address space structure 1519 * @raddr: rmap address in the shadow guest address space 1520 * @r1t: pointer to the start of a shadow region-1 table 1521 * 1522 * Called with the shadow->guest_table_lock 1523 */ 1524static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, 1525 unsigned long *r1t) 1526{ 1527 unsigned long asce, *r2t; 1528 struct page *page; 1529 int i; 1530 1531 BUG_ON(!gmap_is_shadow(sg)); 1532 asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; 1533 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { 1534 if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) 1535 continue; 1536 r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); 1537 __gmap_unshadow_r2t(sg, raddr, r2t); 1538 /* Clear entry and flush translation r1t -> r2t */ 1539 gmap_idte_one(asce, raddr); 1540 r1t[i] = _REGION1_ENTRY_EMPTY; 1541 /* Free region 2 table */ 1542 page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); 1543 list_del(&page->lru); 1544 __free_pages(page, CRST_ALLOC_ORDER); 1545 } 1546} 1547 1548/** 1549 * gmap_unshadow - remove a shadow page table completely 1550 * @sg: pointer to the shadow guest address space structure 1551 * 1552 * Called with sg->guest_table_lock 1553 */ 1554static void gmap_unshadow(struct gmap *sg) 1555{ 1556 unsigned long *table; 1557 1558 BUG_ON(!gmap_is_shadow(sg)); 1559 if (sg->removed) 1560 return; 1561 sg->removed = 1; 1562 gmap_call_notifier(sg, 0, -1UL); 1563 gmap_flush_tlb(sg); 1564 table = (unsigned long *)(sg->asce & _ASCE_ORIGIN); 1565 switch (sg->asce & _ASCE_TYPE_MASK) { 1566 case _ASCE_TYPE_REGION1: 1567 __gmap_unshadow_r1t(sg, 0, table); 1568 break; 1569 case _ASCE_TYPE_REGION2: 1570 __gmap_unshadow_r2t(sg, 0, table); 1571 break; 1572 case _ASCE_TYPE_REGION3: 1573 __gmap_unshadow_r3t(sg, 0, table); 1574 break; 1575 case _ASCE_TYPE_SEGMENT: 1576 __gmap_unshadow_sgt(sg, 0, table); 1577 break; 1578 } 1579} 1580 1581/** 1582 * gmap_find_shadow - find a specific asce in the list of shadow tables 1583 * @parent: pointer to the parent gmap 1584 * @asce: ASCE for which the shadow table is created 1585 * @edat_level: edat level to be used for the shadow translation 1586 * 1587 * Returns the pointer to a gmap if a shadow table with the given asce is 1588 * already available, ERR_PTR(-EAGAIN) if another one is just being created, 1589 * otherwise NULL 1590 */ 1591static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, 1592 int edat_level) 1593{ 1594 struct gmap *sg; 1595 1596 list_for_each_entry(sg, &parent->children, list) { 1597 if (sg->orig_asce != asce || sg->edat_level != edat_level || 1598 sg->removed) 1599 continue; 1600 if (!sg->initialized) 1601 return ERR_PTR(-EAGAIN); 1602 refcount_inc(&sg->ref_count); 1603 return sg; 1604 } 1605 return NULL; 1606} 1607 1608/** 1609 * gmap_shadow_valid - check if a shadow guest address space matches the 1610 * given properties and is still valid 1611 * @sg: pointer to the shadow guest address space structure 1612 * @asce: ASCE for which the shadow table is requested 1613 * @edat_level: edat level to be used for the shadow translation 1614 * 1615 * Returns 1 if the gmap shadow is still valid and matches the given 1616 * properties, the caller can continue using it. Returns 0 otherwise, the 1617 * caller has to request a new shadow gmap in this case. 1618 * 1619 */ 1620int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) 1621{ 1622 if (sg->removed) 1623 return 0; 1624 return sg->orig_asce == asce && sg->edat_level == edat_level; 1625} 1626EXPORT_SYMBOL_GPL(gmap_shadow_valid); 1627 1628/** 1629 * gmap_shadow - create/find a shadow guest address space 1630 * @parent: pointer to the parent gmap 1631 * @asce: ASCE for which the shadow table is created 1632 * @edat_level: edat level to be used for the shadow translation 1633 * 1634 * The pages of the top level page table referred by the asce parameter 1635 * will be set to read-only and marked in the PGSTEs of the kvm process. 1636 * The shadow table will be removed automatically on any change to the 1637 * PTE mapping for the source table. 1638 * 1639 * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, 1640 * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the 1641 * parent gmap table could not be protected. 1642 */ 1643struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, 1644 int edat_level) 1645{ 1646 struct gmap *sg, *new; 1647 unsigned long limit; 1648 int rc; 1649 1650 BUG_ON(parent->mm->context.allow_gmap_hpage_1m); 1651 BUG_ON(gmap_is_shadow(parent)); 1652 spin_lock(&parent->shadow_lock); 1653 sg = gmap_find_shadow(parent, asce, edat_level); 1654 spin_unlock(&parent->shadow_lock); 1655 if (sg) 1656 return sg; 1657 /* Create a new shadow gmap */ 1658 limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); 1659 if (asce & _ASCE_REAL_SPACE) 1660 limit = -1UL; 1661 new = gmap_alloc(limit); 1662 if (!new) 1663 return ERR_PTR(-ENOMEM); 1664 new->mm = parent->mm; 1665 new->parent = gmap_get(parent); 1666 new->orig_asce = asce; 1667 new->edat_level = edat_level; 1668 new->initialized = false; 1669 spin_lock(&parent->shadow_lock); 1670 /* Recheck if another CPU created the same shadow */ 1671 sg = gmap_find_shadow(parent, asce, edat_level); 1672 if (sg) { 1673 spin_unlock(&parent->shadow_lock); 1674 gmap_free(new); 1675 return sg; 1676 } 1677 if (asce & _ASCE_REAL_SPACE) { 1678 /* only allow one real-space gmap shadow */ 1679 list_for_each_entry(sg, &parent->children, list) { 1680 if (sg->orig_asce & _ASCE_REAL_SPACE) { 1681 spin_lock(&sg->guest_table_lock); 1682 gmap_unshadow(sg); 1683 spin_unlock(&sg->guest_table_lock); 1684 list_del(&sg->list); 1685 gmap_put(sg); 1686 break; 1687 } 1688 } 1689 } 1690 refcount_set(&new->ref_count, 2); 1691 list_add(&new->list, &parent->children); 1692 if (asce & _ASCE_REAL_SPACE) { 1693 /* nothing to protect, return right away */ 1694 new->initialized = true; 1695 spin_unlock(&parent->shadow_lock); 1696 return new; 1697 } 1698 spin_unlock(&parent->shadow_lock); 1699 /* protect after insertion, so it will get properly invalidated */ 1700 mmap_read_lock(parent->mm); 1701 rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, 1702 ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, 1703 PROT_READ, GMAP_NOTIFY_SHADOW); 1704 mmap_read_unlock(parent->mm); 1705 spin_lock(&parent->shadow_lock); 1706 new->initialized = true; 1707 if (rc) { 1708 list_del(&new->list); 1709 gmap_free(new); 1710 new = ERR_PTR(rc); 1711 } 1712 spin_unlock(&parent->shadow_lock); 1713 return new; 1714} 1715EXPORT_SYMBOL_GPL(gmap_shadow); 1716 1717/** 1718 * gmap_shadow_r2t - create an empty shadow region 2 table 1719 * @sg: pointer to the shadow guest address space structure 1720 * @saddr: faulting address in the shadow gmap 1721 * @r2t: parent gmap address of the region 2 table to get shadowed 1722 * @fake: r2t references contiguous guest memory block, not a r2t 1723 * 1724 * The r2t parameter specifies the address of the source table. The 1725 * four pages of the source table are made read-only in the parent gmap 1726 * address space. A write to the source table area @r2t will automatically 1727 * remove the shadow r2 table and all of its decendents. 1728 * 1729 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 1730 * shadow table structure is incomplete, -ENOMEM if out of memory and 1731 * -EFAULT if an address in the parent gmap could not be resolved. 1732 * 1733 * Called with sg->mm->mmap_lock in read. 1734 */ 1735int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, 1736 int fake) 1737{ 1738 unsigned long raddr, origin, offset, len; 1739 unsigned long *s_r2t, *table; 1740 struct page *page; 1741 int rc; 1742 1743 BUG_ON(!gmap_is_shadow(sg)); 1744 /* Allocate a shadow region second table */ 1745 page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); 1746 if (!page) 1747 return -ENOMEM; 1748 page->index = r2t & _REGION_ENTRY_ORIGIN; 1749 if (fake) 1750 page->index |= GMAP_SHADOW_FAKE_TABLE; 1751 s_r2t = (unsigned long *) page_to_phys(page); 1752 /* Install shadow region second table */ 1753 spin_lock(&sg->guest_table_lock); 1754 table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ 1755 if (!table) { 1756 rc = -EAGAIN; /* Race with unshadow */ 1757 goto out_free; 1758 } 1759 if (!(*table & _REGION_ENTRY_INVALID)) { 1760 rc = 0; /* Already established */ 1761 goto out_free; 1762 } else if (*table & _REGION_ENTRY_ORIGIN) { 1763 rc = -EAGAIN; /* Race with shadow */ 1764 goto out_free; 1765 } 1766 crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY); 1767 /* mark as invalid as long as the parent table is not protected */ 1768 *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH | 1769 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; 1770 if (sg->edat_level >= 1) 1771 *table |= (r2t & _REGION_ENTRY_PROTECT); 1772 list_add(&page->lru, &sg->crst_list); 1773 if (fake) { 1774 /* nothing to protect for fake tables */ 1775 *table &= ~_REGION_ENTRY_INVALID; 1776 spin_unlock(&sg->guest_table_lock); 1777 return 0; 1778 } 1779 spin_unlock(&sg->guest_table_lock); 1780 /* Make r2t read-only in parent gmap page table */ 1781 raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1; 1782 origin = r2t & _REGION_ENTRY_ORIGIN; 1783 offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1784 len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1785 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1786 spin_lock(&sg->guest_table_lock); 1787 if (!rc) { 1788 table = gmap_table_walk(sg, saddr, 4); 1789 if (!table || (*table & _REGION_ENTRY_ORIGIN) != 1790 (unsigned long) s_r2t) 1791 rc = -EAGAIN; /* Race with unshadow */ 1792 else 1793 *table &= ~_REGION_ENTRY_INVALID; 1794 } else { 1795 gmap_unshadow_r2t(sg, raddr); 1796 } 1797 spin_unlock(&sg->guest_table_lock); 1798 return rc; 1799out_free: 1800 spin_unlock(&sg->guest_table_lock); 1801 __free_pages(page, CRST_ALLOC_ORDER); 1802 return rc; 1803} 1804EXPORT_SYMBOL_GPL(gmap_shadow_r2t); 1805 1806/** 1807 * gmap_shadow_r3t - create a shadow region 3 table 1808 * @sg: pointer to the shadow guest address space structure 1809 * @saddr: faulting address in the shadow gmap 1810 * @r3t: parent gmap address of the region 3 table to get shadowed 1811 * @fake: r3t references contiguous guest memory block, not a r3t 1812 * 1813 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 1814 * shadow table structure is incomplete, -ENOMEM if out of memory and 1815 * -EFAULT if an address in the parent gmap could not be resolved. 1816 * 1817 * Called with sg->mm->mmap_lock in read. 1818 */ 1819int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, 1820 int fake) 1821{ 1822 unsigned long raddr, origin, offset, len; 1823 unsigned long *s_r3t, *table; 1824 struct page *page; 1825 int rc; 1826 1827 BUG_ON(!gmap_is_shadow(sg)); 1828 /* Allocate a shadow region second table */ 1829 page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); 1830 if (!page) 1831 return -ENOMEM; 1832 page->index = r3t & _REGION_ENTRY_ORIGIN; 1833 if (fake) 1834 page->index |= GMAP_SHADOW_FAKE_TABLE; 1835 s_r3t = (unsigned long *) page_to_phys(page); 1836 /* Install shadow region second table */ 1837 spin_lock(&sg->guest_table_lock); 1838 table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ 1839 if (!table) { 1840 rc = -EAGAIN; /* Race with unshadow */ 1841 goto out_free; 1842 } 1843 if (!(*table & _REGION_ENTRY_INVALID)) { 1844 rc = 0; /* Already established */ 1845 goto out_free; 1846 } else if (*table & _REGION_ENTRY_ORIGIN) { 1847 rc = -EAGAIN; /* Race with shadow */ 1848 goto out_free; 1849 } 1850 crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY); 1851 /* mark as invalid as long as the parent table is not protected */ 1852 *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH | 1853 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; 1854 if (sg->edat_level >= 1) 1855 *table |= (r3t & _REGION_ENTRY_PROTECT); 1856 list_add(&page->lru, &sg->crst_list); 1857 if (fake) { 1858 /* nothing to protect for fake tables */ 1859 *table &= ~_REGION_ENTRY_INVALID; 1860 spin_unlock(&sg->guest_table_lock); 1861 return 0; 1862 } 1863 spin_unlock(&sg->guest_table_lock); 1864 /* Make r3t read-only in parent gmap page table */ 1865 raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2; 1866 origin = r3t & _REGION_ENTRY_ORIGIN; 1867 offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1868 len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1869 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1870 spin_lock(&sg->guest_table_lock); 1871 if (!rc) { 1872 table = gmap_table_walk(sg, saddr, 3); 1873 if (!table || (*table & _REGION_ENTRY_ORIGIN) != 1874 (unsigned long) s_r3t) 1875 rc = -EAGAIN; /* Race with unshadow */ 1876 else 1877 *table &= ~_REGION_ENTRY_INVALID; 1878 } else { 1879 gmap_unshadow_r3t(sg, raddr); 1880 } 1881 spin_unlock(&sg->guest_table_lock); 1882 return rc; 1883out_free: 1884 spin_unlock(&sg->guest_table_lock); 1885 __free_pages(page, CRST_ALLOC_ORDER); 1886 return rc; 1887} 1888EXPORT_SYMBOL_GPL(gmap_shadow_r3t); 1889 1890/** 1891 * gmap_shadow_sgt - create a shadow segment table 1892 * @sg: pointer to the shadow guest address space structure 1893 * @saddr: faulting address in the shadow gmap 1894 * @sgt: parent gmap address of the segment table to get shadowed 1895 * @fake: sgt references contiguous guest memory block, not a sgt 1896 * 1897 * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the 1898 * shadow table structure is incomplete, -ENOMEM if out of memory and 1899 * -EFAULT if an address in the parent gmap could not be resolved. 1900 * 1901 * Called with sg->mm->mmap_lock in read. 1902 */ 1903int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, 1904 int fake) 1905{ 1906 unsigned long raddr, origin, offset, len; 1907 unsigned long *s_sgt, *table; 1908 struct page *page; 1909 int rc; 1910 1911 BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); 1912 /* Allocate a shadow segment table */ 1913 page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); 1914 if (!page) 1915 return -ENOMEM; 1916 page->index = sgt & _REGION_ENTRY_ORIGIN; 1917 if (fake) 1918 page->index |= GMAP_SHADOW_FAKE_TABLE; 1919 s_sgt = (unsigned long *) page_to_phys(page); 1920 /* Install shadow region second table */ 1921 spin_lock(&sg->guest_table_lock); 1922 table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ 1923 if (!table) { 1924 rc = -EAGAIN; /* Race with unshadow */ 1925 goto out_free; 1926 } 1927 if (!(*table & _REGION_ENTRY_INVALID)) { 1928 rc = 0; /* Already established */ 1929 goto out_free; 1930 } else if (*table & _REGION_ENTRY_ORIGIN) { 1931 rc = -EAGAIN; /* Race with shadow */ 1932 goto out_free; 1933 } 1934 crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY); 1935 /* mark as invalid as long as the parent table is not protected */ 1936 *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH | 1937 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; 1938 if (sg->edat_level >= 1) 1939 *table |= sgt & _REGION_ENTRY_PROTECT; 1940 list_add(&page->lru, &sg->crst_list); 1941 if (fake) { 1942 /* nothing to protect for fake tables */ 1943 *table &= ~_REGION_ENTRY_INVALID; 1944 spin_unlock(&sg->guest_table_lock); 1945 return 0; 1946 } 1947 spin_unlock(&sg->guest_table_lock); 1948 /* Make sgt read-only in parent gmap page table */ 1949 raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3; 1950 origin = sgt & _REGION_ENTRY_ORIGIN; 1951 offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1952 len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1953 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1954 spin_lock(&sg->guest_table_lock); 1955 if (!rc) { 1956 table = gmap_table_walk(sg, saddr, 2); 1957 if (!table || (*table & _REGION_ENTRY_ORIGIN) != 1958 (unsigned long) s_sgt) 1959 rc = -EAGAIN; /* Race with unshadow */ 1960 else 1961 *table &= ~_REGION_ENTRY_INVALID; 1962 } else { 1963 gmap_unshadow_sgt(sg, raddr); 1964 } 1965 spin_unlock(&sg->guest_table_lock); 1966 return rc; 1967out_free: 1968 spin_unlock(&sg->guest_table_lock); 1969 __free_pages(page, CRST_ALLOC_ORDER); 1970 return rc; 1971} 1972EXPORT_SYMBOL_GPL(gmap_shadow_sgt); 1973 1974/** 1975 * gmap_shadow_lookup_pgtable - find a shadow page table 1976 * @sg: pointer to the shadow guest address space structure 1977 * @saddr: the address in the shadow aguest address space 1978 * @pgt: parent gmap address of the page table to get shadowed 1979 * @dat_protection: if the pgtable is marked as protected by dat 1980 * @fake: pgt references contiguous guest memory block, not a pgtable 1981 * 1982 * Returns 0 if the shadow page table was found and -EAGAIN if the page 1983 * table was not found. 1984 * 1985 * Called with sg->mm->mmap_lock in read. 1986 */ 1987int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, 1988 unsigned long *pgt, int *dat_protection, 1989 int *fake) 1990{ 1991 unsigned long *table; 1992 struct page *page; 1993 int rc; 1994 1995 BUG_ON(!gmap_is_shadow(sg)); 1996 spin_lock(&sg->guest_table_lock); 1997 table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ 1998 if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { 1999 /* Shadow page tables are full pages (pte+pgste) */ 2000 page = pfn_to_page(*table >> PAGE_SHIFT); 2001 *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; 2002 *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); 2003 *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); 2004 rc = 0; 2005 } else { 2006 rc = -EAGAIN; 2007 } 2008 spin_unlock(&sg->guest_table_lock); 2009 return rc; 2010 2011} 2012EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); 2013 2014/** 2015 * gmap_shadow_pgt - instantiate a shadow page table 2016 * @sg: pointer to the shadow guest address space structure 2017 * @saddr: faulting address in the shadow gmap 2018 * @pgt: parent gmap address of the page table to get shadowed 2019 * @fake: pgt references contiguous guest memory block, not a pgtable 2020 * 2021 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 2022 * shadow table structure is incomplete, -ENOMEM if out of memory, 2023 * -EFAULT if an address in the parent gmap could not be resolved and 2024 * 2025 * Called with gmap->mm->mmap_lock in read 2026 */ 2027int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, 2028 int fake) 2029{ 2030 unsigned long raddr, origin; 2031 unsigned long *s_pgt, *table; 2032 struct page *page; 2033 int rc; 2034 2035 BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); 2036 /* Allocate a shadow page table */ 2037 page = page_table_alloc_pgste(sg->mm); 2038 if (!page) 2039 return -ENOMEM; 2040 page->index = pgt & _SEGMENT_ENTRY_ORIGIN; 2041 if (fake) 2042 page->index |= GMAP_SHADOW_FAKE_TABLE; 2043 s_pgt = (unsigned long *) page_to_phys(page); 2044 /* Install shadow page table */ 2045 spin_lock(&sg->guest_table_lock); 2046 table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ 2047 if (!table) { 2048 rc = -EAGAIN; /* Race with unshadow */ 2049 goto out_free; 2050 } 2051 if (!(*table & _SEGMENT_ENTRY_INVALID)) { 2052 rc = 0; /* Already established */ 2053 goto out_free; 2054 } else if (*table & _SEGMENT_ENTRY_ORIGIN) { 2055 rc = -EAGAIN; /* Race with shadow */ 2056 goto out_free; 2057 } 2058 /* mark as invalid as long as the parent table is not protected */ 2059 *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | 2060 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; 2061 list_add(&page->lru, &sg->pt_list); 2062 if (fake) { 2063 /* nothing to protect for fake tables */ 2064 *table &= ~_SEGMENT_ENTRY_INVALID; 2065 spin_unlock(&sg->guest_table_lock); 2066 return 0; 2067 } 2068 spin_unlock(&sg->guest_table_lock); 2069 /* Make pgt read-only in parent gmap page table (not the pgste) */ 2070 raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; 2071 origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; 2072 rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE); 2073 spin_lock(&sg->guest_table_lock); 2074 if (!rc) { 2075 table = gmap_table_walk(sg, saddr, 1); 2076 if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != 2077 (unsigned long) s_pgt) 2078 rc = -EAGAIN; /* Race with unshadow */ 2079 else 2080 *table &= ~_SEGMENT_ENTRY_INVALID; 2081 } else { 2082 gmap_unshadow_pgt(sg, raddr); 2083 } 2084 spin_unlock(&sg->guest_table_lock); 2085 return rc; 2086out_free: 2087 spin_unlock(&sg->guest_table_lock); 2088 page_table_free_pgste(page); 2089 return rc; 2090 2091} 2092EXPORT_SYMBOL_GPL(gmap_shadow_pgt); 2093 2094/** 2095 * gmap_shadow_page - create a shadow page mapping 2096 * @sg: pointer to the shadow guest address space structure 2097 * @saddr: faulting address in the shadow gmap 2098 * @pte: pte in parent gmap address space to get shadowed 2099 * 2100 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 2101 * shadow table structure is incomplete, -ENOMEM if out of memory and 2102 * -EFAULT if an address in the parent gmap could not be resolved. 2103 * 2104 * Called with sg->mm->mmap_lock in read. 2105 */ 2106int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) 2107{ 2108 struct gmap *parent; 2109 struct gmap_rmap *rmap; 2110 unsigned long vmaddr, paddr; 2111 spinlock_t *ptl; 2112 pte_t *sptep, *tptep; 2113 int prot; 2114 int rc; 2115 2116 BUG_ON(!gmap_is_shadow(sg)); 2117 parent = sg->parent; 2118 prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; 2119 2120 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); 2121 if (!rmap) 2122 return -ENOMEM; 2123 rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; 2124 2125 while (1) { 2126 paddr = pte_val(pte) & PAGE_MASK; 2127 vmaddr = __gmap_translate(parent, paddr); 2128 if (IS_ERR_VALUE(vmaddr)) { 2129 rc = vmaddr; 2130 break; 2131 } 2132 rc = radix_tree_preload(GFP_KERNEL); 2133 if (rc) 2134 break; 2135 rc = -EAGAIN; 2136 sptep = gmap_pte_op_walk(parent, paddr, &ptl); 2137 if (sptep) { 2138 spin_lock(&sg->guest_table_lock); 2139 /* Get page table pointer */ 2140 tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); 2141 if (!tptep) { 2142 spin_unlock(&sg->guest_table_lock); 2143 gmap_pte_op_end(ptl); 2144 radix_tree_preload_end(); 2145 break; 2146 } 2147 rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte); 2148 if (rc > 0) { 2149 /* Success and a new mapping */ 2150 gmap_insert_rmap(sg, vmaddr, rmap); 2151 rmap = NULL; 2152 rc = 0; 2153 } 2154 gmap_pte_op_end(ptl); 2155 spin_unlock(&sg->guest_table_lock); 2156 } 2157 radix_tree_preload_end(); 2158 if (!rc) 2159 break; 2160 rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); 2161 if (rc) 2162 break; 2163 } 2164 kfree(rmap); 2165 return rc; 2166} 2167EXPORT_SYMBOL_GPL(gmap_shadow_page); 2168 2169/** 2170 * gmap_shadow_notify - handle notifications for shadow gmap 2171 * 2172 * Called with sg->parent->shadow_lock. 2173 */ 2174static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, 2175 unsigned long gaddr) 2176{ 2177 struct gmap_rmap *rmap, *rnext, *head; 2178 unsigned long start, end, bits, raddr; 2179 2180 BUG_ON(!gmap_is_shadow(sg)); 2181 2182 spin_lock(&sg->guest_table_lock); 2183 if (sg->removed) { 2184 spin_unlock(&sg->guest_table_lock); 2185 return; 2186 } 2187 /* Check for top level table */ 2188 start = sg->orig_asce & _ASCE_ORIGIN; 2189 end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE; 2190 if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && 2191 gaddr < end) { 2192 /* The complete shadow table has to go */ 2193 gmap_unshadow(sg); 2194 spin_unlock(&sg->guest_table_lock); 2195 list_del(&sg->list); 2196 gmap_put(sg); 2197 return; 2198 } 2199 /* Remove the page table tree from on specific entry */ 2200 head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); 2201 gmap_for_each_rmap_safe(rmap, rnext, head) { 2202 bits = rmap->raddr & _SHADOW_RMAP_MASK; 2203 raddr = rmap->raddr ^ bits; 2204 switch (bits) { 2205 case _SHADOW_RMAP_REGION1: 2206 gmap_unshadow_r2t(sg, raddr); 2207 break; 2208 case _SHADOW_RMAP_REGION2: 2209 gmap_unshadow_r3t(sg, raddr); 2210 break; 2211 case _SHADOW_RMAP_REGION3: 2212 gmap_unshadow_sgt(sg, raddr); 2213 break; 2214 case _SHADOW_RMAP_SEGMENT: 2215 gmap_unshadow_pgt(sg, raddr); 2216 break; 2217 case _SHADOW_RMAP_PGTABLE: 2218 gmap_unshadow_page(sg, raddr); 2219 break; 2220 } 2221 kfree(rmap); 2222 } 2223 spin_unlock(&sg->guest_table_lock); 2224} 2225 2226/** 2227 * ptep_notify - call all invalidation callbacks for a specific pte. 2228 * @mm: pointer to the process mm_struct 2229 * @addr: virtual address in the process address space 2230 * @pte: pointer to the page table entry 2231 * @bits: bits from the pgste that caused the notify call 2232 * 2233 * This function is assumed to be called with the page table lock held 2234 * for the pte to notify. 2235 */ 2236void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, 2237 pte_t *pte, unsigned long bits) 2238{ 2239 unsigned long offset, gaddr = 0; 2240 unsigned long *table; 2241 struct gmap *gmap, *sg, *next; 2242 2243 offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 2244 offset = offset * (PAGE_SIZE / sizeof(pte_t)); 2245 rcu_read_lock(); 2246 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2247 spin_lock(&gmap->guest_table_lock); 2248 table = radix_tree_lookup(&gmap->host_to_guest, 2249 vmaddr >> PMD_SHIFT); 2250 if (table) 2251 gaddr = __gmap_segment_gaddr(table) + offset; 2252 spin_unlock(&gmap->guest_table_lock); 2253 if (!table) 2254 continue; 2255 2256 if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { 2257 spin_lock(&gmap->shadow_lock); 2258 list_for_each_entry_safe(sg, next, 2259 &gmap->children, list) 2260 gmap_shadow_notify(sg, vmaddr, gaddr); 2261 spin_unlock(&gmap->shadow_lock); 2262 } 2263 if (bits & PGSTE_IN_BIT) 2264 gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); 2265 } 2266 rcu_read_unlock(); 2267} 2268EXPORT_SYMBOL_GPL(ptep_notify); 2269 2270static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, 2271 unsigned long gaddr) 2272{ 2273 pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN; 2274 gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); 2275} 2276 2277/** 2278 * gmap_pmdp_xchg - exchange a gmap pmd with another 2279 * @gmap: pointer to the guest address space structure 2280 * @pmdp: pointer to the pmd entry 2281 * @new: replacement entry 2282 * @gaddr: the affected guest address 2283 * 2284 * This function is assumed to be called with the guest_table_lock 2285 * held. 2286 */ 2287static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, 2288 unsigned long gaddr) 2289{ 2290 gaddr &= HPAGE_MASK; 2291 pmdp_notify_gmap(gmap, pmdp, gaddr); 2292 pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN; 2293 if (MACHINE_HAS_TLB_GUEST) 2294 __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, 2295 IDTE_GLOBAL); 2296 else if (MACHINE_HAS_IDTE) 2297 __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); 2298 else 2299 __pmdp_csp(pmdp); 2300 *pmdp = new; 2301} 2302 2303static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, 2304 int purge) 2305{ 2306 pmd_t *pmdp; 2307 struct gmap *gmap; 2308 unsigned long gaddr; 2309 2310 rcu_read_lock(); 2311 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2312 spin_lock(&gmap->guest_table_lock); 2313 pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest, 2314 vmaddr >> PMD_SHIFT); 2315 if (pmdp) { 2316 gaddr = __gmap_segment_gaddr((unsigned long *)pmdp); 2317 pmdp_notify_gmap(gmap, pmdp, gaddr); 2318 WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2319 _SEGMENT_ENTRY_GMAP_UC)); 2320 if (purge) 2321 __pmdp_csp(pmdp); 2322 pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY; 2323 } 2324 spin_unlock(&gmap->guest_table_lock); 2325 } 2326 rcu_read_unlock(); 2327} 2328 2329/** 2330 * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without 2331 * flushing 2332 * @mm: pointer to the process mm_struct 2333 * @vmaddr: virtual address in the process address space 2334 */ 2335void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr) 2336{ 2337 gmap_pmdp_clear(mm, vmaddr, 0); 2338} 2339EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate); 2340 2341/** 2342 * gmap_pmdp_csp - csp all affected guest pmd entries 2343 * @mm: pointer to the process mm_struct 2344 * @vmaddr: virtual address in the process address space 2345 */ 2346void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr) 2347{ 2348 gmap_pmdp_clear(mm, vmaddr, 1); 2349} 2350EXPORT_SYMBOL_GPL(gmap_pmdp_csp); 2351 2352/** 2353 * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry 2354 * @mm: pointer to the process mm_struct 2355 * @vmaddr: virtual address in the process address space 2356 */ 2357void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) 2358{ 2359 unsigned long *entry, gaddr; 2360 struct gmap *gmap; 2361 pmd_t *pmdp; 2362 2363 rcu_read_lock(); 2364 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2365 spin_lock(&gmap->guest_table_lock); 2366 entry = radix_tree_delete(&gmap->host_to_guest, 2367 vmaddr >> PMD_SHIFT); 2368 if (entry) { 2369 pmdp = (pmd_t *)entry; 2370 gaddr = __gmap_segment_gaddr(entry); 2371 pmdp_notify_gmap(gmap, pmdp, gaddr); 2372 WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2373 _SEGMENT_ENTRY_GMAP_UC)); 2374 if (MACHINE_HAS_TLB_GUEST) 2375 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, 2376 gmap->asce, IDTE_LOCAL); 2377 else if (MACHINE_HAS_IDTE) 2378 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); 2379 *entry = _SEGMENT_ENTRY_EMPTY; 2380 } 2381 spin_unlock(&gmap->guest_table_lock); 2382 } 2383 rcu_read_unlock(); 2384} 2385EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); 2386 2387/** 2388 * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry 2389 * @mm: pointer to the process mm_struct 2390 * @vmaddr: virtual address in the process address space 2391 */ 2392void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) 2393{ 2394 unsigned long *entry, gaddr; 2395 struct gmap *gmap; 2396 pmd_t *pmdp; 2397 2398 rcu_read_lock(); 2399 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2400 spin_lock(&gmap->guest_table_lock); 2401 entry = radix_tree_delete(&gmap->host_to_guest, 2402 vmaddr >> PMD_SHIFT); 2403 if (entry) { 2404 pmdp = (pmd_t *)entry; 2405 gaddr = __gmap_segment_gaddr(entry); 2406 pmdp_notify_gmap(gmap, pmdp, gaddr); 2407 WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2408 _SEGMENT_ENTRY_GMAP_UC)); 2409 if (MACHINE_HAS_TLB_GUEST) 2410 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, 2411 gmap->asce, IDTE_GLOBAL); 2412 else if (MACHINE_HAS_IDTE) 2413 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); 2414 else 2415 __pmdp_csp(pmdp); 2416 *entry = _SEGMENT_ENTRY_EMPTY; 2417 } 2418 spin_unlock(&gmap->guest_table_lock); 2419 } 2420 rcu_read_unlock(); 2421} 2422EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global); 2423 2424/** 2425 * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status 2426 * @gmap: pointer to guest address space 2427 * @pmdp: pointer to the pmd to be tested 2428 * @gaddr: virtual address in the guest address space 2429 * 2430 * This function is assumed to be called with the guest_table_lock 2431 * held. 2432 */ 2433static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, 2434 unsigned long gaddr) 2435{ 2436 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) 2437 return false; 2438 2439 /* Already protected memory, which did not change is clean */ 2440 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT && 2441 !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC)) 2442 return false; 2443 2444 /* Clear UC indication and reset protection */ 2445 pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC; 2446 gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); 2447 return true; 2448} 2449 2450/** 2451 * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment 2452 * @gmap: pointer to guest address space 2453 * @bitmap: dirty bitmap for this pmd 2454 * @gaddr: virtual address in the guest address space 2455 * @vmaddr: virtual address in the host address space 2456 * 2457 * This function is assumed to be called with the guest_table_lock 2458 * held. 2459 */ 2460void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], 2461 unsigned long gaddr, unsigned long vmaddr) 2462{ 2463 int i; 2464 pmd_t *pmdp; 2465 pte_t *ptep; 2466 spinlock_t *ptl; 2467 2468 pmdp = gmap_pmd_op_walk(gmap, gaddr); 2469 if (!pmdp) 2470 return; 2471 2472 if (pmd_large(*pmdp)) { 2473 if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) 2474 bitmap_fill(bitmap, _PAGE_ENTRIES); 2475 } else { 2476 for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) { 2477 ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl); 2478 if (!ptep) 2479 continue; 2480 if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) 2481 set_bit(i, bitmap); 2482 spin_unlock(ptl); 2483 } 2484 } 2485 gmap_pmd_op_end(gmap, pmdp); 2486} 2487EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); 2488 2489#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2490static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, 2491 unsigned long end, struct mm_walk *walk) 2492{ 2493 struct vm_area_struct *vma = walk->vma; 2494 2495 split_huge_pmd(vma, pmd, addr); 2496 return 0; 2497} 2498 2499static const struct mm_walk_ops thp_split_walk_ops = { 2500 .pmd_entry = thp_split_walk_pmd_entry, 2501}; 2502 2503static inline void thp_split_mm(struct mm_struct *mm) 2504{ 2505 struct vm_area_struct *vma; 2506 2507 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { 2508 vma->vm_flags &= ~VM_HUGEPAGE; 2509 vma->vm_flags |= VM_NOHUGEPAGE; 2510 walk_page_vma(vma, &thp_split_walk_ops, NULL); 2511 } 2512 mm->def_flags |= VM_NOHUGEPAGE; 2513} 2514#else 2515static inline void thp_split_mm(struct mm_struct *mm) 2516{ 2517} 2518#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2519 2520/* 2521 * Remove all empty zero pages from the mapping for lazy refaulting 2522 * - This must be called after mm->context.has_pgste is set, to avoid 2523 * future creation of zero pages 2524 * - This must be called after THP was enabled 2525 */ 2526static int __zap_zero_pages(pmd_t *pmd, unsigned long start, 2527 unsigned long end, struct mm_walk *walk) 2528{ 2529 unsigned long addr; 2530 2531 for (addr = start; addr != end; addr += PAGE_SIZE) { 2532 pte_t *ptep; 2533 spinlock_t *ptl; 2534 2535 ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 2536 if (is_zero_pfn(pte_pfn(*ptep))) 2537 ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID)); 2538 pte_unmap_unlock(ptep, ptl); 2539 } 2540 return 0; 2541} 2542 2543static const struct mm_walk_ops zap_zero_walk_ops = { 2544 .pmd_entry = __zap_zero_pages, 2545}; 2546 2547/* 2548 * switch on pgstes for its userspace process (for kvm) 2549 */ 2550int s390_enable_sie(void) 2551{ 2552 struct mm_struct *mm = current->mm; 2553 2554 /* Do we have pgstes? if yes, we are done */ 2555 if (mm_has_pgste(mm)) 2556 return 0; 2557 /* Fail if the page tables are 2K */ 2558 if (!mm_alloc_pgste(mm)) 2559 return -EINVAL; 2560 mmap_write_lock(mm); 2561 mm->context.has_pgste = 1; 2562 /* split thp mappings and disable thp for future mappings */ 2563 thp_split_mm(mm); 2564 walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL); 2565 mmap_write_unlock(mm); 2566 return 0; 2567} 2568EXPORT_SYMBOL_GPL(s390_enable_sie); 2569 2570int gmap_mark_unmergeable(void) 2571{ 2572 struct mm_struct *mm = current->mm; 2573 struct vm_area_struct *vma; 2574 int ret; 2575 2576 for (vma = mm->mmap; vma; vma = vma->vm_next) { 2577 ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, 2578 MADV_UNMERGEABLE, &vma->vm_flags); 2579 if (ret) 2580 return ret; 2581 } 2582 mm->def_flags &= ~VM_MERGEABLE; 2583 return 0; 2584} 2585EXPORT_SYMBOL_GPL(gmap_mark_unmergeable); 2586 2587/* 2588 * Enable storage key handling from now on and initialize the storage 2589 * keys with the default key. 2590 */ 2591static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, 2592 unsigned long next, struct mm_walk *walk) 2593{ 2594 /* Clear storage key */ 2595 ptep_zap_key(walk->mm, addr, pte); 2596 return 0; 2597} 2598 2599/* 2600 * Give a chance to schedule after setting a key to 256 pages. 2601 * We only hold the mm lock, which is a rwsem and the kvm srcu. 2602 * Both can sleep. 2603 */ 2604static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr, 2605 unsigned long next, struct mm_walk *walk) 2606{ 2607 cond_resched(); 2608 return 0; 2609} 2610 2611static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, 2612 unsigned long hmask, unsigned long next, 2613 struct mm_walk *walk) 2614{ 2615 pmd_t *pmd = (pmd_t *)pte; 2616 unsigned long start, end; 2617 struct page *page = pmd_page(*pmd); 2618 2619 /* 2620 * The write check makes sure we do not set a key on shared 2621 * memory. This is needed as the walker does not differentiate 2622 * between actual guest memory and the process executable or 2623 * shared libraries. 2624 */ 2625 if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID || 2626 !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE)) 2627 return 0; 2628 2629 start = pmd_val(*pmd) & HPAGE_MASK; 2630 end = start + HPAGE_SIZE - 1; 2631 __storage_key_init_range(start, end); 2632 set_bit(PG_arch_1, &page->flags); 2633 cond_resched(); 2634 return 0; 2635} 2636 2637static const struct mm_walk_ops enable_skey_walk_ops = { 2638 .hugetlb_entry = __s390_enable_skey_hugetlb, 2639 .pte_entry = __s390_enable_skey_pte, 2640 .pmd_entry = __s390_enable_skey_pmd, 2641}; 2642 2643int s390_enable_skey(void) 2644{ 2645 struct mm_struct *mm = current->mm; 2646 int rc = 0; 2647 2648 mmap_write_lock(mm); 2649 if (mm_uses_skeys(mm)) 2650 goto out_up; 2651 2652 mm->context.uses_skeys = 1; 2653 rc = gmap_mark_unmergeable(); 2654 if (rc) { 2655 mm->context.uses_skeys = 0; 2656 goto out_up; 2657 } 2658 walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); 2659 2660out_up: 2661 mmap_write_unlock(mm); 2662 return rc; 2663} 2664EXPORT_SYMBOL_GPL(s390_enable_skey); 2665 2666/* 2667 * Reset CMMA state, make all pages stable again. 2668 */ 2669static int __s390_reset_cmma(pte_t *pte, unsigned long addr, 2670 unsigned long next, struct mm_walk *walk) 2671{ 2672 ptep_zap_unused(walk->mm, addr, pte, 1); 2673 return 0; 2674} 2675 2676static const struct mm_walk_ops reset_cmma_walk_ops = { 2677 .pte_entry = __s390_reset_cmma, 2678}; 2679 2680void s390_reset_cmma(struct mm_struct *mm) 2681{ 2682 mmap_write_lock(mm); 2683 walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); 2684 mmap_write_unlock(mm); 2685} 2686EXPORT_SYMBOL_GPL(s390_reset_cmma); 2687 2688/* 2689 * make inaccessible pages accessible again 2690 */ 2691static int __s390_reset_acc(pte_t *ptep, unsigned long addr, 2692 unsigned long next, struct mm_walk *walk) 2693{ 2694 pte_t pte = READ_ONCE(*ptep); 2695 2696 if (pte_present(pte)) 2697 WARN_ON_ONCE(uv_destroy_page(pte_val(pte) & PAGE_MASK)); 2698 return 0; 2699} 2700 2701static const struct mm_walk_ops reset_acc_walk_ops = { 2702 .pte_entry = __s390_reset_acc, 2703}; 2704 2705#include <linux/sched/mm.h> 2706void s390_reset_acc(struct mm_struct *mm) 2707{ 2708 if (!mm_is_protected(mm)) 2709 return; 2710 /* 2711 * we might be called during 2712 * reset: we walk the pages and clear 2713 * close of all kvm file descriptors: we walk the pages and clear 2714 * exit of process on fd closure: vma already gone, do nothing 2715 */ 2716 if (!mmget_not_zero(mm)) 2717 return; 2718 mmap_read_lock(mm); 2719 walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL); 2720 mmap_read_unlock(mm); 2721 mmput(mm); 2722} 2723EXPORT_SYMBOL_GPL(s390_reset_acc); 2724 2725/** 2726 * s390_unlist_old_asce - Remove the topmost level of page tables from the 2727 * list of page tables of the gmap. 2728 * @gmap: the gmap whose table is to be removed 2729 * 2730 * On s390x, KVM keeps a list of all pages containing the page tables of the 2731 * gmap (the CRST list). This list is used at tear down time to free all 2732 * pages that are now not needed anymore. 2733 * 2734 * This function removes the topmost page of the tree (the one pointed to by 2735 * the ASCE) from the CRST list. 2736 * 2737 * This means that it will not be freed when the VM is torn down, and needs 2738 * to be handled separately by the caller, unless a leak is actually 2739 * intended. Notice that this function will only remove the page from the 2740 * list, the page will still be used as a top level page table (and ASCE). 2741 */ 2742void s390_unlist_old_asce(struct gmap *gmap) 2743{ 2744 struct page *old; 2745 2746 old = virt_to_page(gmap->table); 2747 spin_lock(&gmap->guest_table_lock); 2748 list_del(&old->lru); 2749 /* 2750 * Sometimes the topmost page might need to be "removed" multiple 2751 * times, for example if the VM is rebooted into secure mode several 2752 * times concurrently, or if s390_replace_asce fails after calling 2753 * s390_remove_old_asce and is attempted again later. In that case 2754 * the old asce has been removed from the list, and therefore it 2755 * will not be freed when the VM terminates, but the ASCE is still 2756 * in use and still pointed to. 2757 * A subsequent call to replace_asce will follow the pointer and try 2758 * to remove the same page from the list again. 2759 * Therefore it's necessary that the page of the ASCE has valid 2760 * pointers, so list_del can work (and do nothing) without 2761 * dereferencing stale or invalid pointers. 2762 */ 2763 INIT_LIST_HEAD(&old->lru); 2764 spin_unlock(&gmap->guest_table_lock); 2765} 2766EXPORT_SYMBOL_GPL(s390_unlist_old_asce); 2767 2768/** 2769 * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy 2770 * @gmap: the gmap whose ASCE needs to be replaced 2771 * 2772 * If the allocation of the new top level page table fails, the ASCE is not 2773 * replaced. 2774 * In any case, the old ASCE is always removed from the gmap CRST list. 2775 * Therefore the caller has to make sure to save a pointer to it 2776 * beforehand, unless a leak is actually intended. 2777 */ 2778int s390_replace_asce(struct gmap *gmap) 2779{ 2780 unsigned long asce; 2781 struct page *page; 2782 void *table; 2783 2784 s390_unlist_old_asce(gmap); 2785 2786 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); 2787 if (!page) 2788 return -ENOMEM; 2789 page->index = 0; 2790 table = page_to_virt(page); 2791 memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); 2792 2793 /* 2794 * The caller has to deal with the old ASCE, but here we make sure 2795 * the new one is properly added to the CRST list, so that 2796 * it will be freed when the VM is torn down. 2797 */ 2798 spin_lock(&gmap->guest_table_lock); 2799 list_add(&page->lru, &gmap->crst_list); 2800 spin_unlock(&gmap->guest_table_lock); 2801 2802 /* Set new table origin while preserving existing ASCE control bits */ 2803 asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); 2804 WRITE_ONCE(gmap->asce, asce); 2805 WRITE_ONCE(gmap->mm->context.gmap_asce, asce); 2806 WRITE_ONCE(gmap->table, table); 2807 2808 return 0; 2809} 2810EXPORT_SYMBOL_GPL(s390_replace_asce); 2811