18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Copyright 2007-2008 Paul Mackerras, IBM Corp. 48c2ecf20Sopenharmony_ci */ 58c2ecf20Sopenharmony_ci 68c2ecf20Sopenharmony_ci#include <linux/errno.h> 78c2ecf20Sopenharmony_ci#include <linux/kernel.h> 88c2ecf20Sopenharmony_ci#include <linux/gfp.h> 98c2ecf20Sopenharmony_ci#include <linux/types.h> 108c2ecf20Sopenharmony_ci#include <linux/pagewalk.h> 118c2ecf20Sopenharmony_ci#include <linux/hugetlb.h> 128c2ecf20Sopenharmony_ci#include <linux/syscalls.h> 138c2ecf20Sopenharmony_ci 148c2ecf20Sopenharmony_ci#include <linux/pgtable.h> 158c2ecf20Sopenharmony_ci#include <linux/uaccess.h> 168c2ecf20Sopenharmony_ci 178c2ecf20Sopenharmony_ci/* 188c2ecf20Sopenharmony_ci * Free all pages allocated for subpage protection maps and pointers. 198c2ecf20Sopenharmony_ci * Also makes sure that the subpage_prot_table structure is 208c2ecf20Sopenharmony_ci * reinitialized for the next user. 218c2ecf20Sopenharmony_ci */ 228c2ecf20Sopenharmony_civoid subpage_prot_free(struct mm_struct *mm) 238c2ecf20Sopenharmony_ci{ 248c2ecf20Sopenharmony_ci struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); 258c2ecf20Sopenharmony_ci unsigned long i, j, addr; 268c2ecf20Sopenharmony_ci u32 **p; 278c2ecf20Sopenharmony_ci 288c2ecf20Sopenharmony_ci if (!spt) 298c2ecf20Sopenharmony_ci return; 308c2ecf20Sopenharmony_ci 318c2ecf20Sopenharmony_ci for (i = 0; i < 4; ++i) { 328c2ecf20Sopenharmony_ci if (spt->low_prot[i]) { 338c2ecf20Sopenharmony_ci free_page((unsigned long)spt->low_prot[i]); 348c2ecf20Sopenharmony_ci spt->low_prot[i] = NULL; 358c2ecf20Sopenharmony_ci } 368c2ecf20Sopenharmony_ci } 378c2ecf20Sopenharmony_ci addr = 0; 388c2ecf20Sopenharmony_ci for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) { 398c2ecf20Sopenharmony_ci p = spt->protptrs[i]; 408c2ecf20Sopenharmony_ci if (!p) 418c2ecf20Sopenharmony_ci continue; 428c2ecf20Sopenharmony_ci spt->protptrs[i] = NULL; 438c2ecf20Sopenharmony_ci for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr; 448c2ecf20Sopenharmony_ci ++j, addr += PAGE_SIZE) 458c2ecf20Sopenharmony_ci if (p[j]) 468c2ecf20Sopenharmony_ci free_page((unsigned long)p[j]); 478c2ecf20Sopenharmony_ci free_page((unsigned long)p); 488c2ecf20Sopenharmony_ci } 498c2ecf20Sopenharmony_ci spt->maxaddr = 0; 508c2ecf20Sopenharmony_ci kfree(spt); 518c2ecf20Sopenharmony_ci} 528c2ecf20Sopenharmony_ci 538c2ecf20Sopenharmony_cistatic void hpte_flush_range(struct mm_struct *mm, unsigned long addr, 548c2ecf20Sopenharmony_ci int npages) 558c2ecf20Sopenharmony_ci{ 568c2ecf20Sopenharmony_ci pgd_t *pgd; 578c2ecf20Sopenharmony_ci p4d_t *p4d; 588c2ecf20Sopenharmony_ci pud_t *pud; 598c2ecf20Sopenharmony_ci pmd_t *pmd; 608c2ecf20Sopenharmony_ci pte_t *pte; 618c2ecf20Sopenharmony_ci spinlock_t *ptl; 628c2ecf20Sopenharmony_ci 638c2ecf20Sopenharmony_ci pgd = pgd_offset(mm, addr); 648c2ecf20Sopenharmony_ci p4d = p4d_offset(pgd, addr); 658c2ecf20Sopenharmony_ci if (p4d_none(*p4d)) 668c2ecf20Sopenharmony_ci return; 678c2ecf20Sopenharmony_ci pud = pud_offset(p4d, addr); 688c2ecf20Sopenharmony_ci if (pud_none(*pud)) 698c2ecf20Sopenharmony_ci return; 708c2ecf20Sopenharmony_ci pmd = pmd_offset(pud, addr); 718c2ecf20Sopenharmony_ci if (pmd_none(*pmd)) 728c2ecf20Sopenharmony_ci return; 738c2ecf20Sopenharmony_ci pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 748c2ecf20Sopenharmony_ci arch_enter_lazy_mmu_mode(); 758c2ecf20Sopenharmony_ci for (; npages > 0; --npages) { 768c2ecf20Sopenharmony_ci pte_update(mm, addr, pte, 0, 0, 0); 778c2ecf20Sopenharmony_ci addr += PAGE_SIZE; 788c2ecf20Sopenharmony_ci ++pte; 798c2ecf20Sopenharmony_ci } 808c2ecf20Sopenharmony_ci arch_leave_lazy_mmu_mode(); 818c2ecf20Sopenharmony_ci pte_unmap_unlock(pte - 1, ptl); 828c2ecf20Sopenharmony_ci} 838c2ecf20Sopenharmony_ci 848c2ecf20Sopenharmony_ci/* 858c2ecf20Sopenharmony_ci * Clear the subpage protection map for an address range, allowing 868c2ecf20Sopenharmony_ci * all accesses that are allowed by the pte permissions. 878c2ecf20Sopenharmony_ci */ 888c2ecf20Sopenharmony_cistatic void subpage_prot_clear(unsigned long addr, unsigned long len) 898c2ecf20Sopenharmony_ci{ 908c2ecf20Sopenharmony_ci struct mm_struct *mm = current->mm; 918c2ecf20Sopenharmony_ci struct subpage_prot_table *spt; 928c2ecf20Sopenharmony_ci u32 **spm, *spp; 938c2ecf20Sopenharmony_ci unsigned long i; 948c2ecf20Sopenharmony_ci size_t nw; 958c2ecf20Sopenharmony_ci unsigned long next, limit; 968c2ecf20Sopenharmony_ci 978c2ecf20Sopenharmony_ci mmap_write_lock(mm); 988c2ecf20Sopenharmony_ci 998c2ecf20Sopenharmony_ci spt = mm_ctx_subpage_prot(&mm->context); 1008c2ecf20Sopenharmony_ci if (!spt) 1018c2ecf20Sopenharmony_ci goto err_out; 1028c2ecf20Sopenharmony_ci 1038c2ecf20Sopenharmony_ci limit = addr + len; 1048c2ecf20Sopenharmony_ci if (limit > spt->maxaddr) 1058c2ecf20Sopenharmony_ci limit = spt->maxaddr; 1068c2ecf20Sopenharmony_ci for (; addr < limit; addr = next) { 1078c2ecf20Sopenharmony_ci next = pmd_addr_end(addr, limit); 1088c2ecf20Sopenharmony_ci if (addr < 0x100000000UL) { 1098c2ecf20Sopenharmony_ci spm = spt->low_prot; 1108c2ecf20Sopenharmony_ci } else { 1118c2ecf20Sopenharmony_ci spm = spt->protptrs[addr >> SBP_L3_SHIFT]; 1128c2ecf20Sopenharmony_ci if (!spm) 1138c2ecf20Sopenharmony_ci continue; 1148c2ecf20Sopenharmony_ci } 1158c2ecf20Sopenharmony_ci spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; 1168c2ecf20Sopenharmony_ci if (!spp) 1178c2ecf20Sopenharmony_ci continue; 1188c2ecf20Sopenharmony_ci spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1); 1198c2ecf20Sopenharmony_ci 1208c2ecf20Sopenharmony_ci i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); 1218c2ecf20Sopenharmony_ci nw = PTRS_PER_PTE - i; 1228c2ecf20Sopenharmony_ci if (addr + (nw << PAGE_SHIFT) > next) 1238c2ecf20Sopenharmony_ci nw = (next - addr) >> PAGE_SHIFT; 1248c2ecf20Sopenharmony_ci 1258c2ecf20Sopenharmony_ci memset(spp, 0, nw * sizeof(u32)); 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_ci /* now flush any existing HPTEs for the range */ 1288c2ecf20Sopenharmony_ci hpte_flush_range(mm, addr, nw); 1298c2ecf20Sopenharmony_ci } 1308c2ecf20Sopenharmony_ci 1318c2ecf20Sopenharmony_cierr_out: 1328c2ecf20Sopenharmony_ci mmap_write_unlock(mm); 1338c2ecf20Sopenharmony_ci} 1348c2ecf20Sopenharmony_ci 1358c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 1368c2ecf20Sopenharmony_cistatic int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, 1378c2ecf20Sopenharmony_ci unsigned long end, struct mm_walk *walk) 1388c2ecf20Sopenharmony_ci{ 1398c2ecf20Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 1408c2ecf20Sopenharmony_ci split_huge_pmd(vma, pmd, addr); 1418c2ecf20Sopenharmony_ci return 0; 1428c2ecf20Sopenharmony_ci} 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_cistatic const struct mm_walk_ops subpage_walk_ops = { 1458c2ecf20Sopenharmony_ci .pmd_entry = subpage_walk_pmd_entry, 1468c2ecf20Sopenharmony_ci}; 1478c2ecf20Sopenharmony_ci 1488c2ecf20Sopenharmony_cistatic void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, 1498c2ecf20Sopenharmony_ci unsigned long len) 1508c2ecf20Sopenharmony_ci{ 1518c2ecf20Sopenharmony_ci struct vm_area_struct *vma; 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_ci /* 1548c2ecf20Sopenharmony_ci * We don't try too hard, we just mark all the vma in that range 1558c2ecf20Sopenharmony_ci * VM_NOHUGEPAGE and split them. 1568c2ecf20Sopenharmony_ci */ 1578c2ecf20Sopenharmony_ci vma = find_vma(mm, addr); 1588c2ecf20Sopenharmony_ci /* 1598c2ecf20Sopenharmony_ci * If the range is in unmapped range, just return 1608c2ecf20Sopenharmony_ci */ 1618c2ecf20Sopenharmony_ci if (vma && ((addr + len) <= vma->vm_start)) 1628c2ecf20Sopenharmony_ci return; 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_ci while (vma) { 1658c2ecf20Sopenharmony_ci if (vma->vm_start >= (addr + len)) 1668c2ecf20Sopenharmony_ci break; 1678c2ecf20Sopenharmony_ci vma->vm_flags |= VM_NOHUGEPAGE; 1688c2ecf20Sopenharmony_ci walk_page_vma(vma, &subpage_walk_ops, NULL); 1698c2ecf20Sopenharmony_ci vma = vma->vm_next; 1708c2ecf20Sopenharmony_ci } 1718c2ecf20Sopenharmony_ci} 1728c2ecf20Sopenharmony_ci#else 1738c2ecf20Sopenharmony_cistatic void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, 1748c2ecf20Sopenharmony_ci unsigned long len) 1758c2ecf20Sopenharmony_ci{ 1768c2ecf20Sopenharmony_ci return; 1778c2ecf20Sopenharmony_ci} 1788c2ecf20Sopenharmony_ci#endif 1798c2ecf20Sopenharmony_ci 1808c2ecf20Sopenharmony_ci/* 1818c2ecf20Sopenharmony_ci * Copy in a subpage protection map for an address range. 1828c2ecf20Sopenharmony_ci * The map has 2 bits per 4k subpage, so 32 bits per 64k page. 1838c2ecf20Sopenharmony_ci * Each 2-bit field is 0 to allow any access, 1 to prevent writes, 1848c2ecf20Sopenharmony_ci * 2 or 3 to prevent all accesses. 1858c2ecf20Sopenharmony_ci * Note that the normal page protections also apply; the subpage 1868c2ecf20Sopenharmony_ci * protection mechanism is an additional constraint, so putting 0 1878c2ecf20Sopenharmony_ci * in a 2-bit field won't allow writes to a page that is otherwise 1888c2ecf20Sopenharmony_ci * write-protected. 1898c2ecf20Sopenharmony_ci */ 1908c2ecf20Sopenharmony_ciSYSCALL_DEFINE3(subpage_prot, unsigned long, addr, 1918c2ecf20Sopenharmony_ci unsigned long, len, u32 __user *, map) 1928c2ecf20Sopenharmony_ci{ 1938c2ecf20Sopenharmony_ci struct mm_struct *mm = current->mm; 1948c2ecf20Sopenharmony_ci struct subpage_prot_table *spt; 1958c2ecf20Sopenharmony_ci u32 **spm, *spp; 1968c2ecf20Sopenharmony_ci unsigned long i; 1978c2ecf20Sopenharmony_ci size_t nw; 1988c2ecf20Sopenharmony_ci unsigned long next, limit; 1998c2ecf20Sopenharmony_ci int err; 2008c2ecf20Sopenharmony_ci 2018c2ecf20Sopenharmony_ci if (radix_enabled()) 2028c2ecf20Sopenharmony_ci return -ENOENT; 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ci /* Check parameters */ 2058c2ecf20Sopenharmony_ci if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) || 2068c2ecf20Sopenharmony_ci addr >= mm->task_size || len >= mm->task_size || 2078c2ecf20Sopenharmony_ci addr + len > mm->task_size) 2088c2ecf20Sopenharmony_ci return -EINVAL; 2098c2ecf20Sopenharmony_ci 2108c2ecf20Sopenharmony_ci if (is_hugepage_only_range(mm, addr, len)) 2118c2ecf20Sopenharmony_ci return -EINVAL; 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci if (!map) { 2148c2ecf20Sopenharmony_ci /* Clear out the protection map for the address range */ 2158c2ecf20Sopenharmony_ci subpage_prot_clear(addr, len); 2168c2ecf20Sopenharmony_ci return 0; 2178c2ecf20Sopenharmony_ci } 2188c2ecf20Sopenharmony_ci 2198c2ecf20Sopenharmony_ci if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32))) 2208c2ecf20Sopenharmony_ci return -EFAULT; 2218c2ecf20Sopenharmony_ci 2228c2ecf20Sopenharmony_ci mmap_write_lock(mm); 2238c2ecf20Sopenharmony_ci 2248c2ecf20Sopenharmony_ci spt = mm_ctx_subpage_prot(&mm->context); 2258c2ecf20Sopenharmony_ci if (!spt) { 2268c2ecf20Sopenharmony_ci /* 2278c2ecf20Sopenharmony_ci * Allocate subpage prot table if not already done. 2288c2ecf20Sopenharmony_ci * Do this with mmap_lock held 2298c2ecf20Sopenharmony_ci */ 2308c2ecf20Sopenharmony_ci spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL); 2318c2ecf20Sopenharmony_ci if (!spt) { 2328c2ecf20Sopenharmony_ci err = -ENOMEM; 2338c2ecf20Sopenharmony_ci goto out; 2348c2ecf20Sopenharmony_ci } 2358c2ecf20Sopenharmony_ci mm->context.hash_context->spt = spt; 2368c2ecf20Sopenharmony_ci } 2378c2ecf20Sopenharmony_ci 2388c2ecf20Sopenharmony_ci subpage_mark_vma_nohuge(mm, addr, len); 2398c2ecf20Sopenharmony_ci for (limit = addr + len; addr < limit; addr = next) { 2408c2ecf20Sopenharmony_ci next = pmd_addr_end(addr, limit); 2418c2ecf20Sopenharmony_ci err = -ENOMEM; 2428c2ecf20Sopenharmony_ci if (addr < 0x100000000UL) { 2438c2ecf20Sopenharmony_ci spm = spt->low_prot; 2448c2ecf20Sopenharmony_ci } else { 2458c2ecf20Sopenharmony_ci spm = spt->protptrs[addr >> SBP_L3_SHIFT]; 2468c2ecf20Sopenharmony_ci if (!spm) { 2478c2ecf20Sopenharmony_ci spm = (u32 **)get_zeroed_page(GFP_KERNEL); 2488c2ecf20Sopenharmony_ci if (!spm) 2498c2ecf20Sopenharmony_ci goto out; 2508c2ecf20Sopenharmony_ci spt->protptrs[addr >> SBP_L3_SHIFT] = spm; 2518c2ecf20Sopenharmony_ci } 2528c2ecf20Sopenharmony_ci } 2538c2ecf20Sopenharmony_ci spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1); 2548c2ecf20Sopenharmony_ci spp = *spm; 2558c2ecf20Sopenharmony_ci if (!spp) { 2568c2ecf20Sopenharmony_ci spp = (u32 *)get_zeroed_page(GFP_KERNEL); 2578c2ecf20Sopenharmony_ci if (!spp) 2588c2ecf20Sopenharmony_ci goto out; 2598c2ecf20Sopenharmony_ci *spm = spp; 2608c2ecf20Sopenharmony_ci } 2618c2ecf20Sopenharmony_ci spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1); 2628c2ecf20Sopenharmony_ci 2638c2ecf20Sopenharmony_ci local_irq_disable(); 2648c2ecf20Sopenharmony_ci demote_segment_4k(mm, addr); 2658c2ecf20Sopenharmony_ci local_irq_enable(); 2668c2ecf20Sopenharmony_ci 2678c2ecf20Sopenharmony_ci i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); 2688c2ecf20Sopenharmony_ci nw = PTRS_PER_PTE - i; 2698c2ecf20Sopenharmony_ci if (addr + (nw << PAGE_SHIFT) > next) 2708c2ecf20Sopenharmony_ci nw = (next - addr) >> PAGE_SHIFT; 2718c2ecf20Sopenharmony_ci 2728c2ecf20Sopenharmony_ci mmap_write_unlock(mm); 2738c2ecf20Sopenharmony_ci if (__copy_from_user(spp, map, nw * sizeof(u32))) 2748c2ecf20Sopenharmony_ci return -EFAULT; 2758c2ecf20Sopenharmony_ci map += nw; 2768c2ecf20Sopenharmony_ci mmap_write_lock(mm); 2778c2ecf20Sopenharmony_ci 2788c2ecf20Sopenharmony_ci /* now flush any existing HPTEs for the range */ 2798c2ecf20Sopenharmony_ci hpte_flush_range(mm, addr, nw); 2808c2ecf20Sopenharmony_ci } 2818c2ecf20Sopenharmony_ci if (limit > spt->maxaddr) 2828c2ecf20Sopenharmony_ci spt->maxaddr = limit; 2838c2ecf20Sopenharmony_ci err = 0; 2848c2ecf20Sopenharmony_ci out: 2858c2ecf20Sopenharmony_ci mmap_write_unlock(mm); 2868c2ecf20Sopenharmony_ci return err; 2878c2ecf20Sopenharmony_ci} 288