18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Copyright 2007-2008 Paul Mackerras, IBM Corp.
48c2ecf20Sopenharmony_ci */
58c2ecf20Sopenharmony_ci
68c2ecf20Sopenharmony_ci#include <linux/errno.h>
78c2ecf20Sopenharmony_ci#include <linux/kernel.h>
88c2ecf20Sopenharmony_ci#include <linux/gfp.h>
98c2ecf20Sopenharmony_ci#include <linux/types.h>
108c2ecf20Sopenharmony_ci#include <linux/pagewalk.h>
118c2ecf20Sopenharmony_ci#include <linux/hugetlb.h>
128c2ecf20Sopenharmony_ci#include <linux/syscalls.h>
138c2ecf20Sopenharmony_ci
148c2ecf20Sopenharmony_ci#include <linux/pgtable.h>
158c2ecf20Sopenharmony_ci#include <linux/uaccess.h>
168c2ecf20Sopenharmony_ci
178c2ecf20Sopenharmony_ci/*
188c2ecf20Sopenharmony_ci * Free all pages allocated for subpage protection maps and pointers.
198c2ecf20Sopenharmony_ci * Also makes sure that the subpage_prot_table structure is
208c2ecf20Sopenharmony_ci * reinitialized for the next user.
218c2ecf20Sopenharmony_ci */
228c2ecf20Sopenharmony_civoid subpage_prot_free(struct mm_struct *mm)
238c2ecf20Sopenharmony_ci{
248c2ecf20Sopenharmony_ci	struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
258c2ecf20Sopenharmony_ci	unsigned long i, j, addr;
268c2ecf20Sopenharmony_ci	u32 **p;
278c2ecf20Sopenharmony_ci
288c2ecf20Sopenharmony_ci	if (!spt)
298c2ecf20Sopenharmony_ci		return;
308c2ecf20Sopenharmony_ci
318c2ecf20Sopenharmony_ci	for (i = 0; i < 4; ++i) {
328c2ecf20Sopenharmony_ci		if (spt->low_prot[i]) {
338c2ecf20Sopenharmony_ci			free_page((unsigned long)spt->low_prot[i]);
348c2ecf20Sopenharmony_ci			spt->low_prot[i] = NULL;
358c2ecf20Sopenharmony_ci		}
368c2ecf20Sopenharmony_ci	}
378c2ecf20Sopenharmony_ci	addr = 0;
388c2ecf20Sopenharmony_ci	for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) {
398c2ecf20Sopenharmony_ci		p = spt->protptrs[i];
408c2ecf20Sopenharmony_ci		if (!p)
418c2ecf20Sopenharmony_ci			continue;
428c2ecf20Sopenharmony_ci		spt->protptrs[i] = NULL;
438c2ecf20Sopenharmony_ci		for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr;
448c2ecf20Sopenharmony_ci		     ++j, addr += PAGE_SIZE)
458c2ecf20Sopenharmony_ci			if (p[j])
468c2ecf20Sopenharmony_ci				free_page((unsigned long)p[j]);
478c2ecf20Sopenharmony_ci		free_page((unsigned long)p);
488c2ecf20Sopenharmony_ci	}
498c2ecf20Sopenharmony_ci	spt->maxaddr = 0;
508c2ecf20Sopenharmony_ci	kfree(spt);
518c2ecf20Sopenharmony_ci}
528c2ecf20Sopenharmony_ci
538c2ecf20Sopenharmony_cistatic void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
548c2ecf20Sopenharmony_ci			     int npages)
558c2ecf20Sopenharmony_ci{
568c2ecf20Sopenharmony_ci	pgd_t *pgd;
578c2ecf20Sopenharmony_ci	p4d_t *p4d;
588c2ecf20Sopenharmony_ci	pud_t *pud;
598c2ecf20Sopenharmony_ci	pmd_t *pmd;
608c2ecf20Sopenharmony_ci	pte_t *pte;
618c2ecf20Sopenharmony_ci	spinlock_t *ptl;
628c2ecf20Sopenharmony_ci
638c2ecf20Sopenharmony_ci	pgd = pgd_offset(mm, addr);
648c2ecf20Sopenharmony_ci	p4d = p4d_offset(pgd, addr);
658c2ecf20Sopenharmony_ci	if (p4d_none(*p4d))
668c2ecf20Sopenharmony_ci		return;
678c2ecf20Sopenharmony_ci	pud = pud_offset(p4d, addr);
688c2ecf20Sopenharmony_ci	if (pud_none(*pud))
698c2ecf20Sopenharmony_ci		return;
708c2ecf20Sopenharmony_ci	pmd = pmd_offset(pud, addr);
718c2ecf20Sopenharmony_ci	if (pmd_none(*pmd))
728c2ecf20Sopenharmony_ci		return;
738c2ecf20Sopenharmony_ci	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
748c2ecf20Sopenharmony_ci	arch_enter_lazy_mmu_mode();
758c2ecf20Sopenharmony_ci	for (; npages > 0; --npages) {
768c2ecf20Sopenharmony_ci		pte_update(mm, addr, pte, 0, 0, 0);
778c2ecf20Sopenharmony_ci		addr += PAGE_SIZE;
788c2ecf20Sopenharmony_ci		++pte;
798c2ecf20Sopenharmony_ci	}
808c2ecf20Sopenharmony_ci	arch_leave_lazy_mmu_mode();
818c2ecf20Sopenharmony_ci	pte_unmap_unlock(pte - 1, ptl);
828c2ecf20Sopenharmony_ci}
838c2ecf20Sopenharmony_ci
848c2ecf20Sopenharmony_ci/*
858c2ecf20Sopenharmony_ci * Clear the subpage protection map for an address range, allowing
868c2ecf20Sopenharmony_ci * all accesses that are allowed by the pte permissions.
878c2ecf20Sopenharmony_ci */
888c2ecf20Sopenharmony_cistatic void subpage_prot_clear(unsigned long addr, unsigned long len)
898c2ecf20Sopenharmony_ci{
908c2ecf20Sopenharmony_ci	struct mm_struct *mm = current->mm;
918c2ecf20Sopenharmony_ci	struct subpage_prot_table *spt;
928c2ecf20Sopenharmony_ci	u32 **spm, *spp;
938c2ecf20Sopenharmony_ci	unsigned long i;
948c2ecf20Sopenharmony_ci	size_t nw;
958c2ecf20Sopenharmony_ci	unsigned long next, limit;
968c2ecf20Sopenharmony_ci
978c2ecf20Sopenharmony_ci	mmap_write_lock(mm);
988c2ecf20Sopenharmony_ci
998c2ecf20Sopenharmony_ci	spt = mm_ctx_subpage_prot(&mm->context);
1008c2ecf20Sopenharmony_ci	if (!spt)
1018c2ecf20Sopenharmony_ci		goto err_out;
1028c2ecf20Sopenharmony_ci
1038c2ecf20Sopenharmony_ci	limit = addr + len;
1048c2ecf20Sopenharmony_ci	if (limit > spt->maxaddr)
1058c2ecf20Sopenharmony_ci		limit = spt->maxaddr;
1068c2ecf20Sopenharmony_ci	for (; addr < limit; addr = next) {
1078c2ecf20Sopenharmony_ci		next = pmd_addr_end(addr, limit);
1088c2ecf20Sopenharmony_ci		if (addr < 0x100000000UL) {
1098c2ecf20Sopenharmony_ci			spm = spt->low_prot;
1108c2ecf20Sopenharmony_ci		} else {
1118c2ecf20Sopenharmony_ci			spm = spt->protptrs[addr >> SBP_L3_SHIFT];
1128c2ecf20Sopenharmony_ci			if (!spm)
1138c2ecf20Sopenharmony_ci				continue;
1148c2ecf20Sopenharmony_ci		}
1158c2ecf20Sopenharmony_ci		spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
1168c2ecf20Sopenharmony_ci		if (!spp)
1178c2ecf20Sopenharmony_ci			continue;
1188c2ecf20Sopenharmony_ci		spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
1198c2ecf20Sopenharmony_ci
1208c2ecf20Sopenharmony_ci		i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
1218c2ecf20Sopenharmony_ci		nw = PTRS_PER_PTE - i;
1228c2ecf20Sopenharmony_ci		if (addr + (nw << PAGE_SHIFT) > next)
1238c2ecf20Sopenharmony_ci			nw = (next - addr) >> PAGE_SHIFT;
1248c2ecf20Sopenharmony_ci
1258c2ecf20Sopenharmony_ci		memset(spp, 0, nw * sizeof(u32));
1268c2ecf20Sopenharmony_ci
1278c2ecf20Sopenharmony_ci		/* now flush any existing HPTEs for the range */
1288c2ecf20Sopenharmony_ci		hpte_flush_range(mm, addr, nw);
1298c2ecf20Sopenharmony_ci	}
1308c2ecf20Sopenharmony_ci
1318c2ecf20Sopenharmony_cierr_out:
1328c2ecf20Sopenharmony_ci	mmap_write_unlock(mm);
1338c2ecf20Sopenharmony_ci}
1348c2ecf20Sopenharmony_ci
1358c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1368c2ecf20Sopenharmony_cistatic int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
1378c2ecf20Sopenharmony_ci				  unsigned long end, struct mm_walk *walk)
1388c2ecf20Sopenharmony_ci{
1398c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = walk->vma;
1408c2ecf20Sopenharmony_ci	split_huge_pmd(vma, pmd, addr);
1418c2ecf20Sopenharmony_ci	return 0;
1428c2ecf20Sopenharmony_ci}
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_cistatic const struct mm_walk_ops subpage_walk_ops = {
1458c2ecf20Sopenharmony_ci	.pmd_entry	= subpage_walk_pmd_entry,
1468c2ecf20Sopenharmony_ci};
1478c2ecf20Sopenharmony_ci
1488c2ecf20Sopenharmony_cistatic void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
1498c2ecf20Sopenharmony_ci				    unsigned long len)
1508c2ecf20Sopenharmony_ci{
1518c2ecf20Sopenharmony_ci	struct vm_area_struct *vma;
1528c2ecf20Sopenharmony_ci
1538c2ecf20Sopenharmony_ci	/*
1548c2ecf20Sopenharmony_ci	 * We don't try too hard, we just mark all the vma in that range
1558c2ecf20Sopenharmony_ci	 * VM_NOHUGEPAGE and split them.
1568c2ecf20Sopenharmony_ci	 */
1578c2ecf20Sopenharmony_ci	vma = find_vma(mm, addr);
1588c2ecf20Sopenharmony_ci	/*
1598c2ecf20Sopenharmony_ci	 * If the range is in unmapped range, just return
1608c2ecf20Sopenharmony_ci	 */
1618c2ecf20Sopenharmony_ci	if (vma && ((addr + len) <= vma->vm_start))
1628c2ecf20Sopenharmony_ci		return;
1638c2ecf20Sopenharmony_ci
1648c2ecf20Sopenharmony_ci	while (vma) {
1658c2ecf20Sopenharmony_ci		if (vma->vm_start >= (addr + len))
1668c2ecf20Sopenharmony_ci			break;
1678c2ecf20Sopenharmony_ci		vma->vm_flags |= VM_NOHUGEPAGE;
1688c2ecf20Sopenharmony_ci		walk_page_vma(vma, &subpage_walk_ops, NULL);
1698c2ecf20Sopenharmony_ci		vma = vma->vm_next;
1708c2ecf20Sopenharmony_ci	}
1718c2ecf20Sopenharmony_ci}
1728c2ecf20Sopenharmony_ci#else
1738c2ecf20Sopenharmony_cistatic void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
1748c2ecf20Sopenharmony_ci				    unsigned long len)
1758c2ecf20Sopenharmony_ci{
1768c2ecf20Sopenharmony_ci	return;
1778c2ecf20Sopenharmony_ci}
1788c2ecf20Sopenharmony_ci#endif
1798c2ecf20Sopenharmony_ci
1808c2ecf20Sopenharmony_ci/*
1818c2ecf20Sopenharmony_ci * Copy in a subpage protection map for an address range.
1828c2ecf20Sopenharmony_ci * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
1838c2ecf20Sopenharmony_ci * Each 2-bit field is 0 to allow any access, 1 to prevent writes,
1848c2ecf20Sopenharmony_ci * 2 or 3 to prevent all accesses.
1858c2ecf20Sopenharmony_ci * Note that the normal page protections also apply; the subpage
1868c2ecf20Sopenharmony_ci * protection mechanism is an additional constraint, so putting 0
1878c2ecf20Sopenharmony_ci * in a 2-bit field won't allow writes to a page that is otherwise
1888c2ecf20Sopenharmony_ci * write-protected.
1898c2ecf20Sopenharmony_ci */
1908c2ecf20Sopenharmony_ciSYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
1918c2ecf20Sopenharmony_ci		unsigned long, len, u32 __user *, map)
1928c2ecf20Sopenharmony_ci{
1938c2ecf20Sopenharmony_ci	struct mm_struct *mm = current->mm;
1948c2ecf20Sopenharmony_ci	struct subpage_prot_table *spt;
1958c2ecf20Sopenharmony_ci	u32 **spm, *spp;
1968c2ecf20Sopenharmony_ci	unsigned long i;
1978c2ecf20Sopenharmony_ci	size_t nw;
1988c2ecf20Sopenharmony_ci	unsigned long next, limit;
1998c2ecf20Sopenharmony_ci	int err;
2008c2ecf20Sopenharmony_ci
2018c2ecf20Sopenharmony_ci	if (radix_enabled())
2028c2ecf20Sopenharmony_ci		return -ENOENT;
2038c2ecf20Sopenharmony_ci
2048c2ecf20Sopenharmony_ci	/* Check parameters */
2058c2ecf20Sopenharmony_ci	if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
2068c2ecf20Sopenharmony_ci	    addr >= mm->task_size || len >= mm->task_size ||
2078c2ecf20Sopenharmony_ci	    addr + len > mm->task_size)
2088c2ecf20Sopenharmony_ci		return -EINVAL;
2098c2ecf20Sopenharmony_ci
2108c2ecf20Sopenharmony_ci	if (is_hugepage_only_range(mm, addr, len))
2118c2ecf20Sopenharmony_ci		return -EINVAL;
2128c2ecf20Sopenharmony_ci
2138c2ecf20Sopenharmony_ci	if (!map) {
2148c2ecf20Sopenharmony_ci		/* Clear out the protection map for the address range */
2158c2ecf20Sopenharmony_ci		subpage_prot_clear(addr, len);
2168c2ecf20Sopenharmony_ci		return 0;
2178c2ecf20Sopenharmony_ci	}
2188c2ecf20Sopenharmony_ci
2198c2ecf20Sopenharmony_ci	if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32)))
2208c2ecf20Sopenharmony_ci		return -EFAULT;
2218c2ecf20Sopenharmony_ci
2228c2ecf20Sopenharmony_ci	mmap_write_lock(mm);
2238c2ecf20Sopenharmony_ci
2248c2ecf20Sopenharmony_ci	spt = mm_ctx_subpage_prot(&mm->context);
2258c2ecf20Sopenharmony_ci	if (!spt) {
2268c2ecf20Sopenharmony_ci		/*
2278c2ecf20Sopenharmony_ci		 * Allocate subpage prot table if not already done.
2288c2ecf20Sopenharmony_ci		 * Do this with mmap_lock held
2298c2ecf20Sopenharmony_ci		 */
2308c2ecf20Sopenharmony_ci		spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL);
2318c2ecf20Sopenharmony_ci		if (!spt) {
2328c2ecf20Sopenharmony_ci			err = -ENOMEM;
2338c2ecf20Sopenharmony_ci			goto out;
2348c2ecf20Sopenharmony_ci		}
2358c2ecf20Sopenharmony_ci		mm->context.hash_context->spt = spt;
2368c2ecf20Sopenharmony_ci	}
2378c2ecf20Sopenharmony_ci
2388c2ecf20Sopenharmony_ci	subpage_mark_vma_nohuge(mm, addr, len);
2398c2ecf20Sopenharmony_ci	for (limit = addr + len; addr < limit; addr = next) {
2408c2ecf20Sopenharmony_ci		next = pmd_addr_end(addr, limit);
2418c2ecf20Sopenharmony_ci		err = -ENOMEM;
2428c2ecf20Sopenharmony_ci		if (addr < 0x100000000UL) {
2438c2ecf20Sopenharmony_ci			spm = spt->low_prot;
2448c2ecf20Sopenharmony_ci		} else {
2458c2ecf20Sopenharmony_ci			spm = spt->protptrs[addr >> SBP_L3_SHIFT];
2468c2ecf20Sopenharmony_ci			if (!spm) {
2478c2ecf20Sopenharmony_ci				spm = (u32 **)get_zeroed_page(GFP_KERNEL);
2488c2ecf20Sopenharmony_ci				if (!spm)
2498c2ecf20Sopenharmony_ci					goto out;
2508c2ecf20Sopenharmony_ci				spt->protptrs[addr >> SBP_L3_SHIFT] = spm;
2518c2ecf20Sopenharmony_ci			}
2528c2ecf20Sopenharmony_ci		}
2538c2ecf20Sopenharmony_ci		spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1);
2548c2ecf20Sopenharmony_ci		spp = *spm;
2558c2ecf20Sopenharmony_ci		if (!spp) {
2568c2ecf20Sopenharmony_ci			spp = (u32 *)get_zeroed_page(GFP_KERNEL);
2578c2ecf20Sopenharmony_ci			if (!spp)
2588c2ecf20Sopenharmony_ci				goto out;
2598c2ecf20Sopenharmony_ci			*spm = spp;
2608c2ecf20Sopenharmony_ci		}
2618c2ecf20Sopenharmony_ci		spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
2628c2ecf20Sopenharmony_ci
2638c2ecf20Sopenharmony_ci		local_irq_disable();
2648c2ecf20Sopenharmony_ci		demote_segment_4k(mm, addr);
2658c2ecf20Sopenharmony_ci		local_irq_enable();
2668c2ecf20Sopenharmony_ci
2678c2ecf20Sopenharmony_ci		i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
2688c2ecf20Sopenharmony_ci		nw = PTRS_PER_PTE - i;
2698c2ecf20Sopenharmony_ci		if (addr + (nw << PAGE_SHIFT) > next)
2708c2ecf20Sopenharmony_ci			nw = (next - addr) >> PAGE_SHIFT;
2718c2ecf20Sopenharmony_ci
2728c2ecf20Sopenharmony_ci		mmap_write_unlock(mm);
2738c2ecf20Sopenharmony_ci		if (__copy_from_user(spp, map, nw * sizeof(u32)))
2748c2ecf20Sopenharmony_ci			return -EFAULT;
2758c2ecf20Sopenharmony_ci		map += nw;
2768c2ecf20Sopenharmony_ci		mmap_write_lock(mm);
2778c2ecf20Sopenharmony_ci
2788c2ecf20Sopenharmony_ci		/* now flush any existing HPTEs for the range */
2798c2ecf20Sopenharmony_ci		hpte_flush_range(mm, addr, nw);
2808c2ecf20Sopenharmony_ci	}
2818c2ecf20Sopenharmony_ci	if (limit > spt->maxaddr)
2828c2ecf20Sopenharmony_ci		spt->maxaddr = limit;
2838c2ecf20Sopenharmony_ci	err = 0;
2848c2ecf20Sopenharmony_ci out:
2858c2ecf20Sopenharmony_ci	mmap_write_unlock(mm);
2868c2ecf20Sopenharmony_ci	return err;
2878c2ecf20Sopenharmony_ci}
288