162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright 2007-2008 Paul Mackerras, IBM Corp.
462306a36Sopenharmony_ci */
562306a36Sopenharmony_ci
662306a36Sopenharmony_ci#include <linux/errno.h>
762306a36Sopenharmony_ci#include <linux/kernel.h>
862306a36Sopenharmony_ci#include <linux/gfp.h>
962306a36Sopenharmony_ci#include <linux/types.h>
1062306a36Sopenharmony_ci#include <linux/pagewalk.h>
1162306a36Sopenharmony_ci#include <linux/hugetlb.h>
1262306a36Sopenharmony_ci#include <linux/syscalls.h>
1362306a36Sopenharmony_ci
1462306a36Sopenharmony_ci#include <linux/pgtable.h>
1562306a36Sopenharmony_ci#include <linux/uaccess.h>
1662306a36Sopenharmony_ci
1762306a36Sopenharmony_ci/*
1862306a36Sopenharmony_ci * Free all pages allocated for subpage protection maps and pointers.
1962306a36Sopenharmony_ci * Also makes sure that the subpage_prot_table structure is
2062306a36Sopenharmony_ci * reinitialized for the next user.
2162306a36Sopenharmony_ci */
2262306a36Sopenharmony_civoid subpage_prot_free(struct mm_struct *mm)
2362306a36Sopenharmony_ci{
2462306a36Sopenharmony_ci	struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
2562306a36Sopenharmony_ci	unsigned long i, j, addr;
2662306a36Sopenharmony_ci	u32 **p;
2762306a36Sopenharmony_ci
2862306a36Sopenharmony_ci	if (!spt)
2962306a36Sopenharmony_ci		return;
3062306a36Sopenharmony_ci
3162306a36Sopenharmony_ci	for (i = 0; i < 4; ++i) {
3262306a36Sopenharmony_ci		if (spt->low_prot[i]) {
3362306a36Sopenharmony_ci			free_page((unsigned long)spt->low_prot[i]);
3462306a36Sopenharmony_ci			spt->low_prot[i] = NULL;
3562306a36Sopenharmony_ci		}
3662306a36Sopenharmony_ci	}
3762306a36Sopenharmony_ci	addr = 0;
3862306a36Sopenharmony_ci	for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) {
3962306a36Sopenharmony_ci		p = spt->protptrs[i];
4062306a36Sopenharmony_ci		if (!p)
4162306a36Sopenharmony_ci			continue;
4262306a36Sopenharmony_ci		spt->protptrs[i] = NULL;
4362306a36Sopenharmony_ci		for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr;
4462306a36Sopenharmony_ci		     ++j, addr += PAGE_SIZE)
4562306a36Sopenharmony_ci			if (p[j])
4662306a36Sopenharmony_ci				free_page((unsigned long)p[j]);
4762306a36Sopenharmony_ci		free_page((unsigned long)p);
4862306a36Sopenharmony_ci	}
4962306a36Sopenharmony_ci	spt->maxaddr = 0;
5062306a36Sopenharmony_ci	kfree(spt);
5162306a36Sopenharmony_ci}
5262306a36Sopenharmony_ci
5362306a36Sopenharmony_cistatic void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
5462306a36Sopenharmony_ci			     int npages)
5562306a36Sopenharmony_ci{
5662306a36Sopenharmony_ci	pgd_t *pgd;
5762306a36Sopenharmony_ci	p4d_t *p4d;
5862306a36Sopenharmony_ci	pud_t *pud;
5962306a36Sopenharmony_ci	pmd_t *pmd;
6062306a36Sopenharmony_ci	pte_t *pte;
6162306a36Sopenharmony_ci	spinlock_t *ptl;
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_ci	pgd = pgd_offset(mm, addr);
6462306a36Sopenharmony_ci	p4d = p4d_offset(pgd, addr);
6562306a36Sopenharmony_ci	if (p4d_none(*p4d))
6662306a36Sopenharmony_ci		return;
6762306a36Sopenharmony_ci	pud = pud_offset(p4d, addr);
6862306a36Sopenharmony_ci	if (pud_none(*pud))
6962306a36Sopenharmony_ci		return;
7062306a36Sopenharmony_ci	pmd = pmd_offset(pud, addr);
7162306a36Sopenharmony_ci	if (pmd_none(*pmd))
7262306a36Sopenharmony_ci		return;
7362306a36Sopenharmony_ci	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
7462306a36Sopenharmony_ci	if (!pte)
7562306a36Sopenharmony_ci		return;
7662306a36Sopenharmony_ci	arch_enter_lazy_mmu_mode();
7762306a36Sopenharmony_ci	for (; npages > 0; --npages) {
7862306a36Sopenharmony_ci		pte_update(mm, addr, pte, 0, 0, 0);
7962306a36Sopenharmony_ci		addr += PAGE_SIZE;
8062306a36Sopenharmony_ci		++pte;
8162306a36Sopenharmony_ci	}
8262306a36Sopenharmony_ci	arch_leave_lazy_mmu_mode();
8362306a36Sopenharmony_ci	pte_unmap_unlock(pte - 1, ptl);
8462306a36Sopenharmony_ci}
8562306a36Sopenharmony_ci
8662306a36Sopenharmony_ci/*
8762306a36Sopenharmony_ci * Clear the subpage protection map for an address range, allowing
8862306a36Sopenharmony_ci * all accesses that are allowed by the pte permissions.
8962306a36Sopenharmony_ci */
9062306a36Sopenharmony_cistatic void subpage_prot_clear(unsigned long addr, unsigned long len)
9162306a36Sopenharmony_ci{
9262306a36Sopenharmony_ci	struct mm_struct *mm = current->mm;
9362306a36Sopenharmony_ci	struct subpage_prot_table *spt;
9462306a36Sopenharmony_ci	u32 **spm, *spp;
9562306a36Sopenharmony_ci	unsigned long i;
9662306a36Sopenharmony_ci	size_t nw;
9762306a36Sopenharmony_ci	unsigned long next, limit;
9862306a36Sopenharmony_ci
9962306a36Sopenharmony_ci	mmap_write_lock(mm);
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_ci	spt = mm_ctx_subpage_prot(&mm->context);
10262306a36Sopenharmony_ci	if (!spt)
10362306a36Sopenharmony_ci		goto err_out;
10462306a36Sopenharmony_ci
10562306a36Sopenharmony_ci	limit = addr + len;
10662306a36Sopenharmony_ci	if (limit > spt->maxaddr)
10762306a36Sopenharmony_ci		limit = spt->maxaddr;
10862306a36Sopenharmony_ci	for (; addr < limit; addr = next) {
10962306a36Sopenharmony_ci		next = pmd_addr_end(addr, limit);
11062306a36Sopenharmony_ci		if (addr < 0x100000000UL) {
11162306a36Sopenharmony_ci			spm = spt->low_prot;
11262306a36Sopenharmony_ci		} else {
11362306a36Sopenharmony_ci			spm = spt->protptrs[addr >> SBP_L3_SHIFT];
11462306a36Sopenharmony_ci			if (!spm)
11562306a36Sopenharmony_ci				continue;
11662306a36Sopenharmony_ci		}
11762306a36Sopenharmony_ci		spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
11862306a36Sopenharmony_ci		if (!spp)
11962306a36Sopenharmony_ci			continue;
12062306a36Sopenharmony_ci		spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
12162306a36Sopenharmony_ci
12262306a36Sopenharmony_ci		i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
12362306a36Sopenharmony_ci		nw = PTRS_PER_PTE - i;
12462306a36Sopenharmony_ci		if (addr + (nw << PAGE_SHIFT) > next)
12562306a36Sopenharmony_ci			nw = (next - addr) >> PAGE_SHIFT;
12662306a36Sopenharmony_ci
12762306a36Sopenharmony_ci		memset(spp, 0, nw * sizeof(u32));
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_ci		/* now flush any existing HPTEs for the range */
13062306a36Sopenharmony_ci		hpte_flush_range(mm, addr, nw);
13162306a36Sopenharmony_ci	}
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_cierr_out:
13462306a36Sopenharmony_ci	mmap_write_unlock(mm);
13562306a36Sopenharmony_ci}
13662306a36Sopenharmony_ci
13762306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
13862306a36Sopenharmony_cistatic int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
13962306a36Sopenharmony_ci				  unsigned long end, struct mm_walk *walk)
14062306a36Sopenharmony_ci{
14162306a36Sopenharmony_ci	struct vm_area_struct *vma = walk->vma;
14262306a36Sopenharmony_ci	split_huge_pmd(vma, pmd, addr);
14362306a36Sopenharmony_ci	return 0;
14462306a36Sopenharmony_ci}
14562306a36Sopenharmony_ci
14662306a36Sopenharmony_cistatic const struct mm_walk_ops subpage_walk_ops = {
14762306a36Sopenharmony_ci	.pmd_entry	= subpage_walk_pmd_entry,
14862306a36Sopenharmony_ci	.walk_lock	= PGWALK_WRLOCK_VERIFY,
14962306a36Sopenharmony_ci};
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_cistatic void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
15262306a36Sopenharmony_ci				    unsigned long len)
15362306a36Sopenharmony_ci{
15462306a36Sopenharmony_ci	struct vm_area_struct *vma;
15562306a36Sopenharmony_ci	VMA_ITERATOR(vmi, mm, addr);
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_ci	/*
15862306a36Sopenharmony_ci	 * We don't try too hard, we just mark all the vma in that range
15962306a36Sopenharmony_ci	 * VM_NOHUGEPAGE and split them.
16062306a36Sopenharmony_ci	 */
16162306a36Sopenharmony_ci	for_each_vma_range(vmi, vma, addr + len) {
16262306a36Sopenharmony_ci		vm_flags_set(vma, VM_NOHUGEPAGE);
16362306a36Sopenharmony_ci		walk_page_vma(vma, &subpage_walk_ops, NULL);
16462306a36Sopenharmony_ci	}
16562306a36Sopenharmony_ci}
16662306a36Sopenharmony_ci#else
16762306a36Sopenharmony_cistatic void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
16862306a36Sopenharmony_ci				    unsigned long len)
16962306a36Sopenharmony_ci{
17062306a36Sopenharmony_ci	return;
17162306a36Sopenharmony_ci}
17262306a36Sopenharmony_ci#endif
17362306a36Sopenharmony_ci
17462306a36Sopenharmony_ci/*
17562306a36Sopenharmony_ci * Copy in a subpage protection map for an address range.
17662306a36Sopenharmony_ci * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
17762306a36Sopenharmony_ci * Each 2-bit field is 0 to allow any access, 1 to prevent writes,
17862306a36Sopenharmony_ci * 2 or 3 to prevent all accesses.
17962306a36Sopenharmony_ci * Note that the normal page protections also apply; the subpage
18062306a36Sopenharmony_ci * protection mechanism is an additional constraint, so putting 0
18162306a36Sopenharmony_ci * in a 2-bit field won't allow writes to a page that is otherwise
18262306a36Sopenharmony_ci * write-protected.
18362306a36Sopenharmony_ci */
18462306a36Sopenharmony_ciSYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
18562306a36Sopenharmony_ci		unsigned long, len, u32 __user *, map)
18662306a36Sopenharmony_ci{
18762306a36Sopenharmony_ci	struct mm_struct *mm = current->mm;
18862306a36Sopenharmony_ci	struct subpage_prot_table *spt;
18962306a36Sopenharmony_ci	u32 **spm, *spp;
19062306a36Sopenharmony_ci	unsigned long i;
19162306a36Sopenharmony_ci	size_t nw;
19262306a36Sopenharmony_ci	unsigned long next, limit;
19362306a36Sopenharmony_ci	int err;
19462306a36Sopenharmony_ci
19562306a36Sopenharmony_ci	if (radix_enabled())
19662306a36Sopenharmony_ci		return -ENOENT;
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ci	/* Check parameters */
19962306a36Sopenharmony_ci	if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
20062306a36Sopenharmony_ci	    addr >= mm->task_size || len >= mm->task_size ||
20162306a36Sopenharmony_ci	    addr + len > mm->task_size)
20262306a36Sopenharmony_ci		return -EINVAL;
20362306a36Sopenharmony_ci
20462306a36Sopenharmony_ci	if (is_hugepage_only_range(mm, addr, len))
20562306a36Sopenharmony_ci		return -EINVAL;
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci	if (!map) {
20862306a36Sopenharmony_ci		/* Clear out the protection map for the address range */
20962306a36Sopenharmony_ci		subpage_prot_clear(addr, len);
21062306a36Sopenharmony_ci		return 0;
21162306a36Sopenharmony_ci	}
21262306a36Sopenharmony_ci
21362306a36Sopenharmony_ci	if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32)))
21462306a36Sopenharmony_ci		return -EFAULT;
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_ci	mmap_write_lock(mm);
21762306a36Sopenharmony_ci
21862306a36Sopenharmony_ci	spt = mm_ctx_subpage_prot(&mm->context);
21962306a36Sopenharmony_ci	if (!spt) {
22062306a36Sopenharmony_ci		/*
22162306a36Sopenharmony_ci		 * Allocate subpage prot table if not already done.
22262306a36Sopenharmony_ci		 * Do this with mmap_lock held
22362306a36Sopenharmony_ci		 */
22462306a36Sopenharmony_ci		spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL);
22562306a36Sopenharmony_ci		if (!spt) {
22662306a36Sopenharmony_ci			err = -ENOMEM;
22762306a36Sopenharmony_ci			goto out;
22862306a36Sopenharmony_ci		}
22962306a36Sopenharmony_ci		mm->context.hash_context->spt = spt;
23062306a36Sopenharmony_ci	}
23162306a36Sopenharmony_ci
23262306a36Sopenharmony_ci	subpage_mark_vma_nohuge(mm, addr, len);
23362306a36Sopenharmony_ci	for (limit = addr + len; addr < limit; addr = next) {
23462306a36Sopenharmony_ci		next = pmd_addr_end(addr, limit);
23562306a36Sopenharmony_ci		err = -ENOMEM;
23662306a36Sopenharmony_ci		if (addr < 0x100000000UL) {
23762306a36Sopenharmony_ci			spm = spt->low_prot;
23862306a36Sopenharmony_ci		} else {
23962306a36Sopenharmony_ci			spm = spt->protptrs[addr >> SBP_L3_SHIFT];
24062306a36Sopenharmony_ci			if (!spm) {
24162306a36Sopenharmony_ci				spm = (u32 **)get_zeroed_page(GFP_KERNEL);
24262306a36Sopenharmony_ci				if (!spm)
24362306a36Sopenharmony_ci					goto out;
24462306a36Sopenharmony_ci				spt->protptrs[addr >> SBP_L3_SHIFT] = spm;
24562306a36Sopenharmony_ci			}
24662306a36Sopenharmony_ci		}
24762306a36Sopenharmony_ci		spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1);
24862306a36Sopenharmony_ci		spp = *spm;
24962306a36Sopenharmony_ci		if (!spp) {
25062306a36Sopenharmony_ci			spp = (u32 *)get_zeroed_page(GFP_KERNEL);
25162306a36Sopenharmony_ci			if (!spp)
25262306a36Sopenharmony_ci				goto out;
25362306a36Sopenharmony_ci			*spm = spp;
25462306a36Sopenharmony_ci		}
25562306a36Sopenharmony_ci		spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci		local_irq_disable();
25862306a36Sopenharmony_ci		demote_segment_4k(mm, addr);
25962306a36Sopenharmony_ci		local_irq_enable();
26062306a36Sopenharmony_ci
26162306a36Sopenharmony_ci		i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
26262306a36Sopenharmony_ci		nw = PTRS_PER_PTE - i;
26362306a36Sopenharmony_ci		if (addr + (nw << PAGE_SHIFT) > next)
26462306a36Sopenharmony_ci			nw = (next - addr) >> PAGE_SHIFT;
26562306a36Sopenharmony_ci
26662306a36Sopenharmony_ci		mmap_write_unlock(mm);
26762306a36Sopenharmony_ci		if (__copy_from_user(spp, map, nw * sizeof(u32)))
26862306a36Sopenharmony_ci			return -EFAULT;
26962306a36Sopenharmony_ci		map += nw;
27062306a36Sopenharmony_ci		mmap_write_lock(mm);
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci		/* now flush any existing HPTEs for the range */
27362306a36Sopenharmony_ci		hpte_flush_range(mm, addr, nw);
27462306a36Sopenharmony_ci	}
27562306a36Sopenharmony_ci	if (limit > spt->maxaddr)
27662306a36Sopenharmony_ci		spt->maxaddr = limit;
27762306a36Sopenharmony_ci	err = 0;
27862306a36Sopenharmony_ci out:
27962306a36Sopenharmony_ci	mmap_write_unlock(mm);
28062306a36Sopenharmony_ci	return err;
28162306a36Sopenharmony_ci}
282