162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci *  Page table allocation functions
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci *    Copyright IBM Corp. 2016
662306a36Sopenharmony_ci *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
762306a36Sopenharmony_ci */
862306a36Sopenharmony_ci
962306a36Sopenharmony_ci#include <linux/sysctl.h>
1062306a36Sopenharmony_ci#include <linux/slab.h>
1162306a36Sopenharmony_ci#include <linux/mm.h>
1262306a36Sopenharmony_ci#include <asm/mmu_context.h>
1362306a36Sopenharmony_ci#include <asm/pgalloc.h>
1462306a36Sopenharmony_ci#include <asm/gmap.h>
1562306a36Sopenharmony_ci#include <asm/tlb.h>
1662306a36Sopenharmony_ci#include <asm/tlbflush.h>
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci#ifdef CONFIG_PGSTE
1962306a36Sopenharmony_ci
2062306a36Sopenharmony_ciint page_table_allocate_pgste = 0;
2162306a36Sopenharmony_ciEXPORT_SYMBOL(page_table_allocate_pgste);
2262306a36Sopenharmony_ci
2362306a36Sopenharmony_cistatic struct ctl_table page_table_sysctl[] = {
2462306a36Sopenharmony_ci	{
2562306a36Sopenharmony_ci		.procname	= "allocate_pgste",
2662306a36Sopenharmony_ci		.data		= &page_table_allocate_pgste,
2762306a36Sopenharmony_ci		.maxlen		= sizeof(int),
2862306a36Sopenharmony_ci		.mode		= S_IRUGO | S_IWUSR,
2962306a36Sopenharmony_ci		.proc_handler	= proc_dointvec_minmax,
3062306a36Sopenharmony_ci		.extra1		= SYSCTL_ZERO,
3162306a36Sopenharmony_ci		.extra2		= SYSCTL_ONE,
3262306a36Sopenharmony_ci	},
3362306a36Sopenharmony_ci	{ }
3462306a36Sopenharmony_ci};
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_cistatic int __init page_table_register_sysctl(void)
3762306a36Sopenharmony_ci{
3862306a36Sopenharmony_ci	return register_sysctl("vm", page_table_sysctl) ? 0 : -ENOMEM;
3962306a36Sopenharmony_ci}
4062306a36Sopenharmony_ci__initcall(page_table_register_sysctl);
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_ci#endif /* CONFIG_PGSTE */
4362306a36Sopenharmony_ci
4462306a36Sopenharmony_ciunsigned long *crst_table_alloc(struct mm_struct *mm)
4562306a36Sopenharmony_ci{
4662306a36Sopenharmony_ci	struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
4762306a36Sopenharmony_ci
4862306a36Sopenharmony_ci	if (!ptdesc)
4962306a36Sopenharmony_ci		return NULL;
5062306a36Sopenharmony_ci	arch_set_page_dat(ptdesc_page(ptdesc), CRST_ALLOC_ORDER);
5162306a36Sopenharmony_ci	return (unsigned long *) ptdesc_to_virt(ptdesc);
5262306a36Sopenharmony_ci}
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_civoid crst_table_free(struct mm_struct *mm, unsigned long *table)
5562306a36Sopenharmony_ci{
5662306a36Sopenharmony_ci	pagetable_free(virt_to_ptdesc(table));
5762306a36Sopenharmony_ci}
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_cistatic void __crst_table_upgrade(void *arg)
6062306a36Sopenharmony_ci{
6162306a36Sopenharmony_ci	struct mm_struct *mm = arg;
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_ci	/* change all active ASCEs to avoid the creation of new TLBs */
6462306a36Sopenharmony_ci	if (current->active_mm == mm) {
6562306a36Sopenharmony_ci		S390_lowcore.user_asce = mm->context.asce;
6662306a36Sopenharmony_ci		__ctl_load(S390_lowcore.user_asce, 7, 7);
6762306a36Sopenharmony_ci	}
6862306a36Sopenharmony_ci	__tlb_flush_local();
6962306a36Sopenharmony_ci}
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_ciint crst_table_upgrade(struct mm_struct *mm, unsigned long end)
7262306a36Sopenharmony_ci{
7362306a36Sopenharmony_ci	unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
7462306a36Sopenharmony_ci	unsigned long asce_limit = mm->context.asce_limit;
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci	/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
7762306a36Sopenharmony_ci	VM_BUG_ON(asce_limit < _REGION2_SIZE);
7862306a36Sopenharmony_ci
7962306a36Sopenharmony_ci	if (end <= asce_limit)
8062306a36Sopenharmony_ci		return 0;
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_ci	if (asce_limit == _REGION2_SIZE) {
8362306a36Sopenharmony_ci		p4d = crst_table_alloc(mm);
8462306a36Sopenharmony_ci		if (unlikely(!p4d))
8562306a36Sopenharmony_ci			goto err_p4d;
8662306a36Sopenharmony_ci		crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
8762306a36Sopenharmony_ci	}
8862306a36Sopenharmony_ci	if (end > _REGION1_SIZE) {
8962306a36Sopenharmony_ci		pgd = crst_table_alloc(mm);
9062306a36Sopenharmony_ci		if (unlikely(!pgd))
9162306a36Sopenharmony_ci			goto err_pgd;
9262306a36Sopenharmony_ci		crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
9362306a36Sopenharmony_ci	}
9462306a36Sopenharmony_ci
9562306a36Sopenharmony_ci	spin_lock_bh(&mm->page_table_lock);
9662306a36Sopenharmony_ci
9762306a36Sopenharmony_ci	/*
9862306a36Sopenharmony_ci	 * This routine gets called with mmap_lock lock held and there is
9962306a36Sopenharmony_ci	 * no reason to optimize for the case of otherwise. However, if
10062306a36Sopenharmony_ci	 * that would ever change, the below check will let us know.
10162306a36Sopenharmony_ci	 */
10262306a36Sopenharmony_ci	VM_BUG_ON(asce_limit != mm->context.asce_limit);
10362306a36Sopenharmony_ci
10462306a36Sopenharmony_ci	if (p4d) {
10562306a36Sopenharmony_ci		__pgd = (unsigned long *) mm->pgd;
10662306a36Sopenharmony_ci		p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
10762306a36Sopenharmony_ci		mm->pgd = (pgd_t *) p4d;
10862306a36Sopenharmony_ci		mm->context.asce_limit = _REGION1_SIZE;
10962306a36Sopenharmony_ci		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
11062306a36Sopenharmony_ci			_ASCE_USER_BITS | _ASCE_TYPE_REGION2;
11162306a36Sopenharmony_ci		mm_inc_nr_puds(mm);
11262306a36Sopenharmony_ci	}
11362306a36Sopenharmony_ci	if (pgd) {
11462306a36Sopenharmony_ci		__pgd = (unsigned long *) mm->pgd;
11562306a36Sopenharmony_ci		pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
11662306a36Sopenharmony_ci		mm->pgd = (pgd_t *) pgd;
11762306a36Sopenharmony_ci		mm->context.asce_limit = TASK_SIZE_MAX;
11862306a36Sopenharmony_ci		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
11962306a36Sopenharmony_ci			_ASCE_USER_BITS | _ASCE_TYPE_REGION1;
12062306a36Sopenharmony_ci	}
12162306a36Sopenharmony_ci
12262306a36Sopenharmony_ci	spin_unlock_bh(&mm->page_table_lock);
12362306a36Sopenharmony_ci
12462306a36Sopenharmony_ci	on_each_cpu(__crst_table_upgrade, mm, 0);
12562306a36Sopenharmony_ci
12662306a36Sopenharmony_ci	return 0;
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_cierr_pgd:
12962306a36Sopenharmony_ci	crst_table_free(mm, p4d);
13062306a36Sopenharmony_cierr_p4d:
13162306a36Sopenharmony_ci	return -ENOMEM;
13262306a36Sopenharmony_ci}
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_cistatic inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
13562306a36Sopenharmony_ci{
13662306a36Sopenharmony_ci	return atomic_fetch_xor(bits, v) ^ bits;
13762306a36Sopenharmony_ci}
13862306a36Sopenharmony_ci
13962306a36Sopenharmony_ci#ifdef CONFIG_PGSTE
14062306a36Sopenharmony_ci
14162306a36Sopenharmony_cistruct page *page_table_alloc_pgste(struct mm_struct *mm)
14262306a36Sopenharmony_ci{
14362306a36Sopenharmony_ci	struct ptdesc *ptdesc;
14462306a36Sopenharmony_ci	u64 *table;
14562306a36Sopenharmony_ci
14662306a36Sopenharmony_ci	ptdesc = pagetable_alloc(GFP_KERNEL, 0);
14762306a36Sopenharmony_ci	if (ptdesc) {
14862306a36Sopenharmony_ci		table = (u64 *)ptdesc_to_virt(ptdesc);
14962306a36Sopenharmony_ci		arch_set_page_dat(virt_to_page(table), 0);
15062306a36Sopenharmony_ci		memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
15162306a36Sopenharmony_ci		memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
15262306a36Sopenharmony_ci	}
15362306a36Sopenharmony_ci	return ptdesc_page(ptdesc);
15462306a36Sopenharmony_ci}
15562306a36Sopenharmony_ci
15662306a36Sopenharmony_civoid page_table_free_pgste(struct page *page)
15762306a36Sopenharmony_ci{
15862306a36Sopenharmony_ci	pagetable_free(page_ptdesc(page));
15962306a36Sopenharmony_ci}
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci#endif /* CONFIG_PGSTE */
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_ci/*
16462306a36Sopenharmony_ci * A 2KB-pgtable is either upper or lower half of a normal page.
16562306a36Sopenharmony_ci * The second half of the page may be unused or used as another
16662306a36Sopenharmony_ci * 2KB-pgtable.
16762306a36Sopenharmony_ci *
16862306a36Sopenharmony_ci * Whenever possible the parent page for a new 2KB-pgtable is picked
16962306a36Sopenharmony_ci * from the list of partially allocated pages mm_context_t::pgtable_list.
17062306a36Sopenharmony_ci * In case the list is empty a new parent page is allocated and added to
17162306a36Sopenharmony_ci * the list.
17262306a36Sopenharmony_ci *
17362306a36Sopenharmony_ci * When a parent page gets fully allocated it contains 2KB-pgtables in both
17462306a36Sopenharmony_ci * upper and lower halves and is removed from mm_context_t::pgtable_list.
17562306a36Sopenharmony_ci *
17662306a36Sopenharmony_ci * When 2KB-pgtable is freed from to fully allocated parent page that
17762306a36Sopenharmony_ci * page turns partially allocated and added to mm_context_t::pgtable_list.
17862306a36Sopenharmony_ci *
17962306a36Sopenharmony_ci * If 2KB-pgtable is freed from the partially allocated parent page that
18062306a36Sopenharmony_ci * page turns unused and gets removed from mm_context_t::pgtable_list.
18162306a36Sopenharmony_ci * Furthermore, the unused parent page is released.
18262306a36Sopenharmony_ci *
18362306a36Sopenharmony_ci * As follows from the above, no unallocated or fully allocated parent
18462306a36Sopenharmony_ci * pages are contained in mm_context_t::pgtable_list.
18562306a36Sopenharmony_ci *
18662306a36Sopenharmony_ci * The upper byte (bits 24-31) of the parent page _refcount is used
18762306a36Sopenharmony_ci * for tracking contained 2KB-pgtables and has the following format:
18862306a36Sopenharmony_ci *
18962306a36Sopenharmony_ci *   PP  AA
19062306a36Sopenharmony_ci * 01234567    upper byte (bits 24-31) of struct page::_refcount
19162306a36Sopenharmony_ci *   ||  ||
19262306a36Sopenharmony_ci *   ||  |+--- upper 2KB-pgtable is allocated
19362306a36Sopenharmony_ci *   ||  +---- lower 2KB-pgtable is allocated
19462306a36Sopenharmony_ci *   |+------- upper 2KB-pgtable is pending for removal
19562306a36Sopenharmony_ci *   +-------- lower 2KB-pgtable is pending for removal
19662306a36Sopenharmony_ci *
19762306a36Sopenharmony_ci * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
19862306a36Sopenharmony_ci * using _refcount is possible).
19962306a36Sopenharmony_ci *
20062306a36Sopenharmony_ci * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
20162306a36Sopenharmony_ci * The parent page is either:
20262306a36Sopenharmony_ci *   - added to mm_context_t::pgtable_list in case the second half of the
20362306a36Sopenharmony_ci *     parent page is still unallocated;
20462306a36Sopenharmony_ci *   - removed from mm_context_t::pgtable_list in case both hales of the
20562306a36Sopenharmony_ci *     parent page are allocated;
20662306a36Sopenharmony_ci * These operations are protected with mm_context_t::lock.
20762306a36Sopenharmony_ci *
20862306a36Sopenharmony_ci * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
20962306a36Sopenharmony_ci * and the corresponding PP bit is set to 1 in a single atomic operation.
21062306a36Sopenharmony_ci * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
21162306a36Sopenharmony_ci * exclusive and may never be both set to 1!
21262306a36Sopenharmony_ci * The parent page is either:
21362306a36Sopenharmony_ci *   - added to mm_context_t::pgtable_list in case the second half of the
21462306a36Sopenharmony_ci *     parent page is still allocated;
21562306a36Sopenharmony_ci *   - removed from mm_context_t::pgtable_list in case the second half of
21662306a36Sopenharmony_ci *     the parent page is unallocated;
21762306a36Sopenharmony_ci * These operations are protected with mm_context_t::lock.
21862306a36Sopenharmony_ci *
21962306a36Sopenharmony_ci * It is important to understand that mm_context_t::lock only protects
22062306a36Sopenharmony_ci * mm_context_t::pgtable_list and AA bits, but not the parent page itself
22162306a36Sopenharmony_ci * and PP bits.
22262306a36Sopenharmony_ci *
22362306a36Sopenharmony_ci * Releasing the parent page happens whenever the PP bit turns from 1 to 0,
22462306a36Sopenharmony_ci * while both AA bits and the second PP bit are already unset. Then the
22562306a36Sopenharmony_ci * parent page does not contain any 2KB-pgtable fragment anymore, and it has
22662306a36Sopenharmony_ci * also been removed from mm_context_t::pgtable_list. It is safe to release
22762306a36Sopenharmony_ci * the page therefore.
22862306a36Sopenharmony_ci *
22962306a36Sopenharmony_ci * PGSTE memory spaces use full 4KB-pgtables and do not need most of the
23062306a36Sopenharmony_ci * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
23162306a36Sopenharmony_ci * while the PP bits are never used, nor such a page is added to or removed
23262306a36Sopenharmony_ci * from mm_context_t::pgtable_list.
23362306a36Sopenharmony_ci *
23462306a36Sopenharmony_ci * pte_free_defer() overrides those rules: it takes the page off pgtable_list,
23562306a36Sopenharmony_ci * and prevents both 2K fragments from being reused. pte_free_defer() has to
23662306a36Sopenharmony_ci * guarantee that its pgtable cannot be reused before the RCU grace period
23762306a36Sopenharmony_ci * has elapsed (which page_table_free_rcu() does not actually guarantee).
23862306a36Sopenharmony_ci * But for simplicity, because page->rcu_head overlays page->lru, and because
23962306a36Sopenharmony_ci * the RCU callback might not be called before the mm_context_t has been freed,
24062306a36Sopenharmony_ci * pte_free_defer() in this implementation prevents both fragments from being
24162306a36Sopenharmony_ci * reused, and delays making the call to RCU until both fragments are freed.
24262306a36Sopenharmony_ci */
24362306a36Sopenharmony_ciunsigned long *page_table_alloc(struct mm_struct *mm)
24462306a36Sopenharmony_ci{
24562306a36Sopenharmony_ci	unsigned long *table;
24662306a36Sopenharmony_ci	struct ptdesc *ptdesc;
24762306a36Sopenharmony_ci	unsigned int mask, bit;
24862306a36Sopenharmony_ci
24962306a36Sopenharmony_ci	/* Try to get a fragment of a 4K page as a 2K page table */
25062306a36Sopenharmony_ci	if (!mm_alloc_pgste(mm)) {
25162306a36Sopenharmony_ci		table = NULL;
25262306a36Sopenharmony_ci		spin_lock_bh(&mm->context.lock);
25362306a36Sopenharmony_ci		if (!list_empty(&mm->context.pgtable_list)) {
25462306a36Sopenharmony_ci			ptdesc = list_first_entry(&mm->context.pgtable_list,
25562306a36Sopenharmony_ci						struct ptdesc, pt_list);
25662306a36Sopenharmony_ci			mask = atomic_read(&ptdesc->_refcount) >> 24;
25762306a36Sopenharmony_ci			/*
25862306a36Sopenharmony_ci			 * The pending removal bits must also be checked.
25962306a36Sopenharmony_ci			 * Failure to do so might lead to an impossible
26062306a36Sopenharmony_ci			 * value of (i.e 0x13 or 0x23) written to _refcount.
26162306a36Sopenharmony_ci			 * Such values violate the assumption that pending and
26262306a36Sopenharmony_ci			 * allocation bits are mutually exclusive, and the rest
26362306a36Sopenharmony_ci			 * of the code unrails as result. That could lead to
26462306a36Sopenharmony_ci			 * a whole bunch of races and corruptions.
26562306a36Sopenharmony_ci			 */
26662306a36Sopenharmony_ci			mask = (mask | (mask >> 4)) & 0x03U;
26762306a36Sopenharmony_ci			if (mask != 0x03U) {
26862306a36Sopenharmony_ci				table = (unsigned long *) ptdesc_to_virt(ptdesc);
26962306a36Sopenharmony_ci				bit = mask & 1;		/* =1 -> second 2K */
27062306a36Sopenharmony_ci				if (bit)
27162306a36Sopenharmony_ci					table += PTRS_PER_PTE;
27262306a36Sopenharmony_ci				atomic_xor_bits(&ptdesc->_refcount,
27362306a36Sopenharmony_ci							0x01U << (bit + 24));
27462306a36Sopenharmony_ci				list_del_init(&ptdesc->pt_list);
27562306a36Sopenharmony_ci			}
27662306a36Sopenharmony_ci		}
27762306a36Sopenharmony_ci		spin_unlock_bh(&mm->context.lock);
27862306a36Sopenharmony_ci		if (table)
27962306a36Sopenharmony_ci			return table;
28062306a36Sopenharmony_ci	}
28162306a36Sopenharmony_ci	/* Allocate a fresh page */
28262306a36Sopenharmony_ci	ptdesc = pagetable_alloc(GFP_KERNEL, 0);
28362306a36Sopenharmony_ci	if (!ptdesc)
28462306a36Sopenharmony_ci		return NULL;
28562306a36Sopenharmony_ci	if (!pagetable_pte_ctor(ptdesc)) {
28662306a36Sopenharmony_ci		pagetable_free(ptdesc);
28762306a36Sopenharmony_ci		return NULL;
28862306a36Sopenharmony_ci	}
28962306a36Sopenharmony_ci	arch_set_page_dat(ptdesc_page(ptdesc), 0);
29062306a36Sopenharmony_ci	/* Initialize page table */
29162306a36Sopenharmony_ci	table = (unsigned long *) ptdesc_to_virt(ptdesc);
29262306a36Sopenharmony_ci	if (mm_alloc_pgste(mm)) {
29362306a36Sopenharmony_ci		/* Return 4K page table with PGSTEs */
29462306a36Sopenharmony_ci		INIT_LIST_HEAD(&ptdesc->pt_list);
29562306a36Sopenharmony_ci		atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
29662306a36Sopenharmony_ci		memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
29762306a36Sopenharmony_ci		memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
29862306a36Sopenharmony_ci	} else {
29962306a36Sopenharmony_ci		/* Return the first 2K fragment of the page */
30062306a36Sopenharmony_ci		atomic_xor_bits(&ptdesc->_refcount, 0x01U << 24);
30162306a36Sopenharmony_ci		memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
30262306a36Sopenharmony_ci		spin_lock_bh(&mm->context.lock);
30362306a36Sopenharmony_ci		list_add(&ptdesc->pt_list, &mm->context.pgtable_list);
30462306a36Sopenharmony_ci		spin_unlock_bh(&mm->context.lock);
30562306a36Sopenharmony_ci	}
30662306a36Sopenharmony_ci	return table;
30762306a36Sopenharmony_ci}
30862306a36Sopenharmony_ci
30962306a36Sopenharmony_cistatic void page_table_release_check(struct page *page, void *table,
31062306a36Sopenharmony_ci				     unsigned int half, unsigned int mask)
31162306a36Sopenharmony_ci{
31262306a36Sopenharmony_ci	char msg[128];
31362306a36Sopenharmony_ci
31462306a36Sopenharmony_ci	if (!IS_ENABLED(CONFIG_DEBUG_VM))
31562306a36Sopenharmony_ci		return;
31662306a36Sopenharmony_ci	if (!mask && list_empty(&page->lru))
31762306a36Sopenharmony_ci		return;
31862306a36Sopenharmony_ci	snprintf(msg, sizeof(msg),
31962306a36Sopenharmony_ci		 "Invalid pgtable %p release half 0x%02x mask 0x%02x",
32062306a36Sopenharmony_ci		 table, half, mask);
32162306a36Sopenharmony_ci	dump_page(page, msg);
32262306a36Sopenharmony_ci}
32362306a36Sopenharmony_ci
32462306a36Sopenharmony_cistatic void pte_free_now(struct rcu_head *head)
32562306a36Sopenharmony_ci{
32662306a36Sopenharmony_ci	struct ptdesc *ptdesc;
32762306a36Sopenharmony_ci
32862306a36Sopenharmony_ci	ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
32962306a36Sopenharmony_ci	pagetable_pte_dtor(ptdesc);
33062306a36Sopenharmony_ci	pagetable_free(ptdesc);
33162306a36Sopenharmony_ci}
33262306a36Sopenharmony_ci
33362306a36Sopenharmony_civoid page_table_free(struct mm_struct *mm, unsigned long *table)
33462306a36Sopenharmony_ci{
33562306a36Sopenharmony_ci	unsigned int mask, bit, half;
33662306a36Sopenharmony_ci	struct ptdesc *ptdesc = virt_to_ptdesc(table);
33762306a36Sopenharmony_ci
33862306a36Sopenharmony_ci	if (!mm_alloc_pgste(mm)) {
33962306a36Sopenharmony_ci		/* Free 2K page table fragment of a 4K page */
34062306a36Sopenharmony_ci		bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
34162306a36Sopenharmony_ci		spin_lock_bh(&mm->context.lock);
34262306a36Sopenharmony_ci		/*
34362306a36Sopenharmony_ci		 * Mark the page for delayed release. The actual release
34462306a36Sopenharmony_ci		 * will happen outside of the critical section from this
34562306a36Sopenharmony_ci		 * function or from __tlb_remove_table()
34662306a36Sopenharmony_ci		 */
34762306a36Sopenharmony_ci		mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24));
34862306a36Sopenharmony_ci		mask >>= 24;
34962306a36Sopenharmony_ci		if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) {
35062306a36Sopenharmony_ci			/*
35162306a36Sopenharmony_ci			 * Other half is allocated, and neither half has had
35262306a36Sopenharmony_ci			 * its free deferred: add page to head of list, to make
35362306a36Sopenharmony_ci			 * this freed half available for immediate reuse.
35462306a36Sopenharmony_ci			 */
35562306a36Sopenharmony_ci			list_add(&ptdesc->pt_list, &mm->context.pgtable_list);
35662306a36Sopenharmony_ci		} else {
35762306a36Sopenharmony_ci			/* If page is on list, now remove it. */
35862306a36Sopenharmony_ci			list_del_init(&ptdesc->pt_list);
35962306a36Sopenharmony_ci		}
36062306a36Sopenharmony_ci		spin_unlock_bh(&mm->context.lock);
36162306a36Sopenharmony_ci		mask = atomic_xor_bits(&ptdesc->_refcount, 0x10U << (bit + 24));
36262306a36Sopenharmony_ci		mask >>= 24;
36362306a36Sopenharmony_ci		if (mask != 0x00U)
36462306a36Sopenharmony_ci			return;
36562306a36Sopenharmony_ci		half = 0x01U << bit;
36662306a36Sopenharmony_ci	} else {
36762306a36Sopenharmony_ci		half = 0x03U;
36862306a36Sopenharmony_ci		mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
36962306a36Sopenharmony_ci		mask >>= 24;
37062306a36Sopenharmony_ci	}
37162306a36Sopenharmony_ci
37262306a36Sopenharmony_ci	page_table_release_check(ptdesc_page(ptdesc), table, half, mask);
37362306a36Sopenharmony_ci	if (folio_test_clear_active(ptdesc_folio(ptdesc)))
37462306a36Sopenharmony_ci		call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
37562306a36Sopenharmony_ci	else
37662306a36Sopenharmony_ci		pte_free_now(&ptdesc->pt_rcu_head);
37762306a36Sopenharmony_ci}
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_civoid page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
38062306a36Sopenharmony_ci			 unsigned long vmaddr)
38162306a36Sopenharmony_ci{
38262306a36Sopenharmony_ci	struct mm_struct *mm;
38362306a36Sopenharmony_ci	unsigned int bit, mask;
38462306a36Sopenharmony_ci	struct ptdesc *ptdesc = virt_to_ptdesc(table);
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_ci	mm = tlb->mm;
38762306a36Sopenharmony_ci	if (mm_alloc_pgste(mm)) {
38862306a36Sopenharmony_ci		gmap_unlink(mm, table, vmaddr);
38962306a36Sopenharmony_ci		table = (unsigned long *) ((unsigned long)table | 0x03U);
39062306a36Sopenharmony_ci		tlb_remove_ptdesc(tlb, table);
39162306a36Sopenharmony_ci		return;
39262306a36Sopenharmony_ci	}
39362306a36Sopenharmony_ci	bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
39462306a36Sopenharmony_ci	spin_lock_bh(&mm->context.lock);
39562306a36Sopenharmony_ci	/*
39662306a36Sopenharmony_ci	 * Mark the page for delayed release. The actual release will happen
39762306a36Sopenharmony_ci	 * outside of the critical section from __tlb_remove_table() or from
39862306a36Sopenharmony_ci	 * page_table_free()
39962306a36Sopenharmony_ci	 */
40062306a36Sopenharmony_ci	mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24));
40162306a36Sopenharmony_ci	mask >>= 24;
40262306a36Sopenharmony_ci	if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) {
40362306a36Sopenharmony_ci		/*
40462306a36Sopenharmony_ci		 * Other half is allocated, and neither half has had
40562306a36Sopenharmony_ci		 * its free deferred: add page to end of list, to make
40662306a36Sopenharmony_ci		 * this freed half available for reuse once its pending
40762306a36Sopenharmony_ci		 * bit has been cleared by __tlb_remove_table().
40862306a36Sopenharmony_ci		 */
40962306a36Sopenharmony_ci		list_add_tail(&ptdesc->pt_list, &mm->context.pgtable_list);
41062306a36Sopenharmony_ci	} else {
41162306a36Sopenharmony_ci		/* If page is on list, now remove it. */
41262306a36Sopenharmony_ci		list_del_init(&ptdesc->pt_list);
41362306a36Sopenharmony_ci	}
41462306a36Sopenharmony_ci	spin_unlock_bh(&mm->context.lock);
41562306a36Sopenharmony_ci	table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
41662306a36Sopenharmony_ci	tlb_remove_table(tlb, table);
41762306a36Sopenharmony_ci}
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_civoid __tlb_remove_table(void *_table)
42062306a36Sopenharmony_ci{
42162306a36Sopenharmony_ci	unsigned int mask = (unsigned long) _table & 0x03U, half = mask;
42262306a36Sopenharmony_ci	void *table = (void *)((unsigned long) _table ^ mask);
42362306a36Sopenharmony_ci	struct ptdesc *ptdesc = virt_to_ptdesc(table);
42462306a36Sopenharmony_ci
42562306a36Sopenharmony_ci	switch (half) {
42662306a36Sopenharmony_ci	case 0x00U:	/* pmd, pud, or p4d */
42762306a36Sopenharmony_ci		pagetable_free(ptdesc);
42862306a36Sopenharmony_ci		return;
42962306a36Sopenharmony_ci	case 0x01U:	/* lower 2K of a 4K page table */
43062306a36Sopenharmony_ci	case 0x02U:	/* higher 2K of a 4K page table */
43162306a36Sopenharmony_ci		mask = atomic_xor_bits(&ptdesc->_refcount, mask << (4 + 24));
43262306a36Sopenharmony_ci		mask >>= 24;
43362306a36Sopenharmony_ci		if (mask != 0x00U)
43462306a36Sopenharmony_ci			return;
43562306a36Sopenharmony_ci		break;
43662306a36Sopenharmony_ci	case 0x03U:	/* 4K page table with pgstes */
43762306a36Sopenharmony_ci		mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
43862306a36Sopenharmony_ci		mask >>= 24;
43962306a36Sopenharmony_ci		break;
44062306a36Sopenharmony_ci	}
44162306a36Sopenharmony_ci
44262306a36Sopenharmony_ci	page_table_release_check(ptdesc_page(ptdesc), table, half, mask);
44362306a36Sopenharmony_ci	if (folio_test_clear_active(ptdesc_folio(ptdesc)))
44462306a36Sopenharmony_ci		call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
44562306a36Sopenharmony_ci	else
44662306a36Sopenharmony_ci		pte_free_now(&ptdesc->pt_rcu_head);
44762306a36Sopenharmony_ci}
44862306a36Sopenharmony_ci
44962306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
45062306a36Sopenharmony_civoid pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
45162306a36Sopenharmony_ci{
45262306a36Sopenharmony_ci	struct page *page;
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_ci	page = virt_to_page(pgtable);
45562306a36Sopenharmony_ci	SetPageActive(page);
45662306a36Sopenharmony_ci	page_table_free(mm, (unsigned long *)pgtable);
45762306a36Sopenharmony_ci	/*
45862306a36Sopenharmony_ci	 * page_table_free() does not do the pgste gmap_unlink() which
45962306a36Sopenharmony_ci	 * page_table_free_rcu() does: warn us if pgste ever reaches here.
46062306a36Sopenharmony_ci	 */
46162306a36Sopenharmony_ci	WARN_ON_ONCE(mm_has_pgste(mm));
46262306a36Sopenharmony_ci}
46362306a36Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
46462306a36Sopenharmony_ci
46562306a36Sopenharmony_ci/*
46662306a36Sopenharmony_ci * Base infrastructure required to generate basic asces, region, segment,
46762306a36Sopenharmony_ci * and page tables that do not make use of enhanced features like EDAT1.
46862306a36Sopenharmony_ci */
46962306a36Sopenharmony_ci
47062306a36Sopenharmony_cistatic struct kmem_cache *base_pgt_cache;
47162306a36Sopenharmony_ci
47262306a36Sopenharmony_cistatic unsigned long *base_pgt_alloc(void)
47362306a36Sopenharmony_ci{
47462306a36Sopenharmony_ci	unsigned long *table;
47562306a36Sopenharmony_ci
47662306a36Sopenharmony_ci	table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
47762306a36Sopenharmony_ci	if (table)
47862306a36Sopenharmony_ci		memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
47962306a36Sopenharmony_ci	return table;
48062306a36Sopenharmony_ci}
48162306a36Sopenharmony_ci
48262306a36Sopenharmony_cistatic void base_pgt_free(unsigned long *table)
48362306a36Sopenharmony_ci{
48462306a36Sopenharmony_ci	kmem_cache_free(base_pgt_cache, table);
48562306a36Sopenharmony_ci}
48662306a36Sopenharmony_ci
48762306a36Sopenharmony_cistatic unsigned long *base_crst_alloc(unsigned long val)
48862306a36Sopenharmony_ci{
48962306a36Sopenharmony_ci	unsigned long *table;
49062306a36Sopenharmony_ci	struct ptdesc *ptdesc;
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_ci	ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, CRST_ALLOC_ORDER);
49362306a36Sopenharmony_ci	if (!ptdesc)
49462306a36Sopenharmony_ci		return NULL;
49562306a36Sopenharmony_ci	table = ptdesc_address(ptdesc);
49662306a36Sopenharmony_ci
49762306a36Sopenharmony_ci	crst_table_init(table, val);
49862306a36Sopenharmony_ci	return table;
49962306a36Sopenharmony_ci}
50062306a36Sopenharmony_ci
50162306a36Sopenharmony_cistatic void base_crst_free(unsigned long *table)
50262306a36Sopenharmony_ci{
50362306a36Sopenharmony_ci	pagetable_free(virt_to_ptdesc(table));
50462306a36Sopenharmony_ci}
50562306a36Sopenharmony_ci
50662306a36Sopenharmony_ci#define BASE_ADDR_END_FUNC(NAME, SIZE)					\
50762306a36Sopenharmony_cistatic inline unsigned long base_##NAME##_addr_end(unsigned long addr,	\
50862306a36Sopenharmony_ci						   unsigned long end)	\
50962306a36Sopenharmony_ci{									\
51062306a36Sopenharmony_ci	unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1);		\
51162306a36Sopenharmony_ci									\
51262306a36Sopenharmony_ci	return (next - 1) < (end - 1) ? next : end;			\
51362306a36Sopenharmony_ci}
51462306a36Sopenharmony_ci
51562306a36Sopenharmony_ciBASE_ADDR_END_FUNC(page,    _PAGE_SIZE)
51662306a36Sopenharmony_ciBASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
51762306a36Sopenharmony_ciBASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
51862306a36Sopenharmony_ciBASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
51962306a36Sopenharmony_ciBASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
52062306a36Sopenharmony_ci
52162306a36Sopenharmony_cistatic inline unsigned long base_lra(unsigned long address)
52262306a36Sopenharmony_ci{
52362306a36Sopenharmony_ci	unsigned long real;
52462306a36Sopenharmony_ci
52562306a36Sopenharmony_ci	asm volatile(
52662306a36Sopenharmony_ci		"	lra	%0,0(%1)\n"
52762306a36Sopenharmony_ci		: "=d" (real) : "a" (address) : "cc");
52862306a36Sopenharmony_ci	return real;
52962306a36Sopenharmony_ci}
53062306a36Sopenharmony_ci
53162306a36Sopenharmony_cistatic int base_page_walk(unsigned long *origin, unsigned long addr,
53262306a36Sopenharmony_ci			  unsigned long end, int alloc)
53362306a36Sopenharmony_ci{
53462306a36Sopenharmony_ci	unsigned long *pte, next;
53562306a36Sopenharmony_ci
53662306a36Sopenharmony_ci	if (!alloc)
53762306a36Sopenharmony_ci		return 0;
53862306a36Sopenharmony_ci	pte = origin;
53962306a36Sopenharmony_ci	pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
54062306a36Sopenharmony_ci	do {
54162306a36Sopenharmony_ci		next = base_page_addr_end(addr, end);
54262306a36Sopenharmony_ci		*pte = base_lra(addr);
54362306a36Sopenharmony_ci	} while (pte++, addr = next, addr < end);
54462306a36Sopenharmony_ci	return 0;
54562306a36Sopenharmony_ci}
54662306a36Sopenharmony_ci
54762306a36Sopenharmony_cistatic int base_segment_walk(unsigned long *origin, unsigned long addr,
54862306a36Sopenharmony_ci			     unsigned long end, int alloc)
54962306a36Sopenharmony_ci{
55062306a36Sopenharmony_ci	unsigned long *ste, next, *table;
55162306a36Sopenharmony_ci	int rc;
55262306a36Sopenharmony_ci
55362306a36Sopenharmony_ci	ste = origin;
55462306a36Sopenharmony_ci	ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
55562306a36Sopenharmony_ci	do {
55662306a36Sopenharmony_ci		next = base_segment_addr_end(addr, end);
55762306a36Sopenharmony_ci		if (*ste & _SEGMENT_ENTRY_INVALID) {
55862306a36Sopenharmony_ci			if (!alloc)
55962306a36Sopenharmony_ci				continue;
56062306a36Sopenharmony_ci			table = base_pgt_alloc();
56162306a36Sopenharmony_ci			if (!table)
56262306a36Sopenharmony_ci				return -ENOMEM;
56362306a36Sopenharmony_ci			*ste = __pa(table) | _SEGMENT_ENTRY;
56462306a36Sopenharmony_ci		}
56562306a36Sopenharmony_ci		table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
56662306a36Sopenharmony_ci		rc = base_page_walk(table, addr, next, alloc);
56762306a36Sopenharmony_ci		if (rc)
56862306a36Sopenharmony_ci			return rc;
56962306a36Sopenharmony_ci		if (!alloc)
57062306a36Sopenharmony_ci			base_pgt_free(table);
57162306a36Sopenharmony_ci		cond_resched();
57262306a36Sopenharmony_ci	} while (ste++, addr = next, addr < end);
57362306a36Sopenharmony_ci	return 0;
57462306a36Sopenharmony_ci}
57562306a36Sopenharmony_ci
57662306a36Sopenharmony_cistatic int base_region3_walk(unsigned long *origin, unsigned long addr,
57762306a36Sopenharmony_ci			     unsigned long end, int alloc)
57862306a36Sopenharmony_ci{
57962306a36Sopenharmony_ci	unsigned long *rtte, next, *table;
58062306a36Sopenharmony_ci	int rc;
58162306a36Sopenharmony_ci
58262306a36Sopenharmony_ci	rtte = origin;
58362306a36Sopenharmony_ci	rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
58462306a36Sopenharmony_ci	do {
58562306a36Sopenharmony_ci		next = base_region3_addr_end(addr, end);
58662306a36Sopenharmony_ci		if (*rtte & _REGION_ENTRY_INVALID) {
58762306a36Sopenharmony_ci			if (!alloc)
58862306a36Sopenharmony_ci				continue;
58962306a36Sopenharmony_ci			table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
59062306a36Sopenharmony_ci			if (!table)
59162306a36Sopenharmony_ci				return -ENOMEM;
59262306a36Sopenharmony_ci			*rtte = __pa(table) | _REGION3_ENTRY;
59362306a36Sopenharmony_ci		}
59462306a36Sopenharmony_ci		table = __va(*rtte & _REGION_ENTRY_ORIGIN);
59562306a36Sopenharmony_ci		rc = base_segment_walk(table, addr, next, alloc);
59662306a36Sopenharmony_ci		if (rc)
59762306a36Sopenharmony_ci			return rc;
59862306a36Sopenharmony_ci		if (!alloc)
59962306a36Sopenharmony_ci			base_crst_free(table);
60062306a36Sopenharmony_ci	} while (rtte++, addr = next, addr < end);
60162306a36Sopenharmony_ci	return 0;
60262306a36Sopenharmony_ci}
60362306a36Sopenharmony_ci
60462306a36Sopenharmony_cistatic int base_region2_walk(unsigned long *origin, unsigned long addr,
60562306a36Sopenharmony_ci			     unsigned long end, int alloc)
60662306a36Sopenharmony_ci{
60762306a36Sopenharmony_ci	unsigned long *rste, next, *table;
60862306a36Sopenharmony_ci	int rc;
60962306a36Sopenharmony_ci
61062306a36Sopenharmony_ci	rste = origin;
61162306a36Sopenharmony_ci	rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
61262306a36Sopenharmony_ci	do {
61362306a36Sopenharmony_ci		next = base_region2_addr_end(addr, end);
61462306a36Sopenharmony_ci		if (*rste & _REGION_ENTRY_INVALID) {
61562306a36Sopenharmony_ci			if (!alloc)
61662306a36Sopenharmony_ci				continue;
61762306a36Sopenharmony_ci			table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
61862306a36Sopenharmony_ci			if (!table)
61962306a36Sopenharmony_ci				return -ENOMEM;
62062306a36Sopenharmony_ci			*rste = __pa(table) | _REGION2_ENTRY;
62162306a36Sopenharmony_ci		}
62262306a36Sopenharmony_ci		table = __va(*rste & _REGION_ENTRY_ORIGIN);
62362306a36Sopenharmony_ci		rc = base_region3_walk(table, addr, next, alloc);
62462306a36Sopenharmony_ci		if (rc)
62562306a36Sopenharmony_ci			return rc;
62662306a36Sopenharmony_ci		if (!alloc)
62762306a36Sopenharmony_ci			base_crst_free(table);
62862306a36Sopenharmony_ci	} while (rste++, addr = next, addr < end);
62962306a36Sopenharmony_ci	return 0;
63062306a36Sopenharmony_ci}
63162306a36Sopenharmony_ci
63262306a36Sopenharmony_cistatic int base_region1_walk(unsigned long *origin, unsigned long addr,
63362306a36Sopenharmony_ci			     unsigned long end, int alloc)
63462306a36Sopenharmony_ci{
63562306a36Sopenharmony_ci	unsigned long *rfte, next, *table;
63662306a36Sopenharmony_ci	int rc;
63762306a36Sopenharmony_ci
63862306a36Sopenharmony_ci	rfte = origin;
63962306a36Sopenharmony_ci	rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
64062306a36Sopenharmony_ci	do {
64162306a36Sopenharmony_ci		next = base_region1_addr_end(addr, end);
64262306a36Sopenharmony_ci		if (*rfte & _REGION_ENTRY_INVALID) {
64362306a36Sopenharmony_ci			if (!alloc)
64462306a36Sopenharmony_ci				continue;
64562306a36Sopenharmony_ci			table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
64662306a36Sopenharmony_ci			if (!table)
64762306a36Sopenharmony_ci				return -ENOMEM;
64862306a36Sopenharmony_ci			*rfte = __pa(table) | _REGION1_ENTRY;
64962306a36Sopenharmony_ci		}
65062306a36Sopenharmony_ci		table = __va(*rfte & _REGION_ENTRY_ORIGIN);
65162306a36Sopenharmony_ci		rc = base_region2_walk(table, addr, next, alloc);
65262306a36Sopenharmony_ci		if (rc)
65362306a36Sopenharmony_ci			return rc;
65462306a36Sopenharmony_ci		if (!alloc)
65562306a36Sopenharmony_ci			base_crst_free(table);
65662306a36Sopenharmony_ci	} while (rfte++, addr = next, addr < end);
65762306a36Sopenharmony_ci	return 0;
65862306a36Sopenharmony_ci}
65962306a36Sopenharmony_ci
66062306a36Sopenharmony_ci/**
66162306a36Sopenharmony_ci * base_asce_free - free asce and tables returned from base_asce_alloc()
66262306a36Sopenharmony_ci * @asce: asce to be freed
66362306a36Sopenharmony_ci *
66462306a36Sopenharmony_ci * Frees all region, segment, and page tables that were allocated with a
66562306a36Sopenharmony_ci * corresponding base_asce_alloc() call.
66662306a36Sopenharmony_ci */
66762306a36Sopenharmony_civoid base_asce_free(unsigned long asce)
66862306a36Sopenharmony_ci{
66962306a36Sopenharmony_ci	unsigned long *table = __va(asce & _ASCE_ORIGIN);
67062306a36Sopenharmony_ci
67162306a36Sopenharmony_ci	if (!asce)
67262306a36Sopenharmony_ci		return;
67362306a36Sopenharmony_ci	switch (asce & _ASCE_TYPE_MASK) {
67462306a36Sopenharmony_ci	case _ASCE_TYPE_SEGMENT:
67562306a36Sopenharmony_ci		base_segment_walk(table, 0, _REGION3_SIZE, 0);
67662306a36Sopenharmony_ci		break;
67762306a36Sopenharmony_ci	case _ASCE_TYPE_REGION3:
67862306a36Sopenharmony_ci		base_region3_walk(table, 0, _REGION2_SIZE, 0);
67962306a36Sopenharmony_ci		break;
68062306a36Sopenharmony_ci	case _ASCE_TYPE_REGION2:
68162306a36Sopenharmony_ci		base_region2_walk(table, 0, _REGION1_SIZE, 0);
68262306a36Sopenharmony_ci		break;
68362306a36Sopenharmony_ci	case _ASCE_TYPE_REGION1:
68462306a36Sopenharmony_ci		base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
68562306a36Sopenharmony_ci		break;
68662306a36Sopenharmony_ci	}
68762306a36Sopenharmony_ci	base_crst_free(table);
68862306a36Sopenharmony_ci}
68962306a36Sopenharmony_ci
69062306a36Sopenharmony_cistatic int base_pgt_cache_init(void)
69162306a36Sopenharmony_ci{
69262306a36Sopenharmony_ci	static DEFINE_MUTEX(base_pgt_cache_mutex);
69362306a36Sopenharmony_ci	unsigned long sz = _PAGE_TABLE_SIZE;
69462306a36Sopenharmony_ci
69562306a36Sopenharmony_ci	if (base_pgt_cache)
69662306a36Sopenharmony_ci		return 0;
69762306a36Sopenharmony_ci	mutex_lock(&base_pgt_cache_mutex);
69862306a36Sopenharmony_ci	if (!base_pgt_cache)
69962306a36Sopenharmony_ci		base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
70062306a36Sopenharmony_ci	mutex_unlock(&base_pgt_cache_mutex);
70162306a36Sopenharmony_ci	return base_pgt_cache ? 0 : -ENOMEM;
70262306a36Sopenharmony_ci}
70362306a36Sopenharmony_ci
70462306a36Sopenharmony_ci/**
70562306a36Sopenharmony_ci * base_asce_alloc - create kernel mapping without enhanced DAT features
70662306a36Sopenharmony_ci * @addr: virtual start address of kernel mapping
70762306a36Sopenharmony_ci * @num_pages: number of consecutive pages
70862306a36Sopenharmony_ci *
70962306a36Sopenharmony_ci * Generate an asce, including all required region, segment and page tables,
71062306a36Sopenharmony_ci * that can be used to access the virtual kernel mapping. The difference is
71162306a36Sopenharmony_ci * that the returned asce does not make use of any enhanced DAT features like
71262306a36Sopenharmony_ci * e.g. large pages. This is required for some I/O functions that pass an
71362306a36Sopenharmony_ci * asce, like e.g. some service call requests.
71462306a36Sopenharmony_ci *
71562306a36Sopenharmony_ci * Note: the returned asce may NEVER be attached to any cpu. It may only be
71662306a36Sopenharmony_ci *	 used for I/O requests. tlb entries that might result because the
71762306a36Sopenharmony_ci *	 asce was attached to a cpu won't be cleared.
71862306a36Sopenharmony_ci */
71962306a36Sopenharmony_ciunsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
72062306a36Sopenharmony_ci{
72162306a36Sopenharmony_ci	unsigned long asce, *table, end;
72262306a36Sopenharmony_ci	int rc;
72362306a36Sopenharmony_ci
72462306a36Sopenharmony_ci	if (base_pgt_cache_init())
72562306a36Sopenharmony_ci		return 0;
72662306a36Sopenharmony_ci	end = addr + num_pages * PAGE_SIZE;
72762306a36Sopenharmony_ci	if (end <= _REGION3_SIZE) {
72862306a36Sopenharmony_ci		table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
72962306a36Sopenharmony_ci		if (!table)
73062306a36Sopenharmony_ci			return 0;
73162306a36Sopenharmony_ci		rc = base_segment_walk(table, addr, end, 1);
73262306a36Sopenharmony_ci		asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
73362306a36Sopenharmony_ci	} else if (end <= _REGION2_SIZE) {
73462306a36Sopenharmony_ci		table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
73562306a36Sopenharmony_ci		if (!table)
73662306a36Sopenharmony_ci			return 0;
73762306a36Sopenharmony_ci		rc = base_region3_walk(table, addr, end, 1);
73862306a36Sopenharmony_ci		asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
73962306a36Sopenharmony_ci	} else if (end <= _REGION1_SIZE) {
74062306a36Sopenharmony_ci		table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
74162306a36Sopenharmony_ci		if (!table)
74262306a36Sopenharmony_ci			return 0;
74362306a36Sopenharmony_ci		rc = base_region2_walk(table, addr, end, 1);
74462306a36Sopenharmony_ci		asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
74562306a36Sopenharmony_ci	} else {
74662306a36Sopenharmony_ci		table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
74762306a36Sopenharmony_ci		if (!table)
74862306a36Sopenharmony_ci			return 0;
74962306a36Sopenharmony_ci		rc = base_region1_walk(table, addr, end, 1);
75062306a36Sopenharmony_ci		asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
75162306a36Sopenharmony_ci	}
75262306a36Sopenharmony_ci	if (rc) {
75362306a36Sopenharmony_ci		base_asce_free(asce);
75462306a36Sopenharmony_ci		asce = 0;
75562306a36Sopenharmony_ci	}
75662306a36Sopenharmony_ci	return asce;
75762306a36Sopenharmony_ci}
758