162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci#ifndef _ASM_POWERPC_BOOK3S_64_HASH_64K_H 362306a36Sopenharmony_ci#define _ASM_POWERPC_BOOK3S_64_HASH_64K_H 462306a36Sopenharmony_ci 562306a36Sopenharmony_ci#define H_PTE_INDEX_SIZE 8 // size: 8B << 8 = 2KB, maps 2^8 x 64KB = 16MB 662306a36Sopenharmony_ci#define H_PMD_INDEX_SIZE 10 // size: 8B << 10 = 8KB, maps 2^10 x 16MB = 16GB 762306a36Sopenharmony_ci#define H_PUD_INDEX_SIZE 10 // size: 8B << 10 = 8KB, maps 2^10 x 16GB = 16TB 862306a36Sopenharmony_ci#define H_PGD_INDEX_SIZE 8 // size: 8B << 8 = 2KB, maps 2^8 x 16TB = 4PB 962306a36Sopenharmony_ci 1062306a36Sopenharmony_ci/* 1162306a36Sopenharmony_ci * If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS 1262306a36Sopenharmony_ci * if we increase SECTIONS_WIDTH we will not store node details in page->flags and 1362306a36Sopenharmony_ci * page_to_nid does a page->section->node lookup 1462306a36Sopenharmony_ci * Hence only increase for VMEMMAP. Further depending on SPARSEMEM_EXTREME reduce 1562306a36Sopenharmony_ci * memory requirements with large number of sections. 1662306a36Sopenharmony_ci * 51 bits is the max physical real address on POWER9 1762306a36Sopenharmony_ci */ 1862306a36Sopenharmony_ci#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) 1962306a36Sopenharmony_ci#define H_MAX_PHYSMEM_BITS 51 2062306a36Sopenharmony_ci#else 2162306a36Sopenharmony_ci#define H_MAX_PHYSMEM_BITS 46 2262306a36Sopenharmony_ci#endif 2362306a36Sopenharmony_ci 2462306a36Sopenharmony_ci/* 2562306a36Sopenharmony_ci * Each context is 512TB size. SLB miss for first context/default context 2662306a36Sopenharmony_ci * is handled in the hotpath. 2762306a36Sopenharmony_ci */ 2862306a36Sopenharmony_ci#define MAX_EA_BITS_PER_CONTEXT 49 2962306a36Sopenharmony_ci#define REGION_SHIFT MAX_EA_BITS_PER_CONTEXT 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_ci/* 3262306a36Sopenharmony_ci * We use one context for each MAP area. 3362306a36Sopenharmony_ci */ 3462306a36Sopenharmony_ci#define H_KERN_MAP_SIZE (1UL << MAX_EA_BITS_PER_CONTEXT) 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_ci/* 3762306a36Sopenharmony_ci * Define the address range of the kernel non-linear virtual area 3862306a36Sopenharmony_ci * 2PB 3962306a36Sopenharmony_ci */ 4062306a36Sopenharmony_ci#define H_KERN_VIRT_START ASM_CONST(0xc008000000000000) 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_ci/* 4362306a36Sopenharmony_ci * 64k aligned address free up few of the lower bits of RPN for us 4462306a36Sopenharmony_ci * We steal that here. For more deatils look at pte_pfn/pfn_pte() 4562306a36Sopenharmony_ci */ 4662306a36Sopenharmony_ci#define H_PAGE_COMBO _RPAGE_RPN0 /* this is a combo 4k page */ 4762306a36Sopenharmony_ci#define H_PAGE_4K_PFN _RPAGE_RPN1 /* PFN is for a single 4k page */ 4862306a36Sopenharmony_ci#define H_PAGE_BUSY _RPAGE_RSV1 /* software: PTE & hash are busy */ 4962306a36Sopenharmony_ci#define H_PAGE_HASHPTE _RPAGE_RPN43 /* PTE has associated HPTE */ 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ci/* memory key bits. */ 5262306a36Sopenharmony_ci#define H_PTE_PKEY_BIT4 _RPAGE_PKEY_BIT4 5362306a36Sopenharmony_ci#define H_PTE_PKEY_BIT3 _RPAGE_PKEY_BIT3 5462306a36Sopenharmony_ci#define H_PTE_PKEY_BIT2 _RPAGE_PKEY_BIT2 5562306a36Sopenharmony_ci#define H_PTE_PKEY_BIT1 _RPAGE_PKEY_BIT1 5662306a36Sopenharmony_ci#define H_PTE_PKEY_BIT0 _RPAGE_PKEY_BIT0 5762306a36Sopenharmony_ci 5862306a36Sopenharmony_ci/* 5962306a36Sopenharmony_ci * We need to differentiate between explicit huge page and THP huge 6062306a36Sopenharmony_ci * page, since THP huge page also need to track real subpage details 6162306a36Sopenharmony_ci */ 6262306a36Sopenharmony_ci#define H_PAGE_THP_HUGE H_PAGE_4K_PFN 6362306a36Sopenharmony_ci 6462306a36Sopenharmony_ci/* PTE flags to conserve for HPTE identification */ 6562306a36Sopenharmony_ci#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | H_PAGE_COMBO) 6662306a36Sopenharmony_ci/* 6762306a36Sopenharmony_ci * We use a 2K PTE page fragment and another 2K for storing 6862306a36Sopenharmony_ci * real_pte_t hash index 6962306a36Sopenharmony_ci * 8 bytes per each pte entry and another 8 bytes for storing 7062306a36Sopenharmony_ci * slot details. 7162306a36Sopenharmony_ci */ 7262306a36Sopenharmony_ci#define H_PTE_FRAG_SIZE_SHIFT (H_PTE_INDEX_SIZE + 3 + 1) 7362306a36Sopenharmony_ci#define H_PTE_FRAG_NR (PAGE_SIZE >> H_PTE_FRAG_SIZE_SHIFT) 7462306a36Sopenharmony_ci 7562306a36Sopenharmony_ci#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) 7662306a36Sopenharmony_ci#define H_PMD_FRAG_SIZE_SHIFT (H_PMD_INDEX_SIZE + 3 + 1) 7762306a36Sopenharmony_ci#else 7862306a36Sopenharmony_ci#define H_PMD_FRAG_SIZE_SHIFT (H_PMD_INDEX_SIZE + 3) 7962306a36Sopenharmony_ci#endif 8062306a36Sopenharmony_ci#define H_PMD_FRAG_NR (PAGE_SIZE >> H_PMD_FRAG_SIZE_SHIFT) 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ci#ifndef __ASSEMBLY__ 8362306a36Sopenharmony_ci#include <asm/errno.h> 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_ci/* 8662306a36Sopenharmony_ci * With 64K pages on hash table, we have a special PTE format that 8762306a36Sopenharmony_ci * uses a second "half" of the page table to encode sub-page information 8862306a36Sopenharmony_ci * in order to deal with 64K made of 4K HW pages. Thus we override the 8962306a36Sopenharmony_ci * generic accessors and iterators here 9062306a36Sopenharmony_ci */ 9162306a36Sopenharmony_ci#define __real_pte __real_pte 9262306a36Sopenharmony_cistatic inline real_pte_t __real_pte(pte_t pte, pte_t *ptep, int offset) 9362306a36Sopenharmony_ci{ 9462306a36Sopenharmony_ci real_pte_t rpte; 9562306a36Sopenharmony_ci unsigned long *hidxp; 9662306a36Sopenharmony_ci 9762306a36Sopenharmony_ci rpte.pte = pte; 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_ci /* 10062306a36Sopenharmony_ci * Ensure that we do not read the hidx before we read the PTE. Because 10162306a36Sopenharmony_ci * the writer side is expected to finish writing the hidx first followed 10262306a36Sopenharmony_ci * by the PTE, by using smp_wmb(). pte_set_hash_slot() ensures that. 10362306a36Sopenharmony_ci */ 10462306a36Sopenharmony_ci smp_rmb(); 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci hidxp = (unsigned long *)(ptep + offset); 10762306a36Sopenharmony_ci rpte.hidx = *hidxp; 10862306a36Sopenharmony_ci return rpte; 10962306a36Sopenharmony_ci} 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_ci/* 11262306a36Sopenharmony_ci * shift the hidx representation by one-modulo-0xf; i.e hidx 0 is respresented 11362306a36Sopenharmony_ci * as 1, 1 as 2,... , and 0xf as 0. This convention lets us represent a 11462306a36Sopenharmony_ci * invalid hidx 0xf with a 0x0 bit value. PTEs are anyway zero'd when 11562306a36Sopenharmony_ci * allocated. We dont have to zero them gain; thus save on the initialization. 11662306a36Sopenharmony_ci */ 11762306a36Sopenharmony_ci#define HIDX_UNSHIFT_BY_ONE(x) ((x + 0xfUL) & 0xfUL) /* shift backward by one */ 11862306a36Sopenharmony_ci#define HIDX_SHIFT_BY_ONE(x) ((x + 0x1UL) & 0xfUL) /* shift forward by one */ 11962306a36Sopenharmony_ci#define HIDX_BITS(x, index) (x << (index << 2)) 12062306a36Sopenharmony_ci#define BITS_TO_HIDX(x, index) ((x >> (index << 2)) & 0xfUL) 12162306a36Sopenharmony_ci#define INVALID_RPTE_HIDX 0x0UL 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_cistatic inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index) 12462306a36Sopenharmony_ci{ 12562306a36Sopenharmony_ci return HIDX_UNSHIFT_BY_ONE(BITS_TO_HIDX(rpte.hidx, index)); 12662306a36Sopenharmony_ci} 12762306a36Sopenharmony_ci 12862306a36Sopenharmony_ci/* 12962306a36Sopenharmony_ci * Commit the hidx and return PTE bits that needs to be modified. The caller is 13062306a36Sopenharmony_ci * expected to modify the PTE bits accordingly and commit the PTE to memory. 13162306a36Sopenharmony_ci */ 13262306a36Sopenharmony_cistatic inline unsigned long pte_set_hidx(pte_t *ptep, real_pte_t rpte, 13362306a36Sopenharmony_ci unsigned int subpg_index, 13462306a36Sopenharmony_ci unsigned long hidx, int offset) 13562306a36Sopenharmony_ci{ 13662306a36Sopenharmony_ci unsigned long *hidxp = (unsigned long *)(ptep + offset); 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci rpte.hidx &= ~HIDX_BITS(0xfUL, subpg_index); 13962306a36Sopenharmony_ci *hidxp = rpte.hidx | HIDX_BITS(HIDX_SHIFT_BY_ONE(hidx), subpg_index); 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci /* 14262306a36Sopenharmony_ci * Anyone reading PTE must ensure hidx bits are read after reading the 14362306a36Sopenharmony_ci * PTE by using the read-side barrier smp_rmb(). __real_pte() can be 14462306a36Sopenharmony_ci * used for that. 14562306a36Sopenharmony_ci */ 14662306a36Sopenharmony_ci smp_wmb(); 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_ci /* No PTE bits to be modified, return 0x0UL */ 14962306a36Sopenharmony_ci return 0x0UL; 15062306a36Sopenharmony_ci} 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci#define __rpte_to_pte(r) ((r).pte) 15362306a36Sopenharmony_ciextern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index); 15462306a36Sopenharmony_ci/* 15562306a36Sopenharmony_ci * Trick: we set __end to va + 64k, which happens works for 15662306a36Sopenharmony_ci * a 16M page as well as we want only one iteration 15762306a36Sopenharmony_ci */ 15862306a36Sopenharmony_ci#define pte_iterate_hashed_subpages(rpte, psize, vpn, index, shift) \ 15962306a36Sopenharmony_ci do { \ 16062306a36Sopenharmony_ci unsigned long __end = vpn + (1UL << (PAGE_SHIFT - VPN_SHIFT)); \ 16162306a36Sopenharmony_ci unsigned __split = (psize == MMU_PAGE_4K || \ 16262306a36Sopenharmony_ci psize == MMU_PAGE_64K_AP); \ 16362306a36Sopenharmony_ci shift = mmu_psize_defs[psize].shift; \ 16462306a36Sopenharmony_ci for (index = 0; vpn < __end; index++, \ 16562306a36Sopenharmony_ci vpn += (1L << (shift - VPN_SHIFT))) { \ 16662306a36Sopenharmony_ci if (!__split || __rpte_sub_valid(rpte, index)) 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_ci#define pte_iterate_hashed_end() } } while(0) 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci#define pte_pagesize_index(mm, addr, pte) \ 17162306a36Sopenharmony_ci (((pte) & H_PAGE_COMBO)? MMU_PAGE_4K: MMU_PAGE_64K) 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_ciextern int remap_pfn_range(struct vm_area_struct *, unsigned long addr, 17462306a36Sopenharmony_ci unsigned long pfn, unsigned long size, pgprot_t); 17562306a36Sopenharmony_cistatic inline int hash__remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, 17662306a36Sopenharmony_ci unsigned long pfn, pgprot_t prot) 17762306a36Sopenharmony_ci{ 17862306a36Sopenharmony_ci if (pfn > (PTE_RPN_MASK >> PAGE_SHIFT)) { 17962306a36Sopenharmony_ci WARN(1, "remap_4k_pfn called with wrong pfn value\n"); 18062306a36Sopenharmony_ci return -EINVAL; 18162306a36Sopenharmony_ci } 18262306a36Sopenharmony_ci return remap_pfn_range(vma, addr, pfn, PAGE_SIZE, 18362306a36Sopenharmony_ci __pgprot(pgprot_val(prot) | H_PAGE_4K_PFN)); 18462306a36Sopenharmony_ci} 18562306a36Sopenharmony_ci 18662306a36Sopenharmony_ci#define H_PTE_TABLE_SIZE PTE_FRAG_SIZE 18762306a36Sopenharmony_ci#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined (CONFIG_HUGETLB_PAGE) 18862306a36Sopenharmony_ci#define H_PMD_TABLE_SIZE ((sizeof(pmd_t) << PMD_INDEX_SIZE) + \ 18962306a36Sopenharmony_ci (sizeof(unsigned long) << PMD_INDEX_SIZE)) 19062306a36Sopenharmony_ci#else 19162306a36Sopenharmony_ci#define H_PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) 19262306a36Sopenharmony_ci#endif 19362306a36Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE 19462306a36Sopenharmony_ci#define H_PUD_TABLE_SIZE ((sizeof(pud_t) << PUD_INDEX_SIZE) + \ 19562306a36Sopenharmony_ci (sizeof(unsigned long) << PUD_INDEX_SIZE)) 19662306a36Sopenharmony_ci#else 19762306a36Sopenharmony_ci#define H_PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) 19862306a36Sopenharmony_ci#endif 19962306a36Sopenharmony_ci#define H_PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) 20062306a36Sopenharmony_ci 20162306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 20262306a36Sopenharmony_cistatic inline char *get_hpte_slot_array(pmd_t *pmdp) 20362306a36Sopenharmony_ci{ 20462306a36Sopenharmony_ci /* 20562306a36Sopenharmony_ci * The hpte hindex is stored in the pgtable whose address is in the 20662306a36Sopenharmony_ci * second half of the PMD 20762306a36Sopenharmony_ci * 20862306a36Sopenharmony_ci * Order this load with the test for pmd_trans_huge in the caller 20962306a36Sopenharmony_ci */ 21062306a36Sopenharmony_ci smp_rmb(); 21162306a36Sopenharmony_ci return *(char **)(pmdp + PTRS_PER_PMD); 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci 21462306a36Sopenharmony_ci} 21562306a36Sopenharmony_ci/* 21662306a36Sopenharmony_ci * The linux hugepage PMD now include the pmd entries followed by the address 21762306a36Sopenharmony_ci * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits. 21862306a36Sopenharmony_ci * [ 000 | 1 bit secondary | 3 bit hidx | 1 bit valid]. We use one byte per 21962306a36Sopenharmony_ci * each HPTE entry. With 16MB hugepage and 64K HPTE we need 256 entries and 22062306a36Sopenharmony_ci * with 4K HPTE we need 4096 entries. Both will fit in a 4K pgtable_t. 22162306a36Sopenharmony_ci * 22262306a36Sopenharmony_ci * The top three bits are intentionally left as zero. This memory location 22362306a36Sopenharmony_ci * are also used as normal page PTE pointers. So if we have any pointers 22462306a36Sopenharmony_ci * left around while we collapse a hugepage, we need to make sure 22562306a36Sopenharmony_ci * _PAGE_PRESENT bit of that is zero when we look at them 22662306a36Sopenharmony_ci */ 22762306a36Sopenharmony_cistatic inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index) 22862306a36Sopenharmony_ci{ 22962306a36Sopenharmony_ci return hpte_slot_array[index] & 0x1; 23062306a36Sopenharmony_ci} 23162306a36Sopenharmony_ci 23262306a36Sopenharmony_cistatic inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array, 23362306a36Sopenharmony_ci int index) 23462306a36Sopenharmony_ci{ 23562306a36Sopenharmony_ci return hpte_slot_array[index] >> 1; 23662306a36Sopenharmony_ci} 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_cistatic inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array, 23962306a36Sopenharmony_ci unsigned int index, unsigned int hidx) 24062306a36Sopenharmony_ci{ 24162306a36Sopenharmony_ci hpte_slot_array[index] = (hidx << 1) | 0x1; 24262306a36Sopenharmony_ci} 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci/* 24562306a36Sopenharmony_ci * 24662306a36Sopenharmony_ci * For core kernel code by design pmd_trans_huge is never run on any hugetlbfs 24762306a36Sopenharmony_ci * page. The hugetlbfs page table walking and mangling paths are totally 24862306a36Sopenharmony_ci * separated form the core VM paths and they're differentiated by 24962306a36Sopenharmony_ci * VM_HUGETLB being set on vm_flags well before any pmd_trans_huge could run. 25062306a36Sopenharmony_ci * 25162306a36Sopenharmony_ci * pmd_trans_huge() is defined as false at build time if 25262306a36Sopenharmony_ci * CONFIG_TRANSPARENT_HUGEPAGE=n to optimize away code blocks at build 25362306a36Sopenharmony_ci * time in such case. 25462306a36Sopenharmony_ci * 25562306a36Sopenharmony_ci * For ppc64 we need to differntiate from explicit hugepages from THP, because 25662306a36Sopenharmony_ci * for THP we also track the subpage details at the pmd level. We don't do 25762306a36Sopenharmony_ci * that for explicit huge pages. 25862306a36Sopenharmony_ci * 25962306a36Sopenharmony_ci */ 26062306a36Sopenharmony_cistatic inline int hash__pmd_trans_huge(pmd_t pmd) 26162306a36Sopenharmony_ci{ 26262306a36Sopenharmony_ci return !!((pmd_val(pmd) & (_PAGE_PTE | H_PAGE_THP_HUGE | _PAGE_DEVMAP)) == 26362306a36Sopenharmony_ci (_PAGE_PTE | H_PAGE_THP_HUGE)); 26462306a36Sopenharmony_ci} 26562306a36Sopenharmony_ci 26662306a36Sopenharmony_cistatic inline pmd_t hash__pmd_mkhuge(pmd_t pmd) 26762306a36Sopenharmony_ci{ 26862306a36Sopenharmony_ci return __pmd(pmd_val(pmd) | (_PAGE_PTE | H_PAGE_THP_HUGE)); 26962306a36Sopenharmony_ci} 27062306a36Sopenharmony_ci 27162306a36Sopenharmony_ciextern unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, 27262306a36Sopenharmony_ci unsigned long addr, pmd_t *pmdp, 27362306a36Sopenharmony_ci unsigned long clr, unsigned long set); 27462306a36Sopenharmony_ciextern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, 27562306a36Sopenharmony_ci unsigned long address, pmd_t *pmdp); 27662306a36Sopenharmony_ciextern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 27762306a36Sopenharmony_ci pgtable_t pgtable); 27862306a36Sopenharmony_ciextern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); 27962306a36Sopenharmony_ciextern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, 28062306a36Sopenharmony_ci unsigned long addr, pmd_t *pmdp); 28162306a36Sopenharmony_ciextern int hash__has_transparent_hugepage(void); 28262306a36Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_cistatic inline pmd_t hash__pmd_mkdevmap(pmd_t pmd) 28562306a36Sopenharmony_ci{ 28662306a36Sopenharmony_ci return __pmd(pmd_val(pmd) | (_PAGE_PTE | H_PAGE_THP_HUGE | _PAGE_DEVMAP)); 28762306a36Sopenharmony_ci} 28862306a36Sopenharmony_ci 28962306a36Sopenharmony_ci#endif /* __ASSEMBLY__ */ 29062306a36Sopenharmony_ci 29162306a36Sopenharmony_ci#endif /* _ASM_POWERPC_BOOK3S_64_HASH_64K_H */ 292