18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 28c2ecf20Sopenharmony_ci#ifndef _ASM_X86_PGTABLE_3LEVEL_H 38c2ecf20Sopenharmony_ci#define _ASM_X86_PGTABLE_3LEVEL_H 48c2ecf20Sopenharmony_ci 58c2ecf20Sopenharmony_ci#include <asm/atomic64_32.h> 68c2ecf20Sopenharmony_ci 78c2ecf20Sopenharmony_ci/* 88c2ecf20Sopenharmony_ci * Intel Physical Address Extension (PAE) Mode - three-level page 98c2ecf20Sopenharmony_ci * tables on PPro+ CPUs. 108c2ecf20Sopenharmony_ci * 118c2ecf20Sopenharmony_ci * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> 128c2ecf20Sopenharmony_ci */ 138c2ecf20Sopenharmony_ci 148c2ecf20Sopenharmony_ci#define pte_ERROR(e) \ 158c2ecf20Sopenharmony_ci pr_err("%s:%d: bad pte %p(%08lx%08lx)\n", \ 168c2ecf20Sopenharmony_ci __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low) 178c2ecf20Sopenharmony_ci#define pmd_ERROR(e) \ 188c2ecf20Sopenharmony_ci pr_err("%s:%d: bad pmd %p(%016Lx)\n", \ 198c2ecf20Sopenharmony_ci __FILE__, __LINE__, &(e), pmd_val(e)) 208c2ecf20Sopenharmony_ci#define pgd_ERROR(e) \ 218c2ecf20Sopenharmony_ci pr_err("%s:%d: bad pgd %p(%016Lx)\n", \ 228c2ecf20Sopenharmony_ci __FILE__, __LINE__, &(e), pgd_val(e)) 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_ci/* Rules for using set_pte: the pte being assigned *must* be 258c2ecf20Sopenharmony_ci * either not present or in a state where the hardware will 268c2ecf20Sopenharmony_ci * not attempt to update the pte. In places where this is 278c2ecf20Sopenharmony_ci * not possible, use pte_get_and_clear to obtain the old pte 288c2ecf20Sopenharmony_ci * value and then use set_pte to update it. -ben 298c2ecf20Sopenharmony_ci */ 308c2ecf20Sopenharmony_cistatic inline void native_set_pte(pte_t *ptep, pte_t pte) 318c2ecf20Sopenharmony_ci{ 328c2ecf20Sopenharmony_ci ptep->pte_high = pte.pte_high; 338c2ecf20Sopenharmony_ci smp_wmb(); 348c2ecf20Sopenharmony_ci ptep->pte_low = pte.pte_low; 358c2ecf20Sopenharmony_ci} 368c2ecf20Sopenharmony_ci 378c2ecf20Sopenharmony_ci#define pmd_read_atomic pmd_read_atomic 388c2ecf20Sopenharmony_ci/* 398c2ecf20Sopenharmony_ci * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with 408c2ecf20Sopenharmony_ci * a "*pmdp" dereference done by GCC. Problem is, in certain places 418c2ecf20Sopenharmony_ci * where pte_offset_map_lock() is called, concurrent page faults are 428c2ecf20Sopenharmony_ci * allowed, if the mmap_lock is hold for reading. An example is mincore 438c2ecf20Sopenharmony_ci * vs page faults vs MADV_DONTNEED. On the page fault side 448c2ecf20Sopenharmony_ci * pmd_populate() rightfully does a set_64bit(), but if we're reading the 458c2ecf20Sopenharmony_ci * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen 468c2ecf20Sopenharmony_ci * because GCC will not read the 64-bit value of the pmd atomically. 478c2ecf20Sopenharmony_ci * 488c2ecf20Sopenharmony_ci * To fix this all places running pte_offset_map_lock() while holding the 498c2ecf20Sopenharmony_ci * mmap_lock in read mode, shall read the pmdp pointer using this 508c2ecf20Sopenharmony_ci * function to know if the pmd is null or not, and in turn to know if 518c2ecf20Sopenharmony_ci * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd 528c2ecf20Sopenharmony_ci * operations. 538c2ecf20Sopenharmony_ci * 548c2ecf20Sopenharmony_ci * Without THP if the mmap_lock is held for reading, the pmd can only 558c2ecf20Sopenharmony_ci * transition from null to not null while pmd_read_atomic() runs. So 568c2ecf20Sopenharmony_ci * we can always return atomic pmd values with this function. 578c2ecf20Sopenharmony_ci * 588c2ecf20Sopenharmony_ci * With THP if the mmap_lock is held for reading, the pmd can become 598c2ecf20Sopenharmony_ci * trans_huge or none or point to a pte (and in turn become "stable") 608c2ecf20Sopenharmony_ci * at any time under pmd_read_atomic(). We could read it truly 618c2ecf20Sopenharmony_ci * atomically here with an atomic64_read() for the THP enabled case (and 628c2ecf20Sopenharmony_ci * it would be a whole lot simpler), but to avoid using cmpxchg8b we 638c2ecf20Sopenharmony_ci * only return an atomic pmdval if the low part of the pmdval is later 648c2ecf20Sopenharmony_ci * found to be stable (i.e. pointing to a pte). We are also returning a 658c2ecf20Sopenharmony_ci * 'none' (zero) pmdval if the low part of the pmd is zero. 668c2ecf20Sopenharmony_ci * 678c2ecf20Sopenharmony_ci * In some cases the high and low part of the pmdval returned may not be 688c2ecf20Sopenharmony_ci * consistent if THP is enabled (the low part may point to previously 698c2ecf20Sopenharmony_ci * mapped hugepage, while the high part may point to a more recently 708c2ecf20Sopenharmony_ci * mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only 718c2ecf20Sopenharmony_ci * needs the low part of the pmd to be read atomically to decide if the 728c2ecf20Sopenharmony_ci * pmd is unstable or not, with the only exception when the low part 738c2ecf20Sopenharmony_ci * of the pmd is zero, in which case we return a 'none' pmd. 748c2ecf20Sopenharmony_ci */ 758c2ecf20Sopenharmony_cistatic inline pmd_t pmd_read_atomic(pmd_t *pmdp) 768c2ecf20Sopenharmony_ci{ 778c2ecf20Sopenharmony_ci pmdval_t ret; 788c2ecf20Sopenharmony_ci u32 *tmp = (u32 *)pmdp; 798c2ecf20Sopenharmony_ci 808c2ecf20Sopenharmony_ci ret = (pmdval_t) (*tmp); 818c2ecf20Sopenharmony_ci if (ret) { 828c2ecf20Sopenharmony_ci /* 838c2ecf20Sopenharmony_ci * If the low part is null, we must not read the high part 848c2ecf20Sopenharmony_ci * or we can end up with a partial pmd. 858c2ecf20Sopenharmony_ci */ 868c2ecf20Sopenharmony_ci smp_rmb(); 878c2ecf20Sopenharmony_ci ret |= ((pmdval_t)*(tmp + 1)) << 32; 888c2ecf20Sopenharmony_ci } 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci return (pmd_t) { ret }; 918c2ecf20Sopenharmony_ci} 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_cistatic inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) 948c2ecf20Sopenharmony_ci{ 958c2ecf20Sopenharmony_ci set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); 968c2ecf20Sopenharmony_ci} 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_cistatic inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) 998c2ecf20Sopenharmony_ci{ 1008c2ecf20Sopenharmony_ci set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd)); 1018c2ecf20Sopenharmony_ci} 1028c2ecf20Sopenharmony_ci 1038c2ecf20Sopenharmony_cistatic inline void native_set_pud(pud_t *pudp, pud_t pud) 1048c2ecf20Sopenharmony_ci{ 1058c2ecf20Sopenharmony_ci#ifdef CONFIG_PAGE_TABLE_ISOLATION 1068c2ecf20Sopenharmony_ci pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd); 1078c2ecf20Sopenharmony_ci#endif 1088c2ecf20Sopenharmony_ci set_64bit((unsigned long long *)(pudp), native_pud_val(pud)); 1098c2ecf20Sopenharmony_ci} 1108c2ecf20Sopenharmony_ci 1118c2ecf20Sopenharmony_ci/* 1128c2ecf20Sopenharmony_ci * For PTEs and PDEs, we must clear the P-bit first when clearing a page table 1138c2ecf20Sopenharmony_ci * entry, so clear the bottom half first and enforce ordering with a compiler 1148c2ecf20Sopenharmony_ci * barrier. 1158c2ecf20Sopenharmony_ci */ 1168c2ecf20Sopenharmony_cistatic inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, 1178c2ecf20Sopenharmony_ci pte_t *ptep) 1188c2ecf20Sopenharmony_ci{ 1198c2ecf20Sopenharmony_ci ptep->pte_low = 0; 1208c2ecf20Sopenharmony_ci smp_wmb(); 1218c2ecf20Sopenharmony_ci ptep->pte_high = 0; 1228c2ecf20Sopenharmony_ci} 1238c2ecf20Sopenharmony_ci 1248c2ecf20Sopenharmony_cistatic inline void native_pmd_clear(pmd_t *pmd) 1258c2ecf20Sopenharmony_ci{ 1268c2ecf20Sopenharmony_ci u32 *tmp = (u32 *)pmd; 1278c2ecf20Sopenharmony_ci *tmp = 0; 1288c2ecf20Sopenharmony_ci smp_wmb(); 1298c2ecf20Sopenharmony_ci *(tmp + 1) = 0; 1308c2ecf20Sopenharmony_ci} 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_cistatic inline void native_pud_clear(pud_t *pudp) 1338c2ecf20Sopenharmony_ci{ 1348c2ecf20Sopenharmony_ci} 1358c2ecf20Sopenharmony_ci 1368c2ecf20Sopenharmony_cistatic inline void pud_clear(pud_t *pudp) 1378c2ecf20Sopenharmony_ci{ 1388c2ecf20Sopenharmony_ci set_pud(pudp, __pud(0)); 1398c2ecf20Sopenharmony_ci 1408c2ecf20Sopenharmony_ci /* 1418c2ecf20Sopenharmony_ci * According to Intel App note "TLBs, Paging-Structure Caches, 1428c2ecf20Sopenharmony_ci * and Their Invalidation", April 2007, document 317080-001, 1438c2ecf20Sopenharmony_ci * section 8.1: in PAE mode we explicitly have to flush the 1448c2ecf20Sopenharmony_ci * TLB via cr3 if the top-level pgd is changed... 1458c2ecf20Sopenharmony_ci * 1468c2ecf20Sopenharmony_ci * Currently all places where pud_clear() is called either have 1478c2ecf20Sopenharmony_ci * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or 1488c2ecf20Sopenharmony_ci * pud_clear_bad()), so we don't need TLB flush here. 1498c2ecf20Sopenharmony_ci */ 1508c2ecf20Sopenharmony_ci} 1518c2ecf20Sopenharmony_ci 1528c2ecf20Sopenharmony_ci#ifdef CONFIG_SMP 1538c2ecf20Sopenharmony_cistatic inline pte_t native_ptep_get_and_clear(pte_t *ptep) 1548c2ecf20Sopenharmony_ci{ 1558c2ecf20Sopenharmony_ci pte_t res; 1568c2ecf20Sopenharmony_ci 1578c2ecf20Sopenharmony_ci res.pte = (pteval_t)arch_atomic64_xchg((atomic64_t *)ptep, 0); 1588c2ecf20Sopenharmony_ci 1598c2ecf20Sopenharmony_ci return res; 1608c2ecf20Sopenharmony_ci} 1618c2ecf20Sopenharmony_ci#else 1628c2ecf20Sopenharmony_ci#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) 1638c2ecf20Sopenharmony_ci#endif 1648c2ecf20Sopenharmony_ci 1658c2ecf20Sopenharmony_ciunion split_pmd { 1668c2ecf20Sopenharmony_ci struct { 1678c2ecf20Sopenharmony_ci u32 pmd_low; 1688c2ecf20Sopenharmony_ci u32 pmd_high; 1698c2ecf20Sopenharmony_ci }; 1708c2ecf20Sopenharmony_ci pmd_t pmd; 1718c2ecf20Sopenharmony_ci}; 1728c2ecf20Sopenharmony_ci 1738c2ecf20Sopenharmony_ci#ifdef CONFIG_SMP 1748c2ecf20Sopenharmony_cistatic inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) 1758c2ecf20Sopenharmony_ci{ 1768c2ecf20Sopenharmony_ci union split_pmd res, *orig = (union split_pmd *)pmdp; 1778c2ecf20Sopenharmony_ci 1788c2ecf20Sopenharmony_ci /* xchg acts as a barrier before setting of the high bits */ 1798c2ecf20Sopenharmony_ci res.pmd_low = xchg(&orig->pmd_low, 0); 1808c2ecf20Sopenharmony_ci res.pmd_high = orig->pmd_high; 1818c2ecf20Sopenharmony_ci orig->pmd_high = 0; 1828c2ecf20Sopenharmony_ci 1838c2ecf20Sopenharmony_ci return res.pmd; 1848c2ecf20Sopenharmony_ci} 1858c2ecf20Sopenharmony_ci#else 1868c2ecf20Sopenharmony_ci#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) 1878c2ecf20Sopenharmony_ci#endif 1888c2ecf20Sopenharmony_ci 1898c2ecf20Sopenharmony_ci#ifndef pmdp_establish 1908c2ecf20Sopenharmony_ci#define pmdp_establish pmdp_establish 1918c2ecf20Sopenharmony_cistatic inline pmd_t pmdp_establish(struct vm_area_struct *vma, 1928c2ecf20Sopenharmony_ci unsigned long address, pmd_t *pmdp, pmd_t pmd) 1938c2ecf20Sopenharmony_ci{ 1948c2ecf20Sopenharmony_ci pmd_t old; 1958c2ecf20Sopenharmony_ci 1968c2ecf20Sopenharmony_ci /* 1978c2ecf20Sopenharmony_ci * If pmd has present bit cleared we can get away without expensive 1988c2ecf20Sopenharmony_ci * cmpxchg64: we can update pmdp half-by-half without racing with 1998c2ecf20Sopenharmony_ci * anybody. 2008c2ecf20Sopenharmony_ci */ 2018c2ecf20Sopenharmony_ci if (!(pmd_val(pmd) & _PAGE_PRESENT)) { 2028c2ecf20Sopenharmony_ci union split_pmd old, new, *ptr; 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ci ptr = (union split_pmd *)pmdp; 2058c2ecf20Sopenharmony_ci 2068c2ecf20Sopenharmony_ci new.pmd = pmd; 2078c2ecf20Sopenharmony_ci 2088c2ecf20Sopenharmony_ci /* xchg acts as a barrier before setting of the high bits */ 2098c2ecf20Sopenharmony_ci old.pmd_low = xchg(&ptr->pmd_low, new.pmd_low); 2108c2ecf20Sopenharmony_ci old.pmd_high = ptr->pmd_high; 2118c2ecf20Sopenharmony_ci ptr->pmd_high = new.pmd_high; 2128c2ecf20Sopenharmony_ci return old.pmd; 2138c2ecf20Sopenharmony_ci } 2148c2ecf20Sopenharmony_ci 2158c2ecf20Sopenharmony_ci do { 2168c2ecf20Sopenharmony_ci old = *pmdp; 2178c2ecf20Sopenharmony_ci } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd); 2188c2ecf20Sopenharmony_ci 2198c2ecf20Sopenharmony_ci return old; 2208c2ecf20Sopenharmony_ci} 2218c2ecf20Sopenharmony_ci#endif 2228c2ecf20Sopenharmony_ci 2238c2ecf20Sopenharmony_ci#ifdef CONFIG_SMP 2248c2ecf20Sopenharmony_ciunion split_pud { 2258c2ecf20Sopenharmony_ci struct { 2268c2ecf20Sopenharmony_ci u32 pud_low; 2278c2ecf20Sopenharmony_ci u32 pud_high; 2288c2ecf20Sopenharmony_ci }; 2298c2ecf20Sopenharmony_ci pud_t pud; 2308c2ecf20Sopenharmony_ci}; 2318c2ecf20Sopenharmony_ci 2328c2ecf20Sopenharmony_cistatic inline pud_t native_pudp_get_and_clear(pud_t *pudp) 2338c2ecf20Sopenharmony_ci{ 2348c2ecf20Sopenharmony_ci union split_pud res, *orig = (union split_pud *)pudp; 2358c2ecf20Sopenharmony_ci 2368c2ecf20Sopenharmony_ci#ifdef CONFIG_PAGE_TABLE_ISOLATION 2378c2ecf20Sopenharmony_ci pti_set_user_pgtbl(&pudp->p4d.pgd, __pgd(0)); 2388c2ecf20Sopenharmony_ci#endif 2398c2ecf20Sopenharmony_ci 2408c2ecf20Sopenharmony_ci /* xchg acts as a barrier before setting of the high bits */ 2418c2ecf20Sopenharmony_ci res.pud_low = xchg(&orig->pud_low, 0); 2428c2ecf20Sopenharmony_ci res.pud_high = orig->pud_high; 2438c2ecf20Sopenharmony_ci orig->pud_high = 0; 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_ci return res.pud; 2468c2ecf20Sopenharmony_ci} 2478c2ecf20Sopenharmony_ci#else 2488c2ecf20Sopenharmony_ci#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp) 2498c2ecf20Sopenharmony_ci#endif 2508c2ecf20Sopenharmony_ci 2518c2ecf20Sopenharmony_ci/* Encode and de-code a swap entry */ 2528c2ecf20Sopenharmony_ci#define SWP_TYPE_BITS 5 2538c2ecf20Sopenharmony_ci 2548c2ecf20Sopenharmony_ci#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) 2558c2ecf20Sopenharmony_ci 2568c2ecf20Sopenharmony_ci/* We always extract/encode the offset by shifting it all the way up, and then down again */ 2578c2ecf20Sopenharmony_ci#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS) 2588c2ecf20Sopenharmony_ci 2598c2ecf20Sopenharmony_ci#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) 2608c2ecf20Sopenharmony_ci#define __swp_type(x) (((x).val) & 0x1f) 2618c2ecf20Sopenharmony_ci#define __swp_offset(x) ((x).val >> 5) 2628c2ecf20Sopenharmony_ci#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) 2638c2ecf20Sopenharmony_ci 2648c2ecf20Sopenharmony_ci/* 2658c2ecf20Sopenharmony_ci * Normally, __swp_entry() converts from arch-independent swp_entry_t to 2668c2ecf20Sopenharmony_ci * arch-dependent swp_entry_t, and __swp_entry_to_pte() just stores the result 2678c2ecf20Sopenharmony_ci * to pte. But here we have 32bit swp_entry_t and 64bit pte, and need to use the 2688c2ecf20Sopenharmony_ci * whole 64 bits. Thus, we shift the "real" arch-dependent conversion to 2698c2ecf20Sopenharmony_ci * __swp_entry_to_pte() through the following helper macro based on 64bit 2708c2ecf20Sopenharmony_ci * __swp_entry(). 2718c2ecf20Sopenharmony_ci */ 2728c2ecf20Sopenharmony_ci#define __swp_pteval_entry(type, offset) ((pteval_t) { \ 2738c2ecf20Sopenharmony_ci (~(pteval_t)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ 2748c2ecf20Sopenharmony_ci | ((pteval_t)(type) << (64 - SWP_TYPE_BITS)) }) 2758c2ecf20Sopenharmony_ci 2768c2ecf20Sopenharmony_ci#define __swp_entry_to_pte(x) ((pte_t){ .pte = \ 2778c2ecf20Sopenharmony_ci __swp_pteval_entry(__swp_type(x), __swp_offset(x)) }) 2788c2ecf20Sopenharmony_ci/* 2798c2ecf20Sopenharmony_ci * Analogically, __pte_to_swp_entry() doesn't just extract the arch-dependent 2808c2ecf20Sopenharmony_ci * swp_entry_t, but also has to convert it from 64bit to the 32bit 2818c2ecf20Sopenharmony_ci * intermediate representation, using the following macros based on 64bit 2828c2ecf20Sopenharmony_ci * __swp_type() and __swp_offset(). 2838c2ecf20Sopenharmony_ci */ 2848c2ecf20Sopenharmony_ci#define __pteval_swp_type(x) ((unsigned long)((x).pte >> (64 - SWP_TYPE_BITS))) 2858c2ecf20Sopenharmony_ci#define __pteval_swp_offset(x) ((unsigned long)(~((x).pte) << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)) 2868c2ecf20Sopenharmony_ci 2878c2ecf20Sopenharmony_ci#define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \ 2888c2ecf20Sopenharmony_ci __pteval_swp_offset(pte))) 2898c2ecf20Sopenharmony_ci 2908c2ecf20Sopenharmony_ci#include <asm/pgtable-invert.h> 2918c2ecf20Sopenharmony_ci 2928c2ecf20Sopenharmony_ci#endif /* _ASM_X86_PGTABLE_3LEVEL_H */ 293