162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * PowerPC version 462306a36Sopenharmony_ci * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * Derived from "arch/i386/mm/fault.c" 762306a36Sopenharmony_ci * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 862306a36Sopenharmony_ci * 962306a36Sopenharmony_ci * Modified by Cort Dougan and Paul Mackerras. 1062306a36Sopenharmony_ci * 1162306a36Sopenharmony_ci * Modified for PPC64 by Dave Engebretsen (engebret@ibm.com) 1262306a36Sopenharmony_ci */ 1362306a36Sopenharmony_ci 1462306a36Sopenharmony_ci#include <linux/signal.h> 1562306a36Sopenharmony_ci#include <linux/sched.h> 1662306a36Sopenharmony_ci#include <linux/sched/task_stack.h> 1762306a36Sopenharmony_ci#include <linux/kernel.h> 1862306a36Sopenharmony_ci#include <linux/errno.h> 1962306a36Sopenharmony_ci#include <linux/string.h> 2062306a36Sopenharmony_ci#include <linux/types.h> 2162306a36Sopenharmony_ci#include <linux/pagemap.h> 2262306a36Sopenharmony_ci#include <linux/ptrace.h> 2362306a36Sopenharmony_ci#include <linux/mman.h> 2462306a36Sopenharmony_ci#include <linux/mm.h> 2562306a36Sopenharmony_ci#include <linux/interrupt.h> 2662306a36Sopenharmony_ci#include <linux/highmem.h> 2762306a36Sopenharmony_ci#include <linux/extable.h> 2862306a36Sopenharmony_ci#include <linux/kprobes.h> 2962306a36Sopenharmony_ci#include <linux/kdebug.h> 3062306a36Sopenharmony_ci#include <linux/perf_event.h> 3162306a36Sopenharmony_ci#include <linux/ratelimit.h> 3262306a36Sopenharmony_ci#include <linux/context_tracking.h> 3362306a36Sopenharmony_ci#include <linux/hugetlb.h> 3462306a36Sopenharmony_ci#include <linux/uaccess.h> 3562306a36Sopenharmony_ci#include <linux/kfence.h> 3662306a36Sopenharmony_ci#include <linux/pkeys.h> 3762306a36Sopenharmony_ci 3862306a36Sopenharmony_ci#include <asm/firmware.h> 3962306a36Sopenharmony_ci#include <asm/interrupt.h> 4062306a36Sopenharmony_ci#include <asm/page.h> 4162306a36Sopenharmony_ci#include <asm/mmu.h> 4262306a36Sopenharmony_ci#include <asm/mmu_context.h> 4362306a36Sopenharmony_ci#include <asm/siginfo.h> 4462306a36Sopenharmony_ci#include <asm/debug.h> 4562306a36Sopenharmony_ci#include <asm/kup.h> 4662306a36Sopenharmony_ci#include <asm/inst.h> 4762306a36Sopenharmony_ci 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci/* 5062306a36Sopenharmony_ci * do_page_fault error handling helpers 5162306a36Sopenharmony_ci */ 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_cistatic int 5462306a36Sopenharmony_ci__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code) 5562306a36Sopenharmony_ci{ 5662306a36Sopenharmony_ci /* 5762306a36Sopenharmony_ci * If we are in kernel mode, bail out with a SEGV, this will 5862306a36Sopenharmony_ci * be caught by the assembly which will restore the non-volatile 5962306a36Sopenharmony_ci * registers before calling bad_page_fault() 6062306a36Sopenharmony_ci */ 6162306a36Sopenharmony_ci if (!user_mode(regs)) 6262306a36Sopenharmony_ci return SIGSEGV; 6362306a36Sopenharmony_ci 6462306a36Sopenharmony_ci _exception(SIGSEGV, regs, si_code, address); 6562306a36Sopenharmony_ci 6662306a36Sopenharmony_ci return 0; 6762306a36Sopenharmony_ci} 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_cistatic noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long address) 7062306a36Sopenharmony_ci{ 7162306a36Sopenharmony_ci return __bad_area_nosemaphore(regs, address, SEGV_MAPERR); 7262306a36Sopenharmony_ci} 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_cistatic int __bad_area(struct pt_regs *regs, unsigned long address, int si_code) 7562306a36Sopenharmony_ci{ 7662306a36Sopenharmony_ci struct mm_struct *mm = current->mm; 7762306a36Sopenharmony_ci 7862306a36Sopenharmony_ci /* 7962306a36Sopenharmony_ci * Something tried to access memory that isn't in our memory map.. 8062306a36Sopenharmony_ci * Fix it, but check if it's kernel or user first.. 8162306a36Sopenharmony_ci */ 8262306a36Sopenharmony_ci mmap_read_unlock(mm); 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci return __bad_area_nosemaphore(regs, address, si_code); 8562306a36Sopenharmony_ci} 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_cistatic noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address, 8862306a36Sopenharmony_ci struct vm_area_struct *vma) 8962306a36Sopenharmony_ci{ 9062306a36Sopenharmony_ci struct mm_struct *mm = current->mm; 9162306a36Sopenharmony_ci int pkey; 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ci /* 9462306a36Sopenharmony_ci * We don't try to fetch the pkey from page table because reading 9562306a36Sopenharmony_ci * page table without locking doesn't guarantee stable pte value. 9662306a36Sopenharmony_ci * Hence the pkey value that we return to userspace can be different 9762306a36Sopenharmony_ci * from the pkey that actually caused access error. 9862306a36Sopenharmony_ci * 9962306a36Sopenharmony_ci * It does *not* guarantee that the VMA we find here 10062306a36Sopenharmony_ci * was the one that we faulted on. 10162306a36Sopenharmony_ci * 10262306a36Sopenharmony_ci * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); 10362306a36Sopenharmony_ci * 2. T1 : set AMR to deny access to pkey=4, touches, page 10462306a36Sopenharmony_ci * 3. T1 : faults... 10562306a36Sopenharmony_ci * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); 10662306a36Sopenharmony_ci * 5. T1 : enters fault handler, takes mmap_lock, etc... 10762306a36Sopenharmony_ci * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really 10862306a36Sopenharmony_ci * faulted on a pte with its pkey=4. 10962306a36Sopenharmony_ci */ 11062306a36Sopenharmony_ci pkey = vma_pkey(vma); 11162306a36Sopenharmony_ci 11262306a36Sopenharmony_ci mmap_read_unlock(mm); 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci /* 11562306a36Sopenharmony_ci * If we are in kernel mode, bail out with a SEGV, this will 11662306a36Sopenharmony_ci * be caught by the assembly which will restore the non-volatile 11762306a36Sopenharmony_ci * registers before calling bad_page_fault() 11862306a36Sopenharmony_ci */ 11962306a36Sopenharmony_ci if (!user_mode(regs)) 12062306a36Sopenharmony_ci return SIGSEGV; 12162306a36Sopenharmony_ci 12262306a36Sopenharmony_ci _exception_pkey(regs, address, pkey); 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci return 0; 12562306a36Sopenharmony_ci} 12662306a36Sopenharmony_ci 12762306a36Sopenharmony_cistatic noinline int bad_access(struct pt_regs *regs, unsigned long address) 12862306a36Sopenharmony_ci{ 12962306a36Sopenharmony_ci return __bad_area(regs, address, SEGV_ACCERR); 13062306a36Sopenharmony_ci} 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_cistatic int do_sigbus(struct pt_regs *regs, unsigned long address, 13362306a36Sopenharmony_ci vm_fault_t fault) 13462306a36Sopenharmony_ci{ 13562306a36Sopenharmony_ci if (!user_mode(regs)) 13662306a36Sopenharmony_ci return SIGBUS; 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci current->thread.trap_nr = BUS_ADRERR; 13962306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_FAILURE 14062306a36Sopenharmony_ci if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { 14162306a36Sopenharmony_ci unsigned int lsb = 0; /* shutup gcc */ 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_ci pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", 14462306a36Sopenharmony_ci current->comm, current->pid, address); 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci if (fault & VM_FAULT_HWPOISON_LARGE) 14762306a36Sopenharmony_ci lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 14862306a36Sopenharmony_ci if (fault & VM_FAULT_HWPOISON) 14962306a36Sopenharmony_ci lsb = PAGE_SHIFT; 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_ci force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb); 15262306a36Sopenharmony_ci return 0; 15362306a36Sopenharmony_ci } 15462306a36Sopenharmony_ci 15562306a36Sopenharmony_ci#endif 15662306a36Sopenharmony_ci force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); 15762306a36Sopenharmony_ci return 0; 15862306a36Sopenharmony_ci} 15962306a36Sopenharmony_ci 16062306a36Sopenharmony_cistatic int mm_fault_error(struct pt_regs *regs, unsigned long addr, 16162306a36Sopenharmony_ci vm_fault_t fault) 16262306a36Sopenharmony_ci{ 16362306a36Sopenharmony_ci /* 16462306a36Sopenharmony_ci * Kernel page fault interrupted by SIGKILL. We have no reason to 16562306a36Sopenharmony_ci * continue processing. 16662306a36Sopenharmony_ci */ 16762306a36Sopenharmony_ci if (fatal_signal_pending(current) && !user_mode(regs)) 16862306a36Sopenharmony_ci return SIGKILL; 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci /* Out of memory */ 17162306a36Sopenharmony_ci if (fault & VM_FAULT_OOM) { 17262306a36Sopenharmony_ci /* 17362306a36Sopenharmony_ci * We ran out of memory, or some other thing happened to us that 17462306a36Sopenharmony_ci * made us unable to handle the page fault gracefully. 17562306a36Sopenharmony_ci */ 17662306a36Sopenharmony_ci if (!user_mode(regs)) 17762306a36Sopenharmony_ci return SIGSEGV; 17862306a36Sopenharmony_ci pagefault_out_of_memory(); 17962306a36Sopenharmony_ci } else { 18062306a36Sopenharmony_ci if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| 18162306a36Sopenharmony_ci VM_FAULT_HWPOISON_LARGE)) 18262306a36Sopenharmony_ci return do_sigbus(regs, addr, fault); 18362306a36Sopenharmony_ci else if (fault & VM_FAULT_SIGSEGV) 18462306a36Sopenharmony_ci return bad_area_nosemaphore(regs, addr); 18562306a36Sopenharmony_ci else 18662306a36Sopenharmony_ci BUG(); 18762306a36Sopenharmony_ci } 18862306a36Sopenharmony_ci return 0; 18962306a36Sopenharmony_ci} 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_ci/* Is this a bad kernel fault ? */ 19262306a36Sopenharmony_cistatic bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code, 19362306a36Sopenharmony_ci unsigned long address, bool is_write) 19462306a36Sopenharmony_ci{ 19562306a36Sopenharmony_ci int is_exec = TRAP(regs) == INTERRUPT_INST_STORAGE; 19662306a36Sopenharmony_ci 19762306a36Sopenharmony_ci if (is_exec) { 19862306a36Sopenharmony_ci pr_crit_ratelimited("kernel tried to execute %s page (%lx) - exploit attempt? (uid: %d)\n", 19962306a36Sopenharmony_ci address >= TASK_SIZE ? "exec-protected" : "user", 20062306a36Sopenharmony_ci address, 20162306a36Sopenharmony_ci from_kuid(&init_user_ns, current_uid())); 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_ci // Kernel exec fault is always bad 20462306a36Sopenharmony_ci return true; 20562306a36Sopenharmony_ci } 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci // Kernel fault on kernel address is bad 20862306a36Sopenharmony_ci if (address >= TASK_SIZE) 20962306a36Sopenharmony_ci return true; 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci // Read/write fault blocked by KUAP is bad, it can never succeed. 21262306a36Sopenharmony_ci if (bad_kuap_fault(regs, address, is_write)) { 21362306a36Sopenharmony_ci pr_crit_ratelimited("Kernel attempted to %s user page (%lx) - exploit attempt? (uid: %d)\n", 21462306a36Sopenharmony_ci is_write ? "write" : "read", address, 21562306a36Sopenharmony_ci from_kuid(&init_user_ns, current_uid())); 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci // Fault on user outside of certain regions (eg. copy_tofrom_user()) is bad 21862306a36Sopenharmony_ci if (!search_exception_tables(regs->nip)) 21962306a36Sopenharmony_ci return true; 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_ci // Read/write fault in a valid region (the exception table search passed 22262306a36Sopenharmony_ci // above), but blocked by KUAP is bad, it can never succeed. 22362306a36Sopenharmony_ci return WARN(true, "Bug: %s fault blocked by KUAP!", is_write ? "Write" : "Read"); 22462306a36Sopenharmony_ci } 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci // What's left? Kernel fault on user and allowed by KUAP in the faulting context. 22762306a36Sopenharmony_ci return false; 22862306a36Sopenharmony_ci} 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_cistatic bool access_pkey_error(bool is_write, bool is_exec, bool is_pkey, 23162306a36Sopenharmony_ci struct vm_area_struct *vma) 23262306a36Sopenharmony_ci{ 23362306a36Sopenharmony_ci /* 23462306a36Sopenharmony_ci * Make sure to check the VMA so that we do not perform 23562306a36Sopenharmony_ci * faults just to hit a pkey fault as soon as we fill in a 23662306a36Sopenharmony_ci * page. Only called for current mm, hence foreign == 0 23762306a36Sopenharmony_ci */ 23862306a36Sopenharmony_ci if (!arch_vma_access_permitted(vma, is_write, is_exec, 0)) 23962306a36Sopenharmony_ci return true; 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_ci return false; 24262306a36Sopenharmony_ci} 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_cistatic bool access_error(bool is_write, bool is_exec, struct vm_area_struct *vma) 24562306a36Sopenharmony_ci{ 24662306a36Sopenharmony_ci /* 24762306a36Sopenharmony_ci * Allow execution from readable areas if the MMU does not 24862306a36Sopenharmony_ci * provide separate controls over reading and executing. 24962306a36Sopenharmony_ci * 25062306a36Sopenharmony_ci * Note: That code used to not be enabled for 4xx/BookE. 25162306a36Sopenharmony_ci * It is now as I/D cache coherency for these is done at 25262306a36Sopenharmony_ci * set_pte_at() time and I see no reason why the test 25362306a36Sopenharmony_ci * below wouldn't be valid on those processors. This -may- 25462306a36Sopenharmony_ci * break programs compiled with a really old ABI though. 25562306a36Sopenharmony_ci */ 25662306a36Sopenharmony_ci if (is_exec) { 25762306a36Sopenharmony_ci return !(vma->vm_flags & VM_EXEC) && 25862306a36Sopenharmony_ci (cpu_has_feature(CPU_FTR_NOEXECUTE) || 25962306a36Sopenharmony_ci !(vma->vm_flags & (VM_READ | VM_WRITE))); 26062306a36Sopenharmony_ci } 26162306a36Sopenharmony_ci 26262306a36Sopenharmony_ci if (is_write) { 26362306a36Sopenharmony_ci if (unlikely(!(vma->vm_flags & VM_WRITE))) 26462306a36Sopenharmony_ci return true; 26562306a36Sopenharmony_ci return false; 26662306a36Sopenharmony_ci } 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_ci /* 26962306a36Sopenharmony_ci * VM_READ, VM_WRITE and VM_EXEC all imply read permissions, as 27062306a36Sopenharmony_ci * defined in protection_map[]. Read faults can only be caused by 27162306a36Sopenharmony_ci * a PROT_NONE mapping, or with a PROT_EXEC-only mapping on Radix. 27262306a36Sopenharmony_ci */ 27362306a36Sopenharmony_ci if (unlikely(!vma_is_accessible(vma))) 27462306a36Sopenharmony_ci return true; 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci if (unlikely(radix_enabled() && ((vma->vm_flags & VM_ACCESS_FLAGS) == VM_EXEC))) 27762306a36Sopenharmony_ci return true; 27862306a36Sopenharmony_ci 27962306a36Sopenharmony_ci /* 28062306a36Sopenharmony_ci * We should ideally do the vma pkey access check here. But in the 28162306a36Sopenharmony_ci * fault path, handle_mm_fault() also does the same check. To avoid 28262306a36Sopenharmony_ci * these multiple checks, we skip it here and handle access error due 28362306a36Sopenharmony_ci * to pkeys later. 28462306a36Sopenharmony_ci */ 28562306a36Sopenharmony_ci return false; 28662306a36Sopenharmony_ci} 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ci#ifdef CONFIG_PPC_SMLPAR 28962306a36Sopenharmony_cistatic inline void cmo_account_page_fault(void) 29062306a36Sopenharmony_ci{ 29162306a36Sopenharmony_ci if (firmware_has_feature(FW_FEATURE_CMO)) { 29262306a36Sopenharmony_ci u32 page_ins; 29362306a36Sopenharmony_ci 29462306a36Sopenharmony_ci preempt_disable(); 29562306a36Sopenharmony_ci page_ins = be32_to_cpu(get_lppaca()->page_ins); 29662306a36Sopenharmony_ci page_ins += 1 << PAGE_FACTOR; 29762306a36Sopenharmony_ci get_lppaca()->page_ins = cpu_to_be32(page_ins); 29862306a36Sopenharmony_ci preempt_enable(); 29962306a36Sopenharmony_ci } 30062306a36Sopenharmony_ci} 30162306a36Sopenharmony_ci#else 30262306a36Sopenharmony_cistatic inline void cmo_account_page_fault(void) { } 30362306a36Sopenharmony_ci#endif /* CONFIG_PPC_SMLPAR */ 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_cistatic void sanity_check_fault(bool is_write, bool is_user, 30662306a36Sopenharmony_ci unsigned long error_code, unsigned long address) 30762306a36Sopenharmony_ci{ 30862306a36Sopenharmony_ci /* 30962306a36Sopenharmony_ci * Userspace trying to access kernel address, we get PROTFAULT for that. 31062306a36Sopenharmony_ci */ 31162306a36Sopenharmony_ci if (is_user && address >= TASK_SIZE) { 31262306a36Sopenharmony_ci if ((long)address == -1) 31362306a36Sopenharmony_ci return; 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_ci pr_crit_ratelimited("%s[%d]: User access of kernel address (%lx) - exploit attempt? (uid: %d)\n", 31662306a36Sopenharmony_ci current->comm, current->pid, address, 31762306a36Sopenharmony_ci from_kuid(&init_user_ns, current_uid())); 31862306a36Sopenharmony_ci return; 31962306a36Sopenharmony_ci } 32062306a36Sopenharmony_ci 32162306a36Sopenharmony_ci if (!IS_ENABLED(CONFIG_PPC_BOOK3S)) 32262306a36Sopenharmony_ci return; 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_ci /* 32562306a36Sopenharmony_ci * For hash translation mode, we should never get a 32662306a36Sopenharmony_ci * PROTFAULT. Any update to pte to reduce access will result in us 32762306a36Sopenharmony_ci * removing the hash page table entry, thus resulting in a DSISR_NOHPTE 32862306a36Sopenharmony_ci * fault instead of DSISR_PROTFAULT. 32962306a36Sopenharmony_ci * 33062306a36Sopenharmony_ci * A pte update to relax the access will not result in a hash page table 33162306a36Sopenharmony_ci * entry invalidate and hence can result in DSISR_PROTFAULT. 33262306a36Sopenharmony_ci * ptep_set_access_flags() doesn't do a hpte flush. This is why we have 33362306a36Sopenharmony_ci * the special !is_write in the below conditional. 33462306a36Sopenharmony_ci * 33562306a36Sopenharmony_ci * For platforms that doesn't supports coherent icache and do support 33662306a36Sopenharmony_ci * per page noexec bit, we do setup things such that we do the 33762306a36Sopenharmony_ci * sync between D/I cache via fault. But that is handled via low level 33862306a36Sopenharmony_ci * hash fault code (hash_page_do_lazy_icache()) and we should not reach 33962306a36Sopenharmony_ci * here in such case. 34062306a36Sopenharmony_ci * 34162306a36Sopenharmony_ci * For wrong access that can result in PROTFAULT, the above vma->vm_flags 34262306a36Sopenharmony_ci * check should handle those and hence we should fall to the bad_area 34362306a36Sopenharmony_ci * handling correctly. 34462306a36Sopenharmony_ci * 34562306a36Sopenharmony_ci * For embedded with per page exec support that doesn't support coherent 34662306a36Sopenharmony_ci * icache we do get PROTFAULT and we handle that D/I cache sync in 34762306a36Sopenharmony_ci * set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON 34862306a36Sopenharmony_ci * is conditional for server MMU. 34962306a36Sopenharmony_ci * 35062306a36Sopenharmony_ci * For radix, we can get prot fault for autonuma case, because radix 35162306a36Sopenharmony_ci * page table will have them marked noaccess for user. 35262306a36Sopenharmony_ci */ 35362306a36Sopenharmony_ci if (radix_enabled() || is_write) 35462306a36Sopenharmony_ci return; 35562306a36Sopenharmony_ci 35662306a36Sopenharmony_ci WARN_ON_ONCE(error_code & DSISR_PROTFAULT); 35762306a36Sopenharmony_ci} 35862306a36Sopenharmony_ci 35962306a36Sopenharmony_ci/* 36062306a36Sopenharmony_ci * Define the correct "is_write" bit in error_code based 36162306a36Sopenharmony_ci * on the processor family 36262306a36Sopenharmony_ci */ 36362306a36Sopenharmony_ci#if (defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) 36462306a36Sopenharmony_ci#define page_fault_is_write(__err) ((__err) & ESR_DST) 36562306a36Sopenharmony_ci#else 36662306a36Sopenharmony_ci#define page_fault_is_write(__err) ((__err) & DSISR_ISSTORE) 36762306a36Sopenharmony_ci#endif 36862306a36Sopenharmony_ci 36962306a36Sopenharmony_ci#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) 37062306a36Sopenharmony_ci#define page_fault_is_bad(__err) (0) 37162306a36Sopenharmony_ci#elif defined(CONFIG_PPC_8xx) 37262306a36Sopenharmony_ci#define page_fault_is_bad(__err) ((__err) & DSISR_NOEXEC_OR_G) 37362306a36Sopenharmony_ci#elif defined(CONFIG_PPC64) 37462306a36Sopenharmony_cistatic int page_fault_is_bad(unsigned long err) 37562306a36Sopenharmony_ci{ 37662306a36Sopenharmony_ci unsigned long flag = DSISR_BAD_FAULT_64S; 37762306a36Sopenharmony_ci 37862306a36Sopenharmony_ci /* 37962306a36Sopenharmony_ci * PAPR+ v2.11 § 14.15.3.4.1 (unreleased) 38062306a36Sopenharmony_ci * If byte 0, bit 3 of pi-attribute-specifier-type in 38162306a36Sopenharmony_ci * ibm,pi-features property is defined, ignore the DSI error 38262306a36Sopenharmony_ci * which is caused by the paste instruction on the 38362306a36Sopenharmony_ci * suspended NX window. 38462306a36Sopenharmony_ci */ 38562306a36Sopenharmony_ci if (mmu_has_feature(MMU_FTR_NX_DSI)) 38662306a36Sopenharmony_ci flag &= ~DSISR_BAD_COPYPASTE; 38762306a36Sopenharmony_ci 38862306a36Sopenharmony_ci return err & flag; 38962306a36Sopenharmony_ci} 39062306a36Sopenharmony_ci#else 39162306a36Sopenharmony_ci#define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_32S) 39262306a36Sopenharmony_ci#endif 39362306a36Sopenharmony_ci 39462306a36Sopenharmony_ci/* 39562306a36Sopenharmony_ci * For 600- and 800-family processors, the error_code parameter is DSISR 39662306a36Sopenharmony_ci * for a data fault, SRR1 for an instruction fault. 39762306a36Sopenharmony_ci * For 400-family processors the error_code parameter is ESR for a data fault, 39862306a36Sopenharmony_ci * 0 for an instruction fault. 39962306a36Sopenharmony_ci * For 64-bit processors, the error_code parameter is DSISR for a data access 40062306a36Sopenharmony_ci * fault, SRR1 & 0x08000000 for an instruction access fault. 40162306a36Sopenharmony_ci * 40262306a36Sopenharmony_ci * The return value is 0 if the fault was handled, or the signal 40362306a36Sopenharmony_ci * number if this is a kernel fault that can't be handled here. 40462306a36Sopenharmony_ci */ 40562306a36Sopenharmony_cistatic int ___do_page_fault(struct pt_regs *regs, unsigned long address, 40662306a36Sopenharmony_ci unsigned long error_code) 40762306a36Sopenharmony_ci{ 40862306a36Sopenharmony_ci struct vm_area_struct * vma; 40962306a36Sopenharmony_ci struct mm_struct *mm = current->mm; 41062306a36Sopenharmony_ci unsigned int flags = FAULT_FLAG_DEFAULT; 41162306a36Sopenharmony_ci int is_exec = TRAP(regs) == INTERRUPT_INST_STORAGE; 41262306a36Sopenharmony_ci int is_user = user_mode(regs); 41362306a36Sopenharmony_ci int is_write = page_fault_is_write(error_code); 41462306a36Sopenharmony_ci vm_fault_t fault, major = 0; 41562306a36Sopenharmony_ci bool kprobe_fault = kprobe_page_fault(regs, 11); 41662306a36Sopenharmony_ci 41762306a36Sopenharmony_ci if (unlikely(debugger_fault_handler(regs) || kprobe_fault)) 41862306a36Sopenharmony_ci return 0; 41962306a36Sopenharmony_ci 42062306a36Sopenharmony_ci if (unlikely(page_fault_is_bad(error_code))) { 42162306a36Sopenharmony_ci if (is_user) { 42262306a36Sopenharmony_ci _exception(SIGBUS, regs, BUS_OBJERR, address); 42362306a36Sopenharmony_ci return 0; 42462306a36Sopenharmony_ci } 42562306a36Sopenharmony_ci return SIGBUS; 42662306a36Sopenharmony_ci } 42762306a36Sopenharmony_ci 42862306a36Sopenharmony_ci /* Additional sanity check(s) */ 42962306a36Sopenharmony_ci sanity_check_fault(is_write, is_user, error_code, address); 43062306a36Sopenharmony_ci 43162306a36Sopenharmony_ci /* 43262306a36Sopenharmony_ci * The kernel should never take an execute fault nor should it 43362306a36Sopenharmony_ci * take a page fault to a kernel address or a page fault to a user 43462306a36Sopenharmony_ci * address outside of dedicated places 43562306a36Sopenharmony_ci */ 43662306a36Sopenharmony_ci if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address, is_write))) { 43762306a36Sopenharmony_ci if (kfence_handle_page_fault(address, is_write, regs)) 43862306a36Sopenharmony_ci return 0; 43962306a36Sopenharmony_ci 44062306a36Sopenharmony_ci return SIGSEGV; 44162306a36Sopenharmony_ci } 44262306a36Sopenharmony_ci 44362306a36Sopenharmony_ci /* 44462306a36Sopenharmony_ci * If we're in an interrupt, have no user context or are running 44562306a36Sopenharmony_ci * in a region with pagefaults disabled then we must not take the fault 44662306a36Sopenharmony_ci */ 44762306a36Sopenharmony_ci if (unlikely(faulthandler_disabled() || !mm)) { 44862306a36Sopenharmony_ci if (is_user) 44962306a36Sopenharmony_ci printk_ratelimited(KERN_ERR "Page fault in user mode" 45062306a36Sopenharmony_ci " with faulthandler_disabled()=%d" 45162306a36Sopenharmony_ci " mm=%p\n", 45262306a36Sopenharmony_ci faulthandler_disabled(), mm); 45362306a36Sopenharmony_ci return bad_area_nosemaphore(regs, address); 45462306a36Sopenharmony_ci } 45562306a36Sopenharmony_ci 45662306a36Sopenharmony_ci interrupt_cond_local_irq_enable(regs); 45762306a36Sopenharmony_ci 45862306a36Sopenharmony_ci perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); 45962306a36Sopenharmony_ci 46062306a36Sopenharmony_ci /* 46162306a36Sopenharmony_ci * We want to do this outside mmap_lock, because reading code around nip 46262306a36Sopenharmony_ci * can result in fault, which will cause a deadlock when called with 46362306a36Sopenharmony_ci * mmap_lock held 46462306a36Sopenharmony_ci */ 46562306a36Sopenharmony_ci if (is_user) 46662306a36Sopenharmony_ci flags |= FAULT_FLAG_USER; 46762306a36Sopenharmony_ci if (is_write) 46862306a36Sopenharmony_ci flags |= FAULT_FLAG_WRITE; 46962306a36Sopenharmony_ci if (is_exec) 47062306a36Sopenharmony_ci flags |= FAULT_FLAG_INSTRUCTION; 47162306a36Sopenharmony_ci 47262306a36Sopenharmony_ci if (!(flags & FAULT_FLAG_USER)) 47362306a36Sopenharmony_ci goto lock_mmap; 47462306a36Sopenharmony_ci 47562306a36Sopenharmony_ci vma = lock_vma_under_rcu(mm, address); 47662306a36Sopenharmony_ci if (!vma) 47762306a36Sopenharmony_ci goto lock_mmap; 47862306a36Sopenharmony_ci 47962306a36Sopenharmony_ci if (unlikely(access_pkey_error(is_write, is_exec, 48062306a36Sopenharmony_ci (error_code & DSISR_KEYFAULT), vma))) { 48162306a36Sopenharmony_ci vma_end_read(vma); 48262306a36Sopenharmony_ci goto lock_mmap; 48362306a36Sopenharmony_ci } 48462306a36Sopenharmony_ci 48562306a36Sopenharmony_ci if (unlikely(access_error(is_write, is_exec, vma))) { 48662306a36Sopenharmony_ci vma_end_read(vma); 48762306a36Sopenharmony_ci goto lock_mmap; 48862306a36Sopenharmony_ci } 48962306a36Sopenharmony_ci 49062306a36Sopenharmony_ci fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); 49162306a36Sopenharmony_ci if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) 49262306a36Sopenharmony_ci vma_end_read(vma); 49362306a36Sopenharmony_ci 49462306a36Sopenharmony_ci if (!(fault & VM_FAULT_RETRY)) { 49562306a36Sopenharmony_ci count_vm_vma_lock_event(VMA_LOCK_SUCCESS); 49662306a36Sopenharmony_ci goto done; 49762306a36Sopenharmony_ci } 49862306a36Sopenharmony_ci count_vm_vma_lock_event(VMA_LOCK_RETRY); 49962306a36Sopenharmony_ci 50062306a36Sopenharmony_ci if (fault_signal_pending(fault, regs)) 50162306a36Sopenharmony_ci return user_mode(regs) ? 0 : SIGBUS; 50262306a36Sopenharmony_ci 50362306a36Sopenharmony_cilock_mmap: 50462306a36Sopenharmony_ci 50562306a36Sopenharmony_ci /* When running in the kernel we expect faults to occur only to 50662306a36Sopenharmony_ci * addresses in user space. All other faults represent errors in the 50762306a36Sopenharmony_ci * kernel and should generate an OOPS. Unfortunately, in the case of an 50862306a36Sopenharmony_ci * erroneous fault occurring in a code path which already holds mmap_lock 50962306a36Sopenharmony_ci * we will deadlock attempting to validate the fault against the 51062306a36Sopenharmony_ci * address space. Luckily the kernel only validly references user 51162306a36Sopenharmony_ci * space from well defined areas of code, which are listed in the 51262306a36Sopenharmony_ci * exceptions table. lock_mm_and_find_vma() handles that logic. 51362306a36Sopenharmony_ci */ 51462306a36Sopenharmony_ciretry: 51562306a36Sopenharmony_ci vma = lock_mm_and_find_vma(mm, address, regs); 51662306a36Sopenharmony_ci if (unlikely(!vma)) 51762306a36Sopenharmony_ci return bad_area_nosemaphore(regs, address); 51862306a36Sopenharmony_ci 51962306a36Sopenharmony_ci if (unlikely(access_pkey_error(is_write, is_exec, 52062306a36Sopenharmony_ci (error_code & DSISR_KEYFAULT), vma))) 52162306a36Sopenharmony_ci return bad_access_pkey(regs, address, vma); 52262306a36Sopenharmony_ci 52362306a36Sopenharmony_ci if (unlikely(access_error(is_write, is_exec, vma))) 52462306a36Sopenharmony_ci return bad_access(regs, address); 52562306a36Sopenharmony_ci 52662306a36Sopenharmony_ci /* 52762306a36Sopenharmony_ci * If for any reason at all we couldn't handle the fault, 52862306a36Sopenharmony_ci * make sure we exit gracefully rather than endlessly redo 52962306a36Sopenharmony_ci * the fault. 53062306a36Sopenharmony_ci */ 53162306a36Sopenharmony_ci fault = handle_mm_fault(vma, address, flags, regs); 53262306a36Sopenharmony_ci 53362306a36Sopenharmony_ci major |= fault & VM_FAULT_MAJOR; 53462306a36Sopenharmony_ci 53562306a36Sopenharmony_ci if (fault_signal_pending(fault, regs)) 53662306a36Sopenharmony_ci return user_mode(regs) ? 0 : SIGBUS; 53762306a36Sopenharmony_ci 53862306a36Sopenharmony_ci /* The fault is fully completed (including releasing mmap lock) */ 53962306a36Sopenharmony_ci if (fault & VM_FAULT_COMPLETED) 54062306a36Sopenharmony_ci goto out; 54162306a36Sopenharmony_ci 54262306a36Sopenharmony_ci /* 54362306a36Sopenharmony_ci * Handle the retry right now, the mmap_lock has been released in that 54462306a36Sopenharmony_ci * case. 54562306a36Sopenharmony_ci */ 54662306a36Sopenharmony_ci if (unlikely(fault & VM_FAULT_RETRY)) { 54762306a36Sopenharmony_ci flags |= FAULT_FLAG_TRIED; 54862306a36Sopenharmony_ci goto retry; 54962306a36Sopenharmony_ci } 55062306a36Sopenharmony_ci 55162306a36Sopenharmony_ci mmap_read_unlock(current->mm); 55262306a36Sopenharmony_ci 55362306a36Sopenharmony_cidone: 55462306a36Sopenharmony_ci if (unlikely(fault & VM_FAULT_ERROR)) 55562306a36Sopenharmony_ci return mm_fault_error(regs, address, fault); 55662306a36Sopenharmony_ci 55762306a36Sopenharmony_ciout: 55862306a36Sopenharmony_ci /* 55962306a36Sopenharmony_ci * Major/minor page fault accounting. 56062306a36Sopenharmony_ci */ 56162306a36Sopenharmony_ci if (major) 56262306a36Sopenharmony_ci cmo_account_page_fault(); 56362306a36Sopenharmony_ci 56462306a36Sopenharmony_ci return 0; 56562306a36Sopenharmony_ci} 56662306a36Sopenharmony_ciNOKPROBE_SYMBOL(___do_page_fault); 56762306a36Sopenharmony_ci 56862306a36Sopenharmony_cistatic __always_inline void __do_page_fault(struct pt_regs *regs) 56962306a36Sopenharmony_ci{ 57062306a36Sopenharmony_ci long err; 57162306a36Sopenharmony_ci 57262306a36Sopenharmony_ci err = ___do_page_fault(regs, regs->dar, regs->dsisr); 57362306a36Sopenharmony_ci if (unlikely(err)) 57462306a36Sopenharmony_ci bad_page_fault(regs, err); 57562306a36Sopenharmony_ci} 57662306a36Sopenharmony_ci 57762306a36Sopenharmony_ciDEFINE_INTERRUPT_HANDLER(do_page_fault) 57862306a36Sopenharmony_ci{ 57962306a36Sopenharmony_ci __do_page_fault(regs); 58062306a36Sopenharmony_ci} 58162306a36Sopenharmony_ci 58262306a36Sopenharmony_ci#ifdef CONFIG_PPC_BOOK3S_64 58362306a36Sopenharmony_ci/* Same as do_page_fault but interrupt entry has already run in do_hash_fault */ 58462306a36Sopenharmony_civoid hash__do_page_fault(struct pt_regs *regs) 58562306a36Sopenharmony_ci{ 58662306a36Sopenharmony_ci __do_page_fault(regs); 58762306a36Sopenharmony_ci} 58862306a36Sopenharmony_ciNOKPROBE_SYMBOL(hash__do_page_fault); 58962306a36Sopenharmony_ci#endif 59062306a36Sopenharmony_ci 59162306a36Sopenharmony_ci/* 59262306a36Sopenharmony_ci * bad_page_fault is called when we have a bad access from the kernel. 59362306a36Sopenharmony_ci * It is called from the DSI and ISI handlers in head.S and from some 59462306a36Sopenharmony_ci * of the procedures in traps.c. 59562306a36Sopenharmony_ci */ 59662306a36Sopenharmony_cistatic void __bad_page_fault(struct pt_regs *regs, int sig) 59762306a36Sopenharmony_ci{ 59862306a36Sopenharmony_ci int is_write = page_fault_is_write(regs->dsisr); 59962306a36Sopenharmony_ci const char *msg; 60062306a36Sopenharmony_ci 60162306a36Sopenharmony_ci /* kernel has accessed a bad area */ 60262306a36Sopenharmony_ci 60362306a36Sopenharmony_ci if (regs->dar < PAGE_SIZE) 60462306a36Sopenharmony_ci msg = "Kernel NULL pointer dereference"; 60562306a36Sopenharmony_ci else 60662306a36Sopenharmony_ci msg = "Unable to handle kernel data access"; 60762306a36Sopenharmony_ci 60862306a36Sopenharmony_ci switch (TRAP(regs)) { 60962306a36Sopenharmony_ci case INTERRUPT_DATA_STORAGE: 61062306a36Sopenharmony_ci case INTERRUPT_H_DATA_STORAGE: 61162306a36Sopenharmony_ci pr_alert("BUG: %s on %s at 0x%08lx\n", msg, 61262306a36Sopenharmony_ci is_write ? "write" : "read", regs->dar); 61362306a36Sopenharmony_ci break; 61462306a36Sopenharmony_ci case INTERRUPT_DATA_SEGMENT: 61562306a36Sopenharmony_ci pr_alert("BUG: %s at 0x%08lx\n", msg, regs->dar); 61662306a36Sopenharmony_ci break; 61762306a36Sopenharmony_ci case INTERRUPT_INST_STORAGE: 61862306a36Sopenharmony_ci case INTERRUPT_INST_SEGMENT: 61962306a36Sopenharmony_ci pr_alert("BUG: Unable to handle kernel instruction fetch%s", 62062306a36Sopenharmony_ci regs->nip < PAGE_SIZE ? " (NULL pointer?)\n" : "\n"); 62162306a36Sopenharmony_ci break; 62262306a36Sopenharmony_ci case INTERRUPT_ALIGNMENT: 62362306a36Sopenharmony_ci pr_alert("BUG: Unable to handle kernel unaligned access at 0x%08lx\n", 62462306a36Sopenharmony_ci regs->dar); 62562306a36Sopenharmony_ci break; 62662306a36Sopenharmony_ci default: 62762306a36Sopenharmony_ci pr_alert("BUG: Unable to handle unknown paging fault at 0x%08lx\n", 62862306a36Sopenharmony_ci regs->dar); 62962306a36Sopenharmony_ci break; 63062306a36Sopenharmony_ci } 63162306a36Sopenharmony_ci printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n", 63262306a36Sopenharmony_ci regs->nip); 63362306a36Sopenharmony_ci 63462306a36Sopenharmony_ci if (task_stack_end_corrupted(current)) 63562306a36Sopenharmony_ci printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); 63662306a36Sopenharmony_ci 63762306a36Sopenharmony_ci die("Kernel access of bad area", regs, sig); 63862306a36Sopenharmony_ci} 63962306a36Sopenharmony_ci 64062306a36Sopenharmony_civoid bad_page_fault(struct pt_regs *regs, int sig) 64162306a36Sopenharmony_ci{ 64262306a36Sopenharmony_ci const struct exception_table_entry *entry; 64362306a36Sopenharmony_ci 64462306a36Sopenharmony_ci /* Are we prepared to handle this fault? */ 64562306a36Sopenharmony_ci entry = search_exception_tables(instruction_pointer(regs)); 64662306a36Sopenharmony_ci if (entry) 64762306a36Sopenharmony_ci instruction_pointer_set(regs, extable_fixup(entry)); 64862306a36Sopenharmony_ci else 64962306a36Sopenharmony_ci __bad_page_fault(regs, sig); 65062306a36Sopenharmony_ci} 65162306a36Sopenharmony_ci 65262306a36Sopenharmony_ci#ifdef CONFIG_PPC_BOOK3S_64 65362306a36Sopenharmony_ciDEFINE_INTERRUPT_HANDLER(do_bad_page_fault_segv) 65462306a36Sopenharmony_ci{ 65562306a36Sopenharmony_ci bad_page_fault(regs, SIGSEGV); 65662306a36Sopenharmony_ci} 65762306a36Sopenharmony_ci 65862306a36Sopenharmony_ci/* 65962306a36Sopenharmony_ci * In radix, segment interrupts indicate the EA is not addressable by the 66062306a36Sopenharmony_ci * page table geometry, so they are always sent here. 66162306a36Sopenharmony_ci * 66262306a36Sopenharmony_ci * In hash, this is called if do_slb_fault returns error. Typically it is 66362306a36Sopenharmony_ci * because the EA was outside the region allowed by software. 66462306a36Sopenharmony_ci */ 66562306a36Sopenharmony_ciDEFINE_INTERRUPT_HANDLER(do_bad_segment_interrupt) 66662306a36Sopenharmony_ci{ 66762306a36Sopenharmony_ci int err = regs->result; 66862306a36Sopenharmony_ci 66962306a36Sopenharmony_ci if (err == -EFAULT) { 67062306a36Sopenharmony_ci if (user_mode(regs)) 67162306a36Sopenharmony_ci _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar); 67262306a36Sopenharmony_ci else 67362306a36Sopenharmony_ci bad_page_fault(regs, SIGSEGV); 67462306a36Sopenharmony_ci } else if (err == -EINVAL) { 67562306a36Sopenharmony_ci unrecoverable_exception(regs); 67662306a36Sopenharmony_ci } else { 67762306a36Sopenharmony_ci BUG(); 67862306a36Sopenharmony_ci } 67962306a36Sopenharmony_ci} 68062306a36Sopenharmony_ci#endif 681