xref: /kernel/linux/linux-5.10/arch/s390/mm/fault.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 *  S390 version
4 *    Copyright IBM Corp. 1999
5 *    Author(s): Hartmut Penner (hp@de.ibm.com)
6 *               Ulrich Weigand (uweigand@de.ibm.com)
7 *
8 *  Derived from "arch/i386/mm/fault.c"
9 *    Copyright (C) 1995  Linus Torvalds
10 */
11
12#include <linux/kernel_stat.h>
13#include <linux/perf_event.h>
14#include <linux/signal.h>
15#include <linux/sched.h>
16#include <linux/sched/debug.h>
17#include <linux/kernel.h>
18#include <linux/errno.h>
19#include <linux/string.h>
20#include <linux/types.h>
21#include <linux/ptrace.h>
22#include <linux/mman.h>
23#include <linux/mm.h>
24#include <linux/compat.h>
25#include <linux/smp.h>
26#include <linux/kdebug.h>
27#include <linux/init.h>
28#include <linux/console.h>
29#include <linux/extable.h>
30#include <linux/hardirq.h>
31#include <linux/kprobes.h>
32#include <linux/uaccess.h>
33#include <linux/hugetlb.h>
34#include <asm/asm-offsets.h>
35#include <asm/diag.h>
36#include <asm/gmap.h>
37#include <asm/irq.h>
38#include <asm/mmu_context.h>
39#include <asm/facility.h>
40#include <asm/uv.h>
41#include "../kernel/entry.h"
42
43#define __FAIL_ADDR_MASK -4096L
44#define __SUBCODE_MASK 0x0600
45#define __PF_RES_FIELD 0x8000000000000000ULL
46
47#define VM_FAULT_BADCONTEXT	((__force vm_fault_t) 0x010000)
48#define VM_FAULT_BADMAP		((__force vm_fault_t) 0x020000)
49#define VM_FAULT_BADACCESS	((__force vm_fault_t) 0x040000)
50#define VM_FAULT_SIGNAL		((__force vm_fault_t) 0x080000)
51#define VM_FAULT_PFAULT		((__force vm_fault_t) 0x100000)
52
53enum fault_type {
54	KERNEL_FAULT,
55	USER_FAULT,
56	VDSO_FAULT,
57	GMAP_FAULT,
58};
59
60static unsigned long store_indication __read_mostly;
61
62static int __init fault_init(void)
63{
64	if (test_facility(75))
65		store_indication = 0xc00;
66	return 0;
67}
68early_initcall(fault_init);
69
70/*
71 * Find out which address space caused the exception.
72 */
73static enum fault_type get_fault_type(struct pt_regs *regs)
74{
75	unsigned long trans_exc_code;
76
77	trans_exc_code = regs->int_parm_long & 3;
78	if (likely(trans_exc_code == 0)) {
79		/* primary space exception */
80		if (IS_ENABLED(CONFIG_PGSTE) &&
81		    test_pt_regs_flag(regs, PIF_GUEST_FAULT))
82			return GMAP_FAULT;
83		if (current->thread.mm_segment == USER_DS)
84			return USER_FAULT;
85		return KERNEL_FAULT;
86	}
87	if (trans_exc_code == 2) {
88		/* secondary space exception */
89		if (current->thread.mm_segment & 1) {
90			if (current->thread.mm_segment == USER_DS_SACF)
91				return USER_FAULT;
92			return KERNEL_FAULT;
93		}
94		return VDSO_FAULT;
95	}
96	if (trans_exc_code == 1) {
97		/* access register mode, not used in the kernel */
98		return USER_FAULT;
99	}
100	/* home space exception -> access via kernel ASCE */
101	return KERNEL_FAULT;
102}
103
104static int bad_address(void *p)
105{
106	unsigned long dummy;
107
108	return get_kernel_nofault(dummy, (unsigned long *)p);
109}
110
111static void dump_pagetable(unsigned long asce, unsigned long address)
112{
113	unsigned long *table = __va(asce & _ASCE_ORIGIN);
114
115	pr_alert("AS:%016lx ", asce);
116	switch (asce & _ASCE_TYPE_MASK) {
117	case _ASCE_TYPE_REGION1:
118		table += (address & _REGION1_INDEX) >> _REGION1_SHIFT;
119		if (bad_address(table))
120			goto bad;
121		pr_cont("R1:%016lx ", *table);
122		if (*table & _REGION_ENTRY_INVALID)
123			goto out;
124		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
125		fallthrough;
126	case _ASCE_TYPE_REGION2:
127		table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
128		if (bad_address(table))
129			goto bad;
130		pr_cont("R2:%016lx ", *table);
131		if (*table & _REGION_ENTRY_INVALID)
132			goto out;
133		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
134		fallthrough;
135	case _ASCE_TYPE_REGION3:
136		table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
137		if (bad_address(table))
138			goto bad;
139		pr_cont("R3:%016lx ", *table);
140		if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
141			goto out;
142		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
143		fallthrough;
144	case _ASCE_TYPE_SEGMENT:
145		table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
146		if (bad_address(table))
147			goto bad;
148		pr_cont("S:%016lx ", *table);
149		if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
150			goto out;
151		table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
152	}
153	table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
154	if (bad_address(table))
155		goto bad;
156	pr_cont("P:%016lx ", *table);
157out:
158	pr_cont("\n");
159	return;
160bad:
161	pr_cont("BAD\n");
162}
163
164static void dump_fault_info(struct pt_regs *regs)
165{
166	unsigned long asce;
167
168	pr_alert("Failing address: %016lx TEID: %016lx\n",
169		 regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
170	pr_alert("Fault in ");
171	switch (regs->int_parm_long & 3) {
172	case 3:
173		pr_cont("home space ");
174		break;
175	case 2:
176		pr_cont("secondary space ");
177		break;
178	case 1:
179		pr_cont("access register ");
180		break;
181	case 0:
182		pr_cont("primary space ");
183		break;
184	}
185	pr_cont("mode while using ");
186	switch (get_fault_type(regs)) {
187	case USER_FAULT:
188		asce = S390_lowcore.user_asce;
189		pr_cont("user ");
190		break;
191	case VDSO_FAULT:
192		asce = S390_lowcore.vdso_asce;
193		pr_cont("vdso ");
194		break;
195	case GMAP_FAULT:
196		asce = ((struct gmap *) S390_lowcore.gmap)->asce;
197		pr_cont("gmap ");
198		break;
199	case KERNEL_FAULT:
200		asce = S390_lowcore.kernel_asce;
201		pr_cont("kernel ");
202		break;
203	default:
204		unreachable();
205	}
206	pr_cont("ASCE.\n");
207	dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK);
208}
209
210int show_unhandled_signals = 1;
211
212void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
213{
214	if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
215		return;
216	if (!unhandled_signal(current, signr))
217		return;
218	if (!printk_ratelimit())
219		return;
220	printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ",
221	       regs->int_code & 0xffff, regs->int_code >> 17);
222	print_vma_addr(KERN_CONT "in ", regs->psw.addr);
223	printk(KERN_CONT "\n");
224	if (is_mm_fault)
225		dump_fault_info(regs);
226	show_regs(regs);
227}
228
229/*
230 * Send SIGSEGV to task.  This is an external routine
231 * to keep the stack usage of do_page_fault small.
232 */
233static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
234{
235	report_user_fault(regs, SIGSEGV, 1);
236	force_sig_fault(SIGSEGV, si_code,
237			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
238}
239
240const struct exception_table_entry *s390_search_extables(unsigned long addr)
241{
242	const struct exception_table_entry *fixup;
243
244	fixup = search_extable(__start_dma_ex_table,
245			       __stop_dma_ex_table - __start_dma_ex_table,
246			       addr);
247	if (!fixup)
248		fixup = search_exception_tables(addr);
249	return fixup;
250}
251
252static noinline void do_no_context(struct pt_regs *regs)
253{
254	const struct exception_table_entry *fixup;
255
256	/* Are we prepared to handle this kernel fault?  */
257	fixup = s390_search_extables(regs->psw.addr);
258	if (fixup && ex_handle(fixup, regs))
259		return;
260
261	/*
262	 * Oops. The kernel tried to access some bad page. We'll have to
263	 * terminate things with extreme prejudice.
264	 */
265	if (get_fault_type(regs) == KERNEL_FAULT)
266		printk(KERN_ALERT "Unable to handle kernel pointer dereference"
267		       " in virtual kernel address space\n");
268	else
269		printk(KERN_ALERT "Unable to handle kernel paging request"
270		       " in virtual user address space\n");
271	dump_fault_info(regs);
272	die(regs, "Oops");
273	do_exit(SIGKILL);
274}
275
276static noinline void do_low_address(struct pt_regs *regs)
277{
278	/* Low-address protection hit in kernel mode means
279	   NULL pointer write access in kernel mode.  */
280	if (regs->psw.mask & PSW_MASK_PSTATE) {
281		/* Low-address protection hit in user mode 'cannot happen'. */
282		die (regs, "Low-address protection");
283		do_exit(SIGKILL);
284	}
285
286	do_no_context(regs);
287}
288
289static noinline void do_sigbus(struct pt_regs *regs)
290{
291	/*
292	 * Send a sigbus, regardless of whether we were in kernel
293	 * or user mode.
294	 */
295	force_sig_fault(SIGBUS, BUS_ADRERR,
296			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
297}
298
299static noinline int signal_return(struct pt_regs *regs)
300{
301	u16 instruction;
302	int rc;
303
304	rc = __get_user(instruction, (u16 __user *) regs->psw.addr);
305	if (rc)
306		return rc;
307	if (instruction == 0x0a77) {
308		set_pt_regs_flag(regs, PIF_SYSCALL);
309		regs->int_code = 0x00040077;
310		return 0;
311	} else if (instruction == 0x0aad) {
312		set_pt_regs_flag(regs, PIF_SYSCALL);
313		regs->int_code = 0x000400ad;
314		return 0;
315	}
316	return -EACCES;
317}
318
319static noinline void do_fault_error(struct pt_regs *regs, int access,
320					vm_fault_t fault)
321{
322	int si_code;
323
324	switch (fault) {
325	case VM_FAULT_BADACCESS:
326		if (access == VM_EXEC && signal_return(regs) == 0)
327			break;
328		fallthrough;
329	case VM_FAULT_BADMAP:
330		/* Bad memory access. Check if it is kernel or user space. */
331		if (user_mode(regs)) {
332			/* User mode accesses just cause a SIGSEGV */
333			si_code = (fault == VM_FAULT_BADMAP) ?
334				SEGV_MAPERR : SEGV_ACCERR;
335			do_sigsegv(regs, si_code);
336			break;
337		}
338		fallthrough;
339	case VM_FAULT_BADCONTEXT:
340	case VM_FAULT_PFAULT:
341		do_no_context(regs);
342		break;
343	case VM_FAULT_SIGNAL:
344		if (!user_mode(regs))
345			do_no_context(regs);
346		break;
347	default: /* fault & VM_FAULT_ERROR */
348		if (fault & VM_FAULT_OOM) {
349			if (!user_mode(regs))
350				do_no_context(regs);
351			else
352				pagefault_out_of_memory();
353		} else if (fault & VM_FAULT_SIGSEGV) {
354			/* Kernel mode? Handle exceptions or die */
355			if (!user_mode(regs))
356				do_no_context(regs);
357			else
358				do_sigsegv(regs, SEGV_MAPERR);
359		} else if (fault & VM_FAULT_SIGBUS) {
360			/* Kernel mode? Handle exceptions or die */
361			if (!user_mode(regs))
362				do_no_context(regs);
363			else
364				do_sigbus(regs);
365		} else
366			BUG();
367		break;
368	}
369}
370
371/*
372 * This routine handles page faults.  It determines the address,
373 * and the problem, and then passes it off to one of the appropriate
374 * routines.
375 *
376 * interruption code (int_code):
377 *   04       Protection           ->  Write-Protection  (suppression)
378 *   10       Segment translation  ->  Not present       (nullification)
379 *   11       Page translation     ->  Not present       (nullification)
380 *   3b       Region third trans.  ->  Not present       (nullification)
381 */
382static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
383{
384	struct gmap *gmap;
385	struct task_struct *tsk;
386	struct mm_struct *mm;
387	struct vm_area_struct *vma;
388	enum fault_type type;
389	unsigned long trans_exc_code;
390	unsigned long address;
391	unsigned int flags;
392	vm_fault_t fault;
393
394	tsk = current;
395	/*
396	 * The instruction that caused the program check has
397	 * been nullified. Don't signal single step via SIGTRAP.
398	 */
399	clear_pt_regs_flag(regs, PIF_PER_TRAP);
400
401	if (kprobe_page_fault(regs, 14))
402		return 0;
403
404	mm = tsk->mm;
405	trans_exc_code = regs->int_parm_long;
406
407	/*
408	 * Verify that the fault happened in user space, that
409	 * we are not in an interrupt and that there is a
410	 * user context.
411	 */
412	fault = VM_FAULT_BADCONTEXT;
413	type = get_fault_type(regs);
414	switch (type) {
415	case KERNEL_FAULT:
416		goto out;
417	case VDSO_FAULT:
418		fault = VM_FAULT_BADMAP;
419		goto out;
420	case USER_FAULT:
421	case GMAP_FAULT:
422		if (faulthandler_disabled() || !mm)
423			goto out;
424		break;
425	}
426
427	address = trans_exc_code & __FAIL_ADDR_MASK;
428	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
429	flags = FAULT_FLAG_DEFAULT;
430	if (user_mode(regs))
431		flags |= FAULT_FLAG_USER;
432	if ((trans_exc_code & store_indication) == 0x400)
433		access = VM_WRITE;
434	if (access == VM_WRITE)
435		flags |= FAULT_FLAG_WRITE;
436	mmap_read_lock(mm);
437
438	gmap = NULL;
439	if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
440		gmap = (struct gmap *) S390_lowcore.gmap;
441		current->thread.gmap_addr = address;
442		current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
443		current->thread.gmap_int_code = regs->int_code & 0xffff;
444		address = __gmap_translate(gmap, address);
445		if (address == -EFAULT) {
446			fault = VM_FAULT_BADMAP;
447			goto out_up;
448		}
449		if (gmap->pfault_enabled)
450			flags |= FAULT_FLAG_RETRY_NOWAIT;
451	}
452
453retry:
454	fault = VM_FAULT_BADMAP;
455	vma = find_vma(mm, address);
456	if (!vma)
457		goto out_up;
458
459	if (unlikely(vma->vm_start > address)) {
460		if (!(vma->vm_flags & VM_GROWSDOWN))
461			goto out_up;
462		if (expand_stack(vma, address))
463			goto out_up;
464	}
465
466	/*
467	 * Ok, we have a good vm_area for this memory access, so
468	 * we can handle it..
469	 */
470	fault = VM_FAULT_BADACCESS;
471	if (unlikely(!(vma->vm_flags & access)))
472		goto out_up;
473
474	if (is_vm_hugetlb_page(vma))
475		address &= HPAGE_MASK;
476	/*
477	 * If for any reason at all we couldn't handle the fault,
478	 * make sure we exit gracefully rather than endlessly redo
479	 * the fault.
480	 */
481	fault = handle_mm_fault(vma, address, flags, regs);
482	if (fault_signal_pending(fault, regs)) {
483		fault = VM_FAULT_SIGNAL;
484		if (flags & FAULT_FLAG_RETRY_NOWAIT)
485			goto out_up;
486		goto out;
487	}
488	if (unlikely(fault & VM_FAULT_ERROR))
489		goto out_up;
490
491	if (flags & FAULT_FLAG_ALLOW_RETRY) {
492		if (fault & VM_FAULT_RETRY) {
493			if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
494			    (flags & FAULT_FLAG_RETRY_NOWAIT)) {
495				/* FAULT_FLAG_RETRY_NOWAIT has been set,
496				 * mmap_lock has not been released */
497				current->thread.gmap_pfault = 1;
498				fault = VM_FAULT_PFAULT;
499				goto out_up;
500			}
501			flags &= ~FAULT_FLAG_RETRY_NOWAIT;
502			flags |= FAULT_FLAG_TRIED;
503			mmap_read_lock(mm);
504			goto retry;
505		}
506	}
507	if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
508		address =  __gmap_link(gmap, current->thread.gmap_addr,
509				       address);
510		if (address == -EFAULT) {
511			fault = VM_FAULT_BADMAP;
512			goto out_up;
513		}
514		if (address == -ENOMEM) {
515			fault = VM_FAULT_OOM;
516			goto out_up;
517		}
518	}
519	fault = 0;
520out_up:
521	mmap_read_unlock(mm);
522out:
523	return fault;
524}
525
526void do_protection_exception(struct pt_regs *regs)
527{
528	unsigned long trans_exc_code;
529	int access;
530	vm_fault_t fault;
531
532	trans_exc_code = regs->int_parm_long;
533	/*
534	 * Protection exceptions are suppressing, decrement psw address.
535	 * The exception to this rule are aborted transactions, for these
536	 * the PSW already points to the correct location.
537	 */
538	if (!(regs->int_code & 0x200))
539		regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
540	/*
541	 * Check for low-address protection.  This needs to be treated
542	 * as a special case because the translation exception code
543	 * field is not guaranteed to contain valid data in this case.
544	 */
545	if (unlikely(!(trans_exc_code & 4))) {
546		do_low_address(regs);
547		return;
548	}
549	if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) {
550		regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) |
551					(regs->psw.addr & PAGE_MASK);
552		access = VM_EXEC;
553		fault = VM_FAULT_BADACCESS;
554	} else {
555		access = VM_WRITE;
556		fault = do_exception(regs, access);
557	}
558	if (unlikely(fault))
559		do_fault_error(regs, access, fault);
560}
561NOKPROBE_SYMBOL(do_protection_exception);
562
563void do_dat_exception(struct pt_regs *regs)
564{
565	int access;
566	vm_fault_t fault;
567
568	access = VM_ACCESS_FLAGS;
569	fault = do_exception(regs, access);
570	if (unlikely(fault))
571		do_fault_error(regs, access, fault);
572}
573NOKPROBE_SYMBOL(do_dat_exception);
574
575#ifdef CONFIG_PFAULT
576/*
577 * 'pfault' pseudo page faults routines.
578 */
579static int pfault_disable;
580
581static int __init nopfault(char *str)
582{
583	pfault_disable = 1;
584	return 1;
585}
586
587__setup("nopfault", nopfault);
588
589struct pfault_refbk {
590	u16 refdiagc;
591	u16 reffcode;
592	u16 refdwlen;
593	u16 refversn;
594	u64 refgaddr;
595	u64 refselmk;
596	u64 refcmpmk;
597	u64 reserved;
598} __attribute__ ((packed, aligned(8)));
599
600static struct pfault_refbk pfault_init_refbk = {
601	.refdiagc = 0x258,
602	.reffcode = 0,
603	.refdwlen = 5,
604	.refversn = 2,
605	.refgaddr = __LC_LPP,
606	.refselmk = 1ULL << 48,
607	.refcmpmk = 1ULL << 48,
608	.reserved = __PF_RES_FIELD
609};
610
611int pfault_init(void)
612{
613        int rc;
614
615	if (pfault_disable)
616		return -1;
617	diag_stat_inc(DIAG_STAT_X258);
618	asm volatile(
619		"	diag	%1,%0,0x258\n"
620		"0:	j	2f\n"
621		"1:	la	%0,8\n"
622		"2:\n"
623		EX_TABLE(0b,1b)
624		: "=d" (rc)
625		: "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc");
626        return rc;
627}
628
629static struct pfault_refbk pfault_fini_refbk = {
630	.refdiagc = 0x258,
631	.reffcode = 1,
632	.refdwlen = 5,
633	.refversn = 2,
634};
635
636void pfault_fini(void)
637{
638
639	if (pfault_disable)
640		return;
641	diag_stat_inc(DIAG_STAT_X258);
642	asm volatile(
643		"	diag	%0,0,0x258\n"
644		"0:	nopr	%%r7\n"
645		EX_TABLE(0b,0b)
646		: : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc");
647}
648
649static DEFINE_SPINLOCK(pfault_lock);
650static LIST_HEAD(pfault_list);
651
652#define PF_COMPLETE	0x0080
653
654/*
655 * The mechanism of our pfault code: if Linux is running as guest, runs a user
656 * space process and the user space process accesses a page that the host has
657 * paged out we get a pfault interrupt.
658 *
659 * This allows us, within the guest, to schedule a different process. Without
660 * this mechanism the host would have to suspend the whole virtual cpu until
661 * the page has been paged in.
662 *
663 * So when we get such an interrupt then we set the state of the current task
664 * to uninterruptible and also set the need_resched flag. Both happens within
665 * interrupt context(!). If we later on want to return to user space we
666 * recognize the need_resched flag and then call schedule().  It's not very
667 * obvious how this works...
668 *
669 * Of course we have a lot of additional fun with the completion interrupt (->
670 * host signals that a page of a process has been paged in and the process can
671 * continue to run). This interrupt can arrive on any cpu and, since we have
672 * virtual cpus, actually appear before the interrupt that signals that a page
673 * is missing.
674 */
675static void pfault_interrupt(struct ext_code ext_code,
676			     unsigned int param32, unsigned long param64)
677{
678	struct task_struct *tsk;
679	__u16 subcode;
680	pid_t pid;
681
682	/*
683	 * Get the external interruption subcode & pfault initial/completion
684	 * signal bit. VM stores this in the 'cpu address' field associated
685	 * with the external interrupt.
686	 */
687	subcode = ext_code.subcode;
688	if ((subcode & 0xff00) != __SUBCODE_MASK)
689		return;
690	inc_irq_stat(IRQEXT_PFL);
691	/* Get the token (= pid of the affected task). */
692	pid = param64 & LPP_PID_MASK;
693	rcu_read_lock();
694	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
695	if (tsk)
696		get_task_struct(tsk);
697	rcu_read_unlock();
698	if (!tsk)
699		return;
700	spin_lock(&pfault_lock);
701	if (subcode & PF_COMPLETE) {
702		/* signal bit is set -> a page has been swapped in by VM */
703		if (tsk->thread.pfault_wait == 1) {
704			/* Initial interrupt was faster than the completion
705			 * interrupt. pfault_wait is valid. Set pfault_wait
706			 * back to zero and wake up the process. This can
707			 * safely be done because the task is still sleeping
708			 * and can't produce new pfaults. */
709			tsk->thread.pfault_wait = 0;
710			list_del(&tsk->thread.list);
711			wake_up_process(tsk);
712			put_task_struct(tsk);
713		} else {
714			/* Completion interrupt was faster than initial
715			 * interrupt. Set pfault_wait to -1 so the initial
716			 * interrupt doesn't put the task to sleep.
717			 * If the task is not running, ignore the completion
718			 * interrupt since it must be a leftover of a PFAULT
719			 * CANCEL operation which didn't remove all pending
720			 * completion interrupts. */
721			if (tsk->state == TASK_RUNNING)
722				tsk->thread.pfault_wait = -1;
723		}
724	} else {
725		/* signal bit not set -> a real page is missing. */
726		if (WARN_ON_ONCE(tsk != current))
727			goto out;
728		if (tsk->thread.pfault_wait == 1) {
729			/* Already on the list with a reference: put to sleep */
730			goto block;
731		} else if (tsk->thread.pfault_wait == -1) {
732			/* Completion interrupt was faster than the initial
733			 * interrupt (pfault_wait == -1). Set pfault_wait
734			 * back to zero and exit. */
735			tsk->thread.pfault_wait = 0;
736		} else {
737			/* Initial interrupt arrived before completion
738			 * interrupt. Let the task sleep.
739			 * An extra task reference is needed since a different
740			 * cpu may set the task state to TASK_RUNNING again
741			 * before the scheduler is reached. */
742			get_task_struct(tsk);
743			tsk->thread.pfault_wait = 1;
744			list_add(&tsk->thread.list, &pfault_list);
745block:
746			/* Since this must be a userspace fault, there
747			 * is no kernel task state to trample. Rely on the
748			 * return to userspace schedule() to block. */
749			__set_current_state(TASK_UNINTERRUPTIBLE);
750			set_tsk_need_resched(tsk);
751			set_preempt_need_resched();
752		}
753	}
754out:
755	spin_unlock(&pfault_lock);
756	put_task_struct(tsk);
757}
758
759static int pfault_cpu_dead(unsigned int cpu)
760{
761	struct thread_struct *thread, *next;
762	struct task_struct *tsk;
763
764	spin_lock_irq(&pfault_lock);
765	list_for_each_entry_safe(thread, next, &pfault_list, list) {
766		thread->pfault_wait = 0;
767		list_del(&thread->list);
768		tsk = container_of(thread, struct task_struct, thread);
769		wake_up_process(tsk);
770		put_task_struct(tsk);
771	}
772	spin_unlock_irq(&pfault_lock);
773	return 0;
774}
775
776static int __init pfault_irq_init(void)
777{
778	int rc;
779
780	rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
781	if (rc)
782		goto out_extint;
783	rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
784	if (rc)
785		goto out_pfault;
786	irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
787	cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
788				  NULL, pfault_cpu_dead);
789	return 0;
790
791out_pfault:
792	unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
793out_extint:
794	pfault_disable = 1;
795	return rc;
796}
797early_initcall(pfault_irq_init);
798
799#endif /* CONFIG_PFAULT */
800
801#if IS_ENABLED(CONFIG_PGSTE)
802void do_secure_storage_access(struct pt_regs *regs)
803{
804	unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK;
805	struct vm_area_struct *vma;
806	struct mm_struct *mm;
807	struct page *page;
808	int rc;
809
810	/*
811	 * bit 61 tells us if the address is valid, if it's not we
812	 * have a major problem and should stop the kernel or send a
813	 * SIGSEGV to the process. Unfortunately bit 61 is not
814	 * reliable without the misc UV feature so we need to check
815	 * for that as well.
816	 */
817	if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) &&
818	    !test_bit_inv(61, &regs->int_parm_long)) {
819		/*
820		 * When this happens, userspace did something that it
821		 * was not supposed to do, e.g. branching into secure
822		 * memory. Trigger a segmentation fault.
823		 */
824		if (user_mode(regs)) {
825			send_sig(SIGSEGV, current, 0);
826			return;
827		}
828
829		/*
830		 * The kernel should never run into this case and we
831		 * have no way out of this situation.
832		 */
833		panic("Unexpected PGM 0x3d with TEID bit 61=0");
834	}
835
836	switch (get_fault_type(regs)) {
837	case USER_FAULT:
838		mm = current->mm;
839		mmap_read_lock(mm);
840		vma = find_vma(mm, addr);
841		if (!vma) {
842			mmap_read_unlock(mm);
843			do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
844			break;
845		}
846		page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET);
847		if (IS_ERR_OR_NULL(page)) {
848			mmap_read_unlock(mm);
849			break;
850		}
851		if (arch_make_page_accessible(page))
852			send_sig(SIGSEGV, current, 0);
853		put_page(page);
854		mmap_read_unlock(mm);
855		break;
856	case KERNEL_FAULT:
857		page = phys_to_page(addr);
858		if (unlikely(!try_get_page(page)))
859			break;
860		rc = arch_make_page_accessible(page);
861		put_page(page);
862		if (rc)
863			BUG();
864		break;
865	case VDSO_FAULT:
866	case GMAP_FAULT:
867	default:
868		do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
869		WARN_ON_ONCE(1);
870	}
871}
872NOKPROBE_SYMBOL(do_secure_storage_access);
873
874void do_non_secure_storage_access(struct pt_regs *regs)
875{
876	unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
877	struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
878
879	if (get_fault_type(regs) != GMAP_FAULT) {
880		do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
881		WARN_ON_ONCE(1);
882		return;
883	}
884
885	if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL)
886		send_sig(SIGSEGV, current, 0);
887}
888NOKPROBE_SYMBOL(do_non_secure_storage_access);
889
890void do_secure_storage_violation(struct pt_regs *regs)
891{
892	/*
893	 * Either KVM messed up the secure guest mapping or the same
894	 * page is mapped into multiple secure guests.
895	 *
896	 * This exception is only triggered when a guest 2 is running
897	 * and can therefore never occur in kernel context.
898	 */
899	printk_ratelimited(KERN_WARNING
900			   "Secure storage violation in task: %s, pid %d\n",
901			   current->comm, current->pid);
902	send_sig(SIGSEGV, current, 0);
903}
904
905#else
906void do_secure_storage_access(struct pt_regs *regs)
907{
908	default_trap_handler(regs);
909}
910
911void do_non_secure_storage_access(struct pt_regs *regs)
912{
913	default_trap_handler(regs);
914}
915
916void do_secure_storage_violation(struct pt_regs *regs)
917{
918	default_trap_handler(regs);
919}
920#endif
921