162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright IBM Corp. 1999, 2023 462306a36Sopenharmony_ci */ 562306a36Sopenharmony_ci 662306a36Sopenharmony_ci#include <linux/cpuhotplug.h> 762306a36Sopenharmony_ci#include <linux/sched/task.h> 862306a36Sopenharmony_ci#include <linux/errno.h> 962306a36Sopenharmony_ci#include <linux/init.h> 1062306a36Sopenharmony_ci#include <linux/irq.h> 1162306a36Sopenharmony_ci#include <asm/asm-extable.h> 1262306a36Sopenharmony_ci#include <asm/pfault.h> 1362306a36Sopenharmony_ci#include <asm/diag.h> 1462306a36Sopenharmony_ci 1562306a36Sopenharmony_ci#define __SUBCODE_MASK 0x0600 1662306a36Sopenharmony_ci#define __PF_RES_FIELD 0x8000000000000000UL 1762306a36Sopenharmony_ci 1862306a36Sopenharmony_ci/* 1962306a36Sopenharmony_ci * 'pfault' pseudo page faults routines. 2062306a36Sopenharmony_ci */ 2162306a36Sopenharmony_cistatic int pfault_disable; 2262306a36Sopenharmony_ci 2362306a36Sopenharmony_cistatic int __init nopfault(char *str) 2462306a36Sopenharmony_ci{ 2562306a36Sopenharmony_ci pfault_disable = 1; 2662306a36Sopenharmony_ci return 1; 2762306a36Sopenharmony_ci} 2862306a36Sopenharmony_ciearly_param("nopfault", nopfault); 2962306a36Sopenharmony_ci 3062306a36Sopenharmony_cistruct pfault_refbk { 3162306a36Sopenharmony_ci u16 refdiagc; 3262306a36Sopenharmony_ci u16 reffcode; 3362306a36Sopenharmony_ci u16 refdwlen; 3462306a36Sopenharmony_ci u16 refversn; 3562306a36Sopenharmony_ci u64 refgaddr; 3662306a36Sopenharmony_ci u64 refselmk; 3762306a36Sopenharmony_ci u64 refcmpmk; 3862306a36Sopenharmony_ci u64 reserved; 3962306a36Sopenharmony_ci}; 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_cistatic struct pfault_refbk pfault_init_refbk = { 4262306a36Sopenharmony_ci .refdiagc = 0x258, 4362306a36Sopenharmony_ci .reffcode = 0, 4462306a36Sopenharmony_ci .refdwlen = 5, 4562306a36Sopenharmony_ci .refversn = 2, 4662306a36Sopenharmony_ci .refgaddr = __LC_LPP, 4762306a36Sopenharmony_ci .refselmk = 1UL << 48, 4862306a36Sopenharmony_ci .refcmpmk = 1UL << 48, 4962306a36Sopenharmony_ci .reserved = __PF_RES_FIELD 5062306a36Sopenharmony_ci}; 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_ciint __pfault_init(void) 5362306a36Sopenharmony_ci{ 5462306a36Sopenharmony_ci int rc = -EOPNOTSUPP; 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci if (pfault_disable) 5762306a36Sopenharmony_ci return rc; 5862306a36Sopenharmony_ci diag_stat_inc(DIAG_STAT_X258); 5962306a36Sopenharmony_ci asm volatile( 6062306a36Sopenharmony_ci " diag %[refbk],%[rc],0x258\n" 6162306a36Sopenharmony_ci "0: nopr %%r7\n" 6262306a36Sopenharmony_ci EX_TABLE(0b, 0b) 6362306a36Sopenharmony_ci : [rc] "+d" (rc) 6462306a36Sopenharmony_ci : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk) 6562306a36Sopenharmony_ci : "cc"); 6662306a36Sopenharmony_ci return rc; 6762306a36Sopenharmony_ci} 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_cistatic struct pfault_refbk pfault_fini_refbk = { 7062306a36Sopenharmony_ci .refdiagc = 0x258, 7162306a36Sopenharmony_ci .reffcode = 1, 7262306a36Sopenharmony_ci .refdwlen = 5, 7362306a36Sopenharmony_ci .refversn = 2, 7462306a36Sopenharmony_ci}; 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_civoid __pfault_fini(void) 7762306a36Sopenharmony_ci{ 7862306a36Sopenharmony_ci if (pfault_disable) 7962306a36Sopenharmony_ci return; 8062306a36Sopenharmony_ci diag_stat_inc(DIAG_STAT_X258); 8162306a36Sopenharmony_ci asm volatile( 8262306a36Sopenharmony_ci " diag %[refbk],0,0x258\n" 8362306a36Sopenharmony_ci "0: nopr %%r7\n" 8462306a36Sopenharmony_ci EX_TABLE(0b, 0b) 8562306a36Sopenharmony_ci : 8662306a36Sopenharmony_ci : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) 8762306a36Sopenharmony_ci : "cc"); 8862306a36Sopenharmony_ci} 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_cistatic DEFINE_SPINLOCK(pfault_lock); 9162306a36Sopenharmony_cistatic LIST_HEAD(pfault_list); 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ci#define PF_COMPLETE 0x0080 9462306a36Sopenharmony_ci 9562306a36Sopenharmony_ci/* 9662306a36Sopenharmony_ci * The mechanism of our pfault code: if Linux is running as guest, runs a user 9762306a36Sopenharmony_ci * space process and the user space process accesses a page that the host has 9862306a36Sopenharmony_ci * paged out we get a pfault interrupt. 9962306a36Sopenharmony_ci * 10062306a36Sopenharmony_ci * This allows us, within the guest, to schedule a different process. Without 10162306a36Sopenharmony_ci * this mechanism the host would have to suspend the whole virtual cpu until 10262306a36Sopenharmony_ci * the page has been paged in. 10362306a36Sopenharmony_ci * 10462306a36Sopenharmony_ci * So when we get such an interrupt then we set the state of the current task 10562306a36Sopenharmony_ci * to uninterruptible and also set the need_resched flag. Both happens within 10662306a36Sopenharmony_ci * interrupt context(!). If we later on want to return to user space we 10762306a36Sopenharmony_ci * recognize the need_resched flag and then call schedule(). It's not very 10862306a36Sopenharmony_ci * obvious how this works... 10962306a36Sopenharmony_ci * 11062306a36Sopenharmony_ci * Of course we have a lot of additional fun with the completion interrupt (-> 11162306a36Sopenharmony_ci * host signals that a page of a process has been paged in and the process can 11262306a36Sopenharmony_ci * continue to run). This interrupt can arrive on any cpu and, since we have 11362306a36Sopenharmony_ci * virtual cpus, actually appear before the interrupt that signals that a page 11462306a36Sopenharmony_ci * is missing. 11562306a36Sopenharmony_ci */ 11662306a36Sopenharmony_cistatic void pfault_interrupt(struct ext_code ext_code, 11762306a36Sopenharmony_ci unsigned int param32, unsigned long param64) 11862306a36Sopenharmony_ci{ 11962306a36Sopenharmony_ci struct task_struct *tsk; 12062306a36Sopenharmony_ci __u16 subcode; 12162306a36Sopenharmony_ci pid_t pid; 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_ci /* 12462306a36Sopenharmony_ci * Get the external interruption subcode & pfault initial/completion 12562306a36Sopenharmony_ci * signal bit. VM stores this in the 'cpu address' field associated 12662306a36Sopenharmony_ci * with the external interrupt. 12762306a36Sopenharmony_ci */ 12862306a36Sopenharmony_ci subcode = ext_code.subcode; 12962306a36Sopenharmony_ci if ((subcode & 0xff00) != __SUBCODE_MASK) 13062306a36Sopenharmony_ci return; 13162306a36Sopenharmony_ci inc_irq_stat(IRQEXT_PFL); 13262306a36Sopenharmony_ci /* Get the token (= pid of the affected task). */ 13362306a36Sopenharmony_ci pid = param64 & LPP_PID_MASK; 13462306a36Sopenharmony_ci rcu_read_lock(); 13562306a36Sopenharmony_ci tsk = find_task_by_pid_ns(pid, &init_pid_ns); 13662306a36Sopenharmony_ci if (tsk) 13762306a36Sopenharmony_ci get_task_struct(tsk); 13862306a36Sopenharmony_ci rcu_read_unlock(); 13962306a36Sopenharmony_ci if (!tsk) 14062306a36Sopenharmony_ci return; 14162306a36Sopenharmony_ci spin_lock(&pfault_lock); 14262306a36Sopenharmony_ci if (subcode & PF_COMPLETE) { 14362306a36Sopenharmony_ci /* signal bit is set -> a page has been swapped in by VM */ 14462306a36Sopenharmony_ci if (tsk->thread.pfault_wait == 1) { 14562306a36Sopenharmony_ci /* 14662306a36Sopenharmony_ci * Initial interrupt was faster than the completion 14762306a36Sopenharmony_ci * interrupt. pfault_wait is valid. Set pfault_wait 14862306a36Sopenharmony_ci * back to zero and wake up the process. This can 14962306a36Sopenharmony_ci * safely be done because the task is still sleeping 15062306a36Sopenharmony_ci * and can't produce new pfaults. 15162306a36Sopenharmony_ci */ 15262306a36Sopenharmony_ci tsk->thread.pfault_wait = 0; 15362306a36Sopenharmony_ci list_del(&tsk->thread.list); 15462306a36Sopenharmony_ci wake_up_process(tsk); 15562306a36Sopenharmony_ci put_task_struct(tsk); 15662306a36Sopenharmony_ci } else { 15762306a36Sopenharmony_ci /* 15862306a36Sopenharmony_ci * Completion interrupt was faster than initial 15962306a36Sopenharmony_ci * interrupt. Set pfault_wait to -1 so the initial 16062306a36Sopenharmony_ci * interrupt doesn't put the task to sleep. 16162306a36Sopenharmony_ci * If the task is not running, ignore the completion 16262306a36Sopenharmony_ci * interrupt since it must be a leftover of a PFAULT 16362306a36Sopenharmony_ci * CANCEL operation which didn't remove all pending 16462306a36Sopenharmony_ci * completion interrupts. 16562306a36Sopenharmony_ci */ 16662306a36Sopenharmony_ci if (task_is_running(tsk)) 16762306a36Sopenharmony_ci tsk->thread.pfault_wait = -1; 16862306a36Sopenharmony_ci } 16962306a36Sopenharmony_ci } else { 17062306a36Sopenharmony_ci /* signal bit not set -> a real page is missing. */ 17162306a36Sopenharmony_ci if (WARN_ON_ONCE(tsk != current)) 17262306a36Sopenharmony_ci goto out; 17362306a36Sopenharmony_ci if (tsk->thread.pfault_wait == 1) { 17462306a36Sopenharmony_ci /* Already on the list with a reference: put to sleep */ 17562306a36Sopenharmony_ci goto block; 17662306a36Sopenharmony_ci } else if (tsk->thread.pfault_wait == -1) { 17762306a36Sopenharmony_ci /* 17862306a36Sopenharmony_ci * Completion interrupt was faster than the initial 17962306a36Sopenharmony_ci * interrupt (pfault_wait == -1). Set pfault_wait 18062306a36Sopenharmony_ci * back to zero and exit. 18162306a36Sopenharmony_ci */ 18262306a36Sopenharmony_ci tsk->thread.pfault_wait = 0; 18362306a36Sopenharmony_ci } else { 18462306a36Sopenharmony_ci /* 18562306a36Sopenharmony_ci * Initial interrupt arrived before completion 18662306a36Sopenharmony_ci * interrupt. Let the task sleep. 18762306a36Sopenharmony_ci * An extra task reference is needed since a different 18862306a36Sopenharmony_ci * cpu may set the task state to TASK_RUNNING again 18962306a36Sopenharmony_ci * before the scheduler is reached. 19062306a36Sopenharmony_ci */ 19162306a36Sopenharmony_ci get_task_struct(tsk); 19262306a36Sopenharmony_ci tsk->thread.pfault_wait = 1; 19362306a36Sopenharmony_ci list_add(&tsk->thread.list, &pfault_list); 19462306a36Sopenharmony_ciblock: 19562306a36Sopenharmony_ci /* 19662306a36Sopenharmony_ci * Since this must be a userspace fault, there 19762306a36Sopenharmony_ci * is no kernel task state to trample. Rely on the 19862306a36Sopenharmony_ci * return to userspace schedule() to block. 19962306a36Sopenharmony_ci */ 20062306a36Sopenharmony_ci __set_current_state(TASK_UNINTERRUPTIBLE); 20162306a36Sopenharmony_ci set_tsk_need_resched(tsk); 20262306a36Sopenharmony_ci set_preempt_need_resched(); 20362306a36Sopenharmony_ci } 20462306a36Sopenharmony_ci } 20562306a36Sopenharmony_ciout: 20662306a36Sopenharmony_ci spin_unlock(&pfault_lock); 20762306a36Sopenharmony_ci put_task_struct(tsk); 20862306a36Sopenharmony_ci} 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_cistatic int pfault_cpu_dead(unsigned int cpu) 21162306a36Sopenharmony_ci{ 21262306a36Sopenharmony_ci struct thread_struct *thread, *next; 21362306a36Sopenharmony_ci struct task_struct *tsk; 21462306a36Sopenharmony_ci 21562306a36Sopenharmony_ci spin_lock_irq(&pfault_lock); 21662306a36Sopenharmony_ci list_for_each_entry_safe(thread, next, &pfault_list, list) { 21762306a36Sopenharmony_ci thread->pfault_wait = 0; 21862306a36Sopenharmony_ci list_del(&thread->list); 21962306a36Sopenharmony_ci tsk = container_of(thread, struct task_struct, thread); 22062306a36Sopenharmony_ci wake_up_process(tsk); 22162306a36Sopenharmony_ci put_task_struct(tsk); 22262306a36Sopenharmony_ci } 22362306a36Sopenharmony_ci spin_unlock_irq(&pfault_lock); 22462306a36Sopenharmony_ci return 0; 22562306a36Sopenharmony_ci} 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_cistatic int __init pfault_irq_init(void) 22862306a36Sopenharmony_ci{ 22962306a36Sopenharmony_ci int rc; 23062306a36Sopenharmony_ci 23162306a36Sopenharmony_ci rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); 23262306a36Sopenharmony_ci if (rc) 23362306a36Sopenharmony_ci goto out_extint; 23462306a36Sopenharmony_ci rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; 23562306a36Sopenharmony_ci if (rc) 23662306a36Sopenharmony_ci goto out_pfault; 23762306a36Sopenharmony_ci irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); 23862306a36Sopenharmony_ci cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", 23962306a36Sopenharmony_ci NULL, pfault_cpu_dead); 24062306a36Sopenharmony_ci return 0; 24162306a36Sopenharmony_ci 24262306a36Sopenharmony_ciout_pfault: 24362306a36Sopenharmony_ci unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); 24462306a36Sopenharmony_ciout_extint: 24562306a36Sopenharmony_ci pfault_disable = 1; 24662306a36Sopenharmony_ci return rc; 24762306a36Sopenharmony_ci} 24862306a36Sopenharmony_ciearly_initcall(pfault_irq_init); 249