162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci#include <linux/init.h> 362306a36Sopenharmony_ci 462306a36Sopenharmony_ci#include <linux/mm.h> 562306a36Sopenharmony_ci#include <linux/spinlock.h> 662306a36Sopenharmony_ci#include <linux/smp.h> 762306a36Sopenharmony_ci#include <linux/interrupt.h> 862306a36Sopenharmony_ci#include <linux/export.h> 962306a36Sopenharmony_ci#include <linux/cpu.h> 1062306a36Sopenharmony_ci#include <linux/debugfs.h> 1162306a36Sopenharmony_ci#include <linux/sched/smt.h> 1262306a36Sopenharmony_ci#include <linux/task_work.h> 1362306a36Sopenharmony_ci#include <linux/mmu_notifier.h> 1462306a36Sopenharmony_ci 1562306a36Sopenharmony_ci#include <asm/tlbflush.h> 1662306a36Sopenharmony_ci#include <asm/mmu_context.h> 1762306a36Sopenharmony_ci#include <asm/nospec-branch.h> 1862306a36Sopenharmony_ci#include <asm/cache.h> 1962306a36Sopenharmony_ci#include <asm/cacheflush.h> 2062306a36Sopenharmony_ci#include <asm/apic.h> 2162306a36Sopenharmony_ci#include <asm/perf_event.h> 2262306a36Sopenharmony_ci 2362306a36Sopenharmony_ci#include "mm_internal.h" 2462306a36Sopenharmony_ci 2562306a36Sopenharmony_ci#ifdef CONFIG_PARAVIRT 2662306a36Sopenharmony_ci# define STATIC_NOPV 2762306a36Sopenharmony_ci#else 2862306a36Sopenharmony_ci# define STATIC_NOPV static 2962306a36Sopenharmony_ci# define __flush_tlb_local native_flush_tlb_local 3062306a36Sopenharmony_ci# define __flush_tlb_global native_flush_tlb_global 3162306a36Sopenharmony_ci# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr) 3262306a36Sopenharmony_ci# define __flush_tlb_multi(msk, info) native_flush_tlb_multi(msk, info) 3362306a36Sopenharmony_ci#endif 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_ci/* 3662306a36Sopenharmony_ci * TLB flushing, formerly SMP-only 3762306a36Sopenharmony_ci * c/o Linus Torvalds. 3862306a36Sopenharmony_ci * 3962306a36Sopenharmony_ci * These mean you can really definitely utterly forget about 4062306a36Sopenharmony_ci * writing to user space from interrupts. (Its not allowed anyway). 4162306a36Sopenharmony_ci * 4262306a36Sopenharmony_ci * Optimizations Manfred Spraul <manfred@colorfullife.com> 4362306a36Sopenharmony_ci * 4462306a36Sopenharmony_ci * More scalable flush, from Andi Kleen 4562306a36Sopenharmony_ci * 4662306a36Sopenharmony_ci * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi 4762306a36Sopenharmony_ci */ 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci/* 5062306a36Sopenharmony_ci * Bits to mangle the TIF_SPEC_* state into the mm pointer which is 5162306a36Sopenharmony_ci * stored in cpu_tlb_state.last_user_mm_spec. 5262306a36Sopenharmony_ci */ 5362306a36Sopenharmony_ci#define LAST_USER_MM_IBPB 0x1UL 5462306a36Sopenharmony_ci#define LAST_USER_MM_L1D_FLUSH 0x2UL 5562306a36Sopenharmony_ci#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB | LAST_USER_MM_L1D_FLUSH) 5662306a36Sopenharmony_ci 5762306a36Sopenharmony_ci/* Bits to set when tlbstate and flush is (re)initialized */ 5862306a36Sopenharmony_ci#define LAST_USER_MM_INIT LAST_USER_MM_IBPB 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_ci/* 6162306a36Sopenharmony_ci * The x86 feature is called PCID (Process Context IDentifier). It is similar 6262306a36Sopenharmony_ci * to what is traditionally called ASID on the RISC processors. 6362306a36Sopenharmony_ci * 6462306a36Sopenharmony_ci * We don't use the traditional ASID implementation, where each process/mm gets 6562306a36Sopenharmony_ci * its own ASID and flush/restart when we run out of ASID space. 6662306a36Sopenharmony_ci * 6762306a36Sopenharmony_ci * Instead we have a small per-cpu array of ASIDs and cache the last few mm's 6862306a36Sopenharmony_ci * that came by on this CPU, allowing cheaper switch_mm between processes on 6962306a36Sopenharmony_ci * this CPU. 7062306a36Sopenharmony_ci * 7162306a36Sopenharmony_ci * We end up with different spaces for different things. To avoid confusion we 7262306a36Sopenharmony_ci * use different names for each of them: 7362306a36Sopenharmony_ci * 7462306a36Sopenharmony_ci * ASID - [0, TLB_NR_DYN_ASIDS-1] 7562306a36Sopenharmony_ci * the canonical identifier for an mm 7662306a36Sopenharmony_ci * 7762306a36Sopenharmony_ci * kPCID - [1, TLB_NR_DYN_ASIDS] 7862306a36Sopenharmony_ci * the value we write into the PCID part of CR3; corresponds to the 7962306a36Sopenharmony_ci * ASID+1, because PCID 0 is special. 8062306a36Sopenharmony_ci * 8162306a36Sopenharmony_ci * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] 8262306a36Sopenharmony_ci * for KPTI each mm has two address spaces and thus needs two 8362306a36Sopenharmony_ci * PCID values, but we can still do with a single ASID denomination 8462306a36Sopenharmony_ci * for each mm. Corresponds to kPCID + 2048. 8562306a36Sopenharmony_ci * 8662306a36Sopenharmony_ci */ 8762306a36Sopenharmony_ci 8862306a36Sopenharmony_ci/* There are 12 bits of space for ASIDS in CR3 */ 8962306a36Sopenharmony_ci#define CR3_HW_ASID_BITS 12 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ci/* 9262306a36Sopenharmony_ci * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for 9362306a36Sopenharmony_ci * user/kernel switches 9462306a36Sopenharmony_ci */ 9562306a36Sopenharmony_ci#ifdef CONFIG_PAGE_TABLE_ISOLATION 9662306a36Sopenharmony_ci# define PTI_CONSUMED_PCID_BITS 1 9762306a36Sopenharmony_ci#else 9862306a36Sopenharmony_ci# define PTI_CONSUMED_PCID_BITS 0 9962306a36Sopenharmony_ci#endif 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_ci#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_ci/* 10462306a36Sopenharmony_ci * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account 10562306a36Sopenharmony_ci * for them being zero-based. Another -1 is because PCID 0 is reserved for 10662306a36Sopenharmony_ci * use by non-PCID-aware users. 10762306a36Sopenharmony_ci */ 10862306a36Sopenharmony_ci#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci/* 11162306a36Sopenharmony_ci * Given @asid, compute kPCID 11262306a36Sopenharmony_ci */ 11362306a36Sopenharmony_cistatic inline u16 kern_pcid(u16 asid) 11462306a36Sopenharmony_ci{ 11562306a36Sopenharmony_ci VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); 11662306a36Sopenharmony_ci 11762306a36Sopenharmony_ci#ifdef CONFIG_PAGE_TABLE_ISOLATION 11862306a36Sopenharmony_ci /* 11962306a36Sopenharmony_ci * Make sure that the dynamic ASID space does not conflict with the 12062306a36Sopenharmony_ci * bit we are using to switch between user and kernel ASIDs. 12162306a36Sopenharmony_ci */ 12262306a36Sopenharmony_ci BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci /* 12562306a36Sopenharmony_ci * The ASID being passed in here should have respected the 12662306a36Sopenharmony_ci * MAX_ASID_AVAILABLE and thus never have the switch bit set. 12762306a36Sopenharmony_ci */ 12862306a36Sopenharmony_ci VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); 12962306a36Sopenharmony_ci#endif 13062306a36Sopenharmony_ci /* 13162306a36Sopenharmony_ci * The dynamically-assigned ASIDs that get passed in are small 13262306a36Sopenharmony_ci * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set, 13362306a36Sopenharmony_ci * so do not bother to clear it. 13462306a36Sopenharmony_ci * 13562306a36Sopenharmony_ci * If PCID is on, ASID-aware code paths put the ASID+1 into the 13662306a36Sopenharmony_ci * PCID bits. This serves two purposes. It prevents a nasty 13762306a36Sopenharmony_ci * situation in which PCID-unaware code saves CR3, loads some other 13862306a36Sopenharmony_ci * value (with PCID == 0), and then restores CR3, thus corrupting 13962306a36Sopenharmony_ci * the TLB for ASID 0 if the saved ASID was nonzero. It also means 14062306a36Sopenharmony_ci * that any bugs involving loading a PCID-enabled CR3 with 14162306a36Sopenharmony_ci * CR4.PCIDE off will trigger deterministically. 14262306a36Sopenharmony_ci */ 14362306a36Sopenharmony_ci return asid + 1; 14462306a36Sopenharmony_ci} 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci/* 14762306a36Sopenharmony_ci * Given @asid, compute uPCID 14862306a36Sopenharmony_ci */ 14962306a36Sopenharmony_cistatic inline u16 user_pcid(u16 asid) 15062306a36Sopenharmony_ci{ 15162306a36Sopenharmony_ci u16 ret = kern_pcid(asid); 15262306a36Sopenharmony_ci#ifdef CONFIG_PAGE_TABLE_ISOLATION 15362306a36Sopenharmony_ci ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; 15462306a36Sopenharmony_ci#endif 15562306a36Sopenharmony_ci return ret; 15662306a36Sopenharmony_ci} 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_cistatic inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam) 15962306a36Sopenharmony_ci{ 16062306a36Sopenharmony_ci unsigned long cr3 = __sme_pa(pgd) | lam; 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_ci if (static_cpu_has(X86_FEATURE_PCID)) { 16362306a36Sopenharmony_ci VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); 16462306a36Sopenharmony_ci cr3 |= kern_pcid(asid); 16562306a36Sopenharmony_ci } else { 16662306a36Sopenharmony_ci VM_WARN_ON_ONCE(asid != 0); 16762306a36Sopenharmony_ci } 16862306a36Sopenharmony_ci 16962306a36Sopenharmony_ci return cr3; 17062306a36Sopenharmony_ci} 17162306a36Sopenharmony_ci 17262306a36Sopenharmony_cistatic inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid, 17362306a36Sopenharmony_ci unsigned long lam) 17462306a36Sopenharmony_ci{ 17562306a36Sopenharmony_ci /* 17662306a36Sopenharmony_ci * Use boot_cpu_has() instead of this_cpu_has() as this function 17762306a36Sopenharmony_ci * might be called during early boot. This should work even after 17862306a36Sopenharmony_ci * boot because all CPU's the have same capabilities: 17962306a36Sopenharmony_ci */ 18062306a36Sopenharmony_ci VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID)); 18162306a36Sopenharmony_ci return build_cr3(pgd, asid, lam) | CR3_NOFLUSH; 18262306a36Sopenharmony_ci} 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_ci/* 18562306a36Sopenharmony_ci * We get here when we do something requiring a TLB invalidation 18662306a36Sopenharmony_ci * but could not go invalidate all of the contexts. We do the 18762306a36Sopenharmony_ci * necessary invalidation by clearing out the 'ctx_id' which 18862306a36Sopenharmony_ci * forces a TLB flush when the context is loaded. 18962306a36Sopenharmony_ci */ 19062306a36Sopenharmony_cistatic void clear_asid_other(void) 19162306a36Sopenharmony_ci{ 19262306a36Sopenharmony_ci u16 asid; 19362306a36Sopenharmony_ci 19462306a36Sopenharmony_ci /* 19562306a36Sopenharmony_ci * This is only expected to be set if we have disabled 19662306a36Sopenharmony_ci * kernel _PAGE_GLOBAL pages. 19762306a36Sopenharmony_ci */ 19862306a36Sopenharmony_ci if (!static_cpu_has(X86_FEATURE_PTI)) { 19962306a36Sopenharmony_ci WARN_ON_ONCE(1); 20062306a36Sopenharmony_ci return; 20162306a36Sopenharmony_ci } 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_ci for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { 20462306a36Sopenharmony_ci /* Do not need to flush the current asid */ 20562306a36Sopenharmony_ci if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid)) 20662306a36Sopenharmony_ci continue; 20762306a36Sopenharmony_ci /* 20862306a36Sopenharmony_ci * Make sure the next time we go to switch to 20962306a36Sopenharmony_ci * this asid, we do a flush: 21062306a36Sopenharmony_ci */ 21162306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0); 21262306a36Sopenharmony_ci } 21362306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate.invalidate_other, false); 21462306a36Sopenharmony_ci} 21562306a36Sopenharmony_ci 21662306a36Sopenharmony_ciatomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); 21762306a36Sopenharmony_ci 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_cistatic void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, 22062306a36Sopenharmony_ci u16 *new_asid, bool *need_flush) 22162306a36Sopenharmony_ci{ 22262306a36Sopenharmony_ci u16 asid; 22362306a36Sopenharmony_ci 22462306a36Sopenharmony_ci if (!static_cpu_has(X86_FEATURE_PCID)) { 22562306a36Sopenharmony_ci *new_asid = 0; 22662306a36Sopenharmony_ci *need_flush = true; 22762306a36Sopenharmony_ci return; 22862306a36Sopenharmony_ci } 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_ci if (this_cpu_read(cpu_tlbstate.invalidate_other)) 23162306a36Sopenharmony_ci clear_asid_other(); 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_ci for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { 23462306a36Sopenharmony_ci if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != 23562306a36Sopenharmony_ci next->context.ctx_id) 23662306a36Sopenharmony_ci continue; 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci *new_asid = asid; 23962306a36Sopenharmony_ci *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < 24062306a36Sopenharmony_ci next_tlb_gen); 24162306a36Sopenharmony_ci return; 24262306a36Sopenharmony_ci } 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci /* 24562306a36Sopenharmony_ci * We don't currently own an ASID slot on this CPU. 24662306a36Sopenharmony_ci * Allocate a slot. 24762306a36Sopenharmony_ci */ 24862306a36Sopenharmony_ci *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; 24962306a36Sopenharmony_ci if (*new_asid >= TLB_NR_DYN_ASIDS) { 25062306a36Sopenharmony_ci *new_asid = 0; 25162306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate.next_asid, 1); 25262306a36Sopenharmony_ci } 25362306a36Sopenharmony_ci *need_flush = true; 25462306a36Sopenharmony_ci} 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci/* 25762306a36Sopenharmony_ci * Given an ASID, flush the corresponding user ASID. We can delay this 25862306a36Sopenharmony_ci * until the next time we switch to it. 25962306a36Sopenharmony_ci * 26062306a36Sopenharmony_ci * See SWITCH_TO_USER_CR3. 26162306a36Sopenharmony_ci */ 26262306a36Sopenharmony_cistatic inline void invalidate_user_asid(u16 asid) 26362306a36Sopenharmony_ci{ 26462306a36Sopenharmony_ci /* There is no user ASID if address space separation is off */ 26562306a36Sopenharmony_ci if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) 26662306a36Sopenharmony_ci return; 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_ci /* 26962306a36Sopenharmony_ci * We only have a single ASID if PCID is off and the CR3 27062306a36Sopenharmony_ci * write will have flushed it. 27162306a36Sopenharmony_ci */ 27262306a36Sopenharmony_ci if (!cpu_feature_enabled(X86_FEATURE_PCID)) 27362306a36Sopenharmony_ci return; 27462306a36Sopenharmony_ci 27562306a36Sopenharmony_ci if (!static_cpu_has(X86_FEATURE_PTI)) 27662306a36Sopenharmony_ci return; 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci __set_bit(kern_pcid(asid), 27962306a36Sopenharmony_ci (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); 28062306a36Sopenharmony_ci} 28162306a36Sopenharmony_ci 28262306a36Sopenharmony_cistatic void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam, 28362306a36Sopenharmony_ci bool need_flush) 28462306a36Sopenharmony_ci{ 28562306a36Sopenharmony_ci unsigned long new_mm_cr3; 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci if (need_flush) { 28862306a36Sopenharmony_ci invalidate_user_asid(new_asid); 28962306a36Sopenharmony_ci new_mm_cr3 = build_cr3(pgdir, new_asid, lam); 29062306a36Sopenharmony_ci } else { 29162306a36Sopenharmony_ci new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam); 29262306a36Sopenharmony_ci } 29362306a36Sopenharmony_ci 29462306a36Sopenharmony_ci /* 29562306a36Sopenharmony_ci * Caution: many callers of this function expect 29662306a36Sopenharmony_ci * that load_cr3() is serializing and orders TLB 29762306a36Sopenharmony_ci * fills with respect to the mm_cpumask writes. 29862306a36Sopenharmony_ci */ 29962306a36Sopenharmony_ci write_cr3(new_mm_cr3); 30062306a36Sopenharmony_ci} 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_civoid leave_mm(int cpu) 30362306a36Sopenharmony_ci{ 30462306a36Sopenharmony_ci struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 30562306a36Sopenharmony_ci 30662306a36Sopenharmony_ci /* 30762306a36Sopenharmony_ci * It's plausible that we're in lazy TLB mode while our mm is init_mm. 30862306a36Sopenharmony_ci * If so, our callers still expect us to flush the TLB, but there 30962306a36Sopenharmony_ci * aren't any user TLB entries in init_mm to worry about. 31062306a36Sopenharmony_ci * 31162306a36Sopenharmony_ci * This needs to happen before any other sanity checks due to 31262306a36Sopenharmony_ci * intel_idle's shenanigans. 31362306a36Sopenharmony_ci */ 31462306a36Sopenharmony_ci if (loaded_mm == &init_mm) 31562306a36Sopenharmony_ci return; 31662306a36Sopenharmony_ci 31762306a36Sopenharmony_ci /* Warn if we're not lazy. */ 31862306a36Sopenharmony_ci WARN_ON(!this_cpu_read(cpu_tlbstate_shared.is_lazy)); 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_ci switch_mm(NULL, &init_mm, NULL); 32162306a36Sopenharmony_ci} 32262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(leave_mm); 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_civoid switch_mm(struct mm_struct *prev, struct mm_struct *next, 32562306a36Sopenharmony_ci struct task_struct *tsk) 32662306a36Sopenharmony_ci{ 32762306a36Sopenharmony_ci unsigned long flags; 32862306a36Sopenharmony_ci 32962306a36Sopenharmony_ci local_irq_save(flags); 33062306a36Sopenharmony_ci switch_mm_irqs_off(prev, next, tsk); 33162306a36Sopenharmony_ci local_irq_restore(flags); 33262306a36Sopenharmony_ci} 33362306a36Sopenharmony_ci 33462306a36Sopenharmony_ci/* 33562306a36Sopenharmony_ci * Invoked from return to user/guest by a task that opted-in to L1D 33662306a36Sopenharmony_ci * flushing but ended up running on an SMT enabled core due to wrong 33762306a36Sopenharmony_ci * affinity settings or CPU hotplug. This is part of the paranoid L1D flush 33862306a36Sopenharmony_ci * contract which this task requested. 33962306a36Sopenharmony_ci */ 34062306a36Sopenharmony_cistatic void l1d_flush_force_sigbus(struct callback_head *ch) 34162306a36Sopenharmony_ci{ 34262306a36Sopenharmony_ci force_sig(SIGBUS); 34362306a36Sopenharmony_ci} 34462306a36Sopenharmony_ci 34562306a36Sopenharmony_cistatic void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm, 34662306a36Sopenharmony_ci struct task_struct *next) 34762306a36Sopenharmony_ci{ 34862306a36Sopenharmony_ci /* Flush L1D if the outgoing task requests it */ 34962306a36Sopenharmony_ci if (prev_mm & LAST_USER_MM_L1D_FLUSH) 35062306a36Sopenharmony_ci wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_ci /* Check whether the incoming task opted in for L1D flush */ 35362306a36Sopenharmony_ci if (likely(!(next_mm & LAST_USER_MM_L1D_FLUSH))) 35462306a36Sopenharmony_ci return; 35562306a36Sopenharmony_ci 35662306a36Sopenharmony_ci /* 35762306a36Sopenharmony_ci * Validate that it is not running on an SMT sibling as this would 35862306a36Sopenharmony_ci * make the excercise pointless because the siblings share L1D. If 35962306a36Sopenharmony_ci * it runs on a SMT sibling, notify it with SIGBUS on return to 36062306a36Sopenharmony_ci * user/guest 36162306a36Sopenharmony_ci */ 36262306a36Sopenharmony_ci if (this_cpu_read(cpu_info.smt_active)) { 36362306a36Sopenharmony_ci clear_ti_thread_flag(&next->thread_info, TIF_SPEC_L1D_FLUSH); 36462306a36Sopenharmony_ci next->l1d_flush_kill.func = l1d_flush_force_sigbus; 36562306a36Sopenharmony_ci task_work_add(next, &next->l1d_flush_kill, TWA_RESUME); 36662306a36Sopenharmony_ci } 36762306a36Sopenharmony_ci} 36862306a36Sopenharmony_ci 36962306a36Sopenharmony_cistatic unsigned long mm_mangle_tif_spec_bits(struct task_struct *next) 37062306a36Sopenharmony_ci{ 37162306a36Sopenharmony_ci unsigned long next_tif = read_task_thread_flags(next); 37262306a36Sopenharmony_ci unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK; 37362306a36Sopenharmony_ci 37462306a36Sopenharmony_ci /* 37562306a36Sopenharmony_ci * Ensure that the bit shift above works as expected and the two flags 37662306a36Sopenharmony_ci * end up in bit 0 and 1. 37762306a36Sopenharmony_ci */ 37862306a36Sopenharmony_ci BUILD_BUG_ON(TIF_SPEC_L1D_FLUSH != TIF_SPEC_IB + 1); 37962306a36Sopenharmony_ci 38062306a36Sopenharmony_ci return (unsigned long)next->mm | spec_bits; 38162306a36Sopenharmony_ci} 38262306a36Sopenharmony_ci 38362306a36Sopenharmony_cistatic void cond_mitigation(struct task_struct *next) 38462306a36Sopenharmony_ci{ 38562306a36Sopenharmony_ci unsigned long prev_mm, next_mm; 38662306a36Sopenharmony_ci 38762306a36Sopenharmony_ci if (!next || !next->mm) 38862306a36Sopenharmony_ci return; 38962306a36Sopenharmony_ci 39062306a36Sopenharmony_ci next_mm = mm_mangle_tif_spec_bits(next); 39162306a36Sopenharmony_ci prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec); 39262306a36Sopenharmony_ci 39362306a36Sopenharmony_ci /* 39462306a36Sopenharmony_ci * Avoid user/user BTB poisoning by flushing the branch predictor 39562306a36Sopenharmony_ci * when switching between processes. This stops one process from 39662306a36Sopenharmony_ci * doing Spectre-v2 attacks on another. 39762306a36Sopenharmony_ci * 39862306a36Sopenharmony_ci * Both, the conditional and the always IBPB mode use the mm 39962306a36Sopenharmony_ci * pointer to avoid the IBPB when switching between tasks of the 40062306a36Sopenharmony_ci * same process. Using the mm pointer instead of mm->context.ctx_id 40162306a36Sopenharmony_ci * opens a hypothetical hole vs. mm_struct reuse, which is more or 40262306a36Sopenharmony_ci * less impossible to control by an attacker. Aside of that it 40362306a36Sopenharmony_ci * would only affect the first schedule so the theoretically 40462306a36Sopenharmony_ci * exposed data is not really interesting. 40562306a36Sopenharmony_ci */ 40662306a36Sopenharmony_ci if (static_branch_likely(&switch_mm_cond_ibpb)) { 40762306a36Sopenharmony_ci /* 40862306a36Sopenharmony_ci * This is a bit more complex than the always mode because 40962306a36Sopenharmony_ci * it has to handle two cases: 41062306a36Sopenharmony_ci * 41162306a36Sopenharmony_ci * 1) Switch from a user space task (potential attacker) 41262306a36Sopenharmony_ci * which has TIF_SPEC_IB set to a user space task 41362306a36Sopenharmony_ci * (potential victim) which has TIF_SPEC_IB not set. 41462306a36Sopenharmony_ci * 41562306a36Sopenharmony_ci * 2) Switch from a user space task (potential attacker) 41662306a36Sopenharmony_ci * which has TIF_SPEC_IB not set to a user space task 41762306a36Sopenharmony_ci * (potential victim) which has TIF_SPEC_IB set. 41862306a36Sopenharmony_ci * 41962306a36Sopenharmony_ci * This could be done by unconditionally issuing IBPB when 42062306a36Sopenharmony_ci * a task which has TIF_SPEC_IB set is either scheduled in 42162306a36Sopenharmony_ci * or out. Though that results in two flushes when: 42262306a36Sopenharmony_ci * 42362306a36Sopenharmony_ci * - the same user space task is scheduled out and later 42462306a36Sopenharmony_ci * scheduled in again and only a kernel thread ran in 42562306a36Sopenharmony_ci * between. 42662306a36Sopenharmony_ci * 42762306a36Sopenharmony_ci * - a user space task belonging to the same process is 42862306a36Sopenharmony_ci * scheduled in after a kernel thread ran in between 42962306a36Sopenharmony_ci * 43062306a36Sopenharmony_ci * - a user space task belonging to the same process is 43162306a36Sopenharmony_ci * scheduled in immediately. 43262306a36Sopenharmony_ci * 43362306a36Sopenharmony_ci * Optimize this with reasonably small overhead for the 43462306a36Sopenharmony_ci * above cases. Mangle the TIF_SPEC_IB bit into the mm 43562306a36Sopenharmony_ci * pointer of the incoming task which is stored in 43662306a36Sopenharmony_ci * cpu_tlbstate.last_user_mm_spec for comparison. 43762306a36Sopenharmony_ci * 43862306a36Sopenharmony_ci * Issue IBPB only if the mm's are different and one or 43962306a36Sopenharmony_ci * both have the IBPB bit set. 44062306a36Sopenharmony_ci */ 44162306a36Sopenharmony_ci if (next_mm != prev_mm && 44262306a36Sopenharmony_ci (next_mm | prev_mm) & LAST_USER_MM_IBPB) 44362306a36Sopenharmony_ci indirect_branch_prediction_barrier(); 44462306a36Sopenharmony_ci } 44562306a36Sopenharmony_ci 44662306a36Sopenharmony_ci if (static_branch_unlikely(&switch_mm_always_ibpb)) { 44762306a36Sopenharmony_ci /* 44862306a36Sopenharmony_ci * Only flush when switching to a user space task with a 44962306a36Sopenharmony_ci * different context than the user space task which ran 45062306a36Sopenharmony_ci * last on this CPU. 45162306a36Sopenharmony_ci */ 45262306a36Sopenharmony_ci if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != 45362306a36Sopenharmony_ci (unsigned long)next->mm) 45462306a36Sopenharmony_ci indirect_branch_prediction_barrier(); 45562306a36Sopenharmony_ci } 45662306a36Sopenharmony_ci 45762306a36Sopenharmony_ci if (static_branch_unlikely(&switch_mm_cond_l1d_flush)) { 45862306a36Sopenharmony_ci /* 45962306a36Sopenharmony_ci * Flush L1D when the outgoing task requested it and/or 46062306a36Sopenharmony_ci * check whether the incoming task requested L1D flushing 46162306a36Sopenharmony_ci * and ended up on an SMT sibling. 46262306a36Sopenharmony_ci */ 46362306a36Sopenharmony_ci if (unlikely((prev_mm | next_mm) & LAST_USER_MM_L1D_FLUSH)) 46462306a36Sopenharmony_ci l1d_flush_evaluate(prev_mm, next_mm, next); 46562306a36Sopenharmony_ci } 46662306a36Sopenharmony_ci 46762306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm); 46862306a36Sopenharmony_ci} 46962306a36Sopenharmony_ci 47062306a36Sopenharmony_ci#ifdef CONFIG_PERF_EVENTS 47162306a36Sopenharmony_cistatic inline void cr4_update_pce_mm(struct mm_struct *mm) 47262306a36Sopenharmony_ci{ 47362306a36Sopenharmony_ci if (static_branch_unlikely(&rdpmc_always_available_key) || 47462306a36Sopenharmony_ci (!static_branch_unlikely(&rdpmc_never_available_key) && 47562306a36Sopenharmony_ci atomic_read(&mm->context.perf_rdpmc_allowed))) { 47662306a36Sopenharmony_ci /* 47762306a36Sopenharmony_ci * Clear the existing dirty counters to 47862306a36Sopenharmony_ci * prevent the leak for an RDPMC task. 47962306a36Sopenharmony_ci */ 48062306a36Sopenharmony_ci perf_clear_dirty_counters(); 48162306a36Sopenharmony_ci cr4_set_bits_irqsoff(X86_CR4_PCE); 48262306a36Sopenharmony_ci } else 48362306a36Sopenharmony_ci cr4_clear_bits_irqsoff(X86_CR4_PCE); 48462306a36Sopenharmony_ci} 48562306a36Sopenharmony_ci 48662306a36Sopenharmony_civoid cr4_update_pce(void *ignored) 48762306a36Sopenharmony_ci{ 48862306a36Sopenharmony_ci cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm)); 48962306a36Sopenharmony_ci} 49062306a36Sopenharmony_ci 49162306a36Sopenharmony_ci#else 49262306a36Sopenharmony_cistatic inline void cr4_update_pce_mm(struct mm_struct *mm) { } 49362306a36Sopenharmony_ci#endif 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_civoid switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, 49662306a36Sopenharmony_ci struct task_struct *tsk) 49762306a36Sopenharmony_ci{ 49862306a36Sopenharmony_ci struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); 49962306a36Sopenharmony_ci u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 50062306a36Sopenharmony_ci unsigned long new_lam = mm_lam_cr3_mask(next); 50162306a36Sopenharmony_ci bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); 50262306a36Sopenharmony_ci unsigned cpu = smp_processor_id(); 50362306a36Sopenharmony_ci u64 next_tlb_gen; 50462306a36Sopenharmony_ci bool need_flush; 50562306a36Sopenharmony_ci u16 new_asid; 50662306a36Sopenharmony_ci 50762306a36Sopenharmony_ci /* 50862306a36Sopenharmony_ci * NB: The scheduler will call us with prev == next when switching 50962306a36Sopenharmony_ci * from lazy TLB mode to normal mode if active_mm isn't changing. 51062306a36Sopenharmony_ci * When this happens, we don't assume that CR3 (and hence 51162306a36Sopenharmony_ci * cpu_tlbstate.loaded_mm) matches next. 51262306a36Sopenharmony_ci * 51362306a36Sopenharmony_ci * NB: leave_mm() calls us with prev == NULL and tsk == NULL. 51462306a36Sopenharmony_ci */ 51562306a36Sopenharmony_ci 51662306a36Sopenharmony_ci /* We don't want flush_tlb_func() to run concurrently with us. */ 51762306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_PROVE_LOCKING)) 51862306a36Sopenharmony_ci WARN_ON_ONCE(!irqs_disabled()); 51962306a36Sopenharmony_ci 52062306a36Sopenharmony_ci /* 52162306a36Sopenharmony_ci * Verify that CR3 is what we think it is. This will catch 52262306a36Sopenharmony_ci * hypothetical buggy code that directly switches to swapper_pg_dir 52362306a36Sopenharmony_ci * without going through leave_mm() / switch_mm_irqs_off() or that 52462306a36Sopenharmony_ci * does something like write_cr3(read_cr3_pa()). 52562306a36Sopenharmony_ci * 52662306a36Sopenharmony_ci * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3() 52762306a36Sopenharmony_ci * isn't free. 52862306a36Sopenharmony_ci */ 52962306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_VM 53062306a36Sopenharmony_ci if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid, 53162306a36Sopenharmony_ci tlbstate_lam_cr3_mask()))) { 53262306a36Sopenharmony_ci /* 53362306a36Sopenharmony_ci * If we were to BUG here, we'd be very likely to kill 53462306a36Sopenharmony_ci * the system so hard that we don't see the call trace. 53562306a36Sopenharmony_ci * Try to recover instead by ignoring the error and doing 53662306a36Sopenharmony_ci * a global flush to minimize the chance of corruption. 53762306a36Sopenharmony_ci * 53862306a36Sopenharmony_ci * (This is far from being a fully correct recovery. 53962306a36Sopenharmony_ci * Architecturally, the CPU could prefetch something 54062306a36Sopenharmony_ci * back into an incorrect ASID slot and leave it there 54162306a36Sopenharmony_ci * to cause trouble down the road. It's better than 54262306a36Sopenharmony_ci * nothing, though.) 54362306a36Sopenharmony_ci */ 54462306a36Sopenharmony_ci __flush_tlb_all(); 54562306a36Sopenharmony_ci } 54662306a36Sopenharmony_ci#endif 54762306a36Sopenharmony_ci if (was_lazy) 54862306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate_shared.is_lazy, false); 54962306a36Sopenharmony_ci 55062306a36Sopenharmony_ci /* 55162306a36Sopenharmony_ci * The membarrier system call requires a full memory barrier and 55262306a36Sopenharmony_ci * core serialization before returning to user-space, after 55362306a36Sopenharmony_ci * storing to rq->curr, when changing mm. This is because 55462306a36Sopenharmony_ci * membarrier() sends IPIs to all CPUs that are in the target mm 55562306a36Sopenharmony_ci * to make them issue memory barriers. However, if another CPU 55662306a36Sopenharmony_ci * switches to/from the target mm concurrently with 55762306a36Sopenharmony_ci * membarrier(), it can cause that CPU not to receive an IPI 55862306a36Sopenharmony_ci * when it really should issue a memory barrier. Writing to CR3 55962306a36Sopenharmony_ci * provides that full memory barrier and core serializing 56062306a36Sopenharmony_ci * instruction. 56162306a36Sopenharmony_ci */ 56262306a36Sopenharmony_ci if (real_prev == next) { 56362306a36Sopenharmony_ci /* Not actually switching mm's */ 56462306a36Sopenharmony_ci VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != 56562306a36Sopenharmony_ci next->context.ctx_id); 56662306a36Sopenharmony_ci 56762306a36Sopenharmony_ci /* 56862306a36Sopenharmony_ci * If this races with another thread that enables lam, 'new_lam' 56962306a36Sopenharmony_ci * might not match tlbstate_lam_cr3_mask(). 57062306a36Sopenharmony_ci */ 57162306a36Sopenharmony_ci 57262306a36Sopenharmony_ci /* 57362306a36Sopenharmony_ci * Even in lazy TLB mode, the CPU should stay set in the 57462306a36Sopenharmony_ci * mm_cpumask. The TLB shootdown code can figure out from 57562306a36Sopenharmony_ci * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. 57662306a36Sopenharmony_ci */ 57762306a36Sopenharmony_ci if (WARN_ON_ONCE(real_prev != &init_mm && 57862306a36Sopenharmony_ci !cpumask_test_cpu(cpu, mm_cpumask(next)))) 57962306a36Sopenharmony_ci cpumask_set_cpu(cpu, mm_cpumask(next)); 58062306a36Sopenharmony_ci 58162306a36Sopenharmony_ci /* 58262306a36Sopenharmony_ci * If the CPU is not in lazy TLB mode, we are just switching 58362306a36Sopenharmony_ci * from one thread in a process to another thread in the same 58462306a36Sopenharmony_ci * process. No TLB flush required. 58562306a36Sopenharmony_ci */ 58662306a36Sopenharmony_ci if (!was_lazy) 58762306a36Sopenharmony_ci return; 58862306a36Sopenharmony_ci 58962306a36Sopenharmony_ci /* 59062306a36Sopenharmony_ci * Read the tlb_gen to check whether a flush is needed. 59162306a36Sopenharmony_ci * If the TLB is up to date, just use it. 59262306a36Sopenharmony_ci * The barrier synchronizes with the tlb_gen increment in 59362306a36Sopenharmony_ci * the TLB shootdown code. 59462306a36Sopenharmony_ci */ 59562306a36Sopenharmony_ci smp_mb(); 59662306a36Sopenharmony_ci next_tlb_gen = atomic64_read(&next->context.tlb_gen); 59762306a36Sopenharmony_ci if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == 59862306a36Sopenharmony_ci next_tlb_gen) 59962306a36Sopenharmony_ci return; 60062306a36Sopenharmony_ci 60162306a36Sopenharmony_ci /* 60262306a36Sopenharmony_ci * TLB contents went out of date while we were in lazy 60362306a36Sopenharmony_ci * mode. Fall through to the TLB switching code below. 60462306a36Sopenharmony_ci */ 60562306a36Sopenharmony_ci new_asid = prev_asid; 60662306a36Sopenharmony_ci need_flush = true; 60762306a36Sopenharmony_ci } else { 60862306a36Sopenharmony_ci /* 60962306a36Sopenharmony_ci * Apply process to process speculation vulnerability 61062306a36Sopenharmony_ci * mitigations if applicable. 61162306a36Sopenharmony_ci */ 61262306a36Sopenharmony_ci cond_mitigation(tsk); 61362306a36Sopenharmony_ci 61462306a36Sopenharmony_ci /* 61562306a36Sopenharmony_ci * Stop remote flushes for the previous mm. 61662306a36Sopenharmony_ci * Skip kernel threads; we never send init_mm TLB flushing IPIs, 61762306a36Sopenharmony_ci * but the bitmap manipulation can cause cache line contention. 61862306a36Sopenharmony_ci */ 61962306a36Sopenharmony_ci if (real_prev != &init_mm) { 62062306a36Sopenharmony_ci VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, 62162306a36Sopenharmony_ci mm_cpumask(real_prev))); 62262306a36Sopenharmony_ci cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); 62362306a36Sopenharmony_ci } 62462306a36Sopenharmony_ci 62562306a36Sopenharmony_ci /* 62662306a36Sopenharmony_ci * Start remote flushes and then read tlb_gen. 62762306a36Sopenharmony_ci */ 62862306a36Sopenharmony_ci if (next != &init_mm) 62962306a36Sopenharmony_ci cpumask_set_cpu(cpu, mm_cpumask(next)); 63062306a36Sopenharmony_ci next_tlb_gen = atomic64_read(&next->context.tlb_gen); 63162306a36Sopenharmony_ci 63262306a36Sopenharmony_ci choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); 63362306a36Sopenharmony_ci 63462306a36Sopenharmony_ci /* Let nmi_uaccess_okay() know that we're changing CR3. */ 63562306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); 63662306a36Sopenharmony_ci barrier(); 63762306a36Sopenharmony_ci } 63862306a36Sopenharmony_ci 63962306a36Sopenharmony_ci set_tlbstate_lam_mode(next); 64062306a36Sopenharmony_ci if (need_flush) { 64162306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); 64262306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); 64362306a36Sopenharmony_ci load_new_mm_cr3(next->pgd, new_asid, new_lam, true); 64462306a36Sopenharmony_ci 64562306a36Sopenharmony_ci trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 64662306a36Sopenharmony_ci } else { 64762306a36Sopenharmony_ci /* The new ASID is already up to date. */ 64862306a36Sopenharmony_ci load_new_mm_cr3(next->pgd, new_asid, new_lam, false); 64962306a36Sopenharmony_ci 65062306a36Sopenharmony_ci trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); 65162306a36Sopenharmony_ci } 65262306a36Sopenharmony_ci 65362306a36Sopenharmony_ci /* Make sure we write CR3 before loaded_mm. */ 65462306a36Sopenharmony_ci barrier(); 65562306a36Sopenharmony_ci 65662306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate.loaded_mm, next); 65762306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); 65862306a36Sopenharmony_ci 65962306a36Sopenharmony_ci if (next != real_prev) { 66062306a36Sopenharmony_ci cr4_update_pce_mm(next); 66162306a36Sopenharmony_ci switch_ldt(real_prev, next); 66262306a36Sopenharmony_ci } 66362306a36Sopenharmony_ci} 66462306a36Sopenharmony_ci 66562306a36Sopenharmony_ci/* 66662306a36Sopenharmony_ci * Please ignore the name of this function. It should be called 66762306a36Sopenharmony_ci * switch_to_kernel_thread(). 66862306a36Sopenharmony_ci * 66962306a36Sopenharmony_ci * enter_lazy_tlb() is a hint from the scheduler that we are entering a 67062306a36Sopenharmony_ci * kernel thread or other context without an mm. Acceptable implementations 67162306a36Sopenharmony_ci * include doing nothing whatsoever, switching to init_mm, or various clever 67262306a36Sopenharmony_ci * lazy tricks to try to minimize TLB flushes. 67362306a36Sopenharmony_ci * 67462306a36Sopenharmony_ci * The scheduler reserves the right to call enter_lazy_tlb() several times 67562306a36Sopenharmony_ci * in a row. It will notify us that we're going back to a real mm by 67662306a36Sopenharmony_ci * calling switch_mm_irqs_off(). 67762306a36Sopenharmony_ci */ 67862306a36Sopenharmony_civoid enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 67962306a36Sopenharmony_ci{ 68062306a36Sopenharmony_ci if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) 68162306a36Sopenharmony_ci return; 68262306a36Sopenharmony_ci 68362306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate_shared.is_lazy, true); 68462306a36Sopenharmony_ci} 68562306a36Sopenharmony_ci 68662306a36Sopenharmony_ci/* 68762306a36Sopenharmony_ci * Call this when reinitializing a CPU. It fixes the following potential 68862306a36Sopenharmony_ci * problems: 68962306a36Sopenharmony_ci * 69062306a36Sopenharmony_ci * - The ASID changed from what cpu_tlbstate thinks it is (most likely 69162306a36Sopenharmony_ci * because the CPU was taken down and came back up with CR3's PCID 69262306a36Sopenharmony_ci * bits clear. CPU hotplug can do this. 69362306a36Sopenharmony_ci * 69462306a36Sopenharmony_ci * - The TLB contains junk in slots corresponding to inactive ASIDs. 69562306a36Sopenharmony_ci * 69662306a36Sopenharmony_ci * - The CPU went so far out to lunch that it may have missed a TLB 69762306a36Sopenharmony_ci * flush. 69862306a36Sopenharmony_ci */ 69962306a36Sopenharmony_civoid initialize_tlbstate_and_flush(void) 70062306a36Sopenharmony_ci{ 70162306a36Sopenharmony_ci int i; 70262306a36Sopenharmony_ci struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); 70362306a36Sopenharmony_ci u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); 70462306a36Sopenharmony_ci unsigned long cr3 = __read_cr3(); 70562306a36Sopenharmony_ci 70662306a36Sopenharmony_ci /* Assert that CR3 already references the right mm. */ 70762306a36Sopenharmony_ci WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); 70862306a36Sopenharmony_ci 70962306a36Sopenharmony_ci /* LAM expected to be disabled */ 71062306a36Sopenharmony_ci WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)); 71162306a36Sopenharmony_ci WARN_ON(mm_lam_cr3_mask(mm)); 71262306a36Sopenharmony_ci 71362306a36Sopenharmony_ci /* 71462306a36Sopenharmony_ci * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization 71562306a36Sopenharmony_ci * doesn't work like other CR4 bits because it can only be set from 71662306a36Sopenharmony_ci * long mode.) 71762306a36Sopenharmony_ci */ 71862306a36Sopenharmony_ci WARN_ON(boot_cpu_has(X86_FEATURE_PCID) && 71962306a36Sopenharmony_ci !(cr4_read_shadow() & X86_CR4_PCIDE)); 72062306a36Sopenharmony_ci 72162306a36Sopenharmony_ci /* Disable LAM, force ASID 0 and force a TLB flush. */ 72262306a36Sopenharmony_ci write_cr3(build_cr3(mm->pgd, 0, 0)); 72362306a36Sopenharmony_ci 72462306a36Sopenharmony_ci /* Reinitialize tlbstate. */ 72562306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT); 72662306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); 72762306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate.next_asid, 1); 72862306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); 72962306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); 73062306a36Sopenharmony_ci set_tlbstate_lam_mode(mm); 73162306a36Sopenharmony_ci 73262306a36Sopenharmony_ci for (i = 1; i < TLB_NR_DYN_ASIDS; i++) 73362306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); 73462306a36Sopenharmony_ci} 73562306a36Sopenharmony_ci 73662306a36Sopenharmony_ci/* 73762306a36Sopenharmony_ci * flush_tlb_func()'s memory ordering requirement is that any 73862306a36Sopenharmony_ci * TLB fills that happen after we flush the TLB are ordered after we 73962306a36Sopenharmony_ci * read active_mm's tlb_gen. We don't need any explicit barriers 74062306a36Sopenharmony_ci * because all x86 flush operations are serializing and the 74162306a36Sopenharmony_ci * atomic64_read operation won't be reordered by the compiler. 74262306a36Sopenharmony_ci */ 74362306a36Sopenharmony_cistatic void flush_tlb_func(void *info) 74462306a36Sopenharmony_ci{ 74562306a36Sopenharmony_ci /* 74662306a36Sopenharmony_ci * We have three different tlb_gen values in here. They are: 74762306a36Sopenharmony_ci * 74862306a36Sopenharmony_ci * - mm_tlb_gen: the latest generation. 74962306a36Sopenharmony_ci * - local_tlb_gen: the generation that this CPU has already caught 75062306a36Sopenharmony_ci * up to. 75162306a36Sopenharmony_ci * - f->new_tlb_gen: the generation that the requester of the flush 75262306a36Sopenharmony_ci * wants us to catch up to. 75362306a36Sopenharmony_ci */ 75462306a36Sopenharmony_ci const struct flush_tlb_info *f = info; 75562306a36Sopenharmony_ci struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 75662306a36Sopenharmony_ci u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 75762306a36Sopenharmony_ci u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); 75862306a36Sopenharmony_ci bool local = smp_processor_id() == f->initiating_cpu; 75962306a36Sopenharmony_ci unsigned long nr_invalidate = 0; 76062306a36Sopenharmony_ci u64 mm_tlb_gen; 76162306a36Sopenharmony_ci 76262306a36Sopenharmony_ci /* This code cannot presently handle being reentered. */ 76362306a36Sopenharmony_ci VM_WARN_ON(!irqs_disabled()); 76462306a36Sopenharmony_ci 76562306a36Sopenharmony_ci if (!local) { 76662306a36Sopenharmony_ci inc_irq_stat(irq_tlb_count); 76762306a36Sopenharmony_ci count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 76862306a36Sopenharmony_ci 76962306a36Sopenharmony_ci /* Can only happen on remote CPUs */ 77062306a36Sopenharmony_ci if (f->mm && f->mm != loaded_mm) 77162306a36Sopenharmony_ci return; 77262306a36Sopenharmony_ci } 77362306a36Sopenharmony_ci 77462306a36Sopenharmony_ci if (unlikely(loaded_mm == &init_mm)) 77562306a36Sopenharmony_ci return; 77662306a36Sopenharmony_ci 77762306a36Sopenharmony_ci VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != 77862306a36Sopenharmony_ci loaded_mm->context.ctx_id); 77962306a36Sopenharmony_ci 78062306a36Sopenharmony_ci if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) { 78162306a36Sopenharmony_ci /* 78262306a36Sopenharmony_ci * We're in lazy mode. We need to at least flush our 78362306a36Sopenharmony_ci * paging-structure cache to avoid speculatively reading 78462306a36Sopenharmony_ci * garbage into our TLB. Since switching to init_mm is barely 78562306a36Sopenharmony_ci * slower than a minimal flush, just switch to init_mm. 78662306a36Sopenharmony_ci * 78762306a36Sopenharmony_ci * This should be rare, with native_flush_tlb_multi() skipping 78862306a36Sopenharmony_ci * IPIs to lazy TLB mode CPUs. 78962306a36Sopenharmony_ci */ 79062306a36Sopenharmony_ci switch_mm_irqs_off(NULL, &init_mm, NULL); 79162306a36Sopenharmony_ci return; 79262306a36Sopenharmony_ci } 79362306a36Sopenharmony_ci 79462306a36Sopenharmony_ci if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID && 79562306a36Sopenharmony_ci f->new_tlb_gen <= local_tlb_gen)) { 79662306a36Sopenharmony_ci /* 79762306a36Sopenharmony_ci * The TLB is already up to date in respect to f->new_tlb_gen. 79862306a36Sopenharmony_ci * While the core might be still behind mm_tlb_gen, checking 79962306a36Sopenharmony_ci * mm_tlb_gen unnecessarily would have negative caching effects 80062306a36Sopenharmony_ci * so avoid it. 80162306a36Sopenharmony_ci */ 80262306a36Sopenharmony_ci return; 80362306a36Sopenharmony_ci } 80462306a36Sopenharmony_ci 80562306a36Sopenharmony_ci /* 80662306a36Sopenharmony_ci * Defer mm_tlb_gen reading as long as possible to avoid cache 80762306a36Sopenharmony_ci * contention. 80862306a36Sopenharmony_ci */ 80962306a36Sopenharmony_ci mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); 81062306a36Sopenharmony_ci 81162306a36Sopenharmony_ci if (unlikely(local_tlb_gen == mm_tlb_gen)) { 81262306a36Sopenharmony_ci /* 81362306a36Sopenharmony_ci * There's nothing to do: we're already up to date. This can 81462306a36Sopenharmony_ci * happen if two concurrent flushes happen -- the first flush to 81562306a36Sopenharmony_ci * be handled can catch us all the way up, leaving no work for 81662306a36Sopenharmony_ci * the second flush. 81762306a36Sopenharmony_ci */ 81862306a36Sopenharmony_ci goto done; 81962306a36Sopenharmony_ci } 82062306a36Sopenharmony_ci 82162306a36Sopenharmony_ci WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); 82262306a36Sopenharmony_ci WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); 82362306a36Sopenharmony_ci 82462306a36Sopenharmony_ci /* 82562306a36Sopenharmony_ci * If we get to this point, we know that our TLB is out of date. 82662306a36Sopenharmony_ci * This does not strictly imply that we need to flush (it's 82762306a36Sopenharmony_ci * possible that f->new_tlb_gen <= local_tlb_gen), but we're 82862306a36Sopenharmony_ci * going to need to flush in the very near future, so we might 82962306a36Sopenharmony_ci * as well get it over with. 83062306a36Sopenharmony_ci * 83162306a36Sopenharmony_ci * The only question is whether to do a full or partial flush. 83262306a36Sopenharmony_ci * 83362306a36Sopenharmony_ci * We do a partial flush if requested and two extra conditions 83462306a36Sopenharmony_ci * are met: 83562306a36Sopenharmony_ci * 83662306a36Sopenharmony_ci * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that 83762306a36Sopenharmony_ci * we've always done all needed flushes to catch up to 83862306a36Sopenharmony_ci * local_tlb_gen. If, for example, local_tlb_gen == 2 and 83962306a36Sopenharmony_ci * f->new_tlb_gen == 3, then we know that the flush needed to bring 84062306a36Sopenharmony_ci * us up to date for tlb_gen 3 is the partial flush we're 84162306a36Sopenharmony_ci * processing. 84262306a36Sopenharmony_ci * 84362306a36Sopenharmony_ci * As an example of why this check is needed, suppose that there 84462306a36Sopenharmony_ci * are two concurrent flushes. The first is a full flush that 84562306a36Sopenharmony_ci * changes context.tlb_gen from 1 to 2. The second is a partial 84662306a36Sopenharmony_ci * flush that changes context.tlb_gen from 2 to 3. If they get 84762306a36Sopenharmony_ci * processed on this CPU in reverse order, we'll see 84862306a36Sopenharmony_ci * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. 84962306a36Sopenharmony_ci * If we were to use __flush_tlb_one_user() and set local_tlb_gen to 85062306a36Sopenharmony_ci * 3, we'd be break the invariant: we'd update local_tlb_gen above 85162306a36Sopenharmony_ci * 1 without the full flush that's needed for tlb_gen 2. 85262306a36Sopenharmony_ci * 85362306a36Sopenharmony_ci * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimization. 85462306a36Sopenharmony_ci * Partial TLB flushes are not all that much cheaper than full TLB 85562306a36Sopenharmony_ci * flushes, so it seems unlikely that it would be a performance win 85662306a36Sopenharmony_ci * to do a partial flush if that won't bring our TLB fully up to 85762306a36Sopenharmony_ci * date. By doing a full flush instead, we can increase 85862306a36Sopenharmony_ci * local_tlb_gen all the way to mm_tlb_gen and we can probably 85962306a36Sopenharmony_ci * avoid another flush in the very near future. 86062306a36Sopenharmony_ci */ 86162306a36Sopenharmony_ci if (f->end != TLB_FLUSH_ALL && 86262306a36Sopenharmony_ci f->new_tlb_gen == local_tlb_gen + 1 && 86362306a36Sopenharmony_ci f->new_tlb_gen == mm_tlb_gen) { 86462306a36Sopenharmony_ci /* Partial flush */ 86562306a36Sopenharmony_ci unsigned long addr = f->start; 86662306a36Sopenharmony_ci 86762306a36Sopenharmony_ci /* Partial flush cannot have invalid generations */ 86862306a36Sopenharmony_ci VM_WARN_ON(f->new_tlb_gen == TLB_GENERATION_INVALID); 86962306a36Sopenharmony_ci 87062306a36Sopenharmony_ci /* Partial flush must have valid mm */ 87162306a36Sopenharmony_ci VM_WARN_ON(f->mm == NULL); 87262306a36Sopenharmony_ci 87362306a36Sopenharmony_ci nr_invalidate = (f->end - f->start) >> f->stride_shift; 87462306a36Sopenharmony_ci 87562306a36Sopenharmony_ci while (addr < f->end) { 87662306a36Sopenharmony_ci flush_tlb_one_user(addr); 87762306a36Sopenharmony_ci addr += 1UL << f->stride_shift; 87862306a36Sopenharmony_ci } 87962306a36Sopenharmony_ci if (local) 88062306a36Sopenharmony_ci count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate); 88162306a36Sopenharmony_ci } else { 88262306a36Sopenharmony_ci /* Full flush. */ 88362306a36Sopenharmony_ci nr_invalidate = TLB_FLUSH_ALL; 88462306a36Sopenharmony_ci 88562306a36Sopenharmony_ci flush_tlb_local(); 88662306a36Sopenharmony_ci if (local) 88762306a36Sopenharmony_ci count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 88862306a36Sopenharmony_ci } 88962306a36Sopenharmony_ci 89062306a36Sopenharmony_ci /* Both paths above update our state to mm_tlb_gen. */ 89162306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); 89262306a36Sopenharmony_ci 89362306a36Sopenharmony_ci /* Tracing is done in a unified manner to reduce the code size */ 89462306a36Sopenharmony_cidone: 89562306a36Sopenharmony_ci trace_tlb_flush(!local ? TLB_REMOTE_SHOOTDOWN : 89662306a36Sopenharmony_ci (f->mm == NULL) ? TLB_LOCAL_SHOOTDOWN : 89762306a36Sopenharmony_ci TLB_LOCAL_MM_SHOOTDOWN, 89862306a36Sopenharmony_ci nr_invalidate); 89962306a36Sopenharmony_ci} 90062306a36Sopenharmony_ci 90162306a36Sopenharmony_cistatic bool tlb_is_not_lazy(int cpu, void *data) 90262306a36Sopenharmony_ci{ 90362306a36Sopenharmony_ci return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu); 90462306a36Sopenharmony_ci} 90562306a36Sopenharmony_ci 90662306a36Sopenharmony_ciDEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared); 90762306a36Sopenharmony_ciEXPORT_PER_CPU_SYMBOL(cpu_tlbstate_shared); 90862306a36Sopenharmony_ci 90962306a36Sopenharmony_ciSTATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask, 91062306a36Sopenharmony_ci const struct flush_tlb_info *info) 91162306a36Sopenharmony_ci{ 91262306a36Sopenharmony_ci /* 91362306a36Sopenharmony_ci * Do accounting and tracing. Note that there are (and have always been) 91462306a36Sopenharmony_ci * cases in which a remote TLB flush will be traced, but eventually 91562306a36Sopenharmony_ci * would not happen. 91662306a36Sopenharmony_ci */ 91762306a36Sopenharmony_ci count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 91862306a36Sopenharmony_ci if (info->end == TLB_FLUSH_ALL) 91962306a36Sopenharmony_ci trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); 92062306a36Sopenharmony_ci else 92162306a36Sopenharmony_ci trace_tlb_flush(TLB_REMOTE_SEND_IPI, 92262306a36Sopenharmony_ci (info->end - info->start) >> PAGE_SHIFT); 92362306a36Sopenharmony_ci 92462306a36Sopenharmony_ci /* 92562306a36Sopenharmony_ci * If no page tables were freed, we can skip sending IPIs to 92662306a36Sopenharmony_ci * CPUs in lazy TLB mode. They will flush the CPU themselves 92762306a36Sopenharmony_ci * at the next context switch. 92862306a36Sopenharmony_ci * 92962306a36Sopenharmony_ci * However, if page tables are getting freed, we need to send the 93062306a36Sopenharmony_ci * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping 93162306a36Sopenharmony_ci * up on the new contents of what used to be page tables, while 93262306a36Sopenharmony_ci * doing a speculative memory access. 93362306a36Sopenharmony_ci */ 93462306a36Sopenharmony_ci if (info->freed_tables) 93562306a36Sopenharmony_ci on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); 93662306a36Sopenharmony_ci else 93762306a36Sopenharmony_ci on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func, 93862306a36Sopenharmony_ci (void *)info, 1, cpumask); 93962306a36Sopenharmony_ci} 94062306a36Sopenharmony_ci 94162306a36Sopenharmony_civoid flush_tlb_multi(const struct cpumask *cpumask, 94262306a36Sopenharmony_ci const struct flush_tlb_info *info) 94362306a36Sopenharmony_ci{ 94462306a36Sopenharmony_ci __flush_tlb_multi(cpumask, info); 94562306a36Sopenharmony_ci} 94662306a36Sopenharmony_ci 94762306a36Sopenharmony_ci/* 94862306a36Sopenharmony_ci * See Documentation/arch/x86/tlb.rst for details. We choose 33 94962306a36Sopenharmony_ci * because it is large enough to cover the vast majority (at 95062306a36Sopenharmony_ci * least 95%) of allocations, and is small enough that we are 95162306a36Sopenharmony_ci * confident it will not cause too much overhead. Each single 95262306a36Sopenharmony_ci * flush is about 100 ns, so this caps the maximum overhead at 95362306a36Sopenharmony_ci * _about_ 3,000 ns. 95462306a36Sopenharmony_ci * 95562306a36Sopenharmony_ci * This is in units of pages. 95662306a36Sopenharmony_ci */ 95762306a36Sopenharmony_ciunsigned long tlb_single_page_flush_ceiling __read_mostly = 33; 95862306a36Sopenharmony_ci 95962306a36Sopenharmony_cistatic DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info); 96062306a36Sopenharmony_ci 96162306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_VM 96262306a36Sopenharmony_cistatic DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx); 96362306a36Sopenharmony_ci#endif 96462306a36Sopenharmony_ci 96562306a36Sopenharmony_cistatic struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, 96662306a36Sopenharmony_ci unsigned long start, unsigned long end, 96762306a36Sopenharmony_ci unsigned int stride_shift, bool freed_tables, 96862306a36Sopenharmony_ci u64 new_tlb_gen) 96962306a36Sopenharmony_ci{ 97062306a36Sopenharmony_ci struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info); 97162306a36Sopenharmony_ci 97262306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_VM 97362306a36Sopenharmony_ci /* 97462306a36Sopenharmony_ci * Ensure that the following code is non-reentrant and flush_tlb_info 97562306a36Sopenharmony_ci * is not overwritten. This means no TLB flushing is initiated by 97662306a36Sopenharmony_ci * interrupt handlers and machine-check exception handlers. 97762306a36Sopenharmony_ci */ 97862306a36Sopenharmony_ci BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1); 97962306a36Sopenharmony_ci#endif 98062306a36Sopenharmony_ci 98162306a36Sopenharmony_ci info->start = start; 98262306a36Sopenharmony_ci info->end = end; 98362306a36Sopenharmony_ci info->mm = mm; 98462306a36Sopenharmony_ci info->stride_shift = stride_shift; 98562306a36Sopenharmony_ci info->freed_tables = freed_tables; 98662306a36Sopenharmony_ci info->new_tlb_gen = new_tlb_gen; 98762306a36Sopenharmony_ci info->initiating_cpu = smp_processor_id(); 98862306a36Sopenharmony_ci 98962306a36Sopenharmony_ci return info; 99062306a36Sopenharmony_ci} 99162306a36Sopenharmony_ci 99262306a36Sopenharmony_cistatic void put_flush_tlb_info(void) 99362306a36Sopenharmony_ci{ 99462306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_VM 99562306a36Sopenharmony_ci /* Complete reentrancy prevention checks */ 99662306a36Sopenharmony_ci barrier(); 99762306a36Sopenharmony_ci this_cpu_dec(flush_tlb_info_idx); 99862306a36Sopenharmony_ci#endif 99962306a36Sopenharmony_ci} 100062306a36Sopenharmony_ci 100162306a36Sopenharmony_civoid flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 100262306a36Sopenharmony_ci unsigned long end, unsigned int stride_shift, 100362306a36Sopenharmony_ci bool freed_tables) 100462306a36Sopenharmony_ci{ 100562306a36Sopenharmony_ci struct flush_tlb_info *info; 100662306a36Sopenharmony_ci u64 new_tlb_gen; 100762306a36Sopenharmony_ci int cpu; 100862306a36Sopenharmony_ci 100962306a36Sopenharmony_ci cpu = get_cpu(); 101062306a36Sopenharmony_ci 101162306a36Sopenharmony_ci /* Should we flush just the requested range? */ 101262306a36Sopenharmony_ci if ((end == TLB_FLUSH_ALL) || 101362306a36Sopenharmony_ci ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) { 101462306a36Sopenharmony_ci start = 0; 101562306a36Sopenharmony_ci end = TLB_FLUSH_ALL; 101662306a36Sopenharmony_ci } 101762306a36Sopenharmony_ci 101862306a36Sopenharmony_ci /* This is also a barrier that synchronizes with switch_mm(). */ 101962306a36Sopenharmony_ci new_tlb_gen = inc_mm_tlb_gen(mm); 102062306a36Sopenharmony_ci 102162306a36Sopenharmony_ci info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables, 102262306a36Sopenharmony_ci new_tlb_gen); 102362306a36Sopenharmony_ci 102462306a36Sopenharmony_ci /* 102562306a36Sopenharmony_ci * flush_tlb_multi() is not optimized for the common case in which only 102662306a36Sopenharmony_ci * a local TLB flush is needed. Optimize this use-case by calling 102762306a36Sopenharmony_ci * flush_tlb_func_local() directly in this case. 102862306a36Sopenharmony_ci */ 102962306a36Sopenharmony_ci if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { 103062306a36Sopenharmony_ci flush_tlb_multi(mm_cpumask(mm), info); 103162306a36Sopenharmony_ci } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { 103262306a36Sopenharmony_ci lockdep_assert_irqs_enabled(); 103362306a36Sopenharmony_ci local_irq_disable(); 103462306a36Sopenharmony_ci flush_tlb_func(info); 103562306a36Sopenharmony_ci local_irq_enable(); 103662306a36Sopenharmony_ci } 103762306a36Sopenharmony_ci 103862306a36Sopenharmony_ci put_flush_tlb_info(); 103962306a36Sopenharmony_ci put_cpu(); 104062306a36Sopenharmony_ci mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); 104162306a36Sopenharmony_ci} 104262306a36Sopenharmony_ci 104362306a36Sopenharmony_ci 104462306a36Sopenharmony_cistatic void do_flush_tlb_all(void *info) 104562306a36Sopenharmony_ci{ 104662306a36Sopenharmony_ci count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 104762306a36Sopenharmony_ci __flush_tlb_all(); 104862306a36Sopenharmony_ci} 104962306a36Sopenharmony_ci 105062306a36Sopenharmony_civoid flush_tlb_all(void) 105162306a36Sopenharmony_ci{ 105262306a36Sopenharmony_ci count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 105362306a36Sopenharmony_ci on_each_cpu(do_flush_tlb_all, NULL, 1); 105462306a36Sopenharmony_ci} 105562306a36Sopenharmony_ci 105662306a36Sopenharmony_cistatic void do_kernel_range_flush(void *info) 105762306a36Sopenharmony_ci{ 105862306a36Sopenharmony_ci struct flush_tlb_info *f = info; 105962306a36Sopenharmony_ci unsigned long addr; 106062306a36Sopenharmony_ci 106162306a36Sopenharmony_ci /* flush range by one by one 'invlpg' */ 106262306a36Sopenharmony_ci for (addr = f->start; addr < f->end; addr += PAGE_SIZE) 106362306a36Sopenharmony_ci flush_tlb_one_kernel(addr); 106462306a36Sopenharmony_ci} 106562306a36Sopenharmony_ci 106662306a36Sopenharmony_civoid flush_tlb_kernel_range(unsigned long start, unsigned long end) 106762306a36Sopenharmony_ci{ 106862306a36Sopenharmony_ci /* Balance as user space task's flush, a bit conservative */ 106962306a36Sopenharmony_ci if (end == TLB_FLUSH_ALL || 107062306a36Sopenharmony_ci (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { 107162306a36Sopenharmony_ci on_each_cpu(do_flush_tlb_all, NULL, 1); 107262306a36Sopenharmony_ci } else { 107362306a36Sopenharmony_ci struct flush_tlb_info *info; 107462306a36Sopenharmony_ci 107562306a36Sopenharmony_ci preempt_disable(); 107662306a36Sopenharmony_ci info = get_flush_tlb_info(NULL, start, end, 0, false, 107762306a36Sopenharmony_ci TLB_GENERATION_INVALID); 107862306a36Sopenharmony_ci 107962306a36Sopenharmony_ci on_each_cpu(do_kernel_range_flush, info, 1); 108062306a36Sopenharmony_ci 108162306a36Sopenharmony_ci put_flush_tlb_info(); 108262306a36Sopenharmony_ci preempt_enable(); 108362306a36Sopenharmony_ci } 108462306a36Sopenharmony_ci} 108562306a36Sopenharmony_ci 108662306a36Sopenharmony_ci/* 108762306a36Sopenharmony_ci * This can be used from process context to figure out what the value of 108862306a36Sopenharmony_ci * CR3 is without needing to do a (slow) __read_cr3(). 108962306a36Sopenharmony_ci * 109062306a36Sopenharmony_ci * It's intended to be used for code like KVM that sneakily changes CR3 109162306a36Sopenharmony_ci * and needs to restore it. It needs to be used very carefully. 109262306a36Sopenharmony_ci */ 109362306a36Sopenharmony_ciunsigned long __get_current_cr3_fast(void) 109462306a36Sopenharmony_ci{ 109562306a36Sopenharmony_ci unsigned long cr3 = 109662306a36Sopenharmony_ci build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, 109762306a36Sopenharmony_ci this_cpu_read(cpu_tlbstate.loaded_mm_asid), 109862306a36Sopenharmony_ci tlbstate_lam_cr3_mask()); 109962306a36Sopenharmony_ci 110062306a36Sopenharmony_ci /* For now, be very restrictive about when this can be called. */ 110162306a36Sopenharmony_ci VM_WARN_ON(in_nmi() || preemptible()); 110262306a36Sopenharmony_ci 110362306a36Sopenharmony_ci VM_BUG_ON(cr3 != __read_cr3()); 110462306a36Sopenharmony_ci return cr3; 110562306a36Sopenharmony_ci} 110662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(__get_current_cr3_fast); 110762306a36Sopenharmony_ci 110862306a36Sopenharmony_ci/* 110962306a36Sopenharmony_ci * Flush one page in the kernel mapping 111062306a36Sopenharmony_ci */ 111162306a36Sopenharmony_civoid flush_tlb_one_kernel(unsigned long addr) 111262306a36Sopenharmony_ci{ 111362306a36Sopenharmony_ci count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); 111462306a36Sopenharmony_ci 111562306a36Sopenharmony_ci /* 111662306a36Sopenharmony_ci * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its 111762306a36Sopenharmony_ci * paravirt equivalent. Even with PCID, this is sufficient: we only 111862306a36Sopenharmony_ci * use PCID if we also use global PTEs for the kernel mapping, and 111962306a36Sopenharmony_ci * INVLPG flushes global translations across all address spaces. 112062306a36Sopenharmony_ci * 112162306a36Sopenharmony_ci * If PTI is on, then the kernel is mapped with non-global PTEs, and 112262306a36Sopenharmony_ci * __flush_tlb_one_user() will flush the given address for the current 112362306a36Sopenharmony_ci * kernel address space and for its usermode counterpart, but it does 112462306a36Sopenharmony_ci * not flush it for other address spaces. 112562306a36Sopenharmony_ci */ 112662306a36Sopenharmony_ci flush_tlb_one_user(addr); 112762306a36Sopenharmony_ci 112862306a36Sopenharmony_ci if (!static_cpu_has(X86_FEATURE_PTI)) 112962306a36Sopenharmony_ci return; 113062306a36Sopenharmony_ci 113162306a36Sopenharmony_ci /* 113262306a36Sopenharmony_ci * See above. We need to propagate the flush to all other address 113362306a36Sopenharmony_ci * spaces. In principle, we only need to propagate it to kernelmode 113462306a36Sopenharmony_ci * address spaces, but the extra bookkeeping we would need is not 113562306a36Sopenharmony_ci * worth it. 113662306a36Sopenharmony_ci */ 113762306a36Sopenharmony_ci this_cpu_write(cpu_tlbstate.invalidate_other, true); 113862306a36Sopenharmony_ci} 113962306a36Sopenharmony_ci 114062306a36Sopenharmony_ci/* 114162306a36Sopenharmony_ci * Flush one page in the user mapping 114262306a36Sopenharmony_ci */ 114362306a36Sopenharmony_ciSTATIC_NOPV void native_flush_tlb_one_user(unsigned long addr) 114462306a36Sopenharmony_ci{ 114562306a36Sopenharmony_ci u32 loaded_mm_asid; 114662306a36Sopenharmony_ci bool cpu_pcide; 114762306a36Sopenharmony_ci 114862306a36Sopenharmony_ci /* Flush 'addr' from the kernel PCID: */ 114962306a36Sopenharmony_ci asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); 115062306a36Sopenharmony_ci 115162306a36Sopenharmony_ci /* If PTI is off there is no user PCID and nothing to flush. */ 115262306a36Sopenharmony_ci if (!static_cpu_has(X86_FEATURE_PTI)) 115362306a36Sopenharmony_ci return; 115462306a36Sopenharmony_ci 115562306a36Sopenharmony_ci loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 115662306a36Sopenharmony_ci cpu_pcide = this_cpu_read(cpu_tlbstate.cr4) & X86_CR4_PCIDE; 115762306a36Sopenharmony_ci 115862306a36Sopenharmony_ci /* 115962306a36Sopenharmony_ci * invpcid_flush_one(pcid>0) will #GP if CR4.PCIDE==0. Check 116062306a36Sopenharmony_ci * 'cpu_pcide' to ensure that *this* CPU will not trigger those 116162306a36Sopenharmony_ci * #GP's even if called before CR4.PCIDE has been initialized. 116262306a36Sopenharmony_ci */ 116362306a36Sopenharmony_ci if (boot_cpu_has(X86_FEATURE_INVPCID) && cpu_pcide) 116462306a36Sopenharmony_ci invpcid_flush_one(user_pcid(loaded_mm_asid), addr); 116562306a36Sopenharmony_ci else 116662306a36Sopenharmony_ci invalidate_user_asid(loaded_mm_asid); 116762306a36Sopenharmony_ci} 116862306a36Sopenharmony_ci 116962306a36Sopenharmony_civoid flush_tlb_one_user(unsigned long addr) 117062306a36Sopenharmony_ci{ 117162306a36Sopenharmony_ci __flush_tlb_one_user(addr); 117262306a36Sopenharmony_ci} 117362306a36Sopenharmony_ci 117462306a36Sopenharmony_ci/* 117562306a36Sopenharmony_ci * Flush everything 117662306a36Sopenharmony_ci */ 117762306a36Sopenharmony_ciSTATIC_NOPV void native_flush_tlb_global(void) 117862306a36Sopenharmony_ci{ 117962306a36Sopenharmony_ci unsigned long flags; 118062306a36Sopenharmony_ci 118162306a36Sopenharmony_ci if (static_cpu_has(X86_FEATURE_INVPCID)) { 118262306a36Sopenharmony_ci /* 118362306a36Sopenharmony_ci * Using INVPCID is considerably faster than a pair of writes 118462306a36Sopenharmony_ci * to CR4 sandwiched inside an IRQ flag save/restore. 118562306a36Sopenharmony_ci * 118662306a36Sopenharmony_ci * Note, this works with CR4.PCIDE=0 or 1. 118762306a36Sopenharmony_ci */ 118862306a36Sopenharmony_ci invpcid_flush_all(); 118962306a36Sopenharmony_ci return; 119062306a36Sopenharmony_ci } 119162306a36Sopenharmony_ci 119262306a36Sopenharmony_ci /* 119362306a36Sopenharmony_ci * Read-modify-write to CR4 - protect it from preemption and 119462306a36Sopenharmony_ci * from interrupts. (Use the raw variant because this code can 119562306a36Sopenharmony_ci * be called from deep inside debugging code.) 119662306a36Sopenharmony_ci */ 119762306a36Sopenharmony_ci raw_local_irq_save(flags); 119862306a36Sopenharmony_ci 119962306a36Sopenharmony_ci __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); 120062306a36Sopenharmony_ci 120162306a36Sopenharmony_ci raw_local_irq_restore(flags); 120262306a36Sopenharmony_ci} 120362306a36Sopenharmony_ci 120462306a36Sopenharmony_ci/* 120562306a36Sopenharmony_ci * Flush the entire current user mapping 120662306a36Sopenharmony_ci */ 120762306a36Sopenharmony_ciSTATIC_NOPV void native_flush_tlb_local(void) 120862306a36Sopenharmony_ci{ 120962306a36Sopenharmony_ci /* 121062306a36Sopenharmony_ci * Preemption or interrupts must be disabled to protect the access 121162306a36Sopenharmony_ci * to the per CPU variable and to prevent being preempted between 121262306a36Sopenharmony_ci * read_cr3() and write_cr3(). 121362306a36Sopenharmony_ci */ 121462306a36Sopenharmony_ci WARN_ON_ONCE(preemptible()); 121562306a36Sopenharmony_ci 121662306a36Sopenharmony_ci invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); 121762306a36Sopenharmony_ci 121862306a36Sopenharmony_ci /* If current->mm == NULL then the read_cr3() "borrows" an mm */ 121962306a36Sopenharmony_ci native_write_cr3(__native_read_cr3()); 122062306a36Sopenharmony_ci} 122162306a36Sopenharmony_ci 122262306a36Sopenharmony_civoid flush_tlb_local(void) 122362306a36Sopenharmony_ci{ 122462306a36Sopenharmony_ci __flush_tlb_local(); 122562306a36Sopenharmony_ci} 122662306a36Sopenharmony_ci 122762306a36Sopenharmony_ci/* 122862306a36Sopenharmony_ci * Flush everything 122962306a36Sopenharmony_ci */ 123062306a36Sopenharmony_civoid __flush_tlb_all(void) 123162306a36Sopenharmony_ci{ 123262306a36Sopenharmony_ci /* 123362306a36Sopenharmony_ci * This is to catch users with enabled preemption and the PGE feature 123462306a36Sopenharmony_ci * and don't trigger the warning in __native_flush_tlb(). 123562306a36Sopenharmony_ci */ 123662306a36Sopenharmony_ci VM_WARN_ON_ONCE(preemptible()); 123762306a36Sopenharmony_ci 123862306a36Sopenharmony_ci if (cpu_feature_enabled(X86_FEATURE_PGE)) { 123962306a36Sopenharmony_ci __flush_tlb_global(); 124062306a36Sopenharmony_ci } else { 124162306a36Sopenharmony_ci /* 124262306a36Sopenharmony_ci * !PGE -> !PCID (setup_pcid()), thus every flush is total. 124362306a36Sopenharmony_ci */ 124462306a36Sopenharmony_ci flush_tlb_local(); 124562306a36Sopenharmony_ci } 124662306a36Sopenharmony_ci} 124762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(__flush_tlb_all); 124862306a36Sopenharmony_ci 124962306a36Sopenharmony_civoid arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) 125062306a36Sopenharmony_ci{ 125162306a36Sopenharmony_ci struct flush_tlb_info *info; 125262306a36Sopenharmony_ci 125362306a36Sopenharmony_ci int cpu = get_cpu(); 125462306a36Sopenharmony_ci 125562306a36Sopenharmony_ci info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 125662306a36Sopenharmony_ci TLB_GENERATION_INVALID); 125762306a36Sopenharmony_ci /* 125862306a36Sopenharmony_ci * flush_tlb_multi() is not optimized for the common case in which only 125962306a36Sopenharmony_ci * a local TLB flush is needed. Optimize this use-case by calling 126062306a36Sopenharmony_ci * flush_tlb_func_local() directly in this case. 126162306a36Sopenharmony_ci */ 126262306a36Sopenharmony_ci if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { 126362306a36Sopenharmony_ci flush_tlb_multi(&batch->cpumask, info); 126462306a36Sopenharmony_ci } else if (cpumask_test_cpu(cpu, &batch->cpumask)) { 126562306a36Sopenharmony_ci lockdep_assert_irqs_enabled(); 126662306a36Sopenharmony_ci local_irq_disable(); 126762306a36Sopenharmony_ci flush_tlb_func(info); 126862306a36Sopenharmony_ci local_irq_enable(); 126962306a36Sopenharmony_ci } 127062306a36Sopenharmony_ci 127162306a36Sopenharmony_ci cpumask_clear(&batch->cpumask); 127262306a36Sopenharmony_ci 127362306a36Sopenharmony_ci put_flush_tlb_info(); 127462306a36Sopenharmony_ci put_cpu(); 127562306a36Sopenharmony_ci} 127662306a36Sopenharmony_ci 127762306a36Sopenharmony_ci/* 127862306a36Sopenharmony_ci * Blindly accessing user memory from NMI context can be dangerous 127962306a36Sopenharmony_ci * if we're in the middle of switching the current user task or 128062306a36Sopenharmony_ci * switching the loaded mm. It can also be dangerous if we 128162306a36Sopenharmony_ci * interrupted some kernel code that was temporarily using a 128262306a36Sopenharmony_ci * different mm. 128362306a36Sopenharmony_ci */ 128462306a36Sopenharmony_cibool nmi_uaccess_okay(void) 128562306a36Sopenharmony_ci{ 128662306a36Sopenharmony_ci struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 128762306a36Sopenharmony_ci struct mm_struct *current_mm = current->mm; 128862306a36Sopenharmony_ci 128962306a36Sopenharmony_ci VM_WARN_ON_ONCE(!loaded_mm); 129062306a36Sopenharmony_ci 129162306a36Sopenharmony_ci /* 129262306a36Sopenharmony_ci * The condition we want to check is 129362306a36Sopenharmony_ci * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, 129462306a36Sopenharmony_ci * if we're running in a VM with shadow paging, and nmi_uaccess_okay() 129562306a36Sopenharmony_ci * is supposed to be reasonably fast. 129662306a36Sopenharmony_ci * 129762306a36Sopenharmony_ci * Instead, we check the almost equivalent but somewhat conservative 129862306a36Sopenharmony_ci * condition below, and we rely on the fact that switch_mm_irqs_off() 129962306a36Sopenharmony_ci * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. 130062306a36Sopenharmony_ci */ 130162306a36Sopenharmony_ci if (loaded_mm != current_mm) 130262306a36Sopenharmony_ci return false; 130362306a36Sopenharmony_ci 130462306a36Sopenharmony_ci VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); 130562306a36Sopenharmony_ci 130662306a36Sopenharmony_ci return true; 130762306a36Sopenharmony_ci} 130862306a36Sopenharmony_ci 130962306a36Sopenharmony_cistatic ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, 131062306a36Sopenharmony_ci size_t count, loff_t *ppos) 131162306a36Sopenharmony_ci{ 131262306a36Sopenharmony_ci char buf[32]; 131362306a36Sopenharmony_ci unsigned int len; 131462306a36Sopenharmony_ci 131562306a36Sopenharmony_ci len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling); 131662306a36Sopenharmony_ci return simple_read_from_buffer(user_buf, count, ppos, buf, len); 131762306a36Sopenharmony_ci} 131862306a36Sopenharmony_ci 131962306a36Sopenharmony_cistatic ssize_t tlbflush_write_file(struct file *file, 132062306a36Sopenharmony_ci const char __user *user_buf, size_t count, loff_t *ppos) 132162306a36Sopenharmony_ci{ 132262306a36Sopenharmony_ci char buf[32]; 132362306a36Sopenharmony_ci ssize_t len; 132462306a36Sopenharmony_ci int ceiling; 132562306a36Sopenharmony_ci 132662306a36Sopenharmony_ci len = min(count, sizeof(buf) - 1); 132762306a36Sopenharmony_ci if (copy_from_user(buf, user_buf, len)) 132862306a36Sopenharmony_ci return -EFAULT; 132962306a36Sopenharmony_ci 133062306a36Sopenharmony_ci buf[len] = '\0'; 133162306a36Sopenharmony_ci if (kstrtoint(buf, 0, &ceiling)) 133262306a36Sopenharmony_ci return -EINVAL; 133362306a36Sopenharmony_ci 133462306a36Sopenharmony_ci if (ceiling < 0) 133562306a36Sopenharmony_ci return -EINVAL; 133662306a36Sopenharmony_ci 133762306a36Sopenharmony_ci tlb_single_page_flush_ceiling = ceiling; 133862306a36Sopenharmony_ci return count; 133962306a36Sopenharmony_ci} 134062306a36Sopenharmony_ci 134162306a36Sopenharmony_cistatic const struct file_operations fops_tlbflush = { 134262306a36Sopenharmony_ci .read = tlbflush_read_file, 134362306a36Sopenharmony_ci .write = tlbflush_write_file, 134462306a36Sopenharmony_ci .llseek = default_llseek, 134562306a36Sopenharmony_ci}; 134662306a36Sopenharmony_ci 134762306a36Sopenharmony_cistatic int __init create_tlb_single_page_flush_ceiling(void) 134862306a36Sopenharmony_ci{ 134962306a36Sopenharmony_ci debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR, 135062306a36Sopenharmony_ci arch_debugfs_dir, NULL, &fops_tlbflush); 135162306a36Sopenharmony_ci return 0; 135262306a36Sopenharmony_ci} 135362306a36Sopenharmony_cilate_initcall(create_tlb_single_page_flush_ceiling); 1354