162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * kvm asynchronous fault support 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright 2010 Red Hat, Inc. 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * Author: 862306a36Sopenharmony_ci * Gleb Natapov <gleb@redhat.com> 962306a36Sopenharmony_ci */ 1062306a36Sopenharmony_ci 1162306a36Sopenharmony_ci#include <linux/kvm_host.h> 1262306a36Sopenharmony_ci#include <linux/slab.h> 1362306a36Sopenharmony_ci#include <linux/module.h> 1462306a36Sopenharmony_ci#include <linux/mmu_context.h> 1562306a36Sopenharmony_ci#include <linux/sched/mm.h> 1662306a36Sopenharmony_ci 1762306a36Sopenharmony_ci#include "async_pf.h" 1862306a36Sopenharmony_ci#include <trace/events/kvm.h> 1962306a36Sopenharmony_ci 2062306a36Sopenharmony_cistatic struct kmem_cache *async_pf_cache; 2162306a36Sopenharmony_ci 2262306a36Sopenharmony_ciint kvm_async_pf_init(void) 2362306a36Sopenharmony_ci{ 2462306a36Sopenharmony_ci async_pf_cache = KMEM_CACHE(kvm_async_pf, 0); 2562306a36Sopenharmony_ci 2662306a36Sopenharmony_ci if (!async_pf_cache) 2762306a36Sopenharmony_ci return -ENOMEM; 2862306a36Sopenharmony_ci 2962306a36Sopenharmony_ci return 0; 3062306a36Sopenharmony_ci} 3162306a36Sopenharmony_ci 3262306a36Sopenharmony_civoid kvm_async_pf_deinit(void) 3362306a36Sopenharmony_ci{ 3462306a36Sopenharmony_ci kmem_cache_destroy(async_pf_cache); 3562306a36Sopenharmony_ci async_pf_cache = NULL; 3662306a36Sopenharmony_ci} 3762306a36Sopenharmony_ci 3862306a36Sopenharmony_civoid kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu) 3962306a36Sopenharmony_ci{ 4062306a36Sopenharmony_ci INIT_LIST_HEAD(&vcpu->async_pf.done); 4162306a36Sopenharmony_ci INIT_LIST_HEAD(&vcpu->async_pf.queue); 4262306a36Sopenharmony_ci spin_lock_init(&vcpu->async_pf.lock); 4362306a36Sopenharmony_ci} 4462306a36Sopenharmony_ci 4562306a36Sopenharmony_cistatic void async_pf_execute(struct work_struct *work) 4662306a36Sopenharmony_ci{ 4762306a36Sopenharmony_ci struct kvm_async_pf *apf = 4862306a36Sopenharmony_ci container_of(work, struct kvm_async_pf, work); 4962306a36Sopenharmony_ci struct mm_struct *mm = apf->mm; 5062306a36Sopenharmony_ci struct kvm_vcpu *vcpu = apf->vcpu; 5162306a36Sopenharmony_ci unsigned long addr = apf->addr; 5262306a36Sopenharmony_ci gpa_t cr2_or_gpa = apf->cr2_or_gpa; 5362306a36Sopenharmony_ci int locked = 1; 5462306a36Sopenharmony_ci bool first; 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci might_sleep(); 5762306a36Sopenharmony_ci 5862306a36Sopenharmony_ci /* 5962306a36Sopenharmony_ci * This work is run asynchronously to the task which owns 6062306a36Sopenharmony_ci * mm and might be done in another context, so we must 6162306a36Sopenharmony_ci * access remotely. 6262306a36Sopenharmony_ci */ 6362306a36Sopenharmony_ci mmap_read_lock(mm); 6462306a36Sopenharmony_ci get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, &locked); 6562306a36Sopenharmony_ci if (locked) 6662306a36Sopenharmony_ci mmap_read_unlock(mm); 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC)) 6962306a36Sopenharmony_ci kvm_arch_async_page_present(vcpu, apf); 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_ci spin_lock(&vcpu->async_pf.lock); 7262306a36Sopenharmony_ci first = list_empty(&vcpu->async_pf.done); 7362306a36Sopenharmony_ci list_add_tail(&apf->link, &vcpu->async_pf.done); 7462306a36Sopenharmony_ci apf->vcpu = NULL; 7562306a36Sopenharmony_ci spin_unlock(&vcpu->async_pf.lock); 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first) 7862306a36Sopenharmony_ci kvm_arch_async_page_present_queued(vcpu); 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci /* 8162306a36Sopenharmony_ci * apf may be freed by kvm_check_async_pf_completion() after 8262306a36Sopenharmony_ci * this point 8362306a36Sopenharmony_ci */ 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_ci trace_kvm_async_pf_completed(addr, cr2_or_gpa); 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ci __kvm_vcpu_wake_up(vcpu); 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci mmput(mm); 9062306a36Sopenharmony_ci} 9162306a36Sopenharmony_ci 9262306a36Sopenharmony_cistatic void kvm_flush_and_free_async_pf_work(struct kvm_async_pf *work) 9362306a36Sopenharmony_ci{ 9462306a36Sopenharmony_ci /* 9562306a36Sopenharmony_ci * The async #PF is "done", but KVM must wait for the work item itself, 9662306a36Sopenharmony_ci * i.e. async_pf_execute(), to run to completion. If KVM is a module, 9762306a36Sopenharmony_ci * KVM must ensure *no* code owned by the KVM (the module) can be run 9862306a36Sopenharmony_ci * after the last call to module_put(). Note, flushing the work item 9962306a36Sopenharmony_ci * is always required when the item is taken off the completion queue. 10062306a36Sopenharmony_ci * E.g. even if the vCPU handles the item in the "normal" path, the VM 10162306a36Sopenharmony_ci * could be terminated before async_pf_execute() completes. 10262306a36Sopenharmony_ci * 10362306a36Sopenharmony_ci * Wake all events skip the queue and go straight done, i.e. don't 10462306a36Sopenharmony_ci * need to be flushed (but sanity check that the work wasn't queued). 10562306a36Sopenharmony_ci */ 10662306a36Sopenharmony_ci if (work->wakeup_all) 10762306a36Sopenharmony_ci WARN_ON_ONCE(work->work.func); 10862306a36Sopenharmony_ci else 10962306a36Sopenharmony_ci flush_work(&work->work); 11062306a36Sopenharmony_ci kmem_cache_free(async_pf_cache, work); 11162306a36Sopenharmony_ci} 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_civoid kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) 11462306a36Sopenharmony_ci{ 11562306a36Sopenharmony_ci spin_lock(&vcpu->async_pf.lock); 11662306a36Sopenharmony_ci 11762306a36Sopenharmony_ci /* cancel outstanding work queue item */ 11862306a36Sopenharmony_ci while (!list_empty(&vcpu->async_pf.queue)) { 11962306a36Sopenharmony_ci struct kvm_async_pf *work = 12062306a36Sopenharmony_ci list_first_entry(&vcpu->async_pf.queue, 12162306a36Sopenharmony_ci typeof(*work), queue); 12262306a36Sopenharmony_ci list_del(&work->queue); 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci /* 12562306a36Sopenharmony_ci * We know it's present in vcpu->async_pf.done, do 12662306a36Sopenharmony_ci * nothing here. 12762306a36Sopenharmony_ci */ 12862306a36Sopenharmony_ci if (!work->vcpu) 12962306a36Sopenharmony_ci continue; 13062306a36Sopenharmony_ci 13162306a36Sopenharmony_ci spin_unlock(&vcpu->async_pf.lock); 13262306a36Sopenharmony_ci#ifdef CONFIG_KVM_ASYNC_PF_SYNC 13362306a36Sopenharmony_ci flush_work(&work->work); 13462306a36Sopenharmony_ci#else 13562306a36Sopenharmony_ci if (cancel_work_sync(&work->work)) { 13662306a36Sopenharmony_ci mmput(work->mm); 13762306a36Sopenharmony_ci kmem_cache_free(async_pf_cache, work); 13862306a36Sopenharmony_ci } 13962306a36Sopenharmony_ci#endif 14062306a36Sopenharmony_ci spin_lock(&vcpu->async_pf.lock); 14162306a36Sopenharmony_ci } 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_ci while (!list_empty(&vcpu->async_pf.done)) { 14462306a36Sopenharmony_ci struct kvm_async_pf *work = 14562306a36Sopenharmony_ci list_first_entry(&vcpu->async_pf.done, 14662306a36Sopenharmony_ci typeof(*work), link); 14762306a36Sopenharmony_ci list_del(&work->link); 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_ci spin_unlock(&vcpu->async_pf.lock); 15062306a36Sopenharmony_ci kvm_flush_and_free_async_pf_work(work); 15162306a36Sopenharmony_ci spin_lock(&vcpu->async_pf.lock); 15262306a36Sopenharmony_ci } 15362306a36Sopenharmony_ci spin_unlock(&vcpu->async_pf.lock); 15462306a36Sopenharmony_ci 15562306a36Sopenharmony_ci vcpu->async_pf.queued = 0; 15662306a36Sopenharmony_ci} 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_civoid kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) 15962306a36Sopenharmony_ci{ 16062306a36Sopenharmony_ci struct kvm_async_pf *work; 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_ci while (!list_empty_careful(&vcpu->async_pf.done) && 16362306a36Sopenharmony_ci kvm_arch_can_dequeue_async_page_present(vcpu)) { 16462306a36Sopenharmony_ci spin_lock(&vcpu->async_pf.lock); 16562306a36Sopenharmony_ci work = list_first_entry(&vcpu->async_pf.done, typeof(*work), 16662306a36Sopenharmony_ci link); 16762306a36Sopenharmony_ci list_del(&work->link); 16862306a36Sopenharmony_ci spin_unlock(&vcpu->async_pf.lock); 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci kvm_arch_async_page_ready(vcpu, work); 17162306a36Sopenharmony_ci if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC)) 17262306a36Sopenharmony_ci kvm_arch_async_page_present(vcpu, work); 17362306a36Sopenharmony_ci 17462306a36Sopenharmony_ci list_del(&work->queue); 17562306a36Sopenharmony_ci vcpu->async_pf.queued--; 17662306a36Sopenharmony_ci kvm_flush_and_free_async_pf_work(work); 17762306a36Sopenharmony_ci } 17862306a36Sopenharmony_ci} 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_ci/* 18162306a36Sopenharmony_ci * Try to schedule a job to handle page fault asynchronously. Returns 'true' on 18262306a36Sopenharmony_ci * success, 'false' on failure (page fault has to be handled synchronously). 18362306a36Sopenharmony_ci */ 18462306a36Sopenharmony_cibool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 18562306a36Sopenharmony_ci unsigned long hva, struct kvm_arch_async_pf *arch) 18662306a36Sopenharmony_ci{ 18762306a36Sopenharmony_ci struct kvm_async_pf *work; 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU) 19062306a36Sopenharmony_ci return false; 19162306a36Sopenharmony_ci 19262306a36Sopenharmony_ci /* Arch specific code should not do async PF in this case */ 19362306a36Sopenharmony_ci if (unlikely(kvm_is_error_hva(hva))) 19462306a36Sopenharmony_ci return false; 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci /* 19762306a36Sopenharmony_ci * do alloc nowait since if we are going to sleep anyway we 19862306a36Sopenharmony_ci * may as well sleep faulting in page 19962306a36Sopenharmony_ci */ 20062306a36Sopenharmony_ci work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT | __GFP_NOWARN); 20162306a36Sopenharmony_ci if (!work) 20262306a36Sopenharmony_ci return false; 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_ci work->wakeup_all = false; 20562306a36Sopenharmony_ci work->vcpu = vcpu; 20662306a36Sopenharmony_ci work->cr2_or_gpa = cr2_or_gpa; 20762306a36Sopenharmony_ci work->addr = hva; 20862306a36Sopenharmony_ci work->arch = *arch; 20962306a36Sopenharmony_ci work->mm = current->mm; 21062306a36Sopenharmony_ci mmget(work->mm); 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci INIT_WORK(&work->work, async_pf_execute); 21362306a36Sopenharmony_ci 21462306a36Sopenharmony_ci list_add_tail(&work->queue, &vcpu->async_pf.queue); 21562306a36Sopenharmony_ci vcpu->async_pf.queued++; 21662306a36Sopenharmony_ci work->notpresent_injected = kvm_arch_async_page_not_present(vcpu, work); 21762306a36Sopenharmony_ci 21862306a36Sopenharmony_ci schedule_work(&work->work); 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_ci return true; 22162306a36Sopenharmony_ci} 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_ciint kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) 22462306a36Sopenharmony_ci{ 22562306a36Sopenharmony_ci struct kvm_async_pf *work; 22662306a36Sopenharmony_ci bool first; 22762306a36Sopenharmony_ci 22862306a36Sopenharmony_ci if (!list_empty_careful(&vcpu->async_pf.done)) 22962306a36Sopenharmony_ci return 0; 23062306a36Sopenharmony_ci 23162306a36Sopenharmony_ci work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC); 23262306a36Sopenharmony_ci if (!work) 23362306a36Sopenharmony_ci return -ENOMEM; 23462306a36Sopenharmony_ci 23562306a36Sopenharmony_ci work->wakeup_all = true; 23662306a36Sopenharmony_ci INIT_LIST_HEAD(&work->queue); /* for list_del to work */ 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci spin_lock(&vcpu->async_pf.lock); 23962306a36Sopenharmony_ci first = list_empty(&vcpu->async_pf.done); 24062306a36Sopenharmony_ci list_add_tail(&work->link, &vcpu->async_pf.done); 24162306a36Sopenharmony_ci spin_unlock(&vcpu->async_pf.lock); 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_ci if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first) 24462306a36Sopenharmony_ci kvm_arch_async_page_present_queued(vcpu); 24562306a36Sopenharmony_ci 24662306a36Sopenharmony_ci vcpu->async_pf.queued++; 24762306a36Sopenharmony_ci return 0; 24862306a36Sopenharmony_ci} 249