18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * kvm asynchronous fault support
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright 2010 Red Hat, Inc.
68c2ecf20Sopenharmony_ci *
78c2ecf20Sopenharmony_ci * Author:
88c2ecf20Sopenharmony_ci *      Gleb Natapov <gleb@redhat.com>
98c2ecf20Sopenharmony_ci */
108c2ecf20Sopenharmony_ci
118c2ecf20Sopenharmony_ci#include <linux/kvm_host.h>
128c2ecf20Sopenharmony_ci#include <linux/slab.h>
138c2ecf20Sopenharmony_ci#include <linux/module.h>
148c2ecf20Sopenharmony_ci#include <linux/mmu_context.h>
158c2ecf20Sopenharmony_ci#include <linux/sched/mm.h>
168c2ecf20Sopenharmony_ci
178c2ecf20Sopenharmony_ci#include "async_pf.h"
188c2ecf20Sopenharmony_ci#include <trace/events/kvm.h>
198c2ecf20Sopenharmony_ci
208c2ecf20Sopenharmony_cistatic struct kmem_cache *async_pf_cache;
218c2ecf20Sopenharmony_ci
228c2ecf20Sopenharmony_ciint kvm_async_pf_init(void)
238c2ecf20Sopenharmony_ci{
248c2ecf20Sopenharmony_ci	async_pf_cache = KMEM_CACHE(kvm_async_pf, 0);
258c2ecf20Sopenharmony_ci
268c2ecf20Sopenharmony_ci	if (!async_pf_cache)
278c2ecf20Sopenharmony_ci		return -ENOMEM;
288c2ecf20Sopenharmony_ci
298c2ecf20Sopenharmony_ci	return 0;
308c2ecf20Sopenharmony_ci}
318c2ecf20Sopenharmony_ci
328c2ecf20Sopenharmony_civoid kvm_async_pf_deinit(void)
338c2ecf20Sopenharmony_ci{
348c2ecf20Sopenharmony_ci	kmem_cache_destroy(async_pf_cache);
358c2ecf20Sopenharmony_ci	async_pf_cache = NULL;
368c2ecf20Sopenharmony_ci}
378c2ecf20Sopenharmony_ci
388c2ecf20Sopenharmony_civoid kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu)
398c2ecf20Sopenharmony_ci{
408c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&vcpu->async_pf.done);
418c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&vcpu->async_pf.queue);
428c2ecf20Sopenharmony_ci	spin_lock_init(&vcpu->async_pf.lock);
438c2ecf20Sopenharmony_ci}
448c2ecf20Sopenharmony_ci
458c2ecf20Sopenharmony_cistatic void async_pf_execute(struct work_struct *work)
468c2ecf20Sopenharmony_ci{
478c2ecf20Sopenharmony_ci	struct kvm_async_pf *apf =
488c2ecf20Sopenharmony_ci		container_of(work, struct kvm_async_pf, work);
498c2ecf20Sopenharmony_ci	struct mm_struct *mm = apf->mm;
508c2ecf20Sopenharmony_ci	struct kvm_vcpu *vcpu = apf->vcpu;
518c2ecf20Sopenharmony_ci	unsigned long addr = apf->addr;
528c2ecf20Sopenharmony_ci	gpa_t cr2_or_gpa = apf->cr2_or_gpa;
538c2ecf20Sopenharmony_ci	int locked = 1;
548c2ecf20Sopenharmony_ci	bool first;
558c2ecf20Sopenharmony_ci
568c2ecf20Sopenharmony_ci	might_sleep();
578c2ecf20Sopenharmony_ci
588c2ecf20Sopenharmony_ci	/*
598c2ecf20Sopenharmony_ci	 * This work is run asynchronously to the task which owns
608c2ecf20Sopenharmony_ci	 * mm and might be done in another context, so we must
618c2ecf20Sopenharmony_ci	 * access remotely.
628c2ecf20Sopenharmony_ci	 */
638c2ecf20Sopenharmony_ci	mmap_read_lock(mm);
648c2ecf20Sopenharmony_ci	get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, NULL,
658c2ecf20Sopenharmony_ci			&locked);
668c2ecf20Sopenharmony_ci	if (locked)
678c2ecf20Sopenharmony_ci		mmap_read_unlock(mm);
688c2ecf20Sopenharmony_ci
698c2ecf20Sopenharmony_ci	if (IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC))
708c2ecf20Sopenharmony_ci		kvm_arch_async_page_present(vcpu, apf);
718c2ecf20Sopenharmony_ci
728c2ecf20Sopenharmony_ci	spin_lock(&vcpu->async_pf.lock);
738c2ecf20Sopenharmony_ci	first = list_empty(&vcpu->async_pf.done);
748c2ecf20Sopenharmony_ci	list_add_tail(&apf->link, &vcpu->async_pf.done);
758c2ecf20Sopenharmony_ci	apf->vcpu = NULL;
768c2ecf20Sopenharmony_ci	spin_unlock(&vcpu->async_pf.lock);
778c2ecf20Sopenharmony_ci
788c2ecf20Sopenharmony_ci	if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first)
798c2ecf20Sopenharmony_ci		kvm_arch_async_page_present_queued(vcpu);
808c2ecf20Sopenharmony_ci
818c2ecf20Sopenharmony_ci	/*
828c2ecf20Sopenharmony_ci	 * apf may be freed by kvm_check_async_pf_completion() after
838c2ecf20Sopenharmony_ci	 * this point
848c2ecf20Sopenharmony_ci	 */
858c2ecf20Sopenharmony_ci
868c2ecf20Sopenharmony_ci	trace_kvm_async_pf_completed(addr, cr2_or_gpa);
878c2ecf20Sopenharmony_ci
888c2ecf20Sopenharmony_ci	rcuwait_wake_up(&vcpu->wait);
898c2ecf20Sopenharmony_ci
908c2ecf20Sopenharmony_ci	mmput(mm);
918c2ecf20Sopenharmony_ci	kvm_put_kvm(vcpu->kvm);
928c2ecf20Sopenharmony_ci}
938c2ecf20Sopenharmony_ci
948c2ecf20Sopenharmony_civoid kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
958c2ecf20Sopenharmony_ci{
968c2ecf20Sopenharmony_ci	spin_lock(&vcpu->async_pf.lock);
978c2ecf20Sopenharmony_ci
988c2ecf20Sopenharmony_ci	/* cancel outstanding work queue item */
998c2ecf20Sopenharmony_ci	while (!list_empty(&vcpu->async_pf.queue)) {
1008c2ecf20Sopenharmony_ci		struct kvm_async_pf *work =
1018c2ecf20Sopenharmony_ci			list_first_entry(&vcpu->async_pf.queue,
1028c2ecf20Sopenharmony_ci					 typeof(*work), queue);
1038c2ecf20Sopenharmony_ci		list_del(&work->queue);
1048c2ecf20Sopenharmony_ci
1058c2ecf20Sopenharmony_ci		/*
1068c2ecf20Sopenharmony_ci		 * We know it's present in vcpu->async_pf.done, do
1078c2ecf20Sopenharmony_ci		 * nothing here.
1088c2ecf20Sopenharmony_ci		 */
1098c2ecf20Sopenharmony_ci		if (!work->vcpu)
1108c2ecf20Sopenharmony_ci			continue;
1118c2ecf20Sopenharmony_ci
1128c2ecf20Sopenharmony_ci		spin_unlock(&vcpu->async_pf.lock);
1138c2ecf20Sopenharmony_ci#ifdef CONFIG_KVM_ASYNC_PF_SYNC
1148c2ecf20Sopenharmony_ci		flush_work(&work->work);
1158c2ecf20Sopenharmony_ci#else
1168c2ecf20Sopenharmony_ci		if (cancel_work_sync(&work->work)) {
1178c2ecf20Sopenharmony_ci			mmput(work->mm);
1188c2ecf20Sopenharmony_ci			kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */
1198c2ecf20Sopenharmony_ci			kmem_cache_free(async_pf_cache, work);
1208c2ecf20Sopenharmony_ci		}
1218c2ecf20Sopenharmony_ci#endif
1228c2ecf20Sopenharmony_ci		spin_lock(&vcpu->async_pf.lock);
1238c2ecf20Sopenharmony_ci	}
1248c2ecf20Sopenharmony_ci
1258c2ecf20Sopenharmony_ci	while (!list_empty(&vcpu->async_pf.done)) {
1268c2ecf20Sopenharmony_ci		struct kvm_async_pf *work =
1278c2ecf20Sopenharmony_ci			list_first_entry(&vcpu->async_pf.done,
1288c2ecf20Sopenharmony_ci					 typeof(*work), link);
1298c2ecf20Sopenharmony_ci		list_del(&work->link);
1308c2ecf20Sopenharmony_ci		kmem_cache_free(async_pf_cache, work);
1318c2ecf20Sopenharmony_ci	}
1328c2ecf20Sopenharmony_ci	spin_unlock(&vcpu->async_pf.lock);
1338c2ecf20Sopenharmony_ci
1348c2ecf20Sopenharmony_ci	vcpu->async_pf.queued = 0;
1358c2ecf20Sopenharmony_ci}
1368c2ecf20Sopenharmony_ci
1378c2ecf20Sopenharmony_civoid kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
1388c2ecf20Sopenharmony_ci{
1398c2ecf20Sopenharmony_ci	struct kvm_async_pf *work;
1408c2ecf20Sopenharmony_ci
1418c2ecf20Sopenharmony_ci	while (!list_empty_careful(&vcpu->async_pf.done) &&
1428c2ecf20Sopenharmony_ci	      kvm_arch_can_dequeue_async_page_present(vcpu)) {
1438c2ecf20Sopenharmony_ci		spin_lock(&vcpu->async_pf.lock);
1448c2ecf20Sopenharmony_ci		work = list_first_entry(&vcpu->async_pf.done, typeof(*work),
1458c2ecf20Sopenharmony_ci					      link);
1468c2ecf20Sopenharmony_ci		list_del(&work->link);
1478c2ecf20Sopenharmony_ci		spin_unlock(&vcpu->async_pf.lock);
1488c2ecf20Sopenharmony_ci
1498c2ecf20Sopenharmony_ci		kvm_arch_async_page_ready(vcpu, work);
1508c2ecf20Sopenharmony_ci		if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC))
1518c2ecf20Sopenharmony_ci			kvm_arch_async_page_present(vcpu, work);
1528c2ecf20Sopenharmony_ci
1538c2ecf20Sopenharmony_ci		list_del(&work->queue);
1548c2ecf20Sopenharmony_ci		vcpu->async_pf.queued--;
1558c2ecf20Sopenharmony_ci		kmem_cache_free(async_pf_cache, work);
1568c2ecf20Sopenharmony_ci	}
1578c2ecf20Sopenharmony_ci}
1588c2ecf20Sopenharmony_ci
1598c2ecf20Sopenharmony_ci/*
1608c2ecf20Sopenharmony_ci * Try to schedule a job to handle page fault asynchronously. Returns 'true' on
1618c2ecf20Sopenharmony_ci * success, 'false' on failure (page fault has to be handled synchronously).
1628c2ecf20Sopenharmony_ci */
1638c2ecf20Sopenharmony_cibool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
1648c2ecf20Sopenharmony_ci			unsigned long hva, struct kvm_arch_async_pf *arch)
1658c2ecf20Sopenharmony_ci{
1668c2ecf20Sopenharmony_ci	struct kvm_async_pf *work;
1678c2ecf20Sopenharmony_ci
1688c2ecf20Sopenharmony_ci	if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU)
1698c2ecf20Sopenharmony_ci		return false;
1708c2ecf20Sopenharmony_ci
1718c2ecf20Sopenharmony_ci	/* Arch specific code should not do async PF in this case */
1728c2ecf20Sopenharmony_ci	if (unlikely(kvm_is_error_hva(hva)))
1738c2ecf20Sopenharmony_ci		return false;
1748c2ecf20Sopenharmony_ci
1758c2ecf20Sopenharmony_ci	/*
1768c2ecf20Sopenharmony_ci	 * do alloc nowait since if we are going to sleep anyway we
1778c2ecf20Sopenharmony_ci	 * may as well sleep faulting in page
1788c2ecf20Sopenharmony_ci	 */
1798c2ecf20Sopenharmony_ci	work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT | __GFP_NOWARN);
1808c2ecf20Sopenharmony_ci	if (!work)
1818c2ecf20Sopenharmony_ci		return false;
1828c2ecf20Sopenharmony_ci
1838c2ecf20Sopenharmony_ci	work->wakeup_all = false;
1848c2ecf20Sopenharmony_ci	work->vcpu = vcpu;
1858c2ecf20Sopenharmony_ci	work->cr2_or_gpa = cr2_or_gpa;
1868c2ecf20Sopenharmony_ci	work->addr = hva;
1878c2ecf20Sopenharmony_ci	work->arch = *arch;
1888c2ecf20Sopenharmony_ci	work->mm = current->mm;
1898c2ecf20Sopenharmony_ci	mmget(work->mm);
1908c2ecf20Sopenharmony_ci	kvm_get_kvm(work->vcpu->kvm);
1918c2ecf20Sopenharmony_ci
1928c2ecf20Sopenharmony_ci	INIT_WORK(&work->work, async_pf_execute);
1938c2ecf20Sopenharmony_ci
1948c2ecf20Sopenharmony_ci	list_add_tail(&work->queue, &vcpu->async_pf.queue);
1958c2ecf20Sopenharmony_ci	vcpu->async_pf.queued++;
1968c2ecf20Sopenharmony_ci	work->notpresent_injected = kvm_arch_async_page_not_present(vcpu, work);
1978c2ecf20Sopenharmony_ci
1988c2ecf20Sopenharmony_ci	schedule_work(&work->work);
1998c2ecf20Sopenharmony_ci
2008c2ecf20Sopenharmony_ci	return true;
2018c2ecf20Sopenharmony_ci}
2028c2ecf20Sopenharmony_ci
2038c2ecf20Sopenharmony_ciint kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
2048c2ecf20Sopenharmony_ci{
2058c2ecf20Sopenharmony_ci	struct kvm_async_pf *work;
2068c2ecf20Sopenharmony_ci	bool first;
2078c2ecf20Sopenharmony_ci
2088c2ecf20Sopenharmony_ci	if (!list_empty_careful(&vcpu->async_pf.done))
2098c2ecf20Sopenharmony_ci		return 0;
2108c2ecf20Sopenharmony_ci
2118c2ecf20Sopenharmony_ci	work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC);
2128c2ecf20Sopenharmony_ci	if (!work)
2138c2ecf20Sopenharmony_ci		return -ENOMEM;
2148c2ecf20Sopenharmony_ci
2158c2ecf20Sopenharmony_ci	work->wakeup_all = true;
2168c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&work->queue); /* for list_del to work */
2178c2ecf20Sopenharmony_ci
2188c2ecf20Sopenharmony_ci	spin_lock(&vcpu->async_pf.lock);
2198c2ecf20Sopenharmony_ci	first = list_empty(&vcpu->async_pf.done);
2208c2ecf20Sopenharmony_ci	list_add_tail(&work->link, &vcpu->async_pf.done);
2218c2ecf20Sopenharmony_ci	spin_unlock(&vcpu->async_pf.lock);
2228c2ecf20Sopenharmony_ci
2238c2ecf20Sopenharmony_ci	if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first)
2248c2ecf20Sopenharmony_ci		kvm_arch_async_page_present_queued(vcpu);
2258c2ecf20Sopenharmony_ci
2268c2ecf20Sopenharmony_ci	vcpu->async_pf.queued++;
2278c2ecf20Sopenharmony_ci	return 0;
2288c2ecf20Sopenharmony_ci}
229