xref: /kernel/linux/linux-6.6/virt/kvm/kvm_main.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
7 *
8 * Copyright (C) 2006 Qumranet, Inc.
9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 *
11 * Authors:
12 *   Avi Kivity   <avi@qumranet.com>
13 *   Yaniv Kamay  <yaniv@qumranet.com>
14 */
15
16#include <kvm/iodev.h>
17
18#include <linux/kvm_host.h>
19#include <linux/kvm.h>
20#include <linux/module.h>
21#include <linux/errno.h>
22#include <linux/percpu.h>
23#include <linux/mm.h>
24#include <linux/miscdevice.h>
25#include <linux/vmalloc.h>
26#include <linux/reboot.h>
27#include <linux/debugfs.h>
28#include <linux/highmem.h>
29#include <linux/file.h>
30#include <linux/syscore_ops.h>
31#include <linux/cpu.h>
32#include <linux/sched/signal.h>
33#include <linux/sched/mm.h>
34#include <linux/sched/stat.h>
35#include <linux/cpumask.h>
36#include <linux/smp.h>
37#include <linux/anon_inodes.h>
38#include <linux/profile.h>
39#include <linux/kvm_para.h>
40#include <linux/pagemap.h>
41#include <linux/mman.h>
42#include <linux/swap.h>
43#include <linux/bitops.h>
44#include <linux/spinlock.h>
45#include <linux/compat.h>
46#include <linux/srcu.h>
47#include <linux/hugetlb.h>
48#include <linux/slab.h>
49#include <linux/sort.h>
50#include <linux/bsearch.h>
51#include <linux/io.h>
52#include <linux/lockdep.h>
53#include <linux/kthread.h>
54#include <linux/suspend.h>
55
56#include <asm/processor.h>
57#include <asm/ioctl.h>
58#include <linux/uaccess.h>
59
60#include "coalesced_mmio.h"
61#include "async_pf.h"
62#include "kvm_mm.h"
63#include "vfio.h"
64
65#include <trace/events/ipi.h>
66
67#define CREATE_TRACE_POINTS
68#include <trace/events/kvm.h>
69
70#include <linux/kvm_dirty_ring.h>
71
72
73/* Worst case buffer size needed for holding an integer. */
74#define ITOA_MAX_LEN 12
75
76MODULE_AUTHOR("Qumranet");
77MODULE_LICENSE("GPL");
78
79/* Architectures should define their poll value according to the halt latency */
80unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
81module_param(halt_poll_ns, uint, 0644);
82EXPORT_SYMBOL_GPL(halt_poll_ns);
83
84/* Default doubles per-vcpu halt_poll_ns. */
85unsigned int halt_poll_ns_grow = 2;
86module_param(halt_poll_ns_grow, uint, 0644);
87EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
88
89/* The start value to grow halt_poll_ns from */
90unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
91module_param(halt_poll_ns_grow_start, uint, 0644);
92EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
93
94/* Default resets per-vcpu halt_poll_ns . */
95unsigned int halt_poll_ns_shrink;
96module_param(halt_poll_ns_shrink, uint, 0644);
97EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
98
99/*
100 * Ordering of locks:
101 *
102 *	kvm->lock --> kvm->slots_lock --> kvm->irq_lock
103 */
104
105DEFINE_MUTEX(kvm_lock);
106LIST_HEAD(vm_list);
107
108static struct kmem_cache *kvm_vcpu_cache;
109
110static __read_mostly struct preempt_ops kvm_preempt_ops;
111static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
112
113struct dentry *kvm_debugfs_dir;
114EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
115
116static const struct file_operations stat_fops_per_vm;
117
118static struct file_operations kvm_chardev_ops;
119
120static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
121			   unsigned long arg);
122#ifdef CONFIG_KVM_COMPAT
123static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
124				  unsigned long arg);
125#define KVM_COMPAT(c)	.compat_ioctl	= (c)
126#else
127/*
128 * For architectures that don't implement a compat infrastructure,
129 * adopt a double line of defense:
130 * - Prevent a compat task from opening /dev/kvm
131 * - If the open has been done by a 64bit task, and the KVM fd
132 *   passed to a compat task, let the ioctls fail.
133 */
134static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
135				unsigned long arg) { return -EINVAL; }
136
137static int kvm_no_compat_open(struct inode *inode, struct file *file)
138{
139	return is_compat_task() ? -ENODEV : 0;
140}
141#define KVM_COMPAT(c)	.compat_ioctl	= kvm_no_compat_ioctl,	\
142			.open		= kvm_no_compat_open
143#endif
144static int hardware_enable_all(void);
145static void hardware_disable_all(void);
146
147static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
148
149#define KVM_EVENT_CREATE_VM 0
150#define KVM_EVENT_DESTROY_VM 1
151static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
152static unsigned long long kvm_createvm_count;
153static unsigned long long kvm_active_vms;
154
155static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
156
157__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
158{
159}
160
161bool kvm_is_zone_device_page(struct page *page)
162{
163	/*
164	 * The metadata used by is_zone_device_page() to determine whether or
165	 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
166	 * the device has been pinned, e.g. by get_user_pages().  WARN if the
167	 * page_count() is zero to help detect bad usage of this helper.
168	 */
169	if (WARN_ON_ONCE(!page_count(page)))
170		return false;
171
172	return is_zone_device_page(page);
173}
174
175/*
176 * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
177 * page, NULL otherwise.  Note, the list of refcounted PG_reserved page types
178 * is likely incomplete, it has been compiled purely through people wanting to
179 * back guest with a certain type of memory and encountering issues.
180 */
181struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
182{
183	struct page *page;
184
185	if (!pfn_valid(pfn))
186		return NULL;
187
188	page = pfn_to_page(pfn);
189	if (!PageReserved(page))
190		return page;
191
192	/* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
193	if (is_zero_pfn(pfn))
194		return page;
195
196	/*
197	 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
198	 * perspective they are "normal" pages, albeit with slightly different
199	 * usage rules.
200	 */
201	if (kvm_is_zone_device_page(page))
202		return page;
203
204	return NULL;
205}
206
207/*
208 * Switches to specified vcpu, until a matching vcpu_put()
209 */
210void vcpu_load(struct kvm_vcpu *vcpu)
211{
212	int cpu = get_cpu();
213
214	__this_cpu_write(kvm_running_vcpu, vcpu);
215	preempt_notifier_register(&vcpu->preempt_notifier);
216	kvm_arch_vcpu_load(vcpu, cpu);
217	put_cpu();
218}
219EXPORT_SYMBOL_GPL(vcpu_load);
220
221void vcpu_put(struct kvm_vcpu *vcpu)
222{
223	preempt_disable();
224	kvm_arch_vcpu_put(vcpu);
225	preempt_notifier_unregister(&vcpu->preempt_notifier);
226	__this_cpu_write(kvm_running_vcpu, NULL);
227	preempt_enable();
228}
229EXPORT_SYMBOL_GPL(vcpu_put);
230
231/* TODO: merge with kvm_arch_vcpu_should_kick */
232static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
233{
234	int mode = kvm_vcpu_exiting_guest_mode(vcpu);
235
236	/*
237	 * We need to wait for the VCPU to reenable interrupts and get out of
238	 * READING_SHADOW_PAGE_TABLES mode.
239	 */
240	if (req & KVM_REQUEST_WAIT)
241		return mode != OUTSIDE_GUEST_MODE;
242
243	/*
244	 * Need to kick a running VCPU, but otherwise there is nothing to do.
245	 */
246	return mode == IN_GUEST_MODE;
247}
248
249static void ack_kick(void *_completed)
250{
251}
252
253static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
254{
255	if (cpumask_empty(cpus))
256		return false;
257
258	smp_call_function_many(cpus, ack_kick, NULL, wait);
259	return true;
260}
261
262static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
263				  struct cpumask *tmp, int current_cpu)
264{
265	int cpu;
266
267	if (likely(!(req & KVM_REQUEST_NO_ACTION)))
268		__kvm_make_request(req, vcpu);
269
270	if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
271		return;
272
273	/*
274	 * Note, the vCPU could get migrated to a different pCPU at any point
275	 * after kvm_request_needs_ipi(), which could result in sending an IPI
276	 * to the previous pCPU.  But, that's OK because the purpose of the IPI
277	 * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
278	 * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
279	 * after this point is also OK, as the requirement is only that KVM wait
280	 * for vCPUs that were reading SPTEs _before_ any changes were
281	 * finalized. See kvm_vcpu_kick() for more details on handling requests.
282	 */
283	if (kvm_request_needs_ipi(vcpu, req)) {
284		cpu = READ_ONCE(vcpu->cpu);
285		if (cpu != -1 && cpu != current_cpu)
286			__cpumask_set_cpu(cpu, tmp);
287	}
288}
289
290bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
291				 unsigned long *vcpu_bitmap)
292{
293	struct kvm_vcpu *vcpu;
294	struct cpumask *cpus;
295	int i, me;
296	bool called;
297
298	me = get_cpu();
299
300	cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
301	cpumask_clear(cpus);
302
303	for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
304		vcpu = kvm_get_vcpu(kvm, i);
305		if (!vcpu)
306			continue;
307		kvm_make_vcpu_request(vcpu, req, cpus, me);
308	}
309
310	called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
311	put_cpu();
312
313	return called;
314}
315
316bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
317				      struct kvm_vcpu *except)
318{
319	struct kvm_vcpu *vcpu;
320	struct cpumask *cpus;
321	unsigned long i;
322	bool called;
323	int me;
324
325	me = get_cpu();
326
327	cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
328	cpumask_clear(cpus);
329
330	kvm_for_each_vcpu(i, vcpu, kvm) {
331		if (vcpu == except)
332			continue;
333		kvm_make_vcpu_request(vcpu, req, cpus, me);
334	}
335
336	called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
337	put_cpu();
338
339	return called;
340}
341
342bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
343{
344	return kvm_make_all_cpus_request_except(kvm, req, NULL);
345}
346EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
347
348void kvm_flush_remote_tlbs(struct kvm *kvm)
349{
350	++kvm->stat.generic.remote_tlb_flush_requests;
351
352	/*
353	 * We want to publish modifications to the page tables before reading
354	 * mode. Pairs with a memory barrier in arch-specific code.
355	 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
356	 * and smp_mb in walk_shadow_page_lockless_begin/end.
357	 * - powerpc: smp_mb in kvmppc_prepare_to_enter.
358	 *
359	 * There is already an smp_mb__after_atomic() before
360	 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
361	 * barrier here.
362	 */
363	if (!kvm_arch_flush_remote_tlbs(kvm)
364	    || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
365		++kvm->stat.generic.remote_tlb_flush;
366}
367EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
368
369void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
370{
371	if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
372		return;
373
374	/*
375	 * Fall back to a flushing entire TLBs if the architecture range-based
376	 * TLB invalidation is unsupported or can't be performed for whatever
377	 * reason.
378	 */
379	kvm_flush_remote_tlbs(kvm);
380}
381
382void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
383				   const struct kvm_memory_slot *memslot)
384{
385	/*
386	 * All current use cases for flushing the TLBs for a specific memslot
387	 * are related to dirty logging, and many do the TLB flush out of
388	 * mmu_lock. The interaction between the various operations on memslot
389	 * must be serialized by slots_locks to ensure the TLB flush from one
390	 * operation is observed by any other operation on the same memslot.
391	 */
392	lockdep_assert_held(&kvm->slots_lock);
393	kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
394}
395
396static void kvm_flush_shadow_all(struct kvm *kvm)
397{
398	kvm_arch_flush_shadow_all(kvm);
399	kvm_arch_guest_memory_reclaimed(kvm);
400}
401
402#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
403static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
404					       gfp_t gfp_flags)
405{
406	gfp_flags |= mc->gfp_zero;
407
408	if (mc->kmem_cache)
409		return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
410	else
411		return (void *)__get_free_page(gfp_flags);
412}
413
414int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
415{
416	gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
417	void *obj;
418
419	if (mc->nobjs >= min)
420		return 0;
421
422	if (unlikely(!mc->objects)) {
423		if (WARN_ON_ONCE(!capacity))
424			return -EIO;
425
426		mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
427		if (!mc->objects)
428			return -ENOMEM;
429
430		mc->capacity = capacity;
431	}
432
433	/* It is illegal to request a different capacity across topups. */
434	if (WARN_ON_ONCE(mc->capacity != capacity))
435		return -EIO;
436
437	while (mc->nobjs < mc->capacity) {
438		obj = mmu_memory_cache_alloc_obj(mc, gfp);
439		if (!obj)
440			return mc->nobjs >= min ? 0 : -ENOMEM;
441		mc->objects[mc->nobjs++] = obj;
442	}
443	return 0;
444}
445
446int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
447{
448	return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
449}
450
451int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
452{
453	return mc->nobjs;
454}
455
456void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
457{
458	while (mc->nobjs) {
459		if (mc->kmem_cache)
460			kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
461		else
462			free_page((unsigned long)mc->objects[--mc->nobjs]);
463	}
464
465	kvfree(mc->objects);
466
467	mc->objects = NULL;
468	mc->capacity = 0;
469}
470
471void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
472{
473	void *p;
474
475	if (WARN_ON(!mc->nobjs))
476		p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
477	else
478		p = mc->objects[--mc->nobjs];
479	BUG_ON(!p);
480	return p;
481}
482#endif
483
484static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
485{
486	mutex_init(&vcpu->mutex);
487	vcpu->cpu = -1;
488	vcpu->kvm = kvm;
489	vcpu->vcpu_id = id;
490	vcpu->pid = NULL;
491#ifndef __KVM_HAVE_ARCH_WQP
492	rcuwait_init(&vcpu->wait);
493#endif
494	kvm_async_pf_vcpu_init(vcpu);
495
496	kvm_vcpu_set_in_spin_loop(vcpu, false);
497	kvm_vcpu_set_dy_eligible(vcpu, false);
498	vcpu->preempted = false;
499	vcpu->ready = false;
500	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
501	vcpu->last_used_slot = NULL;
502
503	/* Fill the stats id string for the vcpu */
504	snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
505		 task_pid_nr(current), id);
506}
507
508static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
509{
510	kvm_arch_vcpu_destroy(vcpu);
511	kvm_dirty_ring_free(&vcpu->dirty_ring);
512
513	/*
514	 * No need for rcu_read_lock as VCPU_RUN is the only place that changes
515	 * the vcpu->pid pointer, and at destruction time all file descriptors
516	 * are already gone.
517	 */
518	put_pid(rcu_dereference_protected(vcpu->pid, 1));
519
520	free_page((unsigned long)vcpu->run);
521	kmem_cache_free(kvm_vcpu_cache, vcpu);
522}
523
524void kvm_destroy_vcpus(struct kvm *kvm)
525{
526	unsigned long i;
527	struct kvm_vcpu *vcpu;
528
529	kvm_for_each_vcpu(i, vcpu, kvm) {
530		kvm_vcpu_destroy(vcpu);
531		xa_erase(&kvm->vcpu_array, i);
532	}
533
534	atomic_set(&kvm->online_vcpus, 0);
535}
536EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
537
538#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
539static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
540{
541	return container_of(mn, struct kvm, mmu_notifier);
542}
543
544typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
545
546typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
547			     unsigned long end);
548
549typedef void (*on_unlock_fn_t)(struct kvm *kvm);
550
551struct kvm_hva_range {
552	unsigned long start;
553	unsigned long end;
554	union kvm_mmu_notifier_arg arg;
555	hva_handler_t handler;
556	on_lock_fn_t on_lock;
557	on_unlock_fn_t on_unlock;
558	bool flush_on_ret;
559	bool may_block;
560};
561
562/*
563 * Use a dedicated stub instead of NULL to indicate that there is no callback
564 * function/handler.  The compiler technically can't guarantee that a real
565 * function will have a non-zero address, and so it will generate code to
566 * check for !NULL, whereas comparing against a stub will be elided at compile
567 * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
568 */
569static void kvm_null_fn(void)
570{
571
572}
573#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
574
575static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG;
576
577/* Iterate over each memslot intersecting [start, last] (inclusive) range */
578#define kvm_for_each_memslot_in_hva_range(node, slots, start, last)	     \
579	for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
580	     node;							     \
581	     node = interval_tree_iter_next(node, start, last))	     \
582
583static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
584						  const struct kvm_hva_range *range)
585{
586	bool ret = false, locked = false;
587	struct kvm_gfn_range gfn_range;
588	struct kvm_memory_slot *slot;
589	struct kvm_memslots *slots;
590	int i, idx;
591
592	if (WARN_ON_ONCE(range->end <= range->start))
593		return 0;
594
595	/* A null handler is allowed if and only if on_lock() is provided. */
596	if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
597			 IS_KVM_NULL_FN(range->handler)))
598		return 0;
599
600	idx = srcu_read_lock(&kvm->srcu);
601
602	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
603		struct interval_tree_node *node;
604
605		slots = __kvm_memslots(kvm, i);
606		kvm_for_each_memslot_in_hva_range(node, slots,
607						  range->start, range->end - 1) {
608			unsigned long hva_start, hva_end;
609
610			slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
611			hva_start = max(range->start, slot->userspace_addr);
612			hva_end = min(range->end, slot->userspace_addr +
613						  (slot->npages << PAGE_SHIFT));
614
615			/*
616			 * To optimize for the likely case where the address
617			 * range is covered by zero or one memslots, don't
618			 * bother making these conditional (to avoid writes on
619			 * the second or later invocation of the handler).
620			 */
621			gfn_range.arg = range->arg;
622			gfn_range.may_block = range->may_block;
623
624			/*
625			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
626			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
627			 */
628			gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
629			gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
630			gfn_range.slot = slot;
631
632			if (!locked) {
633				locked = true;
634				KVM_MMU_LOCK(kvm);
635				if (!IS_KVM_NULL_FN(range->on_lock))
636					range->on_lock(kvm, range->start, range->end);
637				if (IS_KVM_NULL_FN(range->handler))
638					break;
639			}
640			ret |= range->handler(kvm, &gfn_range);
641		}
642	}
643
644	if (range->flush_on_ret && ret)
645		kvm_flush_remote_tlbs(kvm);
646
647	if (locked) {
648		KVM_MMU_UNLOCK(kvm);
649		if (!IS_KVM_NULL_FN(range->on_unlock))
650			range->on_unlock(kvm);
651	}
652
653	srcu_read_unlock(&kvm->srcu, idx);
654
655	/* The notifiers are averse to booleans. :-( */
656	return (int)ret;
657}
658
659static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
660						unsigned long start,
661						unsigned long end,
662						union kvm_mmu_notifier_arg arg,
663						hva_handler_t handler)
664{
665	struct kvm *kvm = mmu_notifier_to_kvm(mn);
666	const struct kvm_hva_range range = {
667		.start		= start,
668		.end		= end,
669		.arg		= arg,
670		.handler	= handler,
671		.on_lock	= (void *)kvm_null_fn,
672		.on_unlock	= (void *)kvm_null_fn,
673		.flush_on_ret	= true,
674		.may_block	= false,
675	};
676
677	return __kvm_handle_hva_range(kvm, &range);
678}
679
680static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
681							 unsigned long start,
682							 unsigned long end,
683							 hva_handler_t handler)
684{
685	struct kvm *kvm = mmu_notifier_to_kvm(mn);
686	const struct kvm_hva_range range = {
687		.start		= start,
688		.end		= end,
689		.handler	= handler,
690		.on_lock	= (void *)kvm_null_fn,
691		.on_unlock	= (void *)kvm_null_fn,
692		.flush_on_ret	= false,
693		.may_block	= false,
694	};
695
696	return __kvm_handle_hva_range(kvm, &range);
697}
698
699static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
700{
701	/*
702	 * Skipping invalid memslots is correct if and only change_pte() is
703	 * surrounded by invalidate_range_{start,end}(), which is currently
704	 * guaranteed by the primary MMU.  If that ever changes, KVM needs to
705	 * unmap the memslot instead of skipping the memslot to ensure that KVM
706	 * doesn't hold references to the old PFN.
707	 */
708	WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
709
710	if (range->slot->flags & KVM_MEMSLOT_INVALID)
711		return false;
712
713	return kvm_set_spte_gfn(kvm, range);
714}
715
716static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
717					struct mm_struct *mm,
718					unsigned long address,
719					pte_t pte)
720{
721	struct kvm *kvm = mmu_notifier_to_kvm(mn);
722	const union kvm_mmu_notifier_arg arg = { .pte = pte };
723
724	trace_kvm_set_spte_hva(address);
725
726	/*
727	 * .change_pte() must be surrounded by .invalidate_range_{start,end}().
728	 * If mmu_invalidate_in_progress is zero, then no in-progress
729	 * invalidations, including this one, found a relevant memslot at
730	 * start(); rechecking memslots here is unnecessary.  Note, a false
731	 * positive (count elevated by a different invalidation) is sub-optimal
732	 * but functionally ok.
733	 */
734	WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
735	if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
736		return;
737
738	kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn);
739}
740
741void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
742			      unsigned long end)
743{
744	/*
745	 * The count increase must become visible at unlock time as no
746	 * spte can be established without taking the mmu_lock and
747	 * count is also read inside the mmu_lock critical section.
748	 */
749	kvm->mmu_invalidate_in_progress++;
750	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
751		kvm->mmu_invalidate_range_start = start;
752		kvm->mmu_invalidate_range_end = end;
753	} else {
754		/*
755		 * Fully tracking multiple concurrent ranges has diminishing
756		 * returns. Keep things simple and just find the minimal range
757		 * which includes the current and new ranges. As there won't be
758		 * enough information to subtract a range after its invalidate
759		 * completes, any ranges invalidated concurrently will
760		 * accumulate and persist until all outstanding invalidates
761		 * complete.
762		 */
763		kvm->mmu_invalidate_range_start =
764			min(kvm->mmu_invalidate_range_start, start);
765		kvm->mmu_invalidate_range_end =
766			max(kvm->mmu_invalidate_range_end, end);
767	}
768}
769
770static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
771					const struct mmu_notifier_range *range)
772{
773	struct kvm *kvm = mmu_notifier_to_kvm(mn);
774	const struct kvm_hva_range hva_range = {
775		.start		= range->start,
776		.end		= range->end,
777		.handler	= kvm_unmap_gfn_range,
778		.on_lock	= kvm_mmu_invalidate_begin,
779		.on_unlock	= kvm_arch_guest_memory_reclaimed,
780		.flush_on_ret	= true,
781		.may_block	= mmu_notifier_range_blockable(range),
782	};
783
784	trace_kvm_unmap_hva_range(range->start, range->end);
785
786	/*
787	 * Prevent memslot modification between range_start() and range_end()
788	 * so that conditionally locking provides the same result in both
789	 * functions.  Without that guarantee, the mmu_invalidate_in_progress
790	 * adjustments will be imbalanced.
791	 *
792	 * Pairs with the decrement in range_end().
793	 */
794	spin_lock(&kvm->mn_invalidate_lock);
795	kvm->mn_active_invalidate_count++;
796	spin_unlock(&kvm->mn_invalidate_lock);
797
798	/*
799	 * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
800	 * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
801	 * each cache's lock.  There are relatively few caches in existence at
802	 * any given time, and the caches themselves can check for hva overlap,
803	 * i.e. don't need to rely on memslot overlap checks for performance.
804	 * Because this runs without holding mmu_lock, the pfn caches must use
805	 * mn_active_invalidate_count (see above) instead of
806	 * mmu_invalidate_in_progress.
807	 */
808	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
809					  hva_range.may_block);
810
811	__kvm_handle_hva_range(kvm, &hva_range);
812
813	return 0;
814}
815
816void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
817			    unsigned long end)
818{
819	/*
820	 * This sequence increase will notify the kvm page fault that
821	 * the page that is going to be mapped in the spte could have
822	 * been freed.
823	 */
824	kvm->mmu_invalidate_seq++;
825	smp_wmb();
826	/*
827	 * The above sequence increase must be visible before the
828	 * below count decrease, which is ensured by the smp_wmb above
829	 * in conjunction with the smp_rmb in mmu_invalidate_retry().
830	 */
831	kvm->mmu_invalidate_in_progress--;
832}
833
834static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
835					const struct mmu_notifier_range *range)
836{
837	struct kvm *kvm = mmu_notifier_to_kvm(mn);
838	const struct kvm_hva_range hva_range = {
839		.start		= range->start,
840		.end		= range->end,
841		.handler	= (void *)kvm_null_fn,
842		.on_lock	= kvm_mmu_invalidate_end,
843		.on_unlock	= (void *)kvm_null_fn,
844		.flush_on_ret	= false,
845		.may_block	= mmu_notifier_range_blockable(range),
846	};
847	bool wake;
848
849	__kvm_handle_hva_range(kvm, &hva_range);
850
851	/* Pairs with the increment in range_start(). */
852	spin_lock(&kvm->mn_invalidate_lock);
853	wake = (--kvm->mn_active_invalidate_count == 0);
854	spin_unlock(&kvm->mn_invalidate_lock);
855
856	/*
857	 * There can only be one waiter, since the wait happens under
858	 * slots_lock.
859	 */
860	if (wake)
861		rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
862
863	BUG_ON(kvm->mmu_invalidate_in_progress < 0);
864}
865
866static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
867					      struct mm_struct *mm,
868					      unsigned long start,
869					      unsigned long end)
870{
871	trace_kvm_age_hva(start, end);
872
873	return kvm_handle_hva_range(mn, start, end, KVM_MMU_NOTIFIER_NO_ARG,
874				    kvm_age_gfn);
875}
876
877static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
878					struct mm_struct *mm,
879					unsigned long start,
880					unsigned long end)
881{
882	trace_kvm_age_hva(start, end);
883
884	/*
885	 * Even though we do not flush TLB, this will still adversely
886	 * affect performance on pre-Haswell Intel EPT, where there is
887	 * no EPT Access Bit to clear so that we have to tear down EPT
888	 * tables instead. If we find this unacceptable, we can always
889	 * add a parameter to kvm_age_hva so that it effectively doesn't
890	 * do anything on clear_young.
891	 *
892	 * Also note that currently we never issue secondary TLB flushes
893	 * from clear_young, leaving this job up to the regular system
894	 * cadence. If we find this inaccurate, we might come up with a
895	 * more sophisticated heuristic later.
896	 */
897	return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
898}
899
900static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
901				       struct mm_struct *mm,
902				       unsigned long address)
903{
904	trace_kvm_test_age_hva(address);
905
906	return kvm_handle_hva_range_no_flush(mn, address, address + 1,
907					     kvm_test_age_gfn);
908}
909
910static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
911				     struct mm_struct *mm)
912{
913	struct kvm *kvm = mmu_notifier_to_kvm(mn);
914	int idx;
915
916	idx = srcu_read_lock(&kvm->srcu);
917	kvm_flush_shadow_all(kvm);
918	srcu_read_unlock(&kvm->srcu, idx);
919}
920
921static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
922	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
923	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
924	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
925	.clear_young		= kvm_mmu_notifier_clear_young,
926	.test_young		= kvm_mmu_notifier_test_young,
927	.change_pte		= kvm_mmu_notifier_change_pte,
928	.release		= kvm_mmu_notifier_release,
929};
930
931static int kvm_init_mmu_notifier(struct kvm *kvm)
932{
933	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
934	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
935}
936
937#else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
938
939static int kvm_init_mmu_notifier(struct kvm *kvm)
940{
941	return 0;
942}
943
944#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
945
946#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
947static int kvm_pm_notifier_call(struct notifier_block *bl,
948				unsigned long state,
949				void *unused)
950{
951	struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
952
953	return kvm_arch_pm_notifier(kvm, state);
954}
955
956static void kvm_init_pm_notifier(struct kvm *kvm)
957{
958	kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
959	/* Suspend KVM before we suspend ftrace, RCU, etc. */
960	kvm->pm_notifier.priority = INT_MAX;
961	register_pm_notifier(&kvm->pm_notifier);
962}
963
964static void kvm_destroy_pm_notifier(struct kvm *kvm)
965{
966	unregister_pm_notifier(&kvm->pm_notifier);
967}
968#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
969static void kvm_init_pm_notifier(struct kvm *kvm)
970{
971}
972
973static void kvm_destroy_pm_notifier(struct kvm *kvm)
974{
975}
976#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
977
978static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
979{
980	if (!memslot->dirty_bitmap)
981		return;
982
983	kvfree(memslot->dirty_bitmap);
984	memslot->dirty_bitmap = NULL;
985}
986
987/* This does not remove the slot from struct kvm_memslots data structures */
988static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
989{
990	kvm_destroy_dirty_bitmap(slot);
991
992	kvm_arch_free_memslot(kvm, slot);
993
994	kfree(slot);
995}
996
997static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
998{
999	struct hlist_node *idnode;
1000	struct kvm_memory_slot *memslot;
1001	int bkt;
1002
1003	/*
1004	 * The same memslot objects live in both active and inactive sets,
1005	 * arbitrarily free using index '1' so the second invocation of this
1006	 * function isn't operating over a structure with dangling pointers
1007	 * (even though this function isn't actually touching them).
1008	 */
1009	if (!slots->node_idx)
1010		return;
1011
1012	hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
1013		kvm_free_memslot(kvm, memslot);
1014}
1015
1016static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
1017{
1018	switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
1019	case KVM_STATS_TYPE_INSTANT:
1020		return 0444;
1021	case KVM_STATS_TYPE_CUMULATIVE:
1022	case KVM_STATS_TYPE_PEAK:
1023	default:
1024		return 0644;
1025	}
1026}
1027
1028
1029static void kvm_destroy_vm_debugfs(struct kvm *kvm)
1030{
1031	int i;
1032	int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1033				      kvm_vcpu_stats_header.num_desc;
1034
1035	if (IS_ERR(kvm->debugfs_dentry))
1036		return;
1037
1038	debugfs_remove_recursive(kvm->debugfs_dentry);
1039
1040	if (kvm->debugfs_stat_data) {
1041		for (i = 0; i < kvm_debugfs_num_entries; i++)
1042			kfree(kvm->debugfs_stat_data[i]);
1043		kfree(kvm->debugfs_stat_data);
1044	}
1045}
1046
1047static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
1048{
1049	static DEFINE_MUTEX(kvm_debugfs_lock);
1050	struct dentry *dent;
1051	char dir_name[ITOA_MAX_LEN * 2];
1052	struct kvm_stat_data *stat_data;
1053	const struct _kvm_stats_desc *pdesc;
1054	int i, ret = -ENOMEM;
1055	int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1056				      kvm_vcpu_stats_header.num_desc;
1057
1058	if (!debugfs_initialized())
1059		return 0;
1060
1061	snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
1062	mutex_lock(&kvm_debugfs_lock);
1063	dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
1064	if (dent) {
1065		pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
1066		dput(dent);
1067		mutex_unlock(&kvm_debugfs_lock);
1068		return 0;
1069	}
1070	dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
1071	mutex_unlock(&kvm_debugfs_lock);
1072	if (IS_ERR(dent))
1073		return 0;
1074
1075	kvm->debugfs_dentry = dent;
1076	kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
1077					 sizeof(*kvm->debugfs_stat_data),
1078					 GFP_KERNEL_ACCOUNT);
1079	if (!kvm->debugfs_stat_data)
1080		goto out_err;
1081
1082	for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
1083		pdesc = &kvm_vm_stats_desc[i];
1084		stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1085		if (!stat_data)
1086			goto out_err;
1087
1088		stat_data->kvm = kvm;
1089		stat_data->desc = pdesc;
1090		stat_data->kind = KVM_STAT_VM;
1091		kvm->debugfs_stat_data[i] = stat_data;
1092		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1093				    kvm->debugfs_dentry, stat_data,
1094				    &stat_fops_per_vm);
1095	}
1096
1097	for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
1098		pdesc = &kvm_vcpu_stats_desc[i];
1099		stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1100		if (!stat_data)
1101			goto out_err;
1102
1103		stat_data->kvm = kvm;
1104		stat_data->desc = pdesc;
1105		stat_data->kind = KVM_STAT_VCPU;
1106		kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
1107		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1108				    kvm->debugfs_dentry, stat_data,
1109				    &stat_fops_per_vm);
1110	}
1111
1112	ret = kvm_arch_create_vm_debugfs(kvm);
1113	if (ret)
1114		goto out_err;
1115
1116	return 0;
1117out_err:
1118	kvm_destroy_vm_debugfs(kvm);
1119	return ret;
1120}
1121
1122/*
1123 * Called after the VM is otherwise initialized, but just before adding it to
1124 * the vm_list.
1125 */
1126int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1127{
1128	return 0;
1129}
1130
1131/*
1132 * Called just after removing the VM from the vm_list, but before doing any
1133 * other destruction.
1134 */
1135void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1136{
1137}
1138
1139/*
1140 * Called after per-vm debugfs created.  When called kvm->debugfs_dentry should
1141 * be setup already, so we can create arch-specific debugfs entries under it.
1142 * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1143 * a per-arch destroy interface is not needed.
1144 */
1145int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1146{
1147	return 0;
1148}
1149
1150static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
1151{
1152	struct kvm *kvm = kvm_arch_alloc_vm();
1153	struct kvm_memslots *slots;
1154	int r = -ENOMEM;
1155	int i, j;
1156
1157	if (!kvm)
1158		return ERR_PTR(-ENOMEM);
1159
1160	/* KVM is pinned via open("/dev/kvm"), the fd passed to this ioctl(). */
1161	__module_get(kvm_chardev_ops.owner);
1162
1163	KVM_MMU_LOCK_INIT(kvm);
1164	mmgrab(current->mm);
1165	kvm->mm = current->mm;
1166	kvm_eventfd_init(kvm);
1167	mutex_init(&kvm->lock);
1168	mutex_init(&kvm->irq_lock);
1169	mutex_init(&kvm->slots_lock);
1170	mutex_init(&kvm->slots_arch_lock);
1171	spin_lock_init(&kvm->mn_invalidate_lock);
1172	rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1173	xa_init(&kvm->vcpu_array);
1174
1175	INIT_LIST_HEAD(&kvm->gpc_list);
1176	spin_lock_init(&kvm->gpc_lock);
1177
1178	INIT_LIST_HEAD(&kvm->devices);
1179	kvm->max_vcpus = KVM_MAX_VCPUS;
1180
1181	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1182
1183	/*
1184	 * Force subsequent debugfs file creations to fail if the VM directory
1185	 * is not created (by kvm_create_vm_debugfs()).
1186	 */
1187	kvm->debugfs_dentry = ERR_PTR(-ENOENT);
1188
1189	snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
1190		 task_pid_nr(current));
1191
1192	if (init_srcu_struct(&kvm->srcu))
1193		goto out_err_no_srcu;
1194	if (init_srcu_struct(&kvm->irq_srcu))
1195		goto out_err_no_irq_srcu;
1196
1197	refcount_set(&kvm->users_count, 1);
1198	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1199		for (j = 0; j < 2; j++) {
1200			slots = &kvm->__memslots[i][j];
1201
1202			atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
1203			slots->hva_tree = RB_ROOT_CACHED;
1204			slots->gfn_tree = RB_ROOT;
1205			hash_init(slots->id_hash);
1206			slots->node_idx = j;
1207
1208			/* Generations must be different for each address space. */
1209			slots->generation = i;
1210		}
1211
1212		rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
1213	}
1214
1215	for (i = 0; i < KVM_NR_BUSES; i++) {
1216		rcu_assign_pointer(kvm->buses[i],
1217			kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1218		if (!kvm->buses[i])
1219			goto out_err_no_arch_destroy_vm;
1220	}
1221
1222	r = kvm_arch_init_vm(kvm, type);
1223	if (r)
1224		goto out_err_no_arch_destroy_vm;
1225
1226	r = hardware_enable_all();
1227	if (r)
1228		goto out_err_no_disable;
1229
1230#ifdef CONFIG_HAVE_KVM_IRQFD
1231	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1232#endif
1233
1234	r = kvm_init_mmu_notifier(kvm);
1235	if (r)
1236		goto out_err_no_mmu_notifier;
1237
1238	r = kvm_coalesced_mmio_init(kvm);
1239	if (r < 0)
1240		goto out_no_coalesced_mmio;
1241
1242	r = kvm_create_vm_debugfs(kvm, fdname);
1243	if (r)
1244		goto out_err_no_debugfs;
1245
1246	r = kvm_arch_post_init_vm(kvm);
1247	if (r)
1248		goto out_err;
1249
1250	mutex_lock(&kvm_lock);
1251	list_add(&kvm->vm_list, &vm_list);
1252	mutex_unlock(&kvm_lock);
1253
1254	preempt_notifier_inc();
1255	kvm_init_pm_notifier(kvm);
1256
1257	return kvm;
1258
1259out_err:
1260	kvm_destroy_vm_debugfs(kvm);
1261out_err_no_debugfs:
1262	kvm_coalesced_mmio_free(kvm);
1263out_no_coalesced_mmio:
1264#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1265	if (kvm->mmu_notifier.ops)
1266		mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1267#endif
1268out_err_no_mmu_notifier:
1269	hardware_disable_all();
1270out_err_no_disable:
1271	kvm_arch_destroy_vm(kvm);
1272out_err_no_arch_destroy_vm:
1273	WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1274	for (i = 0; i < KVM_NR_BUSES; i++)
1275		kfree(kvm_get_bus(kvm, i));
1276	cleanup_srcu_struct(&kvm->irq_srcu);
1277out_err_no_irq_srcu:
1278	cleanup_srcu_struct(&kvm->srcu);
1279out_err_no_srcu:
1280	kvm_arch_free_vm(kvm);
1281	mmdrop(current->mm);
1282	module_put(kvm_chardev_ops.owner);
1283	return ERR_PTR(r);
1284}
1285
1286static void kvm_destroy_devices(struct kvm *kvm)
1287{
1288	struct kvm_device *dev, *tmp;
1289
1290	/*
1291	 * We do not need to take the kvm->lock here, because nobody else
1292	 * has a reference to the struct kvm at this point and therefore
1293	 * cannot access the devices list anyhow.
1294	 */
1295	list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1296		list_del(&dev->vm_node);
1297		dev->ops->destroy(dev);
1298	}
1299}
1300
1301static void kvm_destroy_vm(struct kvm *kvm)
1302{
1303	int i;
1304	struct mm_struct *mm = kvm->mm;
1305
1306	kvm_destroy_pm_notifier(kvm);
1307	kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1308	kvm_destroy_vm_debugfs(kvm);
1309	kvm_arch_sync_events(kvm);
1310	mutex_lock(&kvm_lock);
1311	list_del(&kvm->vm_list);
1312	mutex_unlock(&kvm_lock);
1313	kvm_arch_pre_destroy_vm(kvm);
1314
1315	kvm_free_irq_routing(kvm);
1316	for (i = 0; i < KVM_NR_BUSES; i++) {
1317		struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1318
1319		if (bus)
1320			kvm_io_bus_destroy(bus);
1321		kvm->buses[i] = NULL;
1322	}
1323	kvm_coalesced_mmio_free(kvm);
1324#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1325	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1326	/*
1327	 * At this point, pending calls to invalidate_range_start()
1328	 * have completed but no more MMU notifiers will run, so
1329	 * mn_active_invalidate_count may remain unbalanced.
1330	 * No threads can be waiting in kvm_swap_active_memslots() as the
1331	 * last reference on KVM has been dropped, but freeing
1332	 * memslots would deadlock without this manual intervention.
1333	 */
1334	WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1335	kvm->mn_active_invalidate_count = 0;
1336#else
1337	kvm_flush_shadow_all(kvm);
1338#endif
1339	kvm_arch_destroy_vm(kvm);
1340	kvm_destroy_devices(kvm);
1341	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1342		kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
1343		kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
1344	}
1345	cleanup_srcu_struct(&kvm->irq_srcu);
1346	cleanup_srcu_struct(&kvm->srcu);
1347	kvm_arch_free_vm(kvm);
1348	preempt_notifier_dec();
1349	hardware_disable_all();
1350	mmdrop(mm);
1351	module_put(kvm_chardev_ops.owner);
1352}
1353
1354void kvm_get_kvm(struct kvm *kvm)
1355{
1356	refcount_inc(&kvm->users_count);
1357}
1358EXPORT_SYMBOL_GPL(kvm_get_kvm);
1359
1360/*
1361 * Make sure the vm is not during destruction, which is a safe version of
1362 * kvm_get_kvm().  Return true if kvm referenced successfully, false otherwise.
1363 */
1364bool kvm_get_kvm_safe(struct kvm *kvm)
1365{
1366	return refcount_inc_not_zero(&kvm->users_count);
1367}
1368EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1369
1370void kvm_put_kvm(struct kvm *kvm)
1371{
1372	if (refcount_dec_and_test(&kvm->users_count))
1373		kvm_destroy_vm(kvm);
1374}
1375EXPORT_SYMBOL_GPL(kvm_put_kvm);
1376
1377/*
1378 * Used to put a reference that was taken on behalf of an object associated
1379 * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1380 * of the new file descriptor fails and the reference cannot be transferred to
1381 * its final owner.  In such cases, the caller is still actively using @kvm and
1382 * will fail miserably if the refcount unexpectedly hits zero.
1383 */
1384void kvm_put_kvm_no_destroy(struct kvm *kvm)
1385{
1386	WARN_ON(refcount_dec_and_test(&kvm->users_count));
1387}
1388EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1389
1390static int kvm_vm_release(struct inode *inode, struct file *filp)
1391{
1392	struct kvm *kvm = filp->private_data;
1393
1394	kvm_irqfd_release(kvm);
1395
1396	kvm_put_kvm(kvm);
1397	return 0;
1398}
1399
1400/*
1401 * Allocation size is twice as large as the actual dirty bitmap size.
1402 * See kvm_vm_ioctl_get_dirty_log() why this is needed.
1403 */
1404static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1405{
1406	unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
1407
1408	memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
1409	if (!memslot->dirty_bitmap)
1410		return -ENOMEM;
1411
1412	return 0;
1413}
1414
1415static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
1416{
1417	struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1418	int node_idx_inactive = active->node_idx ^ 1;
1419
1420	return &kvm->__memslots[as_id][node_idx_inactive];
1421}
1422
1423/*
1424 * Helper to get the address space ID when one of memslot pointers may be NULL.
1425 * This also serves as a sanity that at least one of the pointers is non-NULL,
1426 * and that their address space IDs don't diverge.
1427 */
1428static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1429				  struct kvm_memory_slot *b)
1430{
1431	if (WARN_ON_ONCE(!a && !b))
1432		return 0;
1433
1434	if (!a)
1435		return b->as_id;
1436	if (!b)
1437		return a->as_id;
1438
1439	WARN_ON_ONCE(a->as_id != b->as_id);
1440	return a->as_id;
1441}
1442
1443static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1444				struct kvm_memory_slot *slot)
1445{
1446	struct rb_root *gfn_tree = &slots->gfn_tree;
1447	struct rb_node **node, *parent;
1448	int idx = slots->node_idx;
1449
1450	parent = NULL;
1451	for (node = &gfn_tree->rb_node; *node; ) {
1452		struct kvm_memory_slot *tmp;
1453
1454		tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
1455		parent = *node;
1456		if (slot->base_gfn < tmp->base_gfn)
1457			node = &(*node)->rb_left;
1458		else if (slot->base_gfn > tmp->base_gfn)
1459			node = &(*node)->rb_right;
1460		else
1461			BUG();
1462	}
1463
1464	rb_link_node(&slot->gfn_node[idx], parent, node);
1465	rb_insert_color(&slot->gfn_node[idx], gfn_tree);
1466}
1467
1468static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1469			       struct kvm_memory_slot *slot)
1470{
1471	rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1472}
1473
1474static void kvm_replace_gfn_node(struct kvm_memslots *slots,
1475				 struct kvm_memory_slot *old,
1476				 struct kvm_memory_slot *new)
1477{
1478	int idx = slots->node_idx;
1479
1480	WARN_ON_ONCE(old->base_gfn != new->base_gfn);
1481
1482	rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
1483			&slots->gfn_tree);
1484}
1485
1486/*
1487 * Replace @old with @new in the inactive memslots.
1488 *
1489 * With NULL @old this simply adds @new.
1490 * With NULL @new this simply removes @old.
1491 *
1492 * If @new is non-NULL its hva_node[slots_idx] range has to be set
1493 * appropriately.
1494 */
1495static void kvm_replace_memslot(struct kvm *kvm,
1496				struct kvm_memory_slot *old,
1497				struct kvm_memory_slot *new)
1498{
1499	int as_id = kvm_memslots_get_as_id(old, new);
1500	struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1501	int idx = slots->node_idx;
1502
1503	if (old) {
1504		hash_del(&old->id_node[idx]);
1505		interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
1506
1507		if ((long)old == atomic_long_read(&slots->last_used_slot))
1508			atomic_long_set(&slots->last_used_slot, (long)new);
1509
1510		if (!new) {
1511			kvm_erase_gfn_node(slots, old);
1512			return;
1513		}
1514	}
1515
1516	/*
1517	 * Initialize @new's hva range.  Do this even when replacing an @old
1518	 * slot, kvm_copy_memslot() deliberately does not touch node data.
1519	 */
1520	new->hva_node[idx].start = new->userspace_addr;
1521	new->hva_node[idx].last = new->userspace_addr +
1522				  (new->npages << PAGE_SHIFT) - 1;
1523
1524	/*
1525	 * (Re)Add the new memslot.  There is no O(1) interval_tree_replace(),
1526	 * hva_node needs to be swapped with remove+insert even though hva can't
1527	 * change when replacing an existing slot.
1528	 */
1529	hash_add(slots->id_hash, &new->id_node[idx], new->id);
1530	interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
1531
1532	/*
1533	 * If the memslot gfn is unchanged, rb_replace_node() can be used to
1534	 * switch the node in the gfn tree instead of removing the old and
1535	 * inserting the new as two separate operations. Replacement is a
1536	 * single O(1) operation versus two O(log(n)) operations for
1537	 * remove+insert.
1538	 */
1539	if (old && old->base_gfn == new->base_gfn) {
1540		kvm_replace_gfn_node(slots, old, new);
1541	} else {
1542		if (old)
1543			kvm_erase_gfn_node(slots, old);
1544		kvm_insert_gfn_node(slots, new);
1545	}
1546}
1547
1548static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1549{
1550	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1551
1552#ifdef __KVM_HAVE_READONLY_MEM
1553	valid_flags |= KVM_MEM_READONLY;
1554#endif
1555
1556	if (mem->flags & ~valid_flags)
1557		return -EINVAL;
1558
1559	return 0;
1560}
1561
1562static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
1563{
1564	struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1565
1566	/* Grab the generation from the activate memslots. */
1567	u64 gen = __kvm_memslots(kvm, as_id)->generation;
1568
1569	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1570	slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1571
1572	/*
1573	 * Do not store the new memslots while there are invalidations in
1574	 * progress, otherwise the locking in invalidate_range_start and
1575	 * invalidate_range_end will be unbalanced.
1576	 */
1577	spin_lock(&kvm->mn_invalidate_lock);
1578	prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1579	while (kvm->mn_active_invalidate_count) {
1580		set_current_state(TASK_UNINTERRUPTIBLE);
1581		spin_unlock(&kvm->mn_invalidate_lock);
1582		schedule();
1583		spin_lock(&kvm->mn_invalidate_lock);
1584	}
1585	finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1586	rcu_assign_pointer(kvm->memslots[as_id], slots);
1587	spin_unlock(&kvm->mn_invalidate_lock);
1588
1589	/*
1590	 * Acquired in kvm_set_memslot. Must be released before synchronize
1591	 * SRCU below in order to avoid deadlock with another thread
1592	 * acquiring the slots_arch_lock in an srcu critical section.
1593	 */
1594	mutex_unlock(&kvm->slots_arch_lock);
1595
1596	synchronize_srcu_expedited(&kvm->srcu);
1597
1598	/*
1599	 * Increment the new memslot generation a second time, dropping the
1600	 * update in-progress flag and incrementing the generation based on
1601	 * the number of address spaces.  This provides a unique and easily
1602	 * identifiable generation number while the memslots are in flux.
1603	 */
1604	gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1605
1606	/*
1607	 * Generations must be unique even across address spaces.  We do not need
1608	 * a global counter for that, instead the generation space is evenly split
1609	 * across address spaces.  For example, with two address spaces, address
1610	 * space 0 will use generations 0, 2, 4, ... while address space 1 will
1611	 * use generations 1, 3, 5, ...
1612	 */
1613	gen += KVM_ADDRESS_SPACE_NUM;
1614
1615	kvm_arch_memslots_updated(kvm, gen);
1616
1617	slots->generation = gen;
1618}
1619
1620static int kvm_prepare_memory_region(struct kvm *kvm,
1621				     const struct kvm_memory_slot *old,
1622				     struct kvm_memory_slot *new,
1623				     enum kvm_mr_change change)
1624{
1625	int r;
1626
1627	/*
1628	 * If dirty logging is disabled, nullify the bitmap; the old bitmap
1629	 * will be freed on "commit".  If logging is enabled in both old and
1630	 * new, reuse the existing bitmap.  If logging is enabled only in the
1631	 * new and KVM isn't using a ring buffer, allocate and initialize a
1632	 * new bitmap.
1633	 */
1634	if (change != KVM_MR_DELETE) {
1635		if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
1636			new->dirty_bitmap = NULL;
1637		else if (old && old->dirty_bitmap)
1638			new->dirty_bitmap = old->dirty_bitmap;
1639		else if (kvm_use_dirty_bitmap(kvm)) {
1640			r = kvm_alloc_dirty_bitmap(new);
1641			if (r)
1642				return r;
1643
1644			if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1645				bitmap_set(new->dirty_bitmap, 0, new->npages);
1646		}
1647	}
1648
1649	r = kvm_arch_prepare_memory_region(kvm, old, new, change);
1650
1651	/* Free the bitmap on failure if it was allocated above. */
1652	if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
1653		kvm_destroy_dirty_bitmap(new);
1654
1655	return r;
1656}
1657
1658static void kvm_commit_memory_region(struct kvm *kvm,
1659				     struct kvm_memory_slot *old,
1660				     const struct kvm_memory_slot *new,
1661				     enum kvm_mr_change change)
1662{
1663	int old_flags = old ? old->flags : 0;
1664	int new_flags = new ? new->flags : 0;
1665	/*
1666	 * Update the total number of memslot pages before calling the arch
1667	 * hook so that architectures can consume the result directly.
1668	 */
1669	if (change == KVM_MR_DELETE)
1670		kvm->nr_memslot_pages -= old->npages;
1671	else if (change == KVM_MR_CREATE)
1672		kvm->nr_memslot_pages += new->npages;
1673
1674	if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
1675		int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
1676		atomic_set(&kvm->nr_memslots_dirty_logging,
1677			   atomic_read(&kvm->nr_memslots_dirty_logging) + change);
1678	}
1679
1680	kvm_arch_commit_memory_region(kvm, old, new, change);
1681
1682	switch (change) {
1683	case KVM_MR_CREATE:
1684		/* Nothing more to do. */
1685		break;
1686	case KVM_MR_DELETE:
1687		/* Free the old memslot and all its metadata. */
1688		kvm_free_memslot(kvm, old);
1689		break;
1690	case KVM_MR_MOVE:
1691	case KVM_MR_FLAGS_ONLY:
1692		/*
1693		 * Free the dirty bitmap as needed; the below check encompasses
1694		 * both the flags and whether a ring buffer is being used)
1695		 */
1696		if (old->dirty_bitmap && !new->dirty_bitmap)
1697			kvm_destroy_dirty_bitmap(old);
1698
1699		/*
1700		 * The final quirk.  Free the detached, old slot, but only its
1701		 * memory, not any metadata.  Metadata, including arch specific
1702		 * data, may be reused by @new.
1703		 */
1704		kfree(old);
1705		break;
1706	default:
1707		BUG();
1708	}
1709}
1710
1711/*
1712 * Activate @new, which must be installed in the inactive slots by the caller,
1713 * by swapping the active slots and then propagating @new to @old once @old is
1714 * unreachable and can be safely modified.
1715 *
1716 * With NULL @old this simply adds @new to @active (while swapping the sets).
1717 * With NULL @new this simply removes @old from @active and frees it
1718 * (while also swapping the sets).
1719 */
1720static void kvm_activate_memslot(struct kvm *kvm,
1721				 struct kvm_memory_slot *old,
1722				 struct kvm_memory_slot *new)
1723{
1724	int as_id = kvm_memslots_get_as_id(old, new);
1725
1726	kvm_swap_active_memslots(kvm, as_id);
1727
1728	/* Propagate the new memslot to the now inactive memslots. */
1729	kvm_replace_memslot(kvm, old, new);
1730}
1731
1732static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1733			     const struct kvm_memory_slot *src)
1734{
1735	dest->base_gfn = src->base_gfn;
1736	dest->npages = src->npages;
1737	dest->dirty_bitmap = src->dirty_bitmap;
1738	dest->arch = src->arch;
1739	dest->userspace_addr = src->userspace_addr;
1740	dest->flags = src->flags;
1741	dest->id = src->id;
1742	dest->as_id = src->as_id;
1743}
1744
1745static void kvm_invalidate_memslot(struct kvm *kvm,
1746				   struct kvm_memory_slot *old,
1747				   struct kvm_memory_slot *invalid_slot)
1748{
1749	/*
1750	 * Mark the current slot INVALID.  As with all memslot modifications,
1751	 * this must be done on an unreachable slot to avoid modifying the
1752	 * current slot in the active tree.
1753	 */
1754	kvm_copy_memslot(invalid_slot, old);
1755	invalid_slot->flags |= KVM_MEMSLOT_INVALID;
1756	kvm_replace_memslot(kvm, old, invalid_slot);
1757
1758	/*
1759	 * Activate the slot that is now marked INVALID, but don't propagate
1760	 * the slot to the now inactive slots. The slot is either going to be
1761	 * deleted or recreated as a new slot.
1762	 */
1763	kvm_swap_active_memslots(kvm, old->as_id);
1764
1765	/*
1766	 * From this point no new shadow pages pointing to a deleted, or moved,
1767	 * memslot will be created.  Validation of sp->gfn happens in:
1768	 *	- gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1769	 *	- kvm_is_visible_gfn (mmu_check_root)
1770	 */
1771	kvm_arch_flush_shadow_memslot(kvm, old);
1772	kvm_arch_guest_memory_reclaimed(kvm);
1773
1774	/* Was released by kvm_swap_active_memslots(), reacquire. */
1775	mutex_lock(&kvm->slots_arch_lock);
1776
1777	/*
1778	 * Copy the arch-specific field of the newly-installed slot back to the
1779	 * old slot as the arch data could have changed between releasing
1780	 * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
1781	 * above.  Writers are required to retrieve memslots *after* acquiring
1782	 * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1783	 */
1784	old->arch = invalid_slot->arch;
1785}
1786
1787static void kvm_create_memslot(struct kvm *kvm,
1788			       struct kvm_memory_slot *new)
1789{
1790	/* Add the new memslot to the inactive set and activate. */
1791	kvm_replace_memslot(kvm, NULL, new);
1792	kvm_activate_memslot(kvm, NULL, new);
1793}
1794
1795static void kvm_delete_memslot(struct kvm *kvm,
1796			       struct kvm_memory_slot *old,
1797			       struct kvm_memory_slot *invalid_slot)
1798{
1799	/*
1800	 * Remove the old memslot (in the inactive memslots) by passing NULL as
1801	 * the "new" slot, and for the invalid version in the active slots.
1802	 */
1803	kvm_replace_memslot(kvm, old, NULL);
1804	kvm_activate_memslot(kvm, invalid_slot, NULL);
1805}
1806
1807static void kvm_move_memslot(struct kvm *kvm,
1808			     struct kvm_memory_slot *old,
1809			     struct kvm_memory_slot *new,
1810			     struct kvm_memory_slot *invalid_slot)
1811{
1812	/*
1813	 * Replace the old memslot in the inactive slots, and then swap slots
1814	 * and replace the current INVALID with the new as well.
1815	 */
1816	kvm_replace_memslot(kvm, old, new);
1817	kvm_activate_memslot(kvm, invalid_slot, new);
1818}
1819
1820static void kvm_update_flags_memslot(struct kvm *kvm,
1821				     struct kvm_memory_slot *old,
1822				     struct kvm_memory_slot *new)
1823{
1824	/*
1825	 * Similar to the MOVE case, but the slot doesn't need to be zapped as
1826	 * an intermediate step. Instead, the old memslot is simply replaced
1827	 * with a new, updated copy in both memslot sets.
1828	 */
1829	kvm_replace_memslot(kvm, old, new);
1830	kvm_activate_memslot(kvm, old, new);
1831}
1832
1833static int kvm_set_memslot(struct kvm *kvm,
1834			   struct kvm_memory_slot *old,
1835			   struct kvm_memory_slot *new,
1836			   enum kvm_mr_change change)
1837{
1838	struct kvm_memory_slot *invalid_slot;
1839	int r;
1840
1841	/*
1842	 * Released in kvm_swap_active_memslots().
1843	 *
1844	 * Must be held from before the current memslots are copied until after
1845	 * the new memslots are installed with rcu_assign_pointer, then
1846	 * released before the synchronize srcu in kvm_swap_active_memslots().
1847	 *
1848	 * When modifying memslots outside of the slots_lock, must be held
1849	 * before reading the pointer to the current memslots until after all
1850	 * changes to those memslots are complete.
1851	 *
1852	 * These rules ensure that installing new memslots does not lose
1853	 * changes made to the previous memslots.
1854	 */
1855	mutex_lock(&kvm->slots_arch_lock);
1856
1857	/*
1858	 * Invalidate the old slot if it's being deleted or moved.  This is
1859	 * done prior to actually deleting/moving the memslot to allow vCPUs to
1860	 * continue running by ensuring there are no mappings or shadow pages
1861	 * for the memslot when it is deleted/moved.  Without pre-invalidation
1862	 * (and without a lock), a window would exist between effecting the
1863	 * delete/move and committing the changes in arch code where KVM or a
1864	 * guest could access a non-existent memslot.
1865	 *
1866	 * Modifications are done on a temporary, unreachable slot.  The old
1867	 * slot needs to be preserved in case a later step fails and the
1868	 * invalidation needs to be reverted.
1869	 */
1870	if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1871		invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1872		if (!invalid_slot) {
1873			mutex_unlock(&kvm->slots_arch_lock);
1874			return -ENOMEM;
1875		}
1876		kvm_invalidate_memslot(kvm, old, invalid_slot);
1877	}
1878
1879	r = kvm_prepare_memory_region(kvm, old, new, change);
1880	if (r) {
1881		/*
1882		 * For DELETE/MOVE, revert the above INVALID change.  No
1883		 * modifications required since the original slot was preserved
1884		 * in the inactive slots.  Changing the active memslots also
1885		 * release slots_arch_lock.
1886		 */
1887		if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1888			kvm_activate_memslot(kvm, invalid_slot, old);
1889			kfree(invalid_slot);
1890		} else {
1891			mutex_unlock(&kvm->slots_arch_lock);
1892		}
1893		return r;
1894	}
1895
1896	/*
1897	 * For DELETE and MOVE, the working slot is now active as the INVALID
1898	 * version of the old slot.  MOVE is particularly special as it reuses
1899	 * the old slot and returns a copy of the old slot (in working_slot).
1900	 * For CREATE, there is no old slot.  For DELETE and FLAGS_ONLY, the
1901	 * old slot is detached but otherwise preserved.
1902	 */
1903	if (change == KVM_MR_CREATE)
1904		kvm_create_memslot(kvm, new);
1905	else if (change == KVM_MR_DELETE)
1906		kvm_delete_memslot(kvm, old, invalid_slot);
1907	else if (change == KVM_MR_MOVE)
1908		kvm_move_memslot(kvm, old, new, invalid_slot);
1909	else if (change == KVM_MR_FLAGS_ONLY)
1910		kvm_update_flags_memslot(kvm, old, new);
1911	else
1912		BUG();
1913
1914	/* Free the temporary INVALID slot used for DELETE and MOVE. */
1915	if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1916		kfree(invalid_slot);
1917
1918	/*
1919	 * No need to refresh new->arch, changes after dropping slots_arch_lock
1920	 * will directly hit the final, active memslot.  Architectures are
1921	 * responsible for knowing that new->arch may be stale.
1922	 */
1923	kvm_commit_memory_region(kvm, old, new, change);
1924
1925	return 0;
1926}
1927
1928static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
1929				      gfn_t start, gfn_t end)
1930{
1931	struct kvm_memslot_iter iter;
1932
1933	kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
1934		if (iter.slot->id != id)
1935			return true;
1936	}
1937
1938	return false;
1939}
1940
1941/*
1942 * Allocate some memory and give it an address in the guest physical address
1943 * space.
1944 *
1945 * Discontiguous memory is allowed, mostly for framebuffers.
1946 *
1947 * Must be called holding kvm->slots_lock for write.
1948 */
1949int __kvm_set_memory_region(struct kvm *kvm,
1950			    const struct kvm_userspace_memory_region *mem)
1951{
1952	struct kvm_memory_slot *old, *new;
1953	struct kvm_memslots *slots;
1954	enum kvm_mr_change change;
1955	unsigned long npages;
1956	gfn_t base_gfn;
1957	int as_id, id;
1958	int r;
1959
1960	r = check_memory_region_flags(mem);
1961	if (r)
1962		return r;
1963
1964	as_id = mem->slot >> 16;
1965	id = (u16)mem->slot;
1966
1967	/* General sanity checks */
1968	if ((mem->memory_size & (PAGE_SIZE - 1)) ||
1969	    (mem->memory_size != (unsigned long)mem->memory_size))
1970		return -EINVAL;
1971	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1972		return -EINVAL;
1973	/* We can read the guest memory with __xxx_user() later on. */
1974	if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1975	    (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1976	     !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1977			mem->memory_size))
1978		return -EINVAL;
1979	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1980		return -EINVAL;
1981	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1982		return -EINVAL;
1983	if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
1984		return -EINVAL;
1985
1986	slots = __kvm_memslots(kvm, as_id);
1987
1988	/*
1989	 * Note, the old memslot (and the pointer itself!) may be invalidated
1990	 * and/or destroyed by kvm_set_memslot().
1991	 */
1992	old = id_to_memslot(slots, id);
1993
1994	if (!mem->memory_size) {
1995		if (!old || !old->npages)
1996			return -EINVAL;
1997
1998		if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
1999			return -EIO;
2000
2001		return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
2002	}
2003
2004	base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
2005	npages = (mem->memory_size >> PAGE_SHIFT);
2006
2007	if (!old || !old->npages) {
2008		change = KVM_MR_CREATE;
2009
2010		/*
2011		 * To simplify KVM internals, the total number of pages across
2012		 * all memslots must fit in an unsigned long.
2013		 */
2014		if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
2015			return -EINVAL;
2016	} else { /* Modify an existing slot. */
2017		if ((mem->userspace_addr != old->userspace_addr) ||
2018		    (npages != old->npages) ||
2019		    ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
2020			return -EINVAL;
2021
2022		if (base_gfn != old->base_gfn)
2023			change = KVM_MR_MOVE;
2024		else if (mem->flags != old->flags)
2025			change = KVM_MR_FLAGS_ONLY;
2026		else /* Nothing to change. */
2027			return 0;
2028	}
2029
2030	if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
2031	    kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
2032		return -EEXIST;
2033
2034	/* Allocate a slot that will persist in the memslot. */
2035	new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
2036	if (!new)
2037		return -ENOMEM;
2038
2039	new->as_id = as_id;
2040	new->id = id;
2041	new->base_gfn = base_gfn;
2042	new->npages = npages;
2043	new->flags = mem->flags;
2044	new->userspace_addr = mem->userspace_addr;
2045
2046	r = kvm_set_memslot(kvm, old, new, change);
2047	if (r)
2048		kfree(new);
2049	return r;
2050}
2051EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
2052
2053int kvm_set_memory_region(struct kvm *kvm,
2054			  const struct kvm_userspace_memory_region *mem)
2055{
2056	int r;
2057
2058	mutex_lock(&kvm->slots_lock);
2059	r = __kvm_set_memory_region(kvm, mem);
2060	mutex_unlock(&kvm->slots_lock);
2061	return r;
2062}
2063EXPORT_SYMBOL_GPL(kvm_set_memory_region);
2064
2065static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
2066					  struct kvm_userspace_memory_region *mem)
2067{
2068	if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
2069		return -EINVAL;
2070
2071	return kvm_set_memory_region(kvm, mem);
2072}
2073
2074#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
2075/**
2076 * kvm_get_dirty_log - get a snapshot of dirty pages
2077 * @kvm:	pointer to kvm instance
2078 * @log:	slot id and address to which we copy the log
2079 * @is_dirty:	set to '1' if any dirty pages were found
2080 * @memslot:	set to the associated memslot, always valid on success
2081 */
2082int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
2083		      int *is_dirty, struct kvm_memory_slot **memslot)
2084{
2085	struct kvm_memslots *slots;
2086	int i, as_id, id;
2087	unsigned long n;
2088	unsigned long any = 0;
2089
2090	/* Dirty ring tracking may be exclusive to dirty log tracking */
2091	if (!kvm_use_dirty_bitmap(kvm))
2092		return -ENXIO;
2093
2094	*memslot = NULL;
2095	*is_dirty = 0;
2096
2097	as_id = log->slot >> 16;
2098	id = (u16)log->slot;
2099	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2100		return -EINVAL;
2101
2102	slots = __kvm_memslots(kvm, as_id);
2103	*memslot = id_to_memslot(slots, id);
2104	if (!(*memslot) || !(*memslot)->dirty_bitmap)
2105		return -ENOENT;
2106
2107	kvm_arch_sync_dirty_log(kvm, *memslot);
2108
2109	n = kvm_dirty_bitmap_bytes(*memslot);
2110
2111	for (i = 0; !any && i < n/sizeof(long); ++i)
2112		any = (*memslot)->dirty_bitmap[i];
2113
2114	if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
2115		return -EFAULT;
2116
2117	if (any)
2118		*is_dirty = 1;
2119	return 0;
2120}
2121EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
2122
2123#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2124/**
2125 * kvm_get_dirty_log_protect - get a snapshot of dirty pages
2126 *	and reenable dirty page tracking for the corresponding pages.
2127 * @kvm:	pointer to kvm instance
2128 * @log:	slot id and address to which we copy the log
2129 *
2130 * We need to keep it in mind that VCPU threads can write to the bitmap
2131 * concurrently. So, to avoid losing track of dirty pages we keep the
2132 * following order:
2133 *
2134 *    1. Take a snapshot of the bit and clear it if needed.
2135 *    2. Write protect the corresponding page.
2136 *    3. Copy the snapshot to the userspace.
2137 *    4. Upon return caller flushes TLB's if needed.
2138 *
2139 * Between 2 and 4, the guest may write to the page using the remaining TLB
2140 * entry.  This is not a problem because the page is reported dirty using
2141 * the snapshot taken before and step 4 ensures that writes done after
2142 * exiting to userspace will be logged for the next call.
2143 *
2144 */
2145static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
2146{
2147	struct kvm_memslots *slots;
2148	struct kvm_memory_slot *memslot;
2149	int i, as_id, id;
2150	unsigned long n;
2151	unsigned long *dirty_bitmap;
2152	unsigned long *dirty_bitmap_buffer;
2153	bool flush;
2154
2155	/* Dirty ring tracking may be exclusive to dirty log tracking */
2156	if (!kvm_use_dirty_bitmap(kvm))
2157		return -ENXIO;
2158
2159	as_id = log->slot >> 16;
2160	id = (u16)log->slot;
2161	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2162		return -EINVAL;
2163
2164	slots = __kvm_memslots(kvm, as_id);
2165	memslot = id_to_memslot(slots, id);
2166	if (!memslot || !memslot->dirty_bitmap)
2167		return -ENOENT;
2168
2169	dirty_bitmap = memslot->dirty_bitmap;
2170
2171	kvm_arch_sync_dirty_log(kvm, memslot);
2172
2173	n = kvm_dirty_bitmap_bytes(memslot);
2174	flush = false;
2175	if (kvm->manual_dirty_log_protect) {
2176		/*
2177		 * Unlike kvm_get_dirty_log, we always return false in *flush,
2178		 * because no flush is needed until KVM_CLEAR_DIRTY_LOG.  There
2179		 * is some code duplication between this function and
2180		 * kvm_get_dirty_log, but hopefully all architecture
2181		 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
2182		 * can be eliminated.
2183		 */
2184		dirty_bitmap_buffer = dirty_bitmap;
2185	} else {
2186		dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2187		memset(dirty_bitmap_buffer, 0, n);
2188
2189		KVM_MMU_LOCK(kvm);
2190		for (i = 0; i < n / sizeof(long); i++) {
2191			unsigned long mask;
2192			gfn_t offset;
2193
2194			if (!dirty_bitmap[i])
2195				continue;
2196
2197			flush = true;
2198			mask = xchg(&dirty_bitmap[i], 0);
2199			dirty_bitmap_buffer[i] = mask;
2200
2201			offset = i * BITS_PER_LONG;
2202			kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2203								offset, mask);
2204		}
2205		KVM_MMU_UNLOCK(kvm);
2206	}
2207
2208	if (flush)
2209		kvm_flush_remote_tlbs_memslot(kvm, memslot);
2210
2211	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
2212		return -EFAULT;
2213	return 0;
2214}
2215
2216
2217/**
2218 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
2219 * @kvm: kvm instance
2220 * @log: slot id and address to which we copy the log
2221 *
2222 * Steps 1-4 below provide general overview of dirty page logging. See
2223 * kvm_get_dirty_log_protect() function description for additional details.
2224 *
2225 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
2226 * always flush the TLB (step 4) even if previous step failed  and the dirty
2227 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
2228 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
2229 * writes will be marked dirty for next log read.
2230 *
2231 *   1. Take a snapshot of the bit and clear it if needed.
2232 *   2. Write protect the corresponding page.
2233 *   3. Copy the snapshot to the userspace.
2234 *   4. Flush TLB's if needed.
2235 */
2236static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2237				      struct kvm_dirty_log *log)
2238{
2239	int r;
2240
2241	mutex_lock(&kvm->slots_lock);
2242
2243	r = kvm_get_dirty_log_protect(kvm, log);
2244
2245	mutex_unlock(&kvm->slots_lock);
2246	return r;
2247}
2248
2249/**
2250 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2251 *	and reenable dirty page tracking for the corresponding pages.
2252 * @kvm:	pointer to kvm instance
2253 * @log:	slot id and address from which to fetch the bitmap of dirty pages
2254 */
2255static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2256				       struct kvm_clear_dirty_log *log)
2257{
2258	struct kvm_memslots *slots;
2259	struct kvm_memory_slot *memslot;
2260	int as_id, id;
2261	gfn_t offset;
2262	unsigned long i, n;
2263	unsigned long *dirty_bitmap;
2264	unsigned long *dirty_bitmap_buffer;
2265	bool flush;
2266
2267	/* Dirty ring tracking may be exclusive to dirty log tracking */
2268	if (!kvm_use_dirty_bitmap(kvm))
2269		return -ENXIO;
2270
2271	as_id = log->slot >> 16;
2272	id = (u16)log->slot;
2273	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2274		return -EINVAL;
2275
2276	if (log->first_page & 63)
2277		return -EINVAL;
2278
2279	slots = __kvm_memslots(kvm, as_id);
2280	memslot = id_to_memslot(slots, id);
2281	if (!memslot || !memslot->dirty_bitmap)
2282		return -ENOENT;
2283
2284	dirty_bitmap = memslot->dirty_bitmap;
2285
2286	n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
2287
2288	if (log->first_page > memslot->npages ||
2289	    log->num_pages > memslot->npages - log->first_page ||
2290	    (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2291	    return -EINVAL;
2292
2293	kvm_arch_sync_dirty_log(kvm, memslot);
2294
2295	flush = false;
2296	dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2297	if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2298		return -EFAULT;
2299
2300	KVM_MMU_LOCK(kvm);
2301	for (offset = log->first_page, i = offset / BITS_PER_LONG,
2302		 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2303	     i++, offset += BITS_PER_LONG) {
2304		unsigned long mask = *dirty_bitmap_buffer++;
2305		atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2306		if (!mask)
2307			continue;
2308
2309		mask &= atomic_long_fetch_andnot(mask, p);
2310
2311		/*
2312		 * mask contains the bits that really have been cleared.  This
2313		 * never includes any bits beyond the length of the memslot (if
2314		 * the length is not aligned to 64 pages), therefore it is not
2315		 * a problem if userspace sets them in log->dirty_bitmap.
2316		*/
2317		if (mask) {
2318			flush = true;
2319			kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2320								offset, mask);
2321		}
2322	}
2323	KVM_MMU_UNLOCK(kvm);
2324
2325	if (flush)
2326		kvm_flush_remote_tlbs_memslot(kvm, memslot);
2327
2328	return 0;
2329}
2330
2331static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2332					struct kvm_clear_dirty_log *log)
2333{
2334	int r;
2335
2336	mutex_lock(&kvm->slots_lock);
2337
2338	r = kvm_clear_dirty_log_protect(kvm, log);
2339
2340	mutex_unlock(&kvm->slots_lock);
2341	return r;
2342}
2343#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2344
2345struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2346{
2347	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2348}
2349EXPORT_SYMBOL_GPL(gfn_to_memslot);
2350
2351struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2352{
2353	struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2354	u64 gen = slots->generation;
2355	struct kvm_memory_slot *slot;
2356
2357	/*
2358	 * This also protects against using a memslot from a different address space,
2359	 * since different address spaces have different generation numbers.
2360	 */
2361	if (unlikely(gen != vcpu->last_used_slot_gen)) {
2362		vcpu->last_used_slot = NULL;
2363		vcpu->last_used_slot_gen = gen;
2364	}
2365
2366	slot = try_get_memslot(vcpu->last_used_slot, gfn);
2367	if (slot)
2368		return slot;
2369
2370	/*
2371	 * Fall back to searching all memslots. We purposely use
2372	 * search_memslots() instead of __gfn_to_memslot() to avoid
2373	 * thrashing the VM-wide last_used_slot in kvm_memslots.
2374	 */
2375	slot = search_memslots(slots, gfn, false);
2376	if (slot) {
2377		vcpu->last_used_slot = slot;
2378		return slot;
2379	}
2380
2381	return NULL;
2382}
2383
2384bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2385{
2386	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2387
2388	return kvm_is_visible_memslot(memslot);
2389}
2390EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2391
2392bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2393{
2394	struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2395
2396	return kvm_is_visible_memslot(memslot);
2397}
2398EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2399
2400unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2401{
2402	struct vm_area_struct *vma;
2403	unsigned long addr, size;
2404
2405	size = PAGE_SIZE;
2406
2407	addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2408	if (kvm_is_error_hva(addr))
2409		return PAGE_SIZE;
2410
2411	mmap_read_lock(current->mm);
2412	vma = find_vma(current->mm, addr);
2413	if (!vma)
2414		goto out;
2415
2416	size = vma_kernel_pagesize(vma);
2417
2418out:
2419	mmap_read_unlock(current->mm);
2420
2421	return size;
2422}
2423
2424static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
2425{
2426	return slot->flags & KVM_MEM_READONLY;
2427}
2428
2429static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
2430				       gfn_t *nr_pages, bool write)
2431{
2432	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2433		return KVM_HVA_ERR_BAD;
2434
2435	if (memslot_is_readonly(slot) && write)
2436		return KVM_HVA_ERR_RO_BAD;
2437
2438	if (nr_pages)
2439		*nr_pages = slot->npages - (gfn - slot->base_gfn);
2440
2441	return __gfn_to_hva_memslot(slot, gfn);
2442}
2443
2444static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2445				     gfn_t *nr_pages)
2446{
2447	return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2448}
2449
2450unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2451					gfn_t gfn)
2452{
2453	return gfn_to_hva_many(slot, gfn, NULL);
2454}
2455EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2456
2457unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2458{
2459	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2460}
2461EXPORT_SYMBOL_GPL(gfn_to_hva);
2462
2463unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2464{
2465	return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2466}
2467EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2468
2469/*
2470 * Return the hva of a @gfn and the R/W attribute if possible.
2471 *
2472 * @slot: the kvm_memory_slot which contains @gfn
2473 * @gfn: the gfn to be translated
2474 * @writable: used to return the read/write attribute of the @slot if the hva
2475 * is valid and @writable is not NULL
2476 */
2477unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2478				      gfn_t gfn, bool *writable)
2479{
2480	unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2481
2482	if (!kvm_is_error_hva(hva) && writable)
2483		*writable = !memslot_is_readonly(slot);
2484
2485	return hva;
2486}
2487
2488unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2489{
2490	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2491
2492	return gfn_to_hva_memslot_prot(slot, gfn, writable);
2493}
2494
2495unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2496{
2497	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2498
2499	return gfn_to_hva_memslot_prot(slot, gfn, writable);
2500}
2501
2502static inline int check_user_page_hwpoison(unsigned long addr)
2503{
2504	int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2505
2506	rc = get_user_pages(addr, 1, flags, NULL);
2507	return rc == -EHWPOISON;
2508}
2509
2510/*
2511 * The fast path to get the writable pfn which will be stored in @pfn,
2512 * true indicates success, otherwise false is returned.  It's also the
2513 * only part that runs if we can in atomic context.
2514 */
2515static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2516			    bool *writable, kvm_pfn_t *pfn)
2517{
2518	struct page *page[1];
2519
2520	/*
2521	 * Fast pin a writable pfn only if it is a write fault request
2522	 * or the caller allows to map a writable pfn for a read fault
2523	 * request.
2524	 */
2525	if (!(write_fault || writable))
2526		return false;
2527
2528	if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2529		*pfn = page_to_pfn(page[0]);
2530
2531		if (writable)
2532			*writable = true;
2533		return true;
2534	}
2535
2536	return false;
2537}
2538
2539/*
2540 * The slow path to get the pfn of the specified host virtual address,
2541 * 1 indicates success, -errno is returned if error is detected.
2542 */
2543static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2544			   bool interruptible, bool *writable, kvm_pfn_t *pfn)
2545{
2546	/*
2547	 * When a VCPU accesses a page that is not mapped into the secondary
2548	 * MMU, we lookup the page using GUP to map it, so the guest VCPU can
2549	 * make progress. We always want to honor NUMA hinting faults in that
2550	 * case, because GUP usage corresponds to memory accesses from the VCPU.
2551	 * Otherwise, we'd not trigger NUMA hinting faults once a page is
2552	 * mapped into the secondary MMU and gets accessed by a VCPU.
2553	 *
2554	 * Note that get_user_page_fast_only() and FOLL_WRITE for now
2555	 * implicitly honor NUMA hinting faults and don't need this flag.
2556	 */
2557	unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT;
2558	struct page *page;
2559	int npages;
2560
2561	might_sleep();
2562
2563	if (writable)
2564		*writable = write_fault;
2565
2566	if (write_fault)
2567		flags |= FOLL_WRITE;
2568	if (async)
2569		flags |= FOLL_NOWAIT;
2570	if (interruptible)
2571		flags |= FOLL_INTERRUPTIBLE;
2572
2573	npages = get_user_pages_unlocked(addr, 1, &page, flags);
2574	if (npages != 1)
2575		return npages;
2576
2577	/* map read fault as writable if possible */
2578	if (unlikely(!write_fault) && writable) {
2579		struct page *wpage;
2580
2581		if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2582			*writable = true;
2583			put_page(page);
2584			page = wpage;
2585		}
2586	}
2587	*pfn = page_to_pfn(page);
2588	return npages;
2589}
2590
2591static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2592{
2593	if (unlikely(!(vma->vm_flags & VM_READ)))
2594		return false;
2595
2596	if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2597		return false;
2598
2599	return true;
2600}
2601
2602static int kvm_try_get_pfn(kvm_pfn_t pfn)
2603{
2604	struct page *page = kvm_pfn_to_refcounted_page(pfn);
2605
2606	if (!page)
2607		return 1;
2608
2609	return get_page_unless_zero(page);
2610}
2611
2612static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2613			       unsigned long addr, bool write_fault,
2614			       bool *writable, kvm_pfn_t *p_pfn)
2615{
2616	kvm_pfn_t pfn;
2617	pte_t *ptep;
2618	pte_t pte;
2619	spinlock_t *ptl;
2620	int r;
2621
2622	r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2623	if (r) {
2624		/*
2625		 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2626		 * not call the fault handler, so do it here.
2627		 */
2628		bool unlocked = false;
2629		r = fixup_user_fault(current->mm, addr,
2630				     (write_fault ? FAULT_FLAG_WRITE : 0),
2631				     &unlocked);
2632		if (unlocked)
2633			return -EAGAIN;
2634		if (r)
2635			return r;
2636
2637		r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2638		if (r)
2639			return r;
2640	}
2641
2642	pte = ptep_get(ptep);
2643
2644	if (write_fault && !pte_write(pte)) {
2645		pfn = KVM_PFN_ERR_RO_FAULT;
2646		goto out;
2647	}
2648
2649	if (writable)
2650		*writable = pte_write(pte);
2651	pfn = pte_pfn(pte);
2652
2653	/*
2654	 * Get a reference here because callers of *hva_to_pfn* and
2655	 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2656	 * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
2657	 * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
2658	 * simply do nothing for reserved pfns.
2659	 *
2660	 * Whoever called remap_pfn_range is also going to call e.g.
2661	 * unmap_mapping_range before the underlying pages are freed,
2662	 * causing a call to our MMU notifier.
2663	 *
2664	 * Certain IO or PFNMAP mappings can be backed with valid
2665	 * struct pages, but be allocated without refcounting e.g.,
2666	 * tail pages of non-compound higher order allocations, which
2667	 * would then underflow the refcount when the caller does the
2668	 * required put_page. Don't allow those pages here.
2669	 */
2670	if (!kvm_try_get_pfn(pfn))
2671		r = -EFAULT;
2672
2673out:
2674	pte_unmap_unlock(ptep, ptl);
2675	*p_pfn = pfn;
2676
2677	return r;
2678}
2679
2680/*
2681 * Pin guest page in memory and return its pfn.
2682 * @addr: host virtual address which maps memory to the guest
2683 * @atomic: whether this function can sleep
2684 * @interruptible: whether the process can be interrupted by non-fatal signals
2685 * @async: whether this function need to wait IO complete if the
2686 *         host page is not in the memory
2687 * @write_fault: whether we should get a writable host page
2688 * @writable: whether it allows to map a writable host page for !@write_fault
2689 *
2690 * The function will map a writable host page for these two cases:
2691 * 1): @write_fault = true
2692 * 2): @write_fault = false && @writable, @writable will tell the caller
2693 *     whether the mapping is writable.
2694 */
2695kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
2696		     bool *async, bool write_fault, bool *writable)
2697{
2698	struct vm_area_struct *vma;
2699	kvm_pfn_t pfn;
2700	int npages, r;
2701
2702	/* we can do it either atomically or asynchronously, not both */
2703	BUG_ON(atomic && async);
2704
2705	if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2706		return pfn;
2707
2708	if (atomic)
2709		return KVM_PFN_ERR_FAULT;
2710
2711	npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
2712				 writable, &pfn);
2713	if (npages == 1)
2714		return pfn;
2715	if (npages == -EINTR)
2716		return KVM_PFN_ERR_SIGPENDING;
2717
2718	mmap_read_lock(current->mm);
2719	if (npages == -EHWPOISON ||
2720	      (!async && check_user_page_hwpoison(addr))) {
2721		pfn = KVM_PFN_ERR_HWPOISON;
2722		goto exit;
2723	}
2724
2725retry:
2726	vma = vma_lookup(current->mm, addr);
2727
2728	if (vma == NULL)
2729		pfn = KVM_PFN_ERR_FAULT;
2730	else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
2731		r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
2732		if (r == -EAGAIN)
2733			goto retry;
2734		if (r < 0)
2735			pfn = KVM_PFN_ERR_FAULT;
2736	} else {
2737		if (async && vma_is_valid(vma, write_fault))
2738			*async = true;
2739		pfn = KVM_PFN_ERR_FAULT;
2740	}
2741exit:
2742	mmap_read_unlock(current->mm);
2743	return pfn;
2744}
2745
2746kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
2747			       bool atomic, bool interruptible, bool *async,
2748			       bool write_fault, bool *writable, hva_t *hva)
2749{
2750	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2751
2752	if (hva)
2753		*hva = addr;
2754
2755	if (addr == KVM_HVA_ERR_RO_BAD) {
2756		if (writable)
2757			*writable = false;
2758		return KVM_PFN_ERR_RO_FAULT;
2759	}
2760
2761	if (kvm_is_error_hva(addr)) {
2762		if (writable)
2763			*writable = false;
2764		return KVM_PFN_NOSLOT;
2765	}
2766
2767	/* Do not map writable pfn in the readonly memslot. */
2768	if (writable && memslot_is_readonly(slot)) {
2769		*writable = false;
2770		writable = NULL;
2771	}
2772
2773	return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
2774			  writable);
2775}
2776EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2777
2778kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2779		      bool *writable)
2780{
2781	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
2782				    NULL, write_fault, writable, NULL);
2783}
2784EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2785
2786kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
2787{
2788	return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
2789				    NULL, NULL);
2790}
2791EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2792
2793kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
2794{
2795	return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
2796				    NULL, NULL);
2797}
2798EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2799
2800kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
2801{
2802	return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2803}
2804EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2805
2806kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2807{
2808	return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2809}
2810EXPORT_SYMBOL_GPL(gfn_to_pfn);
2811
2812kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2813{
2814	return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2815}
2816EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2817
2818int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2819			    struct page **pages, int nr_pages)
2820{
2821	unsigned long addr;
2822	gfn_t entry = 0;
2823
2824	addr = gfn_to_hva_many(slot, gfn, &entry);
2825	if (kvm_is_error_hva(addr))
2826		return -1;
2827
2828	if (entry < nr_pages)
2829		return 0;
2830
2831	return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
2832}
2833EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2834
2835/*
2836 * Do not use this helper unless you are absolutely certain the gfn _must_ be
2837 * backed by 'struct page'.  A valid example is if the backing memslot is
2838 * controlled by KVM.  Note, if the returned page is valid, it's refcount has
2839 * been elevated by gfn_to_pfn().
2840 */
2841struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2842{
2843	struct page *page;
2844	kvm_pfn_t pfn;
2845
2846	pfn = gfn_to_pfn(kvm, gfn);
2847
2848	if (is_error_noslot_pfn(pfn))
2849		return KVM_ERR_PTR_BAD_PAGE;
2850
2851	page = kvm_pfn_to_refcounted_page(pfn);
2852	if (!page)
2853		return KVM_ERR_PTR_BAD_PAGE;
2854
2855	return page;
2856}
2857EXPORT_SYMBOL_GPL(gfn_to_page);
2858
2859void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
2860{
2861	if (dirty)
2862		kvm_release_pfn_dirty(pfn);
2863	else
2864		kvm_release_pfn_clean(pfn);
2865}
2866
2867int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2868{
2869	kvm_pfn_t pfn;
2870	void *hva = NULL;
2871	struct page *page = KVM_UNMAPPED_PAGE;
2872
2873	if (!map)
2874		return -EINVAL;
2875
2876	pfn = gfn_to_pfn(vcpu->kvm, gfn);
2877	if (is_error_noslot_pfn(pfn))
2878		return -EINVAL;
2879
2880	if (pfn_valid(pfn)) {
2881		page = pfn_to_page(pfn);
2882		hva = kmap(page);
2883#ifdef CONFIG_HAS_IOMEM
2884	} else {
2885		hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2886#endif
2887	}
2888
2889	if (!hva)
2890		return -EFAULT;
2891
2892	map->page = page;
2893	map->hva = hva;
2894	map->pfn = pfn;
2895	map->gfn = gfn;
2896
2897	return 0;
2898}
2899EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2900
2901void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2902{
2903	if (!map)
2904		return;
2905
2906	if (!map->hva)
2907		return;
2908
2909	if (map->page != KVM_UNMAPPED_PAGE)
2910		kunmap(map->page);
2911#ifdef CONFIG_HAS_IOMEM
2912	else
2913		memunmap(map->hva);
2914#endif
2915
2916	if (dirty)
2917		kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
2918
2919	kvm_release_pfn(map->pfn, dirty);
2920
2921	map->hva = NULL;
2922	map->page = NULL;
2923}
2924EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2925
2926static bool kvm_is_ad_tracked_page(struct page *page)
2927{
2928	/*
2929	 * Per page-flags.h, pages tagged PG_reserved "should in general not be
2930	 * touched (e.g. set dirty) except by its owner".
2931	 */
2932	return !PageReserved(page);
2933}
2934
2935static void kvm_set_page_dirty(struct page *page)
2936{
2937	if (kvm_is_ad_tracked_page(page))
2938		SetPageDirty(page);
2939}
2940
2941static void kvm_set_page_accessed(struct page *page)
2942{
2943	if (kvm_is_ad_tracked_page(page))
2944		mark_page_accessed(page);
2945}
2946
2947void kvm_release_page_clean(struct page *page)
2948{
2949	WARN_ON(is_error_page(page));
2950
2951	kvm_set_page_accessed(page);
2952	put_page(page);
2953}
2954EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2955
2956void kvm_release_pfn_clean(kvm_pfn_t pfn)
2957{
2958	struct page *page;
2959
2960	if (is_error_noslot_pfn(pfn))
2961		return;
2962
2963	page = kvm_pfn_to_refcounted_page(pfn);
2964	if (!page)
2965		return;
2966
2967	kvm_release_page_clean(page);
2968}
2969EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2970
2971void kvm_release_page_dirty(struct page *page)
2972{
2973	WARN_ON(is_error_page(page));
2974
2975	kvm_set_page_dirty(page);
2976	kvm_release_page_clean(page);
2977}
2978EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2979
2980void kvm_release_pfn_dirty(kvm_pfn_t pfn)
2981{
2982	struct page *page;
2983
2984	if (is_error_noslot_pfn(pfn))
2985		return;
2986
2987	page = kvm_pfn_to_refcounted_page(pfn);
2988	if (!page)
2989		return;
2990
2991	kvm_release_page_dirty(page);
2992}
2993EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
2994
2995/*
2996 * Note, checking for an error/noslot pfn is the caller's responsibility when
2997 * directly marking a page dirty/accessed.  Unlike the "release" helpers, the
2998 * "set" helpers are not to be used when the pfn might point at garbage.
2999 */
3000void kvm_set_pfn_dirty(kvm_pfn_t pfn)
3001{
3002	if (WARN_ON(is_error_noslot_pfn(pfn)))
3003		return;
3004
3005	if (pfn_valid(pfn))
3006		kvm_set_page_dirty(pfn_to_page(pfn));
3007}
3008EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
3009
3010void kvm_set_pfn_accessed(kvm_pfn_t pfn)
3011{
3012	if (WARN_ON(is_error_noslot_pfn(pfn)))
3013		return;
3014
3015	if (pfn_valid(pfn))
3016		kvm_set_page_accessed(pfn_to_page(pfn));
3017}
3018EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
3019
3020static int next_segment(unsigned long len, int offset)
3021{
3022	if (len > PAGE_SIZE - offset)
3023		return PAGE_SIZE - offset;
3024	else
3025		return len;
3026}
3027
3028static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
3029				 void *data, int offset, int len)
3030{
3031	int r;
3032	unsigned long addr;
3033
3034	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3035	if (kvm_is_error_hva(addr))
3036		return -EFAULT;
3037	r = __copy_from_user(data, (void __user *)addr + offset, len);
3038	if (r)
3039		return -EFAULT;
3040	return 0;
3041}
3042
3043int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
3044			int len)
3045{
3046	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3047
3048	return __kvm_read_guest_page(slot, gfn, data, offset, len);
3049}
3050EXPORT_SYMBOL_GPL(kvm_read_guest_page);
3051
3052int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
3053			     int offset, int len)
3054{
3055	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3056
3057	return __kvm_read_guest_page(slot, gfn, data, offset, len);
3058}
3059EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
3060
3061int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
3062{
3063	gfn_t gfn = gpa >> PAGE_SHIFT;
3064	int seg;
3065	int offset = offset_in_page(gpa);
3066	int ret;
3067
3068	while ((seg = next_segment(len, offset)) != 0) {
3069		ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
3070		if (ret < 0)
3071			return ret;
3072		offset = 0;
3073		len -= seg;
3074		data += seg;
3075		++gfn;
3076	}
3077	return 0;
3078}
3079EXPORT_SYMBOL_GPL(kvm_read_guest);
3080
3081int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
3082{
3083	gfn_t gfn = gpa >> PAGE_SHIFT;
3084	int seg;
3085	int offset = offset_in_page(gpa);
3086	int ret;
3087
3088	while ((seg = next_segment(len, offset)) != 0) {
3089		ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
3090		if (ret < 0)
3091			return ret;
3092		offset = 0;
3093		len -= seg;
3094		data += seg;
3095		++gfn;
3096	}
3097	return 0;
3098}
3099EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
3100
3101static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3102			           void *data, int offset, unsigned long len)
3103{
3104	int r;
3105	unsigned long addr;
3106
3107	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3108	if (kvm_is_error_hva(addr))
3109		return -EFAULT;
3110	pagefault_disable();
3111	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
3112	pagefault_enable();
3113	if (r)
3114		return -EFAULT;
3115	return 0;
3116}
3117
3118int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
3119			       void *data, unsigned long len)
3120{
3121	gfn_t gfn = gpa >> PAGE_SHIFT;
3122	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3123	int offset = offset_in_page(gpa);
3124
3125	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
3126}
3127EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
3128
3129static int __kvm_write_guest_page(struct kvm *kvm,
3130				  struct kvm_memory_slot *memslot, gfn_t gfn,
3131			          const void *data, int offset, int len)
3132{
3133	int r;
3134	unsigned long addr;
3135
3136	addr = gfn_to_hva_memslot(memslot, gfn);
3137	if (kvm_is_error_hva(addr))
3138		return -EFAULT;
3139	r = __copy_to_user((void __user *)addr + offset, data, len);
3140	if (r)
3141		return -EFAULT;
3142	mark_page_dirty_in_slot(kvm, memslot, gfn);
3143	return 0;
3144}
3145
3146int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
3147			 const void *data, int offset, int len)
3148{
3149	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3150
3151	return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
3152}
3153EXPORT_SYMBOL_GPL(kvm_write_guest_page);
3154
3155int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
3156			      const void *data, int offset, int len)
3157{
3158	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3159
3160	return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
3161}
3162EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
3163
3164int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
3165		    unsigned long len)
3166{
3167	gfn_t gfn = gpa >> PAGE_SHIFT;
3168	int seg;
3169	int offset = offset_in_page(gpa);
3170	int ret;
3171
3172	while ((seg = next_segment(len, offset)) != 0) {
3173		ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
3174		if (ret < 0)
3175			return ret;
3176		offset = 0;
3177		len -= seg;
3178		data += seg;
3179		++gfn;
3180	}
3181	return 0;
3182}
3183EXPORT_SYMBOL_GPL(kvm_write_guest);
3184
3185int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
3186		         unsigned long len)
3187{
3188	gfn_t gfn = gpa >> PAGE_SHIFT;
3189	int seg;
3190	int offset = offset_in_page(gpa);
3191	int ret;
3192
3193	while ((seg = next_segment(len, offset)) != 0) {
3194		ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
3195		if (ret < 0)
3196			return ret;
3197		offset = 0;
3198		len -= seg;
3199		data += seg;
3200		++gfn;
3201	}
3202	return 0;
3203}
3204EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
3205
3206static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
3207				       struct gfn_to_hva_cache *ghc,
3208				       gpa_t gpa, unsigned long len)
3209{
3210	int offset = offset_in_page(gpa);
3211	gfn_t start_gfn = gpa >> PAGE_SHIFT;
3212	gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
3213	gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
3214	gfn_t nr_pages_avail;
3215
3216	/* Update ghc->generation before performing any error checks. */
3217	ghc->generation = slots->generation;
3218
3219	if (start_gfn > end_gfn) {
3220		ghc->hva = KVM_HVA_ERR_BAD;
3221		return -EINVAL;
3222	}
3223
3224	/*
3225	 * If the requested region crosses two memslots, we still
3226	 * verify that the entire region is valid here.
3227	 */
3228	for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
3229		ghc->memslot = __gfn_to_memslot(slots, start_gfn);
3230		ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
3231					   &nr_pages_avail);
3232		if (kvm_is_error_hva(ghc->hva))
3233			return -EFAULT;
3234	}
3235
3236	/* Use the slow path for cross page reads and writes. */
3237	if (nr_pages_needed == 1)
3238		ghc->hva += offset;
3239	else
3240		ghc->memslot = NULL;
3241
3242	ghc->gpa = gpa;
3243	ghc->len = len;
3244	return 0;
3245}
3246
3247int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3248			      gpa_t gpa, unsigned long len)
3249{
3250	struct kvm_memslots *slots = kvm_memslots(kvm);
3251	return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
3252}
3253EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
3254
3255int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3256				  void *data, unsigned int offset,
3257				  unsigned long len)
3258{
3259	struct kvm_memslots *slots = kvm_memslots(kvm);
3260	int r;
3261	gpa_t gpa = ghc->gpa + offset;
3262
3263	if (WARN_ON_ONCE(len + offset > ghc->len))
3264		return -EINVAL;
3265
3266	if (slots->generation != ghc->generation) {
3267		if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3268			return -EFAULT;
3269	}
3270
3271	if (kvm_is_error_hva(ghc->hva))
3272		return -EFAULT;
3273
3274	if (unlikely(!ghc->memslot))
3275		return kvm_write_guest(kvm, gpa, data, len);
3276
3277	r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
3278	if (r)
3279		return -EFAULT;
3280	mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
3281
3282	return 0;
3283}
3284EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
3285
3286int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3287			   void *data, unsigned long len)
3288{
3289	return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
3290}
3291EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
3292
3293int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3294				 void *data, unsigned int offset,
3295				 unsigned long len)
3296{
3297	struct kvm_memslots *slots = kvm_memslots(kvm);
3298	int r;
3299	gpa_t gpa = ghc->gpa + offset;
3300
3301	if (WARN_ON_ONCE(len + offset > ghc->len))
3302		return -EINVAL;
3303
3304	if (slots->generation != ghc->generation) {
3305		if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3306			return -EFAULT;
3307	}
3308
3309	if (kvm_is_error_hva(ghc->hva))
3310		return -EFAULT;
3311
3312	if (unlikely(!ghc->memslot))
3313		return kvm_read_guest(kvm, gpa, data, len);
3314
3315	r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
3316	if (r)
3317		return -EFAULT;
3318
3319	return 0;
3320}
3321EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3322
3323int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3324			  void *data, unsigned long len)
3325{
3326	return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3327}
3328EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3329
3330int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3331{
3332	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3333	gfn_t gfn = gpa >> PAGE_SHIFT;
3334	int seg;
3335	int offset = offset_in_page(gpa);
3336	int ret;
3337
3338	while ((seg = next_segment(len, offset)) != 0) {
3339		ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
3340		if (ret < 0)
3341			return ret;
3342		offset = 0;
3343		len -= seg;
3344		++gfn;
3345	}
3346	return 0;
3347}
3348EXPORT_SYMBOL_GPL(kvm_clear_guest);
3349
3350void mark_page_dirty_in_slot(struct kvm *kvm,
3351			     const struct kvm_memory_slot *memslot,
3352		 	     gfn_t gfn)
3353{
3354	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
3355
3356#ifdef CONFIG_HAVE_KVM_DIRTY_RING
3357	if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
3358		return;
3359
3360	WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
3361#endif
3362
3363	if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
3364		unsigned long rel_gfn = gfn - memslot->base_gfn;
3365		u32 slot = (memslot->as_id << 16) | memslot->id;
3366
3367		if (kvm->dirty_ring_size && vcpu)
3368			kvm_dirty_ring_push(vcpu, slot, rel_gfn);
3369		else if (memslot->dirty_bitmap)
3370			set_bit_le(rel_gfn, memslot->dirty_bitmap);
3371	}
3372}
3373EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
3374
3375void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3376{
3377	struct kvm_memory_slot *memslot;
3378
3379	memslot = gfn_to_memslot(kvm, gfn);
3380	mark_page_dirty_in_slot(kvm, memslot, gfn);
3381}
3382EXPORT_SYMBOL_GPL(mark_page_dirty);
3383
3384void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3385{
3386	struct kvm_memory_slot *memslot;
3387
3388	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3389	mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3390}
3391EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3392
3393void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3394{
3395	if (!vcpu->sigset_active)
3396		return;
3397
3398	/*
3399	 * This does a lockless modification of ->real_blocked, which is fine
3400	 * because, only current can change ->real_blocked and all readers of
3401	 * ->real_blocked don't care as long ->real_blocked is always a subset
3402	 * of ->blocked.
3403	 */
3404	sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3405}
3406
3407void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3408{
3409	if (!vcpu->sigset_active)
3410		return;
3411
3412	sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3413	sigemptyset(&current->real_blocked);
3414}
3415
3416static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3417{
3418	unsigned int old, val, grow, grow_start;
3419
3420	old = val = vcpu->halt_poll_ns;
3421	grow_start = READ_ONCE(halt_poll_ns_grow_start);
3422	grow = READ_ONCE(halt_poll_ns_grow);
3423	if (!grow)
3424		goto out;
3425
3426	val *= grow;
3427	if (val < grow_start)
3428		val = grow_start;
3429
3430	vcpu->halt_poll_ns = val;
3431out:
3432	trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3433}
3434
3435static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3436{
3437	unsigned int old, val, shrink, grow_start;
3438
3439	old = val = vcpu->halt_poll_ns;
3440	shrink = READ_ONCE(halt_poll_ns_shrink);
3441	grow_start = READ_ONCE(halt_poll_ns_grow_start);
3442	if (shrink == 0)
3443		val = 0;
3444	else
3445		val /= shrink;
3446
3447	if (val < grow_start)
3448		val = 0;
3449
3450	vcpu->halt_poll_ns = val;
3451	trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3452}
3453
3454static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3455{
3456	int ret = -EINTR;
3457	int idx = srcu_read_lock(&vcpu->kvm->srcu);
3458
3459	if (kvm_arch_vcpu_runnable(vcpu))
3460		goto out;
3461	if (kvm_cpu_has_pending_timer(vcpu))
3462		goto out;
3463	if (signal_pending(current))
3464		goto out;
3465	if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3466		goto out;
3467
3468	ret = 0;
3469out:
3470	srcu_read_unlock(&vcpu->kvm->srcu, idx);
3471	return ret;
3472}
3473
3474/*
3475 * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3476 * pending.  This is mostly used when halting a vCPU, but may also be used
3477 * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
3478 */
3479bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
3480{
3481	struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
3482	bool waited = false;
3483
3484	vcpu->stat.generic.blocking = 1;
3485
3486	preempt_disable();
3487	kvm_arch_vcpu_blocking(vcpu);
3488	prepare_to_rcuwait(wait);
3489	preempt_enable();
3490
3491	for (;;) {
3492		set_current_state(TASK_INTERRUPTIBLE);
3493
3494		if (kvm_vcpu_check_block(vcpu) < 0)
3495			break;
3496
3497		waited = true;
3498		schedule();
3499	}
3500
3501	preempt_disable();
3502	finish_rcuwait(wait);
3503	kvm_arch_vcpu_unblocking(vcpu);
3504	preempt_enable();
3505
3506	vcpu->stat.generic.blocking = 0;
3507
3508	return waited;
3509}
3510
3511static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
3512					  ktime_t end, bool success)
3513{
3514	struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
3515	u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
3516
3517	++vcpu->stat.generic.halt_attempted_poll;
3518
3519	if (success) {
3520		++vcpu->stat.generic.halt_successful_poll;
3521
3522		if (!vcpu_valid_wakeup(vcpu))
3523			++vcpu->stat.generic.halt_poll_invalid;
3524
3525		stats->halt_poll_success_ns += poll_ns;
3526		KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
3527	} else {
3528		stats->halt_poll_fail_ns += poll_ns;
3529		KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
3530	}
3531}
3532
3533static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
3534{
3535	struct kvm *kvm = vcpu->kvm;
3536
3537	if (kvm->override_halt_poll_ns) {
3538		/*
3539		 * Ensure kvm->max_halt_poll_ns is not read before
3540		 * kvm->override_halt_poll_ns.
3541		 *
3542		 * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
3543		 */
3544		smp_rmb();
3545		return READ_ONCE(kvm->max_halt_poll_ns);
3546	}
3547
3548	return READ_ONCE(halt_poll_ns);
3549}
3550
3551/*
3552 * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc...  If halt
3553 * polling is enabled, busy wait for a short time before blocking to avoid the
3554 * expensive block+unblock sequence if a wake event arrives soon after the vCPU
3555 * is halted.
3556 */
3557void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
3558{
3559	unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3560	bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
3561	ktime_t start, cur, poll_end;
3562	bool waited = false;
3563	bool do_halt_poll;
3564	u64 halt_ns;
3565
3566	if (vcpu->halt_poll_ns > max_halt_poll_ns)
3567		vcpu->halt_poll_ns = max_halt_poll_ns;
3568
3569	do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
3570
3571	start = cur = poll_end = ktime_get();
3572	if (do_halt_poll) {
3573		ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
3574
3575		do {
3576			if (kvm_vcpu_check_block(vcpu) < 0)
3577				goto out;
3578			cpu_relax();
3579			poll_end = cur = ktime_get();
3580		} while (kvm_vcpu_can_poll(cur, stop));
3581	}
3582
3583	waited = kvm_vcpu_block(vcpu);
3584
3585	cur = ktime_get();
3586	if (waited) {
3587		vcpu->stat.generic.halt_wait_ns +=
3588			ktime_to_ns(cur) - ktime_to_ns(poll_end);
3589		KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3590				ktime_to_ns(cur) - ktime_to_ns(poll_end));
3591	}
3592out:
3593	/* The total time the vCPU was "halted", including polling time. */
3594	halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3595
3596	/*
3597	 * Note, halt-polling is considered successful so long as the vCPU was
3598	 * never actually scheduled out, i.e. even if the wake event arrived
3599	 * after of the halt-polling loop itself, but before the full wait.
3600	 */
3601	if (do_halt_poll)
3602		update_halt_poll_stats(vcpu, start, poll_end, !waited);
3603
3604	if (halt_poll_allowed) {
3605		/* Recompute the max halt poll time in case it changed. */
3606		max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3607
3608		if (!vcpu_valid_wakeup(vcpu)) {
3609			shrink_halt_poll_ns(vcpu);
3610		} else if (max_halt_poll_ns) {
3611			if (halt_ns <= vcpu->halt_poll_ns)
3612				;
3613			/* we had a long block, shrink polling */
3614			else if (vcpu->halt_poll_ns &&
3615				 halt_ns > max_halt_poll_ns)
3616				shrink_halt_poll_ns(vcpu);
3617			/* we had a short halt and our poll time is too small */
3618			else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
3619				 halt_ns < max_halt_poll_ns)
3620				grow_halt_poll_ns(vcpu);
3621		} else {
3622			vcpu->halt_poll_ns = 0;
3623		}
3624	}
3625
3626	trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
3627}
3628EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
3629
3630bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3631{
3632	if (__kvm_vcpu_wake_up(vcpu)) {
3633		WRITE_ONCE(vcpu->ready, true);
3634		++vcpu->stat.generic.halt_wakeup;
3635		return true;
3636	}
3637
3638	return false;
3639}
3640EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3641
3642#ifndef CONFIG_S390
3643/*
3644 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3645 */
3646void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3647{
3648	int me, cpu;
3649
3650	if (kvm_vcpu_wake_up(vcpu))
3651		return;
3652
3653	me = get_cpu();
3654	/*
3655	 * The only state change done outside the vcpu mutex is IN_GUEST_MODE
3656	 * to EXITING_GUEST_MODE.  Therefore the moderately expensive "should
3657	 * kick" check does not need atomic operations if kvm_vcpu_kick is used
3658	 * within the vCPU thread itself.
3659	 */
3660	if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3661		if (vcpu->mode == IN_GUEST_MODE)
3662			WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3663		goto out;
3664	}
3665
3666	/*
3667	 * Note, the vCPU could get migrated to a different pCPU at any point
3668	 * after kvm_arch_vcpu_should_kick(), which could result in sending an
3669	 * IPI to the previous pCPU.  But, that's ok because the purpose of the
3670	 * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3671	 * vCPU also requires it to leave IN_GUEST_MODE.
3672	 */
3673	if (kvm_arch_vcpu_should_kick(vcpu)) {
3674		cpu = READ_ONCE(vcpu->cpu);
3675		if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3676			smp_send_reschedule(cpu);
3677	}
3678out:
3679	put_cpu();
3680}
3681EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3682#endif /* !CONFIG_S390 */
3683
3684int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3685{
3686	struct pid *pid;
3687	struct task_struct *task = NULL;
3688	int ret = 0;
3689
3690	rcu_read_lock();
3691	pid = rcu_dereference(target->pid);
3692	if (pid)
3693		task = get_pid_task(pid, PIDTYPE_PID);
3694	rcu_read_unlock();
3695	if (!task)
3696		return ret;
3697	ret = yield_to(task, 1);
3698	put_task_struct(task);
3699
3700	return ret;
3701}
3702EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3703
3704/*
3705 * Helper that checks whether a VCPU is eligible for directed yield.
3706 * Most eligible candidate to yield is decided by following heuristics:
3707 *
3708 *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3709 *  (preempted lock holder), indicated by @in_spin_loop.
3710 *  Set at the beginning and cleared at the end of interception/PLE handler.
3711 *
3712 *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3713 *  chance last time (mostly it has become eligible now since we have probably
3714 *  yielded to lockholder in last iteration. This is done by toggling
3715 *  @dy_eligible each time a VCPU checked for eligibility.)
3716 *
3717 *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3718 *  to preempted lock-holder could result in wrong VCPU selection and CPU
3719 *  burning. Giving priority for a potential lock-holder increases lock
3720 *  progress.
3721 *
3722 *  Since algorithm is based on heuristics, accessing another VCPU data without
3723 *  locking does not harm. It may result in trying to yield to  same VCPU, fail
3724 *  and continue with next VCPU and so on.
3725 */
3726static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3727{
3728#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3729	bool eligible;
3730
3731	eligible = !vcpu->spin_loop.in_spin_loop ||
3732		    vcpu->spin_loop.dy_eligible;
3733
3734	if (vcpu->spin_loop.in_spin_loop)
3735		kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3736
3737	return eligible;
3738#else
3739	return true;
3740#endif
3741}
3742
3743/*
3744 * Unlike kvm_arch_vcpu_runnable, this function is called outside
3745 * a vcpu_load/vcpu_put pair.  However, for most architectures
3746 * kvm_arch_vcpu_runnable does not require vcpu_load.
3747 */
3748bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3749{
3750	return kvm_arch_vcpu_runnable(vcpu);
3751}
3752
3753static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3754{
3755	if (kvm_arch_dy_runnable(vcpu))
3756		return true;
3757
3758#ifdef CONFIG_KVM_ASYNC_PF
3759	if (!list_empty_careful(&vcpu->async_pf.done))
3760		return true;
3761#endif
3762
3763	return false;
3764}
3765
3766bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3767{
3768	return false;
3769}
3770
3771void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3772{
3773	struct kvm *kvm = me->kvm;
3774	struct kvm_vcpu *vcpu;
3775	int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
3776	unsigned long i;
3777	int yielded = 0;
3778	int try = 3;
3779	int pass;
3780
3781	kvm_vcpu_set_in_spin_loop(me, true);
3782	/*
3783	 * We boost the priority of a VCPU that is runnable but not
3784	 * currently running, because it got preempted by something
3785	 * else and called schedule in __vcpu_run.  Hopefully that
3786	 * VCPU is holding the lock that we need and will release it.
3787	 * We approximate round-robin by starting at the last boosted VCPU.
3788	 */
3789	for (pass = 0; pass < 2 && !yielded && try; pass++) {
3790		kvm_for_each_vcpu(i, vcpu, kvm) {
3791			if (!pass && i <= last_boosted_vcpu) {
3792				i = last_boosted_vcpu;
3793				continue;
3794			} else if (pass && i > last_boosted_vcpu)
3795				break;
3796			if (!READ_ONCE(vcpu->ready))
3797				continue;
3798			if (vcpu == me)
3799				continue;
3800			if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
3801				continue;
3802			if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3803			    !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3804			    !kvm_arch_vcpu_in_kernel(vcpu))
3805				continue;
3806			if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3807				continue;
3808
3809			yielded = kvm_vcpu_yield_to(vcpu);
3810			if (yielded > 0) {
3811				kvm->last_boosted_vcpu = i;
3812				break;
3813			} else if (yielded < 0) {
3814				try--;
3815				if (!try)
3816					break;
3817			}
3818		}
3819	}
3820	kvm_vcpu_set_in_spin_loop(me, false);
3821
3822	/* Ensure vcpu is not eligible during next spinloop */
3823	kvm_vcpu_set_dy_eligible(me, false);
3824}
3825EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3826
3827static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3828{
3829#ifdef CONFIG_HAVE_KVM_DIRTY_RING
3830	return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3831	    (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3832	     kvm->dirty_ring_size / PAGE_SIZE);
3833#else
3834	return false;
3835#endif
3836}
3837
3838static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3839{
3840	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3841	struct page *page;
3842
3843	if (vmf->pgoff == 0)
3844		page = virt_to_page(vcpu->run);
3845#ifdef CONFIG_X86
3846	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3847		page = virt_to_page(vcpu->arch.pio_data);
3848#endif
3849#ifdef CONFIG_KVM_MMIO
3850	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3851		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3852#endif
3853	else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3854		page = kvm_dirty_ring_get_page(
3855		    &vcpu->dirty_ring,
3856		    vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
3857	else
3858		return kvm_arch_vcpu_fault(vcpu, vmf);
3859	get_page(page);
3860	vmf->page = page;
3861	return 0;
3862}
3863
3864static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3865	.fault = kvm_vcpu_fault,
3866};
3867
3868static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3869{
3870	struct kvm_vcpu *vcpu = file->private_data;
3871	unsigned long pages = vma_pages(vma);
3872
3873	if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3874	     kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3875	    ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3876		return -EINVAL;
3877
3878	vma->vm_ops = &kvm_vcpu_vm_ops;
3879	return 0;
3880}
3881
3882static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3883{
3884	struct kvm_vcpu *vcpu = filp->private_data;
3885
3886	kvm_put_kvm(vcpu->kvm);
3887	return 0;
3888}
3889
3890static const struct file_operations kvm_vcpu_fops = {
3891	.release        = kvm_vcpu_release,
3892	.unlocked_ioctl = kvm_vcpu_ioctl,
3893	.mmap           = kvm_vcpu_mmap,
3894	.llseek		= noop_llseek,
3895	KVM_COMPAT(kvm_vcpu_compat_ioctl),
3896};
3897
3898/*
3899 * Allocates an inode for the vcpu.
3900 */
3901static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3902{
3903	char name[8 + 1 + ITOA_MAX_LEN + 1];
3904
3905	snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3906	return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
3907}
3908
3909#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3910static int vcpu_get_pid(void *data, u64 *val)
3911{
3912	struct kvm_vcpu *vcpu = data;
3913
3914	rcu_read_lock();
3915	*val = pid_nr(rcu_dereference(vcpu->pid));
3916	rcu_read_unlock();
3917	return 0;
3918}
3919
3920DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
3921
3922static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3923{
3924	struct dentry *debugfs_dentry;
3925	char dir_name[ITOA_MAX_LEN * 2];
3926
3927	if (!debugfs_initialized())
3928		return;
3929
3930	snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3931	debugfs_dentry = debugfs_create_dir(dir_name,
3932					    vcpu->kvm->debugfs_dentry);
3933	debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
3934			    &vcpu_get_pid_fops);
3935
3936	kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3937}
3938#endif
3939
3940/*
3941 * Creates some virtual cpus.  Good luck creating more than one.
3942 */
3943static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
3944{
3945	int r;
3946	struct kvm_vcpu *vcpu;
3947	struct page *page;
3948
3949	if (id >= KVM_MAX_VCPU_IDS)
3950		return -EINVAL;
3951
3952	mutex_lock(&kvm->lock);
3953	if (kvm->created_vcpus >= kvm->max_vcpus) {
3954		mutex_unlock(&kvm->lock);
3955		return -EINVAL;
3956	}
3957
3958	r = kvm_arch_vcpu_precreate(kvm, id);
3959	if (r) {
3960		mutex_unlock(&kvm->lock);
3961		return r;
3962	}
3963
3964	kvm->created_vcpus++;
3965	mutex_unlock(&kvm->lock);
3966
3967	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
3968	if (!vcpu) {
3969		r = -ENOMEM;
3970		goto vcpu_decrement;
3971	}
3972
3973	BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3974	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
3975	if (!page) {
3976		r = -ENOMEM;
3977		goto vcpu_free;
3978	}
3979	vcpu->run = page_address(page);
3980
3981	kvm_vcpu_init(vcpu, kvm, id);
3982
3983	r = kvm_arch_vcpu_create(vcpu);
3984	if (r)
3985		goto vcpu_free_run_page;
3986
3987	if (kvm->dirty_ring_size) {
3988		r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3989					 id, kvm->dirty_ring_size);
3990		if (r)
3991			goto arch_vcpu_destroy;
3992	}
3993
3994	mutex_lock(&kvm->lock);
3995
3996#ifdef CONFIG_LOCKDEP
3997	/* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */
3998	mutex_lock(&vcpu->mutex);
3999	mutex_unlock(&vcpu->mutex);
4000#endif
4001
4002	if (kvm_get_vcpu_by_id(kvm, id)) {
4003		r = -EEXIST;
4004		goto unlock_vcpu_destroy;
4005	}
4006
4007	vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
4008	r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
4009	if (r)
4010		goto unlock_vcpu_destroy;
4011
4012	/* Now it's all set up, let userspace reach it */
4013	kvm_get_kvm(kvm);
4014	r = create_vcpu_fd(vcpu);
4015	if (r < 0)
4016		goto kvm_put_xa_release;
4017
4018	if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
4019		r = -EINVAL;
4020		goto kvm_put_xa_release;
4021	}
4022
4023	/*
4024	 * Pairs with smp_rmb() in kvm_get_vcpu.  Store the vcpu
4025	 * pointer before kvm->online_vcpu's incremented value.
4026	 */
4027	smp_wmb();
4028	atomic_inc(&kvm->online_vcpus);
4029
4030	mutex_unlock(&kvm->lock);
4031	kvm_arch_vcpu_postcreate(vcpu);
4032	kvm_create_vcpu_debugfs(vcpu);
4033	return r;
4034
4035kvm_put_xa_release:
4036	kvm_put_kvm_no_destroy(kvm);
4037	xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
4038unlock_vcpu_destroy:
4039	mutex_unlock(&kvm->lock);
4040	kvm_dirty_ring_free(&vcpu->dirty_ring);
4041arch_vcpu_destroy:
4042	kvm_arch_vcpu_destroy(vcpu);
4043vcpu_free_run_page:
4044	free_page((unsigned long)vcpu->run);
4045vcpu_free:
4046	kmem_cache_free(kvm_vcpu_cache, vcpu);
4047vcpu_decrement:
4048	mutex_lock(&kvm->lock);
4049	kvm->created_vcpus--;
4050	mutex_unlock(&kvm->lock);
4051	return r;
4052}
4053
4054static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
4055{
4056	if (sigset) {
4057		sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
4058		vcpu->sigset_active = 1;
4059		vcpu->sigset = *sigset;
4060	} else
4061		vcpu->sigset_active = 0;
4062	return 0;
4063}
4064
4065static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
4066			      size_t size, loff_t *offset)
4067{
4068	struct kvm_vcpu *vcpu = file->private_data;
4069
4070	return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
4071			&kvm_vcpu_stats_desc[0], &vcpu->stat,
4072			sizeof(vcpu->stat), user_buffer, size, offset);
4073}
4074
4075static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
4076{
4077	struct kvm_vcpu *vcpu = file->private_data;
4078
4079	kvm_put_kvm(vcpu->kvm);
4080	return 0;
4081}
4082
4083static const struct file_operations kvm_vcpu_stats_fops = {
4084	.read = kvm_vcpu_stats_read,
4085	.release = kvm_vcpu_stats_release,
4086	.llseek = noop_llseek,
4087};
4088
4089static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
4090{
4091	int fd;
4092	struct file *file;
4093	char name[15 + ITOA_MAX_LEN + 1];
4094
4095	snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
4096
4097	fd = get_unused_fd_flags(O_CLOEXEC);
4098	if (fd < 0)
4099		return fd;
4100
4101	file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
4102	if (IS_ERR(file)) {
4103		put_unused_fd(fd);
4104		return PTR_ERR(file);
4105	}
4106
4107	kvm_get_kvm(vcpu->kvm);
4108
4109	file->f_mode |= FMODE_PREAD;
4110	fd_install(fd, file);
4111
4112	return fd;
4113}
4114
4115static long kvm_vcpu_ioctl(struct file *filp,
4116			   unsigned int ioctl, unsigned long arg)
4117{
4118	struct kvm_vcpu *vcpu = filp->private_data;
4119	void __user *argp = (void __user *)arg;
4120	int r;
4121	struct kvm_fpu *fpu = NULL;
4122	struct kvm_sregs *kvm_sregs = NULL;
4123
4124	if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4125		return -EIO;
4126
4127	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
4128		return -EINVAL;
4129
4130	/*
4131	 * Some architectures have vcpu ioctls that are asynchronous to vcpu
4132	 * execution; mutex_lock() would break them.
4133	 */
4134	r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
4135	if (r != -ENOIOCTLCMD)
4136		return r;
4137
4138	if (mutex_lock_killable(&vcpu->mutex))
4139		return -EINTR;
4140	switch (ioctl) {
4141	case KVM_RUN: {
4142		struct pid *oldpid;
4143		r = -EINVAL;
4144		if (arg)
4145			goto out;
4146		oldpid = rcu_access_pointer(vcpu->pid);
4147		if (unlikely(oldpid != task_pid(current))) {
4148			/* The thread running this VCPU changed. */
4149			struct pid *newpid;
4150
4151			r = kvm_arch_vcpu_run_pid_change(vcpu);
4152			if (r)
4153				break;
4154
4155			newpid = get_task_pid(current, PIDTYPE_PID);
4156			rcu_assign_pointer(vcpu->pid, newpid);
4157			if (oldpid)
4158				synchronize_rcu();
4159			put_pid(oldpid);
4160		}
4161		r = kvm_arch_vcpu_ioctl_run(vcpu);
4162		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
4163		break;
4164	}
4165	case KVM_GET_REGS: {
4166		struct kvm_regs *kvm_regs;
4167
4168		r = -ENOMEM;
4169		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
4170		if (!kvm_regs)
4171			goto out;
4172		r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
4173		if (r)
4174			goto out_free1;
4175		r = -EFAULT;
4176		if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
4177			goto out_free1;
4178		r = 0;
4179out_free1:
4180		kfree(kvm_regs);
4181		break;
4182	}
4183	case KVM_SET_REGS: {
4184		struct kvm_regs *kvm_regs;
4185
4186		kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
4187		if (IS_ERR(kvm_regs)) {
4188			r = PTR_ERR(kvm_regs);
4189			goto out;
4190		}
4191		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
4192		kfree(kvm_regs);
4193		break;
4194	}
4195	case KVM_GET_SREGS: {
4196		kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
4197				    GFP_KERNEL_ACCOUNT);
4198		r = -ENOMEM;
4199		if (!kvm_sregs)
4200			goto out;
4201		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
4202		if (r)
4203			goto out;
4204		r = -EFAULT;
4205		if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
4206			goto out;
4207		r = 0;
4208		break;
4209	}
4210	case KVM_SET_SREGS: {
4211		kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
4212		if (IS_ERR(kvm_sregs)) {
4213			r = PTR_ERR(kvm_sregs);
4214			kvm_sregs = NULL;
4215			goto out;
4216		}
4217		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
4218		break;
4219	}
4220	case KVM_GET_MP_STATE: {
4221		struct kvm_mp_state mp_state;
4222
4223		r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
4224		if (r)
4225			goto out;
4226		r = -EFAULT;
4227		if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
4228			goto out;
4229		r = 0;
4230		break;
4231	}
4232	case KVM_SET_MP_STATE: {
4233		struct kvm_mp_state mp_state;
4234
4235		r = -EFAULT;
4236		if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
4237			goto out;
4238		r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
4239		break;
4240	}
4241	case KVM_TRANSLATE: {
4242		struct kvm_translation tr;
4243
4244		r = -EFAULT;
4245		if (copy_from_user(&tr, argp, sizeof(tr)))
4246			goto out;
4247		r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
4248		if (r)
4249			goto out;
4250		r = -EFAULT;
4251		if (copy_to_user(argp, &tr, sizeof(tr)))
4252			goto out;
4253		r = 0;
4254		break;
4255	}
4256	case KVM_SET_GUEST_DEBUG: {
4257		struct kvm_guest_debug dbg;
4258
4259		r = -EFAULT;
4260		if (copy_from_user(&dbg, argp, sizeof(dbg)))
4261			goto out;
4262		r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
4263		break;
4264	}
4265	case KVM_SET_SIGNAL_MASK: {
4266		struct kvm_signal_mask __user *sigmask_arg = argp;
4267		struct kvm_signal_mask kvm_sigmask;
4268		sigset_t sigset, *p;
4269
4270		p = NULL;
4271		if (argp) {
4272			r = -EFAULT;
4273			if (copy_from_user(&kvm_sigmask, argp,
4274					   sizeof(kvm_sigmask)))
4275				goto out;
4276			r = -EINVAL;
4277			if (kvm_sigmask.len != sizeof(sigset))
4278				goto out;
4279			r = -EFAULT;
4280			if (copy_from_user(&sigset, sigmask_arg->sigset,
4281					   sizeof(sigset)))
4282				goto out;
4283			p = &sigset;
4284		}
4285		r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
4286		break;
4287	}
4288	case KVM_GET_FPU: {
4289		fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
4290		r = -ENOMEM;
4291		if (!fpu)
4292			goto out;
4293		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
4294		if (r)
4295			goto out;
4296		r = -EFAULT;
4297		if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
4298			goto out;
4299		r = 0;
4300		break;
4301	}
4302	case KVM_SET_FPU: {
4303		fpu = memdup_user(argp, sizeof(*fpu));
4304		if (IS_ERR(fpu)) {
4305			r = PTR_ERR(fpu);
4306			fpu = NULL;
4307			goto out;
4308		}
4309		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
4310		break;
4311	}
4312	case KVM_GET_STATS_FD: {
4313		r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4314		break;
4315	}
4316	default:
4317		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
4318	}
4319out:
4320	mutex_unlock(&vcpu->mutex);
4321	kfree(fpu);
4322	kfree(kvm_sregs);
4323	return r;
4324}
4325
4326#ifdef CONFIG_KVM_COMPAT
4327static long kvm_vcpu_compat_ioctl(struct file *filp,
4328				  unsigned int ioctl, unsigned long arg)
4329{
4330	struct kvm_vcpu *vcpu = filp->private_data;
4331	void __user *argp = compat_ptr(arg);
4332	int r;
4333
4334	if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4335		return -EIO;
4336
4337	switch (ioctl) {
4338	case KVM_SET_SIGNAL_MASK: {
4339		struct kvm_signal_mask __user *sigmask_arg = argp;
4340		struct kvm_signal_mask kvm_sigmask;
4341		sigset_t sigset;
4342
4343		if (argp) {
4344			r = -EFAULT;
4345			if (copy_from_user(&kvm_sigmask, argp,
4346					   sizeof(kvm_sigmask)))
4347				goto out;
4348			r = -EINVAL;
4349			if (kvm_sigmask.len != sizeof(compat_sigset_t))
4350				goto out;
4351			r = -EFAULT;
4352			if (get_compat_sigset(&sigset,
4353					      (compat_sigset_t __user *)sigmask_arg->sigset))
4354				goto out;
4355			r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4356		} else
4357			r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
4358		break;
4359	}
4360	default:
4361		r = kvm_vcpu_ioctl(filp, ioctl, arg);
4362	}
4363
4364out:
4365	return r;
4366}
4367#endif
4368
4369static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4370{
4371	struct kvm_device *dev = filp->private_data;
4372
4373	if (dev->ops->mmap)
4374		return dev->ops->mmap(dev, vma);
4375
4376	return -ENODEV;
4377}
4378
4379static int kvm_device_ioctl_attr(struct kvm_device *dev,
4380				 int (*accessor)(struct kvm_device *dev,
4381						 struct kvm_device_attr *attr),
4382				 unsigned long arg)
4383{
4384	struct kvm_device_attr attr;
4385
4386	if (!accessor)
4387		return -EPERM;
4388
4389	if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4390		return -EFAULT;
4391
4392	return accessor(dev, &attr);
4393}
4394
4395static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4396			     unsigned long arg)
4397{
4398	struct kvm_device *dev = filp->private_data;
4399
4400	if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
4401		return -EIO;
4402
4403	switch (ioctl) {
4404	case KVM_SET_DEVICE_ATTR:
4405		return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4406	case KVM_GET_DEVICE_ATTR:
4407		return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4408	case KVM_HAS_DEVICE_ATTR:
4409		return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4410	default:
4411		if (dev->ops->ioctl)
4412			return dev->ops->ioctl(dev, ioctl, arg);
4413
4414		return -ENOTTY;
4415	}
4416}
4417
4418static int kvm_device_release(struct inode *inode, struct file *filp)
4419{
4420	struct kvm_device *dev = filp->private_data;
4421	struct kvm *kvm = dev->kvm;
4422
4423	if (dev->ops->release) {
4424		mutex_lock(&kvm->lock);
4425		list_del(&dev->vm_node);
4426		dev->ops->release(dev);
4427		mutex_unlock(&kvm->lock);
4428	}
4429
4430	kvm_put_kvm(kvm);
4431	return 0;
4432}
4433
4434static const struct file_operations kvm_device_fops = {
4435	.unlocked_ioctl = kvm_device_ioctl,
4436	.release = kvm_device_release,
4437	KVM_COMPAT(kvm_device_ioctl),
4438	.mmap = kvm_device_mmap,
4439};
4440
4441struct kvm_device *kvm_device_from_filp(struct file *filp)
4442{
4443	if (filp->f_op != &kvm_device_fops)
4444		return NULL;
4445
4446	return filp->private_data;
4447}
4448
4449static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4450#ifdef CONFIG_KVM_MPIC
4451	[KVM_DEV_TYPE_FSL_MPIC_20]	= &kvm_mpic_ops,
4452	[KVM_DEV_TYPE_FSL_MPIC_42]	= &kvm_mpic_ops,
4453#endif
4454};
4455
4456int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4457{
4458	if (type >= ARRAY_SIZE(kvm_device_ops_table))
4459		return -ENOSPC;
4460
4461	if (kvm_device_ops_table[type] != NULL)
4462		return -EEXIST;
4463
4464	kvm_device_ops_table[type] = ops;
4465	return 0;
4466}
4467
4468void kvm_unregister_device_ops(u32 type)
4469{
4470	if (kvm_device_ops_table[type] != NULL)
4471		kvm_device_ops_table[type] = NULL;
4472}
4473
4474static int kvm_ioctl_create_device(struct kvm *kvm,
4475				   struct kvm_create_device *cd)
4476{
4477	const struct kvm_device_ops *ops;
4478	struct kvm_device *dev;
4479	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4480	int type;
4481	int ret;
4482
4483	if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4484		return -ENODEV;
4485
4486	type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4487	ops = kvm_device_ops_table[type];
4488	if (ops == NULL)
4489		return -ENODEV;
4490
4491	if (test)
4492		return 0;
4493
4494	dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4495	if (!dev)
4496		return -ENOMEM;
4497
4498	dev->ops = ops;
4499	dev->kvm = kvm;
4500
4501	mutex_lock(&kvm->lock);
4502	ret = ops->create(dev, type);
4503	if (ret < 0) {
4504		mutex_unlock(&kvm->lock);
4505		kfree(dev);
4506		return ret;
4507	}
4508	list_add(&dev->vm_node, &kvm->devices);
4509	mutex_unlock(&kvm->lock);
4510
4511	if (ops->init)
4512		ops->init(dev);
4513
4514	kvm_get_kvm(kvm);
4515	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4516	if (ret < 0) {
4517		kvm_put_kvm_no_destroy(kvm);
4518		mutex_lock(&kvm->lock);
4519		list_del(&dev->vm_node);
4520		if (ops->release)
4521			ops->release(dev);
4522		mutex_unlock(&kvm->lock);
4523		if (ops->destroy)
4524			ops->destroy(dev);
4525		return ret;
4526	}
4527
4528	cd->fd = ret;
4529	return 0;
4530}
4531
4532static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4533{
4534	switch (arg) {
4535	case KVM_CAP_USER_MEMORY:
4536	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4537	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4538	case KVM_CAP_INTERNAL_ERROR_DATA:
4539#ifdef CONFIG_HAVE_KVM_MSI
4540	case KVM_CAP_SIGNAL_MSI:
4541#endif
4542#ifdef CONFIG_HAVE_KVM_IRQFD
4543	case KVM_CAP_IRQFD:
4544#endif
4545	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4546	case KVM_CAP_CHECK_EXTENSION_VM:
4547	case KVM_CAP_ENABLE_CAP_VM:
4548	case KVM_CAP_HALT_POLL:
4549		return 1;
4550#ifdef CONFIG_KVM_MMIO
4551	case KVM_CAP_COALESCED_MMIO:
4552		return KVM_COALESCED_MMIO_PAGE_OFFSET;
4553	case KVM_CAP_COALESCED_PIO:
4554		return 1;
4555#endif
4556#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4557	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4558		return KVM_DIRTY_LOG_MANUAL_CAPS;
4559#endif
4560#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4561	case KVM_CAP_IRQ_ROUTING:
4562		return KVM_MAX_IRQ_ROUTES;
4563#endif
4564#if KVM_ADDRESS_SPACE_NUM > 1
4565	case KVM_CAP_MULTI_ADDRESS_SPACE:
4566		return KVM_ADDRESS_SPACE_NUM;
4567#endif
4568	case KVM_CAP_NR_MEMSLOTS:
4569		return KVM_USER_MEM_SLOTS;
4570	case KVM_CAP_DIRTY_LOG_RING:
4571#ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
4572		return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4573#else
4574		return 0;
4575#endif
4576	case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4577#ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
4578		return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4579#else
4580		return 0;
4581#endif
4582#ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
4583	case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
4584#endif
4585	case KVM_CAP_BINARY_STATS_FD:
4586	case KVM_CAP_SYSTEM_EVENT_DATA:
4587		return 1;
4588	default:
4589		break;
4590	}
4591	return kvm_vm_ioctl_check_extension(kvm, arg);
4592}
4593
4594static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4595{
4596	int r;
4597
4598	if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4599		return -EINVAL;
4600
4601	/* the size should be power of 2 */
4602	if (!size || (size & (size - 1)))
4603		return -EINVAL;
4604
4605	/* Should be bigger to keep the reserved entries, or a page */
4606	if (size < kvm_dirty_ring_get_rsvd_entries() *
4607	    sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4608		return -EINVAL;
4609
4610	if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4611	    sizeof(struct kvm_dirty_gfn))
4612		return -E2BIG;
4613
4614	/* We only allow it to set once */
4615	if (kvm->dirty_ring_size)
4616		return -EINVAL;
4617
4618	mutex_lock(&kvm->lock);
4619
4620	if (kvm->created_vcpus) {
4621		/* We don't allow to change this value after vcpu created */
4622		r = -EINVAL;
4623	} else {
4624		kvm->dirty_ring_size = size;
4625		r = 0;
4626	}
4627
4628	mutex_unlock(&kvm->lock);
4629	return r;
4630}
4631
4632static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4633{
4634	unsigned long i;
4635	struct kvm_vcpu *vcpu;
4636	int cleared = 0;
4637
4638	if (!kvm->dirty_ring_size)
4639		return -EINVAL;
4640
4641	mutex_lock(&kvm->slots_lock);
4642
4643	kvm_for_each_vcpu(i, vcpu, kvm)
4644		cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4645
4646	mutex_unlock(&kvm->slots_lock);
4647
4648	if (cleared)
4649		kvm_flush_remote_tlbs(kvm);
4650
4651	return cleared;
4652}
4653
4654int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4655						  struct kvm_enable_cap *cap)
4656{
4657	return -EINVAL;
4658}
4659
4660bool kvm_are_all_memslots_empty(struct kvm *kvm)
4661{
4662	int i;
4663
4664	lockdep_assert_held(&kvm->slots_lock);
4665
4666	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
4667		if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
4668			return false;
4669	}
4670
4671	return true;
4672}
4673EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
4674
4675static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4676					   struct kvm_enable_cap *cap)
4677{
4678	switch (cap->cap) {
4679#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4680	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4681		u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4682
4683		if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4684			allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4685
4686		if (cap->flags || (cap->args[0] & ~allowed_options))
4687			return -EINVAL;
4688		kvm->manual_dirty_log_protect = cap->args[0];
4689		return 0;
4690	}
4691#endif
4692	case KVM_CAP_HALT_POLL: {
4693		if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4694			return -EINVAL;
4695
4696		kvm->max_halt_poll_ns = cap->args[0];
4697
4698		/*
4699		 * Ensure kvm->override_halt_poll_ns does not become visible
4700		 * before kvm->max_halt_poll_ns.
4701		 *
4702		 * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
4703		 */
4704		smp_wmb();
4705		kvm->override_halt_poll_ns = true;
4706
4707		return 0;
4708	}
4709	case KVM_CAP_DIRTY_LOG_RING:
4710	case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4711		if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
4712			return -EINVAL;
4713
4714		return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
4715	case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
4716		int r = -EINVAL;
4717
4718		if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
4719		    !kvm->dirty_ring_size || cap->flags)
4720			return r;
4721
4722		mutex_lock(&kvm->slots_lock);
4723
4724		/*
4725		 * For simplicity, allow enabling ring+bitmap if and only if
4726		 * there are no memslots, e.g. to ensure all memslots allocate
4727		 * a bitmap after the capability is enabled.
4728		 */
4729		if (kvm_are_all_memslots_empty(kvm)) {
4730			kvm->dirty_ring_with_bitmap = true;
4731			r = 0;
4732		}
4733
4734		mutex_unlock(&kvm->slots_lock);
4735
4736		return r;
4737	}
4738	default:
4739		return kvm_vm_ioctl_enable_cap(kvm, cap);
4740	}
4741}
4742
4743static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4744			      size_t size, loff_t *offset)
4745{
4746	struct kvm *kvm = file->private_data;
4747
4748	return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4749				&kvm_vm_stats_desc[0], &kvm->stat,
4750				sizeof(kvm->stat), user_buffer, size, offset);
4751}
4752
4753static int kvm_vm_stats_release(struct inode *inode, struct file *file)
4754{
4755	struct kvm *kvm = file->private_data;
4756
4757	kvm_put_kvm(kvm);
4758	return 0;
4759}
4760
4761static const struct file_operations kvm_vm_stats_fops = {
4762	.read = kvm_vm_stats_read,
4763	.release = kvm_vm_stats_release,
4764	.llseek = noop_llseek,
4765};
4766
4767static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4768{
4769	int fd;
4770	struct file *file;
4771
4772	fd = get_unused_fd_flags(O_CLOEXEC);
4773	if (fd < 0)
4774		return fd;
4775
4776	file = anon_inode_getfile("kvm-vm-stats",
4777			&kvm_vm_stats_fops, kvm, O_RDONLY);
4778	if (IS_ERR(file)) {
4779		put_unused_fd(fd);
4780		return PTR_ERR(file);
4781	}
4782
4783	kvm_get_kvm(kvm);
4784
4785	file->f_mode |= FMODE_PREAD;
4786	fd_install(fd, file);
4787
4788	return fd;
4789}
4790
4791static long kvm_vm_ioctl(struct file *filp,
4792			   unsigned int ioctl, unsigned long arg)
4793{
4794	struct kvm *kvm = filp->private_data;
4795	void __user *argp = (void __user *)arg;
4796	int r;
4797
4798	if (kvm->mm != current->mm || kvm->vm_dead)
4799		return -EIO;
4800	switch (ioctl) {
4801	case KVM_CREATE_VCPU:
4802		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
4803		break;
4804	case KVM_ENABLE_CAP: {
4805		struct kvm_enable_cap cap;
4806
4807		r = -EFAULT;
4808		if (copy_from_user(&cap, argp, sizeof(cap)))
4809			goto out;
4810		r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4811		break;
4812	}
4813	case KVM_SET_USER_MEMORY_REGION: {
4814		struct kvm_userspace_memory_region kvm_userspace_mem;
4815
4816		r = -EFAULT;
4817		if (copy_from_user(&kvm_userspace_mem, argp,
4818						sizeof(kvm_userspace_mem)))
4819			goto out;
4820
4821		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
4822		break;
4823	}
4824	case KVM_GET_DIRTY_LOG: {
4825		struct kvm_dirty_log log;
4826
4827		r = -EFAULT;
4828		if (copy_from_user(&log, argp, sizeof(log)))
4829			goto out;
4830		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4831		break;
4832	}
4833#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4834	case KVM_CLEAR_DIRTY_LOG: {
4835		struct kvm_clear_dirty_log log;
4836
4837		r = -EFAULT;
4838		if (copy_from_user(&log, argp, sizeof(log)))
4839			goto out;
4840		r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4841		break;
4842	}
4843#endif
4844#ifdef CONFIG_KVM_MMIO
4845	case KVM_REGISTER_COALESCED_MMIO: {
4846		struct kvm_coalesced_mmio_zone zone;
4847
4848		r = -EFAULT;
4849		if (copy_from_user(&zone, argp, sizeof(zone)))
4850			goto out;
4851		r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
4852		break;
4853	}
4854	case KVM_UNREGISTER_COALESCED_MMIO: {
4855		struct kvm_coalesced_mmio_zone zone;
4856
4857		r = -EFAULT;
4858		if (copy_from_user(&zone, argp, sizeof(zone)))
4859			goto out;
4860		r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
4861		break;
4862	}
4863#endif
4864	case KVM_IRQFD: {
4865		struct kvm_irqfd data;
4866
4867		r = -EFAULT;
4868		if (copy_from_user(&data, argp, sizeof(data)))
4869			goto out;
4870		r = kvm_irqfd(kvm, &data);
4871		break;
4872	}
4873	case KVM_IOEVENTFD: {
4874		struct kvm_ioeventfd data;
4875
4876		r = -EFAULT;
4877		if (copy_from_user(&data, argp, sizeof(data)))
4878			goto out;
4879		r = kvm_ioeventfd(kvm, &data);
4880		break;
4881	}
4882#ifdef CONFIG_HAVE_KVM_MSI
4883	case KVM_SIGNAL_MSI: {
4884		struct kvm_msi msi;
4885
4886		r = -EFAULT;
4887		if (copy_from_user(&msi, argp, sizeof(msi)))
4888			goto out;
4889		r = kvm_send_userspace_msi(kvm, &msi);
4890		break;
4891	}
4892#endif
4893#ifdef __KVM_HAVE_IRQ_LINE
4894	case KVM_IRQ_LINE_STATUS:
4895	case KVM_IRQ_LINE: {
4896		struct kvm_irq_level irq_event;
4897
4898		r = -EFAULT;
4899		if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
4900			goto out;
4901
4902		r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4903					ioctl == KVM_IRQ_LINE_STATUS);
4904		if (r)
4905			goto out;
4906
4907		r = -EFAULT;
4908		if (ioctl == KVM_IRQ_LINE_STATUS) {
4909			if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
4910				goto out;
4911		}
4912
4913		r = 0;
4914		break;
4915	}
4916#endif
4917#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4918	case KVM_SET_GSI_ROUTING: {
4919		struct kvm_irq_routing routing;
4920		struct kvm_irq_routing __user *urouting;
4921		struct kvm_irq_routing_entry *entries = NULL;
4922
4923		r = -EFAULT;
4924		if (copy_from_user(&routing, argp, sizeof(routing)))
4925			goto out;
4926		r = -EINVAL;
4927		if (!kvm_arch_can_set_irq_routing(kvm))
4928			goto out;
4929		if (routing.nr > KVM_MAX_IRQ_ROUTES)
4930			goto out;
4931		if (routing.flags)
4932			goto out;
4933		if (routing.nr) {
4934			urouting = argp;
4935			entries = vmemdup_user(urouting->entries,
4936					       array_size(sizeof(*entries),
4937							  routing.nr));
4938			if (IS_ERR(entries)) {
4939				r = PTR_ERR(entries);
4940				goto out;
4941			}
4942		}
4943		r = kvm_set_irq_routing(kvm, entries, routing.nr,
4944					routing.flags);
4945		kvfree(entries);
4946		break;
4947	}
4948#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
4949	case KVM_CREATE_DEVICE: {
4950		struct kvm_create_device cd;
4951
4952		r = -EFAULT;
4953		if (copy_from_user(&cd, argp, sizeof(cd)))
4954			goto out;
4955
4956		r = kvm_ioctl_create_device(kvm, &cd);
4957		if (r)
4958			goto out;
4959
4960		r = -EFAULT;
4961		if (copy_to_user(argp, &cd, sizeof(cd)))
4962			goto out;
4963
4964		r = 0;
4965		break;
4966	}
4967	case KVM_CHECK_EXTENSION:
4968		r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
4969		break;
4970	case KVM_RESET_DIRTY_RINGS:
4971		r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4972		break;
4973	case KVM_GET_STATS_FD:
4974		r = kvm_vm_ioctl_get_stats_fd(kvm);
4975		break;
4976	default:
4977		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
4978	}
4979out:
4980	return r;
4981}
4982
4983#ifdef CONFIG_KVM_COMPAT
4984struct compat_kvm_dirty_log {
4985	__u32 slot;
4986	__u32 padding1;
4987	union {
4988		compat_uptr_t dirty_bitmap; /* one bit per page */
4989		__u64 padding2;
4990	};
4991};
4992
4993struct compat_kvm_clear_dirty_log {
4994	__u32 slot;
4995	__u32 num_pages;
4996	__u64 first_page;
4997	union {
4998		compat_uptr_t dirty_bitmap; /* one bit per page */
4999		__u64 padding2;
5000	};
5001};
5002
5003long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
5004				     unsigned long arg)
5005{
5006	return -ENOTTY;
5007}
5008
5009static long kvm_vm_compat_ioctl(struct file *filp,
5010			   unsigned int ioctl, unsigned long arg)
5011{
5012	struct kvm *kvm = filp->private_data;
5013	int r;
5014
5015	if (kvm->mm != current->mm || kvm->vm_dead)
5016		return -EIO;
5017
5018	r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
5019	if (r != -ENOTTY)
5020		return r;
5021
5022	switch (ioctl) {
5023#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5024	case KVM_CLEAR_DIRTY_LOG: {
5025		struct compat_kvm_clear_dirty_log compat_log;
5026		struct kvm_clear_dirty_log log;
5027
5028		if (copy_from_user(&compat_log, (void __user *)arg,
5029				   sizeof(compat_log)))
5030			return -EFAULT;
5031		log.slot	 = compat_log.slot;
5032		log.num_pages	 = compat_log.num_pages;
5033		log.first_page	 = compat_log.first_page;
5034		log.padding2	 = compat_log.padding2;
5035		log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5036
5037		r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5038		break;
5039	}
5040#endif
5041	case KVM_GET_DIRTY_LOG: {
5042		struct compat_kvm_dirty_log compat_log;
5043		struct kvm_dirty_log log;
5044
5045		if (copy_from_user(&compat_log, (void __user *)arg,
5046				   sizeof(compat_log)))
5047			return -EFAULT;
5048		log.slot	 = compat_log.slot;
5049		log.padding1	 = compat_log.padding1;
5050		log.padding2	 = compat_log.padding2;
5051		log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5052
5053		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
5054		break;
5055	}
5056	default:
5057		r = kvm_vm_ioctl(filp, ioctl, arg);
5058	}
5059	return r;
5060}
5061#endif
5062
5063static const struct file_operations kvm_vm_fops = {
5064	.release        = kvm_vm_release,
5065	.unlocked_ioctl = kvm_vm_ioctl,
5066	.llseek		= noop_llseek,
5067	KVM_COMPAT(kvm_vm_compat_ioctl),
5068};
5069
5070bool file_is_kvm(struct file *file)
5071{
5072	return file && file->f_op == &kvm_vm_fops;
5073}
5074EXPORT_SYMBOL_GPL(file_is_kvm);
5075
5076static int kvm_dev_ioctl_create_vm(unsigned long type)
5077{
5078	char fdname[ITOA_MAX_LEN + 1];
5079	int r, fd;
5080	struct kvm *kvm;
5081	struct file *file;
5082
5083	fd = get_unused_fd_flags(O_CLOEXEC);
5084	if (fd < 0)
5085		return fd;
5086
5087	snprintf(fdname, sizeof(fdname), "%d", fd);
5088
5089	kvm = kvm_create_vm(type, fdname);
5090	if (IS_ERR(kvm)) {
5091		r = PTR_ERR(kvm);
5092		goto put_fd;
5093	}
5094
5095	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
5096	if (IS_ERR(file)) {
5097		r = PTR_ERR(file);
5098		goto put_kvm;
5099	}
5100
5101	/*
5102	 * Don't call kvm_put_kvm anymore at this point; file->f_op is
5103	 * already set, with ->release() being kvm_vm_release().  In error
5104	 * cases it will be called by the final fput(file) and will take
5105	 * care of doing kvm_put_kvm(kvm).
5106	 */
5107	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
5108
5109	fd_install(fd, file);
5110	return fd;
5111
5112put_kvm:
5113	kvm_put_kvm(kvm);
5114put_fd:
5115	put_unused_fd(fd);
5116	return r;
5117}
5118
5119static long kvm_dev_ioctl(struct file *filp,
5120			  unsigned int ioctl, unsigned long arg)
5121{
5122	int r = -EINVAL;
5123
5124	switch (ioctl) {
5125	case KVM_GET_API_VERSION:
5126		if (arg)
5127			goto out;
5128		r = KVM_API_VERSION;
5129		break;
5130	case KVM_CREATE_VM:
5131		r = kvm_dev_ioctl_create_vm(arg);
5132		break;
5133	case KVM_CHECK_EXTENSION:
5134		r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
5135		break;
5136	case KVM_GET_VCPU_MMAP_SIZE:
5137		if (arg)
5138			goto out;
5139		r = PAGE_SIZE;     /* struct kvm_run */
5140#ifdef CONFIG_X86
5141		r += PAGE_SIZE;    /* pio data page */
5142#endif
5143#ifdef CONFIG_KVM_MMIO
5144		r += PAGE_SIZE;    /* coalesced mmio ring page */
5145#endif
5146		break;
5147	case KVM_TRACE_ENABLE:
5148	case KVM_TRACE_PAUSE:
5149	case KVM_TRACE_DISABLE:
5150		r = -EOPNOTSUPP;
5151		break;
5152	default:
5153		return kvm_arch_dev_ioctl(filp, ioctl, arg);
5154	}
5155out:
5156	return r;
5157}
5158
5159static struct file_operations kvm_chardev_ops = {
5160	.unlocked_ioctl = kvm_dev_ioctl,
5161	.llseek		= noop_llseek,
5162	KVM_COMPAT(kvm_dev_ioctl),
5163};
5164
5165static struct miscdevice kvm_dev = {
5166	KVM_MINOR,
5167	"kvm",
5168	&kvm_chardev_ops,
5169};
5170
5171#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
5172__visible bool kvm_rebooting;
5173EXPORT_SYMBOL_GPL(kvm_rebooting);
5174
5175static DEFINE_PER_CPU(bool, hardware_enabled);
5176static int kvm_usage_count;
5177
5178static int __hardware_enable_nolock(void)
5179{
5180	if (__this_cpu_read(hardware_enabled))
5181		return 0;
5182
5183	if (kvm_arch_hardware_enable()) {
5184		pr_info("kvm: enabling virtualization on CPU%d failed\n",
5185			raw_smp_processor_id());
5186		return -EIO;
5187	}
5188
5189	__this_cpu_write(hardware_enabled, true);
5190	return 0;
5191}
5192
5193static void hardware_enable_nolock(void *failed)
5194{
5195	if (__hardware_enable_nolock())
5196		atomic_inc(failed);
5197}
5198
5199static int kvm_online_cpu(unsigned int cpu)
5200{
5201	int ret = 0;
5202
5203	/*
5204	 * Abort the CPU online process if hardware virtualization cannot
5205	 * be enabled. Otherwise running VMs would encounter unrecoverable
5206	 * errors when scheduled to this CPU.
5207	 */
5208	mutex_lock(&kvm_lock);
5209	if (kvm_usage_count)
5210		ret = __hardware_enable_nolock();
5211	mutex_unlock(&kvm_lock);
5212	return ret;
5213}
5214
5215static void hardware_disable_nolock(void *junk)
5216{
5217	/*
5218	 * Note, hardware_disable_all_nolock() tells all online CPUs to disable
5219	 * hardware, not just CPUs that successfully enabled hardware!
5220	 */
5221	if (!__this_cpu_read(hardware_enabled))
5222		return;
5223
5224	kvm_arch_hardware_disable();
5225
5226	__this_cpu_write(hardware_enabled, false);
5227}
5228
5229static int kvm_offline_cpu(unsigned int cpu)
5230{
5231	mutex_lock(&kvm_lock);
5232	if (kvm_usage_count)
5233		hardware_disable_nolock(NULL);
5234	mutex_unlock(&kvm_lock);
5235	return 0;
5236}
5237
5238static void hardware_disable_all_nolock(void)
5239{
5240	BUG_ON(!kvm_usage_count);
5241
5242	kvm_usage_count--;
5243	if (!kvm_usage_count)
5244		on_each_cpu(hardware_disable_nolock, NULL, 1);
5245}
5246
5247static void hardware_disable_all(void)
5248{
5249	cpus_read_lock();
5250	mutex_lock(&kvm_lock);
5251	hardware_disable_all_nolock();
5252	mutex_unlock(&kvm_lock);
5253	cpus_read_unlock();
5254}
5255
5256static int hardware_enable_all(void)
5257{
5258	atomic_t failed = ATOMIC_INIT(0);
5259	int r;
5260
5261	/*
5262	 * Do not enable hardware virtualization if the system is going down.
5263	 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
5264	 * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
5265	 * after kvm_reboot() is called.  Note, this relies on system_state
5266	 * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
5267	 * hook instead of registering a dedicated reboot notifier (the latter
5268	 * runs before system_state is updated).
5269	 */
5270	if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
5271	    system_state == SYSTEM_RESTART)
5272		return -EBUSY;
5273
5274	/*
5275	 * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
5276	 * is called, and so on_each_cpu() between them includes the CPU that
5277	 * is being onlined.  As a result, hardware_enable_nolock() may get
5278	 * invoked before kvm_online_cpu(), which also enables hardware if the
5279	 * usage count is non-zero.  Disable CPU hotplug to avoid attempting to
5280	 * enable hardware multiple times.
5281	 */
5282	cpus_read_lock();
5283	mutex_lock(&kvm_lock);
5284
5285	r = 0;
5286
5287	kvm_usage_count++;
5288	if (kvm_usage_count == 1) {
5289		on_each_cpu(hardware_enable_nolock, &failed, 1);
5290
5291		if (atomic_read(&failed)) {
5292			hardware_disable_all_nolock();
5293			r = -EBUSY;
5294		}
5295	}
5296
5297	mutex_unlock(&kvm_lock);
5298	cpus_read_unlock();
5299
5300	return r;
5301}
5302
5303static void kvm_shutdown(void)
5304{
5305	/*
5306	 * Disable hardware virtualization and set kvm_rebooting to indicate
5307	 * that KVM has asynchronously disabled hardware virtualization, i.e.
5308	 * that relevant errors and exceptions aren't entirely unexpected.
5309	 * Some flavors of hardware virtualization need to be disabled before
5310	 * transferring control to firmware (to perform shutdown/reboot), e.g.
5311	 * on x86, virtualization can block INIT interrupts, which are used by
5312	 * firmware to pull APs back under firmware control.  Note, this path
5313	 * is used for both shutdown and reboot scenarios, i.e. neither name is
5314	 * 100% comprehensive.
5315	 */
5316	pr_info("kvm: exiting hardware virtualization\n");
5317	kvm_rebooting = true;
5318	on_each_cpu(hardware_disable_nolock, NULL, 1);
5319}
5320
5321static int kvm_suspend(void)
5322{
5323	/*
5324	 * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
5325	 * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count
5326	 * is stable.  Assert that kvm_lock is not held to ensure the system
5327	 * isn't suspended while KVM is enabling hardware.  Hardware enabling
5328	 * can be preempted, but the task cannot be frozen until it has dropped
5329	 * all locks (userspace tasks are frozen via a fake signal).
5330	 */
5331	lockdep_assert_not_held(&kvm_lock);
5332	lockdep_assert_irqs_disabled();
5333
5334	if (kvm_usage_count)
5335		hardware_disable_nolock(NULL);
5336	return 0;
5337}
5338
5339static void kvm_resume(void)
5340{
5341	lockdep_assert_not_held(&kvm_lock);
5342	lockdep_assert_irqs_disabled();
5343
5344	if (kvm_usage_count)
5345		WARN_ON_ONCE(__hardware_enable_nolock());
5346}
5347
5348static struct syscore_ops kvm_syscore_ops = {
5349	.suspend = kvm_suspend,
5350	.resume = kvm_resume,
5351	.shutdown = kvm_shutdown,
5352};
5353#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5354static int hardware_enable_all(void)
5355{
5356	return 0;
5357}
5358
5359static void hardware_disable_all(void)
5360{
5361
5362}
5363#endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5364
5365static void kvm_iodevice_destructor(struct kvm_io_device *dev)
5366{
5367	if (dev->ops->destructor)
5368		dev->ops->destructor(dev);
5369}
5370
5371static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
5372{
5373	int i;
5374
5375	for (i = 0; i < bus->dev_count; i++) {
5376		struct kvm_io_device *pos = bus->range[i].dev;
5377
5378		kvm_iodevice_destructor(pos);
5379	}
5380	kfree(bus);
5381}
5382
5383static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
5384				 const struct kvm_io_range *r2)
5385{
5386	gpa_t addr1 = r1->addr;
5387	gpa_t addr2 = r2->addr;
5388
5389	if (addr1 < addr2)
5390		return -1;
5391
5392	/* If r2->len == 0, match the exact address.  If r2->len != 0,
5393	 * accept any overlapping write.  Any order is acceptable for
5394	 * overlapping ranges, because kvm_io_bus_get_first_dev ensures
5395	 * we process all of them.
5396	 */
5397	if (r2->len) {
5398		addr1 += r1->len;
5399		addr2 += r2->len;
5400	}
5401
5402	if (addr1 > addr2)
5403		return 1;
5404
5405	return 0;
5406}
5407
5408static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
5409{
5410	return kvm_io_bus_cmp(p1, p2);
5411}
5412
5413static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
5414			     gpa_t addr, int len)
5415{
5416	struct kvm_io_range *range, key;
5417	int off;
5418
5419	key = (struct kvm_io_range) {
5420		.addr = addr,
5421		.len = len,
5422	};
5423
5424	range = bsearch(&key, bus->range, bus->dev_count,
5425			sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
5426	if (range == NULL)
5427		return -ENOENT;
5428
5429	off = range - bus->range;
5430
5431	while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
5432		off--;
5433
5434	return off;
5435}
5436
5437static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5438			      struct kvm_io_range *range, const void *val)
5439{
5440	int idx;
5441
5442	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5443	if (idx < 0)
5444		return -EOPNOTSUPP;
5445
5446	while (idx < bus->dev_count &&
5447		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5448		if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
5449					range->len, val))
5450			return idx;
5451		idx++;
5452	}
5453
5454	return -EOPNOTSUPP;
5455}
5456
5457/* kvm_io_bus_write - called under kvm->slots_lock */
5458int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5459		     int len, const void *val)
5460{
5461	struct kvm_io_bus *bus;
5462	struct kvm_io_range range;
5463	int r;
5464
5465	range = (struct kvm_io_range) {
5466		.addr = addr,
5467		.len = len,
5468	};
5469
5470	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5471	if (!bus)
5472		return -ENOMEM;
5473	r = __kvm_io_bus_write(vcpu, bus, &range, val);
5474	return r < 0 ? r : 0;
5475}
5476EXPORT_SYMBOL_GPL(kvm_io_bus_write);
5477
5478/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
5479int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
5480			    gpa_t addr, int len, const void *val, long cookie)
5481{
5482	struct kvm_io_bus *bus;
5483	struct kvm_io_range range;
5484
5485	range = (struct kvm_io_range) {
5486		.addr = addr,
5487		.len = len,
5488	};
5489
5490	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5491	if (!bus)
5492		return -ENOMEM;
5493
5494	/* First try the device referenced by cookie. */
5495	if ((cookie >= 0) && (cookie < bus->dev_count) &&
5496	    (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
5497		if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
5498					val))
5499			return cookie;
5500
5501	/*
5502	 * cookie contained garbage; fall back to search and return the
5503	 * correct cookie value.
5504	 */
5505	return __kvm_io_bus_write(vcpu, bus, &range, val);
5506}
5507
5508static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5509			     struct kvm_io_range *range, void *val)
5510{
5511	int idx;
5512
5513	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5514	if (idx < 0)
5515		return -EOPNOTSUPP;
5516
5517	while (idx < bus->dev_count &&
5518		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5519		if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
5520				       range->len, val))
5521			return idx;
5522		idx++;
5523	}
5524
5525	return -EOPNOTSUPP;
5526}
5527
5528/* kvm_io_bus_read - called under kvm->slots_lock */
5529int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5530		    int len, void *val)
5531{
5532	struct kvm_io_bus *bus;
5533	struct kvm_io_range range;
5534	int r;
5535
5536	range = (struct kvm_io_range) {
5537		.addr = addr,
5538		.len = len,
5539	};
5540
5541	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5542	if (!bus)
5543		return -ENOMEM;
5544	r = __kvm_io_bus_read(vcpu, bus, &range, val);
5545	return r < 0 ? r : 0;
5546}
5547
5548/* Caller must hold slots_lock. */
5549int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
5550			    int len, struct kvm_io_device *dev)
5551{
5552	int i;
5553	struct kvm_io_bus *new_bus, *bus;
5554	struct kvm_io_range range;
5555
5556	bus = kvm_get_bus(kvm, bus_idx);
5557	if (!bus)
5558		return -ENOMEM;
5559
5560	/* exclude ioeventfd which is limited by maximum fd */
5561	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
5562		return -ENOSPC;
5563
5564	new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
5565			  GFP_KERNEL_ACCOUNT);
5566	if (!new_bus)
5567		return -ENOMEM;
5568
5569	range = (struct kvm_io_range) {
5570		.addr = addr,
5571		.len = len,
5572		.dev = dev,
5573	};
5574
5575	for (i = 0; i < bus->dev_count; i++)
5576		if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5577			break;
5578
5579	memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5580	new_bus->dev_count++;
5581	new_bus->range[i] = range;
5582	memcpy(new_bus->range + i + 1, bus->range + i,
5583		(bus->dev_count - i) * sizeof(struct kvm_io_range));
5584	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5585	synchronize_srcu_expedited(&kvm->srcu);
5586	kfree(bus);
5587
5588	return 0;
5589}
5590
5591int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5592			      struct kvm_io_device *dev)
5593{
5594	int i;
5595	struct kvm_io_bus *new_bus, *bus;
5596
5597	lockdep_assert_held(&kvm->slots_lock);
5598
5599	bus = kvm_get_bus(kvm, bus_idx);
5600	if (!bus)
5601		return 0;
5602
5603	for (i = 0; i < bus->dev_count; i++) {
5604		if (bus->range[i].dev == dev) {
5605			break;
5606		}
5607	}
5608
5609	if (i == bus->dev_count)
5610		return 0;
5611
5612	new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
5613			  GFP_KERNEL_ACCOUNT);
5614	if (new_bus) {
5615		memcpy(new_bus, bus, struct_size(bus, range, i));
5616		new_bus->dev_count--;
5617		memcpy(new_bus->range + i, bus->range + i + 1,
5618				flex_array_size(new_bus, range, new_bus->dev_count - i));
5619	}
5620
5621	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5622	synchronize_srcu_expedited(&kvm->srcu);
5623
5624	/*
5625	 * If NULL bus is installed, destroy the old bus, including all the
5626	 * attached devices. Otherwise, destroy the caller's device only.
5627	 */
5628	if (!new_bus) {
5629		pr_err("kvm: failed to shrink bus, removing it completely\n");
5630		kvm_io_bus_destroy(bus);
5631		return -ENOMEM;
5632	}
5633
5634	kvm_iodevice_destructor(dev);
5635	kfree(bus);
5636	return 0;
5637}
5638
5639struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5640					 gpa_t addr)
5641{
5642	struct kvm_io_bus *bus;
5643	int dev_idx, srcu_idx;
5644	struct kvm_io_device *iodev = NULL;
5645
5646	srcu_idx = srcu_read_lock(&kvm->srcu);
5647
5648	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
5649	if (!bus)
5650		goto out_unlock;
5651
5652	dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5653	if (dev_idx < 0)
5654		goto out_unlock;
5655
5656	iodev = bus->range[dev_idx].dev;
5657
5658out_unlock:
5659	srcu_read_unlock(&kvm->srcu, srcu_idx);
5660
5661	return iodev;
5662}
5663EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5664
5665static int kvm_debugfs_open(struct inode *inode, struct file *file,
5666			   int (*get)(void *, u64 *), int (*set)(void *, u64),
5667			   const char *fmt)
5668{
5669	int ret;
5670	struct kvm_stat_data *stat_data = inode->i_private;
5671
5672	/*
5673	 * The debugfs files are a reference to the kvm struct which
5674        * is still valid when kvm_destroy_vm is called.  kvm_get_kvm_safe
5675        * avoids the race between open and the removal of the debugfs directory.
5676	 */
5677	if (!kvm_get_kvm_safe(stat_data->kvm))
5678		return -ENOENT;
5679
5680	ret = simple_attr_open(inode, file, get,
5681			       kvm_stats_debugfs_mode(stat_data->desc) & 0222
5682			       ? set : NULL, fmt);
5683	if (ret)
5684		kvm_put_kvm(stat_data->kvm);
5685
5686	return ret;
5687}
5688
5689static int kvm_debugfs_release(struct inode *inode, struct file *file)
5690{
5691	struct kvm_stat_data *stat_data = inode->i_private;
5692
5693	simple_attr_release(inode, file);
5694	kvm_put_kvm(stat_data->kvm);
5695
5696	return 0;
5697}
5698
5699static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
5700{
5701	*val = *(u64 *)((void *)(&kvm->stat) + offset);
5702
5703	return 0;
5704}
5705
5706static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5707{
5708	*(u64 *)((void *)(&kvm->stat) + offset) = 0;
5709
5710	return 0;
5711}
5712
5713static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
5714{
5715	unsigned long i;
5716	struct kvm_vcpu *vcpu;
5717
5718	*val = 0;
5719
5720	kvm_for_each_vcpu(i, vcpu, kvm)
5721		*val += *(u64 *)((void *)(&vcpu->stat) + offset);
5722
5723	return 0;
5724}
5725
5726static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
5727{
5728	unsigned long i;
5729	struct kvm_vcpu *vcpu;
5730
5731	kvm_for_each_vcpu(i, vcpu, kvm)
5732		*(u64 *)((void *)(&vcpu->stat) + offset) = 0;
5733
5734	return 0;
5735}
5736
5737static int kvm_stat_data_get(void *data, u64 *val)
5738{
5739	int r = -EFAULT;
5740	struct kvm_stat_data *stat_data = data;
5741
5742	switch (stat_data->kind) {
5743	case KVM_STAT_VM:
5744		r = kvm_get_stat_per_vm(stat_data->kvm,
5745					stat_data->desc->desc.offset, val);
5746		break;
5747	case KVM_STAT_VCPU:
5748		r = kvm_get_stat_per_vcpu(stat_data->kvm,
5749					  stat_data->desc->desc.offset, val);
5750		break;
5751	}
5752
5753	return r;
5754}
5755
5756static int kvm_stat_data_clear(void *data, u64 val)
5757{
5758	int r = -EFAULT;
5759	struct kvm_stat_data *stat_data = data;
5760
5761	if (val)
5762		return -EINVAL;
5763
5764	switch (stat_data->kind) {
5765	case KVM_STAT_VM:
5766		r = kvm_clear_stat_per_vm(stat_data->kvm,
5767					  stat_data->desc->desc.offset);
5768		break;
5769	case KVM_STAT_VCPU:
5770		r = kvm_clear_stat_per_vcpu(stat_data->kvm,
5771					    stat_data->desc->desc.offset);
5772		break;
5773	}
5774
5775	return r;
5776}
5777
5778static int kvm_stat_data_open(struct inode *inode, struct file *file)
5779{
5780	__simple_attr_check_format("%llu\n", 0ull);
5781	return kvm_debugfs_open(inode, file, kvm_stat_data_get,
5782				kvm_stat_data_clear, "%llu\n");
5783}
5784
5785static const struct file_operations stat_fops_per_vm = {
5786	.owner = THIS_MODULE,
5787	.open = kvm_stat_data_open,
5788	.release = kvm_debugfs_release,
5789	.read = simple_attr_read,
5790	.write = simple_attr_write,
5791	.llseek = no_llseek,
5792};
5793
5794static int vm_stat_get(void *_offset, u64 *val)
5795{
5796	unsigned offset = (long)_offset;
5797	struct kvm *kvm;
5798	u64 tmp_val;
5799
5800	*val = 0;
5801	mutex_lock(&kvm_lock);
5802	list_for_each_entry(kvm, &vm_list, vm_list) {
5803		kvm_get_stat_per_vm(kvm, offset, &tmp_val);
5804		*val += tmp_val;
5805	}
5806	mutex_unlock(&kvm_lock);
5807	return 0;
5808}
5809
5810static int vm_stat_clear(void *_offset, u64 val)
5811{
5812	unsigned offset = (long)_offset;
5813	struct kvm *kvm;
5814
5815	if (val)
5816		return -EINVAL;
5817
5818	mutex_lock(&kvm_lock);
5819	list_for_each_entry(kvm, &vm_list, vm_list) {
5820		kvm_clear_stat_per_vm(kvm, offset);
5821	}
5822	mutex_unlock(&kvm_lock);
5823
5824	return 0;
5825}
5826
5827DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
5828DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
5829
5830static int vcpu_stat_get(void *_offset, u64 *val)
5831{
5832	unsigned offset = (long)_offset;
5833	struct kvm *kvm;
5834	u64 tmp_val;
5835
5836	*val = 0;
5837	mutex_lock(&kvm_lock);
5838	list_for_each_entry(kvm, &vm_list, vm_list) {
5839		kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
5840		*val += tmp_val;
5841	}
5842	mutex_unlock(&kvm_lock);
5843	return 0;
5844}
5845
5846static int vcpu_stat_clear(void *_offset, u64 val)
5847{
5848	unsigned offset = (long)_offset;
5849	struct kvm *kvm;
5850
5851	if (val)
5852		return -EINVAL;
5853
5854	mutex_lock(&kvm_lock);
5855	list_for_each_entry(kvm, &vm_list, vm_list) {
5856		kvm_clear_stat_per_vcpu(kvm, offset);
5857	}
5858	mutex_unlock(&kvm_lock);
5859
5860	return 0;
5861}
5862
5863DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5864			"%llu\n");
5865DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
5866
5867static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5868{
5869	struct kobj_uevent_env *env;
5870	unsigned long long created, active;
5871
5872	if (!kvm_dev.this_device || !kvm)
5873		return;
5874
5875	mutex_lock(&kvm_lock);
5876	if (type == KVM_EVENT_CREATE_VM) {
5877		kvm_createvm_count++;
5878		kvm_active_vms++;
5879	} else if (type == KVM_EVENT_DESTROY_VM) {
5880		kvm_active_vms--;
5881	}
5882	created = kvm_createvm_count;
5883	active = kvm_active_vms;
5884	mutex_unlock(&kvm_lock);
5885
5886	env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
5887	if (!env)
5888		return;
5889
5890	add_uevent_var(env, "CREATED=%llu", created);
5891	add_uevent_var(env, "COUNT=%llu", active);
5892
5893	if (type == KVM_EVENT_CREATE_VM) {
5894		add_uevent_var(env, "EVENT=create");
5895		kvm->userspace_pid = task_pid_nr(current);
5896	} else if (type == KVM_EVENT_DESTROY_VM) {
5897		add_uevent_var(env, "EVENT=destroy");
5898	}
5899	add_uevent_var(env, "PID=%d", kvm->userspace_pid);
5900
5901	if (!IS_ERR(kvm->debugfs_dentry)) {
5902		char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
5903
5904		if (p) {
5905			tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5906			if (!IS_ERR(tmp))
5907				add_uevent_var(env, "STATS_PATH=%s", tmp);
5908			kfree(p);
5909		}
5910	}
5911	/* no need for checks, since we are adding at most only 5 keys */
5912	env->envp[env->envp_idx++] = NULL;
5913	kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5914	kfree(env);
5915}
5916
5917static void kvm_init_debug(void)
5918{
5919	const struct file_operations *fops;
5920	const struct _kvm_stats_desc *pdesc;
5921	int i;
5922
5923	kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
5924
5925	for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
5926		pdesc = &kvm_vm_stats_desc[i];
5927		if (kvm_stats_debugfs_mode(pdesc) & 0222)
5928			fops = &vm_stat_fops;
5929		else
5930			fops = &vm_stat_readonly_fops;
5931		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5932				kvm_debugfs_dir,
5933				(void *)(long)pdesc->desc.offset, fops);
5934	}
5935
5936	for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
5937		pdesc = &kvm_vcpu_stats_desc[i];
5938		if (kvm_stats_debugfs_mode(pdesc) & 0222)
5939			fops = &vcpu_stat_fops;
5940		else
5941			fops = &vcpu_stat_readonly_fops;
5942		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5943				kvm_debugfs_dir,
5944				(void *)(long)pdesc->desc.offset, fops);
5945	}
5946}
5947
5948static inline
5949struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
5950{
5951	return container_of(pn, struct kvm_vcpu, preempt_notifier);
5952}
5953
5954static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
5955{
5956	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5957
5958	WRITE_ONCE(vcpu->preempted, false);
5959	WRITE_ONCE(vcpu->ready, false);
5960
5961	__this_cpu_write(kvm_running_vcpu, vcpu);
5962	kvm_arch_sched_in(vcpu, cpu);
5963	kvm_arch_vcpu_load(vcpu, cpu);
5964}
5965
5966static void kvm_sched_out(struct preempt_notifier *pn,
5967			  struct task_struct *next)
5968{
5969	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5970
5971	if (current->on_rq) {
5972		WRITE_ONCE(vcpu->preempted, true);
5973		WRITE_ONCE(vcpu->ready, true);
5974	}
5975	kvm_arch_vcpu_put(vcpu);
5976	__this_cpu_write(kvm_running_vcpu, NULL);
5977}
5978
5979/**
5980 * kvm_get_running_vcpu - get the vcpu running on the current CPU.
5981 *
5982 * We can disable preemption locally around accessing the per-CPU variable,
5983 * and use the resolved vcpu pointer after enabling preemption again,
5984 * because even if the current thread is migrated to another CPU, reading
5985 * the per-CPU value later will give us the same value as we update the
5986 * per-CPU variable in the preempt notifier handlers.
5987 */
5988struct kvm_vcpu *kvm_get_running_vcpu(void)
5989{
5990	struct kvm_vcpu *vcpu;
5991
5992	preempt_disable();
5993	vcpu = __this_cpu_read(kvm_running_vcpu);
5994	preempt_enable();
5995
5996	return vcpu;
5997}
5998EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
5999
6000/**
6001 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
6002 */
6003struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
6004{
6005        return &kvm_running_vcpu;
6006}
6007
6008#ifdef CONFIG_GUEST_PERF_EVENTS
6009static unsigned int kvm_guest_state(void)
6010{
6011	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6012	unsigned int state;
6013
6014	if (!kvm_arch_pmi_in_guest(vcpu))
6015		return 0;
6016
6017	state = PERF_GUEST_ACTIVE;
6018	if (!kvm_arch_vcpu_in_kernel(vcpu))
6019		state |= PERF_GUEST_USER;
6020
6021	return state;
6022}
6023
6024static unsigned long kvm_guest_get_ip(void)
6025{
6026	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6027
6028	/* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
6029	if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
6030		return 0;
6031
6032	return kvm_arch_vcpu_get_ip(vcpu);
6033}
6034
6035static struct perf_guest_info_callbacks kvm_guest_cbs = {
6036	.state			= kvm_guest_state,
6037	.get_ip			= kvm_guest_get_ip,
6038	.handle_intel_pt_intr	= NULL,
6039};
6040
6041void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
6042{
6043	kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
6044	perf_register_guest_info_callbacks(&kvm_guest_cbs);
6045}
6046void kvm_unregister_perf_callbacks(void)
6047{
6048	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
6049}
6050#endif
6051
6052int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
6053{
6054	int r;
6055	int cpu;
6056
6057#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6058	r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
6059				      kvm_online_cpu, kvm_offline_cpu);
6060	if (r)
6061		return r;
6062
6063	register_syscore_ops(&kvm_syscore_ops);
6064#endif
6065
6066	/* A kmem cache lets us meet the alignment requirements of fx_save. */
6067	if (!vcpu_align)
6068		vcpu_align = __alignof__(struct kvm_vcpu);
6069	kvm_vcpu_cache =
6070		kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
6071					   SLAB_ACCOUNT,
6072					   offsetof(struct kvm_vcpu, arch),
6073					   offsetofend(struct kvm_vcpu, stats_id)
6074					   - offsetof(struct kvm_vcpu, arch),
6075					   NULL);
6076	if (!kvm_vcpu_cache) {
6077		r = -ENOMEM;
6078		goto err_vcpu_cache;
6079	}
6080
6081	for_each_possible_cpu(cpu) {
6082		if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
6083					    GFP_KERNEL, cpu_to_node(cpu))) {
6084			r = -ENOMEM;
6085			goto err_cpu_kick_mask;
6086		}
6087	}
6088
6089	r = kvm_irqfd_init();
6090	if (r)
6091		goto err_irqfd;
6092
6093	r = kvm_async_pf_init();
6094	if (r)
6095		goto err_async_pf;
6096
6097	kvm_chardev_ops.owner = module;
6098
6099	kvm_preempt_ops.sched_in = kvm_sched_in;
6100	kvm_preempt_ops.sched_out = kvm_sched_out;
6101
6102	kvm_init_debug();
6103
6104	r = kvm_vfio_ops_init();
6105	if (WARN_ON_ONCE(r))
6106		goto err_vfio;
6107
6108	/*
6109	 * Registration _must_ be the very last thing done, as this exposes
6110	 * /dev/kvm to userspace, i.e. all infrastructure must be setup!
6111	 */
6112	r = misc_register(&kvm_dev);
6113	if (r) {
6114		pr_err("kvm: misc device register failed\n");
6115		goto err_register;
6116	}
6117
6118	return 0;
6119
6120err_register:
6121	kvm_vfio_ops_exit();
6122err_vfio:
6123	kvm_async_pf_deinit();
6124err_async_pf:
6125	kvm_irqfd_exit();
6126err_irqfd:
6127err_cpu_kick_mask:
6128	for_each_possible_cpu(cpu)
6129		free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6130	kmem_cache_destroy(kvm_vcpu_cache);
6131err_vcpu_cache:
6132#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6133	unregister_syscore_ops(&kvm_syscore_ops);
6134	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6135#endif
6136	return r;
6137}
6138EXPORT_SYMBOL_GPL(kvm_init);
6139
6140void kvm_exit(void)
6141{
6142	int cpu;
6143
6144	/*
6145	 * Note, unregistering /dev/kvm doesn't strictly need to come first,
6146	 * fops_get(), a.k.a. try_module_get(), prevents acquiring references
6147	 * to KVM while the module is being stopped.
6148	 */
6149	misc_deregister(&kvm_dev);
6150
6151	debugfs_remove_recursive(kvm_debugfs_dir);
6152	for_each_possible_cpu(cpu)
6153		free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6154	kmem_cache_destroy(kvm_vcpu_cache);
6155	kvm_vfio_ops_exit();
6156	kvm_async_pf_deinit();
6157#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6158	unregister_syscore_ops(&kvm_syscore_ops);
6159	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6160#endif
6161	kvm_irqfd_exit();
6162}
6163EXPORT_SYMBOL_GPL(kvm_exit);
6164
6165struct kvm_vm_worker_thread_context {
6166	struct kvm *kvm;
6167	struct task_struct *parent;
6168	struct completion init_done;
6169	kvm_vm_thread_fn_t thread_fn;
6170	uintptr_t data;
6171	int err;
6172};
6173
6174static int kvm_vm_worker_thread(void *context)
6175{
6176	/*
6177	 * The init_context is allocated on the stack of the parent thread, so
6178	 * we have to locally copy anything that is needed beyond initialization
6179	 */
6180	struct kvm_vm_worker_thread_context *init_context = context;
6181	struct task_struct *parent;
6182	struct kvm *kvm = init_context->kvm;
6183	kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
6184	uintptr_t data = init_context->data;
6185	int err;
6186
6187	err = kthread_park(current);
6188	/* kthread_park(current) is never supposed to return an error */
6189	WARN_ON(err != 0);
6190	if (err)
6191		goto init_complete;
6192
6193	err = cgroup_attach_task_all(init_context->parent, current);
6194	if (err) {
6195		kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
6196			__func__, err);
6197		goto init_complete;
6198	}
6199
6200	set_user_nice(current, task_nice(init_context->parent));
6201
6202init_complete:
6203	init_context->err = err;
6204	complete(&init_context->init_done);
6205	init_context = NULL;
6206
6207	if (err)
6208		goto out;
6209
6210	/* Wait to be woken up by the spawner before proceeding. */
6211	kthread_parkme();
6212
6213	if (!kthread_should_stop())
6214		err = thread_fn(kvm, data);
6215
6216out:
6217	/*
6218	 * Move kthread back to its original cgroup to prevent it lingering in
6219	 * the cgroup of the VM process, after the latter finishes its
6220	 * execution.
6221	 *
6222	 * kthread_stop() waits on the 'exited' completion condition which is
6223	 * set in exit_mm(), via mm_release(), in do_exit(). However, the
6224	 * kthread is removed from the cgroup in the cgroup_exit() which is
6225	 * called after the exit_mm(). This causes the kthread_stop() to return
6226	 * before the kthread actually quits the cgroup.
6227	 */
6228	rcu_read_lock();
6229	parent = rcu_dereference(current->real_parent);
6230	get_task_struct(parent);
6231	rcu_read_unlock();
6232	cgroup_attach_task_all(parent, current);
6233	put_task_struct(parent);
6234
6235	return err;
6236}
6237
6238int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
6239				uintptr_t data, const char *name,
6240				struct task_struct **thread_ptr)
6241{
6242	struct kvm_vm_worker_thread_context init_context = {};
6243	struct task_struct *thread;
6244
6245	*thread_ptr = NULL;
6246	init_context.kvm = kvm;
6247	init_context.parent = current;
6248	init_context.thread_fn = thread_fn;
6249	init_context.data = data;
6250	init_completion(&init_context.init_done);
6251
6252	thread = kthread_run(kvm_vm_worker_thread, &init_context,
6253			     "%s-%d", name, task_pid_nr(current));
6254	if (IS_ERR(thread))
6255		return PTR_ERR(thread);
6256
6257	/* kthread_run is never supposed to return NULL */
6258	WARN_ON(thread == NULL);
6259
6260	wait_for_completion(&init_context.init_done);
6261
6262	if (!init_context.err)
6263		*thread_ptr = thread;
6264
6265	return init_context.err;
6266}
6267