1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 16#include <kvm/iodev.h> 17 18#include <linux/kvm_host.h> 19#include <linux/kvm.h> 20#include <linux/module.h> 21#include <linux/errno.h> 22#include <linux/percpu.h> 23#include <linux/mm.h> 24#include <linux/miscdevice.h> 25#include <linux/vmalloc.h> 26#include <linux/reboot.h> 27#include <linux/debugfs.h> 28#include <linux/highmem.h> 29#include <linux/file.h> 30#include <linux/syscore_ops.h> 31#include <linux/cpu.h> 32#include <linux/sched/signal.h> 33#include <linux/sched/mm.h> 34#include <linux/sched/stat.h> 35#include <linux/cpumask.h> 36#include <linux/smp.h> 37#include <linux/anon_inodes.h> 38#include <linux/profile.h> 39#include <linux/kvm_para.h> 40#include <linux/pagemap.h> 41#include <linux/mman.h> 42#include <linux/swap.h> 43#include <linux/bitops.h> 44#include <linux/spinlock.h> 45#include <linux/compat.h> 46#include <linux/srcu.h> 47#include <linux/hugetlb.h> 48#include <linux/slab.h> 49#include <linux/sort.h> 50#include <linux/bsearch.h> 51#include <linux/io.h> 52#include <linux/lockdep.h> 53#include <linux/kthread.h> 54 55#include <asm/processor.h> 56#include <asm/ioctl.h> 57#include <linux/uaccess.h> 58 59#include "coalesced_mmio.h" 60#include "async_pf.h" 61#include "vfio.h" 62 63#define CREATE_TRACE_POINTS 64#include <trace/events/kvm.h> 65 66/* Worst case buffer size needed for holding an integer. */ 67#define ITOA_MAX_LEN 12 68 69MODULE_AUTHOR("Qumranet"); 70MODULE_LICENSE("GPL"); 71 72/* Architectures should define their poll value according to the halt latency */ 73unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; 74module_param(halt_poll_ns, uint, 0644); 75EXPORT_SYMBOL_GPL(halt_poll_ns); 76 77/* Default doubles per-vcpu halt_poll_ns. */ 78unsigned int halt_poll_ns_grow = 2; 79module_param(halt_poll_ns_grow, uint, 0644); 80EXPORT_SYMBOL_GPL(halt_poll_ns_grow); 81 82/* The start value to grow halt_poll_ns from */ 83unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ 84module_param(halt_poll_ns_grow_start, uint, 0644); 85EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); 86 87/* Default resets per-vcpu halt_poll_ns . */ 88unsigned int halt_poll_ns_shrink; 89module_param(halt_poll_ns_shrink, uint, 0644); 90EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); 91 92/* 93 * Ordering of locks: 94 * 95 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 96 */ 97 98DEFINE_MUTEX(kvm_lock); 99static DEFINE_RAW_SPINLOCK(kvm_count_lock); 100LIST_HEAD(vm_list); 101 102static cpumask_var_t cpus_hardware_enabled; 103static int kvm_usage_count; 104static atomic_t hardware_enable_failed; 105 106static struct kmem_cache *kvm_vcpu_cache; 107 108static __read_mostly struct preempt_ops kvm_preempt_ops; 109static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu); 110 111struct dentry *kvm_debugfs_dir; 112EXPORT_SYMBOL_GPL(kvm_debugfs_dir); 113 114static int kvm_debugfs_num_entries; 115static const struct file_operations stat_fops_per_vm; 116 117static struct file_operations kvm_chardev_ops; 118 119static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 120 unsigned long arg); 121#ifdef CONFIG_KVM_COMPAT 122static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 123 unsigned long arg); 124#define KVM_COMPAT(c) .compat_ioctl = (c) 125#else 126/* 127 * For architectures that don't implement a compat infrastructure, 128 * adopt a double line of defense: 129 * - Prevent a compat task from opening /dev/kvm 130 * - If the open has been done by a 64bit task, and the KVM fd 131 * passed to a compat task, let the ioctls fail. 132 */ 133static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, 134 unsigned long arg) { return -EINVAL; } 135 136static int kvm_no_compat_open(struct inode *inode, struct file *file) 137{ 138 return is_compat_task() ? -ENODEV : 0; 139} 140#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ 141 .open = kvm_no_compat_open 142#endif 143static int hardware_enable_all(void); 144static void hardware_disable_all(void); 145 146static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 147 148__visible bool kvm_rebooting; 149EXPORT_SYMBOL_GPL(kvm_rebooting); 150 151#define KVM_EVENT_CREATE_VM 0 152#define KVM_EVENT_DESTROY_VM 1 153static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); 154static unsigned long long kvm_createvm_count; 155static unsigned long long kvm_active_vms; 156 157static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask); 158 159__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, 160 unsigned long start, unsigned long end) 161{ 162} 163 164__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) 165{ 166} 167 168bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) 169{ 170 /* 171 * The metadata used by is_zone_device_page() to determine whether or 172 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if 173 * the device has been pinned, e.g. by get_user_pages(). WARN if the 174 * page_count() is zero to help detect bad usage of this helper. 175 */ 176 if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn)))) 177 return false; 178 179 return is_zone_device_page(pfn_to_page(pfn)); 180} 181 182bool kvm_is_reserved_pfn(kvm_pfn_t pfn) 183{ 184 /* 185 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting 186 * perspective they are "normal" pages, albeit with slightly different 187 * usage rules. 188 */ 189 if (pfn_valid(pfn)) 190 return PageReserved(pfn_to_page(pfn)) && 191 !is_zero_pfn(pfn) && 192 !kvm_is_zone_device_pfn(pfn); 193 194 return true; 195} 196 197bool kvm_is_transparent_hugepage(kvm_pfn_t pfn) 198{ 199 struct page *page = pfn_to_page(pfn); 200 201 if (!PageTransCompoundMap(page)) 202 return false; 203 204 return is_transparent_hugepage(compound_head(page)); 205} 206 207/* 208 * Switches to specified vcpu, until a matching vcpu_put() 209 */ 210void vcpu_load(struct kvm_vcpu *vcpu) 211{ 212 int cpu = get_cpu(); 213 214 __this_cpu_write(kvm_running_vcpu, vcpu); 215 preempt_notifier_register(&vcpu->preempt_notifier); 216 kvm_arch_vcpu_load(vcpu, cpu); 217 put_cpu(); 218} 219EXPORT_SYMBOL_GPL(vcpu_load); 220 221void vcpu_put(struct kvm_vcpu *vcpu) 222{ 223 preempt_disable(); 224 kvm_arch_vcpu_put(vcpu); 225 preempt_notifier_unregister(&vcpu->preempt_notifier); 226 __this_cpu_write(kvm_running_vcpu, NULL); 227 preempt_enable(); 228} 229EXPORT_SYMBOL_GPL(vcpu_put); 230 231/* TODO: merge with kvm_arch_vcpu_should_kick */ 232static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) 233{ 234 int mode = kvm_vcpu_exiting_guest_mode(vcpu); 235 236 /* 237 * We need to wait for the VCPU to reenable interrupts and get out of 238 * READING_SHADOW_PAGE_TABLES mode. 239 */ 240 if (req & KVM_REQUEST_WAIT) 241 return mode != OUTSIDE_GUEST_MODE; 242 243 /* 244 * Need to kick a running VCPU, but otherwise there is nothing to do. 245 */ 246 return mode == IN_GUEST_MODE; 247} 248 249static void ack_flush(void *_completed) 250{ 251} 252 253static inline bool kvm_kick_many_cpus(cpumask_var_t tmp, bool wait) 254{ 255 const struct cpumask *cpus; 256 257 if (likely(cpumask_available(tmp))) 258 cpus = tmp; 259 else 260 cpus = cpu_online_mask; 261 262 if (cpumask_empty(cpus)) 263 return false; 264 265 smp_call_function_many(cpus, ack_flush, NULL, wait); 266 return true; 267} 268 269static void kvm_make_vcpu_request(struct kvm *kvm, struct kvm_vcpu *vcpu, 270 unsigned int req, cpumask_var_t tmp, 271 int current_cpu) 272{ 273 int cpu; 274 275 kvm_make_request(req, vcpu); 276 277 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) 278 return; 279 280 /* 281 * tmp can be "unavailable" if cpumasks are allocated off stack as 282 * allocation of the mask is deliberately not fatal and is handled by 283 * falling back to kicking all online CPUs. 284 */ 285 if (!cpumask_available(tmp)) 286 return; 287 288 /* 289 * Note, the vCPU could get migrated to a different pCPU at any point 290 * after kvm_request_needs_ipi(), which could result in sending an IPI 291 * to the previous pCPU. But, that's OK because the purpose of the IPI 292 * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is 293 * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES 294 * after this point is also OK, as the requirement is only that KVM wait 295 * for vCPUs that were reading SPTEs _before_ any changes were 296 * finalized. See kvm_vcpu_kick() for more details on handling requests. 297 */ 298 if (kvm_request_needs_ipi(vcpu, req)) { 299 cpu = READ_ONCE(vcpu->cpu); 300 if (cpu != -1 && cpu != current_cpu) 301 __cpumask_set_cpu(cpu, tmp); 302 } 303} 304 305bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, 306 struct kvm_vcpu *except, 307 unsigned long *vcpu_bitmap, cpumask_var_t tmp) 308{ 309 struct kvm_vcpu *vcpu; 310 int i, me; 311 bool called; 312 313 me = get_cpu(); 314 315 for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) { 316 vcpu = kvm_get_vcpu(kvm, i); 317 if (!vcpu || vcpu == except) 318 continue; 319 kvm_make_vcpu_request(kvm, vcpu, req, tmp, me); 320 } 321 322 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT)); 323 put_cpu(); 324 325 return called; 326} 327 328bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, 329 struct kvm_vcpu *except) 330{ 331 struct kvm_vcpu *vcpu; 332 struct cpumask *cpus; 333 bool called; 334 int i, me; 335 336 me = get_cpu(); 337 338 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask); 339 cpumask_clear(cpus); 340 341 kvm_for_each_vcpu(i, vcpu, kvm) { 342 if (vcpu == except) 343 continue; 344 kvm_make_vcpu_request(kvm, vcpu, req, cpus, me); 345 } 346 347 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); 348 put_cpu(); 349 350 return called; 351} 352 353bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) 354{ 355 return kvm_make_all_cpus_request_except(kvm, req, NULL); 356} 357 358#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL 359void kvm_flush_remote_tlbs(struct kvm *kvm) 360{ 361 /* 362 * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in 363 * kvm_make_all_cpus_request. 364 */ 365 long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); 366 367 /* 368 * We want to publish modifications to the page tables before reading 369 * mode. Pairs with a memory barrier in arch-specific code. 370 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest 371 * and smp_mb in walk_shadow_page_lockless_begin/end. 372 * - powerpc: smp_mb in kvmppc_prepare_to_enter. 373 * 374 * There is already an smp_mb__after_atomic() before 375 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that 376 * barrier here. 377 */ 378 if (!kvm_arch_flush_remote_tlb(kvm) 379 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 380 ++kvm->stat.remote_tlb_flush; 381 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 382} 383EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 384#endif 385 386void kvm_reload_remote_mmus(struct kvm *kvm) 387{ 388 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 389} 390 391static void kvm_flush_shadow_all(struct kvm *kvm) 392{ 393 kvm_arch_flush_shadow_all(kvm); 394 kvm_arch_guest_memory_reclaimed(kvm); 395} 396 397#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 398static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, 399 gfp_t gfp_flags) 400{ 401 gfp_flags |= mc->gfp_zero; 402 403 if (mc->kmem_cache) 404 return kmem_cache_alloc(mc->kmem_cache, gfp_flags); 405 else 406 return (void *)__get_free_page(gfp_flags); 407} 408 409int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) 410{ 411 void *obj; 412 413 if (mc->nobjs >= min) 414 return 0; 415 while (mc->nobjs < ARRAY_SIZE(mc->objects)) { 416 obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT); 417 if (!obj) 418 return mc->nobjs >= min ? 0 : -ENOMEM; 419 mc->objects[mc->nobjs++] = obj; 420 } 421 return 0; 422} 423 424int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc) 425{ 426 return mc->nobjs; 427} 428 429void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 430{ 431 while (mc->nobjs) { 432 if (mc->kmem_cache) 433 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]); 434 else 435 free_page((unsigned long)mc->objects[--mc->nobjs]); 436 } 437} 438 439void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) 440{ 441 void *p; 442 443 if (WARN_ON(!mc->nobjs)) 444 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT); 445 else 446 p = mc->objects[--mc->nobjs]; 447 BUG_ON(!p); 448 return p; 449} 450#endif 451 452static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 453{ 454 mutex_init(&vcpu->mutex); 455 vcpu->cpu = -1; 456 vcpu->kvm = kvm; 457 vcpu->vcpu_id = id; 458 vcpu->pid = NULL; 459 rcuwait_init(&vcpu->wait); 460 kvm_async_pf_vcpu_init(vcpu); 461 462 vcpu->pre_pcpu = -1; 463 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); 464 465 kvm_vcpu_set_in_spin_loop(vcpu, false); 466 kvm_vcpu_set_dy_eligible(vcpu, false); 467 vcpu->preempted = false; 468 vcpu->ready = false; 469 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 470} 471 472void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) 473{ 474 kvm_arch_vcpu_destroy(vcpu); 475 476 /* 477 * No need for rcu_read_lock as VCPU_RUN is the only place that changes 478 * the vcpu->pid pointer, and at destruction time all file descriptors 479 * are already gone. 480 */ 481 put_pid(rcu_dereference_protected(vcpu->pid, 1)); 482 483 free_page((unsigned long)vcpu->run); 484 kmem_cache_free(kvm_vcpu_cache, vcpu); 485} 486EXPORT_SYMBOL_GPL(kvm_vcpu_destroy); 487 488#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 489static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 490{ 491 return container_of(mn, struct kvm, mmu_notifier); 492} 493 494static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn, 495 struct mm_struct *mm, 496 unsigned long start, unsigned long end) 497{ 498 struct kvm *kvm = mmu_notifier_to_kvm(mn); 499 int idx; 500 501 idx = srcu_read_lock(&kvm->srcu); 502 kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); 503 srcu_read_unlock(&kvm->srcu, idx); 504} 505 506static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 507 struct mm_struct *mm, 508 unsigned long address, 509 pte_t pte) 510{ 511 struct kvm *kvm = mmu_notifier_to_kvm(mn); 512 int idx; 513 514 idx = srcu_read_lock(&kvm->srcu); 515 spin_lock(&kvm->mmu_lock); 516 kvm->mmu_notifier_seq++; 517 518 if (kvm_set_spte_hva(kvm, address, pte)) 519 kvm_flush_remote_tlbs(kvm); 520 521 spin_unlock(&kvm->mmu_lock); 522 srcu_read_unlock(&kvm->srcu, idx); 523} 524 525static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 526 const struct mmu_notifier_range *range) 527{ 528 struct kvm *kvm = mmu_notifier_to_kvm(mn); 529 int need_tlb_flush = 0, idx; 530 531 idx = srcu_read_lock(&kvm->srcu); 532 spin_lock(&kvm->mmu_lock); 533 /* 534 * The count increase must become visible at unlock time as no 535 * spte can be established without taking the mmu_lock and 536 * count is also read inside the mmu_lock critical section. 537 */ 538 kvm->mmu_notifier_count++; 539 need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end, 540 range->flags); 541 /* we've to flush the tlb before the pages can be freed */ 542 if (need_tlb_flush || kvm->tlbs_dirty) 543 kvm_flush_remote_tlbs(kvm); 544 545 spin_unlock(&kvm->mmu_lock); 546 kvm_arch_guest_memory_reclaimed(kvm); 547 srcu_read_unlock(&kvm->srcu, idx); 548 549 return 0; 550} 551 552static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 553 const struct mmu_notifier_range *range) 554{ 555 struct kvm *kvm = mmu_notifier_to_kvm(mn); 556 557 spin_lock(&kvm->mmu_lock); 558 /* 559 * This sequence increase will notify the kvm page fault that 560 * the page that is going to be mapped in the spte could have 561 * been freed. 562 */ 563 kvm->mmu_notifier_seq++; 564 smp_wmb(); 565 /* 566 * The above sequence increase must be visible before the 567 * below count decrease, which is ensured by the smp_wmb above 568 * in conjunction with the smp_rmb in mmu_notifier_retry(). 569 */ 570 kvm->mmu_notifier_count--; 571 spin_unlock(&kvm->mmu_lock); 572 573 BUG_ON(kvm->mmu_notifier_count < 0); 574} 575 576static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 577 struct mm_struct *mm, 578 unsigned long start, 579 unsigned long end) 580{ 581 struct kvm *kvm = mmu_notifier_to_kvm(mn); 582 int young, idx; 583 584 idx = srcu_read_lock(&kvm->srcu); 585 spin_lock(&kvm->mmu_lock); 586 587 young = kvm_age_hva(kvm, start, end); 588 if (young) 589 kvm_flush_remote_tlbs(kvm); 590 591 spin_unlock(&kvm->mmu_lock); 592 srcu_read_unlock(&kvm->srcu, idx); 593 594 return young; 595} 596 597static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, 598 struct mm_struct *mm, 599 unsigned long start, 600 unsigned long end) 601{ 602 struct kvm *kvm = mmu_notifier_to_kvm(mn); 603 int young, idx; 604 605 idx = srcu_read_lock(&kvm->srcu); 606 spin_lock(&kvm->mmu_lock); 607 /* 608 * Even though we do not flush TLB, this will still adversely 609 * affect performance on pre-Haswell Intel EPT, where there is 610 * no EPT Access Bit to clear so that we have to tear down EPT 611 * tables instead. If we find this unacceptable, we can always 612 * add a parameter to kvm_age_hva so that it effectively doesn't 613 * do anything on clear_young. 614 * 615 * Also note that currently we never issue secondary TLB flushes 616 * from clear_young, leaving this job up to the regular system 617 * cadence. If we find this inaccurate, we might come up with a 618 * more sophisticated heuristic later. 619 */ 620 young = kvm_age_hva(kvm, start, end); 621 spin_unlock(&kvm->mmu_lock); 622 srcu_read_unlock(&kvm->srcu, idx); 623 624 return young; 625} 626 627static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 628 struct mm_struct *mm, 629 unsigned long address) 630{ 631 struct kvm *kvm = mmu_notifier_to_kvm(mn); 632 int young, idx; 633 634 idx = srcu_read_lock(&kvm->srcu); 635 spin_lock(&kvm->mmu_lock); 636 young = kvm_test_age_hva(kvm, address); 637 spin_unlock(&kvm->mmu_lock); 638 srcu_read_unlock(&kvm->srcu, idx); 639 640 return young; 641} 642 643static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 644 struct mm_struct *mm) 645{ 646 struct kvm *kvm = mmu_notifier_to_kvm(mn); 647 int idx; 648 649 idx = srcu_read_lock(&kvm->srcu); 650 kvm_flush_shadow_all(kvm); 651 srcu_read_unlock(&kvm->srcu, idx); 652} 653 654static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 655 .invalidate_range = kvm_mmu_notifier_invalidate_range, 656 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 657 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 658 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 659 .clear_young = kvm_mmu_notifier_clear_young, 660 .test_young = kvm_mmu_notifier_test_young, 661 .change_pte = kvm_mmu_notifier_change_pte, 662 .release = kvm_mmu_notifier_release, 663}; 664 665static int kvm_init_mmu_notifier(struct kvm *kvm) 666{ 667 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 668 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 669} 670 671#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 672 673static int kvm_init_mmu_notifier(struct kvm *kvm) 674{ 675 return 0; 676} 677 678#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 679 680static struct kvm_memslots *kvm_alloc_memslots(void) 681{ 682 int i; 683 struct kvm_memslots *slots; 684 685 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); 686 if (!slots) 687 return NULL; 688 689 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 690 slots->id_to_index[i] = -1; 691 692 return slots; 693} 694 695static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 696{ 697 if (!memslot->dirty_bitmap) 698 return; 699 700 kvfree(memslot->dirty_bitmap); 701 memslot->dirty_bitmap = NULL; 702} 703 704static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 705{ 706 kvm_destroy_dirty_bitmap(slot); 707 708 kvm_arch_free_memslot(kvm, slot); 709 710 slot->flags = 0; 711 slot->npages = 0; 712} 713 714static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) 715{ 716 struct kvm_memory_slot *memslot; 717 718 if (!slots) 719 return; 720 721 kvm_for_each_memslot(memslot, slots) 722 kvm_free_memslot(kvm, memslot); 723 724 kvfree(slots); 725} 726 727static void kvm_destroy_vm_debugfs(struct kvm *kvm) 728{ 729 int i; 730 731 if (!kvm->debugfs_dentry) 732 return; 733 734 debugfs_remove_recursive(kvm->debugfs_dentry); 735 736 if (kvm->debugfs_stat_data) { 737 for (i = 0; i < kvm_debugfs_num_entries; i++) 738 kfree(kvm->debugfs_stat_data[i]); 739 kfree(kvm->debugfs_stat_data); 740 } 741} 742 743static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) 744{ 745 static DEFINE_MUTEX(kvm_debugfs_lock); 746 struct dentry *dent; 747 char dir_name[ITOA_MAX_LEN * 2]; 748 struct kvm_stat_data *stat_data; 749 struct kvm_stats_debugfs_item *p; 750 751 if (!debugfs_initialized()) 752 return 0; 753 754 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd); 755 mutex_lock(&kvm_debugfs_lock); 756 dent = debugfs_lookup(dir_name, kvm_debugfs_dir); 757 if (dent) { 758 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name); 759 dput(dent); 760 mutex_unlock(&kvm_debugfs_lock); 761 return 0; 762 } 763 dent = debugfs_create_dir(dir_name, kvm_debugfs_dir); 764 mutex_unlock(&kvm_debugfs_lock); 765 if (IS_ERR(dent)) 766 return 0; 767 768 kvm->debugfs_dentry = dent; 769 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, 770 sizeof(*kvm->debugfs_stat_data), 771 GFP_KERNEL_ACCOUNT); 772 if (!kvm->debugfs_stat_data) 773 return -ENOMEM; 774 775 for (p = debugfs_entries; p->name; p++) { 776 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); 777 if (!stat_data) 778 return -ENOMEM; 779 780 stat_data->kvm = kvm; 781 stat_data->dbgfs_item = p; 782 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; 783 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), 784 kvm->debugfs_dentry, stat_data, 785 &stat_fops_per_vm); 786 } 787 return 0; 788} 789 790/* 791 * Called after the VM is otherwise initialized, but just before adding it to 792 * the vm_list. 793 */ 794int __weak kvm_arch_post_init_vm(struct kvm *kvm) 795{ 796 return 0; 797} 798 799/* 800 * Called just after removing the VM from the vm_list, but before doing any 801 * other destruction. 802 */ 803void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) 804{ 805} 806 807static struct kvm *kvm_create_vm(unsigned long type) 808{ 809 struct kvm *kvm = kvm_arch_alloc_vm(); 810 int r = -ENOMEM; 811 int i; 812 813 if (!kvm) 814 return ERR_PTR(-ENOMEM); 815 816 spin_lock_init(&kvm->mmu_lock); 817 mmgrab(current->mm); 818 kvm->mm = current->mm; 819 kvm_eventfd_init(kvm); 820 mutex_init(&kvm->lock); 821 mutex_init(&kvm->irq_lock); 822 mutex_init(&kvm->slots_lock); 823 INIT_LIST_HEAD(&kvm->devices); 824 825 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); 826 827 if (init_srcu_struct(&kvm->srcu)) 828 goto out_err_no_srcu; 829 if (init_srcu_struct(&kvm->irq_srcu)) 830 goto out_err_no_irq_srcu; 831 832 refcount_set(&kvm->users_count, 1); 833 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 834 struct kvm_memslots *slots = kvm_alloc_memslots(); 835 836 if (!slots) 837 goto out_err_no_arch_destroy_vm; 838 /* Generations must be different for each address space. */ 839 slots->generation = i; 840 rcu_assign_pointer(kvm->memslots[i], slots); 841 } 842 843 for (i = 0; i < KVM_NR_BUSES; i++) { 844 rcu_assign_pointer(kvm->buses[i], 845 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); 846 if (!kvm->buses[i]) 847 goto out_err_no_arch_destroy_vm; 848 } 849 850 kvm->max_halt_poll_ns = halt_poll_ns; 851 852 r = kvm_arch_init_vm(kvm, type); 853 if (r) 854 goto out_err_no_arch_destroy_vm; 855 856 r = hardware_enable_all(); 857 if (r) 858 goto out_err_no_disable; 859 860#ifdef CONFIG_HAVE_KVM_IRQFD 861 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 862#endif 863 864 r = kvm_init_mmu_notifier(kvm); 865 if (r) 866 goto out_err_no_mmu_notifier; 867 868 r = kvm_arch_post_init_vm(kvm); 869 if (r) 870 goto out_err; 871 872 mutex_lock(&kvm_lock); 873 list_add(&kvm->vm_list, &vm_list); 874 mutex_unlock(&kvm_lock); 875 876 preempt_notifier_inc(); 877 878 /* 879 * When the fd passed to this ioctl() is opened it pins the module, 880 * but try_module_get() also prevents getting a reference if the module 881 * is in MODULE_STATE_GOING (e.g. if someone ran "rmmod --wait"). 882 */ 883 if (!try_module_get(kvm_chardev_ops.owner)) { 884 r = -ENODEV; 885 goto out_err; 886 } 887 888 return kvm; 889 890out_err: 891#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 892 if (kvm->mmu_notifier.ops) 893 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); 894#endif 895out_err_no_mmu_notifier: 896 hardware_disable_all(); 897out_err_no_disable: 898 kvm_arch_destroy_vm(kvm); 899out_err_no_arch_destroy_vm: 900 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); 901 for (i = 0; i < KVM_NR_BUSES; i++) 902 kfree(kvm_get_bus(kvm, i)); 903 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 904 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 905 cleanup_srcu_struct(&kvm->irq_srcu); 906out_err_no_irq_srcu: 907 cleanup_srcu_struct(&kvm->srcu); 908out_err_no_srcu: 909 kvm_arch_free_vm(kvm); 910 mmdrop(current->mm); 911 return ERR_PTR(r); 912} 913 914static void kvm_destroy_devices(struct kvm *kvm) 915{ 916 struct kvm_device *dev, *tmp; 917 918 /* 919 * We do not need to take the kvm->lock here, because nobody else 920 * has a reference to the struct kvm at this point and therefore 921 * cannot access the devices list anyhow. 922 */ 923 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { 924 list_del(&dev->vm_node); 925 dev->ops->destroy(dev); 926 } 927} 928 929static void kvm_destroy_vm(struct kvm *kvm) 930{ 931 int i; 932 struct mm_struct *mm = kvm->mm; 933 934 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); 935 kvm_destroy_vm_debugfs(kvm); 936 kvm_arch_sync_events(kvm); 937 mutex_lock(&kvm_lock); 938 list_del(&kvm->vm_list); 939 mutex_unlock(&kvm_lock); 940 kvm_arch_pre_destroy_vm(kvm); 941 942 kvm_free_irq_routing(kvm); 943 for (i = 0; i < KVM_NR_BUSES; i++) { 944 struct kvm_io_bus *bus = kvm_get_bus(kvm, i); 945 946 if (bus) 947 kvm_io_bus_destroy(bus); 948 kvm->buses[i] = NULL; 949 } 950 kvm_coalesced_mmio_free(kvm); 951#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 952 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 953#else 954 kvm_flush_shadow_all(kvm); 955#endif 956 kvm_arch_destroy_vm(kvm); 957 kvm_destroy_devices(kvm); 958 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 959 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 960 cleanup_srcu_struct(&kvm->irq_srcu); 961 cleanup_srcu_struct(&kvm->srcu); 962 kvm_arch_free_vm(kvm); 963 preempt_notifier_dec(); 964 hardware_disable_all(); 965 mmdrop(mm); 966 module_put(kvm_chardev_ops.owner); 967} 968 969void kvm_get_kvm(struct kvm *kvm) 970{ 971 refcount_inc(&kvm->users_count); 972} 973EXPORT_SYMBOL_GPL(kvm_get_kvm); 974 975void kvm_put_kvm(struct kvm *kvm) 976{ 977 if (refcount_dec_and_test(&kvm->users_count)) 978 kvm_destroy_vm(kvm); 979} 980EXPORT_SYMBOL_GPL(kvm_put_kvm); 981 982/* 983 * Used to put a reference that was taken on behalf of an object associated 984 * with a user-visible file descriptor, e.g. a vcpu or device, if installation 985 * of the new file descriptor fails and the reference cannot be transferred to 986 * its final owner. In such cases, the caller is still actively using @kvm and 987 * will fail miserably if the refcount unexpectedly hits zero. 988 */ 989void kvm_put_kvm_no_destroy(struct kvm *kvm) 990{ 991 WARN_ON(refcount_dec_and_test(&kvm->users_count)); 992} 993EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy); 994 995static int kvm_vm_release(struct inode *inode, struct file *filp) 996{ 997 struct kvm *kvm = filp->private_data; 998 999 kvm_irqfd_release(kvm); 1000 1001 kvm_put_kvm(kvm); 1002 return 0; 1003} 1004 1005/* 1006 * Allocation size is twice as large as the actual dirty bitmap size. 1007 * See kvm_vm_ioctl_get_dirty_log() why this is needed. 1008 */ 1009static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot) 1010{ 1011 unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot); 1012 1013 memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT); 1014 if (!memslot->dirty_bitmap) 1015 return -ENOMEM; 1016 1017 return 0; 1018} 1019 1020/* 1021 * Delete a memslot by decrementing the number of used slots and shifting all 1022 * other entries in the array forward one spot. 1023 */ 1024static inline void kvm_memslot_delete(struct kvm_memslots *slots, 1025 struct kvm_memory_slot *memslot) 1026{ 1027 struct kvm_memory_slot *mslots = slots->memslots; 1028 int i; 1029 1030 if (WARN_ON(slots->id_to_index[memslot->id] == -1)) 1031 return; 1032 1033 slots->used_slots--; 1034 1035 if (atomic_read(&slots->lru_slot) >= slots->used_slots) 1036 atomic_set(&slots->lru_slot, 0); 1037 1038 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) { 1039 mslots[i] = mslots[i + 1]; 1040 slots->id_to_index[mslots[i].id] = i; 1041 } 1042 mslots[i] = *memslot; 1043 slots->id_to_index[memslot->id] = -1; 1044} 1045 1046/* 1047 * "Insert" a new memslot by incrementing the number of used slots. Returns 1048 * the new slot's initial index into the memslots array. 1049 */ 1050static inline int kvm_memslot_insert_back(struct kvm_memslots *slots) 1051{ 1052 return slots->used_slots++; 1053} 1054 1055/* 1056 * Move a changed memslot backwards in the array by shifting existing slots 1057 * with a higher GFN toward the front of the array. Note, the changed memslot 1058 * itself is not preserved in the array, i.e. not swapped at this time, only 1059 * its new index into the array is tracked. Returns the changed memslot's 1060 * current index into the memslots array. 1061 */ 1062static inline int kvm_memslot_move_backward(struct kvm_memslots *slots, 1063 struct kvm_memory_slot *memslot) 1064{ 1065 struct kvm_memory_slot *mslots = slots->memslots; 1066 int i; 1067 1068 if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) || 1069 WARN_ON_ONCE(!slots->used_slots)) 1070 return -1; 1071 1072 /* 1073 * Move the target memslot backward in the array by shifting existing 1074 * memslots with a higher GFN (than the target memslot) towards the 1075 * front of the array. 1076 */ 1077 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) { 1078 if (memslot->base_gfn > mslots[i + 1].base_gfn) 1079 break; 1080 1081 WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn); 1082 1083 /* Shift the next memslot forward one and update its index. */ 1084 mslots[i] = mslots[i + 1]; 1085 slots->id_to_index[mslots[i].id] = i; 1086 } 1087 return i; 1088} 1089 1090/* 1091 * Move a changed memslot forwards in the array by shifting existing slots with 1092 * a lower GFN toward the back of the array. Note, the changed memslot itself 1093 * is not preserved in the array, i.e. not swapped at this time, only its new 1094 * index into the array is tracked. Returns the changed memslot's final index 1095 * into the memslots array. 1096 */ 1097static inline int kvm_memslot_move_forward(struct kvm_memslots *slots, 1098 struct kvm_memory_slot *memslot, 1099 int start) 1100{ 1101 struct kvm_memory_slot *mslots = slots->memslots; 1102 int i; 1103 1104 for (i = start; i > 0; i--) { 1105 if (memslot->base_gfn < mslots[i - 1].base_gfn) 1106 break; 1107 1108 WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn); 1109 1110 /* Shift the next memslot back one and update its index. */ 1111 mslots[i] = mslots[i - 1]; 1112 slots->id_to_index[mslots[i].id] = i; 1113 } 1114 return i; 1115} 1116 1117/* 1118 * Re-sort memslots based on their GFN to account for an added, deleted, or 1119 * moved memslot. Sorting memslots by GFN allows using a binary search during 1120 * memslot lookup. 1121 * 1122 * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! I.e. the entry 1123 * at memslots[0] has the highest GFN. 1124 * 1125 * The sorting algorithm takes advantage of having initially sorted memslots 1126 * and knowing the position of the changed memslot. Sorting is also optimized 1127 * by not swapping the updated memslot and instead only shifting other memslots 1128 * and tracking the new index for the update memslot. Only once its final 1129 * index is known is the updated memslot copied into its position in the array. 1130 * 1131 * - When deleting a memslot, the deleted memslot simply needs to be moved to 1132 * the end of the array. 1133 * 1134 * - When creating a memslot, the algorithm "inserts" the new memslot at the 1135 * end of the array and then it forward to its correct location. 1136 * 1137 * - When moving a memslot, the algorithm first moves the updated memslot 1138 * backward to handle the scenario where the memslot's GFN was changed to a 1139 * lower value. update_memslots() then falls through and runs the same flow 1140 * as creating a memslot to move the memslot forward to handle the scenario 1141 * where its GFN was changed to a higher value. 1142 * 1143 * Note, slots are sorted from highest->lowest instead of lowest->highest for 1144 * historical reasons. Originally, invalid memslots where denoted by having 1145 * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots 1146 * to the end of the array. The current algorithm uses dedicated logic to 1147 * delete a memslot and thus does not rely on invalid memslots having GFN=0. 1148 * 1149 * The other historical motiviation for highest->lowest was to improve the 1150 * performance of memslot lookup. KVM originally used a linear search starting 1151 * at memslots[0]. On x86, the largest memslot usually has one of the highest, 1152 * if not *the* highest, GFN, as the bulk of the guest's RAM is located in a 1153 * single memslot above the 4gb boundary. As the largest memslot is also the 1154 * most likely to be referenced, sorting it to the front of the array was 1155 * advantageous. The current binary search starts from the middle of the array 1156 * and uses an LRU pointer to improve performance for all memslots and GFNs. 1157 */ 1158static void update_memslots(struct kvm_memslots *slots, 1159 struct kvm_memory_slot *memslot, 1160 enum kvm_mr_change change) 1161{ 1162 int i; 1163 1164 if (change == KVM_MR_DELETE) { 1165 kvm_memslot_delete(slots, memslot); 1166 } else { 1167 if (change == KVM_MR_CREATE) 1168 i = kvm_memslot_insert_back(slots); 1169 else 1170 i = kvm_memslot_move_backward(slots, memslot); 1171 i = kvm_memslot_move_forward(slots, memslot, i); 1172 1173 /* 1174 * Copy the memslot to its new position in memslots and update 1175 * its index accordingly. 1176 */ 1177 slots->memslots[i] = *memslot; 1178 slots->id_to_index[memslot->id] = i; 1179 } 1180} 1181 1182static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) 1183{ 1184 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 1185 1186#ifdef __KVM_HAVE_READONLY_MEM 1187 valid_flags |= KVM_MEM_READONLY; 1188#endif 1189 1190 if (mem->flags & ~valid_flags) 1191 return -EINVAL; 1192 1193 return 0; 1194} 1195 1196static struct kvm_memslots *install_new_memslots(struct kvm *kvm, 1197 int as_id, struct kvm_memslots *slots) 1198{ 1199 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); 1200 u64 gen = old_memslots->generation; 1201 1202 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); 1203 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 1204 1205 rcu_assign_pointer(kvm->memslots[as_id], slots); 1206 synchronize_srcu_expedited(&kvm->srcu); 1207 1208 /* 1209 * Increment the new memslot generation a second time, dropping the 1210 * update in-progress flag and incrementing the generation based on 1211 * the number of address spaces. This provides a unique and easily 1212 * identifiable generation number while the memslots are in flux. 1213 */ 1214 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 1215 1216 /* 1217 * Generations must be unique even across address spaces. We do not need 1218 * a global counter for that, instead the generation space is evenly split 1219 * across address spaces. For example, with two address spaces, address 1220 * space 0 will use generations 0, 2, 4, ... while address space 1 will 1221 * use generations 1, 3, 5, ... 1222 */ 1223 gen += KVM_ADDRESS_SPACE_NUM; 1224 1225 kvm_arch_memslots_updated(kvm, gen); 1226 1227 slots->generation = gen; 1228 1229 return old_memslots; 1230} 1231 1232/* 1233 * Note, at a minimum, the current number of used slots must be allocated, even 1234 * when deleting a memslot, as we need a complete duplicate of the memslots for 1235 * use when invalidating a memslot prior to deleting/moving the memslot. 1236 */ 1237static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old, 1238 enum kvm_mr_change change) 1239{ 1240 struct kvm_memslots *slots; 1241 size_t old_size, new_size; 1242 1243 old_size = sizeof(struct kvm_memslots) + 1244 (sizeof(struct kvm_memory_slot) * old->used_slots); 1245 1246 if (change == KVM_MR_CREATE) 1247 new_size = old_size + sizeof(struct kvm_memory_slot); 1248 else 1249 new_size = old_size; 1250 1251 slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT); 1252 if (likely(slots)) 1253 memcpy(slots, old, old_size); 1254 1255 return slots; 1256} 1257 1258static int kvm_set_memslot(struct kvm *kvm, 1259 const struct kvm_userspace_memory_region *mem, 1260 struct kvm_memory_slot *old, 1261 struct kvm_memory_slot *new, int as_id, 1262 enum kvm_mr_change change) 1263{ 1264 struct kvm_memory_slot *slot; 1265 struct kvm_memslots *slots; 1266 int r; 1267 1268 slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change); 1269 if (!slots) 1270 return -ENOMEM; 1271 1272 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { 1273 /* 1274 * Note, the INVALID flag needs to be in the appropriate entry 1275 * in the freshly allocated memslots, not in @old or @new. 1276 */ 1277 slot = id_to_memslot(slots, old->id); 1278 slot->flags |= KVM_MEMSLOT_INVALID; 1279 1280 /* 1281 * We can re-use the old memslots, the only difference from the 1282 * newly installed memslots is the invalid flag, which will get 1283 * dropped by update_memslots anyway. We'll also revert to the 1284 * old memslots if preparing the new memory region fails. 1285 */ 1286 slots = install_new_memslots(kvm, as_id, slots); 1287 1288 /* From this point no new shadow pages pointing to a deleted, 1289 * or moved, memslot will be created. 1290 * 1291 * validation of sp->gfn happens in: 1292 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 1293 * - kvm_is_visible_gfn (mmu_check_root) 1294 */ 1295 kvm_arch_flush_shadow_memslot(kvm, slot); 1296 kvm_arch_guest_memory_reclaimed(kvm); 1297 } 1298 1299 r = kvm_arch_prepare_memory_region(kvm, new, mem, change); 1300 if (r) 1301 goto out_slots; 1302 1303 update_memslots(slots, new, change); 1304 slots = install_new_memslots(kvm, as_id, slots); 1305 1306 kvm_arch_commit_memory_region(kvm, mem, old, new, change); 1307 1308 kvfree(slots); 1309 return 0; 1310 1311out_slots: 1312 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) 1313 slots = install_new_memslots(kvm, as_id, slots); 1314 kvfree(slots); 1315 return r; 1316} 1317 1318static int kvm_delete_memslot(struct kvm *kvm, 1319 const struct kvm_userspace_memory_region *mem, 1320 struct kvm_memory_slot *old, int as_id) 1321{ 1322 struct kvm_memory_slot new; 1323 int r; 1324 1325 if (!old->npages) 1326 return -EINVAL; 1327 1328 memset(&new, 0, sizeof(new)); 1329 new.id = old->id; 1330 /* 1331 * This is only for debugging purpose; it should never be referenced 1332 * for a removed memslot. 1333 */ 1334 new.as_id = as_id; 1335 1336 r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE); 1337 if (r) 1338 return r; 1339 1340 kvm_free_memslot(kvm, old); 1341 return 0; 1342} 1343 1344/* 1345 * Allocate some memory and give it an address in the guest physical address 1346 * space. 1347 * 1348 * Discontiguous memory is allowed, mostly for framebuffers. 1349 * 1350 * Must be called holding kvm->slots_lock for write. 1351 */ 1352int __kvm_set_memory_region(struct kvm *kvm, 1353 const struct kvm_userspace_memory_region *mem) 1354{ 1355 struct kvm_memory_slot old, new; 1356 struct kvm_memory_slot *tmp; 1357 enum kvm_mr_change change; 1358 int as_id, id; 1359 int r; 1360 1361 r = check_memory_region_flags(mem); 1362 if (r) 1363 return r; 1364 1365 as_id = mem->slot >> 16; 1366 id = (u16)mem->slot; 1367 1368 /* General sanity checks */ 1369 if ((mem->memory_size & (PAGE_SIZE - 1)) || 1370 (mem->memory_size != (unsigned long)mem->memory_size)) 1371 return -EINVAL; 1372 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 1373 return -EINVAL; 1374 /* We can read the guest memory with __xxx_user() later on. */ 1375 if ((mem->userspace_addr & (PAGE_SIZE - 1)) || 1376 (mem->userspace_addr != untagged_addr(mem->userspace_addr)) || 1377 !access_ok((void __user *)(unsigned long)mem->userspace_addr, 1378 mem->memory_size)) 1379 return -EINVAL; 1380 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) 1381 return -EINVAL; 1382 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 1383 return -EINVAL; 1384 1385 /* 1386 * Make a full copy of the old memslot, the pointer will become stale 1387 * when the memslots are re-sorted by update_memslots(), and the old 1388 * memslot needs to be referenced after calling update_memslots(), e.g. 1389 * to free its resources and for arch specific behavior. 1390 */ 1391 tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id); 1392 if (tmp) { 1393 old = *tmp; 1394 tmp = NULL; 1395 } else { 1396 memset(&old, 0, sizeof(old)); 1397 old.id = id; 1398 } 1399 1400 if (!mem->memory_size) 1401 return kvm_delete_memslot(kvm, mem, &old, as_id); 1402 1403 new.as_id = as_id; 1404 new.id = id; 1405 new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 1406 new.npages = mem->memory_size >> PAGE_SHIFT; 1407 new.flags = mem->flags; 1408 new.userspace_addr = mem->userspace_addr; 1409 1410 if (new.npages > KVM_MEM_MAX_NR_PAGES) 1411 return -EINVAL; 1412 1413 if (!old.npages) { 1414 change = KVM_MR_CREATE; 1415 new.dirty_bitmap = NULL; 1416 memset(&new.arch, 0, sizeof(new.arch)); 1417 } else { /* Modify an existing slot. */ 1418 if ((new.userspace_addr != old.userspace_addr) || 1419 (new.npages != old.npages) || 1420 ((new.flags ^ old.flags) & KVM_MEM_READONLY)) 1421 return -EINVAL; 1422 1423 if (new.base_gfn != old.base_gfn) 1424 change = KVM_MR_MOVE; 1425 else if (new.flags != old.flags) 1426 change = KVM_MR_FLAGS_ONLY; 1427 else /* Nothing to change. */ 1428 return 0; 1429 1430 /* Copy dirty_bitmap and arch from the current memslot. */ 1431 new.dirty_bitmap = old.dirty_bitmap; 1432 memcpy(&new.arch, &old.arch, sizeof(new.arch)); 1433 } 1434 1435 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 1436 /* Check for overlaps */ 1437 kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) { 1438 if (tmp->id == id) 1439 continue; 1440 if (!((new.base_gfn + new.npages <= tmp->base_gfn) || 1441 (new.base_gfn >= tmp->base_gfn + tmp->npages))) 1442 return -EEXIST; 1443 } 1444 } 1445 1446 /* Allocate/free page dirty bitmap as needed */ 1447 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 1448 new.dirty_bitmap = NULL; 1449 else if (!new.dirty_bitmap) { 1450 r = kvm_alloc_dirty_bitmap(&new); 1451 if (r) 1452 return r; 1453 1454 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1455 bitmap_set(new.dirty_bitmap, 0, new.npages); 1456 } 1457 1458 r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change); 1459 if (r) 1460 goto out_bitmap; 1461 1462 if (old.dirty_bitmap && !new.dirty_bitmap) 1463 kvm_destroy_dirty_bitmap(&old); 1464 return 0; 1465 1466out_bitmap: 1467 if (new.dirty_bitmap && !old.dirty_bitmap) 1468 kvm_destroy_dirty_bitmap(&new); 1469 return r; 1470} 1471EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1472 1473int kvm_set_memory_region(struct kvm *kvm, 1474 const struct kvm_userspace_memory_region *mem) 1475{ 1476 int r; 1477 1478 mutex_lock(&kvm->slots_lock); 1479 r = __kvm_set_memory_region(kvm, mem); 1480 mutex_unlock(&kvm->slots_lock); 1481 return r; 1482} 1483EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1484 1485static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1486 struct kvm_userspace_memory_region *mem) 1487{ 1488 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) 1489 return -EINVAL; 1490 1491 return kvm_set_memory_region(kvm, mem); 1492} 1493 1494#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 1495/** 1496 * kvm_get_dirty_log - get a snapshot of dirty pages 1497 * @kvm: pointer to kvm instance 1498 * @log: slot id and address to which we copy the log 1499 * @is_dirty: set to '1' if any dirty pages were found 1500 * @memslot: set to the associated memslot, always valid on success 1501 */ 1502int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, 1503 int *is_dirty, struct kvm_memory_slot **memslot) 1504{ 1505 struct kvm_memslots *slots; 1506 int i, as_id, id; 1507 unsigned long n; 1508 unsigned long any = 0; 1509 1510 *memslot = NULL; 1511 *is_dirty = 0; 1512 1513 as_id = log->slot >> 16; 1514 id = (u16)log->slot; 1515 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1516 return -EINVAL; 1517 1518 slots = __kvm_memslots(kvm, as_id); 1519 *memslot = id_to_memslot(slots, id); 1520 if (!(*memslot) || !(*memslot)->dirty_bitmap) 1521 return -ENOENT; 1522 1523 kvm_arch_sync_dirty_log(kvm, *memslot); 1524 1525 n = kvm_dirty_bitmap_bytes(*memslot); 1526 1527 for (i = 0; !any && i < n/sizeof(long); ++i) 1528 any = (*memslot)->dirty_bitmap[i]; 1529 1530 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n)) 1531 return -EFAULT; 1532 1533 if (any) 1534 *is_dirty = 1; 1535 return 0; 1536} 1537EXPORT_SYMBOL_GPL(kvm_get_dirty_log); 1538 1539#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ 1540/** 1541 * kvm_get_dirty_log_protect - get a snapshot of dirty pages 1542 * and reenable dirty page tracking for the corresponding pages. 1543 * @kvm: pointer to kvm instance 1544 * @log: slot id and address to which we copy the log 1545 * 1546 * We need to keep it in mind that VCPU threads can write to the bitmap 1547 * concurrently. So, to avoid losing track of dirty pages we keep the 1548 * following order: 1549 * 1550 * 1. Take a snapshot of the bit and clear it if needed. 1551 * 2. Write protect the corresponding page. 1552 * 3. Copy the snapshot to the userspace. 1553 * 4. Upon return caller flushes TLB's if needed. 1554 * 1555 * Between 2 and 4, the guest may write to the page using the remaining TLB 1556 * entry. This is not a problem because the page is reported dirty using 1557 * the snapshot taken before and step 4 ensures that writes done after 1558 * exiting to userspace will be logged for the next call. 1559 * 1560 */ 1561static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) 1562{ 1563 struct kvm_memslots *slots; 1564 struct kvm_memory_slot *memslot; 1565 int i, as_id, id; 1566 unsigned long n; 1567 unsigned long *dirty_bitmap; 1568 unsigned long *dirty_bitmap_buffer; 1569 bool flush; 1570 1571 as_id = log->slot >> 16; 1572 id = (u16)log->slot; 1573 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1574 return -EINVAL; 1575 1576 slots = __kvm_memslots(kvm, as_id); 1577 memslot = id_to_memslot(slots, id); 1578 if (!memslot || !memslot->dirty_bitmap) 1579 return -ENOENT; 1580 1581 dirty_bitmap = memslot->dirty_bitmap; 1582 1583 kvm_arch_sync_dirty_log(kvm, memslot); 1584 1585 n = kvm_dirty_bitmap_bytes(memslot); 1586 flush = false; 1587 if (kvm->manual_dirty_log_protect) { 1588 /* 1589 * Unlike kvm_get_dirty_log, we always return false in *flush, 1590 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There 1591 * is some code duplication between this function and 1592 * kvm_get_dirty_log, but hopefully all architecture 1593 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log 1594 * can be eliminated. 1595 */ 1596 dirty_bitmap_buffer = dirty_bitmap; 1597 } else { 1598 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1599 memset(dirty_bitmap_buffer, 0, n); 1600 1601 spin_lock(&kvm->mmu_lock); 1602 for (i = 0; i < n / sizeof(long); i++) { 1603 unsigned long mask; 1604 gfn_t offset; 1605 1606 if (!dirty_bitmap[i]) 1607 continue; 1608 1609 flush = true; 1610 mask = xchg(&dirty_bitmap[i], 0); 1611 dirty_bitmap_buffer[i] = mask; 1612 1613 offset = i * BITS_PER_LONG; 1614 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1615 offset, mask); 1616 } 1617 spin_unlock(&kvm->mmu_lock); 1618 } 1619 1620 if (flush) 1621 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 1622 1623 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 1624 return -EFAULT; 1625 return 0; 1626} 1627 1628 1629/** 1630 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot 1631 * @kvm: kvm instance 1632 * @log: slot id and address to which we copy the log 1633 * 1634 * Steps 1-4 below provide general overview of dirty page logging. See 1635 * kvm_get_dirty_log_protect() function description for additional details. 1636 * 1637 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we 1638 * always flush the TLB (step 4) even if previous step failed and the dirty 1639 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API 1640 * does not preclude user space subsequent dirty log read. Flushing TLB ensures 1641 * writes will be marked dirty for next log read. 1642 * 1643 * 1. Take a snapshot of the bit and clear it if needed. 1644 * 2. Write protect the corresponding page. 1645 * 3. Copy the snapshot to the userspace. 1646 * 4. Flush TLB's if needed. 1647 */ 1648static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 1649 struct kvm_dirty_log *log) 1650{ 1651 int r; 1652 1653 mutex_lock(&kvm->slots_lock); 1654 1655 r = kvm_get_dirty_log_protect(kvm, log); 1656 1657 mutex_unlock(&kvm->slots_lock); 1658 return r; 1659} 1660 1661/** 1662 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap 1663 * and reenable dirty page tracking for the corresponding pages. 1664 * @kvm: pointer to kvm instance 1665 * @log: slot id and address from which to fetch the bitmap of dirty pages 1666 */ 1667static int kvm_clear_dirty_log_protect(struct kvm *kvm, 1668 struct kvm_clear_dirty_log *log) 1669{ 1670 struct kvm_memslots *slots; 1671 struct kvm_memory_slot *memslot; 1672 int as_id, id; 1673 gfn_t offset; 1674 unsigned long i, n; 1675 unsigned long *dirty_bitmap; 1676 unsigned long *dirty_bitmap_buffer; 1677 bool flush; 1678 1679 as_id = log->slot >> 16; 1680 id = (u16)log->slot; 1681 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1682 return -EINVAL; 1683 1684 if (log->first_page & 63) 1685 return -EINVAL; 1686 1687 slots = __kvm_memslots(kvm, as_id); 1688 memslot = id_to_memslot(slots, id); 1689 if (!memslot || !memslot->dirty_bitmap) 1690 return -ENOENT; 1691 1692 dirty_bitmap = memslot->dirty_bitmap; 1693 1694 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8; 1695 1696 if (log->first_page > memslot->npages || 1697 log->num_pages > memslot->npages - log->first_page || 1698 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63))) 1699 return -EINVAL; 1700 1701 kvm_arch_sync_dirty_log(kvm, memslot); 1702 1703 flush = false; 1704 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1705 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) 1706 return -EFAULT; 1707 1708 spin_lock(&kvm->mmu_lock); 1709 for (offset = log->first_page, i = offset / BITS_PER_LONG, 1710 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; 1711 i++, offset += BITS_PER_LONG) { 1712 unsigned long mask = *dirty_bitmap_buffer++; 1713 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; 1714 if (!mask) 1715 continue; 1716 1717 mask &= atomic_long_fetch_andnot(mask, p); 1718 1719 /* 1720 * mask contains the bits that really have been cleared. This 1721 * never includes any bits beyond the length of the memslot (if 1722 * the length is not aligned to 64 pages), therefore it is not 1723 * a problem if userspace sets them in log->dirty_bitmap. 1724 */ 1725 if (mask) { 1726 flush = true; 1727 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1728 offset, mask); 1729 } 1730 } 1731 spin_unlock(&kvm->mmu_lock); 1732 1733 if (flush) 1734 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 1735 1736 return 0; 1737} 1738 1739static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, 1740 struct kvm_clear_dirty_log *log) 1741{ 1742 int r; 1743 1744 mutex_lock(&kvm->slots_lock); 1745 1746 r = kvm_clear_dirty_log_protect(kvm, log); 1747 1748 mutex_unlock(&kvm->slots_lock); 1749 return r; 1750} 1751#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ 1752 1753struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1754{ 1755 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 1756} 1757EXPORT_SYMBOL_GPL(gfn_to_memslot); 1758 1759struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) 1760{ 1761 return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); 1762} 1763 1764bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1765{ 1766 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 1767 1768 return kvm_is_visible_memslot(memslot); 1769} 1770EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1771 1772bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) 1773{ 1774 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1775 1776 return kvm_is_visible_memslot(memslot); 1777} 1778EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn); 1779 1780unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) 1781{ 1782 struct vm_area_struct *vma; 1783 unsigned long addr, size; 1784 1785 size = PAGE_SIZE; 1786 1787 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL); 1788 if (kvm_is_error_hva(addr)) 1789 return PAGE_SIZE; 1790 1791 mmap_read_lock(current->mm); 1792 vma = find_vma(current->mm, addr); 1793 if (!vma) 1794 goto out; 1795 1796 size = vma_kernel_pagesize(vma); 1797 1798out: 1799 mmap_read_unlock(current->mm); 1800 1801 return size; 1802} 1803 1804static bool memslot_is_readonly(struct kvm_memory_slot *slot) 1805{ 1806 return slot->flags & KVM_MEM_READONLY; 1807} 1808 1809static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1810 gfn_t *nr_pages, bool write) 1811{ 1812 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1813 return KVM_HVA_ERR_BAD; 1814 1815 if (memslot_is_readonly(slot) && write) 1816 return KVM_HVA_ERR_RO_BAD; 1817 1818 if (nr_pages) 1819 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1820 1821 return __gfn_to_hva_memslot(slot, gfn); 1822} 1823 1824static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1825 gfn_t *nr_pages) 1826{ 1827 return __gfn_to_hva_many(slot, gfn, nr_pages, true); 1828} 1829 1830unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 1831 gfn_t gfn) 1832{ 1833 return gfn_to_hva_many(slot, gfn, NULL); 1834} 1835EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); 1836 1837unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1838{ 1839 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1840} 1841EXPORT_SYMBOL_GPL(gfn_to_hva); 1842 1843unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) 1844{ 1845 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); 1846} 1847EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); 1848 1849/* 1850 * Return the hva of a @gfn and the R/W attribute if possible. 1851 * 1852 * @slot: the kvm_memory_slot which contains @gfn 1853 * @gfn: the gfn to be translated 1854 * @writable: used to return the read/write attribute of the @slot if the hva 1855 * is valid and @writable is not NULL 1856 */ 1857unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, 1858 gfn_t gfn, bool *writable) 1859{ 1860 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); 1861 1862 if (!kvm_is_error_hva(hva) && writable) 1863 *writable = !memslot_is_readonly(slot); 1864 1865 return hva; 1866} 1867 1868unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) 1869{ 1870 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1871 1872 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1873} 1874 1875unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) 1876{ 1877 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1878 1879 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1880} 1881 1882static inline int check_user_page_hwpoison(unsigned long addr) 1883{ 1884 int rc, flags = FOLL_HWPOISON | FOLL_WRITE; 1885 1886 rc = get_user_pages(addr, 1, flags, NULL, NULL); 1887 return rc == -EHWPOISON; 1888} 1889 1890/* 1891 * The fast path to get the writable pfn which will be stored in @pfn, 1892 * true indicates success, otherwise false is returned. It's also the 1893 * only part that runs if we can in atomic context. 1894 */ 1895static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, 1896 bool *writable, kvm_pfn_t *pfn) 1897{ 1898 struct page *page[1]; 1899 1900 /* 1901 * Fast pin a writable pfn only if it is a write fault request 1902 * or the caller allows to map a writable pfn for a read fault 1903 * request. 1904 */ 1905 if (!(write_fault || writable)) 1906 return false; 1907 1908 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) { 1909 *pfn = page_to_pfn(page[0]); 1910 1911 if (writable) 1912 *writable = true; 1913 return true; 1914 } 1915 1916 return false; 1917} 1918 1919/* 1920 * The slow path to get the pfn of the specified host virtual address, 1921 * 1 indicates success, -errno is returned if error is detected. 1922 */ 1923static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, 1924 bool *writable, kvm_pfn_t *pfn) 1925{ 1926 unsigned int flags = FOLL_HWPOISON; 1927 struct page *page; 1928 int npages = 0; 1929 1930 might_sleep(); 1931 1932 if (writable) 1933 *writable = write_fault; 1934 1935 if (write_fault) 1936 flags |= FOLL_WRITE; 1937 if (async) 1938 flags |= FOLL_NOWAIT; 1939 1940 npages = get_user_pages_unlocked(addr, 1, &page, flags); 1941 if (npages != 1) 1942 return npages; 1943 1944 /* map read fault as writable if possible */ 1945 if (unlikely(!write_fault) && writable) { 1946 struct page *wpage; 1947 1948 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) { 1949 *writable = true; 1950 put_page(page); 1951 page = wpage; 1952 } 1953 } 1954 *pfn = page_to_pfn(page); 1955 return npages; 1956} 1957 1958static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) 1959{ 1960 if (unlikely(!(vma->vm_flags & VM_READ))) 1961 return false; 1962 1963 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) 1964 return false; 1965 1966 return true; 1967} 1968 1969static int kvm_try_get_pfn(kvm_pfn_t pfn) 1970{ 1971 if (kvm_is_reserved_pfn(pfn)) 1972 return 1; 1973 return get_page_unless_zero(pfn_to_page(pfn)); 1974} 1975 1976static int hva_to_pfn_remapped(struct vm_area_struct *vma, 1977 unsigned long addr, bool *async, 1978 bool write_fault, bool *writable, 1979 kvm_pfn_t *p_pfn) 1980{ 1981 kvm_pfn_t pfn; 1982 pte_t *ptep; 1983 spinlock_t *ptl; 1984 int r; 1985 1986 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); 1987 if (r) { 1988 /* 1989 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does 1990 * not call the fault handler, so do it here. 1991 */ 1992 bool unlocked = false; 1993 r = fixup_user_fault(current->mm, addr, 1994 (write_fault ? FAULT_FLAG_WRITE : 0), 1995 &unlocked); 1996 if (unlocked) 1997 return -EAGAIN; 1998 if (r) 1999 return r; 2000 2001 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); 2002 if (r) 2003 return r; 2004 } 2005 2006 if (write_fault && !pte_write(*ptep)) { 2007 pfn = KVM_PFN_ERR_RO_FAULT; 2008 goto out; 2009 } 2010 2011 if (writable) 2012 *writable = pte_write(*ptep); 2013 pfn = pte_pfn(*ptep); 2014 2015 /* 2016 * Get a reference here because callers of *hva_to_pfn* and 2017 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the 2018 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP 2019 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will 2020 * simply do nothing for reserved pfns. 2021 * 2022 * Whoever called remap_pfn_range is also going to call e.g. 2023 * unmap_mapping_range before the underlying pages are freed, 2024 * causing a call to our MMU notifier. 2025 * 2026 * Certain IO or PFNMAP mappings can be backed with valid 2027 * struct pages, but be allocated without refcounting e.g., 2028 * tail pages of non-compound higher order allocations, which 2029 * would then underflow the refcount when the caller does the 2030 * required put_page. Don't allow those pages here. 2031 */ 2032 if (!kvm_try_get_pfn(pfn)) 2033 r = -EFAULT; 2034 2035out: 2036 pte_unmap_unlock(ptep, ptl); 2037 *p_pfn = pfn; 2038 2039 return r; 2040} 2041 2042/* 2043 * Pin guest page in memory and return its pfn. 2044 * @addr: host virtual address which maps memory to the guest 2045 * @atomic: whether this function can sleep 2046 * @async: whether this function need to wait IO complete if the 2047 * host page is not in the memory 2048 * @write_fault: whether we should get a writable host page 2049 * @writable: whether it allows to map a writable host page for !@write_fault 2050 * 2051 * The function will map a writable host page for these two cases: 2052 * 1): @write_fault = true 2053 * 2): @write_fault = false && @writable, @writable will tell the caller 2054 * whether the mapping is writable. 2055 */ 2056static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, 2057 bool write_fault, bool *writable) 2058{ 2059 struct vm_area_struct *vma; 2060 kvm_pfn_t pfn = 0; 2061 int npages, r; 2062 2063 /* we can do it either atomically or asynchronously, not both */ 2064 BUG_ON(atomic && async); 2065 2066 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) 2067 return pfn; 2068 2069 if (atomic) 2070 return KVM_PFN_ERR_FAULT; 2071 2072 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); 2073 if (npages == 1) 2074 return pfn; 2075 2076 mmap_read_lock(current->mm); 2077 if (npages == -EHWPOISON || 2078 (!async && check_user_page_hwpoison(addr))) { 2079 pfn = KVM_PFN_ERR_HWPOISON; 2080 goto exit; 2081 } 2082 2083retry: 2084 vma = find_vma_intersection(current->mm, addr, addr + 1); 2085 2086 if (vma == NULL) 2087 pfn = KVM_PFN_ERR_FAULT; 2088 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { 2089 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn); 2090 if (r == -EAGAIN) 2091 goto retry; 2092 if (r < 0) 2093 pfn = KVM_PFN_ERR_FAULT; 2094 } else { 2095 if (async && vma_is_valid(vma, write_fault)) 2096 *async = true; 2097 pfn = KVM_PFN_ERR_FAULT; 2098 } 2099exit: 2100 mmap_read_unlock(current->mm); 2101 return pfn; 2102} 2103 2104kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, 2105 bool atomic, bool *async, bool write_fault, 2106 bool *writable) 2107{ 2108 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 2109 2110 if (addr == KVM_HVA_ERR_RO_BAD) { 2111 if (writable) 2112 *writable = false; 2113 return KVM_PFN_ERR_RO_FAULT; 2114 } 2115 2116 if (kvm_is_error_hva(addr)) { 2117 if (writable) 2118 *writable = false; 2119 return KVM_PFN_NOSLOT; 2120 } 2121 2122 /* Do not map writable pfn in the readonly memslot. */ 2123 if (writable && memslot_is_readonly(slot)) { 2124 *writable = false; 2125 writable = NULL; 2126 } 2127 2128 return hva_to_pfn(addr, atomic, async, write_fault, 2129 writable); 2130} 2131EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); 2132 2133kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 2134 bool *writable) 2135{ 2136 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, 2137 write_fault, writable); 2138} 2139EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 2140 2141kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 2142{ 2143 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 2144} 2145EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); 2146 2147kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 2148{ 2149 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 2150} 2151EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 2152 2153kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) 2154{ 2155 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 2156} 2157EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); 2158 2159kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 2160{ 2161 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); 2162} 2163EXPORT_SYMBOL_GPL(gfn_to_pfn); 2164 2165kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) 2166{ 2167 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 2168} 2169EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); 2170 2171int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 2172 struct page **pages, int nr_pages) 2173{ 2174 unsigned long addr; 2175 gfn_t entry = 0; 2176 2177 addr = gfn_to_hva_many(slot, gfn, &entry); 2178 if (kvm_is_error_hva(addr)) 2179 return -1; 2180 2181 if (entry < nr_pages) 2182 return 0; 2183 2184 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); 2185} 2186EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 2187 2188static struct page *kvm_pfn_to_page(kvm_pfn_t pfn) 2189{ 2190 if (is_error_noslot_pfn(pfn)) 2191 return KVM_ERR_PTR_BAD_PAGE; 2192 2193 if (kvm_is_reserved_pfn(pfn)) { 2194 WARN_ON(1); 2195 return KVM_ERR_PTR_BAD_PAGE; 2196 } 2197 2198 return pfn_to_page(pfn); 2199} 2200 2201struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 2202{ 2203 kvm_pfn_t pfn; 2204 2205 pfn = gfn_to_pfn(kvm, gfn); 2206 2207 return kvm_pfn_to_page(pfn); 2208} 2209EXPORT_SYMBOL_GPL(gfn_to_page); 2210 2211void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache) 2212{ 2213 if (pfn == 0) 2214 return; 2215 2216 if (cache) 2217 cache->pfn = cache->gfn = 0; 2218 2219 if (dirty) 2220 kvm_release_pfn_dirty(pfn); 2221 else 2222 kvm_release_pfn_clean(pfn); 2223} 2224 2225static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn, 2226 struct gfn_to_pfn_cache *cache, u64 gen) 2227{ 2228 kvm_release_pfn(cache->pfn, cache->dirty, cache); 2229 2230 cache->pfn = gfn_to_pfn_memslot(slot, gfn); 2231 cache->gfn = gfn; 2232 cache->dirty = false; 2233 cache->generation = gen; 2234} 2235 2236static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn, 2237 struct kvm_host_map *map, 2238 struct gfn_to_pfn_cache *cache, 2239 bool atomic) 2240{ 2241 kvm_pfn_t pfn; 2242 void *hva = NULL; 2243 struct page *page = KVM_UNMAPPED_PAGE; 2244 struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn); 2245 u64 gen = slots->generation; 2246 2247 if (!map) 2248 return -EINVAL; 2249 2250 if (cache) { 2251 if (!cache->pfn || cache->gfn != gfn || 2252 cache->generation != gen) { 2253 if (atomic) 2254 return -EAGAIN; 2255 kvm_cache_gfn_to_pfn(slot, gfn, cache, gen); 2256 } 2257 pfn = cache->pfn; 2258 } else { 2259 if (atomic) 2260 return -EAGAIN; 2261 pfn = gfn_to_pfn_memslot(slot, gfn); 2262 } 2263 if (is_error_noslot_pfn(pfn)) 2264 return -EINVAL; 2265 2266 if (pfn_valid(pfn)) { 2267 page = pfn_to_page(pfn); 2268 if (atomic) 2269 hva = kmap_atomic(page); 2270 else 2271 hva = kmap(page); 2272#ifdef CONFIG_HAS_IOMEM 2273 } else if (!atomic) { 2274 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); 2275 } else { 2276 return -EINVAL; 2277#endif 2278 } 2279 2280 if (!hva) 2281 return -EFAULT; 2282 2283 map->page = page; 2284 map->hva = hva; 2285 map->pfn = pfn; 2286 map->gfn = gfn; 2287 2288 return 0; 2289} 2290 2291int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, 2292 struct gfn_to_pfn_cache *cache, bool atomic) 2293{ 2294 return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map, 2295 cache, atomic); 2296} 2297EXPORT_SYMBOL_GPL(kvm_map_gfn); 2298 2299int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) 2300{ 2301 return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map, 2302 NULL, false); 2303} 2304EXPORT_SYMBOL_GPL(kvm_vcpu_map); 2305 2306static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot, 2307 struct kvm_host_map *map, 2308 struct gfn_to_pfn_cache *cache, 2309 bool dirty, bool atomic) 2310{ 2311 if (!map) 2312 return; 2313 2314 if (!map->hva) 2315 return; 2316 2317 if (map->page != KVM_UNMAPPED_PAGE) { 2318 if (atomic) 2319 kunmap_atomic(map->hva); 2320 else 2321 kunmap(map->page); 2322 } 2323#ifdef CONFIG_HAS_IOMEM 2324 else if (!atomic) 2325 memunmap(map->hva); 2326 else 2327 WARN_ONCE(1, "Unexpected unmapping in atomic context"); 2328#endif 2329 2330 if (dirty) 2331 mark_page_dirty_in_slot(memslot, map->gfn); 2332 2333 if (cache) 2334 cache->dirty |= dirty; 2335 else 2336 kvm_release_pfn(map->pfn, dirty, NULL); 2337 2338 map->hva = NULL; 2339 map->page = NULL; 2340} 2341 2342int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 2343 struct gfn_to_pfn_cache *cache, bool dirty, bool atomic) 2344{ 2345 __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, 2346 cache, dirty, atomic); 2347 return 0; 2348} 2349EXPORT_SYMBOL_GPL(kvm_unmap_gfn); 2350 2351void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) 2352{ 2353 __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL, 2354 dirty, false); 2355} 2356EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); 2357 2358struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) 2359{ 2360 kvm_pfn_t pfn; 2361 2362 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); 2363 2364 return kvm_pfn_to_page(pfn); 2365} 2366EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page); 2367 2368void kvm_release_page_clean(struct page *page) 2369{ 2370 WARN_ON(is_error_page(page)); 2371 2372 kvm_release_pfn_clean(page_to_pfn(page)); 2373} 2374EXPORT_SYMBOL_GPL(kvm_release_page_clean); 2375 2376void kvm_release_pfn_clean(kvm_pfn_t pfn) 2377{ 2378 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) 2379 put_page(pfn_to_page(pfn)); 2380} 2381EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 2382 2383void kvm_release_page_dirty(struct page *page) 2384{ 2385 WARN_ON(is_error_page(page)); 2386 2387 kvm_release_pfn_dirty(page_to_pfn(page)); 2388} 2389EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 2390 2391void kvm_release_pfn_dirty(kvm_pfn_t pfn) 2392{ 2393 kvm_set_pfn_dirty(pfn); 2394 kvm_release_pfn_clean(pfn); 2395} 2396EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 2397 2398static bool kvm_is_ad_tracked_pfn(kvm_pfn_t pfn) 2399{ 2400 if (!pfn_valid(pfn)) 2401 return false; 2402 2403 /* 2404 * Per page-flags.h, pages tagged PG_reserved "should in general not be 2405 * touched (e.g. set dirty) except by its owner". 2406 */ 2407 return !PageReserved(pfn_to_page(pfn)); 2408} 2409 2410void kvm_set_pfn_dirty(kvm_pfn_t pfn) 2411{ 2412 if (kvm_is_ad_tracked_pfn(pfn)) 2413 SetPageDirty(pfn_to_page(pfn)); 2414} 2415EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 2416 2417void kvm_set_pfn_accessed(kvm_pfn_t pfn) 2418{ 2419 if (kvm_is_ad_tracked_pfn(pfn)) 2420 mark_page_accessed(pfn_to_page(pfn)); 2421} 2422EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 2423 2424void kvm_get_pfn(kvm_pfn_t pfn) 2425{ 2426 if (!kvm_is_reserved_pfn(pfn)) 2427 get_page(pfn_to_page(pfn)); 2428} 2429EXPORT_SYMBOL_GPL(kvm_get_pfn); 2430 2431static int next_segment(unsigned long len, int offset) 2432{ 2433 if (len > PAGE_SIZE - offset) 2434 return PAGE_SIZE - offset; 2435 else 2436 return len; 2437} 2438 2439static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, 2440 void *data, int offset, int len) 2441{ 2442 int r; 2443 unsigned long addr; 2444 2445 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 2446 if (kvm_is_error_hva(addr)) 2447 return -EFAULT; 2448 r = __copy_from_user(data, (void __user *)addr + offset, len); 2449 if (r) 2450 return -EFAULT; 2451 return 0; 2452} 2453 2454int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 2455 int len) 2456{ 2457 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2458 2459 return __kvm_read_guest_page(slot, gfn, data, offset, len); 2460} 2461EXPORT_SYMBOL_GPL(kvm_read_guest_page); 2462 2463int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, 2464 int offset, int len) 2465{ 2466 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2467 2468 return __kvm_read_guest_page(slot, gfn, data, offset, len); 2469} 2470EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); 2471 2472int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 2473{ 2474 gfn_t gfn = gpa >> PAGE_SHIFT; 2475 int seg; 2476 int offset = offset_in_page(gpa); 2477 int ret; 2478 2479 while ((seg = next_segment(len, offset)) != 0) { 2480 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 2481 if (ret < 0) 2482 return ret; 2483 offset = 0; 2484 len -= seg; 2485 data += seg; 2486 ++gfn; 2487 } 2488 return 0; 2489} 2490EXPORT_SYMBOL_GPL(kvm_read_guest); 2491 2492int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) 2493{ 2494 gfn_t gfn = gpa >> PAGE_SHIFT; 2495 int seg; 2496 int offset = offset_in_page(gpa); 2497 int ret; 2498 2499 while ((seg = next_segment(len, offset)) != 0) { 2500 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg); 2501 if (ret < 0) 2502 return ret; 2503 offset = 0; 2504 len -= seg; 2505 data += seg; 2506 ++gfn; 2507 } 2508 return 0; 2509} 2510EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); 2511 2512static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 2513 void *data, int offset, unsigned long len) 2514{ 2515 int r; 2516 unsigned long addr; 2517 2518 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 2519 if (kvm_is_error_hva(addr)) 2520 return -EFAULT; 2521 pagefault_disable(); 2522 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 2523 pagefault_enable(); 2524 if (r) 2525 return -EFAULT; 2526 return 0; 2527} 2528 2529int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, 2530 void *data, unsigned long len) 2531{ 2532 gfn_t gfn = gpa >> PAGE_SHIFT; 2533 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2534 int offset = offset_in_page(gpa); 2535 2536 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 2537} 2538EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); 2539 2540static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, 2541 const void *data, int offset, int len) 2542{ 2543 int r; 2544 unsigned long addr; 2545 2546 addr = gfn_to_hva_memslot(memslot, gfn); 2547 if (kvm_is_error_hva(addr)) 2548 return -EFAULT; 2549 r = __copy_to_user((void __user *)addr + offset, data, len); 2550 if (r) 2551 return -EFAULT; 2552 mark_page_dirty_in_slot(memslot, gfn); 2553 return 0; 2554} 2555 2556int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, 2557 const void *data, int offset, int len) 2558{ 2559 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2560 2561 return __kvm_write_guest_page(slot, gfn, data, offset, len); 2562} 2563EXPORT_SYMBOL_GPL(kvm_write_guest_page); 2564 2565int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 2566 const void *data, int offset, int len) 2567{ 2568 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2569 2570 return __kvm_write_guest_page(slot, gfn, data, offset, len); 2571} 2572EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); 2573 2574int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 2575 unsigned long len) 2576{ 2577 gfn_t gfn = gpa >> PAGE_SHIFT; 2578 int seg; 2579 int offset = offset_in_page(gpa); 2580 int ret; 2581 2582 while ((seg = next_segment(len, offset)) != 0) { 2583 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 2584 if (ret < 0) 2585 return ret; 2586 offset = 0; 2587 len -= seg; 2588 data += seg; 2589 ++gfn; 2590 } 2591 return 0; 2592} 2593EXPORT_SYMBOL_GPL(kvm_write_guest); 2594 2595int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, 2596 unsigned long len) 2597{ 2598 gfn_t gfn = gpa >> PAGE_SHIFT; 2599 int seg; 2600 int offset = offset_in_page(gpa); 2601 int ret; 2602 2603 while ((seg = next_segment(len, offset)) != 0) { 2604 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg); 2605 if (ret < 0) 2606 return ret; 2607 offset = 0; 2608 len -= seg; 2609 data += seg; 2610 ++gfn; 2611 } 2612 return 0; 2613} 2614EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); 2615 2616static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, 2617 struct gfn_to_hva_cache *ghc, 2618 gpa_t gpa, unsigned long len) 2619{ 2620 int offset = offset_in_page(gpa); 2621 gfn_t start_gfn = gpa >> PAGE_SHIFT; 2622 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; 2623 gfn_t nr_pages_needed = end_gfn - start_gfn + 1; 2624 gfn_t nr_pages_avail; 2625 2626 /* Update ghc->generation before performing any error checks. */ 2627 ghc->generation = slots->generation; 2628 2629 if (start_gfn > end_gfn) { 2630 ghc->hva = KVM_HVA_ERR_BAD; 2631 return -EINVAL; 2632 } 2633 2634 /* 2635 * If the requested region crosses two memslots, we still 2636 * verify that the entire region is valid here. 2637 */ 2638 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) { 2639 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 2640 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 2641 &nr_pages_avail); 2642 if (kvm_is_error_hva(ghc->hva)) 2643 return -EFAULT; 2644 } 2645 2646 /* Use the slow path for cross page reads and writes. */ 2647 if (nr_pages_needed == 1) 2648 ghc->hva += offset; 2649 else 2650 ghc->memslot = NULL; 2651 2652 ghc->gpa = gpa; 2653 ghc->len = len; 2654 return 0; 2655} 2656 2657int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2658 gpa_t gpa, unsigned long len) 2659{ 2660 struct kvm_memslots *slots = kvm_memslots(kvm); 2661 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); 2662} 2663EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 2664 2665int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2666 void *data, unsigned int offset, 2667 unsigned long len) 2668{ 2669 struct kvm_memslots *slots = kvm_memslots(kvm); 2670 int r; 2671 gpa_t gpa = ghc->gpa + offset; 2672 2673 if (WARN_ON_ONCE(len + offset > ghc->len)) 2674 return -EINVAL; 2675 2676 if (slots->generation != ghc->generation) { 2677 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) 2678 return -EFAULT; 2679 } 2680 2681 if (kvm_is_error_hva(ghc->hva)) 2682 return -EFAULT; 2683 2684 if (unlikely(!ghc->memslot)) 2685 return kvm_write_guest(kvm, gpa, data, len); 2686 2687 r = __copy_to_user((void __user *)ghc->hva + offset, data, len); 2688 if (r) 2689 return -EFAULT; 2690 mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT); 2691 2692 return 0; 2693} 2694EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); 2695 2696int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2697 void *data, unsigned long len) 2698{ 2699 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); 2700} 2701EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 2702 2703int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2704 void *data, unsigned int offset, 2705 unsigned long len) 2706{ 2707 struct kvm_memslots *slots = kvm_memslots(kvm); 2708 int r; 2709 gpa_t gpa = ghc->gpa + offset; 2710 2711 if (WARN_ON_ONCE(len + offset > ghc->len)) 2712 return -EINVAL; 2713 2714 if (slots->generation != ghc->generation) { 2715 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) 2716 return -EFAULT; 2717 } 2718 2719 if (kvm_is_error_hva(ghc->hva)) 2720 return -EFAULT; 2721 2722 if (unlikely(!ghc->memslot)) 2723 return kvm_read_guest(kvm, gpa, data, len); 2724 2725 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len); 2726 if (r) 2727 return -EFAULT; 2728 2729 return 0; 2730} 2731EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached); 2732 2733int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2734 void *data, unsigned long len) 2735{ 2736 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len); 2737} 2738EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 2739 2740int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 2741{ 2742 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 2743 2744 return kvm_write_guest_page(kvm, gfn, zero_page, offset, len); 2745} 2746EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 2747 2748int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 2749{ 2750 gfn_t gfn = gpa >> PAGE_SHIFT; 2751 int seg; 2752 int offset = offset_in_page(gpa); 2753 int ret; 2754 2755 while ((seg = next_segment(len, offset)) != 0) { 2756 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 2757 if (ret < 0) 2758 return ret; 2759 offset = 0; 2760 len -= seg; 2761 ++gfn; 2762 } 2763 return 0; 2764} 2765EXPORT_SYMBOL_GPL(kvm_clear_guest); 2766 2767void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn) 2768{ 2769 if (memslot && memslot->dirty_bitmap) { 2770 unsigned long rel_gfn = gfn - memslot->base_gfn; 2771 2772 set_bit_le(rel_gfn, memslot->dirty_bitmap); 2773 } 2774} 2775EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot); 2776 2777void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 2778{ 2779 struct kvm_memory_slot *memslot; 2780 2781 memslot = gfn_to_memslot(kvm, gfn); 2782 mark_page_dirty_in_slot(memslot, gfn); 2783} 2784EXPORT_SYMBOL_GPL(mark_page_dirty); 2785 2786void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) 2787{ 2788 struct kvm_memory_slot *memslot; 2789 2790 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2791 mark_page_dirty_in_slot(memslot, gfn); 2792} 2793EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); 2794 2795void kvm_sigset_activate(struct kvm_vcpu *vcpu) 2796{ 2797 if (!vcpu->sigset_active) 2798 return; 2799 2800 /* 2801 * This does a lockless modification of ->real_blocked, which is fine 2802 * because, only current can change ->real_blocked and all readers of 2803 * ->real_blocked don't care as long ->real_blocked is always a subset 2804 * of ->blocked. 2805 */ 2806 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked); 2807} 2808 2809void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) 2810{ 2811 if (!vcpu->sigset_active) 2812 return; 2813 2814 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL); 2815 sigemptyset(¤t->real_blocked); 2816} 2817 2818static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) 2819{ 2820 unsigned int old, val, grow, grow_start; 2821 2822 old = val = vcpu->halt_poll_ns; 2823 grow_start = READ_ONCE(halt_poll_ns_grow_start); 2824 grow = READ_ONCE(halt_poll_ns_grow); 2825 if (!grow) 2826 goto out; 2827 2828 val *= grow; 2829 if (val < grow_start) 2830 val = grow_start; 2831 2832 if (val > vcpu->kvm->max_halt_poll_ns) 2833 val = vcpu->kvm->max_halt_poll_ns; 2834 2835 vcpu->halt_poll_ns = val; 2836out: 2837 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); 2838} 2839 2840static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) 2841{ 2842 unsigned int old, val, shrink, grow_start; 2843 2844 old = val = vcpu->halt_poll_ns; 2845 shrink = READ_ONCE(halt_poll_ns_shrink); 2846 grow_start = READ_ONCE(halt_poll_ns_grow_start); 2847 if (shrink == 0) 2848 val = 0; 2849 else 2850 val /= shrink; 2851 2852 if (val < grow_start) 2853 val = 0; 2854 2855 vcpu->halt_poll_ns = val; 2856 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); 2857} 2858 2859static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) 2860{ 2861 int ret = -EINTR; 2862 int idx = srcu_read_lock(&vcpu->kvm->srcu); 2863 2864 if (kvm_arch_vcpu_runnable(vcpu)) { 2865 kvm_make_request(KVM_REQ_UNHALT, vcpu); 2866 goto out; 2867 } 2868 if (kvm_cpu_has_pending_timer(vcpu)) 2869 goto out; 2870 if (signal_pending(current)) 2871 goto out; 2872 2873 ret = 0; 2874out: 2875 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2876 return ret; 2877} 2878 2879static inline void 2880update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited) 2881{ 2882 if (waited) 2883 vcpu->stat.halt_poll_fail_ns += poll_ns; 2884 else 2885 vcpu->stat.halt_poll_success_ns += poll_ns; 2886} 2887 2888/* 2889 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 2890 */ 2891void kvm_vcpu_block(struct kvm_vcpu *vcpu) 2892{ 2893 ktime_t start, cur, poll_end; 2894 bool waited = false; 2895 u64 block_ns; 2896 2897 kvm_arch_vcpu_blocking(vcpu); 2898 2899 start = cur = poll_end = ktime_get(); 2900 if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) { 2901 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); 2902 2903 ++vcpu->stat.halt_attempted_poll; 2904 do { 2905 /* 2906 * This sets KVM_REQ_UNHALT if an interrupt 2907 * arrives. 2908 */ 2909 if (kvm_vcpu_check_block(vcpu) < 0) { 2910 ++vcpu->stat.halt_successful_poll; 2911 if (!vcpu_valid_wakeup(vcpu)) 2912 ++vcpu->stat.halt_poll_invalid; 2913 goto out; 2914 } 2915 poll_end = cur = ktime_get(); 2916 } while (single_task_running() && !need_resched() && 2917 ktime_before(cur, stop)); 2918 } 2919 2920 prepare_to_rcuwait(&vcpu->wait); 2921 for (;;) { 2922 set_current_state(TASK_INTERRUPTIBLE); 2923 2924 if (kvm_vcpu_check_block(vcpu) < 0) 2925 break; 2926 2927 waited = true; 2928 schedule(); 2929 } 2930 finish_rcuwait(&vcpu->wait); 2931 cur = ktime_get(); 2932out: 2933 kvm_arch_vcpu_unblocking(vcpu); 2934 block_ns = ktime_to_ns(cur) - ktime_to_ns(start); 2935 2936 update_halt_poll_stats( 2937 vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited); 2938 2939 if (!kvm_arch_no_poll(vcpu)) { 2940 if (!vcpu_valid_wakeup(vcpu)) { 2941 shrink_halt_poll_ns(vcpu); 2942 } else if (vcpu->kvm->max_halt_poll_ns) { 2943 if (block_ns <= vcpu->halt_poll_ns) 2944 ; 2945 /* we had a long block, shrink polling */ 2946 else if (vcpu->halt_poll_ns && 2947 block_ns > vcpu->kvm->max_halt_poll_ns) 2948 shrink_halt_poll_ns(vcpu); 2949 /* we had a short halt and our poll time is too small */ 2950 else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns && 2951 block_ns < vcpu->kvm->max_halt_poll_ns) 2952 grow_halt_poll_ns(vcpu); 2953 } else { 2954 vcpu->halt_poll_ns = 0; 2955 } 2956 } 2957 2958 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu)); 2959 kvm_arch_vcpu_block_finish(vcpu); 2960} 2961EXPORT_SYMBOL_GPL(kvm_vcpu_block); 2962 2963bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) 2964{ 2965 struct rcuwait *waitp; 2966 2967 waitp = kvm_arch_vcpu_get_wait(vcpu); 2968 if (rcuwait_wake_up(waitp)) { 2969 WRITE_ONCE(vcpu->ready, true); 2970 ++vcpu->stat.halt_wakeup; 2971 return true; 2972 } 2973 2974 return false; 2975} 2976EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); 2977 2978#ifndef CONFIG_S390 2979/* 2980 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 2981 */ 2982void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 2983{ 2984 int me, cpu; 2985 2986 if (kvm_vcpu_wake_up(vcpu)) 2987 return; 2988 2989 /* 2990 * Note, the vCPU could get migrated to a different pCPU at any point 2991 * after kvm_arch_vcpu_should_kick(), which could result in sending an 2992 * IPI to the previous pCPU. But, that's ok because the purpose of the 2993 * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the 2994 * vCPU also requires it to leave IN_GUEST_MODE. 2995 */ 2996 me = get_cpu(); 2997 if (kvm_arch_vcpu_should_kick(vcpu)) { 2998 cpu = READ_ONCE(vcpu->cpu); 2999 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 3000 smp_send_reschedule(cpu); 3001 } 3002 put_cpu(); 3003} 3004EXPORT_SYMBOL_GPL(kvm_vcpu_kick); 3005#endif /* !CONFIG_S390 */ 3006 3007int kvm_vcpu_yield_to(struct kvm_vcpu *target) 3008{ 3009 struct pid *pid; 3010 struct task_struct *task = NULL; 3011 int ret = 0; 3012 3013 rcu_read_lock(); 3014 pid = rcu_dereference(target->pid); 3015 if (pid) 3016 task = get_pid_task(pid, PIDTYPE_PID); 3017 rcu_read_unlock(); 3018 if (!task) 3019 return ret; 3020 ret = yield_to(task, 1); 3021 put_task_struct(task); 3022 3023 return ret; 3024} 3025EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 3026 3027/* 3028 * Helper that checks whether a VCPU is eligible for directed yield. 3029 * Most eligible candidate to yield is decided by following heuristics: 3030 * 3031 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently 3032 * (preempted lock holder), indicated by @in_spin_loop. 3033 * Set at the beginning and cleared at the end of interception/PLE handler. 3034 * 3035 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get 3036 * chance last time (mostly it has become eligible now since we have probably 3037 * yielded to lockholder in last iteration. This is done by toggling 3038 * @dy_eligible each time a VCPU checked for eligibility.) 3039 * 3040 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding 3041 * to preempted lock-holder could result in wrong VCPU selection and CPU 3042 * burning. Giving priority for a potential lock-holder increases lock 3043 * progress. 3044 * 3045 * Since algorithm is based on heuristics, accessing another VCPU data without 3046 * locking does not harm. It may result in trying to yield to same VCPU, fail 3047 * and continue with next VCPU and so on. 3048 */ 3049static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) 3050{ 3051#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 3052 bool eligible; 3053 3054 eligible = !vcpu->spin_loop.in_spin_loop || 3055 vcpu->spin_loop.dy_eligible; 3056 3057 if (vcpu->spin_loop.in_spin_loop) 3058 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); 3059 3060 return eligible; 3061#else 3062 return true; 3063#endif 3064} 3065 3066/* 3067 * Unlike kvm_arch_vcpu_runnable, this function is called outside 3068 * a vcpu_load/vcpu_put pair. However, for most architectures 3069 * kvm_arch_vcpu_runnable does not require vcpu_load. 3070 */ 3071bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) 3072{ 3073 return kvm_arch_vcpu_runnable(vcpu); 3074} 3075 3076static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) 3077{ 3078 if (kvm_arch_dy_runnable(vcpu)) 3079 return true; 3080 3081#ifdef CONFIG_KVM_ASYNC_PF 3082 if (!list_empty_careful(&vcpu->async_pf.done)) 3083 return true; 3084#endif 3085 3086 return false; 3087} 3088 3089void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) 3090{ 3091 struct kvm *kvm = me->kvm; 3092 struct kvm_vcpu *vcpu; 3093 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 3094 int yielded = 0; 3095 int try = 3; 3096 int pass; 3097 int i; 3098 3099 kvm_vcpu_set_in_spin_loop(me, true); 3100 /* 3101 * We boost the priority of a VCPU that is runnable but not 3102 * currently running, because it got preempted by something 3103 * else and called schedule in __vcpu_run. Hopefully that 3104 * VCPU is holding the lock that we need and will release it. 3105 * We approximate round-robin by starting at the last boosted VCPU. 3106 */ 3107 for (pass = 0; pass < 2 && !yielded && try; pass++) { 3108 kvm_for_each_vcpu(i, vcpu, kvm) { 3109 if (!pass && i <= last_boosted_vcpu) { 3110 i = last_boosted_vcpu; 3111 continue; 3112 } else if (pass && i > last_boosted_vcpu) 3113 break; 3114 if (!READ_ONCE(vcpu->ready)) 3115 continue; 3116 if (vcpu == me) 3117 continue; 3118 if (rcuwait_active(&vcpu->wait) && 3119 !vcpu_dy_runnable(vcpu)) 3120 continue; 3121 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && 3122 !kvm_arch_vcpu_in_kernel(vcpu)) 3123 continue; 3124 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 3125 continue; 3126 3127 yielded = kvm_vcpu_yield_to(vcpu); 3128 if (yielded > 0) { 3129 kvm->last_boosted_vcpu = i; 3130 break; 3131 } else if (yielded < 0) { 3132 try--; 3133 if (!try) 3134 break; 3135 } 3136 } 3137 } 3138 kvm_vcpu_set_in_spin_loop(me, false); 3139 3140 /* Ensure vcpu is not eligible during next spinloop */ 3141 kvm_vcpu_set_dy_eligible(me, false); 3142} 3143EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 3144 3145static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf) 3146{ 3147 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data; 3148 struct page *page; 3149 3150 if (vmf->pgoff == 0) 3151 page = virt_to_page(vcpu->run); 3152#ifdef CONFIG_X86 3153 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 3154 page = virt_to_page(vcpu->arch.pio_data); 3155#endif 3156#ifdef CONFIG_KVM_MMIO 3157 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 3158 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 3159#endif 3160 else 3161 return kvm_arch_vcpu_fault(vcpu, vmf); 3162 get_page(page); 3163 vmf->page = page; 3164 return 0; 3165} 3166 3167static const struct vm_operations_struct kvm_vcpu_vm_ops = { 3168 .fault = kvm_vcpu_fault, 3169}; 3170 3171static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 3172{ 3173 vma->vm_ops = &kvm_vcpu_vm_ops; 3174 return 0; 3175} 3176 3177static int kvm_vcpu_release(struct inode *inode, struct file *filp) 3178{ 3179 struct kvm_vcpu *vcpu = filp->private_data; 3180 3181 kvm_put_kvm(vcpu->kvm); 3182 return 0; 3183} 3184 3185static struct file_operations kvm_vcpu_fops = { 3186 .release = kvm_vcpu_release, 3187 .unlocked_ioctl = kvm_vcpu_ioctl, 3188 .mmap = kvm_vcpu_mmap, 3189 .llseek = noop_llseek, 3190 KVM_COMPAT(kvm_vcpu_compat_ioctl), 3191}; 3192 3193/* 3194 * Allocates an inode for the vcpu. 3195 */ 3196static int create_vcpu_fd(struct kvm_vcpu *vcpu) 3197{ 3198 char name[8 + 1 + ITOA_MAX_LEN + 1]; 3199 3200 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id); 3201 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); 3202} 3203 3204static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) 3205{ 3206#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS 3207 struct dentry *debugfs_dentry; 3208 char dir_name[ITOA_MAX_LEN * 2]; 3209 3210 if (!debugfs_initialized()) 3211 return; 3212 3213 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); 3214 debugfs_dentry = debugfs_create_dir(dir_name, 3215 vcpu->kvm->debugfs_dentry); 3216 3217 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry); 3218#endif 3219} 3220 3221/* 3222 * Creates some virtual cpus. Good luck creating more than one. 3223 */ 3224static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 3225{ 3226 int r; 3227 struct kvm_vcpu *vcpu; 3228 struct page *page; 3229 3230 if (id >= KVM_MAX_VCPU_ID) 3231 return -EINVAL; 3232 3233 mutex_lock(&kvm->lock); 3234 if (kvm->created_vcpus == KVM_MAX_VCPUS) { 3235 mutex_unlock(&kvm->lock); 3236 return -EINVAL; 3237 } 3238 3239 kvm->created_vcpus++; 3240 mutex_unlock(&kvm->lock); 3241 3242 r = kvm_arch_vcpu_precreate(kvm, id); 3243 if (r) 3244 goto vcpu_decrement; 3245 3246 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 3247 if (!vcpu) { 3248 r = -ENOMEM; 3249 goto vcpu_decrement; 3250 } 3251 3252 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE); 3253 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 3254 if (!page) { 3255 r = -ENOMEM; 3256 goto vcpu_free; 3257 } 3258 vcpu->run = page_address(page); 3259 3260 kvm_vcpu_init(vcpu, kvm, id); 3261 3262 r = kvm_arch_vcpu_create(vcpu); 3263 if (r) 3264 goto vcpu_free_run_page; 3265 3266 mutex_lock(&kvm->lock); 3267 if (kvm_get_vcpu_by_id(kvm, id)) { 3268 r = -EEXIST; 3269 goto unlock_vcpu_destroy; 3270 } 3271 3272 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); 3273 BUG_ON(kvm->vcpus[vcpu->vcpu_idx]); 3274 3275 /* Now it's all set up, let userspace reach it */ 3276 kvm_get_kvm(kvm); 3277 r = create_vcpu_fd(vcpu); 3278 if (r < 0) { 3279 kvm_put_kvm_no_destroy(kvm); 3280 goto unlock_vcpu_destroy; 3281 } 3282 3283 kvm->vcpus[vcpu->vcpu_idx] = vcpu; 3284 3285 /* 3286 * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus 3287 * before kvm->online_vcpu's incremented value. 3288 */ 3289 smp_wmb(); 3290 atomic_inc(&kvm->online_vcpus); 3291 3292 mutex_unlock(&kvm->lock); 3293 kvm_arch_vcpu_postcreate(vcpu); 3294 kvm_create_vcpu_debugfs(vcpu); 3295 return r; 3296 3297unlock_vcpu_destroy: 3298 mutex_unlock(&kvm->lock); 3299 kvm_arch_vcpu_destroy(vcpu); 3300vcpu_free_run_page: 3301 free_page((unsigned long)vcpu->run); 3302vcpu_free: 3303 kmem_cache_free(kvm_vcpu_cache, vcpu); 3304vcpu_decrement: 3305 mutex_lock(&kvm->lock); 3306 kvm->created_vcpus--; 3307 mutex_unlock(&kvm->lock); 3308 return r; 3309} 3310 3311static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 3312{ 3313 if (sigset) { 3314 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 3315 vcpu->sigset_active = 1; 3316 vcpu->sigset = *sigset; 3317 } else 3318 vcpu->sigset_active = 0; 3319 return 0; 3320} 3321 3322static long kvm_vcpu_ioctl(struct file *filp, 3323 unsigned int ioctl, unsigned long arg) 3324{ 3325 struct kvm_vcpu *vcpu = filp->private_data; 3326 void __user *argp = (void __user *)arg; 3327 int r; 3328 struct kvm_fpu *fpu = NULL; 3329 struct kvm_sregs *kvm_sregs = NULL; 3330 3331 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged) 3332 return -EIO; 3333 3334 if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) 3335 return -EINVAL; 3336 3337 /* 3338 * Some architectures have vcpu ioctls that are asynchronous to vcpu 3339 * execution; mutex_lock() would break them. 3340 */ 3341 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); 3342 if (r != -ENOIOCTLCMD) 3343 return r; 3344 3345 if (mutex_lock_killable(&vcpu->mutex)) 3346 return -EINTR; 3347 switch (ioctl) { 3348 case KVM_RUN: { 3349 struct pid *oldpid; 3350 r = -EINVAL; 3351 if (arg) 3352 goto out; 3353 oldpid = rcu_access_pointer(vcpu->pid); 3354 if (unlikely(oldpid != task_pid(current))) { 3355 /* The thread running this VCPU changed. */ 3356 struct pid *newpid; 3357 3358 r = kvm_arch_vcpu_run_pid_change(vcpu); 3359 if (r) 3360 break; 3361 3362 newpid = get_task_pid(current, PIDTYPE_PID); 3363 rcu_assign_pointer(vcpu->pid, newpid); 3364 if (oldpid) 3365 synchronize_rcu(); 3366 put_pid(oldpid); 3367 } 3368 r = kvm_arch_vcpu_ioctl_run(vcpu); 3369 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 3370 break; 3371 } 3372 case KVM_GET_REGS: { 3373 struct kvm_regs *kvm_regs; 3374 3375 r = -ENOMEM; 3376 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT); 3377 if (!kvm_regs) 3378 goto out; 3379 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 3380 if (r) 3381 goto out_free1; 3382 r = -EFAULT; 3383 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 3384 goto out_free1; 3385 r = 0; 3386out_free1: 3387 kfree(kvm_regs); 3388 break; 3389 } 3390 case KVM_SET_REGS: { 3391 struct kvm_regs *kvm_regs; 3392 3393 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 3394 if (IS_ERR(kvm_regs)) { 3395 r = PTR_ERR(kvm_regs); 3396 goto out; 3397 } 3398 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 3399 kfree(kvm_regs); 3400 break; 3401 } 3402 case KVM_GET_SREGS: { 3403 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), 3404 GFP_KERNEL_ACCOUNT); 3405 r = -ENOMEM; 3406 if (!kvm_sregs) 3407 goto out; 3408 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 3409 if (r) 3410 goto out; 3411 r = -EFAULT; 3412 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 3413 goto out; 3414 r = 0; 3415 break; 3416 } 3417 case KVM_SET_SREGS: { 3418 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 3419 if (IS_ERR(kvm_sregs)) { 3420 r = PTR_ERR(kvm_sregs); 3421 kvm_sregs = NULL; 3422 goto out; 3423 } 3424 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 3425 break; 3426 } 3427 case KVM_GET_MP_STATE: { 3428 struct kvm_mp_state mp_state; 3429 3430 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 3431 if (r) 3432 goto out; 3433 r = -EFAULT; 3434 if (copy_to_user(argp, &mp_state, sizeof(mp_state))) 3435 goto out; 3436 r = 0; 3437 break; 3438 } 3439 case KVM_SET_MP_STATE: { 3440 struct kvm_mp_state mp_state; 3441 3442 r = -EFAULT; 3443 if (copy_from_user(&mp_state, argp, sizeof(mp_state))) 3444 goto out; 3445 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 3446 break; 3447 } 3448 case KVM_TRANSLATE: { 3449 struct kvm_translation tr; 3450 3451 r = -EFAULT; 3452 if (copy_from_user(&tr, argp, sizeof(tr))) 3453 goto out; 3454 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 3455 if (r) 3456 goto out; 3457 r = -EFAULT; 3458 if (copy_to_user(argp, &tr, sizeof(tr))) 3459 goto out; 3460 r = 0; 3461 break; 3462 } 3463 case KVM_SET_GUEST_DEBUG: { 3464 struct kvm_guest_debug dbg; 3465 3466 r = -EFAULT; 3467 if (copy_from_user(&dbg, argp, sizeof(dbg))) 3468 goto out; 3469 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 3470 break; 3471 } 3472 case KVM_SET_SIGNAL_MASK: { 3473 struct kvm_signal_mask __user *sigmask_arg = argp; 3474 struct kvm_signal_mask kvm_sigmask; 3475 sigset_t sigset, *p; 3476 3477 p = NULL; 3478 if (argp) { 3479 r = -EFAULT; 3480 if (copy_from_user(&kvm_sigmask, argp, 3481 sizeof(kvm_sigmask))) 3482 goto out; 3483 r = -EINVAL; 3484 if (kvm_sigmask.len != sizeof(sigset)) 3485 goto out; 3486 r = -EFAULT; 3487 if (copy_from_user(&sigset, sigmask_arg->sigset, 3488 sizeof(sigset))) 3489 goto out; 3490 p = &sigset; 3491 } 3492 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 3493 break; 3494 } 3495 case KVM_GET_FPU: { 3496 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT); 3497 r = -ENOMEM; 3498 if (!fpu) 3499 goto out; 3500 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 3501 if (r) 3502 goto out; 3503 r = -EFAULT; 3504 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 3505 goto out; 3506 r = 0; 3507 break; 3508 } 3509 case KVM_SET_FPU: { 3510 fpu = memdup_user(argp, sizeof(*fpu)); 3511 if (IS_ERR(fpu)) { 3512 r = PTR_ERR(fpu); 3513 fpu = NULL; 3514 goto out; 3515 } 3516 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 3517 break; 3518 } 3519 default: 3520 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 3521 } 3522out: 3523 mutex_unlock(&vcpu->mutex); 3524 kfree(fpu); 3525 kfree(kvm_sregs); 3526 return r; 3527} 3528 3529#ifdef CONFIG_KVM_COMPAT 3530static long kvm_vcpu_compat_ioctl(struct file *filp, 3531 unsigned int ioctl, unsigned long arg) 3532{ 3533 struct kvm_vcpu *vcpu = filp->private_data; 3534 void __user *argp = compat_ptr(arg); 3535 int r; 3536 3537 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged) 3538 return -EIO; 3539 3540 switch (ioctl) { 3541 case KVM_SET_SIGNAL_MASK: { 3542 struct kvm_signal_mask __user *sigmask_arg = argp; 3543 struct kvm_signal_mask kvm_sigmask; 3544 sigset_t sigset; 3545 3546 if (argp) { 3547 r = -EFAULT; 3548 if (copy_from_user(&kvm_sigmask, argp, 3549 sizeof(kvm_sigmask))) 3550 goto out; 3551 r = -EINVAL; 3552 if (kvm_sigmask.len != sizeof(compat_sigset_t)) 3553 goto out; 3554 r = -EFAULT; 3555 if (get_compat_sigset(&sigset, 3556 (compat_sigset_t __user *)sigmask_arg->sigset)) 3557 goto out; 3558 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 3559 } else 3560 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); 3561 break; 3562 } 3563 default: 3564 r = kvm_vcpu_ioctl(filp, ioctl, arg); 3565 } 3566 3567out: 3568 return r; 3569} 3570#endif 3571 3572static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma) 3573{ 3574 struct kvm_device *dev = filp->private_data; 3575 3576 if (dev->ops->mmap) 3577 return dev->ops->mmap(dev, vma); 3578 3579 return -ENODEV; 3580} 3581 3582static int kvm_device_ioctl_attr(struct kvm_device *dev, 3583 int (*accessor)(struct kvm_device *dev, 3584 struct kvm_device_attr *attr), 3585 unsigned long arg) 3586{ 3587 struct kvm_device_attr attr; 3588 3589 if (!accessor) 3590 return -EPERM; 3591 3592 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 3593 return -EFAULT; 3594 3595 return accessor(dev, &attr); 3596} 3597 3598static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, 3599 unsigned long arg) 3600{ 3601 struct kvm_device *dev = filp->private_data; 3602 3603 if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged) 3604 return -EIO; 3605 3606 switch (ioctl) { 3607 case KVM_SET_DEVICE_ATTR: 3608 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); 3609 case KVM_GET_DEVICE_ATTR: 3610 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); 3611 case KVM_HAS_DEVICE_ATTR: 3612 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); 3613 default: 3614 if (dev->ops->ioctl) 3615 return dev->ops->ioctl(dev, ioctl, arg); 3616 3617 return -ENOTTY; 3618 } 3619} 3620 3621static int kvm_device_release(struct inode *inode, struct file *filp) 3622{ 3623 struct kvm_device *dev = filp->private_data; 3624 struct kvm *kvm = dev->kvm; 3625 3626 if (dev->ops->release) { 3627 mutex_lock(&kvm->lock); 3628 list_del(&dev->vm_node); 3629 dev->ops->release(dev); 3630 mutex_unlock(&kvm->lock); 3631 } 3632 3633 kvm_put_kvm(kvm); 3634 return 0; 3635} 3636 3637static const struct file_operations kvm_device_fops = { 3638 .unlocked_ioctl = kvm_device_ioctl, 3639 .release = kvm_device_release, 3640 KVM_COMPAT(kvm_device_ioctl), 3641 .mmap = kvm_device_mmap, 3642}; 3643 3644struct kvm_device *kvm_device_from_filp(struct file *filp) 3645{ 3646 if (filp->f_op != &kvm_device_fops) 3647 return NULL; 3648 3649 return filp->private_data; 3650} 3651 3652static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { 3653#ifdef CONFIG_KVM_MPIC 3654 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, 3655 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, 3656#endif 3657}; 3658 3659int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type) 3660{ 3661 if (type >= ARRAY_SIZE(kvm_device_ops_table)) 3662 return -ENOSPC; 3663 3664 if (kvm_device_ops_table[type] != NULL) 3665 return -EEXIST; 3666 3667 kvm_device_ops_table[type] = ops; 3668 return 0; 3669} 3670 3671void kvm_unregister_device_ops(u32 type) 3672{ 3673 if (kvm_device_ops_table[type] != NULL) 3674 kvm_device_ops_table[type] = NULL; 3675} 3676 3677static int kvm_ioctl_create_device(struct kvm *kvm, 3678 struct kvm_create_device *cd) 3679{ 3680 const struct kvm_device_ops *ops = NULL; 3681 struct kvm_device *dev; 3682 bool test = cd->flags & KVM_CREATE_DEVICE_TEST; 3683 int type; 3684 int ret; 3685 3686 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) 3687 return -ENODEV; 3688 3689 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table)); 3690 ops = kvm_device_ops_table[type]; 3691 if (ops == NULL) 3692 return -ENODEV; 3693 3694 if (test) 3695 return 0; 3696 3697 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT); 3698 if (!dev) 3699 return -ENOMEM; 3700 3701 dev->ops = ops; 3702 dev->kvm = kvm; 3703 3704 mutex_lock(&kvm->lock); 3705 ret = ops->create(dev, type); 3706 if (ret < 0) { 3707 mutex_unlock(&kvm->lock); 3708 kfree(dev); 3709 return ret; 3710 } 3711 list_add(&dev->vm_node, &kvm->devices); 3712 mutex_unlock(&kvm->lock); 3713 3714 if (ops->init) 3715 ops->init(dev); 3716 3717 kvm_get_kvm(kvm); 3718 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); 3719 if (ret < 0) { 3720 kvm_put_kvm_no_destroy(kvm); 3721 mutex_lock(&kvm->lock); 3722 list_del(&dev->vm_node); 3723 if (ops->release) 3724 ops->release(dev); 3725 mutex_unlock(&kvm->lock); 3726 if (ops->destroy) 3727 ops->destroy(dev); 3728 return ret; 3729 } 3730 3731 cd->fd = ret; 3732 return 0; 3733} 3734 3735static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) 3736{ 3737 switch (arg) { 3738 case KVM_CAP_USER_MEMORY: 3739 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 3740 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 3741 case KVM_CAP_INTERNAL_ERROR_DATA: 3742#ifdef CONFIG_HAVE_KVM_MSI 3743 case KVM_CAP_SIGNAL_MSI: 3744#endif 3745#ifdef CONFIG_HAVE_KVM_IRQFD 3746 case KVM_CAP_IRQFD: 3747 case KVM_CAP_IRQFD_RESAMPLE: 3748#endif 3749 case KVM_CAP_IOEVENTFD_ANY_LENGTH: 3750 case KVM_CAP_CHECK_EXTENSION_VM: 3751 case KVM_CAP_ENABLE_CAP_VM: 3752 case KVM_CAP_HALT_POLL: 3753 return 1; 3754#ifdef CONFIG_KVM_MMIO 3755 case KVM_CAP_COALESCED_MMIO: 3756 return KVM_COALESCED_MMIO_PAGE_OFFSET; 3757 case KVM_CAP_COALESCED_PIO: 3758 return 1; 3759#endif 3760#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3761 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: 3762 return KVM_DIRTY_LOG_MANUAL_CAPS; 3763#endif 3764#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3765 case KVM_CAP_IRQ_ROUTING: 3766 return KVM_MAX_IRQ_ROUTES; 3767#endif 3768#if KVM_ADDRESS_SPACE_NUM > 1 3769 case KVM_CAP_MULTI_ADDRESS_SPACE: 3770 return KVM_ADDRESS_SPACE_NUM; 3771#endif 3772 case KVM_CAP_NR_MEMSLOTS: 3773 return KVM_USER_MEM_SLOTS; 3774 default: 3775 break; 3776 } 3777 return kvm_vm_ioctl_check_extension(kvm, arg); 3778} 3779 3780int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, 3781 struct kvm_enable_cap *cap) 3782{ 3783 return -EINVAL; 3784} 3785 3786static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, 3787 struct kvm_enable_cap *cap) 3788{ 3789 switch (cap->cap) { 3790#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3791 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: { 3792 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE; 3793 3794 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE) 3795 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS; 3796 3797 if (cap->flags || (cap->args[0] & ~allowed_options)) 3798 return -EINVAL; 3799 kvm->manual_dirty_log_protect = cap->args[0]; 3800 return 0; 3801 } 3802#endif 3803 case KVM_CAP_HALT_POLL: { 3804 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0]) 3805 return -EINVAL; 3806 3807 kvm->max_halt_poll_ns = cap->args[0]; 3808 return 0; 3809 } 3810 default: 3811 return kvm_vm_ioctl_enable_cap(kvm, cap); 3812 } 3813} 3814 3815static long kvm_vm_ioctl(struct file *filp, 3816 unsigned int ioctl, unsigned long arg) 3817{ 3818 struct kvm *kvm = filp->private_data; 3819 void __user *argp = (void __user *)arg; 3820 int r; 3821 3822 if (kvm->mm != current->mm || kvm->vm_bugged) 3823 return -EIO; 3824 switch (ioctl) { 3825 case KVM_CREATE_VCPU: 3826 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 3827 break; 3828 case KVM_ENABLE_CAP: { 3829 struct kvm_enable_cap cap; 3830 3831 r = -EFAULT; 3832 if (copy_from_user(&cap, argp, sizeof(cap))) 3833 goto out; 3834 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); 3835 break; 3836 } 3837 case KVM_SET_USER_MEMORY_REGION: { 3838 struct kvm_userspace_memory_region kvm_userspace_mem; 3839 3840 r = -EFAULT; 3841 if (copy_from_user(&kvm_userspace_mem, argp, 3842 sizeof(kvm_userspace_mem))) 3843 goto out; 3844 3845 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); 3846 break; 3847 } 3848 case KVM_GET_DIRTY_LOG: { 3849 struct kvm_dirty_log log; 3850 3851 r = -EFAULT; 3852 if (copy_from_user(&log, argp, sizeof(log))) 3853 goto out; 3854 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3855 break; 3856 } 3857#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3858 case KVM_CLEAR_DIRTY_LOG: { 3859 struct kvm_clear_dirty_log log; 3860 3861 r = -EFAULT; 3862 if (copy_from_user(&log, argp, sizeof(log))) 3863 goto out; 3864 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); 3865 break; 3866 } 3867#endif 3868#ifdef CONFIG_KVM_MMIO 3869 case KVM_REGISTER_COALESCED_MMIO: { 3870 struct kvm_coalesced_mmio_zone zone; 3871 3872 r = -EFAULT; 3873 if (copy_from_user(&zone, argp, sizeof(zone))) 3874 goto out; 3875 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 3876 break; 3877 } 3878 case KVM_UNREGISTER_COALESCED_MMIO: { 3879 struct kvm_coalesced_mmio_zone zone; 3880 3881 r = -EFAULT; 3882 if (copy_from_user(&zone, argp, sizeof(zone))) 3883 goto out; 3884 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 3885 break; 3886 } 3887#endif 3888 case KVM_IRQFD: { 3889 struct kvm_irqfd data; 3890 3891 r = -EFAULT; 3892 if (copy_from_user(&data, argp, sizeof(data))) 3893 goto out; 3894 r = kvm_irqfd(kvm, &data); 3895 break; 3896 } 3897 case KVM_IOEVENTFD: { 3898 struct kvm_ioeventfd data; 3899 3900 r = -EFAULT; 3901 if (copy_from_user(&data, argp, sizeof(data))) 3902 goto out; 3903 r = kvm_ioeventfd(kvm, &data); 3904 break; 3905 } 3906#ifdef CONFIG_HAVE_KVM_MSI 3907 case KVM_SIGNAL_MSI: { 3908 struct kvm_msi msi; 3909 3910 r = -EFAULT; 3911 if (copy_from_user(&msi, argp, sizeof(msi))) 3912 goto out; 3913 r = kvm_send_userspace_msi(kvm, &msi); 3914 break; 3915 } 3916#endif 3917#ifdef __KVM_HAVE_IRQ_LINE 3918 case KVM_IRQ_LINE_STATUS: 3919 case KVM_IRQ_LINE: { 3920 struct kvm_irq_level irq_event; 3921 3922 r = -EFAULT; 3923 if (copy_from_user(&irq_event, argp, sizeof(irq_event))) 3924 goto out; 3925 3926 r = kvm_vm_ioctl_irq_line(kvm, &irq_event, 3927 ioctl == KVM_IRQ_LINE_STATUS); 3928 if (r) 3929 goto out; 3930 3931 r = -EFAULT; 3932 if (ioctl == KVM_IRQ_LINE_STATUS) { 3933 if (copy_to_user(argp, &irq_event, sizeof(irq_event))) 3934 goto out; 3935 } 3936 3937 r = 0; 3938 break; 3939 } 3940#endif 3941#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3942 case KVM_SET_GSI_ROUTING: { 3943 struct kvm_irq_routing routing; 3944 struct kvm_irq_routing __user *urouting; 3945 struct kvm_irq_routing_entry *entries = NULL; 3946 3947 r = -EFAULT; 3948 if (copy_from_user(&routing, argp, sizeof(routing))) 3949 goto out; 3950 r = -EINVAL; 3951 if (!kvm_arch_can_set_irq_routing(kvm)) 3952 goto out; 3953 if (routing.nr > KVM_MAX_IRQ_ROUTES) 3954 goto out; 3955 if (routing.flags) 3956 goto out; 3957 if (routing.nr) { 3958 urouting = argp; 3959 entries = vmemdup_user(urouting->entries, 3960 array_size(sizeof(*entries), 3961 routing.nr)); 3962 if (IS_ERR(entries)) { 3963 r = PTR_ERR(entries); 3964 goto out; 3965 } 3966 } 3967 r = kvm_set_irq_routing(kvm, entries, routing.nr, 3968 routing.flags); 3969 kvfree(entries); 3970 break; 3971 } 3972#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ 3973 case KVM_CREATE_DEVICE: { 3974 struct kvm_create_device cd; 3975 3976 r = -EFAULT; 3977 if (copy_from_user(&cd, argp, sizeof(cd))) 3978 goto out; 3979 3980 r = kvm_ioctl_create_device(kvm, &cd); 3981 if (r) 3982 goto out; 3983 3984 r = -EFAULT; 3985 if (copy_to_user(argp, &cd, sizeof(cd))) 3986 goto out; 3987 3988 r = 0; 3989 break; 3990 } 3991 case KVM_CHECK_EXTENSION: 3992 r = kvm_vm_ioctl_check_extension_generic(kvm, arg); 3993 break; 3994 default: 3995 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 3996 } 3997out: 3998 return r; 3999} 4000 4001#ifdef CONFIG_KVM_COMPAT 4002struct compat_kvm_dirty_log { 4003 __u32 slot; 4004 __u32 padding1; 4005 union { 4006 compat_uptr_t dirty_bitmap; /* one bit per page */ 4007 __u64 padding2; 4008 }; 4009}; 4010 4011struct compat_kvm_clear_dirty_log { 4012 __u32 slot; 4013 __u32 num_pages; 4014 __u64 first_page; 4015 union { 4016 compat_uptr_t dirty_bitmap; /* one bit per page */ 4017 __u64 padding2; 4018 }; 4019}; 4020 4021long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, 4022 unsigned long arg) 4023{ 4024 return -ENOTTY; 4025} 4026 4027static long kvm_vm_compat_ioctl(struct file *filp, 4028 unsigned int ioctl, unsigned long arg) 4029{ 4030 struct kvm *kvm = filp->private_data; 4031 int r; 4032 4033 if (kvm->mm != current->mm || kvm->vm_bugged) 4034 return -EIO; 4035 4036 r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg); 4037 if (r != -ENOTTY) 4038 return r; 4039 4040 switch (ioctl) { 4041#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 4042 case KVM_CLEAR_DIRTY_LOG: { 4043 struct compat_kvm_clear_dirty_log compat_log; 4044 struct kvm_clear_dirty_log log; 4045 4046 if (copy_from_user(&compat_log, (void __user *)arg, 4047 sizeof(compat_log))) 4048 return -EFAULT; 4049 log.slot = compat_log.slot; 4050 log.num_pages = compat_log.num_pages; 4051 log.first_page = compat_log.first_page; 4052 log.padding2 = compat_log.padding2; 4053 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 4054 4055 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); 4056 break; 4057 } 4058#endif 4059 case KVM_GET_DIRTY_LOG: { 4060 struct compat_kvm_dirty_log compat_log; 4061 struct kvm_dirty_log log; 4062 4063 if (copy_from_user(&compat_log, (void __user *)arg, 4064 sizeof(compat_log))) 4065 return -EFAULT; 4066 log.slot = compat_log.slot; 4067 log.padding1 = compat_log.padding1; 4068 log.padding2 = compat_log.padding2; 4069 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 4070 4071 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 4072 break; 4073 } 4074 default: 4075 r = kvm_vm_ioctl(filp, ioctl, arg); 4076 } 4077 return r; 4078} 4079#endif 4080 4081static struct file_operations kvm_vm_fops = { 4082 .release = kvm_vm_release, 4083 .unlocked_ioctl = kvm_vm_ioctl, 4084 .llseek = noop_llseek, 4085 KVM_COMPAT(kvm_vm_compat_ioctl), 4086}; 4087 4088static int kvm_dev_ioctl_create_vm(unsigned long type) 4089{ 4090 int r; 4091 struct kvm *kvm; 4092 struct file *file; 4093 4094 kvm = kvm_create_vm(type); 4095 if (IS_ERR(kvm)) 4096 return PTR_ERR(kvm); 4097#ifdef CONFIG_KVM_MMIO 4098 r = kvm_coalesced_mmio_init(kvm); 4099 if (r < 0) 4100 goto put_kvm; 4101#endif 4102 r = get_unused_fd_flags(O_CLOEXEC); 4103 if (r < 0) 4104 goto put_kvm; 4105 4106 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 4107 if (IS_ERR(file)) { 4108 put_unused_fd(r); 4109 r = PTR_ERR(file); 4110 goto put_kvm; 4111 } 4112 4113 /* 4114 * Don't call kvm_put_kvm anymore at this point; file->f_op is 4115 * already set, with ->release() being kvm_vm_release(). In error 4116 * cases it will be called by the final fput(file) and will take 4117 * care of doing kvm_put_kvm(kvm). 4118 */ 4119 if (kvm_create_vm_debugfs(kvm, r) < 0) { 4120 put_unused_fd(r); 4121 fput(file); 4122 return -ENOMEM; 4123 } 4124 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); 4125 4126 fd_install(r, file); 4127 return r; 4128 4129put_kvm: 4130 kvm_put_kvm(kvm); 4131 return r; 4132} 4133 4134static long kvm_dev_ioctl(struct file *filp, 4135 unsigned int ioctl, unsigned long arg) 4136{ 4137 long r = -EINVAL; 4138 4139 switch (ioctl) { 4140 case KVM_GET_API_VERSION: 4141 if (arg) 4142 goto out; 4143 r = KVM_API_VERSION; 4144 break; 4145 case KVM_CREATE_VM: 4146 r = kvm_dev_ioctl_create_vm(arg); 4147 break; 4148 case KVM_CHECK_EXTENSION: 4149 r = kvm_vm_ioctl_check_extension_generic(NULL, arg); 4150 break; 4151 case KVM_GET_VCPU_MMAP_SIZE: 4152 if (arg) 4153 goto out; 4154 r = PAGE_SIZE; /* struct kvm_run */ 4155#ifdef CONFIG_X86 4156 r += PAGE_SIZE; /* pio data page */ 4157#endif 4158#ifdef CONFIG_KVM_MMIO 4159 r += PAGE_SIZE; /* coalesced mmio ring page */ 4160#endif 4161 break; 4162 case KVM_TRACE_ENABLE: 4163 case KVM_TRACE_PAUSE: 4164 case KVM_TRACE_DISABLE: 4165 r = -EOPNOTSUPP; 4166 break; 4167 default: 4168 return kvm_arch_dev_ioctl(filp, ioctl, arg); 4169 } 4170out: 4171 return r; 4172} 4173 4174static struct file_operations kvm_chardev_ops = { 4175 .unlocked_ioctl = kvm_dev_ioctl, 4176 .llseek = noop_llseek, 4177 KVM_COMPAT(kvm_dev_ioctl), 4178}; 4179 4180static struct miscdevice kvm_dev = { 4181 KVM_MINOR, 4182 "kvm", 4183 &kvm_chardev_ops, 4184}; 4185 4186static void hardware_enable_nolock(void *junk) 4187{ 4188 int cpu = raw_smp_processor_id(); 4189 int r; 4190 4191 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 4192 return; 4193 4194 cpumask_set_cpu(cpu, cpus_hardware_enabled); 4195 4196 r = kvm_arch_hardware_enable(); 4197 4198 if (r) { 4199 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 4200 atomic_inc(&hardware_enable_failed); 4201 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); 4202 } 4203} 4204 4205static int kvm_starting_cpu(unsigned int cpu) 4206{ 4207 raw_spin_lock(&kvm_count_lock); 4208 if (kvm_usage_count) 4209 hardware_enable_nolock(NULL); 4210 raw_spin_unlock(&kvm_count_lock); 4211 return 0; 4212} 4213 4214static void hardware_disable_nolock(void *junk) 4215{ 4216 int cpu = raw_smp_processor_id(); 4217 4218 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 4219 return; 4220 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 4221 kvm_arch_hardware_disable(); 4222} 4223 4224static int kvm_dying_cpu(unsigned int cpu) 4225{ 4226 raw_spin_lock(&kvm_count_lock); 4227 if (kvm_usage_count) 4228 hardware_disable_nolock(NULL); 4229 raw_spin_unlock(&kvm_count_lock); 4230 return 0; 4231} 4232 4233static void hardware_disable_all_nolock(void) 4234{ 4235 BUG_ON(!kvm_usage_count); 4236 4237 kvm_usage_count--; 4238 if (!kvm_usage_count) 4239 on_each_cpu(hardware_disable_nolock, NULL, 1); 4240} 4241 4242static void hardware_disable_all(void) 4243{ 4244 raw_spin_lock(&kvm_count_lock); 4245 hardware_disable_all_nolock(); 4246 raw_spin_unlock(&kvm_count_lock); 4247} 4248 4249static int hardware_enable_all(void) 4250{ 4251 int r = 0; 4252 4253 raw_spin_lock(&kvm_count_lock); 4254 4255 kvm_usage_count++; 4256 if (kvm_usage_count == 1) { 4257 atomic_set(&hardware_enable_failed, 0); 4258 on_each_cpu(hardware_enable_nolock, NULL, 1); 4259 4260 if (atomic_read(&hardware_enable_failed)) { 4261 hardware_disable_all_nolock(); 4262 r = -EBUSY; 4263 } 4264 } 4265 4266 raw_spin_unlock(&kvm_count_lock); 4267 4268 return r; 4269} 4270 4271static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 4272 void *v) 4273{ 4274 /* 4275 * Some (well, at least mine) BIOSes hang on reboot if 4276 * in vmx root mode. 4277 * 4278 * And Intel TXT required VMX off for all cpu when system shutdown. 4279 */ 4280 pr_info("kvm: exiting hardware virtualization\n"); 4281 kvm_rebooting = true; 4282 on_each_cpu(hardware_disable_nolock, NULL, 1); 4283 return NOTIFY_OK; 4284} 4285 4286static struct notifier_block kvm_reboot_notifier = { 4287 .notifier_call = kvm_reboot, 4288 .priority = 0, 4289}; 4290 4291static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 4292{ 4293 int i; 4294 4295 for (i = 0; i < bus->dev_count; i++) { 4296 struct kvm_io_device *pos = bus->range[i].dev; 4297 4298 kvm_iodevice_destructor(pos); 4299 } 4300 kfree(bus); 4301} 4302 4303static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, 4304 const struct kvm_io_range *r2) 4305{ 4306 gpa_t addr1 = r1->addr; 4307 gpa_t addr2 = r2->addr; 4308 4309 if (addr1 < addr2) 4310 return -1; 4311 4312 /* If r2->len == 0, match the exact address. If r2->len != 0, 4313 * accept any overlapping write. Any order is acceptable for 4314 * overlapping ranges, because kvm_io_bus_get_first_dev ensures 4315 * we process all of them. 4316 */ 4317 if (r2->len) { 4318 addr1 += r1->len; 4319 addr2 += r2->len; 4320 } 4321 4322 if (addr1 > addr2) 4323 return 1; 4324 4325 return 0; 4326} 4327 4328static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 4329{ 4330 return kvm_io_bus_cmp(p1, p2); 4331} 4332 4333static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 4334 gpa_t addr, int len) 4335{ 4336 struct kvm_io_range *range, key; 4337 int off; 4338 4339 key = (struct kvm_io_range) { 4340 .addr = addr, 4341 .len = len, 4342 }; 4343 4344 range = bsearch(&key, bus->range, bus->dev_count, 4345 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 4346 if (range == NULL) 4347 return -ENOENT; 4348 4349 off = range - bus->range; 4350 4351 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) 4352 off--; 4353 4354 return off; 4355} 4356 4357static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 4358 struct kvm_io_range *range, const void *val) 4359{ 4360 int idx; 4361 4362 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 4363 if (idx < 0) 4364 return -EOPNOTSUPP; 4365 4366 while (idx < bus->dev_count && 4367 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 4368 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr, 4369 range->len, val)) 4370 return idx; 4371 idx++; 4372 } 4373 4374 return -EOPNOTSUPP; 4375} 4376 4377/* kvm_io_bus_write - called under kvm->slots_lock */ 4378int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 4379 int len, const void *val) 4380{ 4381 struct kvm_io_bus *bus; 4382 struct kvm_io_range range; 4383 int r; 4384 4385 range = (struct kvm_io_range) { 4386 .addr = addr, 4387 .len = len, 4388 }; 4389 4390 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 4391 if (!bus) 4392 return -ENOMEM; 4393 r = __kvm_io_bus_write(vcpu, bus, &range, val); 4394 return r < 0 ? r : 0; 4395} 4396EXPORT_SYMBOL_GPL(kvm_io_bus_write); 4397 4398/* kvm_io_bus_write_cookie - called under kvm->slots_lock */ 4399int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, 4400 gpa_t addr, int len, const void *val, long cookie) 4401{ 4402 struct kvm_io_bus *bus; 4403 struct kvm_io_range range; 4404 4405 range = (struct kvm_io_range) { 4406 .addr = addr, 4407 .len = len, 4408 }; 4409 4410 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 4411 if (!bus) 4412 return -ENOMEM; 4413 4414 /* First try the device referenced by cookie. */ 4415 if ((cookie >= 0) && (cookie < bus->dev_count) && 4416 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) 4417 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len, 4418 val)) 4419 return cookie; 4420 4421 /* 4422 * cookie contained garbage; fall back to search and return the 4423 * correct cookie value. 4424 */ 4425 return __kvm_io_bus_write(vcpu, bus, &range, val); 4426} 4427 4428static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 4429 struct kvm_io_range *range, void *val) 4430{ 4431 int idx; 4432 4433 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 4434 if (idx < 0) 4435 return -EOPNOTSUPP; 4436 4437 while (idx < bus->dev_count && 4438 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 4439 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr, 4440 range->len, val)) 4441 return idx; 4442 idx++; 4443 } 4444 4445 return -EOPNOTSUPP; 4446} 4447 4448/* kvm_io_bus_read - called under kvm->slots_lock */ 4449int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 4450 int len, void *val) 4451{ 4452 struct kvm_io_bus *bus; 4453 struct kvm_io_range range; 4454 int r; 4455 4456 range = (struct kvm_io_range) { 4457 .addr = addr, 4458 .len = len, 4459 }; 4460 4461 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 4462 if (!bus) 4463 return -ENOMEM; 4464 r = __kvm_io_bus_read(vcpu, bus, &range, val); 4465 return r < 0 ? r : 0; 4466} 4467 4468/* Caller must hold slots_lock. */ 4469int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 4470 int len, struct kvm_io_device *dev) 4471{ 4472 int i; 4473 struct kvm_io_bus *new_bus, *bus; 4474 struct kvm_io_range range; 4475 4476 bus = kvm_get_bus(kvm, bus_idx); 4477 if (!bus) 4478 return -ENOMEM; 4479 4480 /* exclude ioeventfd which is limited by maximum fd */ 4481 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) 4482 return -ENOSPC; 4483 4484 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1), 4485 GFP_KERNEL_ACCOUNT); 4486 if (!new_bus) 4487 return -ENOMEM; 4488 4489 range = (struct kvm_io_range) { 4490 .addr = addr, 4491 .len = len, 4492 .dev = dev, 4493 }; 4494 4495 for (i = 0; i < bus->dev_count; i++) 4496 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0) 4497 break; 4498 4499 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 4500 new_bus->dev_count++; 4501 new_bus->range[i] = range; 4502 memcpy(new_bus->range + i + 1, bus->range + i, 4503 (bus->dev_count - i) * sizeof(struct kvm_io_range)); 4504 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 4505 synchronize_srcu_expedited(&kvm->srcu); 4506 kfree(bus); 4507 4508 return 0; 4509} 4510 4511/* Caller must hold slots_lock. */ 4512int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 4513 struct kvm_io_device *dev) 4514{ 4515 int i, j; 4516 struct kvm_io_bus *new_bus, *bus; 4517 4518 bus = kvm_get_bus(kvm, bus_idx); 4519 if (!bus) 4520 return 0; 4521 4522 for (i = 0; i < bus->dev_count; i++) 4523 if (bus->range[i].dev == dev) { 4524 break; 4525 } 4526 4527 if (i == bus->dev_count) 4528 return 0; 4529 4530 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), 4531 GFP_KERNEL_ACCOUNT); 4532 if (new_bus) { 4533 memcpy(new_bus, bus, struct_size(bus, range, i)); 4534 new_bus->dev_count--; 4535 memcpy(new_bus->range + i, bus->range + i + 1, 4536 flex_array_size(new_bus, range, new_bus->dev_count - i)); 4537 } 4538 4539 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 4540 synchronize_srcu_expedited(&kvm->srcu); 4541 4542 /* Destroy the old bus _after_ installing the (null) bus. */ 4543 if (!new_bus) { 4544 pr_err("kvm: failed to shrink bus, removing it completely\n"); 4545 for (j = 0; j < bus->dev_count; j++) { 4546 if (j == i) 4547 continue; 4548 kvm_iodevice_destructor(bus->range[j].dev); 4549 } 4550 } 4551 4552 kfree(bus); 4553 return new_bus ? 0 : -ENOMEM; 4554} 4555 4556struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, 4557 gpa_t addr) 4558{ 4559 struct kvm_io_bus *bus; 4560 int dev_idx, srcu_idx; 4561 struct kvm_io_device *iodev = NULL; 4562 4563 srcu_idx = srcu_read_lock(&kvm->srcu); 4564 4565 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 4566 if (!bus) 4567 goto out_unlock; 4568 4569 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); 4570 if (dev_idx < 0) 4571 goto out_unlock; 4572 4573 iodev = bus->range[dev_idx].dev; 4574 4575out_unlock: 4576 srcu_read_unlock(&kvm->srcu, srcu_idx); 4577 4578 return iodev; 4579} 4580EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); 4581 4582static int kvm_debugfs_open(struct inode *inode, struct file *file, 4583 int (*get)(void *, u64 *), int (*set)(void *, u64), 4584 const char *fmt) 4585{ 4586 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 4587 inode->i_private; 4588 4589 /* The debugfs files are a reference to the kvm struct which 4590 * is still valid when kvm_destroy_vm is called. 4591 * To avoid the race between open and the removal of the debugfs 4592 * directory we test against the users count. 4593 */ 4594 if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) 4595 return -ENOENT; 4596 4597 if (simple_attr_open(inode, file, get, 4598 KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222 4599 ? set : NULL, 4600 fmt)) { 4601 kvm_put_kvm(stat_data->kvm); 4602 return -ENOMEM; 4603 } 4604 4605 return 0; 4606} 4607 4608static int kvm_debugfs_release(struct inode *inode, struct file *file) 4609{ 4610 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 4611 inode->i_private; 4612 4613 simple_attr_release(inode, file); 4614 kvm_put_kvm(stat_data->kvm); 4615 4616 return 0; 4617} 4618 4619static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val) 4620{ 4621 *val = *(ulong *)((void *)kvm + offset); 4622 4623 return 0; 4624} 4625 4626static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset) 4627{ 4628 *(ulong *)((void *)kvm + offset) = 0; 4629 4630 return 0; 4631} 4632 4633static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val) 4634{ 4635 int i; 4636 struct kvm_vcpu *vcpu; 4637 4638 *val = 0; 4639 4640 kvm_for_each_vcpu(i, vcpu, kvm) 4641 *val += *(u64 *)((void *)vcpu + offset); 4642 4643 return 0; 4644} 4645 4646static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset) 4647{ 4648 int i; 4649 struct kvm_vcpu *vcpu; 4650 4651 kvm_for_each_vcpu(i, vcpu, kvm) 4652 *(u64 *)((void *)vcpu + offset) = 0; 4653 4654 return 0; 4655} 4656 4657static int kvm_stat_data_get(void *data, u64 *val) 4658{ 4659 int r = -EFAULT; 4660 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4661 4662 switch (stat_data->dbgfs_item->kind) { 4663 case KVM_STAT_VM: 4664 r = kvm_get_stat_per_vm(stat_data->kvm, 4665 stat_data->dbgfs_item->offset, val); 4666 break; 4667 case KVM_STAT_VCPU: 4668 r = kvm_get_stat_per_vcpu(stat_data->kvm, 4669 stat_data->dbgfs_item->offset, val); 4670 break; 4671 } 4672 4673 return r; 4674} 4675 4676static int kvm_stat_data_clear(void *data, u64 val) 4677{ 4678 int r = -EFAULT; 4679 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4680 4681 if (val) 4682 return -EINVAL; 4683 4684 switch (stat_data->dbgfs_item->kind) { 4685 case KVM_STAT_VM: 4686 r = kvm_clear_stat_per_vm(stat_data->kvm, 4687 stat_data->dbgfs_item->offset); 4688 break; 4689 case KVM_STAT_VCPU: 4690 r = kvm_clear_stat_per_vcpu(stat_data->kvm, 4691 stat_data->dbgfs_item->offset); 4692 break; 4693 } 4694 4695 return r; 4696} 4697 4698static int kvm_stat_data_open(struct inode *inode, struct file *file) 4699{ 4700 __simple_attr_check_format("%llu\n", 0ull); 4701 return kvm_debugfs_open(inode, file, kvm_stat_data_get, 4702 kvm_stat_data_clear, "%llu\n"); 4703} 4704 4705static const struct file_operations stat_fops_per_vm = { 4706 .owner = THIS_MODULE, 4707 .open = kvm_stat_data_open, 4708 .release = kvm_debugfs_release, 4709 .read = simple_attr_read, 4710 .write = simple_attr_write, 4711 .llseek = no_llseek, 4712}; 4713 4714static int vm_stat_get(void *_offset, u64 *val) 4715{ 4716 unsigned offset = (long)_offset; 4717 struct kvm *kvm; 4718 u64 tmp_val; 4719 4720 *val = 0; 4721 mutex_lock(&kvm_lock); 4722 list_for_each_entry(kvm, &vm_list, vm_list) { 4723 kvm_get_stat_per_vm(kvm, offset, &tmp_val); 4724 *val += tmp_val; 4725 } 4726 mutex_unlock(&kvm_lock); 4727 return 0; 4728} 4729 4730static int vm_stat_clear(void *_offset, u64 val) 4731{ 4732 unsigned offset = (long)_offset; 4733 struct kvm *kvm; 4734 4735 if (val) 4736 return -EINVAL; 4737 4738 mutex_lock(&kvm_lock); 4739 list_for_each_entry(kvm, &vm_list, vm_list) { 4740 kvm_clear_stat_per_vm(kvm, offset); 4741 } 4742 mutex_unlock(&kvm_lock); 4743 4744 return 0; 4745} 4746 4747DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n"); 4748 4749static int vcpu_stat_get(void *_offset, u64 *val) 4750{ 4751 unsigned offset = (long)_offset; 4752 struct kvm *kvm; 4753 u64 tmp_val; 4754 4755 *val = 0; 4756 mutex_lock(&kvm_lock); 4757 list_for_each_entry(kvm, &vm_list, vm_list) { 4758 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val); 4759 *val += tmp_val; 4760 } 4761 mutex_unlock(&kvm_lock); 4762 return 0; 4763} 4764 4765static int vcpu_stat_clear(void *_offset, u64 val) 4766{ 4767 unsigned offset = (long)_offset; 4768 struct kvm *kvm; 4769 4770 if (val) 4771 return -EINVAL; 4772 4773 mutex_lock(&kvm_lock); 4774 list_for_each_entry(kvm, &vm_list, vm_list) { 4775 kvm_clear_stat_per_vcpu(kvm, offset); 4776 } 4777 mutex_unlock(&kvm_lock); 4778 4779 return 0; 4780} 4781 4782DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear, 4783 "%llu\n"); 4784 4785static const struct file_operations *stat_fops[] = { 4786 [KVM_STAT_VCPU] = &vcpu_stat_fops, 4787 [KVM_STAT_VM] = &vm_stat_fops, 4788}; 4789 4790static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) 4791{ 4792 struct kobj_uevent_env *env; 4793 unsigned long long created, active; 4794 4795 if (!kvm_dev.this_device || !kvm) 4796 return; 4797 4798 mutex_lock(&kvm_lock); 4799 if (type == KVM_EVENT_CREATE_VM) { 4800 kvm_createvm_count++; 4801 kvm_active_vms++; 4802 } else if (type == KVM_EVENT_DESTROY_VM) { 4803 kvm_active_vms--; 4804 } 4805 created = kvm_createvm_count; 4806 active = kvm_active_vms; 4807 mutex_unlock(&kvm_lock); 4808 4809 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); 4810 if (!env) 4811 return; 4812 4813 add_uevent_var(env, "CREATED=%llu", created); 4814 add_uevent_var(env, "COUNT=%llu", active); 4815 4816 if (type == KVM_EVENT_CREATE_VM) { 4817 add_uevent_var(env, "EVENT=create"); 4818 kvm->userspace_pid = task_pid_nr(current); 4819 } else if (type == KVM_EVENT_DESTROY_VM) { 4820 add_uevent_var(env, "EVENT=destroy"); 4821 } 4822 add_uevent_var(env, "PID=%d", kvm->userspace_pid); 4823 4824 if (kvm->debugfs_dentry) { 4825 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); 4826 4827 if (p) { 4828 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); 4829 if (!IS_ERR(tmp)) 4830 add_uevent_var(env, "STATS_PATH=%s", tmp); 4831 kfree(p); 4832 } 4833 } 4834 /* no need for checks, since we are adding at most only 5 keys */ 4835 env->envp[env->envp_idx++] = NULL; 4836 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp); 4837 kfree(env); 4838} 4839 4840static void kvm_init_debug(void) 4841{ 4842 struct kvm_stats_debugfs_item *p; 4843 4844 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 4845 4846 kvm_debugfs_num_entries = 0; 4847 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { 4848 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), 4849 kvm_debugfs_dir, (void *)(long)p->offset, 4850 stat_fops[p->kind]); 4851 } 4852} 4853 4854static int kvm_suspend(void) 4855{ 4856 if (kvm_usage_count) 4857 hardware_disable_nolock(NULL); 4858 return 0; 4859} 4860 4861static void kvm_resume(void) 4862{ 4863 if (kvm_usage_count) { 4864#ifdef CONFIG_LOCKDEP 4865 WARN_ON(lockdep_is_held(&kvm_count_lock)); 4866#endif 4867 hardware_enable_nolock(NULL); 4868 } 4869} 4870 4871static struct syscore_ops kvm_syscore_ops = { 4872 .suspend = kvm_suspend, 4873 .resume = kvm_resume, 4874}; 4875 4876static inline 4877struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 4878{ 4879 return container_of(pn, struct kvm_vcpu, preempt_notifier); 4880} 4881 4882static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 4883{ 4884 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 4885 4886 WRITE_ONCE(vcpu->preempted, false); 4887 WRITE_ONCE(vcpu->ready, false); 4888 4889 __this_cpu_write(kvm_running_vcpu, vcpu); 4890 kvm_arch_sched_in(vcpu, cpu); 4891 kvm_arch_vcpu_load(vcpu, cpu); 4892} 4893 4894static void kvm_sched_out(struct preempt_notifier *pn, 4895 struct task_struct *next) 4896{ 4897 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 4898 4899 if (current->state == TASK_RUNNING) { 4900 WRITE_ONCE(vcpu->preempted, true); 4901 WRITE_ONCE(vcpu->ready, true); 4902 } 4903 kvm_arch_vcpu_put(vcpu); 4904 __this_cpu_write(kvm_running_vcpu, NULL); 4905} 4906 4907/** 4908 * kvm_get_running_vcpu - get the vcpu running on the current CPU. 4909 * 4910 * We can disable preemption locally around accessing the per-CPU variable, 4911 * and use the resolved vcpu pointer after enabling preemption again, 4912 * because even if the current thread is migrated to another CPU, reading 4913 * the per-CPU value later will give us the same value as we update the 4914 * per-CPU variable in the preempt notifier handlers. 4915 */ 4916struct kvm_vcpu *kvm_get_running_vcpu(void) 4917{ 4918 struct kvm_vcpu *vcpu; 4919 4920 preempt_disable(); 4921 vcpu = __this_cpu_read(kvm_running_vcpu); 4922 preempt_enable(); 4923 4924 return vcpu; 4925} 4926EXPORT_SYMBOL_GPL(kvm_get_running_vcpu); 4927 4928/** 4929 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus. 4930 */ 4931struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) 4932{ 4933 return &kvm_running_vcpu; 4934} 4935 4936struct kvm_cpu_compat_check { 4937 void *opaque; 4938 int *ret; 4939}; 4940 4941static void check_processor_compat(void *data) 4942{ 4943 struct kvm_cpu_compat_check *c = data; 4944 4945 *c->ret = kvm_arch_check_processor_compat(c->opaque); 4946} 4947 4948int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 4949 struct module *module) 4950{ 4951 struct kvm_cpu_compat_check c; 4952 int r; 4953 int cpu; 4954 4955 r = kvm_arch_init(opaque); 4956 if (r) 4957 goto out_fail; 4958 4959 /* 4960 * kvm_arch_init makes sure there's at most one caller 4961 * for architectures that support multiple implementations, 4962 * like intel and amd on x86. 4963 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating 4964 * conflicts in case kvm is already setup for another implementation. 4965 */ 4966 r = kvm_irqfd_init(); 4967 if (r) 4968 goto out_irqfd; 4969 4970 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 4971 r = -ENOMEM; 4972 goto out_free_0; 4973 } 4974 4975 r = kvm_arch_hardware_setup(opaque); 4976 if (r < 0) 4977 goto out_free_1; 4978 4979 c.ret = &r; 4980 c.opaque = opaque; 4981 for_each_online_cpu(cpu) { 4982 smp_call_function_single(cpu, check_processor_compat, &c, 1); 4983 if (r < 0) 4984 goto out_free_2; 4985 } 4986 4987 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", 4988 kvm_starting_cpu, kvm_dying_cpu); 4989 if (r) 4990 goto out_free_2; 4991 register_reboot_notifier(&kvm_reboot_notifier); 4992 4993 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 4994 if (!vcpu_align) 4995 vcpu_align = __alignof__(struct kvm_vcpu); 4996 kvm_vcpu_cache = 4997 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align, 4998 SLAB_ACCOUNT, 4999 offsetof(struct kvm_vcpu, arch), 5000 sizeof_field(struct kvm_vcpu, arch), 5001 NULL); 5002 if (!kvm_vcpu_cache) { 5003 r = -ENOMEM; 5004 goto out_free_3; 5005 } 5006 5007 for_each_possible_cpu(cpu) { 5008 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu), 5009 GFP_KERNEL, cpu_to_node(cpu))) { 5010 r = -ENOMEM; 5011 goto out_free_4; 5012 } 5013 } 5014 5015 r = kvm_async_pf_init(); 5016 if (r) 5017 goto out_free_4; 5018 5019 kvm_chardev_ops.owner = module; 5020 kvm_vm_fops.owner = module; 5021 kvm_vcpu_fops.owner = module; 5022 5023 register_syscore_ops(&kvm_syscore_ops); 5024 5025 kvm_preempt_ops.sched_in = kvm_sched_in; 5026 kvm_preempt_ops.sched_out = kvm_sched_out; 5027 5028 kvm_init_debug(); 5029 5030 r = kvm_vfio_ops_init(); 5031 if (WARN_ON_ONCE(r)) 5032 goto err_vfio; 5033 5034 /* 5035 * Registration _must_ be the very last thing done, as this exposes 5036 * /dev/kvm to userspace, i.e. all infrastructure must be setup! 5037 */ 5038 r = misc_register(&kvm_dev); 5039 if (r) { 5040 pr_err("kvm: misc device register failed\n"); 5041 goto err_register; 5042 } 5043 5044 return 0; 5045 5046err_register: 5047 kvm_vfio_ops_exit(); 5048err_vfio: 5049 kvm_async_pf_deinit(); 5050out_free_4: 5051 for_each_possible_cpu(cpu) 5052 free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); 5053 kmem_cache_destroy(kvm_vcpu_cache); 5054out_free_3: 5055 unregister_reboot_notifier(&kvm_reboot_notifier); 5056 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 5057out_free_2: 5058 kvm_arch_hardware_unsetup(); 5059out_free_1: 5060 free_cpumask_var(cpus_hardware_enabled); 5061out_free_0: 5062 kvm_irqfd_exit(); 5063out_irqfd: 5064 kvm_arch_exit(); 5065out_fail: 5066 return r; 5067} 5068EXPORT_SYMBOL_GPL(kvm_init); 5069 5070void kvm_exit(void) 5071{ 5072 int cpu; 5073 5074 /* 5075 * Note, unregistering /dev/kvm doesn't strictly need to come first, 5076 * fops_get(), a.k.a. try_module_get(), prevents acquiring references 5077 * to KVM while the module is being stopped. 5078 */ 5079 misc_deregister(&kvm_dev); 5080 5081 debugfs_remove_recursive(kvm_debugfs_dir); 5082 for_each_possible_cpu(cpu) 5083 free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); 5084 kmem_cache_destroy(kvm_vcpu_cache); 5085 kvm_async_pf_deinit(); 5086 unregister_syscore_ops(&kvm_syscore_ops); 5087 unregister_reboot_notifier(&kvm_reboot_notifier); 5088 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 5089 on_each_cpu(hardware_disable_nolock, NULL, 1); 5090 kvm_arch_hardware_unsetup(); 5091 kvm_arch_exit(); 5092 kvm_irqfd_exit(); 5093 free_cpumask_var(cpus_hardware_enabled); 5094 kvm_vfio_ops_exit(); 5095} 5096EXPORT_SYMBOL_GPL(kvm_exit); 5097 5098struct kvm_vm_worker_thread_context { 5099 struct kvm *kvm; 5100 struct task_struct *parent; 5101 struct completion init_done; 5102 kvm_vm_thread_fn_t thread_fn; 5103 uintptr_t data; 5104 int err; 5105}; 5106 5107static int kvm_vm_worker_thread(void *context) 5108{ 5109 /* 5110 * The init_context is allocated on the stack of the parent thread, so 5111 * we have to locally copy anything that is needed beyond initialization 5112 */ 5113 struct kvm_vm_worker_thread_context *init_context = context; 5114 struct kvm *kvm = init_context->kvm; 5115 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; 5116 uintptr_t data = init_context->data; 5117 int err; 5118 5119 err = kthread_park(current); 5120 /* kthread_park(current) is never supposed to return an error */ 5121 WARN_ON(err != 0); 5122 if (err) 5123 goto init_complete; 5124 5125 err = cgroup_attach_task_all(init_context->parent, current); 5126 if (err) { 5127 kvm_err("%s: cgroup_attach_task_all failed with err %d\n", 5128 __func__, err); 5129 goto init_complete; 5130 } 5131 5132 set_user_nice(current, task_nice(init_context->parent)); 5133 5134init_complete: 5135 init_context->err = err; 5136 complete(&init_context->init_done); 5137 init_context = NULL; 5138 5139 if (err) 5140 return err; 5141 5142 /* Wait to be woken up by the spawner before proceeding. */ 5143 kthread_parkme(); 5144 5145 if (!kthread_should_stop()) 5146 err = thread_fn(kvm, data); 5147 5148 return err; 5149} 5150 5151int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, 5152 uintptr_t data, const char *name, 5153 struct task_struct **thread_ptr) 5154{ 5155 struct kvm_vm_worker_thread_context init_context = {}; 5156 struct task_struct *thread; 5157 5158 *thread_ptr = NULL; 5159 init_context.kvm = kvm; 5160 init_context.parent = current; 5161 init_context.thread_fn = thread_fn; 5162 init_context.data = data; 5163 init_completion(&init_context.init_done); 5164 5165 thread = kthread_run(kvm_vm_worker_thread, &init_context, 5166 "%s-%d", name, task_pid_nr(current)); 5167 if (IS_ERR(thread)) 5168 return PTR_ERR(thread); 5169 5170 /* kthread_run is never supposed to return NULL */ 5171 WARN_ON(thread == NULL); 5172 5173 wait_for_completion(&init_context.init_done); 5174 5175 if (!init_context.err) 5176 *thread_ptr = thread; 5177 5178 return init_context.err; 5179} 5180