1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
4 */
5
6 #include <linux/highmem.h>
7 #include <linux/hugetlb.h>
8 #include <linux/page-flags.h>
9 #include <linux/kvm_host.h>
10 #include <linux/uaccess.h>
11 #include <asm/mmu_context.h>
12 #include <asm/pgalloc.h>
13 #include <asm/tlb.h>
14 #include "kvm_compat.h"
15
16 /*
17 * KVM_MMU_CACHE_MIN_PAGES is the number of GPA page table translation levels
18 * for which pages need to be cached.
19 */
20 #if defined(__PAGETABLE_PMD_FOLDED)
21 #define KVM_MMU_CACHE_MIN_PAGES 1
22 #else
23 #define KVM_MMU_CACHE_MIN_PAGES 2
24 #endif
25
kvm_pte_huge(pte_t pte)26 static inline int kvm_pte_huge(pte_t pte) { return pte_val(pte) & _PAGE_HUGE; }
kvm_pte_mksmall(pte_t pte)27 static inline pte_t kvm_pte_mksmall(pte_t pte)
28 {
29 pte_val(pte) &= ~_PAGE_HUGE;
30 return pte;
31 }
kvm_set_pte(pte_t *ptep, pte_t val)32 static inline void kvm_set_pte(pte_t *ptep, pte_t val)
33 {
34 WRITE_ONCE(*ptep, val);
35 }
36
kvm_tlb_flush_gpa(struct kvm_vcpu *vcpu, unsigned long gpa)37 static int kvm_tlb_flush_gpa(struct kvm_vcpu *vcpu, unsigned long gpa)
38 {
39 preempt_disable();
40 gpa &= (PAGE_MASK << 1);
41 invtlb(INVTLB_GID_ADDR, kvm_read_csr_gstat() & KVM_GSTAT_GID, gpa);
42 preempt_enable();
43 return 0;
44 }
45
kvm_pmd_mkhuge(pmd_t pmd)46 static inline pmd_t kvm_pmd_mkhuge(pmd_t pmd)
47 {
48 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
49 return pmd_mkhuge(pmd);
50 #elif defined(CONFIG_HUGETLB_PAGE)
51 pte_t entry;
52
53 pte_val(entry) = pmd_val(pmd);
54 entry = pte_mkhuge(entry);
55 pmd_val(pmd) = pte_val(entry);
56 #endif
57 return pmd;
58 }
59
kvm_pmd_mkclean(pmd_t pmd)60 static inline pmd_t kvm_pmd_mkclean(pmd_t pmd)
61 {
62 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
63 return pmd_mkclean(pmd);
64 #elif defined(CONFIG_HUGETLB_PAGE)
65 pte_t entry;
66
67 pte_val(entry) = pmd_val(pmd);
68 entry = pte_mkclean(entry);
69 pmd_val(pmd) = pte_val(entry);
70 #endif
71 return pmd;
72 }
73
kvm_pmd_mkold(pmd_t pmd)74 static inline pmd_t kvm_pmd_mkold(pmd_t pmd)
75 {
76 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
77 return pmd_mkold(pmd);
78 #elif defined(CONFIG_HUGETLB_PAGE)
79 pte_t entry;
80
81 pte_val(entry) = pmd_val(pmd);
82 entry = pte_mkold(entry);
83 pmd_val(pmd) = pte_val(entry);
84 #endif
85 return pmd;
86 }
87
kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)88 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
89 {
90 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
91 }
92
93 /**
94 * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory.
95 *
96 * Allocate a blank KVM GPA page directory (PGD) for representing guest physical
97 * to host physical page mappings.
98 *
99 * Returns: Pointer to new KVM GPA page directory.
100 * NULL on allocation failure.
101 */
kvm_pgd_alloc(void)102 pgd_t *kvm_pgd_alloc(void)
103 {
104 pgd_t *ret;
105 struct page *page;
106
107 page = alloc_pages(GFP_KERNEL, 0);
108 if (!page)
109 return NULL;
110 ret = (pgd_t *) page_address(page);
111 if (ret)
112 pgd_init((unsigned long)ret);
113
114 return ret;
115 }
116
117 /**
118 * kvm_walk_pgd() - Walk page table with optional allocation.
119 * @pgd: Page directory pointer.
120 * @addr: Address to index page table using.
121 * @cache: MMU page cache to allocate new page tables from, or NULL.
122 *
123 * Walk the page tables pointed to by @pgd to find the PTE corresponding to the
124 * address @addr. If page tables don't exist for @addr, they will be created
125 * from the MMU cache if @cache is not NULL.
126 *
127 * Returns: Pointer to pte_t corresponding to @addr.
128 * NULL if a page table doesn't exist for @addr and !@cache.
129 * NULL if a page table allocation failed.
130 */
kvm_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache, unsigned long addr)131 static pte_t *kvm_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache,
132 unsigned long addr)
133 {
134 p4d_t *p4d;
135 pud_t *pud;
136 pmd_t *pmd;
137
138 pgd += pgd_index(addr);
139 if (pgd_none(*pgd)) {
140 /* Not used yet */
141 BUG();
142 return NULL;
143 }
144 p4d = p4d_offset(pgd, addr);
145 pud = pud_offset(p4d, addr);
146 if (pud_none(*pud)) {
147 pmd_t *new_pmd;
148
149 if (!cache)
150 return NULL;
151 new_pmd = kvm_mmu_memory_cache_alloc(cache);
152 pmd_init((unsigned long)new_pmd,
153 (unsigned long)invalid_pte_table);
154 pud_populate(NULL, pud, new_pmd);
155 }
156 pmd = pmd_offset(pud, addr);
157 #ifdef CONFIG_HUGETLB_PAGE
158 if (pmd_huge(*pmd)) {
159 return (pte_t *)pmd;
160 }
161 #endif
162 if (pmd_none(*pmd)) {
163 pte_t *new_pte;
164
165 if (!cache)
166 return NULL;
167 new_pte = kvm_mmu_memory_cache_alloc(cache);
168 clear_page(new_pte);
169 pmd_populate_kernel(NULL, pmd, new_pte);
170 }
171 return pte_offset_kernel(pmd, addr);
172 }
173
174 /* Caller must hold kvm->mm_lock */
kvm_pte_for_gpa(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, unsigned long addr)175 static pte_t *kvm_pte_for_gpa(struct kvm *kvm,
176 struct kvm_mmu_memory_cache *cache,
177 unsigned long addr)
178 {
179 return kvm_walk_pgd(kvm->arch.gpa_mm.pgd, cache, addr);
180 }
181
182 /*
183 * kvm_flush_gpa_{pte,pmd,pud,pgd,pt}.
184 * Flush a range of guest physical address space from the VM's GPA page tables.
185 */
186
kvm_flush_gpa_pte(pte_t *pte, unsigned long start_gpa, unsigned long end_gpa, unsigned long *data)187 static bool kvm_flush_gpa_pte(pte_t *pte, unsigned long start_gpa,
188 unsigned long end_gpa, unsigned long *data)
189 {
190 int i_min = pte_index(start_gpa);
191 int i_max = pte_index(end_gpa);
192 bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PTE - 1);
193 int i;
194
195 for (i = i_min; i <= i_max; ++i) {
196 if (!pte_present(pte[i]))
197 continue;
198
199 set_pte(pte + i, __pte(0));
200 if (data)
201 *data = *data + 1;
202 }
203 return safe_to_remove;
204 }
205
kvm_flush_gpa_pmd(pmd_t *pmd, unsigned long start_gpa, unsigned long end_gpa, unsigned long *data)206 static bool kvm_flush_gpa_pmd(pmd_t *pmd, unsigned long start_gpa,
207 unsigned long end_gpa, unsigned long *data)
208 {
209 pte_t *pte;
210 unsigned long end = ~0ul;
211 int i_min = pmd_index(start_gpa);
212 int i_max = pmd_index(end_gpa);
213 bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PMD - 1);
214 int i;
215
216 for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
217 if (!pmd_present(pmd[i]))
218 continue;
219
220 if (pmd_huge(pmd[i]) && pmd_present(pmd[i])) {
221 pmd_clear(pmd + i);
222 if (data)
223 *data += PTRS_PER_PMD;
224 continue;
225 }
226
227 pte = pte_offset_kernel(pmd + i, 0);
228 if (i == i_max)
229 end = end_gpa;
230
231 if (kvm_flush_gpa_pte(pte, start_gpa, end, data)) {
232 pmd_clear(pmd + i);
233 pte_free_kernel(NULL, pte);
234 } else {
235 safe_to_remove = false;
236 }
237 }
238 return safe_to_remove;
239 }
240
kvm_flush_gpa_pud(pud_t *pud, unsigned long start_gpa, unsigned long end_gpa, unsigned long *data)241 static bool kvm_flush_gpa_pud(pud_t *pud, unsigned long start_gpa,
242 unsigned long end_gpa, unsigned long *data)
243 {
244 pmd_t *pmd;
245 unsigned long end = ~0ul;
246 int i_min = pud_index(start_gpa);
247 int i_max = pud_index(end_gpa);
248 bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PUD - 1);
249 int i;
250
251 for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
252 if (!pud_present(pud[i]))
253 continue;
254
255 pmd = pmd_offset(pud + i, 0);
256 if (i == i_max)
257 end = end_gpa;
258
259 if (kvm_flush_gpa_pmd(pmd, start_gpa, end, data)) {
260 pud_clear(pud + i);
261 pmd_free(NULL, pmd);
262 } else {
263 safe_to_remove = false;
264 }
265 }
266 return safe_to_remove;
267 }
268
kvm_flush_gpa_pgd(pgd_t *pgd, unsigned long start_gpa, unsigned long end_gpa, unsigned long *data)269 static bool kvm_flush_gpa_pgd(pgd_t *pgd, unsigned long start_gpa,
270 unsigned long end_gpa, unsigned long *data)
271 {
272 p4d_t *p4d;
273 pud_t *pud;
274 unsigned long end = ~0ul;
275 int i_min = pgd_index(start_gpa);
276 int i_max = pgd_index(end_gpa);
277 bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PGD - 1);
278 int i;
279
280 for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
281 if (!pgd_present(pgd[i]))
282 continue;
283
284 p4d = p4d_offset(pgd, 0);
285 pud = pud_offset(p4d + i, 0);
286 if (i == i_max)
287 end = end_gpa;
288
289 if (kvm_flush_gpa_pud(pud, start_gpa, end, data)) {
290 pgd_clear(pgd + i);
291 pud_free(NULL, pud);
292 } else {
293 safe_to_remove = false;
294 }
295 }
296 return safe_to_remove;
297 }
298
299 /**
300 * kvm_flush_gpa_pt() - Flush a range of guest physical addresses.
301 * @kvm: KVM pointer.
302 * @start_gfn: Guest frame number of first page in GPA range to flush.
303 * @end_gfn: Guest frame number of last page in GPA range to flush.
304 *
305 * Flushes a range of GPA mappings from the GPA page tables.
306 *
307 * The caller must hold the @kvm->mmu_lock spinlock.
308 *
309 * Returns: Whether its safe to remove the top level page directory because
310 * all lower levels have been removed.
311 */
kvm_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, void *data)312 static bool kvm_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, void *data)
313 {
314 return kvm_flush_gpa_pgd(kvm->arch.gpa_mm.pgd,
315 start_gfn << PAGE_SHIFT,
316 end_gfn << PAGE_SHIFT, (unsigned long *)data);
317 }
318
319 /*
320 * kvm_mkclean_gpa_pt.
321 * Mark a range of guest physical address space clean (writes fault) in the VM's
322 * GPA page table to allow dirty page tracking.
323 */
324
kvm_mkclean_pte(pte_t *pte, unsigned long start, unsigned long end)325 static int kvm_mkclean_pte(pte_t *pte, unsigned long start, unsigned long end)
326 {
327 int ret = 0;
328 int i_min = pte_index(start);
329 int i_max = pte_index(end);
330 int i;
331 pte_t val;
332
333 for (i = i_min; i <= i_max; ++i) {
334 val = pte[i];
335 if (pte_present(val) && pte_dirty(val)) {
336 set_pte(pte + i, pte_mkclean(val));
337 ret = 1;
338 }
339 }
340 return ret;
341 }
342
kvm_mkclean_pmd(pmd_t *pmd, unsigned long start, unsigned long end)343 static int kvm_mkclean_pmd(pmd_t *pmd, unsigned long start, unsigned long end)
344 {
345 int ret = 0;
346 pte_t *pte;
347 unsigned long cur_end = ~0ul;
348 int i_min = pmd_index(start);
349 int i_max = pmd_index(end);
350 int i;
351 pmd_t old, new;
352
353 for (i = i_min; i <= i_max; ++i, start = 0) {
354 if (!pmd_present(pmd[i]))
355 continue;
356
357 if (pmd_huge(pmd[i])) {
358 old = pmd[i];
359 new = kvm_pmd_mkclean(old);
360 if (pmd_val(new) == pmd_val(old))
361 continue;
362 set_pmd(pmd + i, new);
363 ret = 1;
364 continue;
365 }
366
367 pte = pte_offset_kernel(pmd + i, 0);
368 if (i == i_max)
369 cur_end = end;
370
371 ret |= kvm_mkclean_pte(pte, start, cur_end);
372 }
373
374 return ret;
375 }
376
kvm_mkclean_pud(pud_t *pud, unsigned long start, unsigned long end)377 static int kvm_mkclean_pud(pud_t *pud, unsigned long start, unsigned long end)
378 {
379 int ret = 0;
380 pmd_t *pmd;
381 unsigned long cur_end = ~0ul;
382 int i_min = pud_index(start);
383 int i_max = pud_index(end);
384 int i;
385
386 for (i = i_min; i <= i_max; ++i, start = 0) {
387 if (!pud_present(pud[i]))
388 continue;
389
390 pmd = pmd_offset(pud + i, 0);
391 if (i == i_max)
392 cur_end = end;
393
394 ret |= kvm_mkclean_pmd(pmd, start, cur_end);
395 }
396 return ret;
397 }
398
kvm_mkclean_pgd(pgd_t *pgd, unsigned long start, unsigned long end)399 static int kvm_mkclean_pgd(pgd_t *pgd, unsigned long start, unsigned long end)
400 {
401 int ret = 0;
402 p4d_t *p4d; \
403 pud_t *pud;
404 unsigned long cur_end = ~0ul;
405 int i_min = pgd_index(start);
406 int i_max = pgd_index(end);
407 int i;
408
409 for (i = i_min; i <= i_max; ++i, start = 0) {
410 if (!pgd_present(pgd[i]))
411 continue;
412
413 p4d = p4d_offset(pgd, 0);
414 pud = pud_offset(p4d + i, 0);
415 if (i == i_max)
416 cur_end = end;
417
418 ret |= kvm_mkclean_pud(pud, start, cur_end);
419 }
420 return ret;
421 }
422
423 /**
424 * kvm_mkclean_gpa_pt() - Make a range of guest physical addresses clean.
425 * @kvm: KVM pointer.
426 * @start_gfn: Guest frame number of first page in GPA range to flush.
427 * @end_gfn: Guest frame number of last page in GPA range to flush.
428 *
429 * Make a range of GPA mappings clean so that guest writes will fault and
430 * trigger dirty page logging.
431 *
432 * The caller must hold the @kvm->mmu_lock spinlock.
433 *
434 * Returns: Whether any GPA mappings were modified, which would require
435 * derived mappings (GVA page tables & TLB enties) to be
436 * invalidated.
437 */
kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)438 static int kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
439 {
440 return kvm_mkclean_pgd(kvm->arch.gpa_mm.pgd, start_gfn << PAGE_SHIFT,
441 end_gfn << PAGE_SHIFT);
442 }
443
444 /**
445 * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages
446 * @kvm: The KVM pointer
447 * @slot: The memory slot associated with mask
448 * @gfn_offset: The gfn offset in memory slot
449 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
450 * slot to be write protected
451 *
452 * Walks bits set in mask write protects the associated pte's. Caller must
453 * acquire @kvm->mmu_lock.
454 */
kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask)455 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
456 struct kvm_memory_slot *slot,
457 gfn_t gfn_offset, unsigned long mask)
458 {
459 gfn_t base_gfn = slot->base_gfn + gfn_offset;
460 gfn_t start = base_gfn + __ffs(mask);
461 gfn_t end = base_gfn + __fls(mask);
462
463 kvm_mkclean_gpa_pt(kvm, start, end);
464 }
465
kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change)466 void kvm_arch_commit_memory_region(struct kvm *kvm,
467 const struct kvm_userspace_memory_region *mem,
468 struct kvm_memory_slot *old,
469 const struct kvm_memory_slot *new,
470 enum kvm_mr_change change)
471 {
472 int needs_flush;
473
474 kvm_debug("%s: kvm: %p slot: %d, GPA: %llx, size: %llx, QVA: %llx\n",
475 __func__, kvm, mem->slot, mem->guest_phys_addr,
476 mem->memory_size, mem->userspace_addr);
477
478 /*
479 * If dirty page logging is enabled, write protect all pages in the slot
480 * ready for dirty logging.
481 *
482 * There is no need to do this in any of the following cases:
483 * CREATE: No dirty mappings will already exist.
484 * MOVE/DELETE: The old mappings will already have been cleaned up by
485 * kvm_arch_flush_shadow_memslot()
486 */
487 if (change == KVM_MR_FLAGS_ONLY &&
488 (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
489 new->flags & KVM_MEM_LOG_DIRTY_PAGES)) {
490 spin_lock(&kvm->mmu_lock);
491 /* Write protect GPA page table entries */
492 needs_flush = kvm_mkclean_gpa_pt(kvm, new->base_gfn,
493 new->base_gfn + new->npages - 1);
494 /* Let implementation do the rest */
495 if (needs_flush)
496 kvm_flush_remote_tlbs(kvm);
497 spin_unlock(&kvm->mmu_lock);
498 }
499 }
500
kvm_arch_flush_shadow_all(struct kvm *kvm)501 void kvm_arch_flush_shadow_all(struct kvm *kvm)
502 {
503 /* Flush whole GPA */
504 kvm_flush_gpa_pt(kvm, 0, ~0UL, NULL);
505
506 /* Flush vpid for each VCPU individually */
507 kvm_flush_remote_tlbs(kvm);
508 }
509
kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)510 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
511 struct kvm_memory_slot *slot)
512 {
513 unsigned long npages;
514
515 /*
516 * The slot has been made invalid (ready for moving or deletion), so we
517 * need to ensure that it can no longer be accessed by any guest VCPUs.
518 */
519
520 npages = 0;
521 spin_lock(&kvm->mmu_lock);
522 /* Flush slot from GPA */
523 kvm_flush_gpa_pt(kvm, slot->base_gfn,
524 slot->base_gfn + slot->npages - 1, &npages);
525 /* Let implementation do the rest */
526 if (npages)
527 kvm_flush_remote_tlbs(kvm);
528 spin_unlock(&kvm->mmu_lock);
529 }
530
_kvm_destroy_mm(struct kvm *kvm)531 void _kvm_destroy_mm(struct kvm *kvm)
532 {
533 /* It should always be safe to remove after flushing the whole range */
534 WARN_ON(!kvm_flush_gpa_pt(kvm, 0, ~0UL, NULL));
535 pgd_free(NULL, kvm->arch.gpa_mm.pgd);
536 kvm->arch.gpa_mm.pgd = NULL;
537 }
538
539 /*
540 * Mark a range of guest physical address space old (all accesses fault) in the
541 * VM's GPA page table to allow detection of commonly used pages.
542 */
543
kvm_mkold_pte(pte_t *pte, unsigned long start, unsigned long end)544 static int kvm_mkold_pte(pte_t *pte, unsigned long start,
545 unsigned long end)
546 {
547 int ret = 0;
548 int i_min = pte_index(start);
549 int i_max = pte_index(end);
550 int i;
551 pte_t old, new;
552
553 for (i = i_min; i <= i_max; ++i) {
554 if (!pte_present(pte[i]))
555 continue;
556
557 old = pte[i];
558 new = pte_mkold(old);
559 if (pte_val(new) == pte_val(old))
560 continue;
561 set_pte(pte + i, new);
562 ret = 1;
563 }
564
565 return ret;
566 }
567
kvm_mkold_pmd(pmd_t *pmd, unsigned long start, unsigned long end)568 static int kvm_mkold_pmd(pmd_t *pmd, unsigned long start, unsigned long end)
569 {
570 int ret = 0;
571 pte_t *pte;
572 unsigned long cur_end = ~0ul;
573 int i_min = pmd_index(start);
574 int i_max = pmd_index(end);
575 int i;
576 pmd_t old, new;
577
578 for (i = i_min; i <= i_max; ++i, start = 0) {
579 if (!pmd_present(pmd[i]))
580 continue;
581
582 if (pmd_huge(pmd[i])) {
583 old = pmd[i];
584 new = kvm_pmd_mkold(old);
585 if (pmd_val(new) == pmd_val(old))
586 continue;
587 set_pmd(pmd + i, new);
588 ret = 1;
589 continue;
590 }
591
592 pte = pte_offset_kernel(pmd + i, 0);
593 if (i == i_max)
594 cur_end = end;
595
596 ret |= kvm_mkold_pte(pte, start, cur_end);
597 }
598
599 return ret;
600 }
601
kvm_mkold_pud(pud_t *pud, unsigned long start, unsigned long end)602 static int kvm_mkold_pud(pud_t *pud, unsigned long start, unsigned long end)
603 {
604 int ret = 0;
605 pmd_t *pmd;
606 unsigned long cur_end = ~0ul;
607 int i_min = pud_index(start);
608 int i_max = pud_index(end);
609 int i;
610
611 for (i = i_min; i <= i_max; ++i, start = 0) {
612 if (!pud_present(pud[i]))
613 continue;
614
615 pmd = pmd_offset(pud + i, 0);
616 if (i == i_max)
617 cur_end = end;
618
619 ret |= kvm_mkold_pmd(pmd, start, cur_end);
620 }
621
622 return ret;
623 }
624
kvm_mkold_pgd(pgd_t *pgd, unsigned long start, unsigned long end)625 static int kvm_mkold_pgd(pgd_t *pgd, unsigned long start, unsigned long end)
626 {
627 int ret = 0;
628 p4d_t *p4d;
629 pud_t *pud;
630 unsigned long cur_end = ~0ul;
631 int i_min = pgd_index(start);
632 int i_max = pgd_index(end);
633 int i;
634
635 for (i = i_min; i <= i_max; ++i, start = 0) {
636 if (!pgd_present(pgd[i]))
637 continue;
638
639 p4d = p4d_offset(pgd, 0);
640 pud = pud_offset(p4d + i, 0);
641 if (i == i_max)
642 cur_end = end;
643
644 ret |= kvm_mkold_pud(pud, start, cur_end);
645 }
646
647 return ret;
648 }
649
handle_hva_to_gpa(struct kvm *kvm, unsigned long start, unsigned long end, int (*handler)(struct kvm *kvm, gfn_t gfn, gpa_t gfn_end, struct kvm_memory_slot *memslot, void *data), void *data)650 static int handle_hva_to_gpa(struct kvm *kvm,
651 unsigned long start,
652 unsigned long end,
653 int (*handler)(struct kvm *kvm, gfn_t gfn,
654 gpa_t gfn_end,
655 struct kvm_memory_slot *memslot,
656 void *data),
657 void *data)
658 {
659 struct kvm_memslots *slots;
660 struct kvm_memory_slot *memslot;
661 int ret = 0;
662
663 slots = kvm_memslots(kvm);
664
665 /* we only care about the pages that the guest sees */
666 kvm_for_each_memslot(memslot, slots) {
667 unsigned long hva_start, hva_end;
668 gfn_t gfn, gfn_end;
669
670 hva_start = max(start, memslot->userspace_addr);
671 hva_end = min(end, memslot->userspace_addr +
672 (memslot->npages << PAGE_SHIFT));
673 if (hva_start >= hva_end)
674 continue;
675
676 /*
677 * {gfn(page) | page intersects with [hva_start, hva_end)} =
678 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
679 */
680 gfn = hva_to_gfn_memslot(hva_start, memslot);
681 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
682 ret |= handler(kvm, gfn, gfn_end, memslot, data);
683 }
684
685 return ret;
686 }
687
688
kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, struct kvm_memory_slot *memslot, void *data)689 static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
690 struct kvm_memory_slot *memslot, void *data)
691 {
692 unsigned long npages;
693
694 npages = 0;
695 kvm_flush_gpa_pt(kvm, gfn, gfn_end - 1, &npages);
696 *(unsigned long *)data = *(unsigned long *)data + npages;
697
698 return npages > 0;
699 }
700
kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable)701 int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable)
702 {
703 unsigned long npages;
704
705 npages = 0;
706 return handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &npages);
707 }
708
kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, struct kvm_memory_slot *memslot, void *data)709 static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
710 struct kvm_memory_slot *memslot, void *data)
711 {
712 gpa_t gpa = gfn << PAGE_SHIFT;
713 pte_t hva_pte = *(pte_t *)data;
714 pte_t *gpa_pte = kvm_pte_for_gpa(kvm, NULL, gpa);
715 pte_t old_pte;
716
717 if (!gpa_pte)
718 return 0;
719
720 /* Mapping may need adjusting depending on memslot flags */
721 old_pte = *gpa_pte;
722 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
723 hva_pte = pte_mkclean(hva_pte);
724 else if (memslot->flags & KVM_MEM_READONLY)
725 hva_pte = pte_wrprotect(hva_pte);
726
727 set_pte(gpa_pte, hva_pte);
728
729 /* Replacing an absent or old page doesn't need flushes */
730 if (!pte_present(old_pte) || !pte_young(old_pte))
731 return 0;
732
733 /* Pages swapped, aged, moved, or cleaned require flushes */
734 return !pte_present(hva_pte) ||
735 !pte_young(hva_pte) ||
736 pte_pfn(old_pte) != pte_pfn(hva_pte) ||
737 (pte_dirty(old_pte) && !pte_dirty(hva_pte));
738 }
739
kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)740 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
741 {
742 unsigned long end = hva + PAGE_SIZE;
743 int ret;
744
745 ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte);
746 if (ret)
747 /* Flush vpid for each VCPU individually */
748 kvm_flush_remote_tlbs(kvm);
749 return 0;
750 }
751
kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, struct kvm_memory_slot *memslot, void *data)752 static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
753 struct kvm_memory_slot *memslot, void *data)
754 {
755 return kvm_mkold_pgd(kvm->arch.gpa_mm.pgd, gfn << PAGE_SHIFT,
756 gfn_end << PAGE_SHIFT);
757 }
758
kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, struct kvm_memory_slot *memslot, void *data)759 static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
760 struct kvm_memory_slot *memslot, void *data)
761 {
762 gpa_t gpa = gfn << PAGE_SHIFT;
763 pte_t *gpa_pte = kvm_pte_for_gpa(kvm, NULL, gpa);
764
765 if (!gpa_pte)
766 return 0;
767 return pte_young(*gpa_pte);
768 }
769
kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)770 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
771 {
772 return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
773 }
774
kvm_test_age_hva(struct kvm *kvm, unsigned long hva)775 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
776 {
777 return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
778 }
779
kvm_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, phys_addr_t addr)780 static pud_t *kvm_get_pud(struct kvm *kvm,
781 struct kvm_mmu_memory_cache *cache, phys_addr_t addr)
782 {
783 pgd_t *pgd;
784
785 pgd = kvm->arch.gpa_mm.pgd + pgd_index(addr);
786 if (pgd_none(*pgd)) {
787 /* Not used yet */
788 BUG();
789 return NULL;
790 }
791
792 return pud_offset(p4d_offset(pgd, addr), addr);
793 }
794
kvm_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, phys_addr_t addr)795 static pmd_t *kvm_get_pmd(struct kvm *kvm,
796 struct kvm_mmu_memory_cache *cache, phys_addr_t addr)
797 {
798 pud_t *pud;
799 pmd_t *pmd;
800
801 pud = kvm_get_pud(kvm, cache, addr);
802 if (!pud || pud_huge(*pud))
803 return NULL;
804
805 if (pud_none(*pud)) {
806 if (!cache)
807 return NULL;
808 pmd = kvm_mmu_memory_cache_alloc(cache);
809 pmd_init((unsigned long)pmd,
810 (unsigned long)invalid_pte_table);
811 pud_populate(NULL, pud, pmd);
812 }
813
814 return pmd_offset(pud, addr);
815 }
816
kvm_set_pmd_huge(struct kvm_vcpu *vcpu, struct kvm_mmu_memory_cache *cache, phys_addr_t addr, const pmd_t *new_pmd)817 static int kvm_set_pmd_huge(struct kvm_vcpu *vcpu, struct kvm_mmu_memory_cache
818 *cache, phys_addr_t addr, const pmd_t *new_pmd)
819 {
820 pmd_t *pmd, old_pmd;
821
822 retry:
823 pmd = kvm_get_pmd(vcpu->kvm, cache, addr);
824 VM_BUG_ON(!pmd);
825
826 old_pmd = *pmd;
827 /*
828 * Multiple vcpus faulting on the same PMD entry, can
829 * lead to them sequentially updating the PMD with the
830 * same value. Following the break-before-make
831 * (pmd_clear() followed by tlb_flush()) process can
832 * hinder forward progress due to refaults generated
833 * on missing translations.
834 *
835 * Skip updating the page table if the entry is
836 * unchanged.
837 */
838 if (pmd_val(old_pmd) == pmd_val(*new_pmd))
839 return 0;
840
841 if (pmd_present(old_pmd)) {
842 /*
843 * If we already have PTE level mapping for this block,
844 * we must unmap it to avoid inconsistent TLB state and
845 * leaking the table page. We could end up in this situation
846 * if the memory slot was marked for dirty logging and was
847 * reverted, leaving PTE level mappings for the pages accessed
848 * during the period. So, unmap the PTE level mapping for this
849 * block and retry, as we could have released the upper level
850 * table in the process.
851 *
852 * Normal THP split/merge follows mmu_notifier callbacks and do
853 * get handled accordingly.
854 */
855 if (!pmd_huge(old_pmd)) {
856 ++vcpu->stat.huge_merge_exits;
857 kvm_flush_gpa_pt(vcpu->kvm,
858 (addr & PMD_MASK) >> PAGE_SHIFT,
859 ((addr & PMD_MASK) + PMD_SIZE - 1) >> PAGE_SHIFT, NULL);
860 goto retry;
861 }
862 /*
863 * Mapping in huge pages should only happen through a
864 * fault. If a page is merged into a transparent huge
865 * page, the individual subpages of that huge page
866 * should be unmapped through MMU notifiers before we
867 * get here.
868 *
869 * Merging of CompoundPages is not supported; they
870 * should become splitting first, unmapped, merged,
871 * and mapped back in on-demand.
872 */
873 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
874 WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
875 #endif
876 pmd_clear(pmd);
877 }
878
879 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
880 set_pmd(pmd, *new_pmd);
881 return 0;
882 }
883
884 /*
885 * Adjust pfn start boundary if support for transparent hugepage
886 */
transparent_hugepage_adjust(kvm_pfn_t *pfnp, unsigned long *gpap)887 static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, unsigned long *gpap)
888 {
889 kvm_pfn_t pfn = *pfnp;
890 gfn_t gfn = *gpap >> PAGE_SHIFT;
891 struct page *page = pfn_to_page(pfn);
892
893 /*
894 * PageTransCompoundMap() returns true for THP and
895 * hugetlbfs. Make sure the adjustment is done only for THP
896 * pages.
897 */
898 if ((!PageHuge(page)) && PageTransCompound(page) &&
899 (atomic_read(&page->_mapcount) < 0)) {
900 unsigned long mask;
901 /*
902 * The address we faulted on is backed by a transparent huge
903 * page. However, because we map the compound huge page and
904 * not the individual tail page, we need to transfer the
905 * refcount to the head page. We have to be careful that the
906 * THP doesn't start to split while we are adjusting the
907 * refcounts.
908 *
909 * We are sure this doesn't happen, because mmu_notifier_retry
910 * was successful and we are holding the mmu_lock, so if this
911 * THP is trying to split, it will be blocked in the mmu
912 * notifier before touching any of the pages, specifically
913 * before being able to call __split_huge_page_refcount().
914 *
915 * We can therefore safely transfer the refcount from PG_tail
916 * to PG_head and switch the pfn from a tail page to the head
917 * page accordingly.
918 */
919 mask = PTRS_PER_PMD - 1;
920 VM_BUG_ON((gfn & mask) != (pfn & mask));
921 if (pfn & mask) {
922 *gpap &= PMD_MASK;
923 kvm_release_pfn_clean(pfn);
924 pfn &= ~mask;
925 kvm_get_pfn(pfn);
926 *pfnp = pfn;
927 }
928
929 return true;
930 }
931
932 return false;
933 }
934
fault_supports_huge_mapping(struct kvm_memory_slot *memslot, unsigned long hva, bool write)935 static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot,
936 unsigned long hva, bool write)
937 {
938 gpa_t gpa_start;
939 hva_t uaddr_start, uaddr_end;
940 unsigned long map_size;
941 size_t size;
942
943 map_size = PMD_SIZE;
944 /* Disable dirty logging on HugePages */
945 if ((memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) && write)
946 return false;
947
948 size = memslot->npages * PAGE_SIZE;
949 gpa_start = memslot->base_gfn << PAGE_SHIFT;
950 uaddr_start = memslot->userspace_addr;
951 uaddr_end = uaddr_start + size;
952
953 /*
954 * Pages belonging to memslots that don't have the same alignment
955 * within a PMD/PUD for userspace and GPA cannot be mapped with stage-2
956 * PMD/PUD entries, because we'll end up mapping the wrong pages.
957 *
958 * Consider a layout like the following:
959 *
960 * memslot->userspace_addr:
961 * +-----+--------------------+--------------------+---+
962 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
963 * +-----+--------------------+--------------------+---+
964 *
965 * memslot->base_gfn << PAGE_SIZE:
966 * +---+--------------------+--------------------+-----+
967 * |abc|def Stage-2 block | Stage-2 block |tvxyz|
968 * +---+--------------------+--------------------+-----+
969 *
970 * If we create those stage-2 blocks, we'll end up with this incorrect
971 * mapping:
972 * d -> f
973 * e -> g
974 * f -> h
975 */
976 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
977 return false;
978
979 /*
980 * Next, let's make sure we're not trying to map anything not covered
981 * by the memslot. This means we have to prohibit block size mappings
982 * for the beginning and end of a non-block aligned and non-block sized
983 * memory slot (illustrated by the head and tail parts of the
984 * userspace view above containing pages 'abcde' and 'xyz',
985 * respectively).
986 *
987 * Note that it doesn't matter if we do the check using the
988 * userspace_addr or the base_gfn, as both are equally aligned (per
989 * the check above) and equally sized.
990 */
991 return (hva & ~(map_size - 1)) >= uaddr_start &&
992 (hva & ~(map_size - 1)) + map_size <= uaddr_end;
993 }
994
995 /**
996 * kvm_map_page_fast() - Fast path GPA fault handler.
997 * @vcpu: VCPU pointer.
998 * @gpa: Guest physical address of fault.
999 * @write: Whether the fault was due to a write.
1000 *
1001 * Perform fast path GPA fault handling, doing all that can be done without
1002 * calling into KVM. This handles marking old pages young (for idle page
1003 * tracking), and dirtying of clean pages (for dirty page logging).
1004 *
1005 * Returns: 0 on success, in which case we can update derived mappings and
1006 * resume guest execution.
1007 * -EFAULT on failure due to absent GPA mapping or write to
1008 * read-only page, in which case KVM must be consulted.
1009 */
kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)1010 static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa,
1011 bool write)
1012 {
1013 struct kvm *kvm = vcpu->kvm;
1014 gfn_t gfn = gpa >> PAGE_SHIFT;
1015 pte_t *ptep;
1016 kvm_pfn_t pfn = 0; /* silence bogus GCC warning */
1017 bool pfn_valid = false;
1018 int ret = 0;
1019 struct kvm_memory_slot *slot;
1020
1021 spin_lock(&kvm->mmu_lock);
1022
1023 /* Fast path - just check GPA page table for an existing entry */
1024 ptep = kvm_pte_for_gpa(kvm, NULL, gpa);
1025 if (!ptep || !pte_present(*ptep)) {
1026 ret = -EFAULT;
1027 goto out;
1028 }
1029
1030 /* Track access to pages marked old */
1031 if (!pte_young(*ptep)) {
1032 set_pte(ptep, pte_mkyoung(*ptep));
1033 pfn = pte_pfn(*ptep);
1034 pfn_valid = true;
1035 /* call kvm_set_pfn_accessed() after unlock */
1036 }
1037 if (write && !pte_dirty(*ptep)) {
1038 if (!pte_write(*ptep)) {
1039 ret = -EFAULT;
1040 goto out;
1041 }
1042
1043 if (kvm_pte_huge(*ptep)) {
1044 /*
1045 * Do not set write permission when dirty logging is
1046 * enabled for HugePages
1047 */
1048 slot = gfn_to_memslot(kvm, gfn);
1049 if (slot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1050 ret = -EFAULT;
1051 goto out;
1052 }
1053 }
1054
1055 /* Track dirtying of writeable pages */
1056 set_pte(ptep, pte_mkdirty(*ptep));
1057 pfn = pte_pfn(*ptep);
1058 if (pmd_huge(*((pmd_t *)ptep))) {
1059 int i;
1060 gfn_t base_gfn = (gpa & PMD_MASK) >> PAGE_SHIFT;
1061
1062 for (i = 0; i < PTRS_PER_PTE; i++)
1063 mark_page_dirty(kvm, base_gfn + i);
1064 } else
1065 mark_page_dirty(kvm, gfn);
1066 kvm_set_pfn_dirty(pfn);
1067 }
1068
1069 out:
1070 spin_unlock(&kvm->mmu_lock);
1071 if (pfn_valid)
1072 kvm_set_pfn_accessed(pfn);
1073 return ret;
1074 }
1075
1076 /*
1077 * Split huge page
1078 */
kvm_split_huge(struct kvm_vcpu *vcpu, pte_t *ptep, gfn_t gfn, struct vm_area_struct *vma, unsigned long hva)1079 static pte_t *kvm_split_huge(struct kvm_vcpu *vcpu, pte_t *ptep, gfn_t gfn,
1080 struct vm_area_struct *vma, unsigned long hva)
1081 {
1082 int i;
1083 pte_t val, *child;
1084 struct kvm_mmu_memory_cache *memcache;
1085
1086 memcache = &vcpu->arch.mmu_page_cache;
1087 child = kvm_mmu_memory_cache_alloc(memcache);
1088 val = kvm_pte_mksmall(*ptep);
1089 for (i = 0; i < PTRS_PER_PTE; i++) {
1090 kvm_set_pte(child + i, val);
1091 pte_val(val) += PAGE_SIZE;
1092 }
1093
1094 /* The later kvm_flush_tlb_gpa() will flush hugepage tlb */
1095 pte_val(val) = (unsigned long)child;
1096 kvm_set_pte(ptep, val);
1097 return child + (gfn & (PTRS_PER_PTE - 1));
1098 }
1099
1100 /**
1101 * kvm_map_page() - Map a guest physical page.
1102 * @vcpu: VCPU pointer.
1103 * @gpa: Guest physical address of fault.
1104 * @write: Whether the fault was due to a write.
1105 *
1106 * Handle GPA faults by creating a new GPA mapping (or updating an existing
1107 * one).
1108 *
1109 * This takes care of marking pages young or dirty (idle/dirty page tracking),
1110 * asking KVM for the corresponding PFN, and creating a mapping in the GPA page
1111 * tables. Derived mappings (GVA page tables and TLBs) must be handled by the
1112 * caller.
1113 *
1114 * Returns: 0 on success, in which case the caller may use the @out_entry
1115 * and @out_buddy PTEs to update derived mappings and resume guest
1116 * execution.
1117 * -EFAULT if there is no memory region at @gpa or a write was
1118 * attempted to a read-only memory region. This is usually handled
1119 * as an MMIO access.
1120 */
kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)1121 static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
1122 bool write)
1123 {
1124 bool writeable;
1125 bool force_pte = false;
1126 int i, srcu_idx, err = 0, retry_no = 0;
1127 unsigned long hva;
1128 unsigned long mmu_seq;
1129 unsigned long prot_bits;
1130 unsigned long vma_pagesize;
1131 pte_t *ptep;
1132 kvm_pfn_t pfn;
1133 gfn_t gfn = gpa >> PAGE_SHIFT;
1134 struct vm_area_struct *vma;
1135 struct kvm *kvm = vcpu->kvm;
1136 struct kvm_memory_slot *memslot;
1137 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1138
1139 /* Try the fast path to handle old / clean pages */
1140 srcu_idx = srcu_read_lock(&kvm->srcu);
1141 err = kvm_map_page_fast(vcpu, gpa, write);
1142 if (!err)
1143 goto out;
1144
1145 memslot = gfn_to_memslot(kvm, gfn);
1146 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writeable);
1147 if (kvm_is_error_hva(hva) || (write && !writeable))
1148 goto out;
1149
1150 /* Let's check if we will get back a huge page backed by hugetlbfs */
1151 mmap_read_lock(current->mm);
1152 vma = find_vma_intersection(current->mm, hva, hva + 1);
1153 if (unlikely(!vma)) {
1154 kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1155 mmap_read_unlock(current->mm);
1156 err = -EFAULT;
1157 goto out;
1158 }
1159
1160 vma_pagesize = vma_kernel_pagesize(vma);
1161 if ((vma_pagesize == PMD_SIZE) &&
1162 !fault_supports_huge_mapping(memslot, hva, write)) {
1163 force_pte = true;
1164 vma_pagesize = PAGE_SIZE;
1165 ++vcpu->stat.huge_dec_exits;
1166 }
1167
1168 /* PMD is not folded, adjust gfn to new boundary */
1169 if (vma_pagesize == PMD_SIZE)
1170 gfn = (gpa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1171
1172 mmap_read_unlock(current->mm);
1173
1174 /* We need a minimum of cached pages ready for page table creation */
1175 err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES);
1176 if (err)
1177 goto out;
1178
1179 retry:
1180 /*
1181 * Used to check for invalidations in progress, of the pfn that is
1182 * returned by pfn_to_pfn_prot below.
1183 */
1184 mmu_seq = kvm->mmu_notifier_seq;
1185 /*
1186 * Ensure the read of mmu_notifier_seq isn't reordered with PTE reads in
1187 * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't
1188 * risk the page we get a reference to getting unmapped before we have a
1189 * chance to grab the mmu_lock without mmu_notifier_retry() noticing.
1190 *
1191 * This smp_rmb() pairs with the effective smp_wmb() of the combination
1192 * of the pte_unmap_unlock() after the PTE is zapped, and the
1193 * spin_lock() in kvm_mmu_notifier_invalidate_<page|range_end>() before
1194 * mmu_notifier_seq is incremented.
1195 */
1196 smp_rmb();
1197
1198 /* Slow path - ask KVM core whether we can access this GPA */
1199 pfn = gfn_to_pfn_prot(kvm, gfn, write, &writeable);
1200 if (is_error_noslot_pfn(pfn)) {
1201 err = -EFAULT;
1202 goto out;
1203 }
1204
1205 spin_lock(&kvm->mmu_lock);
1206 /* Check if an invalidation has taken place since we got pfn */
1207 if (mmu_notifier_retry(kvm, mmu_seq)) {
1208 /*
1209 * This can happen when mappings are changed asynchronously, but
1210 * also synchronously if a COW is triggered by
1211 * gfn_to_pfn_prot().
1212 */
1213 spin_unlock(&kvm->mmu_lock);
1214 kvm_set_pfn_accessed(pfn);
1215 kvm_release_pfn_clean(pfn);
1216 if (retry_no > 100) {
1217 retry_no = 0;
1218 schedule();
1219 }
1220 retry_no++;
1221 goto retry;
1222 }
1223
1224 if (vma_pagesize == PAGE_SIZE && !force_pte) {
1225 /*
1226 * Only PMD_SIZE transparent hugepages(THP) are
1227 * currently supported. This code will need to be
1228 * updated to support other THP sizes.
1229 *
1230 * Make sure the host VA and the guest IPA are sufficiently
1231 * aligned and that the block is contained within the memslot.
1232 */
1233 ++vcpu->stat.huge_thp_exits;
1234 if (fault_supports_huge_mapping(memslot, hva, write) &&
1235 transparent_hugepage_adjust(&pfn, &gpa)) {
1236 ++vcpu->stat.huge_adjust_exits;
1237 vma_pagesize = PMD_SIZE;
1238 }
1239 }
1240
1241 /* Set up the prot bits */
1242 prot_bits = _PAGE_PRESENT | __READABLE;
1243 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1244 prot_bits |= _CACHE_SUC;
1245 else
1246 prot_bits |= _CACHE_CC;
1247
1248 if (writeable) {
1249 prot_bits |= _PAGE_WRITE;
1250 if (write) {
1251 prot_bits |= __WRITEABLE;
1252 mark_page_dirty(kvm, gfn);
1253 kvm_set_pfn_dirty(pfn);
1254 }
1255 }
1256
1257 if (vma_pagesize == PMD_SIZE) {
1258 pmd_t new_pmd = pfn_pmd(pfn, __pgprot(prot_bits));
1259 new_pmd = pmd_mkhuge(new_pmd);
1260 if (writeable && write) {
1261 gfn_t base_gfn = (gpa & PMD_MASK) >> PAGE_SHIFT;
1262 for (i = 0; i < PTRS_PER_PTE; i++)
1263 mark_page_dirty(kvm, base_gfn + i);
1264 }
1265
1266 ++vcpu->stat.huge_set_exits;
1267 kvm_set_pmd_huge(vcpu, memcache, gpa, &new_pmd);
1268 } else {
1269 pte_t new_pte = pfn_pte(pfn, __pgprot(prot_bits));
1270 if (writeable && write)
1271 mark_page_dirty(kvm, gfn);
1272
1273 /* Ensure page tables are allocated */
1274 ptep = kvm_pte_for_gpa(kvm, memcache, gpa);
1275 if (ptep && kvm_pte_huge(*ptep) && write)
1276 ptep = kvm_split_huge(vcpu, ptep, gfn, vma, hva);
1277
1278 set_pte(ptep, new_pte);
1279 err = 0;
1280 }
1281
1282 spin_unlock(&kvm->mmu_lock);
1283 kvm_release_pfn_clean(pfn);
1284 kvm_set_pfn_accessed(pfn);
1285 out:
1286 srcu_read_unlock(&kvm->srcu, srcu_idx);
1287 return err;
1288 }
1289
kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv, bool write)1290 int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv,
1291 bool write)
1292 {
1293 int ret;
1294
1295 ret = kvm_map_page(vcpu, badv, write);
1296 if (ret)
1297 return ret;
1298
1299 /* Invalidate this entry in the TLB */
1300 return kvm_tlb_flush_gpa(vcpu, badv);
1301 }
1302
1303 /**
1304 * kvm_flush_tlb_all() - Flush all root TLB entries for
1305 * guests.
1306 *
1307 * Invalidate all entries including GVA-->GPA and GPA-->HPA mappings.
1308 */
kvm_flush_tlb_all(void)1309 void kvm_flush_tlb_all(void)
1310 {
1311 unsigned long flags;
1312
1313 local_irq_save(flags);
1314 invtlb_all(INVTLB_ALLGID, 0, 0);
1315 local_irq_restore(flags);
1316 }
1317