1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
4  */
5 
6 #include <linux/highmem.h>
7 #include <linux/hugetlb.h>
8 #include <linux/page-flags.h>
9 #include <linux/kvm_host.h>
10 #include <linux/uaccess.h>
11 #include <asm/mmu_context.h>
12 #include <asm/pgalloc.h>
13 #include <asm/tlb.h>
14 #include "kvm_compat.h"
15 
16 /*
17  * KVM_MMU_CACHE_MIN_PAGES is the number of GPA page table translation levels
18  * for which pages need to be cached.
19  */
20 #if defined(__PAGETABLE_PMD_FOLDED)
21 #define KVM_MMU_CACHE_MIN_PAGES 1
22 #else
23 #define KVM_MMU_CACHE_MIN_PAGES 2
24 #endif
25 
kvm_pte_huge(pte_t pte)26 static inline int kvm_pte_huge(pte_t pte) { return pte_val(pte) & _PAGE_HUGE; }
kvm_pte_mksmall(pte_t pte)27 static inline pte_t kvm_pte_mksmall(pte_t pte)
28 {
29 	pte_val(pte) &= ~_PAGE_HUGE;
30 	return pte;
31 }
kvm_set_pte(pte_t *ptep, pte_t val)32 static inline void kvm_set_pte(pte_t *ptep, pte_t val)
33 {
34 	WRITE_ONCE(*ptep, val);
35 }
36 
kvm_tlb_flush_gpa(struct kvm_vcpu *vcpu, unsigned long gpa)37 static int kvm_tlb_flush_gpa(struct kvm_vcpu *vcpu, unsigned long gpa)
38 {
39 	preempt_disable();
40 	gpa &= (PAGE_MASK << 1);
41 	invtlb(INVTLB_GID_ADDR, kvm_read_csr_gstat() & KVM_GSTAT_GID, gpa);
42 	preempt_enable();
43 	return 0;
44 }
45 
kvm_pmd_mkhuge(pmd_t pmd)46 static inline pmd_t kvm_pmd_mkhuge(pmd_t pmd)
47 {
48 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
49 	return pmd_mkhuge(pmd);
50 #elif defined(CONFIG_HUGETLB_PAGE)
51 	pte_t entry;
52 
53 	pte_val(entry) = pmd_val(pmd);
54 	entry = pte_mkhuge(entry);
55 	pmd_val(pmd) = pte_val(entry);
56 #endif
57 	return pmd;
58 }
59 
kvm_pmd_mkclean(pmd_t pmd)60 static inline pmd_t kvm_pmd_mkclean(pmd_t pmd)
61 {
62 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
63 	return pmd_mkclean(pmd);
64 #elif defined(CONFIG_HUGETLB_PAGE)
65 	pte_t entry;
66 
67 	pte_val(entry) = pmd_val(pmd);
68 	entry = pte_mkclean(entry);
69 	pmd_val(pmd) = pte_val(entry);
70 #endif
71 	return pmd;
72 }
73 
kvm_pmd_mkold(pmd_t pmd)74 static inline pmd_t kvm_pmd_mkold(pmd_t pmd)
75 {
76 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
77 	return pmd_mkold(pmd);
78 #elif defined(CONFIG_HUGETLB_PAGE)
79 	pte_t entry;
80 
81 	pte_val(entry) = pmd_val(pmd);
82 	entry = pte_mkold(entry);
83 	pmd_val(pmd) = pte_val(entry);
84 #endif
85 	return pmd;
86 }
87 
kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)88 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
89 {
90 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
91 }
92 
93 /**
94  * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory.
95  *
96  * Allocate a blank KVM GPA page directory (PGD) for representing guest physical
97  * to host physical page mappings.
98  *
99  * Returns:	Pointer to new KVM GPA page directory.
100  *		NULL on allocation failure.
101  */
kvm_pgd_alloc(void)102 pgd_t *kvm_pgd_alloc(void)
103 {
104 	pgd_t *ret;
105 	struct page *page;
106 
107 	page = alloc_pages(GFP_KERNEL, 0);
108 	if (!page)
109 		return NULL;
110 	ret = (pgd_t *) page_address(page);
111 	if (ret)
112 		pgd_init((unsigned long)ret);
113 
114 	return ret;
115 }
116 
117 /**
118  * kvm_walk_pgd() - Walk page table with optional allocation.
119  * @pgd:	Page directory pointer.
120  * @addr:	Address to index page table using.
121  * @cache:	MMU page cache to allocate new page tables from, or NULL.
122  *
123  * Walk the page tables pointed to by @pgd to find the PTE corresponding to the
124  * address @addr. If page tables don't exist for @addr, they will be created
125  * from the MMU cache if @cache is not NULL.
126  *
127  * Returns:	Pointer to pte_t corresponding to @addr.
128  *		NULL if a page table doesn't exist for @addr and !@cache.
129  *		NULL if a page table allocation failed.
130  */
kvm_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache, unsigned long addr)131 static pte_t *kvm_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache,
132 				unsigned long addr)
133 {
134 	p4d_t *p4d;
135 	pud_t *pud;
136 	pmd_t *pmd;
137 
138 	pgd += pgd_index(addr);
139 	if (pgd_none(*pgd)) {
140 		/* Not used yet */
141 		BUG();
142 		return NULL;
143 	}
144 	p4d = p4d_offset(pgd, addr);
145 	pud = pud_offset(p4d, addr);
146 	if (pud_none(*pud)) {
147 		pmd_t *new_pmd;
148 
149 		if (!cache)
150 			return NULL;
151 		new_pmd = kvm_mmu_memory_cache_alloc(cache);
152 		pmd_init((unsigned long)new_pmd,
153 			 (unsigned long)invalid_pte_table);
154 		pud_populate(NULL, pud, new_pmd);
155 	}
156 	pmd = pmd_offset(pud, addr);
157 #ifdef CONFIG_HUGETLB_PAGE
158 	if (pmd_huge(*pmd)) {
159 		return (pte_t *)pmd;
160 	}
161 #endif
162 	if (pmd_none(*pmd)) {
163 		pte_t *new_pte;
164 
165 		if (!cache)
166 			return NULL;
167 		new_pte = kvm_mmu_memory_cache_alloc(cache);
168 		clear_page(new_pte);
169 		pmd_populate_kernel(NULL, pmd, new_pte);
170 	}
171 	return pte_offset_kernel(pmd, addr);
172 }
173 
174 /* Caller must hold kvm->mm_lock */
kvm_pte_for_gpa(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, unsigned long addr)175 static pte_t *kvm_pte_for_gpa(struct kvm *kvm,
176 				struct kvm_mmu_memory_cache *cache,
177 				unsigned long addr)
178 {
179 	return kvm_walk_pgd(kvm->arch.gpa_mm.pgd, cache, addr);
180 }
181 
182 /*
183  * kvm_flush_gpa_{pte,pmd,pud,pgd,pt}.
184  * Flush a range of guest physical address space from the VM's GPA page tables.
185  */
186 
kvm_flush_gpa_pte(pte_t *pte, unsigned long start_gpa, unsigned long end_gpa, unsigned long *data)187 static bool kvm_flush_gpa_pte(pte_t *pte, unsigned long start_gpa,
188 				   unsigned long end_gpa, unsigned long *data)
189 {
190 	int i_min = pte_index(start_gpa);
191 	int i_max = pte_index(end_gpa);
192 	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PTE - 1);
193 	int i;
194 
195 	for (i = i_min; i <= i_max; ++i) {
196 		if (!pte_present(pte[i]))
197 			continue;
198 
199 		set_pte(pte + i, __pte(0));
200 		if (data)
201 			*data = *data + 1;
202 	}
203 	return safe_to_remove;
204 }
205 
kvm_flush_gpa_pmd(pmd_t *pmd, unsigned long start_gpa, unsigned long end_gpa, unsigned long *data)206 static bool kvm_flush_gpa_pmd(pmd_t *pmd, unsigned long start_gpa,
207 				   unsigned long end_gpa, unsigned long *data)
208 {
209 	pte_t *pte;
210 	unsigned long end = ~0ul;
211 	int i_min = pmd_index(start_gpa);
212 	int i_max = pmd_index(end_gpa);
213 	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PMD - 1);
214 	int i;
215 
216 	for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
217 		if (!pmd_present(pmd[i]))
218 			continue;
219 
220 		if (pmd_huge(pmd[i]) && pmd_present(pmd[i])) {
221 			pmd_clear(pmd + i);
222 			if (data)
223 				*data += PTRS_PER_PMD;
224 			continue;
225 		}
226 
227 		pte = pte_offset_kernel(pmd + i, 0);
228 		if (i == i_max)
229 			end = end_gpa;
230 
231 		if (kvm_flush_gpa_pte(pte, start_gpa, end, data)) {
232 			pmd_clear(pmd + i);
233 			pte_free_kernel(NULL, pte);
234 		} else {
235 			safe_to_remove = false;
236 		}
237 	}
238 	return safe_to_remove;
239 }
240 
kvm_flush_gpa_pud(pud_t *pud, unsigned long start_gpa, unsigned long end_gpa, unsigned long *data)241 static bool kvm_flush_gpa_pud(pud_t *pud, unsigned long start_gpa,
242 				   unsigned long end_gpa, unsigned long *data)
243 {
244 	pmd_t *pmd;
245 	unsigned long end = ~0ul;
246 	int i_min = pud_index(start_gpa);
247 	int i_max = pud_index(end_gpa);
248 	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PUD - 1);
249 	int i;
250 
251 	for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
252 		if (!pud_present(pud[i]))
253 			continue;
254 
255 		pmd = pmd_offset(pud + i, 0);
256 		if (i == i_max)
257 			end = end_gpa;
258 
259 		if (kvm_flush_gpa_pmd(pmd, start_gpa, end, data)) {
260 			pud_clear(pud + i);
261 			pmd_free(NULL, pmd);
262 		} else {
263 			safe_to_remove = false;
264 		}
265 	}
266 	return safe_to_remove;
267 }
268 
kvm_flush_gpa_pgd(pgd_t *pgd, unsigned long start_gpa, unsigned long end_gpa, unsigned long *data)269 static bool kvm_flush_gpa_pgd(pgd_t *pgd, unsigned long start_gpa,
270 				unsigned long end_gpa, unsigned long *data)
271 {
272 	p4d_t *p4d;
273 	pud_t *pud;
274 	unsigned long end = ~0ul;
275 	int i_min = pgd_index(start_gpa);
276 	int i_max = pgd_index(end_gpa);
277 	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PGD - 1);
278 	int i;
279 
280 	for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
281 		if (!pgd_present(pgd[i]))
282 			continue;
283 
284 		p4d = p4d_offset(pgd, 0);
285 		pud = pud_offset(p4d + i, 0);
286 		if (i == i_max)
287 			end = end_gpa;
288 
289 		if (kvm_flush_gpa_pud(pud, start_gpa, end, data)) {
290 			pgd_clear(pgd + i);
291 			pud_free(NULL, pud);
292 		} else {
293 			safe_to_remove = false;
294 		}
295 	}
296 	return safe_to_remove;
297 }
298 
299 /**
300  * kvm_flush_gpa_pt() - Flush a range of guest physical addresses.
301  * @kvm:	KVM pointer.
302  * @start_gfn:	Guest frame number of first page in GPA range to flush.
303  * @end_gfn:	Guest frame number of last page in GPA range to flush.
304  *
305  * Flushes a range of GPA mappings from the GPA page tables.
306  *
307  * The caller must hold the @kvm->mmu_lock spinlock.
308  *
309  * Returns:	Whether its safe to remove the top level page directory because
310  *		all lower levels have been removed.
311  */
kvm_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, void *data)312 static bool kvm_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, void *data)
313 {
314 	return kvm_flush_gpa_pgd(kvm->arch.gpa_mm.pgd,
315 				start_gfn << PAGE_SHIFT,
316 				end_gfn << PAGE_SHIFT, (unsigned long *)data);
317 }
318 
319 /*
320  * kvm_mkclean_gpa_pt.
321  * Mark a range of guest physical address space clean (writes fault) in the VM's
322  * GPA page table to allow dirty page tracking.
323  */
324 
kvm_mkclean_pte(pte_t *pte, unsigned long start, unsigned long end)325 static int kvm_mkclean_pte(pte_t *pte, unsigned long start, unsigned long end)
326 {
327 	int ret = 0;
328 	int i_min = pte_index(start);
329 	int i_max = pte_index(end);
330 	int i;
331 	pte_t val;
332 
333 	for (i = i_min; i <= i_max; ++i) {
334 		val = pte[i];
335 		if (pte_present(val) && pte_dirty(val)) {
336 			set_pte(pte + i, pte_mkclean(val));
337 			ret = 1;
338 		}
339 	}
340 	return ret;
341 }
342 
kvm_mkclean_pmd(pmd_t *pmd, unsigned long start, unsigned long end)343 static int kvm_mkclean_pmd(pmd_t *pmd, unsigned long start, unsigned long end)
344 {
345 	int ret = 0;
346 	pte_t *pte;
347 	unsigned long cur_end = ~0ul;
348 	int i_min = pmd_index(start);
349 	int i_max = pmd_index(end);
350 	int i;
351 	pmd_t old, new;
352 
353 	for (i = i_min; i <= i_max; ++i, start = 0) {
354 		if (!pmd_present(pmd[i]))
355 			continue;
356 
357 		if (pmd_huge(pmd[i])) {
358 			old = pmd[i];
359 			new = kvm_pmd_mkclean(old);
360 			if (pmd_val(new) == pmd_val(old))
361 				continue;
362 			set_pmd(pmd + i, new);
363 			ret = 1;
364 			continue;
365 		}
366 
367 		pte = pte_offset_kernel(pmd + i, 0);
368 		if (i == i_max)
369 			cur_end = end;
370 
371 		ret |= kvm_mkclean_pte(pte, start, cur_end);
372 	}
373 
374 	return ret;
375 }
376 
kvm_mkclean_pud(pud_t *pud, unsigned long start, unsigned long end)377 static int kvm_mkclean_pud(pud_t *pud, unsigned long start, unsigned long end)
378 {
379 	int ret = 0;
380 	pmd_t *pmd;
381 	unsigned long cur_end = ~0ul;
382 	int i_min = pud_index(start);
383 	int i_max = pud_index(end);
384 	int i;
385 
386 	for (i = i_min; i <= i_max; ++i, start = 0) {
387 		if (!pud_present(pud[i]))
388 			continue;
389 
390 		pmd = pmd_offset(pud + i, 0);
391 		if (i == i_max)
392 			cur_end = end;
393 
394 		ret |= kvm_mkclean_pmd(pmd, start, cur_end);
395 	}
396 	return ret;
397 }
398 
kvm_mkclean_pgd(pgd_t *pgd, unsigned long start, unsigned long end)399 static int kvm_mkclean_pgd(pgd_t *pgd, unsigned long start, unsigned long end)
400 {
401 	int ret = 0;
402 	p4d_t *p4d;							\
403 	pud_t *pud;
404 	unsigned long cur_end = ~0ul;
405 	int i_min = pgd_index(start);
406 	int i_max = pgd_index(end);
407 	int i;
408 
409 	for (i = i_min; i <= i_max; ++i, start = 0) {
410 		if (!pgd_present(pgd[i]))
411 			continue;
412 
413 		p4d = p4d_offset(pgd, 0);
414 		pud = pud_offset(p4d + i, 0);
415 		if (i == i_max)
416 			cur_end = end;
417 
418 		ret |= kvm_mkclean_pud(pud, start, cur_end);
419 	}
420 	return ret;
421 }
422 
423 /**
424  * kvm_mkclean_gpa_pt() - Make a range of guest physical addresses clean.
425  * @kvm:	KVM pointer.
426  * @start_gfn:	Guest frame number of first page in GPA range to flush.
427  * @end_gfn:	Guest frame number of last page in GPA range to flush.
428  *
429  * Make a range of GPA mappings clean so that guest writes will fault and
430  * trigger dirty page logging.
431  *
432  * The caller must hold the @kvm->mmu_lock spinlock.
433  *
434  * Returns:	Whether any GPA mappings were modified, which would require
435  *		derived mappings (GVA page tables & TLB enties) to be
436  *		invalidated.
437  */
kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)438 static int kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
439 {
440 	return kvm_mkclean_pgd(kvm->arch.gpa_mm.pgd, start_gfn << PAGE_SHIFT,
441 				end_gfn << PAGE_SHIFT);
442 }
443 
444 /**
445  * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages
446  * @kvm:	The KVM pointer
447  * @slot:	The memory slot associated with mask
448  * @gfn_offset:	The gfn offset in memory slot
449  * @mask:	The mask of dirty pages at offset 'gfn_offset' in this memory
450  *		slot to be write protected
451  *
452  * Walks bits set in mask write protects the associated pte's. Caller must
453  * acquire @kvm->mmu_lock.
454  */
kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask)455 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
456 		struct kvm_memory_slot *slot,
457 		gfn_t gfn_offset, unsigned long mask)
458 {
459 	gfn_t base_gfn = slot->base_gfn + gfn_offset;
460 	gfn_t start = base_gfn +  __ffs(mask);
461 	gfn_t end = base_gfn + __fls(mask);
462 
463 	kvm_mkclean_gpa_pt(kvm, start, end);
464 }
465 
kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change)466 void kvm_arch_commit_memory_region(struct kvm *kvm,
467 				   const struct kvm_userspace_memory_region *mem,
468 				   struct kvm_memory_slot *old,
469 				   const struct kvm_memory_slot *new,
470 				   enum kvm_mr_change change)
471 {
472 	int needs_flush;
473 
474 	kvm_debug("%s: kvm: %p slot: %d, GPA: %llx, size: %llx, QVA: %llx\n",
475 		  __func__, kvm, mem->slot, mem->guest_phys_addr,
476 		  mem->memory_size, mem->userspace_addr);
477 
478 	/*
479 	 * If dirty page logging is enabled, write protect all pages in the slot
480 	 * ready for dirty logging.
481 	 *
482 	 * There is no need to do this in any of the following cases:
483 	 * CREATE:	No dirty mappings will already exist.
484 	 * MOVE/DELETE:	The old mappings will already have been cleaned up by
485 	 *		kvm_arch_flush_shadow_memslot()
486 	 */
487 	if (change == KVM_MR_FLAGS_ONLY &&
488 	    (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
489 	     new->flags & KVM_MEM_LOG_DIRTY_PAGES)) {
490 		spin_lock(&kvm->mmu_lock);
491 		/* Write protect GPA page table entries */
492 		needs_flush = kvm_mkclean_gpa_pt(kvm, new->base_gfn,
493 					new->base_gfn + new->npages - 1);
494 		/* Let implementation do the rest */
495 		if (needs_flush)
496 			kvm_flush_remote_tlbs(kvm);
497 		spin_unlock(&kvm->mmu_lock);
498 	}
499 }
500 
kvm_arch_flush_shadow_all(struct kvm *kvm)501 void kvm_arch_flush_shadow_all(struct kvm *kvm)
502 {
503 	/* Flush whole GPA */
504 	kvm_flush_gpa_pt(kvm, 0, ~0UL, NULL);
505 
506 	/* Flush vpid for each VCPU individually */
507 	kvm_flush_remote_tlbs(kvm);
508 }
509 
kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)510 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
511 		struct kvm_memory_slot *slot)
512 {
513 	unsigned long npages;
514 
515 	/*
516 	 * The slot has been made invalid (ready for moving or deletion), so we
517 	 * need to ensure that it can no longer be accessed by any guest VCPUs.
518 	 */
519 
520 	npages = 0;
521 	spin_lock(&kvm->mmu_lock);
522 	/* Flush slot from GPA */
523 	kvm_flush_gpa_pt(kvm, slot->base_gfn,
524 			slot->base_gfn + slot->npages - 1, &npages);
525 	/* Let implementation do the rest */
526 	if (npages)
527 		kvm_flush_remote_tlbs(kvm);
528 	spin_unlock(&kvm->mmu_lock);
529 }
530 
_kvm_destroy_mm(struct kvm *kvm)531 void _kvm_destroy_mm(struct kvm *kvm)
532 {
533 	/* It should always be safe to remove after flushing the whole range */
534 	WARN_ON(!kvm_flush_gpa_pt(kvm, 0, ~0UL, NULL));
535 	pgd_free(NULL, kvm->arch.gpa_mm.pgd);
536 	kvm->arch.gpa_mm.pgd = NULL;
537 }
538 
539 /*
540  * Mark a range of guest physical address space old (all accesses fault) in the
541  * VM's GPA page table to allow detection of commonly used pages.
542  */
543 
kvm_mkold_pte(pte_t *pte, unsigned long start, unsigned long end)544 static int kvm_mkold_pte(pte_t *pte, unsigned long start,
545 				 unsigned long end)
546 {
547 	int ret = 0;
548 	int i_min = pte_index(start);
549 	int i_max = pte_index(end);
550 	int i;
551 	pte_t old, new;
552 
553 	for (i = i_min; i <= i_max; ++i) {
554 		if (!pte_present(pte[i]))
555 			continue;
556 
557 		old = pte[i];
558 		new = pte_mkold(old);
559 		if (pte_val(new) == pte_val(old))
560 			continue;
561 		set_pte(pte + i, new);
562 		ret = 1;
563 	}
564 
565 	return ret;
566 }
567 
kvm_mkold_pmd(pmd_t *pmd, unsigned long start, unsigned long end)568 static int kvm_mkold_pmd(pmd_t *pmd, unsigned long start, unsigned long end)
569 {
570 	int ret = 0;
571 	pte_t *pte;
572 	unsigned long cur_end = ~0ul;
573 	int i_min = pmd_index(start);
574 	int i_max = pmd_index(end);
575 	int i;
576 	pmd_t old, new;
577 
578 	for (i = i_min; i <= i_max; ++i, start = 0) {
579 		if (!pmd_present(pmd[i]))
580 			continue;
581 
582 		if (pmd_huge(pmd[i])) {
583 			old = pmd[i];
584 			new = kvm_pmd_mkold(old);
585 			if (pmd_val(new) == pmd_val(old))
586 				continue;
587 			set_pmd(pmd + i, new);
588 			ret = 1;
589 			continue;
590 		}
591 
592 		pte = pte_offset_kernel(pmd + i, 0);
593 		if (i == i_max)
594 			cur_end = end;
595 
596 		ret |= kvm_mkold_pte(pte, start, cur_end);
597 	}
598 
599 	return ret;
600 }
601 
kvm_mkold_pud(pud_t *pud, unsigned long start, unsigned long end)602 static int kvm_mkold_pud(pud_t *pud, unsigned long start, unsigned long end)
603 {
604 	int ret = 0;
605 	pmd_t *pmd;
606 	unsigned long cur_end = ~0ul;
607 	int i_min = pud_index(start);
608 	int i_max = pud_index(end);
609 	int i;
610 
611 	for (i = i_min; i <= i_max; ++i, start = 0) {
612 		if (!pud_present(pud[i]))
613 			continue;
614 
615 		pmd = pmd_offset(pud + i, 0);
616 		if (i == i_max)
617 			cur_end = end;
618 
619 		ret |= kvm_mkold_pmd(pmd, start, cur_end);
620 	}
621 
622 	return ret;
623 }
624 
kvm_mkold_pgd(pgd_t *pgd, unsigned long start, unsigned long end)625 static int kvm_mkold_pgd(pgd_t *pgd, unsigned long start, unsigned long end)
626 {
627 	int ret = 0;
628 	p4d_t *p4d;
629 	pud_t *pud;
630 	unsigned long cur_end = ~0ul;
631 	int i_min = pgd_index(start);
632 	int i_max = pgd_index(end);
633 	int i;
634 
635 	for (i = i_min; i <= i_max; ++i, start = 0) {
636 		if (!pgd_present(pgd[i]))
637 			continue;
638 
639 		p4d = p4d_offset(pgd, 0);
640 		pud = pud_offset(p4d + i, 0);
641 		if (i == i_max)
642 			cur_end = end;
643 
644 		ret |= kvm_mkold_pud(pud, start, cur_end);
645 	}
646 
647 	return ret;
648 }
649 
handle_hva_to_gpa(struct kvm *kvm, unsigned long start, unsigned long end, int (*handler)(struct kvm *kvm, gfn_t gfn, gpa_t gfn_end, struct kvm_memory_slot *memslot, void *data), void *data)650 static int handle_hva_to_gpa(struct kvm *kvm,
651 			     unsigned long start,
652 			     unsigned long end,
653 			     int (*handler)(struct kvm *kvm, gfn_t gfn,
654 					    gpa_t gfn_end,
655 					    struct kvm_memory_slot *memslot,
656 					    void *data),
657 			     void *data)
658 {
659 	struct kvm_memslots *slots;
660 	struct kvm_memory_slot *memslot;
661 	int ret = 0;
662 
663 	slots = kvm_memslots(kvm);
664 
665 	/* we only care about the pages that the guest sees */
666 	kvm_for_each_memslot(memslot, slots) {
667 		unsigned long hva_start, hva_end;
668 		gfn_t gfn, gfn_end;
669 
670 		hva_start = max(start, memslot->userspace_addr);
671 		hva_end = min(end, memslot->userspace_addr +
672 					(memslot->npages << PAGE_SHIFT));
673 		if (hva_start >= hva_end)
674 			continue;
675 
676 		/*
677 		 * {gfn(page) | page intersects with [hva_start, hva_end)} =
678 		 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
679 		 */
680 		gfn = hva_to_gfn_memslot(hva_start, memslot);
681 		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
682 		ret |= handler(kvm, gfn, gfn_end, memslot, data);
683 	}
684 
685 	return ret;
686 }
687 
688 
kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, struct kvm_memory_slot *memslot, void *data)689 static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
690 				 struct kvm_memory_slot *memslot, void *data)
691 {
692 	unsigned long npages;
693 
694 	npages = 0;
695 	kvm_flush_gpa_pt(kvm, gfn, gfn_end - 1, &npages);
696 	*(unsigned long *)data = *(unsigned long *)data + npages;
697 
698 	return npages > 0;
699 }
700 
kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable)701 int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable)
702 {
703 	unsigned long npages;
704 
705 	npages = 0;
706 	return handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &npages);
707 }
708 
kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, struct kvm_memory_slot *memslot, void *data)709 static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
710 				struct kvm_memory_slot *memslot, void *data)
711 {
712 	gpa_t gpa = gfn << PAGE_SHIFT;
713 	pte_t hva_pte = *(pte_t *)data;
714 	pte_t *gpa_pte = kvm_pte_for_gpa(kvm, NULL, gpa);
715 	pte_t old_pte;
716 
717 	if (!gpa_pte)
718 		return 0;
719 
720 	/* Mapping may need adjusting depending on memslot flags */
721 	old_pte = *gpa_pte;
722 	if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
723 		hva_pte = pte_mkclean(hva_pte);
724 	else if (memslot->flags & KVM_MEM_READONLY)
725 		hva_pte = pte_wrprotect(hva_pte);
726 
727 	set_pte(gpa_pte, hva_pte);
728 
729 	/* Replacing an absent or old page doesn't need flushes */
730 	if (!pte_present(old_pte) || !pte_young(old_pte))
731 		return 0;
732 
733 	/* Pages swapped, aged, moved, or cleaned require flushes */
734 	return !pte_present(hva_pte) ||
735 	       !pte_young(hva_pte) ||
736 	       pte_pfn(old_pte) != pte_pfn(hva_pte) ||
737 	       (pte_dirty(old_pte) && !pte_dirty(hva_pte));
738 }
739 
kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)740 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
741 {
742 	unsigned long end = hva + PAGE_SIZE;
743 	int ret;
744 
745 	ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte);
746 	if (ret)
747 		/* Flush vpid for each VCPU individually */
748 		kvm_flush_remote_tlbs(kvm);
749 	return 0;
750 }
751 
kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, struct kvm_memory_slot *memslot, void *data)752 static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
753 			       struct kvm_memory_slot *memslot, void *data)
754 {
755 	return kvm_mkold_pgd(kvm->arch.gpa_mm.pgd, gfn << PAGE_SHIFT,
756 				gfn_end << PAGE_SHIFT);
757 }
758 
kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, struct kvm_memory_slot *memslot, void *data)759 static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
760 				    struct kvm_memory_slot *memslot, void *data)
761 {
762 	gpa_t gpa = gfn << PAGE_SHIFT;
763 	pte_t *gpa_pte = kvm_pte_for_gpa(kvm, NULL, gpa);
764 
765 	if (!gpa_pte)
766 		return 0;
767 	return pte_young(*gpa_pte);
768 }
769 
kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)770 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
771 {
772 	return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
773 }
774 
kvm_test_age_hva(struct kvm *kvm, unsigned long hva)775 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
776 {
777 	return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
778 }
779 
kvm_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, phys_addr_t addr)780 static pud_t *kvm_get_pud(struct kvm *kvm,
781 		 struct kvm_mmu_memory_cache *cache, phys_addr_t addr)
782 {
783 	pgd_t *pgd;
784 
785 	pgd = kvm->arch.gpa_mm.pgd + pgd_index(addr);
786 	if (pgd_none(*pgd)) {
787 		/* Not used yet */
788 		BUG();
789 		return NULL;
790 	}
791 
792 	return pud_offset(p4d_offset(pgd, addr), addr);
793 }
794 
kvm_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, phys_addr_t addr)795 static pmd_t *kvm_get_pmd(struct kvm *kvm,
796 		 struct kvm_mmu_memory_cache *cache, phys_addr_t addr)
797 {
798 	pud_t *pud;
799 	pmd_t *pmd;
800 
801 	pud = kvm_get_pud(kvm, cache, addr);
802 	if (!pud || pud_huge(*pud))
803 		return NULL;
804 
805 	if (pud_none(*pud)) {
806 		if (!cache)
807 			return NULL;
808 		pmd = kvm_mmu_memory_cache_alloc(cache);
809 		pmd_init((unsigned long)pmd,
810 				(unsigned long)invalid_pte_table);
811 		pud_populate(NULL, pud, pmd);
812 	}
813 
814 	return pmd_offset(pud, addr);
815 }
816 
kvm_set_pmd_huge(struct kvm_vcpu *vcpu, struct kvm_mmu_memory_cache *cache, phys_addr_t addr, const pmd_t *new_pmd)817 static int kvm_set_pmd_huge(struct kvm_vcpu *vcpu, struct kvm_mmu_memory_cache
818 			       *cache, phys_addr_t addr, const pmd_t *new_pmd)
819 {
820 	pmd_t *pmd, old_pmd;
821 
822 retry:
823 	pmd = kvm_get_pmd(vcpu->kvm, cache, addr);
824 	VM_BUG_ON(!pmd);
825 
826 	old_pmd = *pmd;
827 	/*
828 	 * Multiple vcpus faulting on the same PMD entry, can
829 	 * lead to them sequentially updating the PMD with the
830 	 * same value. Following the break-before-make
831 	 * (pmd_clear() followed by tlb_flush()) process can
832 	 * hinder forward progress due to refaults generated
833 	 * on missing translations.
834 	 *
835 	 * Skip updating the page table if the entry is
836 	 * unchanged.
837 	 */
838 	if (pmd_val(old_pmd) == pmd_val(*new_pmd))
839 		return 0;
840 
841 	if (pmd_present(old_pmd)) {
842 		/*
843 		 * If we already have PTE level mapping for this block,
844 		 * we must unmap it to avoid inconsistent TLB state and
845 		 * leaking the table page. We could end up in this situation
846 		 * if the memory slot was marked for dirty logging and was
847 		 * reverted, leaving PTE level mappings for the pages accessed
848 		 * during the period. So, unmap the PTE level mapping for this
849 		 * block and retry, as we could have released the upper level
850 		 * table in the process.
851 		 *
852 		 * Normal THP split/merge follows mmu_notifier callbacks and do
853 		 * get handled accordingly.
854 		 */
855 		if (!pmd_huge(old_pmd)) {
856 			++vcpu->stat.huge_merge_exits;
857 			kvm_flush_gpa_pt(vcpu->kvm,
858 				(addr & PMD_MASK) >> PAGE_SHIFT,
859 				((addr & PMD_MASK) + PMD_SIZE - 1) >> PAGE_SHIFT, NULL);
860 			goto retry;
861 		}
862 		/*
863 		 * Mapping in huge pages should only happen through a
864 		 * fault.  If a page is merged into a transparent huge
865 		 * page, the individual subpages of that huge page
866 		 * should be unmapped through MMU notifiers before we
867 		 * get here.
868 		 *
869 		 * Merging of CompoundPages is not supported; they
870 		 * should become splitting first, unmapped, merged,
871 		 * and mapped back in on-demand.
872 		 */
873 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
874 		WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
875 #endif
876 		pmd_clear(pmd);
877 	}
878 
879 	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
880 	set_pmd(pmd, *new_pmd);
881 	return 0;
882 }
883 
884 /*
885  * Adjust pfn start boundary if support for transparent hugepage
886  */
transparent_hugepage_adjust(kvm_pfn_t *pfnp, unsigned long *gpap)887 static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, unsigned long *gpap)
888 {
889 	kvm_pfn_t pfn = *pfnp;
890 	gfn_t gfn = *gpap >> PAGE_SHIFT;
891 	struct page *page = pfn_to_page(pfn);
892 
893 	/*
894 	 * PageTransCompoundMap() returns true for THP and
895 	 * hugetlbfs. Make sure the adjustment is done only for THP
896 	 * pages.
897 	 */
898 	if ((!PageHuge(page)) && PageTransCompound(page) &&
899 			 (atomic_read(&page->_mapcount) < 0)) {
900 		unsigned long mask;
901 		/*
902 		 * The address we faulted on is backed by a transparent huge
903 		 * page.  However, because we map the compound huge page and
904 		 * not the individual tail page, we need to transfer the
905 		 * refcount to the head page.  We have to be careful that the
906 		 * THP doesn't start to split while we are adjusting the
907 		 * refcounts.
908 		 *
909 		 * We are sure this doesn't happen, because mmu_notifier_retry
910 		 * was successful and we are holding the mmu_lock, so if this
911 		 * THP is trying to split, it will be blocked in the mmu
912 		 * notifier before touching any of the pages, specifically
913 		 * before being able to call __split_huge_page_refcount().
914 		 *
915 		 * We can therefore safely transfer the refcount from PG_tail
916 		 * to PG_head and switch the pfn from a tail page to the head
917 		 * page accordingly.
918 		 */
919 		mask = PTRS_PER_PMD - 1;
920 		VM_BUG_ON((gfn & mask) != (pfn & mask));
921 		if (pfn & mask) {
922 			*gpap &= PMD_MASK;
923 			kvm_release_pfn_clean(pfn);
924 			pfn &= ~mask;
925 			kvm_get_pfn(pfn);
926 			*pfnp = pfn;
927 		}
928 
929 		return true;
930 	}
931 
932 	return false;
933 }
934 
fault_supports_huge_mapping(struct kvm_memory_slot *memslot, unsigned long hva, bool write)935 static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot,
936 					unsigned long hva, bool write)
937 {
938 	gpa_t gpa_start;
939 	hva_t uaddr_start, uaddr_end;
940 	unsigned long map_size;
941 	size_t size;
942 
943 	map_size = PMD_SIZE;
944 	/* Disable dirty logging on HugePages */
945 	if ((memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) && write)
946 		return false;
947 
948 	size = memslot->npages * PAGE_SIZE;
949 	gpa_start = memslot->base_gfn << PAGE_SHIFT;
950 	uaddr_start = memslot->userspace_addr;
951 	uaddr_end = uaddr_start + size;
952 
953 	/*
954 	 * Pages belonging to memslots that don't have the same alignment
955 	 * within a PMD/PUD for userspace and GPA cannot be mapped with stage-2
956 	 * PMD/PUD entries, because we'll end up mapping the wrong pages.
957 	 *
958 	 * Consider a layout like the following:
959 	 *
960 	 *    memslot->userspace_addr:
961 	 *    +-----+--------------------+--------------------+---+
962 	 *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
963 	 *    +-----+--------------------+--------------------+---+
964 	 *
965 	 *    memslot->base_gfn << PAGE_SIZE:
966 	 *      +---+--------------------+--------------------+-----+
967 	 *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
968 	 *      +---+--------------------+--------------------+-----+
969 	 *
970 	 * If we create those stage-2 blocks, we'll end up with this incorrect
971 	 * mapping:
972 	 *   d -> f
973 	 *   e -> g
974 	 *   f -> h
975 	 */
976 	if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
977 		return false;
978 
979 	/*
980 	 * Next, let's make sure we're not trying to map anything not covered
981 	 * by the memslot. This means we have to prohibit block size mappings
982 	 * for the beginning and end of a non-block aligned and non-block sized
983 	 * memory slot (illustrated by the head and tail parts of the
984 	 * userspace view above containing pages 'abcde' and 'xyz',
985 	 * respectively).
986 	 *
987 	 * Note that it doesn't matter if we do the check using the
988 	 * userspace_addr or the base_gfn, as both are equally aligned (per
989 	 * the check above) and equally sized.
990 	 */
991 	return (hva & ~(map_size - 1)) >= uaddr_start &&
992 	       (hva & ~(map_size - 1)) + map_size <= uaddr_end;
993 }
994 
995 /**
996  * kvm_map_page_fast() - Fast path GPA fault handler.
997  * @vcpu:		VCPU pointer.
998  * @gpa:		Guest physical address of fault.
999  * @write:	Whether the fault was due to a write.
1000  *
1001  * Perform fast path GPA fault handling, doing all that can be done without
1002  * calling into KVM. This handles marking old pages young (for idle page
1003  * tracking), and dirtying of clean pages (for dirty page logging).
1004  *
1005  * Returns:	0 on success, in which case we can update derived mappings and
1006  *		resume guest execution.
1007  *		-EFAULT on failure due to absent GPA mapping or write to
1008  *		read-only page, in which case KVM must be consulted.
1009  */
kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)1010 static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa,
1011 				   bool write)
1012 {
1013 	struct kvm *kvm = vcpu->kvm;
1014 	gfn_t gfn = gpa >> PAGE_SHIFT;
1015 	pte_t *ptep;
1016 	kvm_pfn_t pfn = 0;	/* silence bogus GCC warning */
1017 	bool pfn_valid = false;
1018 	int ret = 0;
1019 	struct kvm_memory_slot *slot;
1020 
1021 	spin_lock(&kvm->mmu_lock);
1022 
1023 	/* Fast path - just check GPA page table for an existing entry */
1024 	ptep = kvm_pte_for_gpa(kvm, NULL, gpa);
1025 	if (!ptep || !pte_present(*ptep)) {
1026 		ret = -EFAULT;
1027 		goto out;
1028 	}
1029 
1030 	/* Track access to pages marked old */
1031 	if (!pte_young(*ptep)) {
1032 		set_pte(ptep, pte_mkyoung(*ptep));
1033 		pfn = pte_pfn(*ptep);
1034 		pfn_valid = true;
1035 		/* call kvm_set_pfn_accessed() after unlock */
1036 	}
1037 	if (write && !pte_dirty(*ptep)) {
1038 		if (!pte_write(*ptep)) {
1039 			ret = -EFAULT;
1040 			goto out;
1041 		}
1042 
1043 		if (kvm_pte_huge(*ptep)) {
1044 			/*
1045 			 * Do not set write permission when dirty logging is
1046 			 * enabled for HugePages
1047 			 */
1048 			slot = gfn_to_memslot(kvm, gfn);
1049 			if (slot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1050 				ret = -EFAULT;
1051 				goto out;
1052 			}
1053 		}
1054 
1055 		/* Track dirtying of writeable pages */
1056 		set_pte(ptep, pte_mkdirty(*ptep));
1057 		pfn = pte_pfn(*ptep);
1058 		if (pmd_huge(*((pmd_t *)ptep))) {
1059 			int i;
1060 			gfn_t base_gfn = (gpa & PMD_MASK) >> PAGE_SHIFT;
1061 
1062 			for (i = 0; i < PTRS_PER_PTE; i++)
1063 				mark_page_dirty(kvm, base_gfn + i);
1064 		} else
1065 			mark_page_dirty(kvm, gfn);
1066 		kvm_set_pfn_dirty(pfn);
1067 	}
1068 
1069 out:
1070 	spin_unlock(&kvm->mmu_lock);
1071 	if (pfn_valid)
1072 		kvm_set_pfn_accessed(pfn);
1073 	return ret;
1074 }
1075 
1076 /*
1077  * Split huge page
1078  */
kvm_split_huge(struct kvm_vcpu *vcpu, pte_t *ptep, gfn_t gfn, struct vm_area_struct *vma, unsigned long hva)1079 static pte_t *kvm_split_huge(struct kvm_vcpu *vcpu, pte_t *ptep, gfn_t gfn,
1080 		struct vm_area_struct *vma, unsigned long hva)
1081 {
1082 	int i;
1083 	pte_t val, *child;
1084 	struct kvm_mmu_memory_cache *memcache;
1085 
1086 	memcache = &vcpu->arch.mmu_page_cache;
1087 	child = kvm_mmu_memory_cache_alloc(memcache);
1088 	val = kvm_pte_mksmall(*ptep);
1089 	for (i = 0; i < PTRS_PER_PTE; i++) {
1090 		kvm_set_pte(child + i, val);
1091 		pte_val(val) += PAGE_SIZE;
1092 	}
1093 
1094 	/* The later kvm_flush_tlb_gpa() will flush hugepage tlb */
1095 	pte_val(val) = (unsigned long)child;
1096 	kvm_set_pte(ptep, val);
1097 	return child + (gfn & (PTRS_PER_PTE - 1));
1098 }
1099 
1100 /**
1101  * kvm_map_page() - Map a guest physical page.
1102  * @vcpu:		VCPU pointer.
1103  * @gpa:		Guest physical address of fault.
1104  * @write:	Whether the fault was due to a write.
1105  *
1106  * Handle GPA faults by creating a new GPA mapping (or updating an existing
1107  * one).
1108  *
1109  * This takes care of marking pages young or dirty (idle/dirty page tracking),
1110  * asking KVM for the corresponding PFN, and creating a mapping in the GPA page
1111  * tables. Derived mappings (GVA page tables and TLBs) must be handled by the
1112  * caller.
1113  *
1114  * Returns:	0 on success, in which case the caller may use the @out_entry
1115  *		and @out_buddy PTEs to update derived mappings and resume guest
1116  *		execution.
1117  *		-EFAULT if there is no memory region at @gpa or a write was
1118  *		attempted to a read-only memory region. This is usually handled
1119  *		as an MMIO access.
1120  */
kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)1121 static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
1122 			     bool write)
1123 {
1124 	bool writeable;
1125 	bool force_pte = false;
1126 	int i, srcu_idx, err = 0, retry_no = 0;
1127 	unsigned long hva;
1128 	unsigned long mmu_seq;
1129 	unsigned long prot_bits;
1130 	unsigned long vma_pagesize;
1131 	pte_t *ptep;
1132 	kvm_pfn_t pfn;
1133 	gfn_t gfn = gpa >> PAGE_SHIFT;
1134 	struct vm_area_struct *vma;
1135 	struct kvm *kvm = vcpu->kvm;
1136 	struct kvm_memory_slot *memslot;
1137 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1138 
1139 	/* Try the fast path to handle old / clean pages */
1140 	srcu_idx = srcu_read_lock(&kvm->srcu);
1141 	err = kvm_map_page_fast(vcpu, gpa, write);
1142 	if (!err)
1143 		goto out;
1144 
1145 	memslot = gfn_to_memslot(kvm, gfn);
1146 	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writeable);
1147 	if (kvm_is_error_hva(hva) || (write && !writeable))
1148 		goto out;
1149 
1150 	/* Let's check if we will get back a huge page backed by hugetlbfs */
1151 	mmap_read_lock(current->mm);
1152 	vma = find_vma_intersection(current->mm, hva, hva + 1);
1153 	if (unlikely(!vma)) {
1154 		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1155 		mmap_read_unlock(current->mm);
1156 		err = -EFAULT;
1157 		goto out;
1158 	}
1159 
1160 	vma_pagesize = vma_kernel_pagesize(vma);
1161 	if ((vma_pagesize == PMD_SIZE) &&
1162 		!fault_supports_huge_mapping(memslot, hva, write)) {
1163 		force_pte = true;
1164 		vma_pagesize = PAGE_SIZE;
1165 		++vcpu->stat.huge_dec_exits;
1166 	}
1167 
1168 	/* PMD is not folded, adjust gfn to new boundary */
1169 	if (vma_pagesize == PMD_SIZE)
1170 		gfn = (gpa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1171 
1172 	mmap_read_unlock(current->mm);
1173 
1174 	/* We need a minimum of cached pages ready for page table creation */
1175 	err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES);
1176 	if (err)
1177 		goto out;
1178 
1179 retry:
1180 	/*
1181 	 * Used to check for invalidations in progress, of the pfn that is
1182 	 * returned by pfn_to_pfn_prot below.
1183 	 */
1184 	mmu_seq = kvm->mmu_notifier_seq;
1185 	/*
1186 	 * Ensure the read of mmu_notifier_seq isn't reordered with PTE reads in
1187 	 * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't
1188 	 * risk the page we get a reference to getting unmapped before we have a
1189 	 * chance to grab the mmu_lock without mmu_notifier_retry() noticing.
1190 	 *
1191 	 * This smp_rmb() pairs with the effective smp_wmb() of the combination
1192 	 * of the pte_unmap_unlock() after the PTE is zapped, and the
1193 	 * spin_lock() in kvm_mmu_notifier_invalidate_<page|range_end>() before
1194 	 * mmu_notifier_seq is incremented.
1195 	 */
1196 	smp_rmb();
1197 
1198 	/* Slow path - ask KVM core whether we can access this GPA */
1199 	pfn = gfn_to_pfn_prot(kvm, gfn, write, &writeable);
1200 	if (is_error_noslot_pfn(pfn)) {
1201 		err = -EFAULT;
1202 		goto out;
1203 	}
1204 
1205 	spin_lock(&kvm->mmu_lock);
1206 	/* Check if an invalidation has taken place since we got pfn */
1207 	if (mmu_notifier_retry(kvm, mmu_seq)) {
1208 		/*
1209 		 * This can happen when mappings are changed asynchronously, but
1210 		 * also synchronously if a COW is triggered by
1211 		 * gfn_to_pfn_prot().
1212 		 */
1213 		spin_unlock(&kvm->mmu_lock);
1214 		kvm_set_pfn_accessed(pfn);
1215 		kvm_release_pfn_clean(pfn);
1216 		if (retry_no > 100) {
1217 			retry_no = 0;
1218 			schedule();
1219 		}
1220 		retry_no++;
1221 		goto retry;
1222 	}
1223 
1224 	if (vma_pagesize == PAGE_SIZE && !force_pte) {
1225 		/*
1226 		 * Only PMD_SIZE transparent hugepages(THP) are
1227 		 * currently supported. This code will need to be
1228 		 * updated to support other THP sizes.
1229 		 *
1230 		 * Make sure the host VA and the guest IPA are sufficiently
1231 		 * aligned and that the block is contained within the memslot.
1232 		 */
1233 		++vcpu->stat.huge_thp_exits;
1234 		if (fault_supports_huge_mapping(memslot, hva, write) &&
1235 		    transparent_hugepage_adjust(&pfn, &gpa)) {
1236 			++vcpu->stat.huge_adjust_exits;
1237 			vma_pagesize = PMD_SIZE;
1238 		}
1239 	}
1240 
1241 	/* Set up the prot bits */
1242 	prot_bits = _PAGE_PRESENT | __READABLE;
1243 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1244 		prot_bits |= _CACHE_SUC;
1245 	else
1246 		prot_bits |= _CACHE_CC;
1247 
1248 	if (writeable) {
1249 		prot_bits |= _PAGE_WRITE;
1250 		if (write) {
1251 			prot_bits |= __WRITEABLE;
1252 			mark_page_dirty(kvm, gfn);
1253 			kvm_set_pfn_dirty(pfn);
1254 		}
1255 	}
1256 
1257 	if (vma_pagesize == PMD_SIZE) {
1258 		pmd_t new_pmd = pfn_pmd(pfn, __pgprot(prot_bits));
1259 		new_pmd = pmd_mkhuge(new_pmd);
1260 		if (writeable && write) {
1261 			gfn_t base_gfn = (gpa & PMD_MASK) >> PAGE_SHIFT;
1262 			for (i = 0; i < PTRS_PER_PTE; i++)
1263 				mark_page_dirty(kvm, base_gfn + i);
1264 		}
1265 
1266 		++vcpu->stat.huge_set_exits;
1267 		kvm_set_pmd_huge(vcpu, memcache, gpa, &new_pmd);
1268 	} else {
1269 		pte_t new_pte = pfn_pte(pfn, __pgprot(prot_bits));
1270 		if (writeable && write)
1271 			mark_page_dirty(kvm, gfn);
1272 
1273 		/* Ensure page tables are allocated */
1274 		ptep = kvm_pte_for_gpa(kvm, memcache, gpa);
1275 		if (ptep && kvm_pte_huge(*ptep) && write)
1276 			ptep = kvm_split_huge(vcpu, ptep, gfn, vma, hva);
1277 
1278 		set_pte(ptep, new_pte);
1279 		err = 0;
1280 	}
1281 
1282 	spin_unlock(&kvm->mmu_lock);
1283 	kvm_release_pfn_clean(pfn);
1284 	kvm_set_pfn_accessed(pfn);
1285 out:
1286 	srcu_read_unlock(&kvm->srcu, srcu_idx);
1287 	return err;
1288 }
1289 
kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv, bool write)1290 int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv,
1291 				      bool write)
1292 {
1293 	int ret;
1294 
1295 	ret = kvm_map_page(vcpu, badv, write);
1296 	if (ret)
1297 		return ret;
1298 
1299 	/* Invalidate this entry in the TLB */
1300 	return kvm_tlb_flush_gpa(vcpu, badv);
1301 }
1302 
1303 /**
1304  * kvm_flush_tlb_all() - Flush all root TLB entries for
1305  * guests.
1306  *
1307  * Invalidate all entries including GVA-->GPA and GPA-->HPA mappings.
1308  */
kvm_flush_tlb_all(void)1309 void kvm_flush_tlb_all(void)
1310 {
1311 	unsigned long flags;
1312 
1313 	local_irq_save(flags);
1314 	invtlb_all(INVTLB_ALLGID, 0, 0);
1315 	local_irq_restore(flags);
1316 }
1317