xref: /kernel/linux/linux-5.10/arch/s390/mm/pgtable.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 *    Copyright IBM Corp. 2007, 2011
4 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
5 */
6
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/gfp.h>
11#include <linux/mm.h>
12#include <linux/swap.h>
13#include <linux/smp.h>
14#include <linux/spinlock.h>
15#include <linux/rcupdate.h>
16#include <linux/slab.h>
17#include <linux/swapops.h>
18#include <linux/sysctl.h>
19#include <linux/ksm.h>
20#include <linux/mman.h>
21
22#include <asm/tlb.h>
23#include <asm/tlbflush.h>
24#include <asm/mmu_context.h>
25#include <asm/page-states.h>
26
27pgprot_t pgprot_writecombine(pgprot_t prot)
28{
29	/*
30	 * mio_wb_bit_mask may be set on a different CPU, but it is only set
31	 * once at init and only read afterwards.
32	 */
33	return __pgprot(pgprot_val(prot) | mio_wb_bit_mask);
34}
35EXPORT_SYMBOL_GPL(pgprot_writecombine);
36
37pgprot_t pgprot_writethrough(pgprot_t prot)
38{
39	/*
40	 * mio_wb_bit_mask may be set on a different CPU, but it is only set
41	 * once at init and only read afterwards.
42	 */
43	return __pgprot(pgprot_val(prot) & ~mio_wb_bit_mask);
44}
45EXPORT_SYMBOL_GPL(pgprot_writethrough);
46
47static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr,
48				   pte_t *ptep, int nodat)
49{
50	unsigned long opt, asce;
51
52	if (MACHINE_HAS_TLB_GUEST) {
53		opt = 0;
54		asce = READ_ONCE(mm->context.gmap_asce);
55		if (asce == 0UL || nodat)
56			opt |= IPTE_NODAT;
57		if (asce != -1UL) {
58			asce = asce ? : mm->context.asce;
59			opt |= IPTE_GUEST_ASCE;
60		}
61		__ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL);
62	} else {
63		__ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL);
64	}
65}
66
67static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr,
68				    pte_t *ptep, int nodat)
69{
70	unsigned long opt, asce;
71
72	if (MACHINE_HAS_TLB_GUEST) {
73		opt = 0;
74		asce = READ_ONCE(mm->context.gmap_asce);
75		if (asce == 0UL || nodat)
76			opt |= IPTE_NODAT;
77		if (asce != -1UL) {
78			asce = asce ? : mm->context.asce;
79			opt |= IPTE_GUEST_ASCE;
80		}
81		__ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL);
82	} else {
83		__ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
84	}
85}
86
87static inline pte_t ptep_flush_direct(struct mm_struct *mm,
88				      unsigned long addr, pte_t *ptep,
89				      int nodat)
90{
91	pte_t old;
92
93	old = *ptep;
94	if (unlikely(pte_val(old) & _PAGE_INVALID))
95		return old;
96	atomic_inc(&mm->context.flush_count);
97	if (MACHINE_HAS_TLB_LC &&
98	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
99		ptep_ipte_local(mm, addr, ptep, nodat);
100	else
101		ptep_ipte_global(mm, addr, ptep, nodat);
102	atomic_dec(&mm->context.flush_count);
103	return old;
104}
105
106static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
107				    unsigned long addr, pte_t *ptep,
108				    int nodat)
109{
110	pte_t old;
111
112	old = *ptep;
113	if (unlikely(pte_val(old) & _PAGE_INVALID))
114		return old;
115	atomic_inc(&mm->context.flush_count);
116	if (cpumask_equal(&mm->context.cpu_attach_mask,
117			  cpumask_of(smp_processor_id()))) {
118		pte_val(*ptep) |= _PAGE_INVALID;
119		mm->context.flush_mm = 1;
120	} else
121		ptep_ipte_global(mm, addr, ptep, nodat);
122	atomic_dec(&mm->context.flush_count);
123	return old;
124}
125
126static inline pgste_t pgste_get_lock(pte_t *ptep)
127{
128	unsigned long new = 0;
129#ifdef CONFIG_PGSTE
130	unsigned long old;
131
132	asm(
133		"	lg	%0,%2\n"
134		"0:	lgr	%1,%0\n"
135		"	nihh	%0,0xff7f\n"	/* clear PCL bit in old */
136		"	oihh	%1,0x0080\n"	/* set PCL bit in new */
137		"	csg	%0,%1,%2\n"
138		"	jl	0b\n"
139		: "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE])
140		: "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory");
141#endif
142	return __pgste(new);
143}
144
145static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste)
146{
147#ifdef CONFIG_PGSTE
148	asm(
149		"	nihh	%1,0xff7f\n"	/* clear PCL bit */
150		"	stg	%1,%0\n"
151		: "=Q" (ptep[PTRS_PER_PTE])
152		: "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE])
153		: "cc", "memory");
154#endif
155}
156
157static inline pgste_t pgste_get(pte_t *ptep)
158{
159	unsigned long pgste = 0;
160#ifdef CONFIG_PGSTE
161	pgste = *(unsigned long *)(ptep + PTRS_PER_PTE);
162#endif
163	return __pgste(pgste);
164}
165
166static inline void pgste_set(pte_t *ptep, pgste_t pgste)
167{
168#ifdef CONFIG_PGSTE
169	*(pgste_t *)(ptep + PTRS_PER_PTE) = pgste;
170#endif
171}
172
173static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste,
174				       struct mm_struct *mm)
175{
176#ifdef CONFIG_PGSTE
177	unsigned long address, bits, skey;
178
179	if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID)
180		return pgste;
181	address = pte_val(pte) & PAGE_MASK;
182	skey = (unsigned long) page_get_storage_key(address);
183	bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
184	/* Transfer page changed & referenced bit to guest bits in pgste */
185	pgste_val(pgste) |= bits << 48;		/* GR bit & GC bit */
186	/* Copy page access key and fetch protection bit to pgste */
187	pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
188	pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
189#endif
190	return pgste;
191
192}
193
194static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry,
195				 struct mm_struct *mm)
196{
197#ifdef CONFIG_PGSTE
198	unsigned long address;
199	unsigned long nkey;
200
201	if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID)
202		return;
203	VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID));
204	address = pte_val(entry) & PAGE_MASK;
205	/*
206	 * Set page access key and fetch protection bit from pgste.
207	 * The guest C/R information is still in the PGSTE, set real
208	 * key C/R to 0.
209	 */
210	nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
211	nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
212	page_set_storage_key(address, nkey, 0);
213#endif
214}
215
216static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
217{
218#ifdef CONFIG_PGSTE
219	if ((pte_val(entry) & _PAGE_PRESENT) &&
220	    (pte_val(entry) & _PAGE_WRITE) &&
221	    !(pte_val(entry) & _PAGE_INVALID)) {
222		if (!MACHINE_HAS_ESOP) {
223			/*
224			 * Without enhanced suppression-on-protection force
225			 * the dirty bit on for all writable ptes.
226			 */
227			pte_val(entry) |= _PAGE_DIRTY;
228			pte_val(entry) &= ~_PAGE_PROTECT;
229		}
230		if (!(pte_val(entry) & _PAGE_PROTECT))
231			/* This pte allows write access, set user-dirty */
232			pgste_val(pgste) |= PGSTE_UC_BIT;
233	}
234#endif
235	*ptep = entry;
236	return pgste;
237}
238
239static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
240				       unsigned long addr,
241				       pte_t *ptep, pgste_t pgste)
242{
243#ifdef CONFIG_PGSTE
244	unsigned long bits;
245
246	bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
247	if (bits) {
248		pgste_val(pgste) ^= bits;
249		ptep_notify(mm, addr, ptep, bits);
250	}
251#endif
252	return pgste;
253}
254
255static inline pgste_t ptep_xchg_start(struct mm_struct *mm,
256				      unsigned long addr, pte_t *ptep)
257{
258	pgste_t pgste = __pgste(0);
259
260	if (mm_has_pgste(mm)) {
261		pgste = pgste_get_lock(ptep);
262		pgste = pgste_pte_notify(mm, addr, ptep, pgste);
263	}
264	return pgste;
265}
266
267static inline pte_t ptep_xchg_commit(struct mm_struct *mm,
268				    unsigned long addr, pte_t *ptep,
269				    pgste_t pgste, pte_t old, pte_t new)
270{
271	if (mm_has_pgste(mm)) {
272		if (pte_val(old) & _PAGE_INVALID)
273			pgste_set_key(ptep, pgste, new, mm);
274		if (pte_val(new) & _PAGE_INVALID) {
275			pgste = pgste_update_all(old, pgste, mm);
276			if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) ==
277			    _PGSTE_GPS_USAGE_UNUSED)
278				pte_val(old) |= _PAGE_UNUSED;
279		}
280		pgste = pgste_set_pte(ptep, pgste, new);
281		pgste_set_unlock(ptep, pgste);
282	} else {
283		*ptep = new;
284	}
285	return old;
286}
287
288pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr,
289		       pte_t *ptep, pte_t new)
290{
291	pgste_t pgste;
292	pte_t old;
293	int nodat;
294
295	preempt_disable();
296	pgste = ptep_xchg_start(mm, addr, ptep);
297	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
298	old = ptep_flush_direct(mm, addr, ptep, nodat);
299	old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
300	preempt_enable();
301	return old;
302}
303EXPORT_SYMBOL(ptep_xchg_direct);
304
305pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr,
306		     pte_t *ptep, pte_t new)
307{
308	pgste_t pgste;
309	pte_t old;
310	int nodat;
311
312	preempt_disable();
313	pgste = ptep_xchg_start(mm, addr, ptep);
314	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
315	old = ptep_flush_lazy(mm, addr, ptep, nodat);
316	old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
317	preempt_enable();
318	return old;
319}
320EXPORT_SYMBOL(ptep_xchg_lazy);
321
322pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
323			     pte_t *ptep)
324{
325	pgste_t pgste;
326	pte_t old;
327	int nodat;
328	struct mm_struct *mm = vma->vm_mm;
329
330	preempt_disable();
331	pgste = ptep_xchg_start(mm, addr, ptep);
332	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
333	old = ptep_flush_lazy(mm, addr, ptep, nodat);
334	if (mm_has_pgste(mm)) {
335		pgste = pgste_update_all(old, pgste, mm);
336		pgste_set(ptep, pgste);
337	}
338	return old;
339}
340
341void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
342			     pte_t *ptep, pte_t old_pte, pte_t pte)
343{
344	pgste_t pgste;
345	struct mm_struct *mm = vma->vm_mm;
346
347	if (!MACHINE_HAS_NX)
348		pte_val(pte) &= ~_PAGE_NOEXEC;
349	if (mm_has_pgste(mm)) {
350		pgste = pgste_get(ptep);
351		pgste_set_key(ptep, pgste, pte, mm);
352		pgste = pgste_set_pte(ptep, pgste, pte);
353		pgste_set_unlock(ptep, pgste);
354	} else {
355		*ptep = pte;
356	}
357	preempt_enable();
358}
359
360static inline void pmdp_idte_local(struct mm_struct *mm,
361				   unsigned long addr, pmd_t *pmdp)
362{
363	if (MACHINE_HAS_TLB_GUEST)
364		__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
365			    mm->context.asce, IDTE_LOCAL);
366	else
367		__pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL);
368	if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
369		gmap_pmdp_idte_local(mm, addr);
370}
371
372static inline void pmdp_idte_global(struct mm_struct *mm,
373				    unsigned long addr, pmd_t *pmdp)
374{
375	if (MACHINE_HAS_TLB_GUEST) {
376		__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
377			    mm->context.asce, IDTE_GLOBAL);
378		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
379			gmap_pmdp_idte_global(mm, addr);
380	} else if (MACHINE_HAS_IDTE) {
381		__pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL);
382		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
383			gmap_pmdp_idte_global(mm, addr);
384	} else {
385		__pmdp_csp(pmdp);
386		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
387			gmap_pmdp_csp(mm, addr);
388	}
389}
390
391static inline pmd_t pmdp_flush_direct(struct mm_struct *mm,
392				      unsigned long addr, pmd_t *pmdp)
393{
394	pmd_t old;
395
396	old = *pmdp;
397	if (pmd_val(old) & _SEGMENT_ENTRY_INVALID)
398		return old;
399	atomic_inc(&mm->context.flush_count);
400	if (MACHINE_HAS_TLB_LC &&
401	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
402		pmdp_idte_local(mm, addr, pmdp);
403	else
404		pmdp_idte_global(mm, addr, pmdp);
405	atomic_dec(&mm->context.flush_count);
406	return old;
407}
408
409static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
410				    unsigned long addr, pmd_t *pmdp)
411{
412	pmd_t old;
413
414	old = *pmdp;
415	if (pmd_val(old) & _SEGMENT_ENTRY_INVALID)
416		return old;
417	atomic_inc(&mm->context.flush_count);
418	if (cpumask_equal(&mm->context.cpu_attach_mask,
419			  cpumask_of(smp_processor_id()))) {
420		pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
421		mm->context.flush_mm = 1;
422		if (mm_has_pgste(mm))
423			gmap_pmdp_invalidate(mm, addr);
424	} else {
425		pmdp_idte_global(mm, addr, pmdp);
426	}
427	atomic_dec(&mm->context.flush_count);
428	return old;
429}
430
431#ifdef CONFIG_PGSTE
432static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr)
433{
434	pgd_t *pgd;
435	p4d_t *p4d;
436	pud_t *pud;
437	pmd_t *pmd;
438
439	pgd = pgd_offset(mm, addr);
440	p4d = p4d_alloc(mm, pgd, addr);
441	if (!p4d)
442		return NULL;
443	pud = pud_alloc(mm, p4d, addr);
444	if (!pud)
445		return NULL;
446	pmd = pmd_alloc(mm, pud, addr);
447	return pmd;
448}
449#endif
450
451pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
452		       pmd_t *pmdp, pmd_t new)
453{
454	pmd_t old;
455
456	preempt_disable();
457	old = pmdp_flush_direct(mm, addr, pmdp);
458	*pmdp = new;
459	preempt_enable();
460	return old;
461}
462EXPORT_SYMBOL(pmdp_xchg_direct);
463
464pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr,
465		     pmd_t *pmdp, pmd_t new)
466{
467	pmd_t old;
468
469	preempt_disable();
470	old = pmdp_flush_lazy(mm, addr, pmdp);
471	*pmdp = new;
472	preempt_enable();
473	return old;
474}
475EXPORT_SYMBOL(pmdp_xchg_lazy);
476
477static inline void pudp_idte_local(struct mm_struct *mm,
478				   unsigned long addr, pud_t *pudp)
479{
480	if (MACHINE_HAS_TLB_GUEST)
481		__pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE,
482			    mm->context.asce, IDTE_LOCAL);
483	else
484		__pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL);
485}
486
487static inline void pudp_idte_global(struct mm_struct *mm,
488				    unsigned long addr, pud_t *pudp)
489{
490	if (MACHINE_HAS_TLB_GUEST)
491		__pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE,
492			    mm->context.asce, IDTE_GLOBAL);
493	else if (MACHINE_HAS_IDTE)
494		__pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL);
495	else
496		/*
497		 * Invalid bit position is the same for pmd and pud, so we can
498		 * re-use _pmd_csp() here
499		 */
500		__pmdp_csp((pmd_t *) pudp);
501}
502
503static inline pud_t pudp_flush_direct(struct mm_struct *mm,
504				      unsigned long addr, pud_t *pudp)
505{
506	pud_t old;
507
508	old = *pudp;
509	if (pud_val(old) & _REGION_ENTRY_INVALID)
510		return old;
511	atomic_inc(&mm->context.flush_count);
512	if (MACHINE_HAS_TLB_LC &&
513	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
514		pudp_idte_local(mm, addr, pudp);
515	else
516		pudp_idte_global(mm, addr, pudp);
517	atomic_dec(&mm->context.flush_count);
518	return old;
519}
520
521pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr,
522		       pud_t *pudp, pud_t new)
523{
524	pud_t old;
525
526	preempt_disable();
527	old = pudp_flush_direct(mm, addr, pudp);
528	*pudp = new;
529	preempt_enable();
530	return old;
531}
532EXPORT_SYMBOL(pudp_xchg_direct);
533
534#ifdef CONFIG_TRANSPARENT_HUGEPAGE
535void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
536				pgtable_t pgtable)
537{
538	struct list_head *lh = (struct list_head *) pgtable;
539
540	assert_spin_locked(pmd_lockptr(mm, pmdp));
541
542	/* FIFO */
543	if (!pmd_huge_pte(mm, pmdp))
544		INIT_LIST_HEAD(lh);
545	else
546		list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
547	pmd_huge_pte(mm, pmdp) = pgtable;
548}
549
550pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
551{
552	struct list_head *lh;
553	pgtable_t pgtable;
554	pte_t *ptep;
555
556	assert_spin_locked(pmd_lockptr(mm, pmdp));
557
558	/* FIFO */
559	pgtable = pmd_huge_pte(mm, pmdp);
560	lh = (struct list_head *) pgtable;
561	if (list_empty(lh))
562		pmd_huge_pte(mm, pmdp) = NULL;
563	else {
564		pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
565		list_del(lh);
566	}
567	ptep = (pte_t *) pgtable;
568	pte_val(*ptep) = _PAGE_INVALID;
569	ptep++;
570	pte_val(*ptep) = _PAGE_INVALID;
571	return pgtable;
572}
573#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
574
575#ifdef CONFIG_PGSTE
576void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
577		     pte_t *ptep, pte_t entry)
578{
579	pgste_t pgste;
580
581	/* the mm_has_pgste() check is done in set_pte_at() */
582	preempt_disable();
583	pgste = pgste_get_lock(ptep);
584	pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
585	pgste_set_key(ptep, pgste, entry, mm);
586	pgste = pgste_set_pte(ptep, pgste, entry);
587	pgste_set_unlock(ptep, pgste);
588	preempt_enable();
589}
590
591void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
592{
593	pgste_t pgste;
594
595	preempt_disable();
596	pgste = pgste_get_lock(ptep);
597	pgste_val(pgste) |= PGSTE_IN_BIT;
598	pgste_set_unlock(ptep, pgste);
599	preempt_enable();
600}
601
602/**
603 * ptep_force_prot - change access rights of a locked pte
604 * @mm: pointer to the process mm_struct
605 * @addr: virtual address in the guest address space
606 * @ptep: pointer to the page table entry
607 * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
608 * @bit: pgste bit to set (e.g. for notification)
609 *
610 * Returns 0 if the access rights were changed and -EAGAIN if the current
611 * and requested access rights are incompatible.
612 */
613int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
614		    pte_t *ptep, int prot, unsigned long bit)
615{
616	pte_t entry;
617	pgste_t pgste;
618	int pte_i, pte_p, nodat;
619
620	pgste = pgste_get_lock(ptep);
621	entry = *ptep;
622	/* Check pte entry after all locks have been acquired */
623	pte_i = pte_val(entry) & _PAGE_INVALID;
624	pte_p = pte_val(entry) & _PAGE_PROTECT;
625	if ((pte_i && (prot != PROT_NONE)) ||
626	    (pte_p && (prot & PROT_WRITE))) {
627		pgste_set_unlock(ptep, pgste);
628		return -EAGAIN;
629	}
630	/* Change access rights and set pgste bit */
631	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
632	if (prot == PROT_NONE && !pte_i) {
633		ptep_flush_direct(mm, addr, ptep, nodat);
634		pgste = pgste_update_all(entry, pgste, mm);
635		pte_val(entry) |= _PAGE_INVALID;
636	}
637	if (prot == PROT_READ && !pte_p) {
638		ptep_flush_direct(mm, addr, ptep, nodat);
639		pte_val(entry) &= ~_PAGE_INVALID;
640		pte_val(entry) |= _PAGE_PROTECT;
641	}
642	pgste_val(pgste) |= bit;
643	pgste = pgste_set_pte(ptep, pgste, entry);
644	pgste_set_unlock(ptep, pgste);
645	return 0;
646}
647
648int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
649		    pte_t *sptep, pte_t *tptep, pte_t pte)
650{
651	pgste_t spgste, tpgste;
652	pte_t spte, tpte;
653	int rc = -EAGAIN;
654
655	if (!(pte_val(*tptep) & _PAGE_INVALID))
656		return 0;	/* already shadowed */
657	spgste = pgste_get_lock(sptep);
658	spte = *sptep;
659	if (!(pte_val(spte) & _PAGE_INVALID) &&
660	    !((pte_val(spte) & _PAGE_PROTECT) &&
661	      !(pte_val(pte) & _PAGE_PROTECT))) {
662		pgste_val(spgste) |= PGSTE_VSIE_BIT;
663		tpgste = pgste_get_lock(tptep);
664		pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
665				(pte_val(pte) & _PAGE_PROTECT);
666		/* don't touch the storage key - it belongs to parent pgste */
667		tpgste = pgste_set_pte(tptep, tpgste, tpte);
668		pgste_set_unlock(tptep, tpgste);
669		rc = 1;
670	}
671	pgste_set_unlock(sptep, spgste);
672	return rc;
673}
674
675void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
676{
677	pgste_t pgste;
678	int nodat;
679
680	pgste = pgste_get_lock(ptep);
681	/* notifier is called by the caller */
682	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
683	ptep_flush_direct(mm, saddr, ptep, nodat);
684	/* don't touch the storage key - it belongs to parent pgste */
685	pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
686	pgste_set_unlock(ptep, pgste);
687}
688
689static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
690{
691	if (!non_swap_entry(entry))
692		dec_mm_counter(mm, MM_SWAPENTS);
693	else if (is_migration_entry(entry)) {
694		struct page *page = migration_entry_to_page(entry);
695
696		dec_mm_counter(mm, mm_counter(page));
697	}
698	free_swap_and_cache(entry);
699}
700
701void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
702		     pte_t *ptep, int reset)
703{
704	unsigned long pgstev;
705	pgste_t pgste;
706	pte_t pte;
707
708	/* Zap unused and logically-zero pages */
709	preempt_disable();
710	pgste = pgste_get_lock(ptep);
711	pgstev = pgste_val(pgste);
712	pte = *ptep;
713	if (!reset && pte_swap(pte) &&
714	    ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED ||
715	     (pgstev & _PGSTE_GPS_ZERO))) {
716		ptep_zap_swap_entry(mm, pte_to_swp_entry(pte));
717		pte_clear(mm, addr, ptep);
718	}
719	if (reset)
720		pgste_val(pgste) &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT);
721	pgste_set_unlock(ptep, pgste);
722	preempt_enable();
723}
724
725void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
726{
727	unsigned long ptev;
728	pgste_t pgste;
729
730	/* Clear storage key ACC and F, but set R/C */
731	preempt_disable();
732	pgste = pgste_get_lock(ptep);
733	pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
734	pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT;
735	ptev = pte_val(*ptep);
736	if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
737		page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0);
738	pgste_set_unlock(ptep, pgste);
739	preempt_enable();
740}
741
742/*
743 * Test and reset if a guest page is dirty
744 */
745bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr,
746		       pte_t *ptep)
747{
748	pgste_t pgste;
749	pte_t pte;
750	bool dirty;
751	int nodat;
752
753	pgste = pgste_get_lock(ptep);
754	dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
755	pgste_val(pgste) &= ~PGSTE_UC_BIT;
756	pte = *ptep;
757	if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
758		pgste = pgste_pte_notify(mm, addr, ptep, pgste);
759		nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
760		ptep_ipte_global(mm, addr, ptep, nodat);
761		if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
762			pte_val(pte) |= _PAGE_PROTECT;
763		else
764			pte_val(pte) |= _PAGE_INVALID;
765		*ptep = pte;
766	}
767	pgste_set_unlock(ptep, pgste);
768	return dirty;
769}
770EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc);
771
772int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
773			  unsigned char key, bool nq)
774{
775	unsigned long keyul, paddr;
776	spinlock_t *ptl;
777	pgste_t old, new;
778	pmd_t *pmdp;
779	pte_t *ptep;
780
781	pmdp = pmd_alloc_map(mm, addr);
782	if (unlikely(!pmdp))
783		return -EFAULT;
784
785	ptl = pmd_lock(mm, pmdp);
786	if (!pmd_present(*pmdp)) {
787		spin_unlock(ptl);
788		return -EFAULT;
789	}
790
791	if (pmd_large(*pmdp)) {
792		paddr = pmd_val(*pmdp) & HPAGE_MASK;
793		paddr |= addr & ~HPAGE_MASK;
794		/*
795		 * Huge pmds need quiescing operations, they are
796		 * always mapped.
797		 */
798		page_set_storage_key(paddr, key, 1);
799		spin_unlock(ptl);
800		return 0;
801	}
802	spin_unlock(ptl);
803
804	ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
805	if (unlikely(!ptep))
806		return -EFAULT;
807
808	new = old = pgste_get_lock(ptep);
809	pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
810			    PGSTE_ACC_BITS | PGSTE_FP_BIT);
811	keyul = (unsigned long) key;
812	pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
813	pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
814	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
815		unsigned long bits, skey;
816
817		paddr = pte_val(*ptep) & PAGE_MASK;
818		skey = (unsigned long) page_get_storage_key(paddr);
819		bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
820		skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
821		/* Set storage key ACC and FP */
822		page_set_storage_key(paddr, skey, !nq);
823		/* Merge host changed & referenced into pgste  */
824		pgste_val(new) |= bits << 52;
825	}
826	/* changing the guest storage key is considered a change of the page */
827	if ((pgste_val(new) ^ pgste_val(old)) &
828	    (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
829		pgste_val(new) |= PGSTE_UC_BIT;
830
831	pgste_set_unlock(ptep, new);
832	pte_unmap_unlock(ptep, ptl);
833	return 0;
834}
835EXPORT_SYMBOL(set_guest_storage_key);
836
837/**
838 * Conditionally set a guest storage key (handling csske).
839 * oldkey will be updated when either mr or mc is set and a pointer is given.
840 *
841 * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest
842 * storage key was updated and -EFAULT on access errors.
843 */
844int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
845			       unsigned char key, unsigned char *oldkey,
846			       bool nq, bool mr, bool mc)
847{
848	unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT;
849	int rc;
850
851	/* we can drop the pgste lock between getting and setting the key */
852	if (mr | mc) {
853		rc = get_guest_storage_key(current->mm, addr, &tmp);
854		if (rc)
855			return rc;
856		if (oldkey)
857			*oldkey = tmp;
858		if (!mr)
859			mask |= _PAGE_REFERENCED;
860		if (!mc)
861			mask |= _PAGE_CHANGED;
862		if (!((tmp ^ key) & mask))
863			return 0;
864	}
865	rc = set_guest_storage_key(current->mm, addr, key, nq);
866	return rc < 0 ? rc : 1;
867}
868EXPORT_SYMBOL(cond_set_guest_storage_key);
869
870/**
871 * Reset a guest reference bit (rrbe), returning the reference and changed bit.
872 *
873 * Returns < 0 in case of error, otherwise the cc to be reported to the guest.
874 */
875int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
876{
877	spinlock_t *ptl;
878	unsigned long paddr;
879	pgste_t old, new;
880	pmd_t *pmdp;
881	pte_t *ptep;
882	int cc = 0;
883
884	pmdp = pmd_alloc_map(mm, addr);
885	if (unlikely(!pmdp))
886		return -EFAULT;
887
888	ptl = pmd_lock(mm, pmdp);
889	if (!pmd_present(*pmdp)) {
890		spin_unlock(ptl);
891		return -EFAULT;
892	}
893
894	if (pmd_large(*pmdp)) {
895		paddr = pmd_val(*pmdp) & HPAGE_MASK;
896		paddr |= addr & ~HPAGE_MASK;
897		cc = page_reset_referenced(paddr);
898		spin_unlock(ptl);
899		return cc;
900	}
901	spin_unlock(ptl);
902
903	ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
904	if (unlikely(!ptep))
905		return -EFAULT;
906
907	new = old = pgste_get_lock(ptep);
908	/* Reset guest reference bit only */
909	pgste_val(new) &= ~PGSTE_GR_BIT;
910
911	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
912		paddr = pte_val(*ptep) & PAGE_MASK;
913		cc = page_reset_referenced(paddr);
914		/* Merge real referenced bit into host-set */
915		pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT;
916	}
917	/* Reflect guest's logical view, not physical */
918	cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49;
919	/* Changing the guest storage key is considered a change of the page */
920	if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT)
921		pgste_val(new) |= PGSTE_UC_BIT;
922
923	pgste_set_unlock(ptep, new);
924	pte_unmap_unlock(ptep, ptl);
925	return cc;
926}
927EXPORT_SYMBOL(reset_guest_reference_bit);
928
929int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
930			  unsigned char *key)
931{
932	unsigned long paddr;
933	spinlock_t *ptl;
934	pgste_t pgste;
935	pmd_t *pmdp;
936	pte_t *ptep;
937
938	pmdp = pmd_alloc_map(mm, addr);
939	if (unlikely(!pmdp))
940		return -EFAULT;
941
942	ptl = pmd_lock(mm, pmdp);
943	if (!pmd_present(*pmdp)) {
944		/* Not yet mapped memory has a zero key */
945		spin_unlock(ptl);
946		*key = 0;
947		return 0;
948	}
949
950	if (pmd_large(*pmdp)) {
951		paddr = pmd_val(*pmdp) & HPAGE_MASK;
952		paddr |= addr & ~HPAGE_MASK;
953		*key = page_get_storage_key(paddr);
954		spin_unlock(ptl);
955		return 0;
956	}
957	spin_unlock(ptl);
958
959	ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
960	if (unlikely(!ptep))
961		return -EFAULT;
962
963	pgste = pgste_get_lock(ptep);
964	*key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
965	paddr = pte_val(*ptep) & PAGE_MASK;
966	if (!(pte_val(*ptep) & _PAGE_INVALID))
967		*key = page_get_storage_key(paddr);
968	/* Reflect guest's logical view, not physical */
969	*key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
970	pgste_set_unlock(ptep, pgste);
971	pte_unmap_unlock(ptep, ptl);
972	return 0;
973}
974EXPORT_SYMBOL(get_guest_storage_key);
975
976/**
977 * pgste_perform_essa - perform ESSA actions on the PGSTE.
978 * @mm: the memory context. It must have PGSTEs, no check is performed here!
979 * @hva: the host virtual address of the page whose PGSTE is to be processed
980 * @orc: the specific action to perform, see the ESSA_SET_* macros.
981 * @oldpte: the PTE will be saved there if the pointer is not NULL.
982 * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL.
983 *
984 * Return: 1 if the page is to be added to the CBRL, otherwise 0,
985 *	   or < 0 in case of error. -EINVAL is returned for invalid values
986 *	   of orc, -EFAULT for invalid addresses.
987 */
988int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
989			unsigned long *oldpte, unsigned long *oldpgste)
990{
991	struct vm_area_struct *vma;
992	unsigned long pgstev;
993	spinlock_t *ptl;
994	pgste_t pgste;
995	pte_t *ptep;
996	int res = 0;
997
998	WARN_ON_ONCE(orc > ESSA_MAX);
999	if (unlikely(orc > ESSA_MAX))
1000		return -EINVAL;
1001
1002	vma = find_vma(mm, hva);
1003	if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma))
1004		return -EFAULT;
1005	ptep = get_locked_pte(mm, hva, &ptl);
1006	if (unlikely(!ptep))
1007		return -EFAULT;
1008	pgste = pgste_get_lock(ptep);
1009	pgstev = pgste_val(pgste);
1010	if (oldpte)
1011		*oldpte = pte_val(*ptep);
1012	if (oldpgste)
1013		*oldpgste = pgstev;
1014
1015	switch (orc) {
1016	case ESSA_GET_STATE:
1017		break;
1018	case ESSA_SET_STABLE:
1019		pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT);
1020		pgstev |= _PGSTE_GPS_USAGE_STABLE;
1021		break;
1022	case ESSA_SET_UNUSED:
1023		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
1024		pgstev |= _PGSTE_GPS_USAGE_UNUSED;
1025		if (pte_val(*ptep) & _PAGE_INVALID)
1026			res = 1;
1027		break;
1028	case ESSA_SET_VOLATILE:
1029		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
1030		pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
1031		if (pte_val(*ptep) & _PAGE_INVALID)
1032			res = 1;
1033		break;
1034	case ESSA_SET_POT_VOLATILE:
1035		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
1036		if (!(pte_val(*ptep) & _PAGE_INVALID)) {
1037			pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE;
1038			break;
1039		}
1040		if (pgstev & _PGSTE_GPS_ZERO) {
1041			pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
1042			break;
1043		}
1044		if (!(pgstev & PGSTE_GC_BIT)) {
1045			pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
1046			res = 1;
1047			break;
1048		}
1049		break;
1050	case ESSA_SET_STABLE_RESIDENT:
1051		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
1052		pgstev |= _PGSTE_GPS_USAGE_STABLE;
1053		/*
1054		 * Since the resident state can go away any time after this
1055		 * call, we will not make this page resident. We can revisit
1056		 * this decision if a guest will ever start using this.
1057		 */
1058		break;
1059	case ESSA_SET_STABLE_IF_RESIDENT:
1060		if (!(pte_val(*ptep) & _PAGE_INVALID)) {
1061			pgstev &= ~_PGSTE_GPS_USAGE_MASK;
1062			pgstev |= _PGSTE_GPS_USAGE_STABLE;
1063		}
1064		break;
1065	case ESSA_SET_STABLE_NODAT:
1066		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
1067		pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT;
1068		break;
1069	default:
1070		/* we should never get here! */
1071		break;
1072	}
1073	/* If we are discarding a page, set it to logical zero */
1074	if (res)
1075		pgstev |= _PGSTE_GPS_ZERO;
1076
1077	pgste_val(pgste) = pgstev;
1078	pgste_set_unlock(ptep, pgste);
1079	pte_unmap_unlock(ptep, ptl);
1080	return res;
1081}
1082EXPORT_SYMBOL(pgste_perform_essa);
1083
1084/**
1085 * set_pgste_bits - set specific PGSTE bits.
1086 * @mm: the memory context. It must have PGSTEs, no check is performed here!
1087 * @hva: the host virtual address of the page whose PGSTE is to be processed
1088 * @bits: a bitmask representing the bits that will be touched
1089 * @value: the values of the bits to be written. Only the bits in the mask
1090 *	   will be written.
1091 *
1092 * Return: 0 on success, < 0 in case of error.
1093 */
1094int set_pgste_bits(struct mm_struct *mm, unsigned long hva,
1095			unsigned long bits, unsigned long value)
1096{
1097	struct vm_area_struct *vma;
1098	spinlock_t *ptl;
1099	pgste_t new;
1100	pte_t *ptep;
1101
1102	vma = find_vma(mm, hva);
1103	if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma))
1104		return -EFAULT;
1105	ptep = get_locked_pte(mm, hva, &ptl);
1106	if (unlikely(!ptep))
1107		return -EFAULT;
1108	new = pgste_get_lock(ptep);
1109
1110	pgste_val(new) &= ~bits;
1111	pgste_val(new) |= value & bits;
1112
1113	pgste_set_unlock(ptep, new);
1114	pte_unmap_unlock(ptep, ptl);
1115	return 0;
1116}
1117EXPORT_SYMBOL(set_pgste_bits);
1118
1119/**
1120 * get_pgste - get the current PGSTE for the given address.
1121 * @mm: the memory context. It must have PGSTEs, no check is performed here!
1122 * @hva: the host virtual address of the page whose PGSTE is to be processed
1123 * @pgstep: will be written with the current PGSTE for the given address.
1124 *
1125 * Return: 0 on success, < 0 in case of error.
1126 */
1127int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep)
1128{
1129	struct vm_area_struct *vma;
1130	spinlock_t *ptl;
1131	pte_t *ptep;
1132
1133	vma = find_vma(mm, hva);
1134	if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma))
1135		return -EFAULT;
1136	ptep = get_locked_pte(mm, hva, &ptl);
1137	if (unlikely(!ptep))
1138		return -EFAULT;
1139	*pgstep = pgste_val(pgste_get(ptep));
1140	pte_unmap_unlock(ptep, ptl);
1141	return 0;
1142}
1143EXPORT_SYMBOL(get_pgste);
1144#endif
1145