xref: /kernel/linux/linux-5.10/fs/proc/task_mmu.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/pagewalk.h>
3#include <linux/vmacache.h>
4#include <linux/mm_inline.h>
5#include <linux/hugetlb.h>
6#include <linux/huge_mm.h>
7#include <linux/mount.h>
8#include <linux/seq_file.h>
9#include <linux/highmem.h>
10#include <linux/ptrace.h>
11#include <linux/slab.h>
12#include <linux/pagemap.h>
13#include <linux/mempolicy.h>
14#include <linux/rmap.h>
15#include <linux/swap.h>
16#include <linux/sched/mm.h>
17#include <linux/swapops.h>
18#include <linux/mmu_notifier.h>
19#include <linux/page_idle.h>
20#include <linux/shmem_fs.h>
21#include <linux/uaccess.h>
22#include <linux/pkeys.h>
23#ifdef CONFIG_MEM_PURGEABLE
24#include <linux/mm_purgeable.h>
25#endif
26
27#include <asm/elf.h>
28#include <asm/tlb.h>
29#include <asm/tlbflush.h>
30#include "internal.h"
31#include <linux/hck/lite_hck_hideaddr.h>
32
33#define SEQ_PUT_DEC(str, val) \
34		seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8)
35void task_mem(struct seq_file *m, struct mm_struct *mm)
36{
37	unsigned long text, lib, swap, anon, file, shmem;
38	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
39#ifdef CONFIG_MEM_PURGEABLE
40	unsigned long nr_purg_sum = 0, nr_purg_pin = 0;
41
42	mm_purg_pages_info(mm, &nr_purg_sum, &nr_purg_pin);
43#endif
44
45	anon = get_mm_counter(mm, MM_ANONPAGES);
46	file = get_mm_counter(mm, MM_FILEPAGES);
47	shmem = get_mm_counter(mm, MM_SHMEMPAGES);
48
49	/*
50	 * Note: to minimize their overhead, mm maintains hiwater_vm and
51	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
52	 * collector of these hiwater stats must therefore get total_vm
53	 * and rss too, which will usually be the higher.  Barriers? not
54	 * worth the effort, such snapshots can always be inconsistent.
55	 */
56	hiwater_vm = total_vm = mm->total_vm;
57	if (hiwater_vm < mm->hiwater_vm)
58		hiwater_vm = mm->hiwater_vm;
59	hiwater_rss = total_rss = anon + file + shmem;
60	if (hiwater_rss < mm->hiwater_rss)
61		hiwater_rss = mm->hiwater_rss;
62
63	/* split executable areas between text and lib */
64	text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK);
65	text = min(text, mm->exec_vm << PAGE_SHIFT);
66	lib = (mm->exec_vm << PAGE_SHIFT) - text;
67
68	swap = get_mm_counter(mm, MM_SWAPENTS);
69	SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
70	SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
71	SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
72	SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm));
73	SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
74	SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
75	SEQ_PUT_DEC(" kB\nRssAnon:\t", anon);
76	SEQ_PUT_DEC(" kB\nRssFile:\t", file);
77	SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem);
78	SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm);
79	SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm);
80	seq_put_decimal_ull_width(m,
81		    " kB\nVmExe:\t", text >> 10, 8);
82	seq_put_decimal_ull_width(m,
83		    " kB\nVmLib:\t", lib >> 10, 8);
84	seq_put_decimal_ull_width(m,
85		    " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8);
86	SEQ_PUT_DEC(" kB\nVmSwap:\t", swap);
87#ifdef CONFIG_MEM_PURGEABLE
88	SEQ_PUT_DEC(" kB\nPurgSum:\t", nr_purg_sum);
89	SEQ_PUT_DEC(" kB\nPurgPin:\t", nr_purg_pin);
90#endif
91	seq_puts(m, " kB\n");
92	hugetlb_report_usage(m, mm);
93}
94#undef SEQ_PUT_DEC
95
96unsigned long task_vsize(struct mm_struct *mm)
97{
98	return PAGE_SIZE * mm->total_vm;
99}
100
101unsigned long task_statm(struct mm_struct *mm,
102			 unsigned long *shared, unsigned long *text,
103			 unsigned long *data, unsigned long *resident)
104{
105	*shared = get_mm_counter(mm, MM_FILEPAGES) +
106			get_mm_counter(mm, MM_SHMEMPAGES);
107	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
108								>> PAGE_SHIFT;
109	*data = mm->data_vm + mm->stack_vm;
110	*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
111	return mm->total_vm;
112}
113
114#ifdef CONFIG_NUMA
115/*
116 * Save get_task_policy() for show_numa_map().
117 */
118static void hold_task_mempolicy(struct proc_maps_private *priv)
119{
120	struct task_struct *task = priv->task;
121
122	task_lock(task);
123	priv->task_mempolicy = get_task_policy(task);
124	mpol_get(priv->task_mempolicy);
125	task_unlock(task);
126}
127static void release_task_mempolicy(struct proc_maps_private *priv)
128{
129	mpol_put(priv->task_mempolicy);
130}
131#else
132static void hold_task_mempolicy(struct proc_maps_private *priv)
133{
134}
135static void release_task_mempolicy(struct proc_maps_private *priv)
136{
137}
138#endif
139
140static void *m_start(struct seq_file *m, loff_t *ppos)
141{
142	struct proc_maps_private *priv = m->private;
143	unsigned long last_addr = *ppos;
144	struct mm_struct *mm;
145	struct vm_area_struct *vma;
146
147	/* See m_next(). Zero at the start or after lseek. */
148	if (last_addr == -1UL)
149		return NULL;
150
151	priv->task = get_proc_task(priv->inode);
152	if (!priv->task)
153		return ERR_PTR(-ESRCH);
154
155	mm = priv->mm;
156	if (!mm || !mmget_not_zero(mm)) {
157		put_task_struct(priv->task);
158		priv->task = NULL;
159		return NULL;
160	}
161
162	if (mmap_read_lock_killable(mm)) {
163		mmput(mm);
164		put_task_struct(priv->task);
165		priv->task = NULL;
166		return ERR_PTR(-EINTR);
167	}
168
169	hold_task_mempolicy(priv);
170	priv->tail_vma = get_gate_vma(mm);
171
172	vma = find_vma(mm, last_addr);
173	if (vma)
174		return vma;
175
176	return priv->tail_vma;
177}
178
179static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
180{
181	struct proc_maps_private *priv = m->private;
182	struct vm_area_struct *next, *vma = v;
183
184	if (vma == priv->tail_vma)
185		next = NULL;
186	else if (vma->vm_next)
187		next = vma->vm_next;
188	else
189		next = priv->tail_vma;
190
191	*ppos = next ? next->vm_start : -1UL;
192
193	return next;
194}
195
196static void m_stop(struct seq_file *m, void *v)
197{
198	struct proc_maps_private *priv = m->private;
199	struct mm_struct *mm = priv->mm;
200
201	if (!priv->task)
202		return;
203
204	release_task_mempolicy(priv);
205	mmap_read_unlock(mm);
206	mmput(mm);
207	put_task_struct(priv->task);
208	priv->task = NULL;
209}
210
211static int proc_maps_open(struct inode *inode, struct file *file,
212			const struct seq_operations *ops, int psize)
213{
214	struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
215
216	if (!priv)
217		return -ENOMEM;
218
219	priv->inode = inode;
220	priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
221	if (IS_ERR(priv->mm)) {
222		int err = PTR_ERR(priv->mm);
223
224		seq_release_private(inode, file);
225		return err;
226	}
227
228	return 0;
229}
230
231static int proc_map_release(struct inode *inode, struct file *file)
232{
233	struct seq_file *seq = file->private_data;
234	struct proc_maps_private *priv = seq->private;
235
236	if (priv->mm)
237		mmdrop(priv->mm);
238
239	return seq_release_private(inode, file);
240}
241
242static int do_maps_open(struct inode *inode, struct file *file,
243			const struct seq_operations *ops)
244{
245	return proc_maps_open(inode, file, ops,
246				sizeof(struct proc_maps_private));
247}
248
249/*
250 * Indicate if the VMA is a stack for the given task; for
251 * /proc/PID/maps that is the stack of the main task.
252 */
253static int is_stack(struct vm_area_struct *vma)
254{
255	/*
256	 * We make no effort to guess what a given thread considers to be
257	 * its "stack".  It's not even well-defined for programs written
258	 * languages like Go.
259	 */
260	return vma->vm_start <= vma->vm_mm->start_stack &&
261		vma->vm_end >= vma->vm_mm->start_stack;
262}
263
264static void show_vma_header_prefix(struct seq_file *m,
265				   unsigned long start, unsigned long end,
266				   vm_flags_t flags, unsigned long long pgoff,
267				   dev_t dev, unsigned long ino)
268{
269	seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
270	seq_put_hex_ll(m, NULL, start, 8);
271	seq_put_hex_ll(m, "-", end, 8);
272	seq_putc(m, ' ');
273	seq_putc(m, flags & VM_READ ? 'r' : '-');
274	seq_putc(m, flags & VM_WRITE ? 'w' : '-');
275	seq_putc(m, flags & VM_EXEC ? 'x' : '-');
276	seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p');
277	seq_put_hex_ll(m, " ", pgoff, 8);
278	seq_put_hex_ll(m, " ", MAJOR(dev), 2);
279	seq_put_hex_ll(m, ":", MINOR(dev), 2);
280	seq_put_decimal_ull(m, " ", ino);
281	seq_putc(m, ' ');
282}
283
284static void
285show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
286{
287	struct mm_struct *mm = vma->vm_mm;
288	struct file *file = vma->vm_file;
289	vm_flags_t flags = vma->vm_flags;
290	unsigned long ino = 0;
291	unsigned long long pgoff = 0;
292	unsigned long start, end;
293	dev_t dev = 0;
294	const char *name = NULL;
295
296	if (file) {
297		struct inode *inode = file_inode(vma->vm_file);
298		dev = inode->i_sb->s_dev;
299		ino = inode->i_ino;
300		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
301	}
302
303	start = vma->vm_start;
304	end = vma->vm_end;
305	CALL_HCK_LITE_HOOK(hideaddr_header_prefix_lhck, &start, &end, &flags, m, vma);
306	show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
307
308	/*
309	 * Print the dentry name for named mappings, and a
310	 * special [heap] marker for the heap:
311	 */
312	if (file) {
313		seq_pad(m, ' ');
314		seq_file_path(m, file, "\n");
315		goto done;
316	}
317
318	if (vma->vm_ops && vma->vm_ops->name) {
319		name = vma->vm_ops->name(vma);
320		if (name)
321			goto done;
322	}
323
324	name = arch_vma_name(vma);
325	if (!name) {
326		struct anon_vma_name *anon_name;
327
328		if (!mm) {
329			name = "[vdso]";
330			goto done;
331		}
332
333		if (vma->vm_start <= mm->brk &&
334		    vma->vm_end >= mm->start_brk) {
335			name = "[heap]";
336			goto done;
337		}
338
339		if (is_stack(vma)) {
340			name = "[stack]";
341			goto done;
342		}
343
344		anon_name = anon_vma_name(vma);
345		if (anon_name) {
346			seq_pad(m, ' ');
347			seq_printf(m, "[anon:%s]", anon_name->name);
348		}
349	}
350
351done:
352	if (name) {
353		seq_pad(m, ' ');
354		seq_puts(m, name);
355	}
356	seq_putc(m, '\n');
357}
358
359static int show_map(struct seq_file *m, void *v)
360{
361	show_map_vma(m, v);
362	return 0;
363}
364
365static const struct seq_operations proc_pid_maps_op = {
366	.start	= m_start,
367	.next	= m_next,
368	.stop	= m_stop,
369	.show	= show_map
370};
371
372static int pid_maps_open(struct inode *inode, struct file *file)
373{
374	return do_maps_open(inode, file, &proc_pid_maps_op);
375}
376
377const struct file_operations proc_pid_maps_operations = {
378	.open		= pid_maps_open,
379	.read		= seq_read,
380	.llseek		= seq_lseek,
381	.release	= proc_map_release,
382};
383
384/*
385 * Proportional Set Size(PSS): my share of RSS.
386 *
387 * PSS of a process is the count of pages it has in memory, where each
388 * page is divided by the number of processes sharing it.  So if a
389 * process has 1000 pages all to itself, and 1000 shared with one other
390 * process, its PSS will be 1500.
391 *
392 * To keep (accumulated) division errors low, we adopt a 64bit
393 * fixed-point pss counter to minimize division errors. So (pss >>
394 * PSS_SHIFT) would be the real byte count.
395 *
396 * A shift of 12 before division means (assuming 4K page size):
397 * 	- 1M 3-user-pages add up to 8KB errors;
398 * 	- supports mapcount up to 2^24, or 16M;
399 * 	- supports PSS up to 2^52 bytes, or 4PB.
400 */
401#define PSS_SHIFT 12
402
403#ifdef CONFIG_PROC_PAGE_MONITOR
404struct mem_size_stats {
405	unsigned long resident;
406	unsigned long shared_clean;
407	unsigned long shared_dirty;
408	unsigned long private_clean;
409	unsigned long private_dirty;
410	unsigned long referenced;
411	unsigned long anonymous;
412	unsigned long lazyfree;
413	unsigned long anonymous_thp;
414	unsigned long shmem_thp;
415	unsigned long file_thp;
416	unsigned long swap;
417	unsigned long shared_hugetlb;
418	unsigned long private_hugetlb;
419	u64 pss;
420	u64 pss_anon;
421	u64 pss_file;
422	u64 pss_shmem;
423	u64 pss_locked;
424	u64 swap_pss;
425	bool check_shmem_swap;
426};
427
428static void smaps_page_accumulate(struct mem_size_stats *mss,
429		struct page *page, unsigned long size, unsigned long pss,
430		bool dirty, bool locked, bool private)
431{
432	mss->pss += pss;
433
434	if (PageAnon(page))
435		mss->pss_anon += pss;
436	else if (PageSwapBacked(page))
437		mss->pss_shmem += pss;
438	else
439		mss->pss_file += pss;
440
441	if (locked)
442		mss->pss_locked += pss;
443
444	if (dirty || PageDirty(page)) {
445		if (private)
446			mss->private_dirty += size;
447		else
448			mss->shared_dirty += size;
449	} else {
450		if (private)
451			mss->private_clean += size;
452		else
453			mss->shared_clean += size;
454	}
455}
456
457static void smaps_account(struct mem_size_stats *mss, struct page *page,
458		bool compound, bool young, bool dirty, bool locked,
459		bool migration)
460{
461	int i, nr = compound ? compound_nr(page) : 1;
462	unsigned long size = nr * PAGE_SIZE;
463
464	/*
465	 * First accumulate quantities that depend only on |size| and the type
466	 * of the compound page.
467	 */
468	if (PageAnon(page)) {
469		mss->anonymous += size;
470		if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
471			mss->lazyfree += size;
472	}
473
474	mss->resident += size;
475	/* Accumulate the size in pages that have been accessed. */
476	if (young || page_is_young(page) || PageReferenced(page))
477		mss->referenced += size;
478
479	/*
480	 * Then accumulate quantities that may depend on sharing, or that may
481	 * differ page-by-page.
482	 *
483	 * page_count(page) == 1 guarantees the page is mapped exactly once.
484	 * If any subpage of the compound page mapped with PTE it would elevate
485	 * page_count().
486	 *
487	 * The page_mapcount() is called to get a snapshot of the mapcount.
488	 * Without holding the page lock this snapshot can be slightly wrong as
489	 * we cannot always read the mapcount atomically.  It is not safe to
490	 * call page_mapcount() even with PTL held if the page is not mapped,
491	 * especially for migration entries.  Treat regular migration entries
492	 * as mapcount == 1.
493	 */
494	if ((page_count(page) == 1) || migration) {
495		smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
496			locked, true);
497		return;
498	}
499	for (i = 0; i < nr; i++, page++) {
500		int mapcount = page_mapcount(page);
501		unsigned long pss = PAGE_SIZE << PSS_SHIFT;
502		if (mapcount >= 2)
503			pss /= mapcount;
504		smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked,
505				      mapcount < 2);
506	}
507}
508
509#ifdef CONFIG_SHMEM
510static int smaps_pte_hole(unsigned long addr, unsigned long end,
511			  __always_unused int depth, struct mm_walk *walk)
512{
513	struct mem_size_stats *mss = walk->private;
514
515	mss->swap += shmem_partial_swap_usage(
516			walk->vma->vm_file->f_mapping, addr, end);
517
518	return 0;
519}
520#else
521#define smaps_pte_hole		NULL
522#endif /* CONFIG_SHMEM */
523
524static void smaps_pte_entry(pte_t *pte, unsigned long addr,
525		struct mm_walk *walk)
526{
527	struct mem_size_stats *mss = walk->private;
528	struct vm_area_struct *vma = walk->vma;
529	bool locked = !!(vma->vm_flags & VM_LOCKED);
530	struct page *page = NULL;
531	bool migration = false, young = false, dirty = false;
532
533	if (pte_present(*pte)) {
534		page = vm_normal_page(vma, addr, *pte);
535		young = pte_young(*pte);
536		dirty = pte_dirty(*pte);
537	} else if (is_swap_pte(*pte)) {
538		swp_entry_t swpent = pte_to_swp_entry(*pte);
539
540		if (!non_swap_entry(swpent)) {
541			int mapcount;
542
543			mss->swap += PAGE_SIZE;
544			mapcount = swp_swapcount(swpent);
545			if (mapcount >= 2) {
546				u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
547
548				do_div(pss_delta, mapcount);
549				mss->swap_pss += pss_delta;
550			} else {
551				mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
552			}
553		} else if (is_migration_entry(swpent)) {
554			migration = true;
555			page = migration_entry_to_page(swpent);
556		} else if (is_device_private_entry(swpent))
557			page = device_private_entry_to_page(swpent);
558	} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
559							&& pte_none(*pte))) {
560		page = xa_load(&vma->vm_file->f_mapping->i_pages,
561						linear_page_index(vma, addr));
562		if (xa_is_value(page))
563			mss->swap += PAGE_SIZE;
564		return;
565	}
566
567	if (!page)
568		return;
569
570	smaps_account(mss, page, false, young, dirty, locked, migration);
571}
572
573#ifdef CONFIG_TRANSPARENT_HUGEPAGE
574static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
575		struct mm_walk *walk)
576{
577	struct mem_size_stats *mss = walk->private;
578	struct vm_area_struct *vma = walk->vma;
579	bool locked = !!(vma->vm_flags & VM_LOCKED);
580	struct page *page = NULL;
581	bool migration = false;
582
583	if (pmd_present(*pmd)) {
584		/* FOLL_DUMP will return -EFAULT on huge zero page */
585		page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
586	} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
587		swp_entry_t entry = pmd_to_swp_entry(*pmd);
588
589		if (is_migration_entry(entry)) {
590			migration = true;
591			page = migration_entry_to_page(entry);
592		}
593	}
594	if (IS_ERR_OR_NULL(page))
595		return;
596	if (PageAnon(page))
597		mss->anonymous_thp += HPAGE_PMD_SIZE;
598	else if (PageSwapBacked(page))
599		mss->shmem_thp += HPAGE_PMD_SIZE;
600	else if (is_zone_device_page(page))
601		/* pass */;
602	else
603		mss->file_thp += HPAGE_PMD_SIZE;
604
605	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
606		      locked, migration);
607}
608#else
609static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
610		struct mm_walk *walk)
611{
612}
613#endif
614
615static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
616			   struct mm_walk *walk)
617{
618	struct vm_area_struct *vma = walk->vma;
619	pte_t *pte;
620	spinlock_t *ptl;
621
622	ptl = pmd_trans_huge_lock(pmd, vma);
623	if (ptl) {
624		smaps_pmd_entry(pmd, addr, walk);
625		spin_unlock(ptl);
626		goto out;
627	}
628
629	if (pmd_trans_unstable(pmd))
630		goto out;
631	/*
632	 * The mmap_lock held all the way back in m_start() is what
633	 * keeps khugepaged out of here and from collapsing things
634	 * in here.
635	 */
636	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
637	for (; addr != end; pte++, addr += PAGE_SIZE)
638		smaps_pte_entry(pte, addr, walk);
639	pte_unmap_unlock(pte - 1, ptl);
640out:
641	cond_resched();
642	return 0;
643}
644
645static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
646{
647	/*
648	 * Don't forget to update Documentation/ on changes.
649	 */
650	static const char mnemonics[BITS_PER_LONG][2] = {
651		/*
652		 * In case if we meet a flag we don't know about.
653		 */
654		[0 ... (BITS_PER_LONG-1)] = "??",
655
656		[ilog2(VM_READ)]	= "rd",
657		[ilog2(VM_WRITE)]	= "wr",
658		[ilog2(VM_EXEC)]	= "ex",
659		[ilog2(VM_SHARED)]	= "sh",
660		[ilog2(VM_MAYREAD)]	= "mr",
661		[ilog2(VM_MAYWRITE)]	= "mw",
662		[ilog2(VM_MAYEXEC)]	= "me",
663		[ilog2(VM_MAYSHARE)]	= "ms",
664		[ilog2(VM_GROWSDOWN)]	= "gd",
665		[ilog2(VM_PFNMAP)]	= "pf",
666		[ilog2(VM_DENYWRITE)]	= "dw",
667		[ilog2(VM_LOCKED)]	= "lo",
668		[ilog2(VM_IO)]		= "io",
669		[ilog2(VM_SEQ_READ)]	= "sr",
670		[ilog2(VM_RAND_READ)]	= "rr",
671		[ilog2(VM_DONTCOPY)]	= "dc",
672		[ilog2(VM_DONTEXPAND)]	= "de",
673		[ilog2(VM_ACCOUNT)]	= "ac",
674		[ilog2(VM_NORESERVE)]	= "nr",
675		[ilog2(VM_HUGETLB)]	= "ht",
676		[ilog2(VM_SYNC)]	= "sf",
677		[ilog2(VM_ARCH_1)]	= "ar",
678		[ilog2(VM_WIPEONFORK)]	= "wf",
679		[ilog2(VM_DONTDUMP)]	= "dd",
680#ifdef CONFIG_ARM64_BTI
681		[ilog2(VM_ARM64_BTI)]	= "bt",
682#endif
683#ifdef CONFIG_MEM_SOFT_DIRTY
684		[ilog2(VM_SOFTDIRTY)]	= "sd",
685#endif
686		[ilog2(VM_MIXEDMAP)]	= "mm",
687		[ilog2(VM_HUGEPAGE)]	= "hg",
688		[ilog2(VM_NOHUGEPAGE)]	= "nh",
689		[ilog2(VM_MERGEABLE)]	= "mg",
690		[ilog2(VM_UFFD_MISSING)]= "um",
691		[ilog2(VM_UFFD_WP)]	= "uw",
692#ifdef CONFIG_ARM64_MTE
693		[ilog2(VM_MTE)]		= "mt",
694		[ilog2(VM_MTE_ALLOWED)]	= "",
695#endif
696#ifdef CONFIG_ARCH_HAS_PKEYS
697		/* These come out via ProtectionKey: */
698		[ilog2(VM_PKEY_BIT0)]	= "",
699		[ilog2(VM_PKEY_BIT1)]	= "",
700		[ilog2(VM_PKEY_BIT2)]	= "",
701		[ilog2(VM_PKEY_BIT3)]	= "",
702#if VM_PKEY_BIT4
703		[ilog2(VM_PKEY_BIT4)]	= "",
704#endif
705#endif /* CONFIG_ARCH_HAS_PKEYS */
706	};
707	size_t i;
708
709	seq_puts(m, "VmFlags: ");
710	for (i = 0; i < BITS_PER_LONG; i++) {
711		if (!mnemonics[i][0])
712			continue;
713		if (vma->vm_flags & (1UL << i)) {
714			seq_putc(m, mnemonics[i][0]);
715			seq_putc(m, mnemonics[i][1]);
716			seq_putc(m, ' ');
717		}
718	}
719	seq_putc(m, '\n');
720}
721
722#ifdef CONFIG_HUGETLB_PAGE
723static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
724				 unsigned long addr, unsigned long end,
725				 struct mm_walk *walk)
726{
727	struct mem_size_stats *mss = walk->private;
728	struct vm_area_struct *vma = walk->vma;
729	struct page *page = NULL;
730
731	if (pte_present(*pte)) {
732		page = vm_normal_page(vma, addr, *pte);
733	} else if (is_swap_pte(*pte)) {
734		swp_entry_t swpent = pte_to_swp_entry(*pte);
735
736		if (is_migration_entry(swpent))
737			page = migration_entry_to_page(swpent);
738		else if (is_device_private_entry(swpent))
739			page = device_private_entry_to_page(swpent);
740	}
741	if (page) {
742		if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte))
743			mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
744		else
745			mss->private_hugetlb += huge_page_size(hstate_vma(vma));
746	}
747	return 0;
748}
749#else
750#define smaps_hugetlb_range	NULL
751#endif /* HUGETLB_PAGE */
752
753static const struct mm_walk_ops smaps_walk_ops = {
754	.pmd_entry		= smaps_pte_range,
755	.hugetlb_entry		= smaps_hugetlb_range,
756};
757
758static const struct mm_walk_ops smaps_shmem_walk_ops = {
759	.pmd_entry		= smaps_pte_range,
760	.hugetlb_entry		= smaps_hugetlb_range,
761	.pte_hole		= smaps_pte_hole,
762};
763
764/*
765 * Gather mem stats from @vma with the indicated beginning
766 * address @start, and keep them in @mss.
767 *
768 * Use vm_start of @vma as the beginning address if @start is 0.
769 */
770static void smap_gather_stats(struct vm_area_struct *vma,
771		struct mem_size_stats *mss, unsigned long start)
772{
773	const struct mm_walk_ops *ops = &smaps_walk_ops;
774
775	/* Invalid start */
776	if (start >= vma->vm_end)
777		return;
778
779#ifdef CONFIG_SHMEM
780	/* In case of smaps_rollup, reset the value from previous vma */
781	mss->check_shmem_swap = false;
782	if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
783		/*
784		 * For shared or readonly shmem mappings we know that all
785		 * swapped out pages belong to the shmem object, and we can
786		 * obtain the swap value much more efficiently. For private
787		 * writable mappings, we might have COW pages that are
788		 * not affected by the parent swapped out pages of the shmem
789		 * object, so we have to distinguish them during the page walk.
790		 * Unless we know that the shmem object (or the part mapped by
791		 * our VMA) has no swapped out pages at all.
792		 */
793		unsigned long shmem_swapped = shmem_swap_usage(vma);
794
795		if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
796					!(vma->vm_flags & VM_WRITE))) {
797			mss->swap += shmem_swapped;
798		} else {
799			mss->check_shmem_swap = true;
800			ops = &smaps_shmem_walk_ops;
801		}
802	}
803#endif
804	/* mmap_lock is held in m_start */
805	if (!start)
806		walk_page_vma(vma, ops, mss);
807	else
808		walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
809}
810
811#define SEQ_PUT_DEC(str, val) \
812		seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
813
814/* Show the contents common for smaps and smaps_rollup */
815static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
816	bool rollup_mode)
817{
818	SEQ_PUT_DEC("Rss:            ", mss->resident);
819	SEQ_PUT_DEC(" kB\nPss:            ", mss->pss >> PSS_SHIFT);
820	if (rollup_mode) {
821		/*
822		 * These are meaningful only for smaps_rollup, otherwise two of
823		 * them are zero, and the other one is the same as Pss.
824		 */
825		SEQ_PUT_DEC(" kB\nPss_Anon:       ",
826			mss->pss_anon >> PSS_SHIFT);
827		SEQ_PUT_DEC(" kB\nPss_File:       ",
828			mss->pss_file >> PSS_SHIFT);
829		SEQ_PUT_DEC(" kB\nPss_Shmem:      ",
830			mss->pss_shmem >> PSS_SHIFT);
831	}
832	SEQ_PUT_DEC(" kB\nShared_Clean:   ", mss->shared_clean);
833	SEQ_PUT_DEC(" kB\nShared_Dirty:   ", mss->shared_dirty);
834	SEQ_PUT_DEC(" kB\nPrivate_Clean:  ", mss->private_clean);
835	SEQ_PUT_DEC(" kB\nPrivate_Dirty:  ", mss->private_dirty);
836	SEQ_PUT_DEC(" kB\nReferenced:     ", mss->referenced);
837	SEQ_PUT_DEC(" kB\nAnonymous:      ", mss->anonymous);
838	SEQ_PUT_DEC(" kB\nLazyFree:       ", mss->lazyfree);
839	SEQ_PUT_DEC(" kB\nAnonHugePages:  ", mss->anonymous_thp);
840	SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
841	SEQ_PUT_DEC(" kB\nFilePmdMapped:  ", mss->file_thp);
842	SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
843	seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
844				  mss->private_hugetlb >> 10, 7);
845	SEQ_PUT_DEC(" kB\nSwap:           ", mss->swap);
846	SEQ_PUT_DEC(" kB\nSwapPss:        ",
847					mss->swap_pss >> PSS_SHIFT);
848	SEQ_PUT_DEC(" kB\nLocked:         ",
849					mss->pss_locked >> PSS_SHIFT);
850	seq_puts(m, " kB\n");
851}
852
853static int show_smap(struct seq_file *m, void *v)
854{
855	struct vm_area_struct *vma = v;
856	struct mem_size_stats mss;
857
858	memset(&mss, 0, sizeof(mss));
859
860	smap_gather_stats(vma, &mss, 0);
861
862	show_map_vma(m, vma);
863
864	SEQ_PUT_DEC("Size:           ", vma->vm_end - vma->vm_start);
865	SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma));
866	SEQ_PUT_DEC(" kB\nMMUPageSize:    ", vma_mmu_pagesize(vma));
867	seq_puts(m, " kB\n");
868
869	__show_smap(m, &mss, false);
870
871	seq_printf(m, "THPeligible:    %d\n",
872		   transparent_hugepage_active(vma));
873
874	if (arch_pkeys_enabled())
875		seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
876	show_smap_vma_flags(m, vma);
877
878	return 0;
879}
880
881static int show_smaps_rollup(struct seq_file *m, void *v)
882{
883	struct proc_maps_private *priv = m->private;
884	struct mem_size_stats mss;
885	struct mm_struct *mm;
886	struct vm_area_struct *vma;
887	unsigned long last_vma_end = 0;
888	int ret = 0;
889
890	priv->task = get_proc_task(priv->inode);
891	if (!priv->task)
892		return -ESRCH;
893
894	mm = priv->mm;
895	if (!mm || !mmget_not_zero(mm)) {
896		ret = -ESRCH;
897		goto out_put_task;
898	}
899
900	memset(&mss, 0, sizeof(mss));
901
902	ret = mmap_read_lock_killable(mm);
903	if (ret)
904		goto out_put_mm;
905
906	hold_task_mempolicy(priv);
907
908	for (vma = priv->mm->mmap; vma;) {
909		smap_gather_stats(vma, &mss, 0);
910		last_vma_end = vma->vm_end;
911
912		/*
913		 * Release mmap_lock temporarily if someone wants to
914		 * access it for write request.
915		 */
916		if (mmap_lock_is_contended(mm)) {
917			mmap_read_unlock(mm);
918			ret = mmap_read_lock_killable(mm);
919			if (ret) {
920				release_task_mempolicy(priv);
921				goto out_put_mm;
922			}
923
924			/*
925			 * After dropping the lock, there are four cases to
926			 * consider. See the following example for explanation.
927			 *
928			 *   +------+------+-----------+
929			 *   | VMA1 | VMA2 | VMA3      |
930			 *   +------+------+-----------+
931			 *   |      |      |           |
932			 *  4k     8k     16k         400k
933			 *
934			 * Suppose we drop the lock after reading VMA2 due to
935			 * contention, then we get:
936			 *
937			 *	last_vma_end = 16k
938			 *
939			 * 1) VMA2 is freed, but VMA3 exists:
940			 *
941			 *    find_vma(mm, 16k - 1) will return VMA3.
942			 *    In this case, just continue from VMA3.
943			 *
944			 * 2) VMA2 still exists:
945			 *
946			 *    find_vma(mm, 16k - 1) will return VMA2.
947			 *    Iterate the loop like the original one.
948			 *
949			 * 3) No more VMAs can be found:
950			 *
951			 *    find_vma(mm, 16k - 1) will return NULL.
952			 *    No more things to do, just break.
953			 *
954			 * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
955			 *
956			 *    find_vma(mm, 16k - 1) will return VMA' whose range
957			 *    contains last_vma_end.
958			 *    Iterate VMA' from last_vma_end.
959			 */
960			vma = find_vma(mm, last_vma_end - 1);
961			/* Case 3 above */
962			if (!vma)
963				break;
964
965			/* Case 1 above */
966			if (vma->vm_start >= last_vma_end)
967				continue;
968
969			/* Case 4 above */
970			if (vma->vm_end > last_vma_end)
971				smap_gather_stats(vma, &mss, last_vma_end);
972		}
973		/* Case 2 above */
974		vma = vma->vm_next;
975	}
976
977	show_vma_header_prefix(m, priv->mm->mmap ? priv->mm->mmap->vm_start : 0,
978			       last_vma_end, 0, 0, 0, 0);
979	seq_pad(m, ' ');
980	seq_puts(m, "[rollup]\n");
981
982	__show_smap(m, &mss, true);
983
984	release_task_mempolicy(priv);
985	mmap_read_unlock(mm);
986
987out_put_mm:
988	mmput(mm);
989out_put_task:
990	put_task_struct(priv->task);
991	priv->task = NULL;
992
993	return ret;
994}
995#undef SEQ_PUT_DEC
996
997static const struct seq_operations proc_pid_smaps_op = {
998	.start	= m_start,
999	.next	= m_next,
1000	.stop	= m_stop,
1001	.show	= show_smap
1002};
1003
1004static int pid_smaps_open(struct inode *inode, struct file *file)
1005{
1006	return do_maps_open(inode, file, &proc_pid_smaps_op);
1007}
1008
1009static int smaps_rollup_open(struct inode *inode, struct file *file)
1010{
1011	int ret;
1012	struct proc_maps_private *priv;
1013
1014	priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT);
1015	if (!priv)
1016		return -ENOMEM;
1017
1018	ret = single_open(file, show_smaps_rollup, priv);
1019	if (ret)
1020		goto out_free;
1021
1022	priv->inode = inode;
1023	priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
1024	if (IS_ERR(priv->mm)) {
1025		ret = PTR_ERR(priv->mm);
1026
1027		single_release(inode, file);
1028		goto out_free;
1029	}
1030
1031	return 0;
1032
1033out_free:
1034	kfree(priv);
1035	return ret;
1036}
1037
1038static int smaps_rollup_release(struct inode *inode, struct file *file)
1039{
1040	struct seq_file *seq = file->private_data;
1041	struct proc_maps_private *priv = seq->private;
1042
1043	if (priv->mm)
1044		mmdrop(priv->mm);
1045
1046	kfree(priv);
1047	return single_release(inode, file);
1048}
1049
1050const struct file_operations proc_pid_smaps_operations = {
1051	.open		= pid_smaps_open,
1052	.read		= seq_read,
1053	.llseek		= seq_lseek,
1054	.release	= proc_map_release,
1055};
1056
1057const struct file_operations proc_pid_smaps_rollup_operations = {
1058	.open		= smaps_rollup_open,
1059	.read		= seq_read,
1060	.llseek		= seq_lseek,
1061	.release	= smaps_rollup_release,
1062};
1063
1064enum clear_refs_types {
1065	CLEAR_REFS_ALL = 1,
1066	CLEAR_REFS_ANON,
1067	CLEAR_REFS_MAPPED,
1068	CLEAR_REFS_SOFT_DIRTY,
1069	CLEAR_REFS_MM_HIWATER_RSS,
1070	CLEAR_REFS_LAST,
1071};
1072
1073struct clear_refs_private {
1074	enum clear_refs_types type;
1075};
1076
1077#ifdef CONFIG_MEM_SOFT_DIRTY
1078
1079#define is_cow_mapping(flags) (((flags) & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE)
1080
1081static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
1082{
1083	struct page *page;
1084
1085	if (!pte_write(pte))
1086		return false;
1087	if (!is_cow_mapping(vma->vm_flags))
1088		return false;
1089	if (likely(!atomic_read(&vma->vm_mm->has_pinned)))
1090		return false;
1091	page = vm_normal_page(vma, addr, pte);
1092	if (!page)
1093		return false;
1094	return page_maybe_dma_pinned(page);
1095}
1096
1097static inline void clear_soft_dirty(struct vm_area_struct *vma,
1098		unsigned long addr, pte_t *pte)
1099{
1100	/*
1101	 * The soft-dirty tracker uses #PF-s to catch writes
1102	 * to pages, so write-protect the pte as well. See the
1103	 * Documentation/admin-guide/mm/soft-dirty.rst for full description
1104	 * of how soft-dirty works.
1105	 */
1106	pte_t ptent = *pte;
1107
1108	if (pte_present(ptent)) {
1109		pte_t old_pte;
1110
1111		if (pte_is_pinned(vma, addr, ptent))
1112			return;
1113		old_pte = ptep_modify_prot_start(vma, addr, pte);
1114		ptent = pte_wrprotect(old_pte);
1115		ptent = pte_clear_soft_dirty(ptent);
1116		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
1117	} else if (is_swap_pte(ptent)) {
1118		ptent = pte_swp_clear_soft_dirty(ptent);
1119		set_pte_at(vma->vm_mm, addr, pte, ptent);
1120	}
1121}
1122#else
1123static inline void clear_soft_dirty(struct vm_area_struct *vma,
1124		unsigned long addr, pte_t *pte)
1125{
1126}
1127#endif
1128
1129#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1130static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
1131		unsigned long addr, pmd_t *pmdp)
1132{
1133	pmd_t old, pmd = *pmdp;
1134
1135	if (pmd_present(pmd)) {
1136		/* See comment in change_huge_pmd() */
1137		old = pmdp_invalidate(vma, addr, pmdp);
1138		if (pmd_dirty(old))
1139			pmd = pmd_mkdirty(pmd);
1140		if (pmd_young(old))
1141			pmd = pmd_mkyoung(pmd);
1142
1143		pmd = pmd_wrprotect(pmd);
1144		pmd = pmd_clear_soft_dirty(pmd);
1145
1146		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
1147	} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
1148		pmd = pmd_swp_clear_soft_dirty(pmd);
1149		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
1150	}
1151}
1152#else
1153static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
1154		unsigned long addr, pmd_t *pmdp)
1155{
1156}
1157#endif
1158
1159static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
1160				unsigned long end, struct mm_walk *walk)
1161{
1162	struct clear_refs_private *cp = walk->private;
1163	struct vm_area_struct *vma = walk->vma;
1164	pte_t *pte, ptent;
1165	spinlock_t *ptl;
1166	struct page *page;
1167
1168	ptl = pmd_trans_huge_lock(pmd, vma);
1169	if (ptl) {
1170		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
1171			clear_soft_dirty_pmd(vma, addr, pmd);
1172			goto out;
1173		}
1174
1175		if (!pmd_present(*pmd))
1176			goto out;
1177
1178		page = pmd_page(*pmd);
1179
1180		/* Clear accessed and referenced bits. */
1181		pmdp_test_and_clear_young(vma, addr, pmd);
1182		test_and_clear_page_young(page);
1183		ClearPageReferenced(page);
1184out:
1185		spin_unlock(ptl);
1186		return 0;
1187	}
1188
1189	if (pmd_trans_unstable(pmd))
1190		return 0;
1191
1192	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1193	for (; addr != end; pte++, addr += PAGE_SIZE) {
1194		ptent = *pte;
1195
1196		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
1197			clear_soft_dirty(vma, addr, pte);
1198			continue;
1199		}
1200
1201		if (!pte_present(ptent))
1202			continue;
1203
1204		page = vm_normal_page(vma, addr, ptent);
1205		if (!page)
1206			continue;
1207
1208		/* Clear accessed and referenced bits. */
1209		ptep_test_and_clear_young(vma, addr, pte);
1210		test_and_clear_page_young(page);
1211		ClearPageReferenced(page);
1212	}
1213	pte_unmap_unlock(pte - 1, ptl);
1214	cond_resched();
1215	return 0;
1216}
1217
1218static int clear_refs_test_walk(unsigned long start, unsigned long end,
1219				struct mm_walk *walk)
1220{
1221	struct clear_refs_private *cp = walk->private;
1222	struct vm_area_struct *vma = walk->vma;
1223
1224	if (vma->vm_flags & VM_PFNMAP)
1225		return 1;
1226
1227	/*
1228	 * Writing 1 to /proc/pid/clear_refs affects all pages.
1229	 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
1230	 * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
1231	 * Writing 4 to /proc/pid/clear_refs affects all pages.
1232	 */
1233	if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
1234		return 1;
1235	if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
1236		return 1;
1237	return 0;
1238}
1239
1240static const struct mm_walk_ops clear_refs_walk_ops = {
1241	.pmd_entry		= clear_refs_pte_range,
1242	.test_walk		= clear_refs_test_walk,
1243};
1244
1245static ssize_t clear_refs_write(struct file *file, const char __user *buf,
1246				size_t count, loff_t *ppos)
1247{
1248	struct task_struct *task;
1249	char buffer[PROC_NUMBUF];
1250	struct mm_struct *mm;
1251	struct vm_area_struct *vma;
1252	enum clear_refs_types type;
1253	int itype;
1254	int rv;
1255
1256	memset(buffer, 0, sizeof(buffer));
1257	if (count > sizeof(buffer) - 1)
1258		count = sizeof(buffer) - 1;
1259	if (copy_from_user(buffer, buf, count))
1260		return -EFAULT;
1261	rv = kstrtoint(strstrip(buffer), 10, &itype);
1262	if (rv < 0)
1263		return rv;
1264	type = (enum clear_refs_types)itype;
1265	if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
1266		return -EINVAL;
1267
1268	task = get_proc_task(file_inode(file));
1269	if (!task)
1270		return -ESRCH;
1271	mm = get_task_mm(task);
1272	if (mm) {
1273		struct mmu_notifier_range range;
1274		struct clear_refs_private cp = {
1275			.type = type,
1276		};
1277
1278		if (mmap_write_lock_killable(mm)) {
1279			count = -EINTR;
1280			goto out_mm;
1281		}
1282		if (type == CLEAR_REFS_MM_HIWATER_RSS) {
1283			/*
1284			 * Writing 5 to /proc/pid/clear_refs resets the peak
1285			 * resident set size to this mm's current rss value.
1286			 */
1287			reset_mm_hiwater_rss(mm);
1288			goto out_unlock;
1289		}
1290
1291		if (type == CLEAR_REFS_SOFT_DIRTY) {
1292			for (vma = mm->mmap; vma; vma = vma->vm_next) {
1293				if (!(vma->vm_flags & VM_SOFTDIRTY))
1294					continue;
1295				vma->vm_flags &= ~VM_SOFTDIRTY;
1296				vma_set_page_prot(vma);
1297			}
1298
1299			inc_tlb_flush_pending(mm);
1300			mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
1301						0, NULL, mm, 0, -1UL);
1302			mmu_notifier_invalidate_range_start(&range);
1303		}
1304		walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
1305				&cp);
1306		if (type == CLEAR_REFS_SOFT_DIRTY) {
1307			mmu_notifier_invalidate_range_end(&range);
1308			flush_tlb_mm(mm);
1309			dec_tlb_flush_pending(mm);
1310		}
1311out_unlock:
1312		mmap_write_unlock(mm);
1313out_mm:
1314		mmput(mm);
1315	}
1316	put_task_struct(task);
1317
1318	return count;
1319}
1320
1321const struct file_operations proc_clear_refs_operations = {
1322	.write		= clear_refs_write,
1323	.llseek		= noop_llseek,
1324};
1325
1326typedef struct {
1327	u64 pme;
1328} pagemap_entry_t;
1329
1330struct pagemapread {
1331	int pos, len;		/* units: PM_ENTRY_BYTES, not bytes */
1332	pagemap_entry_t *buffer;
1333	bool show_pfn;
1334};
1335
1336#define PAGEMAP_WALK_SIZE	(PMD_SIZE)
1337#define PAGEMAP_WALK_MASK	(PMD_MASK)
1338
1339#define PM_ENTRY_BYTES		sizeof(pagemap_entry_t)
1340#define PM_PFRAME_BITS		55
1341#define PM_PFRAME_MASK		GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
1342#define PM_SOFT_DIRTY		BIT_ULL(55)
1343#define PM_MMAP_EXCLUSIVE	BIT_ULL(56)
1344#define PM_FILE			BIT_ULL(61)
1345#define PM_SWAP			BIT_ULL(62)
1346#define PM_PRESENT		BIT_ULL(63)
1347
1348#define PM_END_OF_BUFFER    1
1349
1350static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
1351{
1352	return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
1353}
1354
1355static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
1356			  struct pagemapread *pm)
1357{
1358	pm->buffer[pm->pos++] = *pme;
1359	if (pm->pos >= pm->len)
1360		return PM_END_OF_BUFFER;
1361	return 0;
1362}
1363
1364static int pagemap_pte_hole(unsigned long start, unsigned long end,
1365			    __always_unused int depth, struct mm_walk *walk)
1366{
1367	struct pagemapread *pm = walk->private;
1368	unsigned long addr = start;
1369	int err = 0;
1370
1371	while (addr < end) {
1372		struct vm_area_struct *vma = find_vma(walk->mm, addr);
1373		pagemap_entry_t pme = make_pme(0, 0);
1374		/* End of address space hole, which we mark as non-present. */
1375		unsigned long hole_end;
1376
1377		if (vma)
1378			hole_end = min(end, vma->vm_start);
1379		else
1380			hole_end = end;
1381
1382		for (; addr < hole_end; addr += PAGE_SIZE) {
1383			err = add_to_pagemap(addr, &pme, pm);
1384			if (err)
1385				goto out;
1386		}
1387
1388		if (!vma)
1389			break;
1390
1391		/* Addresses in the VMA. */
1392		if (vma->vm_flags & VM_SOFTDIRTY)
1393			pme = make_pme(0, PM_SOFT_DIRTY);
1394		for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
1395			err = add_to_pagemap(addr, &pme, pm);
1396			if (err)
1397				goto out;
1398		}
1399	}
1400out:
1401	return err;
1402}
1403
1404static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
1405		struct vm_area_struct *vma, unsigned long addr, pte_t pte)
1406{
1407	u64 frame = 0, flags = 0;
1408	struct page *page = NULL;
1409	bool migration = false;
1410
1411	if (pte_present(pte)) {
1412		if (pm->show_pfn)
1413			frame = pte_pfn(pte);
1414		flags |= PM_PRESENT;
1415		page = vm_normal_page(vma, addr, pte);
1416		if (pte_soft_dirty(pte))
1417			flags |= PM_SOFT_DIRTY;
1418	} else if (is_swap_pte(pte)) {
1419		swp_entry_t entry;
1420		if (pte_swp_soft_dirty(pte))
1421			flags |= PM_SOFT_DIRTY;
1422		entry = pte_to_swp_entry(pte);
1423		if (pm->show_pfn)
1424			frame = swp_type(entry) |
1425				(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
1426		flags |= PM_SWAP;
1427		if (is_migration_entry(entry)) {
1428			migration = true;
1429			page = migration_entry_to_page(entry);
1430		}
1431
1432		if (is_device_private_entry(entry))
1433			page = device_private_entry_to_page(entry);
1434	}
1435
1436	if (page && !PageAnon(page))
1437		flags |= PM_FILE;
1438	if (page && !migration && page_mapcount(page) == 1)
1439		flags |= PM_MMAP_EXCLUSIVE;
1440	if (vma->vm_flags & VM_SOFTDIRTY)
1441		flags |= PM_SOFT_DIRTY;
1442
1443	return make_pme(frame, flags);
1444}
1445
1446static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
1447			     struct mm_walk *walk)
1448{
1449	struct vm_area_struct *vma = walk->vma;
1450	struct pagemapread *pm = walk->private;
1451	spinlock_t *ptl;
1452	pte_t *pte, *orig_pte;
1453	int err = 0;
1454#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1455	bool migration = false;
1456
1457	ptl = pmd_trans_huge_lock(pmdp, vma);
1458	if (ptl) {
1459		u64 flags = 0, frame = 0;
1460		pmd_t pmd = *pmdp;
1461		struct page *page = NULL;
1462
1463		if (vma->vm_flags & VM_SOFTDIRTY)
1464			flags |= PM_SOFT_DIRTY;
1465
1466		if (pmd_present(pmd)) {
1467			page = pmd_page(pmd);
1468
1469			flags |= PM_PRESENT;
1470			if (pmd_soft_dirty(pmd))
1471				flags |= PM_SOFT_DIRTY;
1472			if (pm->show_pfn)
1473				frame = pmd_pfn(pmd) +
1474					((addr & ~PMD_MASK) >> PAGE_SHIFT);
1475		}
1476#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1477		else if (is_swap_pmd(pmd)) {
1478			swp_entry_t entry = pmd_to_swp_entry(pmd);
1479			unsigned long offset;
1480
1481			if (pm->show_pfn) {
1482				offset = swp_offset(entry) +
1483					((addr & ~PMD_MASK) >> PAGE_SHIFT);
1484				frame = swp_type(entry) |
1485					(offset << MAX_SWAPFILES_SHIFT);
1486			}
1487			flags |= PM_SWAP;
1488			if (pmd_swp_soft_dirty(pmd))
1489				flags |= PM_SOFT_DIRTY;
1490			VM_BUG_ON(!is_pmd_migration_entry(pmd));
1491			migration = is_migration_entry(entry);
1492			page = migration_entry_to_page(entry);
1493		}
1494#endif
1495
1496		if (page && !migration && page_mapcount(page) == 1)
1497			flags |= PM_MMAP_EXCLUSIVE;
1498
1499		for (; addr != end; addr += PAGE_SIZE) {
1500			pagemap_entry_t pme = make_pme(frame, flags);
1501
1502			err = add_to_pagemap(addr, &pme, pm);
1503			if (err)
1504				break;
1505			if (pm->show_pfn) {
1506				if (flags & PM_PRESENT)
1507					frame++;
1508				else if (flags & PM_SWAP)
1509					frame += (1 << MAX_SWAPFILES_SHIFT);
1510			}
1511		}
1512		spin_unlock(ptl);
1513		return err;
1514	}
1515
1516	if (pmd_trans_unstable(pmdp))
1517		return 0;
1518#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1519
1520	/*
1521	 * We can assume that @vma always points to a valid one and @end never
1522	 * goes beyond vma->vm_end.
1523	 */
1524	orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
1525	for (; addr < end; pte++, addr += PAGE_SIZE) {
1526		pagemap_entry_t pme;
1527
1528		pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
1529		err = add_to_pagemap(addr, &pme, pm);
1530		if (err)
1531			break;
1532	}
1533	pte_unmap_unlock(orig_pte, ptl);
1534
1535	cond_resched();
1536
1537	return err;
1538}
1539
1540#ifdef CONFIG_HUGETLB_PAGE
1541/* This function walks within one hugetlb entry in the single call */
1542static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
1543				 unsigned long addr, unsigned long end,
1544				 struct mm_walk *walk)
1545{
1546	struct pagemapread *pm = walk->private;
1547	struct vm_area_struct *vma = walk->vma;
1548	u64 flags = 0, frame = 0;
1549	int err = 0;
1550	pte_t pte;
1551
1552	if (vma->vm_flags & VM_SOFTDIRTY)
1553		flags |= PM_SOFT_DIRTY;
1554
1555	pte = huge_ptep_get(ptep);
1556	if (pte_present(pte)) {
1557		struct page *page = pte_page(pte);
1558
1559		if (!PageAnon(page))
1560			flags |= PM_FILE;
1561
1562		if (page_mapcount(page) == 1)
1563			flags |= PM_MMAP_EXCLUSIVE;
1564
1565		flags |= PM_PRESENT;
1566		if (pm->show_pfn)
1567			frame = pte_pfn(pte) +
1568				((addr & ~hmask) >> PAGE_SHIFT);
1569	}
1570
1571	for (; addr != end; addr += PAGE_SIZE) {
1572		pagemap_entry_t pme = make_pme(frame, flags);
1573
1574		err = add_to_pagemap(addr, &pme, pm);
1575		if (err)
1576			return err;
1577		if (pm->show_pfn && (flags & PM_PRESENT))
1578			frame++;
1579	}
1580
1581	cond_resched();
1582
1583	return err;
1584}
1585#else
1586#define pagemap_hugetlb_range	NULL
1587#endif /* HUGETLB_PAGE */
1588
1589static const struct mm_walk_ops pagemap_ops = {
1590	.pmd_entry	= pagemap_pmd_range,
1591	.pte_hole	= pagemap_pte_hole,
1592	.hugetlb_entry	= pagemap_hugetlb_range,
1593};
1594
1595/*
1596 * /proc/pid/pagemap - an array mapping virtual pages to pfns
1597 *
1598 * For each page in the address space, this file contains one 64-bit entry
1599 * consisting of the following:
1600 *
1601 * Bits 0-54  page frame number (PFN) if present
1602 * Bits 0-4   swap type if swapped
1603 * Bits 5-54  swap offset if swapped
1604 * Bit  55    pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
1605 * Bit  56    page exclusively mapped
1606 * Bits 57-60 zero
1607 * Bit  61    page is file-page or shared-anon
1608 * Bit  62    page swapped
1609 * Bit  63    page present
1610 *
1611 * If the page is not present but in swap, then the PFN contains an
1612 * encoding of the swap file number and the page's offset into the
1613 * swap. Unmapped pages return a null PFN. This allows determining
1614 * precisely which pages are mapped (or in swap) and comparing mapped
1615 * pages between processes.
1616 *
1617 * Efficient users of this interface will use /proc/pid/maps to
1618 * determine which areas of memory are actually mapped and llseek to
1619 * skip over unmapped regions.
1620 */
1621static ssize_t pagemap_read(struct file *file, char __user *buf,
1622			    size_t count, loff_t *ppos)
1623{
1624	struct mm_struct *mm = file->private_data;
1625	struct pagemapread pm;
1626	unsigned long src;
1627	unsigned long svpfn;
1628	unsigned long start_vaddr;
1629	unsigned long end_vaddr;
1630	int ret = 0, copied = 0;
1631
1632	if (!mm || !mmget_not_zero(mm))
1633		goto out;
1634
1635	ret = -EINVAL;
1636	/* file position must be aligned */
1637	if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
1638		goto out_mm;
1639
1640	ret = 0;
1641	if (!count)
1642		goto out_mm;
1643
1644	/* do not disclose physical addresses: attack vector */
1645	pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
1646
1647	pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
1648	pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL);
1649	ret = -ENOMEM;
1650	if (!pm.buffer)
1651		goto out_mm;
1652
1653	src = *ppos;
1654	svpfn = src / PM_ENTRY_BYTES;
1655	end_vaddr = mm->task_size;
1656
1657	/* watch out for wraparound */
1658	start_vaddr = end_vaddr;
1659	if (svpfn <= (ULONG_MAX >> PAGE_SHIFT))
1660		start_vaddr = untagged_addr(svpfn << PAGE_SHIFT);
1661
1662	/* Ensure the address is inside the task */
1663	if (start_vaddr > mm->task_size)
1664		start_vaddr = end_vaddr;
1665
1666	/*
1667	 * The odds are that this will stop walking way
1668	 * before end_vaddr, because the length of the
1669	 * user buffer is tracked in "pm", and the walk
1670	 * will stop when we hit the end of the buffer.
1671	 */
1672	ret = 0;
1673	while (count && (start_vaddr < end_vaddr)) {
1674		int len;
1675		unsigned long end;
1676
1677		pm.pos = 0;
1678		end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
1679		/* overflow ? */
1680		if (end < start_vaddr || end > end_vaddr)
1681			end = end_vaddr;
1682		ret = mmap_read_lock_killable(mm);
1683		if (ret)
1684			goto out_free;
1685		ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
1686		mmap_read_unlock(mm);
1687		start_vaddr = end;
1688
1689		len = min(count, PM_ENTRY_BYTES * pm.pos);
1690		if (copy_to_user(buf, pm.buffer, len)) {
1691			ret = -EFAULT;
1692			goto out_free;
1693		}
1694		copied += len;
1695		buf += len;
1696		count -= len;
1697	}
1698	*ppos += copied;
1699	if (!ret || ret == PM_END_OF_BUFFER)
1700		ret = copied;
1701
1702out_free:
1703	kfree(pm.buffer);
1704out_mm:
1705	mmput(mm);
1706out:
1707	return ret;
1708}
1709
1710static int pagemap_open(struct inode *inode, struct file *file)
1711{
1712	struct mm_struct *mm;
1713
1714	mm = proc_mem_open(inode, PTRACE_MODE_READ);
1715	if (IS_ERR(mm))
1716		return PTR_ERR(mm);
1717	file->private_data = mm;
1718	return 0;
1719}
1720
1721static int pagemap_release(struct inode *inode, struct file *file)
1722{
1723	struct mm_struct *mm = file->private_data;
1724
1725	if (mm)
1726		mmdrop(mm);
1727	return 0;
1728}
1729
1730const struct file_operations proc_pagemap_operations = {
1731	.llseek		= mem_lseek, /* borrow this */
1732	.read		= pagemap_read,
1733	.open		= pagemap_open,
1734	.release	= pagemap_release,
1735};
1736#endif /* CONFIG_PROC_PAGE_MONITOR */
1737
1738#ifdef CONFIG_NUMA
1739
1740struct numa_maps {
1741	unsigned long pages;
1742	unsigned long anon;
1743	unsigned long active;
1744	unsigned long writeback;
1745	unsigned long mapcount_max;
1746	unsigned long dirty;
1747	unsigned long swapcache;
1748	unsigned long node[MAX_NUMNODES];
1749};
1750
1751struct numa_maps_private {
1752	struct proc_maps_private proc_maps;
1753	struct numa_maps md;
1754};
1755
1756static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
1757			unsigned long nr_pages)
1758{
1759	int count = page_mapcount(page);
1760
1761	md->pages += nr_pages;
1762	if (pte_dirty || PageDirty(page))
1763		md->dirty += nr_pages;
1764
1765	if (PageSwapCache(page))
1766		md->swapcache += nr_pages;
1767
1768	if (PageActive(page) || PageUnevictable(page))
1769		md->active += nr_pages;
1770
1771	if (PageWriteback(page))
1772		md->writeback += nr_pages;
1773
1774	if (PageAnon(page))
1775		md->anon += nr_pages;
1776
1777	if (count > md->mapcount_max)
1778		md->mapcount_max = count;
1779
1780	md->node[page_to_nid(page)] += nr_pages;
1781}
1782
1783static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
1784		unsigned long addr)
1785{
1786	struct page *page;
1787	int nid;
1788
1789	if (!pte_present(pte))
1790		return NULL;
1791
1792	page = vm_normal_page(vma, addr, pte);
1793	if (!page)
1794		return NULL;
1795
1796	if (PageReserved(page))
1797		return NULL;
1798
1799	nid = page_to_nid(page);
1800	if (!node_isset(nid, node_states[N_MEMORY]))
1801		return NULL;
1802
1803	return page;
1804}
1805
1806#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1807static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
1808					      struct vm_area_struct *vma,
1809					      unsigned long addr)
1810{
1811	struct page *page;
1812	int nid;
1813
1814	if (!pmd_present(pmd))
1815		return NULL;
1816
1817	page = vm_normal_page_pmd(vma, addr, pmd);
1818	if (!page)
1819		return NULL;
1820
1821	if (PageReserved(page))
1822		return NULL;
1823
1824	nid = page_to_nid(page);
1825	if (!node_isset(nid, node_states[N_MEMORY]))
1826		return NULL;
1827
1828	return page;
1829}
1830#endif
1831
1832static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1833		unsigned long end, struct mm_walk *walk)
1834{
1835	struct numa_maps *md = walk->private;
1836	struct vm_area_struct *vma = walk->vma;
1837	spinlock_t *ptl;
1838	pte_t *orig_pte;
1839	pte_t *pte;
1840
1841#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1842	ptl = pmd_trans_huge_lock(pmd, vma);
1843	if (ptl) {
1844		struct page *page;
1845
1846		page = can_gather_numa_stats_pmd(*pmd, vma, addr);
1847		if (page)
1848			gather_stats(page, md, pmd_dirty(*pmd),
1849				     HPAGE_PMD_SIZE/PAGE_SIZE);
1850		spin_unlock(ptl);
1851		return 0;
1852	}
1853
1854	if (pmd_trans_unstable(pmd))
1855		return 0;
1856#endif
1857	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1858	do {
1859		struct page *page = can_gather_numa_stats(*pte, vma, addr);
1860		if (!page)
1861			continue;
1862		gather_stats(page, md, pte_dirty(*pte), 1);
1863
1864	} while (pte++, addr += PAGE_SIZE, addr != end);
1865	pte_unmap_unlock(orig_pte, ptl);
1866	cond_resched();
1867	return 0;
1868}
1869#ifdef CONFIG_HUGETLB_PAGE
1870static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
1871		unsigned long addr, unsigned long end, struct mm_walk *walk)
1872{
1873	pte_t huge_pte = huge_ptep_get(pte);
1874	struct numa_maps *md;
1875	struct page *page;
1876
1877	if (!pte_present(huge_pte))
1878		return 0;
1879
1880	page = pte_page(huge_pte);
1881	if (!page)
1882		return 0;
1883
1884	md = walk->private;
1885	gather_stats(page, md, pte_dirty(huge_pte), 1);
1886	return 0;
1887}
1888
1889#else
1890static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
1891		unsigned long addr, unsigned long end, struct mm_walk *walk)
1892{
1893	return 0;
1894}
1895#endif
1896
1897static const struct mm_walk_ops show_numa_ops = {
1898	.hugetlb_entry = gather_hugetlb_stats,
1899	.pmd_entry = gather_pte_stats,
1900};
1901
1902/*
1903 * Display pages allocated per node and memory policy via /proc.
1904 */
1905static int show_numa_map(struct seq_file *m, void *v)
1906{
1907	struct numa_maps_private *numa_priv = m->private;
1908	struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
1909	struct vm_area_struct *vma = v;
1910	struct numa_maps *md = &numa_priv->md;
1911	struct file *file = vma->vm_file;
1912	struct mm_struct *mm = vma->vm_mm;
1913	struct mempolicy *pol;
1914	char buffer[64];
1915	int nid;
1916
1917	if (!mm)
1918		return 0;
1919
1920	/* Ensure we start with an empty set of numa_maps statistics. */
1921	memset(md, 0, sizeof(*md));
1922
1923	pol = __get_vma_policy(vma, vma->vm_start);
1924	if (pol) {
1925		mpol_to_str(buffer, sizeof(buffer), pol);
1926		mpol_cond_put(pol);
1927	} else {
1928		mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
1929	}
1930
1931	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1932
1933	if (file) {
1934		seq_puts(m, " file=");
1935		seq_file_path(m, file, "\n\t= ");
1936	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1937		seq_puts(m, " heap");
1938	} else if (is_stack(vma)) {
1939		seq_puts(m, " stack");
1940	}
1941
1942	if (is_vm_hugetlb_page(vma))
1943		seq_puts(m, " huge");
1944
1945	/* mmap_lock is held by m_start */
1946	walk_page_vma(vma, &show_numa_ops, md);
1947
1948	if (!md->pages)
1949		goto out;
1950
1951	if (md->anon)
1952		seq_printf(m, " anon=%lu", md->anon);
1953
1954	if (md->dirty)
1955		seq_printf(m, " dirty=%lu", md->dirty);
1956
1957	if (md->pages != md->anon && md->pages != md->dirty)
1958		seq_printf(m, " mapped=%lu", md->pages);
1959
1960	if (md->mapcount_max > 1)
1961		seq_printf(m, " mapmax=%lu", md->mapcount_max);
1962
1963	if (md->swapcache)
1964		seq_printf(m, " swapcache=%lu", md->swapcache);
1965
1966	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1967		seq_printf(m, " active=%lu", md->active);
1968
1969	if (md->writeback)
1970		seq_printf(m, " writeback=%lu", md->writeback);
1971
1972	for_each_node_state(nid, N_MEMORY)
1973		if (md->node[nid])
1974			seq_printf(m, " N%d=%lu", nid, md->node[nid]);
1975
1976	seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
1977out:
1978	seq_putc(m, '\n');
1979	return 0;
1980}
1981
1982static const struct seq_operations proc_pid_numa_maps_op = {
1983	.start  = m_start,
1984	.next   = m_next,
1985	.stop   = m_stop,
1986	.show   = show_numa_map,
1987};
1988
1989static int pid_numa_maps_open(struct inode *inode, struct file *file)
1990{
1991	return proc_maps_open(inode, file, &proc_pid_numa_maps_op,
1992				sizeof(struct numa_maps_private));
1993}
1994
1995const struct file_operations proc_pid_numa_maps_operations = {
1996	.open		= pid_numa_maps_open,
1997	.read		= seq_read,
1998	.llseek		= seq_lseek,
1999	.release	= proc_map_release,
2000};
2001
2002#endif /* CONFIG_NUMA */
2003