162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (C) 2008, 2009 Intel Corporation
462306a36Sopenharmony_ci * Authors: Andi Kleen, Fengguang Wu
562306a36Sopenharmony_ci *
662306a36Sopenharmony_ci * High level machine check handler. Handles pages reported by the
762306a36Sopenharmony_ci * hardware as being corrupted usually due to a multi-bit ECC memory or cache
862306a36Sopenharmony_ci * failure.
962306a36Sopenharmony_ci *
1062306a36Sopenharmony_ci * In addition there is a "soft offline" entry point that allows stop using
1162306a36Sopenharmony_ci * not-yet-corrupted-by-suspicious pages without killing anything.
1262306a36Sopenharmony_ci *
1362306a36Sopenharmony_ci * Handles page cache pages in various states.	The tricky part
1462306a36Sopenharmony_ci * here is that we can access any page asynchronously in respect to
1562306a36Sopenharmony_ci * other VM users, because memory failures could happen anytime and
1662306a36Sopenharmony_ci * anywhere. This could violate some of their assumptions. This is why
1762306a36Sopenharmony_ci * this code has to be extremely careful. Generally it tries to use
1862306a36Sopenharmony_ci * normal locking rules, as in get the standard locks, even if that means
1962306a36Sopenharmony_ci * the error handling takes potentially a long time.
2062306a36Sopenharmony_ci *
2162306a36Sopenharmony_ci * It can be very tempting to add handling for obscure cases here.
2262306a36Sopenharmony_ci * In general any code for handling new cases should only be added iff:
2362306a36Sopenharmony_ci * - You know how to test it.
2462306a36Sopenharmony_ci * - You have a test that can be added to mce-test
2562306a36Sopenharmony_ci *   https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
2662306a36Sopenharmony_ci * - The case actually shows up as a frequent (top 10) page state in
2762306a36Sopenharmony_ci *   tools/mm/page-types when running a real workload.
2862306a36Sopenharmony_ci *
2962306a36Sopenharmony_ci * There are several operations here with exponential complexity because
3062306a36Sopenharmony_ci * of unsuitable VM data structures. For example the operation to map back
3162306a36Sopenharmony_ci * from RMAP chains to processes has to walk the complete process list and
3262306a36Sopenharmony_ci * has non linear complexity with the number. But since memory corruptions
3362306a36Sopenharmony_ci * are rare we hope to get away with this. This avoids impacting the core
3462306a36Sopenharmony_ci * VM.
3562306a36Sopenharmony_ci */
3662306a36Sopenharmony_ci
3762306a36Sopenharmony_ci#define pr_fmt(fmt) "Memory failure: " fmt
3862306a36Sopenharmony_ci
3962306a36Sopenharmony_ci#include <linux/kernel.h>
4062306a36Sopenharmony_ci#include <linux/mm.h>
4162306a36Sopenharmony_ci#include <linux/page-flags.h>
4262306a36Sopenharmony_ci#include <linux/sched/signal.h>
4362306a36Sopenharmony_ci#include <linux/sched/task.h>
4462306a36Sopenharmony_ci#include <linux/dax.h>
4562306a36Sopenharmony_ci#include <linux/ksm.h>
4662306a36Sopenharmony_ci#include <linux/rmap.h>
4762306a36Sopenharmony_ci#include <linux/export.h>
4862306a36Sopenharmony_ci#include <linux/pagemap.h>
4962306a36Sopenharmony_ci#include <linux/swap.h>
5062306a36Sopenharmony_ci#include <linux/backing-dev.h>
5162306a36Sopenharmony_ci#include <linux/migrate.h>
5262306a36Sopenharmony_ci#include <linux/slab.h>
5362306a36Sopenharmony_ci#include <linux/swapops.h>
5462306a36Sopenharmony_ci#include <linux/hugetlb.h>
5562306a36Sopenharmony_ci#include <linux/memory_hotplug.h>
5662306a36Sopenharmony_ci#include <linux/mm_inline.h>
5762306a36Sopenharmony_ci#include <linux/memremap.h>
5862306a36Sopenharmony_ci#include <linux/kfifo.h>
5962306a36Sopenharmony_ci#include <linux/ratelimit.h>
6062306a36Sopenharmony_ci#include <linux/pagewalk.h>
6162306a36Sopenharmony_ci#include <linux/shmem_fs.h>
6262306a36Sopenharmony_ci#include <linux/sysctl.h>
6362306a36Sopenharmony_ci#include "swap.h"
6462306a36Sopenharmony_ci#include "internal.h"
6562306a36Sopenharmony_ci#include "ras/ras_event.h"
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_cistatic int sysctl_memory_failure_early_kill __read_mostly;
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_cistatic int sysctl_memory_failure_recovery __read_mostly = 1;
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_ciatomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_cistatic bool hw_memory_failure __read_mostly = false;
7462306a36Sopenharmony_ci
7562306a36Sopenharmony_cistatic DEFINE_MUTEX(mf_mutex);
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_civoid num_poisoned_pages_inc(unsigned long pfn)
7862306a36Sopenharmony_ci{
7962306a36Sopenharmony_ci	atomic_long_inc(&num_poisoned_pages);
8062306a36Sopenharmony_ci	memblk_nr_poison_inc(pfn);
8162306a36Sopenharmony_ci}
8262306a36Sopenharmony_ci
8362306a36Sopenharmony_civoid num_poisoned_pages_sub(unsigned long pfn, long i)
8462306a36Sopenharmony_ci{
8562306a36Sopenharmony_ci	atomic_long_sub(i, &num_poisoned_pages);
8662306a36Sopenharmony_ci	if (pfn != -1UL)
8762306a36Sopenharmony_ci		memblk_nr_poison_sub(pfn, i);
8862306a36Sopenharmony_ci}
8962306a36Sopenharmony_ci
9062306a36Sopenharmony_ci/**
9162306a36Sopenharmony_ci * MF_ATTR_RO - Create sysfs entry for each memory failure statistics.
9262306a36Sopenharmony_ci * @_name: name of the file in the per NUMA sysfs directory.
9362306a36Sopenharmony_ci */
9462306a36Sopenharmony_ci#define MF_ATTR_RO(_name)					\
9562306a36Sopenharmony_cistatic ssize_t _name##_show(struct device *dev,			\
9662306a36Sopenharmony_ci			    struct device_attribute *attr,	\
9762306a36Sopenharmony_ci			    char *buf)				\
9862306a36Sopenharmony_ci{								\
9962306a36Sopenharmony_ci	struct memory_failure_stats *mf_stats =			\
10062306a36Sopenharmony_ci		&NODE_DATA(dev->id)->mf_stats;			\
10162306a36Sopenharmony_ci	return sprintf(buf, "%lu\n", mf_stats->_name);		\
10262306a36Sopenharmony_ci}								\
10362306a36Sopenharmony_cistatic DEVICE_ATTR_RO(_name)
10462306a36Sopenharmony_ci
10562306a36Sopenharmony_ciMF_ATTR_RO(total);
10662306a36Sopenharmony_ciMF_ATTR_RO(ignored);
10762306a36Sopenharmony_ciMF_ATTR_RO(failed);
10862306a36Sopenharmony_ciMF_ATTR_RO(delayed);
10962306a36Sopenharmony_ciMF_ATTR_RO(recovered);
11062306a36Sopenharmony_ci
11162306a36Sopenharmony_cistatic struct attribute *memory_failure_attr[] = {
11262306a36Sopenharmony_ci	&dev_attr_total.attr,
11362306a36Sopenharmony_ci	&dev_attr_ignored.attr,
11462306a36Sopenharmony_ci	&dev_attr_failed.attr,
11562306a36Sopenharmony_ci	&dev_attr_delayed.attr,
11662306a36Sopenharmony_ci	&dev_attr_recovered.attr,
11762306a36Sopenharmony_ci	NULL,
11862306a36Sopenharmony_ci};
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_ciconst struct attribute_group memory_failure_attr_group = {
12162306a36Sopenharmony_ci	.name = "memory_failure",
12262306a36Sopenharmony_ci	.attrs = memory_failure_attr,
12362306a36Sopenharmony_ci};
12462306a36Sopenharmony_ci
12562306a36Sopenharmony_cistatic struct ctl_table memory_failure_table[] = {
12662306a36Sopenharmony_ci	{
12762306a36Sopenharmony_ci		.procname	= "memory_failure_early_kill",
12862306a36Sopenharmony_ci		.data		= &sysctl_memory_failure_early_kill,
12962306a36Sopenharmony_ci		.maxlen		= sizeof(sysctl_memory_failure_early_kill),
13062306a36Sopenharmony_ci		.mode		= 0644,
13162306a36Sopenharmony_ci		.proc_handler	= proc_dointvec_minmax,
13262306a36Sopenharmony_ci		.extra1		= SYSCTL_ZERO,
13362306a36Sopenharmony_ci		.extra2		= SYSCTL_ONE,
13462306a36Sopenharmony_ci	},
13562306a36Sopenharmony_ci	{
13662306a36Sopenharmony_ci		.procname	= "memory_failure_recovery",
13762306a36Sopenharmony_ci		.data		= &sysctl_memory_failure_recovery,
13862306a36Sopenharmony_ci		.maxlen		= sizeof(sysctl_memory_failure_recovery),
13962306a36Sopenharmony_ci		.mode		= 0644,
14062306a36Sopenharmony_ci		.proc_handler	= proc_dointvec_minmax,
14162306a36Sopenharmony_ci		.extra1		= SYSCTL_ZERO,
14262306a36Sopenharmony_ci		.extra2		= SYSCTL_ONE,
14362306a36Sopenharmony_ci	},
14462306a36Sopenharmony_ci	{ }
14562306a36Sopenharmony_ci};
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci/*
14862306a36Sopenharmony_ci * Return values:
14962306a36Sopenharmony_ci *   1:   the page is dissolved (if needed) and taken off from buddy,
15062306a36Sopenharmony_ci *   0:   the page is dissolved (if needed) and not taken off from buddy,
15162306a36Sopenharmony_ci *   < 0: failed to dissolve.
15262306a36Sopenharmony_ci */
15362306a36Sopenharmony_cistatic int __page_handle_poison(struct page *page)
15462306a36Sopenharmony_ci{
15562306a36Sopenharmony_ci	int ret;
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_ci	zone_pcp_disable(page_zone(page));
15862306a36Sopenharmony_ci	ret = dissolve_free_huge_page(page);
15962306a36Sopenharmony_ci	if (!ret)
16062306a36Sopenharmony_ci		ret = take_page_off_buddy(page);
16162306a36Sopenharmony_ci	zone_pcp_enable(page_zone(page));
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_ci	return ret;
16462306a36Sopenharmony_ci}
16562306a36Sopenharmony_ci
16662306a36Sopenharmony_cistatic bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
16762306a36Sopenharmony_ci{
16862306a36Sopenharmony_ci	if (hugepage_or_freepage) {
16962306a36Sopenharmony_ci		/*
17062306a36Sopenharmony_ci		 * Doing this check for free pages is also fine since dissolve_free_huge_page
17162306a36Sopenharmony_ci		 * returns 0 for non-hugetlb pages as well.
17262306a36Sopenharmony_ci		 */
17362306a36Sopenharmony_ci		if (__page_handle_poison(page) <= 0)
17462306a36Sopenharmony_ci			/*
17562306a36Sopenharmony_ci			 * We could fail to take off the target page from buddy
17662306a36Sopenharmony_ci			 * for example due to racy page allocation, but that's
17762306a36Sopenharmony_ci			 * acceptable because soft-offlined page is not broken
17862306a36Sopenharmony_ci			 * and if someone really want to use it, they should
17962306a36Sopenharmony_ci			 * take it.
18062306a36Sopenharmony_ci			 */
18162306a36Sopenharmony_ci			return false;
18262306a36Sopenharmony_ci	}
18362306a36Sopenharmony_ci
18462306a36Sopenharmony_ci	SetPageHWPoison(page);
18562306a36Sopenharmony_ci	if (release)
18662306a36Sopenharmony_ci		put_page(page);
18762306a36Sopenharmony_ci	page_ref_inc(page);
18862306a36Sopenharmony_ci	num_poisoned_pages_inc(page_to_pfn(page));
18962306a36Sopenharmony_ci
19062306a36Sopenharmony_ci	return true;
19162306a36Sopenharmony_ci}
19262306a36Sopenharmony_ci
19362306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_HWPOISON_INJECT)
19462306a36Sopenharmony_ci
19562306a36Sopenharmony_ciu32 hwpoison_filter_enable = 0;
19662306a36Sopenharmony_ciu32 hwpoison_filter_dev_major = ~0U;
19762306a36Sopenharmony_ciu32 hwpoison_filter_dev_minor = ~0U;
19862306a36Sopenharmony_ciu64 hwpoison_filter_flags_mask;
19962306a36Sopenharmony_ciu64 hwpoison_filter_flags_value;
20062306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_enable);
20162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
20262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
20362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
20462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
20562306a36Sopenharmony_ci
20662306a36Sopenharmony_cistatic int hwpoison_filter_dev(struct page *p)
20762306a36Sopenharmony_ci{
20862306a36Sopenharmony_ci	struct address_space *mapping;
20962306a36Sopenharmony_ci	dev_t dev;
21062306a36Sopenharmony_ci
21162306a36Sopenharmony_ci	if (hwpoison_filter_dev_major == ~0U &&
21262306a36Sopenharmony_ci	    hwpoison_filter_dev_minor == ~0U)
21362306a36Sopenharmony_ci		return 0;
21462306a36Sopenharmony_ci
21562306a36Sopenharmony_ci	mapping = page_mapping(p);
21662306a36Sopenharmony_ci	if (mapping == NULL || mapping->host == NULL)
21762306a36Sopenharmony_ci		return -EINVAL;
21862306a36Sopenharmony_ci
21962306a36Sopenharmony_ci	dev = mapping->host->i_sb->s_dev;
22062306a36Sopenharmony_ci	if (hwpoison_filter_dev_major != ~0U &&
22162306a36Sopenharmony_ci	    hwpoison_filter_dev_major != MAJOR(dev))
22262306a36Sopenharmony_ci		return -EINVAL;
22362306a36Sopenharmony_ci	if (hwpoison_filter_dev_minor != ~0U &&
22462306a36Sopenharmony_ci	    hwpoison_filter_dev_minor != MINOR(dev))
22562306a36Sopenharmony_ci		return -EINVAL;
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ci	return 0;
22862306a36Sopenharmony_ci}
22962306a36Sopenharmony_ci
23062306a36Sopenharmony_cistatic int hwpoison_filter_flags(struct page *p)
23162306a36Sopenharmony_ci{
23262306a36Sopenharmony_ci	if (!hwpoison_filter_flags_mask)
23362306a36Sopenharmony_ci		return 0;
23462306a36Sopenharmony_ci
23562306a36Sopenharmony_ci	if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
23662306a36Sopenharmony_ci				    hwpoison_filter_flags_value)
23762306a36Sopenharmony_ci		return 0;
23862306a36Sopenharmony_ci	else
23962306a36Sopenharmony_ci		return -EINVAL;
24062306a36Sopenharmony_ci}
24162306a36Sopenharmony_ci
24262306a36Sopenharmony_ci/*
24362306a36Sopenharmony_ci * This allows stress tests to limit test scope to a collection of tasks
24462306a36Sopenharmony_ci * by putting them under some memcg. This prevents killing unrelated/important
24562306a36Sopenharmony_ci * processes such as /sbin/init. Note that the target task may share clean
24662306a36Sopenharmony_ci * pages with init (eg. libc text), which is harmless. If the target task
24762306a36Sopenharmony_ci * share _dirty_ pages with another task B, the test scheme must make sure B
24862306a36Sopenharmony_ci * is also included in the memcg. At last, due to race conditions this filter
24962306a36Sopenharmony_ci * can only guarantee that the page either belongs to the memcg tasks, or is
25062306a36Sopenharmony_ci * a freed page.
25162306a36Sopenharmony_ci */
25262306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
25362306a36Sopenharmony_ciu64 hwpoison_filter_memcg;
25462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
25562306a36Sopenharmony_cistatic int hwpoison_filter_task(struct page *p)
25662306a36Sopenharmony_ci{
25762306a36Sopenharmony_ci	if (!hwpoison_filter_memcg)
25862306a36Sopenharmony_ci		return 0;
25962306a36Sopenharmony_ci
26062306a36Sopenharmony_ci	if (page_cgroup_ino(p) != hwpoison_filter_memcg)
26162306a36Sopenharmony_ci		return -EINVAL;
26262306a36Sopenharmony_ci
26362306a36Sopenharmony_ci	return 0;
26462306a36Sopenharmony_ci}
26562306a36Sopenharmony_ci#else
26662306a36Sopenharmony_cistatic int hwpoison_filter_task(struct page *p) { return 0; }
26762306a36Sopenharmony_ci#endif
26862306a36Sopenharmony_ci
26962306a36Sopenharmony_ciint hwpoison_filter(struct page *p)
27062306a36Sopenharmony_ci{
27162306a36Sopenharmony_ci	if (!hwpoison_filter_enable)
27262306a36Sopenharmony_ci		return 0;
27362306a36Sopenharmony_ci
27462306a36Sopenharmony_ci	if (hwpoison_filter_dev(p))
27562306a36Sopenharmony_ci		return -EINVAL;
27662306a36Sopenharmony_ci
27762306a36Sopenharmony_ci	if (hwpoison_filter_flags(p))
27862306a36Sopenharmony_ci		return -EINVAL;
27962306a36Sopenharmony_ci
28062306a36Sopenharmony_ci	if (hwpoison_filter_task(p))
28162306a36Sopenharmony_ci		return -EINVAL;
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_ci	return 0;
28462306a36Sopenharmony_ci}
28562306a36Sopenharmony_ci#else
28662306a36Sopenharmony_ciint hwpoison_filter(struct page *p)
28762306a36Sopenharmony_ci{
28862306a36Sopenharmony_ci	return 0;
28962306a36Sopenharmony_ci}
29062306a36Sopenharmony_ci#endif
29162306a36Sopenharmony_ci
29262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter);
29362306a36Sopenharmony_ci
29462306a36Sopenharmony_ci/*
29562306a36Sopenharmony_ci * Kill all processes that have a poisoned page mapped and then isolate
29662306a36Sopenharmony_ci * the page.
29762306a36Sopenharmony_ci *
29862306a36Sopenharmony_ci * General strategy:
29962306a36Sopenharmony_ci * Find all processes having the page mapped and kill them.
30062306a36Sopenharmony_ci * But we keep a page reference around so that the page is not
30162306a36Sopenharmony_ci * actually freed yet.
30262306a36Sopenharmony_ci * Then stash the page away
30362306a36Sopenharmony_ci *
30462306a36Sopenharmony_ci * There's no convenient way to get back to mapped processes
30562306a36Sopenharmony_ci * from the VMAs. So do a brute-force search over all
30662306a36Sopenharmony_ci * running processes.
30762306a36Sopenharmony_ci *
30862306a36Sopenharmony_ci * Remember that machine checks are not common (or rather
30962306a36Sopenharmony_ci * if they are common you have other problems), so this shouldn't
31062306a36Sopenharmony_ci * be a performance issue.
31162306a36Sopenharmony_ci *
31262306a36Sopenharmony_ci * Also there are some races possible while we get from the
31362306a36Sopenharmony_ci * error detection to actually handle it.
31462306a36Sopenharmony_ci */
31562306a36Sopenharmony_ci
31662306a36Sopenharmony_cistruct to_kill {
31762306a36Sopenharmony_ci	struct list_head nd;
31862306a36Sopenharmony_ci	struct task_struct *tsk;
31962306a36Sopenharmony_ci	unsigned long addr;
32062306a36Sopenharmony_ci	short size_shift;
32162306a36Sopenharmony_ci};
32262306a36Sopenharmony_ci
32362306a36Sopenharmony_ci/*
32462306a36Sopenharmony_ci * Send all the processes who have the page mapped a signal.
32562306a36Sopenharmony_ci * ``action optional'' if they are not immediately affected by the error
32662306a36Sopenharmony_ci * ``action required'' if error happened in current execution context
32762306a36Sopenharmony_ci */
32862306a36Sopenharmony_cistatic int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
32962306a36Sopenharmony_ci{
33062306a36Sopenharmony_ci	struct task_struct *t = tk->tsk;
33162306a36Sopenharmony_ci	short addr_lsb = tk->size_shift;
33262306a36Sopenharmony_ci	int ret = 0;
33362306a36Sopenharmony_ci
33462306a36Sopenharmony_ci	pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
33562306a36Sopenharmony_ci			pfn, t->comm, t->pid);
33662306a36Sopenharmony_ci
33762306a36Sopenharmony_ci	if ((flags & MF_ACTION_REQUIRED) && (t == current))
33862306a36Sopenharmony_ci		ret = force_sig_mceerr(BUS_MCEERR_AR,
33962306a36Sopenharmony_ci				 (void __user *)tk->addr, addr_lsb);
34062306a36Sopenharmony_ci	else
34162306a36Sopenharmony_ci		/*
34262306a36Sopenharmony_ci		 * Signal other processes sharing the page if they have
34362306a36Sopenharmony_ci		 * PF_MCE_EARLY set.
34462306a36Sopenharmony_ci		 * Don't use force here, it's convenient if the signal
34562306a36Sopenharmony_ci		 * can be temporarily blocked.
34662306a36Sopenharmony_ci		 * This could cause a loop when the user sets SIGBUS
34762306a36Sopenharmony_ci		 * to SIG_IGN, but hopefully no one will do that?
34862306a36Sopenharmony_ci		 */
34962306a36Sopenharmony_ci		ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
35062306a36Sopenharmony_ci				      addr_lsb, t);
35162306a36Sopenharmony_ci	if (ret < 0)
35262306a36Sopenharmony_ci		pr_info("Error sending signal to %s:%d: %d\n",
35362306a36Sopenharmony_ci			t->comm, t->pid, ret);
35462306a36Sopenharmony_ci	return ret;
35562306a36Sopenharmony_ci}
35662306a36Sopenharmony_ci
35762306a36Sopenharmony_ci/*
35862306a36Sopenharmony_ci * Unknown page type encountered. Try to check whether it can turn PageLRU by
35962306a36Sopenharmony_ci * lru_add_drain_all.
36062306a36Sopenharmony_ci */
36162306a36Sopenharmony_civoid shake_page(struct page *p)
36262306a36Sopenharmony_ci{
36362306a36Sopenharmony_ci	if (PageHuge(p))
36462306a36Sopenharmony_ci		return;
36562306a36Sopenharmony_ci	/*
36662306a36Sopenharmony_ci	 * TODO: Could shrink slab caches here if a lightweight range-based
36762306a36Sopenharmony_ci	 * shrinker will be available.
36862306a36Sopenharmony_ci	 */
36962306a36Sopenharmony_ci	if (PageSlab(p))
37062306a36Sopenharmony_ci		return;
37162306a36Sopenharmony_ci
37262306a36Sopenharmony_ci	lru_add_drain_all();
37362306a36Sopenharmony_ci}
37462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(shake_page);
37562306a36Sopenharmony_ci
37662306a36Sopenharmony_cistatic unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
37762306a36Sopenharmony_ci		unsigned long address)
37862306a36Sopenharmony_ci{
37962306a36Sopenharmony_ci	unsigned long ret = 0;
38062306a36Sopenharmony_ci	pgd_t *pgd;
38162306a36Sopenharmony_ci	p4d_t *p4d;
38262306a36Sopenharmony_ci	pud_t *pud;
38362306a36Sopenharmony_ci	pmd_t *pmd;
38462306a36Sopenharmony_ci	pte_t *pte;
38562306a36Sopenharmony_ci	pte_t ptent;
38662306a36Sopenharmony_ci
38762306a36Sopenharmony_ci	VM_BUG_ON_VMA(address == -EFAULT, vma);
38862306a36Sopenharmony_ci	pgd = pgd_offset(vma->vm_mm, address);
38962306a36Sopenharmony_ci	if (!pgd_present(*pgd))
39062306a36Sopenharmony_ci		return 0;
39162306a36Sopenharmony_ci	p4d = p4d_offset(pgd, address);
39262306a36Sopenharmony_ci	if (!p4d_present(*p4d))
39362306a36Sopenharmony_ci		return 0;
39462306a36Sopenharmony_ci	pud = pud_offset(p4d, address);
39562306a36Sopenharmony_ci	if (!pud_present(*pud))
39662306a36Sopenharmony_ci		return 0;
39762306a36Sopenharmony_ci	if (pud_devmap(*pud))
39862306a36Sopenharmony_ci		return PUD_SHIFT;
39962306a36Sopenharmony_ci	pmd = pmd_offset(pud, address);
40062306a36Sopenharmony_ci	if (!pmd_present(*pmd))
40162306a36Sopenharmony_ci		return 0;
40262306a36Sopenharmony_ci	if (pmd_devmap(*pmd))
40362306a36Sopenharmony_ci		return PMD_SHIFT;
40462306a36Sopenharmony_ci	pte = pte_offset_map(pmd, address);
40562306a36Sopenharmony_ci	if (!pte)
40662306a36Sopenharmony_ci		return 0;
40762306a36Sopenharmony_ci	ptent = ptep_get(pte);
40862306a36Sopenharmony_ci	if (pte_present(ptent) && pte_devmap(ptent))
40962306a36Sopenharmony_ci		ret = PAGE_SHIFT;
41062306a36Sopenharmony_ci	pte_unmap(pte);
41162306a36Sopenharmony_ci	return ret;
41262306a36Sopenharmony_ci}
41362306a36Sopenharmony_ci
41462306a36Sopenharmony_ci/*
41562306a36Sopenharmony_ci * Failure handling: if we can't find or can't kill a process there's
41662306a36Sopenharmony_ci * not much we can do.	We just print a message and ignore otherwise.
41762306a36Sopenharmony_ci */
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci#define FSDAX_INVALID_PGOFF ULONG_MAX
42062306a36Sopenharmony_ci
42162306a36Sopenharmony_ci/*
42262306a36Sopenharmony_ci * Schedule a process for later kill.
42362306a36Sopenharmony_ci * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
42462306a36Sopenharmony_ci *
42562306a36Sopenharmony_ci * Note: @fsdax_pgoff is used only when @p is a fsdax page and a
42662306a36Sopenharmony_ci * filesystem with a memory failure handler has claimed the
42762306a36Sopenharmony_ci * memory_failure event. In all other cases, page->index and
42862306a36Sopenharmony_ci * page->mapping are sufficient for mapping the page back to its
42962306a36Sopenharmony_ci * corresponding user virtual address.
43062306a36Sopenharmony_ci */
43162306a36Sopenharmony_cistatic void __add_to_kill(struct task_struct *tsk, struct page *p,
43262306a36Sopenharmony_ci			  struct vm_area_struct *vma, struct list_head *to_kill,
43362306a36Sopenharmony_ci			  unsigned long ksm_addr, pgoff_t fsdax_pgoff)
43462306a36Sopenharmony_ci{
43562306a36Sopenharmony_ci	struct to_kill *tk;
43662306a36Sopenharmony_ci
43762306a36Sopenharmony_ci	tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
43862306a36Sopenharmony_ci	if (!tk) {
43962306a36Sopenharmony_ci		pr_err("Out of memory while machine check handling\n");
44062306a36Sopenharmony_ci		return;
44162306a36Sopenharmony_ci	}
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_ci	tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma);
44462306a36Sopenharmony_ci	if (is_zone_device_page(p)) {
44562306a36Sopenharmony_ci		if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
44662306a36Sopenharmony_ci			tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
44762306a36Sopenharmony_ci		tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
44862306a36Sopenharmony_ci	} else
44962306a36Sopenharmony_ci		tk->size_shift = page_shift(compound_head(p));
45062306a36Sopenharmony_ci
45162306a36Sopenharmony_ci	/*
45262306a36Sopenharmony_ci	 * Send SIGKILL if "tk->addr == -EFAULT". Also, as
45362306a36Sopenharmony_ci	 * "tk->size_shift" is always non-zero for !is_zone_device_page(),
45462306a36Sopenharmony_ci	 * so "tk->size_shift == 0" effectively checks no mapping on
45562306a36Sopenharmony_ci	 * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
45662306a36Sopenharmony_ci	 * to a process' address space, it's possible not all N VMAs
45762306a36Sopenharmony_ci	 * contain mappings for the page, but at least one VMA does.
45862306a36Sopenharmony_ci	 * Only deliver SIGBUS with payload derived from the VMA that
45962306a36Sopenharmony_ci	 * has a mapping for the page.
46062306a36Sopenharmony_ci	 */
46162306a36Sopenharmony_ci	if (tk->addr == -EFAULT) {
46262306a36Sopenharmony_ci		pr_info("Unable to find user space address %lx in %s\n",
46362306a36Sopenharmony_ci			page_to_pfn(p), tsk->comm);
46462306a36Sopenharmony_ci	} else if (tk->size_shift == 0) {
46562306a36Sopenharmony_ci		kfree(tk);
46662306a36Sopenharmony_ci		return;
46762306a36Sopenharmony_ci	}
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci	get_task_struct(tsk);
47062306a36Sopenharmony_ci	tk->tsk = tsk;
47162306a36Sopenharmony_ci	list_add_tail(&tk->nd, to_kill);
47262306a36Sopenharmony_ci}
47362306a36Sopenharmony_ci
47462306a36Sopenharmony_cistatic void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
47562306a36Sopenharmony_ci				  struct vm_area_struct *vma,
47662306a36Sopenharmony_ci				  struct list_head *to_kill)
47762306a36Sopenharmony_ci{
47862306a36Sopenharmony_ci	__add_to_kill(tsk, p, vma, to_kill, 0, FSDAX_INVALID_PGOFF);
47962306a36Sopenharmony_ci}
48062306a36Sopenharmony_ci
48162306a36Sopenharmony_ci#ifdef CONFIG_KSM
48262306a36Sopenharmony_cistatic bool task_in_to_kill_list(struct list_head *to_kill,
48362306a36Sopenharmony_ci				 struct task_struct *tsk)
48462306a36Sopenharmony_ci{
48562306a36Sopenharmony_ci	struct to_kill *tk, *next;
48662306a36Sopenharmony_ci
48762306a36Sopenharmony_ci	list_for_each_entry_safe(tk, next, to_kill, nd) {
48862306a36Sopenharmony_ci		if (tk->tsk == tsk)
48962306a36Sopenharmony_ci			return true;
49062306a36Sopenharmony_ci	}
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_ci	return false;
49362306a36Sopenharmony_ci}
49462306a36Sopenharmony_civoid add_to_kill_ksm(struct task_struct *tsk, struct page *p,
49562306a36Sopenharmony_ci		     struct vm_area_struct *vma, struct list_head *to_kill,
49662306a36Sopenharmony_ci		     unsigned long ksm_addr)
49762306a36Sopenharmony_ci{
49862306a36Sopenharmony_ci	if (!task_in_to_kill_list(to_kill, tsk))
49962306a36Sopenharmony_ci		__add_to_kill(tsk, p, vma, to_kill, ksm_addr, FSDAX_INVALID_PGOFF);
50062306a36Sopenharmony_ci}
50162306a36Sopenharmony_ci#endif
50262306a36Sopenharmony_ci/*
50362306a36Sopenharmony_ci * Kill the processes that have been collected earlier.
50462306a36Sopenharmony_ci *
50562306a36Sopenharmony_ci * Only do anything when FORCEKILL is set, otherwise just free the
50662306a36Sopenharmony_ci * list (this is used for clean pages which do not need killing)
50762306a36Sopenharmony_ci * Also when FAIL is set do a force kill because something went
50862306a36Sopenharmony_ci * wrong earlier.
50962306a36Sopenharmony_ci */
51062306a36Sopenharmony_cistatic void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
51162306a36Sopenharmony_ci		unsigned long pfn, int flags)
51262306a36Sopenharmony_ci{
51362306a36Sopenharmony_ci	struct to_kill *tk, *next;
51462306a36Sopenharmony_ci
51562306a36Sopenharmony_ci	list_for_each_entry_safe(tk, next, to_kill, nd) {
51662306a36Sopenharmony_ci		if (forcekill) {
51762306a36Sopenharmony_ci			/*
51862306a36Sopenharmony_ci			 * In case something went wrong with munmapping
51962306a36Sopenharmony_ci			 * make sure the process doesn't catch the
52062306a36Sopenharmony_ci			 * signal and then access the memory. Just kill it.
52162306a36Sopenharmony_ci			 */
52262306a36Sopenharmony_ci			if (fail || tk->addr == -EFAULT) {
52362306a36Sopenharmony_ci				pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
52462306a36Sopenharmony_ci				       pfn, tk->tsk->comm, tk->tsk->pid);
52562306a36Sopenharmony_ci				do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
52662306a36Sopenharmony_ci						 tk->tsk, PIDTYPE_PID);
52762306a36Sopenharmony_ci			}
52862306a36Sopenharmony_ci
52962306a36Sopenharmony_ci			/*
53062306a36Sopenharmony_ci			 * In theory the process could have mapped
53162306a36Sopenharmony_ci			 * something else on the address in-between. We could
53262306a36Sopenharmony_ci			 * check for that, but we need to tell the
53362306a36Sopenharmony_ci			 * process anyways.
53462306a36Sopenharmony_ci			 */
53562306a36Sopenharmony_ci			else if (kill_proc(tk, pfn, flags) < 0)
53662306a36Sopenharmony_ci				pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
53762306a36Sopenharmony_ci				       pfn, tk->tsk->comm, tk->tsk->pid);
53862306a36Sopenharmony_ci		}
53962306a36Sopenharmony_ci		list_del(&tk->nd);
54062306a36Sopenharmony_ci		put_task_struct(tk->tsk);
54162306a36Sopenharmony_ci		kfree(tk);
54262306a36Sopenharmony_ci	}
54362306a36Sopenharmony_ci}
54462306a36Sopenharmony_ci
54562306a36Sopenharmony_ci/*
54662306a36Sopenharmony_ci * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
54762306a36Sopenharmony_ci * on behalf of the thread group. Return task_struct of the (first found)
54862306a36Sopenharmony_ci * dedicated thread if found, and return NULL otherwise.
54962306a36Sopenharmony_ci *
55062306a36Sopenharmony_ci * We already hold rcu lock in the caller, so we don't have to call
55162306a36Sopenharmony_ci * rcu_read_lock/unlock() in this function.
55262306a36Sopenharmony_ci */
55362306a36Sopenharmony_cistatic struct task_struct *find_early_kill_thread(struct task_struct *tsk)
55462306a36Sopenharmony_ci{
55562306a36Sopenharmony_ci	struct task_struct *t;
55662306a36Sopenharmony_ci
55762306a36Sopenharmony_ci	for_each_thread(tsk, t) {
55862306a36Sopenharmony_ci		if (t->flags & PF_MCE_PROCESS) {
55962306a36Sopenharmony_ci			if (t->flags & PF_MCE_EARLY)
56062306a36Sopenharmony_ci				return t;
56162306a36Sopenharmony_ci		} else {
56262306a36Sopenharmony_ci			if (sysctl_memory_failure_early_kill)
56362306a36Sopenharmony_ci				return t;
56462306a36Sopenharmony_ci		}
56562306a36Sopenharmony_ci	}
56662306a36Sopenharmony_ci	return NULL;
56762306a36Sopenharmony_ci}
56862306a36Sopenharmony_ci
56962306a36Sopenharmony_ci/*
57062306a36Sopenharmony_ci * Determine whether a given process is "early kill" process which expects
57162306a36Sopenharmony_ci * to be signaled when some page under the process is hwpoisoned.
57262306a36Sopenharmony_ci * Return task_struct of the dedicated thread (main thread unless explicitly
57362306a36Sopenharmony_ci * specified) if the process is "early kill" and otherwise returns NULL.
57462306a36Sopenharmony_ci *
57562306a36Sopenharmony_ci * Note that the above is true for Action Optional case. For Action Required
57662306a36Sopenharmony_ci * case, it's only meaningful to the current thread which need to be signaled
57762306a36Sopenharmony_ci * with SIGBUS, this error is Action Optional for other non current
57862306a36Sopenharmony_ci * processes sharing the same error page,if the process is "early kill", the
57962306a36Sopenharmony_ci * task_struct of the dedicated thread will also be returned.
58062306a36Sopenharmony_ci */
58162306a36Sopenharmony_cistruct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
58262306a36Sopenharmony_ci{
58362306a36Sopenharmony_ci	if (!tsk->mm)
58462306a36Sopenharmony_ci		return NULL;
58562306a36Sopenharmony_ci	/*
58662306a36Sopenharmony_ci	 * Comparing ->mm here because current task might represent
58762306a36Sopenharmony_ci	 * a subthread, while tsk always points to the main thread.
58862306a36Sopenharmony_ci	 */
58962306a36Sopenharmony_ci	if (force_early && tsk->mm == current->mm)
59062306a36Sopenharmony_ci		return current;
59162306a36Sopenharmony_ci
59262306a36Sopenharmony_ci	return find_early_kill_thread(tsk);
59362306a36Sopenharmony_ci}
59462306a36Sopenharmony_ci
59562306a36Sopenharmony_ci/*
59662306a36Sopenharmony_ci * Collect processes when the error hit an anonymous page.
59762306a36Sopenharmony_ci */
59862306a36Sopenharmony_cistatic void collect_procs_anon(struct folio *folio, struct page *page,
59962306a36Sopenharmony_ci		struct list_head *to_kill, int force_early)
60062306a36Sopenharmony_ci{
60162306a36Sopenharmony_ci	struct vm_area_struct *vma;
60262306a36Sopenharmony_ci	struct task_struct *tsk;
60362306a36Sopenharmony_ci	struct anon_vma *av;
60462306a36Sopenharmony_ci	pgoff_t pgoff;
60562306a36Sopenharmony_ci
60662306a36Sopenharmony_ci	av = folio_lock_anon_vma_read(folio, NULL);
60762306a36Sopenharmony_ci	if (av == NULL)	/* Not actually mapped anymore */
60862306a36Sopenharmony_ci		return;
60962306a36Sopenharmony_ci
61062306a36Sopenharmony_ci	pgoff = page_to_pgoff(page);
61162306a36Sopenharmony_ci	rcu_read_lock();
61262306a36Sopenharmony_ci	for_each_process(tsk) {
61362306a36Sopenharmony_ci		struct anon_vma_chain *vmac;
61462306a36Sopenharmony_ci		struct task_struct *t = task_early_kill(tsk, force_early);
61562306a36Sopenharmony_ci
61662306a36Sopenharmony_ci		if (!t)
61762306a36Sopenharmony_ci			continue;
61862306a36Sopenharmony_ci		anon_vma_interval_tree_foreach(vmac, &av->rb_root,
61962306a36Sopenharmony_ci					       pgoff, pgoff) {
62062306a36Sopenharmony_ci			vma = vmac->vma;
62162306a36Sopenharmony_ci			if (vma->vm_mm != t->mm)
62262306a36Sopenharmony_ci				continue;
62362306a36Sopenharmony_ci			if (!page_mapped_in_vma(page, vma))
62462306a36Sopenharmony_ci				continue;
62562306a36Sopenharmony_ci			add_to_kill_anon_file(t, page, vma, to_kill);
62662306a36Sopenharmony_ci		}
62762306a36Sopenharmony_ci	}
62862306a36Sopenharmony_ci	rcu_read_unlock();
62962306a36Sopenharmony_ci	anon_vma_unlock_read(av);
63062306a36Sopenharmony_ci}
63162306a36Sopenharmony_ci
63262306a36Sopenharmony_ci/*
63362306a36Sopenharmony_ci * Collect processes when the error hit a file mapped page.
63462306a36Sopenharmony_ci */
63562306a36Sopenharmony_cistatic void collect_procs_file(struct folio *folio, struct page *page,
63662306a36Sopenharmony_ci		struct list_head *to_kill, int force_early)
63762306a36Sopenharmony_ci{
63862306a36Sopenharmony_ci	struct vm_area_struct *vma;
63962306a36Sopenharmony_ci	struct task_struct *tsk;
64062306a36Sopenharmony_ci	struct address_space *mapping = folio->mapping;
64162306a36Sopenharmony_ci	pgoff_t pgoff;
64262306a36Sopenharmony_ci
64362306a36Sopenharmony_ci	i_mmap_lock_read(mapping);
64462306a36Sopenharmony_ci	rcu_read_lock();
64562306a36Sopenharmony_ci	pgoff = page_to_pgoff(page);
64662306a36Sopenharmony_ci	for_each_process(tsk) {
64762306a36Sopenharmony_ci		struct task_struct *t = task_early_kill(tsk, force_early);
64862306a36Sopenharmony_ci
64962306a36Sopenharmony_ci		if (!t)
65062306a36Sopenharmony_ci			continue;
65162306a36Sopenharmony_ci		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
65262306a36Sopenharmony_ci				      pgoff) {
65362306a36Sopenharmony_ci			/*
65462306a36Sopenharmony_ci			 * Send early kill signal to tasks where a vma covers
65562306a36Sopenharmony_ci			 * the page but the corrupted page is not necessarily
65662306a36Sopenharmony_ci			 * mapped in its pte.
65762306a36Sopenharmony_ci			 * Assume applications who requested early kill want
65862306a36Sopenharmony_ci			 * to be informed of all such data corruptions.
65962306a36Sopenharmony_ci			 */
66062306a36Sopenharmony_ci			if (vma->vm_mm == t->mm)
66162306a36Sopenharmony_ci				add_to_kill_anon_file(t, page, vma, to_kill);
66262306a36Sopenharmony_ci		}
66362306a36Sopenharmony_ci	}
66462306a36Sopenharmony_ci	rcu_read_unlock();
66562306a36Sopenharmony_ci	i_mmap_unlock_read(mapping);
66662306a36Sopenharmony_ci}
66762306a36Sopenharmony_ci
66862306a36Sopenharmony_ci#ifdef CONFIG_FS_DAX
66962306a36Sopenharmony_cistatic void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
67062306a36Sopenharmony_ci			      struct vm_area_struct *vma,
67162306a36Sopenharmony_ci			      struct list_head *to_kill, pgoff_t pgoff)
67262306a36Sopenharmony_ci{
67362306a36Sopenharmony_ci	__add_to_kill(tsk, p, vma, to_kill, 0, pgoff);
67462306a36Sopenharmony_ci}
67562306a36Sopenharmony_ci
67662306a36Sopenharmony_ci/*
67762306a36Sopenharmony_ci * Collect processes when the error hit a fsdax page.
67862306a36Sopenharmony_ci */
67962306a36Sopenharmony_cistatic void collect_procs_fsdax(struct page *page,
68062306a36Sopenharmony_ci		struct address_space *mapping, pgoff_t pgoff,
68162306a36Sopenharmony_ci		struct list_head *to_kill)
68262306a36Sopenharmony_ci{
68362306a36Sopenharmony_ci	struct vm_area_struct *vma;
68462306a36Sopenharmony_ci	struct task_struct *tsk;
68562306a36Sopenharmony_ci
68662306a36Sopenharmony_ci	i_mmap_lock_read(mapping);
68762306a36Sopenharmony_ci	rcu_read_lock();
68862306a36Sopenharmony_ci	for_each_process(tsk) {
68962306a36Sopenharmony_ci		struct task_struct *t = task_early_kill(tsk, true);
69062306a36Sopenharmony_ci
69162306a36Sopenharmony_ci		if (!t)
69262306a36Sopenharmony_ci			continue;
69362306a36Sopenharmony_ci		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
69462306a36Sopenharmony_ci			if (vma->vm_mm == t->mm)
69562306a36Sopenharmony_ci				add_to_kill_fsdax(t, page, vma, to_kill, pgoff);
69662306a36Sopenharmony_ci		}
69762306a36Sopenharmony_ci	}
69862306a36Sopenharmony_ci	rcu_read_unlock();
69962306a36Sopenharmony_ci	i_mmap_unlock_read(mapping);
70062306a36Sopenharmony_ci}
70162306a36Sopenharmony_ci#endif /* CONFIG_FS_DAX */
70262306a36Sopenharmony_ci
70362306a36Sopenharmony_ci/*
70462306a36Sopenharmony_ci * Collect the processes who have the corrupted page mapped to kill.
70562306a36Sopenharmony_ci */
70662306a36Sopenharmony_cistatic void collect_procs(struct folio *folio, struct page *page,
70762306a36Sopenharmony_ci		struct list_head *tokill, int force_early)
70862306a36Sopenharmony_ci{
70962306a36Sopenharmony_ci	if (!folio->mapping)
71062306a36Sopenharmony_ci		return;
71162306a36Sopenharmony_ci	if (unlikely(PageKsm(page)))
71262306a36Sopenharmony_ci		collect_procs_ksm(page, tokill, force_early);
71362306a36Sopenharmony_ci	else if (PageAnon(page))
71462306a36Sopenharmony_ci		collect_procs_anon(folio, page, tokill, force_early);
71562306a36Sopenharmony_ci	else
71662306a36Sopenharmony_ci		collect_procs_file(folio, page, tokill, force_early);
71762306a36Sopenharmony_ci}
71862306a36Sopenharmony_ci
71962306a36Sopenharmony_cistruct hwpoison_walk {
72062306a36Sopenharmony_ci	struct to_kill tk;
72162306a36Sopenharmony_ci	unsigned long pfn;
72262306a36Sopenharmony_ci	int flags;
72362306a36Sopenharmony_ci};
72462306a36Sopenharmony_ci
72562306a36Sopenharmony_cistatic void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
72662306a36Sopenharmony_ci{
72762306a36Sopenharmony_ci	tk->addr = addr;
72862306a36Sopenharmony_ci	tk->size_shift = shift;
72962306a36Sopenharmony_ci}
73062306a36Sopenharmony_ci
73162306a36Sopenharmony_cistatic int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
73262306a36Sopenharmony_ci				unsigned long poisoned_pfn, struct to_kill *tk)
73362306a36Sopenharmony_ci{
73462306a36Sopenharmony_ci	unsigned long pfn = 0;
73562306a36Sopenharmony_ci
73662306a36Sopenharmony_ci	if (pte_present(pte)) {
73762306a36Sopenharmony_ci		pfn = pte_pfn(pte);
73862306a36Sopenharmony_ci	} else {
73962306a36Sopenharmony_ci		swp_entry_t swp = pte_to_swp_entry(pte);
74062306a36Sopenharmony_ci
74162306a36Sopenharmony_ci		if (is_hwpoison_entry(swp))
74262306a36Sopenharmony_ci			pfn = swp_offset_pfn(swp);
74362306a36Sopenharmony_ci	}
74462306a36Sopenharmony_ci
74562306a36Sopenharmony_ci	if (!pfn || pfn != poisoned_pfn)
74662306a36Sopenharmony_ci		return 0;
74762306a36Sopenharmony_ci
74862306a36Sopenharmony_ci	set_to_kill(tk, addr, shift);
74962306a36Sopenharmony_ci	return 1;
75062306a36Sopenharmony_ci}
75162306a36Sopenharmony_ci
75262306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
75362306a36Sopenharmony_cistatic int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
75462306a36Sopenharmony_ci				      struct hwpoison_walk *hwp)
75562306a36Sopenharmony_ci{
75662306a36Sopenharmony_ci	pmd_t pmd = *pmdp;
75762306a36Sopenharmony_ci	unsigned long pfn;
75862306a36Sopenharmony_ci	unsigned long hwpoison_vaddr;
75962306a36Sopenharmony_ci
76062306a36Sopenharmony_ci	if (!pmd_present(pmd))
76162306a36Sopenharmony_ci		return 0;
76262306a36Sopenharmony_ci	pfn = pmd_pfn(pmd);
76362306a36Sopenharmony_ci	if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
76462306a36Sopenharmony_ci		hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
76562306a36Sopenharmony_ci		set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
76662306a36Sopenharmony_ci		return 1;
76762306a36Sopenharmony_ci	}
76862306a36Sopenharmony_ci	return 0;
76962306a36Sopenharmony_ci}
77062306a36Sopenharmony_ci#else
77162306a36Sopenharmony_cistatic int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
77262306a36Sopenharmony_ci				      struct hwpoison_walk *hwp)
77362306a36Sopenharmony_ci{
77462306a36Sopenharmony_ci	return 0;
77562306a36Sopenharmony_ci}
77662306a36Sopenharmony_ci#endif
77762306a36Sopenharmony_ci
77862306a36Sopenharmony_cistatic int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
77962306a36Sopenharmony_ci			      unsigned long end, struct mm_walk *walk)
78062306a36Sopenharmony_ci{
78162306a36Sopenharmony_ci	struct hwpoison_walk *hwp = walk->private;
78262306a36Sopenharmony_ci	int ret = 0;
78362306a36Sopenharmony_ci	pte_t *ptep, *mapped_pte;
78462306a36Sopenharmony_ci	spinlock_t *ptl;
78562306a36Sopenharmony_ci
78662306a36Sopenharmony_ci	ptl = pmd_trans_huge_lock(pmdp, walk->vma);
78762306a36Sopenharmony_ci	if (ptl) {
78862306a36Sopenharmony_ci		ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
78962306a36Sopenharmony_ci		spin_unlock(ptl);
79062306a36Sopenharmony_ci		goto out;
79162306a36Sopenharmony_ci	}
79262306a36Sopenharmony_ci
79362306a36Sopenharmony_ci	mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp,
79462306a36Sopenharmony_ci						addr, &ptl);
79562306a36Sopenharmony_ci	if (!ptep)
79662306a36Sopenharmony_ci		goto out;
79762306a36Sopenharmony_ci
79862306a36Sopenharmony_ci	for (; addr != end; ptep++, addr += PAGE_SIZE) {
79962306a36Sopenharmony_ci		ret = check_hwpoisoned_entry(ptep_get(ptep), addr, PAGE_SHIFT,
80062306a36Sopenharmony_ci					     hwp->pfn, &hwp->tk);
80162306a36Sopenharmony_ci		if (ret == 1)
80262306a36Sopenharmony_ci			break;
80362306a36Sopenharmony_ci	}
80462306a36Sopenharmony_ci	pte_unmap_unlock(mapped_pte, ptl);
80562306a36Sopenharmony_ciout:
80662306a36Sopenharmony_ci	cond_resched();
80762306a36Sopenharmony_ci	return ret;
80862306a36Sopenharmony_ci}
80962306a36Sopenharmony_ci
81062306a36Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE
81162306a36Sopenharmony_cistatic int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
81262306a36Sopenharmony_ci			    unsigned long addr, unsigned long end,
81362306a36Sopenharmony_ci			    struct mm_walk *walk)
81462306a36Sopenharmony_ci{
81562306a36Sopenharmony_ci	struct hwpoison_walk *hwp = walk->private;
81662306a36Sopenharmony_ci	pte_t pte = huge_ptep_get(ptep);
81762306a36Sopenharmony_ci	struct hstate *h = hstate_vma(walk->vma);
81862306a36Sopenharmony_ci
81962306a36Sopenharmony_ci	return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
82062306a36Sopenharmony_ci				      hwp->pfn, &hwp->tk);
82162306a36Sopenharmony_ci}
82262306a36Sopenharmony_ci#else
82362306a36Sopenharmony_ci#define hwpoison_hugetlb_range	NULL
82462306a36Sopenharmony_ci#endif
82562306a36Sopenharmony_ci
82662306a36Sopenharmony_cistatic const struct mm_walk_ops hwpoison_walk_ops = {
82762306a36Sopenharmony_ci	.pmd_entry = hwpoison_pte_range,
82862306a36Sopenharmony_ci	.hugetlb_entry = hwpoison_hugetlb_range,
82962306a36Sopenharmony_ci	.walk_lock = PGWALK_RDLOCK,
83062306a36Sopenharmony_ci};
83162306a36Sopenharmony_ci
83262306a36Sopenharmony_ci/*
83362306a36Sopenharmony_ci * Sends SIGBUS to the current process with error info.
83462306a36Sopenharmony_ci *
83562306a36Sopenharmony_ci * This function is intended to handle "Action Required" MCEs on already
83662306a36Sopenharmony_ci * hardware poisoned pages. They could happen, for example, when
83762306a36Sopenharmony_ci * memory_failure() failed to unmap the error page at the first call, or
83862306a36Sopenharmony_ci * when multiple local machine checks happened on different CPUs.
83962306a36Sopenharmony_ci *
84062306a36Sopenharmony_ci * MCE handler currently has no easy access to the error virtual address,
84162306a36Sopenharmony_ci * so this function walks page table to find it. The returned virtual address
84262306a36Sopenharmony_ci * is proper in most cases, but it could be wrong when the application
84362306a36Sopenharmony_ci * process has multiple entries mapping the error page.
84462306a36Sopenharmony_ci */
84562306a36Sopenharmony_cistatic int kill_accessing_process(struct task_struct *p, unsigned long pfn,
84662306a36Sopenharmony_ci				  int flags)
84762306a36Sopenharmony_ci{
84862306a36Sopenharmony_ci	int ret;
84962306a36Sopenharmony_ci	struct hwpoison_walk priv = {
85062306a36Sopenharmony_ci		.pfn = pfn,
85162306a36Sopenharmony_ci	};
85262306a36Sopenharmony_ci	priv.tk.tsk = p;
85362306a36Sopenharmony_ci
85462306a36Sopenharmony_ci	if (!p->mm)
85562306a36Sopenharmony_ci		return -EFAULT;
85662306a36Sopenharmony_ci
85762306a36Sopenharmony_ci	mmap_read_lock(p->mm);
85862306a36Sopenharmony_ci	ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops,
85962306a36Sopenharmony_ci			      (void *)&priv);
86062306a36Sopenharmony_ci	if (ret == 1 && priv.tk.addr)
86162306a36Sopenharmony_ci		kill_proc(&priv.tk, pfn, flags);
86262306a36Sopenharmony_ci	else
86362306a36Sopenharmony_ci		ret = 0;
86462306a36Sopenharmony_ci	mmap_read_unlock(p->mm);
86562306a36Sopenharmony_ci	return ret > 0 ? -EHWPOISON : -EFAULT;
86662306a36Sopenharmony_ci}
86762306a36Sopenharmony_ci
86862306a36Sopenharmony_cistatic const char *action_name[] = {
86962306a36Sopenharmony_ci	[MF_IGNORED] = "Ignored",
87062306a36Sopenharmony_ci	[MF_FAILED] = "Failed",
87162306a36Sopenharmony_ci	[MF_DELAYED] = "Delayed",
87262306a36Sopenharmony_ci	[MF_RECOVERED] = "Recovered",
87362306a36Sopenharmony_ci};
87462306a36Sopenharmony_ci
87562306a36Sopenharmony_cistatic const char * const action_page_types[] = {
87662306a36Sopenharmony_ci	[MF_MSG_KERNEL]			= "reserved kernel page",
87762306a36Sopenharmony_ci	[MF_MSG_KERNEL_HIGH_ORDER]	= "high-order kernel page",
87862306a36Sopenharmony_ci	[MF_MSG_SLAB]			= "kernel slab page",
87962306a36Sopenharmony_ci	[MF_MSG_DIFFERENT_COMPOUND]	= "different compound page after locking",
88062306a36Sopenharmony_ci	[MF_MSG_HUGE]			= "huge page",
88162306a36Sopenharmony_ci	[MF_MSG_FREE_HUGE]		= "free huge page",
88262306a36Sopenharmony_ci	[MF_MSG_UNMAP_FAILED]		= "unmapping failed page",
88362306a36Sopenharmony_ci	[MF_MSG_DIRTY_SWAPCACHE]	= "dirty swapcache page",
88462306a36Sopenharmony_ci	[MF_MSG_CLEAN_SWAPCACHE]	= "clean swapcache page",
88562306a36Sopenharmony_ci	[MF_MSG_DIRTY_MLOCKED_LRU]	= "dirty mlocked LRU page",
88662306a36Sopenharmony_ci	[MF_MSG_CLEAN_MLOCKED_LRU]	= "clean mlocked LRU page",
88762306a36Sopenharmony_ci	[MF_MSG_DIRTY_UNEVICTABLE_LRU]	= "dirty unevictable LRU page",
88862306a36Sopenharmony_ci	[MF_MSG_CLEAN_UNEVICTABLE_LRU]	= "clean unevictable LRU page",
88962306a36Sopenharmony_ci	[MF_MSG_DIRTY_LRU]		= "dirty LRU page",
89062306a36Sopenharmony_ci	[MF_MSG_CLEAN_LRU]		= "clean LRU page",
89162306a36Sopenharmony_ci	[MF_MSG_TRUNCATED_LRU]		= "already truncated LRU page",
89262306a36Sopenharmony_ci	[MF_MSG_BUDDY]			= "free buddy page",
89362306a36Sopenharmony_ci	[MF_MSG_DAX]			= "dax page",
89462306a36Sopenharmony_ci	[MF_MSG_UNSPLIT_THP]		= "unsplit thp",
89562306a36Sopenharmony_ci	[MF_MSG_UNKNOWN]		= "unknown page",
89662306a36Sopenharmony_ci};
89762306a36Sopenharmony_ci
89862306a36Sopenharmony_ci/*
89962306a36Sopenharmony_ci * XXX: It is possible that a page is isolated from LRU cache,
90062306a36Sopenharmony_ci * and then kept in swap cache or failed to remove from page cache.
90162306a36Sopenharmony_ci * The page count will stop it from being freed by unpoison.
90262306a36Sopenharmony_ci * Stress tests should be aware of this memory leak problem.
90362306a36Sopenharmony_ci */
90462306a36Sopenharmony_cistatic int delete_from_lru_cache(struct page *p)
90562306a36Sopenharmony_ci{
90662306a36Sopenharmony_ci	if (isolate_lru_page(p)) {
90762306a36Sopenharmony_ci		/*
90862306a36Sopenharmony_ci		 * Clear sensible page flags, so that the buddy system won't
90962306a36Sopenharmony_ci		 * complain when the page is unpoison-and-freed.
91062306a36Sopenharmony_ci		 */
91162306a36Sopenharmony_ci		ClearPageActive(p);
91262306a36Sopenharmony_ci		ClearPageUnevictable(p);
91362306a36Sopenharmony_ci
91462306a36Sopenharmony_ci		/*
91562306a36Sopenharmony_ci		 * Poisoned page might never drop its ref count to 0 so we have
91662306a36Sopenharmony_ci		 * to uncharge it manually from its memcg.
91762306a36Sopenharmony_ci		 */
91862306a36Sopenharmony_ci		mem_cgroup_uncharge(page_folio(p));
91962306a36Sopenharmony_ci
92062306a36Sopenharmony_ci		/*
92162306a36Sopenharmony_ci		 * drop the page count elevated by isolate_lru_page()
92262306a36Sopenharmony_ci		 */
92362306a36Sopenharmony_ci		put_page(p);
92462306a36Sopenharmony_ci		return 0;
92562306a36Sopenharmony_ci	}
92662306a36Sopenharmony_ci	return -EIO;
92762306a36Sopenharmony_ci}
92862306a36Sopenharmony_ci
92962306a36Sopenharmony_cistatic int truncate_error_page(struct page *p, unsigned long pfn,
93062306a36Sopenharmony_ci				struct address_space *mapping)
93162306a36Sopenharmony_ci{
93262306a36Sopenharmony_ci	int ret = MF_FAILED;
93362306a36Sopenharmony_ci
93462306a36Sopenharmony_ci	if (mapping->a_ops->error_remove_page) {
93562306a36Sopenharmony_ci		struct folio *folio = page_folio(p);
93662306a36Sopenharmony_ci		int err = mapping->a_ops->error_remove_page(mapping, p);
93762306a36Sopenharmony_ci
93862306a36Sopenharmony_ci		if (err != 0)
93962306a36Sopenharmony_ci			pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
94062306a36Sopenharmony_ci		else if (!filemap_release_folio(folio, GFP_NOIO))
94162306a36Sopenharmony_ci			pr_info("%#lx: failed to release buffers\n", pfn);
94262306a36Sopenharmony_ci		else
94362306a36Sopenharmony_ci			ret = MF_RECOVERED;
94462306a36Sopenharmony_ci	} else {
94562306a36Sopenharmony_ci		/*
94662306a36Sopenharmony_ci		 * If the file system doesn't support it just invalidate
94762306a36Sopenharmony_ci		 * This fails on dirty or anything with private pages
94862306a36Sopenharmony_ci		 */
94962306a36Sopenharmony_ci		if (invalidate_inode_page(p))
95062306a36Sopenharmony_ci			ret = MF_RECOVERED;
95162306a36Sopenharmony_ci		else
95262306a36Sopenharmony_ci			pr_info("%#lx: Failed to invalidate\n",	pfn);
95362306a36Sopenharmony_ci	}
95462306a36Sopenharmony_ci
95562306a36Sopenharmony_ci	return ret;
95662306a36Sopenharmony_ci}
95762306a36Sopenharmony_ci
95862306a36Sopenharmony_cistruct page_state {
95962306a36Sopenharmony_ci	unsigned long mask;
96062306a36Sopenharmony_ci	unsigned long res;
96162306a36Sopenharmony_ci	enum mf_action_page_type type;
96262306a36Sopenharmony_ci
96362306a36Sopenharmony_ci	/* Callback ->action() has to unlock the relevant page inside it. */
96462306a36Sopenharmony_ci	int (*action)(struct page_state *ps, struct page *p);
96562306a36Sopenharmony_ci};
96662306a36Sopenharmony_ci
96762306a36Sopenharmony_ci/*
96862306a36Sopenharmony_ci * Return true if page is still referenced by others, otherwise return
96962306a36Sopenharmony_ci * false.
97062306a36Sopenharmony_ci *
97162306a36Sopenharmony_ci * The extra_pins is true when one extra refcount is expected.
97262306a36Sopenharmony_ci */
97362306a36Sopenharmony_cistatic bool has_extra_refcount(struct page_state *ps, struct page *p,
97462306a36Sopenharmony_ci			       bool extra_pins)
97562306a36Sopenharmony_ci{
97662306a36Sopenharmony_ci	int count = page_count(p) - 1;
97762306a36Sopenharmony_ci
97862306a36Sopenharmony_ci	if (extra_pins)
97962306a36Sopenharmony_ci		count -= 1;
98062306a36Sopenharmony_ci
98162306a36Sopenharmony_ci	if (count > 0) {
98262306a36Sopenharmony_ci		pr_err("%#lx: %s still referenced by %d users\n",
98362306a36Sopenharmony_ci		       page_to_pfn(p), action_page_types[ps->type], count);
98462306a36Sopenharmony_ci		return true;
98562306a36Sopenharmony_ci	}
98662306a36Sopenharmony_ci
98762306a36Sopenharmony_ci	return false;
98862306a36Sopenharmony_ci}
98962306a36Sopenharmony_ci
99062306a36Sopenharmony_ci/*
99162306a36Sopenharmony_ci * Error hit kernel page.
99262306a36Sopenharmony_ci * Do nothing, try to be lucky and not touch this instead. For a few cases we
99362306a36Sopenharmony_ci * could be more sophisticated.
99462306a36Sopenharmony_ci */
99562306a36Sopenharmony_cistatic int me_kernel(struct page_state *ps, struct page *p)
99662306a36Sopenharmony_ci{
99762306a36Sopenharmony_ci	unlock_page(p);
99862306a36Sopenharmony_ci	return MF_IGNORED;
99962306a36Sopenharmony_ci}
100062306a36Sopenharmony_ci
100162306a36Sopenharmony_ci/*
100262306a36Sopenharmony_ci * Page in unknown state. Do nothing.
100362306a36Sopenharmony_ci */
100462306a36Sopenharmony_cistatic int me_unknown(struct page_state *ps, struct page *p)
100562306a36Sopenharmony_ci{
100662306a36Sopenharmony_ci	pr_err("%#lx: Unknown page state\n", page_to_pfn(p));
100762306a36Sopenharmony_ci	unlock_page(p);
100862306a36Sopenharmony_ci	return MF_FAILED;
100962306a36Sopenharmony_ci}
101062306a36Sopenharmony_ci
101162306a36Sopenharmony_ci/*
101262306a36Sopenharmony_ci * Clean (or cleaned) page cache page.
101362306a36Sopenharmony_ci */
101462306a36Sopenharmony_cistatic int me_pagecache_clean(struct page_state *ps, struct page *p)
101562306a36Sopenharmony_ci{
101662306a36Sopenharmony_ci	int ret;
101762306a36Sopenharmony_ci	struct address_space *mapping;
101862306a36Sopenharmony_ci	bool extra_pins;
101962306a36Sopenharmony_ci
102062306a36Sopenharmony_ci	delete_from_lru_cache(p);
102162306a36Sopenharmony_ci
102262306a36Sopenharmony_ci	/*
102362306a36Sopenharmony_ci	 * For anonymous pages we're done the only reference left
102462306a36Sopenharmony_ci	 * should be the one m_f() holds.
102562306a36Sopenharmony_ci	 */
102662306a36Sopenharmony_ci	if (PageAnon(p)) {
102762306a36Sopenharmony_ci		ret = MF_RECOVERED;
102862306a36Sopenharmony_ci		goto out;
102962306a36Sopenharmony_ci	}
103062306a36Sopenharmony_ci
103162306a36Sopenharmony_ci	/*
103262306a36Sopenharmony_ci	 * Now truncate the page in the page cache. This is really
103362306a36Sopenharmony_ci	 * more like a "temporary hole punch"
103462306a36Sopenharmony_ci	 * Don't do this for block devices when someone else
103562306a36Sopenharmony_ci	 * has a reference, because it could be file system metadata
103662306a36Sopenharmony_ci	 * and that's not safe to truncate.
103762306a36Sopenharmony_ci	 */
103862306a36Sopenharmony_ci	mapping = page_mapping(p);
103962306a36Sopenharmony_ci	if (!mapping) {
104062306a36Sopenharmony_ci		/*
104162306a36Sopenharmony_ci		 * Page has been teared down in the meanwhile
104262306a36Sopenharmony_ci		 */
104362306a36Sopenharmony_ci		ret = MF_FAILED;
104462306a36Sopenharmony_ci		goto out;
104562306a36Sopenharmony_ci	}
104662306a36Sopenharmony_ci
104762306a36Sopenharmony_ci	/*
104862306a36Sopenharmony_ci	 * The shmem page is kept in page cache instead of truncating
104962306a36Sopenharmony_ci	 * so is expected to have an extra refcount after error-handling.
105062306a36Sopenharmony_ci	 */
105162306a36Sopenharmony_ci	extra_pins = shmem_mapping(mapping);
105262306a36Sopenharmony_ci
105362306a36Sopenharmony_ci	/*
105462306a36Sopenharmony_ci	 * Truncation is a bit tricky. Enable it per file system for now.
105562306a36Sopenharmony_ci	 *
105662306a36Sopenharmony_ci	 * Open: to take i_rwsem or not for this? Right now we don't.
105762306a36Sopenharmony_ci	 */
105862306a36Sopenharmony_ci	ret = truncate_error_page(p, page_to_pfn(p), mapping);
105962306a36Sopenharmony_ci	if (has_extra_refcount(ps, p, extra_pins))
106062306a36Sopenharmony_ci		ret = MF_FAILED;
106162306a36Sopenharmony_ci
106262306a36Sopenharmony_ciout:
106362306a36Sopenharmony_ci	unlock_page(p);
106462306a36Sopenharmony_ci
106562306a36Sopenharmony_ci	return ret;
106662306a36Sopenharmony_ci}
106762306a36Sopenharmony_ci
106862306a36Sopenharmony_ci/*
106962306a36Sopenharmony_ci * Dirty pagecache page
107062306a36Sopenharmony_ci * Issues: when the error hit a hole page the error is not properly
107162306a36Sopenharmony_ci * propagated.
107262306a36Sopenharmony_ci */
107362306a36Sopenharmony_cistatic int me_pagecache_dirty(struct page_state *ps, struct page *p)
107462306a36Sopenharmony_ci{
107562306a36Sopenharmony_ci	struct address_space *mapping = page_mapping(p);
107662306a36Sopenharmony_ci
107762306a36Sopenharmony_ci	SetPageError(p);
107862306a36Sopenharmony_ci	/* TBD: print more information about the file. */
107962306a36Sopenharmony_ci	if (mapping) {
108062306a36Sopenharmony_ci		/*
108162306a36Sopenharmony_ci		 * IO error will be reported by write(), fsync(), etc.
108262306a36Sopenharmony_ci		 * who check the mapping.
108362306a36Sopenharmony_ci		 * This way the application knows that something went
108462306a36Sopenharmony_ci		 * wrong with its dirty file data.
108562306a36Sopenharmony_ci		 *
108662306a36Sopenharmony_ci		 * There's one open issue:
108762306a36Sopenharmony_ci		 *
108862306a36Sopenharmony_ci		 * The EIO will be only reported on the next IO
108962306a36Sopenharmony_ci		 * operation and then cleared through the IO map.
109062306a36Sopenharmony_ci		 * Normally Linux has two mechanisms to pass IO error
109162306a36Sopenharmony_ci		 * first through the AS_EIO flag in the address space
109262306a36Sopenharmony_ci		 * and then through the PageError flag in the page.
109362306a36Sopenharmony_ci		 * Since we drop pages on memory failure handling the
109462306a36Sopenharmony_ci		 * only mechanism open to use is through AS_AIO.
109562306a36Sopenharmony_ci		 *
109662306a36Sopenharmony_ci		 * This has the disadvantage that it gets cleared on
109762306a36Sopenharmony_ci		 * the first operation that returns an error, while
109862306a36Sopenharmony_ci		 * the PageError bit is more sticky and only cleared
109962306a36Sopenharmony_ci		 * when the page is reread or dropped.  If an
110062306a36Sopenharmony_ci		 * application assumes it will always get error on
110162306a36Sopenharmony_ci		 * fsync, but does other operations on the fd before
110262306a36Sopenharmony_ci		 * and the page is dropped between then the error
110362306a36Sopenharmony_ci		 * will not be properly reported.
110462306a36Sopenharmony_ci		 *
110562306a36Sopenharmony_ci		 * This can already happen even without hwpoisoned
110662306a36Sopenharmony_ci		 * pages: first on metadata IO errors (which only
110762306a36Sopenharmony_ci		 * report through AS_EIO) or when the page is dropped
110862306a36Sopenharmony_ci		 * at the wrong time.
110962306a36Sopenharmony_ci		 *
111062306a36Sopenharmony_ci		 * So right now we assume that the application DTRT on
111162306a36Sopenharmony_ci		 * the first EIO, but we're not worse than other parts
111262306a36Sopenharmony_ci		 * of the kernel.
111362306a36Sopenharmony_ci		 */
111462306a36Sopenharmony_ci		mapping_set_error(mapping, -EIO);
111562306a36Sopenharmony_ci	}
111662306a36Sopenharmony_ci
111762306a36Sopenharmony_ci	return me_pagecache_clean(ps, p);
111862306a36Sopenharmony_ci}
111962306a36Sopenharmony_ci
112062306a36Sopenharmony_ci/*
112162306a36Sopenharmony_ci * Clean and dirty swap cache.
112262306a36Sopenharmony_ci *
112362306a36Sopenharmony_ci * Dirty swap cache page is tricky to handle. The page could live both in page
112462306a36Sopenharmony_ci * cache and swap cache(ie. page is freshly swapped in). So it could be
112562306a36Sopenharmony_ci * referenced concurrently by 2 types of PTEs:
112662306a36Sopenharmony_ci * normal PTEs and swap PTEs. We try to handle them consistently by calling
112762306a36Sopenharmony_ci * try_to_unmap(!TTU_HWPOISON) to convert the normal PTEs to swap PTEs,
112862306a36Sopenharmony_ci * and then
112962306a36Sopenharmony_ci *      - clear dirty bit to prevent IO
113062306a36Sopenharmony_ci *      - remove from LRU
113162306a36Sopenharmony_ci *      - but keep in the swap cache, so that when we return to it on
113262306a36Sopenharmony_ci *        a later page fault, we know the application is accessing
113362306a36Sopenharmony_ci *        corrupted data and shall be killed (we installed simple
113462306a36Sopenharmony_ci *        interception code in do_swap_page to catch it).
113562306a36Sopenharmony_ci *
113662306a36Sopenharmony_ci * Clean swap cache pages can be directly isolated. A later page fault will
113762306a36Sopenharmony_ci * bring in the known good data from disk.
113862306a36Sopenharmony_ci */
113962306a36Sopenharmony_cistatic int me_swapcache_dirty(struct page_state *ps, struct page *p)
114062306a36Sopenharmony_ci{
114162306a36Sopenharmony_ci	int ret;
114262306a36Sopenharmony_ci	bool extra_pins = false;
114362306a36Sopenharmony_ci
114462306a36Sopenharmony_ci	ClearPageDirty(p);
114562306a36Sopenharmony_ci	/* Trigger EIO in shmem: */
114662306a36Sopenharmony_ci	ClearPageUptodate(p);
114762306a36Sopenharmony_ci
114862306a36Sopenharmony_ci	ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
114962306a36Sopenharmony_ci	unlock_page(p);
115062306a36Sopenharmony_ci
115162306a36Sopenharmony_ci	if (ret == MF_DELAYED)
115262306a36Sopenharmony_ci		extra_pins = true;
115362306a36Sopenharmony_ci
115462306a36Sopenharmony_ci	if (has_extra_refcount(ps, p, extra_pins))
115562306a36Sopenharmony_ci		ret = MF_FAILED;
115662306a36Sopenharmony_ci
115762306a36Sopenharmony_ci	return ret;
115862306a36Sopenharmony_ci}
115962306a36Sopenharmony_ci
116062306a36Sopenharmony_cistatic int me_swapcache_clean(struct page_state *ps, struct page *p)
116162306a36Sopenharmony_ci{
116262306a36Sopenharmony_ci	struct folio *folio = page_folio(p);
116362306a36Sopenharmony_ci	int ret;
116462306a36Sopenharmony_ci
116562306a36Sopenharmony_ci	delete_from_swap_cache(folio);
116662306a36Sopenharmony_ci
116762306a36Sopenharmony_ci	ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
116862306a36Sopenharmony_ci	folio_unlock(folio);
116962306a36Sopenharmony_ci
117062306a36Sopenharmony_ci	if (has_extra_refcount(ps, p, false))
117162306a36Sopenharmony_ci		ret = MF_FAILED;
117262306a36Sopenharmony_ci
117362306a36Sopenharmony_ci	return ret;
117462306a36Sopenharmony_ci}
117562306a36Sopenharmony_ci
117662306a36Sopenharmony_ci/*
117762306a36Sopenharmony_ci * Huge pages. Needs work.
117862306a36Sopenharmony_ci * Issues:
117962306a36Sopenharmony_ci * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
118062306a36Sopenharmony_ci *   To narrow down kill region to one page, we need to break up pmd.
118162306a36Sopenharmony_ci */
118262306a36Sopenharmony_cistatic int me_huge_page(struct page_state *ps, struct page *p)
118362306a36Sopenharmony_ci{
118462306a36Sopenharmony_ci	int res;
118562306a36Sopenharmony_ci	struct page *hpage = compound_head(p);
118662306a36Sopenharmony_ci	struct address_space *mapping;
118762306a36Sopenharmony_ci	bool extra_pins = false;
118862306a36Sopenharmony_ci
118962306a36Sopenharmony_ci	mapping = page_mapping(hpage);
119062306a36Sopenharmony_ci	if (mapping) {
119162306a36Sopenharmony_ci		res = truncate_error_page(hpage, page_to_pfn(p), mapping);
119262306a36Sopenharmony_ci		/* The page is kept in page cache. */
119362306a36Sopenharmony_ci		extra_pins = true;
119462306a36Sopenharmony_ci		unlock_page(hpage);
119562306a36Sopenharmony_ci	} else {
119662306a36Sopenharmony_ci		unlock_page(hpage);
119762306a36Sopenharmony_ci		/*
119862306a36Sopenharmony_ci		 * migration entry prevents later access on error hugepage,
119962306a36Sopenharmony_ci		 * so we can free and dissolve it into buddy to save healthy
120062306a36Sopenharmony_ci		 * subpages.
120162306a36Sopenharmony_ci		 */
120262306a36Sopenharmony_ci		put_page(hpage);
120362306a36Sopenharmony_ci		if (__page_handle_poison(p) >= 0) {
120462306a36Sopenharmony_ci			page_ref_inc(p);
120562306a36Sopenharmony_ci			res = MF_RECOVERED;
120662306a36Sopenharmony_ci		} else {
120762306a36Sopenharmony_ci			res = MF_FAILED;
120862306a36Sopenharmony_ci		}
120962306a36Sopenharmony_ci	}
121062306a36Sopenharmony_ci
121162306a36Sopenharmony_ci	if (has_extra_refcount(ps, p, extra_pins))
121262306a36Sopenharmony_ci		res = MF_FAILED;
121362306a36Sopenharmony_ci
121462306a36Sopenharmony_ci	return res;
121562306a36Sopenharmony_ci}
121662306a36Sopenharmony_ci
121762306a36Sopenharmony_ci/*
121862306a36Sopenharmony_ci * Various page states we can handle.
121962306a36Sopenharmony_ci *
122062306a36Sopenharmony_ci * A page state is defined by its current page->flags bits.
122162306a36Sopenharmony_ci * The table matches them in order and calls the right handler.
122262306a36Sopenharmony_ci *
122362306a36Sopenharmony_ci * This is quite tricky because we can access page at any time
122462306a36Sopenharmony_ci * in its live cycle, so all accesses have to be extremely careful.
122562306a36Sopenharmony_ci *
122662306a36Sopenharmony_ci * This is not complete. More states could be added.
122762306a36Sopenharmony_ci * For any missing state don't attempt recovery.
122862306a36Sopenharmony_ci */
122962306a36Sopenharmony_ci
123062306a36Sopenharmony_ci#define dirty		(1UL << PG_dirty)
123162306a36Sopenharmony_ci#define sc		((1UL << PG_swapcache) | (1UL << PG_swapbacked))
123262306a36Sopenharmony_ci#define unevict		(1UL << PG_unevictable)
123362306a36Sopenharmony_ci#define mlock		(1UL << PG_mlocked)
123462306a36Sopenharmony_ci#define lru		(1UL << PG_lru)
123562306a36Sopenharmony_ci#define head		(1UL << PG_head)
123662306a36Sopenharmony_ci#define slab		(1UL << PG_slab)
123762306a36Sopenharmony_ci#define reserved	(1UL << PG_reserved)
123862306a36Sopenharmony_ci
123962306a36Sopenharmony_cistatic struct page_state error_states[] = {
124062306a36Sopenharmony_ci	{ reserved,	reserved,	MF_MSG_KERNEL,	me_kernel },
124162306a36Sopenharmony_ci	/*
124262306a36Sopenharmony_ci	 * free pages are specially detected outside this table:
124362306a36Sopenharmony_ci	 * PG_buddy pages only make a small fraction of all free pages.
124462306a36Sopenharmony_ci	 */
124562306a36Sopenharmony_ci
124662306a36Sopenharmony_ci	/*
124762306a36Sopenharmony_ci	 * Could in theory check if slab page is free or if we can drop
124862306a36Sopenharmony_ci	 * currently unused objects without touching them. But just
124962306a36Sopenharmony_ci	 * treat it as standard kernel for now.
125062306a36Sopenharmony_ci	 */
125162306a36Sopenharmony_ci	{ slab,		slab,		MF_MSG_SLAB,	me_kernel },
125262306a36Sopenharmony_ci
125362306a36Sopenharmony_ci	{ head,		head,		MF_MSG_HUGE,		me_huge_page },
125462306a36Sopenharmony_ci
125562306a36Sopenharmony_ci	{ sc|dirty,	sc|dirty,	MF_MSG_DIRTY_SWAPCACHE,	me_swapcache_dirty },
125662306a36Sopenharmony_ci	{ sc|dirty,	sc,		MF_MSG_CLEAN_SWAPCACHE,	me_swapcache_clean },
125762306a36Sopenharmony_ci
125862306a36Sopenharmony_ci	{ mlock|dirty,	mlock|dirty,	MF_MSG_DIRTY_MLOCKED_LRU,	me_pagecache_dirty },
125962306a36Sopenharmony_ci	{ mlock|dirty,	mlock,		MF_MSG_CLEAN_MLOCKED_LRU,	me_pagecache_clean },
126062306a36Sopenharmony_ci
126162306a36Sopenharmony_ci	{ unevict|dirty, unevict|dirty,	MF_MSG_DIRTY_UNEVICTABLE_LRU,	me_pagecache_dirty },
126262306a36Sopenharmony_ci	{ unevict|dirty, unevict,	MF_MSG_CLEAN_UNEVICTABLE_LRU,	me_pagecache_clean },
126362306a36Sopenharmony_ci
126462306a36Sopenharmony_ci	{ lru|dirty,	lru|dirty,	MF_MSG_DIRTY_LRU,	me_pagecache_dirty },
126562306a36Sopenharmony_ci	{ lru|dirty,	lru,		MF_MSG_CLEAN_LRU,	me_pagecache_clean },
126662306a36Sopenharmony_ci
126762306a36Sopenharmony_ci	/*
126862306a36Sopenharmony_ci	 * Catchall entry: must be at end.
126962306a36Sopenharmony_ci	 */
127062306a36Sopenharmony_ci	{ 0,		0,		MF_MSG_UNKNOWN,	me_unknown },
127162306a36Sopenharmony_ci};
127262306a36Sopenharmony_ci
127362306a36Sopenharmony_ci#undef dirty
127462306a36Sopenharmony_ci#undef sc
127562306a36Sopenharmony_ci#undef unevict
127662306a36Sopenharmony_ci#undef mlock
127762306a36Sopenharmony_ci#undef lru
127862306a36Sopenharmony_ci#undef head
127962306a36Sopenharmony_ci#undef slab
128062306a36Sopenharmony_ci#undef reserved
128162306a36Sopenharmony_ci
128262306a36Sopenharmony_cistatic void update_per_node_mf_stats(unsigned long pfn,
128362306a36Sopenharmony_ci				     enum mf_result result)
128462306a36Sopenharmony_ci{
128562306a36Sopenharmony_ci	int nid = MAX_NUMNODES;
128662306a36Sopenharmony_ci	struct memory_failure_stats *mf_stats = NULL;
128762306a36Sopenharmony_ci
128862306a36Sopenharmony_ci	nid = pfn_to_nid(pfn);
128962306a36Sopenharmony_ci	if (unlikely(nid < 0 || nid >= MAX_NUMNODES)) {
129062306a36Sopenharmony_ci		WARN_ONCE(1, "Memory failure: pfn=%#lx, invalid nid=%d", pfn, nid);
129162306a36Sopenharmony_ci		return;
129262306a36Sopenharmony_ci	}
129362306a36Sopenharmony_ci
129462306a36Sopenharmony_ci	mf_stats = &NODE_DATA(nid)->mf_stats;
129562306a36Sopenharmony_ci	switch (result) {
129662306a36Sopenharmony_ci	case MF_IGNORED:
129762306a36Sopenharmony_ci		++mf_stats->ignored;
129862306a36Sopenharmony_ci		break;
129962306a36Sopenharmony_ci	case MF_FAILED:
130062306a36Sopenharmony_ci		++mf_stats->failed;
130162306a36Sopenharmony_ci		break;
130262306a36Sopenharmony_ci	case MF_DELAYED:
130362306a36Sopenharmony_ci		++mf_stats->delayed;
130462306a36Sopenharmony_ci		break;
130562306a36Sopenharmony_ci	case MF_RECOVERED:
130662306a36Sopenharmony_ci		++mf_stats->recovered;
130762306a36Sopenharmony_ci		break;
130862306a36Sopenharmony_ci	default:
130962306a36Sopenharmony_ci		WARN_ONCE(1, "Memory failure: mf_result=%d is not properly handled", result);
131062306a36Sopenharmony_ci		break;
131162306a36Sopenharmony_ci	}
131262306a36Sopenharmony_ci	++mf_stats->total;
131362306a36Sopenharmony_ci}
131462306a36Sopenharmony_ci
131562306a36Sopenharmony_ci/*
131662306a36Sopenharmony_ci * "Dirty/Clean" indication is not 100% accurate due to the possibility of
131762306a36Sopenharmony_ci * setting PG_dirty outside page lock. See also comment above set_page_dirty().
131862306a36Sopenharmony_ci */
131962306a36Sopenharmony_cistatic int action_result(unsigned long pfn, enum mf_action_page_type type,
132062306a36Sopenharmony_ci			 enum mf_result result)
132162306a36Sopenharmony_ci{
132262306a36Sopenharmony_ci	trace_memory_failure_event(pfn, type, result);
132362306a36Sopenharmony_ci
132462306a36Sopenharmony_ci	num_poisoned_pages_inc(pfn);
132562306a36Sopenharmony_ci
132662306a36Sopenharmony_ci	update_per_node_mf_stats(pfn, result);
132762306a36Sopenharmony_ci
132862306a36Sopenharmony_ci	pr_err("%#lx: recovery action for %s: %s\n",
132962306a36Sopenharmony_ci		pfn, action_page_types[type], action_name[result]);
133062306a36Sopenharmony_ci
133162306a36Sopenharmony_ci	return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
133262306a36Sopenharmony_ci}
133362306a36Sopenharmony_ci
133462306a36Sopenharmony_cistatic int page_action(struct page_state *ps, struct page *p,
133562306a36Sopenharmony_ci			unsigned long pfn)
133662306a36Sopenharmony_ci{
133762306a36Sopenharmony_ci	int result;
133862306a36Sopenharmony_ci
133962306a36Sopenharmony_ci	/* page p should be unlocked after returning from ps->action().  */
134062306a36Sopenharmony_ci	result = ps->action(ps, p);
134162306a36Sopenharmony_ci
134262306a36Sopenharmony_ci	/* Could do more checks here if page looks ok */
134362306a36Sopenharmony_ci	/*
134462306a36Sopenharmony_ci	 * Could adjust zone counters here to correct for the missing page.
134562306a36Sopenharmony_ci	 */
134662306a36Sopenharmony_ci
134762306a36Sopenharmony_ci	return action_result(pfn, ps->type, result);
134862306a36Sopenharmony_ci}
134962306a36Sopenharmony_ci
135062306a36Sopenharmony_cistatic inline bool PageHWPoisonTakenOff(struct page *page)
135162306a36Sopenharmony_ci{
135262306a36Sopenharmony_ci	return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON;
135362306a36Sopenharmony_ci}
135462306a36Sopenharmony_ci
135562306a36Sopenharmony_civoid SetPageHWPoisonTakenOff(struct page *page)
135662306a36Sopenharmony_ci{
135762306a36Sopenharmony_ci	set_page_private(page, MAGIC_HWPOISON);
135862306a36Sopenharmony_ci}
135962306a36Sopenharmony_ci
136062306a36Sopenharmony_civoid ClearPageHWPoisonTakenOff(struct page *page)
136162306a36Sopenharmony_ci{
136262306a36Sopenharmony_ci	if (PageHWPoison(page))
136362306a36Sopenharmony_ci		set_page_private(page, 0);
136462306a36Sopenharmony_ci}
136562306a36Sopenharmony_ci
136662306a36Sopenharmony_ci/*
136762306a36Sopenharmony_ci * Return true if a page type of a given page is supported by hwpoison
136862306a36Sopenharmony_ci * mechanism (while handling could fail), otherwise false.  This function
136962306a36Sopenharmony_ci * does not return true for hugetlb or device memory pages, so it's assumed
137062306a36Sopenharmony_ci * to be called only in the context where we never have such pages.
137162306a36Sopenharmony_ci */
137262306a36Sopenharmony_cistatic inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
137362306a36Sopenharmony_ci{
137462306a36Sopenharmony_ci	/* Soft offline could migrate non-LRU movable pages */
137562306a36Sopenharmony_ci	if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
137662306a36Sopenharmony_ci		return true;
137762306a36Sopenharmony_ci
137862306a36Sopenharmony_ci	return PageLRU(page) || is_free_buddy_page(page);
137962306a36Sopenharmony_ci}
138062306a36Sopenharmony_ci
138162306a36Sopenharmony_cistatic int __get_hwpoison_page(struct page *page, unsigned long flags)
138262306a36Sopenharmony_ci{
138362306a36Sopenharmony_ci	struct folio *folio = page_folio(page);
138462306a36Sopenharmony_ci	int ret = 0;
138562306a36Sopenharmony_ci	bool hugetlb = false;
138662306a36Sopenharmony_ci
138762306a36Sopenharmony_ci	ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, false);
138862306a36Sopenharmony_ci	if (hugetlb) {
138962306a36Sopenharmony_ci		/* Make sure hugetlb demotion did not happen from under us. */
139062306a36Sopenharmony_ci		if (folio == page_folio(page))
139162306a36Sopenharmony_ci			return ret;
139262306a36Sopenharmony_ci		if (ret > 0) {
139362306a36Sopenharmony_ci			folio_put(folio);
139462306a36Sopenharmony_ci			folio = page_folio(page);
139562306a36Sopenharmony_ci		}
139662306a36Sopenharmony_ci	}
139762306a36Sopenharmony_ci
139862306a36Sopenharmony_ci	/*
139962306a36Sopenharmony_ci	 * This check prevents from calling folio_try_get() for any
140062306a36Sopenharmony_ci	 * unsupported type of folio in order to reduce the risk of unexpected
140162306a36Sopenharmony_ci	 * races caused by taking a folio refcount.
140262306a36Sopenharmony_ci	 */
140362306a36Sopenharmony_ci	if (!HWPoisonHandlable(&folio->page, flags))
140462306a36Sopenharmony_ci		return -EBUSY;
140562306a36Sopenharmony_ci
140662306a36Sopenharmony_ci	if (folio_try_get(folio)) {
140762306a36Sopenharmony_ci		if (folio == page_folio(page))
140862306a36Sopenharmony_ci			return 1;
140962306a36Sopenharmony_ci
141062306a36Sopenharmony_ci		pr_info("%#lx cannot catch tail\n", page_to_pfn(page));
141162306a36Sopenharmony_ci		folio_put(folio);
141262306a36Sopenharmony_ci	}
141362306a36Sopenharmony_ci
141462306a36Sopenharmony_ci	return 0;
141562306a36Sopenharmony_ci}
141662306a36Sopenharmony_ci
141762306a36Sopenharmony_cistatic int get_any_page(struct page *p, unsigned long flags)
141862306a36Sopenharmony_ci{
141962306a36Sopenharmony_ci	int ret = 0, pass = 0;
142062306a36Sopenharmony_ci	bool count_increased = false;
142162306a36Sopenharmony_ci
142262306a36Sopenharmony_ci	if (flags & MF_COUNT_INCREASED)
142362306a36Sopenharmony_ci		count_increased = true;
142462306a36Sopenharmony_ci
142562306a36Sopenharmony_citry_again:
142662306a36Sopenharmony_ci	if (!count_increased) {
142762306a36Sopenharmony_ci		ret = __get_hwpoison_page(p, flags);
142862306a36Sopenharmony_ci		if (!ret) {
142962306a36Sopenharmony_ci			if (page_count(p)) {
143062306a36Sopenharmony_ci				/* We raced with an allocation, retry. */
143162306a36Sopenharmony_ci				if (pass++ < 3)
143262306a36Sopenharmony_ci					goto try_again;
143362306a36Sopenharmony_ci				ret = -EBUSY;
143462306a36Sopenharmony_ci			} else if (!PageHuge(p) && !is_free_buddy_page(p)) {
143562306a36Sopenharmony_ci				/* We raced with put_page, retry. */
143662306a36Sopenharmony_ci				if (pass++ < 3)
143762306a36Sopenharmony_ci					goto try_again;
143862306a36Sopenharmony_ci				ret = -EIO;
143962306a36Sopenharmony_ci			}
144062306a36Sopenharmony_ci			goto out;
144162306a36Sopenharmony_ci		} else if (ret == -EBUSY) {
144262306a36Sopenharmony_ci			/*
144362306a36Sopenharmony_ci			 * We raced with (possibly temporary) unhandlable
144462306a36Sopenharmony_ci			 * page, retry.
144562306a36Sopenharmony_ci			 */
144662306a36Sopenharmony_ci			if (pass++ < 3) {
144762306a36Sopenharmony_ci				shake_page(p);
144862306a36Sopenharmony_ci				goto try_again;
144962306a36Sopenharmony_ci			}
145062306a36Sopenharmony_ci			ret = -EIO;
145162306a36Sopenharmony_ci			goto out;
145262306a36Sopenharmony_ci		}
145362306a36Sopenharmony_ci	}
145462306a36Sopenharmony_ci
145562306a36Sopenharmony_ci	if (PageHuge(p) || HWPoisonHandlable(p, flags)) {
145662306a36Sopenharmony_ci		ret = 1;
145762306a36Sopenharmony_ci	} else {
145862306a36Sopenharmony_ci		/*
145962306a36Sopenharmony_ci		 * A page we cannot handle. Check whether we can turn
146062306a36Sopenharmony_ci		 * it into something we can handle.
146162306a36Sopenharmony_ci		 */
146262306a36Sopenharmony_ci		if (pass++ < 3) {
146362306a36Sopenharmony_ci			put_page(p);
146462306a36Sopenharmony_ci			shake_page(p);
146562306a36Sopenharmony_ci			count_increased = false;
146662306a36Sopenharmony_ci			goto try_again;
146762306a36Sopenharmony_ci		}
146862306a36Sopenharmony_ci		put_page(p);
146962306a36Sopenharmony_ci		ret = -EIO;
147062306a36Sopenharmony_ci	}
147162306a36Sopenharmony_ciout:
147262306a36Sopenharmony_ci	if (ret == -EIO)
147362306a36Sopenharmony_ci		pr_err("%#lx: unhandlable page.\n", page_to_pfn(p));
147462306a36Sopenharmony_ci
147562306a36Sopenharmony_ci	return ret;
147662306a36Sopenharmony_ci}
147762306a36Sopenharmony_ci
147862306a36Sopenharmony_cistatic int __get_unpoison_page(struct page *page)
147962306a36Sopenharmony_ci{
148062306a36Sopenharmony_ci	struct folio *folio = page_folio(page);
148162306a36Sopenharmony_ci	int ret = 0;
148262306a36Sopenharmony_ci	bool hugetlb = false;
148362306a36Sopenharmony_ci
148462306a36Sopenharmony_ci	ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, true);
148562306a36Sopenharmony_ci	if (hugetlb) {
148662306a36Sopenharmony_ci		/* Make sure hugetlb demotion did not happen from under us. */
148762306a36Sopenharmony_ci		if (folio == page_folio(page))
148862306a36Sopenharmony_ci			return ret;
148962306a36Sopenharmony_ci		if (ret > 0)
149062306a36Sopenharmony_ci			folio_put(folio);
149162306a36Sopenharmony_ci	}
149262306a36Sopenharmony_ci
149362306a36Sopenharmony_ci	/*
149462306a36Sopenharmony_ci	 * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison,
149562306a36Sopenharmony_ci	 * but also isolated from buddy freelist, so need to identify the
149662306a36Sopenharmony_ci	 * state and have to cancel both operations to unpoison.
149762306a36Sopenharmony_ci	 */
149862306a36Sopenharmony_ci	if (PageHWPoisonTakenOff(page))
149962306a36Sopenharmony_ci		return -EHWPOISON;
150062306a36Sopenharmony_ci
150162306a36Sopenharmony_ci	return get_page_unless_zero(page) ? 1 : 0;
150262306a36Sopenharmony_ci}
150362306a36Sopenharmony_ci
150462306a36Sopenharmony_ci/**
150562306a36Sopenharmony_ci * get_hwpoison_page() - Get refcount for memory error handling
150662306a36Sopenharmony_ci * @p:		Raw error page (hit by memory error)
150762306a36Sopenharmony_ci * @flags:	Flags controlling behavior of error handling
150862306a36Sopenharmony_ci *
150962306a36Sopenharmony_ci * get_hwpoison_page() takes a page refcount of an error page to handle memory
151062306a36Sopenharmony_ci * error on it, after checking that the error page is in a well-defined state
151162306a36Sopenharmony_ci * (defined as a page-type we can successfully handle the memory error on it,
151262306a36Sopenharmony_ci * such as LRU page and hugetlb page).
151362306a36Sopenharmony_ci *
151462306a36Sopenharmony_ci * Memory error handling could be triggered at any time on any type of page,
151562306a36Sopenharmony_ci * so it's prone to race with typical memory management lifecycle (like
151662306a36Sopenharmony_ci * allocation and free).  So to avoid such races, get_hwpoison_page() takes
151762306a36Sopenharmony_ci * extra care for the error page's state (as done in __get_hwpoison_page()),
151862306a36Sopenharmony_ci * and has some retry logic in get_any_page().
151962306a36Sopenharmony_ci *
152062306a36Sopenharmony_ci * When called from unpoison_memory(), the caller should already ensure that
152162306a36Sopenharmony_ci * the given page has PG_hwpoison. So it's never reused for other page
152262306a36Sopenharmony_ci * allocations, and __get_unpoison_page() never races with them.
152362306a36Sopenharmony_ci *
152462306a36Sopenharmony_ci * Return: 0 on failure,
152562306a36Sopenharmony_ci *         1 on success for in-use pages in a well-defined state,
152662306a36Sopenharmony_ci *         -EIO for pages on which we can not handle memory errors,
152762306a36Sopenharmony_ci *         -EBUSY when get_hwpoison_page() has raced with page lifecycle
152862306a36Sopenharmony_ci *         operations like allocation and free,
152962306a36Sopenharmony_ci *         -EHWPOISON when the page is hwpoisoned and taken off from buddy.
153062306a36Sopenharmony_ci */
153162306a36Sopenharmony_cistatic int get_hwpoison_page(struct page *p, unsigned long flags)
153262306a36Sopenharmony_ci{
153362306a36Sopenharmony_ci	int ret;
153462306a36Sopenharmony_ci
153562306a36Sopenharmony_ci	zone_pcp_disable(page_zone(p));
153662306a36Sopenharmony_ci	if (flags & MF_UNPOISON)
153762306a36Sopenharmony_ci		ret = __get_unpoison_page(p);
153862306a36Sopenharmony_ci	else
153962306a36Sopenharmony_ci		ret = get_any_page(p, flags);
154062306a36Sopenharmony_ci	zone_pcp_enable(page_zone(p));
154162306a36Sopenharmony_ci
154262306a36Sopenharmony_ci	return ret;
154362306a36Sopenharmony_ci}
154462306a36Sopenharmony_ci
154562306a36Sopenharmony_ci/*
154662306a36Sopenharmony_ci * Do all that is necessary to remove user space mappings. Unmap
154762306a36Sopenharmony_ci * the pages and send SIGBUS to the processes if the data was dirty.
154862306a36Sopenharmony_ci */
154962306a36Sopenharmony_cistatic bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
155062306a36Sopenharmony_ci				  int flags, struct page *hpage)
155162306a36Sopenharmony_ci{
155262306a36Sopenharmony_ci	struct folio *folio = page_folio(hpage);
155362306a36Sopenharmony_ci	enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
155462306a36Sopenharmony_ci	struct address_space *mapping;
155562306a36Sopenharmony_ci	LIST_HEAD(tokill);
155662306a36Sopenharmony_ci	bool unmap_success;
155762306a36Sopenharmony_ci	int forcekill;
155862306a36Sopenharmony_ci	bool mlocked = PageMlocked(hpage);
155962306a36Sopenharmony_ci
156062306a36Sopenharmony_ci	/*
156162306a36Sopenharmony_ci	 * Here we are interested only in user-mapped pages, so skip any
156262306a36Sopenharmony_ci	 * other types of pages.
156362306a36Sopenharmony_ci	 */
156462306a36Sopenharmony_ci	if (PageReserved(p) || PageSlab(p) || PageTable(p) || PageOffline(p))
156562306a36Sopenharmony_ci		return true;
156662306a36Sopenharmony_ci	if (!(PageLRU(hpage) || PageHuge(p)))
156762306a36Sopenharmony_ci		return true;
156862306a36Sopenharmony_ci
156962306a36Sopenharmony_ci	/*
157062306a36Sopenharmony_ci	 * This check implies we don't kill processes if their pages
157162306a36Sopenharmony_ci	 * are in the swap cache early. Those are always late kills.
157262306a36Sopenharmony_ci	 */
157362306a36Sopenharmony_ci	if (!page_mapped(p))
157462306a36Sopenharmony_ci		return true;
157562306a36Sopenharmony_ci
157662306a36Sopenharmony_ci	if (PageSwapCache(p)) {
157762306a36Sopenharmony_ci		pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
157862306a36Sopenharmony_ci		ttu &= ~TTU_HWPOISON;
157962306a36Sopenharmony_ci	}
158062306a36Sopenharmony_ci
158162306a36Sopenharmony_ci	/*
158262306a36Sopenharmony_ci	 * Propagate the dirty bit from PTEs to struct page first, because we
158362306a36Sopenharmony_ci	 * need this to decide if we should kill or just drop the page.
158462306a36Sopenharmony_ci	 * XXX: the dirty test could be racy: set_page_dirty() may not always
158562306a36Sopenharmony_ci	 * be called inside page lock (it's recommended but not enforced).
158662306a36Sopenharmony_ci	 */
158762306a36Sopenharmony_ci	mapping = page_mapping(hpage);
158862306a36Sopenharmony_ci	if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
158962306a36Sopenharmony_ci	    mapping_can_writeback(mapping)) {
159062306a36Sopenharmony_ci		if (page_mkclean(hpage)) {
159162306a36Sopenharmony_ci			SetPageDirty(hpage);
159262306a36Sopenharmony_ci		} else {
159362306a36Sopenharmony_ci			ttu &= ~TTU_HWPOISON;
159462306a36Sopenharmony_ci			pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
159562306a36Sopenharmony_ci				pfn);
159662306a36Sopenharmony_ci		}
159762306a36Sopenharmony_ci	}
159862306a36Sopenharmony_ci
159962306a36Sopenharmony_ci	/*
160062306a36Sopenharmony_ci	 * First collect all the processes that have the page
160162306a36Sopenharmony_ci	 * mapped in dirty form.  This has to be done before try_to_unmap,
160262306a36Sopenharmony_ci	 * because ttu takes the rmap data structures down.
160362306a36Sopenharmony_ci	 */
160462306a36Sopenharmony_ci	collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
160562306a36Sopenharmony_ci
160662306a36Sopenharmony_ci	if (PageHuge(hpage) && !PageAnon(hpage)) {
160762306a36Sopenharmony_ci		/*
160862306a36Sopenharmony_ci		 * For hugetlb pages in shared mappings, try_to_unmap
160962306a36Sopenharmony_ci		 * could potentially call huge_pmd_unshare.  Because of
161062306a36Sopenharmony_ci		 * this, take semaphore in write mode here and set
161162306a36Sopenharmony_ci		 * TTU_RMAP_LOCKED to indicate we have taken the lock
161262306a36Sopenharmony_ci		 * at this higher level.
161362306a36Sopenharmony_ci		 */
161462306a36Sopenharmony_ci		mapping = hugetlb_page_mapping_lock_write(hpage);
161562306a36Sopenharmony_ci		if (mapping) {
161662306a36Sopenharmony_ci			try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
161762306a36Sopenharmony_ci			i_mmap_unlock_write(mapping);
161862306a36Sopenharmony_ci		} else
161962306a36Sopenharmony_ci			pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn);
162062306a36Sopenharmony_ci	} else {
162162306a36Sopenharmony_ci		try_to_unmap(folio, ttu);
162262306a36Sopenharmony_ci	}
162362306a36Sopenharmony_ci
162462306a36Sopenharmony_ci	unmap_success = !page_mapped(p);
162562306a36Sopenharmony_ci	if (!unmap_success)
162662306a36Sopenharmony_ci		pr_err("%#lx: failed to unmap page (mapcount=%d)\n",
162762306a36Sopenharmony_ci		       pfn, page_mapcount(p));
162862306a36Sopenharmony_ci
162962306a36Sopenharmony_ci	/*
163062306a36Sopenharmony_ci	 * try_to_unmap() might put mlocked page in lru cache, so call
163162306a36Sopenharmony_ci	 * shake_page() again to ensure that it's flushed.
163262306a36Sopenharmony_ci	 */
163362306a36Sopenharmony_ci	if (mlocked)
163462306a36Sopenharmony_ci		shake_page(hpage);
163562306a36Sopenharmony_ci
163662306a36Sopenharmony_ci	/*
163762306a36Sopenharmony_ci	 * Now that the dirty bit has been propagated to the
163862306a36Sopenharmony_ci	 * struct page and all unmaps done we can decide if
163962306a36Sopenharmony_ci	 * killing is needed or not.  Only kill when the page
164062306a36Sopenharmony_ci	 * was dirty or the process is not restartable,
164162306a36Sopenharmony_ci	 * otherwise the tokill list is merely
164262306a36Sopenharmony_ci	 * freed.  When there was a problem unmapping earlier
164362306a36Sopenharmony_ci	 * use a more force-full uncatchable kill to prevent
164462306a36Sopenharmony_ci	 * any accesses to the poisoned memory.
164562306a36Sopenharmony_ci	 */
164662306a36Sopenharmony_ci	forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) ||
164762306a36Sopenharmony_ci		    !unmap_success;
164862306a36Sopenharmony_ci	kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
164962306a36Sopenharmony_ci
165062306a36Sopenharmony_ci	return unmap_success;
165162306a36Sopenharmony_ci}
165262306a36Sopenharmony_ci
165362306a36Sopenharmony_cistatic int identify_page_state(unsigned long pfn, struct page *p,
165462306a36Sopenharmony_ci				unsigned long page_flags)
165562306a36Sopenharmony_ci{
165662306a36Sopenharmony_ci	struct page_state *ps;
165762306a36Sopenharmony_ci
165862306a36Sopenharmony_ci	/*
165962306a36Sopenharmony_ci	 * The first check uses the current page flags which may not have any
166062306a36Sopenharmony_ci	 * relevant information. The second check with the saved page flags is
166162306a36Sopenharmony_ci	 * carried out only if the first check can't determine the page status.
166262306a36Sopenharmony_ci	 */
166362306a36Sopenharmony_ci	for (ps = error_states;; ps++)
166462306a36Sopenharmony_ci		if ((p->flags & ps->mask) == ps->res)
166562306a36Sopenharmony_ci			break;
166662306a36Sopenharmony_ci
166762306a36Sopenharmony_ci	page_flags |= (p->flags & (1UL << PG_dirty));
166862306a36Sopenharmony_ci
166962306a36Sopenharmony_ci	if (!ps->mask)
167062306a36Sopenharmony_ci		for (ps = error_states;; ps++)
167162306a36Sopenharmony_ci			if ((page_flags & ps->mask) == ps->res)
167262306a36Sopenharmony_ci				break;
167362306a36Sopenharmony_ci	return page_action(ps, p, pfn);
167462306a36Sopenharmony_ci}
167562306a36Sopenharmony_ci
167662306a36Sopenharmony_cistatic int try_to_split_thp_page(struct page *page)
167762306a36Sopenharmony_ci{
167862306a36Sopenharmony_ci	int ret;
167962306a36Sopenharmony_ci
168062306a36Sopenharmony_ci	lock_page(page);
168162306a36Sopenharmony_ci	ret = split_huge_page(page);
168262306a36Sopenharmony_ci	unlock_page(page);
168362306a36Sopenharmony_ci
168462306a36Sopenharmony_ci	if (unlikely(ret))
168562306a36Sopenharmony_ci		put_page(page);
168662306a36Sopenharmony_ci
168762306a36Sopenharmony_ci	return ret;
168862306a36Sopenharmony_ci}
168962306a36Sopenharmony_ci
169062306a36Sopenharmony_cistatic void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
169162306a36Sopenharmony_ci		struct address_space *mapping, pgoff_t index, int flags)
169262306a36Sopenharmony_ci{
169362306a36Sopenharmony_ci	struct to_kill *tk;
169462306a36Sopenharmony_ci	unsigned long size = 0;
169562306a36Sopenharmony_ci
169662306a36Sopenharmony_ci	list_for_each_entry(tk, to_kill, nd)
169762306a36Sopenharmony_ci		if (tk->size_shift)
169862306a36Sopenharmony_ci			size = max(size, 1UL << tk->size_shift);
169962306a36Sopenharmony_ci
170062306a36Sopenharmony_ci	if (size) {
170162306a36Sopenharmony_ci		/*
170262306a36Sopenharmony_ci		 * Unmap the largest mapping to avoid breaking up device-dax
170362306a36Sopenharmony_ci		 * mappings which are constant size. The actual size of the
170462306a36Sopenharmony_ci		 * mapping being torn down is communicated in siginfo, see
170562306a36Sopenharmony_ci		 * kill_proc()
170662306a36Sopenharmony_ci		 */
170762306a36Sopenharmony_ci		loff_t start = ((loff_t)index << PAGE_SHIFT) & ~(size - 1);
170862306a36Sopenharmony_ci
170962306a36Sopenharmony_ci		unmap_mapping_range(mapping, start, size, 0);
171062306a36Sopenharmony_ci	}
171162306a36Sopenharmony_ci
171262306a36Sopenharmony_ci	kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags);
171362306a36Sopenharmony_ci}
171462306a36Sopenharmony_ci
171562306a36Sopenharmony_ci/*
171662306a36Sopenharmony_ci * Only dev_pagemap pages get here, such as fsdax when the filesystem
171762306a36Sopenharmony_ci * either do not claim or fails to claim a hwpoison event, or devdax.
171862306a36Sopenharmony_ci * The fsdax pages are initialized per base page, and the devdax pages
171962306a36Sopenharmony_ci * could be initialized either as base pages, or as compound pages with
172062306a36Sopenharmony_ci * vmemmap optimization enabled. Devdax is simplistic in its dealing with
172162306a36Sopenharmony_ci * hwpoison, such that, if a subpage of a compound page is poisoned,
172262306a36Sopenharmony_ci * simply mark the compound head page is by far sufficient.
172362306a36Sopenharmony_ci */
172462306a36Sopenharmony_cistatic int mf_generic_kill_procs(unsigned long long pfn, int flags,
172562306a36Sopenharmony_ci		struct dev_pagemap *pgmap)
172662306a36Sopenharmony_ci{
172762306a36Sopenharmony_ci	struct folio *folio = pfn_folio(pfn);
172862306a36Sopenharmony_ci	LIST_HEAD(to_kill);
172962306a36Sopenharmony_ci	dax_entry_t cookie;
173062306a36Sopenharmony_ci	int rc = 0;
173162306a36Sopenharmony_ci
173262306a36Sopenharmony_ci	/*
173362306a36Sopenharmony_ci	 * Prevent the inode from being freed while we are interrogating
173462306a36Sopenharmony_ci	 * the address_space, typically this would be handled by
173562306a36Sopenharmony_ci	 * lock_page(), but dax pages do not use the page lock. This
173662306a36Sopenharmony_ci	 * also prevents changes to the mapping of this pfn until
173762306a36Sopenharmony_ci	 * poison signaling is complete.
173862306a36Sopenharmony_ci	 */
173962306a36Sopenharmony_ci	cookie = dax_lock_folio(folio);
174062306a36Sopenharmony_ci	if (!cookie)
174162306a36Sopenharmony_ci		return -EBUSY;
174262306a36Sopenharmony_ci
174362306a36Sopenharmony_ci	if (hwpoison_filter(&folio->page)) {
174462306a36Sopenharmony_ci		rc = -EOPNOTSUPP;
174562306a36Sopenharmony_ci		goto unlock;
174662306a36Sopenharmony_ci	}
174762306a36Sopenharmony_ci
174862306a36Sopenharmony_ci	switch (pgmap->type) {
174962306a36Sopenharmony_ci	case MEMORY_DEVICE_PRIVATE:
175062306a36Sopenharmony_ci	case MEMORY_DEVICE_COHERENT:
175162306a36Sopenharmony_ci		/*
175262306a36Sopenharmony_ci		 * TODO: Handle device pages which may need coordination
175362306a36Sopenharmony_ci		 * with device-side memory.
175462306a36Sopenharmony_ci		 */
175562306a36Sopenharmony_ci		rc = -ENXIO;
175662306a36Sopenharmony_ci		goto unlock;
175762306a36Sopenharmony_ci	default:
175862306a36Sopenharmony_ci		break;
175962306a36Sopenharmony_ci	}
176062306a36Sopenharmony_ci
176162306a36Sopenharmony_ci	/*
176262306a36Sopenharmony_ci	 * Use this flag as an indication that the dax page has been
176362306a36Sopenharmony_ci	 * remapped UC to prevent speculative consumption of poison.
176462306a36Sopenharmony_ci	 */
176562306a36Sopenharmony_ci	SetPageHWPoison(&folio->page);
176662306a36Sopenharmony_ci
176762306a36Sopenharmony_ci	/*
176862306a36Sopenharmony_ci	 * Unlike System-RAM there is no possibility to swap in a
176962306a36Sopenharmony_ci	 * different physical page at a given virtual address, so all
177062306a36Sopenharmony_ci	 * userspace consumption of ZONE_DEVICE memory necessitates
177162306a36Sopenharmony_ci	 * SIGBUS (i.e. MF_MUST_KILL)
177262306a36Sopenharmony_ci	 */
177362306a36Sopenharmony_ci	flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
177462306a36Sopenharmony_ci	collect_procs(folio, &folio->page, &to_kill, true);
177562306a36Sopenharmony_ci
177662306a36Sopenharmony_ci	unmap_and_kill(&to_kill, pfn, folio->mapping, folio->index, flags);
177762306a36Sopenharmony_ciunlock:
177862306a36Sopenharmony_ci	dax_unlock_folio(folio, cookie);
177962306a36Sopenharmony_ci	return rc;
178062306a36Sopenharmony_ci}
178162306a36Sopenharmony_ci
178262306a36Sopenharmony_ci#ifdef CONFIG_FS_DAX
178362306a36Sopenharmony_ci/**
178462306a36Sopenharmony_ci * mf_dax_kill_procs - Collect and kill processes who are using this file range
178562306a36Sopenharmony_ci * @mapping:	address_space of the file in use
178662306a36Sopenharmony_ci * @index:	start pgoff of the range within the file
178762306a36Sopenharmony_ci * @count:	length of the range, in unit of PAGE_SIZE
178862306a36Sopenharmony_ci * @mf_flags:	memory failure flags
178962306a36Sopenharmony_ci */
179062306a36Sopenharmony_ciint mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
179162306a36Sopenharmony_ci		unsigned long count, int mf_flags)
179262306a36Sopenharmony_ci{
179362306a36Sopenharmony_ci	LIST_HEAD(to_kill);
179462306a36Sopenharmony_ci	dax_entry_t cookie;
179562306a36Sopenharmony_ci	struct page *page;
179662306a36Sopenharmony_ci	size_t end = index + count;
179762306a36Sopenharmony_ci
179862306a36Sopenharmony_ci	mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
179962306a36Sopenharmony_ci
180062306a36Sopenharmony_ci	for (; index < end; index++) {
180162306a36Sopenharmony_ci		page = NULL;
180262306a36Sopenharmony_ci		cookie = dax_lock_mapping_entry(mapping, index, &page);
180362306a36Sopenharmony_ci		if (!cookie)
180462306a36Sopenharmony_ci			return -EBUSY;
180562306a36Sopenharmony_ci		if (!page)
180662306a36Sopenharmony_ci			goto unlock;
180762306a36Sopenharmony_ci
180862306a36Sopenharmony_ci		SetPageHWPoison(page);
180962306a36Sopenharmony_ci
181062306a36Sopenharmony_ci		collect_procs_fsdax(page, mapping, index, &to_kill);
181162306a36Sopenharmony_ci		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
181262306a36Sopenharmony_ci				index, mf_flags);
181362306a36Sopenharmony_ciunlock:
181462306a36Sopenharmony_ci		dax_unlock_mapping_entry(mapping, index, cookie);
181562306a36Sopenharmony_ci	}
181662306a36Sopenharmony_ci	return 0;
181762306a36Sopenharmony_ci}
181862306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(mf_dax_kill_procs);
181962306a36Sopenharmony_ci#endif /* CONFIG_FS_DAX */
182062306a36Sopenharmony_ci
182162306a36Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE
182262306a36Sopenharmony_ci
182362306a36Sopenharmony_ci/*
182462306a36Sopenharmony_ci * Struct raw_hwp_page represents information about "raw error page",
182562306a36Sopenharmony_ci * constructing singly linked list from ->_hugetlb_hwpoison field of folio.
182662306a36Sopenharmony_ci */
182762306a36Sopenharmony_cistruct raw_hwp_page {
182862306a36Sopenharmony_ci	struct llist_node node;
182962306a36Sopenharmony_ci	struct page *page;
183062306a36Sopenharmony_ci};
183162306a36Sopenharmony_ci
183262306a36Sopenharmony_cistatic inline struct llist_head *raw_hwp_list_head(struct folio *folio)
183362306a36Sopenharmony_ci{
183462306a36Sopenharmony_ci	return (struct llist_head *)&folio->_hugetlb_hwpoison;
183562306a36Sopenharmony_ci}
183662306a36Sopenharmony_ci
183762306a36Sopenharmony_cibool is_raw_hwpoison_page_in_hugepage(struct page *page)
183862306a36Sopenharmony_ci{
183962306a36Sopenharmony_ci	struct llist_head *raw_hwp_head;
184062306a36Sopenharmony_ci	struct raw_hwp_page *p;
184162306a36Sopenharmony_ci	struct folio *folio = page_folio(page);
184262306a36Sopenharmony_ci	bool ret = false;
184362306a36Sopenharmony_ci
184462306a36Sopenharmony_ci	if (!folio_test_hwpoison(folio))
184562306a36Sopenharmony_ci		return false;
184662306a36Sopenharmony_ci
184762306a36Sopenharmony_ci	if (!folio_test_hugetlb(folio))
184862306a36Sopenharmony_ci		return PageHWPoison(page);
184962306a36Sopenharmony_ci
185062306a36Sopenharmony_ci	/*
185162306a36Sopenharmony_ci	 * When RawHwpUnreliable is set, kernel lost track of which subpages
185262306a36Sopenharmony_ci	 * are HWPOISON. So return as if ALL subpages are HWPOISONed.
185362306a36Sopenharmony_ci	 */
185462306a36Sopenharmony_ci	if (folio_test_hugetlb_raw_hwp_unreliable(folio))
185562306a36Sopenharmony_ci		return true;
185662306a36Sopenharmony_ci
185762306a36Sopenharmony_ci	mutex_lock(&mf_mutex);
185862306a36Sopenharmony_ci
185962306a36Sopenharmony_ci	raw_hwp_head = raw_hwp_list_head(folio);
186062306a36Sopenharmony_ci	llist_for_each_entry(p, raw_hwp_head->first, node) {
186162306a36Sopenharmony_ci		if (page == p->page) {
186262306a36Sopenharmony_ci			ret = true;
186362306a36Sopenharmony_ci			break;
186462306a36Sopenharmony_ci		}
186562306a36Sopenharmony_ci	}
186662306a36Sopenharmony_ci
186762306a36Sopenharmony_ci	mutex_unlock(&mf_mutex);
186862306a36Sopenharmony_ci
186962306a36Sopenharmony_ci	return ret;
187062306a36Sopenharmony_ci}
187162306a36Sopenharmony_ci
187262306a36Sopenharmony_cistatic unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
187362306a36Sopenharmony_ci{
187462306a36Sopenharmony_ci	struct llist_node *head;
187562306a36Sopenharmony_ci	struct raw_hwp_page *p, *next;
187662306a36Sopenharmony_ci	unsigned long count = 0;
187762306a36Sopenharmony_ci
187862306a36Sopenharmony_ci	head = llist_del_all(raw_hwp_list_head(folio));
187962306a36Sopenharmony_ci	llist_for_each_entry_safe(p, next, head, node) {
188062306a36Sopenharmony_ci		if (move_flag)
188162306a36Sopenharmony_ci			SetPageHWPoison(p->page);
188262306a36Sopenharmony_ci		else
188362306a36Sopenharmony_ci			num_poisoned_pages_sub(page_to_pfn(p->page), 1);
188462306a36Sopenharmony_ci		kfree(p);
188562306a36Sopenharmony_ci		count++;
188662306a36Sopenharmony_ci	}
188762306a36Sopenharmony_ci	return count;
188862306a36Sopenharmony_ci}
188962306a36Sopenharmony_ci
189062306a36Sopenharmony_cistatic int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
189162306a36Sopenharmony_ci{
189262306a36Sopenharmony_ci	struct llist_head *head;
189362306a36Sopenharmony_ci	struct raw_hwp_page *raw_hwp;
189462306a36Sopenharmony_ci	struct raw_hwp_page *p, *next;
189562306a36Sopenharmony_ci	int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0;
189662306a36Sopenharmony_ci
189762306a36Sopenharmony_ci	/*
189862306a36Sopenharmony_ci	 * Once the hwpoison hugepage has lost reliable raw error info,
189962306a36Sopenharmony_ci	 * there is little meaning to keep additional error info precisely,
190062306a36Sopenharmony_ci	 * so skip to add additional raw error info.
190162306a36Sopenharmony_ci	 */
190262306a36Sopenharmony_ci	if (folio_test_hugetlb_raw_hwp_unreliable(folio))
190362306a36Sopenharmony_ci		return -EHWPOISON;
190462306a36Sopenharmony_ci	head = raw_hwp_list_head(folio);
190562306a36Sopenharmony_ci	llist_for_each_entry_safe(p, next, head->first, node) {
190662306a36Sopenharmony_ci		if (p->page == page)
190762306a36Sopenharmony_ci			return -EHWPOISON;
190862306a36Sopenharmony_ci	}
190962306a36Sopenharmony_ci
191062306a36Sopenharmony_ci	raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
191162306a36Sopenharmony_ci	if (raw_hwp) {
191262306a36Sopenharmony_ci		raw_hwp->page = page;
191362306a36Sopenharmony_ci		llist_add(&raw_hwp->node, head);
191462306a36Sopenharmony_ci		/* the first error event will be counted in action_result(). */
191562306a36Sopenharmony_ci		if (ret)
191662306a36Sopenharmony_ci			num_poisoned_pages_inc(page_to_pfn(page));
191762306a36Sopenharmony_ci	} else {
191862306a36Sopenharmony_ci		/*
191962306a36Sopenharmony_ci		 * Failed to save raw error info.  We no longer trace all
192062306a36Sopenharmony_ci		 * hwpoisoned subpages, and we need refuse to free/dissolve
192162306a36Sopenharmony_ci		 * this hwpoisoned hugepage.
192262306a36Sopenharmony_ci		 */
192362306a36Sopenharmony_ci		folio_set_hugetlb_raw_hwp_unreliable(folio);
192462306a36Sopenharmony_ci		/*
192562306a36Sopenharmony_ci		 * Once hugetlb_raw_hwp_unreliable is set, raw_hwp_page is not
192662306a36Sopenharmony_ci		 * used any more, so free it.
192762306a36Sopenharmony_ci		 */
192862306a36Sopenharmony_ci		__folio_free_raw_hwp(folio, false);
192962306a36Sopenharmony_ci	}
193062306a36Sopenharmony_ci	return ret;
193162306a36Sopenharmony_ci}
193262306a36Sopenharmony_ci
193362306a36Sopenharmony_cistatic unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag)
193462306a36Sopenharmony_ci{
193562306a36Sopenharmony_ci	/*
193662306a36Sopenharmony_ci	 * hugetlb_vmemmap_optimized hugepages can't be freed because struct
193762306a36Sopenharmony_ci	 * pages for tail pages are required but they don't exist.
193862306a36Sopenharmony_ci	 */
193962306a36Sopenharmony_ci	if (move_flag && folio_test_hugetlb_vmemmap_optimized(folio))
194062306a36Sopenharmony_ci		return 0;
194162306a36Sopenharmony_ci
194262306a36Sopenharmony_ci	/*
194362306a36Sopenharmony_ci	 * hugetlb_raw_hwp_unreliable hugepages shouldn't be unpoisoned by
194462306a36Sopenharmony_ci	 * definition.
194562306a36Sopenharmony_ci	 */
194662306a36Sopenharmony_ci	if (folio_test_hugetlb_raw_hwp_unreliable(folio))
194762306a36Sopenharmony_ci		return 0;
194862306a36Sopenharmony_ci
194962306a36Sopenharmony_ci	return __folio_free_raw_hwp(folio, move_flag);
195062306a36Sopenharmony_ci}
195162306a36Sopenharmony_ci
195262306a36Sopenharmony_civoid folio_clear_hugetlb_hwpoison(struct folio *folio)
195362306a36Sopenharmony_ci{
195462306a36Sopenharmony_ci	if (folio_test_hugetlb_raw_hwp_unreliable(folio))
195562306a36Sopenharmony_ci		return;
195662306a36Sopenharmony_ci	if (folio_test_hugetlb_vmemmap_optimized(folio))
195762306a36Sopenharmony_ci		return;
195862306a36Sopenharmony_ci	folio_clear_hwpoison(folio);
195962306a36Sopenharmony_ci	folio_free_raw_hwp(folio, true);
196062306a36Sopenharmony_ci}
196162306a36Sopenharmony_ci
196262306a36Sopenharmony_ci/*
196362306a36Sopenharmony_ci * Called from hugetlb code with hugetlb_lock held.
196462306a36Sopenharmony_ci *
196562306a36Sopenharmony_ci * Return values:
196662306a36Sopenharmony_ci *   0             - free hugepage
196762306a36Sopenharmony_ci *   1             - in-use hugepage
196862306a36Sopenharmony_ci *   2             - not a hugepage
196962306a36Sopenharmony_ci *   -EBUSY        - the hugepage is busy (try to retry)
197062306a36Sopenharmony_ci *   -EHWPOISON    - the hugepage is already hwpoisoned
197162306a36Sopenharmony_ci */
197262306a36Sopenharmony_ciint __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
197362306a36Sopenharmony_ci				 bool *migratable_cleared)
197462306a36Sopenharmony_ci{
197562306a36Sopenharmony_ci	struct page *page = pfn_to_page(pfn);
197662306a36Sopenharmony_ci	struct folio *folio = page_folio(page);
197762306a36Sopenharmony_ci	int ret = 2;	/* fallback to normal page handling */
197862306a36Sopenharmony_ci	bool count_increased = false;
197962306a36Sopenharmony_ci
198062306a36Sopenharmony_ci	if (!folio_test_hugetlb(folio))
198162306a36Sopenharmony_ci		goto out;
198262306a36Sopenharmony_ci
198362306a36Sopenharmony_ci	if (flags & MF_COUNT_INCREASED) {
198462306a36Sopenharmony_ci		ret = 1;
198562306a36Sopenharmony_ci		count_increased = true;
198662306a36Sopenharmony_ci	} else if (folio_test_hugetlb_freed(folio)) {
198762306a36Sopenharmony_ci		ret = 0;
198862306a36Sopenharmony_ci	} else if (folio_test_hugetlb_migratable(folio)) {
198962306a36Sopenharmony_ci		ret = folio_try_get(folio);
199062306a36Sopenharmony_ci		if (ret)
199162306a36Sopenharmony_ci			count_increased = true;
199262306a36Sopenharmony_ci	} else {
199362306a36Sopenharmony_ci		ret = -EBUSY;
199462306a36Sopenharmony_ci		if (!(flags & MF_NO_RETRY))
199562306a36Sopenharmony_ci			goto out;
199662306a36Sopenharmony_ci	}
199762306a36Sopenharmony_ci
199862306a36Sopenharmony_ci	if (folio_set_hugetlb_hwpoison(folio, page)) {
199962306a36Sopenharmony_ci		ret = -EHWPOISON;
200062306a36Sopenharmony_ci		goto out;
200162306a36Sopenharmony_ci	}
200262306a36Sopenharmony_ci
200362306a36Sopenharmony_ci	/*
200462306a36Sopenharmony_ci	 * Clearing hugetlb_migratable for hwpoisoned hugepages to prevent them
200562306a36Sopenharmony_ci	 * from being migrated by memory hotremove.
200662306a36Sopenharmony_ci	 */
200762306a36Sopenharmony_ci	if (count_increased && folio_test_hugetlb_migratable(folio)) {
200862306a36Sopenharmony_ci		folio_clear_hugetlb_migratable(folio);
200962306a36Sopenharmony_ci		*migratable_cleared = true;
201062306a36Sopenharmony_ci	}
201162306a36Sopenharmony_ci
201262306a36Sopenharmony_ci	return ret;
201362306a36Sopenharmony_ciout:
201462306a36Sopenharmony_ci	if (count_increased)
201562306a36Sopenharmony_ci		folio_put(folio);
201662306a36Sopenharmony_ci	return ret;
201762306a36Sopenharmony_ci}
201862306a36Sopenharmony_ci
201962306a36Sopenharmony_ci/*
202062306a36Sopenharmony_ci * Taking refcount of hugetlb pages needs extra care about race conditions
202162306a36Sopenharmony_ci * with basic operations like hugepage allocation/free/demotion.
202262306a36Sopenharmony_ci * So some of prechecks for hwpoison (pinning, and testing/setting
202362306a36Sopenharmony_ci * PageHWPoison) should be done in single hugetlb_lock range.
202462306a36Sopenharmony_ci */
202562306a36Sopenharmony_cistatic int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
202662306a36Sopenharmony_ci{
202762306a36Sopenharmony_ci	int res;
202862306a36Sopenharmony_ci	struct page *p = pfn_to_page(pfn);
202962306a36Sopenharmony_ci	struct folio *folio;
203062306a36Sopenharmony_ci	unsigned long page_flags;
203162306a36Sopenharmony_ci	bool migratable_cleared = false;
203262306a36Sopenharmony_ci
203362306a36Sopenharmony_ci	*hugetlb = 1;
203462306a36Sopenharmony_ciretry:
203562306a36Sopenharmony_ci	res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
203662306a36Sopenharmony_ci	if (res == 2) { /* fallback to normal page handling */
203762306a36Sopenharmony_ci		*hugetlb = 0;
203862306a36Sopenharmony_ci		return 0;
203962306a36Sopenharmony_ci	} else if (res == -EHWPOISON) {
204062306a36Sopenharmony_ci		pr_err("%#lx: already hardware poisoned\n", pfn);
204162306a36Sopenharmony_ci		if (flags & MF_ACTION_REQUIRED) {
204262306a36Sopenharmony_ci			folio = page_folio(p);
204362306a36Sopenharmony_ci			res = kill_accessing_process(current, folio_pfn(folio), flags);
204462306a36Sopenharmony_ci		}
204562306a36Sopenharmony_ci		return res;
204662306a36Sopenharmony_ci	} else if (res == -EBUSY) {
204762306a36Sopenharmony_ci		if (!(flags & MF_NO_RETRY)) {
204862306a36Sopenharmony_ci			flags |= MF_NO_RETRY;
204962306a36Sopenharmony_ci			goto retry;
205062306a36Sopenharmony_ci		}
205162306a36Sopenharmony_ci		return action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
205262306a36Sopenharmony_ci	}
205362306a36Sopenharmony_ci
205462306a36Sopenharmony_ci	folio = page_folio(p);
205562306a36Sopenharmony_ci	folio_lock(folio);
205662306a36Sopenharmony_ci
205762306a36Sopenharmony_ci	if (hwpoison_filter(p)) {
205862306a36Sopenharmony_ci		folio_clear_hugetlb_hwpoison(folio);
205962306a36Sopenharmony_ci		if (migratable_cleared)
206062306a36Sopenharmony_ci			folio_set_hugetlb_migratable(folio);
206162306a36Sopenharmony_ci		folio_unlock(folio);
206262306a36Sopenharmony_ci		if (res == 1)
206362306a36Sopenharmony_ci			folio_put(folio);
206462306a36Sopenharmony_ci		return -EOPNOTSUPP;
206562306a36Sopenharmony_ci	}
206662306a36Sopenharmony_ci
206762306a36Sopenharmony_ci	/*
206862306a36Sopenharmony_ci	 * Handling free hugepage.  The possible race with hugepage allocation
206962306a36Sopenharmony_ci	 * or demotion can be prevented by PageHWPoison flag.
207062306a36Sopenharmony_ci	 */
207162306a36Sopenharmony_ci	if (res == 0) {
207262306a36Sopenharmony_ci		folio_unlock(folio);
207362306a36Sopenharmony_ci		if (__page_handle_poison(p) >= 0) {
207462306a36Sopenharmony_ci			page_ref_inc(p);
207562306a36Sopenharmony_ci			res = MF_RECOVERED;
207662306a36Sopenharmony_ci		} else {
207762306a36Sopenharmony_ci			res = MF_FAILED;
207862306a36Sopenharmony_ci		}
207962306a36Sopenharmony_ci		return action_result(pfn, MF_MSG_FREE_HUGE, res);
208062306a36Sopenharmony_ci	}
208162306a36Sopenharmony_ci
208262306a36Sopenharmony_ci	page_flags = folio->flags;
208362306a36Sopenharmony_ci
208462306a36Sopenharmony_ci	if (!hwpoison_user_mappings(p, pfn, flags, &folio->page)) {
208562306a36Sopenharmony_ci		folio_unlock(folio);
208662306a36Sopenharmony_ci		return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
208762306a36Sopenharmony_ci	}
208862306a36Sopenharmony_ci
208962306a36Sopenharmony_ci	return identify_page_state(pfn, p, page_flags);
209062306a36Sopenharmony_ci}
209162306a36Sopenharmony_ci
209262306a36Sopenharmony_ci#else
209362306a36Sopenharmony_cistatic inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
209462306a36Sopenharmony_ci{
209562306a36Sopenharmony_ci	return 0;
209662306a36Sopenharmony_ci}
209762306a36Sopenharmony_ci
209862306a36Sopenharmony_cistatic inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag)
209962306a36Sopenharmony_ci{
210062306a36Sopenharmony_ci	return 0;
210162306a36Sopenharmony_ci}
210262306a36Sopenharmony_ci#endif	/* CONFIG_HUGETLB_PAGE */
210362306a36Sopenharmony_ci
210462306a36Sopenharmony_ci/* Drop the extra refcount in case we come from madvise() */
210562306a36Sopenharmony_cistatic void put_ref_page(unsigned long pfn, int flags)
210662306a36Sopenharmony_ci{
210762306a36Sopenharmony_ci	struct page *page;
210862306a36Sopenharmony_ci
210962306a36Sopenharmony_ci	if (!(flags & MF_COUNT_INCREASED))
211062306a36Sopenharmony_ci		return;
211162306a36Sopenharmony_ci
211262306a36Sopenharmony_ci	page = pfn_to_page(pfn);
211362306a36Sopenharmony_ci	if (page)
211462306a36Sopenharmony_ci		put_page(page);
211562306a36Sopenharmony_ci}
211662306a36Sopenharmony_ci
211762306a36Sopenharmony_cistatic int memory_failure_dev_pagemap(unsigned long pfn, int flags,
211862306a36Sopenharmony_ci		struct dev_pagemap *pgmap)
211962306a36Sopenharmony_ci{
212062306a36Sopenharmony_ci	int rc = -ENXIO;
212162306a36Sopenharmony_ci
212262306a36Sopenharmony_ci	/* device metadata space is not recoverable */
212362306a36Sopenharmony_ci	if (!pgmap_pfn_valid(pgmap, pfn))
212462306a36Sopenharmony_ci		goto out;
212562306a36Sopenharmony_ci
212662306a36Sopenharmony_ci	/*
212762306a36Sopenharmony_ci	 * Call driver's implementation to handle the memory failure, otherwise
212862306a36Sopenharmony_ci	 * fall back to generic handler.
212962306a36Sopenharmony_ci	 */
213062306a36Sopenharmony_ci	if (pgmap_has_memory_failure(pgmap)) {
213162306a36Sopenharmony_ci		rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
213262306a36Sopenharmony_ci		/*
213362306a36Sopenharmony_ci		 * Fall back to generic handler too if operation is not
213462306a36Sopenharmony_ci		 * supported inside the driver/device/filesystem.
213562306a36Sopenharmony_ci		 */
213662306a36Sopenharmony_ci		if (rc != -EOPNOTSUPP)
213762306a36Sopenharmony_ci			goto out;
213862306a36Sopenharmony_ci	}
213962306a36Sopenharmony_ci
214062306a36Sopenharmony_ci	rc = mf_generic_kill_procs(pfn, flags, pgmap);
214162306a36Sopenharmony_ciout:
214262306a36Sopenharmony_ci	/* drop pgmap ref acquired in caller */
214362306a36Sopenharmony_ci	put_dev_pagemap(pgmap);
214462306a36Sopenharmony_ci	if (rc != -EOPNOTSUPP)
214562306a36Sopenharmony_ci		action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
214662306a36Sopenharmony_ci	return rc;
214762306a36Sopenharmony_ci}
214862306a36Sopenharmony_ci
214962306a36Sopenharmony_ci/**
215062306a36Sopenharmony_ci * memory_failure - Handle memory failure of a page.
215162306a36Sopenharmony_ci * @pfn: Page Number of the corrupted page
215262306a36Sopenharmony_ci * @flags: fine tune action taken
215362306a36Sopenharmony_ci *
215462306a36Sopenharmony_ci * This function is called by the low level machine check code
215562306a36Sopenharmony_ci * of an architecture when it detects hardware memory corruption
215662306a36Sopenharmony_ci * of a page. It tries its best to recover, which includes
215762306a36Sopenharmony_ci * dropping pages, killing processes etc.
215862306a36Sopenharmony_ci *
215962306a36Sopenharmony_ci * The function is primarily of use for corruptions that
216062306a36Sopenharmony_ci * happen outside the current execution context (e.g. when
216162306a36Sopenharmony_ci * detected by a background scrubber)
216262306a36Sopenharmony_ci *
216362306a36Sopenharmony_ci * Must run in process context (e.g. a work queue) with interrupts
216462306a36Sopenharmony_ci * enabled and no spinlocks held.
216562306a36Sopenharmony_ci *
216662306a36Sopenharmony_ci * Return: 0 for successfully handled the memory error,
216762306a36Sopenharmony_ci *         -EOPNOTSUPP for hwpoison_filter() filtered the error event,
216862306a36Sopenharmony_ci *         < 0(except -EOPNOTSUPP) on failure.
216962306a36Sopenharmony_ci */
217062306a36Sopenharmony_ciint memory_failure(unsigned long pfn, int flags)
217162306a36Sopenharmony_ci{
217262306a36Sopenharmony_ci	struct page *p;
217362306a36Sopenharmony_ci	struct page *hpage;
217462306a36Sopenharmony_ci	struct dev_pagemap *pgmap;
217562306a36Sopenharmony_ci	int res = 0;
217662306a36Sopenharmony_ci	unsigned long page_flags;
217762306a36Sopenharmony_ci	bool retry = true;
217862306a36Sopenharmony_ci	int hugetlb = 0;
217962306a36Sopenharmony_ci
218062306a36Sopenharmony_ci	if (!sysctl_memory_failure_recovery)
218162306a36Sopenharmony_ci		panic("Memory failure on page %lx", pfn);
218262306a36Sopenharmony_ci
218362306a36Sopenharmony_ci	mutex_lock(&mf_mutex);
218462306a36Sopenharmony_ci
218562306a36Sopenharmony_ci	if (!(flags & MF_SW_SIMULATED))
218662306a36Sopenharmony_ci		hw_memory_failure = true;
218762306a36Sopenharmony_ci
218862306a36Sopenharmony_ci	p = pfn_to_online_page(pfn);
218962306a36Sopenharmony_ci	if (!p) {
219062306a36Sopenharmony_ci		res = arch_memory_failure(pfn, flags);
219162306a36Sopenharmony_ci		if (res == 0)
219262306a36Sopenharmony_ci			goto unlock_mutex;
219362306a36Sopenharmony_ci
219462306a36Sopenharmony_ci		if (pfn_valid(pfn)) {
219562306a36Sopenharmony_ci			pgmap = get_dev_pagemap(pfn, NULL);
219662306a36Sopenharmony_ci			put_ref_page(pfn, flags);
219762306a36Sopenharmony_ci			if (pgmap) {
219862306a36Sopenharmony_ci				res = memory_failure_dev_pagemap(pfn, flags,
219962306a36Sopenharmony_ci								 pgmap);
220062306a36Sopenharmony_ci				goto unlock_mutex;
220162306a36Sopenharmony_ci			}
220262306a36Sopenharmony_ci		}
220362306a36Sopenharmony_ci		pr_err("%#lx: memory outside kernel control\n", pfn);
220462306a36Sopenharmony_ci		res = -ENXIO;
220562306a36Sopenharmony_ci		goto unlock_mutex;
220662306a36Sopenharmony_ci	}
220762306a36Sopenharmony_ci
220862306a36Sopenharmony_citry_again:
220962306a36Sopenharmony_ci	res = try_memory_failure_hugetlb(pfn, flags, &hugetlb);
221062306a36Sopenharmony_ci	if (hugetlb)
221162306a36Sopenharmony_ci		goto unlock_mutex;
221262306a36Sopenharmony_ci
221362306a36Sopenharmony_ci	if (TestSetPageHWPoison(p)) {
221462306a36Sopenharmony_ci		pr_err("%#lx: already hardware poisoned\n", pfn);
221562306a36Sopenharmony_ci		res = -EHWPOISON;
221662306a36Sopenharmony_ci		if (flags & MF_ACTION_REQUIRED)
221762306a36Sopenharmony_ci			res = kill_accessing_process(current, pfn, flags);
221862306a36Sopenharmony_ci		if (flags & MF_COUNT_INCREASED)
221962306a36Sopenharmony_ci			put_page(p);
222062306a36Sopenharmony_ci		goto unlock_mutex;
222162306a36Sopenharmony_ci	}
222262306a36Sopenharmony_ci
222362306a36Sopenharmony_ci	/*
222462306a36Sopenharmony_ci	 * We need/can do nothing about count=0 pages.
222562306a36Sopenharmony_ci	 * 1) it's a free page, and therefore in safe hand:
222662306a36Sopenharmony_ci	 *    check_new_page() will be the gate keeper.
222762306a36Sopenharmony_ci	 * 2) it's part of a non-compound high order page.
222862306a36Sopenharmony_ci	 *    Implies some kernel user: cannot stop them from
222962306a36Sopenharmony_ci	 *    R/W the page; let's pray that the page has been
223062306a36Sopenharmony_ci	 *    used and will be freed some time later.
223162306a36Sopenharmony_ci	 * In fact it's dangerous to directly bump up page count from 0,
223262306a36Sopenharmony_ci	 * that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
223362306a36Sopenharmony_ci	 */
223462306a36Sopenharmony_ci	if (!(flags & MF_COUNT_INCREASED)) {
223562306a36Sopenharmony_ci		res = get_hwpoison_page(p, flags);
223662306a36Sopenharmony_ci		if (!res) {
223762306a36Sopenharmony_ci			if (is_free_buddy_page(p)) {
223862306a36Sopenharmony_ci				if (take_page_off_buddy(p)) {
223962306a36Sopenharmony_ci					page_ref_inc(p);
224062306a36Sopenharmony_ci					res = MF_RECOVERED;
224162306a36Sopenharmony_ci				} else {
224262306a36Sopenharmony_ci					/* We lost the race, try again */
224362306a36Sopenharmony_ci					if (retry) {
224462306a36Sopenharmony_ci						ClearPageHWPoison(p);
224562306a36Sopenharmony_ci						retry = false;
224662306a36Sopenharmony_ci						goto try_again;
224762306a36Sopenharmony_ci					}
224862306a36Sopenharmony_ci					res = MF_FAILED;
224962306a36Sopenharmony_ci				}
225062306a36Sopenharmony_ci				res = action_result(pfn, MF_MSG_BUDDY, res);
225162306a36Sopenharmony_ci			} else {
225262306a36Sopenharmony_ci				res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
225362306a36Sopenharmony_ci			}
225462306a36Sopenharmony_ci			goto unlock_mutex;
225562306a36Sopenharmony_ci		} else if (res < 0) {
225662306a36Sopenharmony_ci			res = action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
225762306a36Sopenharmony_ci			goto unlock_mutex;
225862306a36Sopenharmony_ci		}
225962306a36Sopenharmony_ci	}
226062306a36Sopenharmony_ci
226162306a36Sopenharmony_ci	hpage = compound_head(p);
226262306a36Sopenharmony_ci	if (PageTransHuge(hpage)) {
226362306a36Sopenharmony_ci		/*
226462306a36Sopenharmony_ci		 * The flag must be set after the refcount is bumped
226562306a36Sopenharmony_ci		 * otherwise it may race with THP split.
226662306a36Sopenharmony_ci		 * And the flag can't be set in get_hwpoison_page() since
226762306a36Sopenharmony_ci		 * it is called by soft offline too and it is just called
226862306a36Sopenharmony_ci		 * for !MF_COUNT_INCREASED.  So here seems to be the best
226962306a36Sopenharmony_ci		 * place.
227062306a36Sopenharmony_ci		 *
227162306a36Sopenharmony_ci		 * Don't need care about the above error handling paths for
227262306a36Sopenharmony_ci		 * get_hwpoison_page() since they handle either free page
227362306a36Sopenharmony_ci		 * or unhandlable page.  The refcount is bumped iff the
227462306a36Sopenharmony_ci		 * page is a valid handlable page.
227562306a36Sopenharmony_ci		 */
227662306a36Sopenharmony_ci		SetPageHasHWPoisoned(hpage);
227762306a36Sopenharmony_ci		if (try_to_split_thp_page(p) < 0) {
227862306a36Sopenharmony_ci			res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
227962306a36Sopenharmony_ci			goto unlock_mutex;
228062306a36Sopenharmony_ci		}
228162306a36Sopenharmony_ci		VM_BUG_ON_PAGE(!page_count(p), p);
228262306a36Sopenharmony_ci	}
228362306a36Sopenharmony_ci
228462306a36Sopenharmony_ci	/*
228562306a36Sopenharmony_ci	 * We ignore non-LRU pages for good reasons.
228662306a36Sopenharmony_ci	 * - PG_locked is only well defined for LRU pages and a few others
228762306a36Sopenharmony_ci	 * - to avoid races with __SetPageLocked()
228862306a36Sopenharmony_ci	 * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
228962306a36Sopenharmony_ci	 * The check (unnecessarily) ignores LRU pages being isolated and
229062306a36Sopenharmony_ci	 * walked by the page reclaim code, however that's not a big loss.
229162306a36Sopenharmony_ci	 */
229262306a36Sopenharmony_ci	shake_page(p);
229362306a36Sopenharmony_ci
229462306a36Sopenharmony_ci	lock_page(p);
229562306a36Sopenharmony_ci
229662306a36Sopenharmony_ci	/*
229762306a36Sopenharmony_ci	 * We're only intended to deal with the non-Compound page here.
229862306a36Sopenharmony_ci	 * However, the page could have changed compound pages due to
229962306a36Sopenharmony_ci	 * race window. If this happens, we could try again to hopefully
230062306a36Sopenharmony_ci	 * handle the page next round.
230162306a36Sopenharmony_ci	 */
230262306a36Sopenharmony_ci	if (PageCompound(p)) {
230362306a36Sopenharmony_ci		if (retry) {
230462306a36Sopenharmony_ci			ClearPageHWPoison(p);
230562306a36Sopenharmony_ci			unlock_page(p);
230662306a36Sopenharmony_ci			put_page(p);
230762306a36Sopenharmony_ci			flags &= ~MF_COUNT_INCREASED;
230862306a36Sopenharmony_ci			retry = false;
230962306a36Sopenharmony_ci			goto try_again;
231062306a36Sopenharmony_ci		}
231162306a36Sopenharmony_ci		res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
231262306a36Sopenharmony_ci		goto unlock_page;
231362306a36Sopenharmony_ci	}
231462306a36Sopenharmony_ci
231562306a36Sopenharmony_ci	/*
231662306a36Sopenharmony_ci	 * We use page flags to determine what action should be taken, but
231762306a36Sopenharmony_ci	 * the flags can be modified by the error containment action.  One
231862306a36Sopenharmony_ci	 * example is an mlocked page, where PG_mlocked is cleared by
231962306a36Sopenharmony_ci	 * page_remove_rmap() in try_to_unmap_one(). So to determine page status
232062306a36Sopenharmony_ci	 * correctly, we save a copy of the page flags at this time.
232162306a36Sopenharmony_ci	 */
232262306a36Sopenharmony_ci	page_flags = p->flags;
232362306a36Sopenharmony_ci
232462306a36Sopenharmony_ci	if (hwpoison_filter(p)) {
232562306a36Sopenharmony_ci		ClearPageHWPoison(p);
232662306a36Sopenharmony_ci		unlock_page(p);
232762306a36Sopenharmony_ci		put_page(p);
232862306a36Sopenharmony_ci		res = -EOPNOTSUPP;
232962306a36Sopenharmony_ci		goto unlock_mutex;
233062306a36Sopenharmony_ci	}
233162306a36Sopenharmony_ci
233262306a36Sopenharmony_ci	/*
233362306a36Sopenharmony_ci	 * __munlock_folio() may clear a writeback page's LRU flag without
233462306a36Sopenharmony_ci	 * page_lock. We need wait writeback completion for this page or it
233562306a36Sopenharmony_ci	 * may trigger vfs BUG while evict inode.
233662306a36Sopenharmony_ci	 */
233762306a36Sopenharmony_ci	if (!PageLRU(p) && !PageWriteback(p))
233862306a36Sopenharmony_ci		goto identify_page_state;
233962306a36Sopenharmony_ci
234062306a36Sopenharmony_ci	/*
234162306a36Sopenharmony_ci	 * It's very difficult to mess with pages currently under IO
234262306a36Sopenharmony_ci	 * and in many cases impossible, so we just avoid it here.
234362306a36Sopenharmony_ci	 */
234462306a36Sopenharmony_ci	wait_on_page_writeback(p);
234562306a36Sopenharmony_ci
234662306a36Sopenharmony_ci	/*
234762306a36Sopenharmony_ci	 * Now take care of user space mappings.
234862306a36Sopenharmony_ci	 * Abort on fail: __filemap_remove_folio() assumes unmapped page.
234962306a36Sopenharmony_ci	 */
235062306a36Sopenharmony_ci	if (!hwpoison_user_mappings(p, pfn, flags, p)) {
235162306a36Sopenharmony_ci		res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
235262306a36Sopenharmony_ci		goto unlock_page;
235362306a36Sopenharmony_ci	}
235462306a36Sopenharmony_ci
235562306a36Sopenharmony_ci	/*
235662306a36Sopenharmony_ci	 * Torn down by someone else?
235762306a36Sopenharmony_ci	 */
235862306a36Sopenharmony_ci	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
235962306a36Sopenharmony_ci		res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
236062306a36Sopenharmony_ci		goto unlock_page;
236162306a36Sopenharmony_ci	}
236262306a36Sopenharmony_ci
236362306a36Sopenharmony_ciidentify_page_state:
236462306a36Sopenharmony_ci	res = identify_page_state(pfn, p, page_flags);
236562306a36Sopenharmony_ci	mutex_unlock(&mf_mutex);
236662306a36Sopenharmony_ci	return res;
236762306a36Sopenharmony_ciunlock_page:
236862306a36Sopenharmony_ci	unlock_page(p);
236962306a36Sopenharmony_ciunlock_mutex:
237062306a36Sopenharmony_ci	mutex_unlock(&mf_mutex);
237162306a36Sopenharmony_ci	return res;
237262306a36Sopenharmony_ci}
237362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(memory_failure);
237462306a36Sopenharmony_ci
237562306a36Sopenharmony_ci#define MEMORY_FAILURE_FIFO_ORDER	4
237662306a36Sopenharmony_ci#define MEMORY_FAILURE_FIFO_SIZE	(1 << MEMORY_FAILURE_FIFO_ORDER)
237762306a36Sopenharmony_ci
237862306a36Sopenharmony_cistruct memory_failure_entry {
237962306a36Sopenharmony_ci	unsigned long pfn;
238062306a36Sopenharmony_ci	int flags;
238162306a36Sopenharmony_ci};
238262306a36Sopenharmony_ci
238362306a36Sopenharmony_cistruct memory_failure_cpu {
238462306a36Sopenharmony_ci	DECLARE_KFIFO(fifo, struct memory_failure_entry,
238562306a36Sopenharmony_ci		      MEMORY_FAILURE_FIFO_SIZE);
238662306a36Sopenharmony_ci	spinlock_t lock;
238762306a36Sopenharmony_ci	struct work_struct work;
238862306a36Sopenharmony_ci};
238962306a36Sopenharmony_ci
239062306a36Sopenharmony_cistatic DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
239162306a36Sopenharmony_ci
239262306a36Sopenharmony_ci/**
239362306a36Sopenharmony_ci * memory_failure_queue - Schedule handling memory failure of a page.
239462306a36Sopenharmony_ci * @pfn: Page Number of the corrupted page
239562306a36Sopenharmony_ci * @flags: Flags for memory failure handling
239662306a36Sopenharmony_ci *
239762306a36Sopenharmony_ci * This function is called by the low level hardware error handler
239862306a36Sopenharmony_ci * when it detects hardware memory corruption of a page. It schedules
239962306a36Sopenharmony_ci * the recovering of error page, including dropping pages, killing
240062306a36Sopenharmony_ci * processes etc.
240162306a36Sopenharmony_ci *
240262306a36Sopenharmony_ci * The function is primarily of use for corruptions that
240362306a36Sopenharmony_ci * happen outside the current execution context (e.g. when
240462306a36Sopenharmony_ci * detected by a background scrubber)
240562306a36Sopenharmony_ci *
240662306a36Sopenharmony_ci * Can run in IRQ context.
240762306a36Sopenharmony_ci */
240862306a36Sopenharmony_civoid memory_failure_queue(unsigned long pfn, int flags)
240962306a36Sopenharmony_ci{
241062306a36Sopenharmony_ci	struct memory_failure_cpu *mf_cpu;
241162306a36Sopenharmony_ci	unsigned long proc_flags;
241262306a36Sopenharmony_ci	struct memory_failure_entry entry = {
241362306a36Sopenharmony_ci		.pfn =		pfn,
241462306a36Sopenharmony_ci		.flags =	flags,
241562306a36Sopenharmony_ci	};
241662306a36Sopenharmony_ci
241762306a36Sopenharmony_ci	mf_cpu = &get_cpu_var(memory_failure_cpu);
241862306a36Sopenharmony_ci	spin_lock_irqsave(&mf_cpu->lock, proc_flags);
241962306a36Sopenharmony_ci	if (kfifo_put(&mf_cpu->fifo, entry))
242062306a36Sopenharmony_ci		schedule_work_on(smp_processor_id(), &mf_cpu->work);
242162306a36Sopenharmony_ci	else
242262306a36Sopenharmony_ci		pr_err("buffer overflow when queuing memory failure at %#lx\n",
242362306a36Sopenharmony_ci		       pfn);
242462306a36Sopenharmony_ci	spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
242562306a36Sopenharmony_ci	put_cpu_var(memory_failure_cpu);
242662306a36Sopenharmony_ci}
242762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(memory_failure_queue);
242862306a36Sopenharmony_ci
242962306a36Sopenharmony_cistatic void memory_failure_work_func(struct work_struct *work)
243062306a36Sopenharmony_ci{
243162306a36Sopenharmony_ci	struct memory_failure_cpu *mf_cpu;
243262306a36Sopenharmony_ci	struct memory_failure_entry entry = { 0, };
243362306a36Sopenharmony_ci	unsigned long proc_flags;
243462306a36Sopenharmony_ci	int gotten;
243562306a36Sopenharmony_ci
243662306a36Sopenharmony_ci	mf_cpu = container_of(work, struct memory_failure_cpu, work);
243762306a36Sopenharmony_ci	for (;;) {
243862306a36Sopenharmony_ci		spin_lock_irqsave(&mf_cpu->lock, proc_flags);
243962306a36Sopenharmony_ci		gotten = kfifo_get(&mf_cpu->fifo, &entry);
244062306a36Sopenharmony_ci		spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
244162306a36Sopenharmony_ci		if (!gotten)
244262306a36Sopenharmony_ci			break;
244362306a36Sopenharmony_ci		if (entry.flags & MF_SOFT_OFFLINE)
244462306a36Sopenharmony_ci			soft_offline_page(entry.pfn, entry.flags);
244562306a36Sopenharmony_ci		else
244662306a36Sopenharmony_ci			memory_failure(entry.pfn, entry.flags);
244762306a36Sopenharmony_ci	}
244862306a36Sopenharmony_ci}
244962306a36Sopenharmony_ci
245062306a36Sopenharmony_ci/*
245162306a36Sopenharmony_ci * Process memory_failure work queued on the specified CPU.
245262306a36Sopenharmony_ci * Used to avoid return-to-userspace racing with the memory_failure workqueue.
245362306a36Sopenharmony_ci */
245462306a36Sopenharmony_civoid memory_failure_queue_kick(int cpu)
245562306a36Sopenharmony_ci{
245662306a36Sopenharmony_ci	struct memory_failure_cpu *mf_cpu;
245762306a36Sopenharmony_ci
245862306a36Sopenharmony_ci	mf_cpu = &per_cpu(memory_failure_cpu, cpu);
245962306a36Sopenharmony_ci	cancel_work_sync(&mf_cpu->work);
246062306a36Sopenharmony_ci	memory_failure_work_func(&mf_cpu->work);
246162306a36Sopenharmony_ci}
246262306a36Sopenharmony_ci
246362306a36Sopenharmony_cistatic int __init memory_failure_init(void)
246462306a36Sopenharmony_ci{
246562306a36Sopenharmony_ci	struct memory_failure_cpu *mf_cpu;
246662306a36Sopenharmony_ci	int cpu;
246762306a36Sopenharmony_ci
246862306a36Sopenharmony_ci	for_each_possible_cpu(cpu) {
246962306a36Sopenharmony_ci		mf_cpu = &per_cpu(memory_failure_cpu, cpu);
247062306a36Sopenharmony_ci		spin_lock_init(&mf_cpu->lock);
247162306a36Sopenharmony_ci		INIT_KFIFO(mf_cpu->fifo);
247262306a36Sopenharmony_ci		INIT_WORK(&mf_cpu->work, memory_failure_work_func);
247362306a36Sopenharmony_ci	}
247462306a36Sopenharmony_ci
247562306a36Sopenharmony_ci	register_sysctl_init("vm", memory_failure_table);
247662306a36Sopenharmony_ci
247762306a36Sopenharmony_ci	return 0;
247862306a36Sopenharmony_ci}
247962306a36Sopenharmony_cicore_initcall(memory_failure_init);
248062306a36Sopenharmony_ci
248162306a36Sopenharmony_ci#undef pr_fmt
248262306a36Sopenharmony_ci#define pr_fmt(fmt)	"" fmt
248362306a36Sopenharmony_ci#define unpoison_pr_info(fmt, pfn, rs)			\
248462306a36Sopenharmony_ci({							\
248562306a36Sopenharmony_ci	if (__ratelimit(rs))				\
248662306a36Sopenharmony_ci		pr_info(fmt, pfn);			\
248762306a36Sopenharmony_ci})
248862306a36Sopenharmony_ci
248962306a36Sopenharmony_ci/**
249062306a36Sopenharmony_ci * unpoison_memory - Unpoison a previously poisoned page
249162306a36Sopenharmony_ci * @pfn: Page number of the to be unpoisoned page
249262306a36Sopenharmony_ci *
249362306a36Sopenharmony_ci * Software-unpoison a page that has been poisoned by
249462306a36Sopenharmony_ci * memory_failure() earlier.
249562306a36Sopenharmony_ci *
249662306a36Sopenharmony_ci * This is only done on the software-level, so it only works
249762306a36Sopenharmony_ci * for linux injected failures, not real hardware failures
249862306a36Sopenharmony_ci *
249962306a36Sopenharmony_ci * Returns 0 for success, otherwise -errno.
250062306a36Sopenharmony_ci */
250162306a36Sopenharmony_ciint unpoison_memory(unsigned long pfn)
250262306a36Sopenharmony_ci{
250362306a36Sopenharmony_ci	struct folio *folio;
250462306a36Sopenharmony_ci	struct page *p;
250562306a36Sopenharmony_ci	int ret = -EBUSY, ghp;
250662306a36Sopenharmony_ci	unsigned long count = 1;
250762306a36Sopenharmony_ci	bool huge = false;
250862306a36Sopenharmony_ci	static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
250962306a36Sopenharmony_ci					DEFAULT_RATELIMIT_BURST);
251062306a36Sopenharmony_ci
251162306a36Sopenharmony_ci	if (!pfn_valid(pfn))
251262306a36Sopenharmony_ci		return -ENXIO;
251362306a36Sopenharmony_ci
251462306a36Sopenharmony_ci	p = pfn_to_page(pfn);
251562306a36Sopenharmony_ci	folio = page_folio(p);
251662306a36Sopenharmony_ci
251762306a36Sopenharmony_ci	mutex_lock(&mf_mutex);
251862306a36Sopenharmony_ci
251962306a36Sopenharmony_ci	if (hw_memory_failure) {
252062306a36Sopenharmony_ci		unpoison_pr_info("Unpoison: Disabled after HW memory failure %#lx\n",
252162306a36Sopenharmony_ci				 pfn, &unpoison_rs);
252262306a36Sopenharmony_ci		ret = -EOPNOTSUPP;
252362306a36Sopenharmony_ci		goto unlock_mutex;
252462306a36Sopenharmony_ci	}
252562306a36Sopenharmony_ci
252662306a36Sopenharmony_ci	if (!PageHWPoison(p)) {
252762306a36Sopenharmony_ci		unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
252862306a36Sopenharmony_ci				 pfn, &unpoison_rs);
252962306a36Sopenharmony_ci		goto unlock_mutex;
253062306a36Sopenharmony_ci	}
253162306a36Sopenharmony_ci
253262306a36Sopenharmony_ci	if (folio_ref_count(folio) > 1) {
253362306a36Sopenharmony_ci		unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
253462306a36Sopenharmony_ci				 pfn, &unpoison_rs);
253562306a36Sopenharmony_ci		goto unlock_mutex;
253662306a36Sopenharmony_ci	}
253762306a36Sopenharmony_ci
253862306a36Sopenharmony_ci	if (folio_test_slab(folio) || PageTable(&folio->page) ||
253962306a36Sopenharmony_ci	    folio_test_reserved(folio) || PageOffline(&folio->page))
254062306a36Sopenharmony_ci		goto unlock_mutex;
254162306a36Sopenharmony_ci
254262306a36Sopenharmony_ci	/*
254362306a36Sopenharmony_ci	 * Note that folio->_mapcount is overloaded in SLAB, so the simple test
254462306a36Sopenharmony_ci	 * in folio_mapped() has to be done after folio_test_slab() is checked.
254562306a36Sopenharmony_ci	 */
254662306a36Sopenharmony_ci	if (folio_mapped(folio)) {
254762306a36Sopenharmony_ci		unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
254862306a36Sopenharmony_ci				 pfn, &unpoison_rs);
254962306a36Sopenharmony_ci		goto unlock_mutex;
255062306a36Sopenharmony_ci	}
255162306a36Sopenharmony_ci
255262306a36Sopenharmony_ci	if (folio_mapping(folio)) {
255362306a36Sopenharmony_ci		unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
255462306a36Sopenharmony_ci				 pfn, &unpoison_rs);
255562306a36Sopenharmony_ci		goto unlock_mutex;
255662306a36Sopenharmony_ci	}
255762306a36Sopenharmony_ci
255862306a36Sopenharmony_ci	ghp = get_hwpoison_page(p, MF_UNPOISON);
255962306a36Sopenharmony_ci	if (!ghp) {
256062306a36Sopenharmony_ci		if (PageHuge(p)) {
256162306a36Sopenharmony_ci			huge = true;
256262306a36Sopenharmony_ci			count = folio_free_raw_hwp(folio, false);
256362306a36Sopenharmony_ci			if (count == 0)
256462306a36Sopenharmony_ci				goto unlock_mutex;
256562306a36Sopenharmony_ci		}
256662306a36Sopenharmony_ci		ret = folio_test_clear_hwpoison(folio) ? 0 : -EBUSY;
256762306a36Sopenharmony_ci	} else if (ghp < 0) {
256862306a36Sopenharmony_ci		if (ghp == -EHWPOISON) {
256962306a36Sopenharmony_ci			ret = put_page_back_buddy(p) ? 0 : -EBUSY;
257062306a36Sopenharmony_ci		} else {
257162306a36Sopenharmony_ci			ret = ghp;
257262306a36Sopenharmony_ci			unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
257362306a36Sopenharmony_ci					 pfn, &unpoison_rs);
257462306a36Sopenharmony_ci		}
257562306a36Sopenharmony_ci	} else {
257662306a36Sopenharmony_ci		if (PageHuge(p)) {
257762306a36Sopenharmony_ci			huge = true;
257862306a36Sopenharmony_ci			count = folio_free_raw_hwp(folio, false);
257962306a36Sopenharmony_ci			if (count == 0) {
258062306a36Sopenharmony_ci				folio_put(folio);
258162306a36Sopenharmony_ci				goto unlock_mutex;
258262306a36Sopenharmony_ci			}
258362306a36Sopenharmony_ci		}
258462306a36Sopenharmony_ci
258562306a36Sopenharmony_ci		folio_put(folio);
258662306a36Sopenharmony_ci		if (TestClearPageHWPoison(p)) {
258762306a36Sopenharmony_ci			folio_put(folio);
258862306a36Sopenharmony_ci			ret = 0;
258962306a36Sopenharmony_ci		}
259062306a36Sopenharmony_ci	}
259162306a36Sopenharmony_ci
259262306a36Sopenharmony_ciunlock_mutex:
259362306a36Sopenharmony_ci	mutex_unlock(&mf_mutex);
259462306a36Sopenharmony_ci	if (!ret) {
259562306a36Sopenharmony_ci		if (!huge)
259662306a36Sopenharmony_ci			num_poisoned_pages_sub(pfn, 1);
259762306a36Sopenharmony_ci		unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
259862306a36Sopenharmony_ci				 page_to_pfn(p), &unpoison_rs);
259962306a36Sopenharmony_ci	}
260062306a36Sopenharmony_ci	return ret;
260162306a36Sopenharmony_ci}
260262306a36Sopenharmony_ciEXPORT_SYMBOL(unpoison_memory);
260362306a36Sopenharmony_ci
260462306a36Sopenharmony_cistatic bool isolate_page(struct page *page, struct list_head *pagelist)
260562306a36Sopenharmony_ci{
260662306a36Sopenharmony_ci	bool isolated = false;
260762306a36Sopenharmony_ci
260862306a36Sopenharmony_ci	if (PageHuge(page)) {
260962306a36Sopenharmony_ci		isolated = isolate_hugetlb(page_folio(page), pagelist);
261062306a36Sopenharmony_ci	} else {
261162306a36Sopenharmony_ci		bool lru = !__PageMovable(page);
261262306a36Sopenharmony_ci
261362306a36Sopenharmony_ci		if (lru)
261462306a36Sopenharmony_ci			isolated = isolate_lru_page(page);
261562306a36Sopenharmony_ci		else
261662306a36Sopenharmony_ci			isolated = isolate_movable_page(page,
261762306a36Sopenharmony_ci							ISOLATE_UNEVICTABLE);
261862306a36Sopenharmony_ci
261962306a36Sopenharmony_ci		if (isolated) {
262062306a36Sopenharmony_ci			list_add(&page->lru, pagelist);
262162306a36Sopenharmony_ci			if (lru)
262262306a36Sopenharmony_ci				inc_node_page_state(page, NR_ISOLATED_ANON +
262362306a36Sopenharmony_ci						    page_is_file_lru(page));
262462306a36Sopenharmony_ci		}
262562306a36Sopenharmony_ci	}
262662306a36Sopenharmony_ci
262762306a36Sopenharmony_ci	/*
262862306a36Sopenharmony_ci	 * If we succeed to isolate the page, we grabbed another refcount on
262962306a36Sopenharmony_ci	 * the page, so we can safely drop the one we got from get_any_page().
263062306a36Sopenharmony_ci	 * If we failed to isolate the page, it means that we cannot go further
263162306a36Sopenharmony_ci	 * and we will return an error, so drop the reference we got from
263262306a36Sopenharmony_ci	 * get_any_page() as well.
263362306a36Sopenharmony_ci	 */
263462306a36Sopenharmony_ci	put_page(page);
263562306a36Sopenharmony_ci	return isolated;
263662306a36Sopenharmony_ci}
263762306a36Sopenharmony_ci
263862306a36Sopenharmony_ci/*
263962306a36Sopenharmony_ci * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages.
264062306a36Sopenharmony_ci * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
264162306a36Sopenharmony_ci * If the page is mapped, it migrates the contents over.
264262306a36Sopenharmony_ci */
264362306a36Sopenharmony_cistatic int soft_offline_in_use_page(struct page *page)
264462306a36Sopenharmony_ci{
264562306a36Sopenharmony_ci	long ret = 0;
264662306a36Sopenharmony_ci	unsigned long pfn = page_to_pfn(page);
264762306a36Sopenharmony_ci	struct page *hpage = compound_head(page);
264862306a36Sopenharmony_ci	char const *msg_page[] = {"page", "hugepage"};
264962306a36Sopenharmony_ci	bool huge = PageHuge(page);
265062306a36Sopenharmony_ci	LIST_HEAD(pagelist);
265162306a36Sopenharmony_ci	struct migration_target_control mtc = {
265262306a36Sopenharmony_ci		.nid = NUMA_NO_NODE,
265362306a36Sopenharmony_ci		.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
265462306a36Sopenharmony_ci	};
265562306a36Sopenharmony_ci
265662306a36Sopenharmony_ci	if (!huge && PageTransHuge(hpage)) {
265762306a36Sopenharmony_ci		if (try_to_split_thp_page(page)) {
265862306a36Sopenharmony_ci			pr_info("soft offline: %#lx: thp split failed\n", pfn);
265962306a36Sopenharmony_ci			return -EBUSY;
266062306a36Sopenharmony_ci		}
266162306a36Sopenharmony_ci		hpage = page;
266262306a36Sopenharmony_ci	}
266362306a36Sopenharmony_ci
266462306a36Sopenharmony_ci	lock_page(page);
266562306a36Sopenharmony_ci	if (!huge)
266662306a36Sopenharmony_ci		wait_on_page_writeback(page);
266762306a36Sopenharmony_ci	if (PageHWPoison(page)) {
266862306a36Sopenharmony_ci		unlock_page(page);
266962306a36Sopenharmony_ci		put_page(page);
267062306a36Sopenharmony_ci		pr_info("soft offline: %#lx page already poisoned\n", pfn);
267162306a36Sopenharmony_ci		return 0;
267262306a36Sopenharmony_ci	}
267362306a36Sopenharmony_ci
267462306a36Sopenharmony_ci	if (!huge && PageLRU(page) && !PageSwapCache(page))
267562306a36Sopenharmony_ci		/*
267662306a36Sopenharmony_ci		 * Try to invalidate first. This should work for
267762306a36Sopenharmony_ci		 * non dirty unmapped page cache pages.
267862306a36Sopenharmony_ci		 */
267962306a36Sopenharmony_ci		ret = invalidate_inode_page(page);
268062306a36Sopenharmony_ci	unlock_page(page);
268162306a36Sopenharmony_ci
268262306a36Sopenharmony_ci	if (ret) {
268362306a36Sopenharmony_ci		pr_info("soft_offline: %#lx: invalidated\n", pfn);
268462306a36Sopenharmony_ci		page_handle_poison(page, false, true);
268562306a36Sopenharmony_ci		return 0;
268662306a36Sopenharmony_ci	}
268762306a36Sopenharmony_ci
268862306a36Sopenharmony_ci	if (isolate_page(hpage, &pagelist)) {
268962306a36Sopenharmony_ci		ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
269062306a36Sopenharmony_ci			(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL);
269162306a36Sopenharmony_ci		if (!ret) {
269262306a36Sopenharmony_ci			bool release = !huge;
269362306a36Sopenharmony_ci
269462306a36Sopenharmony_ci			if (!page_handle_poison(page, huge, release))
269562306a36Sopenharmony_ci				ret = -EBUSY;
269662306a36Sopenharmony_ci		} else {
269762306a36Sopenharmony_ci			if (!list_empty(&pagelist))
269862306a36Sopenharmony_ci				putback_movable_pages(&pagelist);
269962306a36Sopenharmony_ci
270062306a36Sopenharmony_ci			pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n",
270162306a36Sopenharmony_ci				pfn, msg_page[huge], ret, &page->flags);
270262306a36Sopenharmony_ci			if (ret > 0)
270362306a36Sopenharmony_ci				ret = -EBUSY;
270462306a36Sopenharmony_ci		}
270562306a36Sopenharmony_ci	} else {
270662306a36Sopenharmony_ci		pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %pGp\n",
270762306a36Sopenharmony_ci			pfn, msg_page[huge], page_count(page), &page->flags);
270862306a36Sopenharmony_ci		ret = -EBUSY;
270962306a36Sopenharmony_ci	}
271062306a36Sopenharmony_ci	return ret;
271162306a36Sopenharmony_ci}
271262306a36Sopenharmony_ci
271362306a36Sopenharmony_ci/**
271462306a36Sopenharmony_ci * soft_offline_page - Soft offline a page.
271562306a36Sopenharmony_ci * @pfn: pfn to soft-offline
271662306a36Sopenharmony_ci * @flags: flags. Same as memory_failure().
271762306a36Sopenharmony_ci *
271862306a36Sopenharmony_ci * Returns 0 on success
271962306a36Sopenharmony_ci *         -EOPNOTSUPP for hwpoison_filter() filtered the error event
272062306a36Sopenharmony_ci *         < 0 otherwise negated errno.
272162306a36Sopenharmony_ci *
272262306a36Sopenharmony_ci * Soft offline a page, by migration or invalidation,
272362306a36Sopenharmony_ci * without killing anything. This is for the case when
272462306a36Sopenharmony_ci * a page is not corrupted yet (so it's still valid to access),
272562306a36Sopenharmony_ci * but has had a number of corrected errors and is better taken
272662306a36Sopenharmony_ci * out.
272762306a36Sopenharmony_ci *
272862306a36Sopenharmony_ci * The actual policy on when to do that is maintained by
272962306a36Sopenharmony_ci * user space.
273062306a36Sopenharmony_ci *
273162306a36Sopenharmony_ci * This should never impact any application or cause data loss,
273262306a36Sopenharmony_ci * however it might take some time.
273362306a36Sopenharmony_ci *
273462306a36Sopenharmony_ci * This is not a 100% solution for all memory, but tries to be
273562306a36Sopenharmony_ci * ``good enough'' for the majority of memory.
273662306a36Sopenharmony_ci */
273762306a36Sopenharmony_ciint soft_offline_page(unsigned long pfn, int flags)
273862306a36Sopenharmony_ci{
273962306a36Sopenharmony_ci	int ret;
274062306a36Sopenharmony_ci	bool try_again = true;
274162306a36Sopenharmony_ci	struct page *page;
274262306a36Sopenharmony_ci
274362306a36Sopenharmony_ci	if (!pfn_valid(pfn)) {
274462306a36Sopenharmony_ci		WARN_ON_ONCE(flags & MF_COUNT_INCREASED);
274562306a36Sopenharmony_ci		return -ENXIO;
274662306a36Sopenharmony_ci	}
274762306a36Sopenharmony_ci
274862306a36Sopenharmony_ci	/* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
274962306a36Sopenharmony_ci	page = pfn_to_online_page(pfn);
275062306a36Sopenharmony_ci	if (!page) {
275162306a36Sopenharmony_ci		put_ref_page(pfn, flags);
275262306a36Sopenharmony_ci		return -EIO;
275362306a36Sopenharmony_ci	}
275462306a36Sopenharmony_ci
275562306a36Sopenharmony_ci	mutex_lock(&mf_mutex);
275662306a36Sopenharmony_ci
275762306a36Sopenharmony_ci	if (PageHWPoison(page)) {
275862306a36Sopenharmony_ci		pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
275962306a36Sopenharmony_ci		put_ref_page(pfn, flags);
276062306a36Sopenharmony_ci		mutex_unlock(&mf_mutex);
276162306a36Sopenharmony_ci		return 0;
276262306a36Sopenharmony_ci	}
276362306a36Sopenharmony_ci
276462306a36Sopenharmony_ciretry:
276562306a36Sopenharmony_ci	get_online_mems();
276662306a36Sopenharmony_ci	ret = get_hwpoison_page(page, flags | MF_SOFT_OFFLINE);
276762306a36Sopenharmony_ci	put_online_mems();
276862306a36Sopenharmony_ci
276962306a36Sopenharmony_ci	if (hwpoison_filter(page)) {
277062306a36Sopenharmony_ci		if (ret > 0)
277162306a36Sopenharmony_ci			put_page(page);
277262306a36Sopenharmony_ci
277362306a36Sopenharmony_ci		mutex_unlock(&mf_mutex);
277462306a36Sopenharmony_ci		return -EOPNOTSUPP;
277562306a36Sopenharmony_ci	}
277662306a36Sopenharmony_ci
277762306a36Sopenharmony_ci	if (ret > 0) {
277862306a36Sopenharmony_ci		ret = soft_offline_in_use_page(page);
277962306a36Sopenharmony_ci	} else if (ret == 0) {
278062306a36Sopenharmony_ci		if (!page_handle_poison(page, true, false)) {
278162306a36Sopenharmony_ci			if (try_again) {
278262306a36Sopenharmony_ci				try_again = false;
278362306a36Sopenharmony_ci				flags &= ~MF_COUNT_INCREASED;
278462306a36Sopenharmony_ci				goto retry;
278562306a36Sopenharmony_ci			}
278662306a36Sopenharmony_ci			ret = -EBUSY;
278762306a36Sopenharmony_ci		}
278862306a36Sopenharmony_ci	}
278962306a36Sopenharmony_ci
279062306a36Sopenharmony_ci	mutex_unlock(&mf_mutex);
279162306a36Sopenharmony_ci
279262306a36Sopenharmony_ci	return ret;
279362306a36Sopenharmony_ci}
2794