162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2008, 2009 Intel Corporation 462306a36Sopenharmony_ci * Authors: Andi Kleen, Fengguang Wu 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * High level machine check handler. Handles pages reported by the 762306a36Sopenharmony_ci * hardware as being corrupted usually due to a multi-bit ECC memory or cache 862306a36Sopenharmony_ci * failure. 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * In addition there is a "soft offline" entry point that allows stop using 1162306a36Sopenharmony_ci * not-yet-corrupted-by-suspicious pages without killing anything. 1262306a36Sopenharmony_ci * 1362306a36Sopenharmony_ci * Handles page cache pages in various states. The tricky part 1462306a36Sopenharmony_ci * here is that we can access any page asynchronously in respect to 1562306a36Sopenharmony_ci * other VM users, because memory failures could happen anytime and 1662306a36Sopenharmony_ci * anywhere. This could violate some of their assumptions. This is why 1762306a36Sopenharmony_ci * this code has to be extremely careful. Generally it tries to use 1862306a36Sopenharmony_ci * normal locking rules, as in get the standard locks, even if that means 1962306a36Sopenharmony_ci * the error handling takes potentially a long time. 2062306a36Sopenharmony_ci * 2162306a36Sopenharmony_ci * It can be very tempting to add handling for obscure cases here. 2262306a36Sopenharmony_ci * In general any code for handling new cases should only be added iff: 2362306a36Sopenharmony_ci * - You know how to test it. 2462306a36Sopenharmony_ci * - You have a test that can be added to mce-test 2562306a36Sopenharmony_ci * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ 2662306a36Sopenharmony_ci * - The case actually shows up as a frequent (top 10) page state in 2762306a36Sopenharmony_ci * tools/mm/page-types when running a real workload. 2862306a36Sopenharmony_ci * 2962306a36Sopenharmony_ci * There are several operations here with exponential complexity because 3062306a36Sopenharmony_ci * of unsuitable VM data structures. For example the operation to map back 3162306a36Sopenharmony_ci * from RMAP chains to processes has to walk the complete process list and 3262306a36Sopenharmony_ci * has non linear complexity with the number. But since memory corruptions 3362306a36Sopenharmony_ci * are rare we hope to get away with this. This avoids impacting the core 3462306a36Sopenharmony_ci * VM. 3562306a36Sopenharmony_ci */ 3662306a36Sopenharmony_ci 3762306a36Sopenharmony_ci#define pr_fmt(fmt) "Memory failure: " fmt 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_ci#include <linux/kernel.h> 4062306a36Sopenharmony_ci#include <linux/mm.h> 4162306a36Sopenharmony_ci#include <linux/page-flags.h> 4262306a36Sopenharmony_ci#include <linux/sched/signal.h> 4362306a36Sopenharmony_ci#include <linux/sched/task.h> 4462306a36Sopenharmony_ci#include <linux/dax.h> 4562306a36Sopenharmony_ci#include <linux/ksm.h> 4662306a36Sopenharmony_ci#include <linux/rmap.h> 4762306a36Sopenharmony_ci#include <linux/export.h> 4862306a36Sopenharmony_ci#include <linux/pagemap.h> 4962306a36Sopenharmony_ci#include <linux/swap.h> 5062306a36Sopenharmony_ci#include <linux/backing-dev.h> 5162306a36Sopenharmony_ci#include <linux/migrate.h> 5262306a36Sopenharmony_ci#include <linux/slab.h> 5362306a36Sopenharmony_ci#include <linux/swapops.h> 5462306a36Sopenharmony_ci#include <linux/hugetlb.h> 5562306a36Sopenharmony_ci#include <linux/memory_hotplug.h> 5662306a36Sopenharmony_ci#include <linux/mm_inline.h> 5762306a36Sopenharmony_ci#include <linux/memremap.h> 5862306a36Sopenharmony_ci#include <linux/kfifo.h> 5962306a36Sopenharmony_ci#include <linux/ratelimit.h> 6062306a36Sopenharmony_ci#include <linux/pagewalk.h> 6162306a36Sopenharmony_ci#include <linux/shmem_fs.h> 6262306a36Sopenharmony_ci#include <linux/sysctl.h> 6362306a36Sopenharmony_ci#include "swap.h" 6462306a36Sopenharmony_ci#include "internal.h" 6562306a36Sopenharmony_ci#include "ras/ras_event.h" 6662306a36Sopenharmony_ci 6762306a36Sopenharmony_cistatic int sysctl_memory_failure_early_kill __read_mostly; 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_cistatic int sysctl_memory_failure_recovery __read_mostly = 1; 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_ciatomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_cistatic bool hw_memory_failure __read_mostly = false; 7462306a36Sopenharmony_ci 7562306a36Sopenharmony_cistatic DEFINE_MUTEX(mf_mutex); 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_civoid num_poisoned_pages_inc(unsigned long pfn) 7862306a36Sopenharmony_ci{ 7962306a36Sopenharmony_ci atomic_long_inc(&num_poisoned_pages); 8062306a36Sopenharmony_ci memblk_nr_poison_inc(pfn); 8162306a36Sopenharmony_ci} 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_civoid num_poisoned_pages_sub(unsigned long pfn, long i) 8462306a36Sopenharmony_ci{ 8562306a36Sopenharmony_ci atomic_long_sub(i, &num_poisoned_pages); 8662306a36Sopenharmony_ci if (pfn != -1UL) 8762306a36Sopenharmony_ci memblk_nr_poison_sub(pfn, i); 8862306a36Sopenharmony_ci} 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci/** 9162306a36Sopenharmony_ci * MF_ATTR_RO - Create sysfs entry for each memory failure statistics. 9262306a36Sopenharmony_ci * @_name: name of the file in the per NUMA sysfs directory. 9362306a36Sopenharmony_ci */ 9462306a36Sopenharmony_ci#define MF_ATTR_RO(_name) \ 9562306a36Sopenharmony_cistatic ssize_t _name##_show(struct device *dev, \ 9662306a36Sopenharmony_ci struct device_attribute *attr, \ 9762306a36Sopenharmony_ci char *buf) \ 9862306a36Sopenharmony_ci{ \ 9962306a36Sopenharmony_ci struct memory_failure_stats *mf_stats = \ 10062306a36Sopenharmony_ci &NODE_DATA(dev->id)->mf_stats; \ 10162306a36Sopenharmony_ci return sprintf(buf, "%lu\n", mf_stats->_name); \ 10262306a36Sopenharmony_ci} \ 10362306a36Sopenharmony_cistatic DEVICE_ATTR_RO(_name) 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_ciMF_ATTR_RO(total); 10662306a36Sopenharmony_ciMF_ATTR_RO(ignored); 10762306a36Sopenharmony_ciMF_ATTR_RO(failed); 10862306a36Sopenharmony_ciMF_ATTR_RO(delayed); 10962306a36Sopenharmony_ciMF_ATTR_RO(recovered); 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_cistatic struct attribute *memory_failure_attr[] = { 11262306a36Sopenharmony_ci &dev_attr_total.attr, 11362306a36Sopenharmony_ci &dev_attr_ignored.attr, 11462306a36Sopenharmony_ci &dev_attr_failed.attr, 11562306a36Sopenharmony_ci &dev_attr_delayed.attr, 11662306a36Sopenharmony_ci &dev_attr_recovered.attr, 11762306a36Sopenharmony_ci NULL, 11862306a36Sopenharmony_ci}; 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_ciconst struct attribute_group memory_failure_attr_group = { 12162306a36Sopenharmony_ci .name = "memory_failure", 12262306a36Sopenharmony_ci .attrs = memory_failure_attr, 12362306a36Sopenharmony_ci}; 12462306a36Sopenharmony_ci 12562306a36Sopenharmony_cistatic struct ctl_table memory_failure_table[] = { 12662306a36Sopenharmony_ci { 12762306a36Sopenharmony_ci .procname = "memory_failure_early_kill", 12862306a36Sopenharmony_ci .data = &sysctl_memory_failure_early_kill, 12962306a36Sopenharmony_ci .maxlen = sizeof(sysctl_memory_failure_early_kill), 13062306a36Sopenharmony_ci .mode = 0644, 13162306a36Sopenharmony_ci .proc_handler = proc_dointvec_minmax, 13262306a36Sopenharmony_ci .extra1 = SYSCTL_ZERO, 13362306a36Sopenharmony_ci .extra2 = SYSCTL_ONE, 13462306a36Sopenharmony_ci }, 13562306a36Sopenharmony_ci { 13662306a36Sopenharmony_ci .procname = "memory_failure_recovery", 13762306a36Sopenharmony_ci .data = &sysctl_memory_failure_recovery, 13862306a36Sopenharmony_ci .maxlen = sizeof(sysctl_memory_failure_recovery), 13962306a36Sopenharmony_ci .mode = 0644, 14062306a36Sopenharmony_ci .proc_handler = proc_dointvec_minmax, 14162306a36Sopenharmony_ci .extra1 = SYSCTL_ZERO, 14262306a36Sopenharmony_ci .extra2 = SYSCTL_ONE, 14362306a36Sopenharmony_ci }, 14462306a36Sopenharmony_ci { } 14562306a36Sopenharmony_ci}; 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci/* 14862306a36Sopenharmony_ci * Return values: 14962306a36Sopenharmony_ci * 1: the page is dissolved (if needed) and taken off from buddy, 15062306a36Sopenharmony_ci * 0: the page is dissolved (if needed) and not taken off from buddy, 15162306a36Sopenharmony_ci * < 0: failed to dissolve. 15262306a36Sopenharmony_ci */ 15362306a36Sopenharmony_cistatic int __page_handle_poison(struct page *page) 15462306a36Sopenharmony_ci{ 15562306a36Sopenharmony_ci int ret; 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_ci zone_pcp_disable(page_zone(page)); 15862306a36Sopenharmony_ci ret = dissolve_free_huge_page(page); 15962306a36Sopenharmony_ci if (!ret) 16062306a36Sopenharmony_ci ret = take_page_off_buddy(page); 16162306a36Sopenharmony_ci zone_pcp_enable(page_zone(page)); 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_ci return ret; 16462306a36Sopenharmony_ci} 16562306a36Sopenharmony_ci 16662306a36Sopenharmony_cistatic bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release) 16762306a36Sopenharmony_ci{ 16862306a36Sopenharmony_ci if (hugepage_or_freepage) { 16962306a36Sopenharmony_ci /* 17062306a36Sopenharmony_ci * Doing this check for free pages is also fine since dissolve_free_huge_page 17162306a36Sopenharmony_ci * returns 0 for non-hugetlb pages as well. 17262306a36Sopenharmony_ci */ 17362306a36Sopenharmony_ci if (__page_handle_poison(page) <= 0) 17462306a36Sopenharmony_ci /* 17562306a36Sopenharmony_ci * We could fail to take off the target page from buddy 17662306a36Sopenharmony_ci * for example due to racy page allocation, but that's 17762306a36Sopenharmony_ci * acceptable because soft-offlined page is not broken 17862306a36Sopenharmony_ci * and if someone really want to use it, they should 17962306a36Sopenharmony_ci * take it. 18062306a36Sopenharmony_ci */ 18162306a36Sopenharmony_ci return false; 18262306a36Sopenharmony_ci } 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_ci SetPageHWPoison(page); 18562306a36Sopenharmony_ci if (release) 18662306a36Sopenharmony_ci put_page(page); 18762306a36Sopenharmony_ci page_ref_inc(page); 18862306a36Sopenharmony_ci num_poisoned_pages_inc(page_to_pfn(page)); 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_ci return true; 19162306a36Sopenharmony_ci} 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_HWPOISON_INJECT) 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ciu32 hwpoison_filter_enable = 0; 19662306a36Sopenharmony_ciu32 hwpoison_filter_dev_major = ~0U; 19762306a36Sopenharmony_ciu32 hwpoison_filter_dev_minor = ~0U; 19862306a36Sopenharmony_ciu64 hwpoison_filter_flags_mask; 19962306a36Sopenharmony_ciu64 hwpoison_filter_flags_value; 20062306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_enable); 20162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); 20262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); 20362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); 20462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_cistatic int hwpoison_filter_dev(struct page *p) 20762306a36Sopenharmony_ci{ 20862306a36Sopenharmony_ci struct address_space *mapping; 20962306a36Sopenharmony_ci dev_t dev; 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci if (hwpoison_filter_dev_major == ~0U && 21262306a36Sopenharmony_ci hwpoison_filter_dev_minor == ~0U) 21362306a36Sopenharmony_ci return 0; 21462306a36Sopenharmony_ci 21562306a36Sopenharmony_ci mapping = page_mapping(p); 21662306a36Sopenharmony_ci if (mapping == NULL || mapping->host == NULL) 21762306a36Sopenharmony_ci return -EINVAL; 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci dev = mapping->host->i_sb->s_dev; 22062306a36Sopenharmony_ci if (hwpoison_filter_dev_major != ~0U && 22162306a36Sopenharmony_ci hwpoison_filter_dev_major != MAJOR(dev)) 22262306a36Sopenharmony_ci return -EINVAL; 22362306a36Sopenharmony_ci if (hwpoison_filter_dev_minor != ~0U && 22462306a36Sopenharmony_ci hwpoison_filter_dev_minor != MINOR(dev)) 22562306a36Sopenharmony_ci return -EINVAL; 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci return 0; 22862306a36Sopenharmony_ci} 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_cistatic int hwpoison_filter_flags(struct page *p) 23162306a36Sopenharmony_ci{ 23262306a36Sopenharmony_ci if (!hwpoison_filter_flags_mask) 23362306a36Sopenharmony_ci return 0; 23462306a36Sopenharmony_ci 23562306a36Sopenharmony_ci if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == 23662306a36Sopenharmony_ci hwpoison_filter_flags_value) 23762306a36Sopenharmony_ci return 0; 23862306a36Sopenharmony_ci else 23962306a36Sopenharmony_ci return -EINVAL; 24062306a36Sopenharmony_ci} 24162306a36Sopenharmony_ci 24262306a36Sopenharmony_ci/* 24362306a36Sopenharmony_ci * This allows stress tests to limit test scope to a collection of tasks 24462306a36Sopenharmony_ci * by putting them under some memcg. This prevents killing unrelated/important 24562306a36Sopenharmony_ci * processes such as /sbin/init. Note that the target task may share clean 24662306a36Sopenharmony_ci * pages with init (eg. libc text), which is harmless. If the target task 24762306a36Sopenharmony_ci * share _dirty_ pages with another task B, the test scheme must make sure B 24862306a36Sopenharmony_ci * is also included in the memcg. At last, due to race conditions this filter 24962306a36Sopenharmony_ci * can only guarantee that the page either belongs to the memcg tasks, or is 25062306a36Sopenharmony_ci * a freed page. 25162306a36Sopenharmony_ci */ 25262306a36Sopenharmony_ci#ifdef CONFIG_MEMCG 25362306a36Sopenharmony_ciu64 hwpoison_filter_memcg; 25462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_memcg); 25562306a36Sopenharmony_cistatic int hwpoison_filter_task(struct page *p) 25662306a36Sopenharmony_ci{ 25762306a36Sopenharmony_ci if (!hwpoison_filter_memcg) 25862306a36Sopenharmony_ci return 0; 25962306a36Sopenharmony_ci 26062306a36Sopenharmony_ci if (page_cgroup_ino(p) != hwpoison_filter_memcg) 26162306a36Sopenharmony_ci return -EINVAL; 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci return 0; 26462306a36Sopenharmony_ci} 26562306a36Sopenharmony_ci#else 26662306a36Sopenharmony_cistatic int hwpoison_filter_task(struct page *p) { return 0; } 26762306a36Sopenharmony_ci#endif 26862306a36Sopenharmony_ci 26962306a36Sopenharmony_ciint hwpoison_filter(struct page *p) 27062306a36Sopenharmony_ci{ 27162306a36Sopenharmony_ci if (!hwpoison_filter_enable) 27262306a36Sopenharmony_ci return 0; 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_ci if (hwpoison_filter_dev(p)) 27562306a36Sopenharmony_ci return -EINVAL; 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_ci if (hwpoison_filter_flags(p)) 27862306a36Sopenharmony_ci return -EINVAL; 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci if (hwpoison_filter_task(p)) 28162306a36Sopenharmony_ci return -EINVAL; 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_ci return 0; 28462306a36Sopenharmony_ci} 28562306a36Sopenharmony_ci#else 28662306a36Sopenharmony_ciint hwpoison_filter(struct page *p) 28762306a36Sopenharmony_ci{ 28862306a36Sopenharmony_ci return 0; 28962306a36Sopenharmony_ci} 29062306a36Sopenharmony_ci#endif 29162306a36Sopenharmony_ci 29262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter); 29362306a36Sopenharmony_ci 29462306a36Sopenharmony_ci/* 29562306a36Sopenharmony_ci * Kill all processes that have a poisoned page mapped and then isolate 29662306a36Sopenharmony_ci * the page. 29762306a36Sopenharmony_ci * 29862306a36Sopenharmony_ci * General strategy: 29962306a36Sopenharmony_ci * Find all processes having the page mapped and kill them. 30062306a36Sopenharmony_ci * But we keep a page reference around so that the page is not 30162306a36Sopenharmony_ci * actually freed yet. 30262306a36Sopenharmony_ci * Then stash the page away 30362306a36Sopenharmony_ci * 30462306a36Sopenharmony_ci * There's no convenient way to get back to mapped processes 30562306a36Sopenharmony_ci * from the VMAs. So do a brute-force search over all 30662306a36Sopenharmony_ci * running processes. 30762306a36Sopenharmony_ci * 30862306a36Sopenharmony_ci * Remember that machine checks are not common (or rather 30962306a36Sopenharmony_ci * if they are common you have other problems), so this shouldn't 31062306a36Sopenharmony_ci * be a performance issue. 31162306a36Sopenharmony_ci * 31262306a36Sopenharmony_ci * Also there are some races possible while we get from the 31362306a36Sopenharmony_ci * error detection to actually handle it. 31462306a36Sopenharmony_ci */ 31562306a36Sopenharmony_ci 31662306a36Sopenharmony_cistruct to_kill { 31762306a36Sopenharmony_ci struct list_head nd; 31862306a36Sopenharmony_ci struct task_struct *tsk; 31962306a36Sopenharmony_ci unsigned long addr; 32062306a36Sopenharmony_ci short size_shift; 32162306a36Sopenharmony_ci}; 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_ci/* 32462306a36Sopenharmony_ci * Send all the processes who have the page mapped a signal. 32562306a36Sopenharmony_ci * ``action optional'' if they are not immediately affected by the error 32662306a36Sopenharmony_ci * ``action required'' if error happened in current execution context 32762306a36Sopenharmony_ci */ 32862306a36Sopenharmony_cistatic int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) 32962306a36Sopenharmony_ci{ 33062306a36Sopenharmony_ci struct task_struct *t = tk->tsk; 33162306a36Sopenharmony_ci short addr_lsb = tk->size_shift; 33262306a36Sopenharmony_ci int ret = 0; 33362306a36Sopenharmony_ci 33462306a36Sopenharmony_ci pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", 33562306a36Sopenharmony_ci pfn, t->comm, t->pid); 33662306a36Sopenharmony_ci 33762306a36Sopenharmony_ci if ((flags & MF_ACTION_REQUIRED) && (t == current)) 33862306a36Sopenharmony_ci ret = force_sig_mceerr(BUS_MCEERR_AR, 33962306a36Sopenharmony_ci (void __user *)tk->addr, addr_lsb); 34062306a36Sopenharmony_ci else 34162306a36Sopenharmony_ci /* 34262306a36Sopenharmony_ci * Signal other processes sharing the page if they have 34362306a36Sopenharmony_ci * PF_MCE_EARLY set. 34462306a36Sopenharmony_ci * Don't use force here, it's convenient if the signal 34562306a36Sopenharmony_ci * can be temporarily blocked. 34662306a36Sopenharmony_ci * This could cause a loop when the user sets SIGBUS 34762306a36Sopenharmony_ci * to SIG_IGN, but hopefully no one will do that? 34862306a36Sopenharmony_ci */ 34962306a36Sopenharmony_ci ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr, 35062306a36Sopenharmony_ci addr_lsb, t); 35162306a36Sopenharmony_ci if (ret < 0) 35262306a36Sopenharmony_ci pr_info("Error sending signal to %s:%d: %d\n", 35362306a36Sopenharmony_ci t->comm, t->pid, ret); 35462306a36Sopenharmony_ci return ret; 35562306a36Sopenharmony_ci} 35662306a36Sopenharmony_ci 35762306a36Sopenharmony_ci/* 35862306a36Sopenharmony_ci * Unknown page type encountered. Try to check whether it can turn PageLRU by 35962306a36Sopenharmony_ci * lru_add_drain_all. 36062306a36Sopenharmony_ci */ 36162306a36Sopenharmony_civoid shake_page(struct page *p) 36262306a36Sopenharmony_ci{ 36362306a36Sopenharmony_ci if (PageHuge(p)) 36462306a36Sopenharmony_ci return; 36562306a36Sopenharmony_ci /* 36662306a36Sopenharmony_ci * TODO: Could shrink slab caches here if a lightweight range-based 36762306a36Sopenharmony_ci * shrinker will be available. 36862306a36Sopenharmony_ci */ 36962306a36Sopenharmony_ci if (PageSlab(p)) 37062306a36Sopenharmony_ci return; 37162306a36Sopenharmony_ci 37262306a36Sopenharmony_ci lru_add_drain_all(); 37362306a36Sopenharmony_ci} 37462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(shake_page); 37562306a36Sopenharmony_ci 37662306a36Sopenharmony_cistatic unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, 37762306a36Sopenharmony_ci unsigned long address) 37862306a36Sopenharmony_ci{ 37962306a36Sopenharmony_ci unsigned long ret = 0; 38062306a36Sopenharmony_ci pgd_t *pgd; 38162306a36Sopenharmony_ci p4d_t *p4d; 38262306a36Sopenharmony_ci pud_t *pud; 38362306a36Sopenharmony_ci pmd_t *pmd; 38462306a36Sopenharmony_ci pte_t *pte; 38562306a36Sopenharmony_ci pte_t ptent; 38662306a36Sopenharmony_ci 38762306a36Sopenharmony_ci VM_BUG_ON_VMA(address == -EFAULT, vma); 38862306a36Sopenharmony_ci pgd = pgd_offset(vma->vm_mm, address); 38962306a36Sopenharmony_ci if (!pgd_present(*pgd)) 39062306a36Sopenharmony_ci return 0; 39162306a36Sopenharmony_ci p4d = p4d_offset(pgd, address); 39262306a36Sopenharmony_ci if (!p4d_present(*p4d)) 39362306a36Sopenharmony_ci return 0; 39462306a36Sopenharmony_ci pud = pud_offset(p4d, address); 39562306a36Sopenharmony_ci if (!pud_present(*pud)) 39662306a36Sopenharmony_ci return 0; 39762306a36Sopenharmony_ci if (pud_devmap(*pud)) 39862306a36Sopenharmony_ci return PUD_SHIFT; 39962306a36Sopenharmony_ci pmd = pmd_offset(pud, address); 40062306a36Sopenharmony_ci if (!pmd_present(*pmd)) 40162306a36Sopenharmony_ci return 0; 40262306a36Sopenharmony_ci if (pmd_devmap(*pmd)) 40362306a36Sopenharmony_ci return PMD_SHIFT; 40462306a36Sopenharmony_ci pte = pte_offset_map(pmd, address); 40562306a36Sopenharmony_ci if (!pte) 40662306a36Sopenharmony_ci return 0; 40762306a36Sopenharmony_ci ptent = ptep_get(pte); 40862306a36Sopenharmony_ci if (pte_present(ptent) && pte_devmap(ptent)) 40962306a36Sopenharmony_ci ret = PAGE_SHIFT; 41062306a36Sopenharmony_ci pte_unmap(pte); 41162306a36Sopenharmony_ci return ret; 41262306a36Sopenharmony_ci} 41362306a36Sopenharmony_ci 41462306a36Sopenharmony_ci/* 41562306a36Sopenharmony_ci * Failure handling: if we can't find or can't kill a process there's 41662306a36Sopenharmony_ci * not much we can do. We just print a message and ignore otherwise. 41762306a36Sopenharmony_ci */ 41862306a36Sopenharmony_ci 41962306a36Sopenharmony_ci#define FSDAX_INVALID_PGOFF ULONG_MAX 42062306a36Sopenharmony_ci 42162306a36Sopenharmony_ci/* 42262306a36Sopenharmony_ci * Schedule a process for later kill. 42362306a36Sopenharmony_ci * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. 42462306a36Sopenharmony_ci * 42562306a36Sopenharmony_ci * Note: @fsdax_pgoff is used only when @p is a fsdax page and a 42662306a36Sopenharmony_ci * filesystem with a memory failure handler has claimed the 42762306a36Sopenharmony_ci * memory_failure event. In all other cases, page->index and 42862306a36Sopenharmony_ci * page->mapping are sufficient for mapping the page back to its 42962306a36Sopenharmony_ci * corresponding user virtual address. 43062306a36Sopenharmony_ci */ 43162306a36Sopenharmony_cistatic void __add_to_kill(struct task_struct *tsk, struct page *p, 43262306a36Sopenharmony_ci struct vm_area_struct *vma, struct list_head *to_kill, 43362306a36Sopenharmony_ci unsigned long ksm_addr, pgoff_t fsdax_pgoff) 43462306a36Sopenharmony_ci{ 43562306a36Sopenharmony_ci struct to_kill *tk; 43662306a36Sopenharmony_ci 43762306a36Sopenharmony_ci tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); 43862306a36Sopenharmony_ci if (!tk) { 43962306a36Sopenharmony_ci pr_err("Out of memory while machine check handling\n"); 44062306a36Sopenharmony_ci return; 44162306a36Sopenharmony_ci } 44262306a36Sopenharmony_ci 44362306a36Sopenharmony_ci tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma); 44462306a36Sopenharmony_ci if (is_zone_device_page(p)) { 44562306a36Sopenharmony_ci if (fsdax_pgoff != FSDAX_INVALID_PGOFF) 44662306a36Sopenharmony_ci tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma); 44762306a36Sopenharmony_ci tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); 44862306a36Sopenharmony_ci } else 44962306a36Sopenharmony_ci tk->size_shift = page_shift(compound_head(p)); 45062306a36Sopenharmony_ci 45162306a36Sopenharmony_ci /* 45262306a36Sopenharmony_ci * Send SIGKILL if "tk->addr == -EFAULT". Also, as 45362306a36Sopenharmony_ci * "tk->size_shift" is always non-zero for !is_zone_device_page(), 45462306a36Sopenharmony_ci * so "tk->size_shift == 0" effectively checks no mapping on 45562306a36Sopenharmony_ci * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times 45662306a36Sopenharmony_ci * to a process' address space, it's possible not all N VMAs 45762306a36Sopenharmony_ci * contain mappings for the page, but at least one VMA does. 45862306a36Sopenharmony_ci * Only deliver SIGBUS with payload derived from the VMA that 45962306a36Sopenharmony_ci * has a mapping for the page. 46062306a36Sopenharmony_ci */ 46162306a36Sopenharmony_ci if (tk->addr == -EFAULT) { 46262306a36Sopenharmony_ci pr_info("Unable to find user space address %lx in %s\n", 46362306a36Sopenharmony_ci page_to_pfn(p), tsk->comm); 46462306a36Sopenharmony_ci } else if (tk->size_shift == 0) { 46562306a36Sopenharmony_ci kfree(tk); 46662306a36Sopenharmony_ci return; 46762306a36Sopenharmony_ci } 46862306a36Sopenharmony_ci 46962306a36Sopenharmony_ci get_task_struct(tsk); 47062306a36Sopenharmony_ci tk->tsk = tsk; 47162306a36Sopenharmony_ci list_add_tail(&tk->nd, to_kill); 47262306a36Sopenharmony_ci} 47362306a36Sopenharmony_ci 47462306a36Sopenharmony_cistatic void add_to_kill_anon_file(struct task_struct *tsk, struct page *p, 47562306a36Sopenharmony_ci struct vm_area_struct *vma, 47662306a36Sopenharmony_ci struct list_head *to_kill) 47762306a36Sopenharmony_ci{ 47862306a36Sopenharmony_ci __add_to_kill(tsk, p, vma, to_kill, 0, FSDAX_INVALID_PGOFF); 47962306a36Sopenharmony_ci} 48062306a36Sopenharmony_ci 48162306a36Sopenharmony_ci#ifdef CONFIG_KSM 48262306a36Sopenharmony_cistatic bool task_in_to_kill_list(struct list_head *to_kill, 48362306a36Sopenharmony_ci struct task_struct *tsk) 48462306a36Sopenharmony_ci{ 48562306a36Sopenharmony_ci struct to_kill *tk, *next; 48662306a36Sopenharmony_ci 48762306a36Sopenharmony_ci list_for_each_entry_safe(tk, next, to_kill, nd) { 48862306a36Sopenharmony_ci if (tk->tsk == tsk) 48962306a36Sopenharmony_ci return true; 49062306a36Sopenharmony_ci } 49162306a36Sopenharmony_ci 49262306a36Sopenharmony_ci return false; 49362306a36Sopenharmony_ci} 49462306a36Sopenharmony_civoid add_to_kill_ksm(struct task_struct *tsk, struct page *p, 49562306a36Sopenharmony_ci struct vm_area_struct *vma, struct list_head *to_kill, 49662306a36Sopenharmony_ci unsigned long ksm_addr) 49762306a36Sopenharmony_ci{ 49862306a36Sopenharmony_ci if (!task_in_to_kill_list(to_kill, tsk)) 49962306a36Sopenharmony_ci __add_to_kill(tsk, p, vma, to_kill, ksm_addr, FSDAX_INVALID_PGOFF); 50062306a36Sopenharmony_ci} 50162306a36Sopenharmony_ci#endif 50262306a36Sopenharmony_ci/* 50362306a36Sopenharmony_ci * Kill the processes that have been collected earlier. 50462306a36Sopenharmony_ci * 50562306a36Sopenharmony_ci * Only do anything when FORCEKILL is set, otherwise just free the 50662306a36Sopenharmony_ci * list (this is used for clean pages which do not need killing) 50762306a36Sopenharmony_ci * Also when FAIL is set do a force kill because something went 50862306a36Sopenharmony_ci * wrong earlier. 50962306a36Sopenharmony_ci */ 51062306a36Sopenharmony_cistatic void kill_procs(struct list_head *to_kill, int forcekill, bool fail, 51162306a36Sopenharmony_ci unsigned long pfn, int flags) 51262306a36Sopenharmony_ci{ 51362306a36Sopenharmony_ci struct to_kill *tk, *next; 51462306a36Sopenharmony_ci 51562306a36Sopenharmony_ci list_for_each_entry_safe(tk, next, to_kill, nd) { 51662306a36Sopenharmony_ci if (forcekill) { 51762306a36Sopenharmony_ci /* 51862306a36Sopenharmony_ci * In case something went wrong with munmapping 51962306a36Sopenharmony_ci * make sure the process doesn't catch the 52062306a36Sopenharmony_ci * signal and then access the memory. Just kill it. 52162306a36Sopenharmony_ci */ 52262306a36Sopenharmony_ci if (fail || tk->addr == -EFAULT) { 52362306a36Sopenharmony_ci pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", 52462306a36Sopenharmony_ci pfn, tk->tsk->comm, tk->tsk->pid); 52562306a36Sopenharmony_ci do_send_sig_info(SIGKILL, SEND_SIG_PRIV, 52662306a36Sopenharmony_ci tk->tsk, PIDTYPE_PID); 52762306a36Sopenharmony_ci } 52862306a36Sopenharmony_ci 52962306a36Sopenharmony_ci /* 53062306a36Sopenharmony_ci * In theory the process could have mapped 53162306a36Sopenharmony_ci * something else on the address in-between. We could 53262306a36Sopenharmony_ci * check for that, but we need to tell the 53362306a36Sopenharmony_ci * process anyways. 53462306a36Sopenharmony_ci */ 53562306a36Sopenharmony_ci else if (kill_proc(tk, pfn, flags) < 0) 53662306a36Sopenharmony_ci pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n", 53762306a36Sopenharmony_ci pfn, tk->tsk->comm, tk->tsk->pid); 53862306a36Sopenharmony_ci } 53962306a36Sopenharmony_ci list_del(&tk->nd); 54062306a36Sopenharmony_ci put_task_struct(tk->tsk); 54162306a36Sopenharmony_ci kfree(tk); 54262306a36Sopenharmony_ci } 54362306a36Sopenharmony_ci} 54462306a36Sopenharmony_ci 54562306a36Sopenharmony_ci/* 54662306a36Sopenharmony_ci * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) 54762306a36Sopenharmony_ci * on behalf of the thread group. Return task_struct of the (first found) 54862306a36Sopenharmony_ci * dedicated thread if found, and return NULL otherwise. 54962306a36Sopenharmony_ci * 55062306a36Sopenharmony_ci * We already hold rcu lock in the caller, so we don't have to call 55162306a36Sopenharmony_ci * rcu_read_lock/unlock() in this function. 55262306a36Sopenharmony_ci */ 55362306a36Sopenharmony_cistatic struct task_struct *find_early_kill_thread(struct task_struct *tsk) 55462306a36Sopenharmony_ci{ 55562306a36Sopenharmony_ci struct task_struct *t; 55662306a36Sopenharmony_ci 55762306a36Sopenharmony_ci for_each_thread(tsk, t) { 55862306a36Sopenharmony_ci if (t->flags & PF_MCE_PROCESS) { 55962306a36Sopenharmony_ci if (t->flags & PF_MCE_EARLY) 56062306a36Sopenharmony_ci return t; 56162306a36Sopenharmony_ci } else { 56262306a36Sopenharmony_ci if (sysctl_memory_failure_early_kill) 56362306a36Sopenharmony_ci return t; 56462306a36Sopenharmony_ci } 56562306a36Sopenharmony_ci } 56662306a36Sopenharmony_ci return NULL; 56762306a36Sopenharmony_ci} 56862306a36Sopenharmony_ci 56962306a36Sopenharmony_ci/* 57062306a36Sopenharmony_ci * Determine whether a given process is "early kill" process which expects 57162306a36Sopenharmony_ci * to be signaled when some page under the process is hwpoisoned. 57262306a36Sopenharmony_ci * Return task_struct of the dedicated thread (main thread unless explicitly 57362306a36Sopenharmony_ci * specified) if the process is "early kill" and otherwise returns NULL. 57462306a36Sopenharmony_ci * 57562306a36Sopenharmony_ci * Note that the above is true for Action Optional case. For Action Required 57662306a36Sopenharmony_ci * case, it's only meaningful to the current thread which need to be signaled 57762306a36Sopenharmony_ci * with SIGBUS, this error is Action Optional for other non current 57862306a36Sopenharmony_ci * processes sharing the same error page,if the process is "early kill", the 57962306a36Sopenharmony_ci * task_struct of the dedicated thread will also be returned. 58062306a36Sopenharmony_ci */ 58162306a36Sopenharmony_cistruct task_struct *task_early_kill(struct task_struct *tsk, int force_early) 58262306a36Sopenharmony_ci{ 58362306a36Sopenharmony_ci if (!tsk->mm) 58462306a36Sopenharmony_ci return NULL; 58562306a36Sopenharmony_ci /* 58662306a36Sopenharmony_ci * Comparing ->mm here because current task might represent 58762306a36Sopenharmony_ci * a subthread, while tsk always points to the main thread. 58862306a36Sopenharmony_ci */ 58962306a36Sopenharmony_ci if (force_early && tsk->mm == current->mm) 59062306a36Sopenharmony_ci return current; 59162306a36Sopenharmony_ci 59262306a36Sopenharmony_ci return find_early_kill_thread(tsk); 59362306a36Sopenharmony_ci} 59462306a36Sopenharmony_ci 59562306a36Sopenharmony_ci/* 59662306a36Sopenharmony_ci * Collect processes when the error hit an anonymous page. 59762306a36Sopenharmony_ci */ 59862306a36Sopenharmony_cistatic void collect_procs_anon(struct folio *folio, struct page *page, 59962306a36Sopenharmony_ci struct list_head *to_kill, int force_early) 60062306a36Sopenharmony_ci{ 60162306a36Sopenharmony_ci struct vm_area_struct *vma; 60262306a36Sopenharmony_ci struct task_struct *tsk; 60362306a36Sopenharmony_ci struct anon_vma *av; 60462306a36Sopenharmony_ci pgoff_t pgoff; 60562306a36Sopenharmony_ci 60662306a36Sopenharmony_ci av = folio_lock_anon_vma_read(folio, NULL); 60762306a36Sopenharmony_ci if (av == NULL) /* Not actually mapped anymore */ 60862306a36Sopenharmony_ci return; 60962306a36Sopenharmony_ci 61062306a36Sopenharmony_ci pgoff = page_to_pgoff(page); 61162306a36Sopenharmony_ci rcu_read_lock(); 61262306a36Sopenharmony_ci for_each_process(tsk) { 61362306a36Sopenharmony_ci struct anon_vma_chain *vmac; 61462306a36Sopenharmony_ci struct task_struct *t = task_early_kill(tsk, force_early); 61562306a36Sopenharmony_ci 61662306a36Sopenharmony_ci if (!t) 61762306a36Sopenharmony_ci continue; 61862306a36Sopenharmony_ci anon_vma_interval_tree_foreach(vmac, &av->rb_root, 61962306a36Sopenharmony_ci pgoff, pgoff) { 62062306a36Sopenharmony_ci vma = vmac->vma; 62162306a36Sopenharmony_ci if (vma->vm_mm != t->mm) 62262306a36Sopenharmony_ci continue; 62362306a36Sopenharmony_ci if (!page_mapped_in_vma(page, vma)) 62462306a36Sopenharmony_ci continue; 62562306a36Sopenharmony_ci add_to_kill_anon_file(t, page, vma, to_kill); 62662306a36Sopenharmony_ci } 62762306a36Sopenharmony_ci } 62862306a36Sopenharmony_ci rcu_read_unlock(); 62962306a36Sopenharmony_ci anon_vma_unlock_read(av); 63062306a36Sopenharmony_ci} 63162306a36Sopenharmony_ci 63262306a36Sopenharmony_ci/* 63362306a36Sopenharmony_ci * Collect processes when the error hit a file mapped page. 63462306a36Sopenharmony_ci */ 63562306a36Sopenharmony_cistatic void collect_procs_file(struct folio *folio, struct page *page, 63662306a36Sopenharmony_ci struct list_head *to_kill, int force_early) 63762306a36Sopenharmony_ci{ 63862306a36Sopenharmony_ci struct vm_area_struct *vma; 63962306a36Sopenharmony_ci struct task_struct *tsk; 64062306a36Sopenharmony_ci struct address_space *mapping = folio->mapping; 64162306a36Sopenharmony_ci pgoff_t pgoff; 64262306a36Sopenharmony_ci 64362306a36Sopenharmony_ci i_mmap_lock_read(mapping); 64462306a36Sopenharmony_ci rcu_read_lock(); 64562306a36Sopenharmony_ci pgoff = page_to_pgoff(page); 64662306a36Sopenharmony_ci for_each_process(tsk) { 64762306a36Sopenharmony_ci struct task_struct *t = task_early_kill(tsk, force_early); 64862306a36Sopenharmony_ci 64962306a36Sopenharmony_ci if (!t) 65062306a36Sopenharmony_ci continue; 65162306a36Sopenharmony_ci vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, 65262306a36Sopenharmony_ci pgoff) { 65362306a36Sopenharmony_ci /* 65462306a36Sopenharmony_ci * Send early kill signal to tasks where a vma covers 65562306a36Sopenharmony_ci * the page but the corrupted page is not necessarily 65662306a36Sopenharmony_ci * mapped in its pte. 65762306a36Sopenharmony_ci * Assume applications who requested early kill want 65862306a36Sopenharmony_ci * to be informed of all such data corruptions. 65962306a36Sopenharmony_ci */ 66062306a36Sopenharmony_ci if (vma->vm_mm == t->mm) 66162306a36Sopenharmony_ci add_to_kill_anon_file(t, page, vma, to_kill); 66262306a36Sopenharmony_ci } 66362306a36Sopenharmony_ci } 66462306a36Sopenharmony_ci rcu_read_unlock(); 66562306a36Sopenharmony_ci i_mmap_unlock_read(mapping); 66662306a36Sopenharmony_ci} 66762306a36Sopenharmony_ci 66862306a36Sopenharmony_ci#ifdef CONFIG_FS_DAX 66962306a36Sopenharmony_cistatic void add_to_kill_fsdax(struct task_struct *tsk, struct page *p, 67062306a36Sopenharmony_ci struct vm_area_struct *vma, 67162306a36Sopenharmony_ci struct list_head *to_kill, pgoff_t pgoff) 67262306a36Sopenharmony_ci{ 67362306a36Sopenharmony_ci __add_to_kill(tsk, p, vma, to_kill, 0, pgoff); 67462306a36Sopenharmony_ci} 67562306a36Sopenharmony_ci 67662306a36Sopenharmony_ci/* 67762306a36Sopenharmony_ci * Collect processes when the error hit a fsdax page. 67862306a36Sopenharmony_ci */ 67962306a36Sopenharmony_cistatic void collect_procs_fsdax(struct page *page, 68062306a36Sopenharmony_ci struct address_space *mapping, pgoff_t pgoff, 68162306a36Sopenharmony_ci struct list_head *to_kill) 68262306a36Sopenharmony_ci{ 68362306a36Sopenharmony_ci struct vm_area_struct *vma; 68462306a36Sopenharmony_ci struct task_struct *tsk; 68562306a36Sopenharmony_ci 68662306a36Sopenharmony_ci i_mmap_lock_read(mapping); 68762306a36Sopenharmony_ci rcu_read_lock(); 68862306a36Sopenharmony_ci for_each_process(tsk) { 68962306a36Sopenharmony_ci struct task_struct *t = task_early_kill(tsk, true); 69062306a36Sopenharmony_ci 69162306a36Sopenharmony_ci if (!t) 69262306a36Sopenharmony_ci continue; 69362306a36Sopenharmony_ci vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 69462306a36Sopenharmony_ci if (vma->vm_mm == t->mm) 69562306a36Sopenharmony_ci add_to_kill_fsdax(t, page, vma, to_kill, pgoff); 69662306a36Sopenharmony_ci } 69762306a36Sopenharmony_ci } 69862306a36Sopenharmony_ci rcu_read_unlock(); 69962306a36Sopenharmony_ci i_mmap_unlock_read(mapping); 70062306a36Sopenharmony_ci} 70162306a36Sopenharmony_ci#endif /* CONFIG_FS_DAX */ 70262306a36Sopenharmony_ci 70362306a36Sopenharmony_ci/* 70462306a36Sopenharmony_ci * Collect the processes who have the corrupted page mapped to kill. 70562306a36Sopenharmony_ci */ 70662306a36Sopenharmony_cistatic void collect_procs(struct folio *folio, struct page *page, 70762306a36Sopenharmony_ci struct list_head *tokill, int force_early) 70862306a36Sopenharmony_ci{ 70962306a36Sopenharmony_ci if (!folio->mapping) 71062306a36Sopenharmony_ci return; 71162306a36Sopenharmony_ci if (unlikely(PageKsm(page))) 71262306a36Sopenharmony_ci collect_procs_ksm(page, tokill, force_early); 71362306a36Sopenharmony_ci else if (PageAnon(page)) 71462306a36Sopenharmony_ci collect_procs_anon(folio, page, tokill, force_early); 71562306a36Sopenharmony_ci else 71662306a36Sopenharmony_ci collect_procs_file(folio, page, tokill, force_early); 71762306a36Sopenharmony_ci} 71862306a36Sopenharmony_ci 71962306a36Sopenharmony_cistruct hwpoison_walk { 72062306a36Sopenharmony_ci struct to_kill tk; 72162306a36Sopenharmony_ci unsigned long pfn; 72262306a36Sopenharmony_ci int flags; 72362306a36Sopenharmony_ci}; 72462306a36Sopenharmony_ci 72562306a36Sopenharmony_cistatic void set_to_kill(struct to_kill *tk, unsigned long addr, short shift) 72662306a36Sopenharmony_ci{ 72762306a36Sopenharmony_ci tk->addr = addr; 72862306a36Sopenharmony_ci tk->size_shift = shift; 72962306a36Sopenharmony_ci} 73062306a36Sopenharmony_ci 73162306a36Sopenharmony_cistatic int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, 73262306a36Sopenharmony_ci unsigned long poisoned_pfn, struct to_kill *tk) 73362306a36Sopenharmony_ci{ 73462306a36Sopenharmony_ci unsigned long pfn = 0; 73562306a36Sopenharmony_ci 73662306a36Sopenharmony_ci if (pte_present(pte)) { 73762306a36Sopenharmony_ci pfn = pte_pfn(pte); 73862306a36Sopenharmony_ci } else { 73962306a36Sopenharmony_ci swp_entry_t swp = pte_to_swp_entry(pte); 74062306a36Sopenharmony_ci 74162306a36Sopenharmony_ci if (is_hwpoison_entry(swp)) 74262306a36Sopenharmony_ci pfn = swp_offset_pfn(swp); 74362306a36Sopenharmony_ci } 74462306a36Sopenharmony_ci 74562306a36Sopenharmony_ci if (!pfn || pfn != poisoned_pfn) 74662306a36Sopenharmony_ci return 0; 74762306a36Sopenharmony_ci 74862306a36Sopenharmony_ci set_to_kill(tk, addr, shift); 74962306a36Sopenharmony_ci return 1; 75062306a36Sopenharmony_ci} 75162306a36Sopenharmony_ci 75262306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 75362306a36Sopenharmony_cistatic int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, 75462306a36Sopenharmony_ci struct hwpoison_walk *hwp) 75562306a36Sopenharmony_ci{ 75662306a36Sopenharmony_ci pmd_t pmd = *pmdp; 75762306a36Sopenharmony_ci unsigned long pfn; 75862306a36Sopenharmony_ci unsigned long hwpoison_vaddr; 75962306a36Sopenharmony_ci 76062306a36Sopenharmony_ci if (!pmd_present(pmd)) 76162306a36Sopenharmony_ci return 0; 76262306a36Sopenharmony_ci pfn = pmd_pfn(pmd); 76362306a36Sopenharmony_ci if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) { 76462306a36Sopenharmony_ci hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT); 76562306a36Sopenharmony_ci set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT); 76662306a36Sopenharmony_ci return 1; 76762306a36Sopenharmony_ci } 76862306a36Sopenharmony_ci return 0; 76962306a36Sopenharmony_ci} 77062306a36Sopenharmony_ci#else 77162306a36Sopenharmony_cistatic int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, 77262306a36Sopenharmony_ci struct hwpoison_walk *hwp) 77362306a36Sopenharmony_ci{ 77462306a36Sopenharmony_ci return 0; 77562306a36Sopenharmony_ci} 77662306a36Sopenharmony_ci#endif 77762306a36Sopenharmony_ci 77862306a36Sopenharmony_cistatic int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, 77962306a36Sopenharmony_ci unsigned long end, struct mm_walk *walk) 78062306a36Sopenharmony_ci{ 78162306a36Sopenharmony_ci struct hwpoison_walk *hwp = walk->private; 78262306a36Sopenharmony_ci int ret = 0; 78362306a36Sopenharmony_ci pte_t *ptep, *mapped_pte; 78462306a36Sopenharmony_ci spinlock_t *ptl; 78562306a36Sopenharmony_ci 78662306a36Sopenharmony_ci ptl = pmd_trans_huge_lock(pmdp, walk->vma); 78762306a36Sopenharmony_ci if (ptl) { 78862306a36Sopenharmony_ci ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp); 78962306a36Sopenharmony_ci spin_unlock(ptl); 79062306a36Sopenharmony_ci goto out; 79162306a36Sopenharmony_ci } 79262306a36Sopenharmony_ci 79362306a36Sopenharmony_ci mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, 79462306a36Sopenharmony_ci addr, &ptl); 79562306a36Sopenharmony_ci if (!ptep) 79662306a36Sopenharmony_ci goto out; 79762306a36Sopenharmony_ci 79862306a36Sopenharmony_ci for (; addr != end; ptep++, addr += PAGE_SIZE) { 79962306a36Sopenharmony_ci ret = check_hwpoisoned_entry(ptep_get(ptep), addr, PAGE_SHIFT, 80062306a36Sopenharmony_ci hwp->pfn, &hwp->tk); 80162306a36Sopenharmony_ci if (ret == 1) 80262306a36Sopenharmony_ci break; 80362306a36Sopenharmony_ci } 80462306a36Sopenharmony_ci pte_unmap_unlock(mapped_pte, ptl); 80562306a36Sopenharmony_ciout: 80662306a36Sopenharmony_ci cond_resched(); 80762306a36Sopenharmony_ci return ret; 80862306a36Sopenharmony_ci} 80962306a36Sopenharmony_ci 81062306a36Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE 81162306a36Sopenharmony_cistatic int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, 81262306a36Sopenharmony_ci unsigned long addr, unsigned long end, 81362306a36Sopenharmony_ci struct mm_walk *walk) 81462306a36Sopenharmony_ci{ 81562306a36Sopenharmony_ci struct hwpoison_walk *hwp = walk->private; 81662306a36Sopenharmony_ci pte_t pte = huge_ptep_get(ptep); 81762306a36Sopenharmony_ci struct hstate *h = hstate_vma(walk->vma); 81862306a36Sopenharmony_ci 81962306a36Sopenharmony_ci return check_hwpoisoned_entry(pte, addr, huge_page_shift(h), 82062306a36Sopenharmony_ci hwp->pfn, &hwp->tk); 82162306a36Sopenharmony_ci} 82262306a36Sopenharmony_ci#else 82362306a36Sopenharmony_ci#define hwpoison_hugetlb_range NULL 82462306a36Sopenharmony_ci#endif 82562306a36Sopenharmony_ci 82662306a36Sopenharmony_cistatic const struct mm_walk_ops hwpoison_walk_ops = { 82762306a36Sopenharmony_ci .pmd_entry = hwpoison_pte_range, 82862306a36Sopenharmony_ci .hugetlb_entry = hwpoison_hugetlb_range, 82962306a36Sopenharmony_ci .walk_lock = PGWALK_RDLOCK, 83062306a36Sopenharmony_ci}; 83162306a36Sopenharmony_ci 83262306a36Sopenharmony_ci/* 83362306a36Sopenharmony_ci * Sends SIGBUS to the current process with error info. 83462306a36Sopenharmony_ci * 83562306a36Sopenharmony_ci * This function is intended to handle "Action Required" MCEs on already 83662306a36Sopenharmony_ci * hardware poisoned pages. They could happen, for example, when 83762306a36Sopenharmony_ci * memory_failure() failed to unmap the error page at the first call, or 83862306a36Sopenharmony_ci * when multiple local machine checks happened on different CPUs. 83962306a36Sopenharmony_ci * 84062306a36Sopenharmony_ci * MCE handler currently has no easy access to the error virtual address, 84162306a36Sopenharmony_ci * so this function walks page table to find it. The returned virtual address 84262306a36Sopenharmony_ci * is proper in most cases, but it could be wrong when the application 84362306a36Sopenharmony_ci * process has multiple entries mapping the error page. 84462306a36Sopenharmony_ci */ 84562306a36Sopenharmony_cistatic int kill_accessing_process(struct task_struct *p, unsigned long pfn, 84662306a36Sopenharmony_ci int flags) 84762306a36Sopenharmony_ci{ 84862306a36Sopenharmony_ci int ret; 84962306a36Sopenharmony_ci struct hwpoison_walk priv = { 85062306a36Sopenharmony_ci .pfn = pfn, 85162306a36Sopenharmony_ci }; 85262306a36Sopenharmony_ci priv.tk.tsk = p; 85362306a36Sopenharmony_ci 85462306a36Sopenharmony_ci if (!p->mm) 85562306a36Sopenharmony_ci return -EFAULT; 85662306a36Sopenharmony_ci 85762306a36Sopenharmony_ci mmap_read_lock(p->mm); 85862306a36Sopenharmony_ci ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops, 85962306a36Sopenharmony_ci (void *)&priv); 86062306a36Sopenharmony_ci if (ret == 1 && priv.tk.addr) 86162306a36Sopenharmony_ci kill_proc(&priv.tk, pfn, flags); 86262306a36Sopenharmony_ci else 86362306a36Sopenharmony_ci ret = 0; 86462306a36Sopenharmony_ci mmap_read_unlock(p->mm); 86562306a36Sopenharmony_ci return ret > 0 ? -EHWPOISON : -EFAULT; 86662306a36Sopenharmony_ci} 86762306a36Sopenharmony_ci 86862306a36Sopenharmony_cistatic const char *action_name[] = { 86962306a36Sopenharmony_ci [MF_IGNORED] = "Ignored", 87062306a36Sopenharmony_ci [MF_FAILED] = "Failed", 87162306a36Sopenharmony_ci [MF_DELAYED] = "Delayed", 87262306a36Sopenharmony_ci [MF_RECOVERED] = "Recovered", 87362306a36Sopenharmony_ci}; 87462306a36Sopenharmony_ci 87562306a36Sopenharmony_cistatic const char * const action_page_types[] = { 87662306a36Sopenharmony_ci [MF_MSG_KERNEL] = "reserved kernel page", 87762306a36Sopenharmony_ci [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", 87862306a36Sopenharmony_ci [MF_MSG_SLAB] = "kernel slab page", 87962306a36Sopenharmony_ci [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", 88062306a36Sopenharmony_ci [MF_MSG_HUGE] = "huge page", 88162306a36Sopenharmony_ci [MF_MSG_FREE_HUGE] = "free huge page", 88262306a36Sopenharmony_ci [MF_MSG_UNMAP_FAILED] = "unmapping failed page", 88362306a36Sopenharmony_ci [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", 88462306a36Sopenharmony_ci [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", 88562306a36Sopenharmony_ci [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", 88662306a36Sopenharmony_ci [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", 88762306a36Sopenharmony_ci [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", 88862306a36Sopenharmony_ci [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", 88962306a36Sopenharmony_ci [MF_MSG_DIRTY_LRU] = "dirty LRU page", 89062306a36Sopenharmony_ci [MF_MSG_CLEAN_LRU] = "clean LRU page", 89162306a36Sopenharmony_ci [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", 89262306a36Sopenharmony_ci [MF_MSG_BUDDY] = "free buddy page", 89362306a36Sopenharmony_ci [MF_MSG_DAX] = "dax page", 89462306a36Sopenharmony_ci [MF_MSG_UNSPLIT_THP] = "unsplit thp", 89562306a36Sopenharmony_ci [MF_MSG_UNKNOWN] = "unknown page", 89662306a36Sopenharmony_ci}; 89762306a36Sopenharmony_ci 89862306a36Sopenharmony_ci/* 89962306a36Sopenharmony_ci * XXX: It is possible that a page is isolated from LRU cache, 90062306a36Sopenharmony_ci * and then kept in swap cache or failed to remove from page cache. 90162306a36Sopenharmony_ci * The page count will stop it from being freed by unpoison. 90262306a36Sopenharmony_ci * Stress tests should be aware of this memory leak problem. 90362306a36Sopenharmony_ci */ 90462306a36Sopenharmony_cistatic int delete_from_lru_cache(struct page *p) 90562306a36Sopenharmony_ci{ 90662306a36Sopenharmony_ci if (isolate_lru_page(p)) { 90762306a36Sopenharmony_ci /* 90862306a36Sopenharmony_ci * Clear sensible page flags, so that the buddy system won't 90962306a36Sopenharmony_ci * complain when the page is unpoison-and-freed. 91062306a36Sopenharmony_ci */ 91162306a36Sopenharmony_ci ClearPageActive(p); 91262306a36Sopenharmony_ci ClearPageUnevictable(p); 91362306a36Sopenharmony_ci 91462306a36Sopenharmony_ci /* 91562306a36Sopenharmony_ci * Poisoned page might never drop its ref count to 0 so we have 91662306a36Sopenharmony_ci * to uncharge it manually from its memcg. 91762306a36Sopenharmony_ci */ 91862306a36Sopenharmony_ci mem_cgroup_uncharge(page_folio(p)); 91962306a36Sopenharmony_ci 92062306a36Sopenharmony_ci /* 92162306a36Sopenharmony_ci * drop the page count elevated by isolate_lru_page() 92262306a36Sopenharmony_ci */ 92362306a36Sopenharmony_ci put_page(p); 92462306a36Sopenharmony_ci return 0; 92562306a36Sopenharmony_ci } 92662306a36Sopenharmony_ci return -EIO; 92762306a36Sopenharmony_ci} 92862306a36Sopenharmony_ci 92962306a36Sopenharmony_cistatic int truncate_error_page(struct page *p, unsigned long pfn, 93062306a36Sopenharmony_ci struct address_space *mapping) 93162306a36Sopenharmony_ci{ 93262306a36Sopenharmony_ci int ret = MF_FAILED; 93362306a36Sopenharmony_ci 93462306a36Sopenharmony_ci if (mapping->a_ops->error_remove_page) { 93562306a36Sopenharmony_ci struct folio *folio = page_folio(p); 93662306a36Sopenharmony_ci int err = mapping->a_ops->error_remove_page(mapping, p); 93762306a36Sopenharmony_ci 93862306a36Sopenharmony_ci if (err != 0) 93962306a36Sopenharmony_ci pr_info("%#lx: Failed to punch page: %d\n", pfn, err); 94062306a36Sopenharmony_ci else if (!filemap_release_folio(folio, GFP_NOIO)) 94162306a36Sopenharmony_ci pr_info("%#lx: failed to release buffers\n", pfn); 94262306a36Sopenharmony_ci else 94362306a36Sopenharmony_ci ret = MF_RECOVERED; 94462306a36Sopenharmony_ci } else { 94562306a36Sopenharmony_ci /* 94662306a36Sopenharmony_ci * If the file system doesn't support it just invalidate 94762306a36Sopenharmony_ci * This fails on dirty or anything with private pages 94862306a36Sopenharmony_ci */ 94962306a36Sopenharmony_ci if (invalidate_inode_page(p)) 95062306a36Sopenharmony_ci ret = MF_RECOVERED; 95162306a36Sopenharmony_ci else 95262306a36Sopenharmony_ci pr_info("%#lx: Failed to invalidate\n", pfn); 95362306a36Sopenharmony_ci } 95462306a36Sopenharmony_ci 95562306a36Sopenharmony_ci return ret; 95662306a36Sopenharmony_ci} 95762306a36Sopenharmony_ci 95862306a36Sopenharmony_cistruct page_state { 95962306a36Sopenharmony_ci unsigned long mask; 96062306a36Sopenharmony_ci unsigned long res; 96162306a36Sopenharmony_ci enum mf_action_page_type type; 96262306a36Sopenharmony_ci 96362306a36Sopenharmony_ci /* Callback ->action() has to unlock the relevant page inside it. */ 96462306a36Sopenharmony_ci int (*action)(struct page_state *ps, struct page *p); 96562306a36Sopenharmony_ci}; 96662306a36Sopenharmony_ci 96762306a36Sopenharmony_ci/* 96862306a36Sopenharmony_ci * Return true if page is still referenced by others, otherwise return 96962306a36Sopenharmony_ci * false. 97062306a36Sopenharmony_ci * 97162306a36Sopenharmony_ci * The extra_pins is true when one extra refcount is expected. 97262306a36Sopenharmony_ci */ 97362306a36Sopenharmony_cistatic bool has_extra_refcount(struct page_state *ps, struct page *p, 97462306a36Sopenharmony_ci bool extra_pins) 97562306a36Sopenharmony_ci{ 97662306a36Sopenharmony_ci int count = page_count(p) - 1; 97762306a36Sopenharmony_ci 97862306a36Sopenharmony_ci if (extra_pins) 97962306a36Sopenharmony_ci count -= 1; 98062306a36Sopenharmony_ci 98162306a36Sopenharmony_ci if (count > 0) { 98262306a36Sopenharmony_ci pr_err("%#lx: %s still referenced by %d users\n", 98362306a36Sopenharmony_ci page_to_pfn(p), action_page_types[ps->type], count); 98462306a36Sopenharmony_ci return true; 98562306a36Sopenharmony_ci } 98662306a36Sopenharmony_ci 98762306a36Sopenharmony_ci return false; 98862306a36Sopenharmony_ci} 98962306a36Sopenharmony_ci 99062306a36Sopenharmony_ci/* 99162306a36Sopenharmony_ci * Error hit kernel page. 99262306a36Sopenharmony_ci * Do nothing, try to be lucky and not touch this instead. For a few cases we 99362306a36Sopenharmony_ci * could be more sophisticated. 99462306a36Sopenharmony_ci */ 99562306a36Sopenharmony_cistatic int me_kernel(struct page_state *ps, struct page *p) 99662306a36Sopenharmony_ci{ 99762306a36Sopenharmony_ci unlock_page(p); 99862306a36Sopenharmony_ci return MF_IGNORED; 99962306a36Sopenharmony_ci} 100062306a36Sopenharmony_ci 100162306a36Sopenharmony_ci/* 100262306a36Sopenharmony_ci * Page in unknown state. Do nothing. 100362306a36Sopenharmony_ci */ 100462306a36Sopenharmony_cistatic int me_unknown(struct page_state *ps, struct page *p) 100562306a36Sopenharmony_ci{ 100662306a36Sopenharmony_ci pr_err("%#lx: Unknown page state\n", page_to_pfn(p)); 100762306a36Sopenharmony_ci unlock_page(p); 100862306a36Sopenharmony_ci return MF_FAILED; 100962306a36Sopenharmony_ci} 101062306a36Sopenharmony_ci 101162306a36Sopenharmony_ci/* 101262306a36Sopenharmony_ci * Clean (or cleaned) page cache page. 101362306a36Sopenharmony_ci */ 101462306a36Sopenharmony_cistatic int me_pagecache_clean(struct page_state *ps, struct page *p) 101562306a36Sopenharmony_ci{ 101662306a36Sopenharmony_ci int ret; 101762306a36Sopenharmony_ci struct address_space *mapping; 101862306a36Sopenharmony_ci bool extra_pins; 101962306a36Sopenharmony_ci 102062306a36Sopenharmony_ci delete_from_lru_cache(p); 102162306a36Sopenharmony_ci 102262306a36Sopenharmony_ci /* 102362306a36Sopenharmony_ci * For anonymous pages we're done the only reference left 102462306a36Sopenharmony_ci * should be the one m_f() holds. 102562306a36Sopenharmony_ci */ 102662306a36Sopenharmony_ci if (PageAnon(p)) { 102762306a36Sopenharmony_ci ret = MF_RECOVERED; 102862306a36Sopenharmony_ci goto out; 102962306a36Sopenharmony_ci } 103062306a36Sopenharmony_ci 103162306a36Sopenharmony_ci /* 103262306a36Sopenharmony_ci * Now truncate the page in the page cache. This is really 103362306a36Sopenharmony_ci * more like a "temporary hole punch" 103462306a36Sopenharmony_ci * Don't do this for block devices when someone else 103562306a36Sopenharmony_ci * has a reference, because it could be file system metadata 103662306a36Sopenharmony_ci * and that's not safe to truncate. 103762306a36Sopenharmony_ci */ 103862306a36Sopenharmony_ci mapping = page_mapping(p); 103962306a36Sopenharmony_ci if (!mapping) { 104062306a36Sopenharmony_ci /* 104162306a36Sopenharmony_ci * Page has been teared down in the meanwhile 104262306a36Sopenharmony_ci */ 104362306a36Sopenharmony_ci ret = MF_FAILED; 104462306a36Sopenharmony_ci goto out; 104562306a36Sopenharmony_ci } 104662306a36Sopenharmony_ci 104762306a36Sopenharmony_ci /* 104862306a36Sopenharmony_ci * The shmem page is kept in page cache instead of truncating 104962306a36Sopenharmony_ci * so is expected to have an extra refcount after error-handling. 105062306a36Sopenharmony_ci */ 105162306a36Sopenharmony_ci extra_pins = shmem_mapping(mapping); 105262306a36Sopenharmony_ci 105362306a36Sopenharmony_ci /* 105462306a36Sopenharmony_ci * Truncation is a bit tricky. Enable it per file system for now. 105562306a36Sopenharmony_ci * 105662306a36Sopenharmony_ci * Open: to take i_rwsem or not for this? Right now we don't. 105762306a36Sopenharmony_ci */ 105862306a36Sopenharmony_ci ret = truncate_error_page(p, page_to_pfn(p), mapping); 105962306a36Sopenharmony_ci if (has_extra_refcount(ps, p, extra_pins)) 106062306a36Sopenharmony_ci ret = MF_FAILED; 106162306a36Sopenharmony_ci 106262306a36Sopenharmony_ciout: 106362306a36Sopenharmony_ci unlock_page(p); 106462306a36Sopenharmony_ci 106562306a36Sopenharmony_ci return ret; 106662306a36Sopenharmony_ci} 106762306a36Sopenharmony_ci 106862306a36Sopenharmony_ci/* 106962306a36Sopenharmony_ci * Dirty pagecache page 107062306a36Sopenharmony_ci * Issues: when the error hit a hole page the error is not properly 107162306a36Sopenharmony_ci * propagated. 107262306a36Sopenharmony_ci */ 107362306a36Sopenharmony_cistatic int me_pagecache_dirty(struct page_state *ps, struct page *p) 107462306a36Sopenharmony_ci{ 107562306a36Sopenharmony_ci struct address_space *mapping = page_mapping(p); 107662306a36Sopenharmony_ci 107762306a36Sopenharmony_ci SetPageError(p); 107862306a36Sopenharmony_ci /* TBD: print more information about the file. */ 107962306a36Sopenharmony_ci if (mapping) { 108062306a36Sopenharmony_ci /* 108162306a36Sopenharmony_ci * IO error will be reported by write(), fsync(), etc. 108262306a36Sopenharmony_ci * who check the mapping. 108362306a36Sopenharmony_ci * This way the application knows that something went 108462306a36Sopenharmony_ci * wrong with its dirty file data. 108562306a36Sopenharmony_ci * 108662306a36Sopenharmony_ci * There's one open issue: 108762306a36Sopenharmony_ci * 108862306a36Sopenharmony_ci * The EIO will be only reported on the next IO 108962306a36Sopenharmony_ci * operation and then cleared through the IO map. 109062306a36Sopenharmony_ci * Normally Linux has two mechanisms to pass IO error 109162306a36Sopenharmony_ci * first through the AS_EIO flag in the address space 109262306a36Sopenharmony_ci * and then through the PageError flag in the page. 109362306a36Sopenharmony_ci * Since we drop pages on memory failure handling the 109462306a36Sopenharmony_ci * only mechanism open to use is through AS_AIO. 109562306a36Sopenharmony_ci * 109662306a36Sopenharmony_ci * This has the disadvantage that it gets cleared on 109762306a36Sopenharmony_ci * the first operation that returns an error, while 109862306a36Sopenharmony_ci * the PageError bit is more sticky and only cleared 109962306a36Sopenharmony_ci * when the page is reread or dropped. If an 110062306a36Sopenharmony_ci * application assumes it will always get error on 110162306a36Sopenharmony_ci * fsync, but does other operations on the fd before 110262306a36Sopenharmony_ci * and the page is dropped between then the error 110362306a36Sopenharmony_ci * will not be properly reported. 110462306a36Sopenharmony_ci * 110562306a36Sopenharmony_ci * This can already happen even without hwpoisoned 110662306a36Sopenharmony_ci * pages: first on metadata IO errors (which only 110762306a36Sopenharmony_ci * report through AS_EIO) or when the page is dropped 110862306a36Sopenharmony_ci * at the wrong time. 110962306a36Sopenharmony_ci * 111062306a36Sopenharmony_ci * So right now we assume that the application DTRT on 111162306a36Sopenharmony_ci * the first EIO, but we're not worse than other parts 111262306a36Sopenharmony_ci * of the kernel. 111362306a36Sopenharmony_ci */ 111462306a36Sopenharmony_ci mapping_set_error(mapping, -EIO); 111562306a36Sopenharmony_ci } 111662306a36Sopenharmony_ci 111762306a36Sopenharmony_ci return me_pagecache_clean(ps, p); 111862306a36Sopenharmony_ci} 111962306a36Sopenharmony_ci 112062306a36Sopenharmony_ci/* 112162306a36Sopenharmony_ci * Clean and dirty swap cache. 112262306a36Sopenharmony_ci * 112362306a36Sopenharmony_ci * Dirty swap cache page is tricky to handle. The page could live both in page 112462306a36Sopenharmony_ci * cache and swap cache(ie. page is freshly swapped in). So it could be 112562306a36Sopenharmony_ci * referenced concurrently by 2 types of PTEs: 112662306a36Sopenharmony_ci * normal PTEs and swap PTEs. We try to handle them consistently by calling 112762306a36Sopenharmony_ci * try_to_unmap(!TTU_HWPOISON) to convert the normal PTEs to swap PTEs, 112862306a36Sopenharmony_ci * and then 112962306a36Sopenharmony_ci * - clear dirty bit to prevent IO 113062306a36Sopenharmony_ci * - remove from LRU 113162306a36Sopenharmony_ci * - but keep in the swap cache, so that when we return to it on 113262306a36Sopenharmony_ci * a later page fault, we know the application is accessing 113362306a36Sopenharmony_ci * corrupted data and shall be killed (we installed simple 113462306a36Sopenharmony_ci * interception code in do_swap_page to catch it). 113562306a36Sopenharmony_ci * 113662306a36Sopenharmony_ci * Clean swap cache pages can be directly isolated. A later page fault will 113762306a36Sopenharmony_ci * bring in the known good data from disk. 113862306a36Sopenharmony_ci */ 113962306a36Sopenharmony_cistatic int me_swapcache_dirty(struct page_state *ps, struct page *p) 114062306a36Sopenharmony_ci{ 114162306a36Sopenharmony_ci int ret; 114262306a36Sopenharmony_ci bool extra_pins = false; 114362306a36Sopenharmony_ci 114462306a36Sopenharmony_ci ClearPageDirty(p); 114562306a36Sopenharmony_ci /* Trigger EIO in shmem: */ 114662306a36Sopenharmony_ci ClearPageUptodate(p); 114762306a36Sopenharmony_ci 114862306a36Sopenharmony_ci ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED; 114962306a36Sopenharmony_ci unlock_page(p); 115062306a36Sopenharmony_ci 115162306a36Sopenharmony_ci if (ret == MF_DELAYED) 115262306a36Sopenharmony_ci extra_pins = true; 115362306a36Sopenharmony_ci 115462306a36Sopenharmony_ci if (has_extra_refcount(ps, p, extra_pins)) 115562306a36Sopenharmony_ci ret = MF_FAILED; 115662306a36Sopenharmony_ci 115762306a36Sopenharmony_ci return ret; 115862306a36Sopenharmony_ci} 115962306a36Sopenharmony_ci 116062306a36Sopenharmony_cistatic int me_swapcache_clean(struct page_state *ps, struct page *p) 116162306a36Sopenharmony_ci{ 116262306a36Sopenharmony_ci struct folio *folio = page_folio(p); 116362306a36Sopenharmony_ci int ret; 116462306a36Sopenharmony_ci 116562306a36Sopenharmony_ci delete_from_swap_cache(folio); 116662306a36Sopenharmony_ci 116762306a36Sopenharmony_ci ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; 116862306a36Sopenharmony_ci folio_unlock(folio); 116962306a36Sopenharmony_ci 117062306a36Sopenharmony_ci if (has_extra_refcount(ps, p, false)) 117162306a36Sopenharmony_ci ret = MF_FAILED; 117262306a36Sopenharmony_ci 117362306a36Sopenharmony_ci return ret; 117462306a36Sopenharmony_ci} 117562306a36Sopenharmony_ci 117662306a36Sopenharmony_ci/* 117762306a36Sopenharmony_ci * Huge pages. Needs work. 117862306a36Sopenharmony_ci * Issues: 117962306a36Sopenharmony_ci * - Error on hugepage is contained in hugepage unit (not in raw page unit.) 118062306a36Sopenharmony_ci * To narrow down kill region to one page, we need to break up pmd. 118162306a36Sopenharmony_ci */ 118262306a36Sopenharmony_cistatic int me_huge_page(struct page_state *ps, struct page *p) 118362306a36Sopenharmony_ci{ 118462306a36Sopenharmony_ci int res; 118562306a36Sopenharmony_ci struct page *hpage = compound_head(p); 118662306a36Sopenharmony_ci struct address_space *mapping; 118762306a36Sopenharmony_ci bool extra_pins = false; 118862306a36Sopenharmony_ci 118962306a36Sopenharmony_ci mapping = page_mapping(hpage); 119062306a36Sopenharmony_ci if (mapping) { 119162306a36Sopenharmony_ci res = truncate_error_page(hpage, page_to_pfn(p), mapping); 119262306a36Sopenharmony_ci /* The page is kept in page cache. */ 119362306a36Sopenharmony_ci extra_pins = true; 119462306a36Sopenharmony_ci unlock_page(hpage); 119562306a36Sopenharmony_ci } else { 119662306a36Sopenharmony_ci unlock_page(hpage); 119762306a36Sopenharmony_ci /* 119862306a36Sopenharmony_ci * migration entry prevents later access on error hugepage, 119962306a36Sopenharmony_ci * so we can free and dissolve it into buddy to save healthy 120062306a36Sopenharmony_ci * subpages. 120162306a36Sopenharmony_ci */ 120262306a36Sopenharmony_ci put_page(hpage); 120362306a36Sopenharmony_ci if (__page_handle_poison(p) >= 0) { 120462306a36Sopenharmony_ci page_ref_inc(p); 120562306a36Sopenharmony_ci res = MF_RECOVERED; 120662306a36Sopenharmony_ci } else { 120762306a36Sopenharmony_ci res = MF_FAILED; 120862306a36Sopenharmony_ci } 120962306a36Sopenharmony_ci } 121062306a36Sopenharmony_ci 121162306a36Sopenharmony_ci if (has_extra_refcount(ps, p, extra_pins)) 121262306a36Sopenharmony_ci res = MF_FAILED; 121362306a36Sopenharmony_ci 121462306a36Sopenharmony_ci return res; 121562306a36Sopenharmony_ci} 121662306a36Sopenharmony_ci 121762306a36Sopenharmony_ci/* 121862306a36Sopenharmony_ci * Various page states we can handle. 121962306a36Sopenharmony_ci * 122062306a36Sopenharmony_ci * A page state is defined by its current page->flags bits. 122162306a36Sopenharmony_ci * The table matches them in order and calls the right handler. 122262306a36Sopenharmony_ci * 122362306a36Sopenharmony_ci * This is quite tricky because we can access page at any time 122462306a36Sopenharmony_ci * in its live cycle, so all accesses have to be extremely careful. 122562306a36Sopenharmony_ci * 122662306a36Sopenharmony_ci * This is not complete. More states could be added. 122762306a36Sopenharmony_ci * For any missing state don't attempt recovery. 122862306a36Sopenharmony_ci */ 122962306a36Sopenharmony_ci 123062306a36Sopenharmony_ci#define dirty (1UL << PG_dirty) 123162306a36Sopenharmony_ci#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked)) 123262306a36Sopenharmony_ci#define unevict (1UL << PG_unevictable) 123362306a36Sopenharmony_ci#define mlock (1UL << PG_mlocked) 123462306a36Sopenharmony_ci#define lru (1UL << PG_lru) 123562306a36Sopenharmony_ci#define head (1UL << PG_head) 123662306a36Sopenharmony_ci#define slab (1UL << PG_slab) 123762306a36Sopenharmony_ci#define reserved (1UL << PG_reserved) 123862306a36Sopenharmony_ci 123962306a36Sopenharmony_cistatic struct page_state error_states[] = { 124062306a36Sopenharmony_ci { reserved, reserved, MF_MSG_KERNEL, me_kernel }, 124162306a36Sopenharmony_ci /* 124262306a36Sopenharmony_ci * free pages are specially detected outside this table: 124362306a36Sopenharmony_ci * PG_buddy pages only make a small fraction of all free pages. 124462306a36Sopenharmony_ci */ 124562306a36Sopenharmony_ci 124662306a36Sopenharmony_ci /* 124762306a36Sopenharmony_ci * Could in theory check if slab page is free or if we can drop 124862306a36Sopenharmony_ci * currently unused objects without touching them. But just 124962306a36Sopenharmony_ci * treat it as standard kernel for now. 125062306a36Sopenharmony_ci */ 125162306a36Sopenharmony_ci { slab, slab, MF_MSG_SLAB, me_kernel }, 125262306a36Sopenharmony_ci 125362306a36Sopenharmony_ci { head, head, MF_MSG_HUGE, me_huge_page }, 125462306a36Sopenharmony_ci 125562306a36Sopenharmony_ci { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, 125662306a36Sopenharmony_ci { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, 125762306a36Sopenharmony_ci 125862306a36Sopenharmony_ci { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, 125962306a36Sopenharmony_ci { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, 126062306a36Sopenharmony_ci 126162306a36Sopenharmony_ci { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, 126262306a36Sopenharmony_ci { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, 126362306a36Sopenharmony_ci 126462306a36Sopenharmony_ci { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty }, 126562306a36Sopenharmony_ci { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean }, 126662306a36Sopenharmony_ci 126762306a36Sopenharmony_ci /* 126862306a36Sopenharmony_ci * Catchall entry: must be at end. 126962306a36Sopenharmony_ci */ 127062306a36Sopenharmony_ci { 0, 0, MF_MSG_UNKNOWN, me_unknown }, 127162306a36Sopenharmony_ci}; 127262306a36Sopenharmony_ci 127362306a36Sopenharmony_ci#undef dirty 127462306a36Sopenharmony_ci#undef sc 127562306a36Sopenharmony_ci#undef unevict 127662306a36Sopenharmony_ci#undef mlock 127762306a36Sopenharmony_ci#undef lru 127862306a36Sopenharmony_ci#undef head 127962306a36Sopenharmony_ci#undef slab 128062306a36Sopenharmony_ci#undef reserved 128162306a36Sopenharmony_ci 128262306a36Sopenharmony_cistatic void update_per_node_mf_stats(unsigned long pfn, 128362306a36Sopenharmony_ci enum mf_result result) 128462306a36Sopenharmony_ci{ 128562306a36Sopenharmony_ci int nid = MAX_NUMNODES; 128662306a36Sopenharmony_ci struct memory_failure_stats *mf_stats = NULL; 128762306a36Sopenharmony_ci 128862306a36Sopenharmony_ci nid = pfn_to_nid(pfn); 128962306a36Sopenharmony_ci if (unlikely(nid < 0 || nid >= MAX_NUMNODES)) { 129062306a36Sopenharmony_ci WARN_ONCE(1, "Memory failure: pfn=%#lx, invalid nid=%d", pfn, nid); 129162306a36Sopenharmony_ci return; 129262306a36Sopenharmony_ci } 129362306a36Sopenharmony_ci 129462306a36Sopenharmony_ci mf_stats = &NODE_DATA(nid)->mf_stats; 129562306a36Sopenharmony_ci switch (result) { 129662306a36Sopenharmony_ci case MF_IGNORED: 129762306a36Sopenharmony_ci ++mf_stats->ignored; 129862306a36Sopenharmony_ci break; 129962306a36Sopenharmony_ci case MF_FAILED: 130062306a36Sopenharmony_ci ++mf_stats->failed; 130162306a36Sopenharmony_ci break; 130262306a36Sopenharmony_ci case MF_DELAYED: 130362306a36Sopenharmony_ci ++mf_stats->delayed; 130462306a36Sopenharmony_ci break; 130562306a36Sopenharmony_ci case MF_RECOVERED: 130662306a36Sopenharmony_ci ++mf_stats->recovered; 130762306a36Sopenharmony_ci break; 130862306a36Sopenharmony_ci default: 130962306a36Sopenharmony_ci WARN_ONCE(1, "Memory failure: mf_result=%d is not properly handled", result); 131062306a36Sopenharmony_ci break; 131162306a36Sopenharmony_ci } 131262306a36Sopenharmony_ci ++mf_stats->total; 131362306a36Sopenharmony_ci} 131462306a36Sopenharmony_ci 131562306a36Sopenharmony_ci/* 131662306a36Sopenharmony_ci * "Dirty/Clean" indication is not 100% accurate due to the possibility of 131762306a36Sopenharmony_ci * setting PG_dirty outside page lock. See also comment above set_page_dirty(). 131862306a36Sopenharmony_ci */ 131962306a36Sopenharmony_cistatic int action_result(unsigned long pfn, enum mf_action_page_type type, 132062306a36Sopenharmony_ci enum mf_result result) 132162306a36Sopenharmony_ci{ 132262306a36Sopenharmony_ci trace_memory_failure_event(pfn, type, result); 132362306a36Sopenharmony_ci 132462306a36Sopenharmony_ci num_poisoned_pages_inc(pfn); 132562306a36Sopenharmony_ci 132662306a36Sopenharmony_ci update_per_node_mf_stats(pfn, result); 132762306a36Sopenharmony_ci 132862306a36Sopenharmony_ci pr_err("%#lx: recovery action for %s: %s\n", 132962306a36Sopenharmony_ci pfn, action_page_types[type], action_name[result]); 133062306a36Sopenharmony_ci 133162306a36Sopenharmony_ci return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; 133262306a36Sopenharmony_ci} 133362306a36Sopenharmony_ci 133462306a36Sopenharmony_cistatic int page_action(struct page_state *ps, struct page *p, 133562306a36Sopenharmony_ci unsigned long pfn) 133662306a36Sopenharmony_ci{ 133762306a36Sopenharmony_ci int result; 133862306a36Sopenharmony_ci 133962306a36Sopenharmony_ci /* page p should be unlocked after returning from ps->action(). */ 134062306a36Sopenharmony_ci result = ps->action(ps, p); 134162306a36Sopenharmony_ci 134262306a36Sopenharmony_ci /* Could do more checks here if page looks ok */ 134362306a36Sopenharmony_ci /* 134462306a36Sopenharmony_ci * Could adjust zone counters here to correct for the missing page. 134562306a36Sopenharmony_ci */ 134662306a36Sopenharmony_ci 134762306a36Sopenharmony_ci return action_result(pfn, ps->type, result); 134862306a36Sopenharmony_ci} 134962306a36Sopenharmony_ci 135062306a36Sopenharmony_cistatic inline bool PageHWPoisonTakenOff(struct page *page) 135162306a36Sopenharmony_ci{ 135262306a36Sopenharmony_ci return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON; 135362306a36Sopenharmony_ci} 135462306a36Sopenharmony_ci 135562306a36Sopenharmony_civoid SetPageHWPoisonTakenOff(struct page *page) 135662306a36Sopenharmony_ci{ 135762306a36Sopenharmony_ci set_page_private(page, MAGIC_HWPOISON); 135862306a36Sopenharmony_ci} 135962306a36Sopenharmony_ci 136062306a36Sopenharmony_civoid ClearPageHWPoisonTakenOff(struct page *page) 136162306a36Sopenharmony_ci{ 136262306a36Sopenharmony_ci if (PageHWPoison(page)) 136362306a36Sopenharmony_ci set_page_private(page, 0); 136462306a36Sopenharmony_ci} 136562306a36Sopenharmony_ci 136662306a36Sopenharmony_ci/* 136762306a36Sopenharmony_ci * Return true if a page type of a given page is supported by hwpoison 136862306a36Sopenharmony_ci * mechanism (while handling could fail), otherwise false. This function 136962306a36Sopenharmony_ci * does not return true for hugetlb or device memory pages, so it's assumed 137062306a36Sopenharmony_ci * to be called only in the context where we never have such pages. 137162306a36Sopenharmony_ci */ 137262306a36Sopenharmony_cistatic inline bool HWPoisonHandlable(struct page *page, unsigned long flags) 137362306a36Sopenharmony_ci{ 137462306a36Sopenharmony_ci /* Soft offline could migrate non-LRU movable pages */ 137562306a36Sopenharmony_ci if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page)) 137662306a36Sopenharmony_ci return true; 137762306a36Sopenharmony_ci 137862306a36Sopenharmony_ci return PageLRU(page) || is_free_buddy_page(page); 137962306a36Sopenharmony_ci} 138062306a36Sopenharmony_ci 138162306a36Sopenharmony_cistatic int __get_hwpoison_page(struct page *page, unsigned long flags) 138262306a36Sopenharmony_ci{ 138362306a36Sopenharmony_ci struct folio *folio = page_folio(page); 138462306a36Sopenharmony_ci int ret = 0; 138562306a36Sopenharmony_ci bool hugetlb = false; 138662306a36Sopenharmony_ci 138762306a36Sopenharmony_ci ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, false); 138862306a36Sopenharmony_ci if (hugetlb) { 138962306a36Sopenharmony_ci /* Make sure hugetlb demotion did not happen from under us. */ 139062306a36Sopenharmony_ci if (folio == page_folio(page)) 139162306a36Sopenharmony_ci return ret; 139262306a36Sopenharmony_ci if (ret > 0) { 139362306a36Sopenharmony_ci folio_put(folio); 139462306a36Sopenharmony_ci folio = page_folio(page); 139562306a36Sopenharmony_ci } 139662306a36Sopenharmony_ci } 139762306a36Sopenharmony_ci 139862306a36Sopenharmony_ci /* 139962306a36Sopenharmony_ci * This check prevents from calling folio_try_get() for any 140062306a36Sopenharmony_ci * unsupported type of folio in order to reduce the risk of unexpected 140162306a36Sopenharmony_ci * races caused by taking a folio refcount. 140262306a36Sopenharmony_ci */ 140362306a36Sopenharmony_ci if (!HWPoisonHandlable(&folio->page, flags)) 140462306a36Sopenharmony_ci return -EBUSY; 140562306a36Sopenharmony_ci 140662306a36Sopenharmony_ci if (folio_try_get(folio)) { 140762306a36Sopenharmony_ci if (folio == page_folio(page)) 140862306a36Sopenharmony_ci return 1; 140962306a36Sopenharmony_ci 141062306a36Sopenharmony_ci pr_info("%#lx cannot catch tail\n", page_to_pfn(page)); 141162306a36Sopenharmony_ci folio_put(folio); 141262306a36Sopenharmony_ci } 141362306a36Sopenharmony_ci 141462306a36Sopenharmony_ci return 0; 141562306a36Sopenharmony_ci} 141662306a36Sopenharmony_ci 141762306a36Sopenharmony_cistatic int get_any_page(struct page *p, unsigned long flags) 141862306a36Sopenharmony_ci{ 141962306a36Sopenharmony_ci int ret = 0, pass = 0; 142062306a36Sopenharmony_ci bool count_increased = false; 142162306a36Sopenharmony_ci 142262306a36Sopenharmony_ci if (flags & MF_COUNT_INCREASED) 142362306a36Sopenharmony_ci count_increased = true; 142462306a36Sopenharmony_ci 142562306a36Sopenharmony_citry_again: 142662306a36Sopenharmony_ci if (!count_increased) { 142762306a36Sopenharmony_ci ret = __get_hwpoison_page(p, flags); 142862306a36Sopenharmony_ci if (!ret) { 142962306a36Sopenharmony_ci if (page_count(p)) { 143062306a36Sopenharmony_ci /* We raced with an allocation, retry. */ 143162306a36Sopenharmony_ci if (pass++ < 3) 143262306a36Sopenharmony_ci goto try_again; 143362306a36Sopenharmony_ci ret = -EBUSY; 143462306a36Sopenharmony_ci } else if (!PageHuge(p) && !is_free_buddy_page(p)) { 143562306a36Sopenharmony_ci /* We raced with put_page, retry. */ 143662306a36Sopenharmony_ci if (pass++ < 3) 143762306a36Sopenharmony_ci goto try_again; 143862306a36Sopenharmony_ci ret = -EIO; 143962306a36Sopenharmony_ci } 144062306a36Sopenharmony_ci goto out; 144162306a36Sopenharmony_ci } else if (ret == -EBUSY) { 144262306a36Sopenharmony_ci /* 144362306a36Sopenharmony_ci * We raced with (possibly temporary) unhandlable 144462306a36Sopenharmony_ci * page, retry. 144562306a36Sopenharmony_ci */ 144662306a36Sopenharmony_ci if (pass++ < 3) { 144762306a36Sopenharmony_ci shake_page(p); 144862306a36Sopenharmony_ci goto try_again; 144962306a36Sopenharmony_ci } 145062306a36Sopenharmony_ci ret = -EIO; 145162306a36Sopenharmony_ci goto out; 145262306a36Sopenharmony_ci } 145362306a36Sopenharmony_ci } 145462306a36Sopenharmony_ci 145562306a36Sopenharmony_ci if (PageHuge(p) || HWPoisonHandlable(p, flags)) { 145662306a36Sopenharmony_ci ret = 1; 145762306a36Sopenharmony_ci } else { 145862306a36Sopenharmony_ci /* 145962306a36Sopenharmony_ci * A page we cannot handle. Check whether we can turn 146062306a36Sopenharmony_ci * it into something we can handle. 146162306a36Sopenharmony_ci */ 146262306a36Sopenharmony_ci if (pass++ < 3) { 146362306a36Sopenharmony_ci put_page(p); 146462306a36Sopenharmony_ci shake_page(p); 146562306a36Sopenharmony_ci count_increased = false; 146662306a36Sopenharmony_ci goto try_again; 146762306a36Sopenharmony_ci } 146862306a36Sopenharmony_ci put_page(p); 146962306a36Sopenharmony_ci ret = -EIO; 147062306a36Sopenharmony_ci } 147162306a36Sopenharmony_ciout: 147262306a36Sopenharmony_ci if (ret == -EIO) 147362306a36Sopenharmony_ci pr_err("%#lx: unhandlable page.\n", page_to_pfn(p)); 147462306a36Sopenharmony_ci 147562306a36Sopenharmony_ci return ret; 147662306a36Sopenharmony_ci} 147762306a36Sopenharmony_ci 147862306a36Sopenharmony_cistatic int __get_unpoison_page(struct page *page) 147962306a36Sopenharmony_ci{ 148062306a36Sopenharmony_ci struct folio *folio = page_folio(page); 148162306a36Sopenharmony_ci int ret = 0; 148262306a36Sopenharmony_ci bool hugetlb = false; 148362306a36Sopenharmony_ci 148462306a36Sopenharmony_ci ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, true); 148562306a36Sopenharmony_ci if (hugetlb) { 148662306a36Sopenharmony_ci /* Make sure hugetlb demotion did not happen from under us. */ 148762306a36Sopenharmony_ci if (folio == page_folio(page)) 148862306a36Sopenharmony_ci return ret; 148962306a36Sopenharmony_ci if (ret > 0) 149062306a36Sopenharmony_ci folio_put(folio); 149162306a36Sopenharmony_ci } 149262306a36Sopenharmony_ci 149362306a36Sopenharmony_ci /* 149462306a36Sopenharmony_ci * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison, 149562306a36Sopenharmony_ci * but also isolated from buddy freelist, so need to identify the 149662306a36Sopenharmony_ci * state and have to cancel both operations to unpoison. 149762306a36Sopenharmony_ci */ 149862306a36Sopenharmony_ci if (PageHWPoisonTakenOff(page)) 149962306a36Sopenharmony_ci return -EHWPOISON; 150062306a36Sopenharmony_ci 150162306a36Sopenharmony_ci return get_page_unless_zero(page) ? 1 : 0; 150262306a36Sopenharmony_ci} 150362306a36Sopenharmony_ci 150462306a36Sopenharmony_ci/** 150562306a36Sopenharmony_ci * get_hwpoison_page() - Get refcount for memory error handling 150662306a36Sopenharmony_ci * @p: Raw error page (hit by memory error) 150762306a36Sopenharmony_ci * @flags: Flags controlling behavior of error handling 150862306a36Sopenharmony_ci * 150962306a36Sopenharmony_ci * get_hwpoison_page() takes a page refcount of an error page to handle memory 151062306a36Sopenharmony_ci * error on it, after checking that the error page is in a well-defined state 151162306a36Sopenharmony_ci * (defined as a page-type we can successfully handle the memory error on it, 151262306a36Sopenharmony_ci * such as LRU page and hugetlb page). 151362306a36Sopenharmony_ci * 151462306a36Sopenharmony_ci * Memory error handling could be triggered at any time on any type of page, 151562306a36Sopenharmony_ci * so it's prone to race with typical memory management lifecycle (like 151662306a36Sopenharmony_ci * allocation and free). So to avoid such races, get_hwpoison_page() takes 151762306a36Sopenharmony_ci * extra care for the error page's state (as done in __get_hwpoison_page()), 151862306a36Sopenharmony_ci * and has some retry logic in get_any_page(). 151962306a36Sopenharmony_ci * 152062306a36Sopenharmony_ci * When called from unpoison_memory(), the caller should already ensure that 152162306a36Sopenharmony_ci * the given page has PG_hwpoison. So it's never reused for other page 152262306a36Sopenharmony_ci * allocations, and __get_unpoison_page() never races with them. 152362306a36Sopenharmony_ci * 152462306a36Sopenharmony_ci * Return: 0 on failure, 152562306a36Sopenharmony_ci * 1 on success for in-use pages in a well-defined state, 152662306a36Sopenharmony_ci * -EIO for pages on which we can not handle memory errors, 152762306a36Sopenharmony_ci * -EBUSY when get_hwpoison_page() has raced with page lifecycle 152862306a36Sopenharmony_ci * operations like allocation and free, 152962306a36Sopenharmony_ci * -EHWPOISON when the page is hwpoisoned and taken off from buddy. 153062306a36Sopenharmony_ci */ 153162306a36Sopenharmony_cistatic int get_hwpoison_page(struct page *p, unsigned long flags) 153262306a36Sopenharmony_ci{ 153362306a36Sopenharmony_ci int ret; 153462306a36Sopenharmony_ci 153562306a36Sopenharmony_ci zone_pcp_disable(page_zone(p)); 153662306a36Sopenharmony_ci if (flags & MF_UNPOISON) 153762306a36Sopenharmony_ci ret = __get_unpoison_page(p); 153862306a36Sopenharmony_ci else 153962306a36Sopenharmony_ci ret = get_any_page(p, flags); 154062306a36Sopenharmony_ci zone_pcp_enable(page_zone(p)); 154162306a36Sopenharmony_ci 154262306a36Sopenharmony_ci return ret; 154362306a36Sopenharmony_ci} 154462306a36Sopenharmony_ci 154562306a36Sopenharmony_ci/* 154662306a36Sopenharmony_ci * Do all that is necessary to remove user space mappings. Unmap 154762306a36Sopenharmony_ci * the pages and send SIGBUS to the processes if the data was dirty. 154862306a36Sopenharmony_ci */ 154962306a36Sopenharmony_cistatic bool hwpoison_user_mappings(struct page *p, unsigned long pfn, 155062306a36Sopenharmony_ci int flags, struct page *hpage) 155162306a36Sopenharmony_ci{ 155262306a36Sopenharmony_ci struct folio *folio = page_folio(hpage); 155362306a36Sopenharmony_ci enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; 155462306a36Sopenharmony_ci struct address_space *mapping; 155562306a36Sopenharmony_ci LIST_HEAD(tokill); 155662306a36Sopenharmony_ci bool unmap_success; 155762306a36Sopenharmony_ci int forcekill; 155862306a36Sopenharmony_ci bool mlocked = PageMlocked(hpage); 155962306a36Sopenharmony_ci 156062306a36Sopenharmony_ci /* 156162306a36Sopenharmony_ci * Here we are interested only in user-mapped pages, so skip any 156262306a36Sopenharmony_ci * other types of pages. 156362306a36Sopenharmony_ci */ 156462306a36Sopenharmony_ci if (PageReserved(p) || PageSlab(p) || PageTable(p) || PageOffline(p)) 156562306a36Sopenharmony_ci return true; 156662306a36Sopenharmony_ci if (!(PageLRU(hpage) || PageHuge(p))) 156762306a36Sopenharmony_ci return true; 156862306a36Sopenharmony_ci 156962306a36Sopenharmony_ci /* 157062306a36Sopenharmony_ci * This check implies we don't kill processes if their pages 157162306a36Sopenharmony_ci * are in the swap cache early. Those are always late kills. 157262306a36Sopenharmony_ci */ 157362306a36Sopenharmony_ci if (!page_mapped(p)) 157462306a36Sopenharmony_ci return true; 157562306a36Sopenharmony_ci 157662306a36Sopenharmony_ci if (PageSwapCache(p)) { 157762306a36Sopenharmony_ci pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); 157862306a36Sopenharmony_ci ttu &= ~TTU_HWPOISON; 157962306a36Sopenharmony_ci } 158062306a36Sopenharmony_ci 158162306a36Sopenharmony_ci /* 158262306a36Sopenharmony_ci * Propagate the dirty bit from PTEs to struct page first, because we 158362306a36Sopenharmony_ci * need this to decide if we should kill or just drop the page. 158462306a36Sopenharmony_ci * XXX: the dirty test could be racy: set_page_dirty() may not always 158562306a36Sopenharmony_ci * be called inside page lock (it's recommended but not enforced). 158662306a36Sopenharmony_ci */ 158762306a36Sopenharmony_ci mapping = page_mapping(hpage); 158862306a36Sopenharmony_ci if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && 158962306a36Sopenharmony_ci mapping_can_writeback(mapping)) { 159062306a36Sopenharmony_ci if (page_mkclean(hpage)) { 159162306a36Sopenharmony_ci SetPageDirty(hpage); 159262306a36Sopenharmony_ci } else { 159362306a36Sopenharmony_ci ttu &= ~TTU_HWPOISON; 159462306a36Sopenharmony_ci pr_info("%#lx: corrupted page was clean: dropped without side effects\n", 159562306a36Sopenharmony_ci pfn); 159662306a36Sopenharmony_ci } 159762306a36Sopenharmony_ci } 159862306a36Sopenharmony_ci 159962306a36Sopenharmony_ci /* 160062306a36Sopenharmony_ci * First collect all the processes that have the page 160162306a36Sopenharmony_ci * mapped in dirty form. This has to be done before try_to_unmap, 160262306a36Sopenharmony_ci * because ttu takes the rmap data structures down. 160362306a36Sopenharmony_ci */ 160462306a36Sopenharmony_ci collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); 160562306a36Sopenharmony_ci 160662306a36Sopenharmony_ci if (PageHuge(hpage) && !PageAnon(hpage)) { 160762306a36Sopenharmony_ci /* 160862306a36Sopenharmony_ci * For hugetlb pages in shared mappings, try_to_unmap 160962306a36Sopenharmony_ci * could potentially call huge_pmd_unshare. Because of 161062306a36Sopenharmony_ci * this, take semaphore in write mode here and set 161162306a36Sopenharmony_ci * TTU_RMAP_LOCKED to indicate we have taken the lock 161262306a36Sopenharmony_ci * at this higher level. 161362306a36Sopenharmony_ci */ 161462306a36Sopenharmony_ci mapping = hugetlb_page_mapping_lock_write(hpage); 161562306a36Sopenharmony_ci if (mapping) { 161662306a36Sopenharmony_ci try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); 161762306a36Sopenharmony_ci i_mmap_unlock_write(mapping); 161862306a36Sopenharmony_ci } else 161962306a36Sopenharmony_ci pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn); 162062306a36Sopenharmony_ci } else { 162162306a36Sopenharmony_ci try_to_unmap(folio, ttu); 162262306a36Sopenharmony_ci } 162362306a36Sopenharmony_ci 162462306a36Sopenharmony_ci unmap_success = !page_mapped(p); 162562306a36Sopenharmony_ci if (!unmap_success) 162662306a36Sopenharmony_ci pr_err("%#lx: failed to unmap page (mapcount=%d)\n", 162762306a36Sopenharmony_ci pfn, page_mapcount(p)); 162862306a36Sopenharmony_ci 162962306a36Sopenharmony_ci /* 163062306a36Sopenharmony_ci * try_to_unmap() might put mlocked page in lru cache, so call 163162306a36Sopenharmony_ci * shake_page() again to ensure that it's flushed. 163262306a36Sopenharmony_ci */ 163362306a36Sopenharmony_ci if (mlocked) 163462306a36Sopenharmony_ci shake_page(hpage); 163562306a36Sopenharmony_ci 163662306a36Sopenharmony_ci /* 163762306a36Sopenharmony_ci * Now that the dirty bit has been propagated to the 163862306a36Sopenharmony_ci * struct page and all unmaps done we can decide if 163962306a36Sopenharmony_ci * killing is needed or not. Only kill when the page 164062306a36Sopenharmony_ci * was dirty or the process is not restartable, 164162306a36Sopenharmony_ci * otherwise the tokill list is merely 164262306a36Sopenharmony_ci * freed. When there was a problem unmapping earlier 164362306a36Sopenharmony_ci * use a more force-full uncatchable kill to prevent 164462306a36Sopenharmony_ci * any accesses to the poisoned memory. 164562306a36Sopenharmony_ci */ 164662306a36Sopenharmony_ci forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) || 164762306a36Sopenharmony_ci !unmap_success; 164862306a36Sopenharmony_ci kill_procs(&tokill, forcekill, !unmap_success, pfn, flags); 164962306a36Sopenharmony_ci 165062306a36Sopenharmony_ci return unmap_success; 165162306a36Sopenharmony_ci} 165262306a36Sopenharmony_ci 165362306a36Sopenharmony_cistatic int identify_page_state(unsigned long pfn, struct page *p, 165462306a36Sopenharmony_ci unsigned long page_flags) 165562306a36Sopenharmony_ci{ 165662306a36Sopenharmony_ci struct page_state *ps; 165762306a36Sopenharmony_ci 165862306a36Sopenharmony_ci /* 165962306a36Sopenharmony_ci * The first check uses the current page flags which may not have any 166062306a36Sopenharmony_ci * relevant information. The second check with the saved page flags is 166162306a36Sopenharmony_ci * carried out only if the first check can't determine the page status. 166262306a36Sopenharmony_ci */ 166362306a36Sopenharmony_ci for (ps = error_states;; ps++) 166462306a36Sopenharmony_ci if ((p->flags & ps->mask) == ps->res) 166562306a36Sopenharmony_ci break; 166662306a36Sopenharmony_ci 166762306a36Sopenharmony_ci page_flags |= (p->flags & (1UL << PG_dirty)); 166862306a36Sopenharmony_ci 166962306a36Sopenharmony_ci if (!ps->mask) 167062306a36Sopenharmony_ci for (ps = error_states;; ps++) 167162306a36Sopenharmony_ci if ((page_flags & ps->mask) == ps->res) 167262306a36Sopenharmony_ci break; 167362306a36Sopenharmony_ci return page_action(ps, p, pfn); 167462306a36Sopenharmony_ci} 167562306a36Sopenharmony_ci 167662306a36Sopenharmony_cistatic int try_to_split_thp_page(struct page *page) 167762306a36Sopenharmony_ci{ 167862306a36Sopenharmony_ci int ret; 167962306a36Sopenharmony_ci 168062306a36Sopenharmony_ci lock_page(page); 168162306a36Sopenharmony_ci ret = split_huge_page(page); 168262306a36Sopenharmony_ci unlock_page(page); 168362306a36Sopenharmony_ci 168462306a36Sopenharmony_ci if (unlikely(ret)) 168562306a36Sopenharmony_ci put_page(page); 168662306a36Sopenharmony_ci 168762306a36Sopenharmony_ci return ret; 168862306a36Sopenharmony_ci} 168962306a36Sopenharmony_ci 169062306a36Sopenharmony_cistatic void unmap_and_kill(struct list_head *to_kill, unsigned long pfn, 169162306a36Sopenharmony_ci struct address_space *mapping, pgoff_t index, int flags) 169262306a36Sopenharmony_ci{ 169362306a36Sopenharmony_ci struct to_kill *tk; 169462306a36Sopenharmony_ci unsigned long size = 0; 169562306a36Sopenharmony_ci 169662306a36Sopenharmony_ci list_for_each_entry(tk, to_kill, nd) 169762306a36Sopenharmony_ci if (tk->size_shift) 169862306a36Sopenharmony_ci size = max(size, 1UL << tk->size_shift); 169962306a36Sopenharmony_ci 170062306a36Sopenharmony_ci if (size) { 170162306a36Sopenharmony_ci /* 170262306a36Sopenharmony_ci * Unmap the largest mapping to avoid breaking up device-dax 170362306a36Sopenharmony_ci * mappings which are constant size. The actual size of the 170462306a36Sopenharmony_ci * mapping being torn down is communicated in siginfo, see 170562306a36Sopenharmony_ci * kill_proc() 170662306a36Sopenharmony_ci */ 170762306a36Sopenharmony_ci loff_t start = ((loff_t)index << PAGE_SHIFT) & ~(size - 1); 170862306a36Sopenharmony_ci 170962306a36Sopenharmony_ci unmap_mapping_range(mapping, start, size, 0); 171062306a36Sopenharmony_ci } 171162306a36Sopenharmony_ci 171262306a36Sopenharmony_ci kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags); 171362306a36Sopenharmony_ci} 171462306a36Sopenharmony_ci 171562306a36Sopenharmony_ci/* 171662306a36Sopenharmony_ci * Only dev_pagemap pages get here, such as fsdax when the filesystem 171762306a36Sopenharmony_ci * either do not claim or fails to claim a hwpoison event, or devdax. 171862306a36Sopenharmony_ci * The fsdax pages are initialized per base page, and the devdax pages 171962306a36Sopenharmony_ci * could be initialized either as base pages, or as compound pages with 172062306a36Sopenharmony_ci * vmemmap optimization enabled. Devdax is simplistic in its dealing with 172162306a36Sopenharmony_ci * hwpoison, such that, if a subpage of a compound page is poisoned, 172262306a36Sopenharmony_ci * simply mark the compound head page is by far sufficient. 172362306a36Sopenharmony_ci */ 172462306a36Sopenharmony_cistatic int mf_generic_kill_procs(unsigned long long pfn, int flags, 172562306a36Sopenharmony_ci struct dev_pagemap *pgmap) 172662306a36Sopenharmony_ci{ 172762306a36Sopenharmony_ci struct folio *folio = pfn_folio(pfn); 172862306a36Sopenharmony_ci LIST_HEAD(to_kill); 172962306a36Sopenharmony_ci dax_entry_t cookie; 173062306a36Sopenharmony_ci int rc = 0; 173162306a36Sopenharmony_ci 173262306a36Sopenharmony_ci /* 173362306a36Sopenharmony_ci * Prevent the inode from being freed while we are interrogating 173462306a36Sopenharmony_ci * the address_space, typically this would be handled by 173562306a36Sopenharmony_ci * lock_page(), but dax pages do not use the page lock. This 173662306a36Sopenharmony_ci * also prevents changes to the mapping of this pfn until 173762306a36Sopenharmony_ci * poison signaling is complete. 173862306a36Sopenharmony_ci */ 173962306a36Sopenharmony_ci cookie = dax_lock_folio(folio); 174062306a36Sopenharmony_ci if (!cookie) 174162306a36Sopenharmony_ci return -EBUSY; 174262306a36Sopenharmony_ci 174362306a36Sopenharmony_ci if (hwpoison_filter(&folio->page)) { 174462306a36Sopenharmony_ci rc = -EOPNOTSUPP; 174562306a36Sopenharmony_ci goto unlock; 174662306a36Sopenharmony_ci } 174762306a36Sopenharmony_ci 174862306a36Sopenharmony_ci switch (pgmap->type) { 174962306a36Sopenharmony_ci case MEMORY_DEVICE_PRIVATE: 175062306a36Sopenharmony_ci case MEMORY_DEVICE_COHERENT: 175162306a36Sopenharmony_ci /* 175262306a36Sopenharmony_ci * TODO: Handle device pages which may need coordination 175362306a36Sopenharmony_ci * with device-side memory. 175462306a36Sopenharmony_ci */ 175562306a36Sopenharmony_ci rc = -ENXIO; 175662306a36Sopenharmony_ci goto unlock; 175762306a36Sopenharmony_ci default: 175862306a36Sopenharmony_ci break; 175962306a36Sopenharmony_ci } 176062306a36Sopenharmony_ci 176162306a36Sopenharmony_ci /* 176262306a36Sopenharmony_ci * Use this flag as an indication that the dax page has been 176362306a36Sopenharmony_ci * remapped UC to prevent speculative consumption of poison. 176462306a36Sopenharmony_ci */ 176562306a36Sopenharmony_ci SetPageHWPoison(&folio->page); 176662306a36Sopenharmony_ci 176762306a36Sopenharmony_ci /* 176862306a36Sopenharmony_ci * Unlike System-RAM there is no possibility to swap in a 176962306a36Sopenharmony_ci * different physical page at a given virtual address, so all 177062306a36Sopenharmony_ci * userspace consumption of ZONE_DEVICE memory necessitates 177162306a36Sopenharmony_ci * SIGBUS (i.e. MF_MUST_KILL) 177262306a36Sopenharmony_ci */ 177362306a36Sopenharmony_ci flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; 177462306a36Sopenharmony_ci collect_procs(folio, &folio->page, &to_kill, true); 177562306a36Sopenharmony_ci 177662306a36Sopenharmony_ci unmap_and_kill(&to_kill, pfn, folio->mapping, folio->index, flags); 177762306a36Sopenharmony_ciunlock: 177862306a36Sopenharmony_ci dax_unlock_folio(folio, cookie); 177962306a36Sopenharmony_ci return rc; 178062306a36Sopenharmony_ci} 178162306a36Sopenharmony_ci 178262306a36Sopenharmony_ci#ifdef CONFIG_FS_DAX 178362306a36Sopenharmony_ci/** 178462306a36Sopenharmony_ci * mf_dax_kill_procs - Collect and kill processes who are using this file range 178562306a36Sopenharmony_ci * @mapping: address_space of the file in use 178662306a36Sopenharmony_ci * @index: start pgoff of the range within the file 178762306a36Sopenharmony_ci * @count: length of the range, in unit of PAGE_SIZE 178862306a36Sopenharmony_ci * @mf_flags: memory failure flags 178962306a36Sopenharmony_ci */ 179062306a36Sopenharmony_ciint mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, 179162306a36Sopenharmony_ci unsigned long count, int mf_flags) 179262306a36Sopenharmony_ci{ 179362306a36Sopenharmony_ci LIST_HEAD(to_kill); 179462306a36Sopenharmony_ci dax_entry_t cookie; 179562306a36Sopenharmony_ci struct page *page; 179662306a36Sopenharmony_ci size_t end = index + count; 179762306a36Sopenharmony_ci 179862306a36Sopenharmony_ci mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; 179962306a36Sopenharmony_ci 180062306a36Sopenharmony_ci for (; index < end; index++) { 180162306a36Sopenharmony_ci page = NULL; 180262306a36Sopenharmony_ci cookie = dax_lock_mapping_entry(mapping, index, &page); 180362306a36Sopenharmony_ci if (!cookie) 180462306a36Sopenharmony_ci return -EBUSY; 180562306a36Sopenharmony_ci if (!page) 180662306a36Sopenharmony_ci goto unlock; 180762306a36Sopenharmony_ci 180862306a36Sopenharmony_ci SetPageHWPoison(page); 180962306a36Sopenharmony_ci 181062306a36Sopenharmony_ci collect_procs_fsdax(page, mapping, index, &to_kill); 181162306a36Sopenharmony_ci unmap_and_kill(&to_kill, page_to_pfn(page), mapping, 181262306a36Sopenharmony_ci index, mf_flags); 181362306a36Sopenharmony_ciunlock: 181462306a36Sopenharmony_ci dax_unlock_mapping_entry(mapping, index, cookie); 181562306a36Sopenharmony_ci } 181662306a36Sopenharmony_ci return 0; 181762306a36Sopenharmony_ci} 181862306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(mf_dax_kill_procs); 181962306a36Sopenharmony_ci#endif /* CONFIG_FS_DAX */ 182062306a36Sopenharmony_ci 182162306a36Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE 182262306a36Sopenharmony_ci 182362306a36Sopenharmony_ci/* 182462306a36Sopenharmony_ci * Struct raw_hwp_page represents information about "raw error page", 182562306a36Sopenharmony_ci * constructing singly linked list from ->_hugetlb_hwpoison field of folio. 182662306a36Sopenharmony_ci */ 182762306a36Sopenharmony_cistruct raw_hwp_page { 182862306a36Sopenharmony_ci struct llist_node node; 182962306a36Sopenharmony_ci struct page *page; 183062306a36Sopenharmony_ci}; 183162306a36Sopenharmony_ci 183262306a36Sopenharmony_cistatic inline struct llist_head *raw_hwp_list_head(struct folio *folio) 183362306a36Sopenharmony_ci{ 183462306a36Sopenharmony_ci return (struct llist_head *)&folio->_hugetlb_hwpoison; 183562306a36Sopenharmony_ci} 183662306a36Sopenharmony_ci 183762306a36Sopenharmony_cibool is_raw_hwpoison_page_in_hugepage(struct page *page) 183862306a36Sopenharmony_ci{ 183962306a36Sopenharmony_ci struct llist_head *raw_hwp_head; 184062306a36Sopenharmony_ci struct raw_hwp_page *p; 184162306a36Sopenharmony_ci struct folio *folio = page_folio(page); 184262306a36Sopenharmony_ci bool ret = false; 184362306a36Sopenharmony_ci 184462306a36Sopenharmony_ci if (!folio_test_hwpoison(folio)) 184562306a36Sopenharmony_ci return false; 184662306a36Sopenharmony_ci 184762306a36Sopenharmony_ci if (!folio_test_hugetlb(folio)) 184862306a36Sopenharmony_ci return PageHWPoison(page); 184962306a36Sopenharmony_ci 185062306a36Sopenharmony_ci /* 185162306a36Sopenharmony_ci * When RawHwpUnreliable is set, kernel lost track of which subpages 185262306a36Sopenharmony_ci * are HWPOISON. So return as if ALL subpages are HWPOISONed. 185362306a36Sopenharmony_ci */ 185462306a36Sopenharmony_ci if (folio_test_hugetlb_raw_hwp_unreliable(folio)) 185562306a36Sopenharmony_ci return true; 185662306a36Sopenharmony_ci 185762306a36Sopenharmony_ci mutex_lock(&mf_mutex); 185862306a36Sopenharmony_ci 185962306a36Sopenharmony_ci raw_hwp_head = raw_hwp_list_head(folio); 186062306a36Sopenharmony_ci llist_for_each_entry(p, raw_hwp_head->first, node) { 186162306a36Sopenharmony_ci if (page == p->page) { 186262306a36Sopenharmony_ci ret = true; 186362306a36Sopenharmony_ci break; 186462306a36Sopenharmony_ci } 186562306a36Sopenharmony_ci } 186662306a36Sopenharmony_ci 186762306a36Sopenharmony_ci mutex_unlock(&mf_mutex); 186862306a36Sopenharmony_ci 186962306a36Sopenharmony_ci return ret; 187062306a36Sopenharmony_ci} 187162306a36Sopenharmony_ci 187262306a36Sopenharmony_cistatic unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) 187362306a36Sopenharmony_ci{ 187462306a36Sopenharmony_ci struct llist_node *head; 187562306a36Sopenharmony_ci struct raw_hwp_page *p, *next; 187662306a36Sopenharmony_ci unsigned long count = 0; 187762306a36Sopenharmony_ci 187862306a36Sopenharmony_ci head = llist_del_all(raw_hwp_list_head(folio)); 187962306a36Sopenharmony_ci llist_for_each_entry_safe(p, next, head, node) { 188062306a36Sopenharmony_ci if (move_flag) 188162306a36Sopenharmony_ci SetPageHWPoison(p->page); 188262306a36Sopenharmony_ci else 188362306a36Sopenharmony_ci num_poisoned_pages_sub(page_to_pfn(p->page), 1); 188462306a36Sopenharmony_ci kfree(p); 188562306a36Sopenharmony_ci count++; 188662306a36Sopenharmony_ci } 188762306a36Sopenharmony_ci return count; 188862306a36Sopenharmony_ci} 188962306a36Sopenharmony_ci 189062306a36Sopenharmony_cistatic int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) 189162306a36Sopenharmony_ci{ 189262306a36Sopenharmony_ci struct llist_head *head; 189362306a36Sopenharmony_ci struct raw_hwp_page *raw_hwp; 189462306a36Sopenharmony_ci struct raw_hwp_page *p, *next; 189562306a36Sopenharmony_ci int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0; 189662306a36Sopenharmony_ci 189762306a36Sopenharmony_ci /* 189862306a36Sopenharmony_ci * Once the hwpoison hugepage has lost reliable raw error info, 189962306a36Sopenharmony_ci * there is little meaning to keep additional error info precisely, 190062306a36Sopenharmony_ci * so skip to add additional raw error info. 190162306a36Sopenharmony_ci */ 190262306a36Sopenharmony_ci if (folio_test_hugetlb_raw_hwp_unreliable(folio)) 190362306a36Sopenharmony_ci return -EHWPOISON; 190462306a36Sopenharmony_ci head = raw_hwp_list_head(folio); 190562306a36Sopenharmony_ci llist_for_each_entry_safe(p, next, head->first, node) { 190662306a36Sopenharmony_ci if (p->page == page) 190762306a36Sopenharmony_ci return -EHWPOISON; 190862306a36Sopenharmony_ci } 190962306a36Sopenharmony_ci 191062306a36Sopenharmony_ci raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC); 191162306a36Sopenharmony_ci if (raw_hwp) { 191262306a36Sopenharmony_ci raw_hwp->page = page; 191362306a36Sopenharmony_ci llist_add(&raw_hwp->node, head); 191462306a36Sopenharmony_ci /* the first error event will be counted in action_result(). */ 191562306a36Sopenharmony_ci if (ret) 191662306a36Sopenharmony_ci num_poisoned_pages_inc(page_to_pfn(page)); 191762306a36Sopenharmony_ci } else { 191862306a36Sopenharmony_ci /* 191962306a36Sopenharmony_ci * Failed to save raw error info. We no longer trace all 192062306a36Sopenharmony_ci * hwpoisoned subpages, and we need refuse to free/dissolve 192162306a36Sopenharmony_ci * this hwpoisoned hugepage. 192262306a36Sopenharmony_ci */ 192362306a36Sopenharmony_ci folio_set_hugetlb_raw_hwp_unreliable(folio); 192462306a36Sopenharmony_ci /* 192562306a36Sopenharmony_ci * Once hugetlb_raw_hwp_unreliable is set, raw_hwp_page is not 192662306a36Sopenharmony_ci * used any more, so free it. 192762306a36Sopenharmony_ci */ 192862306a36Sopenharmony_ci __folio_free_raw_hwp(folio, false); 192962306a36Sopenharmony_ci } 193062306a36Sopenharmony_ci return ret; 193162306a36Sopenharmony_ci} 193262306a36Sopenharmony_ci 193362306a36Sopenharmony_cistatic unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag) 193462306a36Sopenharmony_ci{ 193562306a36Sopenharmony_ci /* 193662306a36Sopenharmony_ci * hugetlb_vmemmap_optimized hugepages can't be freed because struct 193762306a36Sopenharmony_ci * pages for tail pages are required but they don't exist. 193862306a36Sopenharmony_ci */ 193962306a36Sopenharmony_ci if (move_flag && folio_test_hugetlb_vmemmap_optimized(folio)) 194062306a36Sopenharmony_ci return 0; 194162306a36Sopenharmony_ci 194262306a36Sopenharmony_ci /* 194362306a36Sopenharmony_ci * hugetlb_raw_hwp_unreliable hugepages shouldn't be unpoisoned by 194462306a36Sopenharmony_ci * definition. 194562306a36Sopenharmony_ci */ 194662306a36Sopenharmony_ci if (folio_test_hugetlb_raw_hwp_unreliable(folio)) 194762306a36Sopenharmony_ci return 0; 194862306a36Sopenharmony_ci 194962306a36Sopenharmony_ci return __folio_free_raw_hwp(folio, move_flag); 195062306a36Sopenharmony_ci} 195162306a36Sopenharmony_ci 195262306a36Sopenharmony_civoid folio_clear_hugetlb_hwpoison(struct folio *folio) 195362306a36Sopenharmony_ci{ 195462306a36Sopenharmony_ci if (folio_test_hugetlb_raw_hwp_unreliable(folio)) 195562306a36Sopenharmony_ci return; 195662306a36Sopenharmony_ci if (folio_test_hugetlb_vmemmap_optimized(folio)) 195762306a36Sopenharmony_ci return; 195862306a36Sopenharmony_ci folio_clear_hwpoison(folio); 195962306a36Sopenharmony_ci folio_free_raw_hwp(folio, true); 196062306a36Sopenharmony_ci} 196162306a36Sopenharmony_ci 196262306a36Sopenharmony_ci/* 196362306a36Sopenharmony_ci * Called from hugetlb code with hugetlb_lock held. 196462306a36Sopenharmony_ci * 196562306a36Sopenharmony_ci * Return values: 196662306a36Sopenharmony_ci * 0 - free hugepage 196762306a36Sopenharmony_ci * 1 - in-use hugepage 196862306a36Sopenharmony_ci * 2 - not a hugepage 196962306a36Sopenharmony_ci * -EBUSY - the hugepage is busy (try to retry) 197062306a36Sopenharmony_ci * -EHWPOISON - the hugepage is already hwpoisoned 197162306a36Sopenharmony_ci */ 197262306a36Sopenharmony_ciint __get_huge_page_for_hwpoison(unsigned long pfn, int flags, 197362306a36Sopenharmony_ci bool *migratable_cleared) 197462306a36Sopenharmony_ci{ 197562306a36Sopenharmony_ci struct page *page = pfn_to_page(pfn); 197662306a36Sopenharmony_ci struct folio *folio = page_folio(page); 197762306a36Sopenharmony_ci int ret = 2; /* fallback to normal page handling */ 197862306a36Sopenharmony_ci bool count_increased = false; 197962306a36Sopenharmony_ci 198062306a36Sopenharmony_ci if (!folio_test_hugetlb(folio)) 198162306a36Sopenharmony_ci goto out; 198262306a36Sopenharmony_ci 198362306a36Sopenharmony_ci if (flags & MF_COUNT_INCREASED) { 198462306a36Sopenharmony_ci ret = 1; 198562306a36Sopenharmony_ci count_increased = true; 198662306a36Sopenharmony_ci } else if (folio_test_hugetlb_freed(folio)) { 198762306a36Sopenharmony_ci ret = 0; 198862306a36Sopenharmony_ci } else if (folio_test_hugetlb_migratable(folio)) { 198962306a36Sopenharmony_ci ret = folio_try_get(folio); 199062306a36Sopenharmony_ci if (ret) 199162306a36Sopenharmony_ci count_increased = true; 199262306a36Sopenharmony_ci } else { 199362306a36Sopenharmony_ci ret = -EBUSY; 199462306a36Sopenharmony_ci if (!(flags & MF_NO_RETRY)) 199562306a36Sopenharmony_ci goto out; 199662306a36Sopenharmony_ci } 199762306a36Sopenharmony_ci 199862306a36Sopenharmony_ci if (folio_set_hugetlb_hwpoison(folio, page)) { 199962306a36Sopenharmony_ci ret = -EHWPOISON; 200062306a36Sopenharmony_ci goto out; 200162306a36Sopenharmony_ci } 200262306a36Sopenharmony_ci 200362306a36Sopenharmony_ci /* 200462306a36Sopenharmony_ci * Clearing hugetlb_migratable for hwpoisoned hugepages to prevent them 200562306a36Sopenharmony_ci * from being migrated by memory hotremove. 200662306a36Sopenharmony_ci */ 200762306a36Sopenharmony_ci if (count_increased && folio_test_hugetlb_migratable(folio)) { 200862306a36Sopenharmony_ci folio_clear_hugetlb_migratable(folio); 200962306a36Sopenharmony_ci *migratable_cleared = true; 201062306a36Sopenharmony_ci } 201162306a36Sopenharmony_ci 201262306a36Sopenharmony_ci return ret; 201362306a36Sopenharmony_ciout: 201462306a36Sopenharmony_ci if (count_increased) 201562306a36Sopenharmony_ci folio_put(folio); 201662306a36Sopenharmony_ci return ret; 201762306a36Sopenharmony_ci} 201862306a36Sopenharmony_ci 201962306a36Sopenharmony_ci/* 202062306a36Sopenharmony_ci * Taking refcount of hugetlb pages needs extra care about race conditions 202162306a36Sopenharmony_ci * with basic operations like hugepage allocation/free/demotion. 202262306a36Sopenharmony_ci * So some of prechecks for hwpoison (pinning, and testing/setting 202362306a36Sopenharmony_ci * PageHWPoison) should be done in single hugetlb_lock range. 202462306a36Sopenharmony_ci */ 202562306a36Sopenharmony_cistatic int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) 202662306a36Sopenharmony_ci{ 202762306a36Sopenharmony_ci int res; 202862306a36Sopenharmony_ci struct page *p = pfn_to_page(pfn); 202962306a36Sopenharmony_ci struct folio *folio; 203062306a36Sopenharmony_ci unsigned long page_flags; 203162306a36Sopenharmony_ci bool migratable_cleared = false; 203262306a36Sopenharmony_ci 203362306a36Sopenharmony_ci *hugetlb = 1; 203462306a36Sopenharmony_ciretry: 203562306a36Sopenharmony_ci res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared); 203662306a36Sopenharmony_ci if (res == 2) { /* fallback to normal page handling */ 203762306a36Sopenharmony_ci *hugetlb = 0; 203862306a36Sopenharmony_ci return 0; 203962306a36Sopenharmony_ci } else if (res == -EHWPOISON) { 204062306a36Sopenharmony_ci pr_err("%#lx: already hardware poisoned\n", pfn); 204162306a36Sopenharmony_ci if (flags & MF_ACTION_REQUIRED) { 204262306a36Sopenharmony_ci folio = page_folio(p); 204362306a36Sopenharmony_ci res = kill_accessing_process(current, folio_pfn(folio), flags); 204462306a36Sopenharmony_ci } 204562306a36Sopenharmony_ci return res; 204662306a36Sopenharmony_ci } else if (res == -EBUSY) { 204762306a36Sopenharmony_ci if (!(flags & MF_NO_RETRY)) { 204862306a36Sopenharmony_ci flags |= MF_NO_RETRY; 204962306a36Sopenharmony_ci goto retry; 205062306a36Sopenharmony_ci } 205162306a36Sopenharmony_ci return action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); 205262306a36Sopenharmony_ci } 205362306a36Sopenharmony_ci 205462306a36Sopenharmony_ci folio = page_folio(p); 205562306a36Sopenharmony_ci folio_lock(folio); 205662306a36Sopenharmony_ci 205762306a36Sopenharmony_ci if (hwpoison_filter(p)) { 205862306a36Sopenharmony_ci folio_clear_hugetlb_hwpoison(folio); 205962306a36Sopenharmony_ci if (migratable_cleared) 206062306a36Sopenharmony_ci folio_set_hugetlb_migratable(folio); 206162306a36Sopenharmony_ci folio_unlock(folio); 206262306a36Sopenharmony_ci if (res == 1) 206362306a36Sopenharmony_ci folio_put(folio); 206462306a36Sopenharmony_ci return -EOPNOTSUPP; 206562306a36Sopenharmony_ci } 206662306a36Sopenharmony_ci 206762306a36Sopenharmony_ci /* 206862306a36Sopenharmony_ci * Handling free hugepage. The possible race with hugepage allocation 206962306a36Sopenharmony_ci * or demotion can be prevented by PageHWPoison flag. 207062306a36Sopenharmony_ci */ 207162306a36Sopenharmony_ci if (res == 0) { 207262306a36Sopenharmony_ci folio_unlock(folio); 207362306a36Sopenharmony_ci if (__page_handle_poison(p) >= 0) { 207462306a36Sopenharmony_ci page_ref_inc(p); 207562306a36Sopenharmony_ci res = MF_RECOVERED; 207662306a36Sopenharmony_ci } else { 207762306a36Sopenharmony_ci res = MF_FAILED; 207862306a36Sopenharmony_ci } 207962306a36Sopenharmony_ci return action_result(pfn, MF_MSG_FREE_HUGE, res); 208062306a36Sopenharmony_ci } 208162306a36Sopenharmony_ci 208262306a36Sopenharmony_ci page_flags = folio->flags; 208362306a36Sopenharmony_ci 208462306a36Sopenharmony_ci if (!hwpoison_user_mappings(p, pfn, flags, &folio->page)) { 208562306a36Sopenharmony_ci folio_unlock(folio); 208662306a36Sopenharmony_ci return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); 208762306a36Sopenharmony_ci } 208862306a36Sopenharmony_ci 208962306a36Sopenharmony_ci return identify_page_state(pfn, p, page_flags); 209062306a36Sopenharmony_ci} 209162306a36Sopenharmony_ci 209262306a36Sopenharmony_ci#else 209362306a36Sopenharmony_cistatic inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) 209462306a36Sopenharmony_ci{ 209562306a36Sopenharmony_ci return 0; 209662306a36Sopenharmony_ci} 209762306a36Sopenharmony_ci 209862306a36Sopenharmony_cistatic inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag) 209962306a36Sopenharmony_ci{ 210062306a36Sopenharmony_ci return 0; 210162306a36Sopenharmony_ci} 210262306a36Sopenharmony_ci#endif /* CONFIG_HUGETLB_PAGE */ 210362306a36Sopenharmony_ci 210462306a36Sopenharmony_ci/* Drop the extra refcount in case we come from madvise() */ 210562306a36Sopenharmony_cistatic void put_ref_page(unsigned long pfn, int flags) 210662306a36Sopenharmony_ci{ 210762306a36Sopenharmony_ci struct page *page; 210862306a36Sopenharmony_ci 210962306a36Sopenharmony_ci if (!(flags & MF_COUNT_INCREASED)) 211062306a36Sopenharmony_ci return; 211162306a36Sopenharmony_ci 211262306a36Sopenharmony_ci page = pfn_to_page(pfn); 211362306a36Sopenharmony_ci if (page) 211462306a36Sopenharmony_ci put_page(page); 211562306a36Sopenharmony_ci} 211662306a36Sopenharmony_ci 211762306a36Sopenharmony_cistatic int memory_failure_dev_pagemap(unsigned long pfn, int flags, 211862306a36Sopenharmony_ci struct dev_pagemap *pgmap) 211962306a36Sopenharmony_ci{ 212062306a36Sopenharmony_ci int rc = -ENXIO; 212162306a36Sopenharmony_ci 212262306a36Sopenharmony_ci /* device metadata space is not recoverable */ 212362306a36Sopenharmony_ci if (!pgmap_pfn_valid(pgmap, pfn)) 212462306a36Sopenharmony_ci goto out; 212562306a36Sopenharmony_ci 212662306a36Sopenharmony_ci /* 212762306a36Sopenharmony_ci * Call driver's implementation to handle the memory failure, otherwise 212862306a36Sopenharmony_ci * fall back to generic handler. 212962306a36Sopenharmony_ci */ 213062306a36Sopenharmony_ci if (pgmap_has_memory_failure(pgmap)) { 213162306a36Sopenharmony_ci rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags); 213262306a36Sopenharmony_ci /* 213362306a36Sopenharmony_ci * Fall back to generic handler too if operation is not 213462306a36Sopenharmony_ci * supported inside the driver/device/filesystem. 213562306a36Sopenharmony_ci */ 213662306a36Sopenharmony_ci if (rc != -EOPNOTSUPP) 213762306a36Sopenharmony_ci goto out; 213862306a36Sopenharmony_ci } 213962306a36Sopenharmony_ci 214062306a36Sopenharmony_ci rc = mf_generic_kill_procs(pfn, flags, pgmap); 214162306a36Sopenharmony_ciout: 214262306a36Sopenharmony_ci /* drop pgmap ref acquired in caller */ 214362306a36Sopenharmony_ci put_dev_pagemap(pgmap); 214462306a36Sopenharmony_ci if (rc != -EOPNOTSUPP) 214562306a36Sopenharmony_ci action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED); 214662306a36Sopenharmony_ci return rc; 214762306a36Sopenharmony_ci} 214862306a36Sopenharmony_ci 214962306a36Sopenharmony_ci/** 215062306a36Sopenharmony_ci * memory_failure - Handle memory failure of a page. 215162306a36Sopenharmony_ci * @pfn: Page Number of the corrupted page 215262306a36Sopenharmony_ci * @flags: fine tune action taken 215362306a36Sopenharmony_ci * 215462306a36Sopenharmony_ci * This function is called by the low level machine check code 215562306a36Sopenharmony_ci * of an architecture when it detects hardware memory corruption 215662306a36Sopenharmony_ci * of a page. It tries its best to recover, which includes 215762306a36Sopenharmony_ci * dropping pages, killing processes etc. 215862306a36Sopenharmony_ci * 215962306a36Sopenharmony_ci * The function is primarily of use for corruptions that 216062306a36Sopenharmony_ci * happen outside the current execution context (e.g. when 216162306a36Sopenharmony_ci * detected by a background scrubber) 216262306a36Sopenharmony_ci * 216362306a36Sopenharmony_ci * Must run in process context (e.g. a work queue) with interrupts 216462306a36Sopenharmony_ci * enabled and no spinlocks held. 216562306a36Sopenharmony_ci * 216662306a36Sopenharmony_ci * Return: 0 for successfully handled the memory error, 216762306a36Sopenharmony_ci * -EOPNOTSUPP for hwpoison_filter() filtered the error event, 216862306a36Sopenharmony_ci * < 0(except -EOPNOTSUPP) on failure. 216962306a36Sopenharmony_ci */ 217062306a36Sopenharmony_ciint memory_failure(unsigned long pfn, int flags) 217162306a36Sopenharmony_ci{ 217262306a36Sopenharmony_ci struct page *p; 217362306a36Sopenharmony_ci struct page *hpage; 217462306a36Sopenharmony_ci struct dev_pagemap *pgmap; 217562306a36Sopenharmony_ci int res = 0; 217662306a36Sopenharmony_ci unsigned long page_flags; 217762306a36Sopenharmony_ci bool retry = true; 217862306a36Sopenharmony_ci int hugetlb = 0; 217962306a36Sopenharmony_ci 218062306a36Sopenharmony_ci if (!sysctl_memory_failure_recovery) 218162306a36Sopenharmony_ci panic("Memory failure on page %lx", pfn); 218262306a36Sopenharmony_ci 218362306a36Sopenharmony_ci mutex_lock(&mf_mutex); 218462306a36Sopenharmony_ci 218562306a36Sopenharmony_ci if (!(flags & MF_SW_SIMULATED)) 218662306a36Sopenharmony_ci hw_memory_failure = true; 218762306a36Sopenharmony_ci 218862306a36Sopenharmony_ci p = pfn_to_online_page(pfn); 218962306a36Sopenharmony_ci if (!p) { 219062306a36Sopenharmony_ci res = arch_memory_failure(pfn, flags); 219162306a36Sopenharmony_ci if (res == 0) 219262306a36Sopenharmony_ci goto unlock_mutex; 219362306a36Sopenharmony_ci 219462306a36Sopenharmony_ci if (pfn_valid(pfn)) { 219562306a36Sopenharmony_ci pgmap = get_dev_pagemap(pfn, NULL); 219662306a36Sopenharmony_ci put_ref_page(pfn, flags); 219762306a36Sopenharmony_ci if (pgmap) { 219862306a36Sopenharmony_ci res = memory_failure_dev_pagemap(pfn, flags, 219962306a36Sopenharmony_ci pgmap); 220062306a36Sopenharmony_ci goto unlock_mutex; 220162306a36Sopenharmony_ci } 220262306a36Sopenharmony_ci } 220362306a36Sopenharmony_ci pr_err("%#lx: memory outside kernel control\n", pfn); 220462306a36Sopenharmony_ci res = -ENXIO; 220562306a36Sopenharmony_ci goto unlock_mutex; 220662306a36Sopenharmony_ci } 220762306a36Sopenharmony_ci 220862306a36Sopenharmony_citry_again: 220962306a36Sopenharmony_ci res = try_memory_failure_hugetlb(pfn, flags, &hugetlb); 221062306a36Sopenharmony_ci if (hugetlb) 221162306a36Sopenharmony_ci goto unlock_mutex; 221262306a36Sopenharmony_ci 221362306a36Sopenharmony_ci if (TestSetPageHWPoison(p)) { 221462306a36Sopenharmony_ci pr_err("%#lx: already hardware poisoned\n", pfn); 221562306a36Sopenharmony_ci res = -EHWPOISON; 221662306a36Sopenharmony_ci if (flags & MF_ACTION_REQUIRED) 221762306a36Sopenharmony_ci res = kill_accessing_process(current, pfn, flags); 221862306a36Sopenharmony_ci if (flags & MF_COUNT_INCREASED) 221962306a36Sopenharmony_ci put_page(p); 222062306a36Sopenharmony_ci goto unlock_mutex; 222162306a36Sopenharmony_ci } 222262306a36Sopenharmony_ci 222362306a36Sopenharmony_ci /* 222462306a36Sopenharmony_ci * We need/can do nothing about count=0 pages. 222562306a36Sopenharmony_ci * 1) it's a free page, and therefore in safe hand: 222662306a36Sopenharmony_ci * check_new_page() will be the gate keeper. 222762306a36Sopenharmony_ci * 2) it's part of a non-compound high order page. 222862306a36Sopenharmony_ci * Implies some kernel user: cannot stop them from 222962306a36Sopenharmony_ci * R/W the page; let's pray that the page has been 223062306a36Sopenharmony_ci * used and will be freed some time later. 223162306a36Sopenharmony_ci * In fact it's dangerous to directly bump up page count from 0, 223262306a36Sopenharmony_ci * that may make page_ref_freeze()/page_ref_unfreeze() mismatch. 223362306a36Sopenharmony_ci */ 223462306a36Sopenharmony_ci if (!(flags & MF_COUNT_INCREASED)) { 223562306a36Sopenharmony_ci res = get_hwpoison_page(p, flags); 223662306a36Sopenharmony_ci if (!res) { 223762306a36Sopenharmony_ci if (is_free_buddy_page(p)) { 223862306a36Sopenharmony_ci if (take_page_off_buddy(p)) { 223962306a36Sopenharmony_ci page_ref_inc(p); 224062306a36Sopenharmony_ci res = MF_RECOVERED; 224162306a36Sopenharmony_ci } else { 224262306a36Sopenharmony_ci /* We lost the race, try again */ 224362306a36Sopenharmony_ci if (retry) { 224462306a36Sopenharmony_ci ClearPageHWPoison(p); 224562306a36Sopenharmony_ci retry = false; 224662306a36Sopenharmony_ci goto try_again; 224762306a36Sopenharmony_ci } 224862306a36Sopenharmony_ci res = MF_FAILED; 224962306a36Sopenharmony_ci } 225062306a36Sopenharmony_ci res = action_result(pfn, MF_MSG_BUDDY, res); 225162306a36Sopenharmony_ci } else { 225262306a36Sopenharmony_ci res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); 225362306a36Sopenharmony_ci } 225462306a36Sopenharmony_ci goto unlock_mutex; 225562306a36Sopenharmony_ci } else if (res < 0) { 225662306a36Sopenharmony_ci res = action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); 225762306a36Sopenharmony_ci goto unlock_mutex; 225862306a36Sopenharmony_ci } 225962306a36Sopenharmony_ci } 226062306a36Sopenharmony_ci 226162306a36Sopenharmony_ci hpage = compound_head(p); 226262306a36Sopenharmony_ci if (PageTransHuge(hpage)) { 226362306a36Sopenharmony_ci /* 226462306a36Sopenharmony_ci * The flag must be set after the refcount is bumped 226562306a36Sopenharmony_ci * otherwise it may race with THP split. 226662306a36Sopenharmony_ci * And the flag can't be set in get_hwpoison_page() since 226762306a36Sopenharmony_ci * it is called by soft offline too and it is just called 226862306a36Sopenharmony_ci * for !MF_COUNT_INCREASED. So here seems to be the best 226962306a36Sopenharmony_ci * place. 227062306a36Sopenharmony_ci * 227162306a36Sopenharmony_ci * Don't need care about the above error handling paths for 227262306a36Sopenharmony_ci * get_hwpoison_page() since they handle either free page 227362306a36Sopenharmony_ci * or unhandlable page. The refcount is bumped iff the 227462306a36Sopenharmony_ci * page is a valid handlable page. 227562306a36Sopenharmony_ci */ 227662306a36Sopenharmony_ci SetPageHasHWPoisoned(hpage); 227762306a36Sopenharmony_ci if (try_to_split_thp_page(p) < 0) { 227862306a36Sopenharmony_ci res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); 227962306a36Sopenharmony_ci goto unlock_mutex; 228062306a36Sopenharmony_ci } 228162306a36Sopenharmony_ci VM_BUG_ON_PAGE(!page_count(p), p); 228262306a36Sopenharmony_ci } 228362306a36Sopenharmony_ci 228462306a36Sopenharmony_ci /* 228562306a36Sopenharmony_ci * We ignore non-LRU pages for good reasons. 228662306a36Sopenharmony_ci * - PG_locked is only well defined for LRU pages and a few others 228762306a36Sopenharmony_ci * - to avoid races with __SetPageLocked() 228862306a36Sopenharmony_ci * - to avoid races with __SetPageSlab*() (and more non-atomic ops) 228962306a36Sopenharmony_ci * The check (unnecessarily) ignores LRU pages being isolated and 229062306a36Sopenharmony_ci * walked by the page reclaim code, however that's not a big loss. 229162306a36Sopenharmony_ci */ 229262306a36Sopenharmony_ci shake_page(p); 229362306a36Sopenharmony_ci 229462306a36Sopenharmony_ci lock_page(p); 229562306a36Sopenharmony_ci 229662306a36Sopenharmony_ci /* 229762306a36Sopenharmony_ci * We're only intended to deal with the non-Compound page here. 229862306a36Sopenharmony_ci * However, the page could have changed compound pages due to 229962306a36Sopenharmony_ci * race window. If this happens, we could try again to hopefully 230062306a36Sopenharmony_ci * handle the page next round. 230162306a36Sopenharmony_ci */ 230262306a36Sopenharmony_ci if (PageCompound(p)) { 230362306a36Sopenharmony_ci if (retry) { 230462306a36Sopenharmony_ci ClearPageHWPoison(p); 230562306a36Sopenharmony_ci unlock_page(p); 230662306a36Sopenharmony_ci put_page(p); 230762306a36Sopenharmony_ci flags &= ~MF_COUNT_INCREASED; 230862306a36Sopenharmony_ci retry = false; 230962306a36Sopenharmony_ci goto try_again; 231062306a36Sopenharmony_ci } 231162306a36Sopenharmony_ci res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); 231262306a36Sopenharmony_ci goto unlock_page; 231362306a36Sopenharmony_ci } 231462306a36Sopenharmony_ci 231562306a36Sopenharmony_ci /* 231662306a36Sopenharmony_ci * We use page flags to determine what action should be taken, but 231762306a36Sopenharmony_ci * the flags can be modified by the error containment action. One 231862306a36Sopenharmony_ci * example is an mlocked page, where PG_mlocked is cleared by 231962306a36Sopenharmony_ci * page_remove_rmap() in try_to_unmap_one(). So to determine page status 232062306a36Sopenharmony_ci * correctly, we save a copy of the page flags at this time. 232162306a36Sopenharmony_ci */ 232262306a36Sopenharmony_ci page_flags = p->flags; 232362306a36Sopenharmony_ci 232462306a36Sopenharmony_ci if (hwpoison_filter(p)) { 232562306a36Sopenharmony_ci ClearPageHWPoison(p); 232662306a36Sopenharmony_ci unlock_page(p); 232762306a36Sopenharmony_ci put_page(p); 232862306a36Sopenharmony_ci res = -EOPNOTSUPP; 232962306a36Sopenharmony_ci goto unlock_mutex; 233062306a36Sopenharmony_ci } 233162306a36Sopenharmony_ci 233262306a36Sopenharmony_ci /* 233362306a36Sopenharmony_ci * __munlock_folio() may clear a writeback page's LRU flag without 233462306a36Sopenharmony_ci * page_lock. We need wait writeback completion for this page or it 233562306a36Sopenharmony_ci * may trigger vfs BUG while evict inode. 233662306a36Sopenharmony_ci */ 233762306a36Sopenharmony_ci if (!PageLRU(p) && !PageWriteback(p)) 233862306a36Sopenharmony_ci goto identify_page_state; 233962306a36Sopenharmony_ci 234062306a36Sopenharmony_ci /* 234162306a36Sopenharmony_ci * It's very difficult to mess with pages currently under IO 234262306a36Sopenharmony_ci * and in many cases impossible, so we just avoid it here. 234362306a36Sopenharmony_ci */ 234462306a36Sopenharmony_ci wait_on_page_writeback(p); 234562306a36Sopenharmony_ci 234662306a36Sopenharmony_ci /* 234762306a36Sopenharmony_ci * Now take care of user space mappings. 234862306a36Sopenharmony_ci * Abort on fail: __filemap_remove_folio() assumes unmapped page. 234962306a36Sopenharmony_ci */ 235062306a36Sopenharmony_ci if (!hwpoison_user_mappings(p, pfn, flags, p)) { 235162306a36Sopenharmony_ci res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); 235262306a36Sopenharmony_ci goto unlock_page; 235362306a36Sopenharmony_ci } 235462306a36Sopenharmony_ci 235562306a36Sopenharmony_ci /* 235662306a36Sopenharmony_ci * Torn down by someone else? 235762306a36Sopenharmony_ci */ 235862306a36Sopenharmony_ci if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { 235962306a36Sopenharmony_ci res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); 236062306a36Sopenharmony_ci goto unlock_page; 236162306a36Sopenharmony_ci } 236262306a36Sopenharmony_ci 236362306a36Sopenharmony_ciidentify_page_state: 236462306a36Sopenharmony_ci res = identify_page_state(pfn, p, page_flags); 236562306a36Sopenharmony_ci mutex_unlock(&mf_mutex); 236662306a36Sopenharmony_ci return res; 236762306a36Sopenharmony_ciunlock_page: 236862306a36Sopenharmony_ci unlock_page(p); 236962306a36Sopenharmony_ciunlock_mutex: 237062306a36Sopenharmony_ci mutex_unlock(&mf_mutex); 237162306a36Sopenharmony_ci return res; 237262306a36Sopenharmony_ci} 237362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(memory_failure); 237462306a36Sopenharmony_ci 237562306a36Sopenharmony_ci#define MEMORY_FAILURE_FIFO_ORDER 4 237662306a36Sopenharmony_ci#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) 237762306a36Sopenharmony_ci 237862306a36Sopenharmony_cistruct memory_failure_entry { 237962306a36Sopenharmony_ci unsigned long pfn; 238062306a36Sopenharmony_ci int flags; 238162306a36Sopenharmony_ci}; 238262306a36Sopenharmony_ci 238362306a36Sopenharmony_cistruct memory_failure_cpu { 238462306a36Sopenharmony_ci DECLARE_KFIFO(fifo, struct memory_failure_entry, 238562306a36Sopenharmony_ci MEMORY_FAILURE_FIFO_SIZE); 238662306a36Sopenharmony_ci spinlock_t lock; 238762306a36Sopenharmony_ci struct work_struct work; 238862306a36Sopenharmony_ci}; 238962306a36Sopenharmony_ci 239062306a36Sopenharmony_cistatic DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); 239162306a36Sopenharmony_ci 239262306a36Sopenharmony_ci/** 239362306a36Sopenharmony_ci * memory_failure_queue - Schedule handling memory failure of a page. 239462306a36Sopenharmony_ci * @pfn: Page Number of the corrupted page 239562306a36Sopenharmony_ci * @flags: Flags for memory failure handling 239662306a36Sopenharmony_ci * 239762306a36Sopenharmony_ci * This function is called by the low level hardware error handler 239862306a36Sopenharmony_ci * when it detects hardware memory corruption of a page. It schedules 239962306a36Sopenharmony_ci * the recovering of error page, including dropping pages, killing 240062306a36Sopenharmony_ci * processes etc. 240162306a36Sopenharmony_ci * 240262306a36Sopenharmony_ci * The function is primarily of use for corruptions that 240362306a36Sopenharmony_ci * happen outside the current execution context (e.g. when 240462306a36Sopenharmony_ci * detected by a background scrubber) 240562306a36Sopenharmony_ci * 240662306a36Sopenharmony_ci * Can run in IRQ context. 240762306a36Sopenharmony_ci */ 240862306a36Sopenharmony_civoid memory_failure_queue(unsigned long pfn, int flags) 240962306a36Sopenharmony_ci{ 241062306a36Sopenharmony_ci struct memory_failure_cpu *mf_cpu; 241162306a36Sopenharmony_ci unsigned long proc_flags; 241262306a36Sopenharmony_ci struct memory_failure_entry entry = { 241362306a36Sopenharmony_ci .pfn = pfn, 241462306a36Sopenharmony_ci .flags = flags, 241562306a36Sopenharmony_ci }; 241662306a36Sopenharmony_ci 241762306a36Sopenharmony_ci mf_cpu = &get_cpu_var(memory_failure_cpu); 241862306a36Sopenharmony_ci spin_lock_irqsave(&mf_cpu->lock, proc_flags); 241962306a36Sopenharmony_ci if (kfifo_put(&mf_cpu->fifo, entry)) 242062306a36Sopenharmony_ci schedule_work_on(smp_processor_id(), &mf_cpu->work); 242162306a36Sopenharmony_ci else 242262306a36Sopenharmony_ci pr_err("buffer overflow when queuing memory failure at %#lx\n", 242362306a36Sopenharmony_ci pfn); 242462306a36Sopenharmony_ci spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); 242562306a36Sopenharmony_ci put_cpu_var(memory_failure_cpu); 242662306a36Sopenharmony_ci} 242762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(memory_failure_queue); 242862306a36Sopenharmony_ci 242962306a36Sopenharmony_cistatic void memory_failure_work_func(struct work_struct *work) 243062306a36Sopenharmony_ci{ 243162306a36Sopenharmony_ci struct memory_failure_cpu *mf_cpu; 243262306a36Sopenharmony_ci struct memory_failure_entry entry = { 0, }; 243362306a36Sopenharmony_ci unsigned long proc_flags; 243462306a36Sopenharmony_ci int gotten; 243562306a36Sopenharmony_ci 243662306a36Sopenharmony_ci mf_cpu = container_of(work, struct memory_failure_cpu, work); 243762306a36Sopenharmony_ci for (;;) { 243862306a36Sopenharmony_ci spin_lock_irqsave(&mf_cpu->lock, proc_flags); 243962306a36Sopenharmony_ci gotten = kfifo_get(&mf_cpu->fifo, &entry); 244062306a36Sopenharmony_ci spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); 244162306a36Sopenharmony_ci if (!gotten) 244262306a36Sopenharmony_ci break; 244362306a36Sopenharmony_ci if (entry.flags & MF_SOFT_OFFLINE) 244462306a36Sopenharmony_ci soft_offline_page(entry.pfn, entry.flags); 244562306a36Sopenharmony_ci else 244662306a36Sopenharmony_ci memory_failure(entry.pfn, entry.flags); 244762306a36Sopenharmony_ci } 244862306a36Sopenharmony_ci} 244962306a36Sopenharmony_ci 245062306a36Sopenharmony_ci/* 245162306a36Sopenharmony_ci * Process memory_failure work queued on the specified CPU. 245262306a36Sopenharmony_ci * Used to avoid return-to-userspace racing with the memory_failure workqueue. 245362306a36Sopenharmony_ci */ 245462306a36Sopenharmony_civoid memory_failure_queue_kick(int cpu) 245562306a36Sopenharmony_ci{ 245662306a36Sopenharmony_ci struct memory_failure_cpu *mf_cpu; 245762306a36Sopenharmony_ci 245862306a36Sopenharmony_ci mf_cpu = &per_cpu(memory_failure_cpu, cpu); 245962306a36Sopenharmony_ci cancel_work_sync(&mf_cpu->work); 246062306a36Sopenharmony_ci memory_failure_work_func(&mf_cpu->work); 246162306a36Sopenharmony_ci} 246262306a36Sopenharmony_ci 246362306a36Sopenharmony_cistatic int __init memory_failure_init(void) 246462306a36Sopenharmony_ci{ 246562306a36Sopenharmony_ci struct memory_failure_cpu *mf_cpu; 246662306a36Sopenharmony_ci int cpu; 246762306a36Sopenharmony_ci 246862306a36Sopenharmony_ci for_each_possible_cpu(cpu) { 246962306a36Sopenharmony_ci mf_cpu = &per_cpu(memory_failure_cpu, cpu); 247062306a36Sopenharmony_ci spin_lock_init(&mf_cpu->lock); 247162306a36Sopenharmony_ci INIT_KFIFO(mf_cpu->fifo); 247262306a36Sopenharmony_ci INIT_WORK(&mf_cpu->work, memory_failure_work_func); 247362306a36Sopenharmony_ci } 247462306a36Sopenharmony_ci 247562306a36Sopenharmony_ci register_sysctl_init("vm", memory_failure_table); 247662306a36Sopenharmony_ci 247762306a36Sopenharmony_ci return 0; 247862306a36Sopenharmony_ci} 247962306a36Sopenharmony_cicore_initcall(memory_failure_init); 248062306a36Sopenharmony_ci 248162306a36Sopenharmony_ci#undef pr_fmt 248262306a36Sopenharmony_ci#define pr_fmt(fmt) "" fmt 248362306a36Sopenharmony_ci#define unpoison_pr_info(fmt, pfn, rs) \ 248462306a36Sopenharmony_ci({ \ 248562306a36Sopenharmony_ci if (__ratelimit(rs)) \ 248662306a36Sopenharmony_ci pr_info(fmt, pfn); \ 248762306a36Sopenharmony_ci}) 248862306a36Sopenharmony_ci 248962306a36Sopenharmony_ci/** 249062306a36Sopenharmony_ci * unpoison_memory - Unpoison a previously poisoned page 249162306a36Sopenharmony_ci * @pfn: Page number of the to be unpoisoned page 249262306a36Sopenharmony_ci * 249362306a36Sopenharmony_ci * Software-unpoison a page that has been poisoned by 249462306a36Sopenharmony_ci * memory_failure() earlier. 249562306a36Sopenharmony_ci * 249662306a36Sopenharmony_ci * This is only done on the software-level, so it only works 249762306a36Sopenharmony_ci * for linux injected failures, not real hardware failures 249862306a36Sopenharmony_ci * 249962306a36Sopenharmony_ci * Returns 0 for success, otherwise -errno. 250062306a36Sopenharmony_ci */ 250162306a36Sopenharmony_ciint unpoison_memory(unsigned long pfn) 250262306a36Sopenharmony_ci{ 250362306a36Sopenharmony_ci struct folio *folio; 250462306a36Sopenharmony_ci struct page *p; 250562306a36Sopenharmony_ci int ret = -EBUSY, ghp; 250662306a36Sopenharmony_ci unsigned long count = 1; 250762306a36Sopenharmony_ci bool huge = false; 250862306a36Sopenharmony_ci static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, 250962306a36Sopenharmony_ci DEFAULT_RATELIMIT_BURST); 251062306a36Sopenharmony_ci 251162306a36Sopenharmony_ci if (!pfn_valid(pfn)) 251262306a36Sopenharmony_ci return -ENXIO; 251362306a36Sopenharmony_ci 251462306a36Sopenharmony_ci p = pfn_to_page(pfn); 251562306a36Sopenharmony_ci folio = page_folio(p); 251662306a36Sopenharmony_ci 251762306a36Sopenharmony_ci mutex_lock(&mf_mutex); 251862306a36Sopenharmony_ci 251962306a36Sopenharmony_ci if (hw_memory_failure) { 252062306a36Sopenharmony_ci unpoison_pr_info("Unpoison: Disabled after HW memory failure %#lx\n", 252162306a36Sopenharmony_ci pfn, &unpoison_rs); 252262306a36Sopenharmony_ci ret = -EOPNOTSUPP; 252362306a36Sopenharmony_ci goto unlock_mutex; 252462306a36Sopenharmony_ci } 252562306a36Sopenharmony_ci 252662306a36Sopenharmony_ci if (!PageHWPoison(p)) { 252762306a36Sopenharmony_ci unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", 252862306a36Sopenharmony_ci pfn, &unpoison_rs); 252962306a36Sopenharmony_ci goto unlock_mutex; 253062306a36Sopenharmony_ci } 253162306a36Sopenharmony_ci 253262306a36Sopenharmony_ci if (folio_ref_count(folio) > 1) { 253362306a36Sopenharmony_ci unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", 253462306a36Sopenharmony_ci pfn, &unpoison_rs); 253562306a36Sopenharmony_ci goto unlock_mutex; 253662306a36Sopenharmony_ci } 253762306a36Sopenharmony_ci 253862306a36Sopenharmony_ci if (folio_test_slab(folio) || PageTable(&folio->page) || 253962306a36Sopenharmony_ci folio_test_reserved(folio) || PageOffline(&folio->page)) 254062306a36Sopenharmony_ci goto unlock_mutex; 254162306a36Sopenharmony_ci 254262306a36Sopenharmony_ci /* 254362306a36Sopenharmony_ci * Note that folio->_mapcount is overloaded in SLAB, so the simple test 254462306a36Sopenharmony_ci * in folio_mapped() has to be done after folio_test_slab() is checked. 254562306a36Sopenharmony_ci */ 254662306a36Sopenharmony_ci if (folio_mapped(folio)) { 254762306a36Sopenharmony_ci unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", 254862306a36Sopenharmony_ci pfn, &unpoison_rs); 254962306a36Sopenharmony_ci goto unlock_mutex; 255062306a36Sopenharmony_ci } 255162306a36Sopenharmony_ci 255262306a36Sopenharmony_ci if (folio_mapping(folio)) { 255362306a36Sopenharmony_ci unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", 255462306a36Sopenharmony_ci pfn, &unpoison_rs); 255562306a36Sopenharmony_ci goto unlock_mutex; 255662306a36Sopenharmony_ci } 255762306a36Sopenharmony_ci 255862306a36Sopenharmony_ci ghp = get_hwpoison_page(p, MF_UNPOISON); 255962306a36Sopenharmony_ci if (!ghp) { 256062306a36Sopenharmony_ci if (PageHuge(p)) { 256162306a36Sopenharmony_ci huge = true; 256262306a36Sopenharmony_ci count = folio_free_raw_hwp(folio, false); 256362306a36Sopenharmony_ci if (count == 0) 256462306a36Sopenharmony_ci goto unlock_mutex; 256562306a36Sopenharmony_ci } 256662306a36Sopenharmony_ci ret = folio_test_clear_hwpoison(folio) ? 0 : -EBUSY; 256762306a36Sopenharmony_ci } else if (ghp < 0) { 256862306a36Sopenharmony_ci if (ghp == -EHWPOISON) { 256962306a36Sopenharmony_ci ret = put_page_back_buddy(p) ? 0 : -EBUSY; 257062306a36Sopenharmony_ci } else { 257162306a36Sopenharmony_ci ret = ghp; 257262306a36Sopenharmony_ci unpoison_pr_info("Unpoison: failed to grab page %#lx\n", 257362306a36Sopenharmony_ci pfn, &unpoison_rs); 257462306a36Sopenharmony_ci } 257562306a36Sopenharmony_ci } else { 257662306a36Sopenharmony_ci if (PageHuge(p)) { 257762306a36Sopenharmony_ci huge = true; 257862306a36Sopenharmony_ci count = folio_free_raw_hwp(folio, false); 257962306a36Sopenharmony_ci if (count == 0) { 258062306a36Sopenharmony_ci folio_put(folio); 258162306a36Sopenharmony_ci goto unlock_mutex; 258262306a36Sopenharmony_ci } 258362306a36Sopenharmony_ci } 258462306a36Sopenharmony_ci 258562306a36Sopenharmony_ci folio_put(folio); 258662306a36Sopenharmony_ci if (TestClearPageHWPoison(p)) { 258762306a36Sopenharmony_ci folio_put(folio); 258862306a36Sopenharmony_ci ret = 0; 258962306a36Sopenharmony_ci } 259062306a36Sopenharmony_ci } 259162306a36Sopenharmony_ci 259262306a36Sopenharmony_ciunlock_mutex: 259362306a36Sopenharmony_ci mutex_unlock(&mf_mutex); 259462306a36Sopenharmony_ci if (!ret) { 259562306a36Sopenharmony_ci if (!huge) 259662306a36Sopenharmony_ci num_poisoned_pages_sub(pfn, 1); 259762306a36Sopenharmony_ci unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", 259862306a36Sopenharmony_ci page_to_pfn(p), &unpoison_rs); 259962306a36Sopenharmony_ci } 260062306a36Sopenharmony_ci return ret; 260162306a36Sopenharmony_ci} 260262306a36Sopenharmony_ciEXPORT_SYMBOL(unpoison_memory); 260362306a36Sopenharmony_ci 260462306a36Sopenharmony_cistatic bool isolate_page(struct page *page, struct list_head *pagelist) 260562306a36Sopenharmony_ci{ 260662306a36Sopenharmony_ci bool isolated = false; 260762306a36Sopenharmony_ci 260862306a36Sopenharmony_ci if (PageHuge(page)) { 260962306a36Sopenharmony_ci isolated = isolate_hugetlb(page_folio(page), pagelist); 261062306a36Sopenharmony_ci } else { 261162306a36Sopenharmony_ci bool lru = !__PageMovable(page); 261262306a36Sopenharmony_ci 261362306a36Sopenharmony_ci if (lru) 261462306a36Sopenharmony_ci isolated = isolate_lru_page(page); 261562306a36Sopenharmony_ci else 261662306a36Sopenharmony_ci isolated = isolate_movable_page(page, 261762306a36Sopenharmony_ci ISOLATE_UNEVICTABLE); 261862306a36Sopenharmony_ci 261962306a36Sopenharmony_ci if (isolated) { 262062306a36Sopenharmony_ci list_add(&page->lru, pagelist); 262162306a36Sopenharmony_ci if (lru) 262262306a36Sopenharmony_ci inc_node_page_state(page, NR_ISOLATED_ANON + 262362306a36Sopenharmony_ci page_is_file_lru(page)); 262462306a36Sopenharmony_ci } 262562306a36Sopenharmony_ci } 262662306a36Sopenharmony_ci 262762306a36Sopenharmony_ci /* 262862306a36Sopenharmony_ci * If we succeed to isolate the page, we grabbed another refcount on 262962306a36Sopenharmony_ci * the page, so we can safely drop the one we got from get_any_page(). 263062306a36Sopenharmony_ci * If we failed to isolate the page, it means that we cannot go further 263162306a36Sopenharmony_ci * and we will return an error, so drop the reference we got from 263262306a36Sopenharmony_ci * get_any_page() as well. 263362306a36Sopenharmony_ci */ 263462306a36Sopenharmony_ci put_page(page); 263562306a36Sopenharmony_ci return isolated; 263662306a36Sopenharmony_ci} 263762306a36Sopenharmony_ci 263862306a36Sopenharmony_ci/* 263962306a36Sopenharmony_ci * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages. 264062306a36Sopenharmony_ci * If the page is a non-dirty unmapped page-cache page, it simply invalidates. 264162306a36Sopenharmony_ci * If the page is mapped, it migrates the contents over. 264262306a36Sopenharmony_ci */ 264362306a36Sopenharmony_cistatic int soft_offline_in_use_page(struct page *page) 264462306a36Sopenharmony_ci{ 264562306a36Sopenharmony_ci long ret = 0; 264662306a36Sopenharmony_ci unsigned long pfn = page_to_pfn(page); 264762306a36Sopenharmony_ci struct page *hpage = compound_head(page); 264862306a36Sopenharmony_ci char const *msg_page[] = {"page", "hugepage"}; 264962306a36Sopenharmony_ci bool huge = PageHuge(page); 265062306a36Sopenharmony_ci LIST_HEAD(pagelist); 265162306a36Sopenharmony_ci struct migration_target_control mtc = { 265262306a36Sopenharmony_ci .nid = NUMA_NO_NODE, 265362306a36Sopenharmony_ci .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, 265462306a36Sopenharmony_ci }; 265562306a36Sopenharmony_ci 265662306a36Sopenharmony_ci if (!huge && PageTransHuge(hpage)) { 265762306a36Sopenharmony_ci if (try_to_split_thp_page(page)) { 265862306a36Sopenharmony_ci pr_info("soft offline: %#lx: thp split failed\n", pfn); 265962306a36Sopenharmony_ci return -EBUSY; 266062306a36Sopenharmony_ci } 266162306a36Sopenharmony_ci hpage = page; 266262306a36Sopenharmony_ci } 266362306a36Sopenharmony_ci 266462306a36Sopenharmony_ci lock_page(page); 266562306a36Sopenharmony_ci if (!huge) 266662306a36Sopenharmony_ci wait_on_page_writeback(page); 266762306a36Sopenharmony_ci if (PageHWPoison(page)) { 266862306a36Sopenharmony_ci unlock_page(page); 266962306a36Sopenharmony_ci put_page(page); 267062306a36Sopenharmony_ci pr_info("soft offline: %#lx page already poisoned\n", pfn); 267162306a36Sopenharmony_ci return 0; 267262306a36Sopenharmony_ci } 267362306a36Sopenharmony_ci 267462306a36Sopenharmony_ci if (!huge && PageLRU(page) && !PageSwapCache(page)) 267562306a36Sopenharmony_ci /* 267662306a36Sopenharmony_ci * Try to invalidate first. This should work for 267762306a36Sopenharmony_ci * non dirty unmapped page cache pages. 267862306a36Sopenharmony_ci */ 267962306a36Sopenharmony_ci ret = invalidate_inode_page(page); 268062306a36Sopenharmony_ci unlock_page(page); 268162306a36Sopenharmony_ci 268262306a36Sopenharmony_ci if (ret) { 268362306a36Sopenharmony_ci pr_info("soft_offline: %#lx: invalidated\n", pfn); 268462306a36Sopenharmony_ci page_handle_poison(page, false, true); 268562306a36Sopenharmony_ci return 0; 268662306a36Sopenharmony_ci } 268762306a36Sopenharmony_ci 268862306a36Sopenharmony_ci if (isolate_page(hpage, &pagelist)) { 268962306a36Sopenharmony_ci ret = migrate_pages(&pagelist, alloc_migration_target, NULL, 269062306a36Sopenharmony_ci (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL); 269162306a36Sopenharmony_ci if (!ret) { 269262306a36Sopenharmony_ci bool release = !huge; 269362306a36Sopenharmony_ci 269462306a36Sopenharmony_ci if (!page_handle_poison(page, huge, release)) 269562306a36Sopenharmony_ci ret = -EBUSY; 269662306a36Sopenharmony_ci } else { 269762306a36Sopenharmony_ci if (!list_empty(&pagelist)) 269862306a36Sopenharmony_ci putback_movable_pages(&pagelist); 269962306a36Sopenharmony_ci 270062306a36Sopenharmony_ci pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n", 270162306a36Sopenharmony_ci pfn, msg_page[huge], ret, &page->flags); 270262306a36Sopenharmony_ci if (ret > 0) 270362306a36Sopenharmony_ci ret = -EBUSY; 270462306a36Sopenharmony_ci } 270562306a36Sopenharmony_ci } else { 270662306a36Sopenharmony_ci pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %pGp\n", 270762306a36Sopenharmony_ci pfn, msg_page[huge], page_count(page), &page->flags); 270862306a36Sopenharmony_ci ret = -EBUSY; 270962306a36Sopenharmony_ci } 271062306a36Sopenharmony_ci return ret; 271162306a36Sopenharmony_ci} 271262306a36Sopenharmony_ci 271362306a36Sopenharmony_ci/** 271462306a36Sopenharmony_ci * soft_offline_page - Soft offline a page. 271562306a36Sopenharmony_ci * @pfn: pfn to soft-offline 271662306a36Sopenharmony_ci * @flags: flags. Same as memory_failure(). 271762306a36Sopenharmony_ci * 271862306a36Sopenharmony_ci * Returns 0 on success 271962306a36Sopenharmony_ci * -EOPNOTSUPP for hwpoison_filter() filtered the error event 272062306a36Sopenharmony_ci * < 0 otherwise negated errno. 272162306a36Sopenharmony_ci * 272262306a36Sopenharmony_ci * Soft offline a page, by migration or invalidation, 272362306a36Sopenharmony_ci * without killing anything. This is for the case when 272462306a36Sopenharmony_ci * a page is not corrupted yet (so it's still valid to access), 272562306a36Sopenharmony_ci * but has had a number of corrected errors and is better taken 272662306a36Sopenharmony_ci * out. 272762306a36Sopenharmony_ci * 272862306a36Sopenharmony_ci * The actual policy on when to do that is maintained by 272962306a36Sopenharmony_ci * user space. 273062306a36Sopenharmony_ci * 273162306a36Sopenharmony_ci * This should never impact any application or cause data loss, 273262306a36Sopenharmony_ci * however it might take some time. 273362306a36Sopenharmony_ci * 273462306a36Sopenharmony_ci * This is not a 100% solution for all memory, but tries to be 273562306a36Sopenharmony_ci * ``good enough'' for the majority of memory. 273662306a36Sopenharmony_ci */ 273762306a36Sopenharmony_ciint soft_offline_page(unsigned long pfn, int flags) 273862306a36Sopenharmony_ci{ 273962306a36Sopenharmony_ci int ret; 274062306a36Sopenharmony_ci bool try_again = true; 274162306a36Sopenharmony_ci struct page *page; 274262306a36Sopenharmony_ci 274362306a36Sopenharmony_ci if (!pfn_valid(pfn)) { 274462306a36Sopenharmony_ci WARN_ON_ONCE(flags & MF_COUNT_INCREASED); 274562306a36Sopenharmony_ci return -ENXIO; 274662306a36Sopenharmony_ci } 274762306a36Sopenharmony_ci 274862306a36Sopenharmony_ci /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */ 274962306a36Sopenharmony_ci page = pfn_to_online_page(pfn); 275062306a36Sopenharmony_ci if (!page) { 275162306a36Sopenharmony_ci put_ref_page(pfn, flags); 275262306a36Sopenharmony_ci return -EIO; 275362306a36Sopenharmony_ci } 275462306a36Sopenharmony_ci 275562306a36Sopenharmony_ci mutex_lock(&mf_mutex); 275662306a36Sopenharmony_ci 275762306a36Sopenharmony_ci if (PageHWPoison(page)) { 275862306a36Sopenharmony_ci pr_info("%s: %#lx page already poisoned\n", __func__, pfn); 275962306a36Sopenharmony_ci put_ref_page(pfn, flags); 276062306a36Sopenharmony_ci mutex_unlock(&mf_mutex); 276162306a36Sopenharmony_ci return 0; 276262306a36Sopenharmony_ci } 276362306a36Sopenharmony_ci 276462306a36Sopenharmony_ciretry: 276562306a36Sopenharmony_ci get_online_mems(); 276662306a36Sopenharmony_ci ret = get_hwpoison_page(page, flags | MF_SOFT_OFFLINE); 276762306a36Sopenharmony_ci put_online_mems(); 276862306a36Sopenharmony_ci 276962306a36Sopenharmony_ci if (hwpoison_filter(page)) { 277062306a36Sopenharmony_ci if (ret > 0) 277162306a36Sopenharmony_ci put_page(page); 277262306a36Sopenharmony_ci 277362306a36Sopenharmony_ci mutex_unlock(&mf_mutex); 277462306a36Sopenharmony_ci return -EOPNOTSUPP; 277562306a36Sopenharmony_ci } 277662306a36Sopenharmony_ci 277762306a36Sopenharmony_ci if (ret > 0) { 277862306a36Sopenharmony_ci ret = soft_offline_in_use_page(page); 277962306a36Sopenharmony_ci } else if (ret == 0) { 278062306a36Sopenharmony_ci if (!page_handle_poison(page, true, false)) { 278162306a36Sopenharmony_ci if (try_again) { 278262306a36Sopenharmony_ci try_again = false; 278362306a36Sopenharmony_ci flags &= ~MF_COUNT_INCREASED; 278462306a36Sopenharmony_ci goto retry; 278562306a36Sopenharmony_ci } 278662306a36Sopenharmony_ci ret = -EBUSY; 278762306a36Sopenharmony_ci } 278862306a36Sopenharmony_ci } 278962306a36Sopenharmony_ci 279062306a36Sopenharmony_ci mutex_unlock(&mf_mutex); 279162306a36Sopenharmony_ci 279262306a36Sopenharmony_ci return ret; 279362306a36Sopenharmony_ci} 2794