18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Copyright (C) 2008, 2009 Intel Corporation 48c2ecf20Sopenharmony_ci * Authors: Andi Kleen, Fengguang Wu 58c2ecf20Sopenharmony_ci * 68c2ecf20Sopenharmony_ci * High level machine check handler. Handles pages reported by the 78c2ecf20Sopenharmony_ci * hardware as being corrupted usually due to a multi-bit ECC memory or cache 88c2ecf20Sopenharmony_ci * failure. 98c2ecf20Sopenharmony_ci * 108c2ecf20Sopenharmony_ci * In addition there is a "soft offline" entry point that allows stop using 118c2ecf20Sopenharmony_ci * not-yet-corrupted-by-suspicious pages without killing anything. 128c2ecf20Sopenharmony_ci * 138c2ecf20Sopenharmony_ci * Handles page cache pages in various states. The tricky part 148c2ecf20Sopenharmony_ci * here is that we can access any page asynchronously in respect to 158c2ecf20Sopenharmony_ci * other VM users, because memory failures could happen anytime and 168c2ecf20Sopenharmony_ci * anywhere. This could violate some of their assumptions. This is why 178c2ecf20Sopenharmony_ci * this code has to be extremely careful. Generally it tries to use 188c2ecf20Sopenharmony_ci * normal locking rules, as in get the standard locks, even if that means 198c2ecf20Sopenharmony_ci * the error handling takes potentially a long time. 208c2ecf20Sopenharmony_ci * 218c2ecf20Sopenharmony_ci * It can be very tempting to add handling for obscure cases here. 228c2ecf20Sopenharmony_ci * In general any code for handling new cases should only be added iff: 238c2ecf20Sopenharmony_ci * - You know how to test it. 248c2ecf20Sopenharmony_ci * - You have a test that can be added to mce-test 258c2ecf20Sopenharmony_ci * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ 268c2ecf20Sopenharmony_ci * - The case actually shows up as a frequent (top 10) page state in 278c2ecf20Sopenharmony_ci * tools/vm/page-types when running a real workload. 288c2ecf20Sopenharmony_ci * 298c2ecf20Sopenharmony_ci * There are several operations here with exponential complexity because 308c2ecf20Sopenharmony_ci * of unsuitable VM data structures. For example the operation to map back 318c2ecf20Sopenharmony_ci * from RMAP chains to processes has to walk the complete process list and 328c2ecf20Sopenharmony_ci * has non linear complexity with the number. But since memory corruptions 338c2ecf20Sopenharmony_ci * are rare we hope to get away with this. This avoids impacting the core 348c2ecf20Sopenharmony_ci * VM. 358c2ecf20Sopenharmony_ci */ 368c2ecf20Sopenharmony_ci#include <linux/kernel.h> 378c2ecf20Sopenharmony_ci#include <linux/mm.h> 388c2ecf20Sopenharmony_ci#include <linux/page-flags.h> 398c2ecf20Sopenharmony_ci#include <linux/kernel-page-flags.h> 408c2ecf20Sopenharmony_ci#include <linux/sched/signal.h> 418c2ecf20Sopenharmony_ci#include <linux/sched/task.h> 428c2ecf20Sopenharmony_ci#include <linux/ksm.h> 438c2ecf20Sopenharmony_ci#include <linux/rmap.h> 448c2ecf20Sopenharmony_ci#include <linux/export.h> 458c2ecf20Sopenharmony_ci#include <linux/pagemap.h> 468c2ecf20Sopenharmony_ci#include <linux/swap.h> 478c2ecf20Sopenharmony_ci#include <linux/backing-dev.h> 488c2ecf20Sopenharmony_ci#include <linux/migrate.h> 498c2ecf20Sopenharmony_ci#include <linux/suspend.h> 508c2ecf20Sopenharmony_ci#include <linux/slab.h> 518c2ecf20Sopenharmony_ci#include <linux/swapops.h> 528c2ecf20Sopenharmony_ci#include <linux/hugetlb.h> 538c2ecf20Sopenharmony_ci#include <linux/memory_hotplug.h> 548c2ecf20Sopenharmony_ci#include <linux/mm_inline.h> 558c2ecf20Sopenharmony_ci#include <linux/memremap.h> 568c2ecf20Sopenharmony_ci#include <linux/kfifo.h> 578c2ecf20Sopenharmony_ci#include <linux/ratelimit.h> 588c2ecf20Sopenharmony_ci#include <linux/page-isolation.h> 598c2ecf20Sopenharmony_ci#include "internal.h" 608c2ecf20Sopenharmony_ci#include "ras/ras_event.h" 618c2ecf20Sopenharmony_ci 628c2ecf20Sopenharmony_ciint sysctl_memory_failure_early_kill __read_mostly = 0; 638c2ecf20Sopenharmony_ci 648c2ecf20Sopenharmony_ciint sysctl_memory_failure_recovery __read_mostly = 1; 658c2ecf20Sopenharmony_ci 668c2ecf20Sopenharmony_ciatomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); 678c2ecf20Sopenharmony_ci 688c2ecf20Sopenharmony_cistatic bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release) 698c2ecf20Sopenharmony_ci{ 708c2ecf20Sopenharmony_ci if (hugepage_or_freepage) { 718c2ecf20Sopenharmony_ci /* 728c2ecf20Sopenharmony_ci * Doing this check for free pages is also fine since dissolve_free_huge_page 738c2ecf20Sopenharmony_ci * returns 0 for non-hugetlb pages as well. 748c2ecf20Sopenharmony_ci */ 758c2ecf20Sopenharmony_ci if (dissolve_free_huge_page(page) || !take_page_off_buddy(page)) 768c2ecf20Sopenharmony_ci /* 778c2ecf20Sopenharmony_ci * We could fail to take off the target page from buddy 788c2ecf20Sopenharmony_ci * for example due to racy page allocaiton, but that's 798c2ecf20Sopenharmony_ci * acceptable because soft-offlined page is not broken 808c2ecf20Sopenharmony_ci * and if someone really want to use it, they should 818c2ecf20Sopenharmony_ci * take it. 828c2ecf20Sopenharmony_ci */ 838c2ecf20Sopenharmony_ci return false; 848c2ecf20Sopenharmony_ci } 858c2ecf20Sopenharmony_ci 868c2ecf20Sopenharmony_ci SetPageHWPoison(page); 878c2ecf20Sopenharmony_ci if (release) 888c2ecf20Sopenharmony_ci put_page(page); 898c2ecf20Sopenharmony_ci page_ref_inc(page); 908c2ecf20Sopenharmony_ci num_poisoned_pages_inc(); 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_ci return true; 938c2ecf20Sopenharmony_ci} 948c2ecf20Sopenharmony_ci 958c2ecf20Sopenharmony_ci#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) 968c2ecf20Sopenharmony_ci 978c2ecf20Sopenharmony_ciu32 hwpoison_filter_enable = 0; 988c2ecf20Sopenharmony_ciu32 hwpoison_filter_dev_major = ~0U; 998c2ecf20Sopenharmony_ciu32 hwpoison_filter_dev_minor = ~0U; 1008c2ecf20Sopenharmony_ciu64 hwpoison_filter_flags_mask; 1018c2ecf20Sopenharmony_ciu64 hwpoison_filter_flags_value; 1028c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_enable); 1038c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); 1048c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); 1058c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); 1068c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); 1078c2ecf20Sopenharmony_ci 1088c2ecf20Sopenharmony_cistatic int hwpoison_filter_dev(struct page *p) 1098c2ecf20Sopenharmony_ci{ 1108c2ecf20Sopenharmony_ci struct address_space *mapping; 1118c2ecf20Sopenharmony_ci dev_t dev; 1128c2ecf20Sopenharmony_ci 1138c2ecf20Sopenharmony_ci if (hwpoison_filter_dev_major == ~0U && 1148c2ecf20Sopenharmony_ci hwpoison_filter_dev_minor == ~0U) 1158c2ecf20Sopenharmony_ci return 0; 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_ci /* 1188c2ecf20Sopenharmony_ci * page_mapping() does not accept slab pages. 1198c2ecf20Sopenharmony_ci */ 1208c2ecf20Sopenharmony_ci if (PageSlab(p)) 1218c2ecf20Sopenharmony_ci return -EINVAL; 1228c2ecf20Sopenharmony_ci 1238c2ecf20Sopenharmony_ci mapping = page_mapping(p); 1248c2ecf20Sopenharmony_ci if (mapping == NULL || mapping->host == NULL) 1258c2ecf20Sopenharmony_ci return -EINVAL; 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_ci dev = mapping->host->i_sb->s_dev; 1288c2ecf20Sopenharmony_ci if (hwpoison_filter_dev_major != ~0U && 1298c2ecf20Sopenharmony_ci hwpoison_filter_dev_major != MAJOR(dev)) 1308c2ecf20Sopenharmony_ci return -EINVAL; 1318c2ecf20Sopenharmony_ci if (hwpoison_filter_dev_minor != ~0U && 1328c2ecf20Sopenharmony_ci hwpoison_filter_dev_minor != MINOR(dev)) 1338c2ecf20Sopenharmony_ci return -EINVAL; 1348c2ecf20Sopenharmony_ci 1358c2ecf20Sopenharmony_ci return 0; 1368c2ecf20Sopenharmony_ci} 1378c2ecf20Sopenharmony_ci 1388c2ecf20Sopenharmony_cistatic int hwpoison_filter_flags(struct page *p) 1398c2ecf20Sopenharmony_ci{ 1408c2ecf20Sopenharmony_ci if (!hwpoison_filter_flags_mask) 1418c2ecf20Sopenharmony_ci return 0; 1428c2ecf20Sopenharmony_ci 1438c2ecf20Sopenharmony_ci if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == 1448c2ecf20Sopenharmony_ci hwpoison_filter_flags_value) 1458c2ecf20Sopenharmony_ci return 0; 1468c2ecf20Sopenharmony_ci else 1478c2ecf20Sopenharmony_ci return -EINVAL; 1488c2ecf20Sopenharmony_ci} 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_ci/* 1518c2ecf20Sopenharmony_ci * This allows stress tests to limit test scope to a collection of tasks 1528c2ecf20Sopenharmony_ci * by putting them under some memcg. This prevents killing unrelated/important 1538c2ecf20Sopenharmony_ci * processes such as /sbin/init. Note that the target task may share clean 1548c2ecf20Sopenharmony_ci * pages with init (eg. libc text), which is harmless. If the target task 1558c2ecf20Sopenharmony_ci * share _dirty_ pages with another task B, the test scheme must make sure B 1568c2ecf20Sopenharmony_ci * is also included in the memcg. At last, due to race conditions this filter 1578c2ecf20Sopenharmony_ci * can only guarantee that the page either belongs to the memcg tasks, or is 1588c2ecf20Sopenharmony_ci * a freed page. 1598c2ecf20Sopenharmony_ci */ 1608c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG 1618c2ecf20Sopenharmony_ciu64 hwpoison_filter_memcg; 1628c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter_memcg); 1638c2ecf20Sopenharmony_cistatic int hwpoison_filter_task(struct page *p) 1648c2ecf20Sopenharmony_ci{ 1658c2ecf20Sopenharmony_ci if (!hwpoison_filter_memcg) 1668c2ecf20Sopenharmony_ci return 0; 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_ci if (page_cgroup_ino(p) != hwpoison_filter_memcg) 1698c2ecf20Sopenharmony_ci return -EINVAL; 1708c2ecf20Sopenharmony_ci 1718c2ecf20Sopenharmony_ci return 0; 1728c2ecf20Sopenharmony_ci} 1738c2ecf20Sopenharmony_ci#else 1748c2ecf20Sopenharmony_cistatic int hwpoison_filter_task(struct page *p) { return 0; } 1758c2ecf20Sopenharmony_ci#endif 1768c2ecf20Sopenharmony_ci 1778c2ecf20Sopenharmony_ciint hwpoison_filter(struct page *p) 1788c2ecf20Sopenharmony_ci{ 1798c2ecf20Sopenharmony_ci if (!hwpoison_filter_enable) 1808c2ecf20Sopenharmony_ci return 0; 1818c2ecf20Sopenharmony_ci 1828c2ecf20Sopenharmony_ci if (hwpoison_filter_dev(p)) 1838c2ecf20Sopenharmony_ci return -EINVAL; 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci if (hwpoison_filter_flags(p)) 1868c2ecf20Sopenharmony_ci return -EINVAL; 1878c2ecf20Sopenharmony_ci 1888c2ecf20Sopenharmony_ci if (hwpoison_filter_task(p)) 1898c2ecf20Sopenharmony_ci return -EINVAL; 1908c2ecf20Sopenharmony_ci 1918c2ecf20Sopenharmony_ci return 0; 1928c2ecf20Sopenharmony_ci} 1938c2ecf20Sopenharmony_ci#else 1948c2ecf20Sopenharmony_ciint hwpoison_filter(struct page *p) 1958c2ecf20Sopenharmony_ci{ 1968c2ecf20Sopenharmony_ci return 0; 1978c2ecf20Sopenharmony_ci} 1988c2ecf20Sopenharmony_ci#endif 1998c2ecf20Sopenharmony_ci 2008c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(hwpoison_filter); 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci/* 2038c2ecf20Sopenharmony_ci * Kill all processes that have a poisoned page mapped and then isolate 2048c2ecf20Sopenharmony_ci * the page. 2058c2ecf20Sopenharmony_ci * 2068c2ecf20Sopenharmony_ci * General strategy: 2078c2ecf20Sopenharmony_ci * Find all processes having the page mapped and kill them. 2088c2ecf20Sopenharmony_ci * But we keep a page reference around so that the page is not 2098c2ecf20Sopenharmony_ci * actually freed yet. 2108c2ecf20Sopenharmony_ci * Then stash the page away 2118c2ecf20Sopenharmony_ci * 2128c2ecf20Sopenharmony_ci * There's no convenient way to get back to mapped processes 2138c2ecf20Sopenharmony_ci * from the VMAs. So do a brute-force search over all 2148c2ecf20Sopenharmony_ci * running processes. 2158c2ecf20Sopenharmony_ci * 2168c2ecf20Sopenharmony_ci * Remember that machine checks are not common (or rather 2178c2ecf20Sopenharmony_ci * if they are common you have other problems), so this shouldn't 2188c2ecf20Sopenharmony_ci * be a performance issue. 2198c2ecf20Sopenharmony_ci * 2208c2ecf20Sopenharmony_ci * Also there are some races possible while we get from the 2218c2ecf20Sopenharmony_ci * error detection to actually handle it. 2228c2ecf20Sopenharmony_ci */ 2238c2ecf20Sopenharmony_ci 2248c2ecf20Sopenharmony_cistruct to_kill { 2258c2ecf20Sopenharmony_ci struct list_head nd; 2268c2ecf20Sopenharmony_ci struct task_struct *tsk; 2278c2ecf20Sopenharmony_ci unsigned long addr; 2288c2ecf20Sopenharmony_ci short size_shift; 2298c2ecf20Sopenharmony_ci}; 2308c2ecf20Sopenharmony_ci 2318c2ecf20Sopenharmony_ci/* 2328c2ecf20Sopenharmony_ci * Send all the processes who have the page mapped a signal. 2338c2ecf20Sopenharmony_ci * ``action optional'' if they are not immediately affected by the error 2348c2ecf20Sopenharmony_ci * ``action required'' if error happened in current execution context 2358c2ecf20Sopenharmony_ci */ 2368c2ecf20Sopenharmony_cistatic int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) 2378c2ecf20Sopenharmony_ci{ 2388c2ecf20Sopenharmony_ci struct task_struct *t = tk->tsk; 2398c2ecf20Sopenharmony_ci short addr_lsb = tk->size_shift; 2408c2ecf20Sopenharmony_ci int ret = 0; 2418c2ecf20Sopenharmony_ci 2428c2ecf20Sopenharmony_ci pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", 2438c2ecf20Sopenharmony_ci pfn, t->comm, t->pid); 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_ci if (flags & MF_ACTION_REQUIRED) { 2468c2ecf20Sopenharmony_ci WARN_ON_ONCE(t != current); 2478c2ecf20Sopenharmony_ci ret = force_sig_mceerr(BUS_MCEERR_AR, 2488c2ecf20Sopenharmony_ci (void __user *)tk->addr, addr_lsb); 2498c2ecf20Sopenharmony_ci } else { 2508c2ecf20Sopenharmony_ci /* 2518c2ecf20Sopenharmony_ci * Don't use force here, it's convenient if the signal 2528c2ecf20Sopenharmony_ci * can be temporarily blocked. 2538c2ecf20Sopenharmony_ci * This could cause a loop when the user sets SIGBUS 2548c2ecf20Sopenharmony_ci * to SIG_IGN, but hopefully no one will do that? 2558c2ecf20Sopenharmony_ci */ 2568c2ecf20Sopenharmony_ci ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr, 2578c2ecf20Sopenharmony_ci addr_lsb, t); /* synchronous? */ 2588c2ecf20Sopenharmony_ci } 2598c2ecf20Sopenharmony_ci if (ret < 0) 2608c2ecf20Sopenharmony_ci pr_info("Memory failure: Error sending signal to %s:%d: %d\n", 2618c2ecf20Sopenharmony_ci t->comm, t->pid, ret); 2628c2ecf20Sopenharmony_ci return ret; 2638c2ecf20Sopenharmony_ci} 2648c2ecf20Sopenharmony_ci 2658c2ecf20Sopenharmony_ci/* 2668c2ecf20Sopenharmony_ci * When a unknown page type is encountered drain as many buffers as possible 2678c2ecf20Sopenharmony_ci * in the hope to turn the page into a LRU or free page, which we can handle. 2688c2ecf20Sopenharmony_ci */ 2698c2ecf20Sopenharmony_civoid shake_page(struct page *p, int access) 2708c2ecf20Sopenharmony_ci{ 2718c2ecf20Sopenharmony_ci if (PageHuge(p)) 2728c2ecf20Sopenharmony_ci return; 2738c2ecf20Sopenharmony_ci 2748c2ecf20Sopenharmony_ci if (!PageSlab(p)) { 2758c2ecf20Sopenharmony_ci lru_add_drain_all(); 2768c2ecf20Sopenharmony_ci if (PageLRU(p)) 2778c2ecf20Sopenharmony_ci return; 2788c2ecf20Sopenharmony_ci drain_all_pages(page_zone(p)); 2798c2ecf20Sopenharmony_ci if (PageLRU(p) || is_free_buddy_page(p)) 2808c2ecf20Sopenharmony_ci return; 2818c2ecf20Sopenharmony_ci } 2828c2ecf20Sopenharmony_ci 2838c2ecf20Sopenharmony_ci /* 2848c2ecf20Sopenharmony_ci * Only call shrink_node_slabs here (which would also shrink 2858c2ecf20Sopenharmony_ci * other caches) if access is not potentially fatal. 2868c2ecf20Sopenharmony_ci */ 2878c2ecf20Sopenharmony_ci if (access) 2888c2ecf20Sopenharmony_ci drop_slab_node(page_to_nid(p)); 2898c2ecf20Sopenharmony_ci} 2908c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(shake_page); 2918c2ecf20Sopenharmony_ci 2928c2ecf20Sopenharmony_cistatic unsigned long dev_pagemap_mapping_shift(struct page *page, 2938c2ecf20Sopenharmony_ci struct vm_area_struct *vma) 2948c2ecf20Sopenharmony_ci{ 2958c2ecf20Sopenharmony_ci unsigned long address = vma_address(page, vma); 2968c2ecf20Sopenharmony_ci pgd_t *pgd; 2978c2ecf20Sopenharmony_ci p4d_t *p4d; 2988c2ecf20Sopenharmony_ci pud_t *pud; 2998c2ecf20Sopenharmony_ci pmd_t *pmd; 3008c2ecf20Sopenharmony_ci pte_t *pte; 3018c2ecf20Sopenharmony_ci 3028c2ecf20Sopenharmony_ci pgd = pgd_offset(vma->vm_mm, address); 3038c2ecf20Sopenharmony_ci if (!pgd_present(*pgd)) 3048c2ecf20Sopenharmony_ci return 0; 3058c2ecf20Sopenharmony_ci p4d = p4d_offset(pgd, address); 3068c2ecf20Sopenharmony_ci if (!p4d_present(*p4d)) 3078c2ecf20Sopenharmony_ci return 0; 3088c2ecf20Sopenharmony_ci pud = pud_offset(p4d, address); 3098c2ecf20Sopenharmony_ci if (!pud_present(*pud)) 3108c2ecf20Sopenharmony_ci return 0; 3118c2ecf20Sopenharmony_ci if (pud_devmap(*pud)) 3128c2ecf20Sopenharmony_ci return PUD_SHIFT; 3138c2ecf20Sopenharmony_ci pmd = pmd_offset(pud, address); 3148c2ecf20Sopenharmony_ci if (!pmd_present(*pmd)) 3158c2ecf20Sopenharmony_ci return 0; 3168c2ecf20Sopenharmony_ci if (pmd_devmap(*pmd)) 3178c2ecf20Sopenharmony_ci return PMD_SHIFT; 3188c2ecf20Sopenharmony_ci pte = pte_offset_map(pmd, address); 3198c2ecf20Sopenharmony_ci if (!pte_present(*pte)) 3208c2ecf20Sopenharmony_ci return 0; 3218c2ecf20Sopenharmony_ci if (pte_devmap(*pte)) 3228c2ecf20Sopenharmony_ci return PAGE_SHIFT; 3238c2ecf20Sopenharmony_ci return 0; 3248c2ecf20Sopenharmony_ci} 3258c2ecf20Sopenharmony_ci 3268c2ecf20Sopenharmony_ci/* 3278c2ecf20Sopenharmony_ci * Failure handling: if we can't find or can't kill a process there's 3288c2ecf20Sopenharmony_ci * not much we can do. We just print a message and ignore otherwise. 3298c2ecf20Sopenharmony_ci */ 3308c2ecf20Sopenharmony_ci 3318c2ecf20Sopenharmony_ci/* 3328c2ecf20Sopenharmony_ci * Schedule a process for later kill. 3338c2ecf20Sopenharmony_ci * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. 3348c2ecf20Sopenharmony_ci */ 3358c2ecf20Sopenharmony_cistatic void add_to_kill(struct task_struct *tsk, struct page *p, 3368c2ecf20Sopenharmony_ci struct vm_area_struct *vma, 3378c2ecf20Sopenharmony_ci struct list_head *to_kill) 3388c2ecf20Sopenharmony_ci{ 3398c2ecf20Sopenharmony_ci struct to_kill *tk; 3408c2ecf20Sopenharmony_ci 3418c2ecf20Sopenharmony_ci tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); 3428c2ecf20Sopenharmony_ci if (!tk) { 3438c2ecf20Sopenharmony_ci pr_err("Memory failure: Out of memory while machine check handling\n"); 3448c2ecf20Sopenharmony_ci return; 3458c2ecf20Sopenharmony_ci } 3468c2ecf20Sopenharmony_ci 3478c2ecf20Sopenharmony_ci tk->addr = page_address_in_vma(p, vma); 3488c2ecf20Sopenharmony_ci if (is_zone_device_page(p)) 3498c2ecf20Sopenharmony_ci tk->size_shift = dev_pagemap_mapping_shift(p, vma); 3508c2ecf20Sopenharmony_ci else 3518c2ecf20Sopenharmony_ci tk->size_shift = page_shift(compound_head(p)); 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_ci /* 3548c2ecf20Sopenharmony_ci * Send SIGKILL if "tk->addr == -EFAULT". Also, as 3558c2ecf20Sopenharmony_ci * "tk->size_shift" is always non-zero for !is_zone_device_page(), 3568c2ecf20Sopenharmony_ci * so "tk->size_shift == 0" effectively checks no mapping on 3578c2ecf20Sopenharmony_ci * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times 3588c2ecf20Sopenharmony_ci * to a process' address space, it's possible not all N VMAs 3598c2ecf20Sopenharmony_ci * contain mappings for the page, but at least one VMA does. 3608c2ecf20Sopenharmony_ci * Only deliver SIGBUS with payload derived from the VMA that 3618c2ecf20Sopenharmony_ci * has a mapping for the page. 3628c2ecf20Sopenharmony_ci */ 3638c2ecf20Sopenharmony_ci if (tk->addr == -EFAULT) { 3648c2ecf20Sopenharmony_ci pr_info("Memory failure: Unable to find user space address %lx in %s\n", 3658c2ecf20Sopenharmony_ci page_to_pfn(p), tsk->comm); 3668c2ecf20Sopenharmony_ci } else if (tk->size_shift == 0) { 3678c2ecf20Sopenharmony_ci kfree(tk); 3688c2ecf20Sopenharmony_ci return; 3698c2ecf20Sopenharmony_ci } 3708c2ecf20Sopenharmony_ci 3718c2ecf20Sopenharmony_ci get_task_struct(tsk); 3728c2ecf20Sopenharmony_ci tk->tsk = tsk; 3738c2ecf20Sopenharmony_ci list_add_tail(&tk->nd, to_kill); 3748c2ecf20Sopenharmony_ci} 3758c2ecf20Sopenharmony_ci 3768c2ecf20Sopenharmony_ci/* 3778c2ecf20Sopenharmony_ci * Kill the processes that have been collected earlier. 3788c2ecf20Sopenharmony_ci * 3798c2ecf20Sopenharmony_ci * Only do anything when DOIT is set, otherwise just free the list 3808c2ecf20Sopenharmony_ci * (this is used for clean pages which do not need killing) 3818c2ecf20Sopenharmony_ci * Also when FAIL is set do a force kill because something went 3828c2ecf20Sopenharmony_ci * wrong earlier. 3838c2ecf20Sopenharmony_ci */ 3848c2ecf20Sopenharmony_cistatic void kill_procs(struct list_head *to_kill, int forcekill, bool fail, 3858c2ecf20Sopenharmony_ci unsigned long pfn, int flags) 3868c2ecf20Sopenharmony_ci{ 3878c2ecf20Sopenharmony_ci struct to_kill *tk, *next; 3888c2ecf20Sopenharmony_ci 3898c2ecf20Sopenharmony_ci list_for_each_entry_safe (tk, next, to_kill, nd) { 3908c2ecf20Sopenharmony_ci if (forcekill) { 3918c2ecf20Sopenharmony_ci /* 3928c2ecf20Sopenharmony_ci * In case something went wrong with munmapping 3938c2ecf20Sopenharmony_ci * make sure the process doesn't catch the 3948c2ecf20Sopenharmony_ci * signal and then access the memory. Just kill it. 3958c2ecf20Sopenharmony_ci */ 3968c2ecf20Sopenharmony_ci if (fail || tk->addr == -EFAULT) { 3978c2ecf20Sopenharmony_ci pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", 3988c2ecf20Sopenharmony_ci pfn, tk->tsk->comm, tk->tsk->pid); 3998c2ecf20Sopenharmony_ci do_send_sig_info(SIGKILL, SEND_SIG_PRIV, 4008c2ecf20Sopenharmony_ci tk->tsk, PIDTYPE_PID); 4018c2ecf20Sopenharmony_ci } 4028c2ecf20Sopenharmony_ci 4038c2ecf20Sopenharmony_ci /* 4048c2ecf20Sopenharmony_ci * In theory the process could have mapped 4058c2ecf20Sopenharmony_ci * something else on the address in-between. We could 4068c2ecf20Sopenharmony_ci * check for that, but we need to tell the 4078c2ecf20Sopenharmony_ci * process anyways. 4088c2ecf20Sopenharmony_ci */ 4098c2ecf20Sopenharmony_ci else if (kill_proc(tk, pfn, flags) < 0) 4108c2ecf20Sopenharmony_ci pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n", 4118c2ecf20Sopenharmony_ci pfn, tk->tsk->comm, tk->tsk->pid); 4128c2ecf20Sopenharmony_ci } 4138c2ecf20Sopenharmony_ci put_task_struct(tk->tsk); 4148c2ecf20Sopenharmony_ci kfree(tk); 4158c2ecf20Sopenharmony_ci } 4168c2ecf20Sopenharmony_ci} 4178c2ecf20Sopenharmony_ci 4188c2ecf20Sopenharmony_ci/* 4198c2ecf20Sopenharmony_ci * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) 4208c2ecf20Sopenharmony_ci * on behalf of the thread group. Return task_struct of the (first found) 4218c2ecf20Sopenharmony_ci * dedicated thread if found, and return NULL otherwise. 4228c2ecf20Sopenharmony_ci * 4238c2ecf20Sopenharmony_ci * We already hold read_lock(&tasklist_lock) in the caller, so we don't 4248c2ecf20Sopenharmony_ci * have to call rcu_read_lock/unlock() in this function. 4258c2ecf20Sopenharmony_ci */ 4268c2ecf20Sopenharmony_cistatic struct task_struct *find_early_kill_thread(struct task_struct *tsk) 4278c2ecf20Sopenharmony_ci{ 4288c2ecf20Sopenharmony_ci struct task_struct *t; 4298c2ecf20Sopenharmony_ci 4308c2ecf20Sopenharmony_ci for_each_thread(tsk, t) { 4318c2ecf20Sopenharmony_ci if (t->flags & PF_MCE_PROCESS) { 4328c2ecf20Sopenharmony_ci if (t->flags & PF_MCE_EARLY) 4338c2ecf20Sopenharmony_ci return t; 4348c2ecf20Sopenharmony_ci } else { 4358c2ecf20Sopenharmony_ci if (sysctl_memory_failure_early_kill) 4368c2ecf20Sopenharmony_ci return t; 4378c2ecf20Sopenharmony_ci } 4388c2ecf20Sopenharmony_ci } 4398c2ecf20Sopenharmony_ci return NULL; 4408c2ecf20Sopenharmony_ci} 4418c2ecf20Sopenharmony_ci 4428c2ecf20Sopenharmony_ci/* 4438c2ecf20Sopenharmony_ci * Determine whether a given process is "early kill" process which expects 4448c2ecf20Sopenharmony_ci * to be signaled when some page under the process is hwpoisoned. 4458c2ecf20Sopenharmony_ci * Return task_struct of the dedicated thread (main thread unless explicitly 4468c2ecf20Sopenharmony_ci * specified) if the process is "early kill," and otherwise returns NULL. 4478c2ecf20Sopenharmony_ci * 4488c2ecf20Sopenharmony_ci * Note that the above is true for Action Optional case, but not for Action 4498c2ecf20Sopenharmony_ci * Required case where SIGBUS should sent only to the current thread. 4508c2ecf20Sopenharmony_ci */ 4518c2ecf20Sopenharmony_cistatic struct task_struct *task_early_kill(struct task_struct *tsk, 4528c2ecf20Sopenharmony_ci int force_early) 4538c2ecf20Sopenharmony_ci{ 4548c2ecf20Sopenharmony_ci if (!tsk->mm) 4558c2ecf20Sopenharmony_ci return NULL; 4568c2ecf20Sopenharmony_ci if (force_early) { 4578c2ecf20Sopenharmony_ci /* 4588c2ecf20Sopenharmony_ci * Comparing ->mm here because current task might represent 4598c2ecf20Sopenharmony_ci * a subthread, while tsk always points to the main thread. 4608c2ecf20Sopenharmony_ci */ 4618c2ecf20Sopenharmony_ci if (tsk->mm == current->mm) 4628c2ecf20Sopenharmony_ci return current; 4638c2ecf20Sopenharmony_ci else 4648c2ecf20Sopenharmony_ci return NULL; 4658c2ecf20Sopenharmony_ci } 4668c2ecf20Sopenharmony_ci return find_early_kill_thread(tsk); 4678c2ecf20Sopenharmony_ci} 4688c2ecf20Sopenharmony_ci 4698c2ecf20Sopenharmony_ci/* 4708c2ecf20Sopenharmony_ci * Collect processes when the error hit an anonymous page. 4718c2ecf20Sopenharmony_ci */ 4728c2ecf20Sopenharmony_cistatic void collect_procs_anon(struct page *page, struct list_head *to_kill, 4738c2ecf20Sopenharmony_ci int force_early) 4748c2ecf20Sopenharmony_ci{ 4758c2ecf20Sopenharmony_ci struct vm_area_struct *vma; 4768c2ecf20Sopenharmony_ci struct task_struct *tsk; 4778c2ecf20Sopenharmony_ci struct anon_vma *av; 4788c2ecf20Sopenharmony_ci pgoff_t pgoff; 4798c2ecf20Sopenharmony_ci 4808c2ecf20Sopenharmony_ci av = page_lock_anon_vma_read(page); 4818c2ecf20Sopenharmony_ci if (av == NULL) /* Not actually mapped anymore */ 4828c2ecf20Sopenharmony_ci return; 4838c2ecf20Sopenharmony_ci 4848c2ecf20Sopenharmony_ci pgoff = page_to_pgoff(page); 4858c2ecf20Sopenharmony_ci read_lock(&tasklist_lock); 4868c2ecf20Sopenharmony_ci for_each_process (tsk) { 4878c2ecf20Sopenharmony_ci struct anon_vma_chain *vmac; 4888c2ecf20Sopenharmony_ci struct task_struct *t = task_early_kill(tsk, force_early); 4898c2ecf20Sopenharmony_ci 4908c2ecf20Sopenharmony_ci if (!t) 4918c2ecf20Sopenharmony_ci continue; 4928c2ecf20Sopenharmony_ci anon_vma_interval_tree_foreach(vmac, &av->rb_root, 4938c2ecf20Sopenharmony_ci pgoff, pgoff) { 4948c2ecf20Sopenharmony_ci vma = vmac->vma; 4958c2ecf20Sopenharmony_ci if (!page_mapped_in_vma(page, vma)) 4968c2ecf20Sopenharmony_ci continue; 4978c2ecf20Sopenharmony_ci if (vma->vm_mm == t->mm) 4988c2ecf20Sopenharmony_ci add_to_kill(t, page, vma, to_kill); 4998c2ecf20Sopenharmony_ci } 5008c2ecf20Sopenharmony_ci } 5018c2ecf20Sopenharmony_ci read_unlock(&tasklist_lock); 5028c2ecf20Sopenharmony_ci page_unlock_anon_vma_read(av); 5038c2ecf20Sopenharmony_ci} 5048c2ecf20Sopenharmony_ci 5058c2ecf20Sopenharmony_ci/* 5068c2ecf20Sopenharmony_ci * Collect processes when the error hit a file mapped page. 5078c2ecf20Sopenharmony_ci */ 5088c2ecf20Sopenharmony_cistatic void collect_procs_file(struct page *page, struct list_head *to_kill, 5098c2ecf20Sopenharmony_ci int force_early) 5108c2ecf20Sopenharmony_ci{ 5118c2ecf20Sopenharmony_ci struct vm_area_struct *vma; 5128c2ecf20Sopenharmony_ci struct task_struct *tsk; 5138c2ecf20Sopenharmony_ci struct address_space *mapping = page->mapping; 5148c2ecf20Sopenharmony_ci pgoff_t pgoff; 5158c2ecf20Sopenharmony_ci 5168c2ecf20Sopenharmony_ci i_mmap_lock_read(mapping); 5178c2ecf20Sopenharmony_ci read_lock(&tasklist_lock); 5188c2ecf20Sopenharmony_ci pgoff = page_to_pgoff(page); 5198c2ecf20Sopenharmony_ci for_each_process(tsk) { 5208c2ecf20Sopenharmony_ci struct task_struct *t = task_early_kill(tsk, force_early); 5218c2ecf20Sopenharmony_ci 5228c2ecf20Sopenharmony_ci if (!t) 5238c2ecf20Sopenharmony_ci continue; 5248c2ecf20Sopenharmony_ci vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, 5258c2ecf20Sopenharmony_ci pgoff) { 5268c2ecf20Sopenharmony_ci /* 5278c2ecf20Sopenharmony_ci * Send early kill signal to tasks where a vma covers 5288c2ecf20Sopenharmony_ci * the page but the corrupted page is not necessarily 5298c2ecf20Sopenharmony_ci * mapped it in its pte. 5308c2ecf20Sopenharmony_ci * Assume applications who requested early kill want 5318c2ecf20Sopenharmony_ci * to be informed of all such data corruptions. 5328c2ecf20Sopenharmony_ci */ 5338c2ecf20Sopenharmony_ci if (vma->vm_mm == t->mm) 5348c2ecf20Sopenharmony_ci add_to_kill(t, page, vma, to_kill); 5358c2ecf20Sopenharmony_ci } 5368c2ecf20Sopenharmony_ci } 5378c2ecf20Sopenharmony_ci read_unlock(&tasklist_lock); 5388c2ecf20Sopenharmony_ci i_mmap_unlock_read(mapping); 5398c2ecf20Sopenharmony_ci} 5408c2ecf20Sopenharmony_ci 5418c2ecf20Sopenharmony_ci/* 5428c2ecf20Sopenharmony_ci * Collect the processes who have the corrupted page mapped to kill. 5438c2ecf20Sopenharmony_ci */ 5448c2ecf20Sopenharmony_cistatic void collect_procs(struct page *page, struct list_head *tokill, 5458c2ecf20Sopenharmony_ci int force_early) 5468c2ecf20Sopenharmony_ci{ 5478c2ecf20Sopenharmony_ci if (!page->mapping) 5488c2ecf20Sopenharmony_ci return; 5498c2ecf20Sopenharmony_ci 5508c2ecf20Sopenharmony_ci if (PageAnon(page)) 5518c2ecf20Sopenharmony_ci collect_procs_anon(page, tokill, force_early); 5528c2ecf20Sopenharmony_ci else 5538c2ecf20Sopenharmony_ci collect_procs_file(page, tokill, force_early); 5548c2ecf20Sopenharmony_ci} 5558c2ecf20Sopenharmony_ci 5568c2ecf20Sopenharmony_cistatic const char *action_name[] = { 5578c2ecf20Sopenharmony_ci [MF_IGNORED] = "Ignored", 5588c2ecf20Sopenharmony_ci [MF_FAILED] = "Failed", 5598c2ecf20Sopenharmony_ci [MF_DELAYED] = "Delayed", 5608c2ecf20Sopenharmony_ci [MF_RECOVERED] = "Recovered", 5618c2ecf20Sopenharmony_ci}; 5628c2ecf20Sopenharmony_ci 5638c2ecf20Sopenharmony_cistatic const char * const action_page_types[] = { 5648c2ecf20Sopenharmony_ci [MF_MSG_KERNEL] = "reserved kernel page", 5658c2ecf20Sopenharmony_ci [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", 5668c2ecf20Sopenharmony_ci [MF_MSG_SLAB] = "kernel slab page", 5678c2ecf20Sopenharmony_ci [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", 5688c2ecf20Sopenharmony_ci [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", 5698c2ecf20Sopenharmony_ci [MF_MSG_HUGE] = "huge page", 5708c2ecf20Sopenharmony_ci [MF_MSG_FREE_HUGE] = "free huge page", 5718c2ecf20Sopenharmony_ci [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page", 5728c2ecf20Sopenharmony_ci [MF_MSG_UNMAP_FAILED] = "unmapping failed page", 5738c2ecf20Sopenharmony_ci [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", 5748c2ecf20Sopenharmony_ci [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", 5758c2ecf20Sopenharmony_ci [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", 5768c2ecf20Sopenharmony_ci [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", 5778c2ecf20Sopenharmony_ci [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", 5788c2ecf20Sopenharmony_ci [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", 5798c2ecf20Sopenharmony_ci [MF_MSG_DIRTY_LRU] = "dirty LRU page", 5808c2ecf20Sopenharmony_ci [MF_MSG_CLEAN_LRU] = "clean LRU page", 5818c2ecf20Sopenharmony_ci [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", 5828c2ecf20Sopenharmony_ci [MF_MSG_BUDDY] = "free buddy page", 5838c2ecf20Sopenharmony_ci [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", 5848c2ecf20Sopenharmony_ci [MF_MSG_DAX] = "dax page", 5858c2ecf20Sopenharmony_ci [MF_MSG_UNSPLIT_THP] = "unsplit thp", 5868c2ecf20Sopenharmony_ci [MF_MSG_UNKNOWN] = "unknown page", 5878c2ecf20Sopenharmony_ci}; 5888c2ecf20Sopenharmony_ci 5898c2ecf20Sopenharmony_ci/* 5908c2ecf20Sopenharmony_ci * XXX: It is possible that a page is isolated from LRU cache, 5918c2ecf20Sopenharmony_ci * and then kept in swap cache or failed to remove from page cache. 5928c2ecf20Sopenharmony_ci * The page count will stop it from being freed by unpoison. 5938c2ecf20Sopenharmony_ci * Stress tests should be aware of this memory leak problem. 5948c2ecf20Sopenharmony_ci */ 5958c2ecf20Sopenharmony_cistatic int delete_from_lru_cache(struct page *p) 5968c2ecf20Sopenharmony_ci{ 5978c2ecf20Sopenharmony_ci if (!isolate_lru_page(p)) { 5988c2ecf20Sopenharmony_ci /* 5998c2ecf20Sopenharmony_ci * Clear sensible page flags, so that the buddy system won't 6008c2ecf20Sopenharmony_ci * complain when the page is unpoison-and-freed. 6018c2ecf20Sopenharmony_ci */ 6028c2ecf20Sopenharmony_ci ClearPageActive(p); 6038c2ecf20Sopenharmony_ci ClearPageUnevictable(p); 6048c2ecf20Sopenharmony_ci 6058c2ecf20Sopenharmony_ci /* 6068c2ecf20Sopenharmony_ci * Poisoned page might never drop its ref count to 0 so we have 6078c2ecf20Sopenharmony_ci * to uncharge it manually from its memcg. 6088c2ecf20Sopenharmony_ci */ 6098c2ecf20Sopenharmony_ci mem_cgroup_uncharge(p); 6108c2ecf20Sopenharmony_ci 6118c2ecf20Sopenharmony_ci /* 6128c2ecf20Sopenharmony_ci * drop the page count elevated by isolate_lru_page() 6138c2ecf20Sopenharmony_ci */ 6148c2ecf20Sopenharmony_ci put_page(p); 6158c2ecf20Sopenharmony_ci return 0; 6168c2ecf20Sopenharmony_ci } 6178c2ecf20Sopenharmony_ci return -EIO; 6188c2ecf20Sopenharmony_ci} 6198c2ecf20Sopenharmony_ci 6208c2ecf20Sopenharmony_cistatic int truncate_error_page(struct page *p, unsigned long pfn, 6218c2ecf20Sopenharmony_ci struct address_space *mapping) 6228c2ecf20Sopenharmony_ci{ 6238c2ecf20Sopenharmony_ci int ret = MF_FAILED; 6248c2ecf20Sopenharmony_ci 6258c2ecf20Sopenharmony_ci if (mapping->a_ops->error_remove_page) { 6268c2ecf20Sopenharmony_ci int err = mapping->a_ops->error_remove_page(mapping, p); 6278c2ecf20Sopenharmony_ci 6288c2ecf20Sopenharmony_ci if (err != 0) { 6298c2ecf20Sopenharmony_ci pr_info("Memory failure: %#lx: Failed to punch page: %d\n", 6308c2ecf20Sopenharmony_ci pfn, err); 6318c2ecf20Sopenharmony_ci } else if (page_has_private(p) && 6328c2ecf20Sopenharmony_ci !try_to_release_page(p, GFP_NOIO)) { 6338c2ecf20Sopenharmony_ci pr_info("Memory failure: %#lx: failed to release buffers\n", 6348c2ecf20Sopenharmony_ci pfn); 6358c2ecf20Sopenharmony_ci } else { 6368c2ecf20Sopenharmony_ci ret = MF_RECOVERED; 6378c2ecf20Sopenharmony_ci } 6388c2ecf20Sopenharmony_ci } else { 6398c2ecf20Sopenharmony_ci /* 6408c2ecf20Sopenharmony_ci * If the file system doesn't support it just invalidate 6418c2ecf20Sopenharmony_ci * This fails on dirty or anything with private pages 6428c2ecf20Sopenharmony_ci */ 6438c2ecf20Sopenharmony_ci if (invalidate_inode_page(p)) 6448c2ecf20Sopenharmony_ci ret = MF_RECOVERED; 6458c2ecf20Sopenharmony_ci else 6468c2ecf20Sopenharmony_ci pr_info("Memory failure: %#lx: Failed to invalidate\n", 6478c2ecf20Sopenharmony_ci pfn); 6488c2ecf20Sopenharmony_ci } 6498c2ecf20Sopenharmony_ci 6508c2ecf20Sopenharmony_ci return ret; 6518c2ecf20Sopenharmony_ci} 6528c2ecf20Sopenharmony_ci 6538c2ecf20Sopenharmony_ci/* 6548c2ecf20Sopenharmony_ci * Error hit kernel page. 6558c2ecf20Sopenharmony_ci * Do nothing, try to be lucky and not touch this instead. For a few cases we 6568c2ecf20Sopenharmony_ci * could be more sophisticated. 6578c2ecf20Sopenharmony_ci */ 6588c2ecf20Sopenharmony_cistatic int me_kernel(struct page *p, unsigned long pfn) 6598c2ecf20Sopenharmony_ci{ 6608c2ecf20Sopenharmony_ci return MF_IGNORED; 6618c2ecf20Sopenharmony_ci} 6628c2ecf20Sopenharmony_ci 6638c2ecf20Sopenharmony_ci/* 6648c2ecf20Sopenharmony_ci * Page in unknown state. Do nothing. 6658c2ecf20Sopenharmony_ci */ 6668c2ecf20Sopenharmony_cistatic int me_unknown(struct page *p, unsigned long pfn) 6678c2ecf20Sopenharmony_ci{ 6688c2ecf20Sopenharmony_ci pr_err("Memory failure: %#lx: Unknown page state\n", pfn); 6698c2ecf20Sopenharmony_ci return MF_FAILED; 6708c2ecf20Sopenharmony_ci} 6718c2ecf20Sopenharmony_ci 6728c2ecf20Sopenharmony_ci/* 6738c2ecf20Sopenharmony_ci * Clean (or cleaned) page cache page. 6748c2ecf20Sopenharmony_ci */ 6758c2ecf20Sopenharmony_cistatic int me_pagecache_clean(struct page *p, unsigned long pfn) 6768c2ecf20Sopenharmony_ci{ 6778c2ecf20Sopenharmony_ci struct address_space *mapping; 6788c2ecf20Sopenharmony_ci 6798c2ecf20Sopenharmony_ci delete_from_lru_cache(p); 6808c2ecf20Sopenharmony_ci 6818c2ecf20Sopenharmony_ci /* 6828c2ecf20Sopenharmony_ci * For anonymous pages we're done the only reference left 6838c2ecf20Sopenharmony_ci * should be the one m_f() holds. 6848c2ecf20Sopenharmony_ci */ 6858c2ecf20Sopenharmony_ci if (PageAnon(p)) 6868c2ecf20Sopenharmony_ci return MF_RECOVERED; 6878c2ecf20Sopenharmony_ci 6888c2ecf20Sopenharmony_ci /* 6898c2ecf20Sopenharmony_ci * Now truncate the page in the page cache. This is really 6908c2ecf20Sopenharmony_ci * more like a "temporary hole punch" 6918c2ecf20Sopenharmony_ci * Don't do this for block devices when someone else 6928c2ecf20Sopenharmony_ci * has a reference, because it could be file system metadata 6938c2ecf20Sopenharmony_ci * and that's not safe to truncate. 6948c2ecf20Sopenharmony_ci */ 6958c2ecf20Sopenharmony_ci mapping = page_mapping(p); 6968c2ecf20Sopenharmony_ci if (!mapping) { 6978c2ecf20Sopenharmony_ci /* 6988c2ecf20Sopenharmony_ci * Page has been teared down in the meanwhile 6998c2ecf20Sopenharmony_ci */ 7008c2ecf20Sopenharmony_ci return MF_FAILED; 7018c2ecf20Sopenharmony_ci } 7028c2ecf20Sopenharmony_ci 7038c2ecf20Sopenharmony_ci /* 7048c2ecf20Sopenharmony_ci * Truncation is a bit tricky. Enable it per file system for now. 7058c2ecf20Sopenharmony_ci * 7068c2ecf20Sopenharmony_ci * Open: to take i_mutex or not for this? Right now we don't. 7078c2ecf20Sopenharmony_ci */ 7088c2ecf20Sopenharmony_ci return truncate_error_page(p, pfn, mapping); 7098c2ecf20Sopenharmony_ci} 7108c2ecf20Sopenharmony_ci 7118c2ecf20Sopenharmony_ci/* 7128c2ecf20Sopenharmony_ci * Dirty pagecache page 7138c2ecf20Sopenharmony_ci * Issues: when the error hit a hole page the error is not properly 7148c2ecf20Sopenharmony_ci * propagated. 7158c2ecf20Sopenharmony_ci */ 7168c2ecf20Sopenharmony_cistatic int me_pagecache_dirty(struct page *p, unsigned long pfn) 7178c2ecf20Sopenharmony_ci{ 7188c2ecf20Sopenharmony_ci struct address_space *mapping = page_mapping(p); 7198c2ecf20Sopenharmony_ci 7208c2ecf20Sopenharmony_ci SetPageError(p); 7218c2ecf20Sopenharmony_ci /* TBD: print more information about the file. */ 7228c2ecf20Sopenharmony_ci if (mapping) { 7238c2ecf20Sopenharmony_ci /* 7248c2ecf20Sopenharmony_ci * IO error will be reported by write(), fsync(), etc. 7258c2ecf20Sopenharmony_ci * who check the mapping. 7268c2ecf20Sopenharmony_ci * This way the application knows that something went 7278c2ecf20Sopenharmony_ci * wrong with its dirty file data. 7288c2ecf20Sopenharmony_ci * 7298c2ecf20Sopenharmony_ci * There's one open issue: 7308c2ecf20Sopenharmony_ci * 7318c2ecf20Sopenharmony_ci * The EIO will be only reported on the next IO 7328c2ecf20Sopenharmony_ci * operation and then cleared through the IO map. 7338c2ecf20Sopenharmony_ci * Normally Linux has two mechanisms to pass IO error 7348c2ecf20Sopenharmony_ci * first through the AS_EIO flag in the address space 7358c2ecf20Sopenharmony_ci * and then through the PageError flag in the page. 7368c2ecf20Sopenharmony_ci * Since we drop pages on memory failure handling the 7378c2ecf20Sopenharmony_ci * only mechanism open to use is through AS_AIO. 7388c2ecf20Sopenharmony_ci * 7398c2ecf20Sopenharmony_ci * This has the disadvantage that it gets cleared on 7408c2ecf20Sopenharmony_ci * the first operation that returns an error, while 7418c2ecf20Sopenharmony_ci * the PageError bit is more sticky and only cleared 7428c2ecf20Sopenharmony_ci * when the page is reread or dropped. If an 7438c2ecf20Sopenharmony_ci * application assumes it will always get error on 7448c2ecf20Sopenharmony_ci * fsync, but does other operations on the fd before 7458c2ecf20Sopenharmony_ci * and the page is dropped between then the error 7468c2ecf20Sopenharmony_ci * will not be properly reported. 7478c2ecf20Sopenharmony_ci * 7488c2ecf20Sopenharmony_ci * This can already happen even without hwpoisoned 7498c2ecf20Sopenharmony_ci * pages: first on metadata IO errors (which only 7508c2ecf20Sopenharmony_ci * report through AS_EIO) or when the page is dropped 7518c2ecf20Sopenharmony_ci * at the wrong time. 7528c2ecf20Sopenharmony_ci * 7538c2ecf20Sopenharmony_ci * So right now we assume that the application DTRT on 7548c2ecf20Sopenharmony_ci * the first EIO, but we're not worse than other parts 7558c2ecf20Sopenharmony_ci * of the kernel. 7568c2ecf20Sopenharmony_ci */ 7578c2ecf20Sopenharmony_ci mapping_set_error(mapping, -EIO); 7588c2ecf20Sopenharmony_ci } 7598c2ecf20Sopenharmony_ci 7608c2ecf20Sopenharmony_ci return me_pagecache_clean(p, pfn); 7618c2ecf20Sopenharmony_ci} 7628c2ecf20Sopenharmony_ci 7638c2ecf20Sopenharmony_ci/* 7648c2ecf20Sopenharmony_ci * Clean and dirty swap cache. 7658c2ecf20Sopenharmony_ci * 7668c2ecf20Sopenharmony_ci * Dirty swap cache page is tricky to handle. The page could live both in page 7678c2ecf20Sopenharmony_ci * cache and swap cache(ie. page is freshly swapped in). So it could be 7688c2ecf20Sopenharmony_ci * referenced concurrently by 2 types of PTEs: 7698c2ecf20Sopenharmony_ci * normal PTEs and swap PTEs. We try to handle them consistently by calling 7708c2ecf20Sopenharmony_ci * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs, 7718c2ecf20Sopenharmony_ci * and then 7728c2ecf20Sopenharmony_ci * - clear dirty bit to prevent IO 7738c2ecf20Sopenharmony_ci * - remove from LRU 7748c2ecf20Sopenharmony_ci * - but keep in the swap cache, so that when we return to it on 7758c2ecf20Sopenharmony_ci * a later page fault, we know the application is accessing 7768c2ecf20Sopenharmony_ci * corrupted data and shall be killed (we installed simple 7778c2ecf20Sopenharmony_ci * interception code in do_swap_page to catch it). 7788c2ecf20Sopenharmony_ci * 7798c2ecf20Sopenharmony_ci * Clean swap cache pages can be directly isolated. A later page fault will 7808c2ecf20Sopenharmony_ci * bring in the known good data from disk. 7818c2ecf20Sopenharmony_ci */ 7828c2ecf20Sopenharmony_cistatic int me_swapcache_dirty(struct page *p, unsigned long pfn) 7838c2ecf20Sopenharmony_ci{ 7848c2ecf20Sopenharmony_ci ClearPageDirty(p); 7858c2ecf20Sopenharmony_ci /* Trigger EIO in shmem: */ 7868c2ecf20Sopenharmony_ci ClearPageUptodate(p); 7878c2ecf20Sopenharmony_ci 7888c2ecf20Sopenharmony_ci if (!delete_from_lru_cache(p)) 7898c2ecf20Sopenharmony_ci return MF_DELAYED; 7908c2ecf20Sopenharmony_ci else 7918c2ecf20Sopenharmony_ci return MF_FAILED; 7928c2ecf20Sopenharmony_ci} 7938c2ecf20Sopenharmony_ci 7948c2ecf20Sopenharmony_cistatic int me_swapcache_clean(struct page *p, unsigned long pfn) 7958c2ecf20Sopenharmony_ci{ 7968c2ecf20Sopenharmony_ci delete_from_swap_cache(p); 7978c2ecf20Sopenharmony_ci 7988c2ecf20Sopenharmony_ci if (!delete_from_lru_cache(p)) 7998c2ecf20Sopenharmony_ci return MF_RECOVERED; 8008c2ecf20Sopenharmony_ci else 8018c2ecf20Sopenharmony_ci return MF_FAILED; 8028c2ecf20Sopenharmony_ci} 8038c2ecf20Sopenharmony_ci 8048c2ecf20Sopenharmony_ci/* 8058c2ecf20Sopenharmony_ci * Huge pages. Needs work. 8068c2ecf20Sopenharmony_ci * Issues: 8078c2ecf20Sopenharmony_ci * - Error on hugepage is contained in hugepage unit (not in raw page unit.) 8088c2ecf20Sopenharmony_ci * To narrow down kill region to one page, we need to break up pmd. 8098c2ecf20Sopenharmony_ci */ 8108c2ecf20Sopenharmony_cistatic int me_huge_page(struct page *p, unsigned long pfn) 8118c2ecf20Sopenharmony_ci{ 8128c2ecf20Sopenharmony_ci int res = 0; 8138c2ecf20Sopenharmony_ci struct page *hpage = compound_head(p); 8148c2ecf20Sopenharmony_ci struct address_space *mapping; 8158c2ecf20Sopenharmony_ci 8168c2ecf20Sopenharmony_ci if (!PageHuge(hpage)) 8178c2ecf20Sopenharmony_ci return MF_DELAYED; 8188c2ecf20Sopenharmony_ci 8198c2ecf20Sopenharmony_ci mapping = page_mapping(hpage); 8208c2ecf20Sopenharmony_ci if (mapping) { 8218c2ecf20Sopenharmony_ci res = truncate_error_page(hpage, pfn, mapping); 8228c2ecf20Sopenharmony_ci } else { 8238c2ecf20Sopenharmony_ci unlock_page(hpage); 8248c2ecf20Sopenharmony_ci /* 8258c2ecf20Sopenharmony_ci * migration entry prevents later access on error anonymous 8268c2ecf20Sopenharmony_ci * hugepage, so we can free and dissolve it into buddy to 8278c2ecf20Sopenharmony_ci * save healthy subpages. 8288c2ecf20Sopenharmony_ci */ 8298c2ecf20Sopenharmony_ci if (PageAnon(hpage)) 8308c2ecf20Sopenharmony_ci put_page(hpage); 8318c2ecf20Sopenharmony_ci dissolve_free_huge_page(p); 8328c2ecf20Sopenharmony_ci res = MF_RECOVERED; 8338c2ecf20Sopenharmony_ci lock_page(hpage); 8348c2ecf20Sopenharmony_ci } 8358c2ecf20Sopenharmony_ci 8368c2ecf20Sopenharmony_ci return res; 8378c2ecf20Sopenharmony_ci} 8388c2ecf20Sopenharmony_ci 8398c2ecf20Sopenharmony_ci/* 8408c2ecf20Sopenharmony_ci * Various page states we can handle. 8418c2ecf20Sopenharmony_ci * 8428c2ecf20Sopenharmony_ci * A page state is defined by its current page->flags bits. 8438c2ecf20Sopenharmony_ci * The table matches them in order and calls the right handler. 8448c2ecf20Sopenharmony_ci * 8458c2ecf20Sopenharmony_ci * This is quite tricky because we can access page at any time 8468c2ecf20Sopenharmony_ci * in its live cycle, so all accesses have to be extremely careful. 8478c2ecf20Sopenharmony_ci * 8488c2ecf20Sopenharmony_ci * This is not complete. More states could be added. 8498c2ecf20Sopenharmony_ci * For any missing state don't attempt recovery. 8508c2ecf20Sopenharmony_ci */ 8518c2ecf20Sopenharmony_ci 8528c2ecf20Sopenharmony_ci#define dirty (1UL << PG_dirty) 8538c2ecf20Sopenharmony_ci#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked)) 8548c2ecf20Sopenharmony_ci#define unevict (1UL << PG_unevictable) 8558c2ecf20Sopenharmony_ci#define mlock (1UL << PG_mlocked) 8568c2ecf20Sopenharmony_ci#define lru (1UL << PG_lru) 8578c2ecf20Sopenharmony_ci#define head (1UL << PG_head) 8588c2ecf20Sopenharmony_ci#define slab (1UL << PG_slab) 8598c2ecf20Sopenharmony_ci#define reserved (1UL << PG_reserved) 8608c2ecf20Sopenharmony_ci 8618c2ecf20Sopenharmony_cistatic struct page_state { 8628c2ecf20Sopenharmony_ci unsigned long mask; 8638c2ecf20Sopenharmony_ci unsigned long res; 8648c2ecf20Sopenharmony_ci enum mf_action_page_type type; 8658c2ecf20Sopenharmony_ci int (*action)(struct page *p, unsigned long pfn); 8668c2ecf20Sopenharmony_ci} error_states[] = { 8678c2ecf20Sopenharmony_ci { reserved, reserved, MF_MSG_KERNEL, me_kernel }, 8688c2ecf20Sopenharmony_ci /* 8698c2ecf20Sopenharmony_ci * free pages are specially detected outside this table: 8708c2ecf20Sopenharmony_ci * PG_buddy pages only make a small fraction of all free pages. 8718c2ecf20Sopenharmony_ci */ 8728c2ecf20Sopenharmony_ci 8738c2ecf20Sopenharmony_ci /* 8748c2ecf20Sopenharmony_ci * Could in theory check if slab page is free or if we can drop 8758c2ecf20Sopenharmony_ci * currently unused objects without touching them. But just 8768c2ecf20Sopenharmony_ci * treat it as standard kernel for now. 8778c2ecf20Sopenharmony_ci */ 8788c2ecf20Sopenharmony_ci { slab, slab, MF_MSG_SLAB, me_kernel }, 8798c2ecf20Sopenharmony_ci 8808c2ecf20Sopenharmony_ci { head, head, MF_MSG_HUGE, me_huge_page }, 8818c2ecf20Sopenharmony_ci 8828c2ecf20Sopenharmony_ci { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, 8838c2ecf20Sopenharmony_ci { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, 8848c2ecf20Sopenharmony_ci 8858c2ecf20Sopenharmony_ci { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, 8868c2ecf20Sopenharmony_ci { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, 8878c2ecf20Sopenharmony_ci 8888c2ecf20Sopenharmony_ci { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, 8898c2ecf20Sopenharmony_ci { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, 8908c2ecf20Sopenharmony_ci 8918c2ecf20Sopenharmony_ci { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty }, 8928c2ecf20Sopenharmony_ci { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean }, 8938c2ecf20Sopenharmony_ci 8948c2ecf20Sopenharmony_ci /* 8958c2ecf20Sopenharmony_ci * Catchall entry: must be at end. 8968c2ecf20Sopenharmony_ci */ 8978c2ecf20Sopenharmony_ci { 0, 0, MF_MSG_UNKNOWN, me_unknown }, 8988c2ecf20Sopenharmony_ci}; 8998c2ecf20Sopenharmony_ci 9008c2ecf20Sopenharmony_ci#undef dirty 9018c2ecf20Sopenharmony_ci#undef sc 9028c2ecf20Sopenharmony_ci#undef unevict 9038c2ecf20Sopenharmony_ci#undef mlock 9048c2ecf20Sopenharmony_ci#undef lru 9058c2ecf20Sopenharmony_ci#undef head 9068c2ecf20Sopenharmony_ci#undef slab 9078c2ecf20Sopenharmony_ci#undef reserved 9088c2ecf20Sopenharmony_ci 9098c2ecf20Sopenharmony_ci/* 9108c2ecf20Sopenharmony_ci * "Dirty/Clean" indication is not 100% accurate due to the possibility of 9118c2ecf20Sopenharmony_ci * setting PG_dirty outside page lock. See also comment above set_page_dirty(). 9128c2ecf20Sopenharmony_ci */ 9138c2ecf20Sopenharmony_cistatic void action_result(unsigned long pfn, enum mf_action_page_type type, 9148c2ecf20Sopenharmony_ci enum mf_result result) 9158c2ecf20Sopenharmony_ci{ 9168c2ecf20Sopenharmony_ci trace_memory_failure_event(pfn, type, result); 9178c2ecf20Sopenharmony_ci 9188c2ecf20Sopenharmony_ci pr_err("Memory failure: %#lx: recovery action for %s: %s\n", 9198c2ecf20Sopenharmony_ci pfn, action_page_types[type], action_name[result]); 9208c2ecf20Sopenharmony_ci} 9218c2ecf20Sopenharmony_ci 9228c2ecf20Sopenharmony_cistatic int page_action(struct page_state *ps, struct page *p, 9238c2ecf20Sopenharmony_ci unsigned long pfn) 9248c2ecf20Sopenharmony_ci{ 9258c2ecf20Sopenharmony_ci int result; 9268c2ecf20Sopenharmony_ci int count; 9278c2ecf20Sopenharmony_ci 9288c2ecf20Sopenharmony_ci result = ps->action(p, pfn); 9298c2ecf20Sopenharmony_ci 9308c2ecf20Sopenharmony_ci count = page_count(p) - 1; 9318c2ecf20Sopenharmony_ci if (ps->action == me_swapcache_dirty && result == MF_DELAYED) 9328c2ecf20Sopenharmony_ci count--; 9338c2ecf20Sopenharmony_ci if (count > 0) { 9348c2ecf20Sopenharmony_ci pr_err("Memory failure: %#lx: %s still referenced by %d users\n", 9358c2ecf20Sopenharmony_ci pfn, action_page_types[ps->type], count); 9368c2ecf20Sopenharmony_ci result = MF_FAILED; 9378c2ecf20Sopenharmony_ci } 9388c2ecf20Sopenharmony_ci action_result(pfn, ps->type, result); 9398c2ecf20Sopenharmony_ci 9408c2ecf20Sopenharmony_ci /* Could do more checks here if page looks ok */ 9418c2ecf20Sopenharmony_ci /* 9428c2ecf20Sopenharmony_ci * Could adjust zone counters here to correct for the missing page. 9438c2ecf20Sopenharmony_ci */ 9448c2ecf20Sopenharmony_ci 9458c2ecf20Sopenharmony_ci return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; 9468c2ecf20Sopenharmony_ci} 9478c2ecf20Sopenharmony_ci 9488c2ecf20Sopenharmony_ci/** 9498c2ecf20Sopenharmony_ci * get_hwpoison_page() - Get refcount for memory error handling: 9508c2ecf20Sopenharmony_ci * @page: raw error page (hit by memory error) 9518c2ecf20Sopenharmony_ci * 9528c2ecf20Sopenharmony_ci * Return: return 0 if failed to grab the refcount, otherwise true (some 9538c2ecf20Sopenharmony_ci * non-zero value.) 9548c2ecf20Sopenharmony_ci */ 9558c2ecf20Sopenharmony_cistatic int get_hwpoison_page(struct page *page) 9568c2ecf20Sopenharmony_ci{ 9578c2ecf20Sopenharmony_ci struct page *head = compound_head(page); 9588c2ecf20Sopenharmony_ci 9598c2ecf20Sopenharmony_ci if (!PageHuge(head) && PageTransHuge(head)) { 9608c2ecf20Sopenharmony_ci /* 9618c2ecf20Sopenharmony_ci * Non anonymous thp exists only in allocation/free time. We 9628c2ecf20Sopenharmony_ci * can't handle such a case correctly, so let's give it up. 9638c2ecf20Sopenharmony_ci * This should be better than triggering BUG_ON when kernel 9648c2ecf20Sopenharmony_ci * tries to touch the "partially handled" page. 9658c2ecf20Sopenharmony_ci */ 9668c2ecf20Sopenharmony_ci if (!PageAnon(head)) { 9678c2ecf20Sopenharmony_ci pr_err("Memory failure: %#lx: non anonymous thp\n", 9688c2ecf20Sopenharmony_ci page_to_pfn(page)); 9698c2ecf20Sopenharmony_ci return 0; 9708c2ecf20Sopenharmony_ci } 9718c2ecf20Sopenharmony_ci } 9728c2ecf20Sopenharmony_ci 9738c2ecf20Sopenharmony_ci if (get_page_unless_zero(head)) { 9748c2ecf20Sopenharmony_ci if (head == compound_head(page)) 9758c2ecf20Sopenharmony_ci return 1; 9768c2ecf20Sopenharmony_ci 9778c2ecf20Sopenharmony_ci pr_info("Memory failure: %#lx cannot catch tail\n", 9788c2ecf20Sopenharmony_ci page_to_pfn(page)); 9798c2ecf20Sopenharmony_ci put_page(head); 9808c2ecf20Sopenharmony_ci } 9818c2ecf20Sopenharmony_ci 9828c2ecf20Sopenharmony_ci return 0; 9838c2ecf20Sopenharmony_ci} 9848c2ecf20Sopenharmony_ci 9858c2ecf20Sopenharmony_ci/* 9868c2ecf20Sopenharmony_ci * Do all that is necessary to remove user space mappings. Unmap 9878c2ecf20Sopenharmony_ci * the pages and send SIGBUS to the processes if the data was dirty. 9888c2ecf20Sopenharmony_ci */ 9898c2ecf20Sopenharmony_cistatic bool hwpoison_user_mappings(struct page *p, unsigned long pfn, 9908c2ecf20Sopenharmony_ci int flags, struct page **hpagep) 9918c2ecf20Sopenharmony_ci{ 9928c2ecf20Sopenharmony_ci enum ttu_flags ttu = TTU_IGNORE_MLOCK; 9938c2ecf20Sopenharmony_ci struct address_space *mapping; 9948c2ecf20Sopenharmony_ci LIST_HEAD(tokill); 9958c2ecf20Sopenharmony_ci bool unmap_success = true; 9968c2ecf20Sopenharmony_ci int kill = 1, forcekill; 9978c2ecf20Sopenharmony_ci struct page *hpage = *hpagep; 9988c2ecf20Sopenharmony_ci bool mlocked = PageMlocked(hpage); 9998c2ecf20Sopenharmony_ci 10008c2ecf20Sopenharmony_ci /* 10018c2ecf20Sopenharmony_ci * Here we are interested only in user-mapped pages, so skip any 10028c2ecf20Sopenharmony_ci * other types of pages. 10038c2ecf20Sopenharmony_ci */ 10048c2ecf20Sopenharmony_ci if (PageReserved(p) || PageSlab(p)) 10058c2ecf20Sopenharmony_ci return true; 10068c2ecf20Sopenharmony_ci if (!(PageLRU(hpage) || PageHuge(p))) 10078c2ecf20Sopenharmony_ci return true; 10088c2ecf20Sopenharmony_ci 10098c2ecf20Sopenharmony_ci /* 10108c2ecf20Sopenharmony_ci * This check implies we don't kill processes if their pages 10118c2ecf20Sopenharmony_ci * are in the swap cache early. Those are always late kills. 10128c2ecf20Sopenharmony_ci */ 10138c2ecf20Sopenharmony_ci if (!page_mapped(p)) 10148c2ecf20Sopenharmony_ci return true; 10158c2ecf20Sopenharmony_ci 10168c2ecf20Sopenharmony_ci if (PageKsm(p)) { 10178c2ecf20Sopenharmony_ci pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn); 10188c2ecf20Sopenharmony_ci return false; 10198c2ecf20Sopenharmony_ci } 10208c2ecf20Sopenharmony_ci 10218c2ecf20Sopenharmony_ci if (PageSwapCache(p)) { 10228c2ecf20Sopenharmony_ci pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n", 10238c2ecf20Sopenharmony_ci pfn); 10248c2ecf20Sopenharmony_ci ttu |= TTU_IGNORE_HWPOISON; 10258c2ecf20Sopenharmony_ci } 10268c2ecf20Sopenharmony_ci 10278c2ecf20Sopenharmony_ci /* 10288c2ecf20Sopenharmony_ci * Propagate the dirty bit from PTEs to struct page first, because we 10298c2ecf20Sopenharmony_ci * need this to decide if we should kill or just drop the page. 10308c2ecf20Sopenharmony_ci * XXX: the dirty test could be racy: set_page_dirty() may not always 10318c2ecf20Sopenharmony_ci * be called inside page lock (it's recommended but not enforced). 10328c2ecf20Sopenharmony_ci */ 10338c2ecf20Sopenharmony_ci mapping = page_mapping(hpage); 10348c2ecf20Sopenharmony_ci if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && 10358c2ecf20Sopenharmony_ci mapping_can_writeback(mapping)) { 10368c2ecf20Sopenharmony_ci if (page_mkclean(hpage)) { 10378c2ecf20Sopenharmony_ci SetPageDirty(hpage); 10388c2ecf20Sopenharmony_ci } else { 10398c2ecf20Sopenharmony_ci kill = 0; 10408c2ecf20Sopenharmony_ci ttu |= TTU_IGNORE_HWPOISON; 10418c2ecf20Sopenharmony_ci pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n", 10428c2ecf20Sopenharmony_ci pfn); 10438c2ecf20Sopenharmony_ci } 10448c2ecf20Sopenharmony_ci } 10458c2ecf20Sopenharmony_ci 10468c2ecf20Sopenharmony_ci /* 10478c2ecf20Sopenharmony_ci * First collect all the processes that have the page 10488c2ecf20Sopenharmony_ci * mapped in dirty form. This has to be done before try_to_unmap, 10498c2ecf20Sopenharmony_ci * because ttu takes the rmap data structures down. 10508c2ecf20Sopenharmony_ci * 10518c2ecf20Sopenharmony_ci * Error handling: We ignore errors here because 10528c2ecf20Sopenharmony_ci * there's nothing that can be done. 10538c2ecf20Sopenharmony_ci */ 10548c2ecf20Sopenharmony_ci if (kill) 10558c2ecf20Sopenharmony_ci collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); 10568c2ecf20Sopenharmony_ci 10578c2ecf20Sopenharmony_ci if (!PageHuge(hpage)) { 10588c2ecf20Sopenharmony_ci unmap_success = try_to_unmap(hpage, ttu); 10598c2ecf20Sopenharmony_ci } else { 10608c2ecf20Sopenharmony_ci if (!PageAnon(hpage)) { 10618c2ecf20Sopenharmony_ci /* 10628c2ecf20Sopenharmony_ci * For hugetlb pages in shared mappings, try_to_unmap 10638c2ecf20Sopenharmony_ci * could potentially call huge_pmd_unshare. Because of 10648c2ecf20Sopenharmony_ci * this, take semaphore in write mode here and set 10658c2ecf20Sopenharmony_ci * TTU_RMAP_LOCKED to indicate we have taken the lock 10668c2ecf20Sopenharmony_ci * at this higer level. 10678c2ecf20Sopenharmony_ci */ 10688c2ecf20Sopenharmony_ci mapping = hugetlb_page_mapping_lock_write(hpage); 10698c2ecf20Sopenharmony_ci if (mapping) { 10708c2ecf20Sopenharmony_ci unmap_success = try_to_unmap(hpage, 10718c2ecf20Sopenharmony_ci ttu|TTU_RMAP_LOCKED); 10728c2ecf20Sopenharmony_ci i_mmap_unlock_write(mapping); 10738c2ecf20Sopenharmony_ci } else { 10748c2ecf20Sopenharmony_ci pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); 10758c2ecf20Sopenharmony_ci unmap_success = false; 10768c2ecf20Sopenharmony_ci } 10778c2ecf20Sopenharmony_ci } else { 10788c2ecf20Sopenharmony_ci unmap_success = try_to_unmap(p, ttu); 10798c2ecf20Sopenharmony_ci } 10808c2ecf20Sopenharmony_ci } 10818c2ecf20Sopenharmony_ci if (!unmap_success) 10828c2ecf20Sopenharmony_ci pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", 10838c2ecf20Sopenharmony_ci pfn, page_mapcount(p)); 10848c2ecf20Sopenharmony_ci 10858c2ecf20Sopenharmony_ci /* 10868c2ecf20Sopenharmony_ci * try_to_unmap() might put mlocked page in lru cache, so call 10878c2ecf20Sopenharmony_ci * shake_page() again to ensure that it's flushed. 10888c2ecf20Sopenharmony_ci */ 10898c2ecf20Sopenharmony_ci if (mlocked) 10908c2ecf20Sopenharmony_ci shake_page(hpage, 0); 10918c2ecf20Sopenharmony_ci 10928c2ecf20Sopenharmony_ci /* 10938c2ecf20Sopenharmony_ci * Now that the dirty bit has been propagated to the 10948c2ecf20Sopenharmony_ci * struct page and all unmaps done we can decide if 10958c2ecf20Sopenharmony_ci * killing is needed or not. Only kill when the page 10968c2ecf20Sopenharmony_ci * was dirty or the process is not restartable, 10978c2ecf20Sopenharmony_ci * otherwise the tokill list is merely 10988c2ecf20Sopenharmony_ci * freed. When there was a problem unmapping earlier 10998c2ecf20Sopenharmony_ci * use a more force-full uncatchable kill to prevent 11008c2ecf20Sopenharmony_ci * any accesses to the poisoned memory. 11018c2ecf20Sopenharmony_ci */ 11028c2ecf20Sopenharmony_ci forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); 11038c2ecf20Sopenharmony_ci kill_procs(&tokill, forcekill, !unmap_success, pfn, flags); 11048c2ecf20Sopenharmony_ci 11058c2ecf20Sopenharmony_ci return unmap_success; 11068c2ecf20Sopenharmony_ci} 11078c2ecf20Sopenharmony_ci 11088c2ecf20Sopenharmony_cistatic int identify_page_state(unsigned long pfn, struct page *p, 11098c2ecf20Sopenharmony_ci unsigned long page_flags) 11108c2ecf20Sopenharmony_ci{ 11118c2ecf20Sopenharmony_ci struct page_state *ps; 11128c2ecf20Sopenharmony_ci 11138c2ecf20Sopenharmony_ci /* 11148c2ecf20Sopenharmony_ci * The first check uses the current page flags which may not have any 11158c2ecf20Sopenharmony_ci * relevant information. The second check with the saved page flags is 11168c2ecf20Sopenharmony_ci * carried out only if the first check can't determine the page status. 11178c2ecf20Sopenharmony_ci */ 11188c2ecf20Sopenharmony_ci for (ps = error_states;; ps++) 11198c2ecf20Sopenharmony_ci if ((p->flags & ps->mask) == ps->res) 11208c2ecf20Sopenharmony_ci break; 11218c2ecf20Sopenharmony_ci 11228c2ecf20Sopenharmony_ci page_flags |= (p->flags & (1UL << PG_dirty)); 11238c2ecf20Sopenharmony_ci 11248c2ecf20Sopenharmony_ci if (!ps->mask) 11258c2ecf20Sopenharmony_ci for (ps = error_states;; ps++) 11268c2ecf20Sopenharmony_ci if ((page_flags & ps->mask) == ps->res) 11278c2ecf20Sopenharmony_ci break; 11288c2ecf20Sopenharmony_ci return page_action(ps, p, pfn); 11298c2ecf20Sopenharmony_ci} 11308c2ecf20Sopenharmony_ci 11318c2ecf20Sopenharmony_cistatic int try_to_split_thp_page(struct page *page, const char *msg) 11328c2ecf20Sopenharmony_ci{ 11338c2ecf20Sopenharmony_ci lock_page(page); 11348c2ecf20Sopenharmony_ci if (!PageAnon(page) || unlikely(split_huge_page(page))) { 11358c2ecf20Sopenharmony_ci unsigned long pfn = page_to_pfn(page); 11368c2ecf20Sopenharmony_ci 11378c2ecf20Sopenharmony_ci unlock_page(page); 11388c2ecf20Sopenharmony_ci if (!PageAnon(page)) 11398c2ecf20Sopenharmony_ci pr_info("%s: %#lx: non anonymous thp\n", msg, pfn); 11408c2ecf20Sopenharmony_ci else 11418c2ecf20Sopenharmony_ci pr_info("%s: %#lx: thp split failed\n", msg, pfn); 11428c2ecf20Sopenharmony_ci put_page(page); 11438c2ecf20Sopenharmony_ci return -EBUSY; 11448c2ecf20Sopenharmony_ci } 11458c2ecf20Sopenharmony_ci unlock_page(page); 11468c2ecf20Sopenharmony_ci 11478c2ecf20Sopenharmony_ci return 0; 11488c2ecf20Sopenharmony_ci} 11498c2ecf20Sopenharmony_ci 11508c2ecf20Sopenharmony_cistatic int memory_failure_hugetlb(unsigned long pfn, int flags) 11518c2ecf20Sopenharmony_ci{ 11528c2ecf20Sopenharmony_ci struct page *p = pfn_to_page(pfn); 11538c2ecf20Sopenharmony_ci struct page *head = compound_head(p); 11548c2ecf20Sopenharmony_ci int res; 11558c2ecf20Sopenharmony_ci unsigned long page_flags; 11568c2ecf20Sopenharmony_ci 11578c2ecf20Sopenharmony_ci if (TestSetPageHWPoison(head)) { 11588c2ecf20Sopenharmony_ci pr_err("Memory failure: %#lx: already hardware poisoned\n", 11598c2ecf20Sopenharmony_ci pfn); 11608c2ecf20Sopenharmony_ci return -EHWPOISON; 11618c2ecf20Sopenharmony_ci } 11628c2ecf20Sopenharmony_ci 11638c2ecf20Sopenharmony_ci num_poisoned_pages_inc(); 11648c2ecf20Sopenharmony_ci 11658c2ecf20Sopenharmony_ci if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { 11668c2ecf20Sopenharmony_ci /* 11678c2ecf20Sopenharmony_ci * Check "filter hit" and "race with other subpage." 11688c2ecf20Sopenharmony_ci */ 11698c2ecf20Sopenharmony_ci lock_page(head); 11708c2ecf20Sopenharmony_ci if (PageHWPoison(head)) { 11718c2ecf20Sopenharmony_ci if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) 11728c2ecf20Sopenharmony_ci || (p != head && TestSetPageHWPoison(head))) { 11738c2ecf20Sopenharmony_ci num_poisoned_pages_dec(); 11748c2ecf20Sopenharmony_ci unlock_page(head); 11758c2ecf20Sopenharmony_ci return 0; 11768c2ecf20Sopenharmony_ci } 11778c2ecf20Sopenharmony_ci } 11788c2ecf20Sopenharmony_ci unlock_page(head); 11798c2ecf20Sopenharmony_ci dissolve_free_huge_page(p); 11808c2ecf20Sopenharmony_ci action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED); 11818c2ecf20Sopenharmony_ci return 0; 11828c2ecf20Sopenharmony_ci } 11838c2ecf20Sopenharmony_ci 11848c2ecf20Sopenharmony_ci lock_page(head); 11858c2ecf20Sopenharmony_ci page_flags = head->flags; 11868c2ecf20Sopenharmony_ci 11878c2ecf20Sopenharmony_ci if (!PageHWPoison(head)) { 11888c2ecf20Sopenharmony_ci pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); 11898c2ecf20Sopenharmony_ci num_poisoned_pages_dec(); 11908c2ecf20Sopenharmony_ci unlock_page(head); 11918c2ecf20Sopenharmony_ci put_page(head); 11928c2ecf20Sopenharmony_ci return 0; 11938c2ecf20Sopenharmony_ci } 11948c2ecf20Sopenharmony_ci 11958c2ecf20Sopenharmony_ci /* 11968c2ecf20Sopenharmony_ci * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so 11978c2ecf20Sopenharmony_ci * simply disable it. In order to make it work properly, we need 11988c2ecf20Sopenharmony_ci * make sure that: 11998c2ecf20Sopenharmony_ci * - conversion of a pud that maps an error hugetlb into hwpoison 12008c2ecf20Sopenharmony_ci * entry properly works, and 12018c2ecf20Sopenharmony_ci * - other mm code walking over page table is aware of pud-aligned 12028c2ecf20Sopenharmony_ci * hwpoison entries. 12038c2ecf20Sopenharmony_ci */ 12048c2ecf20Sopenharmony_ci if (huge_page_size(page_hstate(head)) > PMD_SIZE) { 12058c2ecf20Sopenharmony_ci action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED); 12068c2ecf20Sopenharmony_ci res = -EBUSY; 12078c2ecf20Sopenharmony_ci goto out; 12088c2ecf20Sopenharmony_ci } 12098c2ecf20Sopenharmony_ci 12108c2ecf20Sopenharmony_ci if (!hwpoison_user_mappings(p, pfn, flags, &head)) { 12118c2ecf20Sopenharmony_ci action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); 12128c2ecf20Sopenharmony_ci res = -EBUSY; 12138c2ecf20Sopenharmony_ci goto out; 12148c2ecf20Sopenharmony_ci } 12158c2ecf20Sopenharmony_ci 12168c2ecf20Sopenharmony_ci res = identify_page_state(pfn, p, page_flags); 12178c2ecf20Sopenharmony_ciout: 12188c2ecf20Sopenharmony_ci unlock_page(head); 12198c2ecf20Sopenharmony_ci return res; 12208c2ecf20Sopenharmony_ci} 12218c2ecf20Sopenharmony_ci 12228c2ecf20Sopenharmony_cistatic int memory_failure_dev_pagemap(unsigned long pfn, int flags, 12238c2ecf20Sopenharmony_ci struct dev_pagemap *pgmap) 12248c2ecf20Sopenharmony_ci{ 12258c2ecf20Sopenharmony_ci struct page *page = pfn_to_page(pfn); 12268c2ecf20Sopenharmony_ci const bool unmap_success = true; 12278c2ecf20Sopenharmony_ci unsigned long size = 0; 12288c2ecf20Sopenharmony_ci struct to_kill *tk; 12298c2ecf20Sopenharmony_ci LIST_HEAD(tokill); 12308c2ecf20Sopenharmony_ci int rc = -EBUSY; 12318c2ecf20Sopenharmony_ci loff_t start; 12328c2ecf20Sopenharmony_ci dax_entry_t cookie; 12338c2ecf20Sopenharmony_ci 12348c2ecf20Sopenharmony_ci if (flags & MF_COUNT_INCREASED) 12358c2ecf20Sopenharmony_ci /* 12368c2ecf20Sopenharmony_ci * Drop the extra refcount in case we come from madvise(). 12378c2ecf20Sopenharmony_ci */ 12388c2ecf20Sopenharmony_ci put_page(page); 12398c2ecf20Sopenharmony_ci 12408c2ecf20Sopenharmony_ci /* device metadata space is not recoverable */ 12418c2ecf20Sopenharmony_ci if (!pgmap_pfn_valid(pgmap, pfn)) { 12428c2ecf20Sopenharmony_ci rc = -ENXIO; 12438c2ecf20Sopenharmony_ci goto out; 12448c2ecf20Sopenharmony_ci } 12458c2ecf20Sopenharmony_ci 12468c2ecf20Sopenharmony_ci /* 12478c2ecf20Sopenharmony_ci * Prevent the inode from being freed while we are interrogating 12488c2ecf20Sopenharmony_ci * the address_space, typically this would be handled by 12498c2ecf20Sopenharmony_ci * lock_page(), but dax pages do not use the page lock. This 12508c2ecf20Sopenharmony_ci * also prevents changes to the mapping of this pfn until 12518c2ecf20Sopenharmony_ci * poison signaling is complete. 12528c2ecf20Sopenharmony_ci */ 12538c2ecf20Sopenharmony_ci cookie = dax_lock_page(page); 12548c2ecf20Sopenharmony_ci if (!cookie) 12558c2ecf20Sopenharmony_ci goto out; 12568c2ecf20Sopenharmony_ci 12578c2ecf20Sopenharmony_ci if (hwpoison_filter(page)) { 12588c2ecf20Sopenharmony_ci rc = 0; 12598c2ecf20Sopenharmony_ci goto unlock; 12608c2ecf20Sopenharmony_ci } 12618c2ecf20Sopenharmony_ci 12628c2ecf20Sopenharmony_ci if (pgmap->type == MEMORY_DEVICE_PRIVATE) { 12638c2ecf20Sopenharmony_ci /* 12648c2ecf20Sopenharmony_ci * TODO: Handle HMM pages which may need coordination 12658c2ecf20Sopenharmony_ci * with device-side memory. 12668c2ecf20Sopenharmony_ci */ 12678c2ecf20Sopenharmony_ci goto unlock; 12688c2ecf20Sopenharmony_ci } 12698c2ecf20Sopenharmony_ci 12708c2ecf20Sopenharmony_ci /* 12718c2ecf20Sopenharmony_ci * Use this flag as an indication that the dax page has been 12728c2ecf20Sopenharmony_ci * remapped UC to prevent speculative consumption of poison. 12738c2ecf20Sopenharmony_ci */ 12748c2ecf20Sopenharmony_ci SetPageHWPoison(page); 12758c2ecf20Sopenharmony_ci 12768c2ecf20Sopenharmony_ci /* 12778c2ecf20Sopenharmony_ci * Unlike System-RAM there is no possibility to swap in a 12788c2ecf20Sopenharmony_ci * different physical page at a given virtual address, so all 12798c2ecf20Sopenharmony_ci * userspace consumption of ZONE_DEVICE memory necessitates 12808c2ecf20Sopenharmony_ci * SIGBUS (i.e. MF_MUST_KILL) 12818c2ecf20Sopenharmony_ci */ 12828c2ecf20Sopenharmony_ci flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; 12838c2ecf20Sopenharmony_ci collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED); 12848c2ecf20Sopenharmony_ci 12858c2ecf20Sopenharmony_ci list_for_each_entry(tk, &tokill, nd) 12868c2ecf20Sopenharmony_ci if (tk->size_shift) 12878c2ecf20Sopenharmony_ci size = max(size, 1UL << tk->size_shift); 12888c2ecf20Sopenharmony_ci if (size) { 12898c2ecf20Sopenharmony_ci /* 12908c2ecf20Sopenharmony_ci * Unmap the largest mapping to avoid breaking up 12918c2ecf20Sopenharmony_ci * device-dax mappings which are constant size. The 12928c2ecf20Sopenharmony_ci * actual size of the mapping being torn down is 12938c2ecf20Sopenharmony_ci * communicated in siginfo, see kill_proc() 12948c2ecf20Sopenharmony_ci */ 12958c2ecf20Sopenharmony_ci start = (page->index << PAGE_SHIFT) & ~(size - 1); 12968c2ecf20Sopenharmony_ci unmap_mapping_range(page->mapping, start, size, 0); 12978c2ecf20Sopenharmony_ci } 12988c2ecf20Sopenharmony_ci kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags); 12998c2ecf20Sopenharmony_ci rc = 0; 13008c2ecf20Sopenharmony_ciunlock: 13018c2ecf20Sopenharmony_ci dax_unlock_page(page, cookie); 13028c2ecf20Sopenharmony_ciout: 13038c2ecf20Sopenharmony_ci /* drop pgmap ref acquired in caller */ 13048c2ecf20Sopenharmony_ci put_dev_pagemap(pgmap); 13058c2ecf20Sopenharmony_ci action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED); 13068c2ecf20Sopenharmony_ci return rc; 13078c2ecf20Sopenharmony_ci} 13088c2ecf20Sopenharmony_ci 13098c2ecf20Sopenharmony_ci/** 13108c2ecf20Sopenharmony_ci * memory_failure - Handle memory failure of a page. 13118c2ecf20Sopenharmony_ci * @pfn: Page Number of the corrupted page 13128c2ecf20Sopenharmony_ci * @flags: fine tune action taken 13138c2ecf20Sopenharmony_ci * 13148c2ecf20Sopenharmony_ci * This function is called by the low level machine check code 13158c2ecf20Sopenharmony_ci * of an architecture when it detects hardware memory corruption 13168c2ecf20Sopenharmony_ci * of a page. It tries its best to recover, which includes 13178c2ecf20Sopenharmony_ci * dropping pages, killing processes etc. 13188c2ecf20Sopenharmony_ci * 13198c2ecf20Sopenharmony_ci * The function is primarily of use for corruptions that 13208c2ecf20Sopenharmony_ci * happen outside the current execution context (e.g. when 13218c2ecf20Sopenharmony_ci * detected by a background scrubber) 13228c2ecf20Sopenharmony_ci * 13238c2ecf20Sopenharmony_ci * Must run in process context (e.g. a work queue) with interrupts 13248c2ecf20Sopenharmony_ci * enabled and no spinlocks hold. 13258c2ecf20Sopenharmony_ci */ 13268c2ecf20Sopenharmony_ciint memory_failure(unsigned long pfn, int flags) 13278c2ecf20Sopenharmony_ci{ 13288c2ecf20Sopenharmony_ci struct page *p; 13298c2ecf20Sopenharmony_ci struct page *hpage; 13308c2ecf20Sopenharmony_ci struct page *orig_head; 13318c2ecf20Sopenharmony_ci struct dev_pagemap *pgmap; 13328c2ecf20Sopenharmony_ci int res = 0; 13338c2ecf20Sopenharmony_ci unsigned long page_flags; 13348c2ecf20Sopenharmony_ci static DEFINE_MUTEX(mf_mutex); 13358c2ecf20Sopenharmony_ci 13368c2ecf20Sopenharmony_ci if (!sysctl_memory_failure_recovery) 13378c2ecf20Sopenharmony_ci panic("Memory failure on page %lx", pfn); 13388c2ecf20Sopenharmony_ci 13398c2ecf20Sopenharmony_ci p = pfn_to_online_page(pfn); 13408c2ecf20Sopenharmony_ci if (!p) { 13418c2ecf20Sopenharmony_ci if (pfn_valid(pfn)) { 13428c2ecf20Sopenharmony_ci pgmap = get_dev_pagemap(pfn, NULL); 13438c2ecf20Sopenharmony_ci if (pgmap) 13448c2ecf20Sopenharmony_ci return memory_failure_dev_pagemap(pfn, flags, 13458c2ecf20Sopenharmony_ci pgmap); 13468c2ecf20Sopenharmony_ci } 13478c2ecf20Sopenharmony_ci pr_err("Memory failure: %#lx: memory outside kernel control\n", 13488c2ecf20Sopenharmony_ci pfn); 13498c2ecf20Sopenharmony_ci return -ENXIO; 13508c2ecf20Sopenharmony_ci } 13518c2ecf20Sopenharmony_ci 13528c2ecf20Sopenharmony_ci mutex_lock(&mf_mutex); 13538c2ecf20Sopenharmony_ci 13548c2ecf20Sopenharmony_ci if (PageHuge(p)) { 13558c2ecf20Sopenharmony_ci res = memory_failure_hugetlb(pfn, flags); 13568c2ecf20Sopenharmony_ci goto unlock_mutex; 13578c2ecf20Sopenharmony_ci } 13588c2ecf20Sopenharmony_ci 13598c2ecf20Sopenharmony_ci if (TestSetPageHWPoison(p)) { 13608c2ecf20Sopenharmony_ci pr_err("Memory failure: %#lx: already hardware poisoned\n", 13618c2ecf20Sopenharmony_ci pfn); 13628c2ecf20Sopenharmony_ci res = -EHWPOISON; 13638c2ecf20Sopenharmony_ci goto unlock_mutex; 13648c2ecf20Sopenharmony_ci } 13658c2ecf20Sopenharmony_ci 13668c2ecf20Sopenharmony_ci orig_head = hpage = compound_head(p); 13678c2ecf20Sopenharmony_ci num_poisoned_pages_inc(); 13688c2ecf20Sopenharmony_ci 13698c2ecf20Sopenharmony_ci /* 13708c2ecf20Sopenharmony_ci * We need/can do nothing about count=0 pages. 13718c2ecf20Sopenharmony_ci * 1) it's a free page, and therefore in safe hand: 13728c2ecf20Sopenharmony_ci * prep_new_page() will be the gate keeper. 13738c2ecf20Sopenharmony_ci * 2) it's part of a non-compound high order page. 13748c2ecf20Sopenharmony_ci * Implies some kernel user: cannot stop them from 13758c2ecf20Sopenharmony_ci * R/W the page; let's pray that the page has been 13768c2ecf20Sopenharmony_ci * used and will be freed some time later. 13778c2ecf20Sopenharmony_ci * In fact it's dangerous to directly bump up page count from 0, 13788c2ecf20Sopenharmony_ci * that may make page_ref_freeze()/page_ref_unfreeze() mismatch. 13798c2ecf20Sopenharmony_ci */ 13808c2ecf20Sopenharmony_ci if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { 13818c2ecf20Sopenharmony_ci if (is_free_buddy_page(p)) { 13828c2ecf20Sopenharmony_ci action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); 13838c2ecf20Sopenharmony_ci } else { 13848c2ecf20Sopenharmony_ci action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); 13858c2ecf20Sopenharmony_ci res = -EBUSY; 13868c2ecf20Sopenharmony_ci } 13878c2ecf20Sopenharmony_ci goto unlock_mutex; 13888c2ecf20Sopenharmony_ci } 13898c2ecf20Sopenharmony_ci 13908c2ecf20Sopenharmony_ci if (PageTransHuge(hpage)) { 13918c2ecf20Sopenharmony_ci if (try_to_split_thp_page(p, "Memory Failure") < 0) { 13928c2ecf20Sopenharmony_ci action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); 13938c2ecf20Sopenharmony_ci res = -EBUSY; 13948c2ecf20Sopenharmony_ci goto unlock_mutex; 13958c2ecf20Sopenharmony_ci } 13968c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!page_count(p), p); 13978c2ecf20Sopenharmony_ci } 13988c2ecf20Sopenharmony_ci 13998c2ecf20Sopenharmony_ci /* 14008c2ecf20Sopenharmony_ci * We ignore non-LRU pages for good reasons. 14018c2ecf20Sopenharmony_ci * - PG_locked is only well defined for LRU pages and a few others 14028c2ecf20Sopenharmony_ci * - to avoid races with __SetPageLocked() 14038c2ecf20Sopenharmony_ci * - to avoid races with __SetPageSlab*() (and more non-atomic ops) 14048c2ecf20Sopenharmony_ci * The check (unnecessarily) ignores LRU pages being isolated and 14058c2ecf20Sopenharmony_ci * walked by the page reclaim code, however that's not a big loss. 14068c2ecf20Sopenharmony_ci */ 14078c2ecf20Sopenharmony_ci shake_page(p, 0); 14088c2ecf20Sopenharmony_ci /* shake_page could have turned it free. */ 14098c2ecf20Sopenharmony_ci if (!PageLRU(p) && is_free_buddy_page(p)) { 14108c2ecf20Sopenharmony_ci if (flags & MF_COUNT_INCREASED) 14118c2ecf20Sopenharmony_ci action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); 14128c2ecf20Sopenharmony_ci else 14138c2ecf20Sopenharmony_ci action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED); 14148c2ecf20Sopenharmony_ci goto unlock_mutex; 14158c2ecf20Sopenharmony_ci } 14168c2ecf20Sopenharmony_ci 14178c2ecf20Sopenharmony_ci lock_page(p); 14188c2ecf20Sopenharmony_ci 14198c2ecf20Sopenharmony_ci /* 14208c2ecf20Sopenharmony_ci * The page could have changed compound pages during the locking. 14218c2ecf20Sopenharmony_ci * If this happens just bail out. 14228c2ecf20Sopenharmony_ci */ 14238c2ecf20Sopenharmony_ci if (PageCompound(p) && compound_head(p) != orig_head) { 14248c2ecf20Sopenharmony_ci action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); 14258c2ecf20Sopenharmony_ci res = -EBUSY; 14268c2ecf20Sopenharmony_ci goto unlock_page; 14278c2ecf20Sopenharmony_ci } 14288c2ecf20Sopenharmony_ci 14298c2ecf20Sopenharmony_ci /* 14308c2ecf20Sopenharmony_ci * We use page flags to determine what action should be taken, but 14318c2ecf20Sopenharmony_ci * the flags can be modified by the error containment action. One 14328c2ecf20Sopenharmony_ci * example is an mlocked page, where PG_mlocked is cleared by 14338c2ecf20Sopenharmony_ci * page_remove_rmap() in try_to_unmap_one(). So to determine page status 14348c2ecf20Sopenharmony_ci * correctly, we save a copy of the page flags at this time. 14358c2ecf20Sopenharmony_ci */ 14368c2ecf20Sopenharmony_ci page_flags = p->flags; 14378c2ecf20Sopenharmony_ci 14388c2ecf20Sopenharmony_ci /* 14398c2ecf20Sopenharmony_ci * unpoison always clear PG_hwpoison inside page lock 14408c2ecf20Sopenharmony_ci */ 14418c2ecf20Sopenharmony_ci if (!PageHWPoison(p)) { 14428c2ecf20Sopenharmony_ci pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); 14438c2ecf20Sopenharmony_ci num_poisoned_pages_dec(); 14448c2ecf20Sopenharmony_ci unlock_page(p); 14458c2ecf20Sopenharmony_ci put_page(p); 14468c2ecf20Sopenharmony_ci goto unlock_mutex; 14478c2ecf20Sopenharmony_ci } 14488c2ecf20Sopenharmony_ci if (hwpoison_filter(p)) { 14498c2ecf20Sopenharmony_ci if (TestClearPageHWPoison(p)) 14508c2ecf20Sopenharmony_ci num_poisoned_pages_dec(); 14518c2ecf20Sopenharmony_ci unlock_page(p); 14528c2ecf20Sopenharmony_ci put_page(p); 14538c2ecf20Sopenharmony_ci goto unlock_mutex; 14548c2ecf20Sopenharmony_ci } 14558c2ecf20Sopenharmony_ci 14568c2ecf20Sopenharmony_ci /* 14578c2ecf20Sopenharmony_ci * __munlock_pagevec may clear a writeback page's LRU flag without 14588c2ecf20Sopenharmony_ci * page_lock. We need wait writeback completion for this page or it 14598c2ecf20Sopenharmony_ci * may trigger vfs BUG while evict inode. 14608c2ecf20Sopenharmony_ci */ 14618c2ecf20Sopenharmony_ci if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p)) 14628c2ecf20Sopenharmony_ci goto identify_page_state; 14638c2ecf20Sopenharmony_ci 14648c2ecf20Sopenharmony_ci /* 14658c2ecf20Sopenharmony_ci * It's very difficult to mess with pages currently under IO 14668c2ecf20Sopenharmony_ci * and in many cases impossible, so we just avoid it here. 14678c2ecf20Sopenharmony_ci */ 14688c2ecf20Sopenharmony_ci wait_on_page_writeback(p); 14698c2ecf20Sopenharmony_ci 14708c2ecf20Sopenharmony_ci /* 14718c2ecf20Sopenharmony_ci * Now take care of user space mappings. 14728c2ecf20Sopenharmony_ci * Abort on fail: __delete_from_page_cache() assumes unmapped page. 14738c2ecf20Sopenharmony_ci */ 14748c2ecf20Sopenharmony_ci if (!hwpoison_user_mappings(p, pfn, flags, &p)) { 14758c2ecf20Sopenharmony_ci action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); 14768c2ecf20Sopenharmony_ci res = -EBUSY; 14778c2ecf20Sopenharmony_ci goto unlock_page; 14788c2ecf20Sopenharmony_ci } 14798c2ecf20Sopenharmony_ci 14808c2ecf20Sopenharmony_ci /* 14818c2ecf20Sopenharmony_ci * Torn down by someone else? 14828c2ecf20Sopenharmony_ci */ 14838c2ecf20Sopenharmony_ci if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { 14848c2ecf20Sopenharmony_ci action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); 14858c2ecf20Sopenharmony_ci res = -EBUSY; 14868c2ecf20Sopenharmony_ci goto unlock_page; 14878c2ecf20Sopenharmony_ci } 14888c2ecf20Sopenharmony_ci 14898c2ecf20Sopenharmony_ciidentify_page_state: 14908c2ecf20Sopenharmony_ci res = identify_page_state(pfn, p, page_flags); 14918c2ecf20Sopenharmony_ciunlock_page: 14928c2ecf20Sopenharmony_ci unlock_page(p); 14938c2ecf20Sopenharmony_ciunlock_mutex: 14948c2ecf20Sopenharmony_ci mutex_unlock(&mf_mutex); 14958c2ecf20Sopenharmony_ci return res; 14968c2ecf20Sopenharmony_ci} 14978c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(memory_failure); 14988c2ecf20Sopenharmony_ci 14998c2ecf20Sopenharmony_ci#define MEMORY_FAILURE_FIFO_ORDER 4 15008c2ecf20Sopenharmony_ci#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) 15018c2ecf20Sopenharmony_ci 15028c2ecf20Sopenharmony_cistruct memory_failure_entry { 15038c2ecf20Sopenharmony_ci unsigned long pfn; 15048c2ecf20Sopenharmony_ci int flags; 15058c2ecf20Sopenharmony_ci}; 15068c2ecf20Sopenharmony_ci 15078c2ecf20Sopenharmony_cistruct memory_failure_cpu { 15088c2ecf20Sopenharmony_ci DECLARE_KFIFO(fifo, struct memory_failure_entry, 15098c2ecf20Sopenharmony_ci MEMORY_FAILURE_FIFO_SIZE); 15108c2ecf20Sopenharmony_ci spinlock_t lock; 15118c2ecf20Sopenharmony_ci struct work_struct work; 15128c2ecf20Sopenharmony_ci}; 15138c2ecf20Sopenharmony_ci 15148c2ecf20Sopenharmony_cistatic DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); 15158c2ecf20Sopenharmony_ci 15168c2ecf20Sopenharmony_ci/** 15178c2ecf20Sopenharmony_ci * memory_failure_queue - Schedule handling memory failure of a page. 15188c2ecf20Sopenharmony_ci * @pfn: Page Number of the corrupted page 15198c2ecf20Sopenharmony_ci * @flags: Flags for memory failure handling 15208c2ecf20Sopenharmony_ci * 15218c2ecf20Sopenharmony_ci * This function is called by the low level hardware error handler 15228c2ecf20Sopenharmony_ci * when it detects hardware memory corruption of a page. It schedules 15238c2ecf20Sopenharmony_ci * the recovering of error page, including dropping pages, killing 15248c2ecf20Sopenharmony_ci * processes etc. 15258c2ecf20Sopenharmony_ci * 15268c2ecf20Sopenharmony_ci * The function is primarily of use for corruptions that 15278c2ecf20Sopenharmony_ci * happen outside the current execution context (e.g. when 15288c2ecf20Sopenharmony_ci * detected by a background scrubber) 15298c2ecf20Sopenharmony_ci * 15308c2ecf20Sopenharmony_ci * Can run in IRQ context. 15318c2ecf20Sopenharmony_ci */ 15328c2ecf20Sopenharmony_civoid memory_failure_queue(unsigned long pfn, int flags) 15338c2ecf20Sopenharmony_ci{ 15348c2ecf20Sopenharmony_ci struct memory_failure_cpu *mf_cpu; 15358c2ecf20Sopenharmony_ci unsigned long proc_flags; 15368c2ecf20Sopenharmony_ci struct memory_failure_entry entry = { 15378c2ecf20Sopenharmony_ci .pfn = pfn, 15388c2ecf20Sopenharmony_ci .flags = flags, 15398c2ecf20Sopenharmony_ci }; 15408c2ecf20Sopenharmony_ci 15418c2ecf20Sopenharmony_ci mf_cpu = &get_cpu_var(memory_failure_cpu); 15428c2ecf20Sopenharmony_ci spin_lock_irqsave(&mf_cpu->lock, proc_flags); 15438c2ecf20Sopenharmony_ci if (kfifo_put(&mf_cpu->fifo, entry)) 15448c2ecf20Sopenharmony_ci schedule_work_on(smp_processor_id(), &mf_cpu->work); 15458c2ecf20Sopenharmony_ci else 15468c2ecf20Sopenharmony_ci pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", 15478c2ecf20Sopenharmony_ci pfn); 15488c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); 15498c2ecf20Sopenharmony_ci put_cpu_var(memory_failure_cpu); 15508c2ecf20Sopenharmony_ci} 15518c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(memory_failure_queue); 15528c2ecf20Sopenharmony_ci 15538c2ecf20Sopenharmony_cistatic void memory_failure_work_func(struct work_struct *work) 15548c2ecf20Sopenharmony_ci{ 15558c2ecf20Sopenharmony_ci struct memory_failure_cpu *mf_cpu; 15568c2ecf20Sopenharmony_ci struct memory_failure_entry entry = { 0, }; 15578c2ecf20Sopenharmony_ci unsigned long proc_flags; 15588c2ecf20Sopenharmony_ci int gotten; 15598c2ecf20Sopenharmony_ci 15608c2ecf20Sopenharmony_ci mf_cpu = container_of(work, struct memory_failure_cpu, work); 15618c2ecf20Sopenharmony_ci for (;;) { 15628c2ecf20Sopenharmony_ci spin_lock_irqsave(&mf_cpu->lock, proc_flags); 15638c2ecf20Sopenharmony_ci gotten = kfifo_get(&mf_cpu->fifo, &entry); 15648c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); 15658c2ecf20Sopenharmony_ci if (!gotten) 15668c2ecf20Sopenharmony_ci break; 15678c2ecf20Sopenharmony_ci if (entry.flags & MF_SOFT_OFFLINE) 15688c2ecf20Sopenharmony_ci soft_offline_page(entry.pfn, entry.flags); 15698c2ecf20Sopenharmony_ci else 15708c2ecf20Sopenharmony_ci memory_failure(entry.pfn, entry.flags); 15718c2ecf20Sopenharmony_ci } 15728c2ecf20Sopenharmony_ci} 15738c2ecf20Sopenharmony_ci 15748c2ecf20Sopenharmony_ci/* 15758c2ecf20Sopenharmony_ci * Process memory_failure work queued on the specified CPU. 15768c2ecf20Sopenharmony_ci * Used to avoid return-to-userspace racing with the memory_failure workqueue. 15778c2ecf20Sopenharmony_ci */ 15788c2ecf20Sopenharmony_civoid memory_failure_queue_kick(int cpu) 15798c2ecf20Sopenharmony_ci{ 15808c2ecf20Sopenharmony_ci struct memory_failure_cpu *mf_cpu; 15818c2ecf20Sopenharmony_ci 15828c2ecf20Sopenharmony_ci mf_cpu = &per_cpu(memory_failure_cpu, cpu); 15838c2ecf20Sopenharmony_ci cancel_work_sync(&mf_cpu->work); 15848c2ecf20Sopenharmony_ci memory_failure_work_func(&mf_cpu->work); 15858c2ecf20Sopenharmony_ci} 15868c2ecf20Sopenharmony_ci 15878c2ecf20Sopenharmony_cistatic int __init memory_failure_init(void) 15888c2ecf20Sopenharmony_ci{ 15898c2ecf20Sopenharmony_ci struct memory_failure_cpu *mf_cpu; 15908c2ecf20Sopenharmony_ci int cpu; 15918c2ecf20Sopenharmony_ci 15928c2ecf20Sopenharmony_ci for_each_possible_cpu(cpu) { 15938c2ecf20Sopenharmony_ci mf_cpu = &per_cpu(memory_failure_cpu, cpu); 15948c2ecf20Sopenharmony_ci spin_lock_init(&mf_cpu->lock); 15958c2ecf20Sopenharmony_ci INIT_KFIFO(mf_cpu->fifo); 15968c2ecf20Sopenharmony_ci INIT_WORK(&mf_cpu->work, memory_failure_work_func); 15978c2ecf20Sopenharmony_ci } 15988c2ecf20Sopenharmony_ci 15998c2ecf20Sopenharmony_ci return 0; 16008c2ecf20Sopenharmony_ci} 16018c2ecf20Sopenharmony_cicore_initcall(memory_failure_init); 16028c2ecf20Sopenharmony_ci 16038c2ecf20Sopenharmony_ci#define unpoison_pr_info(fmt, pfn, rs) \ 16048c2ecf20Sopenharmony_ci({ \ 16058c2ecf20Sopenharmony_ci if (__ratelimit(rs)) \ 16068c2ecf20Sopenharmony_ci pr_info(fmt, pfn); \ 16078c2ecf20Sopenharmony_ci}) 16088c2ecf20Sopenharmony_ci 16098c2ecf20Sopenharmony_ci/** 16108c2ecf20Sopenharmony_ci * unpoison_memory - Unpoison a previously poisoned page 16118c2ecf20Sopenharmony_ci * @pfn: Page number of the to be unpoisoned page 16128c2ecf20Sopenharmony_ci * 16138c2ecf20Sopenharmony_ci * Software-unpoison a page that has been poisoned by 16148c2ecf20Sopenharmony_ci * memory_failure() earlier. 16158c2ecf20Sopenharmony_ci * 16168c2ecf20Sopenharmony_ci * This is only done on the software-level, so it only works 16178c2ecf20Sopenharmony_ci * for linux injected failures, not real hardware failures 16188c2ecf20Sopenharmony_ci * 16198c2ecf20Sopenharmony_ci * Returns 0 for success, otherwise -errno. 16208c2ecf20Sopenharmony_ci */ 16218c2ecf20Sopenharmony_ciint unpoison_memory(unsigned long pfn) 16228c2ecf20Sopenharmony_ci{ 16238c2ecf20Sopenharmony_ci struct page *page; 16248c2ecf20Sopenharmony_ci struct page *p; 16258c2ecf20Sopenharmony_ci int freeit = 0; 16268c2ecf20Sopenharmony_ci static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, 16278c2ecf20Sopenharmony_ci DEFAULT_RATELIMIT_BURST); 16288c2ecf20Sopenharmony_ci 16298c2ecf20Sopenharmony_ci if (!pfn_valid(pfn)) 16308c2ecf20Sopenharmony_ci return -ENXIO; 16318c2ecf20Sopenharmony_ci 16328c2ecf20Sopenharmony_ci p = pfn_to_page(pfn); 16338c2ecf20Sopenharmony_ci page = compound_head(p); 16348c2ecf20Sopenharmony_ci 16358c2ecf20Sopenharmony_ci if (!PageHWPoison(p)) { 16368c2ecf20Sopenharmony_ci unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", 16378c2ecf20Sopenharmony_ci pfn, &unpoison_rs); 16388c2ecf20Sopenharmony_ci return 0; 16398c2ecf20Sopenharmony_ci } 16408c2ecf20Sopenharmony_ci 16418c2ecf20Sopenharmony_ci if (page_count(page) > 1) { 16428c2ecf20Sopenharmony_ci unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", 16438c2ecf20Sopenharmony_ci pfn, &unpoison_rs); 16448c2ecf20Sopenharmony_ci return 0; 16458c2ecf20Sopenharmony_ci } 16468c2ecf20Sopenharmony_ci 16478c2ecf20Sopenharmony_ci if (page_mapped(page)) { 16488c2ecf20Sopenharmony_ci unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", 16498c2ecf20Sopenharmony_ci pfn, &unpoison_rs); 16508c2ecf20Sopenharmony_ci return 0; 16518c2ecf20Sopenharmony_ci } 16528c2ecf20Sopenharmony_ci 16538c2ecf20Sopenharmony_ci if (page_mapping(page)) { 16548c2ecf20Sopenharmony_ci unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", 16558c2ecf20Sopenharmony_ci pfn, &unpoison_rs); 16568c2ecf20Sopenharmony_ci return 0; 16578c2ecf20Sopenharmony_ci } 16588c2ecf20Sopenharmony_ci 16598c2ecf20Sopenharmony_ci /* 16608c2ecf20Sopenharmony_ci * unpoison_memory() can encounter thp only when the thp is being 16618c2ecf20Sopenharmony_ci * worked by memory_failure() and the page lock is not held yet. 16628c2ecf20Sopenharmony_ci * In such case, we yield to memory_failure() and make unpoison fail. 16638c2ecf20Sopenharmony_ci */ 16648c2ecf20Sopenharmony_ci if (!PageHuge(page) && PageTransHuge(page)) { 16658c2ecf20Sopenharmony_ci unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n", 16668c2ecf20Sopenharmony_ci pfn, &unpoison_rs); 16678c2ecf20Sopenharmony_ci return 0; 16688c2ecf20Sopenharmony_ci } 16698c2ecf20Sopenharmony_ci 16708c2ecf20Sopenharmony_ci if (!get_hwpoison_page(p)) { 16718c2ecf20Sopenharmony_ci if (TestClearPageHWPoison(p)) 16728c2ecf20Sopenharmony_ci num_poisoned_pages_dec(); 16738c2ecf20Sopenharmony_ci unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", 16748c2ecf20Sopenharmony_ci pfn, &unpoison_rs); 16758c2ecf20Sopenharmony_ci return 0; 16768c2ecf20Sopenharmony_ci } 16778c2ecf20Sopenharmony_ci 16788c2ecf20Sopenharmony_ci lock_page(page); 16798c2ecf20Sopenharmony_ci /* 16808c2ecf20Sopenharmony_ci * This test is racy because PG_hwpoison is set outside of page lock. 16818c2ecf20Sopenharmony_ci * That's acceptable because that won't trigger kernel panic. Instead, 16828c2ecf20Sopenharmony_ci * the PG_hwpoison page will be caught and isolated on the entrance to 16838c2ecf20Sopenharmony_ci * the free buddy page pool. 16848c2ecf20Sopenharmony_ci */ 16858c2ecf20Sopenharmony_ci if (TestClearPageHWPoison(page)) { 16868c2ecf20Sopenharmony_ci unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", 16878c2ecf20Sopenharmony_ci pfn, &unpoison_rs); 16888c2ecf20Sopenharmony_ci num_poisoned_pages_dec(); 16898c2ecf20Sopenharmony_ci freeit = 1; 16908c2ecf20Sopenharmony_ci } 16918c2ecf20Sopenharmony_ci unlock_page(page); 16928c2ecf20Sopenharmony_ci 16938c2ecf20Sopenharmony_ci put_page(page); 16948c2ecf20Sopenharmony_ci if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) 16958c2ecf20Sopenharmony_ci put_page(page); 16968c2ecf20Sopenharmony_ci 16978c2ecf20Sopenharmony_ci return 0; 16988c2ecf20Sopenharmony_ci} 16998c2ecf20Sopenharmony_ciEXPORT_SYMBOL(unpoison_memory); 17008c2ecf20Sopenharmony_ci 17018c2ecf20Sopenharmony_ci/* 17028c2ecf20Sopenharmony_ci * Safely get reference count of an arbitrary page. 17038c2ecf20Sopenharmony_ci * Returns 0 for a free page, 1 for an in-use page, -EIO for a page-type we 17048c2ecf20Sopenharmony_ci * cannot handle and -EBUSY if we raced with an allocation. 17058c2ecf20Sopenharmony_ci * We only incremented refcount in case the page was already in-use and it is 17068c2ecf20Sopenharmony_ci * a known type we can handle. 17078c2ecf20Sopenharmony_ci */ 17088c2ecf20Sopenharmony_cistatic int get_any_page(struct page *p, int flags) 17098c2ecf20Sopenharmony_ci{ 17108c2ecf20Sopenharmony_ci int ret = 0, pass = 0; 17118c2ecf20Sopenharmony_ci bool count_increased = false; 17128c2ecf20Sopenharmony_ci 17138c2ecf20Sopenharmony_ci if (flags & MF_COUNT_INCREASED) 17148c2ecf20Sopenharmony_ci count_increased = true; 17158c2ecf20Sopenharmony_ci 17168c2ecf20Sopenharmony_citry_again: 17178c2ecf20Sopenharmony_ci if (!count_increased && !get_hwpoison_page(p)) { 17188c2ecf20Sopenharmony_ci if (page_count(p)) { 17198c2ecf20Sopenharmony_ci /* We raced with an allocation, retry. */ 17208c2ecf20Sopenharmony_ci if (pass++ < 3) 17218c2ecf20Sopenharmony_ci goto try_again; 17228c2ecf20Sopenharmony_ci ret = -EBUSY; 17238c2ecf20Sopenharmony_ci } else if (!PageHuge(p) && !is_free_buddy_page(p)) { 17248c2ecf20Sopenharmony_ci /* We raced with put_page, retry. */ 17258c2ecf20Sopenharmony_ci if (pass++ < 3) 17268c2ecf20Sopenharmony_ci goto try_again; 17278c2ecf20Sopenharmony_ci ret = -EIO; 17288c2ecf20Sopenharmony_ci } 17298c2ecf20Sopenharmony_ci } else { 17308c2ecf20Sopenharmony_ci if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) { 17318c2ecf20Sopenharmony_ci ret = 1; 17328c2ecf20Sopenharmony_ci } else { 17338c2ecf20Sopenharmony_ci /* 17348c2ecf20Sopenharmony_ci * A page we cannot handle. Check whether we can turn 17358c2ecf20Sopenharmony_ci * it into something we can handle. 17368c2ecf20Sopenharmony_ci */ 17378c2ecf20Sopenharmony_ci if (pass++ < 3) { 17388c2ecf20Sopenharmony_ci put_page(p); 17398c2ecf20Sopenharmony_ci shake_page(p, 1); 17408c2ecf20Sopenharmony_ci count_increased = false; 17418c2ecf20Sopenharmony_ci goto try_again; 17428c2ecf20Sopenharmony_ci } 17438c2ecf20Sopenharmony_ci put_page(p); 17448c2ecf20Sopenharmony_ci ret = -EIO; 17458c2ecf20Sopenharmony_ci } 17468c2ecf20Sopenharmony_ci } 17478c2ecf20Sopenharmony_ci 17488c2ecf20Sopenharmony_ci return ret; 17498c2ecf20Sopenharmony_ci} 17508c2ecf20Sopenharmony_ci 17518c2ecf20Sopenharmony_cistatic bool isolate_page(struct page *page, struct list_head *pagelist) 17528c2ecf20Sopenharmony_ci{ 17538c2ecf20Sopenharmony_ci bool isolated = false; 17548c2ecf20Sopenharmony_ci bool lru = PageLRU(page); 17558c2ecf20Sopenharmony_ci 17568c2ecf20Sopenharmony_ci if (PageHuge(page)) { 17578c2ecf20Sopenharmony_ci isolated = !isolate_hugetlb(page, pagelist); 17588c2ecf20Sopenharmony_ci } else { 17598c2ecf20Sopenharmony_ci if (lru) 17608c2ecf20Sopenharmony_ci isolated = !isolate_lru_page(page); 17618c2ecf20Sopenharmony_ci else 17628c2ecf20Sopenharmony_ci isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE); 17638c2ecf20Sopenharmony_ci 17648c2ecf20Sopenharmony_ci if (isolated) 17658c2ecf20Sopenharmony_ci list_add(&page->lru, pagelist); 17668c2ecf20Sopenharmony_ci } 17678c2ecf20Sopenharmony_ci 17688c2ecf20Sopenharmony_ci if (isolated && lru) 17698c2ecf20Sopenharmony_ci inc_node_page_state(page, NR_ISOLATED_ANON + 17708c2ecf20Sopenharmony_ci page_is_file_lru(page)); 17718c2ecf20Sopenharmony_ci 17728c2ecf20Sopenharmony_ci /* 17738c2ecf20Sopenharmony_ci * If we succeed to isolate the page, we grabbed another refcount on 17748c2ecf20Sopenharmony_ci * the page, so we can safely drop the one we got from get_any_pages(). 17758c2ecf20Sopenharmony_ci * If we failed to isolate the page, it means that we cannot go further 17768c2ecf20Sopenharmony_ci * and we will return an error, so drop the reference we got from 17778c2ecf20Sopenharmony_ci * get_any_pages() as well. 17788c2ecf20Sopenharmony_ci */ 17798c2ecf20Sopenharmony_ci put_page(page); 17808c2ecf20Sopenharmony_ci return isolated; 17818c2ecf20Sopenharmony_ci} 17828c2ecf20Sopenharmony_ci 17838c2ecf20Sopenharmony_ci/* 17848c2ecf20Sopenharmony_ci * __soft_offline_page handles hugetlb-pages and non-hugetlb pages. 17858c2ecf20Sopenharmony_ci * If the page is a non-dirty unmapped page-cache page, it simply invalidates. 17868c2ecf20Sopenharmony_ci * If the page is mapped, it migrates the contents over. 17878c2ecf20Sopenharmony_ci */ 17888c2ecf20Sopenharmony_cistatic int __soft_offline_page(struct page *page) 17898c2ecf20Sopenharmony_ci{ 17908c2ecf20Sopenharmony_ci int ret = 0; 17918c2ecf20Sopenharmony_ci unsigned long pfn = page_to_pfn(page); 17928c2ecf20Sopenharmony_ci struct page *hpage = compound_head(page); 17938c2ecf20Sopenharmony_ci char const *msg_page[] = {"page", "hugepage"}; 17948c2ecf20Sopenharmony_ci bool huge = PageHuge(page); 17958c2ecf20Sopenharmony_ci LIST_HEAD(pagelist); 17968c2ecf20Sopenharmony_ci struct migration_target_control mtc = { 17978c2ecf20Sopenharmony_ci .nid = NUMA_NO_NODE, 17988c2ecf20Sopenharmony_ci .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, 17998c2ecf20Sopenharmony_ci }; 18008c2ecf20Sopenharmony_ci 18018c2ecf20Sopenharmony_ci /* 18028c2ecf20Sopenharmony_ci * Check PageHWPoison again inside page lock because PageHWPoison 18038c2ecf20Sopenharmony_ci * is set by memory_failure() outside page lock. Note that 18048c2ecf20Sopenharmony_ci * memory_failure() also double-checks PageHWPoison inside page lock, 18058c2ecf20Sopenharmony_ci * so there's no race between soft_offline_page() and memory_failure(). 18068c2ecf20Sopenharmony_ci */ 18078c2ecf20Sopenharmony_ci lock_page(page); 18088c2ecf20Sopenharmony_ci if (!PageHuge(page)) 18098c2ecf20Sopenharmony_ci wait_on_page_writeback(page); 18108c2ecf20Sopenharmony_ci if (PageHWPoison(page)) { 18118c2ecf20Sopenharmony_ci unlock_page(page); 18128c2ecf20Sopenharmony_ci put_page(page); 18138c2ecf20Sopenharmony_ci pr_info("soft offline: %#lx page already poisoned\n", pfn); 18148c2ecf20Sopenharmony_ci return 0; 18158c2ecf20Sopenharmony_ci } 18168c2ecf20Sopenharmony_ci 18178c2ecf20Sopenharmony_ci if (!PageHuge(page)) 18188c2ecf20Sopenharmony_ci /* 18198c2ecf20Sopenharmony_ci * Try to invalidate first. This should work for 18208c2ecf20Sopenharmony_ci * non dirty unmapped page cache pages. 18218c2ecf20Sopenharmony_ci */ 18228c2ecf20Sopenharmony_ci ret = invalidate_inode_page(page); 18238c2ecf20Sopenharmony_ci unlock_page(page); 18248c2ecf20Sopenharmony_ci 18258c2ecf20Sopenharmony_ci /* 18268c2ecf20Sopenharmony_ci * RED-PEN would be better to keep it isolated here, but we 18278c2ecf20Sopenharmony_ci * would need to fix isolation locking first. 18288c2ecf20Sopenharmony_ci */ 18298c2ecf20Sopenharmony_ci if (ret) { 18308c2ecf20Sopenharmony_ci pr_info("soft_offline: %#lx: invalidated\n", pfn); 18318c2ecf20Sopenharmony_ci page_handle_poison(page, false, true); 18328c2ecf20Sopenharmony_ci return 0; 18338c2ecf20Sopenharmony_ci } 18348c2ecf20Sopenharmony_ci 18358c2ecf20Sopenharmony_ci if (isolate_page(hpage, &pagelist)) { 18368c2ecf20Sopenharmony_ci ret = migrate_pages(&pagelist, alloc_migration_target, NULL, 18378c2ecf20Sopenharmony_ci (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE); 18388c2ecf20Sopenharmony_ci if (!ret) { 18398c2ecf20Sopenharmony_ci bool release = !huge; 18408c2ecf20Sopenharmony_ci 18418c2ecf20Sopenharmony_ci if (!page_handle_poison(page, huge, release)) 18428c2ecf20Sopenharmony_ci ret = -EBUSY; 18438c2ecf20Sopenharmony_ci } else { 18448c2ecf20Sopenharmony_ci if (!list_empty(&pagelist)) 18458c2ecf20Sopenharmony_ci putback_movable_pages(&pagelist); 18468c2ecf20Sopenharmony_ci 18478c2ecf20Sopenharmony_ci pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n", 18488c2ecf20Sopenharmony_ci pfn, msg_page[huge], ret, page->flags, &page->flags); 18498c2ecf20Sopenharmony_ci if (ret > 0) 18508c2ecf20Sopenharmony_ci ret = -EBUSY; 18518c2ecf20Sopenharmony_ci } 18528c2ecf20Sopenharmony_ci } else { 18538c2ecf20Sopenharmony_ci pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %lx (%pGp)\n", 18548c2ecf20Sopenharmony_ci pfn, msg_page[huge], page_count(page), page->flags, &page->flags); 18558c2ecf20Sopenharmony_ci ret = -EBUSY; 18568c2ecf20Sopenharmony_ci } 18578c2ecf20Sopenharmony_ci return ret; 18588c2ecf20Sopenharmony_ci} 18598c2ecf20Sopenharmony_ci 18608c2ecf20Sopenharmony_cistatic int soft_offline_in_use_page(struct page *page) 18618c2ecf20Sopenharmony_ci{ 18628c2ecf20Sopenharmony_ci struct page *hpage = compound_head(page); 18638c2ecf20Sopenharmony_ci 18648c2ecf20Sopenharmony_ci if (!PageHuge(page) && PageTransHuge(hpage)) 18658c2ecf20Sopenharmony_ci if (try_to_split_thp_page(page, "soft offline") < 0) 18668c2ecf20Sopenharmony_ci return -EBUSY; 18678c2ecf20Sopenharmony_ci return __soft_offline_page(page); 18688c2ecf20Sopenharmony_ci} 18698c2ecf20Sopenharmony_ci 18708c2ecf20Sopenharmony_cistatic void put_ref_page(struct page *page) 18718c2ecf20Sopenharmony_ci{ 18728c2ecf20Sopenharmony_ci if (page) 18738c2ecf20Sopenharmony_ci put_page(page); 18748c2ecf20Sopenharmony_ci} 18758c2ecf20Sopenharmony_ci 18768c2ecf20Sopenharmony_ci/** 18778c2ecf20Sopenharmony_ci * soft_offline_page - Soft offline a page. 18788c2ecf20Sopenharmony_ci * @pfn: pfn to soft-offline 18798c2ecf20Sopenharmony_ci * @flags: flags. Same as memory_failure(). 18808c2ecf20Sopenharmony_ci * 18818c2ecf20Sopenharmony_ci * Returns 0 on success, otherwise negated errno. 18828c2ecf20Sopenharmony_ci * 18838c2ecf20Sopenharmony_ci * Soft offline a page, by migration or invalidation, 18848c2ecf20Sopenharmony_ci * without killing anything. This is for the case when 18858c2ecf20Sopenharmony_ci * a page is not corrupted yet (so it's still valid to access), 18868c2ecf20Sopenharmony_ci * but has had a number of corrected errors and is better taken 18878c2ecf20Sopenharmony_ci * out. 18888c2ecf20Sopenharmony_ci * 18898c2ecf20Sopenharmony_ci * The actual policy on when to do that is maintained by 18908c2ecf20Sopenharmony_ci * user space. 18918c2ecf20Sopenharmony_ci * 18928c2ecf20Sopenharmony_ci * This should never impact any application or cause data loss, 18938c2ecf20Sopenharmony_ci * however it might take some time. 18948c2ecf20Sopenharmony_ci * 18958c2ecf20Sopenharmony_ci * This is not a 100% solution for all memory, but tries to be 18968c2ecf20Sopenharmony_ci * ``good enough'' for the majority of memory. 18978c2ecf20Sopenharmony_ci */ 18988c2ecf20Sopenharmony_ciint soft_offline_page(unsigned long pfn, int flags) 18998c2ecf20Sopenharmony_ci{ 19008c2ecf20Sopenharmony_ci int ret; 19018c2ecf20Sopenharmony_ci bool try_again = true; 19028c2ecf20Sopenharmony_ci struct page *page, *ref_page = NULL; 19038c2ecf20Sopenharmony_ci 19048c2ecf20Sopenharmony_ci WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED)); 19058c2ecf20Sopenharmony_ci 19068c2ecf20Sopenharmony_ci if (!pfn_valid(pfn)) 19078c2ecf20Sopenharmony_ci return -ENXIO; 19088c2ecf20Sopenharmony_ci if (flags & MF_COUNT_INCREASED) 19098c2ecf20Sopenharmony_ci ref_page = pfn_to_page(pfn); 19108c2ecf20Sopenharmony_ci 19118c2ecf20Sopenharmony_ci /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */ 19128c2ecf20Sopenharmony_ci page = pfn_to_online_page(pfn); 19138c2ecf20Sopenharmony_ci if (!page) { 19148c2ecf20Sopenharmony_ci put_ref_page(ref_page); 19158c2ecf20Sopenharmony_ci return -EIO; 19168c2ecf20Sopenharmony_ci } 19178c2ecf20Sopenharmony_ci 19188c2ecf20Sopenharmony_ci if (PageHWPoison(page)) { 19198c2ecf20Sopenharmony_ci pr_info("%s: %#lx page already poisoned\n", __func__, pfn); 19208c2ecf20Sopenharmony_ci put_ref_page(ref_page); 19218c2ecf20Sopenharmony_ci return 0; 19228c2ecf20Sopenharmony_ci } 19238c2ecf20Sopenharmony_ci 19248c2ecf20Sopenharmony_ciretry: 19258c2ecf20Sopenharmony_ci get_online_mems(); 19268c2ecf20Sopenharmony_ci ret = get_any_page(page, flags); 19278c2ecf20Sopenharmony_ci put_online_mems(); 19288c2ecf20Sopenharmony_ci 19298c2ecf20Sopenharmony_ci if (ret > 0) { 19308c2ecf20Sopenharmony_ci ret = soft_offline_in_use_page(page); 19318c2ecf20Sopenharmony_ci } else if (ret == 0) { 19328c2ecf20Sopenharmony_ci if (!page_handle_poison(page, true, false)) { 19338c2ecf20Sopenharmony_ci if (try_again) { 19348c2ecf20Sopenharmony_ci try_again = false; 19358c2ecf20Sopenharmony_ci flags &= ~MF_COUNT_INCREASED; 19368c2ecf20Sopenharmony_ci goto retry; 19378c2ecf20Sopenharmony_ci } 19388c2ecf20Sopenharmony_ci ret = -EBUSY; 19398c2ecf20Sopenharmony_ci } 19408c2ecf20Sopenharmony_ci } else if (ret == -EIO) { 19418c2ecf20Sopenharmony_ci pr_info("%s: %#lx: unknown page type: %lx (%pGp)\n", 19428c2ecf20Sopenharmony_ci __func__, pfn, page->flags, &page->flags); 19438c2ecf20Sopenharmony_ci } 19448c2ecf20Sopenharmony_ci 19458c2ecf20Sopenharmony_ci return ret; 19468c2ecf20Sopenharmony_ci} 1947