162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * linux/mm/oom_kill.c 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 1998,2000 Rik van Riel 662306a36Sopenharmony_ci * Thanks go out to Claus Fischer for some serious inspiration and 762306a36Sopenharmony_ci * for goading me into coding this file... 862306a36Sopenharmony_ci * Copyright (C) 2010 Google, Inc. 962306a36Sopenharmony_ci * Rewritten by David Rientjes 1062306a36Sopenharmony_ci * 1162306a36Sopenharmony_ci * The routines in this file are used to kill a process when 1262306a36Sopenharmony_ci * we're seriously out of memory. This gets called from __alloc_pages() 1362306a36Sopenharmony_ci * in mm/page_alloc.c when we really run out of memory. 1462306a36Sopenharmony_ci * 1562306a36Sopenharmony_ci * Since we won't call these routines often (on a well-configured 1662306a36Sopenharmony_ci * machine) this file will double as a 'coding guide' and a signpost 1762306a36Sopenharmony_ci * for newbie kernel hackers. It features several pointers to major 1862306a36Sopenharmony_ci * kernel subsystems and hints as to where to find out what things do. 1962306a36Sopenharmony_ci */ 2062306a36Sopenharmony_ci 2162306a36Sopenharmony_ci#include <linux/oom.h> 2262306a36Sopenharmony_ci#include <linux/mm.h> 2362306a36Sopenharmony_ci#include <linux/err.h> 2462306a36Sopenharmony_ci#include <linux/gfp.h> 2562306a36Sopenharmony_ci#include <linux/sched.h> 2662306a36Sopenharmony_ci#include <linux/sched/mm.h> 2762306a36Sopenharmony_ci#include <linux/sched/coredump.h> 2862306a36Sopenharmony_ci#include <linux/sched/task.h> 2962306a36Sopenharmony_ci#include <linux/sched/debug.h> 3062306a36Sopenharmony_ci#include <linux/swap.h> 3162306a36Sopenharmony_ci#include <linux/syscalls.h> 3262306a36Sopenharmony_ci#include <linux/timex.h> 3362306a36Sopenharmony_ci#include <linux/jiffies.h> 3462306a36Sopenharmony_ci#include <linux/cpuset.h> 3562306a36Sopenharmony_ci#include <linux/export.h> 3662306a36Sopenharmony_ci#include <linux/notifier.h> 3762306a36Sopenharmony_ci#include <linux/memcontrol.h> 3862306a36Sopenharmony_ci#include <linux/mempolicy.h> 3962306a36Sopenharmony_ci#include <linux/security.h> 4062306a36Sopenharmony_ci#include <linux/ptrace.h> 4162306a36Sopenharmony_ci#include <linux/freezer.h> 4262306a36Sopenharmony_ci#include <linux/ftrace.h> 4362306a36Sopenharmony_ci#include <linux/ratelimit.h> 4462306a36Sopenharmony_ci#include <linux/kthread.h> 4562306a36Sopenharmony_ci#include <linux/init.h> 4662306a36Sopenharmony_ci#include <linux/mmu_notifier.h> 4762306a36Sopenharmony_ci 4862306a36Sopenharmony_ci#include <asm/tlb.h> 4962306a36Sopenharmony_ci#include "internal.h" 5062306a36Sopenharmony_ci#include "slab.h" 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_ci#define CREATE_TRACE_POINTS 5362306a36Sopenharmony_ci#include <trace/events/oom.h> 5462306a36Sopenharmony_ci 5562306a36Sopenharmony_cistatic int sysctl_panic_on_oom; 5662306a36Sopenharmony_cistatic int sysctl_oom_kill_allocating_task; 5762306a36Sopenharmony_cistatic int sysctl_oom_dump_tasks = 1; 5862306a36Sopenharmony_ci 5962306a36Sopenharmony_ci/* 6062306a36Sopenharmony_ci * Serializes oom killer invocations (out_of_memory()) from all contexts to 6162306a36Sopenharmony_ci * prevent from over eager oom killing (e.g. when the oom killer is invoked 6262306a36Sopenharmony_ci * from different domains). 6362306a36Sopenharmony_ci * 6462306a36Sopenharmony_ci * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled 6562306a36Sopenharmony_ci * and mark_oom_victim 6662306a36Sopenharmony_ci */ 6762306a36Sopenharmony_ciDEFINE_MUTEX(oom_lock); 6862306a36Sopenharmony_ci/* Serializes oom_score_adj and oom_score_adj_min updates */ 6962306a36Sopenharmony_ciDEFINE_MUTEX(oom_adj_mutex); 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_cistatic inline bool is_memcg_oom(struct oom_control *oc) 7262306a36Sopenharmony_ci{ 7362306a36Sopenharmony_ci return oc->memcg != NULL; 7462306a36Sopenharmony_ci} 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_ci#ifdef CONFIG_NUMA 7762306a36Sopenharmony_ci/** 7862306a36Sopenharmony_ci * oom_cpuset_eligible() - check task eligibility for kill 7962306a36Sopenharmony_ci * @start: task struct of which task to consider 8062306a36Sopenharmony_ci * @oc: pointer to struct oom_control 8162306a36Sopenharmony_ci * 8262306a36Sopenharmony_ci * Task eligibility is determined by whether or not a candidate task, @tsk, 8362306a36Sopenharmony_ci * shares the same mempolicy nodes as current if it is bound by such a policy 8462306a36Sopenharmony_ci * and whether or not it has the same set of allowed cpuset nodes. 8562306a36Sopenharmony_ci * 8662306a36Sopenharmony_ci * This function is assuming oom-killer context and 'current' has triggered 8762306a36Sopenharmony_ci * the oom-killer. 8862306a36Sopenharmony_ci */ 8962306a36Sopenharmony_cistatic bool oom_cpuset_eligible(struct task_struct *start, 9062306a36Sopenharmony_ci struct oom_control *oc) 9162306a36Sopenharmony_ci{ 9262306a36Sopenharmony_ci struct task_struct *tsk; 9362306a36Sopenharmony_ci bool ret = false; 9462306a36Sopenharmony_ci const nodemask_t *mask = oc->nodemask; 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci rcu_read_lock(); 9762306a36Sopenharmony_ci for_each_thread(start, tsk) { 9862306a36Sopenharmony_ci if (mask) { 9962306a36Sopenharmony_ci /* 10062306a36Sopenharmony_ci * If this is a mempolicy constrained oom, tsk's 10162306a36Sopenharmony_ci * cpuset is irrelevant. Only return true if its 10262306a36Sopenharmony_ci * mempolicy intersects current, otherwise it may be 10362306a36Sopenharmony_ci * needlessly killed. 10462306a36Sopenharmony_ci */ 10562306a36Sopenharmony_ci ret = mempolicy_in_oom_domain(tsk, mask); 10662306a36Sopenharmony_ci } else { 10762306a36Sopenharmony_ci /* 10862306a36Sopenharmony_ci * This is not a mempolicy constrained oom, so only 10962306a36Sopenharmony_ci * check the mems of tsk's cpuset. 11062306a36Sopenharmony_ci */ 11162306a36Sopenharmony_ci ret = cpuset_mems_allowed_intersects(current, tsk); 11262306a36Sopenharmony_ci } 11362306a36Sopenharmony_ci if (ret) 11462306a36Sopenharmony_ci break; 11562306a36Sopenharmony_ci } 11662306a36Sopenharmony_ci rcu_read_unlock(); 11762306a36Sopenharmony_ci 11862306a36Sopenharmony_ci return ret; 11962306a36Sopenharmony_ci} 12062306a36Sopenharmony_ci#else 12162306a36Sopenharmony_cistatic bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc) 12262306a36Sopenharmony_ci{ 12362306a36Sopenharmony_ci return true; 12462306a36Sopenharmony_ci} 12562306a36Sopenharmony_ci#endif /* CONFIG_NUMA */ 12662306a36Sopenharmony_ci 12762306a36Sopenharmony_ci/* 12862306a36Sopenharmony_ci * The process p may have detached its own ->mm while exiting or through 12962306a36Sopenharmony_ci * kthread_use_mm(), but one or more of its subthreads may still have a valid 13062306a36Sopenharmony_ci * pointer. Return p, or any of its subthreads with a valid ->mm, with 13162306a36Sopenharmony_ci * task_lock() held. 13262306a36Sopenharmony_ci */ 13362306a36Sopenharmony_cistruct task_struct *find_lock_task_mm(struct task_struct *p) 13462306a36Sopenharmony_ci{ 13562306a36Sopenharmony_ci struct task_struct *t; 13662306a36Sopenharmony_ci 13762306a36Sopenharmony_ci rcu_read_lock(); 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_ci for_each_thread(p, t) { 14062306a36Sopenharmony_ci task_lock(t); 14162306a36Sopenharmony_ci if (likely(t->mm)) 14262306a36Sopenharmony_ci goto found; 14362306a36Sopenharmony_ci task_unlock(t); 14462306a36Sopenharmony_ci } 14562306a36Sopenharmony_ci t = NULL; 14662306a36Sopenharmony_cifound: 14762306a36Sopenharmony_ci rcu_read_unlock(); 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_ci return t; 15062306a36Sopenharmony_ci} 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci/* 15362306a36Sopenharmony_ci * order == -1 means the oom kill is required by sysrq, otherwise only 15462306a36Sopenharmony_ci * for display purposes. 15562306a36Sopenharmony_ci */ 15662306a36Sopenharmony_cistatic inline bool is_sysrq_oom(struct oom_control *oc) 15762306a36Sopenharmony_ci{ 15862306a36Sopenharmony_ci return oc->order == -1; 15962306a36Sopenharmony_ci} 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci/* return true if the task is not adequate as candidate victim task. */ 16262306a36Sopenharmony_cistatic bool oom_unkillable_task(struct task_struct *p) 16362306a36Sopenharmony_ci{ 16462306a36Sopenharmony_ci if (is_global_init(p)) 16562306a36Sopenharmony_ci return true; 16662306a36Sopenharmony_ci if (p->flags & PF_KTHREAD) 16762306a36Sopenharmony_ci return true; 16862306a36Sopenharmony_ci return false; 16962306a36Sopenharmony_ci} 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_ci/* 17262306a36Sopenharmony_ci * Check whether unreclaimable slab amount is greater than 17362306a36Sopenharmony_ci * all user memory(LRU pages). 17462306a36Sopenharmony_ci * dump_unreclaimable_slab() could help in the case that 17562306a36Sopenharmony_ci * oom due to too much unreclaimable slab used by kernel. 17662306a36Sopenharmony_ci*/ 17762306a36Sopenharmony_cistatic bool should_dump_unreclaim_slab(void) 17862306a36Sopenharmony_ci{ 17962306a36Sopenharmony_ci unsigned long nr_lru; 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_ci nr_lru = global_node_page_state(NR_ACTIVE_ANON) + 18262306a36Sopenharmony_ci global_node_page_state(NR_INACTIVE_ANON) + 18362306a36Sopenharmony_ci global_node_page_state(NR_ACTIVE_FILE) + 18462306a36Sopenharmony_ci global_node_page_state(NR_INACTIVE_FILE) + 18562306a36Sopenharmony_ci global_node_page_state(NR_ISOLATED_ANON) + 18662306a36Sopenharmony_ci global_node_page_state(NR_ISOLATED_FILE) + 18762306a36Sopenharmony_ci global_node_page_state(NR_UNEVICTABLE); 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru); 19062306a36Sopenharmony_ci} 19162306a36Sopenharmony_ci 19262306a36Sopenharmony_ci/** 19362306a36Sopenharmony_ci * oom_badness - heuristic function to determine which candidate task to kill 19462306a36Sopenharmony_ci * @p: task struct of which task we should calculate 19562306a36Sopenharmony_ci * @totalpages: total present RAM allowed for page allocation 19662306a36Sopenharmony_ci * 19762306a36Sopenharmony_ci * The heuristic for determining which task to kill is made to be as simple and 19862306a36Sopenharmony_ci * predictable as possible. The goal is to return the highest value for the 19962306a36Sopenharmony_ci * task consuming the most memory to avoid subsequent oom failures. 20062306a36Sopenharmony_ci */ 20162306a36Sopenharmony_cilong oom_badness(struct task_struct *p, unsigned long totalpages) 20262306a36Sopenharmony_ci{ 20362306a36Sopenharmony_ci long points; 20462306a36Sopenharmony_ci long adj; 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_ci if (oom_unkillable_task(p)) 20762306a36Sopenharmony_ci return LONG_MIN; 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ci p = find_lock_task_mm(p); 21062306a36Sopenharmony_ci if (!p) 21162306a36Sopenharmony_ci return LONG_MIN; 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci /* 21462306a36Sopenharmony_ci * Do not even consider tasks which are explicitly marked oom 21562306a36Sopenharmony_ci * unkillable or have been already oom reaped or the are in 21662306a36Sopenharmony_ci * the middle of vfork 21762306a36Sopenharmony_ci */ 21862306a36Sopenharmony_ci adj = (long)p->signal->oom_score_adj; 21962306a36Sopenharmony_ci if (adj == OOM_SCORE_ADJ_MIN || 22062306a36Sopenharmony_ci test_bit(MMF_OOM_SKIP, &p->mm->flags) || 22162306a36Sopenharmony_ci in_vfork(p)) { 22262306a36Sopenharmony_ci task_unlock(p); 22362306a36Sopenharmony_ci return LONG_MIN; 22462306a36Sopenharmony_ci } 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci /* 22762306a36Sopenharmony_ci * The baseline for the badness score is the proportion of RAM that each 22862306a36Sopenharmony_ci * task's rss, pagetable and swap space use. 22962306a36Sopenharmony_ci */ 23062306a36Sopenharmony_ci points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + 23162306a36Sopenharmony_ci mm_pgtables_bytes(p->mm) / PAGE_SIZE; 23262306a36Sopenharmony_ci task_unlock(p); 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_ci /* Normalize to oom_score_adj units */ 23562306a36Sopenharmony_ci adj *= totalpages / 1000; 23662306a36Sopenharmony_ci points += adj; 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci return points; 23962306a36Sopenharmony_ci} 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_cistatic const char * const oom_constraint_text[] = { 24262306a36Sopenharmony_ci [CONSTRAINT_NONE] = "CONSTRAINT_NONE", 24362306a36Sopenharmony_ci [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET", 24462306a36Sopenharmony_ci [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY", 24562306a36Sopenharmony_ci [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG", 24662306a36Sopenharmony_ci}; 24762306a36Sopenharmony_ci 24862306a36Sopenharmony_ci/* 24962306a36Sopenharmony_ci * Determine the type of allocation constraint. 25062306a36Sopenharmony_ci */ 25162306a36Sopenharmony_cistatic enum oom_constraint constrained_alloc(struct oom_control *oc) 25262306a36Sopenharmony_ci{ 25362306a36Sopenharmony_ci struct zone *zone; 25462306a36Sopenharmony_ci struct zoneref *z; 25562306a36Sopenharmony_ci enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask); 25662306a36Sopenharmony_ci bool cpuset_limited = false; 25762306a36Sopenharmony_ci int nid; 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci if (is_memcg_oom(oc)) { 26062306a36Sopenharmony_ci oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1; 26162306a36Sopenharmony_ci return CONSTRAINT_MEMCG; 26262306a36Sopenharmony_ci } 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_ci /* Default to all available memory */ 26562306a36Sopenharmony_ci oc->totalpages = totalram_pages() + total_swap_pages; 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_ci if (!IS_ENABLED(CONFIG_NUMA)) 26862306a36Sopenharmony_ci return CONSTRAINT_NONE; 26962306a36Sopenharmony_ci 27062306a36Sopenharmony_ci if (!oc->zonelist) 27162306a36Sopenharmony_ci return CONSTRAINT_NONE; 27262306a36Sopenharmony_ci /* 27362306a36Sopenharmony_ci * Reach here only when __GFP_NOFAIL is used. So, we should avoid 27462306a36Sopenharmony_ci * to kill current.We have to random task kill in this case. 27562306a36Sopenharmony_ci * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. 27662306a36Sopenharmony_ci */ 27762306a36Sopenharmony_ci if (oc->gfp_mask & __GFP_THISNODE) 27862306a36Sopenharmony_ci return CONSTRAINT_NONE; 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci /* 28162306a36Sopenharmony_ci * This is not a __GFP_THISNODE allocation, so a truncated nodemask in 28262306a36Sopenharmony_ci * the page allocator means a mempolicy is in effect. Cpuset policy 28362306a36Sopenharmony_ci * is enforced in get_page_from_freelist(). 28462306a36Sopenharmony_ci */ 28562306a36Sopenharmony_ci if (oc->nodemask && 28662306a36Sopenharmony_ci !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { 28762306a36Sopenharmony_ci oc->totalpages = total_swap_pages; 28862306a36Sopenharmony_ci for_each_node_mask(nid, *oc->nodemask) 28962306a36Sopenharmony_ci oc->totalpages += node_present_pages(nid); 29062306a36Sopenharmony_ci return CONSTRAINT_MEMORY_POLICY; 29162306a36Sopenharmony_ci } 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci /* Check this allocation failure is caused by cpuset's wall function */ 29462306a36Sopenharmony_ci for_each_zone_zonelist_nodemask(zone, z, oc->zonelist, 29562306a36Sopenharmony_ci highest_zoneidx, oc->nodemask) 29662306a36Sopenharmony_ci if (!cpuset_zone_allowed(zone, oc->gfp_mask)) 29762306a36Sopenharmony_ci cpuset_limited = true; 29862306a36Sopenharmony_ci 29962306a36Sopenharmony_ci if (cpuset_limited) { 30062306a36Sopenharmony_ci oc->totalpages = total_swap_pages; 30162306a36Sopenharmony_ci for_each_node_mask(nid, cpuset_current_mems_allowed) 30262306a36Sopenharmony_ci oc->totalpages += node_present_pages(nid); 30362306a36Sopenharmony_ci return CONSTRAINT_CPUSET; 30462306a36Sopenharmony_ci } 30562306a36Sopenharmony_ci return CONSTRAINT_NONE; 30662306a36Sopenharmony_ci} 30762306a36Sopenharmony_ci 30862306a36Sopenharmony_cistatic int oom_evaluate_task(struct task_struct *task, void *arg) 30962306a36Sopenharmony_ci{ 31062306a36Sopenharmony_ci struct oom_control *oc = arg; 31162306a36Sopenharmony_ci long points; 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci if (oom_unkillable_task(task)) 31462306a36Sopenharmony_ci goto next; 31562306a36Sopenharmony_ci 31662306a36Sopenharmony_ci /* p may not have freeable memory in nodemask */ 31762306a36Sopenharmony_ci if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc)) 31862306a36Sopenharmony_ci goto next; 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_ci /* 32162306a36Sopenharmony_ci * This task already has access to memory reserves and is being killed. 32262306a36Sopenharmony_ci * Don't allow any other task to have access to the reserves unless 32362306a36Sopenharmony_ci * the task has MMF_OOM_SKIP because chances that it would release 32462306a36Sopenharmony_ci * any memory is quite low. 32562306a36Sopenharmony_ci */ 32662306a36Sopenharmony_ci if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { 32762306a36Sopenharmony_ci if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) 32862306a36Sopenharmony_ci goto next; 32962306a36Sopenharmony_ci goto abort; 33062306a36Sopenharmony_ci } 33162306a36Sopenharmony_ci 33262306a36Sopenharmony_ci /* 33362306a36Sopenharmony_ci * If task is allocating a lot of memory and has been marked to be 33462306a36Sopenharmony_ci * killed first if it triggers an oom, then select it. 33562306a36Sopenharmony_ci */ 33662306a36Sopenharmony_ci if (oom_task_origin(task)) { 33762306a36Sopenharmony_ci points = LONG_MAX; 33862306a36Sopenharmony_ci goto select; 33962306a36Sopenharmony_ci } 34062306a36Sopenharmony_ci 34162306a36Sopenharmony_ci points = oom_badness(task, oc->totalpages); 34262306a36Sopenharmony_ci if (points == LONG_MIN || points < oc->chosen_points) 34362306a36Sopenharmony_ci goto next; 34462306a36Sopenharmony_ci 34562306a36Sopenharmony_ciselect: 34662306a36Sopenharmony_ci if (oc->chosen) 34762306a36Sopenharmony_ci put_task_struct(oc->chosen); 34862306a36Sopenharmony_ci get_task_struct(task); 34962306a36Sopenharmony_ci oc->chosen = task; 35062306a36Sopenharmony_ci oc->chosen_points = points; 35162306a36Sopenharmony_cinext: 35262306a36Sopenharmony_ci return 0; 35362306a36Sopenharmony_ciabort: 35462306a36Sopenharmony_ci if (oc->chosen) 35562306a36Sopenharmony_ci put_task_struct(oc->chosen); 35662306a36Sopenharmony_ci oc->chosen = (void *)-1UL; 35762306a36Sopenharmony_ci return 1; 35862306a36Sopenharmony_ci} 35962306a36Sopenharmony_ci 36062306a36Sopenharmony_ci/* 36162306a36Sopenharmony_ci * Simple selection loop. We choose the process with the highest number of 36262306a36Sopenharmony_ci * 'points'. In case scan was aborted, oc->chosen is set to -1. 36362306a36Sopenharmony_ci */ 36462306a36Sopenharmony_cistatic void select_bad_process(struct oom_control *oc) 36562306a36Sopenharmony_ci{ 36662306a36Sopenharmony_ci oc->chosen_points = LONG_MIN; 36762306a36Sopenharmony_ci 36862306a36Sopenharmony_ci if (is_memcg_oom(oc)) 36962306a36Sopenharmony_ci mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc); 37062306a36Sopenharmony_ci else { 37162306a36Sopenharmony_ci struct task_struct *p; 37262306a36Sopenharmony_ci 37362306a36Sopenharmony_ci rcu_read_lock(); 37462306a36Sopenharmony_ci for_each_process(p) 37562306a36Sopenharmony_ci if (oom_evaluate_task(p, oc)) 37662306a36Sopenharmony_ci break; 37762306a36Sopenharmony_ci rcu_read_unlock(); 37862306a36Sopenharmony_ci } 37962306a36Sopenharmony_ci} 38062306a36Sopenharmony_ci 38162306a36Sopenharmony_cistatic int dump_task(struct task_struct *p, void *arg) 38262306a36Sopenharmony_ci{ 38362306a36Sopenharmony_ci struct oom_control *oc = arg; 38462306a36Sopenharmony_ci struct task_struct *task; 38562306a36Sopenharmony_ci 38662306a36Sopenharmony_ci if (oom_unkillable_task(p)) 38762306a36Sopenharmony_ci return 0; 38862306a36Sopenharmony_ci 38962306a36Sopenharmony_ci /* p may not have freeable memory in nodemask */ 39062306a36Sopenharmony_ci if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc)) 39162306a36Sopenharmony_ci return 0; 39262306a36Sopenharmony_ci 39362306a36Sopenharmony_ci task = find_lock_task_mm(p); 39462306a36Sopenharmony_ci if (!task) { 39562306a36Sopenharmony_ci /* 39662306a36Sopenharmony_ci * All of p's threads have already detached their mm's. There's 39762306a36Sopenharmony_ci * no need to report them; they can't be oom killed anyway. 39862306a36Sopenharmony_ci */ 39962306a36Sopenharmony_ci return 0; 40062306a36Sopenharmony_ci } 40162306a36Sopenharmony_ci 40262306a36Sopenharmony_ci pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", 40362306a36Sopenharmony_ci task->pid, from_kuid(&init_user_ns, task_uid(task)), 40462306a36Sopenharmony_ci task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 40562306a36Sopenharmony_ci mm_pgtables_bytes(task->mm), 40662306a36Sopenharmony_ci get_mm_counter(task->mm, MM_SWAPENTS), 40762306a36Sopenharmony_ci task->signal->oom_score_adj, task->comm); 40862306a36Sopenharmony_ci task_unlock(task); 40962306a36Sopenharmony_ci 41062306a36Sopenharmony_ci return 0; 41162306a36Sopenharmony_ci} 41262306a36Sopenharmony_ci 41362306a36Sopenharmony_ci/** 41462306a36Sopenharmony_ci * dump_tasks - dump current memory state of all system tasks 41562306a36Sopenharmony_ci * @oc: pointer to struct oom_control 41662306a36Sopenharmony_ci * 41762306a36Sopenharmony_ci * Dumps the current memory state of all eligible tasks. Tasks not in the same 41862306a36Sopenharmony_ci * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes 41962306a36Sopenharmony_ci * are not shown. 42062306a36Sopenharmony_ci * State information includes task's pid, uid, tgid, vm size, rss, 42162306a36Sopenharmony_ci * pgtables_bytes, swapents, oom_score_adj value, and name. 42262306a36Sopenharmony_ci */ 42362306a36Sopenharmony_cistatic void dump_tasks(struct oom_control *oc) 42462306a36Sopenharmony_ci{ 42562306a36Sopenharmony_ci pr_info("Tasks state (memory values in pages):\n"); 42662306a36Sopenharmony_ci pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); 42762306a36Sopenharmony_ci 42862306a36Sopenharmony_ci if (is_memcg_oom(oc)) 42962306a36Sopenharmony_ci mem_cgroup_scan_tasks(oc->memcg, dump_task, oc); 43062306a36Sopenharmony_ci else { 43162306a36Sopenharmony_ci struct task_struct *p; 43262306a36Sopenharmony_ci 43362306a36Sopenharmony_ci rcu_read_lock(); 43462306a36Sopenharmony_ci for_each_process(p) 43562306a36Sopenharmony_ci dump_task(p, oc); 43662306a36Sopenharmony_ci rcu_read_unlock(); 43762306a36Sopenharmony_ci } 43862306a36Sopenharmony_ci} 43962306a36Sopenharmony_ci 44062306a36Sopenharmony_cistatic void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) 44162306a36Sopenharmony_ci{ 44262306a36Sopenharmony_ci /* one line summary of the oom killer context. */ 44362306a36Sopenharmony_ci pr_info("oom-kill:constraint=%s,nodemask=%*pbl", 44462306a36Sopenharmony_ci oom_constraint_text[oc->constraint], 44562306a36Sopenharmony_ci nodemask_pr_args(oc->nodemask)); 44662306a36Sopenharmony_ci cpuset_print_current_mems_allowed(); 44762306a36Sopenharmony_ci mem_cgroup_print_oom_context(oc->memcg, victim); 44862306a36Sopenharmony_ci pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid, 44962306a36Sopenharmony_ci from_kuid(&init_user_ns, task_uid(victim))); 45062306a36Sopenharmony_ci} 45162306a36Sopenharmony_ci 45262306a36Sopenharmony_cistatic void dump_header(struct oom_control *oc, struct task_struct *p) 45362306a36Sopenharmony_ci{ 45462306a36Sopenharmony_ci pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", 45562306a36Sopenharmony_ci current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, 45662306a36Sopenharmony_ci current->signal->oom_score_adj); 45762306a36Sopenharmony_ci if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) 45862306a36Sopenharmony_ci pr_warn("COMPACTION is disabled!!!\n"); 45962306a36Sopenharmony_ci 46062306a36Sopenharmony_ci dump_stack(); 46162306a36Sopenharmony_ci if (is_memcg_oom(oc)) 46262306a36Sopenharmony_ci mem_cgroup_print_oom_meminfo(oc->memcg); 46362306a36Sopenharmony_ci else { 46462306a36Sopenharmony_ci __show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask)); 46562306a36Sopenharmony_ci if (should_dump_unreclaim_slab()) 46662306a36Sopenharmony_ci dump_unreclaimable_slab(); 46762306a36Sopenharmony_ci } 46862306a36Sopenharmony_ci if (sysctl_oom_dump_tasks) 46962306a36Sopenharmony_ci dump_tasks(oc); 47062306a36Sopenharmony_ci if (p) 47162306a36Sopenharmony_ci dump_oom_summary(oc, p); 47262306a36Sopenharmony_ci} 47362306a36Sopenharmony_ci 47462306a36Sopenharmony_ci/* 47562306a36Sopenharmony_ci * Number of OOM victims in flight 47662306a36Sopenharmony_ci */ 47762306a36Sopenharmony_cistatic atomic_t oom_victims = ATOMIC_INIT(0); 47862306a36Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); 47962306a36Sopenharmony_ci 48062306a36Sopenharmony_cistatic bool oom_killer_disabled __read_mostly; 48162306a36Sopenharmony_ci 48262306a36Sopenharmony_ci/* 48362306a36Sopenharmony_ci * task->mm can be NULL if the task is the exited group leader. So to 48462306a36Sopenharmony_ci * determine whether the task is using a particular mm, we examine all the 48562306a36Sopenharmony_ci * task's threads: if one of those is using this mm then this task was also 48662306a36Sopenharmony_ci * using it. 48762306a36Sopenharmony_ci */ 48862306a36Sopenharmony_cibool process_shares_mm(struct task_struct *p, struct mm_struct *mm) 48962306a36Sopenharmony_ci{ 49062306a36Sopenharmony_ci struct task_struct *t; 49162306a36Sopenharmony_ci 49262306a36Sopenharmony_ci for_each_thread(p, t) { 49362306a36Sopenharmony_ci struct mm_struct *t_mm = READ_ONCE(t->mm); 49462306a36Sopenharmony_ci if (t_mm) 49562306a36Sopenharmony_ci return t_mm == mm; 49662306a36Sopenharmony_ci } 49762306a36Sopenharmony_ci return false; 49862306a36Sopenharmony_ci} 49962306a36Sopenharmony_ci 50062306a36Sopenharmony_ci#ifdef CONFIG_MMU 50162306a36Sopenharmony_ci/* 50262306a36Sopenharmony_ci * OOM Reaper kernel thread which tries to reap the memory used by the OOM 50362306a36Sopenharmony_ci * victim (if that is possible) to help the OOM killer to move on. 50462306a36Sopenharmony_ci */ 50562306a36Sopenharmony_cistatic struct task_struct *oom_reaper_th; 50662306a36Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); 50762306a36Sopenharmony_cistatic struct task_struct *oom_reaper_list; 50862306a36Sopenharmony_cistatic DEFINE_SPINLOCK(oom_reaper_lock); 50962306a36Sopenharmony_ci 51062306a36Sopenharmony_cistatic bool __oom_reap_task_mm(struct mm_struct *mm) 51162306a36Sopenharmony_ci{ 51262306a36Sopenharmony_ci struct vm_area_struct *vma; 51362306a36Sopenharmony_ci bool ret = true; 51462306a36Sopenharmony_ci VMA_ITERATOR(vmi, mm, 0); 51562306a36Sopenharmony_ci 51662306a36Sopenharmony_ci /* 51762306a36Sopenharmony_ci * Tell all users of get_user/copy_from_user etc... that the content 51862306a36Sopenharmony_ci * is no longer stable. No barriers really needed because unmapping 51962306a36Sopenharmony_ci * should imply barriers already and the reader would hit a page fault 52062306a36Sopenharmony_ci * if it stumbled over a reaped memory. 52162306a36Sopenharmony_ci */ 52262306a36Sopenharmony_ci set_bit(MMF_UNSTABLE, &mm->flags); 52362306a36Sopenharmony_ci 52462306a36Sopenharmony_ci for_each_vma(vmi, vma) { 52562306a36Sopenharmony_ci if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) 52662306a36Sopenharmony_ci continue; 52762306a36Sopenharmony_ci 52862306a36Sopenharmony_ci /* 52962306a36Sopenharmony_ci * Only anonymous pages have a good chance to be dropped 53062306a36Sopenharmony_ci * without additional steps which we cannot afford as we 53162306a36Sopenharmony_ci * are OOM already. 53262306a36Sopenharmony_ci * 53362306a36Sopenharmony_ci * We do not even care about fs backed pages because all 53462306a36Sopenharmony_ci * which are reclaimable have already been reclaimed and 53562306a36Sopenharmony_ci * we do not want to block exit_mmap by keeping mm ref 53662306a36Sopenharmony_ci * count elevated without a good reason. 53762306a36Sopenharmony_ci */ 53862306a36Sopenharmony_ci if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { 53962306a36Sopenharmony_ci struct mmu_notifier_range range; 54062306a36Sopenharmony_ci struct mmu_gather tlb; 54162306a36Sopenharmony_ci 54262306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, 54362306a36Sopenharmony_ci mm, vma->vm_start, 54462306a36Sopenharmony_ci vma->vm_end); 54562306a36Sopenharmony_ci tlb_gather_mmu(&tlb, mm); 54662306a36Sopenharmony_ci if (mmu_notifier_invalidate_range_start_nonblock(&range)) { 54762306a36Sopenharmony_ci tlb_finish_mmu(&tlb); 54862306a36Sopenharmony_ci ret = false; 54962306a36Sopenharmony_ci continue; 55062306a36Sopenharmony_ci } 55162306a36Sopenharmony_ci unmap_page_range(&tlb, vma, range.start, range.end, NULL); 55262306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 55362306a36Sopenharmony_ci tlb_finish_mmu(&tlb); 55462306a36Sopenharmony_ci } 55562306a36Sopenharmony_ci } 55662306a36Sopenharmony_ci 55762306a36Sopenharmony_ci return ret; 55862306a36Sopenharmony_ci} 55962306a36Sopenharmony_ci 56062306a36Sopenharmony_ci/* 56162306a36Sopenharmony_ci * Reaps the address space of the give task. 56262306a36Sopenharmony_ci * 56362306a36Sopenharmony_ci * Returns true on success and false if none or part of the address space 56462306a36Sopenharmony_ci * has been reclaimed and the caller should retry later. 56562306a36Sopenharmony_ci */ 56662306a36Sopenharmony_cistatic bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) 56762306a36Sopenharmony_ci{ 56862306a36Sopenharmony_ci bool ret = true; 56962306a36Sopenharmony_ci 57062306a36Sopenharmony_ci if (!mmap_read_trylock(mm)) { 57162306a36Sopenharmony_ci trace_skip_task_reaping(tsk->pid); 57262306a36Sopenharmony_ci return false; 57362306a36Sopenharmony_ci } 57462306a36Sopenharmony_ci 57562306a36Sopenharmony_ci /* 57662306a36Sopenharmony_ci * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't 57762306a36Sopenharmony_ci * work on the mm anymore. The check for MMF_OOM_SKIP must run 57862306a36Sopenharmony_ci * under mmap_lock for reading because it serializes against the 57962306a36Sopenharmony_ci * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap(). 58062306a36Sopenharmony_ci */ 58162306a36Sopenharmony_ci if (test_bit(MMF_OOM_SKIP, &mm->flags)) { 58262306a36Sopenharmony_ci trace_skip_task_reaping(tsk->pid); 58362306a36Sopenharmony_ci goto out_unlock; 58462306a36Sopenharmony_ci } 58562306a36Sopenharmony_ci 58662306a36Sopenharmony_ci trace_start_task_reaping(tsk->pid); 58762306a36Sopenharmony_ci 58862306a36Sopenharmony_ci /* failed to reap part of the address space. Try again later */ 58962306a36Sopenharmony_ci ret = __oom_reap_task_mm(mm); 59062306a36Sopenharmony_ci if (!ret) 59162306a36Sopenharmony_ci goto out_finish; 59262306a36Sopenharmony_ci 59362306a36Sopenharmony_ci pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", 59462306a36Sopenharmony_ci task_pid_nr(tsk), tsk->comm, 59562306a36Sopenharmony_ci K(get_mm_counter(mm, MM_ANONPAGES)), 59662306a36Sopenharmony_ci K(get_mm_counter(mm, MM_FILEPAGES)), 59762306a36Sopenharmony_ci K(get_mm_counter(mm, MM_SHMEMPAGES))); 59862306a36Sopenharmony_ciout_finish: 59962306a36Sopenharmony_ci trace_finish_task_reaping(tsk->pid); 60062306a36Sopenharmony_ciout_unlock: 60162306a36Sopenharmony_ci mmap_read_unlock(mm); 60262306a36Sopenharmony_ci 60362306a36Sopenharmony_ci return ret; 60462306a36Sopenharmony_ci} 60562306a36Sopenharmony_ci 60662306a36Sopenharmony_ci#define MAX_OOM_REAP_RETRIES 10 60762306a36Sopenharmony_cistatic void oom_reap_task(struct task_struct *tsk) 60862306a36Sopenharmony_ci{ 60962306a36Sopenharmony_ci int attempts = 0; 61062306a36Sopenharmony_ci struct mm_struct *mm = tsk->signal->oom_mm; 61162306a36Sopenharmony_ci 61262306a36Sopenharmony_ci /* Retry the mmap_read_trylock(mm) a few times */ 61362306a36Sopenharmony_ci while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm)) 61462306a36Sopenharmony_ci schedule_timeout_idle(HZ/10); 61562306a36Sopenharmony_ci 61662306a36Sopenharmony_ci if (attempts <= MAX_OOM_REAP_RETRIES || 61762306a36Sopenharmony_ci test_bit(MMF_OOM_SKIP, &mm->flags)) 61862306a36Sopenharmony_ci goto done; 61962306a36Sopenharmony_ci 62062306a36Sopenharmony_ci pr_info("oom_reaper: unable to reap pid:%d (%s)\n", 62162306a36Sopenharmony_ci task_pid_nr(tsk), tsk->comm); 62262306a36Sopenharmony_ci sched_show_task(tsk); 62362306a36Sopenharmony_ci debug_show_all_locks(); 62462306a36Sopenharmony_ci 62562306a36Sopenharmony_cidone: 62662306a36Sopenharmony_ci tsk->oom_reaper_list = NULL; 62762306a36Sopenharmony_ci 62862306a36Sopenharmony_ci /* 62962306a36Sopenharmony_ci * Hide this mm from OOM killer because it has been either reaped or 63062306a36Sopenharmony_ci * somebody can't call mmap_write_unlock(mm). 63162306a36Sopenharmony_ci */ 63262306a36Sopenharmony_ci set_bit(MMF_OOM_SKIP, &mm->flags); 63362306a36Sopenharmony_ci 63462306a36Sopenharmony_ci /* Drop a reference taken by queue_oom_reaper */ 63562306a36Sopenharmony_ci put_task_struct(tsk); 63662306a36Sopenharmony_ci} 63762306a36Sopenharmony_ci 63862306a36Sopenharmony_cistatic int oom_reaper(void *unused) 63962306a36Sopenharmony_ci{ 64062306a36Sopenharmony_ci set_freezable(); 64162306a36Sopenharmony_ci 64262306a36Sopenharmony_ci while (true) { 64362306a36Sopenharmony_ci struct task_struct *tsk = NULL; 64462306a36Sopenharmony_ci 64562306a36Sopenharmony_ci wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); 64662306a36Sopenharmony_ci spin_lock_irq(&oom_reaper_lock); 64762306a36Sopenharmony_ci if (oom_reaper_list != NULL) { 64862306a36Sopenharmony_ci tsk = oom_reaper_list; 64962306a36Sopenharmony_ci oom_reaper_list = tsk->oom_reaper_list; 65062306a36Sopenharmony_ci } 65162306a36Sopenharmony_ci spin_unlock_irq(&oom_reaper_lock); 65262306a36Sopenharmony_ci 65362306a36Sopenharmony_ci if (tsk) 65462306a36Sopenharmony_ci oom_reap_task(tsk); 65562306a36Sopenharmony_ci } 65662306a36Sopenharmony_ci 65762306a36Sopenharmony_ci return 0; 65862306a36Sopenharmony_ci} 65962306a36Sopenharmony_ci 66062306a36Sopenharmony_cistatic void wake_oom_reaper(struct timer_list *timer) 66162306a36Sopenharmony_ci{ 66262306a36Sopenharmony_ci struct task_struct *tsk = container_of(timer, struct task_struct, 66362306a36Sopenharmony_ci oom_reaper_timer); 66462306a36Sopenharmony_ci struct mm_struct *mm = tsk->signal->oom_mm; 66562306a36Sopenharmony_ci unsigned long flags; 66662306a36Sopenharmony_ci 66762306a36Sopenharmony_ci /* The victim managed to terminate on its own - see exit_mmap */ 66862306a36Sopenharmony_ci if (test_bit(MMF_OOM_SKIP, &mm->flags)) { 66962306a36Sopenharmony_ci put_task_struct(tsk); 67062306a36Sopenharmony_ci return; 67162306a36Sopenharmony_ci } 67262306a36Sopenharmony_ci 67362306a36Sopenharmony_ci spin_lock_irqsave(&oom_reaper_lock, flags); 67462306a36Sopenharmony_ci tsk->oom_reaper_list = oom_reaper_list; 67562306a36Sopenharmony_ci oom_reaper_list = tsk; 67662306a36Sopenharmony_ci spin_unlock_irqrestore(&oom_reaper_lock, flags); 67762306a36Sopenharmony_ci trace_wake_reaper(tsk->pid); 67862306a36Sopenharmony_ci wake_up(&oom_reaper_wait); 67962306a36Sopenharmony_ci} 68062306a36Sopenharmony_ci 68162306a36Sopenharmony_ci/* 68262306a36Sopenharmony_ci * Give the OOM victim time to exit naturally before invoking the oom_reaping. 68362306a36Sopenharmony_ci * The timers timeout is arbitrary... the longer it is, the longer the worst 68462306a36Sopenharmony_ci * case scenario for the OOM can take. If it is too small, the oom_reaper can 68562306a36Sopenharmony_ci * get in the way and release resources needed by the process exit path. 68662306a36Sopenharmony_ci * e.g. The futex robust list can sit in Anon|Private memory that gets reaped 68762306a36Sopenharmony_ci * before the exit path is able to wake the futex waiters. 68862306a36Sopenharmony_ci */ 68962306a36Sopenharmony_ci#define OOM_REAPER_DELAY (2*HZ) 69062306a36Sopenharmony_cistatic void queue_oom_reaper(struct task_struct *tsk) 69162306a36Sopenharmony_ci{ 69262306a36Sopenharmony_ci /* mm is already queued? */ 69362306a36Sopenharmony_ci if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) 69462306a36Sopenharmony_ci return; 69562306a36Sopenharmony_ci 69662306a36Sopenharmony_ci get_task_struct(tsk); 69762306a36Sopenharmony_ci timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0); 69862306a36Sopenharmony_ci tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY; 69962306a36Sopenharmony_ci add_timer(&tsk->oom_reaper_timer); 70062306a36Sopenharmony_ci} 70162306a36Sopenharmony_ci 70262306a36Sopenharmony_ci#ifdef CONFIG_SYSCTL 70362306a36Sopenharmony_cistatic struct ctl_table vm_oom_kill_table[] = { 70462306a36Sopenharmony_ci { 70562306a36Sopenharmony_ci .procname = "panic_on_oom", 70662306a36Sopenharmony_ci .data = &sysctl_panic_on_oom, 70762306a36Sopenharmony_ci .maxlen = sizeof(sysctl_panic_on_oom), 70862306a36Sopenharmony_ci .mode = 0644, 70962306a36Sopenharmony_ci .proc_handler = proc_dointvec_minmax, 71062306a36Sopenharmony_ci .extra1 = SYSCTL_ZERO, 71162306a36Sopenharmony_ci .extra2 = SYSCTL_TWO, 71262306a36Sopenharmony_ci }, 71362306a36Sopenharmony_ci { 71462306a36Sopenharmony_ci .procname = "oom_kill_allocating_task", 71562306a36Sopenharmony_ci .data = &sysctl_oom_kill_allocating_task, 71662306a36Sopenharmony_ci .maxlen = sizeof(sysctl_oom_kill_allocating_task), 71762306a36Sopenharmony_ci .mode = 0644, 71862306a36Sopenharmony_ci .proc_handler = proc_dointvec, 71962306a36Sopenharmony_ci }, 72062306a36Sopenharmony_ci { 72162306a36Sopenharmony_ci .procname = "oom_dump_tasks", 72262306a36Sopenharmony_ci .data = &sysctl_oom_dump_tasks, 72362306a36Sopenharmony_ci .maxlen = sizeof(sysctl_oom_dump_tasks), 72462306a36Sopenharmony_ci .mode = 0644, 72562306a36Sopenharmony_ci .proc_handler = proc_dointvec, 72662306a36Sopenharmony_ci }, 72762306a36Sopenharmony_ci {} 72862306a36Sopenharmony_ci}; 72962306a36Sopenharmony_ci#endif 73062306a36Sopenharmony_ci 73162306a36Sopenharmony_cistatic int __init oom_init(void) 73262306a36Sopenharmony_ci{ 73362306a36Sopenharmony_ci oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); 73462306a36Sopenharmony_ci#ifdef CONFIG_SYSCTL 73562306a36Sopenharmony_ci register_sysctl_init("vm", vm_oom_kill_table); 73662306a36Sopenharmony_ci#endif 73762306a36Sopenharmony_ci return 0; 73862306a36Sopenharmony_ci} 73962306a36Sopenharmony_cisubsys_initcall(oom_init) 74062306a36Sopenharmony_ci#else 74162306a36Sopenharmony_cistatic inline void queue_oom_reaper(struct task_struct *tsk) 74262306a36Sopenharmony_ci{ 74362306a36Sopenharmony_ci} 74462306a36Sopenharmony_ci#endif /* CONFIG_MMU */ 74562306a36Sopenharmony_ci 74662306a36Sopenharmony_ci/** 74762306a36Sopenharmony_ci * mark_oom_victim - mark the given task as OOM victim 74862306a36Sopenharmony_ci * @tsk: task to mark 74962306a36Sopenharmony_ci * 75062306a36Sopenharmony_ci * Has to be called with oom_lock held and never after 75162306a36Sopenharmony_ci * oom has been disabled already. 75262306a36Sopenharmony_ci * 75362306a36Sopenharmony_ci * tsk->mm has to be non NULL and caller has to guarantee it is stable (either 75462306a36Sopenharmony_ci * under task_lock or operate on the current). 75562306a36Sopenharmony_ci */ 75662306a36Sopenharmony_cistatic void mark_oom_victim(struct task_struct *tsk) 75762306a36Sopenharmony_ci{ 75862306a36Sopenharmony_ci struct mm_struct *mm = tsk->mm; 75962306a36Sopenharmony_ci 76062306a36Sopenharmony_ci WARN_ON(oom_killer_disabled); 76162306a36Sopenharmony_ci /* OOM killer might race with memcg OOM */ 76262306a36Sopenharmony_ci if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) 76362306a36Sopenharmony_ci return; 76462306a36Sopenharmony_ci 76562306a36Sopenharmony_ci /* oom_mm is bound to the signal struct life time. */ 76662306a36Sopenharmony_ci if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) 76762306a36Sopenharmony_ci mmgrab(tsk->signal->oom_mm); 76862306a36Sopenharmony_ci 76962306a36Sopenharmony_ci /* 77062306a36Sopenharmony_ci * Make sure that the task is woken up from uninterruptible sleep 77162306a36Sopenharmony_ci * if it is frozen because OOM killer wouldn't be able to free 77262306a36Sopenharmony_ci * any memory and livelock. freezing_slow_path will tell the freezer 77362306a36Sopenharmony_ci * that TIF_MEMDIE tasks should be ignored. 77462306a36Sopenharmony_ci */ 77562306a36Sopenharmony_ci __thaw_task(tsk); 77662306a36Sopenharmony_ci atomic_inc(&oom_victims); 77762306a36Sopenharmony_ci trace_mark_victim(tsk->pid); 77862306a36Sopenharmony_ci} 77962306a36Sopenharmony_ci 78062306a36Sopenharmony_ci/** 78162306a36Sopenharmony_ci * exit_oom_victim - note the exit of an OOM victim 78262306a36Sopenharmony_ci */ 78362306a36Sopenharmony_civoid exit_oom_victim(void) 78462306a36Sopenharmony_ci{ 78562306a36Sopenharmony_ci clear_thread_flag(TIF_MEMDIE); 78662306a36Sopenharmony_ci 78762306a36Sopenharmony_ci if (!atomic_dec_return(&oom_victims)) 78862306a36Sopenharmony_ci wake_up_all(&oom_victims_wait); 78962306a36Sopenharmony_ci} 79062306a36Sopenharmony_ci 79162306a36Sopenharmony_ci/** 79262306a36Sopenharmony_ci * oom_killer_enable - enable OOM killer 79362306a36Sopenharmony_ci */ 79462306a36Sopenharmony_civoid oom_killer_enable(void) 79562306a36Sopenharmony_ci{ 79662306a36Sopenharmony_ci oom_killer_disabled = false; 79762306a36Sopenharmony_ci pr_info("OOM killer enabled.\n"); 79862306a36Sopenharmony_ci} 79962306a36Sopenharmony_ci 80062306a36Sopenharmony_ci/** 80162306a36Sopenharmony_ci * oom_killer_disable - disable OOM killer 80262306a36Sopenharmony_ci * @timeout: maximum timeout to wait for oom victims in jiffies 80362306a36Sopenharmony_ci * 80462306a36Sopenharmony_ci * Forces all page allocations to fail rather than trigger OOM killer. 80562306a36Sopenharmony_ci * Will block and wait until all OOM victims are killed or the given 80662306a36Sopenharmony_ci * timeout expires. 80762306a36Sopenharmony_ci * 80862306a36Sopenharmony_ci * The function cannot be called when there are runnable user tasks because 80962306a36Sopenharmony_ci * the userspace would see unexpected allocation failures as a result. Any 81062306a36Sopenharmony_ci * new usage of this function should be consulted with MM people. 81162306a36Sopenharmony_ci * 81262306a36Sopenharmony_ci * Returns true if successful and false if the OOM killer cannot be 81362306a36Sopenharmony_ci * disabled. 81462306a36Sopenharmony_ci */ 81562306a36Sopenharmony_cibool oom_killer_disable(signed long timeout) 81662306a36Sopenharmony_ci{ 81762306a36Sopenharmony_ci signed long ret; 81862306a36Sopenharmony_ci 81962306a36Sopenharmony_ci /* 82062306a36Sopenharmony_ci * Make sure to not race with an ongoing OOM killer. Check that the 82162306a36Sopenharmony_ci * current is not killed (possibly due to sharing the victim's memory). 82262306a36Sopenharmony_ci */ 82362306a36Sopenharmony_ci if (mutex_lock_killable(&oom_lock)) 82462306a36Sopenharmony_ci return false; 82562306a36Sopenharmony_ci oom_killer_disabled = true; 82662306a36Sopenharmony_ci mutex_unlock(&oom_lock); 82762306a36Sopenharmony_ci 82862306a36Sopenharmony_ci ret = wait_event_interruptible_timeout(oom_victims_wait, 82962306a36Sopenharmony_ci !atomic_read(&oom_victims), timeout); 83062306a36Sopenharmony_ci if (ret <= 0) { 83162306a36Sopenharmony_ci oom_killer_enable(); 83262306a36Sopenharmony_ci return false; 83362306a36Sopenharmony_ci } 83462306a36Sopenharmony_ci pr_info("OOM killer disabled.\n"); 83562306a36Sopenharmony_ci 83662306a36Sopenharmony_ci return true; 83762306a36Sopenharmony_ci} 83862306a36Sopenharmony_ci 83962306a36Sopenharmony_cistatic inline bool __task_will_free_mem(struct task_struct *task) 84062306a36Sopenharmony_ci{ 84162306a36Sopenharmony_ci struct signal_struct *sig = task->signal; 84262306a36Sopenharmony_ci 84362306a36Sopenharmony_ci /* 84462306a36Sopenharmony_ci * A coredumping process may sleep for an extended period in 84562306a36Sopenharmony_ci * coredump_task_exit(), so the oom killer cannot assume that 84662306a36Sopenharmony_ci * the process will promptly exit and release memory. 84762306a36Sopenharmony_ci */ 84862306a36Sopenharmony_ci if (sig->core_state) 84962306a36Sopenharmony_ci return false; 85062306a36Sopenharmony_ci 85162306a36Sopenharmony_ci if (sig->flags & SIGNAL_GROUP_EXIT) 85262306a36Sopenharmony_ci return true; 85362306a36Sopenharmony_ci 85462306a36Sopenharmony_ci if (thread_group_empty(task) && (task->flags & PF_EXITING)) 85562306a36Sopenharmony_ci return true; 85662306a36Sopenharmony_ci 85762306a36Sopenharmony_ci return false; 85862306a36Sopenharmony_ci} 85962306a36Sopenharmony_ci 86062306a36Sopenharmony_ci/* 86162306a36Sopenharmony_ci * Checks whether the given task is dying or exiting and likely to 86262306a36Sopenharmony_ci * release its address space. This means that all threads and processes 86362306a36Sopenharmony_ci * sharing the same mm have to be killed or exiting. 86462306a36Sopenharmony_ci * Caller has to make sure that task->mm is stable (hold task_lock or 86562306a36Sopenharmony_ci * it operates on the current). 86662306a36Sopenharmony_ci */ 86762306a36Sopenharmony_cistatic bool task_will_free_mem(struct task_struct *task) 86862306a36Sopenharmony_ci{ 86962306a36Sopenharmony_ci struct mm_struct *mm = task->mm; 87062306a36Sopenharmony_ci struct task_struct *p; 87162306a36Sopenharmony_ci bool ret = true; 87262306a36Sopenharmony_ci 87362306a36Sopenharmony_ci /* 87462306a36Sopenharmony_ci * Skip tasks without mm because it might have passed its exit_mm and 87562306a36Sopenharmony_ci * exit_oom_victim. oom_reaper could have rescued that but do not rely 87662306a36Sopenharmony_ci * on that for now. We can consider find_lock_task_mm in future. 87762306a36Sopenharmony_ci */ 87862306a36Sopenharmony_ci if (!mm) 87962306a36Sopenharmony_ci return false; 88062306a36Sopenharmony_ci 88162306a36Sopenharmony_ci if (!__task_will_free_mem(task)) 88262306a36Sopenharmony_ci return false; 88362306a36Sopenharmony_ci 88462306a36Sopenharmony_ci /* 88562306a36Sopenharmony_ci * This task has already been drained by the oom reaper so there are 88662306a36Sopenharmony_ci * only small chances it will free some more 88762306a36Sopenharmony_ci */ 88862306a36Sopenharmony_ci if (test_bit(MMF_OOM_SKIP, &mm->flags)) 88962306a36Sopenharmony_ci return false; 89062306a36Sopenharmony_ci 89162306a36Sopenharmony_ci if (atomic_read(&mm->mm_users) <= 1) 89262306a36Sopenharmony_ci return true; 89362306a36Sopenharmony_ci 89462306a36Sopenharmony_ci /* 89562306a36Sopenharmony_ci * Make sure that all tasks which share the mm with the given tasks 89662306a36Sopenharmony_ci * are dying as well to make sure that a) nobody pins its mm and 89762306a36Sopenharmony_ci * b) the task is also reapable by the oom reaper. 89862306a36Sopenharmony_ci */ 89962306a36Sopenharmony_ci rcu_read_lock(); 90062306a36Sopenharmony_ci for_each_process(p) { 90162306a36Sopenharmony_ci if (!process_shares_mm(p, mm)) 90262306a36Sopenharmony_ci continue; 90362306a36Sopenharmony_ci if (same_thread_group(task, p)) 90462306a36Sopenharmony_ci continue; 90562306a36Sopenharmony_ci ret = __task_will_free_mem(p); 90662306a36Sopenharmony_ci if (!ret) 90762306a36Sopenharmony_ci break; 90862306a36Sopenharmony_ci } 90962306a36Sopenharmony_ci rcu_read_unlock(); 91062306a36Sopenharmony_ci 91162306a36Sopenharmony_ci return ret; 91262306a36Sopenharmony_ci} 91362306a36Sopenharmony_ci 91462306a36Sopenharmony_cistatic void __oom_kill_process(struct task_struct *victim, const char *message) 91562306a36Sopenharmony_ci{ 91662306a36Sopenharmony_ci struct task_struct *p; 91762306a36Sopenharmony_ci struct mm_struct *mm; 91862306a36Sopenharmony_ci bool can_oom_reap = true; 91962306a36Sopenharmony_ci 92062306a36Sopenharmony_ci p = find_lock_task_mm(victim); 92162306a36Sopenharmony_ci if (!p) { 92262306a36Sopenharmony_ci pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n", 92362306a36Sopenharmony_ci message, task_pid_nr(victim), victim->comm); 92462306a36Sopenharmony_ci put_task_struct(victim); 92562306a36Sopenharmony_ci return; 92662306a36Sopenharmony_ci } else if (victim != p) { 92762306a36Sopenharmony_ci get_task_struct(p); 92862306a36Sopenharmony_ci put_task_struct(victim); 92962306a36Sopenharmony_ci victim = p; 93062306a36Sopenharmony_ci } 93162306a36Sopenharmony_ci 93262306a36Sopenharmony_ci /* Get a reference to safely compare mm after task_unlock(victim) */ 93362306a36Sopenharmony_ci mm = victim->mm; 93462306a36Sopenharmony_ci mmgrab(mm); 93562306a36Sopenharmony_ci 93662306a36Sopenharmony_ci /* Raise event before sending signal: task reaper must see this */ 93762306a36Sopenharmony_ci count_vm_event(OOM_KILL); 93862306a36Sopenharmony_ci memcg_memory_event_mm(mm, MEMCG_OOM_KILL); 93962306a36Sopenharmony_ci 94062306a36Sopenharmony_ci /* 94162306a36Sopenharmony_ci * We should send SIGKILL before granting access to memory reserves 94262306a36Sopenharmony_ci * in order to prevent the OOM victim from depleting the memory 94362306a36Sopenharmony_ci * reserves from the user space under its control. 94462306a36Sopenharmony_ci */ 94562306a36Sopenharmony_ci do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); 94662306a36Sopenharmony_ci mark_oom_victim(victim); 94762306a36Sopenharmony_ci pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n", 94862306a36Sopenharmony_ci message, task_pid_nr(victim), victim->comm, K(mm->total_vm), 94962306a36Sopenharmony_ci K(get_mm_counter(mm, MM_ANONPAGES)), 95062306a36Sopenharmony_ci K(get_mm_counter(mm, MM_FILEPAGES)), 95162306a36Sopenharmony_ci K(get_mm_counter(mm, MM_SHMEMPAGES)), 95262306a36Sopenharmony_ci from_kuid(&init_user_ns, task_uid(victim)), 95362306a36Sopenharmony_ci mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj); 95462306a36Sopenharmony_ci task_unlock(victim); 95562306a36Sopenharmony_ci 95662306a36Sopenharmony_ci /* 95762306a36Sopenharmony_ci * Kill all user processes sharing victim->mm in other thread groups, if 95862306a36Sopenharmony_ci * any. They don't get access to memory reserves, though, to avoid 95962306a36Sopenharmony_ci * depletion of all memory. This prevents mm->mmap_lock livelock when an 96062306a36Sopenharmony_ci * oom killed thread cannot exit because it requires the semaphore and 96162306a36Sopenharmony_ci * its contended by another thread trying to allocate memory itself. 96262306a36Sopenharmony_ci * That thread will now get access to memory reserves since it has a 96362306a36Sopenharmony_ci * pending fatal signal. 96462306a36Sopenharmony_ci */ 96562306a36Sopenharmony_ci rcu_read_lock(); 96662306a36Sopenharmony_ci for_each_process(p) { 96762306a36Sopenharmony_ci if (!process_shares_mm(p, mm)) 96862306a36Sopenharmony_ci continue; 96962306a36Sopenharmony_ci if (same_thread_group(p, victim)) 97062306a36Sopenharmony_ci continue; 97162306a36Sopenharmony_ci if (is_global_init(p)) { 97262306a36Sopenharmony_ci can_oom_reap = false; 97362306a36Sopenharmony_ci set_bit(MMF_OOM_SKIP, &mm->flags); 97462306a36Sopenharmony_ci pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", 97562306a36Sopenharmony_ci task_pid_nr(victim), victim->comm, 97662306a36Sopenharmony_ci task_pid_nr(p), p->comm); 97762306a36Sopenharmony_ci continue; 97862306a36Sopenharmony_ci } 97962306a36Sopenharmony_ci /* 98062306a36Sopenharmony_ci * No kthread_use_mm() user needs to read from the userspace so 98162306a36Sopenharmony_ci * we are ok to reap it. 98262306a36Sopenharmony_ci */ 98362306a36Sopenharmony_ci if (unlikely(p->flags & PF_KTHREAD)) 98462306a36Sopenharmony_ci continue; 98562306a36Sopenharmony_ci do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID); 98662306a36Sopenharmony_ci } 98762306a36Sopenharmony_ci rcu_read_unlock(); 98862306a36Sopenharmony_ci 98962306a36Sopenharmony_ci if (can_oom_reap) 99062306a36Sopenharmony_ci queue_oom_reaper(victim); 99162306a36Sopenharmony_ci 99262306a36Sopenharmony_ci mmdrop(mm); 99362306a36Sopenharmony_ci put_task_struct(victim); 99462306a36Sopenharmony_ci} 99562306a36Sopenharmony_ci 99662306a36Sopenharmony_ci/* 99762306a36Sopenharmony_ci * Kill provided task unless it's secured by setting 99862306a36Sopenharmony_ci * oom_score_adj to OOM_SCORE_ADJ_MIN. 99962306a36Sopenharmony_ci */ 100062306a36Sopenharmony_cistatic int oom_kill_memcg_member(struct task_struct *task, void *message) 100162306a36Sopenharmony_ci{ 100262306a36Sopenharmony_ci if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN && 100362306a36Sopenharmony_ci !is_global_init(task)) { 100462306a36Sopenharmony_ci get_task_struct(task); 100562306a36Sopenharmony_ci __oom_kill_process(task, message); 100662306a36Sopenharmony_ci } 100762306a36Sopenharmony_ci return 0; 100862306a36Sopenharmony_ci} 100962306a36Sopenharmony_ci 101062306a36Sopenharmony_cistatic void oom_kill_process(struct oom_control *oc, const char *message) 101162306a36Sopenharmony_ci{ 101262306a36Sopenharmony_ci struct task_struct *victim = oc->chosen; 101362306a36Sopenharmony_ci struct mem_cgroup *oom_group; 101462306a36Sopenharmony_ci static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, 101562306a36Sopenharmony_ci DEFAULT_RATELIMIT_BURST); 101662306a36Sopenharmony_ci 101762306a36Sopenharmony_ci /* 101862306a36Sopenharmony_ci * If the task is already exiting, don't alarm the sysadmin or kill 101962306a36Sopenharmony_ci * its children or threads, just give it access to memory reserves 102062306a36Sopenharmony_ci * so it can die quickly 102162306a36Sopenharmony_ci */ 102262306a36Sopenharmony_ci task_lock(victim); 102362306a36Sopenharmony_ci if (task_will_free_mem(victim)) { 102462306a36Sopenharmony_ci mark_oom_victim(victim); 102562306a36Sopenharmony_ci queue_oom_reaper(victim); 102662306a36Sopenharmony_ci task_unlock(victim); 102762306a36Sopenharmony_ci put_task_struct(victim); 102862306a36Sopenharmony_ci return; 102962306a36Sopenharmony_ci } 103062306a36Sopenharmony_ci task_unlock(victim); 103162306a36Sopenharmony_ci 103262306a36Sopenharmony_ci if (__ratelimit(&oom_rs)) 103362306a36Sopenharmony_ci dump_header(oc, victim); 103462306a36Sopenharmony_ci 103562306a36Sopenharmony_ci /* 103662306a36Sopenharmony_ci * Do we need to kill the entire memory cgroup? 103762306a36Sopenharmony_ci * Or even one of the ancestor memory cgroups? 103862306a36Sopenharmony_ci * Check this out before killing the victim task. 103962306a36Sopenharmony_ci */ 104062306a36Sopenharmony_ci oom_group = mem_cgroup_get_oom_group(victim, oc->memcg); 104162306a36Sopenharmony_ci 104262306a36Sopenharmony_ci __oom_kill_process(victim, message); 104362306a36Sopenharmony_ci 104462306a36Sopenharmony_ci /* 104562306a36Sopenharmony_ci * If necessary, kill all tasks in the selected memory cgroup. 104662306a36Sopenharmony_ci */ 104762306a36Sopenharmony_ci if (oom_group) { 104862306a36Sopenharmony_ci memcg_memory_event(oom_group, MEMCG_OOM_GROUP_KILL); 104962306a36Sopenharmony_ci mem_cgroup_print_oom_group(oom_group); 105062306a36Sopenharmony_ci mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, 105162306a36Sopenharmony_ci (void *)message); 105262306a36Sopenharmony_ci mem_cgroup_put(oom_group); 105362306a36Sopenharmony_ci } 105462306a36Sopenharmony_ci} 105562306a36Sopenharmony_ci 105662306a36Sopenharmony_ci/* 105762306a36Sopenharmony_ci * Determines whether the kernel must panic because of the panic_on_oom sysctl. 105862306a36Sopenharmony_ci */ 105962306a36Sopenharmony_cistatic void check_panic_on_oom(struct oom_control *oc) 106062306a36Sopenharmony_ci{ 106162306a36Sopenharmony_ci if (likely(!sysctl_panic_on_oom)) 106262306a36Sopenharmony_ci return; 106362306a36Sopenharmony_ci if (sysctl_panic_on_oom != 2) { 106462306a36Sopenharmony_ci /* 106562306a36Sopenharmony_ci * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel 106662306a36Sopenharmony_ci * does not panic for cpuset, mempolicy, or memcg allocation 106762306a36Sopenharmony_ci * failures. 106862306a36Sopenharmony_ci */ 106962306a36Sopenharmony_ci if (oc->constraint != CONSTRAINT_NONE) 107062306a36Sopenharmony_ci return; 107162306a36Sopenharmony_ci } 107262306a36Sopenharmony_ci /* Do not panic for oom kills triggered by sysrq */ 107362306a36Sopenharmony_ci if (is_sysrq_oom(oc)) 107462306a36Sopenharmony_ci return; 107562306a36Sopenharmony_ci dump_header(oc, NULL); 107662306a36Sopenharmony_ci panic("Out of memory: %s panic_on_oom is enabled\n", 107762306a36Sopenharmony_ci sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); 107862306a36Sopenharmony_ci} 107962306a36Sopenharmony_ci 108062306a36Sopenharmony_cistatic BLOCKING_NOTIFIER_HEAD(oom_notify_list); 108162306a36Sopenharmony_ci 108262306a36Sopenharmony_ciint register_oom_notifier(struct notifier_block *nb) 108362306a36Sopenharmony_ci{ 108462306a36Sopenharmony_ci return blocking_notifier_chain_register(&oom_notify_list, nb); 108562306a36Sopenharmony_ci} 108662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(register_oom_notifier); 108762306a36Sopenharmony_ci 108862306a36Sopenharmony_ciint unregister_oom_notifier(struct notifier_block *nb) 108962306a36Sopenharmony_ci{ 109062306a36Sopenharmony_ci return blocking_notifier_chain_unregister(&oom_notify_list, nb); 109162306a36Sopenharmony_ci} 109262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(unregister_oom_notifier); 109362306a36Sopenharmony_ci 109462306a36Sopenharmony_ci/** 109562306a36Sopenharmony_ci * out_of_memory - kill the "best" process when we run out of memory 109662306a36Sopenharmony_ci * @oc: pointer to struct oom_control 109762306a36Sopenharmony_ci * 109862306a36Sopenharmony_ci * If we run out of memory, we have the choice between either 109962306a36Sopenharmony_ci * killing a random task (bad), letting the system crash (worse) 110062306a36Sopenharmony_ci * OR try to be smart about which process to kill. Note that we 110162306a36Sopenharmony_ci * don't have to be perfect here, we just have to be good. 110262306a36Sopenharmony_ci */ 110362306a36Sopenharmony_cibool out_of_memory(struct oom_control *oc) 110462306a36Sopenharmony_ci{ 110562306a36Sopenharmony_ci unsigned long freed = 0; 110662306a36Sopenharmony_ci 110762306a36Sopenharmony_ci if (oom_killer_disabled) 110862306a36Sopenharmony_ci return false; 110962306a36Sopenharmony_ci 111062306a36Sopenharmony_ci if (!is_memcg_oom(oc)) { 111162306a36Sopenharmony_ci blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 111262306a36Sopenharmony_ci if (freed > 0 && !is_sysrq_oom(oc)) 111362306a36Sopenharmony_ci /* Got some memory back in the last second. */ 111462306a36Sopenharmony_ci return true; 111562306a36Sopenharmony_ci } 111662306a36Sopenharmony_ci 111762306a36Sopenharmony_ci /* 111862306a36Sopenharmony_ci * If current has a pending SIGKILL or is exiting, then automatically 111962306a36Sopenharmony_ci * select it. The goal is to allow it to allocate so that it may 112062306a36Sopenharmony_ci * quickly exit and free its memory. 112162306a36Sopenharmony_ci */ 112262306a36Sopenharmony_ci if (task_will_free_mem(current)) { 112362306a36Sopenharmony_ci mark_oom_victim(current); 112462306a36Sopenharmony_ci queue_oom_reaper(current); 112562306a36Sopenharmony_ci return true; 112662306a36Sopenharmony_ci } 112762306a36Sopenharmony_ci 112862306a36Sopenharmony_ci /* 112962306a36Sopenharmony_ci * The OOM killer does not compensate for IO-less reclaim. 113062306a36Sopenharmony_ci * But mem_cgroup_oom() has to invoke the OOM killer even 113162306a36Sopenharmony_ci * if it is a GFP_NOFS allocation. 113262306a36Sopenharmony_ci */ 113362306a36Sopenharmony_ci if (!(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc)) 113462306a36Sopenharmony_ci return true; 113562306a36Sopenharmony_ci 113662306a36Sopenharmony_ci /* 113762306a36Sopenharmony_ci * Check if there were limitations on the allocation (only relevant for 113862306a36Sopenharmony_ci * NUMA and memcg) that may require different handling. 113962306a36Sopenharmony_ci */ 114062306a36Sopenharmony_ci oc->constraint = constrained_alloc(oc); 114162306a36Sopenharmony_ci if (oc->constraint != CONSTRAINT_MEMORY_POLICY) 114262306a36Sopenharmony_ci oc->nodemask = NULL; 114362306a36Sopenharmony_ci check_panic_on_oom(oc); 114462306a36Sopenharmony_ci 114562306a36Sopenharmony_ci if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && 114662306a36Sopenharmony_ci current->mm && !oom_unkillable_task(current) && 114762306a36Sopenharmony_ci oom_cpuset_eligible(current, oc) && 114862306a36Sopenharmony_ci current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { 114962306a36Sopenharmony_ci get_task_struct(current); 115062306a36Sopenharmony_ci oc->chosen = current; 115162306a36Sopenharmony_ci oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); 115262306a36Sopenharmony_ci return true; 115362306a36Sopenharmony_ci } 115462306a36Sopenharmony_ci 115562306a36Sopenharmony_ci select_bad_process(oc); 115662306a36Sopenharmony_ci /* Found nothing?!?! */ 115762306a36Sopenharmony_ci if (!oc->chosen) { 115862306a36Sopenharmony_ci dump_header(oc, NULL); 115962306a36Sopenharmony_ci pr_warn("Out of memory and no killable processes...\n"); 116062306a36Sopenharmony_ci /* 116162306a36Sopenharmony_ci * If we got here due to an actual allocation at the 116262306a36Sopenharmony_ci * system level, we cannot survive this and will enter 116362306a36Sopenharmony_ci * an endless loop in the allocator. Bail out now. 116462306a36Sopenharmony_ci */ 116562306a36Sopenharmony_ci if (!is_sysrq_oom(oc) && !is_memcg_oom(oc)) 116662306a36Sopenharmony_ci panic("System is deadlocked on memory\n"); 116762306a36Sopenharmony_ci } 116862306a36Sopenharmony_ci if (oc->chosen && oc->chosen != (void *)-1UL) 116962306a36Sopenharmony_ci oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : 117062306a36Sopenharmony_ci "Memory cgroup out of memory"); 117162306a36Sopenharmony_ci return !!oc->chosen; 117262306a36Sopenharmony_ci} 117362306a36Sopenharmony_ci 117462306a36Sopenharmony_ci/* 117562306a36Sopenharmony_ci * The pagefault handler calls here because some allocation has failed. We have 117662306a36Sopenharmony_ci * to take care of the memcg OOM here because this is the only safe context without 117762306a36Sopenharmony_ci * any locks held but let the oom killer triggered from the allocation context care 117862306a36Sopenharmony_ci * about the global OOM. 117962306a36Sopenharmony_ci */ 118062306a36Sopenharmony_civoid pagefault_out_of_memory(void) 118162306a36Sopenharmony_ci{ 118262306a36Sopenharmony_ci static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL, 118362306a36Sopenharmony_ci DEFAULT_RATELIMIT_BURST); 118462306a36Sopenharmony_ci 118562306a36Sopenharmony_ci if (mem_cgroup_oom_synchronize(true)) 118662306a36Sopenharmony_ci return; 118762306a36Sopenharmony_ci 118862306a36Sopenharmony_ci if (fatal_signal_pending(current)) 118962306a36Sopenharmony_ci return; 119062306a36Sopenharmony_ci 119162306a36Sopenharmony_ci if (__ratelimit(&pfoom_rs)) 119262306a36Sopenharmony_ci pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n"); 119362306a36Sopenharmony_ci} 119462306a36Sopenharmony_ci 119562306a36Sopenharmony_ciSYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) 119662306a36Sopenharmony_ci{ 119762306a36Sopenharmony_ci#ifdef CONFIG_MMU 119862306a36Sopenharmony_ci struct mm_struct *mm = NULL; 119962306a36Sopenharmony_ci struct task_struct *task; 120062306a36Sopenharmony_ci struct task_struct *p; 120162306a36Sopenharmony_ci unsigned int f_flags; 120262306a36Sopenharmony_ci bool reap = false; 120362306a36Sopenharmony_ci long ret = 0; 120462306a36Sopenharmony_ci 120562306a36Sopenharmony_ci if (flags) 120662306a36Sopenharmony_ci return -EINVAL; 120762306a36Sopenharmony_ci 120862306a36Sopenharmony_ci task = pidfd_get_task(pidfd, &f_flags); 120962306a36Sopenharmony_ci if (IS_ERR(task)) 121062306a36Sopenharmony_ci return PTR_ERR(task); 121162306a36Sopenharmony_ci 121262306a36Sopenharmony_ci /* 121362306a36Sopenharmony_ci * Make sure to choose a thread which still has a reference to mm 121462306a36Sopenharmony_ci * during the group exit 121562306a36Sopenharmony_ci */ 121662306a36Sopenharmony_ci p = find_lock_task_mm(task); 121762306a36Sopenharmony_ci if (!p) { 121862306a36Sopenharmony_ci ret = -ESRCH; 121962306a36Sopenharmony_ci goto put_task; 122062306a36Sopenharmony_ci } 122162306a36Sopenharmony_ci 122262306a36Sopenharmony_ci mm = p->mm; 122362306a36Sopenharmony_ci mmgrab(mm); 122462306a36Sopenharmony_ci 122562306a36Sopenharmony_ci if (task_will_free_mem(p)) 122662306a36Sopenharmony_ci reap = true; 122762306a36Sopenharmony_ci else { 122862306a36Sopenharmony_ci /* Error only if the work has not been done already */ 122962306a36Sopenharmony_ci if (!test_bit(MMF_OOM_SKIP, &mm->flags)) 123062306a36Sopenharmony_ci ret = -EINVAL; 123162306a36Sopenharmony_ci } 123262306a36Sopenharmony_ci task_unlock(p); 123362306a36Sopenharmony_ci 123462306a36Sopenharmony_ci if (!reap) 123562306a36Sopenharmony_ci goto drop_mm; 123662306a36Sopenharmony_ci 123762306a36Sopenharmony_ci if (mmap_read_lock_killable(mm)) { 123862306a36Sopenharmony_ci ret = -EINTR; 123962306a36Sopenharmony_ci goto drop_mm; 124062306a36Sopenharmony_ci } 124162306a36Sopenharmony_ci /* 124262306a36Sopenharmony_ci * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure 124362306a36Sopenharmony_ci * possible change in exit_mmap is seen 124462306a36Sopenharmony_ci */ 124562306a36Sopenharmony_ci if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm)) 124662306a36Sopenharmony_ci ret = -EAGAIN; 124762306a36Sopenharmony_ci mmap_read_unlock(mm); 124862306a36Sopenharmony_ci 124962306a36Sopenharmony_cidrop_mm: 125062306a36Sopenharmony_ci mmdrop(mm); 125162306a36Sopenharmony_ciput_task: 125262306a36Sopenharmony_ci put_task_struct(task); 125362306a36Sopenharmony_ci return ret; 125462306a36Sopenharmony_ci#else 125562306a36Sopenharmony_ci return -ENOSYS; 125662306a36Sopenharmony_ci#endif /* CONFIG_MMU */ 125762306a36Sopenharmony_ci} 1258