162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci *  linux/mm/oom_kill.c
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci *  Copyright (C)  1998,2000  Rik van Riel
662306a36Sopenharmony_ci *	Thanks go out to Claus Fischer for some serious inspiration and
762306a36Sopenharmony_ci *	for goading me into coding this file...
862306a36Sopenharmony_ci *  Copyright (C)  2010  Google, Inc.
962306a36Sopenharmony_ci *	Rewritten by David Rientjes
1062306a36Sopenharmony_ci *
1162306a36Sopenharmony_ci *  The routines in this file are used to kill a process when
1262306a36Sopenharmony_ci *  we're seriously out of memory. This gets called from __alloc_pages()
1362306a36Sopenharmony_ci *  in mm/page_alloc.c when we really run out of memory.
1462306a36Sopenharmony_ci *
1562306a36Sopenharmony_ci *  Since we won't call these routines often (on a well-configured
1662306a36Sopenharmony_ci *  machine) this file will double as a 'coding guide' and a signpost
1762306a36Sopenharmony_ci *  for newbie kernel hackers. It features several pointers to major
1862306a36Sopenharmony_ci *  kernel subsystems and hints as to where to find out what things do.
1962306a36Sopenharmony_ci */
2062306a36Sopenharmony_ci
2162306a36Sopenharmony_ci#include <linux/oom.h>
2262306a36Sopenharmony_ci#include <linux/mm.h>
2362306a36Sopenharmony_ci#include <linux/err.h>
2462306a36Sopenharmony_ci#include <linux/gfp.h>
2562306a36Sopenharmony_ci#include <linux/sched.h>
2662306a36Sopenharmony_ci#include <linux/sched/mm.h>
2762306a36Sopenharmony_ci#include <linux/sched/coredump.h>
2862306a36Sopenharmony_ci#include <linux/sched/task.h>
2962306a36Sopenharmony_ci#include <linux/sched/debug.h>
3062306a36Sopenharmony_ci#include <linux/swap.h>
3162306a36Sopenharmony_ci#include <linux/syscalls.h>
3262306a36Sopenharmony_ci#include <linux/timex.h>
3362306a36Sopenharmony_ci#include <linux/jiffies.h>
3462306a36Sopenharmony_ci#include <linux/cpuset.h>
3562306a36Sopenharmony_ci#include <linux/export.h>
3662306a36Sopenharmony_ci#include <linux/notifier.h>
3762306a36Sopenharmony_ci#include <linux/memcontrol.h>
3862306a36Sopenharmony_ci#include <linux/mempolicy.h>
3962306a36Sopenharmony_ci#include <linux/security.h>
4062306a36Sopenharmony_ci#include <linux/ptrace.h>
4162306a36Sopenharmony_ci#include <linux/freezer.h>
4262306a36Sopenharmony_ci#include <linux/ftrace.h>
4362306a36Sopenharmony_ci#include <linux/ratelimit.h>
4462306a36Sopenharmony_ci#include <linux/kthread.h>
4562306a36Sopenharmony_ci#include <linux/init.h>
4662306a36Sopenharmony_ci#include <linux/mmu_notifier.h>
4762306a36Sopenharmony_ci
4862306a36Sopenharmony_ci#include <asm/tlb.h>
4962306a36Sopenharmony_ci#include "internal.h"
5062306a36Sopenharmony_ci#include "slab.h"
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_ci#define CREATE_TRACE_POINTS
5362306a36Sopenharmony_ci#include <trace/events/oom.h>
5462306a36Sopenharmony_ci
5562306a36Sopenharmony_cistatic int sysctl_panic_on_oom;
5662306a36Sopenharmony_cistatic int sysctl_oom_kill_allocating_task;
5762306a36Sopenharmony_cistatic int sysctl_oom_dump_tasks = 1;
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_ci/*
6062306a36Sopenharmony_ci * Serializes oom killer invocations (out_of_memory()) from all contexts to
6162306a36Sopenharmony_ci * prevent from over eager oom killing (e.g. when the oom killer is invoked
6262306a36Sopenharmony_ci * from different domains).
6362306a36Sopenharmony_ci *
6462306a36Sopenharmony_ci * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
6562306a36Sopenharmony_ci * and mark_oom_victim
6662306a36Sopenharmony_ci */
6762306a36Sopenharmony_ciDEFINE_MUTEX(oom_lock);
6862306a36Sopenharmony_ci/* Serializes oom_score_adj and oom_score_adj_min updates */
6962306a36Sopenharmony_ciDEFINE_MUTEX(oom_adj_mutex);
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_cistatic inline bool is_memcg_oom(struct oom_control *oc)
7262306a36Sopenharmony_ci{
7362306a36Sopenharmony_ci	return oc->memcg != NULL;
7462306a36Sopenharmony_ci}
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci#ifdef CONFIG_NUMA
7762306a36Sopenharmony_ci/**
7862306a36Sopenharmony_ci * oom_cpuset_eligible() - check task eligibility for kill
7962306a36Sopenharmony_ci * @start: task struct of which task to consider
8062306a36Sopenharmony_ci * @oc: pointer to struct oom_control
8162306a36Sopenharmony_ci *
8262306a36Sopenharmony_ci * Task eligibility is determined by whether or not a candidate task, @tsk,
8362306a36Sopenharmony_ci * shares the same mempolicy nodes as current if it is bound by such a policy
8462306a36Sopenharmony_ci * and whether or not it has the same set of allowed cpuset nodes.
8562306a36Sopenharmony_ci *
8662306a36Sopenharmony_ci * This function is assuming oom-killer context and 'current' has triggered
8762306a36Sopenharmony_ci * the oom-killer.
8862306a36Sopenharmony_ci */
8962306a36Sopenharmony_cistatic bool oom_cpuset_eligible(struct task_struct *start,
9062306a36Sopenharmony_ci				struct oom_control *oc)
9162306a36Sopenharmony_ci{
9262306a36Sopenharmony_ci	struct task_struct *tsk;
9362306a36Sopenharmony_ci	bool ret = false;
9462306a36Sopenharmony_ci	const nodemask_t *mask = oc->nodemask;
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci	rcu_read_lock();
9762306a36Sopenharmony_ci	for_each_thread(start, tsk) {
9862306a36Sopenharmony_ci		if (mask) {
9962306a36Sopenharmony_ci			/*
10062306a36Sopenharmony_ci			 * If this is a mempolicy constrained oom, tsk's
10162306a36Sopenharmony_ci			 * cpuset is irrelevant.  Only return true if its
10262306a36Sopenharmony_ci			 * mempolicy intersects current, otherwise it may be
10362306a36Sopenharmony_ci			 * needlessly killed.
10462306a36Sopenharmony_ci			 */
10562306a36Sopenharmony_ci			ret = mempolicy_in_oom_domain(tsk, mask);
10662306a36Sopenharmony_ci		} else {
10762306a36Sopenharmony_ci			/*
10862306a36Sopenharmony_ci			 * This is not a mempolicy constrained oom, so only
10962306a36Sopenharmony_ci			 * check the mems of tsk's cpuset.
11062306a36Sopenharmony_ci			 */
11162306a36Sopenharmony_ci			ret = cpuset_mems_allowed_intersects(current, tsk);
11262306a36Sopenharmony_ci		}
11362306a36Sopenharmony_ci		if (ret)
11462306a36Sopenharmony_ci			break;
11562306a36Sopenharmony_ci	}
11662306a36Sopenharmony_ci	rcu_read_unlock();
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci	return ret;
11962306a36Sopenharmony_ci}
12062306a36Sopenharmony_ci#else
12162306a36Sopenharmony_cistatic bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)
12262306a36Sopenharmony_ci{
12362306a36Sopenharmony_ci	return true;
12462306a36Sopenharmony_ci}
12562306a36Sopenharmony_ci#endif /* CONFIG_NUMA */
12662306a36Sopenharmony_ci
12762306a36Sopenharmony_ci/*
12862306a36Sopenharmony_ci * The process p may have detached its own ->mm while exiting or through
12962306a36Sopenharmony_ci * kthread_use_mm(), but one or more of its subthreads may still have a valid
13062306a36Sopenharmony_ci * pointer.  Return p, or any of its subthreads with a valid ->mm, with
13162306a36Sopenharmony_ci * task_lock() held.
13262306a36Sopenharmony_ci */
13362306a36Sopenharmony_cistruct task_struct *find_lock_task_mm(struct task_struct *p)
13462306a36Sopenharmony_ci{
13562306a36Sopenharmony_ci	struct task_struct *t;
13662306a36Sopenharmony_ci
13762306a36Sopenharmony_ci	rcu_read_lock();
13862306a36Sopenharmony_ci
13962306a36Sopenharmony_ci	for_each_thread(p, t) {
14062306a36Sopenharmony_ci		task_lock(t);
14162306a36Sopenharmony_ci		if (likely(t->mm))
14262306a36Sopenharmony_ci			goto found;
14362306a36Sopenharmony_ci		task_unlock(t);
14462306a36Sopenharmony_ci	}
14562306a36Sopenharmony_ci	t = NULL;
14662306a36Sopenharmony_cifound:
14762306a36Sopenharmony_ci	rcu_read_unlock();
14862306a36Sopenharmony_ci
14962306a36Sopenharmony_ci	return t;
15062306a36Sopenharmony_ci}
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci/*
15362306a36Sopenharmony_ci * order == -1 means the oom kill is required by sysrq, otherwise only
15462306a36Sopenharmony_ci * for display purposes.
15562306a36Sopenharmony_ci */
15662306a36Sopenharmony_cistatic inline bool is_sysrq_oom(struct oom_control *oc)
15762306a36Sopenharmony_ci{
15862306a36Sopenharmony_ci	return oc->order == -1;
15962306a36Sopenharmony_ci}
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci/* return true if the task is not adequate as candidate victim task. */
16262306a36Sopenharmony_cistatic bool oom_unkillable_task(struct task_struct *p)
16362306a36Sopenharmony_ci{
16462306a36Sopenharmony_ci	if (is_global_init(p))
16562306a36Sopenharmony_ci		return true;
16662306a36Sopenharmony_ci	if (p->flags & PF_KTHREAD)
16762306a36Sopenharmony_ci		return true;
16862306a36Sopenharmony_ci	return false;
16962306a36Sopenharmony_ci}
17062306a36Sopenharmony_ci
17162306a36Sopenharmony_ci/*
17262306a36Sopenharmony_ci * Check whether unreclaimable slab amount is greater than
17362306a36Sopenharmony_ci * all user memory(LRU pages).
17462306a36Sopenharmony_ci * dump_unreclaimable_slab() could help in the case that
17562306a36Sopenharmony_ci * oom due to too much unreclaimable slab used by kernel.
17662306a36Sopenharmony_ci*/
17762306a36Sopenharmony_cistatic bool should_dump_unreclaim_slab(void)
17862306a36Sopenharmony_ci{
17962306a36Sopenharmony_ci	unsigned long nr_lru;
18062306a36Sopenharmony_ci
18162306a36Sopenharmony_ci	nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
18262306a36Sopenharmony_ci		 global_node_page_state(NR_INACTIVE_ANON) +
18362306a36Sopenharmony_ci		 global_node_page_state(NR_ACTIVE_FILE) +
18462306a36Sopenharmony_ci		 global_node_page_state(NR_INACTIVE_FILE) +
18562306a36Sopenharmony_ci		 global_node_page_state(NR_ISOLATED_ANON) +
18662306a36Sopenharmony_ci		 global_node_page_state(NR_ISOLATED_FILE) +
18762306a36Sopenharmony_ci		 global_node_page_state(NR_UNEVICTABLE);
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_ci	return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
19062306a36Sopenharmony_ci}
19162306a36Sopenharmony_ci
19262306a36Sopenharmony_ci/**
19362306a36Sopenharmony_ci * oom_badness - heuristic function to determine which candidate task to kill
19462306a36Sopenharmony_ci * @p: task struct of which task we should calculate
19562306a36Sopenharmony_ci * @totalpages: total present RAM allowed for page allocation
19662306a36Sopenharmony_ci *
19762306a36Sopenharmony_ci * The heuristic for determining which task to kill is made to be as simple and
19862306a36Sopenharmony_ci * predictable as possible.  The goal is to return the highest value for the
19962306a36Sopenharmony_ci * task consuming the most memory to avoid subsequent oom failures.
20062306a36Sopenharmony_ci */
20162306a36Sopenharmony_cilong oom_badness(struct task_struct *p, unsigned long totalpages)
20262306a36Sopenharmony_ci{
20362306a36Sopenharmony_ci	long points;
20462306a36Sopenharmony_ci	long adj;
20562306a36Sopenharmony_ci
20662306a36Sopenharmony_ci	if (oom_unkillable_task(p))
20762306a36Sopenharmony_ci		return LONG_MIN;
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_ci	p = find_lock_task_mm(p);
21062306a36Sopenharmony_ci	if (!p)
21162306a36Sopenharmony_ci		return LONG_MIN;
21262306a36Sopenharmony_ci
21362306a36Sopenharmony_ci	/*
21462306a36Sopenharmony_ci	 * Do not even consider tasks which are explicitly marked oom
21562306a36Sopenharmony_ci	 * unkillable or have been already oom reaped or the are in
21662306a36Sopenharmony_ci	 * the middle of vfork
21762306a36Sopenharmony_ci	 */
21862306a36Sopenharmony_ci	adj = (long)p->signal->oom_score_adj;
21962306a36Sopenharmony_ci	if (adj == OOM_SCORE_ADJ_MIN ||
22062306a36Sopenharmony_ci			test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
22162306a36Sopenharmony_ci			in_vfork(p)) {
22262306a36Sopenharmony_ci		task_unlock(p);
22362306a36Sopenharmony_ci		return LONG_MIN;
22462306a36Sopenharmony_ci	}
22562306a36Sopenharmony_ci
22662306a36Sopenharmony_ci	/*
22762306a36Sopenharmony_ci	 * The baseline for the badness score is the proportion of RAM that each
22862306a36Sopenharmony_ci	 * task's rss, pagetable and swap space use.
22962306a36Sopenharmony_ci	 */
23062306a36Sopenharmony_ci	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
23162306a36Sopenharmony_ci		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
23262306a36Sopenharmony_ci	task_unlock(p);
23362306a36Sopenharmony_ci
23462306a36Sopenharmony_ci	/* Normalize to oom_score_adj units */
23562306a36Sopenharmony_ci	adj *= totalpages / 1000;
23662306a36Sopenharmony_ci	points += adj;
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci	return points;
23962306a36Sopenharmony_ci}
24062306a36Sopenharmony_ci
24162306a36Sopenharmony_cistatic const char * const oom_constraint_text[] = {
24262306a36Sopenharmony_ci	[CONSTRAINT_NONE] = "CONSTRAINT_NONE",
24362306a36Sopenharmony_ci	[CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
24462306a36Sopenharmony_ci	[CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
24562306a36Sopenharmony_ci	[CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
24662306a36Sopenharmony_ci};
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_ci/*
24962306a36Sopenharmony_ci * Determine the type of allocation constraint.
25062306a36Sopenharmony_ci */
25162306a36Sopenharmony_cistatic enum oom_constraint constrained_alloc(struct oom_control *oc)
25262306a36Sopenharmony_ci{
25362306a36Sopenharmony_ci	struct zone *zone;
25462306a36Sopenharmony_ci	struct zoneref *z;
25562306a36Sopenharmony_ci	enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
25662306a36Sopenharmony_ci	bool cpuset_limited = false;
25762306a36Sopenharmony_ci	int nid;
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci	if (is_memcg_oom(oc)) {
26062306a36Sopenharmony_ci		oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
26162306a36Sopenharmony_ci		return CONSTRAINT_MEMCG;
26262306a36Sopenharmony_ci	}
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_ci	/* Default to all available memory */
26562306a36Sopenharmony_ci	oc->totalpages = totalram_pages() + total_swap_pages;
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_ci	if (!IS_ENABLED(CONFIG_NUMA))
26862306a36Sopenharmony_ci		return CONSTRAINT_NONE;
26962306a36Sopenharmony_ci
27062306a36Sopenharmony_ci	if (!oc->zonelist)
27162306a36Sopenharmony_ci		return CONSTRAINT_NONE;
27262306a36Sopenharmony_ci	/*
27362306a36Sopenharmony_ci	 * Reach here only when __GFP_NOFAIL is used. So, we should avoid
27462306a36Sopenharmony_ci	 * to kill current.We have to random task kill in this case.
27562306a36Sopenharmony_ci	 * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
27662306a36Sopenharmony_ci	 */
27762306a36Sopenharmony_ci	if (oc->gfp_mask & __GFP_THISNODE)
27862306a36Sopenharmony_ci		return CONSTRAINT_NONE;
27962306a36Sopenharmony_ci
28062306a36Sopenharmony_ci	/*
28162306a36Sopenharmony_ci	 * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
28262306a36Sopenharmony_ci	 * the page allocator means a mempolicy is in effect.  Cpuset policy
28362306a36Sopenharmony_ci	 * is enforced in get_page_from_freelist().
28462306a36Sopenharmony_ci	 */
28562306a36Sopenharmony_ci	if (oc->nodemask &&
28662306a36Sopenharmony_ci	    !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
28762306a36Sopenharmony_ci		oc->totalpages = total_swap_pages;
28862306a36Sopenharmony_ci		for_each_node_mask(nid, *oc->nodemask)
28962306a36Sopenharmony_ci			oc->totalpages += node_present_pages(nid);
29062306a36Sopenharmony_ci		return CONSTRAINT_MEMORY_POLICY;
29162306a36Sopenharmony_ci	}
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci	/* Check this allocation failure is caused by cpuset's wall function */
29462306a36Sopenharmony_ci	for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
29562306a36Sopenharmony_ci			highest_zoneidx, oc->nodemask)
29662306a36Sopenharmony_ci		if (!cpuset_zone_allowed(zone, oc->gfp_mask))
29762306a36Sopenharmony_ci			cpuset_limited = true;
29862306a36Sopenharmony_ci
29962306a36Sopenharmony_ci	if (cpuset_limited) {
30062306a36Sopenharmony_ci		oc->totalpages = total_swap_pages;
30162306a36Sopenharmony_ci		for_each_node_mask(nid, cpuset_current_mems_allowed)
30262306a36Sopenharmony_ci			oc->totalpages += node_present_pages(nid);
30362306a36Sopenharmony_ci		return CONSTRAINT_CPUSET;
30462306a36Sopenharmony_ci	}
30562306a36Sopenharmony_ci	return CONSTRAINT_NONE;
30662306a36Sopenharmony_ci}
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_cistatic int oom_evaluate_task(struct task_struct *task, void *arg)
30962306a36Sopenharmony_ci{
31062306a36Sopenharmony_ci	struct oom_control *oc = arg;
31162306a36Sopenharmony_ci	long points;
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci	if (oom_unkillable_task(task))
31462306a36Sopenharmony_ci		goto next;
31562306a36Sopenharmony_ci
31662306a36Sopenharmony_ci	/* p may not have freeable memory in nodemask */
31762306a36Sopenharmony_ci	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
31862306a36Sopenharmony_ci		goto next;
31962306a36Sopenharmony_ci
32062306a36Sopenharmony_ci	/*
32162306a36Sopenharmony_ci	 * This task already has access to memory reserves and is being killed.
32262306a36Sopenharmony_ci	 * Don't allow any other task to have access to the reserves unless
32362306a36Sopenharmony_ci	 * the task has MMF_OOM_SKIP because chances that it would release
32462306a36Sopenharmony_ci	 * any memory is quite low.
32562306a36Sopenharmony_ci	 */
32662306a36Sopenharmony_ci	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
32762306a36Sopenharmony_ci		if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
32862306a36Sopenharmony_ci			goto next;
32962306a36Sopenharmony_ci		goto abort;
33062306a36Sopenharmony_ci	}
33162306a36Sopenharmony_ci
33262306a36Sopenharmony_ci	/*
33362306a36Sopenharmony_ci	 * If task is allocating a lot of memory and has been marked to be
33462306a36Sopenharmony_ci	 * killed first if it triggers an oom, then select it.
33562306a36Sopenharmony_ci	 */
33662306a36Sopenharmony_ci	if (oom_task_origin(task)) {
33762306a36Sopenharmony_ci		points = LONG_MAX;
33862306a36Sopenharmony_ci		goto select;
33962306a36Sopenharmony_ci	}
34062306a36Sopenharmony_ci
34162306a36Sopenharmony_ci	points = oom_badness(task, oc->totalpages);
34262306a36Sopenharmony_ci	if (points == LONG_MIN || points < oc->chosen_points)
34362306a36Sopenharmony_ci		goto next;
34462306a36Sopenharmony_ci
34562306a36Sopenharmony_ciselect:
34662306a36Sopenharmony_ci	if (oc->chosen)
34762306a36Sopenharmony_ci		put_task_struct(oc->chosen);
34862306a36Sopenharmony_ci	get_task_struct(task);
34962306a36Sopenharmony_ci	oc->chosen = task;
35062306a36Sopenharmony_ci	oc->chosen_points = points;
35162306a36Sopenharmony_cinext:
35262306a36Sopenharmony_ci	return 0;
35362306a36Sopenharmony_ciabort:
35462306a36Sopenharmony_ci	if (oc->chosen)
35562306a36Sopenharmony_ci		put_task_struct(oc->chosen);
35662306a36Sopenharmony_ci	oc->chosen = (void *)-1UL;
35762306a36Sopenharmony_ci	return 1;
35862306a36Sopenharmony_ci}
35962306a36Sopenharmony_ci
36062306a36Sopenharmony_ci/*
36162306a36Sopenharmony_ci * Simple selection loop. We choose the process with the highest number of
36262306a36Sopenharmony_ci * 'points'. In case scan was aborted, oc->chosen is set to -1.
36362306a36Sopenharmony_ci */
36462306a36Sopenharmony_cistatic void select_bad_process(struct oom_control *oc)
36562306a36Sopenharmony_ci{
36662306a36Sopenharmony_ci	oc->chosen_points = LONG_MIN;
36762306a36Sopenharmony_ci
36862306a36Sopenharmony_ci	if (is_memcg_oom(oc))
36962306a36Sopenharmony_ci		mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
37062306a36Sopenharmony_ci	else {
37162306a36Sopenharmony_ci		struct task_struct *p;
37262306a36Sopenharmony_ci
37362306a36Sopenharmony_ci		rcu_read_lock();
37462306a36Sopenharmony_ci		for_each_process(p)
37562306a36Sopenharmony_ci			if (oom_evaluate_task(p, oc))
37662306a36Sopenharmony_ci				break;
37762306a36Sopenharmony_ci		rcu_read_unlock();
37862306a36Sopenharmony_ci	}
37962306a36Sopenharmony_ci}
38062306a36Sopenharmony_ci
38162306a36Sopenharmony_cistatic int dump_task(struct task_struct *p, void *arg)
38262306a36Sopenharmony_ci{
38362306a36Sopenharmony_ci	struct oom_control *oc = arg;
38462306a36Sopenharmony_ci	struct task_struct *task;
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_ci	if (oom_unkillable_task(p))
38762306a36Sopenharmony_ci		return 0;
38862306a36Sopenharmony_ci
38962306a36Sopenharmony_ci	/* p may not have freeable memory in nodemask */
39062306a36Sopenharmony_ci	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
39162306a36Sopenharmony_ci		return 0;
39262306a36Sopenharmony_ci
39362306a36Sopenharmony_ci	task = find_lock_task_mm(p);
39462306a36Sopenharmony_ci	if (!task) {
39562306a36Sopenharmony_ci		/*
39662306a36Sopenharmony_ci		 * All of p's threads have already detached their mm's. There's
39762306a36Sopenharmony_ci		 * no need to report them; they can't be oom killed anyway.
39862306a36Sopenharmony_ci		 */
39962306a36Sopenharmony_ci		return 0;
40062306a36Sopenharmony_ci	}
40162306a36Sopenharmony_ci
40262306a36Sopenharmony_ci	pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
40362306a36Sopenharmony_ci		task->pid, from_kuid(&init_user_ns, task_uid(task)),
40462306a36Sopenharmony_ci		task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
40562306a36Sopenharmony_ci		mm_pgtables_bytes(task->mm),
40662306a36Sopenharmony_ci		get_mm_counter(task->mm, MM_SWAPENTS),
40762306a36Sopenharmony_ci		task->signal->oom_score_adj, task->comm);
40862306a36Sopenharmony_ci	task_unlock(task);
40962306a36Sopenharmony_ci
41062306a36Sopenharmony_ci	return 0;
41162306a36Sopenharmony_ci}
41262306a36Sopenharmony_ci
41362306a36Sopenharmony_ci/**
41462306a36Sopenharmony_ci * dump_tasks - dump current memory state of all system tasks
41562306a36Sopenharmony_ci * @oc: pointer to struct oom_control
41662306a36Sopenharmony_ci *
41762306a36Sopenharmony_ci * Dumps the current memory state of all eligible tasks.  Tasks not in the same
41862306a36Sopenharmony_ci * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
41962306a36Sopenharmony_ci * are not shown.
42062306a36Sopenharmony_ci * State information includes task's pid, uid, tgid, vm size, rss,
42162306a36Sopenharmony_ci * pgtables_bytes, swapents, oom_score_adj value, and name.
42262306a36Sopenharmony_ci */
42362306a36Sopenharmony_cistatic void dump_tasks(struct oom_control *oc)
42462306a36Sopenharmony_ci{
42562306a36Sopenharmony_ci	pr_info("Tasks state (memory values in pages):\n");
42662306a36Sopenharmony_ci	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
42762306a36Sopenharmony_ci
42862306a36Sopenharmony_ci	if (is_memcg_oom(oc))
42962306a36Sopenharmony_ci		mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
43062306a36Sopenharmony_ci	else {
43162306a36Sopenharmony_ci		struct task_struct *p;
43262306a36Sopenharmony_ci
43362306a36Sopenharmony_ci		rcu_read_lock();
43462306a36Sopenharmony_ci		for_each_process(p)
43562306a36Sopenharmony_ci			dump_task(p, oc);
43662306a36Sopenharmony_ci		rcu_read_unlock();
43762306a36Sopenharmony_ci	}
43862306a36Sopenharmony_ci}
43962306a36Sopenharmony_ci
44062306a36Sopenharmony_cistatic void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
44162306a36Sopenharmony_ci{
44262306a36Sopenharmony_ci	/* one line summary of the oom killer context. */
44362306a36Sopenharmony_ci	pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
44462306a36Sopenharmony_ci			oom_constraint_text[oc->constraint],
44562306a36Sopenharmony_ci			nodemask_pr_args(oc->nodemask));
44662306a36Sopenharmony_ci	cpuset_print_current_mems_allowed();
44762306a36Sopenharmony_ci	mem_cgroup_print_oom_context(oc->memcg, victim);
44862306a36Sopenharmony_ci	pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
44962306a36Sopenharmony_ci		from_kuid(&init_user_ns, task_uid(victim)));
45062306a36Sopenharmony_ci}
45162306a36Sopenharmony_ci
45262306a36Sopenharmony_cistatic void dump_header(struct oom_control *oc, struct task_struct *p)
45362306a36Sopenharmony_ci{
45462306a36Sopenharmony_ci	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
45562306a36Sopenharmony_ci		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
45662306a36Sopenharmony_ci			current->signal->oom_score_adj);
45762306a36Sopenharmony_ci	if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
45862306a36Sopenharmony_ci		pr_warn("COMPACTION is disabled!!!\n");
45962306a36Sopenharmony_ci
46062306a36Sopenharmony_ci	dump_stack();
46162306a36Sopenharmony_ci	if (is_memcg_oom(oc))
46262306a36Sopenharmony_ci		mem_cgroup_print_oom_meminfo(oc->memcg);
46362306a36Sopenharmony_ci	else {
46462306a36Sopenharmony_ci		__show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask));
46562306a36Sopenharmony_ci		if (should_dump_unreclaim_slab())
46662306a36Sopenharmony_ci			dump_unreclaimable_slab();
46762306a36Sopenharmony_ci	}
46862306a36Sopenharmony_ci	if (sysctl_oom_dump_tasks)
46962306a36Sopenharmony_ci		dump_tasks(oc);
47062306a36Sopenharmony_ci	if (p)
47162306a36Sopenharmony_ci		dump_oom_summary(oc, p);
47262306a36Sopenharmony_ci}
47362306a36Sopenharmony_ci
47462306a36Sopenharmony_ci/*
47562306a36Sopenharmony_ci * Number of OOM victims in flight
47662306a36Sopenharmony_ci */
47762306a36Sopenharmony_cistatic atomic_t oom_victims = ATOMIC_INIT(0);
47862306a36Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
47962306a36Sopenharmony_ci
48062306a36Sopenharmony_cistatic bool oom_killer_disabled __read_mostly;
48162306a36Sopenharmony_ci
48262306a36Sopenharmony_ci/*
48362306a36Sopenharmony_ci * task->mm can be NULL if the task is the exited group leader.  So to
48462306a36Sopenharmony_ci * determine whether the task is using a particular mm, we examine all the
48562306a36Sopenharmony_ci * task's threads: if one of those is using this mm then this task was also
48662306a36Sopenharmony_ci * using it.
48762306a36Sopenharmony_ci */
48862306a36Sopenharmony_cibool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
48962306a36Sopenharmony_ci{
49062306a36Sopenharmony_ci	struct task_struct *t;
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_ci	for_each_thread(p, t) {
49362306a36Sopenharmony_ci		struct mm_struct *t_mm = READ_ONCE(t->mm);
49462306a36Sopenharmony_ci		if (t_mm)
49562306a36Sopenharmony_ci			return t_mm == mm;
49662306a36Sopenharmony_ci	}
49762306a36Sopenharmony_ci	return false;
49862306a36Sopenharmony_ci}
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ci#ifdef CONFIG_MMU
50162306a36Sopenharmony_ci/*
50262306a36Sopenharmony_ci * OOM Reaper kernel thread which tries to reap the memory used by the OOM
50362306a36Sopenharmony_ci * victim (if that is possible) to help the OOM killer to move on.
50462306a36Sopenharmony_ci */
50562306a36Sopenharmony_cistatic struct task_struct *oom_reaper_th;
50662306a36Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
50762306a36Sopenharmony_cistatic struct task_struct *oom_reaper_list;
50862306a36Sopenharmony_cistatic DEFINE_SPINLOCK(oom_reaper_lock);
50962306a36Sopenharmony_ci
51062306a36Sopenharmony_cistatic bool __oom_reap_task_mm(struct mm_struct *mm)
51162306a36Sopenharmony_ci{
51262306a36Sopenharmony_ci	struct vm_area_struct *vma;
51362306a36Sopenharmony_ci	bool ret = true;
51462306a36Sopenharmony_ci	VMA_ITERATOR(vmi, mm, 0);
51562306a36Sopenharmony_ci
51662306a36Sopenharmony_ci	/*
51762306a36Sopenharmony_ci	 * Tell all users of get_user/copy_from_user etc... that the content
51862306a36Sopenharmony_ci	 * is no longer stable. No barriers really needed because unmapping
51962306a36Sopenharmony_ci	 * should imply barriers already and the reader would hit a page fault
52062306a36Sopenharmony_ci	 * if it stumbled over a reaped memory.
52162306a36Sopenharmony_ci	 */
52262306a36Sopenharmony_ci	set_bit(MMF_UNSTABLE, &mm->flags);
52362306a36Sopenharmony_ci
52462306a36Sopenharmony_ci	for_each_vma(vmi, vma) {
52562306a36Sopenharmony_ci		if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
52662306a36Sopenharmony_ci			continue;
52762306a36Sopenharmony_ci
52862306a36Sopenharmony_ci		/*
52962306a36Sopenharmony_ci		 * Only anonymous pages have a good chance to be dropped
53062306a36Sopenharmony_ci		 * without additional steps which we cannot afford as we
53162306a36Sopenharmony_ci		 * are OOM already.
53262306a36Sopenharmony_ci		 *
53362306a36Sopenharmony_ci		 * We do not even care about fs backed pages because all
53462306a36Sopenharmony_ci		 * which are reclaimable have already been reclaimed and
53562306a36Sopenharmony_ci		 * we do not want to block exit_mmap by keeping mm ref
53662306a36Sopenharmony_ci		 * count elevated without a good reason.
53762306a36Sopenharmony_ci		 */
53862306a36Sopenharmony_ci		if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
53962306a36Sopenharmony_ci			struct mmu_notifier_range range;
54062306a36Sopenharmony_ci			struct mmu_gather tlb;
54162306a36Sopenharmony_ci
54262306a36Sopenharmony_ci			mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
54362306a36Sopenharmony_ci						mm, vma->vm_start,
54462306a36Sopenharmony_ci						vma->vm_end);
54562306a36Sopenharmony_ci			tlb_gather_mmu(&tlb, mm);
54662306a36Sopenharmony_ci			if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
54762306a36Sopenharmony_ci				tlb_finish_mmu(&tlb);
54862306a36Sopenharmony_ci				ret = false;
54962306a36Sopenharmony_ci				continue;
55062306a36Sopenharmony_ci			}
55162306a36Sopenharmony_ci			unmap_page_range(&tlb, vma, range.start, range.end, NULL);
55262306a36Sopenharmony_ci			mmu_notifier_invalidate_range_end(&range);
55362306a36Sopenharmony_ci			tlb_finish_mmu(&tlb);
55462306a36Sopenharmony_ci		}
55562306a36Sopenharmony_ci	}
55662306a36Sopenharmony_ci
55762306a36Sopenharmony_ci	return ret;
55862306a36Sopenharmony_ci}
55962306a36Sopenharmony_ci
56062306a36Sopenharmony_ci/*
56162306a36Sopenharmony_ci * Reaps the address space of the give task.
56262306a36Sopenharmony_ci *
56362306a36Sopenharmony_ci * Returns true on success and false if none or part of the address space
56462306a36Sopenharmony_ci * has been reclaimed and the caller should retry later.
56562306a36Sopenharmony_ci */
56662306a36Sopenharmony_cistatic bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
56762306a36Sopenharmony_ci{
56862306a36Sopenharmony_ci	bool ret = true;
56962306a36Sopenharmony_ci
57062306a36Sopenharmony_ci	if (!mmap_read_trylock(mm)) {
57162306a36Sopenharmony_ci		trace_skip_task_reaping(tsk->pid);
57262306a36Sopenharmony_ci		return false;
57362306a36Sopenharmony_ci	}
57462306a36Sopenharmony_ci
57562306a36Sopenharmony_ci	/*
57662306a36Sopenharmony_ci	 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
57762306a36Sopenharmony_ci	 * work on the mm anymore. The check for MMF_OOM_SKIP must run
57862306a36Sopenharmony_ci	 * under mmap_lock for reading because it serializes against the
57962306a36Sopenharmony_ci	 * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
58062306a36Sopenharmony_ci	 */
58162306a36Sopenharmony_ci	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
58262306a36Sopenharmony_ci		trace_skip_task_reaping(tsk->pid);
58362306a36Sopenharmony_ci		goto out_unlock;
58462306a36Sopenharmony_ci	}
58562306a36Sopenharmony_ci
58662306a36Sopenharmony_ci	trace_start_task_reaping(tsk->pid);
58762306a36Sopenharmony_ci
58862306a36Sopenharmony_ci	/* failed to reap part of the address space. Try again later */
58962306a36Sopenharmony_ci	ret = __oom_reap_task_mm(mm);
59062306a36Sopenharmony_ci	if (!ret)
59162306a36Sopenharmony_ci		goto out_finish;
59262306a36Sopenharmony_ci
59362306a36Sopenharmony_ci	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
59462306a36Sopenharmony_ci			task_pid_nr(tsk), tsk->comm,
59562306a36Sopenharmony_ci			K(get_mm_counter(mm, MM_ANONPAGES)),
59662306a36Sopenharmony_ci			K(get_mm_counter(mm, MM_FILEPAGES)),
59762306a36Sopenharmony_ci			K(get_mm_counter(mm, MM_SHMEMPAGES)));
59862306a36Sopenharmony_ciout_finish:
59962306a36Sopenharmony_ci	trace_finish_task_reaping(tsk->pid);
60062306a36Sopenharmony_ciout_unlock:
60162306a36Sopenharmony_ci	mmap_read_unlock(mm);
60262306a36Sopenharmony_ci
60362306a36Sopenharmony_ci	return ret;
60462306a36Sopenharmony_ci}
60562306a36Sopenharmony_ci
60662306a36Sopenharmony_ci#define MAX_OOM_REAP_RETRIES 10
60762306a36Sopenharmony_cistatic void oom_reap_task(struct task_struct *tsk)
60862306a36Sopenharmony_ci{
60962306a36Sopenharmony_ci	int attempts = 0;
61062306a36Sopenharmony_ci	struct mm_struct *mm = tsk->signal->oom_mm;
61162306a36Sopenharmony_ci
61262306a36Sopenharmony_ci	/* Retry the mmap_read_trylock(mm) a few times */
61362306a36Sopenharmony_ci	while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
61462306a36Sopenharmony_ci		schedule_timeout_idle(HZ/10);
61562306a36Sopenharmony_ci
61662306a36Sopenharmony_ci	if (attempts <= MAX_OOM_REAP_RETRIES ||
61762306a36Sopenharmony_ci	    test_bit(MMF_OOM_SKIP, &mm->flags))
61862306a36Sopenharmony_ci		goto done;
61962306a36Sopenharmony_ci
62062306a36Sopenharmony_ci	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
62162306a36Sopenharmony_ci		task_pid_nr(tsk), tsk->comm);
62262306a36Sopenharmony_ci	sched_show_task(tsk);
62362306a36Sopenharmony_ci	debug_show_all_locks();
62462306a36Sopenharmony_ci
62562306a36Sopenharmony_cidone:
62662306a36Sopenharmony_ci	tsk->oom_reaper_list = NULL;
62762306a36Sopenharmony_ci
62862306a36Sopenharmony_ci	/*
62962306a36Sopenharmony_ci	 * Hide this mm from OOM killer because it has been either reaped or
63062306a36Sopenharmony_ci	 * somebody can't call mmap_write_unlock(mm).
63162306a36Sopenharmony_ci	 */
63262306a36Sopenharmony_ci	set_bit(MMF_OOM_SKIP, &mm->flags);
63362306a36Sopenharmony_ci
63462306a36Sopenharmony_ci	/* Drop a reference taken by queue_oom_reaper */
63562306a36Sopenharmony_ci	put_task_struct(tsk);
63662306a36Sopenharmony_ci}
63762306a36Sopenharmony_ci
63862306a36Sopenharmony_cistatic int oom_reaper(void *unused)
63962306a36Sopenharmony_ci{
64062306a36Sopenharmony_ci	set_freezable();
64162306a36Sopenharmony_ci
64262306a36Sopenharmony_ci	while (true) {
64362306a36Sopenharmony_ci		struct task_struct *tsk = NULL;
64462306a36Sopenharmony_ci
64562306a36Sopenharmony_ci		wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
64662306a36Sopenharmony_ci		spin_lock_irq(&oom_reaper_lock);
64762306a36Sopenharmony_ci		if (oom_reaper_list != NULL) {
64862306a36Sopenharmony_ci			tsk = oom_reaper_list;
64962306a36Sopenharmony_ci			oom_reaper_list = tsk->oom_reaper_list;
65062306a36Sopenharmony_ci		}
65162306a36Sopenharmony_ci		spin_unlock_irq(&oom_reaper_lock);
65262306a36Sopenharmony_ci
65362306a36Sopenharmony_ci		if (tsk)
65462306a36Sopenharmony_ci			oom_reap_task(tsk);
65562306a36Sopenharmony_ci	}
65662306a36Sopenharmony_ci
65762306a36Sopenharmony_ci	return 0;
65862306a36Sopenharmony_ci}
65962306a36Sopenharmony_ci
66062306a36Sopenharmony_cistatic void wake_oom_reaper(struct timer_list *timer)
66162306a36Sopenharmony_ci{
66262306a36Sopenharmony_ci	struct task_struct *tsk = container_of(timer, struct task_struct,
66362306a36Sopenharmony_ci			oom_reaper_timer);
66462306a36Sopenharmony_ci	struct mm_struct *mm = tsk->signal->oom_mm;
66562306a36Sopenharmony_ci	unsigned long flags;
66662306a36Sopenharmony_ci
66762306a36Sopenharmony_ci	/* The victim managed to terminate on its own - see exit_mmap */
66862306a36Sopenharmony_ci	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
66962306a36Sopenharmony_ci		put_task_struct(tsk);
67062306a36Sopenharmony_ci		return;
67162306a36Sopenharmony_ci	}
67262306a36Sopenharmony_ci
67362306a36Sopenharmony_ci	spin_lock_irqsave(&oom_reaper_lock, flags);
67462306a36Sopenharmony_ci	tsk->oom_reaper_list = oom_reaper_list;
67562306a36Sopenharmony_ci	oom_reaper_list = tsk;
67662306a36Sopenharmony_ci	spin_unlock_irqrestore(&oom_reaper_lock, flags);
67762306a36Sopenharmony_ci	trace_wake_reaper(tsk->pid);
67862306a36Sopenharmony_ci	wake_up(&oom_reaper_wait);
67962306a36Sopenharmony_ci}
68062306a36Sopenharmony_ci
68162306a36Sopenharmony_ci/*
68262306a36Sopenharmony_ci * Give the OOM victim time to exit naturally before invoking the oom_reaping.
68362306a36Sopenharmony_ci * The timers timeout is arbitrary... the longer it is, the longer the worst
68462306a36Sopenharmony_ci * case scenario for the OOM can take. If it is too small, the oom_reaper can
68562306a36Sopenharmony_ci * get in the way and release resources needed by the process exit path.
68662306a36Sopenharmony_ci * e.g. The futex robust list can sit in Anon|Private memory that gets reaped
68762306a36Sopenharmony_ci * before the exit path is able to wake the futex waiters.
68862306a36Sopenharmony_ci */
68962306a36Sopenharmony_ci#define OOM_REAPER_DELAY (2*HZ)
69062306a36Sopenharmony_cistatic void queue_oom_reaper(struct task_struct *tsk)
69162306a36Sopenharmony_ci{
69262306a36Sopenharmony_ci	/* mm is already queued? */
69362306a36Sopenharmony_ci	if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
69462306a36Sopenharmony_ci		return;
69562306a36Sopenharmony_ci
69662306a36Sopenharmony_ci	get_task_struct(tsk);
69762306a36Sopenharmony_ci	timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
69862306a36Sopenharmony_ci	tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
69962306a36Sopenharmony_ci	add_timer(&tsk->oom_reaper_timer);
70062306a36Sopenharmony_ci}
70162306a36Sopenharmony_ci
70262306a36Sopenharmony_ci#ifdef CONFIG_SYSCTL
70362306a36Sopenharmony_cistatic struct ctl_table vm_oom_kill_table[] = {
70462306a36Sopenharmony_ci	{
70562306a36Sopenharmony_ci		.procname	= "panic_on_oom",
70662306a36Sopenharmony_ci		.data		= &sysctl_panic_on_oom,
70762306a36Sopenharmony_ci		.maxlen		= sizeof(sysctl_panic_on_oom),
70862306a36Sopenharmony_ci		.mode		= 0644,
70962306a36Sopenharmony_ci		.proc_handler	= proc_dointvec_minmax,
71062306a36Sopenharmony_ci		.extra1		= SYSCTL_ZERO,
71162306a36Sopenharmony_ci		.extra2		= SYSCTL_TWO,
71262306a36Sopenharmony_ci	},
71362306a36Sopenharmony_ci	{
71462306a36Sopenharmony_ci		.procname	= "oom_kill_allocating_task",
71562306a36Sopenharmony_ci		.data		= &sysctl_oom_kill_allocating_task,
71662306a36Sopenharmony_ci		.maxlen		= sizeof(sysctl_oom_kill_allocating_task),
71762306a36Sopenharmony_ci		.mode		= 0644,
71862306a36Sopenharmony_ci		.proc_handler	= proc_dointvec,
71962306a36Sopenharmony_ci	},
72062306a36Sopenharmony_ci	{
72162306a36Sopenharmony_ci		.procname	= "oom_dump_tasks",
72262306a36Sopenharmony_ci		.data		= &sysctl_oom_dump_tasks,
72362306a36Sopenharmony_ci		.maxlen		= sizeof(sysctl_oom_dump_tasks),
72462306a36Sopenharmony_ci		.mode		= 0644,
72562306a36Sopenharmony_ci		.proc_handler	= proc_dointvec,
72662306a36Sopenharmony_ci	},
72762306a36Sopenharmony_ci	{}
72862306a36Sopenharmony_ci};
72962306a36Sopenharmony_ci#endif
73062306a36Sopenharmony_ci
73162306a36Sopenharmony_cistatic int __init oom_init(void)
73262306a36Sopenharmony_ci{
73362306a36Sopenharmony_ci	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
73462306a36Sopenharmony_ci#ifdef CONFIG_SYSCTL
73562306a36Sopenharmony_ci	register_sysctl_init("vm", vm_oom_kill_table);
73662306a36Sopenharmony_ci#endif
73762306a36Sopenharmony_ci	return 0;
73862306a36Sopenharmony_ci}
73962306a36Sopenharmony_cisubsys_initcall(oom_init)
74062306a36Sopenharmony_ci#else
74162306a36Sopenharmony_cistatic inline void queue_oom_reaper(struct task_struct *tsk)
74262306a36Sopenharmony_ci{
74362306a36Sopenharmony_ci}
74462306a36Sopenharmony_ci#endif /* CONFIG_MMU */
74562306a36Sopenharmony_ci
74662306a36Sopenharmony_ci/**
74762306a36Sopenharmony_ci * mark_oom_victim - mark the given task as OOM victim
74862306a36Sopenharmony_ci * @tsk: task to mark
74962306a36Sopenharmony_ci *
75062306a36Sopenharmony_ci * Has to be called with oom_lock held and never after
75162306a36Sopenharmony_ci * oom has been disabled already.
75262306a36Sopenharmony_ci *
75362306a36Sopenharmony_ci * tsk->mm has to be non NULL and caller has to guarantee it is stable (either
75462306a36Sopenharmony_ci * under task_lock or operate on the current).
75562306a36Sopenharmony_ci */
75662306a36Sopenharmony_cistatic void mark_oom_victim(struct task_struct *tsk)
75762306a36Sopenharmony_ci{
75862306a36Sopenharmony_ci	struct mm_struct *mm = tsk->mm;
75962306a36Sopenharmony_ci
76062306a36Sopenharmony_ci	WARN_ON(oom_killer_disabled);
76162306a36Sopenharmony_ci	/* OOM killer might race with memcg OOM */
76262306a36Sopenharmony_ci	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
76362306a36Sopenharmony_ci		return;
76462306a36Sopenharmony_ci
76562306a36Sopenharmony_ci	/* oom_mm is bound to the signal struct life time. */
76662306a36Sopenharmony_ci	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
76762306a36Sopenharmony_ci		mmgrab(tsk->signal->oom_mm);
76862306a36Sopenharmony_ci
76962306a36Sopenharmony_ci	/*
77062306a36Sopenharmony_ci	 * Make sure that the task is woken up from uninterruptible sleep
77162306a36Sopenharmony_ci	 * if it is frozen because OOM killer wouldn't be able to free
77262306a36Sopenharmony_ci	 * any memory and livelock. freezing_slow_path will tell the freezer
77362306a36Sopenharmony_ci	 * that TIF_MEMDIE tasks should be ignored.
77462306a36Sopenharmony_ci	 */
77562306a36Sopenharmony_ci	__thaw_task(tsk);
77662306a36Sopenharmony_ci	atomic_inc(&oom_victims);
77762306a36Sopenharmony_ci	trace_mark_victim(tsk->pid);
77862306a36Sopenharmony_ci}
77962306a36Sopenharmony_ci
78062306a36Sopenharmony_ci/**
78162306a36Sopenharmony_ci * exit_oom_victim - note the exit of an OOM victim
78262306a36Sopenharmony_ci */
78362306a36Sopenharmony_civoid exit_oom_victim(void)
78462306a36Sopenharmony_ci{
78562306a36Sopenharmony_ci	clear_thread_flag(TIF_MEMDIE);
78662306a36Sopenharmony_ci
78762306a36Sopenharmony_ci	if (!atomic_dec_return(&oom_victims))
78862306a36Sopenharmony_ci		wake_up_all(&oom_victims_wait);
78962306a36Sopenharmony_ci}
79062306a36Sopenharmony_ci
79162306a36Sopenharmony_ci/**
79262306a36Sopenharmony_ci * oom_killer_enable - enable OOM killer
79362306a36Sopenharmony_ci */
79462306a36Sopenharmony_civoid oom_killer_enable(void)
79562306a36Sopenharmony_ci{
79662306a36Sopenharmony_ci	oom_killer_disabled = false;
79762306a36Sopenharmony_ci	pr_info("OOM killer enabled.\n");
79862306a36Sopenharmony_ci}
79962306a36Sopenharmony_ci
80062306a36Sopenharmony_ci/**
80162306a36Sopenharmony_ci * oom_killer_disable - disable OOM killer
80262306a36Sopenharmony_ci * @timeout: maximum timeout to wait for oom victims in jiffies
80362306a36Sopenharmony_ci *
80462306a36Sopenharmony_ci * Forces all page allocations to fail rather than trigger OOM killer.
80562306a36Sopenharmony_ci * Will block and wait until all OOM victims are killed or the given
80662306a36Sopenharmony_ci * timeout expires.
80762306a36Sopenharmony_ci *
80862306a36Sopenharmony_ci * The function cannot be called when there are runnable user tasks because
80962306a36Sopenharmony_ci * the userspace would see unexpected allocation failures as a result. Any
81062306a36Sopenharmony_ci * new usage of this function should be consulted with MM people.
81162306a36Sopenharmony_ci *
81262306a36Sopenharmony_ci * Returns true if successful and false if the OOM killer cannot be
81362306a36Sopenharmony_ci * disabled.
81462306a36Sopenharmony_ci */
81562306a36Sopenharmony_cibool oom_killer_disable(signed long timeout)
81662306a36Sopenharmony_ci{
81762306a36Sopenharmony_ci	signed long ret;
81862306a36Sopenharmony_ci
81962306a36Sopenharmony_ci	/*
82062306a36Sopenharmony_ci	 * Make sure to not race with an ongoing OOM killer. Check that the
82162306a36Sopenharmony_ci	 * current is not killed (possibly due to sharing the victim's memory).
82262306a36Sopenharmony_ci	 */
82362306a36Sopenharmony_ci	if (mutex_lock_killable(&oom_lock))
82462306a36Sopenharmony_ci		return false;
82562306a36Sopenharmony_ci	oom_killer_disabled = true;
82662306a36Sopenharmony_ci	mutex_unlock(&oom_lock);
82762306a36Sopenharmony_ci
82862306a36Sopenharmony_ci	ret = wait_event_interruptible_timeout(oom_victims_wait,
82962306a36Sopenharmony_ci			!atomic_read(&oom_victims), timeout);
83062306a36Sopenharmony_ci	if (ret <= 0) {
83162306a36Sopenharmony_ci		oom_killer_enable();
83262306a36Sopenharmony_ci		return false;
83362306a36Sopenharmony_ci	}
83462306a36Sopenharmony_ci	pr_info("OOM killer disabled.\n");
83562306a36Sopenharmony_ci
83662306a36Sopenharmony_ci	return true;
83762306a36Sopenharmony_ci}
83862306a36Sopenharmony_ci
83962306a36Sopenharmony_cistatic inline bool __task_will_free_mem(struct task_struct *task)
84062306a36Sopenharmony_ci{
84162306a36Sopenharmony_ci	struct signal_struct *sig = task->signal;
84262306a36Sopenharmony_ci
84362306a36Sopenharmony_ci	/*
84462306a36Sopenharmony_ci	 * A coredumping process may sleep for an extended period in
84562306a36Sopenharmony_ci	 * coredump_task_exit(), so the oom killer cannot assume that
84662306a36Sopenharmony_ci	 * the process will promptly exit and release memory.
84762306a36Sopenharmony_ci	 */
84862306a36Sopenharmony_ci	if (sig->core_state)
84962306a36Sopenharmony_ci		return false;
85062306a36Sopenharmony_ci
85162306a36Sopenharmony_ci	if (sig->flags & SIGNAL_GROUP_EXIT)
85262306a36Sopenharmony_ci		return true;
85362306a36Sopenharmony_ci
85462306a36Sopenharmony_ci	if (thread_group_empty(task) && (task->flags & PF_EXITING))
85562306a36Sopenharmony_ci		return true;
85662306a36Sopenharmony_ci
85762306a36Sopenharmony_ci	return false;
85862306a36Sopenharmony_ci}
85962306a36Sopenharmony_ci
86062306a36Sopenharmony_ci/*
86162306a36Sopenharmony_ci * Checks whether the given task is dying or exiting and likely to
86262306a36Sopenharmony_ci * release its address space. This means that all threads and processes
86362306a36Sopenharmony_ci * sharing the same mm have to be killed or exiting.
86462306a36Sopenharmony_ci * Caller has to make sure that task->mm is stable (hold task_lock or
86562306a36Sopenharmony_ci * it operates on the current).
86662306a36Sopenharmony_ci */
86762306a36Sopenharmony_cistatic bool task_will_free_mem(struct task_struct *task)
86862306a36Sopenharmony_ci{
86962306a36Sopenharmony_ci	struct mm_struct *mm = task->mm;
87062306a36Sopenharmony_ci	struct task_struct *p;
87162306a36Sopenharmony_ci	bool ret = true;
87262306a36Sopenharmony_ci
87362306a36Sopenharmony_ci	/*
87462306a36Sopenharmony_ci	 * Skip tasks without mm because it might have passed its exit_mm and
87562306a36Sopenharmony_ci	 * exit_oom_victim. oom_reaper could have rescued that but do not rely
87662306a36Sopenharmony_ci	 * on that for now. We can consider find_lock_task_mm in future.
87762306a36Sopenharmony_ci	 */
87862306a36Sopenharmony_ci	if (!mm)
87962306a36Sopenharmony_ci		return false;
88062306a36Sopenharmony_ci
88162306a36Sopenharmony_ci	if (!__task_will_free_mem(task))
88262306a36Sopenharmony_ci		return false;
88362306a36Sopenharmony_ci
88462306a36Sopenharmony_ci	/*
88562306a36Sopenharmony_ci	 * This task has already been drained by the oom reaper so there are
88662306a36Sopenharmony_ci	 * only small chances it will free some more
88762306a36Sopenharmony_ci	 */
88862306a36Sopenharmony_ci	if (test_bit(MMF_OOM_SKIP, &mm->flags))
88962306a36Sopenharmony_ci		return false;
89062306a36Sopenharmony_ci
89162306a36Sopenharmony_ci	if (atomic_read(&mm->mm_users) <= 1)
89262306a36Sopenharmony_ci		return true;
89362306a36Sopenharmony_ci
89462306a36Sopenharmony_ci	/*
89562306a36Sopenharmony_ci	 * Make sure that all tasks which share the mm with the given tasks
89662306a36Sopenharmony_ci	 * are dying as well to make sure that a) nobody pins its mm and
89762306a36Sopenharmony_ci	 * b) the task is also reapable by the oom reaper.
89862306a36Sopenharmony_ci	 */
89962306a36Sopenharmony_ci	rcu_read_lock();
90062306a36Sopenharmony_ci	for_each_process(p) {
90162306a36Sopenharmony_ci		if (!process_shares_mm(p, mm))
90262306a36Sopenharmony_ci			continue;
90362306a36Sopenharmony_ci		if (same_thread_group(task, p))
90462306a36Sopenharmony_ci			continue;
90562306a36Sopenharmony_ci		ret = __task_will_free_mem(p);
90662306a36Sopenharmony_ci		if (!ret)
90762306a36Sopenharmony_ci			break;
90862306a36Sopenharmony_ci	}
90962306a36Sopenharmony_ci	rcu_read_unlock();
91062306a36Sopenharmony_ci
91162306a36Sopenharmony_ci	return ret;
91262306a36Sopenharmony_ci}
91362306a36Sopenharmony_ci
91462306a36Sopenharmony_cistatic void __oom_kill_process(struct task_struct *victim, const char *message)
91562306a36Sopenharmony_ci{
91662306a36Sopenharmony_ci	struct task_struct *p;
91762306a36Sopenharmony_ci	struct mm_struct *mm;
91862306a36Sopenharmony_ci	bool can_oom_reap = true;
91962306a36Sopenharmony_ci
92062306a36Sopenharmony_ci	p = find_lock_task_mm(victim);
92162306a36Sopenharmony_ci	if (!p) {
92262306a36Sopenharmony_ci		pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
92362306a36Sopenharmony_ci			message, task_pid_nr(victim), victim->comm);
92462306a36Sopenharmony_ci		put_task_struct(victim);
92562306a36Sopenharmony_ci		return;
92662306a36Sopenharmony_ci	} else if (victim != p) {
92762306a36Sopenharmony_ci		get_task_struct(p);
92862306a36Sopenharmony_ci		put_task_struct(victim);
92962306a36Sopenharmony_ci		victim = p;
93062306a36Sopenharmony_ci	}
93162306a36Sopenharmony_ci
93262306a36Sopenharmony_ci	/* Get a reference to safely compare mm after task_unlock(victim) */
93362306a36Sopenharmony_ci	mm = victim->mm;
93462306a36Sopenharmony_ci	mmgrab(mm);
93562306a36Sopenharmony_ci
93662306a36Sopenharmony_ci	/* Raise event before sending signal: task reaper must see this */
93762306a36Sopenharmony_ci	count_vm_event(OOM_KILL);
93862306a36Sopenharmony_ci	memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
93962306a36Sopenharmony_ci
94062306a36Sopenharmony_ci	/*
94162306a36Sopenharmony_ci	 * We should send SIGKILL before granting access to memory reserves
94262306a36Sopenharmony_ci	 * in order to prevent the OOM victim from depleting the memory
94362306a36Sopenharmony_ci	 * reserves from the user space under its control.
94462306a36Sopenharmony_ci	 */
94562306a36Sopenharmony_ci	do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
94662306a36Sopenharmony_ci	mark_oom_victim(victim);
94762306a36Sopenharmony_ci	pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
94862306a36Sopenharmony_ci		message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
94962306a36Sopenharmony_ci		K(get_mm_counter(mm, MM_ANONPAGES)),
95062306a36Sopenharmony_ci		K(get_mm_counter(mm, MM_FILEPAGES)),
95162306a36Sopenharmony_ci		K(get_mm_counter(mm, MM_SHMEMPAGES)),
95262306a36Sopenharmony_ci		from_kuid(&init_user_ns, task_uid(victim)),
95362306a36Sopenharmony_ci		mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
95462306a36Sopenharmony_ci	task_unlock(victim);
95562306a36Sopenharmony_ci
95662306a36Sopenharmony_ci	/*
95762306a36Sopenharmony_ci	 * Kill all user processes sharing victim->mm in other thread groups, if
95862306a36Sopenharmony_ci	 * any.  They don't get access to memory reserves, though, to avoid
95962306a36Sopenharmony_ci	 * depletion of all memory.  This prevents mm->mmap_lock livelock when an
96062306a36Sopenharmony_ci	 * oom killed thread cannot exit because it requires the semaphore and
96162306a36Sopenharmony_ci	 * its contended by another thread trying to allocate memory itself.
96262306a36Sopenharmony_ci	 * That thread will now get access to memory reserves since it has a
96362306a36Sopenharmony_ci	 * pending fatal signal.
96462306a36Sopenharmony_ci	 */
96562306a36Sopenharmony_ci	rcu_read_lock();
96662306a36Sopenharmony_ci	for_each_process(p) {
96762306a36Sopenharmony_ci		if (!process_shares_mm(p, mm))
96862306a36Sopenharmony_ci			continue;
96962306a36Sopenharmony_ci		if (same_thread_group(p, victim))
97062306a36Sopenharmony_ci			continue;
97162306a36Sopenharmony_ci		if (is_global_init(p)) {
97262306a36Sopenharmony_ci			can_oom_reap = false;
97362306a36Sopenharmony_ci			set_bit(MMF_OOM_SKIP, &mm->flags);
97462306a36Sopenharmony_ci			pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
97562306a36Sopenharmony_ci					task_pid_nr(victim), victim->comm,
97662306a36Sopenharmony_ci					task_pid_nr(p), p->comm);
97762306a36Sopenharmony_ci			continue;
97862306a36Sopenharmony_ci		}
97962306a36Sopenharmony_ci		/*
98062306a36Sopenharmony_ci		 * No kthread_use_mm() user needs to read from the userspace so
98162306a36Sopenharmony_ci		 * we are ok to reap it.
98262306a36Sopenharmony_ci		 */
98362306a36Sopenharmony_ci		if (unlikely(p->flags & PF_KTHREAD))
98462306a36Sopenharmony_ci			continue;
98562306a36Sopenharmony_ci		do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
98662306a36Sopenharmony_ci	}
98762306a36Sopenharmony_ci	rcu_read_unlock();
98862306a36Sopenharmony_ci
98962306a36Sopenharmony_ci	if (can_oom_reap)
99062306a36Sopenharmony_ci		queue_oom_reaper(victim);
99162306a36Sopenharmony_ci
99262306a36Sopenharmony_ci	mmdrop(mm);
99362306a36Sopenharmony_ci	put_task_struct(victim);
99462306a36Sopenharmony_ci}
99562306a36Sopenharmony_ci
99662306a36Sopenharmony_ci/*
99762306a36Sopenharmony_ci * Kill provided task unless it's secured by setting
99862306a36Sopenharmony_ci * oom_score_adj to OOM_SCORE_ADJ_MIN.
99962306a36Sopenharmony_ci */
100062306a36Sopenharmony_cistatic int oom_kill_memcg_member(struct task_struct *task, void *message)
100162306a36Sopenharmony_ci{
100262306a36Sopenharmony_ci	if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
100362306a36Sopenharmony_ci	    !is_global_init(task)) {
100462306a36Sopenharmony_ci		get_task_struct(task);
100562306a36Sopenharmony_ci		__oom_kill_process(task, message);
100662306a36Sopenharmony_ci	}
100762306a36Sopenharmony_ci	return 0;
100862306a36Sopenharmony_ci}
100962306a36Sopenharmony_ci
101062306a36Sopenharmony_cistatic void oom_kill_process(struct oom_control *oc, const char *message)
101162306a36Sopenharmony_ci{
101262306a36Sopenharmony_ci	struct task_struct *victim = oc->chosen;
101362306a36Sopenharmony_ci	struct mem_cgroup *oom_group;
101462306a36Sopenharmony_ci	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
101562306a36Sopenharmony_ci					      DEFAULT_RATELIMIT_BURST);
101662306a36Sopenharmony_ci
101762306a36Sopenharmony_ci	/*
101862306a36Sopenharmony_ci	 * If the task is already exiting, don't alarm the sysadmin or kill
101962306a36Sopenharmony_ci	 * its children or threads, just give it access to memory reserves
102062306a36Sopenharmony_ci	 * so it can die quickly
102162306a36Sopenharmony_ci	 */
102262306a36Sopenharmony_ci	task_lock(victim);
102362306a36Sopenharmony_ci	if (task_will_free_mem(victim)) {
102462306a36Sopenharmony_ci		mark_oom_victim(victim);
102562306a36Sopenharmony_ci		queue_oom_reaper(victim);
102662306a36Sopenharmony_ci		task_unlock(victim);
102762306a36Sopenharmony_ci		put_task_struct(victim);
102862306a36Sopenharmony_ci		return;
102962306a36Sopenharmony_ci	}
103062306a36Sopenharmony_ci	task_unlock(victim);
103162306a36Sopenharmony_ci
103262306a36Sopenharmony_ci	if (__ratelimit(&oom_rs))
103362306a36Sopenharmony_ci		dump_header(oc, victim);
103462306a36Sopenharmony_ci
103562306a36Sopenharmony_ci	/*
103662306a36Sopenharmony_ci	 * Do we need to kill the entire memory cgroup?
103762306a36Sopenharmony_ci	 * Or even one of the ancestor memory cgroups?
103862306a36Sopenharmony_ci	 * Check this out before killing the victim task.
103962306a36Sopenharmony_ci	 */
104062306a36Sopenharmony_ci	oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
104162306a36Sopenharmony_ci
104262306a36Sopenharmony_ci	__oom_kill_process(victim, message);
104362306a36Sopenharmony_ci
104462306a36Sopenharmony_ci	/*
104562306a36Sopenharmony_ci	 * If necessary, kill all tasks in the selected memory cgroup.
104662306a36Sopenharmony_ci	 */
104762306a36Sopenharmony_ci	if (oom_group) {
104862306a36Sopenharmony_ci		memcg_memory_event(oom_group, MEMCG_OOM_GROUP_KILL);
104962306a36Sopenharmony_ci		mem_cgroup_print_oom_group(oom_group);
105062306a36Sopenharmony_ci		mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
105162306a36Sopenharmony_ci				      (void *)message);
105262306a36Sopenharmony_ci		mem_cgroup_put(oom_group);
105362306a36Sopenharmony_ci	}
105462306a36Sopenharmony_ci}
105562306a36Sopenharmony_ci
105662306a36Sopenharmony_ci/*
105762306a36Sopenharmony_ci * Determines whether the kernel must panic because of the panic_on_oom sysctl.
105862306a36Sopenharmony_ci */
105962306a36Sopenharmony_cistatic void check_panic_on_oom(struct oom_control *oc)
106062306a36Sopenharmony_ci{
106162306a36Sopenharmony_ci	if (likely(!sysctl_panic_on_oom))
106262306a36Sopenharmony_ci		return;
106362306a36Sopenharmony_ci	if (sysctl_panic_on_oom != 2) {
106462306a36Sopenharmony_ci		/*
106562306a36Sopenharmony_ci		 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
106662306a36Sopenharmony_ci		 * does not panic for cpuset, mempolicy, or memcg allocation
106762306a36Sopenharmony_ci		 * failures.
106862306a36Sopenharmony_ci		 */
106962306a36Sopenharmony_ci		if (oc->constraint != CONSTRAINT_NONE)
107062306a36Sopenharmony_ci			return;
107162306a36Sopenharmony_ci	}
107262306a36Sopenharmony_ci	/* Do not panic for oom kills triggered by sysrq */
107362306a36Sopenharmony_ci	if (is_sysrq_oom(oc))
107462306a36Sopenharmony_ci		return;
107562306a36Sopenharmony_ci	dump_header(oc, NULL);
107662306a36Sopenharmony_ci	panic("Out of memory: %s panic_on_oom is enabled\n",
107762306a36Sopenharmony_ci		sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
107862306a36Sopenharmony_ci}
107962306a36Sopenharmony_ci
108062306a36Sopenharmony_cistatic BLOCKING_NOTIFIER_HEAD(oom_notify_list);
108162306a36Sopenharmony_ci
108262306a36Sopenharmony_ciint register_oom_notifier(struct notifier_block *nb)
108362306a36Sopenharmony_ci{
108462306a36Sopenharmony_ci	return blocking_notifier_chain_register(&oom_notify_list, nb);
108562306a36Sopenharmony_ci}
108662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(register_oom_notifier);
108762306a36Sopenharmony_ci
108862306a36Sopenharmony_ciint unregister_oom_notifier(struct notifier_block *nb)
108962306a36Sopenharmony_ci{
109062306a36Sopenharmony_ci	return blocking_notifier_chain_unregister(&oom_notify_list, nb);
109162306a36Sopenharmony_ci}
109262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(unregister_oom_notifier);
109362306a36Sopenharmony_ci
109462306a36Sopenharmony_ci/**
109562306a36Sopenharmony_ci * out_of_memory - kill the "best" process when we run out of memory
109662306a36Sopenharmony_ci * @oc: pointer to struct oom_control
109762306a36Sopenharmony_ci *
109862306a36Sopenharmony_ci * If we run out of memory, we have the choice between either
109962306a36Sopenharmony_ci * killing a random task (bad), letting the system crash (worse)
110062306a36Sopenharmony_ci * OR try to be smart about which process to kill. Note that we
110162306a36Sopenharmony_ci * don't have to be perfect here, we just have to be good.
110262306a36Sopenharmony_ci */
110362306a36Sopenharmony_cibool out_of_memory(struct oom_control *oc)
110462306a36Sopenharmony_ci{
110562306a36Sopenharmony_ci	unsigned long freed = 0;
110662306a36Sopenharmony_ci
110762306a36Sopenharmony_ci	if (oom_killer_disabled)
110862306a36Sopenharmony_ci		return false;
110962306a36Sopenharmony_ci
111062306a36Sopenharmony_ci	if (!is_memcg_oom(oc)) {
111162306a36Sopenharmony_ci		blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
111262306a36Sopenharmony_ci		if (freed > 0 && !is_sysrq_oom(oc))
111362306a36Sopenharmony_ci			/* Got some memory back in the last second. */
111462306a36Sopenharmony_ci			return true;
111562306a36Sopenharmony_ci	}
111662306a36Sopenharmony_ci
111762306a36Sopenharmony_ci	/*
111862306a36Sopenharmony_ci	 * If current has a pending SIGKILL or is exiting, then automatically
111962306a36Sopenharmony_ci	 * select it.  The goal is to allow it to allocate so that it may
112062306a36Sopenharmony_ci	 * quickly exit and free its memory.
112162306a36Sopenharmony_ci	 */
112262306a36Sopenharmony_ci	if (task_will_free_mem(current)) {
112362306a36Sopenharmony_ci		mark_oom_victim(current);
112462306a36Sopenharmony_ci		queue_oom_reaper(current);
112562306a36Sopenharmony_ci		return true;
112662306a36Sopenharmony_ci	}
112762306a36Sopenharmony_ci
112862306a36Sopenharmony_ci	/*
112962306a36Sopenharmony_ci	 * The OOM killer does not compensate for IO-less reclaim.
113062306a36Sopenharmony_ci	 * But mem_cgroup_oom() has to invoke the OOM killer even
113162306a36Sopenharmony_ci	 * if it is a GFP_NOFS allocation.
113262306a36Sopenharmony_ci	 */
113362306a36Sopenharmony_ci	if (!(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
113462306a36Sopenharmony_ci		return true;
113562306a36Sopenharmony_ci
113662306a36Sopenharmony_ci	/*
113762306a36Sopenharmony_ci	 * Check if there were limitations on the allocation (only relevant for
113862306a36Sopenharmony_ci	 * NUMA and memcg) that may require different handling.
113962306a36Sopenharmony_ci	 */
114062306a36Sopenharmony_ci	oc->constraint = constrained_alloc(oc);
114162306a36Sopenharmony_ci	if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
114262306a36Sopenharmony_ci		oc->nodemask = NULL;
114362306a36Sopenharmony_ci	check_panic_on_oom(oc);
114462306a36Sopenharmony_ci
114562306a36Sopenharmony_ci	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
114662306a36Sopenharmony_ci	    current->mm && !oom_unkillable_task(current) &&
114762306a36Sopenharmony_ci	    oom_cpuset_eligible(current, oc) &&
114862306a36Sopenharmony_ci	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
114962306a36Sopenharmony_ci		get_task_struct(current);
115062306a36Sopenharmony_ci		oc->chosen = current;
115162306a36Sopenharmony_ci		oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
115262306a36Sopenharmony_ci		return true;
115362306a36Sopenharmony_ci	}
115462306a36Sopenharmony_ci
115562306a36Sopenharmony_ci	select_bad_process(oc);
115662306a36Sopenharmony_ci	/* Found nothing?!?! */
115762306a36Sopenharmony_ci	if (!oc->chosen) {
115862306a36Sopenharmony_ci		dump_header(oc, NULL);
115962306a36Sopenharmony_ci		pr_warn("Out of memory and no killable processes...\n");
116062306a36Sopenharmony_ci		/*
116162306a36Sopenharmony_ci		 * If we got here due to an actual allocation at the
116262306a36Sopenharmony_ci		 * system level, we cannot survive this and will enter
116362306a36Sopenharmony_ci		 * an endless loop in the allocator. Bail out now.
116462306a36Sopenharmony_ci		 */
116562306a36Sopenharmony_ci		if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
116662306a36Sopenharmony_ci			panic("System is deadlocked on memory\n");
116762306a36Sopenharmony_ci	}
116862306a36Sopenharmony_ci	if (oc->chosen && oc->chosen != (void *)-1UL)
116962306a36Sopenharmony_ci		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
117062306a36Sopenharmony_ci				 "Memory cgroup out of memory");
117162306a36Sopenharmony_ci	return !!oc->chosen;
117262306a36Sopenharmony_ci}
117362306a36Sopenharmony_ci
117462306a36Sopenharmony_ci/*
117562306a36Sopenharmony_ci * The pagefault handler calls here because some allocation has failed. We have
117662306a36Sopenharmony_ci * to take care of the memcg OOM here because this is the only safe context without
117762306a36Sopenharmony_ci * any locks held but let the oom killer triggered from the allocation context care
117862306a36Sopenharmony_ci * about the global OOM.
117962306a36Sopenharmony_ci */
118062306a36Sopenharmony_civoid pagefault_out_of_memory(void)
118162306a36Sopenharmony_ci{
118262306a36Sopenharmony_ci	static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
118362306a36Sopenharmony_ci				      DEFAULT_RATELIMIT_BURST);
118462306a36Sopenharmony_ci
118562306a36Sopenharmony_ci	if (mem_cgroup_oom_synchronize(true))
118662306a36Sopenharmony_ci		return;
118762306a36Sopenharmony_ci
118862306a36Sopenharmony_ci	if (fatal_signal_pending(current))
118962306a36Sopenharmony_ci		return;
119062306a36Sopenharmony_ci
119162306a36Sopenharmony_ci	if (__ratelimit(&pfoom_rs))
119262306a36Sopenharmony_ci		pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
119362306a36Sopenharmony_ci}
119462306a36Sopenharmony_ci
119562306a36Sopenharmony_ciSYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
119662306a36Sopenharmony_ci{
119762306a36Sopenharmony_ci#ifdef CONFIG_MMU
119862306a36Sopenharmony_ci	struct mm_struct *mm = NULL;
119962306a36Sopenharmony_ci	struct task_struct *task;
120062306a36Sopenharmony_ci	struct task_struct *p;
120162306a36Sopenharmony_ci	unsigned int f_flags;
120262306a36Sopenharmony_ci	bool reap = false;
120362306a36Sopenharmony_ci	long ret = 0;
120462306a36Sopenharmony_ci
120562306a36Sopenharmony_ci	if (flags)
120662306a36Sopenharmony_ci		return -EINVAL;
120762306a36Sopenharmony_ci
120862306a36Sopenharmony_ci	task = pidfd_get_task(pidfd, &f_flags);
120962306a36Sopenharmony_ci	if (IS_ERR(task))
121062306a36Sopenharmony_ci		return PTR_ERR(task);
121162306a36Sopenharmony_ci
121262306a36Sopenharmony_ci	/*
121362306a36Sopenharmony_ci	 * Make sure to choose a thread which still has a reference to mm
121462306a36Sopenharmony_ci	 * during the group exit
121562306a36Sopenharmony_ci	 */
121662306a36Sopenharmony_ci	p = find_lock_task_mm(task);
121762306a36Sopenharmony_ci	if (!p) {
121862306a36Sopenharmony_ci		ret = -ESRCH;
121962306a36Sopenharmony_ci		goto put_task;
122062306a36Sopenharmony_ci	}
122162306a36Sopenharmony_ci
122262306a36Sopenharmony_ci	mm = p->mm;
122362306a36Sopenharmony_ci	mmgrab(mm);
122462306a36Sopenharmony_ci
122562306a36Sopenharmony_ci	if (task_will_free_mem(p))
122662306a36Sopenharmony_ci		reap = true;
122762306a36Sopenharmony_ci	else {
122862306a36Sopenharmony_ci		/* Error only if the work has not been done already */
122962306a36Sopenharmony_ci		if (!test_bit(MMF_OOM_SKIP, &mm->flags))
123062306a36Sopenharmony_ci			ret = -EINVAL;
123162306a36Sopenharmony_ci	}
123262306a36Sopenharmony_ci	task_unlock(p);
123362306a36Sopenharmony_ci
123462306a36Sopenharmony_ci	if (!reap)
123562306a36Sopenharmony_ci		goto drop_mm;
123662306a36Sopenharmony_ci
123762306a36Sopenharmony_ci	if (mmap_read_lock_killable(mm)) {
123862306a36Sopenharmony_ci		ret = -EINTR;
123962306a36Sopenharmony_ci		goto drop_mm;
124062306a36Sopenharmony_ci	}
124162306a36Sopenharmony_ci	/*
124262306a36Sopenharmony_ci	 * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
124362306a36Sopenharmony_ci	 * possible change in exit_mmap is seen
124462306a36Sopenharmony_ci	 */
124562306a36Sopenharmony_ci	if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
124662306a36Sopenharmony_ci		ret = -EAGAIN;
124762306a36Sopenharmony_ci	mmap_read_unlock(mm);
124862306a36Sopenharmony_ci
124962306a36Sopenharmony_cidrop_mm:
125062306a36Sopenharmony_ci	mmdrop(mm);
125162306a36Sopenharmony_ciput_task:
125262306a36Sopenharmony_ci	put_task_struct(task);
125362306a36Sopenharmony_ci	return ret;
125462306a36Sopenharmony_ci#else
125562306a36Sopenharmony_ci	return -ENOSYS;
125662306a36Sopenharmony_ci#endif /* CONFIG_MMU */
125762306a36Sopenharmony_ci}
1258