162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
262306a36Sopenharmony_ci/* memcontrol.c - Memory Controller
362306a36Sopenharmony_ci *
462306a36Sopenharmony_ci * Copyright IBM Corporation, 2007
562306a36Sopenharmony_ci * Author Balbir Singh <balbir@linux.vnet.ibm.com>
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * Copyright 2007 OpenVZ SWsoft Inc
862306a36Sopenharmony_ci * Author: Pavel Emelianov <xemul@openvz.org>
962306a36Sopenharmony_ci *
1062306a36Sopenharmony_ci * Memory thresholds
1162306a36Sopenharmony_ci * Copyright (C) 2009 Nokia Corporation
1262306a36Sopenharmony_ci * Author: Kirill A. Shutemov
1362306a36Sopenharmony_ci *
1462306a36Sopenharmony_ci * Kernel Memory Controller
1562306a36Sopenharmony_ci * Copyright (C) 2012 Parallels Inc. and Google Inc.
1662306a36Sopenharmony_ci * Authors: Glauber Costa and Suleiman Souhlal
1762306a36Sopenharmony_ci *
1862306a36Sopenharmony_ci * Native page reclaim
1962306a36Sopenharmony_ci * Charge lifetime sanitation
2062306a36Sopenharmony_ci * Lockless page tracking & accounting
2162306a36Sopenharmony_ci * Unified hierarchy configuration model
2262306a36Sopenharmony_ci * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
2362306a36Sopenharmony_ci *
2462306a36Sopenharmony_ci * Per memcg lru locking
2562306a36Sopenharmony_ci * Copyright (C) 2020 Alibaba, Inc, Alex Shi
2662306a36Sopenharmony_ci */
2762306a36Sopenharmony_ci
2862306a36Sopenharmony_ci#include <linux/page_counter.h>
2962306a36Sopenharmony_ci#include <linux/memcontrol.h>
3062306a36Sopenharmony_ci#include <linux/cgroup.h>
3162306a36Sopenharmony_ci#include <linux/pagewalk.h>
3262306a36Sopenharmony_ci#include <linux/sched/mm.h>
3362306a36Sopenharmony_ci#include <linux/shmem_fs.h>
3462306a36Sopenharmony_ci#include <linux/hugetlb.h>
3562306a36Sopenharmony_ci#include <linux/pagemap.h>
3662306a36Sopenharmony_ci#include <linux/vm_event_item.h>
3762306a36Sopenharmony_ci#include <linux/smp.h>
3862306a36Sopenharmony_ci#include <linux/page-flags.h>
3962306a36Sopenharmony_ci#include <linux/backing-dev.h>
4062306a36Sopenharmony_ci#include <linux/bit_spinlock.h>
4162306a36Sopenharmony_ci#include <linux/rcupdate.h>
4262306a36Sopenharmony_ci#include <linux/limits.h>
4362306a36Sopenharmony_ci#include <linux/export.h>
4462306a36Sopenharmony_ci#include <linux/mutex.h>
4562306a36Sopenharmony_ci#include <linux/rbtree.h>
4662306a36Sopenharmony_ci#include <linux/slab.h>
4762306a36Sopenharmony_ci#include <linux/swap.h>
4862306a36Sopenharmony_ci#include <linux/swapops.h>
4962306a36Sopenharmony_ci#include <linux/spinlock.h>
5062306a36Sopenharmony_ci#include <linux/eventfd.h>
5162306a36Sopenharmony_ci#include <linux/poll.h>
5262306a36Sopenharmony_ci#include <linux/sort.h>
5362306a36Sopenharmony_ci#include <linux/fs.h>
5462306a36Sopenharmony_ci#include <linux/seq_file.h>
5562306a36Sopenharmony_ci#include <linux/vmpressure.h>
5662306a36Sopenharmony_ci#include <linux/memremap.h>
5762306a36Sopenharmony_ci#include <linux/mm_inline.h>
5862306a36Sopenharmony_ci#include <linux/swap_cgroup.h>
5962306a36Sopenharmony_ci#include <linux/cpu.h>
6062306a36Sopenharmony_ci#include <linux/oom.h>
6162306a36Sopenharmony_ci#include <linux/lockdep.h>
6262306a36Sopenharmony_ci#include <linux/file.h>
6362306a36Sopenharmony_ci#include <linux/resume_user_mode.h>
6462306a36Sopenharmony_ci#include <linux/psi.h>
6562306a36Sopenharmony_ci#include <linux/seq_buf.h>
6662306a36Sopenharmony_ci#include <linux/sched/isolation.h>
6762306a36Sopenharmony_ci#include "internal.h"
6862306a36Sopenharmony_ci#include <net/sock.h>
6962306a36Sopenharmony_ci#include <net/ip.h>
7062306a36Sopenharmony_ci#include "slab.h"
7162306a36Sopenharmony_ci#include "swap.h"
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci#include <linux/uaccess.h>
7462306a36Sopenharmony_ci#include <linux/zswapd.h>
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci#include <trace/events/vmscan.h>
7762306a36Sopenharmony_ci
7862306a36Sopenharmony_cistruct cgroup_subsys memory_cgrp_subsys __read_mostly;
7962306a36Sopenharmony_ciEXPORT_SYMBOL(memory_cgrp_subsys);
8062306a36Sopenharmony_ci
8162306a36Sopenharmony_cistruct mem_cgroup *root_mem_cgroup __read_mostly;
8262306a36Sopenharmony_ci
8362306a36Sopenharmony_ci/* Active memory cgroup to use from an interrupt context */
8462306a36Sopenharmony_ciDEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
8562306a36Sopenharmony_ciEXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_ci/* Socket memory accounting disabled? */
8862306a36Sopenharmony_cistatic bool cgroup_memory_nosocket __ro_after_init;
8962306a36Sopenharmony_ci
9062306a36Sopenharmony_ci/* Kernel memory accounting disabled? */
9162306a36Sopenharmony_cistatic bool cgroup_memory_nokmem = true;
9262306a36Sopenharmony_ci
9362306a36Sopenharmony_ci/* BPF memory accounting disabled? */
9462306a36Sopenharmony_cistatic bool cgroup_memory_nobpf __ro_after_init;
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK
9762306a36Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
9862306a36Sopenharmony_ci#endif
9962306a36Sopenharmony_ci
10062306a36Sopenharmony_ci/* Whether legacy memory+swap accounting is active */
10162306a36Sopenharmony_cistatic bool do_memsw_account(void)
10262306a36Sopenharmony_ci{
10362306a36Sopenharmony_ci	return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
10462306a36Sopenharmony_ci}
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci#define THRESHOLDS_EVENTS_TARGET 128
10762306a36Sopenharmony_ci#define SOFTLIMIT_EVENTS_TARGET 1024
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci/*
11062306a36Sopenharmony_ci * Cgroups above their limits are maintained in a RB-Tree, independent of
11162306a36Sopenharmony_ci * their hierarchy representation
11262306a36Sopenharmony_ci */
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_cistruct mem_cgroup_tree_per_node {
11562306a36Sopenharmony_ci	struct rb_root rb_root;
11662306a36Sopenharmony_ci	struct rb_node *rb_rightmost;
11762306a36Sopenharmony_ci	spinlock_t lock;
11862306a36Sopenharmony_ci};
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_cistruct mem_cgroup_tree {
12162306a36Sopenharmony_ci	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
12262306a36Sopenharmony_ci};
12362306a36Sopenharmony_ci
12462306a36Sopenharmony_cistatic struct mem_cgroup_tree soft_limit_tree __read_mostly;
12562306a36Sopenharmony_ci
12662306a36Sopenharmony_ci/* for OOM */
12762306a36Sopenharmony_cistruct mem_cgroup_eventfd_list {
12862306a36Sopenharmony_ci	struct list_head list;
12962306a36Sopenharmony_ci	struct eventfd_ctx *eventfd;
13062306a36Sopenharmony_ci};
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_ci/*
13362306a36Sopenharmony_ci * cgroup_event represents events which userspace want to receive.
13462306a36Sopenharmony_ci */
13562306a36Sopenharmony_cistruct mem_cgroup_event {
13662306a36Sopenharmony_ci	/*
13762306a36Sopenharmony_ci	 * memcg which the event belongs to.
13862306a36Sopenharmony_ci	 */
13962306a36Sopenharmony_ci	struct mem_cgroup *memcg;
14062306a36Sopenharmony_ci	/*
14162306a36Sopenharmony_ci	 * eventfd to signal userspace about the event.
14262306a36Sopenharmony_ci	 */
14362306a36Sopenharmony_ci	struct eventfd_ctx *eventfd;
14462306a36Sopenharmony_ci	/*
14562306a36Sopenharmony_ci	 * Each of these stored in a list by the cgroup.
14662306a36Sopenharmony_ci	 */
14762306a36Sopenharmony_ci	struct list_head list;
14862306a36Sopenharmony_ci	/*
14962306a36Sopenharmony_ci	 * register_event() callback will be used to add new userspace
15062306a36Sopenharmony_ci	 * waiter for changes related to this event.  Use eventfd_signal()
15162306a36Sopenharmony_ci	 * on eventfd to send notification to userspace.
15262306a36Sopenharmony_ci	 */
15362306a36Sopenharmony_ci	int (*register_event)(struct mem_cgroup *memcg,
15462306a36Sopenharmony_ci			      struct eventfd_ctx *eventfd, const char *args);
15562306a36Sopenharmony_ci	/*
15662306a36Sopenharmony_ci	 * unregister_event() callback will be called when userspace closes
15762306a36Sopenharmony_ci	 * the eventfd or on cgroup removing.  This callback must be set,
15862306a36Sopenharmony_ci	 * if you want provide notification functionality.
15962306a36Sopenharmony_ci	 */
16062306a36Sopenharmony_ci	void (*unregister_event)(struct mem_cgroup *memcg,
16162306a36Sopenharmony_ci				 struct eventfd_ctx *eventfd);
16262306a36Sopenharmony_ci	/*
16362306a36Sopenharmony_ci	 * All fields below needed to unregister event when
16462306a36Sopenharmony_ci	 * userspace closes eventfd.
16562306a36Sopenharmony_ci	 */
16662306a36Sopenharmony_ci	poll_table pt;
16762306a36Sopenharmony_ci	wait_queue_head_t *wqh;
16862306a36Sopenharmony_ci	wait_queue_entry_t wait;
16962306a36Sopenharmony_ci	struct work_struct remove;
17062306a36Sopenharmony_ci};
17162306a36Sopenharmony_ci
17262306a36Sopenharmony_cistatic void mem_cgroup_threshold(struct mem_cgroup *memcg);
17362306a36Sopenharmony_cistatic void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
17462306a36Sopenharmony_ci
17562306a36Sopenharmony_ci/* Stuffs for move charges at task migration. */
17662306a36Sopenharmony_ci/*
17762306a36Sopenharmony_ci * Types of charges to be moved.
17862306a36Sopenharmony_ci */
17962306a36Sopenharmony_ci#define MOVE_ANON	0x1U
18062306a36Sopenharmony_ci#define MOVE_FILE	0x2U
18162306a36Sopenharmony_ci#define MOVE_MASK	(MOVE_ANON | MOVE_FILE)
18262306a36Sopenharmony_ci
18362306a36Sopenharmony_ci/* "mc" and its members are protected by cgroup_mutex */
18462306a36Sopenharmony_cistatic struct move_charge_struct {
18562306a36Sopenharmony_ci	spinlock_t	  lock; /* for from, to */
18662306a36Sopenharmony_ci	struct mm_struct  *mm;
18762306a36Sopenharmony_ci	struct mem_cgroup *from;
18862306a36Sopenharmony_ci	struct mem_cgroup *to;
18962306a36Sopenharmony_ci	unsigned long flags;
19062306a36Sopenharmony_ci	unsigned long precharge;
19162306a36Sopenharmony_ci	unsigned long moved_charge;
19262306a36Sopenharmony_ci	unsigned long moved_swap;
19362306a36Sopenharmony_ci	struct task_struct *moving_task;	/* a task moving charges */
19462306a36Sopenharmony_ci	wait_queue_head_t waitq;		/* a waitq for other context */
19562306a36Sopenharmony_ci} mc = {
19662306a36Sopenharmony_ci	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
19762306a36Sopenharmony_ci	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
19862306a36Sopenharmony_ci};
19962306a36Sopenharmony_ci
20062306a36Sopenharmony_ci/*
20162306a36Sopenharmony_ci * Maximum loops in mem_cgroup_soft_reclaim(), used for soft
20262306a36Sopenharmony_ci * limit reclaim to prevent infinite loops, if they ever occur.
20362306a36Sopenharmony_ci */
20462306a36Sopenharmony_ci#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
20562306a36Sopenharmony_ci#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci/* for encoding cft->private value on file */
20862306a36Sopenharmony_cienum res_type {
20962306a36Sopenharmony_ci	_MEM,
21062306a36Sopenharmony_ci	_MEMSWAP,
21162306a36Sopenharmony_ci	_KMEM,
21262306a36Sopenharmony_ci	_TCP,
21362306a36Sopenharmony_ci};
21462306a36Sopenharmony_ci
21562306a36Sopenharmony_ci#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
21662306a36Sopenharmony_ci#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
21762306a36Sopenharmony_ci#define MEMFILE_ATTR(val)	((val) & 0xffff)
21862306a36Sopenharmony_ci
21962306a36Sopenharmony_ci/*
22062306a36Sopenharmony_ci * Iteration constructs for visiting all cgroups (under a tree).  If
22162306a36Sopenharmony_ci * loops are exited prematurely (break), mem_cgroup_iter_break() must
22262306a36Sopenharmony_ci * be used for reference counting.
22362306a36Sopenharmony_ci */
22462306a36Sopenharmony_ci#define for_each_mem_cgroup_tree(iter, root)		\
22562306a36Sopenharmony_ci	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
22662306a36Sopenharmony_ci	     iter != NULL;				\
22762306a36Sopenharmony_ci	     iter = mem_cgroup_iter(root, iter, NULL))
22862306a36Sopenharmony_ci
22962306a36Sopenharmony_ci#define for_each_mem_cgroup(iter)			\
23062306a36Sopenharmony_ci	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
23162306a36Sopenharmony_ci	     iter != NULL;				\
23262306a36Sopenharmony_ci	     iter = mem_cgroup_iter(NULL, iter, NULL))
23362306a36Sopenharmony_ci
23462306a36Sopenharmony_cistatic inline bool task_is_dying(void)
23562306a36Sopenharmony_ci{
23662306a36Sopenharmony_ci	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
23762306a36Sopenharmony_ci		(current->flags & PF_EXITING);
23862306a36Sopenharmony_ci}
23962306a36Sopenharmony_ci
24062306a36Sopenharmony_ci/* Some nice accessors for the vmpressure. */
24162306a36Sopenharmony_cistruct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
24262306a36Sopenharmony_ci{
24362306a36Sopenharmony_ci	if (!memcg)
24462306a36Sopenharmony_ci		memcg = root_mem_cgroup;
24562306a36Sopenharmony_ci	return &memcg->vmpressure;
24662306a36Sopenharmony_ci}
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_cistruct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
24962306a36Sopenharmony_ci{
25062306a36Sopenharmony_ci	return container_of(vmpr, struct mem_cgroup, vmpressure);
25162306a36Sopenharmony_ci}
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
25462306a36Sopenharmony_cistatic DEFINE_SPINLOCK(objcg_lock);
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_cibool mem_cgroup_kmem_disabled(void)
25762306a36Sopenharmony_ci{
25862306a36Sopenharmony_ci	return cgroup_memory_nokmem;
25962306a36Sopenharmony_ci}
26062306a36Sopenharmony_ci
26162306a36Sopenharmony_cistatic void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
26262306a36Sopenharmony_ci				      unsigned int nr_pages);
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_cistatic void obj_cgroup_release(struct percpu_ref *ref)
26562306a36Sopenharmony_ci{
26662306a36Sopenharmony_ci	struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
26762306a36Sopenharmony_ci	unsigned int nr_bytes;
26862306a36Sopenharmony_ci	unsigned int nr_pages;
26962306a36Sopenharmony_ci	unsigned long flags;
27062306a36Sopenharmony_ci
27162306a36Sopenharmony_ci	/*
27262306a36Sopenharmony_ci	 * At this point all allocated objects are freed, and
27362306a36Sopenharmony_ci	 * objcg->nr_charged_bytes can't have an arbitrary byte value.
27462306a36Sopenharmony_ci	 * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
27562306a36Sopenharmony_ci	 *
27662306a36Sopenharmony_ci	 * The following sequence can lead to it:
27762306a36Sopenharmony_ci	 * 1) CPU0: objcg == stock->cached_objcg
27862306a36Sopenharmony_ci	 * 2) CPU1: we do a small allocation (e.g. 92 bytes),
27962306a36Sopenharmony_ci	 *          PAGE_SIZE bytes are charged
28062306a36Sopenharmony_ci	 * 3) CPU1: a process from another memcg is allocating something,
28162306a36Sopenharmony_ci	 *          the stock if flushed,
28262306a36Sopenharmony_ci	 *          objcg->nr_charged_bytes = PAGE_SIZE - 92
28362306a36Sopenharmony_ci	 * 5) CPU0: we do release this object,
28462306a36Sopenharmony_ci	 *          92 bytes are added to stock->nr_bytes
28562306a36Sopenharmony_ci	 * 6) CPU0: stock is flushed,
28662306a36Sopenharmony_ci	 *          92 bytes are added to objcg->nr_charged_bytes
28762306a36Sopenharmony_ci	 *
28862306a36Sopenharmony_ci	 * In the result, nr_charged_bytes == PAGE_SIZE.
28962306a36Sopenharmony_ci	 * This page will be uncharged in obj_cgroup_release().
29062306a36Sopenharmony_ci	 */
29162306a36Sopenharmony_ci	nr_bytes = atomic_read(&objcg->nr_charged_bytes);
29262306a36Sopenharmony_ci	WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
29362306a36Sopenharmony_ci	nr_pages = nr_bytes >> PAGE_SHIFT;
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_ci	if (nr_pages)
29662306a36Sopenharmony_ci		obj_cgroup_uncharge_pages(objcg, nr_pages);
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_ci	spin_lock_irqsave(&objcg_lock, flags);
29962306a36Sopenharmony_ci	list_del(&objcg->list);
30062306a36Sopenharmony_ci	spin_unlock_irqrestore(&objcg_lock, flags);
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_ci	percpu_ref_exit(ref);
30362306a36Sopenharmony_ci	kfree_rcu(objcg, rcu);
30462306a36Sopenharmony_ci}
30562306a36Sopenharmony_ci
30662306a36Sopenharmony_cistatic struct obj_cgroup *obj_cgroup_alloc(void)
30762306a36Sopenharmony_ci{
30862306a36Sopenharmony_ci	struct obj_cgroup *objcg;
30962306a36Sopenharmony_ci	int ret;
31062306a36Sopenharmony_ci
31162306a36Sopenharmony_ci	objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
31262306a36Sopenharmony_ci	if (!objcg)
31362306a36Sopenharmony_ci		return NULL;
31462306a36Sopenharmony_ci
31562306a36Sopenharmony_ci	ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
31662306a36Sopenharmony_ci			      GFP_KERNEL);
31762306a36Sopenharmony_ci	if (ret) {
31862306a36Sopenharmony_ci		kfree(objcg);
31962306a36Sopenharmony_ci		return NULL;
32062306a36Sopenharmony_ci	}
32162306a36Sopenharmony_ci	INIT_LIST_HEAD(&objcg->list);
32262306a36Sopenharmony_ci	return objcg;
32362306a36Sopenharmony_ci}
32462306a36Sopenharmony_ci
32562306a36Sopenharmony_cistatic void memcg_reparent_objcgs(struct mem_cgroup *memcg,
32662306a36Sopenharmony_ci				  struct mem_cgroup *parent)
32762306a36Sopenharmony_ci{
32862306a36Sopenharmony_ci	struct obj_cgroup *objcg, *iter;
32962306a36Sopenharmony_ci
33062306a36Sopenharmony_ci	objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
33162306a36Sopenharmony_ci
33262306a36Sopenharmony_ci	spin_lock_irq(&objcg_lock);
33362306a36Sopenharmony_ci
33462306a36Sopenharmony_ci	/* 1) Ready to reparent active objcg. */
33562306a36Sopenharmony_ci	list_add(&objcg->list, &memcg->objcg_list);
33662306a36Sopenharmony_ci	/* 2) Reparent active objcg and already reparented objcgs to parent. */
33762306a36Sopenharmony_ci	list_for_each_entry(iter, &memcg->objcg_list, list)
33862306a36Sopenharmony_ci		WRITE_ONCE(iter->memcg, parent);
33962306a36Sopenharmony_ci	/* 3) Move already reparented objcgs to the parent's list */
34062306a36Sopenharmony_ci	list_splice(&memcg->objcg_list, &parent->objcg_list);
34162306a36Sopenharmony_ci
34262306a36Sopenharmony_ci	spin_unlock_irq(&objcg_lock);
34362306a36Sopenharmony_ci
34462306a36Sopenharmony_ci	percpu_ref_kill(&objcg->refcnt);
34562306a36Sopenharmony_ci}
34662306a36Sopenharmony_ci
34762306a36Sopenharmony_ci/*
34862306a36Sopenharmony_ci * A lot of the calls to the cache allocation functions are expected to be
34962306a36Sopenharmony_ci * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
35062306a36Sopenharmony_ci * conditional to this static branch, we'll have to allow modules that does
35162306a36Sopenharmony_ci * kmem_cache_alloc and the such to see this symbol as well
35262306a36Sopenharmony_ci */
35362306a36Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key);
35462306a36Sopenharmony_ciEXPORT_SYMBOL(memcg_kmem_online_key);
35562306a36Sopenharmony_ci
35662306a36Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key);
35762306a36Sopenharmony_ciEXPORT_SYMBOL(memcg_bpf_enabled_key);
35862306a36Sopenharmony_ci#endif
35962306a36Sopenharmony_ci
36062306a36Sopenharmony_ci/**
36162306a36Sopenharmony_ci * mem_cgroup_css_from_folio - css of the memcg associated with a folio
36262306a36Sopenharmony_ci * @folio: folio of interest
36362306a36Sopenharmony_ci *
36462306a36Sopenharmony_ci * If memcg is bound to the default hierarchy, css of the memcg associated
36562306a36Sopenharmony_ci * with @folio is returned.  The returned css remains associated with @folio
36662306a36Sopenharmony_ci * until it is released.
36762306a36Sopenharmony_ci *
36862306a36Sopenharmony_ci * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
36962306a36Sopenharmony_ci * is returned.
37062306a36Sopenharmony_ci */
37162306a36Sopenharmony_cistruct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio)
37262306a36Sopenharmony_ci{
37362306a36Sopenharmony_ci	struct mem_cgroup *memcg = folio_memcg(folio);
37462306a36Sopenharmony_ci
37562306a36Sopenharmony_ci	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
37662306a36Sopenharmony_ci		memcg = root_mem_cgroup;
37762306a36Sopenharmony_ci
37862306a36Sopenharmony_ci	return &memcg->css;
37962306a36Sopenharmony_ci}
38062306a36Sopenharmony_ci
38162306a36Sopenharmony_ci/**
38262306a36Sopenharmony_ci * page_cgroup_ino - return inode number of the memcg a page is charged to
38362306a36Sopenharmony_ci * @page: the page
38462306a36Sopenharmony_ci *
38562306a36Sopenharmony_ci * Look up the closest online ancestor of the memory cgroup @page is charged to
38662306a36Sopenharmony_ci * and return its inode number or 0 if @page is not charged to any cgroup. It
38762306a36Sopenharmony_ci * is safe to call this function without holding a reference to @page.
38862306a36Sopenharmony_ci *
38962306a36Sopenharmony_ci * Note, this function is inherently racy, because there is nothing to prevent
39062306a36Sopenharmony_ci * the cgroup inode from getting torn down and potentially reallocated a moment
39162306a36Sopenharmony_ci * after page_cgroup_ino() returns, so it only should be used by callers that
39262306a36Sopenharmony_ci * do not care (such as procfs interfaces).
39362306a36Sopenharmony_ci */
39462306a36Sopenharmony_ciino_t page_cgroup_ino(struct page *page)
39562306a36Sopenharmony_ci{
39662306a36Sopenharmony_ci	struct mem_cgroup *memcg;
39762306a36Sopenharmony_ci	unsigned long ino = 0;
39862306a36Sopenharmony_ci
39962306a36Sopenharmony_ci	rcu_read_lock();
40062306a36Sopenharmony_ci	/* page_folio() is racy here, but the entire function is racy anyway */
40162306a36Sopenharmony_ci	memcg = folio_memcg_check(page_folio(page));
40262306a36Sopenharmony_ci
40362306a36Sopenharmony_ci	while (memcg && !(memcg->css.flags & CSS_ONLINE))
40462306a36Sopenharmony_ci		memcg = parent_mem_cgroup(memcg);
40562306a36Sopenharmony_ci	if (memcg)
40662306a36Sopenharmony_ci		ino = cgroup_ino(memcg->css.cgroup);
40762306a36Sopenharmony_ci	rcu_read_unlock();
40862306a36Sopenharmony_ci	return ino;
40962306a36Sopenharmony_ci}
41062306a36Sopenharmony_ci
41162306a36Sopenharmony_cistatic void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
41262306a36Sopenharmony_ci					 struct mem_cgroup_tree_per_node *mctz,
41362306a36Sopenharmony_ci					 unsigned long new_usage_in_excess)
41462306a36Sopenharmony_ci{
41562306a36Sopenharmony_ci	struct rb_node **p = &mctz->rb_root.rb_node;
41662306a36Sopenharmony_ci	struct rb_node *parent = NULL;
41762306a36Sopenharmony_ci	struct mem_cgroup_per_node *mz_node;
41862306a36Sopenharmony_ci	bool rightmost = true;
41962306a36Sopenharmony_ci
42062306a36Sopenharmony_ci	if (mz->on_tree)
42162306a36Sopenharmony_ci		return;
42262306a36Sopenharmony_ci
42362306a36Sopenharmony_ci	mz->usage_in_excess = new_usage_in_excess;
42462306a36Sopenharmony_ci	if (!mz->usage_in_excess)
42562306a36Sopenharmony_ci		return;
42662306a36Sopenharmony_ci	while (*p) {
42762306a36Sopenharmony_ci		parent = *p;
42862306a36Sopenharmony_ci		mz_node = rb_entry(parent, struct mem_cgroup_per_node,
42962306a36Sopenharmony_ci					tree_node);
43062306a36Sopenharmony_ci		if (mz->usage_in_excess < mz_node->usage_in_excess) {
43162306a36Sopenharmony_ci			p = &(*p)->rb_left;
43262306a36Sopenharmony_ci			rightmost = false;
43362306a36Sopenharmony_ci		} else {
43462306a36Sopenharmony_ci			p = &(*p)->rb_right;
43562306a36Sopenharmony_ci		}
43662306a36Sopenharmony_ci	}
43762306a36Sopenharmony_ci
43862306a36Sopenharmony_ci	if (rightmost)
43962306a36Sopenharmony_ci		mctz->rb_rightmost = &mz->tree_node;
44062306a36Sopenharmony_ci
44162306a36Sopenharmony_ci	rb_link_node(&mz->tree_node, parent, p);
44262306a36Sopenharmony_ci	rb_insert_color(&mz->tree_node, &mctz->rb_root);
44362306a36Sopenharmony_ci	mz->on_tree = true;
44462306a36Sopenharmony_ci}
44562306a36Sopenharmony_ci
44662306a36Sopenharmony_cistatic void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
44762306a36Sopenharmony_ci					 struct mem_cgroup_tree_per_node *mctz)
44862306a36Sopenharmony_ci{
44962306a36Sopenharmony_ci	if (!mz->on_tree)
45062306a36Sopenharmony_ci		return;
45162306a36Sopenharmony_ci
45262306a36Sopenharmony_ci	if (&mz->tree_node == mctz->rb_rightmost)
45362306a36Sopenharmony_ci		mctz->rb_rightmost = rb_prev(&mz->tree_node);
45462306a36Sopenharmony_ci
45562306a36Sopenharmony_ci	rb_erase(&mz->tree_node, &mctz->rb_root);
45662306a36Sopenharmony_ci	mz->on_tree = false;
45762306a36Sopenharmony_ci}
45862306a36Sopenharmony_ci
45962306a36Sopenharmony_cistatic void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
46062306a36Sopenharmony_ci				       struct mem_cgroup_tree_per_node *mctz)
46162306a36Sopenharmony_ci{
46262306a36Sopenharmony_ci	unsigned long flags;
46362306a36Sopenharmony_ci
46462306a36Sopenharmony_ci	spin_lock_irqsave(&mctz->lock, flags);
46562306a36Sopenharmony_ci	__mem_cgroup_remove_exceeded(mz, mctz);
46662306a36Sopenharmony_ci	spin_unlock_irqrestore(&mctz->lock, flags);
46762306a36Sopenharmony_ci}
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_cistatic unsigned long soft_limit_excess(struct mem_cgroup *memcg)
47062306a36Sopenharmony_ci{
47162306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
47262306a36Sopenharmony_ci	struct mem_cgroup_per_node *mz = mem_cgroup_nodeinfo(memcg, 0);
47362306a36Sopenharmony_ci	struct lruvec *lruvec = &mz->lruvec;
47462306a36Sopenharmony_ci	unsigned long nr_pages = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON,
47562306a36Sopenharmony_ci			MAX_NR_ZONES) + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON,
47662306a36Sopenharmony_ci			MAX_NR_ZONES);
47762306a36Sopenharmony_ci#else
47862306a36Sopenharmony_ci	unsigned long nr_pages = page_counter_read(&memcg->memory);
47962306a36Sopenharmony_ci#endif
48062306a36Sopenharmony_ci	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
48162306a36Sopenharmony_ci	unsigned long excess = 0;
48262306a36Sopenharmony_ci
48362306a36Sopenharmony_ci	if (nr_pages > soft_limit)
48462306a36Sopenharmony_ci		excess = nr_pages - soft_limit;
48562306a36Sopenharmony_ci
48662306a36Sopenharmony_ci	return excess;
48762306a36Sopenharmony_ci}
48862306a36Sopenharmony_ci
48962306a36Sopenharmony_cistatic void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
49062306a36Sopenharmony_ci{
49162306a36Sopenharmony_ci	unsigned long excess;
49262306a36Sopenharmony_ci	struct mem_cgroup_per_node *mz;
49362306a36Sopenharmony_ci	struct mem_cgroup_tree_per_node *mctz;
49462306a36Sopenharmony_ci
49562306a36Sopenharmony_ci	if (lru_gen_enabled()) {
49662306a36Sopenharmony_ci		if (soft_limit_excess(memcg))
49762306a36Sopenharmony_ci			lru_gen_soft_reclaim(memcg, nid);
49862306a36Sopenharmony_ci		return;
49962306a36Sopenharmony_ci	}
50062306a36Sopenharmony_ci
50162306a36Sopenharmony_ci	mctz = soft_limit_tree.rb_tree_per_node[nid];
50262306a36Sopenharmony_ci	if (!mctz)
50362306a36Sopenharmony_ci		return;
50462306a36Sopenharmony_ci	/*
50562306a36Sopenharmony_ci	 * Necessary to update all ancestors when hierarchy is used.
50662306a36Sopenharmony_ci	 * because their event counter is not touched.
50762306a36Sopenharmony_ci	 */
50862306a36Sopenharmony_ci	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
50962306a36Sopenharmony_ci		mz = memcg->nodeinfo[nid];
51062306a36Sopenharmony_ci		excess = soft_limit_excess(memcg);
51162306a36Sopenharmony_ci		/*
51262306a36Sopenharmony_ci		 * We have to update the tree if mz is on RB-tree or
51362306a36Sopenharmony_ci		 * mem is over its softlimit.
51462306a36Sopenharmony_ci		 */
51562306a36Sopenharmony_ci		if (excess || mz->on_tree) {
51662306a36Sopenharmony_ci			unsigned long flags;
51762306a36Sopenharmony_ci
51862306a36Sopenharmony_ci			spin_lock_irqsave(&mctz->lock, flags);
51962306a36Sopenharmony_ci			/* if on-tree, remove it */
52062306a36Sopenharmony_ci			if (mz->on_tree)
52162306a36Sopenharmony_ci				__mem_cgroup_remove_exceeded(mz, mctz);
52262306a36Sopenharmony_ci			/*
52362306a36Sopenharmony_ci			 * Insert again. mz->usage_in_excess will be updated.
52462306a36Sopenharmony_ci			 * If excess is 0, no tree ops.
52562306a36Sopenharmony_ci			 */
52662306a36Sopenharmony_ci			__mem_cgroup_insert_exceeded(mz, mctz, excess);
52762306a36Sopenharmony_ci			spin_unlock_irqrestore(&mctz->lock, flags);
52862306a36Sopenharmony_ci		}
52962306a36Sopenharmony_ci	}
53062306a36Sopenharmony_ci}
53162306a36Sopenharmony_ci
53262306a36Sopenharmony_cistatic void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
53362306a36Sopenharmony_ci{
53462306a36Sopenharmony_ci	struct mem_cgroup_tree_per_node *mctz;
53562306a36Sopenharmony_ci	struct mem_cgroup_per_node *mz;
53662306a36Sopenharmony_ci	int nid;
53762306a36Sopenharmony_ci
53862306a36Sopenharmony_ci	for_each_node(nid) {
53962306a36Sopenharmony_ci		mz = memcg->nodeinfo[nid];
54062306a36Sopenharmony_ci		mctz = soft_limit_tree.rb_tree_per_node[nid];
54162306a36Sopenharmony_ci		if (mctz)
54262306a36Sopenharmony_ci			mem_cgroup_remove_exceeded(mz, mctz);
54362306a36Sopenharmony_ci	}
54462306a36Sopenharmony_ci}
54562306a36Sopenharmony_ci
54662306a36Sopenharmony_cistatic struct mem_cgroup_per_node *
54762306a36Sopenharmony_ci__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
54862306a36Sopenharmony_ci{
54962306a36Sopenharmony_ci	struct mem_cgroup_per_node *mz;
55062306a36Sopenharmony_ci
55162306a36Sopenharmony_ciretry:
55262306a36Sopenharmony_ci	mz = NULL;
55362306a36Sopenharmony_ci	if (!mctz->rb_rightmost)
55462306a36Sopenharmony_ci		goto done;		/* Nothing to reclaim from */
55562306a36Sopenharmony_ci
55662306a36Sopenharmony_ci	mz = rb_entry(mctz->rb_rightmost,
55762306a36Sopenharmony_ci		      struct mem_cgroup_per_node, tree_node);
55862306a36Sopenharmony_ci	/*
55962306a36Sopenharmony_ci	 * Remove the node now but someone else can add it back,
56062306a36Sopenharmony_ci	 * we will to add it back at the end of reclaim to its correct
56162306a36Sopenharmony_ci	 * position in the tree.
56262306a36Sopenharmony_ci	 */
56362306a36Sopenharmony_ci	__mem_cgroup_remove_exceeded(mz, mctz);
56462306a36Sopenharmony_ci	if (!soft_limit_excess(mz->memcg) ||
56562306a36Sopenharmony_ci	    !css_tryget(&mz->memcg->css))
56662306a36Sopenharmony_ci		goto retry;
56762306a36Sopenharmony_cidone:
56862306a36Sopenharmony_ci	return mz;
56962306a36Sopenharmony_ci}
57062306a36Sopenharmony_ci
57162306a36Sopenharmony_cistatic struct mem_cgroup_per_node *
57262306a36Sopenharmony_cimem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
57362306a36Sopenharmony_ci{
57462306a36Sopenharmony_ci	struct mem_cgroup_per_node *mz;
57562306a36Sopenharmony_ci
57662306a36Sopenharmony_ci	spin_lock_irq(&mctz->lock);
57762306a36Sopenharmony_ci	mz = __mem_cgroup_largest_soft_limit_node(mctz);
57862306a36Sopenharmony_ci	spin_unlock_irq(&mctz->lock);
57962306a36Sopenharmony_ci	return mz;
58062306a36Sopenharmony_ci}
58162306a36Sopenharmony_ci
58262306a36Sopenharmony_ci/*
58362306a36Sopenharmony_ci * memcg and lruvec stats flushing
58462306a36Sopenharmony_ci *
58562306a36Sopenharmony_ci * Many codepaths leading to stats update or read are performance sensitive and
58662306a36Sopenharmony_ci * adding stats flushing in such codepaths is not desirable. So, to optimize the
58762306a36Sopenharmony_ci * flushing the kernel does:
58862306a36Sopenharmony_ci *
58962306a36Sopenharmony_ci * 1) Periodically and asynchronously flush the stats every 2 seconds to not let
59062306a36Sopenharmony_ci *    rstat update tree grow unbounded.
59162306a36Sopenharmony_ci *
59262306a36Sopenharmony_ci * 2) Flush the stats synchronously on reader side only when there are more than
59362306a36Sopenharmony_ci *    (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
59462306a36Sopenharmony_ci *    will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
59562306a36Sopenharmony_ci *    only for 2 seconds due to (1).
59662306a36Sopenharmony_ci */
59762306a36Sopenharmony_cistatic void flush_memcg_stats_dwork(struct work_struct *w);
59862306a36Sopenharmony_cistatic DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
59962306a36Sopenharmony_cistatic DEFINE_PER_CPU(unsigned int, stats_updates);
60062306a36Sopenharmony_cistatic atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
60162306a36Sopenharmony_cistatic atomic_t stats_flush_threshold = ATOMIC_INIT(0);
60262306a36Sopenharmony_cistatic u64 flush_next_time;
60362306a36Sopenharmony_ci
60462306a36Sopenharmony_ci#define FLUSH_TIME (2UL*HZ)
60562306a36Sopenharmony_ci
60662306a36Sopenharmony_ci/*
60762306a36Sopenharmony_ci * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
60862306a36Sopenharmony_ci * not rely on this as part of an acquired spinlock_t lock. These functions are
60962306a36Sopenharmony_ci * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
61062306a36Sopenharmony_ci * is sufficient.
61162306a36Sopenharmony_ci */
61262306a36Sopenharmony_cistatic void memcg_stats_lock(void)
61362306a36Sopenharmony_ci{
61462306a36Sopenharmony_ci	preempt_disable_nested();
61562306a36Sopenharmony_ci	VM_WARN_ON_IRQS_ENABLED();
61662306a36Sopenharmony_ci}
61762306a36Sopenharmony_ci
61862306a36Sopenharmony_cistatic void __memcg_stats_lock(void)
61962306a36Sopenharmony_ci{
62062306a36Sopenharmony_ci	preempt_disable_nested();
62162306a36Sopenharmony_ci}
62262306a36Sopenharmony_ci
62362306a36Sopenharmony_cistatic void memcg_stats_unlock(void)
62462306a36Sopenharmony_ci{
62562306a36Sopenharmony_ci	preempt_enable_nested();
62662306a36Sopenharmony_ci}
62762306a36Sopenharmony_ci
62862306a36Sopenharmony_cistatic inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
62962306a36Sopenharmony_ci{
63062306a36Sopenharmony_ci	unsigned int x;
63162306a36Sopenharmony_ci
63262306a36Sopenharmony_ci	if (!val)
63362306a36Sopenharmony_ci		return;
63462306a36Sopenharmony_ci
63562306a36Sopenharmony_ci	cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
63662306a36Sopenharmony_ci
63762306a36Sopenharmony_ci	x = __this_cpu_add_return(stats_updates, abs(val));
63862306a36Sopenharmony_ci	if (x > MEMCG_CHARGE_BATCH) {
63962306a36Sopenharmony_ci		/*
64062306a36Sopenharmony_ci		 * If stats_flush_threshold exceeds the threshold
64162306a36Sopenharmony_ci		 * (>num_online_cpus()), cgroup stats update will be triggered
64262306a36Sopenharmony_ci		 * in __mem_cgroup_flush_stats(). Increasing this var further
64362306a36Sopenharmony_ci		 * is redundant and simply adds overhead in atomic update.
64462306a36Sopenharmony_ci		 */
64562306a36Sopenharmony_ci		if (atomic_read(&stats_flush_threshold) <= num_online_cpus())
64662306a36Sopenharmony_ci			atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
64762306a36Sopenharmony_ci		__this_cpu_write(stats_updates, 0);
64862306a36Sopenharmony_ci	}
64962306a36Sopenharmony_ci}
65062306a36Sopenharmony_ci
65162306a36Sopenharmony_cistatic void do_flush_stats(void)
65262306a36Sopenharmony_ci{
65362306a36Sopenharmony_ci	/*
65462306a36Sopenharmony_ci	 * We always flush the entire tree, so concurrent flushers can just
65562306a36Sopenharmony_ci	 * skip. This avoids a thundering herd problem on the rstat global lock
65662306a36Sopenharmony_ci	 * from memcg flushers (e.g. reclaim, refault, etc).
65762306a36Sopenharmony_ci	 */
65862306a36Sopenharmony_ci	if (atomic_read(&stats_flush_ongoing) ||
65962306a36Sopenharmony_ci	    atomic_xchg(&stats_flush_ongoing, 1))
66062306a36Sopenharmony_ci		return;
66162306a36Sopenharmony_ci
66262306a36Sopenharmony_ci	WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
66362306a36Sopenharmony_ci
66462306a36Sopenharmony_ci	cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
66562306a36Sopenharmony_ci
66662306a36Sopenharmony_ci	atomic_set(&stats_flush_threshold, 0);
66762306a36Sopenharmony_ci	atomic_set(&stats_flush_ongoing, 0);
66862306a36Sopenharmony_ci}
66962306a36Sopenharmony_ci
67062306a36Sopenharmony_civoid mem_cgroup_flush_stats(void)
67162306a36Sopenharmony_ci{
67262306a36Sopenharmony_ci	if (atomic_read(&stats_flush_threshold) > num_online_cpus())
67362306a36Sopenharmony_ci		do_flush_stats();
67462306a36Sopenharmony_ci}
67562306a36Sopenharmony_ci
67662306a36Sopenharmony_civoid mem_cgroup_flush_stats_ratelimited(void)
67762306a36Sopenharmony_ci{
67862306a36Sopenharmony_ci	if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
67962306a36Sopenharmony_ci		mem_cgroup_flush_stats();
68062306a36Sopenharmony_ci}
68162306a36Sopenharmony_ci
68262306a36Sopenharmony_cistatic void flush_memcg_stats_dwork(struct work_struct *w)
68362306a36Sopenharmony_ci{
68462306a36Sopenharmony_ci	/*
68562306a36Sopenharmony_ci	 * Always flush here so that flushing in latency-sensitive paths is
68662306a36Sopenharmony_ci	 * as cheap as possible.
68762306a36Sopenharmony_ci	 */
68862306a36Sopenharmony_ci	do_flush_stats();
68962306a36Sopenharmony_ci	queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
69062306a36Sopenharmony_ci}
69162306a36Sopenharmony_ci
69262306a36Sopenharmony_ci/* Subset of vm_event_item to report for memcg event stats */
69362306a36Sopenharmony_cistatic const unsigned int memcg_vm_event_stat[] = {
69462306a36Sopenharmony_ci	PGPGIN,
69562306a36Sopenharmony_ci	PGPGOUT,
69662306a36Sopenharmony_ci	PGSCAN_KSWAPD,
69762306a36Sopenharmony_ci	PGSCAN_DIRECT,
69862306a36Sopenharmony_ci	PGSCAN_KHUGEPAGED,
69962306a36Sopenharmony_ci	PGSTEAL_KSWAPD,
70062306a36Sopenharmony_ci	PGSTEAL_DIRECT,
70162306a36Sopenharmony_ci	PGSTEAL_KHUGEPAGED,
70262306a36Sopenharmony_ci	PGFAULT,
70362306a36Sopenharmony_ci	PGMAJFAULT,
70462306a36Sopenharmony_ci	PGREFILL,
70562306a36Sopenharmony_ci	PGACTIVATE,
70662306a36Sopenharmony_ci	PGDEACTIVATE,
70762306a36Sopenharmony_ci	PGLAZYFREE,
70862306a36Sopenharmony_ci	PGLAZYFREED,
70962306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
71062306a36Sopenharmony_ci	ZSWPIN,
71162306a36Sopenharmony_ci	ZSWPOUT,
71262306a36Sopenharmony_ci#endif
71362306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
71462306a36Sopenharmony_ci	THP_FAULT_ALLOC,
71562306a36Sopenharmony_ci	THP_COLLAPSE_ALLOC,
71662306a36Sopenharmony_ci#endif
71762306a36Sopenharmony_ci};
71862306a36Sopenharmony_ci
71962306a36Sopenharmony_ci#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
72062306a36Sopenharmony_cistatic int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
72162306a36Sopenharmony_ci
72262306a36Sopenharmony_cistatic void init_memcg_events(void)
72362306a36Sopenharmony_ci{
72462306a36Sopenharmony_ci	int i;
72562306a36Sopenharmony_ci
72662306a36Sopenharmony_ci	for (i = 0; i < NR_MEMCG_EVENTS; ++i)
72762306a36Sopenharmony_ci		mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1;
72862306a36Sopenharmony_ci}
72962306a36Sopenharmony_ci
73062306a36Sopenharmony_cistatic inline int memcg_events_index(enum vm_event_item idx)
73162306a36Sopenharmony_ci{
73262306a36Sopenharmony_ci	return mem_cgroup_events_index[idx] - 1;
73362306a36Sopenharmony_ci}
73462306a36Sopenharmony_ci
73562306a36Sopenharmony_cistruct memcg_vmstats_percpu {
73662306a36Sopenharmony_ci	/* Local (CPU and cgroup) page state & events */
73762306a36Sopenharmony_ci	long			state[MEMCG_NR_STAT];
73862306a36Sopenharmony_ci	unsigned long		events[NR_MEMCG_EVENTS];
73962306a36Sopenharmony_ci
74062306a36Sopenharmony_ci	/* Delta calculation for lockless upward propagation */
74162306a36Sopenharmony_ci	long			state_prev[MEMCG_NR_STAT];
74262306a36Sopenharmony_ci	unsigned long		events_prev[NR_MEMCG_EVENTS];
74362306a36Sopenharmony_ci
74462306a36Sopenharmony_ci	/* Cgroup1: threshold notifications & softlimit tree updates */
74562306a36Sopenharmony_ci	unsigned long		nr_page_events;
74662306a36Sopenharmony_ci	unsigned long		targets[MEM_CGROUP_NTARGETS];
74762306a36Sopenharmony_ci};
74862306a36Sopenharmony_ci
74962306a36Sopenharmony_cistruct memcg_vmstats {
75062306a36Sopenharmony_ci	/* Aggregated (CPU and subtree) page state & events */
75162306a36Sopenharmony_ci	long			state[MEMCG_NR_STAT];
75262306a36Sopenharmony_ci	unsigned long		events[NR_MEMCG_EVENTS];
75362306a36Sopenharmony_ci
75462306a36Sopenharmony_ci	/* Non-hierarchical (CPU aggregated) page state & events */
75562306a36Sopenharmony_ci	long			state_local[MEMCG_NR_STAT];
75662306a36Sopenharmony_ci	unsigned long		events_local[NR_MEMCG_EVENTS];
75762306a36Sopenharmony_ci
75862306a36Sopenharmony_ci	/* Pending child counts during tree propagation */
75962306a36Sopenharmony_ci	long			state_pending[MEMCG_NR_STAT];
76062306a36Sopenharmony_ci	unsigned long		events_pending[NR_MEMCG_EVENTS];
76162306a36Sopenharmony_ci};
76262306a36Sopenharmony_ci
76362306a36Sopenharmony_ciunsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
76462306a36Sopenharmony_ci{
76562306a36Sopenharmony_ci	long x = READ_ONCE(memcg->vmstats->state[idx]);
76662306a36Sopenharmony_ci#ifdef CONFIG_SMP
76762306a36Sopenharmony_ci	if (x < 0)
76862306a36Sopenharmony_ci		x = 0;
76962306a36Sopenharmony_ci#endif
77062306a36Sopenharmony_ci	return x;
77162306a36Sopenharmony_ci}
77262306a36Sopenharmony_ci
77362306a36Sopenharmony_ci/**
77462306a36Sopenharmony_ci * __mod_memcg_state - update cgroup memory statistics
77562306a36Sopenharmony_ci * @memcg: the memory cgroup
77662306a36Sopenharmony_ci * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
77762306a36Sopenharmony_ci * @val: delta to add to the counter, can be negative
77862306a36Sopenharmony_ci */
77962306a36Sopenharmony_civoid __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
78062306a36Sopenharmony_ci{
78162306a36Sopenharmony_ci	if (mem_cgroup_disabled())
78262306a36Sopenharmony_ci		return;
78362306a36Sopenharmony_ci
78462306a36Sopenharmony_ci	__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
78562306a36Sopenharmony_ci	memcg_rstat_updated(memcg, val);
78662306a36Sopenharmony_ci}
78762306a36Sopenharmony_ci
78862306a36Sopenharmony_ci/* idx can be of type enum memcg_stat_item or node_stat_item. */
78962306a36Sopenharmony_cistatic unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
79062306a36Sopenharmony_ci{
79162306a36Sopenharmony_ci	long x = READ_ONCE(memcg->vmstats->state_local[idx]);
79262306a36Sopenharmony_ci
79362306a36Sopenharmony_ci#ifdef CONFIG_SMP
79462306a36Sopenharmony_ci	if (x < 0)
79562306a36Sopenharmony_ci		x = 0;
79662306a36Sopenharmony_ci#endif
79762306a36Sopenharmony_ci	return x;
79862306a36Sopenharmony_ci}
79962306a36Sopenharmony_ci
80062306a36Sopenharmony_civoid __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
80162306a36Sopenharmony_ci			      int val)
80262306a36Sopenharmony_ci{
80362306a36Sopenharmony_ci	struct mem_cgroup_per_node *pn;
80462306a36Sopenharmony_ci	struct mem_cgroup *memcg;
80562306a36Sopenharmony_ci
80662306a36Sopenharmony_ci	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
80762306a36Sopenharmony_ci	memcg = pn->memcg;
80862306a36Sopenharmony_ci
80962306a36Sopenharmony_ci	/*
81062306a36Sopenharmony_ci	 * The caller from rmap relay on disabled preemption becase they never
81162306a36Sopenharmony_ci	 * update their counter from in-interrupt context. For these two
81262306a36Sopenharmony_ci	 * counters we check that the update is never performed from an
81362306a36Sopenharmony_ci	 * interrupt context while other caller need to have disabled interrupt.
81462306a36Sopenharmony_ci	 */
81562306a36Sopenharmony_ci	__memcg_stats_lock();
81662306a36Sopenharmony_ci	if (IS_ENABLED(CONFIG_DEBUG_VM)) {
81762306a36Sopenharmony_ci		switch (idx) {
81862306a36Sopenharmony_ci		case NR_ANON_MAPPED:
81962306a36Sopenharmony_ci		case NR_FILE_MAPPED:
82062306a36Sopenharmony_ci		case NR_ANON_THPS:
82162306a36Sopenharmony_ci		case NR_SHMEM_PMDMAPPED:
82262306a36Sopenharmony_ci		case NR_FILE_PMDMAPPED:
82362306a36Sopenharmony_ci			WARN_ON_ONCE(!in_task());
82462306a36Sopenharmony_ci			break;
82562306a36Sopenharmony_ci		default:
82662306a36Sopenharmony_ci			VM_WARN_ON_IRQS_ENABLED();
82762306a36Sopenharmony_ci		}
82862306a36Sopenharmony_ci	}
82962306a36Sopenharmony_ci
83062306a36Sopenharmony_ci	/* Update memcg */
83162306a36Sopenharmony_ci	__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
83262306a36Sopenharmony_ci
83362306a36Sopenharmony_ci	/* Update lruvec */
83462306a36Sopenharmony_ci	__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
83562306a36Sopenharmony_ci
83662306a36Sopenharmony_ci	memcg_rstat_updated(memcg, val);
83762306a36Sopenharmony_ci	memcg_stats_unlock();
83862306a36Sopenharmony_ci}
83962306a36Sopenharmony_ci
84062306a36Sopenharmony_ci/**
84162306a36Sopenharmony_ci * __mod_lruvec_state - update lruvec memory statistics
84262306a36Sopenharmony_ci * @lruvec: the lruvec
84362306a36Sopenharmony_ci * @idx: the stat item
84462306a36Sopenharmony_ci * @val: delta to add to the counter, can be negative
84562306a36Sopenharmony_ci *
84662306a36Sopenharmony_ci * The lruvec is the intersection of the NUMA node and a cgroup. This
84762306a36Sopenharmony_ci * function updates the all three counters that are affected by a
84862306a36Sopenharmony_ci * change of state at this level: per-node, per-cgroup, per-lruvec.
84962306a36Sopenharmony_ci */
85062306a36Sopenharmony_civoid __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
85162306a36Sopenharmony_ci			int val)
85262306a36Sopenharmony_ci{
85362306a36Sopenharmony_ci	/* Update node */
85462306a36Sopenharmony_ci	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
85562306a36Sopenharmony_ci
85662306a36Sopenharmony_ci	/* Update memcg and lruvec */
85762306a36Sopenharmony_ci	if (!mem_cgroup_disabled()) {
85862306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
85962306a36Sopenharmony_ci		if (is_node_lruvec(lruvec))
86062306a36Sopenharmony_ci			return;
86162306a36Sopenharmony_ci#endif
86262306a36Sopenharmony_ci		__mod_memcg_lruvec_state(lruvec, idx, val);
86362306a36Sopenharmony_ci	}
86462306a36Sopenharmony_ci}
86562306a36Sopenharmony_ci
86662306a36Sopenharmony_civoid __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
86762306a36Sopenharmony_ci			     int val)
86862306a36Sopenharmony_ci{
86962306a36Sopenharmony_ci	struct page *head = compound_head(page); /* rmap on tail pages */
87062306a36Sopenharmony_ci	struct mem_cgroup *memcg;
87162306a36Sopenharmony_ci	pg_data_t *pgdat = page_pgdat(page);
87262306a36Sopenharmony_ci	struct lruvec *lruvec;
87362306a36Sopenharmony_ci
87462306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
87562306a36Sopenharmony_ci	if (is_file_page(page) && !is_prot_page(page)) {
87662306a36Sopenharmony_ci		__mod_node_page_state(pgdat, idx, val);
87762306a36Sopenharmony_ci		return;
87862306a36Sopenharmony_ci	}
87962306a36Sopenharmony_ci#endif
88062306a36Sopenharmony_ci
88162306a36Sopenharmony_ci	rcu_read_lock();
88262306a36Sopenharmony_ci	memcg = page_memcg(head);
88362306a36Sopenharmony_ci	/* Untracked pages have no memcg, no lruvec. Update only the node */
88462306a36Sopenharmony_ci	if (!memcg) {
88562306a36Sopenharmony_ci		rcu_read_unlock();
88662306a36Sopenharmony_ci		__mod_node_page_state(pgdat, idx, val);
88762306a36Sopenharmony_ci		return;
88862306a36Sopenharmony_ci	}
88962306a36Sopenharmony_ci
89062306a36Sopenharmony_ci	lruvec = mem_cgroup_lruvec(memcg, pgdat);
89162306a36Sopenharmony_ci	__mod_lruvec_state(lruvec, idx, val);
89262306a36Sopenharmony_ci	rcu_read_unlock();
89362306a36Sopenharmony_ci}
89462306a36Sopenharmony_ciEXPORT_SYMBOL(__mod_lruvec_page_state);
89562306a36Sopenharmony_ci
89662306a36Sopenharmony_civoid __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
89762306a36Sopenharmony_ci{
89862306a36Sopenharmony_ci	pg_data_t *pgdat = page_pgdat(virt_to_page(p));
89962306a36Sopenharmony_ci	struct mem_cgroup *memcg;
90062306a36Sopenharmony_ci	struct lruvec *lruvec;
90162306a36Sopenharmony_ci
90262306a36Sopenharmony_ci	rcu_read_lock();
90362306a36Sopenharmony_ci	memcg = mem_cgroup_from_slab_obj(p);
90462306a36Sopenharmony_ci
90562306a36Sopenharmony_ci	/*
90662306a36Sopenharmony_ci	 * Untracked pages have no memcg, no lruvec. Update only the
90762306a36Sopenharmony_ci	 * node. If we reparent the slab objects to the root memcg,
90862306a36Sopenharmony_ci	 * when we free the slab object, we need to update the per-memcg
90962306a36Sopenharmony_ci	 * vmstats to keep it correct for the root memcg.
91062306a36Sopenharmony_ci	 */
91162306a36Sopenharmony_ci	if (!memcg) {
91262306a36Sopenharmony_ci		__mod_node_page_state(pgdat, idx, val);
91362306a36Sopenharmony_ci	} else {
91462306a36Sopenharmony_ci		lruvec = mem_cgroup_lruvec(memcg, pgdat);
91562306a36Sopenharmony_ci		__mod_lruvec_state(lruvec, idx, val);
91662306a36Sopenharmony_ci	}
91762306a36Sopenharmony_ci	rcu_read_unlock();
91862306a36Sopenharmony_ci}
91962306a36Sopenharmony_ci
92062306a36Sopenharmony_ci/**
92162306a36Sopenharmony_ci * __count_memcg_events - account VM events in a cgroup
92262306a36Sopenharmony_ci * @memcg: the memory cgroup
92362306a36Sopenharmony_ci * @idx: the event item
92462306a36Sopenharmony_ci * @count: the number of events that occurred
92562306a36Sopenharmony_ci */
92662306a36Sopenharmony_civoid __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
92762306a36Sopenharmony_ci			  unsigned long count)
92862306a36Sopenharmony_ci{
92962306a36Sopenharmony_ci	int index = memcg_events_index(idx);
93062306a36Sopenharmony_ci
93162306a36Sopenharmony_ci	if (mem_cgroup_disabled() || index < 0)
93262306a36Sopenharmony_ci		return;
93362306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
93462306a36Sopenharmony_ci	if (!memcg)
93562306a36Sopenharmony_ci		return;
93662306a36Sopenharmony_ci#endif
93762306a36Sopenharmony_ci
93862306a36Sopenharmony_ci	memcg_stats_lock();
93962306a36Sopenharmony_ci	__this_cpu_add(memcg->vmstats_percpu->events[index], count);
94062306a36Sopenharmony_ci	memcg_rstat_updated(memcg, count);
94162306a36Sopenharmony_ci	memcg_stats_unlock();
94262306a36Sopenharmony_ci}
94362306a36Sopenharmony_ci
94462306a36Sopenharmony_cistatic unsigned long memcg_events(struct mem_cgroup *memcg, int event)
94562306a36Sopenharmony_ci{
94662306a36Sopenharmony_ci	int index = memcg_events_index(event);
94762306a36Sopenharmony_ci
94862306a36Sopenharmony_ci	if (index < 0)
94962306a36Sopenharmony_ci		return 0;
95062306a36Sopenharmony_ci	return READ_ONCE(memcg->vmstats->events[index]);
95162306a36Sopenharmony_ci}
95262306a36Sopenharmony_ci
95362306a36Sopenharmony_cistatic unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
95462306a36Sopenharmony_ci{
95562306a36Sopenharmony_ci	int index = memcg_events_index(event);
95662306a36Sopenharmony_ci
95762306a36Sopenharmony_ci	if (index < 0)
95862306a36Sopenharmony_ci		return 0;
95962306a36Sopenharmony_ci
96062306a36Sopenharmony_ci	return READ_ONCE(memcg->vmstats->events_local[index]);
96162306a36Sopenharmony_ci}
96262306a36Sopenharmony_ci
96362306a36Sopenharmony_cistatic void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
96462306a36Sopenharmony_ci					 int nr_pages)
96562306a36Sopenharmony_ci{
96662306a36Sopenharmony_ci	/* pagein of a big page is an event. So, ignore page size */
96762306a36Sopenharmony_ci	if (nr_pages > 0)
96862306a36Sopenharmony_ci		__count_memcg_events(memcg, PGPGIN, 1);
96962306a36Sopenharmony_ci	else {
97062306a36Sopenharmony_ci		__count_memcg_events(memcg, PGPGOUT, 1);
97162306a36Sopenharmony_ci		nr_pages = -nr_pages; /* for event */
97262306a36Sopenharmony_ci	}
97362306a36Sopenharmony_ci
97462306a36Sopenharmony_ci	__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
97562306a36Sopenharmony_ci}
97662306a36Sopenharmony_ci
97762306a36Sopenharmony_cistatic bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
97862306a36Sopenharmony_ci				       enum mem_cgroup_events_target target)
97962306a36Sopenharmony_ci{
98062306a36Sopenharmony_ci	unsigned long val, next;
98162306a36Sopenharmony_ci
98262306a36Sopenharmony_ci	val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
98362306a36Sopenharmony_ci	next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
98462306a36Sopenharmony_ci	/* from time_after() in jiffies.h */
98562306a36Sopenharmony_ci	if ((long)(next - val) < 0) {
98662306a36Sopenharmony_ci		switch (target) {
98762306a36Sopenharmony_ci		case MEM_CGROUP_TARGET_THRESH:
98862306a36Sopenharmony_ci			next = val + THRESHOLDS_EVENTS_TARGET;
98962306a36Sopenharmony_ci			break;
99062306a36Sopenharmony_ci		case MEM_CGROUP_TARGET_SOFTLIMIT:
99162306a36Sopenharmony_ci			next = val + SOFTLIMIT_EVENTS_TARGET;
99262306a36Sopenharmony_ci			break;
99362306a36Sopenharmony_ci		default:
99462306a36Sopenharmony_ci			break;
99562306a36Sopenharmony_ci		}
99662306a36Sopenharmony_ci		__this_cpu_write(memcg->vmstats_percpu->targets[target], next);
99762306a36Sopenharmony_ci		return true;
99862306a36Sopenharmony_ci	}
99962306a36Sopenharmony_ci	return false;
100062306a36Sopenharmony_ci}
100162306a36Sopenharmony_ci
100262306a36Sopenharmony_ci/*
100362306a36Sopenharmony_ci * Check events in order.
100462306a36Sopenharmony_ci *
100562306a36Sopenharmony_ci */
100662306a36Sopenharmony_cistatic void memcg_check_events(struct mem_cgroup *memcg, int nid)
100762306a36Sopenharmony_ci{
100862306a36Sopenharmony_ci	if (IS_ENABLED(CONFIG_PREEMPT_RT))
100962306a36Sopenharmony_ci		return;
101062306a36Sopenharmony_ci
101162306a36Sopenharmony_ci	/* threshold event is triggered in finer grain than soft limit */
101262306a36Sopenharmony_ci	if (unlikely(mem_cgroup_event_ratelimit(memcg,
101362306a36Sopenharmony_ci						MEM_CGROUP_TARGET_THRESH))) {
101462306a36Sopenharmony_ci		bool do_softlimit;
101562306a36Sopenharmony_ci
101662306a36Sopenharmony_ci		do_softlimit = mem_cgroup_event_ratelimit(memcg,
101762306a36Sopenharmony_ci						MEM_CGROUP_TARGET_SOFTLIMIT);
101862306a36Sopenharmony_ci		mem_cgroup_threshold(memcg);
101962306a36Sopenharmony_ci		if (unlikely(do_softlimit))
102062306a36Sopenharmony_ci			mem_cgroup_update_tree(memcg, nid);
102162306a36Sopenharmony_ci	}
102262306a36Sopenharmony_ci}
102362306a36Sopenharmony_ci
102462306a36Sopenharmony_cistruct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
102562306a36Sopenharmony_ci{
102662306a36Sopenharmony_ci	/*
102762306a36Sopenharmony_ci	 * mm_update_next_owner() may clear mm->owner to NULL
102862306a36Sopenharmony_ci	 * if it races with swapoff, page migration, etc.
102962306a36Sopenharmony_ci	 * So this can be called with p == NULL.
103062306a36Sopenharmony_ci	 */
103162306a36Sopenharmony_ci	if (unlikely(!p))
103262306a36Sopenharmony_ci		return NULL;
103362306a36Sopenharmony_ci
103462306a36Sopenharmony_ci	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
103562306a36Sopenharmony_ci}
103662306a36Sopenharmony_ciEXPORT_SYMBOL(mem_cgroup_from_task);
103762306a36Sopenharmony_ci
103862306a36Sopenharmony_cistatic __always_inline struct mem_cgroup *active_memcg(void)
103962306a36Sopenharmony_ci{
104062306a36Sopenharmony_ci	if (!in_task())
104162306a36Sopenharmony_ci		return this_cpu_read(int_active_memcg);
104262306a36Sopenharmony_ci	else
104362306a36Sopenharmony_ci		return current->active_memcg;
104462306a36Sopenharmony_ci}
104562306a36Sopenharmony_ci
104662306a36Sopenharmony_ci/**
104762306a36Sopenharmony_ci * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
104862306a36Sopenharmony_ci * @mm: mm from which memcg should be extracted. It can be NULL.
104962306a36Sopenharmony_ci *
105062306a36Sopenharmony_ci * Obtain a reference on mm->memcg and returns it if successful. If mm
105162306a36Sopenharmony_ci * is NULL, then the memcg is chosen as follows:
105262306a36Sopenharmony_ci * 1) The active memcg, if set.
105362306a36Sopenharmony_ci * 2) current->mm->memcg, if available
105462306a36Sopenharmony_ci * 3) root memcg
105562306a36Sopenharmony_ci * If mem_cgroup is disabled, NULL is returned.
105662306a36Sopenharmony_ci */
105762306a36Sopenharmony_cistruct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
105862306a36Sopenharmony_ci{
105962306a36Sopenharmony_ci	struct mem_cgroup *memcg;
106062306a36Sopenharmony_ci
106162306a36Sopenharmony_ci	if (mem_cgroup_disabled())
106262306a36Sopenharmony_ci		return NULL;
106362306a36Sopenharmony_ci
106462306a36Sopenharmony_ci	/*
106562306a36Sopenharmony_ci	 * Page cache insertions can happen without an
106662306a36Sopenharmony_ci	 * actual mm context, e.g. during disk probing
106762306a36Sopenharmony_ci	 * on boot, loopback IO, acct() writes etc.
106862306a36Sopenharmony_ci	 *
106962306a36Sopenharmony_ci	 * No need to css_get on root memcg as the reference
107062306a36Sopenharmony_ci	 * counting is disabled on the root level in the
107162306a36Sopenharmony_ci	 * cgroup core. See CSS_NO_REF.
107262306a36Sopenharmony_ci	 */
107362306a36Sopenharmony_ci	if (unlikely(!mm)) {
107462306a36Sopenharmony_ci		memcg = active_memcg();
107562306a36Sopenharmony_ci		if (unlikely(memcg)) {
107662306a36Sopenharmony_ci			/* remote memcg must hold a ref */
107762306a36Sopenharmony_ci			css_get(&memcg->css);
107862306a36Sopenharmony_ci			return memcg;
107962306a36Sopenharmony_ci		}
108062306a36Sopenharmony_ci		mm = current->mm;
108162306a36Sopenharmony_ci		if (unlikely(!mm))
108262306a36Sopenharmony_ci			return root_mem_cgroup;
108362306a36Sopenharmony_ci	}
108462306a36Sopenharmony_ci
108562306a36Sopenharmony_ci	rcu_read_lock();
108662306a36Sopenharmony_ci	do {
108762306a36Sopenharmony_ci		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
108862306a36Sopenharmony_ci		if (unlikely(!memcg))
108962306a36Sopenharmony_ci			memcg = root_mem_cgroup;
109062306a36Sopenharmony_ci	} while (!css_tryget(&memcg->css));
109162306a36Sopenharmony_ci	rcu_read_unlock();
109262306a36Sopenharmony_ci	return memcg;
109362306a36Sopenharmony_ci}
109462306a36Sopenharmony_ciEXPORT_SYMBOL(get_mem_cgroup_from_mm);
109562306a36Sopenharmony_ci
109662306a36Sopenharmony_cistatic __always_inline bool memcg_kmem_bypass(void)
109762306a36Sopenharmony_ci{
109862306a36Sopenharmony_ci	/* Allow remote memcg charging from any context. */
109962306a36Sopenharmony_ci	if (unlikely(active_memcg()))
110062306a36Sopenharmony_ci		return false;
110162306a36Sopenharmony_ci
110262306a36Sopenharmony_ci	/* Memcg to charge can't be determined. */
110362306a36Sopenharmony_ci	if (!in_task() || !current->mm || (current->flags & PF_KTHREAD))
110462306a36Sopenharmony_ci		return true;
110562306a36Sopenharmony_ci
110662306a36Sopenharmony_ci	return false;
110762306a36Sopenharmony_ci}
110862306a36Sopenharmony_ci
110962306a36Sopenharmony_ci/**
111062306a36Sopenharmony_ci * mem_cgroup_iter - iterate over memory cgroup hierarchy
111162306a36Sopenharmony_ci * @root: hierarchy root
111262306a36Sopenharmony_ci * @prev: previously returned memcg, NULL on first invocation
111362306a36Sopenharmony_ci * @reclaim: cookie for shared reclaim walks, NULL for full walks
111462306a36Sopenharmony_ci *
111562306a36Sopenharmony_ci * Returns references to children of the hierarchy below @root, or
111662306a36Sopenharmony_ci * @root itself, or %NULL after a full round-trip.
111762306a36Sopenharmony_ci *
111862306a36Sopenharmony_ci * Caller must pass the return value in @prev on subsequent
111962306a36Sopenharmony_ci * invocations for reference counting, or use mem_cgroup_iter_break()
112062306a36Sopenharmony_ci * to cancel a hierarchy walk before the round-trip is complete.
112162306a36Sopenharmony_ci *
112262306a36Sopenharmony_ci * Reclaimers can specify a node in @reclaim to divide up the memcgs
112362306a36Sopenharmony_ci * in the hierarchy among all concurrent reclaimers operating on the
112462306a36Sopenharmony_ci * same node.
112562306a36Sopenharmony_ci */
112662306a36Sopenharmony_cistruct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
112762306a36Sopenharmony_ci				   struct mem_cgroup *prev,
112862306a36Sopenharmony_ci				   struct mem_cgroup_reclaim_cookie *reclaim)
112962306a36Sopenharmony_ci{
113062306a36Sopenharmony_ci	struct mem_cgroup_reclaim_iter *iter;
113162306a36Sopenharmony_ci	struct cgroup_subsys_state *css = NULL;
113262306a36Sopenharmony_ci	struct mem_cgroup *memcg = NULL;
113362306a36Sopenharmony_ci	struct mem_cgroup *pos = NULL;
113462306a36Sopenharmony_ci
113562306a36Sopenharmony_ci	if (mem_cgroup_disabled())
113662306a36Sopenharmony_ci		return NULL;
113762306a36Sopenharmony_ci
113862306a36Sopenharmony_ci	if (!root)
113962306a36Sopenharmony_ci		root = root_mem_cgroup;
114062306a36Sopenharmony_ci
114162306a36Sopenharmony_ci	rcu_read_lock();
114262306a36Sopenharmony_ci
114362306a36Sopenharmony_ci	if (reclaim) {
114462306a36Sopenharmony_ci		struct mem_cgroup_per_node *mz;
114562306a36Sopenharmony_ci
114662306a36Sopenharmony_ci		mz = root->nodeinfo[reclaim->pgdat->node_id];
114762306a36Sopenharmony_ci		iter = &mz->iter;
114862306a36Sopenharmony_ci
114962306a36Sopenharmony_ci		/*
115062306a36Sopenharmony_ci		 * On start, join the current reclaim iteration cycle.
115162306a36Sopenharmony_ci		 * Exit when a concurrent walker completes it.
115262306a36Sopenharmony_ci		 */
115362306a36Sopenharmony_ci		if (!prev)
115462306a36Sopenharmony_ci			reclaim->generation = iter->generation;
115562306a36Sopenharmony_ci		else if (reclaim->generation != iter->generation)
115662306a36Sopenharmony_ci			goto out_unlock;
115762306a36Sopenharmony_ci
115862306a36Sopenharmony_ci		while (1) {
115962306a36Sopenharmony_ci			pos = READ_ONCE(iter->position);
116062306a36Sopenharmony_ci			if (!pos || css_tryget(&pos->css))
116162306a36Sopenharmony_ci				break;
116262306a36Sopenharmony_ci			/*
116362306a36Sopenharmony_ci			 * css reference reached zero, so iter->position will
116462306a36Sopenharmony_ci			 * be cleared by ->css_released. However, we should not
116562306a36Sopenharmony_ci			 * rely on this happening soon, because ->css_released
116662306a36Sopenharmony_ci			 * is called from a work queue, and by busy-waiting we
116762306a36Sopenharmony_ci			 * might block it. So we clear iter->position right
116862306a36Sopenharmony_ci			 * away.
116962306a36Sopenharmony_ci			 */
117062306a36Sopenharmony_ci			(void)cmpxchg(&iter->position, pos, NULL);
117162306a36Sopenharmony_ci		}
117262306a36Sopenharmony_ci	} else if (prev) {
117362306a36Sopenharmony_ci		pos = prev;
117462306a36Sopenharmony_ci	}
117562306a36Sopenharmony_ci
117662306a36Sopenharmony_ci	if (pos)
117762306a36Sopenharmony_ci		css = &pos->css;
117862306a36Sopenharmony_ci
117962306a36Sopenharmony_ci	for (;;) {
118062306a36Sopenharmony_ci		css = css_next_descendant_pre(css, &root->css);
118162306a36Sopenharmony_ci		if (!css) {
118262306a36Sopenharmony_ci			/*
118362306a36Sopenharmony_ci			 * Reclaimers share the hierarchy walk, and a
118462306a36Sopenharmony_ci			 * new one might jump in right at the end of
118562306a36Sopenharmony_ci			 * the hierarchy - make sure they see at least
118662306a36Sopenharmony_ci			 * one group and restart from the beginning.
118762306a36Sopenharmony_ci			 */
118862306a36Sopenharmony_ci			if (!prev)
118962306a36Sopenharmony_ci				continue;
119062306a36Sopenharmony_ci			break;
119162306a36Sopenharmony_ci		}
119262306a36Sopenharmony_ci
119362306a36Sopenharmony_ci		/*
119462306a36Sopenharmony_ci		 * Verify the css and acquire a reference.  The root
119562306a36Sopenharmony_ci		 * is provided by the caller, so we know it's alive
119662306a36Sopenharmony_ci		 * and kicking, and don't take an extra reference.
119762306a36Sopenharmony_ci		 */
119862306a36Sopenharmony_ci		if (css == &root->css || css_tryget(css)) {
119962306a36Sopenharmony_ci			memcg = mem_cgroup_from_css(css);
120062306a36Sopenharmony_ci			break;
120162306a36Sopenharmony_ci		}
120262306a36Sopenharmony_ci	}
120362306a36Sopenharmony_ci
120462306a36Sopenharmony_ci	if (reclaim) {
120562306a36Sopenharmony_ci		/*
120662306a36Sopenharmony_ci		 * The position could have already been updated by a competing
120762306a36Sopenharmony_ci		 * thread, so check that the value hasn't changed since we read
120862306a36Sopenharmony_ci		 * it to avoid reclaiming from the same cgroup twice.
120962306a36Sopenharmony_ci		 */
121062306a36Sopenharmony_ci		(void)cmpxchg(&iter->position, pos, memcg);
121162306a36Sopenharmony_ci
121262306a36Sopenharmony_ci		if (pos)
121362306a36Sopenharmony_ci			css_put(&pos->css);
121462306a36Sopenharmony_ci
121562306a36Sopenharmony_ci		if (!memcg)
121662306a36Sopenharmony_ci			iter->generation++;
121762306a36Sopenharmony_ci	}
121862306a36Sopenharmony_ci
121962306a36Sopenharmony_ciout_unlock:
122062306a36Sopenharmony_ci	rcu_read_unlock();
122162306a36Sopenharmony_ci	if (prev && prev != root)
122262306a36Sopenharmony_ci		css_put(&prev->css);
122362306a36Sopenharmony_ci
122462306a36Sopenharmony_ci	return memcg;
122562306a36Sopenharmony_ci}
122662306a36Sopenharmony_ci
122762306a36Sopenharmony_ci/**
122862306a36Sopenharmony_ci * mem_cgroup_iter_break - abort a hierarchy walk prematurely
122962306a36Sopenharmony_ci * @root: hierarchy root
123062306a36Sopenharmony_ci * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
123162306a36Sopenharmony_ci */
123262306a36Sopenharmony_civoid mem_cgroup_iter_break(struct mem_cgroup *root,
123362306a36Sopenharmony_ci			   struct mem_cgroup *prev)
123462306a36Sopenharmony_ci{
123562306a36Sopenharmony_ci	if (!root)
123662306a36Sopenharmony_ci		root = root_mem_cgroup;
123762306a36Sopenharmony_ci	if (prev && prev != root)
123862306a36Sopenharmony_ci		css_put(&prev->css);
123962306a36Sopenharmony_ci}
124062306a36Sopenharmony_ci
124162306a36Sopenharmony_cistatic void __invalidate_reclaim_iterators(struct mem_cgroup *from,
124262306a36Sopenharmony_ci					struct mem_cgroup *dead_memcg)
124362306a36Sopenharmony_ci{
124462306a36Sopenharmony_ci	struct mem_cgroup_reclaim_iter *iter;
124562306a36Sopenharmony_ci	struct mem_cgroup_per_node *mz;
124662306a36Sopenharmony_ci	int nid;
124762306a36Sopenharmony_ci
124862306a36Sopenharmony_ci	for_each_node(nid) {
124962306a36Sopenharmony_ci		mz = from->nodeinfo[nid];
125062306a36Sopenharmony_ci		iter = &mz->iter;
125162306a36Sopenharmony_ci		cmpxchg(&iter->position, dead_memcg, NULL);
125262306a36Sopenharmony_ci	}
125362306a36Sopenharmony_ci}
125462306a36Sopenharmony_ci
125562306a36Sopenharmony_cistatic void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
125662306a36Sopenharmony_ci{
125762306a36Sopenharmony_ci	struct mem_cgroup *memcg = dead_memcg;
125862306a36Sopenharmony_ci	struct mem_cgroup *last;
125962306a36Sopenharmony_ci
126062306a36Sopenharmony_ci	do {
126162306a36Sopenharmony_ci		__invalidate_reclaim_iterators(memcg, dead_memcg);
126262306a36Sopenharmony_ci		last = memcg;
126362306a36Sopenharmony_ci	} while ((memcg = parent_mem_cgroup(memcg)));
126462306a36Sopenharmony_ci
126562306a36Sopenharmony_ci	/*
126662306a36Sopenharmony_ci	 * When cgroup1 non-hierarchy mode is used,
126762306a36Sopenharmony_ci	 * parent_mem_cgroup() does not walk all the way up to the
126862306a36Sopenharmony_ci	 * cgroup root (root_mem_cgroup). So we have to handle
126962306a36Sopenharmony_ci	 * dead_memcg from cgroup root separately.
127062306a36Sopenharmony_ci	 */
127162306a36Sopenharmony_ci	if (!mem_cgroup_is_root(last))
127262306a36Sopenharmony_ci		__invalidate_reclaim_iterators(root_mem_cgroup,
127362306a36Sopenharmony_ci						dead_memcg);
127462306a36Sopenharmony_ci}
127562306a36Sopenharmony_ci
127662306a36Sopenharmony_ci/**
127762306a36Sopenharmony_ci * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
127862306a36Sopenharmony_ci * @memcg: hierarchy root
127962306a36Sopenharmony_ci * @fn: function to call for each task
128062306a36Sopenharmony_ci * @arg: argument passed to @fn
128162306a36Sopenharmony_ci *
128262306a36Sopenharmony_ci * This function iterates over tasks attached to @memcg or to any of its
128362306a36Sopenharmony_ci * descendants and calls @fn for each task. If @fn returns a non-zero
128462306a36Sopenharmony_ci * value, the function breaks the iteration loop. Otherwise, it will iterate
128562306a36Sopenharmony_ci * over all tasks and return 0.
128662306a36Sopenharmony_ci *
128762306a36Sopenharmony_ci * This function must not be called for the root memory cgroup.
128862306a36Sopenharmony_ci */
128962306a36Sopenharmony_civoid mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
129062306a36Sopenharmony_ci			   int (*fn)(struct task_struct *, void *), void *arg)
129162306a36Sopenharmony_ci{
129262306a36Sopenharmony_ci	struct mem_cgroup *iter;
129362306a36Sopenharmony_ci	int ret = 0;
129462306a36Sopenharmony_ci
129562306a36Sopenharmony_ci	BUG_ON(mem_cgroup_is_root(memcg));
129662306a36Sopenharmony_ci
129762306a36Sopenharmony_ci	for_each_mem_cgroup_tree(iter, memcg) {
129862306a36Sopenharmony_ci		struct css_task_iter it;
129962306a36Sopenharmony_ci		struct task_struct *task;
130062306a36Sopenharmony_ci
130162306a36Sopenharmony_ci		css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
130262306a36Sopenharmony_ci		while (!ret && (task = css_task_iter_next(&it)))
130362306a36Sopenharmony_ci			ret = fn(task, arg);
130462306a36Sopenharmony_ci		css_task_iter_end(&it);
130562306a36Sopenharmony_ci		if (ret) {
130662306a36Sopenharmony_ci			mem_cgroup_iter_break(memcg, iter);
130762306a36Sopenharmony_ci			break;
130862306a36Sopenharmony_ci		}
130962306a36Sopenharmony_ci	}
131062306a36Sopenharmony_ci}
131162306a36Sopenharmony_ci
131262306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_VM
131362306a36Sopenharmony_civoid lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
131462306a36Sopenharmony_ci{
131562306a36Sopenharmony_ci	struct mem_cgroup *memcg;
131662306a36Sopenharmony_ci
131762306a36Sopenharmony_ci	if (mem_cgroup_disabled())
131862306a36Sopenharmony_ci		return;
131962306a36Sopenharmony_ci
132062306a36Sopenharmony_ci	memcg = folio_memcg(folio);
132162306a36Sopenharmony_ci
132262306a36Sopenharmony_ci	if (!memcg)
132362306a36Sopenharmony_ci		VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio);
132462306a36Sopenharmony_ci	else
132562306a36Sopenharmony_ci		VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
132662306a36Sopenharmony_ci}
132762306a36Sopenharmony_ci#endif
132862306a36Sopenharmony_ci
132962306a36Sopenharmony_ci/**
133062306a36Sopenharmony_ci * folio_lruvec_lock - Lock the lruvec for a folio.
133162306a36Sopenharmony_ci * @folio: Pointer to the folio.
133262306a36Sopenharmony_ci *
133362306a36Sopenharmony_ci * These functions are safe to use under any of the following conditions:
133462306a36Sopenharmony_ci * - folio locked
133562306a36Sopenharmony_ci * - folio_test_lru false
133662306a36Sopenharmony_ci * - folio_memcg_lock()
133762306a36Sopenharmony_ci * - folio frozen (refcount of 0)
133862306a36Sopenharmony_ci *
133962306a36Sopenharmony_ci * Return: The lruvec this folio is on with its lock held.
134062306a36Sopenharmony_ci */
134162306a36Sopenharmony_cistruct lruvec *folio_lruvec_lock(struct folio *folio)
134262306a36Sopenharmony_ci{
134362306a36Sopenharmony_ci	struct lruvec *lruvec = folio_lruvec(folio);
134462306a36Sopenharmony_ci
134562306a36Sopenharmony_ci	spin_lock(&lruvec->lru_lock);
134662306a36Sopenharmony_ci	lruvec_memcg_debug(lruvec, folio);
134762306a36Sopenharmony_ci
134862306a36Sopenharmony_ci	return lruvec;
134962306a36Sopenharmony_ci}
135062306a36Sopenharmony_ci
135162306a36Sopenharmony_ci/**
135262306a36Sopenharmony_ci * folio_lruvec_lock_irq - Lock the lruvec for a folio.
135362306a36Sopenharmony_ci * @folio: Pointer to the folio.
135462306a36Sopenharmony_ci *
135562306a36Sopenharmony_ci * These functions are safe to use under any of the following conditions:
135662306a36Sopenharmony_ci * - folio locked
135762306a36Sopenharmony_ci * - folio_test_lru false
135862306a36Sopenharmony_ci * - folio_memcg_lock()
135962306a36Sopenharmony_ci * - folio frozen (refcount of 0)
136062306a36Sopenharmony_ci *
136162306a36Sopenharmony_ci * Return: The lruvec this folio is on with its lock held and interrupts
136262306a36Sopenharmony_ci * disabled.
136362306a36Sopenharmony_ci */
136462306a36Sopenharmony_cistruct lruvec *folio_lruvec_lock_irq(struct folio *folio)
136562306a36Sopenharmony_ci{
136662306a36Sopenharmony_ci	struct lruvec *lruvec = folio_lruvec(folio);
136762306a36Sopenharmony_ci
136862306a36Sopenharmony_ci	spin_lock_irq(&lruvec->lru_lock);
136962306a36Sopenharmony_ci	lruvec_memcg_debug(lruvec, folio);
137062306a36Sopenharmony_ci
137162306a36Sopenharmony_ci	return lruvec;
137262306a36Sopenharmony_ci}
137362306a36Sopenharmony_ci
137462306a36Sopenharmony_ci/**
137562306a36Sopenharmony_ci * folio_lruvec_lock_irqsave - Lock the lruvec for a folio.
137662306a36Sopenharmony_ci * @folio: Pointer to the folio.
137762306a36Sopenharmony_ci * @flags: Pointer to irqsave flags.
137862306a36Sopenharmony_ci *
137962306a36Sopenharmony_ci * These functions are safe to use under any of the following conditions:
138062306a36Sopenharmony_ci * - folio locked
138162306a36Sopenharmony_ci * - folio_test_lru false
138262306a36Sopenharmony_ci * - folio_memcg_lock()
138362306a36Sopenharmony_ci * - folio frozen (refcount of 0)
138462306a36Sopenharmony_ci *
138562306a36Sopenharmony_ci * Return: The lruvec this folio is on with its lock held and interrupts
138662306a36Sopenharmony_ci * disabled.
138762306a36Sopenharmony_ci */
138862306a36Sopenharmony_cistruct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
138962306a36Sopenharmony_ci		unsigned long *flags)
139062306a36Sopenharmony_ci{
139162306a36Sopenharmony_ci	struct lruvec *lruvec = folio_lruvec(folio);
139262306a36Sopenharmony_ci
139362306a36Sopenharmony_ci	spin_lock_irqsave(&lruvec->lru_lock, *flags);
139462306a36Sopenharmony_ci	lruvec_memcg_debug(lruvec, folio);
139562306a36Sopenharmony_ci
139662306a36Sopenharmony_ci	return lruvec;
139762306a36Sopenharmony_ci}
139862306a36Sopenharmony_ci
139962306a36Sopenharmony_ci/**
140062306a36Sopenharmony_ci * mem_cgroup_update_lru_size - account for adding or removing an lru page
140162306a36Sopenharmony_ci * @lruvec: mem_cgroup per zone lru vector
140262306a36Sopenharmony_ci * @lru: index of lru list the page is sitting on
140362306a36Sopenharmony_ci * @zid: zone id of the accounted pages
140462306a36Sopenharmony_ci * @nr_pages: positive when adding or negative when removing
140562306a36Sopenharmony_ci *
140662306a36Sopenharmony_ci * This function must be called under lru_lock, just before a page is added
140762306a36Sopenharmony_ci * to or just after a page is removed from an lru list.
140862306a36Sopenharmony_ci */
140962306a36Sopenharmony_civoid mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
141062306a36Sopenharmony_ci				int zid, int nr_pages)
141162306a36Sopenharmony_ci{
141262306a36Sopenharmony_ci	struct mem_cgroup_per_node *mz;
141362306a36Sopenharmony_ci	unsigned long *lru_size;
141462306a36Sopenharmony_ci	long size;
141562306a36Sopenharmony_ci
141662306a36Sopenharmony_ci	if (mem_cgroup_disabled())
141762306a36Sopenharmony_ci		return;
141862306a36Sopenharmony_ci
141962306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
142062306a36Sopenharmony_ci	if (is_node_lruvec(lruvec))
142162306a36Sopenharmony_ci		return;
142262306a36Sopenharmony_ci#endif
142362306a36Sopenharmony_ci
142462306a36Sopenharmony_ci	mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
142562306a36Sopenharmony_ci	lru_size = &mz->lru_zone_size[zid][lru];
142662306a36Sopenharmony_ci
142762306a36Sopenharmony_ci	if (nr_pages < 0)
142862306a36Sopenharmony_ci		*lru_size += nr_pages;
142962306a36Sopenharmony_ci
143062306a36Sopenharmony_ci	size = *lru_size;
143162306a36Sopenharmony_ci	if (WARN_ONCE(size < 0,
143262306a36Sopenharmony_ci		"%s(%p, %d, %d): lru_size %ld\n",
143362306a36Sopenharmony_ci		__func__, lruvec, lru, nr_pages, size)) {
143462306a36Sopenharmony_ci		VM_BUG_ON(1);
143562306a36Sopenharmony_ci		*lru_size = 0;
143662306a36Sopenharmony_ci	}
143762306a36Sopenharmony_ci
143862306a36Sopenharmony_ci	if (nr_pages > 0)
143962306a36Sopenharmony_ci		*lru_size += nr_pages;
144062306a36Sopenharmony_ci}
144162306a36Sopenharmony_ci
144262306a36Sopenharmony_ci/**
144362306a36Sopenharmony_ci * mem_cgroup_margin - calculate chargeable space of a memory cgroup
144462306a36Sopenharmony_ci * @memcg: the memory cgroup
144562306a36Sopenharmony_ci *
144662306a36Sopenharmony_ci * Returns the maximum amount of memory @mem can be charged with, in
144762306a36Sopenharmony_ci * pages.
144862306a36Sopenharmony_ci */
144962306a36Sopenharmony_cistatic unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
145062306a36Sopenharmony_ci{
145162306a36Sopenharmony_ci	unsigned long margin = 0;
145262306a36Sopenharmony_ci	unsigned long count;
145362306a36Sopenharmony_ci	unsigned long limit;
145462306a36Sopenharmony_ci
145562306a36Sopenharmony_ci	count = page_counter_read(&memcg->memory);
145662306a36Sopenharmony_ci	limit = READ_ONCE(memcg->memory.max);
145762306a36Sopenharmony_ci	if (count < limit)
145862306a36Sopenharmony_ci		margin = limit - count;
145962306a36Sopenharmony_ci
146062306a36Sopenharmony_ci	if (do_memsw_account()) {
146162306a36Sopenharmony_ci		count = page_counter_read(&memcg->memsw);
146262306a36Sopenharmony_ci		limit = READ_ONCE(memcg->memsw.max);
146362306a36Sopenharmony_ci		if (count < limit)
146462306a36Sopenharmony_ci			margin = min(margin, limit - count);
146562306a36Sopenharmony_ci		else
146662306a36Sopenharmony_ci			margin = 0;
146762306a36Sopenharmony_ci	}
146862306a36Sopenharmony_ci
146962306a36Sopenharmony_ci	return margin;
147062306a36Sopenharmony_ci}
147162306a36Sopenharmony_ci
147262306a36Sopenharmony_ci/*
147362306a36Sopenharmony_ci * A routine for checking "mem" is under move_account() or not.
147462306a36Sopenharmony_ci *
147562306a36Sopenharmony_ci * Checking a cgroup is mc.from or mc.to or under hierarchy of
147662306a36Sopenharmony_ci * moving cgroups. This is for waiting at high-memory pressure
147762306a36Sopenharmony_ci * caused by "move".
147862306a36Sopenharmony_ci */
147962306a36Sopenharmony_cistatic bool mem_cgroup_under_move(struct mem_cgroup *memcg)
148062306a36Sopenharmony_ci{
148162306a36Sopenharmony_ci	struct mem_cgroup *from;
148262306a36Sopenharmony_ci	struct mem_cgroup *to;
148362306a36Sopenharmony_ci	bool ret = false;
148462306a36Sopenharmony_ci	/*
148562306a36Sopenharmony_ci	 * Unlike task_move routines, we access mc.to, mc.from not under
148662306a36Sopenharmony_ci	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
148762306a36Sopenharmony_ci	 */
148862306a36Sopenharmony_ci	spin_lock(&mc.lock);
148962306a36Sopenharmony_ci	from = mc.from;
149062306a36Sopenharmony_ci	to = mc.to;
149162306a36Sopenharmony_ci	if (!from)
149262306a36Sopenharmony_ci		goto unlock;
149362306a36Sopenharmony_ci
149462306a36Sopenharmony_ci	ret = mem_cgroup_is_descendant(from, memcg) ||
149562306a36Sopenharmony_ci		mem_cgroup_is_descendant(to, memcg);
149662306a36Sopenharmony_ciunlock:
149762306a36Sopenharmony_ci	spin_unlock(&mc.lock);
149862306a36Sopenharmony_ci	return ret;
149962306a36Sopenharmony_ci}
150062306a36Sopenharmony_ci
150162306a36Sopenharmony_cistatic bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
150262306a36Sopenharmony_ci{
150362306a36Sopenharmony_ci	if (mc.moving_task && current != mc.moving_task) {
150462306a36Sopenharmony_ci		if (mem_cgroup_under_move(memcg)) {
150562306a36Sopenharmony_ci			DEFINE_WAIT(wait);
150662306a36Sopenharmony_ci			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
150762306a36Sopenharmony_ci			/* moving charge context might have finished. */
150862306a36Sopenharmony_ci			if (mc.moving_task)
150962306a36Sopenharmony_ci				schedule();
151062306a36Sopenharmony_ci			finish_wait(&mc.waitq, &wait);
151162306a36Sopenharmony_ci			return true;
151262306a36Sopenharmony_ci		}
151362306a36Sopenharmony_ci	}
151462306a36Sopenharmony_ci	return false;
151562306a36Sopenharmony_ci}
151662306a36Sopenharmony_ci
151762306a36Sopenharmony_cistruct memory_stat {
151862306a36Sopenharmony_ci	const char *name;
151962306a36Sopenharmony_ci	unsigned int idx;
152062306a36Sopenharmony_ci};
152162306a36Sopenharmony_ci
152262306a36Sopenharmony_cistatic const struct memory_stat memory_stats[] = {
152362306a36Sopenharmony_ci	{ "anon",			NR_ANON_MAPPED			},
152462306a36Sopenharmony_ci	{ "file",			NR_FILE_PAGES			},
152562306a36Sopenharmony_ci	{ "kernel",			MEMCG_KMEM			},
152662306a36Sopenharmony_ci	{ "kernel_stack",		NR_KERNEL_STACK_KB		},
152762306a36Sopenharmony_ci	{ "pagetables",			NR_PAGETABLE			},
152862306a36Sopenharmony_ci	{ "sec_pagetables",		NR_SECONDARY_PAGETABLE		},
152962306a36Sopenharmony_ci	{ "percpu",			MEMCG_PERCPU_B			},
153062306a36Sopenharmony_ci	{ "sock",			MEMCG_SOCK			},
153162306a36Sopenharmony_ci	{ "vmalloc",			MEMCG_VMALLOC			},
153262306a36Sopenharmony_ci	{ "shmem",			NR_SHMEM			},
153362306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
153462306a36Sopenharmony_ci	{ "zswap",			MEMCG_ZSWAP_B			},
153562306a36Sopenharmony_ci	{ "zswapped",			MEMCG_ZSWAPPED			},
153662306a36Sopenharmony_ci#endif
153762306a36Sopenharmony_ci	{ "file_mapped",		NR_FILE_MAPPED			},
153862306a36Sopenharmony_ci	{ "file_dirty",			NR_FILE_DIRTY			},
153962306a36Sopenharmony_ci	{ "file_writeback",		NR_WRITEBACK			},
154062306a36Sopenharmony_ci#ifdef CONFIG_SWAP
154162306a36Sopenharmony_ci	{ "swapcached",			NR_SWAPCACHE			},
154262306a36Sopenharmony_ci#endif
154362306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
154462306a36Sopenharmony_ci	{ "anon_thp",			NR_ANON_THPS			},
154562306a36Sopenharmony_ci	{ "file_thp",			NR_FILE_THPS			},
154662306a36Sopenharmony_ci	{ "shmem_thp",			NR_SHMEM_THPS			},
154762306a36Sopenharmony_ci#endif
154862306a36Sopenharmony_ci	{ "inactive_anon",		NR_INACTIVE_ANON		},
154962306a36Sopenharmony_ci	{ "active_anon",		NR_ACTIVE_ANON			},
155062306a36Sopenharmony_ci	{ "inactive_file",		NR_INACTIVE_FILE		},
155162306a36Sopenharmony_ci	{ "active_file",		NR_ACTIVE_FILE			},
155262306a36Sopenharmony_ci	{ "unevictable",		NR_UNEVICTABLE			},
155362306a36Sopenharmony_ci	{ "slab_reclaimable",		NR_SLAB_RECLAIMABLE_B		},
155462306a36Sopenharmony_ci	{ "slab_unreclaimable",		NR_SLAB_UNRECLAIMABLE_B		},
155562306a36Sopenharmony_ci
155662306a36Sopenharmony_ci	/* The memory events */
155762306a36Sopenharmony_ci	{ "workingset_refault_anon",	WORKINGSET_REFAULT_ANON		},
155862306a36Sopenharmony_ci	{ "workingset_refault_file",	WORKINGSET_REFAULT_FILE		},
155962306a36Sopenharmony_ci	{ "workingset_activate_anon",	WORKINGSET_ACTIVATE_ANON	},
156062306a36Sopenharmony_ci	{ "workingset_activate_file",	WORKINGSET_ACTIVATE_FILE	},
156162306a36Sopenharmony_ci	{ "workingset_restore_anon",	WORKINGSET_RESTORE_ANON		},
156262306a36Sopenharmony_ci	{ "workingset_restore_file",	WORKINGSET_RESTORE_FILE		},
156362306a36Sopenharmony_ci	{ "workingset_nodereclaim",	WORKINGSET_NODERECLAIM		},
156462306a36Sopenharmony_ci};
156562306a36Sopenharmony_ci
156662306a36Sopenharmony_ci/* Translate stat items to the correct unit for memory.stat output */
156762306a36Sopenharmony_cistatic int memcg_page_state_unit(int item)
156862306a36Sopenharmony_ci{
156962306a36Sopenharmony_ci	switch (item) {
157062306a36Sopenharmony_ci	case MEMCG_PERCPU_B:
157162306a36Sopenharmony_ci	case MEMCG_ZSWAP_B:
157262306a36Sopenharmony_ci	case NR_SLAB_RECLAIMABLE_B:
157362306a36Sopenharmony_ci	case NR_SLAB_UNRECLAIMABLE_B:
157462306a36Sopenharmony_ci	case WORKINGSET_REFAULT_ANON:
157562306a36Sopenharmony_ci	case WORKINGSET_REFAULT_FILE:
157662306a36Sopenharmony_ci	case WORKINGSET_ACTIVATE_ANON:
157762306a36Sopenharmony_ci	case WORKINGSET_ACTIVATE_FILE:
157862306a36Sopenharmony_ci	case WORKINGSET_RESTORE_ANON:
157962306a36Sopenharmony_ci	case WORKINGSET_RESTORE_FILE:
158062306a36Sopenharmony_ci	case WORKINGSET_NODERECLAIM:
158162306a36Sopenharmony_ci		return 1;
158262306a36Sopenharmony_ci	case NR_KERNEL_STACK_KB:
158362306a36Sopenharmony_ci		return SZ_1K;
158462306a36Sopenharmony_ci	default:
158562306a36Sopenharmony_ci		return PAGE_SIZE;
158662306a36Sopenharmony_ci	}
158762306a36Sopenharmony_ci}
158862306a36Sopenharmony_ci
158962306a36Sopenharmony_cistatic inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
159062306a36Sopenharmony_ci						    int item)
159162306a36Sopenharmony_ci{
159262306a36Sopenharmony_ci	return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
159362306a36Sopenharmony_ci}
159462306a36Sopenharmony_ci
159562306a36Sopenharmony_cistatic void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
159662306a36Sopenharmony_ci{
159762306a36Sopenharmony_ci	int i;
159862306a36Sopenharmony_ci
159962306a36Sopenharmony_ci	/*
160062306a36Sopenharmony_ci	 * Provide statistics on the state of the memory subsystem as
160162306a36Sopenharmony_ci	 * well as cumulative event counters that show past behavior.
160262306a36Sopenharmony_ci	 *
160362306a36Sopenharmony_ci	 * This list is ordered following a combination of these gradients:
160462306a36Sopenharmony_ci	 * 1) generic big picture -> specifics and details
160562306a36Sopenharmony_ci	 * 2) reflecting userspace activity -> reflecting kernel heuristics
160662306a36Sopenharmony_ci	 *
160762306a36Sopenharmony_ci	 * Current memory state:
160862306a36Sopenharmony_ci	 */
160962306a36Sopenharmony_ci	mem_cgroup_flush_stats();
161062306a36Sopenharmony_ci
161162306a36Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
161262306a36Sopenharmony_ci		u64 size;
161362306a36Sopenharmony_ci
161462306a36Sopenharmony_ci		size = memcg_page_state_output(memcg, memory_stats[i].idx);
161562306a36Sopenharmony_ci		seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size);
161662306a36Sopenharmony_ci
161762306a36Sopenharmony_ci		if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
161862306a36Sopenharmony_ci			size += memcg_page_state_output(memcg,
161962306a36Sopenharmony_ci							NR_SLAB_RECLAIMABLE_B);
162062306a36Sopenharmony_ci			seq_buf_printf(s, "slab %llu\n", size);
162162306a36Sopenharmony_ci		}
162262306a36Sopenharmony_ci	}
162362306a36Sopenharmony_ci
162462306a36Sopenharmony_ci	/* Accumulated memory events */
162562306a36Sopenharmony_ci	seq_buf_printf(s, "pgscan %lu\n",
162662306a36Sopenharmony_ci		       memcg_events(memcg, PGSCAN_KSWAPD) +
162762306a36Sopenharmony_ci		       memcg_events(memcg, PGSCAN_DIRECT) +
162862306a36Sopenharmony_ci		       memcg_events(memcg, PGSCAN_KHUGEPAGED));
162962306a36Sopenharmony_ci	seq_buf_printf(s, "pgsteal %lu\n",
163062306a36Sopenharmony_ci		       memcg_events(memcg, PGSTEAL_KSWAPD) +
163162306a36Sopenharmony_ci		       memcg_events(memcg, PGSTEAL_DIRECT) +
163262306a36Sopenharmony_ci		       memcg_events(memcg, PGSTEAL_KHUGEPAGED));
163362306a36Sopenharmony_ci
163462306a36Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
163562306a36Sopenharmony_ci		if (memcg_vm_event_stat[i] == PGPGIN ||
163662306a36Sopenharmony_ci		    memcg_vm_event_stat[i] == PGPGOUT)
163762306a36Sopenharmony_ci			continue;
163862306a36Sopenharmony_ci
163962306a36Sopenharmony_ci		seq_buf_printf(s, "%s %lu\n",
164062306a36Sopenharmony_ci			       vm_event_name(memcg_vm_event_stat[i]),
164162306a36Sopenharmony_ci			       memcg_events(memcg, memcg_vm_event_stat[i]));
164262306a36Sopenharmony_ci	}
164362306a36Sopenharmony_ci
164462306a36Sopenharmony_ci	/* The above should easily fit into one page */
164562306a36Sopenharmony_ci	WARN_ON_ONCE(seq_buf_has_overflowed(s));
164662306a36Sopenharmony_ci}
164762306a36Sopenharmony_ci
164862306a36Sopenharmony_cistatic void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
164962306a36Sopenharmony_ci
165062306a36Sopenharmony_cistatic void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
165162306a36Sopenharmony_ci{
165262306a36Sopenharmony_ci	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
165362306a36Sopenharmony_ci		memcg_stat_format(memcg, s);
165462306a36Sopenharmony_ci	else
165562306a36Sopenharmony_ci		memcg1_stat_format(memcg, s);
165662306a36Sopenharmony_ci	WARN_ON_ONCE(seq_buf_has_overflowed(s));
165762306a36Sopenharmony_ci}
165862306a36Sopenharmony_ci
165962306a36Sopenharmony_ci/**
166062306a36Sopenharmony_ci * mem_cgroup_print_oom_context: Print OOM information relevant to
166162306a36Sopenharmony_ci * memory controller.
166262306a36Sopenharmony_ci * @memcg: The memory cgroup that went over limit
166362306a36Sopenharmony_ci * @p: Task that is going to be killed
166462306a36Sopenharmony_ci *
166562306a36Sopenharmony_ci * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
166662306a36Sopenharmony_ci * enabled
166762306a36Sopenharmony_ci */
166862306a36Sopenharmony_civoid mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
166962306a36Sopenharmony_ci{
167062306a36Sopenharmony_ci	rcu_read_lock();
167162306a36Sopenharmony_ci
167262306a36Sopenharmony_ci	if (memcg) {
167362306a36Sopenharmony_ci		pr_cont(",oom_memcg=");
167462306a36Sopenharmony_ci		pr_cont_cgroup_path(memcg->css.cgroup);
167562306a36Sopenharmony_ci	} else
167662306a36Sopenharmony_ci		pr_cont(",global_oom");
167762306a36Sopenharmony_ci	if (p) {
167862306a36Sopenharmony_ci		pr_cont(",task_memcg=");
167962306a36Sopenharmony_ci		pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
168062306a36Sopenharmony_ci	}
168162306a36Sopenharmony_ci	rcu_read_unlock();
168262306a36Sopenharmony_ci}
168362306a36Sopenharmony_ci
168462306a36Sopenharmony_ci/**
168562306a36Sopenharmony_ci * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
168662306a36Sopenharmony_ci * memory controller.
168762306a36Sopenharmony_ci * @memcg: The memory cgroup that went over limit
168862306a36Sopenharmony_ci */
168962306a36Sopenharmony_civoid mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
169062306a36Sopenharmony_ci{
169162306a36Sopenharmony_ci	/* Use static buffer, for the caller is holding oom_lock. */
169262306a36Sopenharmony_ci	static char buf[PAGE_SIZE];
169362306a36Sopenharmony_ci	struct seq_buf s;
169462306a36Sopenharmony_ci
169562306a36Sopenharmony_ci	lockdep_assert_held(&oom_lock);
169662306a36Sopenharmony_ci
169762306a36Sopenharmony_ci	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
169862306a36Sopenharmony_ci		K((u64)page_counter_read(&memcg->memory)),
169962306a36Sopenharmony_ci		K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
170062306a36Sopenharmony_ci	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
170162306a36Sopenharmony_ci		pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
170262306a36Sopenharmony_ci			K((u64)page_counter_read(&memcg->swap)),
170362306a36Sopenharmony_ci			K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
170462306a36Sopenharmony_ci	else {
170562306a36Sopenharmony_ci		pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
170662306a36Sopenharmony_ci			K((u64)page_counter_read(&memcg->memsw)),
170762306a36Sopenharmony_ci			K((u64)memcg->memsw.max), memcg->memsw.failcnt);
170862306a36Sopenharmony_ci		pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
170962306a36Sopenharmony_ci			K((u64)page_counter_read(&memcg->kmem)),
171062306a36Sopenharmony_ci			K((u64)memcg->kmem.max), memcg->kmem.failcnt);
171162306a36Sopenharmony_ci	}
171262306a36Sopenharmony_ci
171362306a36Sopenharmony_ci	pr_info("Memory cgroup stats for ");
171462306a36Sopenharmony_ci	pr_cont_cgroup_path(memcg->css.cgroup);
171562306a36Sopenharmony_ci	pr_cont(":");
171662306a36Sopenharmony_ci	seq_buf_init(&s, buf, sizeof(buf));
171762306a36Sopenharmony_ci	memory_stat_format(memcg, &s);
171862306a36Sopenharmony_ci	seq_buf_do_printk(&s, KERN_INFO);
171962306a36Sopenharmony_ci}
172062306a36Sopenharmony_ci
172162306a36Sopenharmony_ci/*
172262306a36Sopenharmony_ci * Return the memory (and swap, if configured) limit for a memcg.
172362306a36Sopenharmony_ci */
172462306a36Sopenharmony_ciunsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
172562306a36Sopenharmony_ci{
172662306a36Sopenharmony_ci	unsigned long max = READ_ONCE(memcg->memory.max);
172762306a36Sopenharmony_ci
172862306a36Sopenharmony_ci	if (do_memsw_account()) {
172962306a36Sopenharmony_ci		if (mem_cgroup_swappiness(memcg)) {
173062306a36Sopenharmony_ci			/* Calculate swap excess capacity from memsw limit */
173162306a36Sopenharmony_ci			unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
173262306a36Sopenharmony_ci
173362306a36Sopenharmony_ci			max += min(swap, (unsigned long)total_swap_pages);
173462306a36Sopenharmony_ci		}
173562306a36Sopenharmony_ci	} else {
173662306a36Sopenharmony_ci		if (mem_cgroup_swappiness(memcg))
173762306a36Sopenharmony_ci			max += min(READ_ONCE(memcg->swap.max),
173862306a36Sopenharmony_ci				   (unsigned long)total_swap_pages);
173962306a36Sopenharmony_ci	}
174062306a36Sopenharmony_ci	return max;
174162306a36Sopenharmony_ci}
174262306a36Sopenharmony_ci
174362306a36Sopenharmony_ciunsigned long mem_cgroup_size(struct mem_cgroup *memcg)
174462306a36Sopenharmony_ci{
174562306a36Sopenharmony_ci	return page_counter_read(&memcg->memory);
174662306a36Sopenharmony_ci}
174762306a36Sopenharmony_ci
174862306a36Sopenharmony_cistatic bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
174962306a36Sopenharmony_ci				     int order)
175062306a36Sopenharmony_ci{
175162306a36Sopenharmony_ci	struct oom_control oc = {
175262306a36Sopenharmony_ci		.zonelist = NULL,
175362306a36Sopenharmony_ci		.nodemask = NULL,
175462306a36Sopenharmony_ci		.memcg = memcg,
175562306a36Sopenharmony_ci		.gfp_mask = gfp_mask,
175662306a36Sopenharmony_ci		.order = order,
175762306a36Sopenharmony_ci	};
175862306a36Sopenharmony_ci	bool ret = true;
175962306a36Sopenharmony_ci
176062306a36Sopenharmony_ci	if (mutex_lock_killable(&oom_lock))
176162306a36Sopenharmony_ci		return true;
176262306a36Sopenharmony_ci
176362306a36Sopenharmony_ci	if (mem_cgroup_margin(memcg) >= (1 << order))
176462306a36Sopenharmony_ci		goto unlock;
176562306a36Sopenharmony_ci
176662306a36Sopenharmony_ci	/*
176762306a36Sopenharmony_ci	 * A few threads which were not waiting at mutex_lock_killable() can
176862306a36Sopenharmony_ci	 * fail to bail out. Therefore, check again after holding oom_lock.
176962306a36Sopenharmony_ci	 */
177062306a36Sopenharmony_ci	ret = task_is_dying() || out_of_memory(&oc);
177162306a36Sopenharmony_ci
177262306a36Sopenharmony_ciunlock:
177362306a36Sopenharmony_ci	mutex_unlock(&oom_lock);
177462306a36Sopenharmony_ci	return ret;
177562306a36Sopenharmony_ci}
177662306a36Sopenharmony_ci
177762306a36Sopenharmony_cistatic int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
177862306a36Sopenharmony_ci				   pg_data_t *pgdat,
177962306a36Sopenharmony_ci				   gfp_t gfp_mask,
178062306a36Sopenharmony_ci				   unsigned long *total_scanned)
178162306a36Sopenharmony_ci{
178262306a36Sopenharmony_ci	struct mem_cgroup *victim = NULL;
178362306a36Sopenharmony_ci	int total = 0;
178462306a36Sopenharmony_ci	int loop = 0;
178562306a36Sopenharmony_ci	unsigned long excess;
178662306a36Sopenharmony_ci	unsigned long nr_scanned;
178762306a36Sopenharmony_ci	struct mem_cgroup_reclaim_cookie reclaim = {
178862306a36Sopenharmony_ci		.pgdat = pgdat,
178962306a36Sopenharmony_ci	};
179062306a36Sopenharmony_ci
179162306a36Sopenharmony_ci	excess = soft_limit_excess(root_memcg);
179262306a36Sopenharmony_ci
179362306a36Sopenharmony_ci	while (1) {
179462306a36Sopenharmony_ci		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
179562306a36Sopenharmony_ci		if (!victim) {
179662306a36Sopenharmony_ci			loop++;
179762306a36Sopenharmony_ci			if (loop >= 2) {
179862306a36Sopenharmony_ci				/*
179962306a36Sopenharmony_ci				 * If we have not been able to reclaim
180062306a36Sopenharmony_ci				 * anything, it might because there are
180162306a36Sopenharmony_ci				 * no reclaimable pages under this hierarchy
180262306a36Sopenharmony_ci				 */
180362306a36Sopenharmony_ci				if (!total)
180462306a36Sopenharmony_ci					break;
180562306a36Sopenharmony_ci				/*
180662306a36Sopenharmony_ci				 * We want to do more targeted reclaim.
180762306a36Sopenharmony_ci				 * excess >> 2 is not to excessive so as to
180862306a36Sopenharmony_ci				 * reclaim too much, nor too less that we keep
180962306a36Sopenharmony_ci				 * coming back to reclaim from this cgroup
181062306a36Sopenharmony_ci				 */
181162306a36Sopenharmony_ci				if (total >= (excess >> 2) ||
181262306a36Sopenharmony_ci					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
181362306a36Sopenharmony_ci					break;
181462306a36Sopenharmony_ci			}
181562306a36Sopenharmony_ci			continue;
181662306a36Sopenharmony_ci		}
181762306a36Sopenharmony_ci		total += mem_cgroup_shrink_node(victim, gfp_mask, false,
181862306a36Sopenharmony_ci					pgdat, &nr_scanned);
181962306a36Sopenharmony_ci		*total_scanned += nr_scanned;
182062306a36Sopenharmony_ci		if (!soft_limit_excess(root_memcg))
182162306a36Sopenharmony_ci			break;
182262306a36Sopenharmony_ci	}
182362306a36Sopenharmony_ci	mem_cgroup_iter_break(root_memcg, victim);
182462306a36Sopenharmony_ci	return total;
182562306a36Sopenharmony_ci}
182662306a36Sopenharmony_ci
182762306a36Sopenharmony_ci#ifdef CONFIG_LOCKDEP
182862306a36Sopenharmony_cistatic struct lockdep_map memcg_oom_lock_dep_map = {
182962306a36Sopenharmony_ci	.name = "memcg_oom_lock",
183062306a36Sopenharmony_ci};
183162306a36Sopenharmony_ci#endif
183262306a36Sopenharmony_ci
183362306a36Sopenharmony_cistatic DEFINE_SPINLOCK(memcg_oom_lock);
183462306a36Sopenharmony_ci
183562306a36Sopenharmony_ci/*
183662306a36Sopenharmony_ci * Check OOM-Killer is already running under our hierarchy.
183762306a36Sopenharmony_ci * If someone is running, return false.
183862306a36Sopenharmony_ci */
183962306a36Sopenharmony_cistatic bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
184062306a36Sopenharmony_ci{
184162306a36Sopenharmony_ci	struct mem_cgroup *iter, *failed = NULL;
184262306a36Sopenharmony_ci
184362306a36Sopenharmony_ci	spin_lock(&memcg_oom_lock);
184462306a36Sopenharmony_ci
184562306a36Sopenharmony_ci	for_each_mem_cgroup_tree(iter, memcg) {
184662306a36Sopenharmony_ci		if (iter->oom_lock) {
184762306a36Sopenharmony_ci			/*
184862306a36Sopenharmony_ci			 * this subtree of our hierarchy is already locked
184962306a36Sopenharmony_ci			 * so we cannot give a lock.
185062306a36Sopenharmony_ci			 */
185162306a36Sopenharmony_ci			failed = iter;
185262306a36Sopenharmony_ci			mem_cgroup_iter_break(memcg, iter);
185362306a36Sopenharmony_ci			break;
185462306a36Sopenharmony_ci		} else
185562306a36Sopenharmony_ci			iter->oom_lock = true;
185662306a36Sopenharmony_ci	}
185762306a36Sopenharmony_ci
185862306a36Sopenharmony_ci	if (failed) {
185962306a36Sopenharmony_ci		/*
186062306a36Sopenharmony_ci		 * OK, we failed to lock the whole subtree so we have
186162306a36Sopenharmony_ci		 * to clean up what we set up to the failing subtree
186262306a36Sopenharmony_ci		 */
186362306a36Sopenharmony_ci		for_each_mem_cgroup_tree(iter, memcg) {
186462306a36Sopenharmony_ci			if (iter == failed) {
186562306a36Sopenharmony_ci				mem_cgroup_iter_break(memcg, iter);
186662306a36Sopenharmony_ci				break;
186762306a36Sopenharmony_ci			}
186862306a36Sopenharmony_ci			iter->oom_lock = false;
186962306a36Sopenharmony_ci		}
187062306a36Sopenharmony_ci	} else
187162306a36Sopenharmony_ci		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
187262306a36Sopenharmony_ci
187362306a36Sopenharmony_ci	spin_unlock(&memcg_oom_lock);
187462306a36Sopenharmony_ci
187562306a36Sopenharmony_ci	return !failed;
187662306a36Sopenharmony_ci}
187762306a36Sopenharmony_ci
187862306a36Sopenharmony_cistatic void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
187962306a36Sopenharmony_ci{
188062306a36Sopenharmony_ci	struct mem_cgroup *iter;
188162306a36Sopenharmony_ci
188262306a36Sopenharmony_ci	spin_lock(&memcg_oom_lock);
188362306a36Sopenharmony_ci	mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
188462306a36Sopenharmony_ci	for_each_mem_cgroup_tree(iter, memcg)
188562306a36Sopenharmony_ci		iter->oom_lock = false;
188662306a36Sopenharmony_ci	spin_unlock(&memcg_oom_lock);
188762306a36Sopenharmony_ci}
188862306a36Sopenharmony_ci
188962306a36Sopenharmony_cistatic void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
189062306a36Sopenharmony_ci{
189162306a36Sopenharmony_ci	struct mem_cgroup *iter;
189262306a36Sopenharmony_ci
189362306a36Sopenharmony_ci	spin_lock(&memcg_oom_lock);
189462306a36Sopenharmony_ci	for_each_mem_cgroup_tree(iter, memcg)
189562306a36Sopenharmony_ci		iter->under_oom++;
189662306a36Sopenharmony_ci	spin_unlock(&memcg_oom_lock);
189762306a36Sopenharmony_ci}
189862306a36Sopenharmony_ci
189962306a36Sopenharmony_cistatic void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
190062306a36Sopenharmony_ci{
190162306a36Sopenharmony_ci	struct mem_cgroup *iter;
190262306a36Sopenharmony_ci
190362306a36Sopenharmony_ci	/*
190462306a36Sopenharmony_ci	 * Be careful about under_oom underflows because a child memcg
190562306a36Sopenharmony_ci	 * could have been added after mem_cgroup_mark_under_oom.
190662306a36Sopenharmony_ci	 */
190762306a36Sopenharmony_ci	spin_lock(&memcg_oom_lock);
190862306a36Sopenharmony_ci	for_each_mem_cgroup_tree(iter, memcg)
190962306a36Sopenharmony_ci		if (iter->under_oom > 0)
191062306a36Sopenharmony_ci			iter->under_oom--;
191162306a36Sopenharmony_ci	spin_unlock(&memcg_oom_lock);
191262306a36Sopenharmony_ci}
191362306a36Sopenharmony_ci
191462306a36Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
191562306a36Sopenharmony_ci
191662306a36Sopenharmony_cistruct oom_wait_info {
191762306a36Sopenharmony_ci	struct mem_cgroup *memcg;
191862306a36Sopenharmony_ci	wait_queue_entry_t	wait;
191962306a36Sopenharmony_ci};
192062306a36Sopenharmony_ci
192162306a36Sopenharmony_cistatic int memcg_oom_wake_function(wait_queue_entry_t *wait,
192262306a36Sopenharmony_ci	unsigned mode, int sync, void *arg)
192362306a36Sopenharmony_ci{
192462306a36Sopenharmony_ci	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
192562306a36Sopenharmony_ci	struct mem_cgroup *oom_wait_memcg;
192662306a36Sopenharmony_ci	struct oom_wait_info *oom_wait_info;
192762306a36Sopenharmony_ci
192862306a36Sopenharmony_ci	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
192962306a36Sopenharmony_ci	oom_wait_memcg = oom_wait_info->memcg;
193062306a36Sopenharmony_ci
193162306a36Sopenharmony_ci	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
193262306a36Sopenharmony_ci	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
193362306a36Sopenharmony_ci		return 0;
193462306a36Sopenharmony_ci	return autoremove_wake_function(wait, mode, sync, arg);
193562306a36Sopenharmony_ci}
193662306a36Sopenharmony_ci
193762306a36Sopenharmony_cistatic void memcg_oom_recover(struct mem_cgroup *memcg)
193862306a36Sopenharmony_ci{
193962306a36Sopenharmony_ci	/*
194062306a36Sopenharmony_ci	 * For the following lockless ->under_oom test, the only required
194162306a36Sopenharmony_ci	 * guarantee is that it must see the state asserted by an OOM when
194262306a36Sopenharmony_ci	 * this function is called as a result of userland actions
194362306a36Sopenharmony_ci	 * triggered by the notification of the OOM.  This is trivially
194462306a36Sopenharmony_ci	 * achieved by invoking mem_cgroup_mark_under_oom() before
194562306a36Sopenharmony_ci	 * triggering notification.
194662306a36Sopenharmony_ci	 */
194762306a36Sopenharmony_ci	if (memcg && memcg->under_oom)
194862306a36Sopenharmony_ci		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
194962306a36Sopenharmony_ci}
195062306a36Sopenharmony_ci
195162306a36Sopenharmony_ci/*
195262306a36Sopenharmony_ci * Returns true if successfully killed one or more processes. Though in some
195362306a36Sopenharmony_ci * corner cases it can return true even without killing any process.
195462306a36Sopenharmony_ci */
195562306a36Sopenharmony_cistatic bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
195662306a36Sopenharmony_ci{
195762306a36Sopenharmony_ci	bool locked, ret;
195862306a36Sopenharmony_ci
195962306a36Sopenharmony_ci	if (order > PAGE_ALLOC_COSTLY_ORDER)
196062306a36Sopenharmony_ci		return false;
196162306a36Sopenharmony_ci
196262306a36Sopenharmony_ci	memcg_memory_event(memcg, MEMCG_OOM);
196362306a36Sopenharmony_ci
196462306a36Sopenharmony_ci	/*
196562306a36Sopenharmony_ci	 * We are in the middle of the charge context here, so we
196662306a36Sopenharmony_ci	 * don't want to block when potentially sitting on a callstack
196762306a36Sopenharmony_ci	 * that holds all kinds of filesystem and mm locks.
196862306a36Sopenharmony_ci	 *
196962306a36Sopenharmony_ci	 * cgroup1 allows disabling the OOM killer and waiting for outside
197062306a36Sopenharmony_ci	 * handling until the charge can succeed; remember the context and put
197162306a36Sopenharmony_ci	 * the task to sleep at the end of the page fault when all locks are
197262306a36Sopenharmony_ci	 * released.
197362306a36Sopenharmony_ci	 *
197462306a36Sopenharmony_ci	 * On the other hand, in-kernel OOM killer allows for an async victim
197562306a36Sopenharmony_ci	 * memory reclaim (oom_reaper) and that means that we are not solely
197662306a36Sopenharmony_ci	 * relying on the oom victim to make a forward progress and we can
197762306a36Sopenharmony_ci	 * invoke the oom killer here.
197862306a36Sopenharmony_ci	 *
197962306a36Sopenharmony_ci	 * Please note that mem_cgroup_out_of_memory might fail to find a
198062306a36Sopenharmony_ci	 * victim and then we have to bail out from the charge path.
198162306a36Sopenharmony_ci	 */
198262306a36Sopenharmony_ci	if (READ_ONCE(memcg->oom_kill_disable)) {
198362306a36Sopenharmony_ci		if (current->in_user_fault) {
198462306a36Sopenharmony_ci			css_get(&memcg->css);
198562306a36Sopenharmony_ci			current->memcg_in_oom = memcg;
198662306a36Sopenharmony_ci			current->memcg_oom_gfp_mask = mask;
198762306a36Sopenharmony_ci			current->memcg_oom_order = order;
198862306a36Sopenharmony_ci		}
198962306a36Sopenharmony_ci		return false;
199062306a36Sopenharmony_ci	}
199162306a36Sopenharmony_ci
199262306a36Sopenharmony_ci	mem_cgroup_mark_under_oom(memcg);
199362306a36Sopenharmony_ci
199462306a36Sopenharmony_ci	locked = mem_cgroup_oom_trylock(memcg);
199562306a36Sopenharmony_ci
199662306a36Sopenharmony_ci	if (locked)
199762306a36Sopenharmony_ci		mem_cgroup_oom_notify(memcg);
199862306a36Sopenharmony_ci
199962306a36Sopenharmony_ci	mem_cgroup_unmark_under_oom(memcg);
200062306a36Sopenharmony_ci	ret = mem_cgroup_out_of_memory(memcg, mask, order);
200162306a36Sopenharmony_ci
200262306a36Sopenharmony_ci	if (locked)
200362306a36Sopenharmony_ci		mem_cgroup_oom_unlock(memcg);
200462306a36Sopenharmony_ci
200562306a36Sopenharmony_ci	return ret;
200662306a36Sopenharmony_ci}
200762306a36Sopenharmony_ci
200862306a36Sopenharmony_ci/**
200962306a36Sopenharmony_ci * mem_cgroup_oom_synchronize - complete memcg OOM handling
201062306a36Sopenharmony_ci * @handle: actually kill/wait or just clean up the OOM state
201162306a36Sopenharmony_ci *
201262306a36Sopenharmony_ci * This has to be called at the end of a page fault if the memcg OOM
201362306a36Sopenharmony_ci * handler was enabled.
201462306a36Sopenharmony_ci *
201562306a36Sopenharmony_ci * Memcg supports userspace OOM handling where failed allocations must
201662306a36Sopenharmony_ci * sleep on a waitqueue until the userspace task resolves the
201762306a36Sopenharmony_ci * situation.  Sleeping directly in the charge context with all kinds
201862306a36Sopenharmony_ci * of locks held is not a good idea, instead we remember an OOM state
201962306a36Sopenharmony_ci * in the task and mem_cgroup_oom_synchronize() has to be called at
202062306a36Sopenharmony_ci * the end of the page fault to complete the OOM handling.
202162306a36Sopenharmony_ci *
202262306a36Sopenharmony_ci * Returns %true if an ongoing memcg OOM situation was detected and
202362306a36Sopenharmony_ci * completed, %false otherwise.
202462306a36Sopenharmony_ci */
202562306a36Sopenharmony_cibool mem_cgroup_oom_synchronize(bool handle)
202662306a36Sopenharmony_ci{
202762306a36Sopenharmony_ci	struct mem_cgroup *memcg = current->memcg_in_oom;
202862306a36Sopenharmony_ci	struct oom_wait_info owait;
202962306a36Sopenharmony_ci	bool locked;
203062306a36Sopenharmony_ci
203162306a36Sopenharmony_ci	/* OOM is global, do not handle */
203262306a36Sopenharmony_ci	if (!memcg)
203362306a36Sopenharmony_ci		return false;
203462306a36Sopenharmony_ci
203562306a36Sopenharmony_ci	if (!handle)
203662306a36Sopenharmony_ci		goto cleanup;
203762306a36Sopenharmony_ci
203862306a36Sopenharmony_ci	owait.memcg = memcg;
203962306a36Sopenharmony_ci	owait.wait.flags = 0;
204062306a36Sopenharmony_ci	owait.wait.func = memcg_oom_wake_function;
204162306a36Sopenharmony_ci	owait.wait.private = current;
204262306a36Sopenharmony_ci	INIT_LIST_HEAD(&owait.wait.entry);
204362306a36Sopenharmony_ci
204462306a36Sopenharmony_ci	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
204562306a36Sopenharmony_ci	mem_cgroup_mark_under_oom(memcg);
204662306a36Sopenharmony_ci
204762306a36Sopenharmony_ci	locked = mem_cgroup_oom_trylock(memcg);
204862306a36Sopenharmony_ci
204962306a36Sopenharmony_ci	if (locked)
205062306a36Sopenharmony_ci		mem_cgroup_oom_notify(memcg);
205162306a36Sopenharmony_ci
205262306a36Sopenharmony_ci	schedule();
205362306a36Sopenharmony_ci	mem_cgroup_unmark_under_oom(memcg);
205462306a36Sopenharmony_ci	finish_wait(&memcg_oom_waitq, &owait.wait);
205562306a36Sopenharmony_ci
205662306a36Sopenharmony_ci	if (locked)
205762306a36Sopenharmony_ci		mem_cgroup_oom_unlock(memcg);
205862306a36Sopenharmony_cicleanup:
205962306a36Sopenharmony_ci	current->memcg_in_oom = NULL;
206062306a36Sopenharmony_ci	css_put(&memcg->css);
206162306a36Sopenharmony_ci	return true;
206262306a36Sopenharmony_ci}
206362306a36Sopenharmony_ci
206462306a36Sopenharmony_ci/**
206562306a36Sopenharmony_ci * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
206662306a36Sopenharmony_ci * @victim: task to be killed by the OOM killer
206762306a36Sopenharmony_ci * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
206862306a36Sopenharmony_ci *
206962306a36Sopenharmony_ci * Returns a pointer to a memory cgroup, which has to be cleaned up
207062306a36Sopenharmony_ci * by killing all belonging OOM-killable tasks.
207162306a36Sopenharmony_ci *
207262306a36Sopenharmony_ci * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
207362306a36Sopenharmony_ci */
207462306a36Sopenharmony_cistruct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
207562306a36Sopenharmony_ci					    struct mem_cgroup *oom_domain)
207662306a36Sopenharmony_ci{
207762306a36Sopenharmony_ci	struct mem_cgroup *oom_group = NULL;
207862306a36Sopenharmony_ci	struct mem_cgroup *memcg;
207962306a36Sopenharmony_ci
208062306a36Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
208162306a36Sopenharmony_ci		return NULL;
208262306a36Sopenharmony_ci
208362306a36Sopenharmony_ci	if (!oom_domain)
208462306a36Sopenharmony_ci		oom_domain = root_mem_cgroup;
208562306a36Sopenharmony_ci
208662306a36Sopenharmony_ci	rcu_read_lock();
208762306a36Sopenharmony_ci
208862306a36Sopenharmony_ci	memcg = mem_cgroup_from_task(victim);
208962306a36Sopenharmony_ci	if (mem_cgroup_is_root(memcg))
209062306a36Sopenharmony_ci		goto out;
209162306a36Sopenharmony_ci
209262306a36Sopenharmony_ci	/*
209362306a36Sopenharmony_ci	 * If the victim task has been asynchronously moved to a different
209462306a36Sopenharmony_ci	 * memory cgroup, we might end up killing tasks outside oom_domain.
209562306a36Sopenharmony_ci	 * In this case it's better to ignore memory.group.oom.
209662306a36Sopenharmony_ci	 */
209762306a36Sopenharmony_ci	if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
209862306a36Sopenharmony_ci		goto out;
209962306a36Sopenharmony_ci
210062306a36Sopenharmony_ci	/*
210162306a36Sopenharmony_ci	 * Traverse the memory cgroup hierarchy from the victim task's
210262306a36Sopenharmony_ci	 * cgroup up to the OOMing cgroup (or root) to find the
210362306a36Sopenharmony_ci	 * highest-level memory cgroup with oom.group set.
210462306a36Sopenharmony_ci	 */
210562306a36Sopenharmony_ci	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
210662306a36Sopenharmony_ci		if (READ_ONCE(memcg->oom_group))
210762306a36Sopenharmony_ci			oom_group = memcg;
210862306a36Sopenharmony_ci
210962306a36Sopenharmony_ci		if (memcg == oom_domain)
211062306a36Sopenharmony_ci			break;
211162306a36Sopenharmony_ci	}
211262306a36Sopenharmony_ci
211362306a36Sopenharmony_ci	if (oom_group)
211462306a36Sopenharmony_ci		css_get(&oom_group->css);
211562306a36Sopenharmony_ciout:
211662306a36Sopenharmony_ci	rcu_read_unlock();
211762306a36Sopenharmony_ci
211862306a36Sopenharmony_ci	return oom_group;
211962306a36Sopenharmony_ci}
212062306a36Sopenharmony_ci
212162306a36Sopenharmony_civoid mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
212262306a36Sopenharmony_ci{
212362306a36Sopenharmony_ci	pr_info("Tasks in ");
212462306a36Sopenharmony_ci	pr_cont_cgroup_path(memcg->css.cgroup);
212562306a36Sopenharmony_ci	pr_cont(" are going to be killed due to memory.oom.group set\n");
212662306a36Sopenharmony_ci}
212762306a36Sopenharmony_ci
212862306a36Sopenharmony_ci/**
212962306a36Sopenharmony_ci * folio_memcg_lock - Bind a folio to its memcg.
213062306a36Sopenharmony_ci * @folio: The folio.
213162306a36Sopenharmony_ci *
213262306a36Sopenharmony_ci * This function prevents unlocked LRU folios from being moved to
213362306a36Sopenharmony_ci * another cgroup.
213462306a36Sopenharmony_ci *
213562306a36Sopenharmony_ci * It ensures lifetime of the bound memcg.  The caller is responsible
213662306a36Sopenharmony_ci * for the lifetime of the folio.
213762306a36Sopenharmony_ci */
213862306a36Sopenharmony_civoid folio_memcg_lock(struct folio *folio)
213962306a36Sopenharmony_ci{
214062306a36Sopenharmony_ci	struct mem_cgroup *memcg;
214162306a36Sopenharmony_ci	unsigned long flags;
214262306a36Sopenharmony_ci
214362306a36Sopenharmony_ci	/*
214462306a36Sopenharmony_ci	 * The RCU lock is held throughout the transaction.  The fast
214562306a36Sopenharmony_ci	 * path can get away without acquiring the memcg->move_lock
214662306a36Sopenharmony_ci	 * because page moving starts with an RCU grace period.
214762306a36Sopenharmony_ci         */
214862306a36Sopenharmony_ci	rcu_read_lock();
214962306a36Sopenharmony_ci
215062306a36Sopenharmony_ci	if (mem_cgroup_disabled())
215162306a36Sopenharmony_ci		return;
215262306a36Sopenharmony_ciagain:
215362306a36Sopenharmony_ci	memcg = folio_memcg(folio);
215462306a36Sopenharmony_ci	if (unlikely(!memcg))
215562306a36Sopenharmony_ci		return;
215662306a36Sopenharmony_ci
215762306a36Sopenharmony_ci#ifdef CONFIG_PROVE_LOCKING
215862306a36Sopenharmony_ci	local_irq_save(flags);
215962306a36Sopenharmony_ci	might_lock(&memcg->move_lock);
216062306a36Sopenharmony_ci	local_irq_restore(flags);
216162306a36Sopenharmony_ci#endif
216262306a36Sopenharmony_ci
216362306a36Sopenharmony_ci	if (atomic_read(&memcg->moving_account) <= 0)
216462306a36Sopenharmony_ci		return;
216562306a36Sopenharmony_ci
216662306a36Sopenharmony_ci	spin_lock_irqsave(&memcg->move_lock, flags);
216762306a36Sopenharmony_ci	if (memcg != folio_memcg(folio)) {
216862306a36Sopenharmony_ci		spin_unlock_irqrestore(&memcg->move_lock, flags);
216962306a36Sopenharmony_ci		goto again;
217062306a36Sopenharmony_ci	}
217162306a36Sopenharmony_ci
217262306a36Sopenharmony_ci	/*
217362306a36Sopenharmony_ci	 * When charge migration first begins, we can have multiple
217462306a36Sopenharmony_ci	 * critical sections holding the fast-path RCU lock and one
217562306a36Sopenharmony_ci	 * holding the slowpath move_lock. Track the task who has the
217662306a36Sopenharmony_ci	 * move_lock for folio_memcg_unlock().
217762306a36Sopenharmony_ci	 */
217862306a36Sopenharmony_ci	memcg->move_lock_task = current;
217962306a36Sopenharmony_ci	memcg->move_lock_flags = flags;
218062306a36Sopenharmony_ci}
218162306a36Sopenharmony_ci
218262306a36Sopenharmony_cistatic void __folio_memcg_unlock(struct mem_cgroup *memcg)
218362306a36Sopenharmony_ci{
218462306a36Sopenharmony_ci	if (memcg && memcg->move_lock_task == current) {
218562306a36Sopenharmony_ci		unsigned long flags = memcg->move_lock_flags;
218662306a36Sopenharmony_ci
218762306a36Sopenharmony_ci		memcg->move_lock_task = NULL;
218862306a36Sopenharmony_ci		memcg->move_lock_flags = 0;
218962306a36Sopenharmony_ci
219062306a36Sopenharmony_ci		spin_unlock_irqrestore(&memcg->move_lock, flags);
219162306a36Sopenharmony_ci	}
219262306a36Sopenharmony_ci
219362306a36Sopenharmony_ci	rcu_read_unlock();
219462306a36Sopenharmony_ci}
219562306a36Sopenharmony_ci
219662306a36Sopenharmony_ci/**
219762306a36Sopenharmony_ci * folio_memcg_unlock - Release the binding between a folio and its memcg.
219862306a36Sopenharmony_ci * @folio: The folio.
219962306a36Sopenharmony_ci *
220062306a36Sopenharmony_ci * This releases the binding created by folio_memcg_lock().  This does
220162306a36Sopenharmony_ci * not change the accounting of this folio to its memcg, but it does
220262306a36Sopenharmony_ci * permit others to change it.
220362306a36Sopenharmony_ci */
220462306a36Sopenharmony_civoid folio_memcg_unlock(struct folio *folio)
220562306a36Sopenharmony_ci{
220662306a36Sopenharmony_ci	__folio_memcg_unlock(folio_memcg(folio));
220762306a36Sopenharmony_ci}
220862306a36Sopenharmony_ci
220962306a36Sopenharmony_cistruct memcg_stock_pcp {
221062306a36Sopenharmony_ci	local_lock_t stock_lock;
221162306a36Sopenharmony_ci	struct mem_cgroup *cached; /* this never be root cgroup */
221262306a36Sopenharmony_ci	unsigned int nr_pages;
221362306a36Sopenharmony_ci
221462306a36Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
221562306a36Sopenharmony_ci	struct obj_cgroup *cached_objcg;
221662306a36Sopenharmony_ci	struct pglist_data *cached_pgdat;
221762306a36Sopenharmony_ci	unsigned int nr_bytes;
221862306a36Sopenharmony_ci	int nr_slab_reclaimable_b;
221962306a36Sopenharmony_ci	int nr_slab_unreclaimable_b;
222062306a36Sopenharmony_ci#endif
222162306a36Sopenharmony_ci
222262306a36Sopenharmony_ci	struct work_struct work;
222362306a36Sopenharmony_ci	unsigned long flags;
222462306a36Sopenharmony_ci#define FLUSHING_CACHED_CHARGE	0
222562306a36Sopenharmony_ci};
222662306a36Sopenharmony_cistatic DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
222762306a36Sopenharmony_ci	.stock_lock = INIT_LOCAL_LOCK(stock_lock),
222862306a36Sopenharmony_ci};
222962306a36Sopenharmony_cistatic DEFINE_MUTEX(percpu_charge_mutex);
223062306a36Sopenharmony_ci
223162306a36Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
223262306a36Sopenharmony_cistatic struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
223362306a36Sopenharmony_cistatic bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
223462306a36Sopenharmony_ci				     struct mem_cgroup *root_memcg);
223562306a36Sopenharmony_cistatic void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages);
223662306a36Sopenharmony_ci
223762306a36Sopenharmony_ci#else
223862306a36Sopenharmony_cistatic inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
223962306a36Sopenharmony_ci{
224062306a36Sopenharmony_ci	return NULL;
224162306a36Sopenharmony_ci}
224262306a36Sopenharmony_cistatic bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
224362306a36Sopenharmony_ci				     struct mem_cgroup *root_memcg)
224462306a36Sopenharmony_ci{
224562306a36Sopenharmony_ci	return false;
224662306a36Sopenharmony_ci}
224762306a36Sopenharmony_cistatic void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
224862306a36Sopenharmony_ci{
224962306a36Sopenharmony_ci}
225062306a36Sopenharmony_ci#endif
225162306a36Sopenharmony_ci
225262306a36Sopenharmony_ci/**
225362306a36Sopenharmony_ci * consume_stock: Try to consume stocked charge on this cpu.
225462306a36Sopenharmony_ci * @memcg: memcg to consume from.
225562306a36Sopenharmony_ci * @nr_pages: how many pages to charge.
225662306a36Sopenharmony_ci *
225762306a36Sopenharmony_ci * The charges will only happen if @memcg matches the current cpu's memcg
225862306a36Sopenharmony_ci * stock, and at least @nr_pages are available in that stock.  Failure to
225962306a36Sopenharmony_ci * service an allocation will refill the stock.
226062306a36Sopenharmony_ci *
226162306a36Sopenharmony_ci * returns true if successful, false otherwise.
226262306a36Sopenharmony_ci */
226362306a36Sopenharmony_cistatic bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
226462306a36Sopenharmony_ci{
226562306a36Sopenharmony_ci	struct memcg_stock_pcp *stock;
226662306a36Sopenharmony_ci	unsigned long flags;
226762306a36Sopenharmony_ci	bool ret = false;
226862306a36Sopenharmony_ci
226962306a36Sopenharmony_ci	if (nr_pages > MEMCG_CHARGE_BATCH)
227062306a36Sopenharmony_ci		return ret;
227162306a36Sopenharmony_ci
227262306a36Sopenharmony_ci	local_lock_irqsave(&memcg_stock.stock_lock, flags);
227362306a36Sopenharmony_ci
227462306a36Sopenharmony_ci	stock = this_cpu_ptr(&memcg_stock);
227562306a36Sopenharmony_ci	if (memcg == READ_ONCE(stock->cached) && stock->nr_pages >= nr_pages) {
227662306a36Sopenharmony_ci		stock->nr_pages -= nr_pages;
227762306a36Sopenharmony_ci		ret = true;
227862306a36Sopenharmony_ci	}
227962306a36Sopenharmony_ci
228062306a36Sopenharmony_ci	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
228162306a36Sopenharmony_ci
228262306a36Sopenharmony_ci	return ret;
228362306a36Sopenharmony_ci}
228462306a36Sopenharmony_ci
228562306a36Sopenharmony_ci/*
228662306a36Sopenharmony_ci * Returns stocks cached in percpu and reset cached information.
228762306a36Sopenharmony_ci */
228862306a36Sopenharmony_cistatic void drain_stock(struct memcg_stock_pcp *stock)
228962306a36Sopenharmony_ci{
229062306a36Sopenharmony_ci	struct mem_cgroup *old = READ_ONCE(stock->cached);
229162306a36Sopenharmony_ci
229262306a36Sopenharmony_ci	if (!old)
229362306a36Sopenharmony_ci		return;
229462306a36Sopenharmony_ci
229562306a36Sopenharmony_ci	if (stock->nr_pages) {
229662306a36Sopenharmony_ci		page_counter_uncharge(&old->memory, stock->nr_pages);
229762306a36Sopenharmony_ci		if (do_memsw_account())
229862306a36Sopenharmony_ci			page_counter_uncharge(&old->memsw, stock->nr_pages);
229962306a36Sopenharmony_ci		stock->nr_pages = 0;
230062306a36Sopenharmony_ci	}
230162306a36Sopenharmony_ci
230262306a36Sopenharmony_ci	css_put(&old->css);
230362306a36Sopenharmony_ci	WRITE_ONCE(stock->cached, NULL);
230462306a36Sopenharmony_ci}
230562306a36Sopenharmony_ci
230662306a36Sopenharmony_cistatic void drain_local_stock(struct work_struct *dummy)
230762306a36Sopenharmony_ci{
230862306a36Sopenharmony_ci	struct memcg_stock_pcp *stock;
230962306a36Sopenharmony_ci	struct obj_cgroup *old = NULL;
231062306a36Sopenharmony_ci	unsigned long flags;
231162306a36Sopenharmony_ci
231262306a36Sopenharmony_ci	/*
231362306a36Sopenharmony_ci	 * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
231462306a36Sopenharmony_ci	 * drain_stock races is that we always operate on local CPU stock
231562306a36Sopenharmony_ci	 * here with IRQ disabled
231662306a36Sopenharmony_ci	 */
231762306a36Sopenharmony_ci	local_lock_irqsave(&memcg_stock.stock_lock, flags);
231862306a36Sopenharmony_ci
231962306a36Sopenharmony_ci	stock = this_cpu_ptr(&memcg_stock);
232062306a36Sopenharmony_ci	old = drain_obj_stock(stock);
232162306a36Sopenharmony_ci	drain_stock(stock);
232262306a36Sopenharmony_ci	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
232362306a36Sopenharmony_ci
232462306a36Sopenharmony_ci	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
232562306a36Sopenharmony_ci	if (old)
232662306a36Sopenharmony_ci		obj_cgroup_put(old);
232762306a36Sopenharmony_ci}
232862306a36Sopenharmony_ci
232962306a36Sopenharmony_ci/*
233062306a36Sopenharmony_ci * Cache charges(val) to local per_cpu area.
233162306a36Sopenharmony_ci * This will be consumed by consume_stock() function, later.
233262306a36Sopenharmony_ci */
233362306a36Sopenharmony_cistatic void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
233462306a36Sopenharmony_ci{
233562306a36Sopenharmony_ci	struct memcg_stock_pcp *stock;
233662306a36Sopenharmony_ci
233762306a36Sopenharmony_ci	stock = this_cpu_ptr(&memcg_stock);
233862306a36Sopenharmony_ci	if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
233962306a36Sopenharmony_ci		drain_stock(stock);
234062306a36Sopenharmony_ci		css_get(&memcg->css);
234162306a36Sopenharmony_ci		WRITE_ONCE(stock->cached, memcg);
234262306a36Sopenharmony_ci	}
234362306a36Sopenharmony_ci	stock->nr_pages += nr_pages;
234462306a36Sopenharmony_ci
234562306a36Sopenharmony_ci	if (stock->nr_pages > MEMCG_CHARGE_BATCH)
234662306a36Sopenharmony_ci		drain_stock(stock);
234762306a36Sopenharmony_ci}
234862306a36Sopenharmony_ci
234962306a36Sopenharmony_cistatic void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
235062306a36Sopenharmony_ci{
235162306a36Sopenharmony_ci	unsigned long flags;
235262306a36Sopenharmony_ci
235362306a36Sopenharmony_ci	local_lock_irqsave(&memcg_stock.stock_lock, flags);
235462306a36Sopenharmony_ci	__refill_stock(memcg, nr_pages);
235562306a36Sopenharmony_ci	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
235662306a36Sopenharmony_ci}
235762306a36Sopenharmony_ci
235862306a36Sopenharmony_ci/*
235962306a36Sopenharmony_ci * Drains all per-CPU charge caches for given root_memcg resp. subtree
236062306a36Sopenharmony_ci * of the hierarchy under it.
236162306a36Sopenharmony_ci */
236262306a36Sopenharmony_cistatic void drain_all_stock(struct mem_cgroup *root_memcg)
236362306a36Sopenharmony_ci{
236462306a36Sopenharmony_ci	int cpu, curcpu;
236562306a36Sopenharmony_ci
236662306a36Sopenharmony_ci	/* If someone's already draining, avoid adding running more workers. */
236762306a36Sopenharmony_ci	if (!mutex_trylock(&percpu_charge_mutex))
236862306a36Sopenharmony_ci		return;
236962306a36Sopenharmony_ci	/*
237062306a36Sopenharmony_ci	 * Notify other cpus that system-wide "drain" is running
237162306a36Sopenharmony_ci	 * We do not care about races with the cpu hotplug because cpu down
237262306a36Sopenharmony_ci	 * as well as workers from this path always operate on the local
237362306a36Sopenharmony_ci	 * per-cpu data. CPU up doesn't touch memcg_stock at all.
237462306a36Sopenharmony_ci	 */
237562306a36Sopenharmony_ci	migrate_disable();
237662306a36Sopenharmony_ci	curcpu = smp_processor_id();
237762306a36Sopenharmony_ci	for_each_online_cpu(cpu) {
237862306a36Sopenharmony_ci		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
237962306a36Sopenharmony_ci		struct mem_cgroup *memcg;
238062306a36Sopenharmony_ci		bool flush = false;
238162306a36Sopenharmony_ci
238262306a36Sopenharmony_ci		rcu_read_lock();
238362306a36Sopenharmony_ci		memcg = READ_ONCE(stock->cached);
238462306a36Sopenharmony_ci		if (memcg && stock->nr_pages &&
238562306a36Sopenharmony_ci		    mem_cgroup_is_descendant(memcg, root_memcg))
238662306a36Sopenharmony_ci			flush = true;
238762306a36Sopenharmony_ci		else if (obj_stock_flush_required(stock, root_memcg))
238862306a36Sopenharmony_ci			flush = true;
238962306a36Sopenharmony_ci		rcu_read_unlock();
239062306a36Sopenharmony_ci
239162306a36Sopenharmony_ci		if (flush &&
239262306a36Sopenharmony_ci		    !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
239362306a36Sopenharmony_ci			if (cpu == curcpu)
239462306a36Sopenharmony_ci				drain_local_stock(&stock->work);
239562306a36Sopenharmony_ci			else if (!cpu_is_isolated(cpu))
239662306a36Sopenharmony_ci				schedule_work_on(cpu, &stock->work);
239762306a36Sopenharmony_ci		}
239862306a36Sopenharmony_ci	}
239962306a36Sopenharmony_ci	migrate_enable();
240062306a36Sopenharmony_ci	mutex_unlock(&percpu_charge_mutex);
240162306a36Sopenharmony_ci}
240262306a36Sopenharmony_ci
240362306a36Sopenharmony_cistatic int memcg_hotplug_cpu_dead(unsigned int cpu)
240462306a36Sopenharmony_ci{
240562306a36Sopenharmony_ci	struct memcg_stock_pcp *stock;
240662306a36Sopenharmony_ci
240762306a36Sopenharmony_ci	stock = &per_cpu(memcg_stock, cpu);
240862306a36Sopenharmony_ci	drain_stock(stock);
240962306a36Sopenharmony_ci
241062306a36Sopenharmony_ci	return 0;
241162306a36Sopenharmony_ci}
241262306a36Sopenharmony_ci
241362306a36Sopenharmony_cistatic unsigned long reclaim_high(struct mem_cgroup *memcg,
241462306a36Sopenharmony_ci				  unsigned int nr_pages,
241562306a36Sopenharmony_ci				  gfp_t gfp_mask)
241662306a36Sopenharmony_ci{
241762306a36Sopenharmony_ci	unsigned long nr_reclaimed = 0;
241862306a36Sopenharmony_ci
241962306a36Sopenharmony_ci	do {
242062306a36Sopenharmony_ci		unsigned long pflags;
242162306a36Sopenharmony_ci
242262306a36Sopenharmony_ci		if (page_counter_read(&memcg->memory) <=
242362306a36Sopenharmony_ci		    READ_ONCE(memcg->memory.high))
242462306a36Sopenharmony_ci			continue;
242562306a36Sopenharmony_ci
242662306a36Sopenharmony_ci		memcg_memory_event(memcg, MEMCG_HIGH);
242762306a36Sopenharmony_ci
242862306a36Sopenharmony_ci		psi_memstall_enter(&pflags);
242962306a36Sopenharmony_ci		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
243062306a36Sopenharmony_ci							gfp_mask,
243162306a36Sopenharmony_ci							MEMCG_RECLAIM_MAY_SWAP);
243262306a36Sopenharmony_ci		psi_memstall_leave(&pflags);
243362306a36Sopenharmony_ci	} while ((memcg = parent_mem_cgroup(memcg)) &&
243462306a36Sopenharmony_ci		 !mem_cgroup_is_root(memcg));
243562306a36Sopenharmony_ci
243662306a36Sopenharmony_ci	return nr_reclaimed;
243762306a36Sopenharmony_ci}
243862306a36Sopenharmony_ci
243962306a36Sopenharmony_cistatic void high_work_func(struct work_struct *work)
244062306a36Sopenharmony_ci{
244162306a36Sopenharmony_ci	struct mem_cgroup *memcg;
244262306a36Sopenharmony_ci
244362306a36Sopenharmony_ci	memcg = container_of(work, struct mem_cgroup, high_work);
244462306a36Sopenharmony_ci	reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
244562306a36Sopenharmony_ci}
244662306a36Sopenharmony_ci
244762306a36Sopenharmony_ci/*
244862306a36Sopenharmony_ci * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
244962306a36Sopenharmony_ci * enough to still cause a significant slowdown in most cases, while still
245062306a36Sopenharmony_ci * allowing diagnostics and tracing to proceed without becoming stuck.
245162306a36Sopenharmony_ci */
245262306a36Sopenharmony_ci#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
245362306a36Sopenharmony_ci
245462306a36Sopenharmony_ci/*
245562306a36Sopenharmony_ci * When calculating the delay, we use these either side of the exponentiation to
245662306a36Sopenharmony_ci * maintain precision and scale to a reasonable number of jiffies (see the table
245762306a36Sopenharmony_ci * below.
245862306a36Sopenharmony_ci *
245962306a36Sopenharmony_ci * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
246062306a36Sopenharmony_ci *   overage ratio to a delay.
246162306a36Sopenharmony_ci * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
246262306a36Sopenharmony_ci *   proposed penalty in order to reduce to a reasonable number of jiffies, and
246362306a36Sopenharmony_ci *   to produce a reasonable delay curve.
246462306a36Sopenharmony_ci *
246562306a36Sopenharmony_ci * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
246662306a36Sopenharmony_ci * reasonable delay curve compared to precision-adjusted overage, not
246762306a36Sopenharmony_ci * penalising heavily at first, but still making sure that growth beyond the
246862306a36Sopenharmony_ci * limit penalises misbehaviour cgroups by slowing them down exponentially. For
246962306a36Sopenharmony_ci * example, with a high of 100 megabytes:
247062306a36Sopenharmony_ci *
247162306a36Sopenharmony_ci *  +-------+------------------------+
247262306a36Sopenharmony_ci *  | usage | time to allocate in ms |
247362306a36Sopenharmony_ci *  +-------+------------------------+
247462306a36Sopenharmony_ci *  | 100M  |                      0 |
247562306a36Sopenharmony_ci *  | 101M  |                      6 |
247662306a36Sopenharmony_ci *  | 102M  |                     25 |
247762306a36Sopenharmony_ci *  | 103M  |                     57 |
247862306a36Sopenharmony_ci *  | 104M  |                    102 |
247962306a36Sopenharmony_ci *  | 105M  |                    159 |
248062306a36Sopenharmony_ci *  | 106M  |                    230 |
248162306a36Sopenharmony_ci *  | 107M  |                    313 |
248262306a36Sopenharmony_ci *  | 108M  |                    409 |
248362306a36Sopenharmony_ci *  | 109M  |                    518 |
248462306a36Sopenharmony_ci *  | 110M  |                    639 |
248562306a36Sopenharmony_ci *  | 111M  |                    774 |
248662306a36Sopenharmony_ci *  | 112M  |                    921 |
248762306a36Sopenharmony_ci *  | 113M  |                   1081 |
248862306a36Sopenharmony_ci *  | 114M  |                   1254 |
248962306a36Sopenharmony_ci *  | 115M  |                   1439 |
249062306a36Sopenharmony_ci *  | 116M  |                   1638 |
249162306a36Sopenharmony_ci *  | 117M  |                   1849 |
249262306a36Sopenharmony_ci *  | 118M  |                   2000 |
249362306a36Sopenharmony_ci *  | 119M  |                   2000 |
249462306a36Sopenharmony_ci *  | 120M  |                   2000 |
249562306a36Sopenharmony_ci *  +-------+------------------------+
249662306a36Sopenharmony_ci */
249762306a36Sopenharmony_ci #define MEMCG_DELAY_PRECISION_SHIFT 20
249862306a36Sopenharmony_ci #define MEMCG_DELAY_SCALING_SHIFT 14
249962306a36Sopenharmony_ci
250062306a36Sopenharmony_cistatic u64 calculate_overage(unsigned long usage, unsigned long high)
250162306a36Sopenharmony_ci{
250262306a36Sopenharmony_ci	u64 overage;
250362306a36Sopenharmony_ci
250462306a36Sopenharmony_ci	if (usage <= high)
250562306a36Sopenharmony_ci		return 0;
250662306a36Sopenharmony_ci
250762306a36Sopenharmony_ci	/*
250862306a36Sopenharmony_ci	 * Prevent division by 0 in overage calculation by acting as if
250962306a36Sopenharmony_ci	 * it was a threshold of 1 page
251062306a36Sopenharmony_ci	 */
251162306a36Sopenharmony_ci	high = max(high, 1UL);
251262306a36Sopenharmony_ci
251362306a36Sopenharmony_ci	overage = usage - high;
251462306a36Sopenharmony_ci	overage <<= MEMCG_DELAY_PRECISION_SHIFT;
251562306a36Sopenharmony_ci	return div64_u64(overage, high);
251662306a36Sopenharmony_ci}
251762306a36Sopenharmony_ci
251862306a36Sopenharmony_cistatic u64 mem_find_max_overage(struct mem_cgroup *memcg)
251962306a36Sopenharmony_ci{
252062306a36Sopenharmony_ci	u64 overage, max_overage = 0;
252162306a36Sopenharmony_ci
252262306a36Sopenharmony_ci	do {
252362306a36Sopenharmony_ci		overage = calculate_overage(page_counter_read(&memcg->memory),
252462306a36Sopenharmony_ci					    READ_ONCE(memcg->memory.high));
252562306a36Sopenharmony_ci		max_overage = max(overage, max_overage);
252662306a36Sopenharmony_ci	} while ((memcg = parent_mem_cgroup(memcg)) &&
252762306a36Sopenharmony_ci		 !mem_cgroup_is_root(memcg));
252862306a36Sopenharmony_ci
252962306a36Sopenharmony_ci	return max_overage;
253062306a36Sopenharmony_ci}
253162306a36Sopenharmony_ci
253262306a36Sopenharmony_cistatic u64 swap_find_max_overage(struct mem_cgroup *memcg)
253362306a36Sopenharmony_ci{
253462306a36Sopenharmony_ci	u64 overage, max_overage = 0;
253562306a36Sopenharmony_ci
253662306a36Sopenharmony_ci	do {
253762306a36Sopenharmony_ci		overage = calculate_overage(page_counter_read(&memcg->swap),
253862306a36Sopenharmony_ci					    READ_ONCE(memcg->swap.high));
253962306a36Sopenharmony_ci		if (overage)
254062306a36Sopenharmony_ci			memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
254162306a36Sopenharmony_ci		max_overage = max(overage, max_overage);
254262306a36Sopenharmony_ci	} while ((memcg = parent_mem_cgroup(memcg)) &&
254362306a36Sopenharmony_ci		 !mem_cgroup_is_root(memcg));
254462306a36Sopenharmony_ci
254562306a36Sopenharmony_ci	return max_overage;
254662306a36Sopenharmony_ci}
254762306a36Sopenharmony_ci
254862306a36Sopenharmony_ci/*
254962306a36Sopenharmony_ci * Get the number of jiffies that we should penalise a mischievous cgroup which
255062306a36Sopenharmony_ci * is exceeding its memory.high by checking both it and its ancestors.
255162306a36Sopenharmony_ci */
255262306a36Sopenharmony_cistatic unsigned long calculate_high_delay(struct mem_cgroup *memcg,
255362306a36Sopenharmony_ci					  unsigned int nr_pages,
255462306a36Sopenharmony_ci					  u64 max_overage)
255562306a36Sopenharmony_ci{
255662306a36Sopenharmony_ci	unsigned long penalty_jiffies;
255762306a36Sopenharmony_ci
255862306a36Sopenharmony_ci	if (!max_overage)
255962306a36Sopenharmony_ci		return 0;
256062306a36Sopenharmony_ci
256162306a36Sopenharmony_ci	/*
256262306a36Sopenharmony_ci	 * We use overage compared to memory.high to calculate the number of
256362306a36Sopenharmony_ci	 * jiffies to sleep (penalty_jiffies). Ideally this value should be
256462306a36Sopenharmony_ci	 * fairly lenient on small overages, and increasingly harsh when the
256562306a36Sopenharmony_ci	 * memcg in question makes it clear that it has no intention of stopping
256662306a36Sopenharmony_ci	 * its crazy behaviour, so we exponentially increase the delay based on
256762306a36Sopenharmony_ci	 * overage amount.
256862306a36Sopenharmony_ci	 */
256962306a36Sopenharmony_ci	penalty_jiffies = max_overage * max_overage * HZ;
257062306a36Sopenharmony_ci	penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
257162306a36Sopenharmony_ci	penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
257262306a36Sopenharmony_ci
257362306a36Sopenharmony_ci	/*
257462306a36Sopenharmony_ci	 * Factor in the task's own contribution to the overage, such that four
257562306a36Sopenharmony_ci	 * N-sized allocations are throttled approximately the same as one
257662306a36Sopenharmony_ci	 * 4N-sized allocation.
257762306a36Sopenharmony_ci	 *
257862306a36Sopenharmony_ci	 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
257962306a36Sopenharmony_ci	 * larger the current charge patch is than that.
258062306a36Sopenharmony_ci	 */
258162306a36Sopenharmony_ci	return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
258262306a36Sopenharmony_ci}
258362306a36Sopenharmony_ci
258462306a36Sopenharmony_ci/*
258562306a36Sopenharmony_ci * Scheduled by try_charge() to be executed from the userland return path
258662306a36Sopenharmony_ci * and reclaims memory over the high limit.
258762306a36Sopenharmony_ci */
258862306a36Sopenharmony_civoid mem_cgroup_handle_over_high(gfp_t gfp_mask)
258962306a36Sopenharmony_ci{
259062306a36Sopenharmony_ci	unsigned long penalty_jiffies;
259162306a36Sopenharmony_ci	unsigned long pflags;
259262306a36Sopenharmony_ci	unsigned long nr_reclaimed;
259362306a36Sopenharmony_ci	unsigned int nr_pages = current->memcg_nr_pages_over_high;
259462306a36Sopenharmony_ci	int nr_retries = MAX_RECLAIM_RETRIES;
259562306a36Sopenharmony_ci	struct mem_cgroup *memcg;
259662306a36Sopenharmony_ci	bool in_retry = false;
259762306a36Sopenharmony_ci
259862306a36Sopenharmony_ci	if (likely(!nr_pages))
259962306a36Sopenharmony_ci		return;
260062306a36Sopenharmony_ci
260162306a36Sopenharmony_ci	memcg = get_mem_cgroup_from_mm(current->mm);
260262306a36Sopenharmony_ci	current->memcg_nr_pages_over_high = 0;
260362306a36Sopenharmony_ci
260462306a36Sopenharmony_ciretry_reclaim:
260562306a36Sopenharmony_ci	/*
260662306a36Sopenharmony_ci	 * The allocating task should reclaim at least the batch size, but for
260762306a36Sopenharmony_ci	 * subsequent retries we only want to do what's necessary to prevent oom
260862306a36Sopenharmony_ci	 * or breaching resource isolation.
260962306a36Sopenharmony_ci	 *
261062306a36Sopenharmony_ci	 * This is distinct from memory.max or page allocator behaviour because
261162306a36Sopenharmony_ci	 * memory.high is currently batched, whereas memory.max and the page
261262306a36Sopenharmony_ci	 * allocator run every time an allocation is made.
261362306a36Sopenharmony_ci	 */
261462306a36Sopenharmony_ci	nr_reclaimed = reclaim_high(memcg,
261562306a36Sopenharmony_ci				    in_retry ? SWAP_CLUSTER_MAX : nr_pages,
261662306a36Sopenharmony_ci				    gfp_mask);
261762306a36Sopenharmony_ci
261862306a36Sopenharmony_ci	/*
261962306a36Sopenharmony_ci	 * memory.high is breached and reclaim is unable to keep up. Throttle
262062306a36Sopenharmony_ci	 * allocators proactively to slow down excessive growth.
262162306a36Sopenharmony_ci	 */
262262306a36Sopenharmony_ci	penalty_jiffies = calculate_high_delay(memcg, nr_pages,
262362306a36Sopenharmony_ci					       mem_find_max_overage(memcg));
262462306a36Sopenharmony_ci
262562306a36Sopenharmony_ci	penalty_jiffies += calculate_high_delay(memcg, nr_pages,
262662306a36Sopenharmony_ci						swap_find_max_overage(memcg));
262762306a36Sopenharmony_ci
262862306a36Sopenharmony_ci	/*
262962306a36Sopenharmony_ci	 * Clamp the max delay per usermode return so as to still keep the
263062306a36Sopenharmony_ci	 * application moving forwards and also permit diagnostics, albeit
263162306a36Sopenharmony_ci	 * extremely slowly.
263262306a36Sopenharmony_ci	 */
263362306a36Sopenharmony_ci	penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
263462306a36Sopenharmony_ci
263562306a36Sopenharmony_ci	/*
263662306a36Sopenharmony_ci	 * Don't sleep if the amount of jiffies this memcg owes us is so low
263762306a36Sopenharmony_ci	 * that it's not even worth doing, in an attempt to be nice to those who
263862306a36Sopenharmony_ci	 * go only a small amount over their memory.high value and maybe haven't
263962306a36Sopenharmony_ci	 * been aggressively reclaimed enough yet.
264062306a36Sopenharmony_ci	 */
264162306a36Sopenharmony_ci	if (penalty_jiffies <= HZ / 100)
264262306a36Sopenharmony_ci		goto out;
264362306a36Sopenharmony_ci
264462306a36Sopenharmony_ci	/*
264562306a36Sopenharmony_ci	 * If reclaim is making forward progress but we're still over
264662306a36Sopenharmony_ci	 * memory.high, we want to encourage that rather than doing allocator
264762306a36Sopenharmony_ci	 * throttling.
264862306a36Sopenharmony_ci	 */
264962306a36Sopenharmony_ci	if (nr_reclaimed || nr_retries--) {
265062306a36Sopenharmony_ci		in_retry = true;
265162306a36Sopenharmony_ci		goto retry_reclaim;
265262306a36Sopenharmony_ci	}
265362306a36Sopenharmony_ci
265462306a36Sopenharmony_ci	/*
265562306a36Sopenharmony_ci	 * If we exit early, we're guaranteed to die (since
265662306a36Sopenharmony_ci	 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
265762306a36Sopenharmony_ci	 * need to account for any ill-begotten jiffies to pay them off later.
265862306a36Sopenharmony_ci	 */
265962306a36Sopenharmony_ci	psi_memstall_enter(&pflags);
266062306a36Sopenharmony_ci	schedule_timeout_killable(penalty_jiffies);
266162306a36Sopenharmony_ci	psi_memstall_leave(&pflags);
266262306a36Sopenharmony_ci
266362306a36Sopenharmony_ciout:
266462306a36Sopenharmony_ci	css_put(&memcg->css);
266562306a36Sopenharmony_ci}
266662306a36Sopenharmony_ci
266762306a36Sopenharmony_cistatic int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
266862306a36Sopenharmony_ci			unsigned int nr_pages)
266962306a36Sopenharmony_ci{
267062306a36Sopenharmony_ci	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
267162306a36Sopenharmony_ci	int nr_retries = MAX_RECLAIM_RETRIES;
267262306a36Sopenharmony_ci	struct mem_cgroup *mem_over_limit;
267362306a36Sopenharmony_ci	struct page_counter *counter;
267462306a36Sopenharmony_ci	unsigned long nr_reclaimed;
267562306a36Sopenharmony_ci	bool passed_oom = false;
267662306a36Sopenharmony_ci	unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
267762306a36Sopenharmony_ci	bool drained = false;
267862306a36Sopenharmony_ci	bool raised_max_event = false;
267962306a36Sopenharmony_ci	unsigned long pflags;
268062306a36Sopenharmony_ci
268162306a36Sopenharmony_ciretry:
268262306a36Sopenharmony_ci	if (consume_stock(memcg, nr_pages))
268362306a36Sopenharmony_ci		return 0;
268462306a36Sopenharmony_ci
268562306a36Sopenharmony_ci	if (!do_memsw_account() ||
268662306a36Sopenharmony_ci	    page_counter_try_charge(&memcg->memsw, batch, &counter)) {
268762306a36Sopenharmony_ci		if (page_counter_try_charge(&memcg->memory, batch, &counter))
268862306a36Sopenharmony_ci			goto done_restock;
268962306a36Sopenharmony_ci		if (do_memsw_account())
269062306a36Sopenharmony_ci			page_counter_uncharge(&memcg->memsw, batch);
269162306a36Sopenharmony_ci		mem_over_limit = mem_cgroup_from_counter(counter, memory);
269262306a36Sopenharmony_ci	} else {
269362306a36Sopenharmony_ci		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
269462306a36Sopenharmony_ci		reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
269562306a36Sopenharmony_ci	}
269662306a36Sopenharmony_ci
269762306a36Sopenharmony_ci	if (batch > nr_pages) {
269862306a36Sopenharmony_ci		batch = nr_pages;
269962306a36Sopenharmony_ci		goto retry;
270062306a36Sopenharmony_ci	}
270162306a36Sopenharmony_ci
270262306a36Sopenharmony_ci	/*
270362306a36Sopenharmony_ci	 * Prevent unbounded recursion when reclaim operations need to
270462306a36Sopenharmony_ci	 * allocate memory. This might exceed the limits temporarily,
270562306a36Sopenharmony_ci	 * but we prefer facilitating memory reclaim and getting back
270662306a36Sopenharmony_ci	 * under the limit over triggering OOM kills in these cases.
270762306a36Sopenharmony_ci	 */
270862306a36Sopenharmony_ci	if (unlikely(current->flags & PF_MEMALLOC))
270962306a36Sopenharmony_ci		goto force;
271062306a36Sopenharmony_ci
271162306a36Sopenharmony_ci	if (unlikely(task_in_memcg_oom(current)))
271262306a36Sopenharmony_ci		goto nomem;
271362306a36Sopenharmony_ci
271462306a36Sopenharmony_ci	if (!gfpflags_allow_blocking(gfp_mask))
271562306a36Sopenharmony_ci		goto nomem;
271662306a36Sopenharmony_ci
271762306a36Sopenharmony_ci	memcg_memory_event(mem_over_limit, MEMCG_MAX);
271862306a36Sopenharmony_ci	raised_max_event = true;
271962306a36Sopenharmony_ci
272062306a36Sopenharmony_ci	psi_memstall_enter(&pflags);
272162306a36Sopenharmony_ci	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
272262306a36Sopenharmony_ci						    gfp_mask, reclaim_options);
272362306a36Sopenharmony_ci	psi_memstall_leave(&pflags);
272462306a36Sopenharmony_ci
272562306a36Sopenharmony_ci	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
272662306a36Sopenharmony_ci		goto retry;
272762306a36Sopenharmony_ci
272862306a36Sopenharmony_ci	if (!drained) {
272962306a36Sopenharmony_ci		drain_all_stock(mem_over_limit);
273062306a36Sopenharmony_ci		drained = true;
273162306a36Sopenharmony_ci		goto retry;
273262306a36Sopenharmony_ci	}
273362306a36Sopenharmony_ci
273462306a36Sopenharmony_ci	if (gfp_mask & __GFP_NORETRY)
273562306a36Sopenharmony_ci		goto nomem;
273662306a36Sopenharmony_ci	/*
273762306a36Sopenharmony_ci	 * Even though the limit is exceeded at this point, reclaim
273862306a36Sopenharmony_ci	 * may have been able to free some pages.  Retry the charge
273962306a36Sopenharmony_ci	 * before killing the task.
274062306a36Sopenharmony_ci	 *
274162306a36Sopenharmony_ci	 * Only for regular pages, though: huge pages are rather
274262306a36Sopenharmony_ci	 * unlikely to succeed so close to the limit, and we fall back
274362306a36Sopenharmony_ci	 * to regular pages anyway in case of failure.
274462306a36Sopenharmony_ci	 */
274562306a36Sopenharmony_ci	if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
274662306a36Sopenharmony_ci		goto retry;
274762306a36Sopenharmony_ci	/*
274862306a36Sopenharmony_ci	 * At task move, charge accounts can be doubly counted. So, it's
274962306a36Sopenharmony_ci	 * better to wait until the end of task_move if something is going on.
275062306a36Sopenharmony_ci	 */
275162306a36Sopenharmony_ci	if (mem_cgroup_wait_acct_move(mem_over_limit))
275262306a36Sopenharmony_ci		goto retry;
275362306a36Sopenharmony_ci
275462306a36Sopenharmony_ci	if (nr_retries--)
275562306a36Sopenharmony_ci		goto retry;
275662306a36Sopenharmony_ci
275762306a36Sopenharmony_ci	if (gfp_mask & __GFP_RETRY_MAYFAIL)
275862306a36Sopenharmony_ci		goto nomem;
275962306a36Sopenharmony_ci
276062306a36Sopenharmony_ci	/* Avoid endless loop for tasks bypassed by the oom killer */
276162306a36Sopenharmony_ci	if (passed_oom && task_is_dying())
276262306a36Sopenharmony_ci		goto nomem;
276362306a36Sopenharmony_ci
276462306a36Sopenharmony_ci	/*
276562306a36Sopenharmony_ci	 * keep retrying as long as the memcg oom killer is able to make
276662306a36Sopenharmony_ci	 * a forward progress or bypass the charge if the oom killer
276762306a36Sopenharmony_ci	 * couldn't make any progress.
276862306a36Sopenharmony_ci	 */
276962306a36Sopenharmony_ci	if (mem_cgroup_oom(mem_over_limit, gfp_mask,
277062306a36Sopenharmony_ci			   get_order(nr_pages * PAGE_SIZE))) {
277162306a36Sopenharmony_ci		passed_oom = true;
277262306a36Sopenharmony_ci		nr_retries = MAX_RECLAIM_RETRIES;
277362306a36Sopenharmony_ci		goto retry;
277462306a36Sopenharmony_ci	}
277562306a36Sopenharmony_cinomem:
277662306a36Sopenharmony_ci	/*
277762306a36Sopenharmony_ci	 * Memcg doesn't have a dedicated reserve for atomic
277862306a36Sopenharmony_ci	 * allocations. But like the global atomic pool, we need to
277962306a36Sopenharmony_ci	 * put the burden of reclaim on regular allocation requests
278062306a36Sopenharmony_ci	 * and let these go through as privileged allocations.
278162306a36Sopenharmony_ci	 */
278262306a36Sopenharmony_ci	if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH)))
278362306a36Sopenharmony_ci		return -ENOMEM;
278462306a36Sopenharmony_ciforce:
278562306a36Sopenharmony_ci	/*
278662306a36Sopenharmony_ci	 * If the allocation has to be enforced, don't forget to raise
278762306a36Sopenharmony_ci	 * a MEMCG_MAX event.
278862306a36Sopenharmony_ci	 */
278962306a36Sopenharmony_ci	if (!raised_max_event)
279062306a36Sopenharmony_ci		memcg_memory_event(mem_over_limit, MEMCG_MAX);
279162306a36Sopenharmony_ci
279262306a36Sopenharmony_ci	/*
279362306a36Sopenharmony_ci	 * The allocation either can't fail or will lead to more memory
279462306a36Sopenharmony_ci	 * being freed very soon.  Allow memory usage go over the limit
279562306a36Sopenharmony_ci	 * temporarily by force charging it.
279662306a36Sopenharmony_ci	 */
279762306a36Sopenharmony_ci	page_counter_charge(&memcg->memory, nr_pages);
279862306a36Sopenharmony_ci	if (do_memsw_account())
279962306a36Sopenharmony_ci		page_counter_charge(&memcg->memsw, nr_pages);
280062306a36Sopenharmony_ci
280162306a36Sopenharmony_ci	return 0;
280262306a36Sopenharmony_ci
280362306a36Sopenharmony_cidone_restock:
280462306a36Sopenharmony_ci	if (batch > nr_pages)
280562306a36Sopenharmony_ci		refill_stock(memcg, batch - nr_pages);
280662306a36Sopenharmony_ci
280762306a36Sopenharmony_ci	/*
280862306a36Sopenharmony_ci	 * If the hierarchy is above the normal consumption range, schedule
280962306a36Sopenharmony_ci	 * reclaim on returning to userland.  We can perform reclaim here
281062306a36Sopenharmony_ci	 * if __GFP_RECLAIM but let's always punt for simplicity and so that
281162306a36Sopenharmony_ci	 * GFP_KERNEL can consistently be used during reclaim.  @memcg is
281262306a36Sopenharmony_ci	 * not recorded as it most likely matches current's and won't
281362306a36Sopenharmony_ci	 * change in the meantime.  As high limit is checked again before
281462306a36Sopenharmony_ci	 * reclaim, the cost of mismatch is negligible.
281562306a36Sopenharmony_ci	 */
281662306a36Sopenharmony_ci	do {
281762306a36Sopenharmony_ci		bool mem_high, swap_high;
281862306a36Sopenharmony_ci
281962306a36Sopenharmony_ci		mem_high = page_counter_read(&memcg->memory) >
282062306a36Sopenharmony_ci			READ_ONCE(memcg->memory.high);
282162306a36Sopenharmony_ci		swap_high = page_counter_read(&memcg->swap) >
282262306a36Sopenharmony_ci			READ_ONCE(memcg->swap.high);
282362306a36Sopenharmony_ci
282462306a36Sopenharmony_ci		/* Don't bother a random interrupted task */
282562306a36Sopenharmony_ci		if (!in_task()) {
282662306a36Sopenharmony_ci			if (mem_high) {
282762306a36Sopenharmony_ci				schedule_work(&memcg->high_work);
282862306a36Sopenharmony_ci				break;
282962306a36Sopenharmony_ci			}
283062306a36Sopenharmony_ci			continue;
283162306a36Sopenharmony_ci		}
283262306a36Sopenharmony_ci
283362306a36Sopenharmony_ci		if (mem_high || swap_high) {
283462306a36Sopenharmony_ci			/*
283562306a36Sopenharmony_ci			 * The allocating tasks in this cgroup will need to do
283662306a36Sopenharmony_ci			 * reclaim or be throttled to prevent further growth
283762306a36Sopenharmony_ci			 * of the memory or swap footprints.
283862306a36Sopenharmony_ci			 *
283962306a36Sopenharmony_ci			 * Target some best-effort fairness between the tasks,
284062306a36Sopenharmony_ci			 * and distribute reclaim work and delay penalties
284162306a36Sopenharmony_ci			 * based on how much each task is actually allocating.
284262306a36Sopenharmony_ci			 */
284362306a36Sopenharmony_ci			current->memcg_nr_pages_over_high += batch;
284462306a36Sopenharmony_ci			set_notify_resume(current);
284562306a36Sopenharmony_ci			break;
284662306a36Sopenharmony_ci		}
284762306a36Sopenharmony_ci	} while ((memcg = parent_mem_cgroup(memcg)));
284862306a36Sopenharmony_ci
284962306a36Sopenharmony_ci	if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
285062306a36Sopenharmony_ci	    !(current->flags & PF_MEMALLOC) &&
285162306a36Sopenharmony_ci	    gfpflags_allow_blocking(gfp_mask)) {
285262306a36Sopenharmony_ci		mem_cgroup_handle_over_high(gfp_mask);
285362306a36Sopenharmony_ci	}
285462306a36Sopenharmony_ci	return 0;
285562306a36Sopenharmony_ci}
285662306a36Sopenharmony_ci
285762306a36Sopenharmony_cistatic inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
285862306a36Sopenharmony_ci			     unsigned int nr_pages)
285962306a36Sopenharmony_ci{
286062306a36Sopenharmony_ci	if (mem_cgroup_is_root(memcg))
286162306a36Sopenharmony_ci		return 0;
286262306a36Sopenharmony_ci
286362306a36Sopenharmony_ci	return try_charge_memcg(memcg, gfp_mask, nr_pages);
286462306a36Sopenharmony_ci}
286562306a36Sopenharmony_ci
286662306a36Sopenharmony_cistatic inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
286762306a36Sopenharmony_ci{
286862306a36Sopenharmony_ci	if (mem_cgroup_is_root(memcg))
286962306a36Sopenharmony_ci		return;
287062306a36Sopenharmony_ci
287162306a36Sopenharmony_ci	page_counter_uncharge(&memcg->memory, nr_pages);
287262306a36Sopenharmony_ci	if (do_memsw_account())
287362306a36Sopenharmony_ci		page_counter_uncharge(&memcg->memsw, nr_pages);
287462306a36Sopenharmony_ci}
287562306a36Sopenharmony_ci
287662306a36Sopenharmony_cistatic void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
287762306a36Sopenharmony_ci{
287862306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(folio_memcg(folio), folio);
287962306a36Sopenharmony_ci	/*
288062306a36Sopenharmony_ci	 * Any of the following ensures page's memcg stability:
288162306a36Sopenharmony_ci	 *
288262306a36Sopenharmony_ci	 * - the page lock
288362306a36Sopenharmony_ci	 * - LRU isolation
288462306a36Sopenharmony_ci	 * - folio_memcg_lock()
288562306a36Sopenharmony_ci	 * - exclusive reference
288662306a36Sopenharmony_ci	 * - mem_cgroup_trylock_pages()
288762306a36Sopenharmony_ci	 */
288862306a36Sopenharmony_ci	folio->memcg_data = (unsigned long)memcg;
288962306a36Sopenharmony_ci}
289062306a36Sopenharmony_ci
289162306a36Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
289262306a36Sopenharmony_ci/*
289362306a36Sopenharmony_ci * The allocated objcg pointers array is not accounted directly.
289462306a36Sopenharmony_ci * Moreover, it should not come from DMA buffer and is not readily
289562306a36Sopenharmony_ci * reclaimable. So those GFP bits should be masked off.
289662306a36Sopenharmony_ci */
289762306a36Sopenharmony_ci#define OBJCGS_CLEAR_MASK	(__GFP_DMA | __GFP_RECLAIMABLE | \
289862306a36Sopenharmony_ci				 __GFP_ACCOUNT | __GFP_NOFAIL)
289962306a36Sopenharmony_ci
290062306a36Sopenharmony_ci/*
290162306a36Sopenharmony_ci * mod_objcg_mlstate() may be called with irq enabled, so
290262306a36Sopenharmony_ci * mod_memcg_lruvec_state() should be used.
290362306a36Sopenharmony_ci */
290462306a36Sopenharmony_cistatic inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
290562306a36Sopenharmony_ci				     struct pglist_data *pgdat,
290662306a36Sopenharmony_ci				     enum node_stat_item idx, int nr)
290762306a36Sopenharmony_ci{
290862306a36Sopenharmony_ci	struct mem_cgroup *memcg;
290962306a36Sopenharmony_ci	struct lruvec *lruvec;
291062306a36Sopenharmony_ci
291162306a36Sopenharmony_ci	rcu_read_lock();
291262306a36Sopenharmony_ci	memcg = obj_cgroup_memcg(objcg);
291362306a36Sopenharmony_ci	lruvec = mem_cgroup_lruvec(memcg, pgdat);
291462306a36Sopenharmony_ci	mod_memcg_lruvec_state(lruvec, idx, nr);
291562306a36Sopenharmony_ci	rcu_read_unlock();
291662306a36Sopenharmony_ci}
291762306a36Sopenharmony_ci
291862306a36Sopenharmony_ciint memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
291962306a36Sopenharmony_ci				 gfp_t gfp, bool new_slab)
292062306a36Sopenharmony_ci{
292162306a36Sopenharmony_ci	unsigned int objects = objs_per_slab(s, slab);
292262306a36Sopenharmony_ci	unsigned long memcg_data;
292362306a36Sopenharmony_ci	void *vec;
292462306a36Sopenharmony_ci
292562306a36Sopenharmony_ci	gfp &= ~OBJCGS_CLEAR_MASK;
292662306a36Sopenharmony_ci	vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
292762306a36Sopenharmony_ci			   slab_nid(slab));
292862306a36Sopenharmony_ci	if (!vec)
292962306a36Sopenharmony_ci		return -ENOMEM;
293062306a36Sopenharmony_ci
293162306a36Sopenharmony_ci	memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
293262306a36Sopenharmony_ci	if (new_slab) {
293362306a36Sopenharmony_ci		/*
293462306a36Sopenharmony_ci		 * If the slab is brand new and nobody can yet access its
293562306a36Sopenharmony_ci		 * memcg_data, no synchronization is required and memcg_data can
293662306a36Sopenharmony_ci		 * be simply assigned.
293762306a36Sopenharmony_ci		 */
293862306a36Sopenharmony_ci		slab->memcg_data = memcg_data;
293962306a36Sopenharmony_ci	} else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) {
294062306a36Sopenharmony_ci		/*
294162306a36Sopenharmony_ci		 * If the slab is already in use, somebody can allocate and
294262306a36Sopenharmony_ci		 * assign obj_cgroups in parallel. In this case the existing
294362306a36Sopenharmony_ci		 * objcg vector should be reused.
294462306a36Sopenharmony_ci		 */
294562306a36Sopenharmony_ci		kfree(vec);
294662306a36Sopenharmony_ci		return 0;
294762306a36Sopenharmony_ci	}
294862306a36Sopenharmony_ci
294962306a36Sopenharmony_ci	kmemleak_not_leak(vec);
295062306a36Sopenharmony_ci	return 0;
295162306a36Sopenharmony_ci}
295262306a36Sopenharmony_ci
295362306a36Sopenharmony_cistatic __always_inline
295462306a36Sopenharmony_cistruct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
295562306a36Sopenharmony_ci{
295662306a36Sopenharmony_ci	/*
295762306a36Sopenharmony_ci	 * Slab objects are accounted individually, not per-page.
295862306a36Sopenharmony_ci	 * Memcg membership data for each individual object is saved in
295962306a36Sopenharmony_ci	 * slab->memcg_data.
296062306a36Sopenharmony_ci	 */
296162306a36Sopenharmony_ci	if (folio_test_slab(folio)) {
296262306a36Sopenharmony_ci		struct obj_cgroup **objcgs;
296362306a36Sopenharmony_ci		struct slab *slab;
296462306a36Sopenharmony_ci		unsigned int off;
296562306a36Sopenharmony_ci
296662306a36Sopenharmony_ci		slab = folio_slab(folio);
296762306a36Sopenharmony_ci		objcgs = slab_objcgs(slab);
296862306a36Sopenharmony_ci		if (!objcgs)
296962306a36Sopenharmony_ci			return NULL;
297062306a36Sopenharmony_ci
297162306a36Sopenharmony_ci		off = obj_to_index(slab->slab_cache, slab, p);
297262306a36Sopenharmony_ci		if (objcgs[off])
297362306a36Sopenharmony_ci			return obj_cgroup_memcg(objcgs[off]);
297462306a36Sopenharmony_ci
297562306a36Sopenharmony_ci		return NULL;
297662306a36Sopenharmony_ci	}
297762306a36Sopenharmony_ci
297862306a36Sopenharmony_ci	/*
297962306a36Sopenharmony_ci	 * folio_memcg_check() is used here, because in theory we can encounter
298062306a36Sopenharmony_ci	 * a folio where the slab flag has been cleared already, but
298162306a36Sopenharmony_ci	 * slab->memcg_data has not been freed yet
298262306a36Sopenharmony_ci	 * folio_memcg_check() will guarantee that a proper memory
298362306a36Sopenharmony_ci	 * cgroup pointer or NULL will be returned.
298462306a36Sopenharmony_ci	 */
298562306a36Sopenharmony_ci	return folio_memcg_check(folio);
298662306a36Sopenharmony_ci}
298762306a36Sopenharmony_ci
298862306a36Sopenharmony_ci/*
298962306a36Sopenharmony_ci * Returns a pointer to the memory cgroup to which the kernel object is charged.
299062306a36Sopenharmony_ci *
299162306a36Sopenharmony_ci * A passed kernel object can be a slab object, vmalloc object or a generic
299262306a36Sopenharmony_ci * kernel page, so different mechanisms for getting the memory cgroup pointer
299362306a36Sopenharmony_ci * should be used.
299462306a36Sopenharmony_ci *
299562306a36Sopenharmony_ci * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
299662306a36Sopenharmony_ci * can not know for sure how the kernel object is implemented.
299762306a36Sopenharmony_ci * mem_cgroup_from_obj() can be safely used in such cases.
299862306a36Sopenharmony_ci *
299962306a36Sopenharmony_ci * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
300062306a36Sopenharmony_ci * cgroup_mutex, etc.
300162306a36Sopenharmony_ci */
300262306a36Sopenharmony_cistruct mem_cgroup *mem_cgroup_from_obj(void *p)
300362306a36Sopenharmony_ci{
300462306a36Sopenharmony_ci	struct folio *folio;
300562306a36Sopenharmony_ci
300662306a36Sopenharmony_ci	if (mem_cgroup_disabled())
300762306a36Sopenharmony_ci		return NULL;
300862306a36Sopenharmony_ci
300962306a36Sopenharmony_ci	if (unlikely(is_vmalloc_addr(p)))
301062306a36Sopenharmony_ci		folio = page_folio(vmalloc_to_page(p));
301162306a36Sopenharmony_ci	else
301262306a36Sopenharmony_ci		folio = virt_to_folio(p);
301362306a36Sopenharmony_ci
301462306a36Sopenharmony_ci	return mem_cgroup_from_obj_folio(folio, p);
301562306a36Sopenharmony_ci}
301662306a36Sopenharmony_ci
301762306a36Sopenharmony_ci/*
301862306a36Sopenharmony_ci * Returns a pointer to the memory cgroup to which the kernel object is charged.
301962306a36Sopenharmony_ci * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects,
302062306a36Sopenharmony_ci * allocated using vmalloc().
302162306a36Sopenharmony_ci *
302262306a36Sopenharmony_ci * A passed kernel object must be a slab object or a generic kernel page.
302362306a36Sopenharmony_ci *
302462306a36Sopenharmony_ci * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
302562306a36Sopenharmony_ci * cgroup_mutex, etc.
302662306a36Sopenharmony_ci */
302762306a36Sopenharmony_cistruct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
302862306a36Sopenharmony_ci{
302962306a36Sopenharmony_ci	if (mem_cgroup_disabled())
303062306a36Sopenharmony_ci		return NULL;
303162306a36Sopenharmony_ci
303262306a36Sopenharmony_ci	return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
303362306a36Sopenharmony_ci}
303462306a36Sopenharmony_ci
303562306a36Sopenharmony_cistatic struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
303662306a36Sopenharmony_ci{
303762306a36Sopenharmony_ci	struct obj_cgroup *objcg = NULL;
303862306a36Sopenharmony_ci
303962306a36Sopenharmony_ci	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
304062306a36Sopenharmony_ci		objcg = rcu_dereference(memcg->objcg);
304162306a36Sopenharmony_ci		if (objcg && obj_cgroup_tryget(objcg))
304262306a36Sopenharmony_ci			break;
304362306a36Sopenharmony_ci		objcg = NULL;
304462306a36Sopenharmony_ci	}
304562306a36Sopenharmony_ci	return objcg;
304662306a36Sopenharmony_ci}
304762306a36Sopenharmony_ci
304862306a36Sopenharmony_ci__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
304962306a36Sopenharmony_ci{
305062306a36Sopenharmony_ci	struct obj_cgroup *objcg = NULL;
305162306a36Sopenharmony_ci	struct mem_cgroup *memcg;
305262306a36Sopenharmony_ci
305362306a36Sopenharmony_ci	if (memcg_kmem_bypass())
305462306a36Sopenharmony_ci		return NULL;
305562306a36Sopenharmony_ci
305662306a36Sopenharmony_ci	rcu_read_lock();
305762306a36Sopenharmony_ci	if (unlikely(active_memcg()))
305862306a36Sopenharmony_ci		memcg = active_memcg();
305962306a36Sopenharmony_ci	else
306062306a36Sopenharmony_ci		memcg = mem_cgroup_from_task(current);
306162306a36Sopenharmony_ci	objcg = __get_obj_cgroup_from_memcg(memcg);
306262306a36Sopenharmony_ci	rcu_read_unlock();
306362306a36Sopenharmony_ci	return objcg;
306462306a36Sopenharmony_ci}
306562306a36Sopenharmony_ci
306662306a36Sopenharmony_cistruct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
306762306a36Sopenharmony_ci{
306862306a36Sopenharmony_ci	struct obj_cgroup *objcg;
306962306a36Sopenharmony_ci
307062306a36Sopenharmony_ci	if (!memcg_kmem_online())
307162306a36Sopenharmony_ci		return NULL;
307262306a36Sopenharmony_ci
307362306a36Sopenharmony_ci	if (folio_memcg_kmem(folio)) {
307462306a36Sopenharmony_ci		objcg = __folio_objcg(folio);
307562306a36Sopenharmony_ci		obj_cgroup_get(objcg);
307662306a36Sopenharmony_ci	} else {
307762306a36Sopenharmony_ci		struct mem_cgroup *memcg;
307862306a36Sopenharmony_ci
307962306a36Sopenharmony_ci		rcu_read_lock();
308062306a36Sopenharmony_ci		memcg = __folio_memcg(folio);
308162306a36Sopenharmony_ci		if (memcg)
308262306a36Sopenharmony_ci			objcg = __get_obj_cgroup_from_memcg(memcg);
308362306a36Sopenharmony_ci		else
308462306a36Sopenharmony_ci			objcg = NULL;
308562306a36Sopenharmony_ci		rcu_read_unlock();
308662306a36Sopenharmony_ci	}
308762306a36Sopenharmony_ci	return objcg;
308862306a36Sopenharmony_ci}
308962306a36Sopenharmony_ci
309062306a36Sopenharmony_cistatic void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
309162306a36Sopenharmony_ci{
309262306a36Sopenharmony_ci	mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
309362306a36Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
309462306a36Sopenharmony_ci		if (nr_pages > 0)
309562306a36Sopenharmony_ci			page_counter_charge(&memcg->kmem, nr_pages);
309662306a36Sopenharmony_ci		else
309762306a36Sopenharmony_ci			page_counter_uncharge(&memcg->kmem, -nr_pages);
309862306a36Sopenharmony_ci	}
309962306a36Sopenharmony_ci}
310062306a36Sopenharmony_ci
310162306a36Sopenharmony_ci
310262306a36Sopenharmony_ci/*
310362306a36Sopenharmony_ci * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
310462306a36Sopenharmony_ci * @objcg: object cgroup to uncharge
310562306a36Sopenharmony_ci * @nr_pages: number of pages to uncharge
310662306a36Sopenharmony_ci */
310762306a36Sopenharmony_cistatic void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
310862306a36Sopenharmony_ci				      unsigned int nr_pages)
310962306a36Sopenharmony_ci{
311062306a36Sopenharmony_ci	struct mem_cgroup *memcg;
311162306a36Sopenharmony_ci
311262306a36Sopenharmony_ci	memcg = get_mem_cgroup_from_objcg(objcg);
311362306a36Sopenharmony_ci
311462306a36Sopenharmony_ci	memcg_account_kmem(memcg, -nr_pages);
311562306a36Sopenharmony_ci	refill_stock(memcg, nr_pages);
311662306a36Sopenharmony_ci
311762306a36Sopenharmony_ci	css_put(&memcg->css);
311862306a36Sopenharmony_ci}
311962306a36Sopenharmony_ci
312062306a36Sopenharmony_ci/*
312162306a36Sopenharmony_ci * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg
312262306a36Sopenharmony_ci * @objcg: object cgroup to charge
312362306a36Sopenharmony_ci * @gfp: reclaim mode
312462306a36Sopenharmony_ci * @nr_pages: number of pages to charge
312562306a36Sopenharmony_ci *
312662306a36Sopenharmony_ci * Returns 0 on success, an error code on failure.
312762306a36Sopenharmony_ci */
312862306a36Sopenharmony_cistatic int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
312962306a36Sopenharmony_ci				   unsigned int nr_pages)
313062306a36Sopenharmony_ci{
313162306a36Sopenharmony_ci	struct mem_cgroup *memcg;
313262306a36Sopenharmony_ci	int ret;
313362306a36Sopenharmony_ci
313462306a36Sopenharmony_ci	memcg = get_mem_cgroup_from_objcg(objcg);
313562306a36Sopenharmony_ci
313662306a36Sopenharmony_ci	ret = try_charge_memcg(memcg, gfp, nr_pages);
313762306a36Sopenharmony_ci	if (ret)
313862306a36Sopenharmony_ci		goto out;
313962306a36Sopenharmony_ci
314062306a36Sopenharmony_ci	memcg_account_kmem(memcg, nr_pages);
314162306a36Sopenharmony_ciout:
314262306a36Sopenharmony_ci	css_put(&memcg->css);
314362306a36Sopenharmony_ci
314462306a36Sopenharmony_ci	return ret;
314562306a36Sopenharmony_ci}
314662306a36Sopenharmony_ci
314762306a36Sopenharmony_ci/**
314862306a36Sopenharmony_ci * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
314962306a36Sopenharmony_ci * @page: page to charge
315062306a36Sopenharmony_ci * @gfp: reclaim mode
315162306a36Sopenharmony_ci * @order: allocation order
315262306a36Sopenharmony_ci *
315362306a36Sopenharmony_ci * Returns 0 on success, an error code on failure.
315462306a36Sopenharmony_ci */
315562306a36Sopenharmony_ciint __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
315662306a36Sopenharmony_ci{
315762306a36Sopenharmony_ci	struct obj_cgroup *objcg;
315862306a36Sopenharmony_ci	int ret = 0;
315962306a36Sopenharmony_ci
316062306a36Sopenharmony_ci	objcg = get_obj_cgroup_from_current();
316162306a36Sopenharmony_ci	if (objcg) {
316262306a36Sopenharmony_ci		ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
316362306a36Sopenharmony_ci		if (!ret) {
316462306a36Sopenharmony_ci			page->memcg_data = (unsigned long)objcg |
316562306a36Sopenharmony_ci				MEMCG_DATA_KMEM;
316662306a36Sopenharmony_ci			return 0;
316762306a36Sopenharmony_ci		}
316862306a36Sopenharmony_ci		obj_cgroup_put(objcg);
316962306a36Sopenharmony_ci	}
317062306a36Sopenharmony_ci	return ret;
317162306a36Sopenharmony_ci}
317262306a36Sopenharmony_ci
317362306a36Sopenharmony_ci/**
317462306a36Sopenharmony_ci * __memcg_kmem_uncharge_page: uncharge a kmem page
317562306a36Sopenharmony_ci * @page: page to uncharge
317662306a36Sopenharmony_ci * @order: allocation order
317762306a36Sopenharmony_ci */
317862306a36Sopenharmony_civoid __memcg_kmem_uncharge_page(struct page *page, int order)
317962306a36Sopenharmony_ci{
318062306a36Sopenharmony_ci	struct folio *folio = page_folio(page);
318162306a36Sopenharmony_ci	struct obj_cgroup *objcg;
318262306a36Sopenharmony_ci	unsigned int nr_pages = 1 << order;
318362306a36Sopenharmony_ci
318462306a36Sopenharmony_ci	if (!folio_memcg_kmem(folio))
318562306a36Sopenharmony_ci		return;
318662306a36Sopenharmony_ci
318762306a36Sopenharmony_ci	objcg = __folio_objcg(folio);
318862306a36Sopenharmony_ci	obj_cgroup_uncharge_pages(objcg, nr_pages);
318962306a36Sopenharmony_ci	folio->memcg_data = 0;
319062306a36Sopenharmony_ci	obj_cgroup_put(objcg);
319162306a36Sopenharmony_ci}
319262306a36Sopenharmony_ci
319362306a36Sopenharmony_civoid mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
319462306a36Sopenharmony_ci		     enum node_stat_item idx, int nr)
319562306a36Sopenharmony_ci{
319662306a36Sopenharmony_ci	struct memcg_stock_pcp *stock;
319762306a36Sopenharmony_ci	struct obj_cgroup *old = NULL;
319862306a36Sopenharmony_ci	unsigned long flags;
319962306a36Sopenharmony_ci	int *bytes;
320062306a36Sopenharmony_ci
320162306a36Sopenharmony_ci	local_lock_irqsave(&memcg_stock.stock_lock, flags);
320262306a36Sopenharmony_ci	stock = this_cpu_ptr(&memcg_stock);
320362306a36Sopenharmony_ci
320462306a36Sopenharmony_ci	/*
320562306a36Sopenharmony_ci	 * Save vmstat data in stock and skip vmstat array update unless
320662306a36Sopenharmony_ci	 * accumulating over a page of vmstat data or when pgdat or idx
320762306a36Sopenharmony_ci	 * changes.
320862306a36Sopenharmony_ci	 */
320962306a36Sopenharmony_ci	if (READ_ONCE(stock->cached_objcg) != objcg) {
321062306a36Sopenharmony_ci		old = drain_obj_stock(stock);
321162306a36Sopenharmony_ci		obj_cgroup_get(objcg);
321262306a36Sopenharmony_ci		stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
321362306a36Sopenharmony_ci				? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
321462306a36Sopenharmony_ci		WRITE_ONCE(stock->cached_objcg, objcg);
321562306a36Sopenharmony_ci		stock->cached_pgdat = pgdat;
321662306a36Sopenharmony_ci	} else if (stock->cached_pgdat != pgdat) {
321762306a36Sopenharmony_ci		/* Flush the existing cached vmstat data */
321862306a36Sopenharmony_ci		struct pglist_data *oldpg = stock->cached_pgdat;
321962306a36Sopenharmony_ci
322062306a36Sopenharmony_ci		if (stock->nr_slab_reclaimable_b) {
322162306a36Sopenharmony_ci			mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
322262306a36Sopenharmony_ci					  stock->nr_slab_reclaimable_b);
322362306a36Sopenharmony_ci			stock->nr_slab_reclaimable_b = 0;
322462306a36Sopenharmony_ci		}
322562306a36Sopenharmony_ci		if (stock->nr_slab_unreclaimable_b) {
322662306a36Sopenharmony_ci			mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
322762306a36Sopenharmony_ci					  stock->nr_slab_unreclaimable_b);
322862306a36Sopenharmony_ci			stock->nr_slab_unreclaimable_b = 0;
322962306a36Sopenharmony_ci		}
323062306a36Sopenharmony_ci		stock->cached_pgdat = pgdat;
323162306a36Sopenharmony_ci	}
323262306a36Sopenharmony_ci
323362306a36Sopenharmony_ci	bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
323462306a36Sopenharmony_ci					       : &stock->nr_slab_unreclaimable_b;
323562306a36Sopenharmony_ci	/*
323662306a36Sopenharmony_ci	 * Even for large object >= PAGE_SIZE, the vmstat data will still be
323762306a36Sopenharmony_ci	 * cached locally at least once before pushing it out.
323862306a36Sopenharmony_ci	 */
323962306a36Sopenharmony_ci	if (!*bytes) {
324062306a36Sopenharmony_ci		*bytes = nr;
324162306a36Sopenharmony_ci		nr = 0;
324262306a36Sopenharmony_ci	} else {
324362306a36Sopenharmony_ci		*bytes += nr;
324462306a36Sopenharmony_ci		if (abs(*bytes) > PAGE_SIZE) {
324562306a36Sopenharmony_ci			nr = *bytes;
324662306a36Sopenharmony_ci			*bytes = 0;
324762306a36Sopenharmony_ci		} else {
324862306a36Sopenharmony_ci			nr = 0;
324962306a36Sopenharmony_ci		}
325062306a36Sopenharmony_ci	}
325162306a36Sopenharmony_ci	if (nr)
325262306a36Sopenharmony_ci		mod_objcg_mlstate(objcg, pgdat, idx, nr);
325362306a36Sopenharmony_ci
325462306a36Sopenharmony_ci	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
325562306a36Sopenharmony_ci	if (old)
325662306a36Sopenharmony_ci		obj_cgroup_put(old);
325762306a36Sopenharmony_ci}
325862306a36Sopenharmony_ci
325962306a36Sopenharmony_cistatic bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
326062306a36Sopenharmony_ci{
326162306a36Sopenharmony_ci	struct memcg_stock_pcp *stock;
326262306a36Sopenharmony_ci	unsigned long flags;
326362306a36Sopenharmony_ci	bool ret = false;
326462306a36Sopenharmony_ci
326562306a36Sopenharmony_ci	local_lock_irqsave(&memcg_stock.stock_lock, flags);
326662306a36Sopenharmony_ci
326762306a36Sopenharmony_ci	stock = this_cpu_ptr(&memcg_stock);
326862306a36Sopenharmony_ci	if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
326962306a36Sopenharmony_ci		stock->nr_bytes -= nr_bytes;
327062306a36Sopenharmony_ci		ret = true;
327162306a36Sopenharmony_ci	}
327262306a36Sopenharmony_ci
327362306a36Sopenharmony_ci	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
327462306a36Sopenharmony_ci
327562306a36Sopenharmony_ci	return ret;
327662306a36Sopenharmony_ci}
327762306a36Sopenharmony_ci
327862306a36Sopenharmony_cistatic struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
327962306a36Sopenharmony_ci{
328062306a36Sopenharmony_ci	struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
328162306a36Sopenharmony_ci
328262306a36Sopenharmony_ci	if (!old)
328362306a36Sopenharmony_ci		return NULL;
328462306a36Sopenharmony_ci
328562306a36Sopenharmony_ci	if (stock->nr_bytes) {
328662306a36Sopenharmony_ci		unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
328762306a36Sopenharmony_ci		unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
328862306a36Sopenharmony_ci
328962306a36Sopenharmony_ci		if (nr_pages) {
329062306a36Sopenharmony_ci			struct mem_cgroup *memcg;
329162306a36Sopenharmony_ci
329262306a36Sopenharmony_ci			memcg = get_mem_cgroup_from_objcg(old);
329362306a36Sopenharmony_ci
329462306a36Sopenharmony_ci			memcg_account_kmem(memcg, -nr_pages);
329562306a36Sopenharmony_ci			__refill_stock(memcg, nr_pages);
329662306a36Sopenharmony_ci
329762306a36Sopenharmony_ci			css_put(&memcg->css);
329862306a36Sopenharmony_ci		}
329962306a36Sopenharmony_ci
330062306a36Sopenharmony_ci		/*
330162306a36Sopenharmony_ci		 * The leftover is flushed to the centralized per-memcg value.
330262306a36Sopenharmony_ci		 * On the next attempt to refill obj stock it will be moved
330362306a36Sopenharmony_ci		 * to a per-cpu stock (probably, on an other CPU), see
330462306a36Sopenharmony_ci		 * refill_obj_stock().
330562306a36Sopenharmony_ci		 *
330662306a36Sopenharmony_ci		 * How often it's flushed is a trade-off between the memory
330762306a36Sopenharmony_ci		 * limit enforcement accuracy and potential CPU contention,
330862306a36Sopenharmony_ci		 * so it might be changed in the future.
330962306a36Sopenharmony_ci		 */
331062306a36Sopenharmony_ci		atomic_add(nr_bytes, &old->nr_charged_bytes);
331162306a36Sopenharmony_ci		stock->nr_bytes = 0;
331262306a36Sopenharmony_ci	}
331362306a36Sopenharmony_ci
331462306a36Sopenharmony_ci	/*
331562306a36Sopenharmony_ci	 * Flush the vmstat data in current stock
331662306a36Sopenharmony_ci	 */
331762306a36Sopenharmony_ci	if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
331862306a36Sopenharmony_ci		if (stock->nr_slab_reclaimable_b) {
331962306a36Sopenharmony_ci			mod_objcg_mlstate(old, stock->cached_pgdat,
332062306a36Sopenharmony_ci					  NR_SLAB_RECLAIMABLE_B,
332162306a36Sopenharmony_ci					  stock->nr_slab_reclaimable_b);
332262306a36Sopenharmony_ci			stock->nr_slab_reclaimable_b = 0;
332362306a36Sopenharmony_ci		}
332462306a36Sopenharmony_ci		if (stock->nr_slab_unreclaimable_b) {
332562306a36Sopenharmony_ci			mod_objcg_mlstate(old, stock->cached_pgdat,
332662306a36Sopenharmony_ci					  NR_SLAB_UNRECLAIMABLE_B,
332762306a36Sopenharmony_ci					  stock->nr_slab_unreclaimable_b);
332862306a36Sopenharmony_ci			stock->nr_slab_unreclaimable_b = 0;
332962306a36Sopenharmony_ci		}
333062306a36Sopenharmony_ci		stock->cached_pgdat = NULL;
333162306a36Sopenharmony_ci	}
333262306a36Sopenharmony_ci
333362306a36Sopenharmony_ci	WRITE_ONCE(stock->cached_objcg, NULL);
333462306a36Sopenharmony_ci	/*
333562306a36Sopenharmony_ci	 * The `old' objects needs to be released by the caller via
333662306a36Sopenharmony_ci	 * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
333762306a36Sopenharmony_ci	 */
333862306a36Sopenharmony_ci	return old;
333962306a36Sopenharmony_ci}
334062306a36Sopenharmony_ci
334162306a36Sopenharmony_cistatic bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
334262306a36Sopenharmony_ci				     struct mem_cgroup *root_memcg)
334362306a36Sopenharmony_ci{
334462306a36Sopenharmony_ci	struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
334562306a36Sopenharmony_ci	struct mem_cgroup *memcg;
334662306a36Sopenharmony_ci
334762306a36Sopenharmony_ci	if (objcg) {
334862306a36Sopenharmony_ci		memcg = obj_cgroup_memcg(objcg);
334962306a36Sopenharmony_ci		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
335062306a36Sopenharmony_ci			return true;
335162306a36Sopenharmony_ci	}
335262306a36Sopenharmony_ci
335362306a36Sopenharmony_ci	return false;
335462306a36Sopenharmony_ci}
335562306a36Sopenharmony_ci
335662306a36Sopenharmony_cistatic void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
335762306a36Sopenharmony_ci			     bool allow_uncharge)
335862306a36Sopenharmony_ci{
335962306a36Sopenharmony_ci	struct memcg_stock_pcp *stock;
336062306a36Sopenharmony_ci	struct obj_cgroup *old = NULL;
336162306a36Sopenharmony_ci	unsigned long flags;
336262306a36Sopenharmony_ci	unsigned int nr_pages = 0;
336362306a36Sopenharmony_ci
336462306a36Sopenharmony_ci	local_lock_irqsave(&memcg_stock.stock_lock, flags);
336562306a36Sopenharmony_ci
336662306a36Sopenharmony_ci	stock = this_cpu_ptr(&memcg_stock);
336762306a36Sopenharmony_ci	if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
336862306a36Sopenharmony_ci		old = drain_obj_stock(stock);
336962306a36Sopenharmony_ci		obj_cgroup_get(objcg);
337062306a36Sopenharmony_ci		WRITE_ONCE(stock->cached_objcg, objcg);
337162306a36Sopenharmony_ci		stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
337262306a36Sopenharmony_ci				? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
337362306a36Sopenharmony_ci		allow_uncharge = true;	/* Allow uncharge when objcg changes */
337462306a36Sopenharmony_ci	}
337562306a36Sopenharmony_ci	stock->nr_bytes += nr_bytes;
337662306a36Sopenharmony_ci
337762306a36Sopenharmony_ci	if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
337862306a36Sopenharmony_ci		nr_pages = stock->nr_bytes >> PAGE_SHIFT;
337962306a36Sopenharmony_ci		stock->nr_bytes &= (PAGE_SIZE - 1);
338062306a36Sopenharmony_ci	}
338162306a36Sopenharmony_ci
338262306a36Sopenharmony_ci	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
338362306a36Sopenharmony_ci	if (old)
338462306a36Sopenharmony_ci		obj_cgroup_put(old);
338562306a36Sopenharmony_ci
338662306a36Sopenharmony_ci	if (nr_pages)
338762306a36Sopenharmony_ci		obj_cgroup_uncharge_pages(objcg, nr_pages);
338862306a36Sopenharmony_ci}
338962306a36Sopenharmony_ci
339062306a36Sopenharmony_ciint obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
339162306a36Sopenharmony_ci{
339262306a36Sopenharmony_ci	unsigned int nr_pages, nr_bytes;
339362306a36Sopenharmony_ci	int ret;
339462306a36Sopenharmony_ci
339562306a36Sopenharmony_ci	if (consume_obj_stock(objcg, size))
339662306a36Sopenharmony_ci		return 0;
339762306a36Sopenharmony_ci
339862306a36Sopenharmony_ci	/*
339962306a36Sopenharmony_ci	 * In theory, objcg->nr_charged_bytes can have enough
340062306a36Sopenharmony_ci	 * pre-charged bytes to satisfy the allocation. However,
340162306a36Sopenharmony_ci	 * flushing objcg->nr_charged_bytes requires two atomic
340262306a36Sopenharmony_ci	 * operations, and objcg->nr_charged_bytes can't be big.
340362306a36Sopenharmony_ci	 * The shared objcg->nr_charged_bytes can also become a
340462306a36Sopenharmony_ci	 * performance bottleneck if all tasks of the same memcg are
340562306a36Sopenharmony_ci	 * trying to update it. So it's better to ignore it and try
340662306a36Sopenharmony_ci	 * grab some new pages. The stock's nr_bytes will be flushed to
340762306a36Sopenharmony_ci	 * objcg->nr_charged_bytes later on when objcg changes.
340862306a36Sopenharmony_ci	 *
340962306a36Sopenharmony_ci	 * The stock's nr_bytes may contain enough pre-charged bytes
341062306a36Sopenharmony_ci	 * to allow one less page from being charged, but we can't rely
341162306a36Sopenharmony_ci	 * on the pre-charged bytes not being changed outside of
341262306a36Sopenharmony_ci	 * consume_obj_stock() or refill_obj_stock(). So ignore those
341362306a36Sopenharmony_ci	 * pre-charged bytes as well when charging pages. To avoid a
341462306a36Sopenharmony_ci	 * page uncharge right after a page charge, we set the
341562306a36Sopenharmony_ci	 * allow_uncharge flag to false when calling refill_obj_stock()
341662306a36Sopenharmony_ci	 * to temporarily allow the pre-charged bytes to exceed the page
341762306a36Sopenharmony_ci	 * size limit. The maximum reachable value of the pre-charged
341862306a36Sopenharmony_ci	 * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data
341962306a36Sopenharmony_ci	 * race.
342062306a36Sopenharmony_ci	 */
342162306a36Sopenharmony_ci	nr_pages = size >> PAGE_SHIFT;
342262306a36Sopenharmony_ci	nr_bytes = size & (PAGE_SIZE - 1);
342362306a36Sopenharmony_ci
342462306a36Sopenharmony_ci	if (nr_bytes)
342562306a36Sopenharmony_ci		nr_pages += 1;
342662306a36Sopenharmony_ci
342762306a36Sopenharmony_ci	ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
342862306a36Sopenharmony_ci	if (!ret && nr_bytes)
342962306a36Sopenharmony_ci		refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
343062306a36Sopenharmony_ci
343162306a36Sopenharmony_ci	return ret;
343262306a36Sopenharmony_ci}
343362306a36Sopenharmony_ci
343462306a36Sopenharmony_civoid obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
343562306a36Sopenharmony_ci{
343662306a36Sopenharmony_ci	refill_obj_stock(objcg, size, true);
343762306a36Sopenharmony_ci}
343862306a36Sopenharmony_ci
343962306a36Sopenharmony_ci#endif /* CONFIG_MEMCG_KMEM */
344062306a36Sopenharmony_ci
344162306a36Sopenharmony_ci/*
344262306a36Sopenharmony_ci * Because page_memcg(head) is not set on tails, set it now.
344362306a36Sopenharmony_ci */
344462306a36Sopenharmony_civoid split_page_memcg(struct page *head, unsigned int nr)
344562306a36Sopenharmony_ci{
344662306a36Sopenharmony_ci	struct folio *folio = page_folio(head);
344762306a36Sopenharmony_ci	struct mem_cgroup *memcg = folio_memcg(folio);
344862306a36Sopenharmony_ci	int i;
344962306a36Sopenharmony_ci
345062306a36Sopenharmony_ci	if (mem_cgroup_disabled() || !memcg)
345162306a36Sopenharmony_ci		return;
345262306a36Sopenharmony_ci
345362306a36Sopenharmony_ci	for (i = 1; i < nr; i++)
345462306a36Sopenharmony_ci		folio_page(folio, i)->memcg_data = folio->memcg_data;
345562306a36Sopenharmony_ci
345662306a36Sopenharmony_ci	if (folio_memcg_kmem(folio))
345762306a36Sopenharmony_ci		obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
345862306a36Sopenharmony_ci	else
345962306a36Sopenharmony_ci		css_get_many(&memcg->css, nr - 1);
346062306a36Sopenharmony_ci}
346162306a36Sopenharmony_ci
346262306a36Sopenharmony_ci#ifdef CONFIG_SWAP
346362306a36Sopenharmony_ci/**
346462306a36Sopenharmony_ci * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
346562306a36Sopenharmony_ci * @entry: swap entry to be moved
346662306a36Sopenharmony_ci * @from:  mem_cgroup which the entry is moved from
346762306a36Sopenharmony_ci * @to:  mem_cgroup which the entry is moved to
346862306a36Sopenharmony_ci *
346962306a36Sopenharmony_ci * It succeeds only when the swap_cgroup's record for this entry is the same
347062306a36Sopenharmony_ci * as the mem_cgroup's id of @from.
347162306a36Sopenharmony_ci *
347262306a36Sopenharmony_ci * Returns 0 on success, -EINVAL on failure.
347362306a36Sopenharmony_ci *
347462306a36Sopenharmony_ci * The caller must have charged to @to, IOW, called page_counter_charge() about
347562306a36Sopenharmony_ci * both res and memsw, and called css_get().
347662306a36Sopenharmony_ci */
347762306a36Sopenharmony_cistatic int mem_cgroup_move_swap_account(swp_entry_t entry,
347862306a36Sopenharmony_ci				struct mem_cgroup *from, struct mem_cgroup *to)
347962306a36Sopenharmony_ci{
348062306a36Sopenharmony_ci	unsigned short old_id, new_id;
348162306a36Sopenharmony_ci
348262306a36Sopenharmony_ci	old_id = mem_cgroup_id(from);
348362306a36Sopenharmony_ci	new_id = mem_cgroup_id(to);
348462306a36Sopenharmony_ci
348562306a36Sopenharmony_ci	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
348662306a36Sopenharmony_ci		mod_memcg_state(from, MEMCG_SWAP, -1);
348762306a36Sopenharmony_ci		mod_memcg_state(to, MEMCG_SWAP, 1);
348862306a36Sopenharmony_ci		return 0;
348962306a36Sopenharmony_ci	}
349062306a36Sopenharmony_ci	return -EINVAL;
349162306a36Sopenharmony_ci}
349262306a36Sopenharmony_ci#else
349362306a36Sopenharmony_cistatic inline int mem_cgroup_move_swap_account(swp_entry_t entry,
349462306a36Sopenharmony_ci				struct mem_cgroup *from, struct mem_cgroup *to)
349562306a36Sopenharmony_ci{
349662306a36Sopenharmony_ci	return -EINVAL;
349762306a36Sopenharmony_ci}
349862306a36Sopenharmony_ci#endif
349962306a36Sopenharmony_ci
350062306a36Sopenharmony_cistatic DEFINE_MUTEX(memcg_max_mutex);
350162306a36Sopenharmony_ci
350262306a36Sopenharmony_cistatic int mem_cgroup_resize_max(struct mem_cgroup *memcg,
350362306a36Sopenharmony_ci				 unsigned long max, bool memsw)
350462306a36Sopenharmony_ci{
350562306a36Sopenharmony_ci	bool enlarge = false;
350662306a36Sopenharmony_ci	bool drained = false;
350762306a36Sopenharmony_ci	int ret;
350862306a36Sopenharmony_ci	bool limits_invariant;
350962306a36Sopenharmony_ci	struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
351062306a36Sopenharmony_ci
351162306a36Sopenharmony_ci	do {
351262306a36Sopenharmony_ci		if (signal_pending(current)) {
351362306a36Sopenharmony_ci			ret = -EINTR;
351462306a36Sopenharmony_ci			break;
351562306a36Sopenharmony_ci		}
351662306a36Sopenharmony_ci
351762306a36Sopenharmony_ci		mutex_lock(&memcg_max_mutex);
351862306a36Sopenharmony_ci		/*
351962306a36Sopenharmony_ci		 * Make sure that the new limit (memsw or memory limit) doesn't
352062306a36Sopenharmony_ci		 * break our basic invariant rule memory.max <= memsw.max.
352162306a36Sopenharmony_ci		 */
352262306a36Sopenharmony_ci		limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
352362306a36Sopenharmony_ci					   max <= memcg->memsw.max;
352462306a36Sopenharmony_ci		if (!limits_invariant) {
352562306a36Sopenharmony_ci			mutex_unlock(&memcg_max_mutex);
352662306a36Sopenharmony_ci			ret = -EINVAL;
352762306a36Sopenharmony_ci			break;
352862306a36Sopenharmony_ci		}
352962306a36Sopenharmony_ci		if (max > counter->max)
353062306a36Sopenharmony_ci			enlarge = true;
353162306a36Sopenharmony_ci		ret = page_counter_set_max(counter, max);
353262306a36Sopenharmony_ci		mutex_unlock(&memcg_max_mutex);
353362306a36Sopenharmony_ci
353462306a36Sopenharmony_ci		if (!ret)
353562306a36Sopenharmony_ci			break;
353662306a36Sopenharmony_ci
353762306a36Sopenharmony_ci		if (!drained) {
353862306a36Sopenharmony_ci			drain_all_stock(memcg);
353962306a36Sopenharmony_ci			drained = true;
354062306a36Sopenharmony_ci			continue;
354162306a36Sopenharmony_ci		}
354262306a36Sopenharmony_ci
354362306a36Sopenharmony_ci		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
354462306a36Sopenharmony_ci					memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
354562306a36Sopenharmony_ci			ret = -EBUSY;
354662306a36Sopenharmony_ci			break;
354762306a36Sopenharmony_ci		}
354862306a36Sopenharmony_ci	} while (true);
354962306a36Sopenharmony_ci
355062306a36Sopenharmony_ci	if (!ret && enlarge)
355162306a36Sopenharmony_ci		memcg_oom_recover(memcg);
355262306a36Sopenharmony_ci
355362306a36Sopenharmony_ci	return ret;
355462306a36Sopenharmony_ci}
355562306a36Sopenharmony_ci
355662306a36Sopenharmony_ciunsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
355762306a36Sopenharmony_ci					    gfp_t gfp_mask,
355862306a36Sopenharmony_ci					    unsigned long *total_scanned)
355962306a36Sopenharmony_ci{
356062306a36Sopenharmony_ci	unsigned long nr_reclaimed = 0;
356162306a36Sopenharmony_ci	struct mem_cgroup_per_node *mz, *next_mz = NULL;
356262306a36Sopenharmony_ci	unsigned long reclaimed;
356362306a36Sopenharmony_ci	int loop = 0;
356462306a36Sopenharmony_ci	struct mem_cgroup_tree_per_node *mctz;
356562306a36Sopenharmony_ci	unsigned long excess;
356662306a36Sopenharmony_ci
356762306a36Sopenharmony_ci	if (lru_gen_enabled())
356862306a36Sopenharmony_ci		return 0;
356962306a36Sopenharmony_ci
357062306a36Sopenharmony_ci	if (order > 0)
357162306a36Sopenharmony_ci		return 0;
357262306a36Sopenharmony_ci
357362306a36Sopenharmony_ci	mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
357462306a36Sopenharmony_ci
357562306a36Sopenharmony_ci	/*
357662306a36Sopenharmony_ci	 * Do not even bother to check the largest node if the root
357762306a36Sopenharmony_ci	 * is empty. Do it lockless to prevent lock bouncing. Races
357862306a36Sopenharmony_ci	 * are acceptable as soft limit is best effort anyway.
357962306a36Sopenharmony_ci	 */
358062306a36Sopenharmony_ci	if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
358162306a36Sopenharmony_ci		return 0;
358262306a36Sopenharmony_ci
358362306a36Sopenharmony_ci	/*
358462306a36Sopenharmony_ci	 * This loop can run a while, specially if mem_cgroup's continuously
358562306a36Sopenharmony_ci	 * keep exceeding their soft limit and putting the system under
358662306a36Sopenharmony_ci	 * pressure
358762306a36Sopenharmony_ci	 */
358862306a36Sopenharmony_ci	do {
358962306a36Sopenharmony_ci		if (next_mz)
359062306a36Sopenharmony_ci			mz = next_mz;
359162306a36Sopenharmony_ci		else
359262306a36Sopenharmony_ci			mz = mem_cgroup_largest_soft_limit_node(mctz);
359362306a36Sopenharmony_ci		if (!mz)
359462306a36Sopenharmony_ci			break;
359562306a36Sopenharmony_ci
359662306a36Sopenharmony_ci		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
359762306a36Sopenharmony_ci						    gfp_mask, total_scanned);
359862306a36Sopenharmony_ci		nr_reclaimed += reclaimed;
359962306a36Sopenharmony_ci		spin_lock_irq(&mctz->lock);
360062306a36Sopenharmony_ci
360162306a36Sopenharmony_ci		/*
360262306a36Sopenharmony_ci		 * If we failed to reclaim anything from this memory cgroup
360362306a36Sopenharmony_ci		 * it is time to move on to the next cgroup
360462306a36Sopenharmony_ci		 */
360562306a36Sopenharmony_ci		next_mz = NULL;
360662306a36Sopenharmony_ci		if (!reclaimed)
360762306a36Sopenharmony_ci			next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
360862306a36Sopenharmony_ci
360962306a36Sopenharmony_ci		excess = soft_limit_excess(mz->memcg);
361062306a36Sopenharmony_ci		/*
361162306a36Sopenharmony_ci		 * One school of thought says that we should not add
361262306a36Sopenharmony_ci		 * back the node to the tree if reclaim returns 0.
361362306a36Sopenharmony_ci		 * But our reclaim could return 0, simply because due
361462306a36Sopenharmony_ci		 * to priority we are exposing a smaller subset of
361562306a36Sopenharmony_ci		 * memory to reclaim from. Consider this as a longer
361662306a36Sopenharmony_ci		 * term TODO.
361762306a36Sopenharmony_ci		 */
361862306a36Sopenharmony_ci		/* If excess == 0, no tree ops */
361962306a36Sopenharmony_ci		__mem_cgroup_insert_exceeded(mz, mctz, excess);
362062306a36Sopenharmony_ci		spin_unlock_irq(&mctz->lock);
362162306a36Sopenharmony_ci		css_put(&mz->memcg->css);
362262306a36Sopenharmony_ci		loop++;
362362306a36Sopenharmony_ci		/*
362462306a36Sopenharmony_ci		 * Could not reclaim anything and there are no more
362562306a36Sopenharmony_ci		 * mem cgroups to try or we seem to be looping without
362662306a36Sopenharmony_ci		 * reclaiming anything.
362762306a36Sopenharmony_ci		 */
362862306a36Sopenharmony_ci		if (!nr_reclaimed &&
362962306a36Sopenharmony_ci			(next_mz == NULL ||
363062306a36Sopenharmony_ci			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
363162306a36Sopenharmony_ci			break;
363262306a36Sopenharmony_ci	} while (!nr_reclaimed);
363362306a36Sopenharmony_ci	if (next_mz)
363462306a36Sopenharmony_ci		css_put(&next_mz->memcg->css);
363562306a36Sopenharmony_ci	return nr_reclaimed;
363662306a36Sopenharmony_ci}
363762306a36Sopenharmony_ci
363862306a36Sopenharmony_ci/*
363962306a36Sopenharmony_ci * Reclaims as many pages from the given memcg as possible.
364062306a36Sopenharmony_ci *
364162306a36Sopenharmony_ci * Caller is responsible for holding css reference for memcg.
364262306a36Sopenharmony_ci */
364362306a36Sopenharmony_cistatic int mem_cgroup_force_empty(struct mem_cgroup *memcg)
364462306a36Sopenharmony_ci{
364562306a36Sopenharmony_ci	int nr_retries = MAX_RECLAIM_RETRIES;
364662306a36Sopenharmony_ci
364762306a36Sopenharmony_ci	/* we call try-to-free pages for make this cgroup empty */
364862306a36Sopenharmony_ci	lru_add_drain_all();
364962306a36Sopenharmony_ci
365062306a36Sopenharmony_ci	drain_all_stock(memcg);
365162306a36Sopenharmony_ci
365262306a36Sopenharmony_ci	/* try to free all pages in this cgroup */
365362306a36Sopenharmony_ci	while (nr_retries && page_counter_read(&memcg->memory)) {
365462306a36Sopenharmony_ci		if (signal_pending(current))
365562306a36Sopenharmony_ci			return -EINTR;
365662306a36Sopenharmony_ci
365762306a36Sopenharmony_ci		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
365862306a36Sopenharmony_ci						  MEMCG_RECLAIM_MAY_SWAP))
365962306a36Sopenharmony_ci			nr_retries--;
366062306a36Sopenharmony_ci	}
366162306a36Sopenharmony_ci
366262306a36Sopenharmony_ci	return 0;
366362306a36Sopenharmony_ci}
366462306a36Sopenharmony_ci
366562306a36Sopenharmony_cistatic ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
366662306a36Sopenharmony_ci					    char *buf, size_t nbytes,
366762306a36Sopenharmony_ci					    loff_t off)
366862306a36Sopenharmony_ci{
366962306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
367062306a36Sopenharmony_ci
367162306a36Sopenharmony_ci	if (mem_cgroup_is_root(memcg))
367262306a36Sopenharmony_ci		return -EINVAL;
367362306a36Sopenharmony_ci	return mem_cgroup_force_empty(memcg) ?: nbytes;
367462306a36Sopenharmony_ci}
367562306a36Sopenharmony_ci
367662306a36Sopenharmony_cistatic u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
367762306a36Sopenharmony_ci				     struct cftype *cft)
367862306a36Sopenharmony_ci{
367962306a36Sopenharmony_ci	return 1;
368062306a36Sopenharmony_ci}
368162306a36Sopenharmony_ci
368262306a36Sopenharmony_cistatic int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
368362306a36Sopenharmony_ci				      struct cftype *cft, u64 val)
368462306a36Sopenharmony_ci{
368562306a36Sopenharmony_ci	if (val == 1)
368662306a36Sopenharmony_ci		return 0;
368762306a36Sopenharmony_ci
368862306a36Sopenharmony_ci	pr_warn_once("Non-hierarchical mode is deprecated. "
368962306a36Sopenharmony_ci		     "Please report your usecase to linux-mm@kvack.org if you "
369062306a36Sopenharmony_ci		     "depend on this functionality.\n");
369162306a36Sopenharmony_ci
369262306a36Sopenharmony_ci	return -EINVAL;
369362306a36Sopenharmony_ci}
369462306a36Sopenharmony_ci
369562306a36Sopenharmony_cistatic unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
369662306a36Sopenharmony_ci{
369762306a36Sopenharmony_ci	unsigned long val;
369862306a36Sopenharmony_ci
369962306a36Sopenharmony_ci	if (mem_cgroup_is_root(memcg)) {
370062306a36Sopenharmony_ci		/*
370162306a36Sopenharmony_ci		 * Approximate root's usage from global state. This isn't
370262306a36Sopenharmony_ci		 * perfect, but the root usage was always an approximation.
370362306a36Sopenharmony_ci		 */
370462306a36Sopenharmony_ci		val = global_node_page_state(NR_FILE_PAGES) +
370562306a36Sopenharmony_ci			global_node_page_state(NR_ANON_MAPPED);
370662306a36Sopenharmony_ci		if (swap)
370762306a36Sopenharmony_ci			val += total_swap_pages - get_nr_swap_pages();
370862306a36Sopenharmony_ci	} else {
370962306a36Sopenharmony_ci		if (!swap)
371062306a36Sopenharmony_ci			val = page_counter_read(&memcg->memory);
371162306a36Sopenharmony_ci		else
371262306a36Sopenharmony_ci			val = page_counter_read(&memcg->memsw);
371362306a36Sopenharmony_ci	}
371462306a36Sopenharmony_ci	return val;
371562306a36Sopenharmony_ci}
371662306a36Sopenharmony_ci
371762306a36Sopenharmony_cienum {
371862306a36Sopenharmony_ci	RES_USAGE,
371962306a36Sopenharmony_ci	RES_LIMIT,
372062306a36Sopenharmony_ci	RES_MAX_USAGE,
372162306a36Sopenharmony_ci	RES_FAILCNT,
372262306a36Sopenharmony_ci	RES_SOFT_LIMIT,
372362306a36Sopenharmony_ci};
372462306a36Sopenharmony_ci
372562306a36Sopenharmony_cistatic u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
372662306a36Sopenharmony_ci			       struct cftype *cft)
372762306a36Sopenharmony_ci{
372862306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
372962306a36Sopenharmony_ci	struct page_counter *counter;
373062306a36Sopenharmony_ci
373162306a36Sopenharmony_ci	switch (MEMFILE_TYPE(cft->private)) {
373262306a36Sopenharmony_ci	case _MEM:
373362306a36Sopenharmony_ci		counter = &memcg->memory;
373462306a36Sopenharmony_ci		break;
373562306a36Sopenharmony_ci	case _MEMSWAP:
373662306a36Sopenharmony_ci		counter = &memcg->memsw;
373762306a36Sopenharmony_ci		break;
373862306a36Sopenharmony_ci	case _KMEM:
373962306a36Sopenharmony_ci		counter = &memcg->kmem;
374062306a36Sopenharmony_ci		break;
374162306a36Sopenharmony_ci	case _TCP:
374262306a36Sopenharmony_ci		counter = &memcg->tcpmem;
374362306a36Sopenharmony_ci		break;
374462306a36Sopenharmony_ci	default:
374562306a36Sopenharmony_ci		BUG();
374662306a36Sopenharmony_ci	}
374762306a36Sopenharmony_ci
374862306a36Sopenharmony_ci	switch (MEMFILE_ATTR(cft->private)) {
374962306a36Sopenharmony_ci	case RES_USAGE:
375062306a36Sopenharmony_ci		if (counter == &memcg->memory)
375162306a36Sopenharmony_ci			return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
375262306a36Sopenharmony_ci		if (counter == &memcg->memsw)
375362306a36Sopenharmony_ci			return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
375462306a36Sopenharmony_ci		return (u64)page_counter_read(counter) * PAGE_SIZE;
375562306a36Sopenharmony_ci	case RES_LIMIT:
375662306a36Sopenharmony_ci		return (u64)counter->max * PAGE_SIZE;
375762306a36Sopenharmony_ci	case RES_MAX_USAGE:
375862306a36Sopenharmony_ci		return (u64)counter->watermark * PAGE_SIZE;
375962306a36Sopenharmony_ci	case RES_FAILCNT:
376062306a36Sopenharmony_ci		return counter->failcnt;
376162306a36Sopenharmony_ci	case RES_SOFT_LIMIT:
376262306a36Sopenharmony_ci		return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE;
376362306a36Sopenharmony_ci	default:
376462306a36Sopenharmony_ci		BUG();
376562306a36Sopenharmony_ci	}
376662306a36Sopenharmony_ci}
376762306a36Sopenharmony_ci
376862306a36Sopenharmony_ci/*
376962306a36Sopenharmony_ci * This function doesn't do anything useful. Its only job is to provide a read
377062306a36Sopenharmony_ci * handler for a file so that cgroup_file_mode() will add read permissions.
377162306a36Sopenharmony_ci */
377262306a36Sopenharmony_cistatic int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m,
377362306a36Sopenharmony_ci				     __always_unused void *v)
377462306a36Sopenharmony_ci{
377562306a36Sopenharmony_ci	return -EINVAL;
377662306a36Sopenharmony_ci}
377762306a36Sopenharmony_ci
377862306a36Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
377962306a36Sopenharmony_cistatic int memcg_online_kmem(struct mem_cgroup *memcg)
378062306a36Sopenharmony_ci{
378162306a36Sopenharmony_ci	struct obj_cgroup *objcg;
378262306a36Sopenharmony_ci
378362306a36Sopenharmony_ci	if (mem_cgroup_kmem_disabled())
378462306a36Sopenharmony_ci		return 0;
378562306a36Sopenharmony_ci
378662306a36Sopenharmony_ci	if (unlikely(mem_cgroup_is_root(memcg)))
378762306a36Sopenharmony_ci		return 0;
378862306a36Sopenharmony_ci
378962306a36Sopenharmony_ci	objcg = obj_cgroup_alloc();
379062306a36Sopenharmony_ci	if (!objcg)
379162306a36Sopenharmony_ci		return -ENOMEM;
379262306a36Sopenharmony_ci
379362306a36Sopenharmony_ci	objcg->memcg = memcg;
379462306a36Sopenharmony_ci	rcu_assign_pointer(memcg->objcg, objcg);
379562306a36Sopenharmony_ci
379662306a36Sopenharmony_ci	static_branch_enable(&memcg_kmem_online_key);
379762306a36Sopenharmony_ci
379862306a36Sopenharmony_ci	memcg->kmemcg_id = memcg->id.id;
379962306a36Sopenharmony_ci
380062306a36Sopenharmony_ci	return 0;
380162306a36Sopenharmony_ci}
380262306a36Sopenharmony_ci
380362306a36Sopenharmony_cistatic void memcg_offline_kmem(struct mem_cgroup *memcg)
380462306a36Sopenharmony_ci{
380562306a36Sopenharmony_ci	struct mem_cgroup *parent;
380662306a36Sopenharmony_ci
380762306a36Sopenharmony_ci	if (mem_cgroup_kmem_disabled())
380862306a36Sopenharmony_ci		return;
380962306a36Sopenharmony_ci
381062306a36Sopenharmony_ci	if (unlikely(mem_cgroup_is_root(memcg)))
381162306a36Sopenharmony_ci		return;
381262306a36Sopenharmony_ci
381362306a36Sopenharmony_ci	parent = parent_mem_cgroup(memcg);
381462306a36Sopenharmony_ci	if (!parent)
381562306a36Sopenharmony_ci		parent = root_mem_cgroup;
381662306a36Sopenharmony_ci
381762306a36Sopenharmony_ci	memcg_reparent_objcgs(memcg, parent);
381862306a36Sopenharmony_ci
381962306a36Sopenharmony_ci	/*
382062306a36Sopenharmony_ci	 * After we have finished memcg_reparent_objcgs(), all list_lrus
382162306a36Sopenharmony_ci	 * corresponding to this cgroup are guaranteed to remain empty.
382262306a36Sopenharmony_ci	 * The ordering is imposed by list_lru_node->lock taken by
382362306a36Sopenharmony_ci	 * memcg_reparent_list_lrus().
382462306a36Sopenharmony_ci	 */
382562306a36Sopenharmony_ci	memcg_reparent_list_lrus(memcg, parent);
382662306a36Sopenharmony_ci}
382762306a36Sopenharmony_ci#else
382862306a36Sopenharmony_cistatic int memcg_online_kmem(struct mem_cgroup *memcg)
382962306a36Sopenharmony_ci{
383062306a36Sopenharmony_ci	return 0;
383162306a36Sopenharmony_ci}
383262306a36Sopenharmony_cistatic void memcg_offline_kmem(struct mem_cgroup *memcg)
383362306a36Sopenharmony_ci{
383462306a36Sopenharmony_ci}
383562306a36Sopenharmony_ci#endif /* CONFIG_MEMCG_KMEM */
383662306a36Sopenharmony_ci
383762306a36Sopenharmony_cistatic int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
383862306a36Sopenharmony_ci{
383962306a36Sopenharmony_ci	int ret;
384062306a36Sopenharmony_ci
384162306a36Sopenharmony_ci	mutex_lock(&memcg_max_mutex);
384262306a36Sopenharmony_ci
384362306a36Sopenharmony_ci	ret = page_counter_set_max(&memcg->tcpmem, max);
384462306a36Sopenharmony_ci	if (ret)
384562306a36Sopenharmony_ci		goto out;
384662306a36Sopenharmony_ci
384762306a36Sopenharmony_ci	if (!memcg->tcpmem_active) {
384862306a36Sopenharmony_ci		/*
384962306a36Sopenharmony_ci		 * The active flag needs to be written after the static_key
385062306a36Sopenharmony_ci		 * update. This is what guarantees that the socket activation
385162306a36Sopenharmony_ci		 * function is the last one to run. See mem_cgroup_sk_alloc()
385262306a36Sopenharmony_ci		 * for details, and note that we don't mark any socket as
385362306a36Sopenharmony_ci		 * belonging to this memcg until that flag is up.
385462306a36Sopenharmony_ci		 *
385562306a36Sopenharmony_ci		 * We need to do this, because static_keys will span multiple
385662306a36Sopenharmony_ci		 * sites, but we can't control their order. If we mark a socket
385762306a36Sopenharmony_ci		 * as accounted, but the accounting functions are not patched in
385862306a36Sopenharmony_ci		 * yet, we'll lose accounting.
385962306a36Sopenharmony_ci		 *
386062306a36Sopenharmony_ci		 * We never race with the readers in mem_cgroup_sk_alloc(),
386162306a36Sopenharmony_ci		 * because when this value change, the code to process it is not
386262306a36Sopenharmony_ci		 * patched in yet.
386362306a36Sopenharmony_ci		 */
386462306a36Sopenharmony_ci		static_branch_inc(&memcg_sockets_enabled_key);
386562306a36Sopenharmony_ci		memcg->tcpmem_active = true;
386662306a36Sopenharmony_ci	}
386762306a36Sopenharmony_ciout:
386862306a36Sopenharmony_ci	mutex_unlock(&memcg_max_mutex);
386962306a36Sopenharmony_ci	return ret;
387062306a36Sopenharmony_ci}
387162306a36Sopenharmony_ci
387262306a36Sopenharmony_ci/*
387362306a36Sopenharmony_ci * The user of this function is...
387462306a36Sopenharmony_ci * RES_LIMIT.
387562306a36Sopenharmony_ci */
387662306a36Sopenharmony_cistatic ssize_t mem_cgroup_write(struct kernfs_open_file *of,
387762306a36Sopenharmony_ci				char *buf, size_t nbytes, loff_t off)
387862306a36Sopenharmony_ci{
387962306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
388062306a36Sopenharmony_ci	unsigned long nr_pages;
388162306a36Sopenharmony_ci	int ret;
388262306a36Sopenharmony_ci
388362306a36Sopenharmony_ci	buf = strstrip(buf);
388462306a36Sopenharmony_ci	ret = page_counter_memparse(buf, "-1", &nr_pages);
388562306a36Sopenharmony_ci	if (ret)
388662306a36Sopenharmony_ci		return ret;
388762306a36Sopenharmony_ci
388862306a36Sopenharmony_ci	switch (MEMFILE_ATTR(of_cft(of)->private)) {
388962306a36Sopenharmony_ci	case RES_LIMIT:
389062306a36Sopenharmony_ci		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
389162306a36Sopenharmony_ci			ret = -EINVAL;
389262306a36Sopenharmony_ci			break;
389362306a36Sopenharmony_ci		}
389462306a36Sopenharmony_ci		switch (MEMFILE_TYPE(of_cft(of)->private)) {
389562306a36Sopenharmony_ci		case _MEM:
389662306a36Sopenharmony_ci			ret = mem_cgroup_resize_max(memcg, nr_pages, false);
389762306a36Sopenharmony_ci			break;
389862306a36Sopenharmony_ci		case _MEMSWAP:
389962306a36Sopenharmony_ci			ret = mem_cgroup_resize_max(memcg, nr_pages, true);
390062306a36Sopenharmony_ci			break;
390162306a36Sopenharmony_ci		case _KMEM:
390262306a36Sopenharmony_ci			pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
390362306a36Sopenharmony_ci				     "Writing any value to this file has no effect. "
390462306a36Sopenharmony_ci				     "Please report your usecase to linux-mm@kvack.org if you "
390562306a36Sopenharmony_ci				     "depend on this functionality.\n");
390662306a36Sopenharmony_ci			ret = 0;
390762306a36Sopenharmony_ci			break;
390862306a36Sopenharmony_ci		case _TCP:
390962306a36Sopenharmony_ci			ret = memcg_update_tcp_max(memcg, nr_pages);
391062306a36Sopenharmony_ci			break;
391162306a36Sopenharmony_ci		}
391262306a36Sopenharmony_ci		break;
391362306a36Sopenharmony_ci	case RES_SOFT_LIMIT:
391462306a36Sopenharmony_ci		if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
391562306a36Sopenharmony_ci			ret = -EOPNOTSUPP;
391662306a36Sopenharmony_ci		} else {
391762306a36Sopenharmony_ci			WRITE_ONCE(memcg->soft_limit, nr_pages);
391862306a36Sopenharmony_ci			ret = 0;
391962306a36Sopenharmony_ci		}
392062306a36Sopenharmony_ci		break;
392162306a36Sopenharmony_ci	}
392262306a36Sopenharmony_ci	return ret ?: nbytes;
392362306a36Sopenharmony_ci}
392462306a36Sopenharmony_ci
392562306a36Sopenharmony_cistatic ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
392662306a36Sopenharmony_ci				size_t nbytes, loff_t off)
392762306a36Sopenharmony_ci{
392862306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
392962306a36Sopenharmony_ci	struct page_counter *counter;
393062306a36Sopenharmony_ci
393162306a36Sopenharmony_ci	switch (MEMFILE_TYPE(of_cft(of)->private)) {
393262306a36Sopenharmony_ci	case _MEM:
393362306a36Sopenharmony_ci		counter = &memcg->memory;
393462306a36Sopenharmony_ci		break;
393562306a36Sopenharmony_ci	case _MEMSWAP:
393662306a36Sopenharmony_ci		counter = &memcg->memsw;
393762306a36Sopenharmony_ci		break;
393862306a36Sopenharmony_ci	case _KMEM:
393962306a36Sopenharmony_ci		counter = &memcg->kmem;
394062306a36Sopenharmony_ci		break;
394162306a36Sopenharmony_ci	case _TCP:
394262306a36Sopenharmony_ci		counter = &memcg->tcpmem;
394362306a36Sopenharmony_ci		break;
394462306a36Sopenharmony_ci	default:
394562306a36Sopenharmony_ci		BUG();
394662306a36Sopenharmony_ci	}
394762306a36Sopenharmony_ci
394862306a36Sopenharmony_ci	switch (MEMFILE_ATTR(of_cft(of)->private)) {
394962306a36Sopenharmony_ci	case RES_MAX_USAGE:
395062306a36Sopenharmony_ci		page_counter_reset_watermark(counter);
395162306a36Sopenharmony_ci		break;
395262306a36Sopenharmony_ci	case RES_FAILCNT:
395362306a36Sopenharmony_ci		counter->failcnt = 0;
395462306a36Sopenharmony_ci		break;
395562306a36Sopenharmony_ci	default:
395662306a36Sopenharmony_ci		BUG();
395762306a36Sopenharmony_ci	}
395862306a36Sopenharmony_ci
395962306a36Sopenharmony_ci	return nbytes;
396062306a36Sopenharmony_ci}
396162306a36Sopenharmony_ci
396262306a36Sopenharmony_cistatic u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
396362306a36Sopenharmony_ci					struct cftype *cft)
396462306a36Sopenharmony_ci{
396562306a36Sopenharmony_ci	return mem_cgroup_from_css(css)->move_charge_at_immigrate;
396662306a36Sopenharmony_ci}
396762306a36Sopenharmony_ci
396862306a36Sopenharmony_ci#ifdef CONFIG_MMU
396962306a36Sopenharmony_cistatic int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
397062306a36Sopenharmony_ci					struct cftype *cft, u64 val)
397162306a36Sopenharmony_ci{
397262306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
397362306a36Sopenharmony_ci
397462306a36Sopenharmony_ci	pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
397562306a36Sopenharmony_ci		     "Please report your usecase to linux-mm@kvack.org if you "
397662306a36Sopenharmony_ci		     "depend on this functionality.\n");
397762306a36Sopenharmony_ci
397862306a36Sopenharmony_ci	if (val & ~MOVE_MASK)
397962306a36Sopenharmony_ci		return -EINVAL;
398062306a36Sopenharmony_ci
398162306a36Sopenharmony_ci	/*
398262306a36Sopenharmony_ci	 * No kind of locking is needed in here, because ->can_attach() will
398362306a36Sopenharmony_ci	 * check this value once in the beginning of the process, and then carry
398462306a36Sopenharmony_ci	 * on with stale data. This means that changes to this value will only
398562306a36Sopenharmony_ci	 * affect task migrations starting after the change.
398662306a36Sopenharmony_ci	 */
398762306a36Sopenharmony_ci	memcg->move_charge_at_immigrate = val;
398862306a36Sopenharmony_ci	return 0;
398962306a36Sopenharmony_ci}
399062306a36Sopenharmony_ci#else
399162306a36Sopenharmony_cistatic int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
399262306a36Sopenharmony_ci					struct cftype *cft, u64 val)
399362306a36Sopenharmony_ci{
399462306a36Sopenharmony_ci	return -ENOSYS;
399562306a36Sopenharmony_ci}
399662306a36Sopenharmony_ci#endif
399762306a36Sopenharmony_ci
399862306a36Sopenharmony_ci#ifdef CONFIG_NUMA
399962306a36Sopenharmony_ci
400062306a36Sopenharmony_ci#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
400162306a36Sopenharmony_ci#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
400262306a36Sopenharmony_ci#define LRU_ALL	     ((1 << NR_LRU_LISTS) - 1)
400362306a36Sopenharmony_ci
400462306a36Sopenharmony_cistatic unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
400562306a36Sopenharmony_ci				int nid, unsigned int lru_mask, bool tree)
400662306a36Sopenharmony_ci{
400762306a36Sopenharmony_ci	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
400862306a36Sopenharmony_ci	unsigned long nr = 0;
400962306a36Sopenharmony_ci	enum lru_list lru;
401062306a36Sopenharmony_ci
401162306a36Sopenharmony_ci	VM_BUG_ON((unsigned)nid >= nr_node_ids);
401262306a36Sopenharmony_ci
401362306a36Sopenharmony_ci	for_each_lru(lru) {
401462306a36Sopenharmony_ci		if (!(BIT(lru) & lru_mask))
401562306a36Sopenharmony_ci			continue;
401662306a36Sopenharmony_ci		if (tree)
401762306a36Sopenharmony_ci			nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
401862306a36Sopenharmony_ci		else
401962306a36Sopenharmony_ci			nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
402062306a36Sopenharmony_ci	}
402162306a36Sopenharmony_ci	return nr;
402262306a36Sopenharmony_ci}
402362306a36Sopenharmony_ci
402462306a36Sopenharmony_cistatic unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
402562306a36Sopenharmony_ci					     unsigned int lru_mask,
402662306a36Sopenharmony_ci					     bool tree)
402762306a36Sopenharmony_ci{
402862306a36Sopenharmony_ci	unsigned long nr = 0;
402962306a36Sopenharmony_ci	enum lru_list lru;
403062306a36Sopenharmony_ci
403162306a36Sopenharmony_ci	for_each_lru(lru) {
403262306a36Sopenharmony_ci		if (!(BIT(lru) & lru_mask))
403362306a36Sopenharmony_ci			continue;
403462306a36Sopenharmony_ci		if (tree)
403562306a36Sopenharmony_ci			nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
403662306a36Sopenharmony_ci		else
403762306a36Sopenharmony_ci			nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
403862306a36Sopenharmony_ci	}
403962306a36Sopenharmony_ci	return nr;
404062306a36Sopenharmony_ci}
404162306a36Sopenharmony_ci
404262306a36Sopenharmony_cistatic int memcg_numa_stat_show(struct seq_file *m, void *v)
404362306a36Sopenharmony_ci{
404462306a36Sopenharmony_ci	struct numa_stat {
404562306a36Sopenharmony_ci		const char *name;
404662306a36Sopenharmony_ci		unsigned int lru_mask;
404762306a36Sopenharmony_ci	};
404862306a36Sopenharmony_ci
404962306a36Sopenharmony_ci	static const struct numa_stat stats[] = {
405062306a36Sopenharmony_ci		{ "total", LRU_ALL },
405162306a36Sopenharmony_ci		{ "file", LRU_ALL_FILE },
405262306a36Sopenharmony_ci		{ "anon", LRU_ALL_ANON },
405362306a36Sopenharmony_ci		{ "unevictable", BIT(LRU_UNEVICTABLE) },
405462306a36Sopenharmony_ci	};
405562306a36Sopenharmony_ci	const struct numa_stat *stat;
405662306a36Sopenharmony_ci	int nid;
405762306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
405862306a36Sopenharmony_ci
405962306a36Sopenharmony_ci	mem_cgroup_flush_stats();
406062306a36Sopenharmony_ci
406162306a36Sopenharmony_ci	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
406262306a36Sopenharmony_ci		seq_printf(m, "%s=%lu", stat->name,
406362306a36Sopenharmony_ci			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
406462306a36Sopenharmony_ci						   false));
406562306a36Sopenharmony_ci		for_each_node_state(nid, N_MEMORY)
406662306a36Sopenharmony_ci			seq_printf(m, " N%d=%lu", nid,
406762306a36Sopenharmony_ci				   mem_cgroup_node_nr_lru_pages(memcg, nid,
406862306a36Sopenharmony_ci							stat->lru_mask, false));
406962306a36Sopenharmony_ci		seq_putc(m, '\n');
407062306a36Sopenharmony_ci	}
407162306a36Sopenharmony_ci
407262306a36Sopenharmony_ci	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
407362306a36Sopenharmony_ci
407462306a36Sopenharmony_ci		seq_printf(m, "hierarchical_%s=%lu", stat->name,
407562306a36Sopenharmony_ci			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
407662306a36Sopenharmony_ci						   true));
407762306a36Sopenharmony_ci		for_each_node_state(nid, N_MEMORY)
407862306a36Sopenharmony_ci			seq_printf(m, " N%d=%lu", nid,
407962306a36Sopenharmony_ci				   mem_cgroup_node_nr_lru_pages(memcg, nid,
408062306a36Sopenharmony_ci							stat->lru_mask, true));
408162306a36Sopenharmony_ci		seq_putc(m, '\n');
408262306a36Sopenharmony_ci	}
408362306a36Sopenharmony_ci
408462306a36Sopenharmony_ci	return 0;
408562306a36Sopenharmony_ci}
408662306a36Sopenharmony_ci#endif /* CONFIG_NUMA */
408762306a36Sopenharmony_ci
408862306a36Sopenharmony_cistatic const unsigned int memcg1_stats[] = {
408962306a36Sopenharmony_ci	NR_FILE_PAGES,
409062306a36Sopenharmony_ci	NR_ANON_MAPPED,
409162306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
409262306a36Sopenharmony_ci	NR_ANON_THPS,
409362306a36Sopenharmony_ci#endif
409462306a36Sopenharmony_ci	NR_SHMEM,
409562306a36Sopenharmony_ci	NR_FILE_MAPPED,
409662306a36Sopenharmony_ci	NR_FILE_DIRTY,
409762306a36Sopenharmony_ci	NR_WRITEBACK,
409862306a36Sopenharmony_ci	WORKINGSET_REFAULT_ANON,
409962306a36Sopenharmony_ci	WORKINGSET_REFAULT_FILE,
410062306a36Sopenharmony_ci	MEMCG_SWAP,
410162306a36Sopenharmony_ci};
410262306a36Sopenharmony_ci
410362306a36Sopenharmony_cistatic const char *const memcg1_stat_names[] = {
410462306a36Sopenharmony_ci	"cache",
410562306a36Sopenharmony_ci	"rss",
410662306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
410762306a36Sopenharmony_ci	"rss_huge",
410862306a36Sopenharmony_ci#endif
410962306a36Sopenharmony_ci	"shmem",
411062306a36Sopenharmony_ci	"mapped_file",
411162306a36Sopenharmony_ci	"dirty",
411262306a36Sopenharmony_ci	"writeback",
411362306a36Sopenharmony_ci	"workingset_refault_anon",
411462306a36Sopenharmony_ci	"workingset_refault_file",
411562306a36Sopenharmony_ci	"swap",
411662306a36Sopenharmony_ci};
411762306a36Sopenharmony_ci
411862306a36Sopenharmony_ci/* Universal VM events cgroup1 shows, original sort order */
411962306a36Sopenharmony_cistatic const unsigned int memcg1_events[] = {
412062306a36Sopenharmony_ci	PGPGIN,
412162306a36Sopenharmony_ci	PGPGOUT,
412262306a36Sopenharmony_ci	PGFAULT,
412362306a36Sopenharmony_ci	PGMAJFAULT,
412462306a36Sopenharmony_ci};
412562306a36Sopenharmony_ci
412662306a36Sopenharmony_cistatic void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
412762306a36Sopenharmony_ci{
412862306a36Sopenharmony_ci	unsigned long memory, memsw;
412962306a36Sopenharmony_ci	struct mem_cgroup *mi;
413062306a36Sopenharmony_ci	unsigned int i;
413162306a36Sopenharmony_ci
413262306a36Sopenharmony_ci	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
413362306a36Sopenharmony_ci
413462306a36Sopenharmony_ci	mem_cgroup_flush_stats();
413562306a36Sopenharmony_ci
413662306a36Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
413762306a36Sopenharmony_ci		unsigned long nr;
413862306a36Sopenharmony_ci
413962306a36Sopenharmony_ci		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
414062306a36Sopenharmony_ci			continue;
414162306a36Sopenharmony_ci		nr = memcg_page_state_local(memcg, memcg1_stats[i]);
414262306a36Sopenharmony_ci		seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i],
414362306a36Sopenharmony_ci			   nr * memcg_page_state_unit(memcg1_stats[i]));
414462306a36Sopenharmony_ci	}
414562306a36Sopenharmony_ci
414662306a36Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
414762306a36Sopenharmony_ci		seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]),
414862306a36Sopenharmony_ci			       memcg_events_local(memcg, memcg1_events[i]));
414962306a36Sopenharmony_ci
415062306a36Sopenharmony_ci	for (i = 0; i < NR_LRU_LISTS; i++)
415162306a36Sopenharmony_ci		seq_buf_printf(s, "%s %lu\n", lru_list_name(i),
415262306a36Sopenharmony_ci			       memcg_page_state_local(memcg, NR_LRU_BASE + i) *
415362306a36Sopenharmony_ci			       PAGE_SIZE);
415462306a36Sopenharmony_ci
415562306a36Sopenharmony_ci	/* Hierarchical information */
415662306a36Sopenharmony_ci	memory = memsw = PAGE_COUNTER_MAX;
415762306a36Sopenharmony_ci	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
415862306a36Sopenharmony_ci		memory = min(memory, READ_ONCE(mi->memory.max));
415962306a36Sopenharmony_ci		memsw = min(memsw, READ_ONCE(mi->memsw.max));
416062306a36Sopenharmony_ci	}
416162306a36Sopenharmony_ci	seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
416262306a36Sopenharmony_ci		       (u64)memory * PAGE_SIZE);
416362306a36Sopenharmony_ci	if (do_memsw_account())
416462306a36Sopenharmony_ci		seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
416562306a36Sopenharmony_ci			       (u64)memsw * PAGE_SIZE);
416662306a36Sopenharmony_ci
416762306a36Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
416862306a36Sopenharmony_ci		unsigned long nr;
416962306a36Sopenharmony_ci
417062306a36Sopenharmony_ci		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
417162306a36Sopenharmony_ci			continue;
417262306a36Sopenharmony_ci		nr = memcg_page_state(memcg, memcg1_stats[i]);
417362306a36Sopenharmony_ci		seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
417462306a36Sopenharmony_ci			   (u64)nr * memcg_page_state_unit(memcg1_stats[i]));
417562306a36Sopenharmony_ci	}
417662306a36Sopenharmony_ci
417762306a36Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
417862306a36Sopenharmony_ci		seq_buf_printf(s, "total_%s %llu\n",
417962306a36Sopenharmony_ci			       vm_event_name(memcg1_events[i]),
418062306a36Sopenharmony_ci			       (u64)memcg_events(memcg, memcg1_events[i]));
418162306a36Sopenharmony_ci
418262306a36Sopenharmony_ci	for (i = 0; i < NR_LRU_LISTS; i++)
418362306a36Sopenharmony_ci		seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i),
418462306a36Sopenharmony_ci			       (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
418562306a36Sopenharmony_ci			       PAGE_SIZE);
418662306a36Sopenharmony_ci
418762306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_VM
418862306a36Sopenharmony_ci	{
418962306a36Sopenharmony_ci		pg_data_t *pgdat;
419062306a36Sopenharmony_ci		struct mem_cgroup_per_node *mz;
419162306a36Sopenharmony_ci		unsigned long anon_cost = 0;
419262306a36Sopenharmony_ci		unsigned long file_cost = 0;
419362306a36Sopenharmony_ci
419462306a36Sopenharmony_ci		for_each_online_pgdat(pgdat) {
419562306a36Sopenharmony_ci			mz = memcg->nodeinfo[pgdat->node_id];
419662306a36Sopenharmony_ci
419762306a36Sopenharmony_ci			anon_cost += mz->lruvec.anon_cost;
419862306a36Sopenharmony_ci			file_cost += mz->lruvec.file_cost;
419962306a36Sopenharmony_ci		}
420062306a36Sopenharmony_ci		seq_buf_printf(s, "anon_cost %lu\n", anon_cost);
420162306a36Sopenharmony_ci		seq_buf_printf(s, "file_cost %lu\n", file_cost);
420262306a36Sopenharmony_ci	}
420362306a36Sopenharmony_ci#endif
420462306a36Sopenharmony_ci}
420562306a36Sopenharmony_ci
420662306a36Sopenharmony_cistatic u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
420762306a36Sopenharmony_ci				      struct cftype *cft)
420862306a36Sopenharmony_ci{
420962306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
421062306a36Sopenharmony_ci
421162306a36Sopenharmony_ci	return mem_cgroup_swappiness(memcg);
421262306a36Sopenharmony_ci}
421362306a36Sopenharmony_ci
421462306a36Sopenharmony_cistatic int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
421562306a36Sopenharmony_ci				       struct cftype *cft, u64 val)
421662306a36Sopenharmony_ci{
421762306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
421862306a36Sopenharmony_ci
421962306a36Sopenharmony_ci	if (val > 200)
422062306a36Sopenharmony_ci		return -EINVAL;
422162306a36Sopenharmony_ci
422262306a36Sopenharmony_ci	if (!mem_cgroup_is_root(memcg))
422362306a36Sopenharmony_ci		WRITE_ONCE(memcg->swappiness, val);
422462306a36Sopenharmony_ci	else
422562306a36Sopenharmony_ci		WRITE_ONCE(vm_swappiness, val);
422662306a36Sopenharmony_ci
422762306a36Sopenharmony_ci	return 0;
422862306a36Sopenharmony_ci}
422962306a36Sopenharmony_ci
423062306a36Sopenharmony_cistatic void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
423162306a36Sopenharmony_ci{
423262306a36Sopenharmony_ci	struct mem_cgroup_threshold_ary *t;
423362306a36Sopenharmony_ci	unsigned long usage;
423462306a36Sopenharmony_ci	int i;
423562306a36Sopenharmony_ci
423662306a36Sopenharmony_ci	rcu_read_lock();
423762306a36Sopenharmony_ci	if (!swap)
423862306a36Sopenharmony_ci		t = rcu_dereference(memcg->thresholds.primary);
423962306a36Sopenharmony_ci	else
424062306a36Sopenharmony_ci		t = rcu_dereference(memcg->memsw_thresholds.primary);
424162306a36Sopenharmony_ci
424262306a36Sopenharmony_ci	if (!t)
424362306a36Sopenharmony_ci		goto unlock;
424462306a36Sopenharmony_ci
424562306a36Sopenharmony_ci	usage = mem_cgroup_usage(memcg, swap);
424662306a36Sopenharmony_ci
424762306a36Sopenharmony_ci	/*
424862306a36Sopenharmony_ci	 * current_threshold points to threshold just below or equal to usage.
424962306a36Sopenharmony_ci	 * If it's not true, a threshold was crossed after last
425062306a36Sopenharmony_ci	 * call of __mem_cgroup_threshold().
425162306a36Sopenharmony_ci	 */
425262306a36Sopenharmony_ci	i = t->current_threshold;
425362306a36Sopenharmony_ci
425462306a36Sopenharmony_ci	/*
425562306a36Sopenharmony_ci	 * Iterate backward over array of thresholds starting from
425662306a36Sopenharmony_ci	 * current_threshold and check if a threshold is crossed.
425762306a36Sopenharmony_ci	 * If none of thresholds below usage is crossed, we read
425862306a36Sopenharmony_ci	 * only one element of the array here.
425962306a36Sopenharmony_ci	 */
426062306a36Sopenharmony_ci	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
426162306a36Sopenharmony_ci		eventfd_signal(t->entries[i].eventfd, 1);
426262306a36Sopenharmony_ci
426362306a36Sopenharmony_ci	/* i = current_threshold + 1 */
426462306a36Sopenharmony_ci	i++;
426562306a36Sopenharmony_ci
426662306a36Sopenharmony_ci	/*
426762306a36Sopenharmony_ci	 * Iterate forward over array of thresholds starting from
426862306a36Sopenharmony_ci	 * current_threshold+1 and check if a threshold is crossed.
426962306a36Sopenharmony_ci	 * If none of thresholds above usage is crossed, we read
427062306a36Sopenharmony_ci	 * only one element of the array here.
427162306a36Sopenharmony_ci	 */
427262306a36Sopenharmony_ci	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
427362306a36Sopenharmony_ci		eventfd_signal(t->entries[i].eventfd, 1);
427462306a36Sopenharmony_ci
427562306a36Sopenharmony_ci	/* Update current_threshold */
427662306a36Sopenharmony_ci	t->current_threshold = i - 1;
427762306a36Sopenharmony_ciunlock:
427862306a36Sopenharmony_ci	rcu_read_unlock();
427962306a36Sopenharmony_ci}
428062306a36Sopenharmony_ci
428162306a36Sopenharmony_cistatic void mem_cgroup_threshold(struct mem_cgroup *memcg)
428262306a36Sopenharmony_ci{
428362306a36Sopenharmony_ci	while (memcg) {
428462306a36Sopenharmony_ci		__mem_cgroup_threshold(memcg, false);
428562306a36Sopenharmony_ci		if (do_memsw_account())
428662306a36Sopenharmony_ci			__mem_cgroup_threshold(memcg, true);
428762306a36Sopenharmony_ci
428862306a36Sopenharmony_ci		memcg = parent_mem_cgroup(memcg);
428962306a36Sopenharmony_ci	}
429062306a36Sopenharmony_ci}
429162306a36Sopenharmony_ci
429262306a36Sopenharmony_cistatic int compare_thresholds(const void *a, const void *b)
429362306a36Sopenharmony_ci{
429462306a36Sopenharmony_ci	const struct mem_cgroup_threshold *_a = a;
429562306a36Sopenharmony_ci	const struct mem_cgroup_threshold *_b = b;
429662306a36Sopenharmony_ci
429762306a36Sopenharmony_ci	if (_a->threshold > _b->threshold)
429862306a36Sopenharmony_ci		return 1;
429962306a36Sopenharmony_ci
430062306a36Sopenharmony_ci	if (_a->threshold < _b->threshold)
430162306a36Sopenharmony_ci		return -1;
430262306a36Sopenharmony_ci
430362306a36Sopenharmony_ci	return 0;
430462306a36Sopenharmony_ci}
430562306a36Sopenharmony_ci
430662306a36Sopenharmony_cistatic int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
430762306a36Sopenharmony_ci{
430862306a36Sopenharmony_ci	struct mem_cgroup_eventfd_list *ev;
430962306a36Sopenharmony_ci
431062306a36Sopenharmony_ci	spin_lock(&memcg_oom_lock);
431162306a36Sopenharmony_ci
431262306a36Sopenharmony_ci	list_for_each_entry(ev, &memcg->oom_notify, list)
431362306a36Sopenharmony_ci		eventfd_signal(ev->eventfd, 1);
431462306a36Sopenharmony_ci
431562306a36Sopenharmony_ci	spin_unlock(&memcg_oom_lock);
431662306a36Sopenharmony_ci	return 0;
431762306a36Sopenharmony_ci}
431862306a36Sopenharmony_ci
431962306a36Sopenharmony_cistatic void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
432062306a36Sopenharmony_ci{
432162306a36Sopenharmony_ci	struct mem_cgroup *iter;
432262306a36Sopenharmony_ci
432362306a36Sopenharmony_ci	for_each_mem_cgroup_tree(iter, memcg)
432462306a36Sopenharmony_ci		mem_cgroup_oom_notify_cb(iter);
432562306a36Sopenharmony_ci}
432662306a36Sopenharmony_ci
432762306a36Sopenharmony_cistatic int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
432862306a36Sopenharmony_ci	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
432962306a36Sopenharmony_ci{
433062306a36Sopenharmony_ci	struct mem_cgroup_thresholds *thresholds;
433162306a36Sopenharmony_ci	struct mem_cgroup_threshold_ary *new;
433262306a36Sopenharmony_ci	unsigned long threshold;
433362306a36Sopenharmony_ci	unsigned long usage;
433462306a36Sopenharmony_ci	int i, size, ret;
433562306a36Sopenharmony_ci
433662306a36Sopenharmony_ci	ret = page_counter_memparse(args, "-1", &threshold);
433762306a36Sopenharmony_ci	if (ret)
433862306a36Sopenharmony_ci		return ret;
433962306a36Sopenharmony_ci
434062306a36Sopenharmony_ci	mutex_lock(&memcg->thresholds_lock);
434162306a36Sopenharmony_ci
434262306a36Sopenharmony_ci	if (type == _MEM) {
434362306a36Sopenharmony_ci		thresholds = &memcg->thresholds;
434462306a36Sopenharmony_ci		usage = mem_cgroup_usage(memcg, false);
434562306a36Sopenharmony_ci	} else if (type == _MEMSWAP) {
434662306a36Sopenharmony_ci		thresholds = &memcg->memsw_thresholds;
434762306a36Sopenharmony_ci		usage = mem_cgroup_usage(memcg, true);
434862306a36Sopenharmony_ci	} else
434962306a36Sopenharmony_ci		BUG();
435062306a36Sopenharmony_ci
435162306a36Sopenharmony_ci	/* Check if a threshold crossed before adding a new one */
435262306a36Sopenharmony_ci	if (thresholds->primary)
435362306a36Sopenharmony_ci		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
435462306a36Sopenharmony_ci
435562306a36Sopenharmony_ci	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
435662306a36Sopenharmony_ci
435762306a36Sopenharmony_ci	/* Allocate memory for new array of thresholds */
435862306a36Sopenharmony_ci	new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
435962306a36Sopenharmony_ci	if (!new) {
436062306a36Sopenharmony_ci		ret = -ENOMEM;
436162306a36Sopenharmony_ci		goto unlock;
436262306a36Sopenharmony_ci	}
436362306a36Sopenharmony_ci	new->size = size;
436462306a36Sopenharmony_ci
436562306a36Sopenharmony_ci	/* Copy thresholds (if any) to new array */
436662306a36Sopenharmony_ci	if (thresholds->primary)
436762306a36Sopenharmony_ci		memcpy(new->entries, thresholds->primary->entries,
436862306a36Sopenharmony_ci		       flex_array_size(new, entries, size - 1));
436962306a36Sopenharmony_ci
437062306a36Sopenharmony_ci	/* Add new threshold */
437162306a36Sopenharmony_ci	new->entries[size - 1].eventfd = eventfd;
437262306a36Sopenharmony_ci	new->entries[size - 1].threshold = threshold;
437362306a36Sopenharmony_ci
437462306a36Sopenharmony_ci	/* Sort thresholds. Registering of new threshold isn't time-critical */
437562306a36Sopenharmony_ci	sort(new->entries, size, sizeof(*new->entries),
437662306a36Sopenharmony_ci			compare_thresholds, NULL);
437762306a36Sopenharmony_ci
437862306a36Sopenharmony_ci	/* Find current threshold */
437962306a36Sopenharmony_ci	new->current_threshold = -1;
438062306a36Sopenharmony_ci	for (i = 0; i < size; i++) {
438162306a36Sopenharmony_ci		if (new->entries[i].threshold <= usage) {
438262306a36Sopenharmony_ci			/*
438362306a36Sopenharmony_ci			 * new->current_threshold will not be used until
438462306a36Sopenharmony_ci			 * rcu_assign_pointer(), so it's safe to increment
438562306a36Sopenharmony_ci			 * it here.
438662306a36Sopenharmony_ci			 */
438762306a36Sopenharmony_ci			++new->current_threshold;
438862306a36Sopenharmony_ci		} else
438962306a36Sopenharmony_ci			break;
439062306a36Sopenharmony_ci	}
439162306a36Sopenharmony_ci
439262306a36Sopenharmony_ci	/* Free old spare buffer and save old primary buffer as spare */
439362306a36Sopenharmony_ci	kfree(thresholds->spare);
439462306a36Sopenharmony_ci	thresholds->spare = thresholds->primary;
439562306a36Sopenharmony_ci
439662306a36Sopenharmony_ci	rcu_assign_pointer(thresholds->primary, new);
439762306a36Sopenharmony_ci
439862306a36Sopenharmony_ci	/* To be sure that nobody uses thresholds */
439962306a36Sopenharmony_ci	synchronize_rcu();
440062306a36Sopenharmony_ci
440162306a36Sopenharmony_ciunlock:
440262306a36Sopenharmony_ci	mutex_unlock(&memcg->thresholds_lock);
440362306a36Sopenharmony_ci
440462306a36Sopenharmony_ci	return ret;
440562306a36Sopenharmony_ci}
440662306a36Sopenharmony_ci
440762306a36Sopenharmony_cistatic int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
440862306a36Sopenharmony_ci	struct eventfd_ctx *eventfd, const char *args)
440962306a36Sopenharmony_ci{
441062306a36Sopenharmony_ci	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
441162306a36Sopenharmony_ci}
441262306a36Sopenharmony_ci
441362306a36Sopenharmony_cistatic int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
441462306a36Sopenharmony_ci	struct eventfd_ctx *eventfd, const char *args)
441562306a36Sopenharmony_ci{
441662306a36Sopenharmony_ci	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
441762306a36Sopenharmony_ci}
441862306a36Sopenharmony_ci
441962306a36Sopenharmony_cistatic void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
442062306a36Sopenharmony_ci	struct eventfd_ctx *eventfd, enum res_type type)
442162306a36Sopenharmony_ci{
442262306a36Sopenharmony_ci	struct mem_cgroup_thresholds *thresholds;
442362306a36Sopenharmony_ci	struct mem_cgroup_threshold_ary *new;
442462306a36Sopenharmony_ci	unsigned long usage;
442562306a36Sopenharmony_ci	int i, j, size, entries;
442662306a36Sopenharmony_ci
442762306a36Sopenharmony_ci	mutex_lock(&memcg->thresholds_lock);
442862306a36Sopenharmony_ci
442962306a36Sopenharmony_ci	if (type == _MEM) {
443062306a36Sopenharmony_ci		thresholds = &memcg->thresholds;
443162306a36Sopenharmony_ci		usage = mem_cgroup_usage(memcg, false);
443262306a36Sopenharmony_ci	} else if (type == _MEMSWAP) {
443362306a36Sopenharmony_ci		thresholds = &memcg->memsw_thresholds;
443462306a36Sopenharmony_ci		usage = mem_cgroup_usage(memcg, true);
443562306a36Sopenharmony_ci	} else
443662306a36Sopenharmony_ci		BUG();
443762306a36Sopenharmony_ci
443862306a36Sopenharmony_ci	if (!thresholds->primary)
443962306a36Sopenharmony_ci		goto unlock;
444062306a36Sopenharmony_ci
444162306a36Sopenharmony_ci	/* Check if a threshold crossed before removing */
444262306a36Sopenharmony_ci	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
444362306a36Sopenharmony_ci
444462306a36Sopenharmony_ci	/* Calculate new number of threshold */
444562306a36Sopenharmony_ci	size = entries = 0;
444662306a36Sopenharmony_ci	for (i = 0; i < thresholds->primary->size; i++) {
444762306a36Sopenharmony_ci		if (thresholds->primary->entries[i].eventfd != eventfd)
444862306a36Sopenharmony_ci			size++;
444962306a36Sopenharmony_ci		else
445062306a36Sopenharmony_ci			entries++;
445162306a36Sopenharmony_ci	}
445262306a36Sopenharmony_ci
445362306a36Sopenharmony_ci	new = thresholds->spare;
445462306a36Sopenharmony_ci
445562306a36Sopenharmony_ci	/* If no items related to eventfd have been cleared, nothing to do */
445662306a36Sopenharmony_ci	if (!entries)
445762306a36Sopenharmony_ci		goto unlock;
445862306a36Sopenharmony_ci
445962306a36Sopenharmony_ci	/* Set thresholds array to NULL if we don't have thresholds */
446062306a36Sopenharmony_ci	if (!size) {
446162306a36Sopenharmony_ci		kfree(new);
446262306a36Sopenharmony_ci		new = NULL;
446362306a36Sopenharmony_ci		goto swap_buffers;
446462306a36Sopenharmony_ci	}
446562306a36Sopenharmony_ci
446662306a36Sopenharmony_ci	new->size = size;
446762306a36Sopenharmony_ci
446862306a36Sopenharmony_ci	/* Copy thresholds and find current threshold */
446962306a36Sopenharmony_ci	new->current_threshold = -1;
447062306a36Sopenharmony_ci	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
447162306a36Sopenharmony_ci		if (thresholds->primary->entries[i].eventfd == eventfd)
447262306a36Sopenharmony_ci			continue;
447362306a36Sopenharmony_ci
447462306a36Sopenharmony_ci		new->entries[j] = thresholds->primary->entries[i];
447562306a36Sopenharmony_ci		if (new->entries[j].threshold <= usage) {
447662306a36Sopenharmony_ci			/*
447762306a36Sopenharmony_ci			 * new->current_threshold will not be used
447862306a36Sopenharmony_ci			 * until rcu_assign_pointer(), so it's safe to increment
447962306a36Sopenharmony_ci			 * it here.
448062306a36Sopenharmony_ci			 */
448162306a36Sopenharmony_ci			++new->current_threshold;
448262306a36Sopenharmony_ci		}
448362306a36Sopenharmony_ci		j++;
448462306a36Sopenharmony_ci	}
448562306a36Sopenharmony_ci
448662306a36Sopenharmony_ciswap_buffers:
448762306a36Sopenharmony_ci	/* Swap primary and spare array */
448862306a36Sopenharmony_ci	thresholds->spare = thresholds->primary;
448962306a36Sopenharmony_ci
449062306a36Sopenharmony_ci	rcu_assign_pointer(thresholds->primary, new);
449162306a36Sopenharmony_ci
449262306a36Sopenharmony_ci	/* To be sure that nobody uses thresholds */
449362306a36Sopenharmony_ci	synchronize_rcu();
449462306a36Sopenharmony_ci
449562306a36Sopenharmony_ci	/* If all events are unregistered, free the spare array */
449662306a36Sopenharmony_ci	if (!new) {
449762306a36Sopenharmony_ci		kfree(thresholds->spare);
449862306a36Sopenharmony_ci		thresholds->spare = NULL;
449962306a36Sopenharmony_ci	}
450062306a36Sopenharmony_ciunlock:
450162306a36Sopenharmony_ci	mutex_unlock(&memcg->thresholds_lock);
450262306a36Sopenharmony_ci}
450362306a36Sopenharmony_ci
450462306a36Sopenharmony_cistatic void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
450562306a36Sopenharmony_ci	struct eventfd_ctx *eventfd)
450662306a36Sopenharmony_ci{
450762306a36Sopenharmony_ci	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
450862306a36Sopenharmony_ci}
450962306a36Sopenharmony_ci
451062306a36Sopenharmony_cistatic void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
451162306a36Sopenharmony_ci	struct eventfd_ctx *eventfd)
451262306a36Sopenharmony_ci{
451362306a36Sopenharmony_ci	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
451462306a36Sopenharmony_ci}
451562306a36Sopenharmony_ci
451662306a36Sopenharmony_cistatic int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
451762306a36Sopenharmony_ci	struct eventfd_ctx *eventfd, const char *args)
451862306a36Sopenharmony_ci{
451962306a36Sopenharmony_ci	struct mem_cgroup_eventfd_list *event;
452062306a36Sopenharmony_ci
452162306a36Sopenharmony_ci	event = kmalloc(sizeof(*event),	GFP_KERNEL);
452262306a36Sopenharmony_ci	if (!event)
452362306a36Sopenharmony_ci		return -ENOMEM;
452462306a36Sopenharmony_ci
452562306a36Sopenharmony_ci	spin_lock(&memcg_oom_lock);
452662306a36Sopenharmony_ci
452762306a36Sopenharmony_ci	event->eventfd = eventfd;
452862306a36Sopenharmony_ci	list_add(&event->list, &memcg->oom_notify);
452962306a36Sopenharmony_ci
453062306a36Sopenharmony_ci	/* already in OOM ? */
453162306a36Sopenharmony_ci	if (memcg->under_oom)
453262306a36Sopenharmony_ci		eventfd_signal(eventfd, 1);
453362306a36Sopenharmony_ci	spin_unlock(&memcg_oom_lock);
453462306a36Sopenharmony_ci
453562306a36Sopenharmony_ci	return 0;
453662306a36Sopenharmony_ci}
453762306a36Sopenharmony_ci
453862306a36Sopenharmony_cistatic void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
453962306a36Sopenharmony_ci	struct eventfd_ctx *eventfd)
454062306a36Sopenharmony_ci{
454162306a36Sopenharmony_ci	struct mem_cgroup_eventfd_list *ev, *tmp;
454262306a36Sopenharmony_ci
454362306a36Sopenharmony_ci	spin_lock(&memcg_oom_lock);
454462306a36Sopenharmony_ci
454562306a36Sopenharmony_ci	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
454662306a36Sopenharmony_ci		if (ev->eventfd == eventfd) {
454762306a36Sopenharmony_ci			list_del(&ev->list);
454862306a36Sopenharmony_ci			kfree(ev);
454962306a36Sopenharmony_ci		}
455062306a36Sopenharmony_ci	}
455162306a36Sopenharmony_ci
455262306a36Sopenharmony_ci	spin_unlock(&memcg_oom_lock);
455362306a36Sopenharmony_ci}
455462306a36Sopenharmony_ci
455562306a36Sopenharmony_cistatic int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
455662306a36Sopenharmony_ci{
455762306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
455862306a36Sopenharmony_ci
455962306a36Sopenharmony_ci	seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable));
456062306a36Sopenharmony_ci	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
456162306a36Sopenharmony_ci	seq_printf(sf, "oom_kill %lu\n",
456262306a36Sopenharmony_ci		   atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
456362306a36Sopenharmony_ci	return 0;
456462306a36Sopenharmony_ci}
456562306a36Sopenharmony_ci
456662306a36Sopenharmony_cistatic int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
456762306a36Sopenharmony_ci	struct cftype *cft, u64 val)
456862306a36Sopenharmony_ci{
456962306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
457062306a36Sopenharmony_ci
457162306a36Sopenharmony_ci	/* cannot set to root cgroup and only 0 and 1 are allowed */
457262306a36Sopenharmony_ci	if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
457362306a36Sopenharmony_ci		return -EINVAL;
457462306a36Sopenharmony_ci
457562306a36Sopenharmony_ci	WRITE_ONCE(memcg->oom_kill_disable, val);
457662306a36Sopenharmony_ci	if (!val)
457762306a36Sopenharmony_ci		memcg_oom_recover(memcg);
457862306a36Sopenharmony_ci
457962306a36Sopenharmony_ci	return 0;
458062306a36Sopenharmony_ci}
458162306a36Sopenharmony_ci
458262306a36Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK
458362306a36Sopenharmony_ci
458462306a36Sopenharmony_ci#include <trace/events/writeback.h>
458562306a36Sopenharmony_ci
458662306a36Sopenharmony_cistatic int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
458762306a36Sopenharmony_ci{
458862306a36Sopenharmony_ci	return wb_domain_init(&memcg->cgwb_domain, gfp);
458962306a36Sopenharmony_ci}
459062306a36Sopenharmony_ci
459162306a36Sopenharmony_cistatic void memcg_wb_domain_exit(struct mem_cgroup *memcg)
459262306a36Sopenharmony_ci{
459362306a36Sopenharmony_ci	wb_domain_exit(&memcg->cgwb_domain);
459462306a36Sopenharmony_ci}
459562306a36Sopenharmony_ci
459662306a36Sopenharmony_cistatic void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
459762306a36Sopenharmony_ci{
459862306a36Sopenharmony_ci	wb_domain_size_changed(&memcg->cgwb_domain);
459962306a36Sopenharmony_ci}
460062306a36Sopenharmony_ci
460162306a36Sopenharmony_cistruct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
460262306a36Sopenharmony_ci{
460362306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
460462306a36Sopenharmony_ci
460562306a36Sopenharmony_ci	if (!memcg->css.parent)
460662306a36Sopenharmony_ci		return NULL;
460762306a36Sopenharmony_ci
460862306a36Sopenharmony_ci	return &memcg->cgwb_domain;
460962306a36Sopenharmony_ci}
461062306a36Sopenharmony_ci
461162306a36Sopenharmony_ci/**
461262306a36Sopenharmony_ci * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
461362306a36Sopenharmony_ci * @wb: bdi_writeback in question
461462306a36Sopenharmony_ci * @pfilepages: out parameter for number of file pages
461562306a36Sopenharmony_ci * @pheadroom: out parameter for number of allocatable pages according to memcg
461662306a36Sopenharmony_ci * @pdirty: out parameter for number of dirty pages
461762306a36Sopenharmony_ci * @pwriteback: out parameter for number of pages under writeback
461862306a36Sopenharmony_ci *
461962306a36Sopenharmony_ci * Determine the numbers of file, headroom, dirty, and writeback pages in
462062306a36Sopenharmony_ci * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
462162306a36Sopenharmony_ci * is a bit more involved.
462262306a36Sopenharmony_ci *
462362306a36Sopenharmony_ci * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
462462306a36Sopenharmony_ci * headroom is calculated as the lowest headroom of itself and the
462562306a36Sopenharmony_ci * ancestors.  Note that this doesn't consider the actual amount of
462662306a36Sopenharmony_ci * available memory in the system.  The caller should further cap
462762306a36Sopenharmony_ci * *@pheadroom accordingly.
462862306a36Sopenharmony_ci */
462962306a36Sopenharmony_civoid mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
463062306a36Sopenharmony_ci			 unsigned long *pheadroom, unsigned long *pdirty,
463162306a36Sopenharmony_ci			 unsigned long *pwriteback)
463262306a36Sopenharmony_ci{
463362306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
463462306a36Sopenharmony_ci	struct mem_cgroup *parent;
463562306a36Sopenharmony_ci
463662306a36Sopenharmony_ci	mem_cgroup_flush_stats();
463762306a36Sopenharmony_ci
463862306a36Sopenharmony_ci	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
463962306a36Sopenharmony_ci	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
464062306a36Sopenharmony_ci	*pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
464162306a36Sopenharmony_ci			memcg_page_state(memcg, NR_ACTIVE_FILE);
464262306a36Sopenharmony_ci
464362306a36Sopenharmony_ci	*pheadroom = PAGE_COUNTER_MAX;
464462306a36Sopenharmony_ci	while ((parent = parent_mem_cgroup(memcg))) {
464562306a36Sopenharmony_ci		unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
464662306a36Sopenharmony_ci					    READ_ONCE(memcg->memory.high));
464762306a36Sopenharmony_ci		unsigned long used = page_counter_read(&memcg->memory);
464862306a36Sopenharmony_ci
464962306a36Sopenharmony_ci		*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
465062306a36Sopenharmony_ci		memcg = parent;
465162306a36Sopenharmony_ci	}
465262306a36Sopenharmony_ci}
465362306a36Sopenharmony_ci
465462306a36Sopenharmony_ci/*
465562306a36Sopenharmony_ci * Foreign dirty flushing
465662306a36Sopenharmony_ci *
465762306a36Sopenharmony_ci * There's an inherent mismatch between memcg and writeback.  The former
465862306a36Sopenharmony_ci * tracks ownership per-page while the latter per-inode.  This was a
465962306a36Sopenharmony_ci * deliberate design decision because honoring per-page ownership in the
466062306a36Sopenharmony_ci * writeback path is complicated, may lead to higher CPU and IO overheads
466162306a36Sopenharmony_ci * and deemed unnecessary given that write-sharing an inode across
466262306a36Sopenharmony_ci * different cgroups isn't a common use-case.
466362306a36Sopenharmony_ci *
466462306a36Sopenharmony_ci * Combined with inode majority-writer ownership switching, this works well
466562306a36Sopenharmony_ci * enough in most cases but there are some pathological cases.  For
466662306a36Sopenharmony_ci * example, let's say there are two cgroups A and B which keep writing to
466762306a36Sopenharmony_ci * different but confined parts of the same inode.  B owns the inode and
466862306a36Sopenharmony_ci * A's memory is limited far below B's.  A's dirty ratio can rise enough to
466962306a36Sopenharmony_ci * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
467062306a36Sopenharmony_ci * triggering background writeback.  A will be slowed down without a way to
467162306a36Sopenharmony_ci * make writeback of the dirty pages happen.
467262306a36Sopenharmony_ci *
467362306a36Sopenharmony_ci * Conditions like the above can lead to a cgroup getting repeatedly and
467462306a36Sopenharmony_ci * severely throttled after making some progress after each
467562306a36Sopenharmony_ci * dirty_expire_interval while the underlying IO device is almost
467662306a36Sopenharmony_ci * completely idle.
467762306a36Sopenharmony_ci *
467862306a36Sopenharmony_ci * Solving this problem completely requires matching the ownership tracking
467962306a36Sopenharmony_ci * granularities between memcg and writeback in either direction.  However,
468062306a36Sopenharmony_ci * the more egregious behaviors can be avoided by simply remembering the
468162306a36Sopenharmony_ci * most recent foreign dirtying events and initiating remote flushes on
468262306a36Sopenharmony_ci * them when local writeback isn't enough to keep the memory clean enough.
468362306a36Sopenharmony_ci *
468462306a36Sopenharmony_ci * The following two functions implement such mechanism.  When a foreign
468562306a36Sopenharmony_ci * page - a page whose memcg and writeback ownerships don't match - is
468662306a36Sopenharmony_ci * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
468762306a36Sopenharmony_ci * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
468862306a36Sopenharmony_ci * decides that the memcg needs to sleep due to high dirty ratio, it calls
468962306a36Sopenharmony_ci * mem_cgroup_flush_foreign() which queues writeback on the recorded
469062306a36Sopenharmony_ci * foreign bdi_writebacks which haven't expired.  Both the numbers of
469162306a36Sopenharmony_ci * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
469262306a36Sopenharmony_ci * limited to MEMCG_CGWB_FRN_CNT.
469362306a36Sopenharmony_ci *
469462306a36Sopenharmony_ci * The mechanism only remembers IDs and doesn't hold any object references.
469562306a36Sopenharmony_ci * As being wrong occasionally doesn't matter, updates and accesses to the
469662306a36Sopenharmony_ci * records are lockless and racy.
469762306a36Sopenharmony_ci */
469862306a36Sopenharmony_civoid mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
469962306a36Sopenharmony_ci					     struct bdi_writeback *wb)
470062306a36Sopenharmony_ci{
470162306a36Sopenharmony_ci	struct mem_cgroup *memcg = folio_memcg(folio);
470262306a36Sopenharmony_ci	struct memcg_cgwb_frn *frn;
470362306a36Sopenharmony_ci	u64 now = get_jiffies_64();
470462306a36Sopenharmony_ci	u64 oldest_at = now;
470562306a36Sopenharmony_ci	int oldest = -1;
470662306a36Sopenharmony_ci	int i;
470762306a36Sopenharmony_ci
470862306a36Sopenharmony_ci	trace_track_foreign_dirty(folio, wb);
470962306a36Sopenharmony_ci
471062306a36Sopenharmony_ci	/*
471162306a36Sopenharmony_ci	 * Pick the slot to use.  If there is already a slot for @wb, keep
471262306a36Sopenharmony_ci	 * using it.  If not replace the oldest one which isn't being
471362306a36Sopenharmony_ci	 * written out.
471462306a36Sopenharmony_ci	 */
471562306a36Sopenharmony_ci	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
471662306a36Sopenharmony_ci		frn = &memcg->cgwb_frn[i];
471762306a36Sopenharmony_ci		if (frn->bdi_id == wb->bdi->id &&
471862306a36Sopenharmony_ci		    frn->memcg_id == wb->memcg_css->id)
471962306a36Sopenharmony_ci			break;
472062306a36Sopenharmony_ci		if (time_before64(frn->at, oldest_at) &&
472162306a36Sopenharmony_ci		    atomic_read(&frn->done.cnt) == 1) {
472262306a36Sopenharmony_ci			oldest = i;
472362306a36Sopenharmony_ci			oldest_at = frn->at;
472462306a36Sopenharmony_ci		}
472562306a36Sopenharmony_ci	}
472662306a36Sopenharmony_ci
472762306a36Sopenharmony_ci	if (i < MEMCG_CGWB_FRN_CNT) {
472862306a36Sopenharmony_ci		/*
472962306a36Sopenharmony_ci		 * Re-using an existing one.  Update timestamp lazily to
473062306a36Sopenharmony_ci		 * avoid making the cacheline hot.  We want them to be
473162306a36Sopenharmony_ci		 * reasonably up-to-date and significantly shorter than
473262306a36Sopenharmony_ci		 * dirty_expire_interval as that's what expires the record.
473362306a36Sopenharmony_ci		 * Use the shorter of 1s and dirty_expire_interval / 8.
473462306a36Sopenharmony_ci		 */
473562306a36Sopenharmony_ci		unsigned long update_intv =
473662306a36Sopenharmony_ci			min_t(unsigned long, HZ,
473762306a36Sopenharmony_ci			      msecs_to_jiffies(dirty_expire_interval * 10) / 8);
473862306a36Sopenharmony_ci
473962306a36Sopenharmony_ci		if (time_before64(frn->at, now - update_intv))
474062306a36Sopenharmony_ci			frn->at = now;
474162306a36Sopenharmony_ci	} else if (oldest >= 0) {
474262306a36Sopenharmony_ci		/* replace the oldest free one */
474362306a36Sopenharmony_ci		frn = &memcg->cgwb_frn[oldest];
474462306a36Sopenharmony_ci		frn->bdi_id = wb->bdi->id;
474562306a36Sopenharmony_ci		frn->memcg_id = wb->memcg_css->id;
474662306a36Sopenharmony_ci		frn->at = now;
474762306a36Sopenharmony_ci	}
474862306a36Sopenharmony_ci}
474962306a36Sopenharmony_ci
475062306a36Sopenharmony_ci/* issue foreign writeback flushes for recorded foreign dirtying events */
475162306a36Sopenharmony_civoid mem_cgroup_flush_foreign(struct bdi_writeback *wb)
475262306a36Sopenharmony_ci{
475362306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
475462306a36Sopenharmony_ci	unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
475562306a36Sopenharmony_ci	u64 now = jiffies_64;
475662306a36Sopenharmony_ci	int i;
475762306a36Sopenharmony_ci
475862306a36Sopenharmony_ci	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
475962306a36Sopenharmony_ci		struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
476062306a36Sopenharmony_ci
476162306a36Sopenharmony_ci		/*
476262306a36Sopenharmony_ci		 * If the record is older than dirty_expire_interval,
476362306a36Sopenharmony_ci		 * writeback on it has already started.  No need to kick it
476462306a36Sopenharmony_ci		 * off again.  Also, don't start a new one if there's
476562306a36Sopenharmony_ci		 * already one in flight.
476662306a36Sopenharmony_ci		 */
476762306a36Sopenharmony_ci		if (time_after64(frn->at, now - intv) &&
476862306a36Sopenharmony_ci		    atomic_read(&frn->done.cnt) == 1) {
476962306a36Sopenharmony_ci			frn->at = 0;
477062306a36Sopenharmony_ci			trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
477162306a36Sopenharmony_ci			cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
477262306a36Sopenharmony_ci					       WB_REASON_FOREIGN_FLUSH,
477362306a36Sopenharmony_ci					       &frn->done);
477462306a36Sopenharmony_ci		}
477562306a36Sopenharmony_ci	}
477662306a36Sopenharmony_ci}
477762306a36Sopenharmony_ci
477862306a36Sopenharmony_ci#else	/* CONFIG_CGROUP_WRITEBACK */
477962306a36Sopenharmony_ci
478062306a36Sopenharmony_cistatic int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
478162306a36Sopenharmony_ci{
478262306a36Sopenharmony_ci	return 0;
478362306a36Sopenharmony_ci}
478462306a36Sopenharmony_ci
478562306a36Sopenharmony_cistatic void memcg_wb_domain_exit(struct mem_cgroup *memcg)
478662306a36Sopenharmony_ci{
478762306a36Sopenharmony_ci}
478862306a36Sopenharmony_ci
478962306a36Sopenharmony_cistatic void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
479062306a36Sopenharmony_ci{
479162306a36Sopenharmony_ci}
479262306a36Sopenharmony_ci
479362306a36Sopenharmony_ci#endif	/* CONFIG_CGROUP_WRITEBACK */
479462306a36Sopenharmony_ci
479562306a36Sopenharmony_ci/*
479662306a36Sopenharmony_ci * DO NOT USE IN NEW FILES.
479762306a36Sopenharmony_ci *
479862306a36Sopenharmony_ci * "cgroup.event_control" implementation.
479962306a36Sopenharmony_ci *
480062306a36Sopenharmony_ci * This is way over-engineered.  It tries to support fully configurable
480162306a36Sopenharmony_ci * events for each user.  Such level of flexibility is completely
480262306a36Sopenharmony_ci * unnecessary especially in the light of the planned unified hierarchy.
480362306a36Sopenharmony_ci *
480462306a36Sopenharmony_ci * Please deprecate this and replace with something simpler if at all
480562306a36Sopenharmony_ci * possible.
480662306a36Sopenharmony_ci */
480762306a36Sopenharmony_ci
480862306a36Sopenharmony_ci/*
480962306a36Sopenharmony_ci * Unregister event and free resources.
481062306a36Sopenharmony_ci *
481162306a36Sopenharmony_ci * Gets called from workqueue.
481262306a36Sopenharmony_ci */
481362306a36Sopenharmony_cistatic void memcg_event_remove(struct work_struct *work)
481462306a36Sopenharmony_ci{
481562306a36Sopenharmony_ci	struct mem_cgroup_event *event =
481662306a36Sopenharmony_ci		container_of(work, struct mem_cgroup_event, remove);
481762306a36Sopenharmony_ci	struct mem_cgroup *memcg = event->memcg;
481862306a36Sopenharmony_ci
481962306a36Sopenharmony_ci	remove_wait_queue(event->wqh, &event->wait);
482062306a36Sopenharmony_ci
482162306a36Sopenharmony_ci	event->unregister_event(memcg, event->eventfd);
482262306a36Sopenharmony_ci
482362306a36Sopenharmony_ci	/* Notify userspace the event is going away. */
482462306a36Sopenharmony_ci	eventfd_signal(event->eventfd, 1);
482562306a36Sopenharmony_ci
482662306a36Sopenharmony_ci	eventfd_ctx_put(event->eventfd);
482762306a36Sopenharmony_ci	kfree(event);
482862306a36Sopenharmony_ci	css_put(&memcg->css);
482962306a36Sopenharmony_ci}
483062306a36Sopenharmony_ci
483162306a36Sopenharmony_ci/*
483262306a36Sopenharmony_ci * Gets called on EPOLLHUP on eventfd when user closes it.
483362306a36Sopenharmony_ci *
483462306a36Sopenharmony_ci * Called with wqh->lock held and interrupts disabled.
483562306a36Sopenharmony_ci */
483662306a36Sopenharmony_cistatic int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
483762306a36Sopenharmony_ci			    int sync, void *key)
483862306a36Sopenharmony_ci{
483962306a36Sopenharmony_ci	struct mem_cgroup_event *event =
484062306a36Sopenharmony_ci		container_of(wait, struct mem_cgroup_event, wait);
484162306a36Sopenharmony_ci	struct mem_cgroup *memcg = event->memcg;
484262306a36Sopenharmony_ci	__poll_t flags = key_to_poll(key);
484362306a36Sopenharmony_ci
484462306a36Sopenharmony_ci	if (flags & EPOLLHUP) {
484562306a36Sopenharmony_ci		/*
484662306a36Sopenharmony_ci		 * If the event has been detached at cgroup removal, we
484762306a36Sopenharmony_ci		 * can simply return knowing the other side will cleanup
484862306a36Sopenharmony_ci		 * for us.
484962306a36Sopenharmony_ci		 *
485062306a36Sopenharmony_ci		 * We can't race against event freeing since the other
485162306a36Sopenharmony_ci		 * side will require wqh->lock via remove_wait_queue(),
485262306a36Sopenharmony_ci		 * which we hold.
485362306a36Sopenharmony_ci		 */
485462306a36Sopenharmony_ci		spin_lock(&memcg->event_list_lock);
485562306a36Sopenharmony_ci		if (!list_empty(&event->list)) {
485662306a36Sopenharmony_ci			list_del_init(&event->list);
485762306a36Sopenharmony_ci			/*
485862306a36Sopenharmony_ci			 * We are in atomic context, but cgroup_event_remove()
485962306a36Sopenharmony_ci			 * may sleep, so we have to call it in workqueue.
486062306a36Sopenharmony_ci			 */
486162306a36Sopenharmony_ci			schedule_work(&event->remove);
486262306a36Sopenharmony_ci		}
486362306a36Sopenharmony_ci		spin_unlock(&memcg->event_list_lock);
486462306a36Sopenharmony_ci	}
486562306a36Sopenharmony_ci
486662306a36Sopenharmony_ci	return 0;
486762306a36Sopenharmony_ci}
486862306a36Sopenharmony_ci
486962306a36Sopenharmony_cistatic void memcg_event_ptable_queue_proc(struct file *file,
487062306a36Sopenharmony_ci		wait_queue_head_t *wqh, poll_table *pt)
487162306a36Sopenharmony_ci{
487262306a36Sopenharmony_ci	struct mem_cgroup_event *event =
487362306a36Sopenharmony_ci		container_of(pt, struct mem_cgroup_event, pt);
487462306a36Sopenharmony_ci
487562306a36Sopenharmony_ci	event->wqh = wqh;
487662306a36Sopenharmony_ci	add_wait_queue(wqh, &event->wait);
487762306a36Sopenharmony_ci}
487862306a36Sopenharmony_ci
487962306a36Sopenharmony_ci/*
488062306a36Sopenharmony_ci * DO NOT USE IN NEW FILES.
488162306a36Sopenharmony_ci *
488262306a36Sopenharmony_ci * Parse input and register new cgroup event handler.
488362306a36Sopenharmony_ci *
488462306a36Sopenharmony_ci * Input must be in format '<event_fd> <control_fd> <args>'.
488562306a36Sopenharmony_ci * Interpretation of args is defined by control file implementation.
488662306a36Sopenharmony_ci */
488762306a36Sopenharmony_cistatic ssize_t memcg_write_event_control(struct kernfs_open_file *of,
488862306a36Sopenharmony_ci					 char *buf, size_t nbytes, loff_t off)
488962306a36Sopenharmony_ci{
489062306a36Sopenharmony_ci	struct cgroup_subsys_state *css = of_css(of);
489162306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
489262306a36Sopenharmony_ci	struct mem_cgroup_event *event;
489362306a36Sopenharmony_ci	struct cgroup_subsys_state *cfile_css;
489462306a36Sopenharmony_ci	unsigned int efd, cfd;
489562306a36Sopenharmony_ci	struct fd efile;
489662306a36Sopenharmony_ci	struct fd cfile;
489762306a36Sopenharmony_ci	struct dentry *cdentry;
489862306a36Sopenharmony_ci	const char *name;
489962306a36Sopenharmony_ci	char *endp;
490062306a36Sopenharmony_ci	int ret;
490162306a36Sopenharmony_ci
490262306a36Sopenharmony_ci	if (IS_ENABLED(CONFIG_PREEMPT_RT))
490362306a36Sopenharmony_ci		return -EOPNOTSUPP;
490462306a36Sopenharmony_ci
490562306a36Sopenharmony_ci	buf = strstrip(buf);
490662306a36Sopenharmony_ci
490762306a36Sopenharmony_ci	efd = simple_strtoul(buf, &endp, 10);
490862306a36Sopenharmony_ci	if (*endp != ' ')
490962306a36Sopenharmony_ci		return -EINVAL;
491062306a36Sopenharmony_ci	buf = endp + 1;
491162306a36Sopenharmony_ci
491262306a36Sopenharmony_ci	cfd = simple_strtoul(buf, &endp, 10);
491362306a36Sopenharmony_ci	if ((*endp != ' ') && (*endp != '\0'))
491462306a36Sopenharmony_ci		return -EINVAL;
491562306a36Sopenharmony_ci	buf = endp + 1;
491662306a36Sopenharmony_ci
491762306a36Sopenharmony_ci	event = kzalloc(sizeof(*event), GFP_KERNEL);
491862306a36Sopenharmony_ci	if (!event)
491962306a36Sopenharmony_ci		return -ENOMEM;
492062306a36Sopenharmony_ci
492162306a36Sopenharmony_ci	event->memcg = memcg;
492262306a36Sopenharmony_ci	INIT_LIST_HEAD(&event->list);
492362306a36Sopenharmony_ci	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
492462306a36Sopenharmony_ci	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
492562306a36Sopenharmony_ci	INIT_WORK(&event->remove, memcg_event_remove);
492662306a36Sopenharmony_ci
492762306a36Sopenharmony_ci	efile = fdget(efd);
492862306a36Sopenharmony_ci	if (!efile.file) {
492962306a36Sopenharmony_ci		ret = -EBADF;
493062306a36Sopenharmony_ci		goto out_kfree;
493162306a36Sopenharmony_ci	}
493262306a36Sopenharmony_ci
493362306a36Sopenharmony_ci	event->eventfd = eventfd_ctx_fileget(efile.file);
493462306a36Sopenharmony_ci	if (IS_ERR(event->eventfd)) {
493562306a36Sopenharmony_ci		ret = PTR_ERR(event->eventfd);
493662306a36Sopenharmony_ci		goto out_put_efile;
493762306a36Sopenharmony_ci	}
493862306a36Sopenharmony_ci
493962306a36Sopenharmony_ci	cfile = fdget(cfd);
494062306a36Sopenharmony_ci	if (!cfile.file) {
494162306a36Sopenharmony_ci		ret = -EBADF;
494262306a36Sopenharmony_ci		goto out_put_eventfd;
494362306a36Sopenharmony_ci	}
494462306a36Sopenharmony_ci
494562306a36Sopenharmony_ci	/* the process need read permission on control file */
494662306a36Sopenharmony_ci	/* AV: shouldn't we check that it's been opened for read instead? */
494762306a36Sopenharmony_ci	ret = file_permission(cfile.file, MAY_READ);
494862306a36Sopenharmony_ci	if (ret < 0)
494962306a36Sopenharmony_ci		goto out_put_cfile;
495062306a36Sopenharmony_ci
495162306a36Sopenharmony_ci	/*
495262306a36Sopenharmony_ci	 * The control file must be a regular cgroup1 file. As a regular cgroup
495362306a36Sopenharmony_ci	 * file can't be renamed, it's safe to access its name afterwards.
495462306a36Sopenharmony_ci	 */
495562306a36Sopenharmony_ci	cdentry = cfile.file->f_path.dentry;
495662306a36Sopenharmony_ci	if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
495762306a36Sopenharmony_ci		ret = -EINVAL;
495862306a36Sopenharmony_ci		goto out_put_cfile;
495962306a36Sopenharmony_ci	}
496062306a36Sopenharmony_ci
496162306a36Sopenharmony_ci	/*
496262306a36Sopenharmony_ci	 * Determine the event callbacks and set them in @event.  This used
496362306a36Sopenharmony_ci	 * to be done via struct cftype but cgroup core no longer knows
496462306a36Sopenharmony_ci	 * about these events.  The following is crude but the whole thing
496562306a36Sopenharmony_ci	 * is for compatibility anyway.
496662306a36Sopenharmony_ci	 *
496762306a36Sopenharmony_ci	 * DO NOT ADD NEW FILES.
496862306a36Sopenharmony_ci	 */
496962306a36Sopenharmony_ci	name = cdentry->d_name.name;
497062306a36Sopenharmony_ci
497162306a36Sopenharmony_ci	if (!strcmp(name, "memory.usage_in_bytes")) {
497262306a36Sopenharmony_ci		event->register_event = mem_cgroup_usage_register_event;
497362306a36Sopenharmony_ci		event->unregister_event = mem_cgroup_usage_unregister_event;
497462306a36Sopenharmony_ci	} else if (!strcmp(name, "memory.oom_control")) {
497562306a36Sopenharmony_ci		event->register_event = mem_cgroup_oom_register_event;
497662306a36Sopenharmony_ci		event->unregister_event = mem_cgroup_oom_unregister_event;
497762306a36Sopenharmony_ci	} else if (!strcmp(name, "memory.pressure_level")) {
497862306a36Sopenharmony_ci		event->register_event = vmpressure_register_event;
497962306a36Sopenharmony_ci		event->unregister_event = vmpressure_unregister_event;
498062306a36Sopenharmony_ci	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
498162306a36Sopenharmony_ci		event->register_event = memsw_cgroup_usage_register_event;
498262306a36Sopenharmony_ci		event->unregister_event = memsw_cgroup_usage_unregister_event;
498362306a36Sopenharmony_ci	} else {
498462306a36Sopenharmony_ci		ret = -EINVAL;
498562306a36Sopenharmony_ci		goto out_put_cfile;
498662306a36Sopenharmony_ci	}
498762306a36Sopenharmony_ci
498862306a36Sopenharmony_ci	/*
498962306a36Sopenharmony_ci	 * Verify @cfile should belong to @css.  Also, remaining events are
499062306a36Sopenharmony_ci	 * automatically removed on cgroup destruction but the removal is
499162306a36Sopenharmony_ci	 * asynchronous, so take an extra ref on @css.
499262306a36Sopenharmony_ci	 */
499362306a36Sopenharmony_ci	cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
499462306a36Sopenharmony_ci					       &memory_cgrp_subsys);
499562306a36Sopenharmony_ci	ret = -EINVAL;
499662306a36Sopenharmony_ci	if (IS_ERR(cfile_css))
499762306a36Sopenharmony_ci		goto out_put_cfile;
499862306a36Sopenharmony_ci	if (cfile_css != css) {
499962306a36Sopenharmony_ci		css_put(cfile_css);
500062306a36Sopenharmony_ci		goto out_put_cfile;
500162306a36Sopenharmony_ci	}
500262306a36Sopenharmony_ci
500362306a36Sopenharmony_ci	ret = event->register_event(memcg, event->eventfd, buf);
500462306a36Sopenharmony_ci	if (ret)
500562306a36Sopenharmony_ci		goto out_put_css;
500662306a36Sopenharmony_ci
500762306a36Sopenharmony_ci	vfs_poll(efile.file, &event->pt);
500862306a36Sopenharmony_ci
500962306a36Sopenharmony_ci	spin_lock_irq(&memcg->event_list_lock);
501062306a36Sopenharmony_ci	list_add(&event->list, &memcg->event_list);
501162306a36Sopenharmony_ci	spin_unlock_irq(&memcg->event_list_lock);
501262306a36Sopenharmony_ci
501362306a36Sopenharmony_ci	fdput(cfile);
501462306a36Sopenharmony_ci	fdput(efile);
501562306a36Sopenharmony_ci
501662306a36Sopenharmony_ci	return nbytes;
501762306a36Sopenharmony_ci
501862306a36Sopenharmony_ciout_put_css:
501962306a36Sopenharmony_ci	css_put(css);
502062306a36Sopenharmony_ciout_put_cfile:
502162306a36Sopenharmony_ci	fdput(cfile);
502262306a36Sopenharmony_ciout_put_eventfd:
502362306a36Sopenharmony_ci	eventfd_ctx_put(event->eventfd);
502462306a36Sopenharmony_ciout_put_efile:
502562306a36Sopenharmony_ci	fdput(efile);
502662306a36Sopenharmony_ciout_kfree:
502762306a36Sopenharmony_ci	kfree(event);
502862306a36Sopenharmony_ci
502962306a36Sopenharmony_ci	return ret;
503062306a36Sopenharmony_ci}
503162306a36Sopenharmony_ci
503262306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
503362306a36Sopenharmony_cistatic int mem_cgroup_slab_show(struct seq_file *m, void *p)
503462306a36Sopenharmony_ci{
503562306a36Sopenharmony_ci	/*
503662306a36Sopenharmony_ci	 * Deprecated.
503762306a36Sopenharmony_ci	 * Please, take a look at tools/cgroup/memcg_slabinfo.py .
503862306a36Sopenharmony_ci	 */
503962306a36Sopenharmony_ci	return 0;
504062306a36Sopenharmony_ci}
504162306a36Sopenharmony_ci#endif
504262306a36Sopenharmony_ci
504362306a36Sopenharmony_cistatic int memory_stat_show(struct seq_file *m, void *v);
504462306a36Sopenharmony_ci
504562306a36Sopenharmony_cistatic struct cftype mem_cgroup_legacy_files[] = {
504662306a36Sopenharmony_ci	{
504762306a36Sopenharmony_ci		.name = "usage_in_bytes",
504862306a36Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
504962306a36Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
505062306a36Sopenharmony_ci	},
505162306a36Sopenharmony_ci	{
505262306a36Sopenharmony_ci		.name = "max_usage_in_bytes",
505362306a36Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
505462306a36Sopenharmony_ci		.write = mem_cgroup_reset,
505562306a36Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
505662306a36Sopenharmony_ci	},
505762306a36Sopenharmony_ci	{
505862306a36Sopenharmony_ci		.name = "limit_in_bytes",
505962306a36Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
506062306a36Sopenharmony_ci		.write = mem_cgroup_write,
506162306a36Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
506262306a36Sopenharmony_ci	},
506362306a36Sopenharmony_ci	{
506462306a36Sopenharmony_ci		.name = "soft_limit_in_bytes",
506562306a36Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
506662306a36Sopenharmony_ci		.write = mem_cgroup_write,
506762306a36Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
506862306a36Sopenharmony_ci	},
506962306a36Sopenharmony_ci	{
507062306a36Sopenharmony_ci		.name = "failcnt",
507162306a36Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
507262306a36Sopenharmony_ci		.write = mem_cgroup_reset,
507362306a36Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
507462306a36Sopenharmony_ci	},
507562306a36Sopenharmony_ci	{
507662306a36Sopenharmony_ci		.name = "stat",
507762306a36Sopenharmony_ci		.seq_show = memory_stat_show,
507862306a36Sopenharmony_ci	},
507962306a36Sopenharmony_ci	{
508062306a36Sopenharmony_ci		.name = "force_empty",
508162306a36Sopenharmony_ci		.write = mem_cgroup_force_empty_write,
508262306a36Sopenharmony_ci	},
508362306a36Sopenharmony_ci	{
508462306a36Sopenharmony_ci		.name = "use_hierarchy",
508562306a36Sopenharmony_ci		.write_u64 = mem_cgroup_hierarchy_write,
508662306a36Sopenharmony_ci		.read_u64 = mem_cgroup_hierarchy_read,
508762306a36Sopenharmony_ci	},
508862306a36Sopenharmony_ci	{
508962306a36Sopenharmony_ci		.name = "cgroup.event_control",		/* XXX: for compat */
509062306a36Sopenharmony_ci		.write = memcg_write_event_control,
509162306a36Sopenharmony_ci		.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
509262306a36Sopenharmony_ci	},
509362306a36Sopenharmony_ci	{
509462306a36Sopenharmony_ci		.name = "swappiness",
509562306a36Sopenharmony_ci		.read_u64 = mem_cgroup_swappiness_read,
509662306a36Sopenharmony_ci		.write_u64 = mem_cgroup_swappiness_write,
509762306a36Sopenharmony_ci	},
509862306a36Sopenharmony_ci	{
509962306a36Sopenharmony_ci		.name = "move_charge_at_immigrate",
510062306a36Sopenharmony_ci		.read_u64 = mem_cgroup_move_charge_read,
510162306a36Sopenharmony_ci		.write_u64 = mem_cgroup_move_charge_write,
510262306a36Sopenharmony_ci	},
510362306a36Sopenharmony_ci	{
510462306a36Sopenharmony_ci		.name = "oom_control",
510562306a36Sopenharmony_ci		.seq_show = mem_cgroup_oom_control_read,
510662306a36Sopenharmony_ci		.write_u64 = mem_cgroup_oom_control_write,
510762306a36Sopenharmony_ci	},
510862306a36Sopenharmony_ci	{
510962306a36Sopenharmony_ci		.name = "pressure_level",
511062306a36Sopenharmony_ci		.seq_show = mem_cgroup_dummy_seq_show,
511162306a36Sopenharmony_ci	},
511262306a36Sopenharmony_ci#ifdef CONFIG_NUMA
511362306a36Sopenharmony_ci	{
511462306a36Sopenharmony_ci		.name = "numa_stat",
511562306a36Sopenharmony_ci		.seq_show = memcg_numa_stat_show,
511662306a36Sopenharmony_ci	},
511762306a36Sopenharmony_ci#endif
511862306a36Sopenharmony_ci	{
511962306a36Sopenharmony_ci		.name = "kmem.limit_in_bytes",
512062306a36Sopenharmony_ci		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
512162306a36Sopenharmony_ci		.write = mem_cgroup_write,
512262306a36Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
512362306a36Sopenharmony_ci	},
512462306a36Sopenharmony_ci	{
512562306a36Sopenharmony_ci		.name = "kmem.usage_in_bytes",
512662306a36Sopenharmony_ci		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
512762306a36Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
512862306a36Sopenharmony_ci	},
512962306a36Sopenharmony_ci	{
513062306a36Sopenharmony_ci		.name = "kmem.failcnt",
513162306a36Sopenharmony_ci		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
513262306a36Sopenharmony_ci		.write = mem_cgroup_reset,
513362306a36Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
513462306a36Sopenharmony_ci	},
513562306a36Sopenharmony_ci	{
513662306a36Sopenharmony_ci		.name = "kmem.max_usage_in_bytes",
513762306a36Sopenharmony_ci		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
513862306a36Sopenharmony_ci		.write = mem_cgroup_reset,
513962306a36Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
514062306a36Sopenharmony_ci	},
514162306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) && \
514262306a36Sopenharmony_ci	(defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
514362306a36Sopenharmony_ci	{
514462306a36Sopenharmony_ci		.name = "kmem.slabinfo",
514562306a36Sopenharmony_ci		.seq_show = mem_cgroup_slab_show,
514662306a36Sopenharmony_ci	},
514762306a36Sopenharmony_ci#endif
514862306a36Sopenharmony_ci	{
514962306a36Sopenharmony_ci		.name = "kmem.tcp.limit_in_bytes",
515062306a36Sopenharmony_ci		.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
515162306a36Sopenharmony_ci		.write = mem_cgroup_write,
515262306a36Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
515362306a36Sopenharmony_ci	},
515462306a36Sopenharmony_ci	{
515562306a36Sopenharmony_ci		.name = "kmem.tcp.usage_in_bytes",
515662306a36Sopenharmony_ci		.private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
515762306a36Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
515862306a36Sopenharmony_ci	},
515962306a36Sopenharmony_ci	{
516062306a36Sopenharmony_ci		.name = "kmem.tcp.failcnt",
516162306a36Sopenharmony_ci		.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
516262306a36Sopenharmony_ci		.write = mem_cgroup_reset,
516362306a36Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
516462306a36Sopenharmony_ci	},
516562306a36Sopenharmony_ci	{
516662306a36Sopenharmony_ci		.name = "kmem.tcp.max_usage_in_bytes",
516762306a36Sopenharmony_ci		.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
516862306a36Sopenharmony_ci		.write = mem_cgroup_reset,
516962306a36Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
517062306a36Sopenharmony_ci	},
517162306a36Sopenharmony_ci	{ },	/* terminate */
517262306a36Sopenharmony_ci};
517362306a36Sopenharmony_ci
517462306a36Sopenharmony_ci/*
517562306a36Sopenharmony_ci * Private memory cgroup IDR
517662306a36Sopenharmony_ci *
517762306a36Sopenharmony_ci * Swap-out records and page cache shadow entries need to store memcg
517862306a36Sopenharmony_ci * references in constrained space, so we maintain an ID space that is
517962306a36Sopenharmony_ci * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
518062306a36Sopenharmony_ci * memory-controlled cgroups to 64k.
518162306a36Sopenharmony_ci *
518262306a36Sopenharmony_ci * However, there usually are many references to the offline CSS after
518362306a36Sopenharmony_ci * the cgroup has been destroyed, such as page cache or reclaimable
518462306a36Sopenharmony_ci * slab objects, that don't need to hang on to the ID. We want to keep
518562306a36Sopenharmony_ci * those dead CSS from occupying IDs, or we might quickly exhaust the
518662306a36Sopenharmony_ci * relatively small ID space and prevent the creation of new cgroups
518762306a36Sopenharmony_ci * even when there are much fewer than 64k cgroups - possibly none.
518862306a36Sopenharmony_ci *
518962306a36Sopenharmony_ci * Maintain a private 16-bit ID space for memcg, and allow the ID to
519062306a36Sopenharmony_ci * be freed and recycled when it's no longer needed, which is usually
519162306a36Sopenharmony_ci * when the CSS is offlined.
519262306a36Sopenharmony_ci *
519362306a36Sopenharmony_ci * The only exception to that are records of swapped out tmpfs/shmem
519462306a36Sopenharmony_ci * pages that need to be attributed to live ancestors on swapin. But
519562306a36Sopenharmony_ci * those references are manageable from userspace.
519662306a36Sopenharmony_ci */
519762306a36Sopenharmony_ci
519862306a36Sopenharmony_ci#define MEM_CGROUP_ID_MAX	((1UL << MEM_CGROUP_ID_SHIFT) - 1)
519962306a36Sopenharmony_cistatic DEFINE_IDR(mem_cgroup_idr);
520062306a36Sopenharmony_ci
520162306a36Sopenharmony_cistatic void mem_cgroup_id_remove(struct mem_cgroup *memcg)
520262306a36Sopenharmony_ci{
520362306a36Sopenharmony_ci	if (memcg->id.id > 0) {
520462306a36Sopenharmony_ci		idr_remove(&mem_cgroup_idr, memcg->id.id);
520562306a36Sopenharmony_ci		memcg->id.id = 0;
520662306a36Sopenharmony_ci	}
520762306a36Sopenharmony_ci}
520862306a36Sopenharmony_ci
520962306a36Sopenharmony_cistatic void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
521062306a36Sopenharmony_ci						  unsigned int n)
521162306a36Sopenharmony_ci{
521262306a36Sopenharmony_ci	refcount_add(n, &memcg->id.ref);
521362306a36Sopenharmony_ci}
521462306a36Sopenharmony_ci
521562306a36Sopenharmony_cistatic void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
521662306a36Sopenharmony_ci{
521762306a36Sopenharmony_ci	if (refcount_sub_and_test(n, &memcg->id.ref)) {
521862306a36Sopenharmony_ci		mem_cgroup_id_remove(memcg);
521962306a36Sopenharmony_ci
522062306a36Sopenharmony_ci		/* Memcg ID pins CSS */
522162306a36Sopenharmony_ci		css_put(&memcg->css);
522262306a36Sopenharmony_ci	}
522362306a36Sopenharmony_ci}
522462306a36Sopenharmony_ci
522562306a36Sopenharmony_cistatic inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
522662306a36Sopenharmony_ci{
522762306a36Sopenharmony_ci	mem_cgroup_id_put_many(memcg, 1);
522862306a36Sopenharmony_ci}
522962306a36Sopenharmony_ci
523062306a36Sopenharmony_ci/**
523162306a36Sopenharmony_ci * mem_cgroup_from_id - look up a memcg from a memcg id
523262306a36Sopenharmony_ci * @id: the memcg id to look up
523362306a36Sopenharmony_ci *
523462306a36Sopenharmony_ci * Caller must hold rcu_read_lock().
523562306a36Sopenharmony_ci */
523662306a36Sopenharmony_cistruct mem_cgroup *mem_cgroup_from_id(unsigned short id)
523762306a36Sopenharmony_ci{
523862306a36Sopenharmony_ci	WARN_ON_ONCE(!rcu_read_lock_held());
523962306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
524062306a36Sopenharmony_ci	if (id == -1)
524162306a36Sopenharmony_ci		return NULL;
524262306a36Sopenharmony_ci#endif
524362306a36Sopenharmony_ci	return idr_find(&mem_cgroup_idr, id);
524462306a36Sopenharmony_ci}
524562306a36Sopenharmony_ci
524662306a36Sopenharmony_ci#ifdef CONFIG_SHRINKER_DEBUG
524762306a36Sopenharmony_cistruct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
524862306a36Sopenharmony_ci{
524962306a36Sopenharmony_ci	struct cgroup *cgrp;
525062306a36Sopenharmony_ci	struct cgroup_subsys_state *css;
525162306a36Sopenharmony_ci	struct mem_cgroup *memcg;
525262306a36Sopenharmony_ci
525362306a36Sopenharmony_ci	cgrp = cgroup_get_from_id(ino);
525462306a36Sopenharmony_ci	if (IS_ERR(cgrp))
525562306a36Sopenharmony_ci		return ERR_CAST(cgrp);
525662306a36Sopenharmony_ci
525762306a36Sopenharmony_ci	css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
525862306a36Sopenharmony_ci	if (css)
525962306a36Sopenharmony_ci		memcg = container_of(css, struct mem_cgroup, css);
526062306a36Sopenharmony_ci	else
526162306a36Sopenharmony_ci		memcg = ERR_PTR(-ENOENT);
526262306a36Sopenharmony_ci
526362306a36Sopenharmony_ci	cgroup_put(cgrp);
526462306a36Sopenharmony_ci
526562306a36Sopenharmony_ci	return memcg;
526662306a36Sopenharmony_ci}
526762306a36Sopenharmony_ci#endif
526862306a36Sopenharmony_ci
526962306a36Sopenharmony_cistatic int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
527062306a36Sopenharmony_ci{
527162306a36Sopenharmony_ci	struct mem_cgroup_per_node *pn;
527262306a36Sopenharmony_ci
527362306a36Sopenharmony_ci	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
527462306a36Sopenharmony_ci	if (!pn)
527562306a36Sopenharmony_ci		return 1;
527662306a36Sopenharmony_ci
527762306a36Sopenharmony_ci	pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
527862306a36Sopenharmony_ci						   GFP_KERNEL_ACCOUNT);
527962306a36Sopenharmony_ci	if (!pn->lruvec_stats_percpu) {
528062306a36Sopenharmony_ci		kfree(pn);
528162306a36Sopenharmony_ci		return 1;
528262306a36Sopenharmony_ci	}
528362306a36Sopenharmony_ci
528462306a36Sopenharmony_ci	lruvec_init(&pn->lruvec);
528562306a36Sopenharmony_ci#if defined(CONFIG_HYPERHOLD_FILE_LRU) && defined(CONFIG_MEMCG)
528662306a36Sopenharmony_ci	pn->lruvec.pgdat = NODE_DATA(node);
528762306a36Sopenharmony_ci#endif
528862306a36Sopenharmony_ci	pn->memcg = memcg;
528962306a36Sopenharmony_ci
529062306a36Sopenharmony_ci	memcg->nodeinfo[node] = pn;
529162306a36Sopenharmony_ci	return 0;
529262306a36Sopenharmony_ci}
529362306a36Sopenharmony_ci
529462306a36Sopenharmony_cistatic void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
529562306a36Sopenharmony_ci{
529662306a36Sopenharmony_ci	struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
529762306a36Sopenharmony_ci
529862306a36Sopenharmony_ci	if (!pn)
529962306a36Sopenharmony_ci		return;
530062306a36Sopenharmony_ci
530162306a36Sopenharmony_ci	free_percpu(pn->lruvec_stats_percpu);
530262306a36Sopenharmony_ci	kfree(pn);
530362306a36Sopenharmony_ci}
530462306a36Sopenharmony_ci
530562306a36Sopenharmony_cistatic void __mem_cgroup_free(struct mem_cgroup *memcg)
530662306a36Sopenharmony_ci{
530762306a36Sopenharmony_ci	int node;
530862306a36Sopenharmony_ci
530962306a36Sopenharmony_ci	for_each_node(node)
531062306a36Sopenharmony_ci		free_mem_cgroup_per_node_info(memcg, node);
531162306a36Sopenharmony_ci	kfree(memcg->vmstats);
531262306a36Sopenharmony_ci	free_percpu(memcg->vmstats_percpu);
531362306a36Sopenharmony_ci	kfree(memcg);
531462306a36Sopenharmony_ci}
531562306a36Sopenharmony_ci
531662306a36Sopenharmony_cistatic void mem_cgroup_free(struct mem_cgroup *memcg)
531762306a36Sopenharmony_ci{
531862306a36Sopenharmony_ci	lru_gen_exit_memcg(memcg);
531962306a36Sopenharmony_ci	memcg_wb_domain_exit(memcg);
532062306a36Sopenharmony_ci	__mem_cgroup_free(memcg);
532162306a36Sopenharmony_ci}
532262306a36Sopenharmony_ci
532362306a36Sopenharmony_cistatic struct mem_cgroup *mem_cgroup_alloc(void)
532462306a36Sopenharmony_ci{
532562306a36Sopenharmony_ci	struct mem_cgroup *memcg;
532662306a36Sopenharmony_ci	int node;
532762306a36Sopenharmony_ci	int __maybe_unused i;
532862306a36Sopenharmony_ci	long error = -ENOMEM;
532962306a36Sopenharmony_ci
533062306a36Sopenharmony_ci	memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
533162306a36Sopenharmony_ci	if (!memcg)
533262306a36Sopenharmony_ci		return ERR_PTR(error);
533362306a36Sopenharmony_ci
533462306a36Sopenharmony_ci	memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
533562306a36Sopenharmony_ci				 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL);
533662306a36Sopenharmony_ci	if (memcg->id.id < 0) {
533762306a36Sopenharmony_ci		error = memcg->id.id;
533862306a36Sopenharmony_ci		goto fail;
533962306a36Sopenharmony_ci	}
534062306a36Sopenharmony_ci
534162306a36Sopenharmony_ci	memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL);
534262306a36Sopenharmony_ci	if (!memcg->vmstats)
534362306a36Sopenharmony_ci		goto fail;
534462306a36Sopenharmony_ci
534562306a36Sopenharmony_ci	memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
534662306a36Sopenharmony_ci						 GFP_KERNEL_ACCOUNT);
534762306a36Sopenharmony_ci	if (!memcg->vmstats_percpu)
534862306a36Sopenharmony_ci		goto fail;
534962306a36Sopenharmony_ci
535062306a36Sopenharmony_ci	for_each_node(node)
535162306a36Sopenharmony_ci		if (alloc_mem_cgroup_per_node_info(memcg, node))
535262306a36Sopenharmony_ci			goto fail;
535362306a36Sopenharmony_ci
535462306a36Sopenharmony_ci	if (memcg_wb_domain_init(memcg, GFP_KERNEL))
535562306a36Sopenharmony_ci		goto fail;
535662306a36Sopenharmony_ci
535762306a36Sopenharmony_ci	INIT_WORK(&memcg->high_work, high_work_func);
535862306a36Sopenharmony_ci	INIT_LIST_HEAD(&memcg->oom_notify);
535962306a36Sopenharmony_ci	mutex_init(&memcg->thresholds_lock);
536062306a36Sopenharmony_ci	spin_lock_init(&memcg->move_lock);
536162306a36Sopenharmony_ci	vmpressure_init(&memcg->vmpressure);
536262306a36Sopenharmony_ci	INIT_LIST_HEAD(&memcg->event_list);
536362306a36Sopenharmony_ci	spin_lock_init(&memcg->event_list_lock);
536462306a36Sopenharmony_ci	memcg->socket_pressure = jiffies;
536562306a36Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
536662306a36Sopenharmony_ci	memcg->kmemcg_id = -1;
536762306a36Sopenharmony_ci	INIT_LIST_HEAD(&memcg->objcg_list);
536862306a36Sopenharmony_ci#endif
536962306a36Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK
537062306a36Sopenharmony_ci	INIT_LIST_HEAD(&memcg->cgwb_list);
537162306a36Sopenharmony_ci	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
537262306a36Sopenharmony_ci		memcg->cgwb_frn[i].done =
537362306a36Sopenharmony_ci			__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
537462306a36Sopenharmony_ci#endif
537562306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
537662306a36Sopenharmony_ci	spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
537762306a36Sopenharmony_ci	INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
537862306a36Sopenharmony_ci	memcg->deferred_split_queue.split_queue_len = 0;
537962306a36Sopenharmony_ci#endif
538062306a36Sopenharmony_ci
538162306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG
538262306a36Sopenharmony_ci	if (unlikely(!score_head_inited)) {
538362306a36Sopenharmony_ci		INIT_LIST_HEAD(&score_head);
538462306a36Sopenharmony_ci		score_head_inited = true;
538562306a36Sopenharmony_ci	}
538662306a36Sopenharmony_ci#endif
538762306a36Sopenharmony_ci
538862306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG
538962306a36Sopenharmony_ci	INIT_LIST_HEAD(&memcg->score_node);
539062306a36Sopenharmony_ci#endif
539162306a36Sopenharmony_ci
539262306a36Sopenharmony_ci	lru_gen_init_memcg(memcg);
539362306a36Sopenharmony_ci	return memcg;
539462306a36Sopenharmony_cifail:
539562306a36Sopenharmony_ci	mem_cgroup_id_remove(memcg);
539662306a36Sopenharmony_ci	__mem_cgroup_free(memcg);
539762306a36Sopenharmony_ci	return ERR_PTR(error);
539862306a36Sopenharmony_ci}
539962306a36Sopenharmony_ci
540062306a36Sopenharmony_cistatic struct cgroup_subsys_state * __ref
540162306a36Sopenharmony_cimem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
540262306a36Sopenharmony_ci{
540362306a36Sopenharmony_ci	struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
540462306a36Sopenharmony_ci	struct mem_cgroup *memcg, *old_memcg;
540562306a36Sopenharmony_ci
540662306a36Sopenharmony_ci	old_memcg = set_active_memcg(parent);
540762306a36Sopenharmony_ci	memcg = mem_cgroup_alloc();
540862306a36Sopenharmony_ci	set_active_memcg(old_memcg);
540962306a36Sopenharmony_ci	if (IS_ERR(memcg))
541062306a36Sopenharmony_ci		return ERR_CAST(memcg);
541162306a36Sopenharmony_ci
541262306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG
541362306a36Sopenharmony_ci	atomic64_set(&memcg->memcg_reclaimed.app_score, 300);
541462306a36Sopenharmony_ci#endif
541562306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_ZSWAPD
541662306a36Sopenharmony_ci	atomic_set(&memcg->memcg_reclaimed.ub_zram2ufs_ratio, 10);
541762306a36Sopenharmony_ci	atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, 60);
541862306a36Sopenharmony_ci	atomic_set(&memcg->memcg_reclaimed.refault_threshold, 50);
541962306a36Sopenharmony_ci#endif
542062306a36Sopenharmony_ci	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
542162306a36Sopenharmony_ci	WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
542262306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
542362306a36Sopenharmony_ci	memcg->zswap_max = PAGE_COUNTER_MAX;
542462306a36Sopenharmony_ci#endif
542562306a36Sopenharmony_ci	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
542662306a36Sopenharmony_ci	if (parent) {
542762306a36Sopenharmony_ci		WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
542862306a36Sopenharmony_ci		WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
542962306a36Sopenharmony_ci
543062306a36Sopenharmony_ci		page_counter_init(&memcg->memory, &parent->memory);
543162306a36Sopenharmony_ci		page_counter_init(&memcg->swap, &parent->swap);
543262306a36Sopenharmony_ci		page_counter_init(&memcg->kmem, &parent->kmem);
543362306a36Sopenharmony_ci		page_counter_init(&memcg->tcpmem, &parent->tcpmem);
543462306a36Sopenharmony_ci	} else {
543562306a36Sopenharmony_ci		init_memcg_events();
543662306a36Sopenharmony_ci		page_counter_init(&memcg->memory, NULL);
543762306a36Sopenharmony_ci		page_counter_init(&memcg->swap, NULL);
543862306a36Sopenharmony_ci		page_counter_init(&memcg->kmem, NULL);
543962306a36Sopenharmony_ci		page_counter_init(&memcg->tcpmem, NULL);
544062306a36Sopenharmony_ci
544162306a36Sopenharmony_ci		root_mem_cgroup = memcg;
544262306a36Sopenharmony_ci		return &memcg->css;
544362306a36Sopenharmony_ci	}
544462306a36Sopenharmony_ci
544562306a36Sopenharmony_ci	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
544662306a36Sopenharmony_ci		static_branch_inc(&memcg_sockets_enabled_key);
544762306a36Sopenharmony_ci
544862306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM)
544962306a36Sopenharmony_ci	if (!cgroup_memory_nobpf)
545062306a36Sopenharmony_ci		static_branch_inc(&memcg_bpf_enabled_key);
545162306a36Sopenharmony_ci#endif
545262306a36Sopenharmony_ci
545362306a36Sopenharmony_ci	return &memcg->css;
545462306a36Sopenharmony_ci}
545562306a36Sopenharmony_ci
545662306a36Sopenharmony_cistatic int mem_cgroup_css_online(struct cgroup_subsys_state *css)
545762306a36Sopenharmony_ci{
545862306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
545962306a36Sopenharmony_ci
546062306a36Sopenharmony_ci	if (memcg_online_kmem(memcg))
546162306a36Sopenharmony_ci		goto remove_id;
546262306a36Sopenharmony_ci
546362306a36Sopenharmony_ci	/*
546462306a36Sopenharmony_ci	 * A memcg must be visible for expand_shrinker_info()
546562306a36Sopenharmony_ci	 * by the time the maps are allocated. So, we allocate maps
546662306a36Sopenharmony_ci	 * here, when for_each_mem_cgroup() can't skip it.
546762306a36Sopenharmony_ci	 */
546862306a36Sopenharmony_ci	if (alloc_shrinker_info(memcg))
546962306a36Sopenharmony_ci		goto offline_kmem;
547062306a36Sopenharmony_ci
547162306a36Sopenharmony_ci	if (unlikely(mem_cgroup_is_root(memcg)))
547262306a36Sopenharmony_ci		queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
547362306a36Sopenharmony_ci				   FLUSH_TIME);
547462306a36Sopenharmony_ci	lru_gen_online_memcg(memcg);
547562306a36Sopenharmony_ci
547662306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG
547762306a36Sopenharmony_ci	memcg_app_score_update(memcg);
547862306a36Sopenharmony_ci	css_get(css);
547962306a36Sopenharmony_ci#endif
548062306a36Sopenharmony_ci
548162306a36Sopenharmony_ci	/* Online state pins memcg ID, memcg ID pins CSS */
548262306a36Sopenharmony_ci	refcount_set(&memcg->id.ref, 1);
548362306a36Sopenharmony_ci	css_get(css);
548462306a36Sopenharmony_ci
548562306a36Sopenharmony_ci	/*
548662306a36Sopenharmony_ci	 * Ensure mem_cgroup_from_id() works once we're fully online.
548762306a36Sopenharmony_ci	 *
548862306a36Sopenharmony_ci	 * We could do this earlier and require callers to filter with
548962306a36Sopenharmony_ci	 * css_tryget_online(). But right now there are no users that
549062306a36Sopenharmony_ci	 * need earlier access, and the workingset code relies on the
549162306a36Sopenharmony_ci	 * cgroup tree linkage (mem_cgroup_get_nr_swap_pages()). So
549262306a36Sopenharmony_ci	 * publish it here at the end of onlining. This matches the
549362306a36Sopenharmony_ci	 * regular ID destruction during offlining.
549462306a36Sopenharmony_ci	 */
549562306a36Sopenharmony_ci	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
549662306a36Sopenharmony_ci
549762306a36Sopenharmony_ci	return 0;
549862306a36Sopenharmony_cioffline_kmem:
549962306a36Sopenharmony_ci	memcg_offline_kmem(memcg);
550062306a36Sopenharmony_ciremove_id:
550162306a36Sopenharmony_ci	mem_cgroup_id_remove(memcg);
550262306a36Sopenharmony_ci	return -ENOMEM;
550362306a36Sopenharmony_ci}
550462306a36Sopenharmony_ci
550562306a36Sopenharmony_cistatic void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
550662306a36Sopenharmony_ci{
550762306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
550862306a36Sopenharmony_ci	struct mem_cgroup_event *event, *tmp;
550962306a36Sopenharmony_ci
551062306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG
551162306a36Sopenharmony_ci	unsigned long flags;
551262306a36Sopenharmony_ci
551362306a36Sopenharmony_ci	write_lock_irqsave(&score_list_lock, flags);
551462306a36Sopenharmony_ci	list_del_init(&memcg->score_node);
551562306a36Sopenharmony_ci	write_unlock_irqrestore(&score_list_lock, flags);
551662306a36Sopenharmony_ci	css_put(css);
551762306a36Sopenharmony_ci#endif
551862306a36Sopenharmony_ci
551962306a36Sopenharmony_ci	/*
552062306a36Sopenharmony_ci	 * Unregister events and notify userspace.
552162306a36Sopenharmony_ci	 * Notify userspace about cgroup removing only after rmdir of cgroup
552262306a36Sopenharmony_ci	 * directory to avoid race between userspace and kernelspace.
552362306a36Sopenharmony_ci	 */
552462306a36Sopenharmony_ci	spin_lock_irq(&memcg->event_list_lock);
552562306a36Sopenharmony_ci	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
552662306a36Sopenharmony_ci		list_del_init(&event->list);
552762306a36Sopenharmony_ci		schedule_work(&event->remove);
552862306a36Sopenharmony_ci	}
552962306a36Sopenharmony_ci	spin_unlock_irq(&memcg->event_list_lock);
553062306a36Sopenharmony_ci
553162306a36Sopenharmony_ci	page_counter_set_min(&memcg->memory, 0);
553262306a36Sopenharmony_ci	page_counter_set_low(&memcg->memory, 0);
553362306a36Sopenharmony_ci
553462306a36Sopenharmony_ci	memcg_offline_kmem(memcg);
553562306a36Sopenharmony_ci	reparent_shrinker_deferred(memcg);
553662306a36Sopenharmony_ci	wb_memcg_offline(memcg);
553762306a36Sopenharmony_ci	lru_gen_offline_memcg(memcg);
553862306a36Sopenharmony_ci
553962306a36Sopenharmony_ci	drain_all_stock(memcg);
554062306a36Sopenharmony_ci
554162306a36Sopenharmony_ci	mem_cgroup_id_put(memcg);
554262306a36Sopenharmony_ci}
554362306a36Sopenharmony_ci
554462306a36Sopenharmony_cistatic void mem_cgroup_css_released(struct cgroup_subsys_state *css)
554562306a36Sopenharmony_ci{
554662306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
554762306a36Sopenharmony_ci
554862306a36Sopenharmony_ci	invalidate_reclaim_iterators(memcg);
554962306a36Sopenharmony_ci	lru_gen_release_memcg(memcg);
555062306a36Sopenharmony_ci}
555162306a36Sopenharmony_ci
555262306a36Sopenharmony_cistatic void mem_cgroup_css_free(struct cgroup_subsys_state *css)
555362306a36Sopenharmony_ci{
555462306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
555562306a36Sopenharmony_ci	int __maybe_unused i;
555662306a36Sopenharmony_ci
555762306a36Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK
555862306a36Sopenharmony_ci	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
555962306a36Sopenharmony_ci		wb_wait_for_completion(&memcg->cgwb_frn[i].done);
556062306a36Sopenharmony_ci#endif
556162306a36Sopenharmony_ci	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
556262306a36Sopenharmony_ci		static_branch_dec(&memcg_sockets_enabled_key);
556362306a36Sopenharmony_ci
556462306a36Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
556562306a36Sopenharmony_ci		static_branch_dec(&memcg_sockets_enabled_key);
556662306a36Sopenharmony_ci
556762306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM)
556862306a36Sopenharmony_ci	if (!cgroup_memory_nobpf)
556962306a36Sopenharmony_ci		static_branch_dec(&memcg_bpf_enabled_key);
557062306a36Sopenharmony_ci#endif
557162306a36Sopenharmony_ci
557262306a36Sopenharmony_ci	vmpressure_cleanup(&memcg->vmpressure);
557362306a36Sopenharmony_ci	cancel_work_sync(&memcg->high_work);
557462306a36Sopenharmony_ci	mem_cgroup_remove_from_trees(memcg);
557562306a36Sopenharmony_ci	free_shrinker_info(memcg);
557662306a36Sopenharmony_ci	mem_cgroup_free(memcg);
557762306a36Sopenharmony_ci}
557862306a36Sopenharmony_ci
557962306a36Sopenharmony_ci/**
558062306a36Sopenharmony_ci * mem_cgroup_css_reset - reset the states of a mem_cgroup
558162306a36Sopenharmony_ci * @css: the target css
558262306a36Sopenharmony_ci *
558362306a36Sopenharmony_ci * Reset the states of the mem_cgroup associated with @css.  This is
558462306a36Sopenharmony_ci * invoked when the userland requests disabling on the default hierarchy
558562306a36Sopenharmony_ci * but the memcg is pinned through dependency.  The memcg should stop
558662306a36Sopenharmony_ci * applying policies and should revert to the vanilla state as it may be
558762306a36Sopenharmony_ci * made visible again.
558862306a36Sopenharmony_ci *
558962306a36Sopenharmony_ci * The current implementation only resets the essential configurations.
559062306a36Sopenharmony_ci * This needs to be expanded to cover all the visible parts.
559162306a36Sopenharmony_ci */
559262306a36Sopenharmony_cistatic void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
559362306a36Sopenharmony_ci{
559462306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
559562306a36Sopenharmony_ci
559662306a36Sopenharmony_ci	page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
559762306a36Sopenharmony_ci	page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
559862306a36Sopenharmony_ci	page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
559962306a36Sopenharmony_ci	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
560062306a36Sopenharmony_ci	page_counter_set_min(&memcg->memory, 0);
560162306a36Sopenharmony_ci	page_counter_set_low(&memcg->memory, 0);
560262306a36Sopenharmony_ci	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
560362306a36Sopenharmony_ci	WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
560462306a36Sopenharmony_ci	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
560562306a36Sopenharmony_ci	memcg_wb_domain_size_changed(memcg);
560662306a36Sopenharmony_ci}
560762306a36Sopenharmony_ci
560862306a36Sopenharmony_cistatic void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
560962306a36Sopenharmony_ci{
561062306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
561162306a36Sopenharmony_ci	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
561262306a36Sopenharmony_ci	struct memcg_vmstats_percpu *statc;
561362306a36Sopenharmony_ci	long delta, delta_cpu, v;
561462306a36Sopenharmony_ci	int i, nid;
561562306a36Sopenharmony_ci
561662306a36Sopenharmony_ci	statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
561762306a36Sopenharmony_ci
561862306a36Sopenharmony_ci	for (i = 0; i < MEMCG_NR_STAT; i++) {
561962306a36Sopenharmony_ci		/*
562062306a36Sopenharmony_ci		 * Collect the aggregated propagation counts of groups
562162306a36Sopenharmony_ci		 * below us. We're in a per-cpu loop here and this is
562262306a36Sopenharmony_ci		 * a global counter, so the first cycle will get them.
562362306a36Sopenharmony_ci		 */
562462306a36Sopenharmony_ci		delta = memcg->vmstats->state_pending[i];
562562306a36Sopenharmony_ci		if (delta)
562662306a36Sopenharmony_ci			memcg->vmstats->state_pending[i] = 0;
562762306a36Sopenharmony_ci
562862306a36Sopenharmony_ci		/* Add CPU changes on this level since the last flush */
562962306a36Sopenharmony_ci		delta_cpu = 0;
563062306a36Sopenharmony_ci		v = READ_ONCE(statc->state[i]);
563162306a36Sopenharmony_ci		if (v != statc->state_prev[i]) {
563262306a36Sopenharmony_ci			delta_cpu = v - statc->state_prev[i];
563362306a36Sopenharmony_ci			delta += delta_cpu;
563462306a36Sopenharmony_ci			statc->state_prev[i] = v;
563562306a36Sopenharmony_ci		}
563662306a36Sopenharmony_ci
563762306a36Sopenharmony_ci		/* Aggregate counts on this level and propagate upwards */
563862306a36Sopenharmony_ci		if (delta_cpu)
563962306a36Sopenharmony_ci			memcg->vmstats->state_local[i] += delta_cpu;
564062306a36Sopenharmony_ci
564162306a36Sopenharmony_ci		if (delta) {
564262306a36Sopenharmony_ci			memcg->vmstats->state[i] += delta;
564362306a36Sopenharmony_ci			if (parent)
564462306a36Sopenharmony_ci				parent->vmstats->state_pending[i] += delta;
564562306a36Sopenharmony_ci		}
564662306a36Sopenharmony_ci	}
564762306a36Sopenharmony_ci
564862306a36Sopenharmony_ci	for (i = 0; i < NR_MEMCG_EVENTS; i++) {
564962306a36Sopenharmony_ci		delta = memcg->vmstats->events_pending[i];
565062306a36Sopenharmony_ci		if (delta)
565162306a36Sopenharmony_ci			memcg->vmstats->events_pending[i] = 0;
565262306a36Sopenharmony_ci
565362306a36Sopenharmony_ci		delta_cpu = 0;
565462306a36Sopenharmony_ci		v = READ_ONCE(statc->events[i]);
565562306a36Sopenharmony_ci		if (v != statc->events_prev[i]) {
565662306a36Sopenharmony_ci			delta_cpu = v - statc->events_prev[i];
565762306a36Sopenharmony_ci			delta += delta_cpu;
565862306a36Sopenharmony_ci			statc->events_prev[i] = v;
565962306a36Sopenharmony_ci		}
566062306a36Sopenharmony_ci
566162306a36Sopenharmony_ci		if (delta_cpu)
566262306a36Sopenharmony_ci			memcg->vmstats->events_local[i] += delta_cpu;
566362306a36Sopenharmony_ci
566462306a36Sopenharmony_ci		if (delta) {
566562306a36Sopenharmony_ci			memcg->vmstats->events[i] += delta;
566662306a36Sopenharmony_ci			if (parent)
566762306a36Sopenharmony_ci				parent->vmstats->events_pending[i] += delta;
566862306a36Sopenharmony_ci		}
566962306a36Sopenharmony_ci	}
567062306a36Sopenharmony_ci
567162306a36Sopenharmony_ci	for_each_node_state(nid, N_MEMORY) {
567262306a36Sopenharmony_ci		struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
567362306a36Sopenharmony_ci		struct mem_cgroup_per_node *ppn = NULL;
567462306a36Sopenharmony_ci		struct lruvec_stats_percpu *lstatc;
567562306a36Sopenharmony_ci
567662306a36Sopenharmony_ci		if (parent)
567762306a36Sopenharmony_ci			ppn = parent->nodeinfo[nid];
567862306a36Sopenharmony_ci
567962306a36Sopenharmony_ci		lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
568062306a36Sopenharmony_ci
568162306a36Sopenharmony_ci		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
568262306a36Sopenharmony_ci			delta = pn->lruvec_stats.state_pending[i];
568362306a36Sopenharmony_ci			if (delta)
568462306a36Sopenharmony_ci				pn->lruvec_stats.state_pending[i] = 0;
568562306a36Sopenharmony_ci
568662306a36Sopenharmony_ci			delta_cpu = 0;
568762306a36Sopenharmony_ci			v = READ_ONCE(lstatc->state[i]);
568862306a36Sopenharmony_ci			if (v != lstatc->state_prev[i]) {
568962306a36Sopenharmony_ci				delta_cpu = v - lstatc->state_prev[i];
569062306a36Sopenharmony_ci				delta += delta_cpu;
569162306a36Sopenharmony_ci				lstatc->state_prev[i] = v;
569262306a36Sopenharmony_ci			}
569362306a36Sopenharmony_ci
569462306a36Sopenharmony_ci			if (delta_cpu)
569562306a36Sopenharmony_ci				pn->lruvec_stats.state_local[i] += delta_cpu;
569662306a36Sopenharmony_ci
569762306a36Sopenharmony_ci			if (delta) {
569862306a36Sopenharmony_ci				pn->lruvec_stats.state[i] += delta;
569962306a36Sopenharmony_ci				if (ppn)
570062306a36Sopenharmony_ci					ppn->lruvec_stats.state_pending[i] += delta;
570162306a36Sopenharmony_ci			}
570262306a36Sopenharmony_ci		}
570362306a36Sopenharmony_ci	}
570462306a36Sopenharmony_ci}
570562306a36Sopenharmony_ci
570662306a36Sopenharmony_ci#ifdef CONFIG_MMU
570762306a36Sopenharmony_ci/* Handlers for move charge at task migration. */
570862306a36Sopenharmony_cistatic int mem_cgroup_do_precharge(unsigned long count)
570962306a36Sopenharmony_ci{
571062306a36Sopenharmony_ci	int ret;
571162306a36Sopenharmony_ci
571262306a36Sopenharmony_ci	/* Try a single bulk charge without reclaim first, kswapd may wake */
571362306a36Sopenharmony_ci	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
571462306a36Sopenharmony_ci	if (!ret) {
571562306a36Sopenharmony_ci		mc.precharge += count;
571662306a36Sopenharmony_ci		return ret;
571762306a36Sopenharmony_ci	}
571862306a36Sopenharmony_ci
571962306a36Sopenharmony_ci	/* Try charges one by one with reclaim, but do not retry */
572062306a36Sopenharmony_ci	while (count--) {
572162306a36Sopenharmony_ci		ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
572262306a36Sopenharmony_ci		if (ret)
572362306a36Sopenharmony_ci			return ret;
572462306a36Sopenharmony_ci		mc.precharge++;
572562306a36Sopenharmony_ci		cond_resched();
572662306a36Sopenharmony_ci	}
572762306a36Sopenharmony_ci	return 0;
572862306a36Sopenharmony_ci}
572962306a36Sopenharmony_ci
573062306a36Sopenharmony_ciunion mc_target {
573162306a36Sopenharmony_ci	struct page	*page;
573262306a36Sopenharmony_ci	swp_entry_t	ent;
573362306a36Sopenharmony_ci};
573462306a36Sopenharmony_ci
573562306a36Sopenharmony_cienum mc_target_type {
573662306a36Sopenharmony_ci	MC_TARGET_NONE = 0,
573762306a36Sopenharmony_ci	MC_TARGET_PAGE,
573862306a36Sopenharmony_ci	MC_TARGET_SWAP,
573962306a36Sopenharmony_ci	MC_TARGET_DEVICE,
574062306a36Sopenharmony_ci};
574162306a36Sopenharmony_ci
574262306a36Sopenharmony_cistatic struct page *mc_handle_present_pte(struct vm_area_struct *vma,
574362306a36Sopenharmony_ci						unsigned long addr, pte_t ptent)
574462306a36Sopenharmony_ci{
574562306a36Sopenharmony_ci	struct page *page = vm_normal_page(vma, addr, ptent);
574662306a36Sopenharmony_ci
574762306a36Sopenharmony_ci	if (!page)
574862306a36Sopenharmony_ci		return NULL;
574962306a36Sopenharmony_ci	if (PageAnon(page)) {
575062306a36Sopenharmony_ci		if (!(mc.flags & MOVE_ANON))
575162306a36Sopenharmony_ci			return NULL;
575262306a36Sopenharmony_ci	} else {
575362306a36Sopenharmony_ci		if (!(mc.flags & MOVE_FILE))
575462306a36Sopenharmony_ci			return NULL;
575562306a36Sopenharmony_ci	}
575662306a36Sopenharmony_ci	get_page(page);
575762306a36Sopenharmony_ci
575862306a36Sopenharmony_ci	return page;
575962306a36Sopenharmony_ci}
576062306a36Sopenharmony_ci
576162306a36Sopenharmony_ci#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
576262306a36Sopenharmony_cistatic struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
576362306a36Sopenharmony_ci			pte_t ptent, swp_entry_t *entry)
576462306a36Sopenharmony_ci{
576562306a36Sopenharmony_ci	struct page *page = NULL;
576662306a36Sopenharmony_ci	swp_entry_t ent = pte_to_swp_entry(ptent);
576762306a36Sopenharmony_ci
576862306a36Sopenharmony_ci	if (!(mc.flags & MOVE_ANON))
576962306a36Sopenharmony_ci		return NULL;
577062306a36Sopenharmony_ci
577162306a36Sopenharmony_ci	/*
577262306a36Sopenharmony_ci	 * Handle device private pages that are not accessible by the CPU, but
577362306a36Sopenharmony_ci	 * stored as special swap entries in the page table.
577462306a36Sopenharmony_ci	 */
577562306a36Sopenharmony_ci	if (is_device_private_entry(ent)) {
577662306a36Sopenharmony_ci		page = pfn_swap_entry_to_page(ent);
577762306a36Sopenharmony_ci		if (!get_page_unless_zero(page))
577862306a36Sopenharmony_ci			return NULL;
577962306a36Sopenharmony_ci		return page;
578062306a36Sopenharmony_ci	}
578162306a36Sopenharmony_ci
578262306a36Sopenharmony_ci	if (non_swap_entry(ent))
578362306a36Sopenharmony_ci		return NULL;
578462306a36Sopenharmony_ci
578562306a36Sopenharmony_ci	/*
578662306a36Sopenharmony_ci	 * Because swap_cache_get_folio() updates some statistics counter,
578762306a36Sopenharmony_ci	 * we call find_get_page() with swapper_space directly.
578862306a36Sopenharmony_ci	 */
578962306a36Sopenharmony_ci	page = find_get_page(swap_address_space(ent), swp_offset(ent));
579062306a36Sopenharmony_ci	entry->val = ent.val;
579162306a36Sopenharmony_ci
579262306a36Sopenharmony_ci	return page;
579362306a36Sopenharmony_ci}
579462306a36Sopenharmony_ci#else
579562306a36Sopenharmony_cistatic struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
579662306a36Sopenharmony_ci			pte_t ptent, swp_entry_t *entry)
579762306a36Sopenharmony_ci{
579862306a36Sopenharmony_ci	return NULL;
579962306a36Sopenharmony_ci}
580062306a36Sopenharmony_ci#endif
580162306a36Sopenharmony_ci
580262306a36Sopenharmony_cistatic struct page *mc_handle_file_pte(struct vm_area_struct *vma,
580362306a36Sopenharmony_ci			unsigned long addr, pte_t ptent)
580462306a36Sopenharmony_ci{
580562306a36Sopenharmony_ci	unsigned long index;
580662306a36Sopenharmony_ci	struct folio *folio;
580762306a36Sopenharmony_ci
580862306a36Sopenharmony_ci	if (!vma->vm_file) /* anonymous vma */
580962306a36Sopenharmony_ci		return NULL;
581062306a36Sopenharmony_ci	if (!(mc.flags & MOVE_FILE))
581162306a36Sopenharmony_ci		return NULL;
581262306a36Sopenharmony_ci
581362306a36Sopenharmony_ci	/* folio is moved even if it's not RSS of this task(page-faulted). */
581462306a36Sopenharmony_ci	/* shmem/tmpfs may report page out on swap: account for that too. */
581562306a36Sopenharmony_ci	index = linear_page_index(vma, addr);
581662306a36Sopenharmony_ci	folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index);
581762306a36Sopenharmony_ci	if (IS_ERR(folio))
581862306a36Sopenharmony_ci		return NULL;
581962306a36Sopenharmony_ci	return folio_file_page(folio, index);
582062306a36Sopenharmony_ci}
582162306a36Sopenharmony_ci
582262306a36Sopenharmony_ci/**
582362306a36Sopenharmony_ci * mem_cgroup_move_account - move account of the page
582462306a36Sopenharmony_ci * @page: the page
582562306a36Sopenharmony_ci * @compound: charge the page as compound or small page
582662306a36Sopenharmony_ci * @from: mem_cgroup which the page is moved from.
582762306a36Sopenharmony_ci * @to:	mem_cgroup which the page is moved to. @from != @to.
582862306a36Sopenharmony_ci *
582962306a36Sopenharmony_ci * The page must be locked and not on the LRU.
583062306a36Sopenharmony_ci *
583162306a36Sopenharmony_ci * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
583262306a36Sopenharmony_ci * from old cgroup.
583362306a36Sopenharmony_ci */
583462306a36Sopenharmony_cistatic int mem_cgroup_move_account(struct page *page,
583562306a36Sopenharmony_ci				   bool compound,
583662306a36Sopenharmony_ci				   struct mem_cgroup *from,
583762306a36Sopenharmony_ci				   struct mem_cgroup *to)
583862306a36Sopenharmony_ci{
583962306a36Sopenharmony_ci	struct folio *folio = page_folio(page);
584062306a36Sopenharmony_ci	struct lruvec *from_vec, *to_vec;
584162306a36Sopenharmony_ci	struct pglist_data *pgdat;
584262306a36Sopenharmony_ci	unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
584362306a36Sopenharmony_ci	int nid, ret;
584462306a36Sopenharmony_ci
584562306a36Sopenharmony_ci	VM_BUG_ON(from == to);
584662306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
584762306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
584862306a36Sopenharmony_ci	VM_BUG_ON(compound && !folio_test_large(folio));
584962306a36Sopenharmony_ci
585062306a36Sopenharmony_ci	ret = -EINVAL;
585162306a36Sopenharmony_ci	if (folio_memcg(folio) != from)
585262306a36Sopenharmony_ci		goto out;
585362306a36Sopenharmony_ci
585462306a36Sopenharmony_ci	pgdat = folio_pgdat(folio);
585562306a36Sopenharmony_ci	from_vec = mem_cgroup_lruvec(from, pgdat);
585662306a36Sopenharmony_ci	to_vec = mem_cgroup_lruvec(to, pgdat);
585762306a36Sopenharmony_ci
585862306a36Sopenharmony_ci	folio_memcg_lock(folio);
585962306a36Sopenharmony_ci
586062306a36Sopenharmony_ci	if (folio_test_anon(folio)) {
586162306a36Sopenharmony_ci		if (folio_mapped(folio)) {
586262306a36Sopenharmony_ci			__mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
586362306a36Sopenharmony_ci			__mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
586462306a36Sopenharmony_ci			if (folio_test_pmd_mappable(folio)) {
586562306a36Sopenharmony_ci				__mod_lruvec_state(from_vec, NR_ANON_THPS,
586662306a36Sopenharmony_ci						   -nr_pages);
586762306a36Sopenharmony_ci				__mod_lruvec_state(to_vec, NR_ANON_THPS,
586862306a36Sopenharmony_ci						   nr_pages);
586962306a36Sopenharmony_ci			}
587062306a36Sopenharmony_ci		}
587162306a36Sopenharmony_ci	} else {
587262306a36Sopenharmony_ci		__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
587362306a36Sopenharmony_ci		__mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
587462306a36Sopenharmony_ci
587562306a36Sopenharmony_ci		if (folio_test_swapbacked(folio)) {
587662306a36Sopenharmony_ci			__mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
587762306a36Sopenharmony_ci			__mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
587862306a36Sopenharmony_ci		}
587962306a36Sopenharmony_ci
588062306a36Sopenharmony_ci		if (folio_mapped(folio)) {
588162306a36Sopenharmony_ci			__mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
588262306a36Sopenharmony_ci			__mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
588362306a36Sopenharmony_ci		}
588462306a36Sopenharmony_ci
588562306a36Sopenharmony_ci		if (folio_test_dirty(folio)) {
588662306a36Sopenharmony_ci			struct address_space *mapping = folio_mapping(folio);
588762306a36Sopenharmony_ci
588862306a36Sopenharmony_ci			if (mapping_can_writeback(mapping)) {
588962306a36Sopenharmony_ci				__mod_lruvec_state(from_vec, NR_FILE_DIRTY,
589062306a36Sopenharmony_ci						   -nr_pages);
589162306a36Sopenharmony_ci				__mod_lruvec_state(to_vec, NR_FILE_DIRTY,
589262306a36Sopenharmony_ci						   nr_pages);
589362306a36Sopenharmony_ci			}
589462306a36Sopenharmony_ci		}
589562306a36Sopenharmony_ci	}
589662306a36Sopenharmony_ci
589762306a36Sopenharmony_ci#ifdef CONFIG_SWAP
589862306a36Sopenharmony_ci	if (folio_test_swapcache(folio)) {
589962306a36Sopenharmony_ci		__mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages);
590062306a36Sopenharmony_ci		__mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages);
590162306a36Sopenharmony_ci	}
590262306a36Sopenharmony_ci#endif
590362306a36Sopenharmony_ci	if (folio_test_writeback(folio)) {
590462306a36Sopenharmony_ci		__mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
590562306a36Sopenharmony_ci		__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
590662306a36Sopenharmony_ci	}
590762306a36Sopenharmony_ci
590862306a36Sopenharmony_ci	/*
590962306a36Sopenharmony_ci	 * All state has been migrated, let's switch to the new memcg.
591062306a36Sopenharmony_ci	 *
591162306a36Sopenharmony_ci	 * It is safe to change page's memcg here because the page
591262306a36Sopenharmony_ci	 * is referenced, charged, isolated, and locked: we can't race
591362306a36Sopenharmony_ci	 * with (un)charging, migration, LRU putback, or anything else
591462306a36Sopenharmony_ci	 * that would rely on a stable page's memory cgroup.
591562306a36Sopenharmony_ci	 *
591662306a36Sopenharmony_ci	 * Note that folio_memcg_lock is a memcg lock, not a page lock,
591762306a36Sopenharmony_ci	 * to save space. As soon as we switch page's memory cgroup to a
591862306a36Sopenharmony_ci	 * new memcg that isn't locked, the above state can change
591962306a36Sopenharmony_ci	 * concurrently again. Make sure we're truly done with it.
592062306a36Sopenharmony_ci	 */
592162306a36Sopenharmony_ci	smp_mb();
592262306a36Sopenharmony_ci
592362306a36Sopenharmony_ci	css_get(&to->css);
592462306a36Sopenharmony_ci	css_put(&from->css);
592562306a36Sopenharmony_ci
592662306a36Sopenharmony_ci	folio->memcg_data = (unsigned long)to;
592762306a36Sopenharmony_ci
592862306a36Sopenharmony_ci	__folio_memcg_unlock(from);
592962306a36Sopenharmony_ci
593062306a36Sopenharmony_ci	ret = 0;
593162306a36Sopenharmony_ci	nid = folio_nid(folio);
593262306a36Sopenharmony_ci
593362306a36Sopenharmony_ci	local_irq_disable();
593462306a36Sopenharmony_ci	mem_cgroup_charge_statistics(to, nr_pages);
593562306a36Sopenharmony_ci	memcg_check_events(to, nid);
593662306a36Sopenharmony_ci	mem_cgroup_charge_statistics(from, -nr_pages);
593762306a36Sopenharmony_ci	memcg_check_events(from, nid);
593862306a36Sopenharmony_ci	local_irq_enable();
593962306a36Sopenharmony_ciout:
594062306a36Sopenharmony_ci	return ret;
594162306a36Sopenharmony_ci}
594262306a36Sopenharmony_ci
594362306a36Sopenharmony_ci/**
594462306a36Sopenharmony_ci * get_mctgt_type - get target type of moving charge
594562306a36Sopenharmony_ci * @vma: the vma the pte to be checked belongs
594662306a36Sopenharmony_ci * @addr: the address corresponding to the pte to be checked
594762306a36Sopenharmony_ci * @ptent: the pte to be checked
594862306a36Sopenharmony_ci * @target: the pointer the target page or swap ent will be stored(can be NULL)
594962306a36Sopenharmony_ci *
595062306a36Sopenharmony_ci * Context: Called with pte lock held.
595162306a36Sopenharmony_ci * Return:
595262306a36Sopenharmony_ci * * MC_TARGET_NONE - If the pte is not a target for move charge.
595362306a36Sopenharmony_ci * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for
595462306a36Sopenharmony_ci *   move charge. If @target is not NULL, the page is stored in target->page
595562306a36Sopenharmony_ci *   with extra refcnt taken (Caller should release it).
595662306a36Sopenharmony_ci * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a
595762306a36Sopenharmony_ci *   target for charge migration.  If @target is not NULL, the entry is
595862306a36Sopenharmony_ci *   stored in target->ent.
595962306a36Sopenharmony_ci * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE but page is device memory and
596062306a36Sopenharmony_ci *   thus not on the lru.  For now such page is charged like a regular page
596162306a36Sopenharmony_ci *   would be as it is just special memory taking the place of a regular page.
596262306a36Sopenharmony_ci *   See Documentations/vm/hmm.txt and include/linux/hmm.h
596362306a36Sopenharmony_ci */
596462306a36Sopenharmony_cistatic enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
596562306a36Sopenharmony_ci		unsigned long addr, pte_t ptent, union mc_target *target)
596662306a36Sopenharmony_ci{
596762306a36Sopenharmony_ci	struct page *page = NULL;
596862306a36Sopenharmony_ci	enum mc_target_type ret = MC_TARGET_NONE;
596962306a36Sopenharmony_ci	swp_entry_t ent = { .val = 0 };
597062306a36Sopenharmony_ci
597162306a36Sopenharmony_ci	if (pte_present(ptent))
597262306a36Sopenharmony_ci		page = mc_handle_present_pte(vma, addr, ptent);
597362306a36Sopenharmony_ci	else if (pte_none_mostly(ptent))
597462306a36Sopenharmony_ci		/*
597562306a36Sopenharmony_ci		 * PTE markers should be treated as a none pte here, separated
597662306a36Sopenharmony_ci		 * from other swap handling below.
597762306a36Sopenharmony_ci		 */
597862306a36Sopenharmony_ci		page = mc_handle_file_pte(vma, addr, ptent);
597962306a36Sopenharmony_ci	else if (is_swap_pte(ptent))
598062306a36Sopenharmony_ci		page = mc_handle_swap_pte(vma, ptent, &ent);
598162306a36Sopenharmony_ci
598262306a36Sopenharmony_ci	if (target && page) {
598362306a36Sopenharmony_ci		if (!trylock_page(page)) {
598462306a36Sopenharmony_ci			put_page(page);
598562306a36Sopenharmony_ci			return ret;
598662306a36Sopenharmony_ci		}
598762306a36Sopenharmony_ci		/*
598862306a36Sopenharmony_ci		 * page_mapped() must be stable during the move. This
598962306a36Sopenharmony_ci		 * pte is locked, so if it's present, the page cannot
599062306a36Sopenharmony_ci		 * become unmapped. If it isn't, we have only partial
599162306a36Sopenharmony_ci		 * control over the mapped state: the page lock will
599262306a36Sopenharmony_ci		 * prevent new faults against pagecache and swapcache,
599362306a36Sopenharmony_ci		 * so an unmapped page cannot become mapped. However,
599462306a36Sopenharmony_ci		 * if the page is already mapped elsewhere, it can
599562306a36Sopenharmony_ci		 * unmap, and there is nothing we can do about it.
599662306a36Sopenharmony_ci		 * Alas, skip moving the page in this case.
599762306a36Sopenharmony_ci		 */
599862306a36Sopenharmony_ci		if (!pte_present(ptent) && page_mapped(page)) {
599962306a36Sopenharmony_ci			unlock_page(page);
600062306a36Sopenharmony_ci			put_page(page);
600162306a36Sopenharmony_ci			return ret;
600262306a36Sopenharmony_ci		}
600362306a36Sopenharmony_ci	}
600462306a36Sopenharmony_ci
600562306a36Sopenharmony_ci	if (!page && !ent.val)
600662306a36Sopenharmony_ci		return ret;
600762306a36Sopenharmony_ci	if (page) {
600862306a36Sopenharmony_ci		/*
600962306a36Sopenharmony_ci		 * Do only loose check w/o serialization.
601062306a36Sopenharmony_ci		 * mem_cgroup_move_account() checks the page is valid or
601162306a36Sopenharmony_ci		 * not under LRU exclusion.
601262306a36Sopenharmony_ci		 */
601362306a36Sopenharmony_ci		if (page_memcg(page) == mc.from) {
601462306a36Sopenharmony_ci			ret = MC_TARGET_PAGE;
601562306a36Sopenharmony_ci			if (is_device_private_page(page) ||
601662306a36Sopenharmony_ci			    is_device_coherent_page(page))
601762306a36Sopenharmony_ci				ret = MC_TARGET_DEVICE;
601862306a36Sopenharmony_ci			if (target)
601962306a36Sopenharmony_ci				target->page = page;
602062306a36Sopenharmony_ci		}
602162306a36Sopenharmony_ci		if (!ret || !target) {
602262306a36Sopenharmony_ci			if (target)
602362306a36Sopenharmony_ci				unlock_page(page);
602462306a36Sopenharmony_ci			put_page(page);
602562306a36Sopenharmony_ci		}
602662306a36Sopenharmony_ci	}
602762306a36Sopenharmony_ci	/*
602862306a36Sopenharmony_ci	 * There is a swap entry and a page doesn't exist or isn't charged.
602962306a36Sopenharmony_ci	 * But we cannot move a tail-page in a THP.
603062306a36Sopenharmony_ci	 */
603162306a36Sopenharmony_ci	if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
603262306a36Sopenharmony_ci	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
603362306a36Sopenharmony_ci		ret = MC_TARGET_SWAP;
603462306a36Sopenharmony_ci		if (target)
603562306a36Sopenharmony_ci			target->ent = ent;
603662306a36Sopenharmony_ci	}
603762306a36Sopenharmony_ci	return ret;
603862306a36Sopenharmony_ci}
603962306a36Sopenharmony_ci
604062306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
604162306a36Sopenharmony_ci/*
604262306a36Sopenharmony_ci * We don't consider PMD mapped swapping or file mapped pages because THP does
604362306a36Sopenharmony_ci * not support them for now.
604462306a36Sopenharmony_ci * Caller should make sure that pmd_trans_huge(pmd) is true.
604562306a36Sopenharmony_ci */
604662306a36Sopenharmony_cistatic enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
604762306a36Sopenharmony_ci		unsigned long addr, pmd_t pmd, union mc_target *target)
604862306a36Sopenharmony_ci{
604962306a36Sopenharmony_ci	struct page *page = NULL;
605062306a36Sopenharmony_ci	enum mc_target_type ret = MC_TARGET_NONE;
605162306a36Sopenharmony_ci
605262306a36Sopenharmony_ci	if (unlikely(is_swap_pmd(pmd))) {
605362306a36Sopenharmony_ci		VM_BUG_ON(thp_migration_supported() &&
605462306a36Sopenharmony_ci				  !is_pmd_migration_entry(pmd));
605562306a36Sopenharmony_ci		return ret;
605662306a36Sopenharmony_ci	}
605762306a36Sopenharmony_ci	page = pmd_page(pmd);
605862306a36Sopenharmony_ci	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
605962306a36Sopenharmony_ci	if (!(mc.flags & MOVE_ANON))
606062306a36Sopenharmony_ci		return ret;
606162306a36Sopenharmony_ci	if (page_memcg(page) == mc.from) {
606262306a36Sopenharmony_ci		ret = MC_TARGET_PAGE;
606362306a36Sopenharmony_ci		if (target) {
606462306a36Sopenharmony_ci			get_page(page);
606562306a36Sopenharmony_ci			if (!trylock_page(page)) {
606662306a36Sopenharmony_ci				put_page(page);
606762306a36Sopenharmony_ci				return MC_TARGET_NONE;
606862306a36Sopenharmony_ci			}
606962306a36Sopenharmony_ci			target->page = page;
607062306a36Sopenharmony_ci		}
607162306a36Sopenharmony_ci	}
607262306a36Sopenharmony_ci	return ret;
607362306a36Sopenharmony_ci}
607462306a36Sopenharmony_ci#else
607562306a36Sopenharmony_cistatic inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
607662306a36Sopenharmony_ci		unsigned long addr, pmd_t pmd, union mc_target *target)
607762306a36Sopenharmony_ci{
607862306a36Sopenharmony_ci	return MC_TARGET_NONE;
607962306a36Sopenharmony_ci}
608062306a36Sopenharmony_ci#endif
608162306a36Sopenharmony_ci
608262306a36Sopenharmony_cistatic int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
608362306a36Sopenharmony_ci					unsigned long addr, unsigned long end,
608462306a36Sopenharmony_ci					struct mm_walk *walk)
608562306a36Sopenharmony_ci{
608662306a36Sopenharmony_ci	struct vm_area_struct *vma = walk->vma;
608762306a36Sopenharmony_ci	pte_t *pte;
608862306a36Sopenharmony_ci	spinlock_t *ptl;
608962306a36Sopenharmony_ci
609062306a36Sopenharmony_ci	ptl = pmd_trans_huge_lock(pmd, vma);
609162306a36Sopenharmony_ci	if (ptl) {
609262306a36Sopenharmony_ci		/*
609362306a36Sopenharmony_ci		 * Note their can not be MC_TARGET_DEVICE for now as we do not
609462306a36Sopenharmony_ci		 * support transparent huge page with MEMORY_DEVICE_PRIVATE but
609562306a36Sopenharmony_ci		 * this might change.
609662306a36Sopenharmony_ci		 */
609762306a36Sopenharmony_ci		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
609862306a36Sopenharmony_ci			mc.precharge += HPAGE_PMD_NR;
609962306a36Sopenharmony_ci		spin_unlock(ptl);
610062306a36Sopenharmony_ci		return 0;
610162306a36Sopenharmony_ci	}
610262306a36Sopenharmony_ci
610362306a36Sopenharmony_ci	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
610462306a36Sopenharmony_ci	if (!pte)
610562306a36Sopenharmony_ci		return 0;
610662306a36Sopenharmony_ci	for (; addr != end; pte++, addr += PAGE_SIZE)
610762306a36Sopenharmony_ci		if (get_mctgt_type(vma, addr, ptep_get(pte), NULL))
610862306a36Sopenharmony_ci			mc.precharge++;	/* increment precharge temporarily */
610962306a36Sopenharmony_ci	pte_unmap_unlock(pte - 1, ptl);
611062306a36Sopenharmony_ci	cond_resched();
611162306a36Sopenharmony_ci
611262306a36Sopenharmony_ci	return 0;
611362306a36Sopenharmony_ci}
611462306a36Sopenharmony_ci
611562306a36Sopenharmony_cistatic const struct mm_walk_ops precharge_walk_ops = {
611662306a36Sopenharmony_ci	.pmd_entry	= mem_cgroup_count_precharge_pte_range,
611762306a36Sopenharmony_ci	.walk_lock	= PGWALK_RDLOCK,
611862306a36Sopenharmony_ci};
611962306a36Sopenharmony_ci
612062306a36Sopenharmony_cistatic unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
612162306a36Sopenharmony_ci{
612262306a36Sopenharmony_ci	unsigned long precharge;
612362306a36Sopenharmony_ci
612462306a36Sopenharmony_ci	mmap_read_lock(mm);
612562306a36Sopenharmony_ci	walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL);
612662306a36Sopenharmony_ci	mmap_read_unlock(mm);
612762306a36Sopenharmony_ci
612862306a36Sopenharmony_ci	precharge = mc.precharge;
612962306a36Sopenharmony_ci	mc.precharge = 0;
613062306a36Sopenharmony_ci
613162306a36Sopenharmony_ci	return precharge;
613262306a36Sopenharmony_ci}
613362306a36Sopenharmony_ci
613462306a36Sopenharmony_cistatic int mem_cgroup_precharge_mc(struct mm_struct *mm)
613562306a36Sopenharmony_ci{
613662306a36Sopenharmony_ci	unsigned long precharge = mem_cgroup_count_precharge(mm);
613762306a36Sopenharmony_ci
613862306a36Sopenharmony_ci	VM_BUG_ON(mc.moving_task);
613962306a36Sopenharmony_ci	mc.moving_task = current;
614062306a36Sopenharmony_ci	return mem_cgroup_do_precharge(precharge);
614162306a36Sopenharmony_ci}
614262306a36Sopenharmony_ci
614362306a36Sopenharmony_ci/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
614462306a36Sopenharmony_cistatic void __mem_cgroup_clear_mc(void)
614562306a36Sopenharmony_ci{
614662306a36Sopenharmony_ci	struct mem_cgroup *from = mc.from;
614762306a36Sopenharmony_ci	struct mem_cgroup *to = mc.to;
614862306a36Sopenharmony_ci
614962306a36Sopenharmony_ci	/* we must uncharge all the leftover precharges from mc.to */
615062306a36Sopenharmony_ci	if (mc.precharge) {
615162306a36Sopenharmony_ci		cancel_charge(mc.to, mc.precharge);
615262306a36Sopenharmony_ci		mc.precharge = 0;
615362306a36Sopenharmony_ci	}
615462306a36Sopenharmony_ci	/*
615562306a36Sopenharmony_ci	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
615662306a36Sopenharmony_ci	 * we must uncharge here.
615762306a36Sopenharmony_ci	 */
615862306a36Sopenharmony_ci	if (mc.moved_charge) {
615962306a36Sopenharmony_ci		cancel_charge(mc.from, mc.moved_charge);
616062306a36Sopenharmony_ci		mc.moved_charge = 0;
616162306a36Sopenharmony_ci	}
616262306a36Sopenharmony_ci	/* we must fixup refcnts and charges */
616362306a36Sopenharmony_ci	if (mc.moved_swap) {
616462306a36Sopenharmony_ci		/* uncharge swap account from the old cgroup */
616562306a36Sopenharmony_ci		if (!mem_cgroup_is_root(mc.from))
616662306a36Sopenharmony_ci			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
616762306a36Sopenharmony_ci
616862306a36Sopenharmony_ci		mem_cgroup_id_put_many(mc.from, mc.moved_swap);
616962306a36Sopenharmony_ci
617062306a36Sopenharmony_ci		/*
617162306a36Sopenharmony_ci		 * we charged both to->memory and to->memsw, so we
617262306a36Sopenharmony_ci		 * should uncharge to->memory.
617362306a36Sopenharmony_ci		 */
617462306a36Sopenharmony_ci		if (!mem_cgroup_is_root(mc.to))
617562306a36Sopenharmony_ci			page_counter_uncharge(&mc.to->memory, mc.moved_swap);
617662306a36Sopenharmony_ci
617762306a36Sopenharmony_ci		mc.moved_swap = 0;
617862306a36Sopenharmony_ci	}
617962306a36Sopenharmony_ci	memcg_oom_recover(from);
618062306a36Sopenharmony_ci	memcg_oom_recover(to);
618162306a36Sopenharmony_ci	wake_up_all(&mc.waitq);
618262306a36Sopenharmony_ci}
618362306a36Sopenharmony_ci
618462306a36Sopenharmony_cistatic void mem_cgroup_clear_mc(void)
618562306a36Sopenharmony_ci{
618662306a36Sopenharmony_ci	struct mm_struct *mm = mc.mm;
618762306a36Sopenharmony_ci
618862306a36Sopenharmony_ci	/*
618962306a36Sopenharmony_ci	 * we must clear moving_task before waking up waiters at the end of
619062306a36Sopenharmony_ci	 * task migration.
619162306a36Sopenharmony_ci	 */
619262306a36Sopenharmony_ci	mc.moving_task = NULL;
619362306a36Sopenharmony_ci	__mem_cgroup_clear_mc();
619462306a36Sopenharmony_ci	spin_lock(&mc.lock);
619562306a36Sopenharmony_ci	mc.from = NULL;
619662306a36Sopenharmony_ci	mc.to = NULL;
619762306a36Sopenharmony_ci	mc.mm = NULL;
619862306a36Sopenharmony_ci	spin_unlock(&mc.lock);
619962306a36Sopenharmony_ci
620062306a36Sopenharmony_ci	mmput(mm);
620162306a36Sopenharmony_ci}
620262306a36Sopenharmony_ci
620362306a36Sopenharmony_cistatic int mem_cgroup_can_attach(struct cgroup_taskset *tset)
620462306a36Sopenharmony_ci{
620562306a36Sopenharmony_ci	struct cgroup_subsys_state *css;
620662306a36Sopenharmony_ci	struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
620762306a36Sopenharmony_ci	struct mem_cgroup *from;
620862306a36Sopenharmony_ci	struct task_struct *leader, *p;
620962306a36Sopenharmony_ci	struct mm_struct *mm;
621062306a36Sopenharmony_ci	unsigned long move_flags;
621162306a36Sopenharmony_ci	int ret = 0;
621262306a36Sopenharmony_ci
621362306a36Sopenharmony_ci	/* charge immigration isn't supported on the default hierarchy */
621462306a36Sopenharmony_ci	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
621562306a36Sopenharmony_ci		return 0;
621662306a36Sopenharmony_ci
621762306a36Sopenharmony_ci	/*
621862306a36Sopenharmony_ci	 * Multi-process migrations only happen on the default hierarchy
621962306a36Sopenharmony_ci	 * where charge immigration is not used.  Perform charge
622062306a36Sopenharmony_ci	 * immigration if @tset contains a leader and whine if there are
622162306a36Sopenharmony_ci	 * multiple.
622262306a36Sopenharmony_ci	 */
622362306a36Sopenharmony_ci	p = NULL;
622462306a36Sopenharmony_ci	cgroup_taskset_for_each_leader(leader, css, tset) {
622562306a36Sopenharmony_ci		WARN_ON_ONCE(p);
622662306a36Sopenharmony_ci		p = leader;
622762306a36Sopenharmony_ci		memcg = mem_cgroup_from_css(css);
622862306a36Sopenharmony_ci	}
622962306a36Sopenharmony_ci	if (!p)
623062306a36Sopenharmony_ci		return 0;
623162306a36Sopenharmony_ci
623262306a36Sopenharmony_ci	/*
623362306a36Sopenharmony_ci	 * We are now committed to this value whatever it is. Changes in this
623462306a36Sopenharmony_ci	 * tunable will only affect upcoming migrations, not the current one.
623562306a36Sopenharmony_ci	 * So we need to save it, and keep it going.
623662306a36Sopenharmony_ci	 */
623762306a36Sopenharmony_ci	move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
623862306a36Sopenharmony_ci	if (!move_flags)
623962306a36Sopenharmony_ci		return 0;
624062306a36Sopenharmony_ci
624162306a36Sopenharmony_ci	from = mem_cgroup_from_task(p);
624262306a36Sopenharmony_ci
624362306a36Sopenharmony_ci	VM_BUG_ON(from == memcg);
624462306a36Sopenharmony_ci
624562306a36Sopenharmony_ci	mm = get_task_mm(p);
624662306a36Sopenharmony_ci	if (!mm)
624762306a36Sopenharmony_ci		return 0;
624862306a36Sopenharmony_ci	/* We move charges only when we move a owner of the mm */
624962306a36Sopenharmony_ci	if (mm->owner == p) {
625062306a36Sopenharmony_ci		VM_BUG_ON(mc.from);
625162306a36Sopenharmony_ci		VM_BUG_ON(mc.to);
625262306a36Sopenharmony_ci		VM_BUG_ON(mc.precharge);
625362306a36Sopenharmony_ci		VM_BUG_ON(mc.moved_charge);
625462306a36Sopenharmony_ci		VM_BUG_ON(mc.moved_swap);
625562306a36Sopenharmony_ci
625662306a36Sopenharmony_ci		spin_lock(&mc.lock);
625762306a36Sopenharmony_ci		mc.mm = mm;
625862306a36Sopenharmony_ci		mc.from = from;
625962306a36Sopenharmony_ci		mc.to = memcg;
626062306a36Sopenharmony_ci		mc.flags = move_flags;
626162306a36Sopenharmony_ci		spin_unlock(&mc.lock);
626262306a36Sopenharmony_ci		/* We set mc.moving_task later */
626362306a36Sopenharmony_ci
626462306a36Sopenharmony_ci		ret = mem_cgroup_precharge_mc(mm);
626562306a36Sopenharmony_ci		if (ret)
626662306a36Sopenharmony_ci			mem_cgroup_clear_mc();
626762306a36Sopenharmony_ci	} else {
626862306a36Sopenharmony_ci		mmput(mm);
626962306a36Sopenharmony_ci	}
627062306a36Sopenharmony_ci	return ret;
627162306a36Sopenharmony_ci}
627262306a36Sopenharmony_ci
627362306a36Sopenharmony_cistatic void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
627462306a36Sopenharmony_ci{
627562306a36Sopenharmony_ci	if (mc.to)
627662306a36Sopenharmony_ci		mem_cgroup_clear_mc();
627762306a36Sopenharmony_ci}
627862306a36Sopenharmony_ci
627962306a36Sopenharmony_cistatic int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
628062306a36Sopenharmony_ci				unsigned long addr, unsigned long end,
628162306a36Sopenharmony_ci				struct mm_walk *walk)
628262306a36Sopenharmony_ci{
628362306a36Sopenharmony_ci	int ret = 0;
628462306a36Sopenharmony_ci	struct vm_area_struct *vma = walk->vma;
628562306a36Sopenharmony_ci	pte_t *pte;
628662306a36Sopenharmony_ci	spinlock_t *ptl;
628762306a36Sopenharmony_ci	enum mc_target_type target_type;
628862306a36Sopenharmony_ci	union mc_target target;
628962306a36Sopenharmony_ci	struct page *page;
629062306a36Sopenharmony_ci
629162306a36Sopenharmony_ci	ptl = pmd_trans_huge_lock(pmd, vma);
629262306a36Sopenharmony_ci	if (ptl) {
629362306a36Sopenharmony_ci		if (mc.precharge < HPAGE_PMD_NR) {
629462306a36Sopenharmony_ci			spin_unlock(ptl);
629562306a36Sopenharmony_ci			return 0;
629662306a36Sopenharmony_ci		}
629762306a36Sopenharmony_ci		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
629862306a36Sopenharmony_ci		if (target_type == MC_TARGET_PAGE) {
629962306a36Sopenharmony_ci			page = target.page;
630062306a36Sopenharmony_ci			if (isolate_lru_page(page)) {
630162306a36Sopenharmony_ci				if (!mem_cgroup_move_account(page, true,
630262306a36Sopenharmony_ci							     mc.from, mc.to)) {
630362306a36Sopenharmony_ci					mc.precharge -= HPAGE_PMD_NR;
630462306a36Sopenharmony_ci					mc.moved_charge += HPAGE_PMD_NR;
630562306a36Sopenharmony_ci				}
630662306a36Sopenharmony_ci				putback_lru_page(page);
630762306a36Sopenharmony_ci			}
630862306a36Sopenharmony_ci			unlock_page(page);
630962306a36Sopenharmony_ci			put_page(page);
631062306a36Sopenharmony_ci		} else if (target_type == MC_TARGET_DEVICE) {
631162306a36Sopenharmony_ci			page = target.page;
631262306a36Sopenharmony_ci			if (!mem_cgroup_move_account(page, true,
631362306a36Sopenharmony_ci						     mc.from, mc.to)) {
631462306a36Sopenharmony_ci				mc.precharge -= HPAGE_PMD_NR;
631562306a36Sopenharmony_ci				mc.moved_charge += HPAGE_PMD_NR;
631662306a36Sopenharmony_ci			}
631762306a36Sopenharmony_ci			unlock_page(page);
631862306a36Sopenharmony_ci			put_page(page);
631962306a36Sopenharmony_ci		}
632062306a36Sopenharmony_ci		spin_unlock(ptl);
632162306a36Sopenharmony_ci		return 0;
632262306a36Sopenharmony_ci	}
632362306a36Sopenharmony_ci
632462306a36Sopenharmony_ciretry:
632562306a36Sopenharmony_ci	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
632662306a36Sopenharmony_ci	if (!pte)
632762306a36Sopenharmony_ci		return 0;
632862306a36Sopenharmony_ci	for (; addr != end; addr += PAGE_SIZE) {
632962306a36Sopenharmony_ci		pte_t ptent = ptep_get(pte++);
633062306a36Sopenharmony_ci		bool device = false;
633162306a36Sopenharmony_ci		swp_entry_t ent;
633262306a36Sopenharmony_ci
633362306a36Sopenharmony_ci		if (!mc.precharge)
633462306a36Sopenharmony_ci			break;
633562306a36Sopenharmony_ci
633662306a36Sopenharmony_ci		switch (get_mctgt_type(vma, addr, ptent, &target)) {
633762306a36Sopenharmony_ci		case MC_TARGET_DEVICE:
633862306a36Sopenharmony_ci			device = true;
633962306a36Sopenharmony_ci			fallthrough;
634062306a36Sopenharmony_ci		case MC_TARGET_PAGE:
634162306a36Sopenharmony_ci			page = target.page;
634262306a36Sopenharmony_ci			/*
634362306a36Sopenharmony_ci			 * We can have a part of the split pmd here. Moving it
634462306a36Sopenharmony_ci			 * can be done but it would be too convoluted so simply
634562306a36Sopenharmony_ci			 * ignore such a partial THP and keep it in original
634662306a36Sopenharmony_ci			 * memcg. There should be somebody mapping the head.
634762306a36Sopenharmony_ci			 */
634862306a36Sopenharmony_ci			if (PageTransCompound(page))
634962306a36Sopenharmony_ci				goto put;
635062306a36Sopenharmony_ci			if (!device && !isolate_lru_page(page))
635162306a36Sopenharmony_ci				goto put;
635262306a36Sopenharmony_ci			if (!mem_cgroup_move_account(page, false,
635362306a36Sopenharmony_ci						mc.from, mc.to)) {
635462306a36Sopenharmony_ci				mc.precharge--;
635562306a36Sopenharmony_ci				/* we uncharge from mc.from later. */
635662306a36Sopenharmony_ci				mc.moved_charge++;
635762306a36Sopenharmony_ci			}
635862306a36Sopenharmony_ci			if (!device)
635962306a36Sopenharmony_ci				putback_lru_page(page);
636062306a36Sopenharmony_ciput:			/* get_mctgt_type() gets & locks the page */
636162306a36Sopenharmony_ci			unlock_page(page);
636262306a36Sopenharmony_ci			put_page(page);
636362306a36Sopenharmony_ci			break;
636462306a36Sopenharmony_ci		case MC_TARGET_SWAP:
636562306a36Sopenharmony_ci			ent = target.ent;
636662306a36Sopenharmony_ci			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
636762306a36Sopenharmony_ci				mc.precharge--;
636862306a36Sopenharmony_ci				mem_cgroup_id_get_many(mc.to, 1);
636962306a36Sopenharmony_ci				/* we fixup other refcnts and charges later. */
637062306a36Sopenharmony_ci				mc.moved_swap++;
637162306a36Sopenharmony_ci			}
637262306a36Sopenharmony_ci			break;
637362306a36Sopenharmony_ci		default:
637462306a36Sopenharmony_ci			break;
637562306a36Sopenharmony_ci		}
637662306a36Sopenharmony_ci	}
637762306a36Sopenharmony_ci	pte_unmap_unlock(pte - 1, ptl);
637862306a36Sopenharmony_ci	cond_resched();
637962306a36Sopenharmony_ci
638062306a36Sopenharmony_ci	if (addr != end) {
638162306a36Sopenharmony_ci		/*
638262306a36Sopenharmony_ci		 * We have consumed all precharges we got in can_attach().
638362306a36Sopenharmony_ci		 * We try charge one by one, but don't do any additional
638462306a36Sopenharmony_ci		 * charges to mc.to if we have failed in charge once in attach()
638562306a36Sopenharmony_ci		 * phase.
638662306a36Sopenharmony_ci		 */
638762306a36Sopenharmony_ci		ret = mem_cgroup_do_precharge(1);
638862306a36Sopenharmony_ci		if (!ret)
638962306a36Sopenharmony_ci			goto retry;
639062306a36Sopenharmony_ci	}
639162306a36Sopenharmony_ci
639262306a36Sopenharmony_ci	return ret;
639362306a36Sopenharmony_ci}
639462306a36Sopenharmony_ci
639562306a36Sopenharmony_cistatic const struct mm_walk_ops charge_walk_ops = {
639662306a36Sopenharmony_ci	.pmd_entry	= mem_cgroup_move_charge_pte_range,
639762306a36Sopenharmony_ci	.walk_lock	= PGWALK_RDLOCK,
639862306a36Sopenharmony_ci};
639962306a36Sopenharmony_ci
640062306a36Sopenharmony_cistatic void mem_cgroup_move_charge(void)
640162306a36Sopenharmony_ci{
640262306a36Sopenharmony_ci	lru_add_drain_all();
640362306a36Sopenharmony_ci	/*
640462306a36Sopenharmony_ci	 * Signal folio_memcg_lock() to take the memcg's move_lock
640562306a36Sopenharmony_ci	 * while we're moving its pages to another memcg. Then wait
640662306a36Sopenharmony_ci	 * for already started RCU-only updates to finish.
640762306a36Sopenharmony_ci	 */
640862306a36Sopenharmony_ci	atomic_inc(&mc.from->moving_account);
640962306a36Sopenharmony_ci	synchronize_rcu();
641062306a36Sopenharmony_ciretry:
641162306a36Sopenharmony_ci	if (unlikely(!mmap_read_trylock(mc.mm))) {
641262306a36Sopenharmony_ci		/*
641362306a36Sopenharmony_ci		 * Someone who are holding the mmap_lock might be waiting in
641462306a36Sopenharmony_ci		 * waitq. So we cancel all extra charges, wake up all waiters,
641562306a36Sopenharmony_ci		 * and retry. Because we cancel precharges, we might not be able
641662306a36Sopenharmony_ci		 * to move enough charges, but moving charge is a best-effort
641762306a36Sopenharmony_ci		 * feature anyway, so it wouldn't be a big problem.
641862306a36Sopenharmony_ci		 */
641962306a36Sopenharmony_ci		__mem_cgroup_clear_mc();
642062306a36Sopenharmony_ci		cond_resched();
642162306a36Sopenharmony_ci		goto retry;
642262306a36Sopenharmony_ci	}
642362306a36Sopenharmony_ci	/*
642462306a36Sopenharmony_ci	 * When we have consumed all precharges and failed in doing
642562306a36Sopenharmony_ci	 * additional charge, the page walk just aborts.
642662306a36Sopenharmony_ci	 */
642762306a36Sopenharmony_ci	walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL);
642862306a36Sopenharmony_ci	mmap_read_unlock(mc.mm);
642962306a36Sopenharmony_ci	atomic_dec(&mc.from->moving_account);
643062306a36Sopenharmony_ci}
643162306a36Sopenharmony_ci
643262306a36Sopenharmony_cistatic void mem_cgroup_move_task(void)
643362306a36Sopenharmony_ci{
643462306a36Sopenharmony_ci	if (mc.to) {
643562306a36Sopenharmony_ci		mem_cgroup_move_charge();
643662306a36Sopenharmony_ci		mem_cgroup_clear_mc();
643762306a36Sopenharmony_ci	}
643862306a36Sopenharmony_ci}
643962306a36Sopenharmony_ci#else	/* !CONFIG_MMU */
644062306a36Sopenharmony_cistatic int mem_cgroup_can_attach(struct cgroup_taskset *tset)
644162306a36Sopenharmony_ci{
644262306a36Sopenharmony_ci	return 0;
644362306a36Sopenharmony_ci}
644462306a36Sopenharmony_cistatic void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
644562306a36Sopenharmony_ci{
644662306a36Sopenharmony_ci}
644762306a36Sopenharmony_cistatic void mem_cgroup_move_task(void)
644862306a36Sopenharmony_ci{
644962306a36Sopenharmony_ci}
645062306a36Sopenharmony_ci#endif
645162306a36Sopenharmony_ci
645262306a36Sopenharmony_ci#ifdef CONFIG_LRU_GEN
645362306a36Sopenharmony_cistatic void mem_cgroup_attach(struct cgroup_taskset *tset)
645462306a36Sopenharmony_ci{
645562306a36Sopenharmony_ci	struct task_struct *task;
645662306a36Sopenharmony_ci	struct cgroup_subsys_state *css;
645762306a36Sopenharmony_ci
645862306a36Sopenharmony_ci	/* find the first leader if there is any */
645962306a36Sopenharmony_ci	cgroup_taskset_for_each_leader(task, css, tset)
646062306a36Sopenharmony_ci		break;
646162306a36Sopenharmony_ci
646262306a36Sopenharmony_ci	if (!task)
646362306a36Sopenharmony_ci		return;
646462306a36Sopenharmony_ci
646562306a36Sopenharmony_ci	task_lock(task);
646662306a36Sopenharmony_ci	if (task->mm && READ_ONCE(task->mm->owner) == task)
646762306a36Sopenharmony_ci		lru_gen_migrate_mm(task->mm);
646862306a36Sopenharmony_ci	task_unlock(task);
646962306a36Sopenharmony_ci}
647062306a36Sopenharmony_ci#else
647162306a36Sopenharmony_cistatic void mem_cgroup_attach(struct cgroup_taskset *tset)
647262306a36Sopenharmony_ci{
647362306a36Sopenharmony_ci}
647462306a36Sopenharmony_ci#endif /* CONFIG_LRU_GEN */
647562306a36Sopenharmony_ci
647662306a36Sopenharmony_cistatic int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
647762306a36Sopenharmony_ci{
647862306a36Sopenharmony_ci	if (value == PAGE_COUNTER_MAX)
647962306a36Sopenharmony_ci		seq_puts(m, "max\n");
648062306a36Sopenharmony_ci	else
648162306a36Sopenharmony_ci		seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
648262306a36Sopenharmony_ci
648362306a36Sopenharmony_ci	return 0;
648462306a36Sopenharmony_ci}
648562306a36Sopenharmony_ci
648662306a36Sopenharmony_cistatic u64 memory_current_read(struct cgroup_subsys_state *css,
648762306a36Sopenharmony_ci			       struct cftype *cft)
648862306a36Sopenharmony_ci{
648962306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
649062306a36Sopenharmony_ci
649162306a36Sopenharmony_ci	return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
649262306a36Sopenharmony_ci}
649362306a36Sopenharmony_ci
649462306a36Sopenharmony_cistatic u64 memory_peak_read(struct cgroup_subsys_state *css,
649562306a36Sopenharmony_ci			    struct cftype *cft)
649662306a36Sopenharmony_ci{
649762306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
649862306a36Sopenharmony_ci
649962306a36Sopenharmony_ci	return (u64)memcg->memory.watermark * PAGE_SIZE;
650062306a36Sopenharmony_ci}
650162306a36Sopenharmony_ci
650262306a36Sopenharmony_cistatic int memory_min_show(struct seq_file *m, void *v)
650362306a36Sopenharmony_ci{
650462306a36Sopenharmony_ci	return seq_puts_memcg_tunable(m,
650562306a36Sopenharmony_ci		READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
650662306a36Sopenharmony_ci}
650762306a36Sopenharmony_ci
650862306a36Sopenharmony_cistatic ssize_t memory_min_write(struct kernfs_open_file *of,
650962306a36Sopenharmony_ci				char *buf, size_t nbytes, loff_t off)
651062306a36Sopenharmony_ci{
651162306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
651262306a36Sopenharmony_ci	unsigned long min;
651362306a36Sopenharmony_ci	int err;
651462306a36Sopenharmony_ci
651562306a36Sopenharmony_ci	buf = strstrip(buf);
651662306a36Sopenharmony_ci	err = page_counter_memparse(buf, "max", &min);
651762306a36Sopenharmony_ci	if (err)
651862306a36Sopenharmony_ci		return err;
651962306a36Sopenharmony_ci
652062306a36Sopenharmony_ci	page_counter_set_min(&memcg->memory, min);
652162306a36Sopenharmony_ci
652262306a36Sopenharmony_ci	return nbytes;
652362306a36Sopenharmony_ci}
652462306a36Sopenharmony_ci
652562306a36Sopenharmony_cistatic int memory_low_show(struct seq_file *m, void *v)
652662306a36Sopenharmony_ci{
652762306a36Sopenharmony_ci	return seq_puts_memcg_tunable(m,
652862306a36Sopenharmony_ci		READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
652962306a36Sopenharmony_ci}
653062306a36Sopenharmony_ci
653162306a36Sopenharmony_cistatic ssize_t memory_low_write(struct kernfs_open_file *of,
653262306a36Sopenharmony_ci				char *buf, size_t nbytes, loff_t off)
653362306a36Sopenharmony_ci{
653462306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
653562306a36Sopenharmony_ci	unsigned long low;
653662306a36Sopenharmony_ci	int err;
653762306a36Sopenharmony_ci
653862306a36Sopenharmony_ci	buf = strstrip(buf);
653962306a36Sopenharmony_ci	err = page_counter_memparse(buf, "max", &low);
654062306a36Sopenharmony_ci	if (err)
654162306a36Sopenharmony_ci		return err;
654262306a36Sopenharmony_ci
654362306a36Sopenharmony_ci	page_counter_set_low(&memcg->memory, low);
654462306a36Sopenharmony_ci
654562306a36Sopenharmony_ci	return nbytes;
654662306a36Sopenharmony_ci}
654762306a36Sopenharmony_ci
654862306a36Sopenharmony_cistatic int memory_high_show(struct seq_file *m, void *v)
654962306a36Sopenharmony_ci{
655062306a36Sopenharmony_ci	return seq_puts_memcg_tunable(m,
655162306a36Sopenharmony_ci		READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
655262306a36Sopenharmony_ci}
655362306a36Sopenharmony_ci
655462306a36Sopenharmony_cistatic ssize_t memory_high_write(struct kernfs_open_file *of,
655562306a36Sopenharmony_ci				 char *buf, size_t nbytes, loff_t off)
655662306a36Sopenharmony_ci{
655762306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
655862306a36Sopenharmony_ci	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
655962306a36Sopenharmony_ci	bool drained = false;
656062306a36Sopenharmony_ci	unsigned long high;
656162306a36Sopenharmony_ci	int err;
656262306a36Sopenharmony_ci
656362306a36Sopenharmony_ci	buf = strstrip(buf);
656462306a36Sopenharmony_ci	err = page_counter_memparse(buf, "max", &high);
656562306a36Sopenharmony_ci	if (err)
656662306a36Sopenharmony_ci		return err;
656762306a36Sopenharmony_ci
656862306a36Sopenharmony_ci	page_counter_set_high(&memcg->memory, high);
656962306a36Sopenharmony_ci
657062306a36Sopenharmony_ci	for (;;) {
657162306a36Sopenharmony_ci		unsigned long nr_pages = page_counter_read(&memcg->memory);
657262306a36Sopenharmony_ci		unsigned long reclaimed;
657362306a36Sopenharmony_ci
657462306a36Sopenharmony_ci		if (nr_pages <= high)
657562306a36Sopenharmony_ci			break;
657662306a36Sopenharmony_ci
657762306a36Sopenharmony_ci		if (signal_pending(current))
657862306a36Sopenharmony_ci			break;
657962306a36Sopenharmony_ci
658062306a36Sopenharmony_ci		if (!drained) {
658162306a36Sopenharmony_ci			drain_all_stock(memcg);
658262306a36Sopenharmony_ci			drained = true;
658362306a36Sopenharmony_ci			continue;
658462306a36Sopenharmony_ci		}
658562306a36Sopenharmony_ci
658662306a36Sopenharmony_ci		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
658762306a36Sopenharmony_ci					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
658862306a36Sopenharmony_ci
658962306a36Sopenharmony_ci		if (!reclaimed && !nr_retries--)
659062306a36Sopenharmony_ci			break;
659162306a36Sopenharmony_ci	}
659262306a36Sopenharmony_ci
659362306a36Sopenharmony_ci	memcg_wb_domain_size_changed(memcg);
659462306a36Sopenharmony_ci	return nbytes;
659562306a36Sopenharmony_ci}
659662306a36Sopenharmony_ci
659762306a36Sopenharmony_cistatic int memory_max_show(struct seq_file *m, void *v)
659862306a36Sopenharmony_ci{
659962306a36Sopenharmony_ci	return seq_puts_memcg_tunable(m,
660062306a36Sopenharmony_ci		READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
660162306a36Sopenharmony_ci}
660262306a36Sopenharmony_ci
660362306a36Sopenharmony_cistatic ssize_t memory_max_write(struct kernfs_open_file *of,
660462306a36Sopenharmony_ci				char *buf, size_t nbytes, loff_t off)
660562306a36Sopenharmony_ci{
660662306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
660762306a36Sopenharmony_ci	unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
660862306a36Sopenharmony_ci	bool drained = false;
660962306a36Sopenharmony_ci	unsigned long max;
661062306a36Sopenharmony_ci	int err;
661162306a36Sopenharmony_ci
661262306a36Sopenharmony_ci	buf = strstrip(buf);
661362306a36Sopenharmony_ci	err = page_counter_memparse(buf, "max", &max);
661462306a36Sopenharmony_ci	if (err)
661562306a36Sopenharmony_ci		return err;
661662306a36Sopenharmony_ci
661762306a36Sopenharmony_ci	xchg(&memcg->memory.max, max);
661862306a36Sopenharmony_ci
661962306a36Sopenharmony_ci	for (;;) {
662062306a36Sopenharmony_ci		unsigned long nr_pages = page_counter_read(&memcg->memory);
662162306a36Sopenharmony_ci
662262306a36Sopenharmony_ci		if (nr_pages <= max)
662362306a36Sopenharmony_ci			break;
662462306a36Sopenharmony_ci
662562306a36Sopenharmony_ci		if (signal_pending(current))
662662306a36Sopenharmony_ci			break;
662762306a36Sopenharmony_ci
662862306a36Sopenharmony_ci		if (!drained) {
662962306a36Sopenharmony_ci			drain_all_stock(memcg);
663062306a36Sopenharmony_ci			drained = true;
663162306a36Sopenharmony_ci			continue;
663262306a36Sopenharmony_ci		}
663362306a36Sopenharmony_ci
663462306a36Sopenharmony_ci		if (nr_reclaims) {
663562306a36Sopenharmony_ci			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
663662306a36Sopenharmony_ci					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
663762306a36Sopenharmony_ci				nr_reclaims--;
663862306a36Sopenharmony_ci			continue;
663962306a36Sopenharmony_ci		}
664062306a36Sopenharmony_ci
664162306a36Sopenharmony_ci		memcg_memory_event(memcg, MEMCG_OOM);
664262306a36Sopenharmony_ci		if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
664362306a36Sopenharmony_ci			break;
664462306a36Sopenharmony_ci	}
664562306a36Sopenharmony_ci
664662306a36Sopenharmony_ci	memcg_wb_domain_size_changed(memcg);
664762306a36Sopenharmony_ci	return nbytes;
664862306a36Sopenharmony_ci}
664962306a36Sopenharmony_ci
665062306a36Sopenharmony_cistatic void __memory_events_show(struct seq_file *m, atomic_long_t *events)
665162306a36Sopenharmony_ci{
665262306a36Sopenharmony_ci	seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
665362306a36Sopenharmony_ci	seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
665462306a36Sopenharmony_ci	seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
665562306a36Sopenharmony_ci	seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
665662306a36Sopenharmony_ci	seq_printf(m, "oom_kill %lu\n",
665762306a36Sopenharmony_ci		   atomic_long_read(&events[MEMCG_OOM_KILL]));
665862306a36Sopenharmony_ci	seq_printf(m, "oom_group_kill %lu\n",
665962306a36Sopenharmony_ci		   atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
666062306a36Sopenharmony_ci}
666162306a36Sopenharmony_ci
666262306a36Sopenharmony_cistatic int memory_events_show(struct seq_file *m, void *v)
666362306a36Sopenharmony_ci{
666462306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
666562306a36Sopenharmony_ci
666662306a36Sopenharmony_ci	__memory_events_show(m, memcg->memory_events);
666762306a36Sopenharmony_ci	return 0;
666862306a36Sopenharmony_ci}
666962306a36Sopenharmony_ci
667062306a36Sopenharmony_cistatic int memory_events_local_show(struct seq_file *m, void *v)
667162306a36Sopenharmony_ci{
667262306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
667362306a36Sopenharmony_ci
667462306a36Sopenharmony_ci	__memory_events_show(m, memcg->memory_events_local);
667562306a36Sopenharmony_ci	return 0;
667662306a36Sopenharmony_ci}
667762306a36Sopenharmony_ci
667862306a36Sopenharmony_cistatic int memory_stat_show(struct seq_file *m, void *v)
667962306a36Sopenharmony_ci{
668062306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
668162306a36Sopenharmony_ci	char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
668262306a36Sopenharmony_ci	struct seq_buf s;
668362306a36Sopenharmony_ci
668462306a36Sopenharmony_ci	if (!buf)
668562306a36Sopenharmony_ci		return -ENOMEM;
668662306a36Sopenharmony_ci	seq_buf_init(&s, buf, PAGE_SIZE);
668762306a36Sopenharmony_ci	memory_stat_format(memcg, &s);
668862306a36Sopenharmony_ci	seq_puts(m, buf);
668962306a36Sopenharmony_ci	kfree(buf);
669062306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_DEBUG
669162306a36Sopenharmony_ci	memcg_eswap_info_show(m);
669262306a36Sopenharmony_ci#endif
669362306a36Sopenharmony_ci	return 0;
669462306a36Sopenharmony_ci}
669562306a36Sopenharmony_ci
669662306a36Sopenharmony_ci#ifdef CONFIG_NUMA
669762306a36Sopenharmony_cistatic inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
669862306a36Sopenharmony_ci						     int item)
669962306a36Sopenharmony_ci{
670062306a36Sopenharmony_ci	return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item);
670162306a36Sopenharmony_ci}
670262306a36Sopenharmony_ci
670362306a36Sopenharmony_cistatic int memory_numa_stat_show(struct seq_file *m, void *v)
670462306a36Sopenharmony_ci{
670562306a36Sopenharmony_ci	int i;
670662306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
670762306a36Sopenharmony_ci
670862306a36Sopenharmony_ci	mem_cgroup_flush_stats();
670962306a36Sopenharmony_ci
671062306a36Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
671162306a36Sopenharmony_ci		int nid;
671262306a36Sopenharmony_ci
671362306a36Sopenharmony_ci		if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
671462306a36Sopenharmony_ci			continue;
671562306a36Sopenharmony_ci
671662306a36Sopenharmony_ci		seq_printf(m, "%s", memory_stats[i].name);
671762306a36Sopenharmony_ci		for_each_node_state(nid, N_MEMORY) {
671862306a36Sopenharmony_ci			u64 size;
671962306a36Sopenharmony_ci			struct lruvec *lruvec;
672062306a36Sopenharmony_ci
672162306a36Sopenharmony_ci			lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
672262306a36Sopenharmony_ci			size = lruvec_page_state_output(lruvec,
672362306a36Sopenharmony_ci							memory_stats[i].idx);
672462306a36Sopenharmony_ci			seq_printf(m, " N%d=%llu", nid, size);
672562306a36Sopenharmony_ci		}
672662306a36Sopenharmony_ci		seq_putc(m, '\n');
672762306a36Sopenharmony_ci	}
672862306a36Sopenharmony_ci
672962306a36Sopenharmony_ci	return 0;
673062306a36Sopenharmony_ci}
673162306a36Sopenharmony_ci#endif
673262306a36Sopenharmony_ci
673362306a36Sopenharmony_cistatic int memory_oom_group_show(struct seq_file *m, void *v)
673462306a36Sopenharmony_ci{
673562306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
673662306a36Sopenharmony_ci
673762306a36Sopenharmony_ci	seq_printf(m, "%d\n", READ_ONCE(memcg->oom_group));
673862306a36Sopenharmony_ci
673962306a36Sopenharmony_ci	return 0;
674062306a36Sopenharmony_ci}
674162306a36Sopenharmony_ci
674262306a36Sopenharmony_cistatic ssize_t memory_oom_group_write(struct kernfs_open_file *of,
674362306a36Sopenharmony_ci				      char *buf, size_t nbytes, loff_t off)
674462306a36Sopenharmony_ci{
674562306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
674662306a36Sopenharmony_ci	int ret, oom_group;
674762306a36Sopenharmony_ci
674862306a36Sopenharmony_ci	buf = strstrip(buf);
674962306a36Sopenharmony_ci	if (!buf)
675062306a36Sopenharmony_ci		return -EINVAL;
675162306a36Sopenharmony_ci
675262306a36Sopenharmony_ci	ret = kstrtoint(buf, 0, &oom_group);
675362306a36Sopenharmony_ci	if (ret)
675462306a36Sopenharmony_ci		return ret;
675562306a36Sopenharmony_ci
675662306a36Sopenharmony_ci	if (oom_group != 0 && oom_group != 1)
675762306a36Sopenharmony_ci		return -EINVAL;
675862306a36Sopenharmony_ci
675962306a36Sopenharmony_ci	WRITE_ONCE(memcg->oom_group, oom_group);
676062306a36Sopenharmony_ci
676162306a36Sopenharmony_ci	return nbytes;
676262306a36Sopenharmony_ci}
676362306a36Sopenharmony_ci
676462306a36Sopenharmony_cistatic ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
676562306a36Sopenharmony_ci			      size_t nbytes, loff_t off)
676662306a36Sopenharmony_ci{
676762306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
676862306a36Sopenharmony_ci	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
676962306a36Sopenharmony_ci	unsigned long nr_to_reclaim, nr_reclaimed = 0;
677062306a36Sopenharmony_ci	unsigned int reclaim_options;
677162306a36Sopenharmony_ci	int err;
677262306a36Sopenharmony_ci
677362306a36Sopenharmony_ci	buf = strstrip(buf);
677462306a36Sopenharmony_ci	err = page_counter_memparse(buf, "", &nr_to_reclaim);
677562306a36Sopenharmony_ci	if (err)
677662306a36Sopenharmony_ci		return err;
677762306a36Sopenharmony_ci
677862306a36Sopenharmony_ci	reclaim_options	= MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
677962306a36Sopenharmony_ci	while (nr_reclaimed < nr_to_reclaim) {
678062306a36Sopenharmony_ci		unsigned long reclaimed;
678162306a36Sopenharmony_ci
678262306a36Sopenharmony_ci		if (signal_pending(current))
678362306a36Sopenharmony_ci			return -EINTR;
678462306a36Sopenharmony_ci
678562306a36Sopenharmony_ci		/*
678662306a36Sopenharmony_ci		 * This is the final attempt, drain percpu lru caches in the
678762306a36Sopenharmony_ci		 * hope of introducing more evictable pages for
678862306a36Sopenharmony_ci		 * try_to_free_mem_cgroup_pages().
678962306a36Sopenharmony_ci		 */
679062306a36Sopenharmony_ci		if (!nr_retries)
679162306a36Sopenharmony_ci			lru_add_drain_all();
679262306a36Sopenharmony_ci
679362306a36Sopenharmony_ci		reclaimed = try_to_free_mem_cgroup_pages(memcg,
679462306a36Sopenharmony_ci					min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX),
679562306a36Sopenharmony_ci					GFP_KERNEL, reclaim_options);
679662306a36Sopenharmony_ci
679762306a36Sopenharmony_ci		if (!reclaimed && !nr_retries--)
679862306a36Sopenharmony_ci			return -EAGAIN;
679962306a36Sopenharmony_ci
680062306a36Sopenharmony_ci		nr_reclaimed += reclaimed;
680162306a36Sopenharmony_ci	}
680262306a36Sopenharmony_ci
680362306a36Sopenharmony_ci	return nbytes;
680462306a36Sopenharmony_ci}
680562306a36Sopenharmony_ci
680662306a36Sopenharmony_cistatic struct cftype memory_files[] = {
680762306a36Sopenharmony_ci	{
680862306a36Sopenharmony_ci		.name = "current",
680962306a36Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
681062306a36Sopenharmony_ci		.read_u64 = memory_current_read,
681162306a36Sopenharmony_ci	},
681262306a36Sopenharmony_ci	{
681362306a36Sopenharmony_ci		.name = "peak",
681462306a36Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
681562306a36Sopenharmony_ci		.read_u64 = memory_peak_read,
681662306a36Sopenharmony_ci	},
681762306a36Sopenharmony_ci	{
681862306a36Sopenharmony_ci		.name = "min",
681962306a36Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
682062306a36Sopenharmony_ci		.seq_show = memory_min_show,
682162306a36Sopenharmony_ci		.write = memory_min_write,
682262306a36Sopenharmony_ci	},
682362306a36Sopenharmony_ci	{
682462306a36Sopenharmony_ci		.name = "low",
682562306a36Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
682662306a36Sopenharmony_ci		.seq_show = memory_low_show,
682762306a36Sopenharmony_ci		.write = memory_low_write,
682862306a36Sopenharmony_ci	},
682962306a36Sopenharmony_ci	{
683062306a36Sopenharmony_ci		.name = "high",
683162306a36Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
683262306a36Sopenharmony_ci		.seq_show = memory_high_show,
683362306a36Sopenharmony_ci		.write = memory_high_write,
683462306a36Sopenharmony_ci	},
683562306a36Sopenharmony_ci	{
683662306a36Sopenharmony_ci		.name = "max",
683762306a36Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
683862306a36Sopenharmony_ci		.seq_show = memory_max_show,
683962306a36Sopenharmony_ci		.write = memory_max_write,
684062306a36Sopenharmony_ci	},
684162306a36Sopenharmony_ci	{
684262306a36Sopenharmony_ci		.name = "events",
684362306a36Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
684462306a36Sopenharmony_ci		.file_offset = offsetof(struct mem_cgroup, events_file),
684562306a36Sopenharmony_ci		.seq_show = memory_events_show,
684662306a36Sopenharmony_ci	},
684762306a36Sopenharmony_ci	{
684862306a36Sopenharmony_ci		.name = "events.local",
684962306a36Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
685062306a36Sopenharmony_ci		.file_offset = offsetof(struct mem_cgroup, events_local_file),
685162306a36Sopenharmony_ci		.seq_show = memory_events_local_show,
685262306a36Sopenharmony_ci	},
685362306a36Sopenharmony_ci	{
685462306a36Sopenharmony_ci		.name = "stat",
685562306a36Sopenharmony_ci		.seq_show = memory_stat_show,
685662306a36Sopenharmony_ci	},
685762306a36Sopenharmony_ci#ifdef CONFIG_NUMA
685862306a36Sopenharmony_ci	{
685962306a36Sopenharmony_ci		.name = "numa_stat",
686062306a36Sopenharmony_ci		.seq_show = memory_numa_stat_show,
686162306a36Sopenharmony_ci	},
686262306a36Sopenharmony_ci#endif
686362306a36Sopenharmony_ci	{
686462306a36Sopenharmony_ci		.name = "oom.group",
686562306a36Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
686662306a36Sopenharmony_ci		.seq_show = memory_oom_group_show,
686762306a36Sopenharmony_ci		.write = memory_oom_group_write,
686862306a36Sopenharmony_ci	},
686962306a36Sopenharmony_ci	{
687062306a36Sopenharmony_ci		.name = "reclaim",
687162306a36Sopenharmony_ci		.flags = CFTYPE_NS_DELEGATABLE,
687262306a36Sopenharmony_ci		.write = memory_reclaim,
687362306a36Sopenharmony_ci	},
687462306a36Sopenharmony_ci	{ }	/* terminate */
687562306a36Sopenharmony_ci};
687662306a36Sopenharmony_ci
687762306a36Sopenharmony_cistruct cgroup_subsys memory_cgrp_subsys = {
687862306a36Sopenharmony_ci	.css_alloc = mem_cgroup_css_alloc,
687962306a36Sopenharmony_ci	.css_online = mem_cgroup_css_online,
688062306a36Sopenharmony_ci	.css_offline = mem_cgroup_css_offline,
688162306a36Sopenharmony_ci	.css_released = mem_cgroup_css_released,
688262306a36Sopenharmony_ci	.css_free = mem_cgroup_css_free,
688362306a36Sopenharmony_ci	.css_reset = mem_cgroup_css_reset,
688462306a36Sopenharmony_ci	.css_rstat_flush = mem_cgroup_css_rstat_flush,
688562306a36Sopenharmony_ci	.can_attach = mem_cgroup_can_attach,
688662306a36Sopenharmony_ci	.attach = mem_cgroup_attach,
688762306a36Sopenharmony_ci	.cancel_attach = mem_cgroup_cancel_attach,
688862306a36Sopenharmony_ci	.post_attach = mem_cgroup_move_task,
688962306a36Sopenharmony_ci	.dfl_cftypes = memory_files,
689062306a36Sopenharmony_ci	.legacy_cftypes = mem_cgroup_legacy_files,
689162306a36Sopenharmony_ci	.early_init = 0,
689262306a36Sopenharmony_ci};
689362306a36Sopenharmony_ci
689462306a36Sopenharmony_ci/*
689562306a36Sopenharmony_ci * This function calculates an individual cgroup's effective
689662306a36Sopenharmony_ci * protection which is derived from its own memory.min/low, its
689762306a36Sopenharmony_ci * parent's and siblings' settings, as well as the actual memory
689862306a36Sopenharmony_ci * distribution in the tree.
689962306a36Sopenharmony_ci *
690062306a36Sopenharmony_ci * The following rules apply to the effective protection values:
690162306a36Sopenharmony_ci *
690262306a36Sopenharmony_ci * 1. At the first level of reclaim, effective protection is equal to
690362306a36Sopenharmony_ci *    the declared protection in memory.min and memory.low.
690462306a36Sopenharmony_ci *
690562306a36Sopenharmony_ci * 2. To enable safe delegation of the protection configuration, at
690662306a36Sopenharmony_ci *    subsequent levels the effective protection is capped to the
690762306a36Sopenharmony_ci *    parent's effective protection.
690862306a36Sopenharmony_ci *
690962306a36Sopenharmony_ci * 3. To make complex and dynamic subtrees easier to configure, the
691062306a36Sopenharmony_ci *    user is allowed to overcommit the declared protection at a given
691162306a36Sopenharmony_ci *    level. If that is the case, the parent's effective protection is
691262306a36Sopenharmony_ci *    distributed to the children in proportion to how much protection
691362306a36Sopenharmony_ci *    they have declared and how much of it they are utilizing.
691462306a36Sopenharmony_ci *
691562306a36Sopenharmony_ci *    This makes distribution proportional, but also work-conserving:
691662306a36Sopenharmony_ci *    if one cgroup claims much more protection than it uses memory,
691762306a36Sopenharmony_ci *    the unused remainder is available to its siblings.
691862306a36Sopenharmony_ci *
691962306a36Sopenharmony_ci * 4. Conversely, when the declared protection is undercommitted at a
692062306a36Sopenharmony_ci *    given level, the distribution of the larger parental protection
692162306a36Sopenharmony_ci *    budget is NOT proportional. A cgroup's protection from a sibling
692262306a36Sopenharmony_ci *    is capped to its own memory.min/low setting.
692362306a36Sopenharmony_ci *
692462306a36Sopenharmony_ci * 5. However, to allow protecting recursive subtrees from each other
692562306a36Sopenharmony_ci *    without having to declare each individual cgroup's fixed share
692662306a36Sopenharmony_ci *    of the ancestor's claim to protection, any unutilized -
692762306a36Sopenharmony_ci *    "floating" - protection from up the tree is distributed in
692862306a36Sopenharmony_ci *    proportion to each cgroup's *usage*. This makes the protection
692962306a36Sopenharmony_ci *    neutral wrt sibling cgroups and lets them compete freely over
693062306a36Sopenharmony_ci *    the shared parental protection budget, but it protects the
693162306a36Sopenharmony_ci *    subtree as a whole from neighboring subtrees.
693262306a36Sopenharmony_ci *
693362306a36Sopenharmony_ci * Note that 4. and 5. are not in conflict: 4. is about protecting
693462306a36Sopenharmony_ci * against immediate siblings whereas 5. is about protecting against
693562306a36Sopenharmony_ci * neighboring subtrees.
693662306a36Sopenharmony_ci */
693762306a36Sopenharmony_cistatic unsigned long effective_protection(unsigned long usage,
693862306a36Sopenharmony_ci					  unsigned long parent_usage,
693962306a36Sopenharmony_ci					  unsigned long setting,
694062306a36Sopenharmony_ci					  unsigned long parent_effective,
694162306a36Sopenharmony_ci					  unsigned long siblings_protected)
694262306a36Sopenharmony_ci{
694362306a36Sopenharmony_ci	unsigned long protected;
694462306a36Sopenharmony_ci	unsigned long ep;
694562306a36Sopenharmony_ci
694662306a36Sopenharmony_ci	protected = min(usage, setting);
694762306a36Sopenharmony_ci	/*
694862306a36Sopenharmony_ci	 * If all cgroups at this level combined claim and use more
694962306a36Sopenharmony_ci	 * protection than what the parent affords them, distribute
695062306a36Sopenharmony_ci	 * shares in proportion to utilization.
695162306a36Sopenharmony_ci	 *
695262306a36Sopenharmony_ci	 * We are using actual utilization rather than the statically
695362306a36Sopenharmony_ci	 * claimed protection in order to be work-conserving: claimed
695462306a36Sopenharmony_ci	 * but unused protection is available to siblings that would
695562306a36Sopenharmony_ci	 * otherwise get a smaller chunk than what they claimed.
695662306a36Sopenharmony_ci	 */
695762306a36Sopenharmony_ci	if (siblings_protected > parent_effective)
695862306a36Sopenharmony_ci		return protected * parent_effective / siblings_protected;
695962306a36Sopenharmony_ci
696062306a36Sopenharmony_ci	/*
696162306a36Sopenharmony_ci	 * Ok, utilized protection of all children is within what the
696262306a36Sopenharmony_ci	 * parent affords them, so we know whatever this child claims
696362306a36Sopenharmony_ci	 * and utilizes is effectively protected.
696462306a36Sopenharmony_ci	 *
696562306a36Sopenharmony_ci	 * If there is unprotected usage beyond this value, reclaim
696662306a36Sopenharmony_ci	 * will apply pressure in proportion to that amount.
696762306a36Sopenharmony_ci	 *
696862306a36Sopenharmony_ci	 * If there is unutilized protection, the cgroup will be fully
696962306a36Sopenharmony_ci	 * shielded from reclaim, but we do return a smaller value for
697062306a36Sopenharmony_ci	 * protection than what the group could enjoy in theory. This
697162306a36Sopenharmony_ci	 * is okay. With the overcommit distribution above, effective
697262306a36Sopenharmony_ci	 * protection is always dependent on how memory is actually
697362306a36Sopenharmony_ci	 * consumed among the siblings anyway.
697462306a36Sopenharmony_ci	 */
697562306a36Sopenharmony_ci	ep = protected;
697662306a36Sopenharmony_ci
697762306a36Sopenharmony_ci	/*
697862306a36Sopenharmony_ci	 * If the children aren't claiming (all of) the protection
697962306a36Sopenharmony_ci	 * afforded to them by the parent, distribute the remainder in
698062306a36Sopenharmony_ci	 * proportion to the (unprotected) memory of each cgroup. That
698162306a36Sopenharmony_ci	 * way, cgroups that aren't explicitly prioritized wrt each
698262306a36Sopenharmony_ci	 * other compete freely over the allowance, but they are
698362306a36Sopenharmony_ci	 * collectively protected from neighboring trees.
698462306a36Sopenharmony_ci	 *
698562306a36Sopenharmony_ci	 * We're using unprotected memory for the weight so that if
698662306a36Sopenharmony_ci	 * some cgroups DO claim explicit protection, we don't protect
698762306a36Sopenharmony_ci	 * the same bytes twice.
698862306a36Sopenharmony_ci	 *
698962306a36Sopenharmony_ci	 * Check both usage and parent_usage against the respective
699062306a36Sopenharmony_ci	 * protected values. One should imply the other, but they
699162306a36Sopenharmony_ci	 * aren't read atomically - make sure the division is sane.
699262306a36Sopenharmony_ci	 */
699362306a36Sopenharmony_ci	if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
699462306a36Sopenharmony_ci		return ep;
699562306a36Sopenharmony_ci	if (parent_effective > siblings_protected &&
699662306a36Sopenharmony_ci	    parent_usage > siblings_protected &&
699762306a36Sopenharmony_ci	    usage > protected) {
699862306a36Sopenharmony_ci		unsigned long unclaimed;
699962306a36Sopenharmony_ci
700062306a36Sopenharmony_ci		unclaimed = parent_effective - siblings_protected;
700162306a36Sopenharmony_ci		unclaimed *= usage - protected;
700262306a36Sopenharmony_ci		unclaimed /= parent_usage - siblings_protected;
700362306a36Sopenharmony_ci
700462306a36Sopenharmony_ci		ep += unclaimed;
700562306a36Sopenharmony_ci	}
700662306a36Sopenharmony_ci
700762306a36Sopenharmony_ci	return ep;
700862306a36Sopenharmony_ci}
700962306a36Sopenharmony_ci
701062306a36Sopenharmony_ci/**
701162306a36Sopenharmony_ci * mem_cgroup_calculate_protection - check if memory consumption is in the normal range
701262306a36Sopenharmony_ci * @root: the top ancestor of the sub-tree being checked
701362306a36Sopenharmony_ci * @memcg: the memory cgroup to check
701462306a36Sopenharmony_ci *
701562306a36Sopenharmony_ci * WARNING: This function is not stateless! It can only be used as part
701662306a36Sopenharmony_ci *          of a top-down tree iteration, not for isolated queries.
701762306a36Sopenharmony_ci */
701862306a36Sopenharmony_civoid mem_cgroup_calculate_protection(struct mem_cgroup *root,
701962306a36Sopenharmony_ci				     struct mem_cgroup *memcg)
702062306a36Sopenharmony_ci{
702162306a36Sopenharmony_ci	unsigned long usage, parent_usage;
702262306a36Sopenharmony_ci	struct mem_cgroup *parent;
702362306a36Sopenharmony_ci
702462306a36Sopenharmony_ci	if (mem_cgroup_disabled())
702562306a36Sopenharmony_ci		return;
702662306a36Sopenharmony_ci
702762306a36Sopenharmony_ci	if (!root)
702862306a36Sopenharmony_ci		root = root_mem_cgroup;
702962306a36Sopenharmony_ci
703062306a36Sopenharmony_ci	/*
703162306a36Sopenharmony_ci	 * Effective values of the reclaim targets are ignored so they
703262306a36Sopenharmony_ci	 * can be stale. Have a look at mem_cgroup_protection for more
703362306a36Sopenharmony_ci	 * details.
703462306a36Sopenharmony_ci	 * TODO: calculation should be more robust so that we do not need
703562306a36Sopenharmony_ci	 * that special casing.
703662306a36Sopenharmony_ci	 */
703762306a36Sopenharmony_ci	if (memcg == root)
703862306a36Sopenharmony_ci		return;
703962306a36Sopenharmony_ci
704062306a36Sopenharmony_ci	usage = page_counter_read(&memcg->memory);
704162306a36Sopenharmony_ci	if (!usage)
704262306a36Sopenharmony_ci		return;
704362306a36Sopenharmony_ci
704462306a36Sopenharmony_ci	parent = parent_mem_cgroup(memcg);
704562306a36Sopenharmony_ci
704662306a36Sopenharmony_ci	if (parent == root) {
704762306a36Sopenharmony_ci		memcg->memory.emin = READ_ONCE(memcg->memory.min);
704862306a36Sopenharmony_ci		memcg->memory.elow = READ_ONCE(memcg->memory.low);
704962306a36Sopenharmony_ci		return;
705062306a36Sopenharmony_ci	}
705162306a36Sopenharmony_ci
705262306a36Sopenharmony_ci	parent_usage = page_counter_read(&parent->memory);
705362306a36Sopenharmony_ci
705462306a36Sopenharmony_ci	WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
705562306a36Sopenharmony_ci			READ_ONCE(memcg->memory.min),
705662306a36Sopenharmony_ci			READ_ONCE(parent->memory.emin),
705762306a36Sopenharmony_ci			atomic_long_read(&parent->memory.children_min_usage)));
705862306a36Sopenharmony_ci
705962306a36Sopenharmony_ci	WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
706062306a36Sopenharmony_ci			READ_ONCE(memcg->memory.low),
706162306a36Sopenharmony_ci			READ_ONCE(parent->memory.elow),
706262306a36Sopenharmony_ci			atomic_long_read(&parent->memory.children_low_usage)));
706362306a36Sopenharmony_ci}
706462306a36Sopenharmony_ci
706562306a36Sopenharmony_cistatic int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
706662306a36Sopenharmony_ci			gfp_t gfp)
706762306a36Sopenharmony_ci{
706862306a36Sopenharmony_ci	long nr_pages = folio_nr_pages(folio);
706962306a36Sopenharmony_ci	int ret;
707062306a36Sopenharmony_ci
707162306a36Sopenharmony_ci	ret = try_charge(memcg, gfp, nr_pages);
707262306a36Sopenharmony_ci	if (ret)
707362306a36Sopenharmony_ci		goto out;
707462306a36Sopenharmony_ci
707562306a36Sopenharmony_ci	css_get(&memcg->css);
707662306a36Sopenharmony_ci	commit_charge(folio, memcg);
707762306a36Sopenharmony_ci
707862306a36Sopenharmony_ci	local_irq_disable();
707962306a36Sopenharmony_ci	mem_cgroup_charge_statistics(memcg, nr_pages);
708062306a36Sopenharmony_ci	memcg_check_events(memcg, folio_nid(folio));
708162306a36Sopenharmony_ci	local_irq_enable();
708262306a36Sopenharmony_ciout:
708362306a36Sopenharmony_ci	return ret;
708462306a36Sopenharmony_ci}
708562306a36Sopenharmony_ci
708662306a36Sopenharmony_ciint __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
708762306a36Sopenharmony_ci{
708862306a36Sopenharmony_ci	struct mem_cgroup *memcg;
708962306a36Sopenharmony_ci	int ret;
709062306a36Sopenharmony_ci
709162306a36Sopenharmony_ci	memcg = get_mem_cgroup_from_mm(mm);
709262306a36Sopenharmony_ci	ret = charge_memcg(folio, memcg, gfp);
709362306a36Sopenharmony_ci	css_put(&memcg->css);
709462306a36Sopenharmony_ci
709562306a36Sopenharmony_ci	return ret;
709662306a36Sopenharmony_ci}
709762306a36Sopenharmony_ci
709862306a36Sopenharmony_ci/**
709962306a36Sopenharmony_ci * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
710062306a36Sopenharmony_ci * @folio: folio to charge.
710162306a36Sopenharmony_ci * @mm: mm context of the victim
710262306a36Sopenharmony_ci * @gfp: reclaim mode
710362306a36Sopenharmony_ci * @entry: swap entry for which the folio is allocated
710462306a36Sopenharmony_ci *
710562306a36Sopenharmony_ci * This function charges a folio allocated for swapin. Please call this before
710662306a36Sopenharmony_ci * adding the folio to the swapcache.
710762306a36Sopenharmony_ci *
710862306a36Sopenharmony_ci * Returns 0 on success. Otherwise, an error code is returned.
710962306a36Sopenharmony_ci */
711062306a36Sopenharmony_ciint mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
711162306a36Sopenharmony_ci				  gfp_t gfp, swp_entry_t entry)
711262306a36Sopenharmony_ci{
711362306a36Sopenharmony_ci	struct mem_cgroup *memcg;
711462306a36Sopenharmony_ci	unsigned short id;
711562306a36Sopenharmony_ci	int ret;
711662306a36Sopenharmony_ci
711762306a36Sopenharmony_ci	if (mem_cgroup_disabled())
711862306a36Sopenharmony_ci		return 0;
711962306a36Sopenharmony_ci
712062306a36Sopenharmony_ci	id = lookup_swap_cgroup_id(entry);
712162306a36Sopenharmony_ci	rcu_read_lock();
712262306a36Sopenharmony_ci	memcg = mem_cgroup_from_id(id);
712362306a36Sopenharmony_ci	if (!memcg || !css_tryget_online(&memcg->css))
712462306a36Sopenharmony_ci		memcg = get_mem_cgroup_from_mm(mm);
712562306a36Sopenharmony_ci	rcu_read_unlock();
712662306a36Sopenharmony_ci
712762306a36Sopenharmony_ci	ret = charge_memcg(folio, memcg, gfp);
712862306a36Sopenharmony_ci
712962306a36Sopenharmony_ci	css_put(&memcg->css);
713062306a36Sopenharmony_ci	return ret;
713162306a36Sopenharmony_ci}
713262306a36Sopenharmony_ci
713362306a36Sopenharmony_ci/*
713462306a36Sopenharmony_ci * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
713562306a36Sopenharmony_ci * @entry: swap entry for which the page is charged
713662306a36Sopenharmony_ci *
713762306a36Sopenharmony_ci * Call this function after successfully adding the charged page to swapcache.
713862306a36Sopenharmony_ci *
713962306a36Sopenharmony_ci * Note: This function assumes the page for which swap slot is being uncharged
714062306a36Sopenharmony_ci * is order 0 page.
714162306a36Sopenharmony_ci */
714262306a36Sopenharmony_civoid mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
714362306a36Sopenharmony_ci{
714462306a36Sopenharmony_ci	/*
714562306a36Sopenharmony_ci	 * Cgroup1's unified memory+swap counter has been charged with the
714662306a36Sopenharmony_ci	 * new swapcache page, finish the transfer by uncharging the swap
714762306a36Sopenharmony_ci	 * slot. The swap slot would also get uncharged when it dies, but
714862306a36Sopenharmony_ci	 * it can stick around indefinitely and we'd count the page twice
714962306a36Sopenharmony_ci	 * the entire time.
715062306a36Sopenharmony_ci	 *
715162306a36Sopenharmony_ci	 * Cgroup2 has separate resource counters for memory and swap,
715262306a36Sopenharmony_ci	 * so this is a non-issue here. Memory and swap charge lifetimes
715362306a36Sopenharmony_ci	 * correspond 1:1 to page and swap slot lifetimes: we charge the
715462306a36Sopenharmony_ci	 * page to memory here, and uncharge swap when the slot is freed.
715562306a36Sopenharmony_ci	 */
715662306a36Sopenharmony_ci	if (!mem_cgroup_disabled() && do_memsw_account()) {
715762306a36Sopenharmony_ci		/*
715862306a36Sopenharmony_ci		 * The swap entry might not get freed for a long time,
715962306a36Sopenharmony_ci		 * let's not wait for it.  The page already received a
716062306a36Sopenharmony_ci		 * memory+swap charge, drop the swap entry duplicate.
716162306a36Sopenharmony_ci		 */
716262306a36Sopenharmony_ci		mem_cgroup_uncharge_swap(entry, 1);
716362306a36Sopenharmony_ci	}
716462306a36Sopenharmony_ci}
716562306a36Sopenharmony_ci
716662306a36Sopenharmony_cistruct uncharge_gather {
716762306a36Sopenharmony_ci	struct mem_cgroup *memcg;
716862306a36Sopenharmony_ci	unsigned long nr_memory;
716962306a36Sopenharmony_ci	unsigned long pgpgout;
717062306a36Sopenharmony_ci	unsigned long nr_kmem;
717162306a36Sopenharmony_ci	int nid;
717262306a36Sopenharmony_ci};
717362306a36Sopenharmony_ci
717462306a36Sopenharmony_cistatic inline void uncharge_gather_clear(struct uncharge_gather *ug)
717562306a36Sopenharmony_ci{
717662306a36Sopenharmony_ci	memset(ug, 0, sizeof(*ug));
717762306a36Sopenharmony_ci}
717862306a36Sopenharmony_ci
717962306a36Sopenharmony_cistatic void uncharge_batch(const struct uncharge_gather *ug)
718062306a36Sopenharmony_ci{
718162306a36Sopenharmony_ci	unsigned long flags;
718262306a36Sopenharmony_ci
718362306a36Sopenharmony_ci	if (ug->nr_memory) {
718462306a36Sopenharmony_ci		page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
718562306a36Sopenharmony_ci		if (do_memsw_account())
718662306a36Sopenharmony_ci			page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
718762306a36Sopenharmony_ci		if (ug->nr_kmem)
718862306a36Sopenharmony_ci			memcg_account_kmem(ug->memcg, -ug->nr_kmem);
718962306a36Sopenharmony_ci		memcg_oom_recover(ug->memcg);
719062306a36Sopenharmony_ci	}
719162306a36Sopenharmony_ci
719262306a36Sopenharmony_ci	local_irq_save(flags);
719362306a36Sopenharmony_ci	__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
719462306a36Sopenharmony_ci	__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
719562306a36Sopenharmony_ci	memcg_check_events(ug->memcg, ug->nid);
719662306a36Sopenharmony_ci	local_irq_restore(flags);
719762306a36Sopenharmony_ci
719862306a36Sopenharmony_ci	/* drop reference from uncharge_folio */
719962306a36Sopenharmony_ci	css_put(&ug->memcg->css);
720062306a36Sopenharmony_ci}
720162306a36Sopenharmony_ci
720262306a36Sopenharmony_cistatic void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
720362306a36Sopenharmony_ci{
720462306a36Sopenharmony_ci	long nr_pages;
720562306a36Sopenharmony_ci	struct mem_cgroup *memcg;
720662306a36Sopenharmony_ci	struct obj_cgroup *objcg;
720762306a36Sopenharmony_ci
720862306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
720962306a36Sopenharmony_ci
721062306a36Sopenharmony_ci	/*
721162306a36Sopenharmony_ci	 * Nobody should be changing or seriously looking at
721262306a36Sopenharmony_ci	 * folio memcg or objcg at this point, we have fully
721362306a36Sopenharmony_ci	 * exclusive access to the folio.
721462306a36Sopenharmony_ci	 */
721562306a36Sopenharmony_ci	if (folio_memcg_kmem(folio)) {
721662306a36Sopenharmony_ci		objcg = __folio_objcg(folio);
721762306a36Sopenharmony_ci		/*
721862306a36Sopenharmony_ci		 * This get matches the put at the end of the function and
721962306a36Sopenharmony_ci		 * kmem pages do not hold memcg references anymore.
722062306a36Sopenharmony_ci		 */
722162306a36Sopenharmony_ci		memcg = get_mem_cgroup_from_objcg(objcg);
722262306a36Sopenharmony_ci	} else {
722362306a36Sopenharmony_ci		memcg = __folio_memcg(folio);
722462306a36Sopenharmony_ci	}
722562306a36Sopenharmony_ci
722662306a36Sopenharmony_ci	if (!memcg)
722762306a36Sopenharmony_ci		return;
722862306a36Sopenharmony_ci
722962306a36Sopenharmony_ci	if (ug->memcg != memcg) {
723062306a36Sopenharmony_ci		if (ug->memcg) {
723162306a36Sopenharmony_ci			uncharge_batch(ug);
723262306a36Sopenharmony_ci			uncharge_gather_clear(ug);
723362306a36Sopenharmony_ci		}
723462306a36Sopenharmony_ci		ug->memcg = memcg;
723562306a36Sopenharmony_ci		ug->nid = folio_nid(folio);
723662306a36Sopenharmony_ci
723762306a36Sopenharmony_ci		/* pairs with css_put in uncharge_batch */
723862306a36Sopenharmony_ci		css_get(&memcg->css);
723962306a36Sopenharmony_ci	}
724062306a36Sopenharmony_ci
724162306a36Sopenharmony_ci	nr_pages = folio_nr_pages(folio);
724262306a36Sopenharmony_ci
724362306a36Sopenharmony_ci	if (folio_memcg_kmem(folio)) {
724462306a36Sopenharmony_ci		ug->nr_memory += nr_pages;
724562306a36Sopenharmony_ci		ug->nr_kmem += nr_pages;
724662306a36Sopenharmony_ci
724762306a36Sopenharmony_ci		folio->memcg_data = 0;
724862306a36Sopenharmony_ci		obj_cgroup_put(objcg);
724962306a36Sopenharmony_ci	} else {
725062306a36Sopenharmony_ci		/* LRU pages aren't accounted at the root level */
725162306a36Sopenharmony_ci		if (!mem_cgroup_is_root(memcg))
725262306a36Sopenharmony_ci			ug->nr_memory += nr_pages;
725362306a36Sopenharmony_ci		ug->pgpgout++;
725462306a36Sopenharmony_ci
725562306a36Sopenharmony_ci		folio->memcg_data = 0;
725662306a36Sopenharmony_ci	}
725762306a36Sopenharmony_ci
725862306a36Sopenharmony_ci	css_put(&memcg->css);
725962306a36Sopenharmony_ci}
726062306a36Sopenharmony_ci
726162306a36Sopenharmony_civoid __mem_cgroup_uncharge(struct folio *folio)
726262306a36Sopenharmony_ci{
726362306a36Sopenharmony_ci	struct uncharge_gather ug;
726462306a36Sopenharmony_ci
726562306a36Sopenharmony_ci	/* Don't touch folio->lru of any random page, pre-check: */
726662306a36Sopenharmony_ci	if (!folio_memcg(folio))
726762306a36Sopenharmony_ci		return;
726862306a36Sopenharmony_ci
726962306a36Sopenharmony_ci	uncharge_gather_clear(&ug);
727062306a36Sopenharmony_ci	uncharge_folio(folio, &ug);
727162306a36Sopenharmony_ci	uncharge_batch(&ug);
727262306a36Sopenharmony_ci}
727362306a36Sopenharmony_ci
727462306a36Sopenharmony_ci/**
727562306a36Sopenharmony_ci * __mem_cgroup_uncharge_list - uncharge a list of page
727662306a36Sopenharmony_ci * @page_list: list of pages to uncharge
727762306a36Sopenharmony_ci *
727862306a36Sopenharmony_ci * Uncharge a list of pages previously charged with
727962306a36Sopenharmony_ci * __mem_cgroup_charge().
728062306a36Sopenharmony_ci */
728162306a36Sopenharmony_civoid __mem_cgroup_uncharge_list(struct list_head *page_list)
728262306a36Sopenharmony_ci{
728362306a36Sopenharmony_ci	struct uncharge_gather ug;
728462306a36Sopenharmony_ci	struct folio *folio;
728562306a36Sopenharmony_ci
728662306a36Sopenharmony_ci	uncharge_gather_clear(&ug);
728762306a36Sopenharmony_ci	list_for_each_entry(folio, page_list, lru)
728862306a36Sopenharmony_ci		uncharge_folio(folio, &ug);
728962306a36Sopenharmony_ci	if (ug.memcg)
729062306a36Sopenharmony_ci		uncharge_batch(&ug);
729162306a36Sopenharmony_ci}
729262306a36Sopenharmony_ci
729362306a36Sopenharmony_ci/**
729462306a36Sopenharmony_ci * mem_cgroup_migrate - Charge a folio's replacement.
729562306a36Sopenharmony_ci * @old: Currently circulating folio.
729662306a36Sopenharmony_ci * @new: Replacement folio.
729762306a36Sopenharmony_ci *
729862306a36Sopenharmony_ci * Charge @new as a replacement folio for @old. @old will
729962306a36Sopenharmony_ci * be uncharged upon free.
730062306a36Sopenharmony_ci *
730162306a36Sopenharmony_ci * Both folios must be locked, @new->mapping must be set up.
730262306a36Sopenharmony_ci */
730362306a36Sopenharmony_civoid mem_cgroup_migrate(struct folio *old, struct folio *new)
730462306a36Sopenharmony_ci{
730562306a36Sopenharmony_ci	struct mem_cgroup *memcg;
730662306a36Sopenharmony_ci	long nr_pages = folio_nr_pages(new);
730762306a36Sopenharmony_ci	unsigned long flags;
730862306a36Sopenharmony_ci
730962306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
731062306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
731162306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
731262306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new);
731362306a36Sopenharmony_ci
731462306a36Sopenharmony_ci	if (mem_cgroup_disabled())
731562306a36Sopenharmony_ci		return;
731662306a36Sopenharmony_ci
731762306a36Sopenharmony_ci	/* Page cache replacement: new folio already charged? */
731862306a36Sopenharmony_ci	if (folio_memcg(new))
731962306a36Sopenharmony_ci		return;
732062306a36Sopenharmony_ci
732162306a36Sopenharmony_ci	memcg = folio_memcg(old);
732262306a36Sopenharmony_ci	VM_WARN_ON_ONCE_FOLIO(!memcg, old);
732362306a36Sopenharmony_ci	if (!memcg)
732462306a36Sopenharmony_ci		return;
732562306a36Sopenharmony_ci
732662306a36Sopenharmony_ci	/* Force-charge the new page. The old one will be freed soon */
732762306a36Sopenharmony_ci	if (!mem_cgroup_is_root(memcg)) {
732862306a36Sopenharmony_ci		page_counter_charge(&memcg->memory, nr_pages);
732962306a36Sopenharmony_ci		if (do_memsw_account())
733062306a36Sopenharmony_ci			page_counter_charge(&memcg->memsw, nr_pages);
733162306a36Sopenharmony_ci	}
733262306a36Sopenharmony_ci
733362306a36Sopenharmony_ci	css_get(&memcg->css);
733462306a36Sopenharmony_ci	commit_charge(new, memcg);
733562306a36Sopenharmony_ci
733662306a36Sopenharmony_ci	local_irq_save(flags);
733762306a36Sopenharmony_ci	mem_cgroup_charge_statistics(memcg, nr_pages);
733862306a36Sopenharmony_ci	memcg_check_events(memcg, folio_nid(new));
733962306a36Sopenharmony_ci	local_irq_restore(flags);
734062306a36Sopenharmony_ci}
734162306a36Sopenharmony_ci
734262306a36Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
734362306a36Sopenharmony_ciEXPORT_SYMBOL(memcg_sockets_enabled_key);
734462306a36Sopenharmony_ci
734562306a36Sopenharmony_civoid mem_cgroup_sk_alloc(struct sock *sk)
734662306a36Sopenharmony_ci{
734762306a36Sopenharmony_ci	struct mem_cgroup *memcg;
734862306a36Sopenharmony_ci
734962306a36Sopenharmony_ci	if (!mem_cgroup_sockets_enabled)
735062306a36Sopenharmony_ci		return;
735162306a36Sopenharmony_ci
735262306a36Sopenharmony_ci	/* Do not associate the sock with unrelated interrupted task's memcg. */
735362306a36Sopenharmony_ci	if (!in_task())
735462306a36Sopenharmony_ci		return;
735562306a36Sopenharmony_ci
735662306a36Sopenharmony_ci	rcu_read_lock();
735762306a36Sopenharmony_ci	memcg = mem_cgroup_from_task(current);
735862306a36Sopenharmony_ci	if (mem_cgroup_is_root(memcg))
735962306a36Sopenharmony_ci		goto out;
736062306a36Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
736162306a36Sopenharmony_ci		goto out;
736262306a36Sopenharmony_ci	if (css_tryget(&memcg->css))
736362306a36Sopenharmony_ci		sk->sk_memcg = memcg;
736462306a36Sopenharmony_ciout:
736562306a36Sopenharmony_ci	rcu_read_unlock();
736662306a36Sopenharmony_ci}
736762306a36Sopenharmony_ci
736862306a36Sopenharmony_civoid mem_cgroup_sk_free(struct sock *sk)
736962306a36Sopenharmony_ci{
737062306a36Sopenharmony_ci	if (sk->sk_memcg)
737162306a36Sopenharmony_ci		css_put(&sk->sk_memcg->css);
737262306a36Sopenharmony_ci}
737362306a36Sopenharmony_ci
737462306a36Sopenharmony_ci/**
737562306a36Sopenharmony_ci * mem_cgroup_charge_skmem - charge socket memory
737662306a36Sopenharmony_ci * @memcg: memcg to charge
737762306a36Sopenharmony_ci * @nr_pages: number of pages to charge
737862306a36Sopenharmony_ci * @gfp_mask: reclaim mode
737962306a36Sopenharmony_ci *
738062306a36Sopenharmony_ci * Charges @nr_pages to @memcg. Returns %true if the charge fit within
738162306a36Sopenharmony_ci * @memcg's configured limit, %false if it doesn't.
738262306a36Sopenharmony_ci */
738362306a36Sopenharmony_cibool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
738462306a36Sopenharmony_ci			     gfp_t gfp_mask)
738562306a36Sopenharmony_ci{
738662306a36Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
738762306a36Sopenharmony_ci		struct page_counter *fail;
738862306a36Sopenharmony_ci
738962306a36Sopenharmony_ci		if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
739062306a36Sopenharmony_ci			memcg->tcpmem_pressure = 0;
739162306a36Sopenharmony_ci			return true;
739262306a36Sopenharmony_ci		}
739362306a36Sopenharmony_ci		memcg->tcpmem_pressure = 1;
739462306a36Sopenharmony_ci		if (gfp_mask & __GFP_NOFAIL) {
739562306a36Sopenharmony_ci			page_counter_charge(&memcg->tcpmem, nr_pages);
739662306a36Sopenharmony_ci			return true;
739762306a36Sopenharmony_ci		}
739862306a36Sopenharmony_ci		return false;
739962306a36Sopenharmony_ci	}
740062306a36Sopenharmony_ci
740162306a36Sopenharmony_ci	if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
740262306a36Sopenharmony_ci		mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
740362306a36Sopenharmony_ci		return true;
740462306a36Sopenharmony_ci	}
740562306a36Sopenharmony_ci
740662306a36Sopenharmony_ci	return false;
740762306a36Sopenharmony_ci}
740862306a36Sopenharmony_ci
740962306a36Sopenharmony_ci/**
741062306a36Sopenharmony_ci * mem_cgroup_uncharge_skmem - uncharge socket memory
741162306a36Sopenharmony_ci * @memcg: memcg to uncharge
741262306a36Sopenharmony_ci * @nr_pages: number of pages to uncharge
741362306a36Sopenharmony_ci */
741462306a36Sopenharmony_civoid mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
741562306a36Sopenharmony_ci{
741662306a36Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
741762306a36Sopenharmony_ci		page_counter_uncharge(&memcg->tcpmem, nr_pages);
741862306a36Sopenharmony_ci		return;
741962306a36Sopenharmony_ci	}
742062306a36Sopenharmony_ci
742162306a36Sopenharmony_ci	mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
742262306a36Sopenharmony_ci
742362306a36Sopenharmony_ci	refill_stock(memcg, nr_pages);
742462306a36Sopenharmony_ci}
742562306a36Sopenharmony_ci
742662306a36Sopenharmony_cistatic int __init cgroup_memory(char *s)
742762306a36Sopenharmony_ci{
742862306a36Sopenharmony_ci	char *token;
742962306a36Sopenharmony_ci
743062306a36Sopenharmony_ci	while ((token = strsep(&s, ",")) != NULL) {
743162306a36Sopenharmony_ci		if (!*token)
743262306a36Sopenharmony_ci			continue;
743362306a36Sopenharmony_ci		if (!strcmp(token, "nosocket"))
743462306a36Sopenharmony_ci			cgroup_memory_nosocket = true;
743562306a36Sopenharmony_ci		if (!strcmp(token, "nokmem"))
743662306a36Sopenharmony_ci			cgroup_memory_nokmem = true;
743762306a36Sopenharmony_ci		if (!strcmp(token, "nobpf"))
743862306a36Sopenharmony_ci			cgroup_memory_nobpf = true;
743962306a36Sopenharmony_ci		if (!strcmp(token, "kmem"))
744062306a36Sopenharmony_ci			cgroup_memory_nokmem = false;
744162306a36Sopenharmony_ci	}
744262306a36Sopenharmony_ci	return 1;
744362306a36Sopenharmony_ci}
744462306a36Sopenharmony_ci__setup("cgroup.memory=", cgroup_memory);
744562306a36Sopenharmony_ci
744662306a36Sopenharmony_ci/*
744762306a36Sopenharmony_ci * subsys_initcall() for memory controller.
744862306a36Sopenharmony_ci *
744962306a36Sopenharmony_ci * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
745062306a36Sopenharmony_ci * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
745162306a36Sopenharmony_ci * basically everything that doesn't depend on a specific mem_cgroup structure
745262306a36Sopenharmony_ci * should be initialized from here.
745362306a36Sopenharmony_ci */
745462306a36Sopenharmony_cistatic int __init mem_cgroup_init(void)
745562306a36Sopenharmony_ci{
745662306a36Sopenharmony_ci	int cpu, node;
745762306a36Sopenharmony_ci
745862306a36Sopenharmony_ci	/*
745962306a36Sopenharmony_ci	 * Currently s32 type (can refer to struct batched_lruvec_stat) is
746062306a36Sopenharmony_ci	 * used for per-memcg-per-cpu caching of per-node statistics. In order
746162306a36Sopenharmony_ci	 * to work fine, we should make sure that the overfill threshold can't
746262306a36Sopenharmony_ci	 * exceed S32_MAX / PAGE_SIZE.
746362306a36Sopenharmony_ci	 */
746462306a36Sopenharmony_ci	BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
746562306a36Sopenharmony_ci
746662306a36Sopenharmony_ci	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
746762306a36Sopenharmony_ci				  memcg_hotplug_cpu_dead);
746862306a36Sopenharmony_ci
746962306a36Sopenharmony_ci	for_each_possible_cpu(cpu)
747062306a36Sopenharmony_ci		INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
747162306a36Sopenharmony_ci			  drain_local_stock);
747262306a36Sopenharmony_ci
747362306a36Sopenharmony_ci	for_each_node(node) {
747462306a36Sopenharmony_ci		struct mem_cgroup_tree_per_node *rtpn;
747562306a36Sopenharmony_ci
747662306a36Sopenharmony_ci		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
747762306a36Sopenharmony_ci
747862306a36Sopenharmony_ci		rtpn->rb_root = RB_ROOT;
747962306a36Sopenharmony_ci		rtpn->rb_rightmost = NULL;
748062306a36Sopenharmony_ci		spin_lock_init(&rtpn->lock);
748162306a36Sopenharmony_ci		soft_limit_tree.rb_tree_per_node[node] = rtpn;
748262306a36Sopenharmony_ci	}
748362306a36Sopenharmony_ci
748462306a36Sopenharmony_ci	return 0;
748562306a36Sopenharmony_ci}
748662306a36Sopenharmony_cisubsys_initcall(mem_cgroup_init);
748762306a36Sopenharmony_ci
748862306a36Sopenharmony_ci#ifdef CONFIG_SWAP
748962306a36Sopenharmony_cistatic struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
749062306a36Sopenharmony_ci{
749162306a36Sopenharmony_ci	while (!refcount_inc_not_zero(&memcg->id.ref)) {
749262306a36Sopenharmony_ci		/*
749362306a36Sopenharmony_ci		 * The root cgroup cannot be destroyed, so it's refcount must
749462306a36Sopenharmony_ci		 * always be >= 1.
749562306a36Sopenharmony_ci		 */
749662306a36Sopenharmony_ci		if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
749762306a36Sopenharmony_ci			VM_BUG_ON(1);
749862306a36Sopenharmony_ci			break;
749962306a36Sopenharmony_ci		}
750062306a36Sopenharmony_ci		memcg = parent_mem_cgroup(memcg);
750162306a36Sopenharmony_ci		if (!memcg)
750262306a36Sopenharmony_ci			memcg = root_mem_cgroup;
750362306a36Sopenharmony_ci	}
750462306a36Sopenharmony_ci	return memcg;
750562306a36Sopenharmony_ci}
750662306a36Sopenharmony_ci
750762306a36Sopenharmony_ci/**
750862306a36Sopenharmony_ci * mem_cgroup_swapout - transfer a memsw charge to swap
750962306a36Sopenharmony_ci * @folio: folio whose memsw charge to transfer
751062306a36Sopenharmony_ci * @entry: swap entry to move the charge to
751162306a36Sopenharmony_ci *
751262306a36Sopenharmony_ci * Transfer the memsw charge of @folio to @entry.
751362306a36Sopenharmony_ci */
751462306a36Sopenharmony_civoid mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
751562306a36Sopenharmony_ci{
751662306a36Sopenharmony_ci	struct mem_cgroup *memcg, *swap_memcg;
751762306a36Sopenharmony_ci	unsigned int nr_entries;
751862306a36Sopenharmony_ci	unsigned short oldid;
751962306a36Sopenharmony_ci
752062306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
752162306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
752262306a36Sopenharmony_ci
752362306a36Sopenharmony_ci	if (mem_cgroup_disabled())
752462306a36Sopenharmony_ci		return;
752562306a36Sopenharmony_ci
752662306a36Sopenharmony_ci	if (!do_memsw_account())
752762306a36Sopenharmony_ci		return;
752862306a36Sopenharmony_ci
752962306a36Sopenharmony_ci	memcg = folio_memcg(folio);
753062306a36Sopenharmony_ci
753162306a36Sopenharmony_ci	VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
753262306a36Sopenharmony_ci	if (!memcg)
753362306a36Sopenharmony_ci		return;
753462306a36Sopenharmony_ci
753562306a36Sopenharmony_ci	/*
753662306a36Sopenharmony_ci	 * In case the memcg owning these pages has been offlined and doesn't
753762306a36Sopenharmony_ci	 * have an ID allocated to it anymore, charge the closest online
753862306a36Sopenharmony_ci	 * ancestor for the swap instead and transfer the memory+swap charge.
753962306a36Sopenharmony_ci	 */
754062306a36Sopenharmony_ci	swap_memcg = mem_cgroup_id_get_online(memcg);
754162306a36Sopenharmony_ci	nr_entries = folio_nr_pages(folio);
754262306a36Sopenharmony_ci	/* Get references for the tail pages, too */
754362306a36Sopenharmony_ci	if (nr_entries > 1)
754462306a36Sopenharmony_ci		mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
754562306a36Sopenharmony_ci	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
754662306a36Sopenharmony_ci				   nr_entries);
754762306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(oldid, folio);
754862306a36Sopenharmony_ci	mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
754962306a36Sopenharmony_ci
755062306a36Sopenharmony_ci	folio->memcg_data = 0;
755162306a36Sopenharmony_ci
755262306a36Sopenharmony_ci	if (!mem_cgroup_is_root(memcg))
755362306a36Sopenharmony_ci		page_counter_uncharge(&memcg->memory, nr_entries);
755462306a36Sopenharmony_ci
755562306a36Sopenharmony_ci	if (memcg != swap_memcg) {
755662306a36Sopenharmony_ci		if (!mem_cgroup_is_root(swap_memcg))
755762306a36Sopenharmony_ci			page_counter_charge(&swap_memcg->memsw, nr_entries);
755862306a36Sopenharmony_ci		page_counter_uncharge(&memcg->memsw, nr_entries);
755962306a36Sopenharmony_ci	}
756062306a36Sopenharmony_ci
756162306a36Sopenharmony_ci	/*
756262306a36Sopenharmony_ci	 * Interrupts should be disabled here because the caller holds the
756362306a36Sopenharmony_ci	 * i_pages lock which is taken with interrupts-off. It is
756462306a36Sopenharmony_ci	 * important here to have the interrupts disabled because it is the
756562306a36Sopenharmony_ci	 * only synchronisation we have for updating the per-CPU variables.
756662306a36Sopenharmony_ci	 */
756762306a36Sopenharmony_ci	memcg_stats_lock();
756862306a36Sopenharmony_ci	mem_cgroup_charge_statistics(memcg, -nr_entries);
756962306a36Sopenharmony_ci	memcg_stats_unlock();
757062306a36Sopenharmony_ci	memcg_check_events(memcg, folio_nid(folio));
757162306a36Sopenharmony_ci
757262306a36Sopenharmony_ci	css_put(&memcg->css);
757362306a36Sopenharmony_ci}
757462306a36Sopenharmony_ci
757562306a36Sopenharmony_ci/**
757662306a36Sopenharmony_ci * __mem_cgroup_try_charge_swap - try charging swap space for a folio
757762306a36Sopenharmony_ci * @folio: folio being added to swap
757862306a36Sopenharmony_ci * @entry: swap entry to charge
757962306a36Sopenharmony_ci *
758062306a36Sopenharmony_ci * Try to charge @folio's memcg for the swap space at @entry.
758162306a36Sopenharmony_ci *
758262306a36Sopenharmony_ci * Returns 0 on success, -ENOMEM on failure.
758362306a36Sopenharmony_ci */
758462306a36Sopenharmony_ciint __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
758562306a36Sopenharmony_ci{
758662306a36Sopenharmony_ci	unsigned int nr_pages = folio_nr_pages(folio);
758762306a36Sopenharmony_ci	struct page_counter *counter;
758862306a36Sopenharmony_ci	struct mem_cgroup *memcg;
758962306a36Sopenharmony_ci	unsigned short oldid;
759062306a36Sopenharmony_ci
759162306a36Sopenharmony_ci	if (do_memsw_account())
759262306a36Sopenharmony_ci		return 0;
759362306a36Sopenharmony_ci
759462306a36Sopenharmony_ci	memcg = folio_memcg(folio);
759562306a36Sopenharmony_ci
759662306a36Sopenharmony_ci	VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
759762306a36Sopenharmony_ci	if (!memcg)
759862306a36Sopenharmony_ci		return 0;
759962306a36Sopenharmony_ci
760062306a36Sopenharmony_ci	if (!entry.val) {
760162306a36Sopenharmony_ci		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
760262306a36Sopenharmony_ci		return 0;
760362306a36Sopenharmony_ci	}
760462306a36Sopenharmony_ci
760562306a36Sopenharmony_ci	memcg = mem_cgroup_id_get_online(memcg);
760662306a36Sopenharmony_ci
760762306a36Sopenharmony_ci	if (!mem_cgroup_is_root(memcg) &&
760862306a36Sopenharmony_ci	    !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
760962306a36Sopenharmony_ci		memcg_memory_event(memcg, MEMCG_SWAP_MAX);
761062306a36Sopenharmony_ci		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
761162306a36Sopenharmony_ci		mem_cgroup_id_put(memcg);
761262306a36Sopenharmony_ci		return -ENOMEM;
761362306a36Sopenharmony_ci	}
761462306a36Sopenharmony_ci
761562306a36Sopenharmony_ci	/* Get references for the tail pages, too */
761662306a36Sopenharmony_ci	if (nr_pages > 1)
761762306a36Sopenharmony_ci		mem_cgroup_id_get_many(memcg, nr_pages - 1);
761862306a36Sopenharmony_ci	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
761962306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(oldid, folio);
762062306a36Sopenharmony_ci	mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
762162306a36Sopenharmony_ci
762262306a36Sopenharmony_ci	return 0;
762362306a36Sopenharmony_ci}
762462306a36Sopenharmony_ci
762562306a36Sopenharmony_ci/**
762662306a36Sopenharmony_ci * __mem_cgroup_uncharge_swap - uncharge swap space
762762306a36Sopenharmony_ci * @entry: swap entry to uncharge
762862306a36Sopenharmony_ci * @nr_pages: the amount of swap space to uncharge
762962306a36Sopenharmony_ci */
763062306a36Sopenharmony_civoid __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
763162306a36Sopenharmony_ci{
763262306a36Sopenharmony_ci	struct mem_cgroup *memcg;
763362306a36Sopenharmony_ci	unsigned short id;
763462306a36Sopenharmony_ci
763562306a36Sopenharmony_ci	id = swap_cgroup_record(entry, 0, nr_pages);
763662306a36Sopenharmony_ci	rcu_read_lock();
763762306a36Sopenharmony_ci	memcg = mem_cgroup_from_id(id);
763862306a36Sopenharmony_ci	if (memcg) {
763962306a36Sopenharmony_ci		if (!mem_cgroup_is_root(memcg)) {
764062306a36Sopenharmony_ci			if (do_memsw_account())
764162306a36Sopenharmony_ci				page_counter_uncharge(&memcg->memsw, nr_pages);
764262306a36Sopenharmony_ci			else
764362306a36Sopenharmony_ci				page_counter_uncharge(&memcg->swap, nr_pages);
764462306a36Sopenharmony_ci		}
764562306a36Sopenharmony_ci		mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
764662306a36Sopenharmony_ci		mem_cgroup_id_put_many(memcg, nr_pages);
764762306a36Sopenharmony_ci	}
764862306a36Sopenharmony_ci	rcu_read_unlock();
764962306a36Sopenharmony_ci}
765062306a36Sopenharmony_ci
765162306a36Sopenharmony_cilong mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
765262306a36Sopenharmony_ci{
765362306a36Sopenharmony_ci	long nr_swap_pages = get_nr_swap_pages();
765462306a36Sopenharmony_ci
765562306a36Sopenharmony_ci	if (mem_cgroup_disabled() || do_memsw_account())
765662306a36Sopenharmony_ci		return nr_swap_pages;
765762306a36Sopenharmony_ci	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg))
765862306a36Sopenharmony_ci		nr_swap_pages = min_t(long, nr_swap_pages,
765962306a36Sopenharmony_ci				      READ_ONCE(memcg->swap.max) -
766062306a36Sopenharmony_ci				      page_counter_read(&memcg->swap));
766162306a36Sopenharmony_ci	return nr_swap_pages;
766262306a36Sopenharmony_ci}
766362306a36Sopenharmony_ci
766462306a36Sopenharmony_cibool mem_cgroup_swap_full(struct folio *folio)
766562306a36Sopenharmony_ci{
766662306a36Sopenharmony_ci	struct mem_cgroup *memcg;
766762306a36Sopenharmony_ci
766862306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
766962306a36Sopenharmony_ci
767062306a36Sopenharmony_ci	if (vm_swap_full())
767162306a36Sopenharmony_ci		return true;
767262306a36Sopenharmony_ci	if (do_memsw_account())
767362306a36Sopenharmony_ci		return false;
767462306a36Sopenharmony_ci
767562306a36Sopenharmony_ci	memcg = folio_memcg(folio);
767662306a36Sopenharmony_ci	if (!memcg)
767762306a36Sopenharmony_ci		return false;
767862306a36Sopenharmony_ci
767962306a36Sopenharmony_ci	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
768062306a36Sopenharmony_ci		unsigned long usage = page_counter_read(&memcg->swap);
768162306a36Sopenharmony_ci
768262306a36Sopenharmony_ci		if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
768362306a36Sopenharmony_ci		    usage * 2 >= READ_ONCE(memcg->swap.max))
768462306a36Sopenharmony_ci			return true;
768562306a36Sopenharmony_ci	}
768662306a36Sopenharmony_ci
768762306a36Sopenharmony_ci	return false;
768862306a36Sopenharmony_ci}
768962306a36Sopenharmony_ci
769062306a36Sopenharmony_cistatic int __init setup_swap_account(char *s)
769162306a36Sopenharmony_ci{
769262306a36Sopenharmony_ci	bool res;
769362306a36Sopenharmony_ci
769462306a36Sopenharmony_ci	if (!kstrtobool(s, &res) && !res)
769562306a36Sopenharmony_ci		pr_warn_once("The swapaccount=0 commandline option is deprecated "
769662306a36Sopenharmony_ci			     "in favor of configuring swap control via cgroupfs. "
769762306a36Sopenharmony_ci			     "Please report your usecase to linux-mm@kvack.org if you "
769862306a36Sopenharmony_ci			     "depend on this functionality.\n");
769962306a36Sopenharmony_ci	return 1;
770062306a36Sopenharmony_ci}
770162306a36Sopenharmony_ci__setup("swapaccount=", setup_swap_account);
770262306a36Sopenharmony_ci
770362306a36Sopenharmony_cistatic u64 swap_current_read(struct cgroup_subsys_state *css,
770462306a36Sopenharmony_ci			     struct cftype *cft)
770562306a36Sopenharmony_ci{
770662306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
770762306a36Sopenharmony_ci
770862306a36Sopenharmony_ci	return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
770962306a36Sopenharmony_ci}
771062306a36Sopenharmony_ci
771162306a36Sopenharmony_cistatic u64 swap_peak_read(struct cgroup_subsys_state *css,
771262306a36Sopenharmony_ci			  struct cftype *cft)
771362306a36Sopenharmony_ci{
771462306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
771562306a36Sopenharmony_ci
771662306a36Sopenharmony_ci	return (u64)memcg->swap.watermark * PAGE_SIZE;
771762306a36Sopenharmony_ci}
771862306a36Sopenharmony_ci
771962306a36Sopenharmony_cistatic int swap_high_show(struct seq_file *m, void *v)
772062306a36Sopenharmony_ci{
772162306a36Sopenharmony_ci	return seq_puts_memcg_tunable(m,
772262306a36Sopenharmony_ci		READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
772362306a36Sopenharmony_ci}
772462306a36Sopenharmony_ci
772562306a36Sopenharmony_cistatic ssize_t swap_high_write(struct kernfs_open_file *of,
772662306a36Sopenharmony_ci			       char *buf, size_t nbytes, loff_t off)
772762306a36Sopenharmony_ci{
772862306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
772962306a36Sopenharmony_ci	unsigned long high;
773062306a36Sopenharmony_ci	int err;
773162306a36Sopenharmony_ci
773262306a36Sopenharmony_ci	buf = strstrip(buf);
773362306a36Sopenharmony_ci	err = page_counter_memparse(buf, "max", &high);
773462306a36Sopenharmony_ci	if (err)
773562306a36Sopenharmony_ci		return err;
773662306a36Sopenharmony_ci
773762306a36Sopenharmony_ci	page_counter_set_high(&memcg->swap, high);
773862306a36Sopenharmony_ci
773962306a36Sopenharmony_ci	return nbytes;
774062306a36Sopenharmony_ci}
774162306a36Sopenharmony_ci
774262306a36Sopenharmony_cistatic int swap_max_show(struct seq_file *m, void *v)
774362306a36Sopenharmony_ci{
774462306a36Sopenharmony_ci	return seq_puts_memcg_tunable(m,
774562306a36Sopenharmony_ci		READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
774662306a36Sopenharmony_ci}
774762306a36Sopenharmony_ci
774862306a36Sopenharmony_cistatic ssize_t swap_max_write(struct kernfs_open_file *of,
774962306a36Sopenharmony_ci			      char *buf, size_t nbytes, loff_t off)
775062306a36Sopenharmony_ci{
775162306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
775262306a36Sopenharmony_ci	unsigned long max;
775362306a36Sopenharmony_ci	int err;
775462306a36Sopenharmony_ci
775562306a36Sopenharmony_ci	buf = strstrip(buf);
775662306a36Sopenharmony_ci	err = page_counter_memparse(buf, "max", &max);
775762306a36Sopenharmony_ci	if (err)
775862306a36Sopenharmony_ci		return err;
775962306a36Sopenharmony_ci
776062306a36Sopenharmony_ci	xchg(&memcg->swap.max, max);
776162306a36Sopenharmony_ci
776262306a36Sopenharmony_ci	return nbytes;
776362306a36Sopenharmony_ci}
776462306a36Sopenharmony_ci
776562306a36Sopenharmony_cistatic int swap_events_show(struct seq_file *m, void *v)
776662306a36Sopenharmony_ci{
776762306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
776862306a36Sopenharmony_ci
776962306a36Sopenharmony_ci	seq_printf(m, "high %lu\n",
777062306a36Sopenharmony_ci		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
777162306a36Sopenharmony_ci	seq_printf(m, "max %lu\n",
777262306a36Sopenharmony_ci		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
777362306a36Sopenharmony_ci	seq_printf(m, "fail %lu\n",
777462306a36Sopenharmony_ci		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
777562306a36Sopenharmony_ci
777662306a36Sopenharmony_ci	return 0;
777762306a36Sopenharmony_ci}
777862306a36Sopenharmony_ci
777962306a36Sopenharmony_cistatic struct cftype swap_files[] = {
778062306a36Sopenharmony_ci	{
778162306a36Sopenharmony_ci		.name = "swap.current",
778262306a36Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
778362306a36Sopenharmony_ci		.read_u64 = swap_current_read,
778462306a36Sopenharmony_ci	},
778562306a36Sopenharmony_ci	{
778662306a36Sopenharmony_ci		.name = "swap.high",
778762306a36Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
778862306a36Sopenharmony_ci		.seq_show = swap_high_show,
778962306a36Sopenharmony_ci		.write = swap_high_write,
779062306a36Sopenharmony_ci	},
779162306a36Sopenharmony_ci	{
779262306a36Sopenharmony_ci		.name = "swap.max",
779362306a36Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
779462306a36Sopenharmony_ci		.seq_show = swap_max_show,
779562306a36Sopenharmony_ci		.write = swap_max_write,
779662306a36Sopenharmony_ci	},
779762306a36Sopenharmony_ci	{
779862306a36Sopenharmony_ci		.name = "swap.peak",
779962306a36Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
780062306a36Sopenharmony_ci		.read_u64 = swap_peak_read,
780162306a36Sopenharmony_ci	},
780262306a36Sopenharmony_ci	{
780362306a36Sopenharmony_ci		.name = "swap.events",
780462306a36Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
780562306a36Sopenharmony_ci		.file_offset = offsetof(struct mem_cgroup, swap_events_file),
780662306a36Sopenharmony_ci		.seq_show = swap_events_show,
780762306a36Sopenharmony_ci	},
780862306a36Sopenharmony_ci	{ }	/* terminate */
780962306a36Sopenharmony_ci};
781062306a36Sopenharmony_ci
781162306a36Sopenharmony_cistatic struct cftype memsw_files[] = {
781262306a36Sopenharmony_ci	{
781362306a36Sopenharmony_ci		.name = "memsw.usage_in_bytes",
781462306a36Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
781562306a36Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
781662306a36Sopenharmony_ci	},
781762306a36Sopenharmony_ci	{
781862306a36Sopenharmony_ci		.name = "memsw.max_usage_in_bytes",
781962306a36Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
782062306a36Sopenharmony_ci		.write = mem_cgroup_reset,
782162306a36Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
782262306a36Sopenharmony_ci	},
782362306a36Sopenharmony_ci	{
782462306a36Sopenharmony_ci		.name = "memsw.limit_in_bytes",
782562306a36Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
782662306a36Sopenharmony_ci		.write = mem_cgroup_write,
782762306a36Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
782862306a36Sopenharmony_ci	},
782962306a36Sopenharmony_ci	{
783062306a36Sopenharmony_ci		.name = "memsw.failcnt",
783162306a36Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
783262306a36Sopenharmony_ci		.write = mem_cgroup_reset,
783362306a36Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
783462306a36Sopenharmony_ci	},
783562306a36Sopenharmony_ci	{ },	/* terminate */
783662306a36Sopenharmony_ci};
783762306a36Sopenharmony_ci
783862306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
783962306a36Sopenharmony_ci/**
784062306a36Sopenharmony_ci * obj_cgroup_may_zswap - check if this cgroup can zswap
784162306a36Sopenharmony_ci * @objcg: the object cgroup
784262306a36Sopenharmony_ci *
784362306a36Sopenharmony_ci * Check if the hierarchical zswap limit has been reached.
784462306a36Sopenharmony_ci *
784562306a36Sopenharmony_ci * This doesn't check for specific headroom, and it is not atomic
784662306a36Sopenharmony_ci * either. But with zswap, the size of the allocation is only known
784762306a36Sopenharmony_ci * once compression has occured, and this optimistic pre-check avoids
784862306a36Sopenharmony_ci * spending cycles on compression when there is already no room left
784962306a36Sopenharmony_ci * or zswap is disabled altogether somewhere in the hierarchy.
785062306a36Sopenharmony_ci */
785162306a36Sopenharmony_cibool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
785262306a36Sopenharmony_ci{
785362306a36Sopenharmony_ci	struct mem_cgroup *memcg, *original_memcg;
785462306a36Sopenharmony_ci	bool ret = true;
785562306a36Sopenharmony_ci
785662306a36Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
785762306a36Sopenharmony_ci		return true;
785862306a36Sopenharmony_ci
785962306a36Sopenharmony_ci	original_memcg = get_mem_cgroup_from_objcg(objcg);
786062306a36Sopenharmony_ci	for (memcg = original_memcg; !mem_cgroup_is_root(memcg);
786162306a36Sopenharmony_ci	     memcg = parent_mem_cgroup(memcg)) {
786262306a36Sopenharmony_ci		unsigned long max = READ_ONCE(memcg->zswap_max);
786362306a36Sopenharmony_ci		unsigned long pages;
786462306a36Sopenharmony_ci
786562306a36Sopenharmony_ci		if (max == PAGE_COUNTER_MAX)
786662306a36Sopenharmony_ci			continue;
786762306a36Sopenharmony_ci		if (max == 0) {
786862306a36Sopenharmony_ci			ret = false;
786962306a36Sopenharmony_ci			break;
787062306a36Sopenharmony_ci		}
787162306a36Sopenharmony_ci
787262306a36Sopenharmony_ci		cgroup_rstat_flush(memcg->css.cgroup);
787362306a36Sopenharmony_ci		pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
787462306a36Sopenharmony_ci		if (pages < max)
787562306a36Sopenharmony_ci			continue;
787662306a36Sopenharmony_ci		ret = false;
787762306a36Sopenharmony_ci		break;
787862306a36Sopenharmony_ci	}
787962306a36Sopenharmony_ci	mem_cgroup_put(original_memcg);
788062306a36Sopenharmony_ci	return ret;
788162306a36Sopenharmony_ci}
788262306a36Sopenharmony_ci
788362306a36Sopenharmony_ci/**
788462306a36Sopenharmony_ci * obj_cgroup_charge_zswap - charge compression backend memory
788562306a36Sopenharmony_ci * @objcg: the object cgroup
788662306a36Sopenharmony_ci * @size: size of compressed object
788762306a36Sopenharmony_ci *
788862306a36Sopenharmony_ci * This forces the charge after obj_cgroup_may_zswap() allowed
788962306a36Sopenharmony_ci * compression and storage in zwap for this cgroup to go ahead.
789062306a36Sopenharmony_ci */
789162306a36Sopenharmony_civoid obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
789262306a36Sopenharmony_ci{
789362306a36Sopenharmony_ci	struct mem_cgroup *memcg;
789462306a36Sopenharmony_ci
789562306a36Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
789662306a36Sopenharmony_ci		return;
789762306a36Sopenharmony_ci
789862306a36Sopenharmony_ci	VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
789962306a36Sopenharmony_ci
790062306a36Sopenharmony_ci	/* PF_MEMALLOC context, charging must succeed */
790162306a36Sopenharmony_ci	if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
790262306a36Sopenharmony_ci		VM_WARN_ON_ONCE(1);
790362306a36Sopenharmony_ci
790462306a36Sopenharmony_ci	rcu_read_lock();
790562306a36Sopenharmony_ci	memcg = obj_cgroup_memcg(objcg);
790662306a36Sopenharmony_ci	mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
790762306a36Sopenharmony_ci	mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
790862306a36Sopenharmony_ci	rcu_read_unlock();
790962306a36Sopenharmony_ci}
791062306a36Sopenharmony_ci
791162306a36Sopenharmony_ci/**
791262306a36Sopenharmony_ci * obj_cgroup_uncharge_zswap - uncharge compression backend memory
791362306a36Sopenharmony_ci * @objcg: the object cgroup
791462306a36Sopenharmony_ci * @size: size of compressed object
791562306a36Sopenharmony_ci *
791662306a36Sopenharmony_ci * Uncharges zswap memory on page in.
791762306a36Sopenharmony_ci */
791862306a36Sopenharmony_civoid obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
791962306a36Sopenharmony_ci{
792062306a36Sopenharmony_ci	struct mem_cgroup *memcg;
792162306a36Sopenharmony_ci
792262306a36Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
792362306a36Sopenharmony_ci		return;
792462306a36Sopenharmony_ci
792562306a36Sopenharmony_ci	obj_cgroup_uncharge(objcg, size);
792662306a36Sopenharmony_ci
792762306a36Sopenharmony_ci	rcu_read_lock();
792862306a36Sopenharmony_ci	memcg = obj_cgroup_memcg(objcg);
792962306a36Sopenharmony_ci	mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
793062306a36Sopenharmony_ci	mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
793162306a36Sopenharmony_ci	rcu_read_unlock();
793262306a36Sopenharmony_ci}
793362306a36Sopenharmony_ci
793462306a36Sopenharmony_cistatic u64 zswap_current_read(struct cgroup_subsys_state *css,
793562306a36Sopenharmony_ci			      struct cftype *cft)
793662306a36Sopenharmony_ci{
793762306a36Sopenharmony_ci	cgroup_rstat_flush(css->cgroup);
793862306a36Sopenharmony_ci	return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B);
793962306a36Sopenharmony_ci}
794062306a36Sopenharmony_ci
794162306a36Sopenharmony_cistatic int zswap_max_show(struct seq_file *m, void *v)
794262306a36Sopenharmony_ci{
794362306a36Sopenharmony_ci	return seq_puts_memcg_tunable(m,
794462306a36Sopenharmony_ci		READ_ONCE(mem_cgroup_from_seq(m)->zswap_max));
794562306a36Sopenharmony_ci}
794662306a36Sopenharmony_ci
794762306a36Sopenharmony_cistatic ssize_t zswap_max_write(struct kernfs_open_file *of,
794862306a36Sopenharmony_ci			       char *buf, size_t nbytes, loff_t off)
794962306a36Sopenharmony_ci{
795062306a36Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
795162306a36Sopenharmony_ci	unsigned long max;
795262306a36Sopenharmony_ci	int err;
795362306a36Sopenharmony_ci
795462306a36Sopenharmony_ci	buf = strstrip(buf);
795562306a36Sopenharmony_ci	err = page_counter_memparse(buf, "max", &max);
795662306a36Sopenharmony_ci	if (err)
795762306a36Sopenharmony_ci		return err;
795862306a36Sopenharmony_ci
795962306a36Sopenharmony_ci	xchg(&memcg->zswap_max, max);
796062306a36Sopenharmony_ci
796162306a36Sopenharmony_ci	return nbytes;
796262306a36Sopenharmony_ci}
796362306a36Sopenharmony_ci
796462306a36Sopenharmony_cistatic struct cftype zswap_files[] = {
796562306a36Sopenharmony_ci	{
796662306a36Sopenharmony_ci		.name = "zswap.current",
796762306a36Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
796862306a36Sopenharmony_ci		.read_u64 = zswap_current_read,
796962306a36Sopenharmony_ci	},
797062306a36Sopenharmony_ci	{
797162306a36Sopenharmony_ci		.name = "zswap.max",
797262306a36Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
797362306a36Sopenharmony_ci		.seq_show = zswap_max_show,
797462306a36Sopenharmony_ci		.write = zswap_max_write,
797562306a36Sopenharmony_ci	},
797662306a36Sopenharmony_ci	{ }	/* terminate */
797762306a36Sopenharmony_ci};
797862306a36Sopenharmony_ci#endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */
797962306a36Sopenharmony_ci
798062306a36Sopenharmony_cistatic int __init mem_cgroup_swap_init(void)
798162306a36Sopenharmony_ci{
798262306a36Sopenharmony_ci	if (mem_cgroup_disabled())
798362306a36Sopenharmony_ci		return 0;
798462306a36Sopenharmony_ci
798562306a36Sopenharmony_ci	WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
798662306a36Sopenharmony_ci	WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
798762306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
798862306a36Sopenharmony_ci	WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files));
798962306a36Sopenharmony_ci#endif
799062306a36Sopenharmony_ci	return 0;
799162306a36Sopenharmony_ci}
799262306a36Sopenharmony_cisubsys_initcall(mem_cgroup_swap_init);
799362306a36Sopenharmony_ci
799462306a36Sopenharmony_ci#endif /* CONFIG_SWAP */
7995