18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
28c2ecf20Sopenharmony_ci/* memcontrol.c - Memory Controller
38c2ecf20Sopenharmony_ci *
48c2ecf20Sopenharmony_ci * Copyright IBM Corporation, 2007
58c2ecf20Sopenharmony_ci * Author Balbir Singh <balbir@linux.vnet.ibm.com>
68c2ecf20Sopenharmony_ci *
78c2ecf20Sopenharmony_ci * Copyright 2007 OpenVZ SWsoft Inc
88c2ecf20Sopenharmony_ci * Author: Pavel Emelianov <xemul@openvz.org>
98c2ecf20Sopenharmony_ci *
108c2ecf20Sopenharmony_ci * Memory thresholds
118c2ecf20Sopenharmony_ci * Copyright (C) 2009 Nokia Corporation
128c2ecf20Sopenharmony_ci * Author: Kirill A. Shutemov
138c2ecf20Sopenharmony_ci *
148c2ecf20Sopenharmony_ci * Kernel Memory Controller
158c2ecf20Sopenharmony_ci * Copyright (C) 2012 Parallels Inc. and Google Inc.
168c2ecf20Sopenharmony_ci * Authors: Glauber Costa and Suleiman Souhlal
178c2ecf20Sopenharmony_ci *
188c2ecf20Sopenharmony_ci * Native page reclaim
198c2ecf20Sopenharmony_ci * Charge lifetime sanitation
208c2ecf20Sopenharmony_ci * Lockless page tracking & accounting
218c2ecf20Sopenharmony_ci * Unified hierarchy configuration model
228c2ecf20Sopenharmony_ci * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
238c2ecf20Sopenharmony_ci */
248c2ecf20Sopenharmony_ci
258c2ecf20Sopenharmony_ci#include <linux/page_counter.h>
268c2ecf20Sopenharmony_ci#include <linux/memcontrol.h>
278c2ecf20Sopenharmony_ci#include <linux/cgroup.h>
288c2ecf20Sopenharmony_ci#include <linux/pagewalk.h>
298c2ecf20Sopenharmony_ci#include <linux/sched/mm.h>
308c2ecf20Sopenharmony_ci#include <linux/shmem_fs.h>
318c2ecf20Sopenharmony_ci#include <linux/hugetlb.h>
328c2ecf20Sopenharmony_ci#include <linux/pagemap.h>
338c2ecf20Sopenharmony_ci#include <linux/vm_event_item.h>
348c2ecf20Sopenharmony_ci#include <linux/smp.h>
358c2ecf20Sopenharmony_ci#include <linux/page-flags.h>
368c2ecf20Sopenharmony_ci#include <linux/backing-dev.h>
378c2ecf20Sopenharmony_ci#include <linux/bit_spinlock.h>
388c2ecf20Sopenharmony_ci#include <linux/rcupdate.h>
398c2ecf20Sopenharmony_ci#include <linux/limits.h>
408c2ecf20Sopenharmony_ci#include <linux/export.h>
418c2ecf20Sopenharmony_ci#include <linux/mutex.h>
428c2ecf20Sopenharmony_ci#include <linux/rbtree.h>
438c2ecf20Sopenharmony_ci#include <linux/slab.h>
448c2ecf20Sopenharmony_ci#include <linux/swap.h>
458c2ecf20Sopenharmony_ci#include <linux/swapops.h>
468c2ecf20Sopenharmony_ci#include <linux/spinlock.h>
478c2ecf20Sopenharmony_ci#include <linux/eventfd.h>
488c2ecf20Sopenharmony_ci#include <linux/poll.h>
498c2ecf20Sopenharmony_ci#include <linux/sort.h>
508c2ecf20Sopenharmony_ci#include <linux/fs.h>
518c2ecf20Sopenharmony_ci#include <linux/seq_file.h>
528c2ecf20Sopenharmony_ci#include <linux/vmpressure.h>
538c2ecf20Sopenharmony_ci#include <linux/mm_inline.h>
548c2ecf20Sopenharmony_ci#include <linux/swap_cgroup.h>
558c2ecf20Sopenharmony_ci#include <linux/cpu.h>
568c2ecf20Sopenharmony_ci#include <linux/oom.h>
578c2ecf20Sopenharmony_ci#include <linux/lockdep.h>
588c2ecf20Sopenharmony_ci#include <linux/file.h>
598c2ecf20Sopenharmony_ci#include <linux/tracehook.h>
608c2ecf20Sopenharmony_ci#include <linux/psi.h>
618c2ecf20Sopenharmony_ci#include <linux/seq_buf.h>
628c2ecf20Sopenharmony_ci#include "internal.h"
638c2ecf20Sopenharmony_ci#include <net/sock.h>
648c2ecf20Sopenharmony_ci#include <net/ip.h>
658c2ecf20Sopenharmony_ci#include "slab.h"
668c2ecf20Sopenharmony_ci
678c2ecf20Sopenharmony_ci#include <linux/uaccess.h>
688c2ecf20Sopenharmony_ci#include <linux/zswapd.h>
698c2ecf20Sopenharmony_ci
708c2ecf20Sopenharmony_ci#include <trace/events/vmscan.h>
718c2ecf20Sopenharmony_ci
728c2ecf20Sopenharmony_cistruct cgroup_subsys memory_cgrp_subsys __read_mostly;
738c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memory_cgrp_subsys);
748c2ecf20Sopenharmony_ci
758c2ecf20Sopenharmony_cistruct mem_cgroup *root_mem_cgroup __read_mostly;
768c2ecf20Sopenharmony_ci
778c2ecf20Sopenharmony_ci/* Active memory cgroup to use from an interrupt context */
788c2ecf20Sopenharmony_ciDEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
798c2ecf20Sopenharmony_ci
808c2ecf20Sopenharmony_ci/* Socket memory accounting disabled? */
818c2ecf20Sopenharmony_cistatic bool cgroup_memory_nosocket;
828c2ecf20Sopenharmony_ci
838c2ecf20Sopenharmony_ci/* Kernel memory accounting disabled */
848c2ecf20Sopenharmony_cistatic bool cgroup_memory_nokmem = true;
858c2ecf20Sopenharmony_ci
868c2ecf20Sopenharmony_ci/* Whether the swap controller is active */
878c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_SWAP
888c2ecf20Sopenharmony_cibool cgroup_memory_noswap __read_mostly;
898c2ecf20Sopenharmony_ci#else
908c2ecf20Sopenharmony_ci#define cgroup_memory_noswap		1
918c2ecf20Sopenharmony_ci#endif
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK
948c2ecf20Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
958c2ecf20Sopenharmony_ci#endif
968c2ecf20Sopenharmony_ci
978c2ecf20Sopenharmony_ci/* Whether legacy memory+swap accounting is active */
988c2ecf20Sopenharmony_cistatic bool do_memsw_account(void)
998c2ecf20Sopenharmony_ci{
1008c2ecf20Sopenharmony_ci	return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
1018c2ecf20Sopenharmony_ci}
1028c2ecf20Sopenharmony_ci
1038c2ecf20Sopenharmony_ci#define THRESHOLDS_EVENTS_TARGET 128
1048c2ecf20Sopenharmony_ci#define SOFTLIMIT_EVENTS_TARGET 1024
1058c2ecf20Sopenharmony_ci
1068c2ecf20Sopenharmony_ci/*
1078c2ecf20Sopenharmony_ci * Cgroups above their limits are maintained in a RB-Tree, independent of
1088c2ecf20Sopenharmony_ci * their hierarchy representation
1098c2ecf20Sopenharmony_ci */
1108c2ecf20Sopenharmony_ci
1118c2ecf20Sopenharmony_cistruct mem_cgroup_tree_per_node {
1128c2ecf20Sopenharmony_ci	struct rb_root rb_root;
1138c2ecf20Sopenharmony_ci	struct rb_node *rb_rightmost;
1148c2ecf20Sopenharmony_ci	spinlock_t lock;
1158c2ecf20Sopenharmony_ci};
1168c2ecf20Sopenharmony_ci
1178c2ecf20Sopenharmony_cistruct mem_cgroup_tree {
1188c2ecf20Sopenharmony_ci	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
1198c2ecf20Sopenharmony_ci};
1208c2ecf20Sopenharmony_ci
1218c2ecf20Sopenharmony_cistatic struct mem_cgroup_tree soft_limit_tree __read_mostly;
1228c2ecf20Sopenharmony_ci
1238c2ecf20Sopenharmony_ci/* for OOM */
1248c2ecf20Sopenharmony_cistruct mem_cgroup_eventfd_list {
1258c2ecf20Sopenharmony_ci	struct list_head list;
1268c2ecf20Sopenharmony_ci	struct eventfd_ctx *eventfd;
1278c2ecf20Sopenharmony_ci};
1288c2ecf20Sopenharmony_ci
1298c2ecf20Sopenharmony_ci/*
1308c2ecf20Sopenharmony_ci * cgroup_event represents events which userspace want to receive.
1318c2ecf20Sopenharmony_ci */
1328c2ecf20Sopenharmony_cistruct mem_cgroup_event {
1338c2ecf20Sopenharmony_ci	/*
1348c2ecf20Sopenharmony_ci	 * memcg which the event belongs to.
1358c2ecf20Sopenharmony_ci	 */
1368c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
1378c2ecf20Sopenharmony_ci	/*
1388c2ecf20Sopenharmony_ci	 * eventfd to signal userspace about the event.
1398c2ecf20Sopenharmony_ci	 */
1408c2ecf20Sopenharmony_ci	struct eventfd_ctx *eventfd;
1418c2ecf20Sopenharmony_ci	/*
1428c2ecf20Sopenharmony_ci	 * Each of these stored in a list by the cgroup.
1438c2ecf20Sopenharmony_ci	 */
1448c2ecf20Sopenharmony_ci	struct list_head list;
1458c2ecf20Sopenharmony_ci	/*
1468c2ecf20Sopenharmony_ci	 * register_event() callback will be used to add new userspace
1478c2ecf20Sopenharmony_ci	 * waiter for changes related to this event.  Use eventfd_signal()
1488c2ecf20Sopenharmony_ci	 * on eventfd to send notification to userspace.
1498c2ecf20Sopenharmony_ci	 */
1508c2ecf20Sopenharmony_ci	int (*register_event)(struct mem_cgroup *memcg,
1518c2ecf20Sopenharmony_ci			      struct eventfd_ctx *eventfd, const char *args);
1528c2ecf20Sopenharmony_ci	/*
1538c2ecf20Sopenharmony_ci	 * unregister_event() callback will be called when userspace closes
1548c2ecf20Sopenharmony_ci	 * the eventfd or on cgroup removing.  This callback must be set,
1558c2ecf20Sopenharmony_ci	 * if you want provide notification functionality.
1568c2ecf20Sopenharmony_ci	 */
1578c2ecf20Sopenharmony_ci	void (*unregister_event)(struct mem_cgroup *memcg,
1588c2ecf20Sopenharmony_ci				 struct eventfd_ctx *eventfd);
1598c2ecf20Sopenharmony_ci	/*
1608c2ecf20Sopenharmony_ci	 * All fields below needed to unregister event when
1618c2ecf20Sopenharmony_ci	 * userspace closes eventfd.
1628c2ecf20Sopenharmony_ci	 */
1638c2ecf20Sopenharmony_ci	poll_table pt;
1648c2ecf20Sopenharmony_ci	wait_queue_head_t *wqh;
1658c2ecf20Sopenharmony_ci	wait_queue_entry_t wait;
1668c2ecf20Sopenharmony_ci	struct work_struct remove;
1678c2ecf20Sopenharmony_ci};
1688c2ecf20Sopenharmony_ci
1698c2ecf20Sopenharmony_cistatic void mem_cgroup_threshold(struct mem_cgroup *memcg);
1708c2ecf20Sopenharmony_cistatic void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
1718c2ecf20Sopenharmony_ci
1728c2ecf20Sopenharmony_ci/* Stuffs for move charges at task migration. */
1738c2ecf20Sopenharmony_ci/*
1748c2ecf20Sopenharmony_ci * Types of charges to be moved.
1758c2ecf20Sopenharmony_ci */
1768c2ecf20Sopenharmony_ci#define MOVE_ANON	0x1U
1778c2ecf20Sopenharmony_ci#define MOVE_FILE	0x2U
1788c2ecf20Sopenharmony_ci#define MOVE_MASK	(MOVE_ANON | MOVE_FILE)
1798c2ecf20Sopenharmony_ci
1808c2ecf20Sopenharmony_ci/* "mc" and its members are protected by cgroup_mutex */
1818c2ecf20Sopenharmony_cistatic struct move_charge_struct {
1828c2ecf20Sopenharmony_ci	spinlock_t	  lock; /* for from, to */
1838c2ecf20Sopenharmony_ci	struct mm_struct  *mm;
1848c2ecf20Sopenharmony_ci	struct mem_cgroup *from;
1858c2ecf20Sopenharmony_ci	struct mem_cgroup *to;
1868c2ecf20Sopenharmony_ci	unsigned long flags;
1878c2ecf20Sopenharmony_ci	unsigned long precharge;
1888c2ecf20Sopenharmony_ci	unsigned long moved_charge;
1898c2ecf20Sopenharmony_ci	unsigned long moved_swap;
1908c2ecf20Sopenharmony_ci	struct task_struct *moving_task;	/* a task moving charges */
1918c2ecf20Sopenharmony_ci	wait_queue_head_t waitq;		/* a waitq for other context */
1928c2ecf20Sopenharmony_ci} mc = {
1938c2ecf20Sopenharmony_ci	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
1948c2ecf20Sopenharmony_ci	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
1958c2ecf20Sopenharmony_ci};
1968c2ecf20Sopenharmony_ci
1978c2ecf20Sopenharmony_ci/*
1988c2ecf20Sopenharmony_ci * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
1998c2ecf20Sopenharmony_ci * limit reclaim to prevent infinite loops, if they ever occur.
2008c2ecf20Sopenharmony_ci */
2018c2ecf20Sopenharmony_ci#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
2028c2ecf20Sopenharmony_ci#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
2038c2ecf20Sopenharmony_ci
2048c2ecf20Sopenharmony_ci/* for encoding cft->private value on file */
2058c2ecf20Sopenharmony_cienum res_type {
2068c2ecf20Sopenharmony_ci	_MEM,
2078c2ecf20Sopenharmony_ci	_MEMSWAP,
2088c2ecf20Sopenharmony_ci	_OOM_TYPE,
2098c2ecf20Sopenharmony_ci	_KMEM,
2108c2ecf20Sopenharmony_ci	_TCP,
2118c2ecf20Sopenharmony_ci};
2128c2ecf20Sopenharmony_ci
2138c2ecf20Sopenharmony_ci#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
2148c2ecf20Sopenharmony_ci#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
2158c2ecf20Sopenharmony_ci#define MEMFILE_ATTR(val)	((val) & 0xffff)
2168c2ecf20Sopenharmony_ci/* Used for OOM nofiier */
2178c2ecf20Sopenharmony_ci#define OOM_CONTROL		(0)
2188c2ecf20Sopenharmony_ci
2198c2ecf20Sopenharmony_ci/*
2208c2ecf20Sopenharmony_ci * Iteration constructs for visiting all cgroups (under a tree).  If
2218c2ecf20Sopenharmony_ci * loops are exited prematurely (break), mem_cgroup_iter_break() must
2228c2ecf20Sopenharmony_ci * be used for reference counting.
2238c2ecf20Sopenharmony_ci */
2248c2ecf20Sopenharmony_ci#define for_each_mem_cgroup_tree(iter, root)		\
2258c2ecf20Sopenharmony_ci	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
2268c2ecf20Sopenharmony_ci	     iter != NULL;				\
2278c2ecf20Sopenharmony_ci	     iter = mem_cgroup_iter(root, iter, NULL))
2288c2ecf20Sopenharmony_ci
2298c2ecf20Sopenharmony_ci#define for_each_mem_cgroup(iter)			\
2308c2ecf20Sopenharmony_ci	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
2318c2ecf20Sopenharmony_ci	     iter != NULL;				\
2328c2ecf20Sopenharmony_ci	     iter = mem_cgroup_iter(NULL, iter, NULL))
2338c2ecf20Sopenharmony_ci
2348c2ecf20Sopenharmony_cistatic inline bool task_is_dying(void)
2358c2ecf20Sopenharmony_ci{
2368c2ecf20Sopenharmony_ci	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
2378c2ecf20Sopenharmony_ci		(current->flags & PF_EXITING);
2388c2ecf20Sopenharmony_ci}
2398c2ecf20Sopenharmony_ci
2408c2ecf20Sopenharmony_ci/* Some nice accessors for the vmpressure. */
2418c2ecf20Sopenharmony_cistruct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
2428c2ecf20Sopenharmony_ci{
2438c2ecf20Sopenharmony_ci	if (!memcg)
2448c2ecf20Sopenharmony_ci		memcg = root_mem_cgroup;
2458c2ecf20Sopenharmony_ci	return &memcg->vmpressure;
2468c2ecf20Sopenharmony_ci}
2478c2ecf20Sopenharmony_ci
2488c2ecf20Sopenharmony_cistruct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
2498c2ecf20Sopenharmony_ci{
2508c2ecf20Sopenharmony_ci	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
2518c2ecf20Sopenharmony_ci}
2528c2ecf20Sopenharmony_ci
2538c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
2548c2ecf20Sopenharmony_cistatic DEFINE_SPINLOCK(objcg_lock);
2558c2ecf20Sopenharmony_ci
2568c2ecf20Sopenharmony_cistatic void obj_cgroup_release(struct percpu_ref *ref)
2578c2ecf20Sopenharmony_ci{
2588c2ecf20Sopenharmony_ci	struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
2598c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
2608c2ecf20Sopenharmony_ci	unsigned int nr_bytes;
2618c2ecf20Sopenharmony_ci	unsigned int nr_pages;
2628c2ecf20Sopenharmony_ci	unsigned long flags;
2638c2ecf20Sopenharmony_ci
2648c2ecf20Sopenharmony_ci	/*
2658c2ecf20Sopenharmony_ci	 * At this point all allocated objects are freed, and
2668c2ecf20Sopenharmony_ci	 * objcg->nr_charged_bytes can't have an arbitrary byte value.
2678c2ecf20Sopenharmony_ci	 * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
2688c2ecf20Sopenharmony_ci	 *
2698c2ecf20Sopenharmony_ci	 * The following sequence can lead to it:
2708c2ecf20Sopenharmony_ci	 * 1) CPU0: objcg == stock->cached_objcg
2718c2ecf20Sopenharmony_ci	 * 2) CPU1: we do a small allocation (e.g. 92 bytes),
2728c2ecf20Sopenharmony_ci	 *          PAGE_SIZE bytes are charged
2738c2ecf20Sopenharmony_ci	 * 3) CPU1: a process from another memcg is allocating something,
2748c2ecf20Sopenharmony_ci	 *          the stock if flushed,
2758c2ecf20Sopenharmony_ci	 *          objcg->nr_charged_bytes = PAGE_SIZE - 92
2768c2ecf20Sopenharmony_ci	 * 5) CPU0: we do release this object,
2778c2ecf20Sopenharmony_ci	 *          92 bytes are added to stock->nr_bytes
2788c2ecf20Sopenharmony_ci	 * 6) CPU0: stock is flushed,
2798c2ecf20Sopenharmony_ci	 *          92 bytes are added to objcg->nr_charged_bytes
2808c2ecf20Sopenharmony_ci	 *
2818c2ecf20Sopenharmony_ci	 * In the result, nr_charged_bytes == PAGE_SIZE.
2828c2ecf20Sopenharmony_ci	 * This page will be uncharged in obj_cgroup_release().
2838c2ecf20Sopenharmony_ci	 */
2848c2ecf20Sopenharmony_ci	nr_bytes = atomic_read(&objcg->nr_charged_bytes);
2858c2ecf20Sopenharmony_ci	WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
2868c2ecf20Sopenharmony_ci	nr_pages = nr_bytes >> PAGE_SHIFT;
2878c2ecf20Sopenharmony_ci
2888c2ecf20Sopenharmony_ci	spin_lock_irqsave(&objcg_lock, flags);
2898c2ecf20Sopenharmony_ci	memcg = obj_cgroup_memcg(objcg);
2908c2ecf20Sopenharmony_ci	if (nr_pages)
2918c2ecf20Sopenharmony_ci		__memcg_kmem_uncharge(memcg, nr_pages);
2928c2ecf20Sopenharmony_ci	list_del(&objcg->list);
2938c2ecf20Sopenharmony_ci	mem_cgroup_put(memcg);
2948c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&objcg_lock, flags);
2958c2ecf20Sopenharmony_ci
2968c2ecf20Sopenharmony_ci	percpu_ref_exit(ref);
2978c2ecf20Sopenharmony_ci	kfree_rcu(objcg, rcu);
2988c2ecf20Sopenharmony_ci}
2998c2ecf20Sopenharmony_ci
3008c2ecf20Sopenharmony_cistatic struct obj_cgroup *obj_cgroup_alloc(void)
3018c2ecf20Sopenharmony_ci{
3028c2ecf20Sopenharmony_ci	struct obj_cgroup *objcg;
3038c2ecf20Sopenharmony_ci	int ret;
3048c2ecf20Sopenharmony_ci
3058c2ecf20Sopenharmony_ci	objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
3068c2ecf20Sopenharmony_ci	if (!objcg)
3078c2ecf20Sopenharmony_ci		return NULL;
3088c2ecf20Sopenharmony_ci
3098c2ecf20Sopenharmony_ci	ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
3108c2ecf20Sopenharmony_ci			      GFP_KERNEL);
3118c2ecf20Sopenharmony_ci	if (ret) {
3128c2ecf20Sopenharmony_ci		kfree(objcg);
3138c2ecf20Sopenharmony_ci		return NULL;
3148c2ecf20Sopenharmony_ci	}
3158c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&objcg->list);
3168c2ecf20Sopenharmony_ci	return objcg;
3178c2ecf20Sopenharmony_ci}
3188c2ecf20Sopenharmony_ci
3198c2ecf20Sopenharmony_cistatic void memcg_reparent_objcgs(struct mem_cgroup *memcg,
3208c2ecf20Sopenharmony_ci				  struct mem_cgroup *parent)
3218c2ecf20Sopenharmony_ci{
3228c2ecf20Sopenharmony_ci	struct obj_cgroup *objcg, *iter;
3238c2ecf20Sopenharmony_ci
3248c2ecf20Sopenharmony_ci	objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
3258c2ecf20Sopenharmony_ci
3268c2ecf20Sopenharmony_ci	spin_lock_irq(&objcg_lock);
3278c2ecf20Sopenharmony_ci
3288c2ecf20Sopenharmony_ci	/* Move active objcg to the parent's list */
3298c2ecf20Sopenharmony_ci	xchg(&objcg->memcg, parent);
3308c2ecf20Sopenharmony_ci	css_get(&parent->css);
3318c2ecf20Sopenharmony_ci	list_add(&objcg->list, &parent->objcg_list);
3328c2ecf20Sopenharmony_ci
3338c2ecf20Sopenharmony_ci	/* Move already reparented objcgs to the parent's list */
3348c2ecf20Sopenharmony_ci	list_for_each_entry(iter, &memcg->objcg_list, list) {
3358c2ecf20Sopenharmony_ci		css_get(&parent->css);
3368c2ecf20Sopenharmony_ci		xchg(&iter->memcg, parent);
3378c2ecf20Sopenharmony_ci		css_put(&memcg->css);
3388c2ecf20Sopenharmony_ci	}
3398c2ecf20Sopenharmony_ci	list_splice(&memcg->objcg_list, &parent->objcg_list);
3408c2ecf20Sopenharmony_ci
3418c2ecf20Sopenharmony_ci	spin_unlock_irq(&objcg_lock);
3428c2ecf20Sopenharmony_ci
3438c2ecf20Sopenharmony_ci	percpu_ref_kill(&objcg->refcnt);
3448c2ecf20Sopenharmony_ci}
3458c2ecf20Sopenharmony_ci
3468c2ecf20Sopenharmony_ci/*
3478c2ecf20Sopenharmony_ci * This will be used as a shrinker list's index.
3488c2ecf20Sopenharmony_ci * The main reason for not using cgroup id for this:
3498c2ecf20Sopenharmony_ci *  this works better in sparse environments, where we have a lot of memcgs,
3508c2ecf20Sopenharmony_ci *  but only a few kmem-limited. Or also, if we have, for instance, 200
3518c2ecf20Sopenharmony_ci *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
3528c2ecf20Sopenharmony_ci *  200 entry array for that.
3538c2ecf20Sopenharmony_ci *
3548c2ecf20Sopenharmony_ci * The current size of the caches array is stored in memcg_nr_cache_ids. It
3558c2ecf20Sopenharmony_ci * will double each time we have to increase it.
3568c2ecf20Sopenharmony_ci */
3578c2ecf20Sopenharmony_cistatic DEFINE_IDA(memcg_cache_ida);
3588c2ecf20Sopenharmony_ciint memcg_nr_cache_ids;
3598c2ecf20Sopenharmony_ci
3608c2ecf20Sopenharmony_ci/* Protects memcg_nr_cache_ids */
3618c2ecf20Sopenharmony_cistatic DECLARE_RWSEM(memcg_cache_ids_sem);
3628c2ecf20Sopenharmony_ci
3638c2ecf20Sopenharmony_civoid memcg_get_cache_ids(void)
3648c2ecf20Sopenharmony_ci{
3658c2ecf20Sopenharmony_ci	down_read(&memcg_cache_ids_sem);
3668c2ecf20Sopenharmony_ci}
3678c2ecf20Sopenharmony_ci
3688c2ecf20Sopenharmony_civoid memcg_put_cache_ids(void)
3698c2ecf20Sopenharmony_ci{
3708c2ecf20Sopenharmony_ci	up_read(&memcg_cache_ids_sem);
3718c2ecf20Sopenharmony_ci}
3728c2ecf20Sopenharmony_ci
3738c2ecf20Sopenharmony_ci/*
3748c2ecf20Sopenharmony_ci * MIN_SIZE is different than 1, because we would like to avoid going through
3758c2ecf20Sopenharmony_ci * the alloc/free process all the time. In a small machine, 4 kmem-limited
3768c2ecf20Sopenharmony_ci * cgroups is a reasonable guess. In the future, it could be a parameter or
3778c2ecf20Sopenharmony_ci * tunable, but that is strictly not necessary.
3788c2ecf20Sopenharmony_ci *
3798c2ecf20Sopenharmony_ci * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
3808c2ecf20Sopenharmony_ci * this constant directly from cgroup, but it is understandable that this is
3818c2ecf20Sopenharmony_ci * better kept as an internal representation in cgroup.c. In any case, the
3828c2ecf20Sopenharmony_ci * cgrp_id space is not getting any smaller, and we don't have to necessarily
3838c2ecf20Sopenharmony_ci * increase ours as well if it increases.
3848c2ecf20Sopenharmony_ci */
3858c2ecf20Sopenharmony_ci#define MEMCG_CACHES_MIN_SIZE 4
3868c2ecf20Sopenharmony_ci#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
3878c2ecf20Sopenharmony_ci
3888c2ecf20Sopenharmony_ci/*
3898c2ecf20Sopenharmony_ci * A lot of the calls to the cache allocation functions are expected to be
3908c2ecf20Sopenharmony_ci * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
3918c2ecf20Sopenharmony_ci * conditional to this static branch, we'll have to allow modules that does
3928c2ecf20Sopenharmony_ci * kmem_cache_alloc and the such to see this symbol as well
3938c2ecf20Sopenharmony_ci */
3948c2ecf20Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
3958c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memcg_kmem_enabled_key);
3968c2ecf20Sopenharmony_ci#endif
3978c2ecf20Sopenharmony_ci
3988c2ecf20Sopenharmony_cistatic int memcg_shrinker_map_size;
3998c2ecf20Sopenharmony_cistatic DEFINE_MUTEX(memcg_shrinker_map_mutex);
4008c2ecf20Sopenharmony_ci
4018c2ecf20Sopenharmony_cistatic void memcg_free_shrinker_map_rcu(struct rcu_head *head)
4028c2ecf20Sopenharmony_ci{
4038c2ecf20Sopenharmony_ci	kvfree(container_of(head, struct memcg_shrinker_map, rcu));
4048c2ecf20Sopenharmony_ci}
4058c2ecf20Sopenharmony_ci
4068c2ecf20Sopenharmony_cistatic int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
4078c2ecf20Sopenharmony_ci					 int size, int old_size)
4088c2ecf20Sopenharmony_ci{
4098c2ecf20Sopenharmony_ci	struct memcg_shrinker_map *new, *old;
4108c2ecf20Sopenharmony_ci	int nid;
4118c2ecf20Sopenharmony_ci
4128c2ecf20Sopenharmony_ci	lockdep_assert_held(&memcg_shrinker_map_mutex);
4138c2ecf20Sopenharmony_ci
4148c2ecf20Sopenharmony_ci	for_each_node(nid) {
4158c2ecf20Sopenharmony_ci		old = rcu_dereference_protected(
4168c2ecf20Sopenharmony_ci			mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
4178c2ecf20Sopenharmony_ci		/* Not yet online memcg */
4188c2ecf20Sopenharmony_ci		if (!old)
4198c2ecf20Sopenharmony_ci			return 0;
4208c2ecf20Sopenharmony_ci
4218c2ecf20Sopenharmony_ci		new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
4228c2ecf20Sopenharmony_ci		if (!new)
4238c2ecf20Sopenharmony_ci			return -ENOMEM;
4248c2ecf20Sopenharmony_ci
4258c2ecf20Sopenharmony_ci		/* Set all old bits, clear all new bits */
4268c2ecf20Sopenharmony_ci		memset(new->map, (int)0xff, old_size);
4278c2ecf20Sopenharmony_ci		memset((void *)new->map + old_size, 0, size - old_size);
4288c2ecf20Sopenharmony_ci
4298c2ecf20Sopenharmony_ci		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
4308c2ecf20Sopenharmony_ci		call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
4318c2ecf20Sopenharmony_ci	}
4328c2ecf20Sopenharmony_ci
4338c2ecf20Sopenharmony_ci	return 0;
4348c2ecf20Sopenharmony_ci}
4358c2ecf20Sopenharmony_ci
4368c2ecf20Sopenharmony_cistatic void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
4378c2ecf20Sopenharmony_ci{
4388c2ecf20Sopenharmony_ci	struct mem_cgroup_per_node *pn;
4398c2ecf20Sopenharmony_ci	struct memcg_shrinker_map *map;
4408c2ecf20Sopenharmony_ci	int nid;
4418c2ecf20Sopenharmony_ci
4428c2ecf20Sopenharmony_ci	if (mem_cgroup_is_root(memcg))
4438c2ecf20Sopenharmony_ci		return;
4448c2ecf20Sopenharmony_ci
4458c2ecf20Sopenharmony_ci	for_each_node(nid) {
4468c2ecf20Sopenharmony_ci		pn = mem_cgroup_nodeinfo(memcg, nid);
4478c2ecf20Sopenharmony_ci		map = rcu_dereference_protected(pn->shrinker_map, true);
4488c2ecf20Sopenharmony_ci		if (map)
4498c2ecf20Sopenharmony_ci			kvfree(map);
4508c2ecf20Sopenharmony_ci		rcu_assign_pointer(pn->shrinker_map, NULL);
4518c2ecf20Sopenharmony_ci	}
4528c2ecf20Sopenharmony_ci}
4538c2ecf20Sopenharmony_ci
4548c2ecf20Sopenharmony_cistatic int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
4558c2ecf20Sopenharmony_ci{
4568c2ecf20Sopenharmony_ci	struct memcg_shrinker_map *map;
4578c2ecf20Sopenharmony_ci	int nid, size, ret = 0;
4588c2ecf20Sopenharmony_ci
4598c2ecf20Sopenharmony_ci	if (mem_cgroup_is_root(memcg))
4608c2ecf20Sopenharmony_ci		return 0;
4618c2ecf20Sopenharmony_ci
4628c2ecf20Sopenharmony_ci	mutex_lock(&memcg_shrinker_map_mutex);
4638c2ecf20Sopenharmony_ci	size = memcg_shrinker_map_size;
4648c2ecf20Sopenharmony_ci	for_each_node(nid) {
4658c2ecf20Sopenharmony_ci		map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
4668c2ecf20Sopenharmony_ci		if (!map) {
4678c2ecf20Sopenharmony_ci			memcg_free_shrinker_maps(memcg);
4688c2ecf20Sopenharmony_ci			ret = -ENOMEM;
4698c2ecf20Sopenharmony_ci			break;
4708c2ecf20Sopenharmony_ci		}
4718c2ecf20Sopenharmony_ci		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
4728c2ecf20Sopenharmony_ci	}
4738c2ecf20Sopenharmony_ci	mutex_unlock(&memcg_shrinker_map_mutex);
4748c2ecf20Sopenharmony_ci
4758c2ecf20Sopenharmony_ci	return ret;
4768c2ecf20Sopenharmony_ci}
4778c2ecf20Sopenharmony_ci
4788c2ecf20Sopenharmony_ciint memcg_expand_shrinker_maps(int new_id)
4798c2ecf20Sopenharmony_ci{
4808c2ecf20Sopenharmony_ci	int size, old_size, ret = 0;
4818c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
4828c2ecf20Sopenharmony_ci
4838c2ecf20Sopenharmony_ci	size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
4848c2ecf20Sopenharmony_ci	old_size = memcg_shrinker_map_size;
4858c2ecf20Sopenharmony_ci	if (size <= old_size)
4868c2ecf20Sopenharmony_ci		return 0;
4878c2ecf20Sopenharmony_ci
4888c2ecf20Sopenharmony_ci	mutex_lock(&memcg_shrinker_map_mutex);
4898c2ecf20Sopenharmony_ci	if (!root_mem_cgroup)
4908c2ecf20Sopenharmony_ci		goto unlock;
4918c2ecf20Sopenharmony_ci
4928c2ecf20Sopenharmony_ci	for_each_mem_cgroup(memcg) {
4938c2ecf20Sopenharmony_ci		if (mem_cgroup_is_root(memcg))
4948c2ecf20Sopenharmony_ci			continue;
4958c2ecf20Sopenharmony_ci		ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
4968c2ecf20Sopenharmony_ci		if (ret) {
4978c2ecf20Sopenharmony_ci			mem_cgroup_iter_break(NULL, memcg);
4988c2ecf20Sopenharmony_ci			goto unlock;
4998c2ecf20Sopenharmony_ci		}
5008c2ecf20Sopenharmony_ci	}
5018c2ecf20Sopenharmony_ciunlock:
5028c2ecf20Sopenharmony_ci	if (!ret)
5038c2ecf20Sopenharmony_ci		memcg_shrinker_map_size = size;
5048c2ecf20Sopenharmony_ci	mutex_unlock(&memcg_shrinker_map_mutex);
5058c2ecf20Sopenharmony_ci	return ret;
5068c2ecf20Sopenharmony_ci}
5078c2ecf20Sopenharmony_ci
5088c2ecf20Sopenharmony_civoid memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
5098c2ecf20Sopenharmony_ci{
5108c2ecf20Sopenharmony_ci	if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
5118c2ecf20Sopenharmony_ci		struct memcg_shrinker_map *map;
5128c2ecf20Sopenharmony_ci
5138c2ecf20Sopenharmony_ci		rcu_read_lock();
5148c2ecf20Sopenharmony_ci		map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
5158c2ecf20Sopenharmony_ci		/* Pairs with smp mb in shrink_slab() */
5168c2ecf20Sopenharmony_ci		smp_mb__before_atomic();
5178c2ecf20Sopenharmony_ci		set_bit(shrinker_id, map->map);
5188c2ecf20Sopenharmony_ci		rcu_read_unlock();
5198c2ecf20Sopenharmony_ci	}
5208c2ecf20Sopenharmony_ci}
5218c2ecf20Sopenharmony_ci
5228c2ecf20Sopenharmony_ci/**
5238c2ecf20Sopenharmony_ci * mem_cgroup_css_from_page - css of the memcg associated with a page
5248c2ecf20Sopenharmony_ci * @page: page of interest
5258c2ecf20Sopenharmony_ci *
5268c2ecf20Sopenharmony_ci * If memcg is bound to the default hierarchy, css of the memcg associated
5278c2ecf20Sopenharmony_ci * with @page is returned.  The returned css remains associated with @page
5288c2ecf20Sopenharmony_ci * until it is released.
5298c2ecf20Sopenharmony_ci *
5308c2ecf20Sopenharmony_ci * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
5318c2ecf20Sopenharmony_ci * is returned.
5328c2ecf20Sopenharmony_ci */
5338c2ecf20Sopenharmony_cistruct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
5348c2ecf20Sopenharmony_ci{
5358c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
5368c2ecf20Sopenharmony_ci
5378c2ecf20Sopenharmony_ci	memcg = page->mem_cgroup;
5388c2ecf20Sopenharmony_ci
5398c2ecf20Sopenharmony_ci	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
5408c2ecf20Sopenharmony_ci		memcg = root_mem_cgroup;
5418c2ecf20Sopenharmony_ci
5428c2ecf20Sopenharmony_ci	return &memcg->css;
5438c2ecf20Sopenharmony_ci}
5448c2ecf20Sopenharmony_ci
5458c2ecf20Sopenharmony_ci/**
5468c2ecf20Sopenharmony_ci * page_cgroup_ino - return inode number of the memcg a page is charged to
5478c2ecf20Sopenharmony_ci * @page: the page
5488c2ecf20Sopenharmony_ci *
5498c2ecf20Sopenharmony_ci * Look up the closest online ancestor of the memory cgroup @page is charged to
5508c2ecf20Sopenharmony_ci * and return its inode number or 0 if @page is not charged to any cgroup. It
5518c2ecf20Sopenharmony_ci * is safe to call this function without holding a reference to @page.
5528c2ecf20Sopenharmony_ci *
5538c2ecf20Sopenharmony_ci * Note, this function is inherently racy, because there is nothing to prevent
5548c2ecf20Sopenharmony_ci * the cgroup inode from getting torn down and potentially reallocated a moment
5558c2ecf20Sopenharmony_ci * after page_cgroup_ino() returns, so it only should be used by callers that
5568c2ecf20Sopenharmony_ci * do not care (such as procfs interfaces).
5578c2ecf20Sopenharmony_ci */
5588c2ecf20Sopenharmony_ciino_t page_cgroup_ino(struct page *page)
5598c2ecf20Sopenharmony_ci{
5608c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
5618c2ecf20Sopenharmony_ci	unsigned long ino = 0;
5628c2ecf20Sopenharmony_ci
5638c2ecf20Sopenharmony_ci	rcu_read_lock();
5648c2ecf20Sopenharmony_ci	memcg = page->mem_cgroup;
5658c2ecf20Sopenharmony_ci
5668c2ecf20Sopenharmony_ci	/*
5678c2ecf20Sopenharmony_ci	 * The lowest bit set means that memcg isn't a valid
5688c2ecf20Sopenharmony_ci	 * memcg pointer, but a obj_cgroups pointer.
5698c2ecf20Sopenharmony_ci	 * In this case the page is shared and doesn't belong
5708c2ecf20Sopenharmony_ci	 * to any specific memory cgroup.
5718c2ecf20Sopenharmony_ci	 */
5728c2ecf20Sopenharmony_ci	if ((unsigned long) memcg & 0x1UL)
5738c2ecf20Sopenharmony_ci		memcg = NULL;
5748c2ecf20Sopenharmony_ci
5758c2ecf20Sopenharmony_ci	while (memcg && !(memcg->css.flags & CSS_ONLINE))
5768c2ecf20Sopenharmony_ci		memcg = parent_mem_cgroup(memcg);
5778c2ecf20Sopenharmony_ci	if (memcg)
5788c2ecf20Sopenharmony_ci		ino = cgroup_ino(memcg->css.cgroup);
5798c2ecf20Sopenharmony_ci	rcu_read_unlock();
5808c2ecf20Sopenharmony_ci	return ino;
5818c2ecf20Sopenharmony_ci}
5828c2ecf20Sopenharmony_ci
5838c2ecf20Sopenharmony_cistatic struct mem_cgroup_per_node *
5848c2ecf20Sopenharmony_cimem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
5858c2ecf20Sopenharmony_ci{
5868c2ecf20Sopenharmony_ci	int nid = page_to_nid(page);
5878c2ecf20Sopenharmony_ci
5888c2ecf20Sopenharmony_ci	return memcg->nodeinfo[nid];
5898c2ecf20Sopenharmony_ci}
5908c2ecf20Sopenharmony_ci
5918c2ecf20Sopenharmony_cistatic struct mem_cgroup_tree_per_node *
5928c2ecf20Sopenharmony_cisoft_limit_tree_node(int nid)
5938c2ecf20Sopenharmony_ci{
5948c2ecf20Sopenharmony_ci	return soft_limit_tree.rb_tree_per_node[nid];
5958c2ecf20Sopenharmony_ci}
5968c2ecf20Sopenharmony_ci
5978c2ecf20Sopenharmony_cistatic struct mem_cgroup_tree_per_node *
5988c2ecf20Sopenharmony_cisoft_limit_tree_from_page(struct page *page)
5998c2ecf20Sopenharmony_ci{
6008c2ecf20Sopenharmony_ci	int nid = page_to_nid(page);
6018c2ecf20Sopenharmony_ci
6028c2ecf20Sopenharmony_ci	return soft_limit_tree.rb_tree_per_node[nid];
6038c2ecf20Sopenharmony_ci}
6048c2ecf20Sopenharmony_ci
6058c2ecf20Sopenharmony_cistatic void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
6068c2ecf20Sopenharmony_ci					 struct mem_cgroup_tree_per_node *mctz,
6078c2ecf20Sopenharmony_ci					 unsigned long new_usage_in_excess)
6088c2ecf20Sopenharmony_ci{
6098c2ecf20Sopenharmony_ci	struct rb_node **p = &mctz->rb_root.rb_node;
6108c2ecf20Sopenharmony_ci	struct rb_node *parent = NULL;
6118c2ecf20Sopenharmony_ci	struct mem_cgroup_per_node *mz_node;
6128c2ecf20Sopenharmony_ci	bool rightmost = true;
6138c2ecf20Sopenharmony_ci
6148c2ecf20Sopenharmony_ci	if (mz->on_tree)
6158c2ecf20Sopenharmony_ci		return;
6168c2ecf20Sopenharmony_ci
6178c2ecf20Sopenharmony_ci	mz->usage_in_excess = new_usage_in_excess;
6188c2ecf20Sopenharmony_ci	if (!mz->usage_in_excess)
6198c2ecf20Sopenharmony_ci		return;
6208c2ecf20Sopenharmony_ci	while (*p) {
6218c2ecf20Sopenharmony_ci		parent = *p;
6228c2ecf20Sopenharmony_ci		mz_node = rb_entry(parent, struct mem_cgroup_per_node,
6238c2ecf20Sopenharmony_ci					tree_node);
6248c2ecf20Sopenharmony_ci		if (mz->usage_in_excess < mz_node->usage_in_excess) {
6258c2ecf20Sopenharmony_ci			p = &(*p)->rb_left;
6268c2ecf20Sopenharmony_ci			rightmost = false;
6278c2ecf20Sopenharmony_ci		}
6288c2ecf20Sopenharmony_ci
6298c2ecf20Sopenharmony_ci		/*
6308c2ecf20Sopenharmony_ci		 * We can't avoid mem cgroups that are over their soft
6318c2ecf20Sopenharmony_ci		 * limit by the same amount
6328c2ecf20Sopenharmony_ci		 */
6338c2ecf20Sopenharmony_ci		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
6348c2ecf20Sopenharmony_ci			p = &(*p)->rb_right;
6358c2ecf20Sopenharmony_ci	}
6368c2ecf20Sopenharmony_ci
6378c2ecf20Sopenharmony_ci	if (rightmost)
6388c2ecf20Sopenharmony_ci		mctz->rb_rightmost = &mz->tree_node;
6398c2ecf20Sopenharmony_ci
6408c2ecf20Sopenharmony_ci	rb_link_node(&mz->tree_node, parent, p);
6418c2ecf20Sopenharmony_ci	rb_insert_color(&mz->tree_node, &mctz->rb_root);
6428c2ecf20Sopenharmony_ci	mz->on_tree = true;
6438c2ecf20Sopenharmony_ci}
6448c2ecf20Sopenharmony_ci
6458c2ecf20Sopenharmony_cistatic void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
6468c2ecf20Sopenharmony_ci					 struct mem_cgroup_tree_per_node *mctz)
6478c2ecf20Sopenharmony_ci{
6488c2ecf20Sopenharmony_ci	if (!mz->on_tree)
6498c2ecf20Sopenharmony_ci		return;
6508c2ecf20Sopenharmony_ci
6518c2ecf20Sopenharmony_ci	if (&mz->tree_node == mctz->rb_rightmost)
6528c2ecf20Sopenharmony_ci		mctz->rb_rightmost = rb_prev(&mz->tree_node);
6538c2ecf20Sopenharmony_ci
6548c2ecf20Sopenharmony_ci	rb_erase(&mz->tree_node, &mctz->rb_root);
6558c2ecf20Sopenharmony_ci	mz->on_tree = false;
6568c2ecf20Sopenharmony_ci}
6578c2ecf20Sopenharmony_ci
6588c2ecf20Sopenharmony_cistatic void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
6598c2ecf20Sopenharmony_ci				       struct mem_cgroup_tree_per_node *mctz)
6608c2ecf20Sopenharmony_ci{
6618c2ecf20Sopenharmony_ci	unsigned long flags;
6628c2ecf20Sopenharmony_ci
6638c2ecf20Sopenharmony_ci	spin_lock_irqsave(&mctz->lock, flags);
6648c2ecf20Sopenharmony_ci	__mem_cgroup_remove_exceeded(mz, mctz);
6658c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&mctz->lock, flags);
6668c2ecf20Sopenharmony_ci}
6678c2ecf20Sopenharmony_ci
6688c2ecf20Sopenharmony_cistatic unsigned long soft_limit_excess(struct mem_cgroup *memcg)
6698c2ecf20Sopenharmony_ci{
6708c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
6718c2ecf20Sopenharmony_ci	struct mem_cgroup_per_node *mz = mem_cgroup_nodeinfo(memcg, 0);
6728c2ecf20Sopenharmony_ci	struct lruvec *lruvec = &mz->lruvec;
6738c2ecf20Sopenharmony_ci	unsigned long nr_pages = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON,
6748c2ecf20Sopenharmony_ci			MAX_NR_ZONES) + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON,
6758c2ecf20Sopenharmony_ci			MAX_NR_ZONES);
6768c2ecf20Sopenharmony_ci#else
6778c2ecf20Sopenharmony_ci	unsigned long nr_pages = page_counter_read(&memcg->memory);
6788c2ecf20Sopenharmony_ci#endif
6798c2ecf20Sopenharmony_ci	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
6808c2ecf20Sopenharmony_ci	unsigned long excess = 0;
6818c2ecf20Sopenharmony_ci
6828c2ecf20Sopenharmony_ci	if (nr_pages > soft_limit)
6838c2ecf20Sopenharmony_ci		excess = nr_pages - soft_limit;
6848c2ecf20Sopenharmony_ci
6858c2ecf20Sopenharmony_ci	return excess;
6868c2ecf20Sopenharmony_ci}
6878c2ecf20Sopenharmony_ci
6888c2ecf20Sopenharmony_cistatic void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
6898c2ecf20Sopenharmony_ci{
6908c2ecf20Sopenharmony_ci	unsigned long excess;
6918c2ecf20Sopenharmony_ci	struct mem_cgroup_per_node *mz;
6928c2ecf20Sopenharmony_ci	struct mem_cgroup_tree_per_node *mctz;
6938c2ecf20Sopenharmony_ci
6948c2ecf20Sopenharmony_ci	mctz = soft_limit_tree_from_page(page);
6958c2ecf20Sopenharmony_ci	if (!mctz)
6968c2ecf20Sopenharmony_ci		return;
6978c2ecf20Sopenharmony_ci	/*
6988c2ecf20Sopenharmony_ci	 * Necessary to update all ancestors when hierarchy is used.
6998c2ecf20Sopenharmony_ci	 * because their event counter is not touched.
7008c2ecf20Sopenharmony_ci	 */
7018c2ecf20Sopenharmony_ci	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
7028c2ecf20Sopenharmony_ci		mz = mem_cgroup_page_nodeinfo(memcg, page);
7038c2ecf20Sopenharmony_ci		excess = soft_limit_excess(memcg);
7048c2ecf20Sopenharmony_ci		/*
7058c2ecf20Sopenharmony_ci		 * We have to update the tree if mz is on RB-tree or
7068c2ecf20Sopenharmony_ci		 * mem is over its softlimit.
7078c2ecf20Sopenharmony_ci		 */
7088c2ecf20Sopenharmony_ci		if (excess || mz->on_tree) {
7098c2ecf20Sopenharmony_ci			unsigned long flags;
7108c2ecf20Sopenharmony_ci
7118c2ecf20Sopenharmony_ci			spin_lock_irqsave(&mctz->lock, flags);
7128c2ecf20Sopenharmony_ci			/* if on-tree, remove it */
7138c2ecf20Sopenharmony_ci			if (mz->on_tree)
7148c2ecf20Sopenharmony_ci				__mem_cgroup_remove_exceeded(mz, mctz);
7158c2ecf20Sopenharmony_ci			/*
7168c2ecf20Sopenharmony_ci			 * Insert again. mz->usage_in_excess will be updated.
7178c2ecf20Sopenharmony_ci			 * If excess is 0, no tree ops.
7188c2ecf20Sopenharmony_ci			 */
7198c2ecf20Sopenharmony_ci			__mem_cgroup_insert_exceeded(mz, mctz, excess);
7208c2ecf20Sopenharmony_ci			spin_unlock_irqrestore(&mctz->lock, flags);
7218c2ecf20Sopenharmony_ci		}
7228c2ecf20Sopenharmony_ci	}
7238c2ecf20Sopenharmony_ci}
7248c2ecf20Sopenharmony_ci
7258c2ecf20Sopenharmony_cistatic void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
7268c2ecf20Sopenharmony_ci{
7278c2ecf20Sopenharmony_ci	struct mem_cgroup_tree_per_node *mctz;
7288c2ecf20Sopenharmony_ci	struct mem_cgroup_per_node *mz;
7298c2ecf20Sopenharmony_ci	int nid;
7308c2ecf20Sopenharmony_ci
7318c2ecf20Sopenharmony_ci	for_each_node(nid) {
7328c2ecf20Sopenharmony_ci		mz = mem_cgroup_nodeinfo(memcg, nid);
7338c2ecf20Sopenharmony_ci		mctz = soft_limit_tree_node(nid);
7348c2ecf20Sopenharmony_ci		if (mctz)
7358c2ecf20Sopenharmony_ci			mem_cgroup_remove_exceeded(mz, mctz);
7368c2ecf20Sopenharmony_ci	}
7378c2ecf20Sopenharmony_ci}
7388c2ecf20Sopenharmony_ci
7398c2ecf20Sopenharmony_cistatic struct mem_cgroup_per_node *
7408c2ecf20Sopenharmony_ci__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
7418c2ecf20Sopenharmony_ci{
7428c2ecf20Sopenharmony_ci	struct mem_cgroup_per_node *mz;
7438c2ecf20Sopenharmony_ci
7448c2ecf20Sopenharmony_ciretry:
7458c2ecf20Sopenharmony_ci	mz = NULL;
7468c2ecf20Sopenharmony_ci	if (!mctz->rb_rightmost)
7478c2ecf20Sopenharmony_ci		goto done;		/* Nothing to reclaim from */
7488c2ecf20Sopenharmony_ci
7498c2ecf20Sopenharmony_ci	mz = rb_entry(mctz->rb_rightmost,
7508c2ecf20Sopenharmony_ci		      struct mem_cgroup_per_node, tree_node);
7518c2ecf20Sopenharmony_ci	/*
7528c2ecf20Sopenharmony_ci	 * Remove the node now but someone else can add it back,
7538c2ecf20Sopenharmony_ci	 * we will to add it back at the end of reclaim to its correct
7548c2ecf20Sopenharmony_ci	 * position in the tree.
7558c2ecf20Sopenharmony_ci	 */
7568c2ecf20Sopenharmony_ci	__mem_cgroup_remove_exceeded(mz, mctz);
7578c2ecf20Sopenharmony_ci	if (!soft_limit_excess(mz->memcg) ||
7588c2ecf20Sopenharmony_ci	    !css_tryget(&mz->memcg->css))
7598c2ecf20Sopenharmony_ci		goto retry;
7608c2ecf20Sopenharmony_cidone:
7618c2ecf20Sopenharmony_ci	return mz;
7628c2ecf20Sopenharmony_ci}
7638c2ecf20Sopenharmony_ci
7648c2ecf20Sopenharmony_cistatic struct mem_cgroup_per_node *
7658c2ecf20Sopenharmony_cimem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
7668c2ecf20Sopenharmony_ci{
7678c2ecf20Sopenharmony_ci	struct mem_cgroup_per_node *mz;
7688c2ecf20Sopenharmony_ci
7698c2ecf20Sopenharmony_ci	spin_lock_irq(&mctz->lock);
7708c2ecf20Sopenharmony_ci	mz = __mem_cgroup_largest_soft_limit_node(mctz);
7718c2ecf20Sopenharmony_ci	spin_unlock_irq(&mctz->lock);
7728c2ecf20Sopenharmony_ci	return mz;
7738c2ecf20Sopenharmony_ci}
7748c2ecf20Sopenharmony_ci
7758c2ecf20Sopenharmony_ci/**
7768c2ecf20Sopenharmony_ci * __mod_memcg_state - update cgroup memory statistics
7778c2ecf20Sopenharmony_ci * @memcg: the memory cgroup
7788c2ecf20Sopenharmony_ci * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
7798c2ecf20Sopenharmony_ci * @val: delta to add to the counter, can be negative
7808c2ecf20Sopenharmony_ci */
7818c2ecf20Sopenharmony_civoid __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
7828c2ecf20Sopenharmony_ci{
7838c2ecf20Sopenharmony_ci	long x, threshold = MEMCG_CHARGE_BATCH;
7848c2ecf20Sopenharmony_ci
7858c2ecf20Sopenharmony_ci	if (mem_cgroup_disabled())
7868c2ecf20Sopenharmony_ci		return;
7878c2ecf20Sopenharmony_ci
7888c2ecf20Sopenharmony_ci	if (memcg_stat_item_in_bytes(idx))
7898c2ecf20Sopenharmony_ci		threshold <<= PAGE_SHIFT;
7908c2ecf20Sopenharmony_ci
7918c2ecf20Sopenharmony_ci	x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
7928c2ecf20Sopenharmony_ci	if (unlikely(abs(x) > threshold)) {
7938c2ecf20Sopenharmony_ci		struct mem_cgroup *mi;
7948c2ecf20Sopenharmony_ci
7958c2ecf20Sopenharmony_ci		/*
7968c2ecf20Sopenharmony_ci		 * Batch local counters to keep them in sync with
7978c2ecf20Sopenharmony_ci		 * the hierarchical ones.
7988c2ecf20Sopenharmony_ci		 */
7998c2ecf20Sopenharmony_ci		__this_cpu_add(memcg->vmstats_local->stat[idx], x);
8008c2ecf20Sopenharmony_ci		for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
8018c2ecf20Sopenharmony_ci			atomic_long_add(x, &mi->vmstats[idx]);
8028c2ecf20Sopenharmony_ci		x = 0;
8038c2ecf20Sopenharmony_ci	}
8048c2ecf20Sopenharmony_ci	__this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
8058c2ecf20Sopenharmony_ci}
8068c2ecf20Sopenharmony_ci
8078c2ecf20Sopenharmony_cistatic struct mem_cgroup_per_node *
8088c2ecf20Sopenharmony_ciparent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
8098c2ecf20Sopenharmony_ci{
8108c2ecf20Sopenharmony_ci	struct mem_cgroup *parent;
8118c2ecf20Sopenharmony_ci
8128c2ecf20Sopenharmony_ci	parent = parent_mem_cgroup(pn->memcg);
8138c2ecf20Sopenharmony_ci	if (!parent)
8148c2ecf20Sopenharmony_ci		return NULL;
8158c2ecf20Sopenharmony_ci	return mem_cgroup_nodeinfo(parent, nid);
8168c2ecf20Sopenharmony_ci}
8178c2ecf20Sopenharmony_ci
8188c2ecf20Sopenharmony_civoid __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
8198c2ecf20Sopenharmony_ci			      int val)
8208c2ecf20Sopenharmony_ci{
8218c2ecf20Sopenharmony_ci	struct mem_cgroup_per_node *pn;
8228c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
8238c2ecf20Sopenharmony_ci	long x, threshold = MEMCG_CHARGE_BATCH;
8248c2ecf20Sopenharmony_ci
8258c2ecf20Sopenharmony_ci	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
8268c2ecf20Sopenharmony_ci	memcg = pn->memcg;
8278c2ecf20Sopenharmony_ci
8288c2ecf20Sopenharmony_ci	/* Update memcg */
8298c2ecf20Sopenharmony_ci	__mod_memcg_state(memcg, idx, val);
8308c2ecf20Sopenharmony_ci
8318c2ecf20Sopenharmony_ci	/* Update lruvec */
8328c2ecf20Sopenharmony_ci	__this_cpu_add(pn->lruvec_stat_local->count[idx], val);
8338c2ecf20Sopenharmony_ci
8348c2ecf20Sopenharmony_ci	if (vmstat_item_in_bytes(idx))
8358c2ecf20Sopenharmony_ci		threshold <<= PAGE_SHIFT;
8368c2ecf20Sopenharmony_ci
8378c2ecf20Sopenharmony_ci	x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
8388c2ecf20Sopenharmony_ci	if (unlikely(abs(x) > threshold)) {
8398c2ecf20Sopenharmony_ci		pg_data_t *pgdat = lruvec_pgdat(lruvec);
8408c2ecf20Sopenharmony_ci		struct mem_cgroup_per_node *pi;
8418c2ecf20Sopenharmony_ci
8428c2ecf20Sopenharmony_ci		for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
8438c2ecf20Sopenharmony_ci			atomic_long_add(x, &pi->lruvec_stat[idx]);
8448c2ecf20Sopenharmony_ci		x = 0;
8458c2ecf20Sopenharmony_ci	}
8468c2ecf20Sopenharmony_ci	__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
8478c2ecf20Sopenharmony_ci}
8488c2ecf20Sopenharmony_ci
8498c2ecf20Sopenharmony_ci/**
8508c2ecf20Sopenharmony_ci * __mod_lruvec_state - update lruvec memory statistics
8518c2ecf20Sopenharmony_ci * @lruvec: the lruvec
8528c2ecf20Sopenharmony_ci * @idx: the stat item
8538c2ecf20Sopenharmony_ci * @val: delta to add to the counter, can be negative
8548c2ecf20Sopenharmony_ci *
8558c2ecf20Sopenharmony_ci * The lruvec is the intersection of the NUMA node and a cgroup. This
8568c2ecf20Sopenharmony_ci * function updates the all three counters that are affected by a
8578c2ecf20Sopenharmony_ci * change of state at this level: per-node, per-cgroup, per-lruvec.
8588c2ecf20Sopenharmony_ci */
8598c2ecf20Sopenharmony_civoid __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
8608c2ecf20Sopenharmony_ci			int val)
8618c2ecf20Sopenharmony_ci{
8628c2ecf20Sopenharmony_ci	/* Update node */
8638c2ecf20Sopenharmony_ci	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
8648c2ecf20Sopenharmony_ci
8658c2ecf20Sopenharmony_ci	/* Update memcg and lruvec */
8668c2ecf20Sopenharmony_ci	if (!mem_cgroup_disabled()) {
8678c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
8688c2ecf20Sopenharmony_ci		if (is_node_lruvec(lruvec))
8698c2ecf20Sopenharmony_ci			return;
8708c2ecf20Sopenharmony_ci#endif
8718c2ecf20Sopenharmony_ci		__mod_memcg_lruvec_state(lruvec, idx, val);
8728c2ecf20Sopenharmony_ci	}
8738c2ecf20Sopenharmony_ci}
8748c2ecf20Sopenharmony_ci
8758c2ecf20Sopenharmony_civoid __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
8768c2ecf20Sopenharmony_ci{
8778c2ecf20Sopenharmony_ci	pg_data_t *pgdat = page_pgdat(virt_to_page(p));
8788c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
8798c2ecf20Sopenharmony_ci	struct lruvec *lruvec;
8808c2ecf20Sopenharmony_ci
8818c2ecf20Sopenharmony_ci	rcu_read_lock();
8828c2ecf20Sopenharmony_ci	memcg = mem_cgroup_from_obj(p);
8838c2ecf20Sopenharmony_ci
8848c2ecf20Sopenharmony_ci	/*
8858c2ecf20Sopenharmony_ci	 * Untracked pages have no memcg, no lruvec. Update only the
8868c2ecf20Sopenharmony_ci	 * node. If we reparent the slab objects to the root memcg,
8878c2ecf20Sopenharmony_ci	 * when we free the slab object, we need to update the per-memcg
8888c2ecf20Sopenharmony_ci	 * vmstats to keep it correct for the root memcg.
8898c2ecf20Sopenharmony_ci	 */
8908c2ecf20Sopenharmony_ci	if (!memcg) {
8918c2ecf20Sopenharmony_ci		__mod_node_page_state(pgdat, idx, val);
8928c2ecf20Sopenharmony_ci	} else {
8938c2ecf20Sopenharmony_ci		lruvec = mem_cgroup_lruvec(memcg, pgdat);
8948c2ecf20Sopenharmony_ci		__mod_lruvec_state(lruvec, idx, val);
8958c2ecf20Sopenharmony_ci	}
8968c2ecf20Sopenharmony_ci	rcu_read_unlock();
8978c2ecf20Sopenharmony_ci}
8988c2ecf20Sopenharmony_ci
8998c2ecf20Sopenharmony_civoid mod_memcg_obj_state(void *p, int idx, int val)
9008c2ecf20Sopenharmony_ci{
9018c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
9028c2ecf20Sopenharmony_ci
9038c2ecf20Sopenharmony_ci	rcu_read_lock();
9048c2ecf20Sopenharmony_ci	memcg = mem_cgroup_from_obj(p);
9058c2ecf20Sopenharmony_ci	if (memcg)
9068c2ecf20Sopenharmony_ci		mod_memcg_state(memcg, idx, val);
9078c2ecf20Sopenharmony_ci	rcu_read_unlock();
9088c2ecf20Sopenharmony_ci}
9098c2ecf20Sopenharmony_ci
9108c2ecf20Sopenharmony_ci/**
9118c2ecf20Sopenharmony_ci * __count_memcg_events - account VM events in a cgroup
9128c2ecf20Sopenharmony_ci * @memcg: the memory cgroup
9138c2ecf20Sopenharmony_ci * @idx: the event item
9148c2ecf20Sopenharmony_ci * @count: the number of events that occured
9158c2ecf20Sopenharmony_ci */
9168c2ecf20Sopenharmony_civoid __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
9178c2ecf20Sopenharmony_ci			  unsigned long count)
9188c2ecf20Sopenharmony_ci{
9198c2ecf20Sopenharmony_ci	unsigned long x;
9208c2ecf20Sopenharmony_ci
9218c2ecf20Sopenharmony_ci	if (mem_cgroup_disabled())
9228c2ecf20Sopenharmony_ci		return;
9238c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
9248c2ecf20Sopenharmony_ci	if (!memcg)
9258c2ecf20Sopenharmony_ci		return;
9268c2ecf20Sopenharmony_ci#endif
9278c2ecf20Sopenharmony_ci
9288c2ecf20Sopenharmony_ci	x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
9298c2ecf20Sopenharmony_ci	if (unlikely(x > MEMCG_CHARGE_BATCH)) {
9308c2ecf20Sopenharmony_ci		struct mem_cgroup *mi;
9318c2ecf20Sopenharmony_ci
9328c2ecf20Sopenharmony_ci		/*
9338c2ecf20Sopenharmony_ci		 * Batch local counters to keep them in sync with
9348c2ecf20Sopenharmony_ci		 * the hierarchical ones.
9358c2ecf20Sopenharmony_ci		 */
9368c2ecf20Sopenharmony_ci		__this_cpu_add(memcg->vmstats_local->events[idx], x);
9378c2ecf20Sopenharmony_ci		for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
9388c2ecf20Sopenharmony_ci			atomic_long_add(x, &mi->vmevents[idx]);
9398c2ecf20Sopenharmony_ci		x = 0;
9408c2ecf20Sopenharmony_ci	}
9418c2ecf20Sopenharmony_ci	__this_cpu_write(memcg->vmstats_percpu->events[idx], x);
9428c2ecf20Sopenharmony_ci}
9438c2ecf20Sopenharmony_ci
9448c2ecf20Sopenharmony_cistatic unsigned long memcg_events(struct mem_cgroup *memcg, int event)
9458c2ecf20Sopenharmony_ci{
9468c2ecf20Sopenharmony_ci	return atomic_long_read(&memcg->vmevents[event]);
9478c2ecf20Sopenharmony_ci}
9488c2ecf20Sopenharmony_ci
9498c2ecf20Sopenharmony_cistatic unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
9508c2ecf20Sopenharmony_ci{
9518c2ecf20Sopenharmony_ci	long x = 0;
9528c2ecf20Sopenharmony_ci	int cpu;
9538c2ecf20Sopenharmony_ci
9548c2ecf20Sopenharmony_ci	for_each_possible_cpu(cpu)
9558c2ecf20Sopenharmony_ci		x += per_cpu(memcg->vmstats_local->events[event], cpu);
9568c2ecf20Sopenharmony_ci	return x;
9578c2ecf20Sopenharmony_ci}
9588c2ecf20Sopenharmony_ci
9598c2ecf20Sopenharmony_cistatic void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
9608c2ecf20Sopenharmony_ci					 struct page *page,
9618c2ecf20Sopenharmony_ci					 int nr_pages)
9628c2ecf20Sopenharmony_ci{
9638c2ecf20Sopenharmony_ci	/* pagein of a big page is an event. So, ignore page size */
9648c2ecf20Sopenharmony_ci	if (nr_pages > 0)
9658c2ecf20Sopenharmony_ci		__count_memcg_events(memcg, PGPGIN, 1);
9668c2ecf20Sopenharmony_ci	else {
9678c2ecf20Sopenharmony_ci		__count_memcg_events(memcg, PGPGOUT, 1);
9688c2ecf20Sopenharmony_ci		nr_pages = -nr_pages; /* for event */
9698c2ecf20Sopenharmony_ci	}
9708c2ecf20Sopenharmony_ci
9718c2ecf20Sopenharmony_ci	__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
9728c2ecf20Sopenharmony_ci}
9738c2ecf20Sopenharmony_ci
9748c2ecf20Sopenharmony_cistatic bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
9758c2ecf20Sopenharmony_ci				       enum mem_cgroup_events_target target)
9768c2ecf20Sopenharmony_ci{
9778c2ecf20Sopenharmony_ci	unsigned long val, next;
9788c2ecf20Sopenharmony_ci
9798c2ecf20Sopenharmony_ci	val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
9808c2ecf20Sopenharmony_ci	next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
9818c2ecf20Sopenharmony_ci	/* from time_after() in jiffies.h */
9828c2ecf20Sopenharmony_ci	if ((long)(next - val) < 0) {
9838c2ecf20Sopenharmony_ci		switch (target) {
9848c2ecf20Sopenharmony_ci		case MEM_CGROUP_TARGET_THRESH:
9858c2ecf20Sopenharmony_ci			next = val + THRESHOLDS_EVENTS_TARGET;
9868c2ecf20Sopenharmony_ci			break;
9878c2ecf20Sopenharmony_ci		case MEM_CGROUP_TARGET_SOFTLIMIT:
9888c2ecf20Sopenharmony_ci			next = val + SOFTLIMIT_EVENTS_TARGET;
9898c2ecf20Sopenharmony_ci			break;
9908c2ecf20Sopenharmony_ci		default:
9918c2ecf20Sopenharmony_ci			break;
9928c2ecf20Sopenharmony_ci		}
9938c2ecf20Sopenharmony_ci		__this_cpu_write(memcg->vmstats_percpu->targets[target], next);
9948c2ecf20Sopenharmony_ci		return true;
9958c2ecf20Sopenharmony_ci	}
9968c2ecf20Sopenharmony_ci	return false;
9978c2ecf20Sopenharmony_ci}
9988c2ecf20Sopenharmony_ci
9998c2ecf20Sopenharmony_ci/*
10008c2ecf20Sopenharmony_ci * Check events in order.
10018c2ecf20Sopenharmony_ci *
10028c2ecf20Sopenharmony_ci */
10038c2ecf20Sopenharmony_cistatic void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
10048c2ecf20Sopenharmony_ci{
10058c2ecf20Sopenharmony_ci	/* threshold event is triggered in finer grain than soft limit */
10068c2ecf20Sopenharmony_ci	if (unlikely(mem_cgroup_event_ratelimit(memcg,
10078c2ecf20Sopenharmony_ci						MEM_CGROUP_TARGET_THRESH))) {
10088c2ecf20Sopenharmony_ci		bool do_softlimit;
10098c2ecf20Sopenharmony_ci
10108c2ecf20Sopenharmony_ci		do_softlimit = mem_cgroup_event_ratelimit(memcg,
10118c2ecf20Sopenharmony_ci						MEM_CGROUP_TARGET_SOFTLIMIT);
10128c2ecf20Sopenharmony_ci		mem_cgroup_threshold(memcg);
10138c2ecf20Sopenharmony_ci		if (unlikely(do_softlimit))
10148c2ecf20Sopenharmony_ci			mem_cgroup_update_tree(memcg, page);
10158c2ecf20Sopenharmony_ci	}
10168c2ecf20Sopenharmony_ci}
10178c2ecf20Sopenharmony_ci
10188c2ecf20Sopenharmony_cistruct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
10198c2ecf20Sopenharmony_ci{
10208c2ecf20Sopenharmony_ci	/*
10218c2ecf20Sopenharmony_ci	 * mm_update_next_owner() may clear mm->owner to NULL
10228c2ecf20Sopenharmony_ci	 * if it races with swapoff, page migration, etc.
10238c2ecf20Sopenharmony_ci	 * So this can be called with p == NULL.
10248c2ecf20Sopenharmony_ci	 */
10258c2ecf20Sopenharmony_ci	if (unlikely(!p))
10268c2ecf20Sopenharmony_ci		return NULL;
10278c2ecf20Sopenharmony_ci
10288c2ecf20Sopenharmony_ci	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
10298c2ecf20Sopenharmony_ci}
10308c2ecf20Sopenharmony_ciEXPORT_SYMBOL(mem_cgroup_from_task);
10318c2ecf20Sopenharmony_ci
10328c2ecf20Sopenharmony_ci/**
10338c2ecf20Sopenharmony_ci * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
10348c2ecf20Sopenharmony_ci * @mm: mm from which memcg should be extracted. It can be NULL.
10358c2ecf20Sopenharmony_ci *
10368c2ecf20Sopenharmony_ci * Obtain a reference on mm->memcg and returns it if successful. Otherwise
10378c2ecf20Sopenharmony_ci * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
10388c2ecf20Sopenharmony_ci * returned.
10398c2ecf20Sopenharmony_ci */
10408c2ecf20Sopenharmony_cistruct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
10418c2ecf20Sopenharmony_ci{
10428c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
10438c2ecf20Sopenharmony_ci
10448c2ecf20Sopenharmony_ci	if (mem_cgroup_disabled())
10458c2ecf20Sopenharmony_ci		return NULL;
10468c2ecf20Sopenharmony_ci
10478c2ecf20Sopenharmony_ci	rcu_read_lock();
10488c2ecf20Sopenharmony_ci	do {
10498c2ecf20Sopenharmony_ci		/*
10508c2ecf20Sopenharmony_ci		 * Page cache insertions can happen withou an
10518c2ecf20Sopenharmony_ci		 * actual mm context, e.g. during disk probing
10528c2ecf20Sopenharmony_ci		 * on boot, loopback IO, acct() writes etc.
10538c2ecf20Sopenharmony_ci		 */
10548c2ecf20Sopenharmony_ci		if (unlikely(!mm))
10558c2ecf20Sopenharmony_ci			memcg = root_mem_cgroup;
10568c2ecf20Sopenharmony_ci		else {
10578c2ecf20Sopenharmony_ci			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
10588c2ecf20Sopenharmony_ci			if (unlikely(!memcg))
10598c2ecf20Sopenharmony_ci				memcg = root_mem_cgroup;
10608c2ecf20Sopenharmony_ci		}
10618c2ecf20Sopenharmony_ci	} while (!css_tryget(&memcg->css));
10628c2ecf20Sopenharmony_ci	rcu_read_unlock();
10638c2ecf20Sopenharmony_ci	return memcg;
10648c2ecf20Sopenharmony_ci}
10658c2ecf20Sopenharmony_ciEXPORT_SYMBOL(get_mem_cgroup_from_mm);
10668c2ecf20Sopenharmony_ci
10678c2ecf20Sopenharmony_ci/**
10688c2ecf20Sopenharmony_ci * get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
10698c2ecf20Sopenharmony_ci * @page: page from which memcg should be extracted.
10708c2ecf20Sopenharmony_ci *
10718c2ecf20Sopenharmony_ci * Obtain a reference on page->memcg and returns it if successful. Otherwise
10728c2ecf20Sopenharmony_ci * root_mem_cgroup is returned.
10738c2ecf20Sopenharmony_ci */
10748c2ecf20Sopenharmony_cistruct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
10758c2ecf20Sopenharmony_ci{
10768c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = page->mem_cgroup;
10778c2ecf20Sopenharmony_ci
10788c2ecf20Sopenharmony_ci	if (mem_cgroup_disabled())
10798c2ecf20Sopenharmony_ci		return NULL;
10808c2ecf20Sopenharmony_ci
10818c2ecf20Sopenharmony_ci	rcu_read_lock();
10828c2ecf20Sopenharmony_ci	/* Page should not get uncharged and freed memcg under us. */
10838c2ecf20Sopenharmony_ci	if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
10848c2ecf20Sopenharmony_ci		memcg = root_mem_cgroup;
10858c2ecf20Sopenharmony_ci	rcu_read_unlock();
10868c2ecf20Sopenharmony_ci	return memcg;
10878c2ecf20Sopenharmony_ci}
10888c2ecf20Sopenharmony_ciEXPORT_SYMBOL(get_mem_cgroup_from_page);
10898c2ecf20Sopenharmony_ci
10908c2ecf20Sopenharmony_cistatic __always_inline struct mem_cgroup *active_memcg(void)
10918c2ecf20Sopenharmony_ci{
10928c2ecf20Sopenharmony_ci	if (in_interrupt())
10938c2ecf20Sopenharmony_ci		return this_cpu_read(int_active_memcg);
10948c2ecf20Sopenharmony_ci	else
10958c2ecf20Sopenharmony_ci		return current->active_memcg;
10968c2ecf20Sopenharmony_ci}
10978c2ecf20Sopenharmony_ci
10988c2ecf20Sopenharmony_cistatic __always_inline struct mem_cgroup *get_active_memcg(void)
10998c2ecf20Sopenharmony_ci{
11008c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
11018c2ecf20Sopenharmony_ci
11028c2ecf20Sopenharmony_ci	rcu_read_lock();
11038c2ecf20Sopenharmony_ci	memcg = active_memcg();
11048c2ecf20Sopenharmony_ci	/* remote memcg must hold a ref. */
11058c2ecf20Sopenharmony_ci	if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
11068c2ecf20Sopenharmony_ci		memcg = root_mem_cgroup;
11078c2ecf20Sopenharmony_ci	rcu_read_unlock();
11088c2ecf20Sopenharmony_ci
11098c2ecf20Sopenharmony_ci	return memcg;
11108c2ecf20Sopenharmony_ci}
11118c2ecf20Sopenharmony_ci
11128c2ecf20Sopenharmony_cistatic __always_inline bool memcg_kmem_bypass(void)
11138c2ecf20Sopenharmony_ci{
11148c2ecf20Sopenharmony_ci	/* Allow remote memcg charging from any context. */
11158c2ecf20Sopenharmony_ci	if (unlikely(active_memcg()))
11168c2ecf20Sopenharmony_ci		return false;
11178c2ecf20Sopenharmony_ci
11188c2ecf20Sopenharmony_ci	/* Memcg to charge can't be determined. */
11198c2ecf20Sopenharmony_ci	if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
11208c2ecf20Sopenharmony_ci		return true;
11218c2ecf20Sopenharmony_ci
11228c2ecf20Sopenharmony_ci	return false;
11238c2ecf20Sopenharmony_ci}
11248c2ecf20Sopenharmony_ci
11258c2ecf20Sopenharmony_ci/**
11268c2ecf20Sopenharmony_ci * If active memcg is set, do not fallback to current->mm->memcg.
11278c2ecf20Sopenharmony_ci */
11288c2ecf20Sopenharmony_cistatic __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
11298c2ecf20Sopenharmony_ci{
11308c2ecf20Sopenharmony_ci	if (memcg_kmem_bypass())
11318c2ecf20Sopenharmony_ci		return NULL;
11328c2ecf20Sopenharmony_ci
11338c2ecf20Sopenharmony_ci	if (unlikely(active_memcg()))
11348c2ecf20Sopenharmony_ci		return get_active_memcg();
11358c2ecf20Sopenharmony_ci
11368c2ecf20Sopenharmony_ci	return get_mem_cgroup_from_mm(current->mm);
11378c2ecf20Sopenharmony_ci}
11388c2ecf20Sopenharmony_ci
11398c2ecf20Sopenharmony_ci/**
11408c2ecf20Sopenharmony_ci * mem_cgroup_iter - iterate over memory cgroup hierarchy
11418c2ecf20Sopenharmony_ci * @root: hierarchy root
11428c2ecf20Sopenharmony_ci * @prev: previously returned memcg, NULL on first invocation
11438c2ecf20Sopenharmony_ci * @reclaim: cookie for shared reclaim walks, NULL for full walks
11448c2ecf20Sopenharmony_ci *
11458c2ecf20Sopenharmony_ci * Returns references to children of the hierarchy below @root, or
11468c2ecf20Sopenharmony_ci * @root itself, or %NULL after a full round-trip.
11478c2ecf20Sopenharmony_ci *
11488c2ecf20Sopenharmony_ci * Caller must pass the return value in @prev on subsequent
11498c2ecf20Sopenharmony_ci * invocations for reference counting, or use mem_cgroup_iter_break()
11508c2ecf20Sopenharmony_ci * to cancel a hierarchy walk before the round-trip is complete.
11518c2ecf20Sopenharmony_ci *
11528c2ecf20Sopenharmony_ci * Reclaimers can specify a node in @reclaim to divide up the memcgs
11538c2ecf20Sopenharmony_ci * in the hierarchy among all concurrent reclaimers operating on the
11548c2ecf20Sopenharmony_ci * same node.
11558c2ecf20Sopenharmony_ci */
11568c2ecf20Sopenharmony_cistruct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
11578c2ecf20Sopenharmony_ci				   struct mem_cgroup *prev,
11588c2ecf20Sopenharmony_ci				   struct mem_cgroup_reclaim_cookie *reclaim)
11598c2ecf20Sopenharmony_ci{
11608c2ecf20Sopenharmony_ci	struct mem_cgroup_reclaim_iter *iter;
11618c2ecf20Sopenharmony_ci	struct cgroup_subsys_state *css = NULL;
11628c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = NULL;
11638c2ecf20Sopenharmony_ci	struct mem_cgroup *pos = NULL;
11648c2ecf20Sopenharmony_ci
11658c2ecf20Sopenharmony_ci	if (mem_cgroup_disabled())
11668c2ecf20Sopenharmony_ci		return NULL;
11678c2ecf20Sopenharmony_ci
11688c2ecf20Sopenharmony_ci	if (!root)
11698c2ecf20Sopenharmony_ci		root = root_mem_cgroup;
11708c2ecf20Sopenharmony_ci
11718c2ecf20Sopenharmony_ci	if (prev && !reclaim)
11728c2ecf20Sopenharmony_ci		pos = prev;
11738c2ecf20Sopenharmony_ci
11748c2ecf20Sopenharmony_ci	if (!root->use_hierarchy && root != root_mem_cgroup) {
11758c2ecf20Sopenharmony_ci		if (prev)
11768c2ecf20Sopenharmony_ci			goto out;
11778c2ecf20Sopenharmony_ci		return root;
11788c2ecf20Sopenharmony_ci	}
11798c2ecf20Sopenharmony_ci
11808c2ecf20Sopenharmony_ci	rcu_read_lock();
11818c2ecf20Sopenharmony_ci
11828c2ecf20Sopenharmony_ci	if (reclaim) {
11838c2ecf20Sopenharmony_ci		struct mem_cgroup_per_node *mz;
11848c2ecf20Sopenharmony_ci
11858c2ecf20Sopenharmony_ci		mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
11868c2ecf20Sopenharmony_ci		iter = &mz->iter;
11878c2ecf20Sopenharmony_ci
11888c2ecf20Sopenharmony_ci		if (prev && reclaim->generation != iter->generation)
11898c2ecf20Sopenharmony_ci			goto out_unlock;
11908c2ecf20Sopenharmony_ci
11918c2ecf20Sopenharmony_ci		while (1) {
11928c2ecf20Sopenharmony_ci			pos = READ_ONCE(iter->position);
11938c2ecf20Sopenharmony_ci			if (!pos || css_tryget(&pos->css))
11948c2ecf20Sopenharmony_ci				break;
11958c2ecf20Sopenharmony_ci			/*
11968c2ecf20Sopenharmony_ci			 * css reference reached zero, so iter->position will
11978c2ecf20Sopenharmony_ci			 * be cleared by ->css_released. However, we should not
11988c2ecf20Sopenharmony_ci			 * rely on this happening soon, because ->css_released
11998c2ecf20Sopenharmony_ci			 * is called from a work queue, and by busy-waiting we
12008c2ecf20Sopenharmony_ci			 * might block it. So we clear iter->position right
12018c2ecf20Sopenharmony_ci			 * away.
12028c2ecf20Sopenharmony_ci			 */
12038c2ecf20Sopenharmony_ci			(void)cmpxchg(&iter->position, pos, NULL);
12048c2ecf20Sopenharmony_ci		}
12058c2ecf20Sopenharmony_ci	}
12068c2ecf20Sopenharmony_ci
12078c2ecf20Sopenharmony_ci	if (pos)
12088c2ecf20Sopenharmony_ci		css = &pos->css;
12098c2ecf20Sopenharmony_ci
12108c2ecf20Sopenharmony_ci	for (;;) {
12118c2ecf20Sopenharmony_ci		css = css_next_descendant_pre(css, &root->css);
12128c2ecf20Sopenharmony_ci		if (!css) {
12138c2ecf20Sopenharmony_ci			/*
12148c2ecf20Sopenharmony_ci			 * Reclaimers share the hierarchy walk, and a
12158c2ecf20Sopenharmony_ci			 * new one might jump in right at the end of
12168c2ecf20Sopenharmony_ci			 * the hierarchy - make sure they see at least
12178c2ecf20Sopenharmony_ci			 * one group and restart from the beginning.
12188c2ecf20Sopenharmony_ci			 */
12198c2ecf20Sopenharmony_ci			if (!prev)
12208c2ecf20Sopenharmony_ci				continue;
12218c2ecf20Sopenharmony_ci			break;
12228c2ecf20Sopenharmony_ci		}
12238c2ecf20Sopenharmony_ci
12248c2ecf20Sopenharmony_ci		/*
12258c2ecf20Sopenharmony_ci		 * Verify the css and acquire a reference.  The root
12268c2ecf20Sopenharmony_ci		 * is provided by the caller, so we know it's alive
12278c2ecf20Sopenharmony_ci		 * and kicking, and don't take an extra reference.
12288c2ecf20Sopenharmony_ci		 */
12298c2ecf20Sopenharmony_ci		memcg = mem_cgroup_from_css(css);
12308c2ecf20Sopenharmony_ci
12318c2ecf20Sopenharmony_ci		if (css == &root->css)
12328c2ecf20Sopenharmony_ci			break;
12338c2ecf20Sopenharmony_ci
12348c2ecf20Sopenharmony_ci		if (css_tryget(css))
12358c2ecf20Sopenharmony_ci			break;
12368c2ecf20Sopenharmony_ci
12378c2ecf20Sopenharmony_ci		memcg = NULL;
12388c2ecf20Sopenharmony_ci	}
12398c2ecf20Sopenharmony_ci
12408c2ecf20Sopenharmony_ci	if (reclaim) {
12418c2ecf20Sopenharmony_ci		/*
12428c2ecf20Sopenharmony_ci		 * The position could have already been updated by a competing
12438c2ecf20Sopenharmony_ci		 * thread, so check that the value hasn't changed since we read
12448c2ecf20Sopenharmony_ci		 * it to avoid reclaiming from the same cgroup twice.
12458c2ecf20Sopenharmony_ci		 */
12468c2ecf20Sopenharmony_ci		(void)cmpxchg(&iter->position, pos, memcg);
12478c2ecf20Sopenharmony_ci
12488c2ecf20Sopenharmony_ci		if (pos)
12498c2ecf20Sopenharmony_ci			css_put(&pos->css);
12508c2ecf20Sopenharmony_ci
12518c2ecf20Sopenharmony_ci		if (!memcg)
12528c2ecf20Sopenharmony_ci			iter->generation++;
12538c2ecf20Sopenharmony_ci		else if (!prev)
12548c2ecf20Sopenharmony_ci			reclaim->generation = iter->generation;
12558c2ecf20Sopenharmony_ci	}
12568c2ecf20Sopenharmony_ci
12578c2ecf20Sopenharmony_ciout_unlock:
12588c2ecf20Sopenharmony_ci	rcu_read_unlock();
12598c2ecf20Sopenharmony_ciout:
12608c2ecf20Sopenharmony_ci	if (prev && prev != root)
12618c2ecf20Sopenharmony_ci		css_put(&prev->css);
12628c2ecf20Sopenharmony_ci
12638c2ecf20Sopenharmony_ci	return memcg;
12648c2ecf20Sopenharmony_ci}
12658c2ecf20Sopenharmony_ci
12668c2ecf20Sopenharmony_ci/**
12678c2ecf20Sopenharmony_ci * mem_cgroup_iter_break - abort a hierarchy walk prematurely
12688c2ecf20Sopenharmony_ci * @root: hierarchy root
12698c2ecf20Sopenharmony_ci * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
12708c2ecf20Sopenharmony_ci */
12718c2ecf20Sopenharmony_civoid mem_cgroup_iter_break(struct mem_cgroup *root,
12728c2ecf20Sopenharmony_ci			   struct mem_cgroup *prev)
12738c2ecf20Sopenharmony_ci{
12748c2ecf20Sopenharmony_ci	if (!root)
12758c2ecf20Sopenharmony_ci		root = root_mem_cgroup;
12768c2ecf20Sopenharmony_ci	if (prev && prev != root)
12778c2ecf20Sopenharmony_ci		css_put(&prev->css);
12788c2ecf20Sopenharmony_ci}
12798c2ecf20Sopenharmony_ci
12808c2ecf20Sopenharmony_cistatic void __invalidate_reclaim_iterators(struct mem_cgroup *from,
12818c2ecf20Sopenharmony_ci					struct mem_cgroup *dead_memcg)
12828c2ecf20Sopenharmony_ci{
12838c2ecf20Sopenharmony_ci	struct mem_cgroup_reclaim_iter *iter;
12848c2ecf20Sopenharmony_ci	struct mem_cgroup_per_node *mz;
12858c2ecf20Sopenharmony_ci	int nid;
12868c2ecf20Sopenharmony_ci
12878c2ecf20Sopenharmony_ci	for_each_node(nid) {
12888c2ecf20Sopenharmony_ci		mz = mem_cgroup_nodeinfo(from, nid);
12898c2ecf20Sopenharmony_ci		iter = &mz->iter;
12908c2ecf20Sopenharmony_ci		cmpxchg(&iter->position, dead_memcg, NULL);
12918c2ecf20Sopenharmony_ci	}
12928c2ecf20Sopenharmony_ci}
12938c2ecf20Sopenharmony_ci
12948c2ecf20Sopenharmony_cistatic void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
12958c2ecf20Sopenharmony_ci{
12968c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = dead_memcg;
12978c2ecf20Sopenharmony_ci	struct mem_cgroup *last;
12988c2ecf20Sopenharmony_ci
12998c2ecf20Sopenharmony_ci	do {
13008c2ecf20Sopenharmony_ci		__invalidate_reclaim_iterators(memcg, dead_memcg);
13018c2ecf20Sopenharmony_ci		last = memcg;
13028c2ecf20Sopenharmony_ci	} while ((memcg = parent_mem_cgroup(memcg)));
13038c2ecf20Sopenharmony_ci
13048c2ecf20Sopenharmony_ci	/*
13058c2ecf20Sopenharmony_ci	 * When cgruop1 non-hierarchy mode is used,
13068c2ecf20Sopenharmony_ci	 * parent_mem_cgroup() does not walk all the way up to the
13078c2ecf20Sopenharmony_ci	 * cgroup root (root_mem_cgroup). So we have to handle
13088c2ecf20Sopenharmony_ci	 * dead_memcg from cgroup root separately.
13098c2ecf20Sopenharmony_ci	 */
13108c2ecf20Sopenharmony_ci	if (last != root_mem_cgroup)
13118c2ecf20Sopenharmony_ci		__invalidate_reclaim_iterators(root_mem_cgroup,
13128c2ecf20Sopenharmony_ci						dead_memcg);
13138c2ecf20Sopenharmony_ci}
13148c2ecf20Sopenharmony_ci
13158c2ecf20Sopenharmony_ci/**
13168c2ecf20Sopenharmony_ci * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
13178c2ecf20Sopenharmony_ci * @memcg: hierarchy root
13188c2ecf20Sopenharmony_ci * @fn: function to call for each task
13198c2ecf20Sopenharmony_ci * @arg: argument passed to @fn
13208c2ecf20Sopenharmony_ci *
13218c2ecf20Sopenharmony_ci * This function iterates over tasks attached to @memcg or to any of its
13228c2ecf20Sopenharmony_ci * descendants and calls @fn for each task. If @fn returns a non-zero
13238c2ecf20Sopenharmony_ci * value, the function breaks the iteration loop and returns the value.
13248c2ecf20Sopenharmony_ci * Otherwise, it will iterate over all tasks and return 0.
13258c2ecf20Sopenharmony_ci *
13268c2ecf20Sopenharmony_ci * This function must not be called for the root memory cgroup.
13278c2ecf20Sopenharmony_ci */
13288c2ecf20Sopenharmony_ciint mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
13298c2ecf20Sopenharmony_ci			  int (*fn)(struct task_struct *, void *), void *arg)
13308c2ecf20Sopenharmony_ci{
13318c2ecf20Sopenharmony_ci	struct mem_cgroup *iter;
13328c2ecf20Sopenharmony_ci	int ret = 0;
13338c2ecf20Sopenharmony_ci
13348c2ecf20Sopenharmony_ci	BUG_ON(memcg == root_mem_cgroup);
13358c2ecf20Sopenharmony_ci
13368c2ecf20Sopenharmony_ci	for_each_mem_cgroup_tree(iter, memcg) {
13378c2ecf20Sopenharmony_ci		struct css_task_iter it;
13388c2ecf20Sopenharmony_ci		struct task_struct *task;
13398c2ecf20Sopenharmony_ci
13408c2ecf20Sopenharmony_ci		css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
13418c2ecf20Sopenharmony_ci		while (!ret && (task = css_task_iter_next(&it)))
13428c2ecf20Sopenharmony_ci			ret = fn(task, arg);
13438c2ecf20Sopenharmony_ci		css_task_iter_end(&it);
13448c2ecf20Sopenharmony_ci		if (ret) {
13458c2ecf20Sopenharmony_ci			mem_cgroup_iter_break(memcg, iter);
13468c2ecf20Sopenharmony_ci			break;
13478c2ecf20Sopenharmony_ci		}
13488c2ecf20Sopenharmony_ci	}
13498c2ecf20Sopenharmony_ci	return ret;
13508c2ecf20Sopenharmony_ci}
13518c2ecf20Sopenharmony_ci
13528c2ecf20Sopenharmony_ci/**
13538c2ecf20Sopenharmony_ci * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
13548c2ecf20Sopenharmony_ci * @page: the page
13558c2ecf20Sopenharmony_ci * @pgdat: pgdat of the page
13568c2ecf20Sopenharmony_ci *
13578c2ecf20Sopenharmony_ci * This function relies on page->mem_cgroup being stable - see the
13588c2ecf20Sopenharmony_ci * access rules in commit_charge().
13598c2ecf20Sopenharmony_ci */
13608c2ecf20Sopenharmony_cistruct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
13618c2ecf20Sopenharmony_ci{
13628c2ecf20Sopenharmony_ci	struct mem_cgroup_per_node *mz;
13638c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
13648c2ecf20Sopenharmony_ci	struct lruvec *lruvec;
13658c2ecf20Sopenharmony_ci
13668c2ecf20Sopenharmony_ci	if (mem_cgroup_disabled()) {
13678c2ecf20Sopenharmony_ci		lruvec = &pgdat->__lruvec;
13688c2ecf20Sopenharmony_ci		goto out;
13698c2ecf20Sopenharmony_ci	}
13708c2ecf20Sopenharmony_ci
13718c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
13728c2ecf20Sopenharmony_ci	if (page_is_file_lru(page) &&
13738c2ecf20Sopenharmony_ci	    !is_prot_page(page)) {
13748c2ecf20Sopenharmony_ci		lruvec = node_lruvec(pgdat);
13758c2ecf20Sopenharmony_ci		goto out;
13768c2ecf20Sopenharmony_ci	}
13778c2ecf20Sopenharmony_ci#endif
13788c2ecf20Sopenharmony_ci	memcg = page->mem_cgroup;
13798c2ecf20Sopenharmony_ci	/*
13808c2ecf20Sopenharmony_ci	 * Swapcache readahead pages are added to the LRU - and
13818c2ecf20Sopenharmony_ci	 * possibly migrated - before they are charged.
13828c2ecf20Sopenharmony_ci	 */
13838c2ecf20Sopenharmony_ci	if (!memcg)
13848c2ecf20Sopenharmony_ci		memcg = root_mem_cgroup;
13858c2ecf20Sopenharmony_ci
13868c2ecf20Sopenharmony_ci	mz = mem_cgroup_page_nodeinfo(memcg, page);
13878c2ecf20Sopenharmony_ci	lruvec = &mz->lruvec;
13888c2ecf20Sopenharmony_ciout:
13898c2ecf20Sopenharmony_ci	/*
13908c2ecf20Sopenharmony_ci	 * Since a node can be onlined after the mem_cgroup was created,
13918c2ecf20Sopenharmony_ci	 * we have to be prepared to initialize lruvec->zone here;
13928c2ecf20Sopenharmony_ci	 * and if offlined then reonlined, we need to reinitialize it.
13938c2ecf20Sopenharmony_ci	 */
13948c2ecf20Sopenharmony_ci	if (unlikely(lruvec->pgdat != pgdat))
13958c2ecf20Sopenharmony_ci		lruvec->pgdat = pgdat;
13968c2ecf20Sopenharmony_ci	return lruvec;
13978c2ecf20Sopenharmony_ci}
13988c2ecf20Sopenharmony_ci
13998c2ecf20Sopenharmony_ci/**
14008c2ecf20Sopenharmony_ci * mem_cgroup_update_lru_size - account for adding or removing an lru page
14018c2ecf20Sopenharmony_ci * @lruvec: mem_cgroup per zone lru vector
14028c2ecf20Sopenharmony_ci * @lru: index of lru list the page is sitting on
14038c2ecf20Sopenharmony_ci * @zid: zone id of the accounted pages
14048c2ecf20Sopenharmony_ci * @nr_pages: positive when adding or negative when removing
14058c2ecf20Sopenharmony_ci *
14068c2ecf20Sopenharmony_ci * This function must be called under lru_lock, just before a page is added
14078c2ecf20Sopenharmony_ci * to or just after a page is removed from an lru list (that ordering being
14088c2ecf20Sopenharmony_ci * so as to allow it to check that lru_size 0 is consistent with list_empty).
14098c2ecf20Sopenharmony_ci */
14108c2ecf20Sopenharmony_civoid mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
14118c2ecf20Sopenharmony_ci				int zid, int nr_pages)
14128c2ecf20Sopenharmony_ci{
14138c2ecf20Sopenharmony_ci	struct mem_cgroup_per_node *mz;
14148c2ecf20Sopenharmony_ci	unsigned long *lru_size;
14158c2ecf20Sopenharmony_ci	long size;
14168c2ecf20Sopenharmony_ci
14178c2ecf20Sopenharmony_ci	if (mem_cgroup_disabled())
14188c2ecf20Sopenharmony_ci		return;
14198c2ecf20Sopenharmony_ci
14208c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
14218c2ecf20Sopenharmony_ci	if (is_node_lruvec(lruvec))
14228c2ecf20Sopenharmony_ci		return;
14238c2ecf20Sopenharmony_ci#endif
14248c2ecf20Sopenharmony_ci	mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
14258c2ecf20Sopenharmony_ci	lru_size = &mz->lru_zone_size[zid][lru];
14268c2ecf20Sopenharmony_ci
14278c2ecf20Sopenharmony_ci	if (nr_pages < 0)
14288c2ecf20Sopenharmony_ci		*lru_size += nr_pages;
14298c2ecf20Sopenharmony_ci
14308c2ecf20Sopenharmony_ci	size = *lru_size;
14318c2ecf20Sopenharmony_ci	if (WARN_ONCE(size < 0,
14328c2ecf20Sopenharmony_ci		"%s(%p, %d, %d): lru_size %ld\n",
14338c2ecf20Sopenharmony_ci		__func__, lruvec, lru, nr_pages, size)) {
14348c2ecf20Sopenharmony_ci		VM_BUG_ON(1);
14358c2ecf20Sopenharmony_ci		*lru_size = 0;
14368c2ecf20Sopenharmony_ci	}
14378c2ecf20Sopenharmony_ci
14388c2ecf20Sopenharmony_ci	if (nr_pages > 0)
14398c2ecf20Sopenharmony_ci		*lru_size += nr_pages;
14408c2ecf20Sopenharmony_ci}
14418c2ecf20Sopenharmony_ci
14428c2ecf20Sopenharmony_ci/**
14438c2ecf20Sopenharmony_ci * mem_cgroup_margin - calculate chargeable space of a memory cgroup
14448c2ecf20Sopenharmony_ci * @memcg: the memory cgroup
14458c2ecf20Sopenharmony_ci *
14468c2ecf20Sopenharmony_ci * Returns the maximum amount of memory @mem can be charged with, in
14478c2ecf20Sopenharmony_ci * pages.
14488c2ecf20Sopenharmony_ci */
14498c2ecf20Sopenharmony_cistatic unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
14508c2ecf20Sopenharmony_ci{
14518c2ecf20Sopenharmony_ci	unsigned long margin = 0;
14528c2ecf20Sopenharmony_ci	unsigned long count;
14538c2ecf20Sopenharmony_ci	unsigned long limit;
14548c2ecf20Sopenharmony_ci
14558c2ecf20Sopenharmony_ci	count = page_counter_read(&memcg->memory);
14568c2ecf20Sopenharmony_ci	limit = READ_ONCE(memcg->memory.max);
14578c2ecf20Sopenharmony_ci	if (count < limit)
14588c2ecf20Sopenharmony_ci		margin = limit - count;
14598c2ecf20Sopenharmony_ci
14608c2ecf20Sopenharmony_ci	if (do_memsw_account()) {
14618c2ecf20Sopenharmony_ci		count = page_counter_read(&memcg->memsw);
14628c2ecf20Sopenharmony_ci		limit = READ_ONCE(memcg->memsw.max);
14638c2ecf20Sopenharmony_ci		if (count < limit)
14648c2ecf20Sopenharmony_ci			margin = min(margin, limit - count);
14658c2ecf20Sopenharmony_ci		else
14668c2ecf20Sopenharmony_ci			margin = 0;
14678c2ecf20Sopenharmony_ci	}
14688c2ecf20Sopenharmony_ci
14698c2ecf20Sopenharmony_ci	return margin;
14708c2ecf20Sopenharmony_ci}
14718c2ecf20Sopenharmony_ci
14728c2ecf20Sopenharmony_ci/*
14738c2ecf20Sopenharmony_ci * A routine for checking "mem" is under move_account() or not.
14748c2ecf20Sopenharmony_ci *
14758c2ecf20Sopenharmony_ci * Checking a cgroup is mc.from or mc.to or under hierarchy of
14768c2ecf20Sopenharmony_ci * moving cgroups. This is for waiting at high-memory pressure
14778c2ecf20Sopenharmony_ci * caused by "move".
14788c2ecf20Sopenharmony_ci */
14798c2ecf20Sopenharmony_cistatic bool mem_cgroup_under_move(struct mem_cgroup *memcg)
14808c2ecf20Sopenharmony_ci{
14818c2ecf20Sopenharmony_ci	struct mem_cgroup *from;
14828c2ecf20Sopenharmony_ci	struct mem_cgroup *to;
14838c2ecf20Sopenharmony_ci	bool ret = false;
14848c2ecf20Sopenharmony_ci	/*
14858c2ecf20Sopenharmony_ci	 * Unlike task_move routines, we access mc.to, mc.from not under
14868c2ecf20Sopenharmony_ci	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
14878c2ecf20Sopenharmony_ci	 */
14888c2ecf20Sopenharmony_ci	spin_lock(&mc.lock);
14898c2ecf20Sopenharmony_ci	from = mc.from;
14908c2ecf20Sopenharmony_ci	to = mc.to;
14918c2ecf20Sopenharmony_ci	if (!from)
14928c2ecf20Sopenharmony_ci		goto unlock;
14938c2ecf20Sopenharmony_ci
14948c2ecf20Sopenharmony_ci	ret = mem_cgroup_is_descendant(from, memcg) ||
14958c2ecf20Sopenharmony_ci		mem_cgroup_is_descendant(to, memcg);
14968c2ecf20Sopenharmony_ciunlock:
14978c2ecf20Sopenharmony_ci	spin_unlock(&mc.lock);
14988c2ecf20Sopenharmony_ci	return ret;
14998c2ecf20Sopenharmony_ci}
15008c2ecf20Sopenharmony_ci
15018c2ecf20Sopenharmony_cistatic bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
15028c2ecf20Sopenharmony_ci{
15038c2ecf20Sopenharmony_ci	if (mc.moving_task && current != mc.moving_task) {
15048c2ecf20Sopenharmony_ci		if (mem_cgroup_under_move(memcg)) {
15058c2ecf20Sopenharmony_ci			DEFINE_WAIT(wait);
15068c2ecf20Sopenharmony_ci			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
15078c2ecf20Sopenharmony_ci			/* moving charge context might have finished. */
15088c2ecf20Sopenharmony_ci			if (mc.moving_task)
15098c2ecf20Sopenharmony_ci				schedule();
15108c2ecf20Sopenharmony_ci			finish_wait(&mc.waitq, &wait);
15118c2ecf20Sopenharmony_ci			return true;
15128c2ecf20Sopenharmony_ci		}
15138c2ecf20Sopenharmony_ci	}
15148c2ecf20Sopenharmony_ci	return false;
15158c2ecf20Sopenharmony_ci}
15168c2ecf20Sopenharmony_ci
15178c2ecf20Sopenharmony_cistruct memory_stat {
15188c2ecf20Sopenharmony_ci	const char *name;
15198c2ecf20Sopenharmony_ci	unsigned int ratio;
15208c2ecf20Sopenharmony_ci	unsigned int idx;
15218c2ecf20Sopenharmony_ci};
15228c2ecf20Sopenharmony_ci
15238c2ecf20Sopenharmony_cistatic struct memory_stat memory_stats[] = {
15248c2ecf20Sopenharmony_ci	{ "anon", PAGE_SIZE, NR_ANON_MAPPED },
15258c2ecf20Sopenharmony_ci	{ "file", PAGE_SIZE, NR_FILE_PAGES },
15268c2ecf20Sopenharmony_ci	{ "kernel_stack", 1024, NR_KERNEL_STACK_KB },
15278c2ecf20Sopenharmony_ci	{ "percpu", 1, MEMCG_PERCPU_B },
15288c2ecf20Sopenharmony_ci	{ "sock", PAGE_SIZE, MEMCG_SOCK },
15298c2ecf20Sopenharmony_ci	{ "shmem", PAGE_SIZE, NR_SHMEM },
15308c2ecf20Sopenharmony_ci	{ "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
15318c2ecf20Sopenharmony_ci	{ "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
15328c2ecf20Sopenharmony_ci	{ "file_writeback", PAGE_SIZE, NR_WRITEBACK },
15338c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
15348c2ecf20Sopenharmony_ci	/*
15358c2ecf20Sopenharmony_ci	 * The ratio will be initialized in memory_stats_init(). Because
15368c2ecf20Sopenharmony_ci	 * on some architectures, the macro of HPAGE_PMD_SIZE is not
15378c2ecf20Sopenharmony_ci	 * constant(e.g. powerpc).
15388c2ecf20Sopenharmony_ci	 */
15398c2ecf20Sopenharmony_ci	{ "anon_thp", 0, NR_ANON_THPS },
15408c2ecf20Sopenharmony_ci#endif
15418c2ecf20Sopenharmony_ci	{ "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
15428c2ecf20Sopenharmony_ci	{ "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
15438c2ecf20Sopenharmony_ci	{ "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
15448c2ecf20Sopenharmony_ci	{ "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
15458c2ecf20Sopenharmony_ci	{ "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
15468c2ecf20Sopenharmony_ci
15478c2ecf20Sopenharmony_ci	/*
15488c2ecf20Sopenharmony_ci	 * Note: The slab_reclaimable and slab_unreclaimable must be
15498c2ecf20Sopenharmony_ci	 * together and slab_reclaimable must be in front.
15508c2ecf20Sopenharmony_ci	 */
15518c2ecf20Sopenharmony_ci	{ "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
15528c2ecf20Sopenharmony_ci	{ "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
15538c2ecf20Sopenharmony_ci
15548c2ecf20Sopenharmony_ci	/* The memory events */
15558c2ecf20Sopenharmony_ci	{ "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
15568c2ecf20Sopenharmony_ci	{ "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
15578c2ecf20Sopenharmony_ci	{ "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
15588c2ecf20Sopenharmony_ci	{ "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
15598c2ecf20Sopenharmony_ci	{ "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
15608c2ecf20Sopenharmony_ci	{ "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
15618c2ecf20Sopenharmony_ci	{ "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
15628c2ecf20Sopenharmony_ci};
15638c2ecf20Sopenharmony_ci
15648c2ecf20Sopenharmony_cistatic int __init memory_stats_init(void)
15658c2ecf20Sopenharmony_ci{
15668c2ecf20Sopenharmony_ci	int i;
15678c2ecf20Sopenharmony_ci
15688c2ecf20Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
15698c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
15708c2ecf20Sopenharmony_ci		if (memory_stats[i].idx == NR_ANON_THPS)
15718c2ecf20Sopenharmony_ci			memory_stats[i].ratio = HPAGE_PMD_SIZE;
15728c2ecf20Sopenharmony_ci#endif
15738c2ecf20Sopenharmony_ci		VM_BUG_ON(!memory_stats[i].ratio);
15748c2ecf20Sopenharmony_ci		VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
15758c2ecf20Sopenharmony_ci	}
15768c2ecf20Sopenharmony_ci
15778c2ecf20Sopenharmony_ci	return 0;
15788c2ecf20Sopenharmony_ci}
15798c2ecf20Sopenharmony_cipure_initcall(memory_stats_init);
15808c2ecf20Sopenharmony_ci
15818c2ecf20Sopenharmony_cistatic char *memory_stat_format(struct mem_cgroup *memcg)
15828c2ecf20Sopenharmony_ci{
15838c2ecf20Sopenharmony_ci	struct seq_buf s;
15848c2ecf20Sopenharmony_ci	int i;
15858c2ecf20Sopenharmony_ci
15868c2ecf20Sopenharmony_ci	seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
15878c2ecf20Sopenharmony_ci	if (!s.buffer)
15888c2ecf20Sopenharmony_ci		return NULL;
15898c2ecf20Sopenharmony_ci
15908c2ecf20Sopenharmony_ci	/*
15918c2ecf20Sopenharmony_ci	 * Provide statistics on the state of the memory subsystem as
15928c2ecf20Sopenharmony_ci	 * well as cumulative event counters that show past behavior.
15938c2ecf20Sopenharmony_ci	 *
15948c2ecf20Sopenharmony_ci	 * This list is ordered following a combination of these gradients:
15958c2ecf20Sopenharmony_ci	 * 1) generic big picture -> specifics and details
15968c2ecf20Sopenharmony_ci	 * 2) reflecting userspace activity -> reflecting kernel heuristics
15978c2ecf20Sopenharmony_ci	 *
15988c2ecf20Sopenharmony_ci	 * Current memory state:
15998c2ecf20Sopenharmony_ci	 */
16008c2ecf20Sopenharmony_ci
16018c2ecf20Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
16028c2ecf20Sopenharmony_ci		u64 size;
16038c2ecf20Sopenharmony_ci
16048c2ecf20Sopenharmony_ci		size = memcg_page_state(memcg, memory_stats[i].idx);
16058c2ecf20Sopenharmony_ci		size *= memory_stats[i].ratio;
16068c2ecf20Sopenharmony_ci		seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
16078c2ecf20Sopenharmony_ci
16088c2ecf20Sopenharmony_ci		if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
16098c2ecf20Sopenharmony_ci			size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
16108c2ecf20Sopenharmony_ci			       memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
16118c2ecf20Sopenharmony_ci			seq_buf_printf(&s, "slab %llu\n", size);
16128c2ecf20Sopenharmony_ci		}
16138c2ecf20Sopenharmony_ci	}
16148c2ecf20Sopenharmony_ci
16158c2ecf20Sopenharmony_ci	/* Accumulated memory events */
16168c2ecf20Sopenharmony_ci
16178c2ecf20Sopenharmony_ci	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
16188c2ecf20Sopenharmony_ci		       memcg_events(memcg, PGFAULT));
16198c2ecf20Sopenharmony_ci	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
16208c2ecf20Sopenharmony_ci		       memcg_events(memcg, PGMAJFAULT));
16218c2ecf20Sopenharmony_ci	seq_buf_printf(&s, "%s %lu\n",  vm_event_name(PGREFILL),
16228c2ecf20Sopenharmony_ci		       memcg_events(memcg, PGREFILL));
16238c2ecf20Sopenharmony_ci	seq_buf_printf(&s, "pgscan %lu\n",
16248c2ecf20Sopenharmony_ci		       memcg_events(memcg, PGSCAN_KSWAPD) +
16258c2ecf20Sopenharmony_ci		       memcg_events(memcg, PGSCAN_DIRECT));
16268c2ecf20Sopenharmony_ci	seq_buf_printf(&s, "pgsteal %lu\n",
16278c2ecf20Sopenharmony_ci		       memcg_events(memcg, PGSTEAL_KSWAPD) +
16288c2ecf20Sopenharmony_ci		       memcg_events(memcg, PGSTEAL_DIRECT));
16298c2ecf20Sopenharmony_ci	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
16308c2ecf20Sopenharmony_ci		       memcg_events(memcg, PGACTIVATE));
16318c2ecf20Sopenharmony_ci	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
16328c2ecf20Sopenharmony_ci		       memcg_events(memcg, PGDEACTIVATE));
16338c2ecf20Sopenharmony_ci	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
16348c2ecf20Sopenharmony_ci		       memcg_events(memcg, PGLAZYFREE));
16358c2ecf20Sopenharmony_ci	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
16368c2ecf20Sopenharmony_ci		       memcg_events(memcg, PGLAZYFREED));
16378c2ecf20Sopenharmony_ci
16388c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
16398c2ecf20Sopenharmony_ci	seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
16408c2ecf20Sopenharmony_ci		       memcg_events(memcg, THP_FAULT_ALLOC));
16418c2ecf20Sopenharmony_ci	seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
16428c2ecf20Sopenharmony_ci		       memcg_events(memcg, THP_COLLAPSE_ALLOC));
16438c2ecf20Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
16448c2ecf20Sopenharmony_ci
16458c2ecf20Sopenharmony_ci	/* The above should easily fit into one page */
16468c2ecf20Sopenharmony_ci	WARN_ON_ONCE(seq_buf_has_overflowed(&s));
16478c2ecf20Sopenharmony_ci
16488c2ecf20Sopenharmony_ci	return s.buffer;
16498c2ecf20Sopenharmony_ci}
16508c2ecf20Sopenharmony_ci
16518c2ecf20Sopenharmony_ci#define K(x) ((x) << (PAGE_SHIFT-10))
16528c2ecf20Sopenharmony_ci/**
16538c2ecf20Sopenharmony_ci * mem_cgroup_print_oom_context: Print OOM information relevant to
16548c2ecf20Sopenharmony_ci * memory controller.
16558c2ecf20Sopenharmony_ci * @memcg: The memory cgroup that went over limit
16568c2ecf20Sopenharmony_ci * @p: Task that is going to be killed
16578c2ecf20Sopenharmony_ci *
16588c2ecf20Sopenharmony_ci * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
16598c2ecf20Sopenharmony_ci * enabled
16608c2ecf20Sopenharmony_ci */
16618c2ecf20Sopenharmony_civoid mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
16628c2ecf20Sopenharmony_ci{
16638c2ecf20Sopenharmony_ci	rcu_read_lock();
16648c2ecf20Sopenharmony_ci
16658c2ecf20Sopenharmony_ci	if (memcg) {
16668c2ecf20Sopenharmony_ci		pr_cont(",oom_memcg=");
16678c2ecf20Sopenharmony_ci		pr_cont_cgroup_path(memcg->css.cgroup);
16688c2ecf20Sopenharmony_ci	} else
16698c2ecf20Sopenharmony_ci		pr_cont(",global_oom");
16708c2ecf20Sopenharmony_ci	if (p) {
16718c2ecf20Sopenharmony_ci		pr_cont(",task_memcg=");
16728c2ecf20Sopenharmony_ci		pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
16738c2ecf20Sopenharmony_ci	}
16748c2ecf20Sopenharmony_ci	rcu_read_unlock();
16758c2ecf20Sopenharmony_ci}
16768c2ecf20Sopenharmony_ci
16778c2ecf20Sopenharmony_ci/**
16788c2ecf20Sopenharmony_ci * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
16798c2ecf20Sopenharmony_ci * memory controller.
16808c2ecf20Sopenharmony_ci * @memcg: The memory cgroup that went over limit
16818c2ecf20Sopenharmony_ci */
16828c2ecf20Sopenharmony_civoid mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
16838c2ecf20Sopenharmony_ci{
16848c2ecf20Sopenharmony_ci	char *buf;
16858c2ecf20Sopenharmony_ci
16868c2ecf20Sopenharmony_ci	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
16878c2ecf20Sopenharmony_ci		K((u64)page_counter_read(&memcg->memory)),
16888c2ecf20Sopenharmony_ci		K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
16898c2ecf20Sopenharmony_ci	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
16908c2ecf20Sopenharmony_ci		pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
16918c2ecf20Sopenharmony_ci			K((u64)page_counter_read(&memcg->swap)),
16928c2ecf20Sopenharmony_ci			K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
16938c2ecf20Sopenharmony_ci	else {
16948c2ecf20Sopenharmony_ci		pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
16958c2ecf20Sopenharmony_ci			K((u64)page_counter_read(&memcg->memsw)),
16968c2ecf20Sopenharmony_ci			K((u64)memcg->memsw.max), memcg->memsw.failcnt);
16978c2ecf20Sopenharmony_ci		pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
16988c2ecf20Sopenharmony_ci			K((u64)page_counter_read(&memcg->kmem)),
16998c2ecf20Sopenharmony_ci			K((u64)memcg->kmem.max), memcg->kmem.failcnt);
17008c2ecf20Sopenharmony_ci	}
17018c2ecf20Sopenharmony_ci
17028c2ecf20Sopenharmony_ci	pr_info("Memory cgroup stats for ");
17038c2ecf20Sopenharmony_ci	pr_cont_cgroup_path(memcg->css.cgroup);
17048c2ecf20Sopenharmony_ci	pr_cont(":");
17058c2ecf20Sopenharmony_ci	buf = memory_stat_format(memcg);
17068c2ecf20Sopenharmony_ci	if (!buf)
17078c2ecf20Sopenharmony_ci		return;
17088c2ecf20Sopenharmony_ci	pr_info("%s", buf);
17098c2ecf20Sopenharmony_ci	kfree(buf);
17108c2ecf20Sopenharmony_ci}
17118c2ecf20Sopenharmony_ci
17128c2ecf20Sopenharmony_ci/*
17138c2ecf20Sopenharmony_ci * Return the memory (and swap, if configured) limit for a memcg.
17148c2ecf20Sopenharmony_ci */
17158c2ecf20Sopenharmony_ciunsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
17168c2ecf20Sopenharmony_ci{
17178c2ecf20Sopenharmony_ci	unsigned long max = READ_ONCE(memcg->memory.max);
17188c2ecf20Sopenharmony_ci
17198c2ecf20Sopenharmony_ci	if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
17208c2ecf20Sopenharmony_ci		if (mem_cgroup_swappiness(memcg))
17218c2ecf20Sopenharmony_ci			max += min(READ_ONCE(memcg->swap.max),
17228c2ecf20Sopenharmony_ci				   (unsigned long)total_swap_pages);
17238c2ecf20Sopenharmony_ci	} else { /* v1 */
17248c2ecf20Sopenharmony_ci		if (mem_cgroup_swappiness(memcg)) {
17258c2ecf20Sopenharmony_ci			/* Calculate swap excess capacity from memsw limit */
17268c2ecf20Sopenharmony_ci			unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
17278c2ecf20Sopenharmony_ci
17288c2ecf20Sopenharmony_ci			max += min(swap, (unsigned long)total_swap_pages);
17298c2ecf20Sopenharmony_ci		}
17308c2ecf20Sopenharmony_ci	}
17318c2ecf20Sopenharmony_ci	return max;
17328c2ecf20Sopenharmony_ci}
17338c2ecf20Sopenharmony_ci
17348c2ecf20Sopenharmony_ciunsigned long mem_cgroup_size(struct mem_cgroup *memcg)
17358c2ecf20Sopenharmony_ci{
17368c2ecf20Sopenharmony_ci	return page_counter_read(&memcg->memory);
17378c2ecf20Sopenharmony_ci}
17388c2ecf20Sopenharmony_ci
17398c2ecf20Sopenharmony_cistatic bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
17408c2ecf20Sopenharmony_ci				     int order)
17418c2ecf20Sopenharmony_ci{
17428c2ecf20Sopenharmony_ci	struct oom_control oc = {
17438c2ecf20Sopenharmony_ci		.zonelist = NULL,
17448c2ecf20Sopenharmony_ci		.nodemask = NULL,
17458c2ecf20Sopenharmony_ci		.memcg = memcg,
17468c2ecf20Sopenharmony_ci		.gfp_mask = gfp_mask,
17478c2ecf20Sopenharmony_ci		.order = order,
17488c2ecf20Sopenharmony_ci	};
17498c2ecf20Sopenharmony_ci	bool ret = true;
17508c2ecf20Sopenharmony_ci
17518c2ecf20Sopenharmony_ci	if (mutex_lock_killable(&oom_lock))
17528c2ecf20Sopenharmony_ci		return true;
17538c2ecf20Sopenharmony_ci
17548c2ecf20Sopenharmony_ci	if (mem_cgroup_margin(memcg) >= (1 << order))
17558c2ecf20Sopenharmony_ci		goto unlock;
17568c2ecf20Sopenharmony_ci
17578c2ecf20Sopenharmony_ci	/*
17588c2ecf20Sopenharmony_ci	 * A few threads which were not waiting at mutex_lock_killable() can
17598c2ecf20Sopenharmony_ci	 * fail to bail out. Therefore, check again after holding oom_lock.
17608c2ecf20Sopenharmony_ci	 */
17618c2ecf20Sopenharmony_ci	ret = task_is_dying() || out_of_memory(&oc);
17628c2ecf20Sopenharmony_ci
17638c2ecf20Sopenharmony_ciunlock:
17648c2ecf20Sopenharmony_ci	mutex_unlock(&oom_lock);
17658c2ecf20Sopenharmony_ci	return ret;
17668c2ecf20Sopenharmony_ci}
17678c2ecf20Sopenharmony_ci
17688c2ecf20Sopenharmony_cistatic int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
17698c2ecf20Sopenharmony_ci				   pg_data_t *pgdat,
17708c2ecf20Sopenharmony_ci				   gfp_t gfp_mask,
17718c2ecf20Sopenharmony_ci				   unsigned long *total_scanned)
17728c2ecf20Sopenharmony_ci{
17738c2ecf20Sopenharmony_ci	struct mem_cgroup *victim = NULL;
17748c2ecf20Sopenharmony_ci	int total = 0;
17758c2ecf20Sopenharmony_ci	int loop = 0;
17768c2ecf20Sopenharmony_ci	unsigned long excess;
17778c2ecf20Sopenharmony_ci	unsigned long nr_scanned;
17788c2ecf20Sopenharmony_ci	struct mem_cgroup_reclaim_cookie reclaim = {
17798c2ecf20Sopenharmony_ci		.pgdat = pgdat,
17808c2ecf20Sopenharmony_ci	};
17818c2ecf20Sopenharmony_ci
17828c2ecf20Sopenharmony_ci	excess = soft_limit_excess(root_memcg);
17838c2ecf20Sopenharmony_ci
17848c2ecf20Sopenharmony_ci	while (1) {
17858c2ecf20Sopenharmony_ci		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
17868c2ecf20Sopenharmony_ci		if (!victim) {
17878c2ecf20Sopenharmony_ci			loop++;
17888c2ecf20Sopenharmony_ci			if (loop >= 2) {
17898c2ecf20Sopenharmony_ci				/*
17908c2ecf20Sopenharmony_ci				 * If we have not been able to reclaim
17918c2ecf20Sopenharmony_ci				 * anything, it might because there are
17928c2ecf20Sopenharmony_ci				 * no reclaimable pages under this hierarchy
17938c2ecf20Sopenharmony_ci				 */
17948c2ecf20Sopenharmony_ci				if (!total)
17958c2ecf20Sopenharmony_ci					break;
17968c2ecf20Sopenharmony_ci				/*
17978c2ecf20Sopenharmony_ci				 * We want to do more targeted reclaim.
17988c2ecf20Sopenharmony_ci				 * excess >> 2 is not to excessive so as to
17998c2ecf20Sopenharmony_ci				 * reclaim too much, nor too less that we keep
18008c2ecf20Sopenharmony_ci				 * coming back to reclaim from this cgroup
18018c2ecf20Sopenharmony_ci				 */
18028c2ecf20Sopenharmony_ci				if (total >= (excess >> 2) ||
18038c2ecf20Sopenharmony_ci					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
18048c2ecf20Sopenharmony_ci					break;
18058c2ecf20Sopenharmony_ci			}
18068c2ecf20Sopenharmony_ci			continue;
18078c2ecf20Sopenharmony_ci		}
18088c2ecf20Sopenharmony_ci		total += mem_cgroup_shrink_node(victim, gfp_mask, false,
18098c2ecf20Sopenharmony_ci					pgdat, &nr_scanned);
18108c2ecf20Sopenharmony_ci		*total_scanned += nr_scanned;
18118c2ecf20Sopenharmony_ci		if (!soft_limit_excess(root_memcg))
18128c2ecf20Sopenharmony_ci			break;
18138c2ecf20Sopenharmony_ci	}
18148c2ecf20Sopenharmony_ci	mem_cgroup_iter_break(root_memcg, victim);
18158c2ecf20Sopenharmony_ci	return total;
18168c2ecf20Sopenharmony_ci}
18178c2ecf20Sopenharmony_ci
18188c2ecf20Sopenharmony_ci#ifdef CONFIG_LOCKDEP
18198c2ecf20Sopenharmony_cistatic struct lockdep_map memcg_oom_lock_dep_map = {
18208c2ecf20Sopenharmony_ci	.name = "memcg_oom_lock",
18218c2ecf20Sopenharmony_ci};
18228c2ecf20Sopenharmony_ci#endif
18238c2ecf20Sopenharmony_ci
18248c2ecf20Sopenharmony_cistatic DEFINE_SPINLOCK(memcg_oom_lock);
18258c2ecf20Sopenharmony_ci
18268c2ecf20Sopenharmony_ci/*
18278c2ecf20Sopenharmony_ci * Check OOM-Killer is already running under our hierarchy.
18288c2ecf20Sopenharmony_ci * If someone is running, return false.
18298c2ecf20Sopenharmony_ci */
18308c2ecf20Sopenharmony_cistatic bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
18318c2ecf20Sopenharmony_ci{
18328c2ecf20Sopenharmony_ci	struct mem_cgroup *iter, *failed = NULL;
18338c2ecf20Sopenharmony_ci
18348c2ecf20Sopenharmony_ci	spin_lock(&memcg_oom_lock);
18358c2ecf20Sopenharmony_ci
18368c2ecf20Sopenharmony_ci	for_each_mem_cgroup_tree(iter, memcg) {
18378c2ecf20Sopenharmony_ci		if (iter->oom_lock) {
18388c2ecf20Sopenharmony_ci			/*
18398c2ecf20Sopenharmony_ci			 * this subtree of our hierarchy is already locked
18408c2ecf20Sopenharmony_ci			 * so we cannot give a lock.
18418c2ecf20Sopenharmony_ci			 */
18428c2ecf20Sopenharmony_ci			failed = iter;
18438c2ecf20Sopenharmony_ci			mem_cgroup_iter_break(memcg, iter);
18448c2ecf20Sopenharmony_ci			break;
18458c2ecf20Sopenharmony_ci		} else
18468c2ecf20Sopenharmony_ci			iter->oom_lock = true;
18478c2ecf20Sopenharmony_ci	}
18488c2ecf20Sopenharmony_ci
18498c2ecf20Sopenharmony_ci	if (failed) {
18508c2ecf20Sopenharmony_ci		/*
18518c2ecf20Sopenharmony_ci		 * OK, we failed to lock the whole subtree so we have
18528c2ecf20Sopenharmony_ci		 * to clean up what we set up to the failing subtree
18538c2ecf20Sopenharmony_ci		 */
18548c2ecf20Sopenharmony_ci		for_each_mem_cgroup_tree(iter, memcg) {
18558c2ecf20Sopenharmony_ci			if (iter == failed) {
18568c2ecf20Sopenharmony_ci				mem_cgroup_iter_break(memcg, iter);
18578c2ecf20Sopenharmony_ci				break;
18588c2ecf20Sopenharmony_ci			}
18598c2ecf20Sopenharmony_ci			iter->oom_lock = false;
18608c2ecf20Sopenharmony_ci		}
18618c2ecf20Sopenharmony_ci	} else
18628c2ecf20Sopenharmony_ci		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
18638c2ecf20Sopenharmony_ci
18648c2ecf20Sopenharmony_ci	spin_unlock(&memcg_oom_lock);
18658c2ecf20Sopenharmony_ci
18668c2ecf20Sopenharmony_ci	return !failed;
18678c2ecf20Sopenharmony_ci}
18688c2ecf20Sopenharmony_ci
18698c2ecf20Sopenharmony_cistatic void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
18708c2ecf20Sopenharmony_ci{
18718c2ecf20Sopenharmony_ci	struct mem_cgroup *iter;
18728c2ecf20Sopenharmony_ci
18738c2ecf20Sopenharmony_ci	spin_lock(&memcg_oom_lock);
18748c2ecf20Sopenharmony_ci	mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
18758c2ecf20Sopenharmony_ci	for_each_mem_cgroup_tree(iter, memcg)
18768c2ecf20Sopenharmony_ci		iter->oom_lock = false;
18778c2ecf20Sopenharmony_ci	spin_unlock(&memcg_oom_lock);
18788c2ecf20Sopenharmony_ci}
18798c2ecf20Sopenharmony_ci
18808c2ecf20Sopenharmony_cistatic void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
18818c2ecf20Sopenharmony_ci{
18828c2ecf20Sopenharmony_ci	struct mem_cgroup *iter;
18838c2ecf20Sopenharmony_ci
18848c2ecf20Sopenharmony_ci	spin_lock(&memcg_oom_lock);
18858c2ecf20Sopenharmony_ci	for_each_mem_cgroup_tree(iter, memcg)
18868c2ecf20Sopenharmony_ci		iter->under_oom++;
18878c2ecf20Sopenharmony_ci	spin_unlock(&memcg_oom_lock);
18888c2ecf20Sopenharmony_ci}
18898c2ecf20Sopenharmony_ci
18908c2ecf20Sopenharmony_cistatic void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
18918c2ecf20Sopenharmony_ci{
18928c2ecf20Sopenharmony_ci	struct mem_cgroup *iter;
18938c2ecf20Sopenharmony_ci
18948c2ecf20Sopenharmony_ci	/*
18958c2ecf20Sopenharmony_ci	 * Be careful about under_oom underflows becase a child memcg
18968c2ecf20Sopenharmony_ci	 * could have been added after mem_cgroup_mark_under_oom.
18978c2ecf20Sopenharmony_ci	 */
18988c2ecf20Sopenharmony_ci	spin_lock(&memcg_oom_lock);
18998c2ecf20Sopenharmony_ci	for_each_mem_cgroup_tree(iter, memcg)
19008c2ecf20Sopenharmony_ci		if (iter->under_oom > 0)
19018c2ecf20Sopenharmony_ci			iter->under_oom--;
19028c2ecf20Sopenharmony_ci	spin_unlock(&memcg_oom_lock);
19038c2ecf20Sopenharmony_ci}
19048c2ecf20Sopenharmony_ci
19058c2ecf20Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
19068c2ecf20Sopenharmony_ci
19078c2ecf20Sopenharmony_cistruct oom_wait_info {
19088c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
19098c2ecf20Sopenharmony_ci	wait_queue_entry_t	wait;
19108c2ecf20Sopenharmony_ci};
19118c2ecf20Sopenharmony_ci
19128c2ecf20Sopenharmony_cistatic int memcg_oom_wake_function(wait_queue_entry_t *wait,
19138c2ecf20Sopenharmony_ci	unsigned mode, int sync, void *arg)
19148c2ecf20Sopenharmony_ci{
19158c2ecf20Sopenharmony_ci	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
19168c2ecf20Sopenharmony_ci	struct mem_cgroup *oom_wait_memcg;
19178c2ecf20Sopenharmony_ci	struct oom_wait_info *oom_wait_info;
19188c2ecf20Sopenharmony_ci
19198c2ecf20Sopenharmony_ci	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
19208c2ecf20Sopenharmony_ci	oom_wait_memcg = oom_wait_info->memcg;
19218c2ecf20Sopenharmony_ci
19228c2ecf20Sopenharmony_ci	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
19238c2ecf20Sopenharmony_ci	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
19248c2ecf20Sopenharmony_ci		return 0;
19258c2ecf20Sopenharmony_ci	return autoremove_wake_function(wait, mode, sync, arg);
19268c2ecf20Sopenharmony_ci}
19278c2ecf20Sopenharmony_ci
19288c2ecf20Sopenharmony_cistatic void memcg_oom_recover(struct mem_cgroup *memcg)
19298c2ecf20Sopenharmony_ci{
19308c2ecf20Sopenharmony_ci	/*
19318c2ecf20Sopenharmony_ci	 * For the following lockless ->under_oom test, the only required
19328c2ecf20Sopenharmony_ci	 * guarantee is that it must see the state asserted by an OOM when
19338c2ecf20Sopenharmony_ci	 * this function is called as a result of userland actions
19348c2ecf20Sopenharmony_ci	 * triggered by the notification of the OOM.  This is trivially
19358c2ecf20Sopenharmony_ci	 * achieved by invoking mem_cgroup_mark_under_oom() before
19368c2ecf20Sopenharmony_ci	 * triggering notification.
19378c2ecf20Sopenharmony_ci	 */
19388c2ecf20Sopenharmony_ci	if (memcg && memcg->under_oom)
19398c2ecf20Sopenharmony_ci		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
19408c2ecf20Sopenharmony_ci}
19418c2ecf20Sopenharmony_ci
19428c2ecf20Sopenharmony_cienum oom_status {
19438c2ecf20Sopenharmony_ci	OOM_SUCCESS,
19448c2ecf20Sopenharmony_ci	OOM_FAILED,
19458c2ecf20Sopenharmony_ci	OOM_ASYNC,
19468c2ecf20Sopenharmony_ci	OOM_SKIPPED
19478c2ecf20Sopenharmony_ci};
19488c2ecf20Sopenharmony_ci
19498c2ecf20Sopenharmony_cistatic enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
19508c2ecf20Sopenharmony_ci{
19518c2ecf20Sopenharmony_ci	enum oom_status ret;
19528c2ecf20Sopenharmony_ci	bool locked;
19538c2ecf20Sopenharmony_ci
19548c2ecf20Sopenharmony_ci	if (order > PAGE_ALLOC_COSTLY_ORDER)
19558c2ecf20Sopenharmony_ci		return OOM_SKIPPED;
19568c2ecf20Sopenharmony_ci
19578c2ecf20Sopenharmony_ci	memcg_memory_event(memcg, MEMCG_OOM);
19588c2ecf20Sopenharmony_ci
19598c2ecf20Sopenharmony_ci	/*
19608c2ecf20Sopenharmony_ci	 * We are in the middle of the charge context here, so we
19618c2ecf20Sopenharmony_ci	 * don't want to block when potentially sitting on a callstack
19628c2ecf20Sopenharmony_ci	 * that holds all kinds of filesystem and mm locks.
19638c2ecf20Sopenharmony_ci	 *
19648c2ecf20Sopenharmony_ci	 * cgroup1 allows disabling the OOM killer and waiting for outside
19658c2ecf20Sopenharmony_ci	 * handling until the charge can succeed; remember the context and put
19668c2ecf20Sopenharmony_ci	 * the task to sleep at the end of the page fault when all locks are
19678c2ecf20Sopenharmony_ci	 * released.
19688c2ecf20Sopenharmony_ci	 *
19698c2ecf20Sopenharmony_ci	 * On the other hand, in-kernel OOM killer allows for an async victim
19708c2ecf20Sopenharmony_ci	 * memory reclaim (oom_reaper) and that means that we are not solely
19718c2ecf20Sopenharmony_ci	 * relying on the oom victim to make a forward progress and we can
19728c2ecf20Sopenharmony_ci	 * invoke the oom killer here.
19738c2ecf20Sopenharmony_ci	 *
19748c2ecf20Sopenharmony_ci	 * Please note that mem_cgroup_out_of_memory might fail to find a
19758c2ecf20Sopenharmony_ci	 * victim and then we have to bail out from the charge path.
19768c2ecf20Sopenharmony_ci	 */
19778c2ecf20Sopenharmony_ci	if (memcg->oom_kill_disable) {
19788c2ecf20Sopenharmony_ci		if (!current->in_user_fault)
19798c2ecf20Sopenharmony_ci			return OOM_SKIPPED;
19808c2ecf20Sopenharmony_ci		css_get(&memcg->css);
19818c2ecf20Sopenharmony_ci		current->memcg_in_oom = memcg;
19828c2ecf20Sopenharmony_ci		current->memcg_oom_gfp_mask = mask;
19838c2ecf20Sopenharmony_ci		current->memcg_oom_order = order;
19848c2ecf20Sopenharmony_ci
19858c2ecf20Sopenharmony_ci		return OOM_ASYNC;
19868c2ecf20Sopenharmony_ci	}
19878c2ecf20Sopenharmony_ci
19888c2ecf20Sopenharmony_ci	mem_cgroup_mark_under_oom(memcg);
19898c2ecf20Sopenharmony_ci
19908c2ecf20Sopenharmony_ci	locked = mem_cgroup_oom_trylock(memcg);
19918c2ecf20Sopenharmony_ci
19928c2ecf20Sopenharmony_ci	if (locked)
19938c2ecf20Sopenharmony_ci		mem_cgroup_oom_notify(memcg);
19948c2ecf20Sopenharmony_ci
19958c2ecf20Sopenharmony_ci	mem_cgroup_unmark_under_oom(memcg);
19968c2ecf20Sopenharmony_ci	if (mem_cgroup_out_of_memory(memcg, mask, order))
19978c2ecf20Sopenharmony_ci		ret = OOM_SUCCESS;
19988c2ecf20Sopenharmony_ci	else
19998c2ecf20Sopenharmony_ci		ret = OOM_FAILED;
20008c2ecf20Sopenharmony_ci
20018c2ecf20Sopenharmony_ci	if (locked)
20028c2ecf20Sopenharmony_ci		mem_cgroup_oom_unlock(memcg);
20038c2ecf20Sopenharmony_ci
20048c2ecf20Sopenharmony_ci	return ret;
20058c2ecf20Sopenharmony_ci}
20068c2ecf20Sopenharmony_ci
20078c2ecf20Sopenharmony_ci/**
20088c2ecf20Sopenharmony_ci * mem_cgroup_oom_synchronize - complete memcg OOM handling
20098c2ecf20Sopenharmony_ci * @handle: actually kill/wait or just clean up the OOM state
20108c2ecf20Sopenharmony_ci *
20118c2ecf20Sopenharmony_ci * This has to be called at the end of a page fault if the memcg OOM
20128c2ecf20Sopenharmony_ci * handler was enabled.
20138c2ecf20Sopenharmony_ci *
20148c2ecf20Sopenharmony_ci * Memcg supports userspace OOM handling where failed allocations must
20158c2ecf20Sopenharmony_ci * sleep on a waitqueue until the userspace task resolves the
20168c2ecf20Sopenharmony_ci * situation.  Sleeping directly in the charge context with all kinds
20178c2ecf20Sopenharmony_ci * of locks held is not a good idea, instead we remember an OOM state
20188c2ecf20Sopenharmony_ci * in the task and mem_cgroup_oom_synchronize() has to be called at
20198c2ecf20Sopenharmony_ci * the end of the page fault to complete the OOM handling.
20208c2ecf20Sopenharmony_ci *
20218c2ecf20Sopenharmony_ci * Returns %true if an ongoing memcg OOM situation was detected and
20228c2ecf20Sopenharmony_ci * completed, %false otherwise.
20238c2ecf20Sopenharmony_ci */
20248c2ecf20Sopenharmony_cibool mem_cgroup_oom_synchronize(bool handle)
20258c2ecf20Sopenharmony_ci{
20268c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = current->memcg_in_oom;
20278c2ecf20Sopenharmony_ci	struct oom_wait_info owait;
20288c2ecf20Sopenharmony_ci	bool locked;
20298c2ecf20Sopenharmony_ci
20308c2ecf20Sopenharmony_ci	/* OOM is global, do not handle */
20318c2ecf20Sopenharmony_ci	if (!memcg)
20328c2ecf20Sopenharmony_ci		return false;
20338c2ecf20Sopenharmony_ci
20348c2ecf20Sopenharmony_ci	if (!handle)
20358c2ecf20Sopenharmony_ci		goto cleanup;
20368c2ecf20Sopenharmony_ci
20378c2ecf20Sopenharmony_ci	owait.memcg = memcg;
20388c2ecf20Sopenharmony_ci	owait.wait.flags = 0;
20398c2ecf20Sopenharmony_ci	owait.wait.func = memcg_oom_wake_function;
20408c2ecf20Sopenharmony_ci	owait.wait.private = current;
20418c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&owait.wait.entry);
20428c2ecf20Sopenharmony_ci
20438c2ecf20Sopenharmony_ci	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
20448c2ecf20Sopenharmony_ci	mem_cgroup_mark_under_oom(memcg);
20458c2ecf20Sopenharmony_ci
20468c2ecf20Sopenharmony_ci	locked = mem_cgroup_oom_trylock(memcg);
20478c2ecf20Sopenharmony_ci
20488c2ecf20Sopenharmony_ci	if (locked)
20498c2ecf20Sopenharmony_ci		mem_cgroup_oom_notify(memcg);
20508c2ecf20Sopenharmony_ci
20518c2ecf20Sopenharmony_ci	if (locked && !memcg->oom_kill_disable) {
20528c2ecf20Sopenharmony_ci		mem_cgroup_unmark_under_oom(memcg);
20538c2ecf20Sopenharmony_ci		finish_wait(&memcg_oom_waitq, &owait.wait);
20548c2ecf20Sopenharmony_ci		mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
20558c2ecf20Sopenharmony_ci					 current->memcg_oom_order);
20568c2ecf20Sopenharmony_ci	} else {
20578c2ecf20Sopenharmony_ci		schedule();
20588c2ecf20Sopenharmony_ci		mem_cgroup_unmark_under_oom(memcg);
20598c2ecf20Sopenharmony_ci		finish_wait(&memcg_oom_waitq, &owait.wait);
20608c2ecf20Sopenharmony_ci	}
20618c2ecf20Sopenharmony_ci
20628c2ecf20Sopenharmony_ci	if (locked) {
20638c2ecf20Sopenharmony_ci		mem_cgroup_oom_unlock(memcg);
20648c2ecf20Sopenharmony_ci		/*
20658c2ecf20Sopenharmony_ci		 * There is no guarantee that an OOM-lock contender
20668c2ecf20Sopenharmony_ci		 * sees the wakeups triggered by the OOM kill
20678c2ecf20Sopenharmony_ci		 * uncharges.  Wake any sleepers explicitely.
20688c2ecf20Sopenharmony_ci		 */
20698c2ecf20Sopenharmony_ci		memcg_oom_recover(memcg);
20708c2ecf20Sopenharmony_ci	}
20718c2ecf20Sopenharmony_cicleanup:
20728c2ecf20Sopenharmony_ci	current->memcg_in_oom = NULL;
20738c2ecf20Sopenharmony_ci	css_put(&memcg->css);
20748c2ecf20Sopenharmony_ci	return true;
20758c2ecf20Sopenharmony_ci}
20768c2ecf20Sopenharmony_ci
20778c2ecf20Sopenharmony_ci/**
20788c2ecf20Sopenharmony_ci * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
20798c2ecf20Sopenharmony_ci * @victim: task to be killed by the OOM killer
20808c2ecf20Sopenharmony_ci * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
20818c2ecf20Sopenharmony_ci *
20828c2ecf20Sopenharmony_ci * Returns a pointer to a memory cgroup, which has to be cleaned up
20838c2ecf20Sopenharmony_ci * by killing all belonging OOM-killable tasks.
20848c2ecf20Sopenharmony_ci *
20858c2ecf20Sopenharmony_ci * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
20868c2ecf20Sopenharmony_ci */
20878c2ecf20Sopenharmony_cistruct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
20888c2ecf20Sopenharmony_ci					    struct mem_cgroup *oom_domain)
20898c2ecf20Sopenharmony_ci{
20908c2ecf20Sopenharmony_ci	struct mem_cgroup *oom_group = NULL;
20918c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
20928c2ecf20Sopenharmony_ci
20938c2ecf20Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
20948c2ecf20Sopenharmony_ci		return NULL;
20958c2ecf20Sopenharmony_ci
20968c2ecf20Sopenharmony_ci	if (!oom_domain)
20978c2ecf20Sopenharmony_ci		oom_domain = root_mem_cgroup;
20988c2ecf20Sopenharmony_ci
20998c2ecf20Sopenharmony_ci	rcu_read_lock();
21008c2ecf20Sopenharmony_ci
21018c2ecf20Sopenharmony_ci	memcg = mem_cgroup_from_task(victim);
21028c2ecf20Sopenharmony_ci	if (memcg == root_mem_cgroup)
21038c2ecf20Sopenharmony_ci		goto out;
21048c2ecf20Sopenharmony_ci
21058c2ecf20Sopenharmony_ci	/*
21068c2ecf20Sopenharmony_ci	 * If the victim task has been asynchronously moved to a different
21078c2ecf20Sopenharmony_ci	 * memory cgroup, we might end up killing tasks outside oom_domain.
21088c2ecf20Sopenharmony_ci	 * In this case it's better to ignore memory.group.oom.
21098c2ecf20Sopenharmony_ci	 */
21108c2ecf20Sopenharmony_ci	if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
21118c2ecf20Sopenharmony_ci		goto out;
21128c2ecf20Sopenharmony_ci
21138c2ecf20Sopenharmony_ci	/*
21148c2ecf20Sopenharmony_ci	 * Traverse the memory cgroup hierarchy from the victim task's
21158c2ecf20Sopenharmony_ci	 * cgroup up to the OOMing cgroup (or root) to find the
21168c2ecf20Sopenharmony_ci	 * highest-level memory cgroup with oom.group set.
21178c2ecf20Sopenharmony_ci	 */
21188c2ecf20Sopenharmony_ci	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
21198c2ecf20Sopenharmony_ci		if (memcg->oom_group)
21208c2ecf20Sopenharmony_ci			oom_group = memcg;
21218c2ecf20Sopenharmony_ci
21228c2ecf20Sopenharmony_ci		if (memcg == oom_domain)
21238c2ecf20Sopenharmony_ci			break;
21248c2ecf20Sopenharmony_ci	}
21258c2ecf20Sopenharmony_ci
21268c2ecf20Sopenharmony_ci	if (oom_group)
21278c2ecf20Sopenharmony_ci		css_get(&oom_group->css);
21288c2ecf20Sopenharmony_ciout:
21298c2ecf20Sopenharmony_ci	rcu_read_unlock();
21308c2ecf20Sopenharmony_ci
21318c2ecf20Sopenharmony_ci	return oom_group;
21328c2ecf20Sopenharmony_ci}
21338c2ecf20Sopenharmony_ci
21348c2ecf20Sopenharmony_civoid mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
21358c2ecf20Sopenharmony_ci{
21368c2ecf20Sopenharmony_ci	pr_info("Tasks in ");
21378c2ecf20Sopenharmony_ci	pr_cont_cgroup_path(memcg->css.cgroup);
21388c2ecf20Sopenharmony_ci	pr_cont(" are going to be killed due to memory.oom.group set\n");
21398c2ecf20Sopenharmony_ci}
21408c2ecf20Sopenharmony_ci
21418c2ecf20Sopenharmony_ci/**
21428c2ecf20Sopenharmony_ci * lock_page_memcg - lock a page->mem_cgroup binding
21438c2ecf20Sopenharmony_ci * @page: the page
21448c2ecf20Sopenharmony_ci *
21458c2ecf20Sopenharmony_ci * This function protects unlocked LRU pages from being moved to
21468c2ecf20Sopenharmony_ci * another cgroup.
21478c2ecf20Sopenharmony_ci *
21488c2ecf20Sopenharmony_ci * It ensures lifetime of the returned memcg. Caller is responsible
21498c2ecf20Sopenharmony_ci * for the lifetime of the page; __unlock_page_memcg() is available
21508c2ecf20Sopenharmony_ci * when @page might get freed inside the locked section.
21518c2ecf20Sopenharmony_ci */
21528c2ecf20Sopenharmony_cistruct mem_cgroup *lock_page_memcg(struct page *page)
21538c2ecf20Sopenharmony_ci{
21548c2ecf20Sopenharmony_ci	struct page *head = compound_head(page); /* rmap on tail pages */
21558c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
21568c2ecf20Sopenharmony_ci	unsigned long flags;
21578c2ecf20Sopenharmony_ci
21588c2ecf20Sopenharmony_ci	/*
21598c2ecf20Sopenharmony_ci	 * The RCU lock is held throughout the transaction.  The fast
21608c2ecf20Sopenharmony_ci	 * path can get away without acquiring the memcg->move_lock
21618c2ecf20Sopenharmony_ci	 * because page moving starts with an RCU grace period.
21628c2ecf20Sopenharmony_ci	 *
21638c2ecf20Sopenharmony_ci	 * The RCU lock also protects the memcg from being freed when
21648c2ecf20Sopenharmony_ci	 * the page state that is going to change is the only thing
21658c2ecf20Sopenharmony_ci	 * preventing the page itself from being freed. E.g. writeback
21668c2ecf20Sopenharmony_ci	 * doesn't hold a page reference and relies on PG_writeback to
21678c2ecf20Sopenharmony_ci	 * keep off truncation, migration and so forth.
21688c2ecf20Sopenharmony_ci         */
21698c2ecf20Sopenharmony_ci	rcu_read_lock();
21708c2ecf20Sopenharmony_ci
21718c2ecf20Sopenharmony_ci	if (mem_cgroup_disabled())
21728c2ecf20Sopenharmony_ci		return NULL;
21738c2ecf20Sopenharmony_ciagain:
21748c2ecf20Sopenharmony_ci	memcg = head->mem_cgroup;
21758c2ecf20Sopenharmony_ci	if (unlikely(!memcg))
21768c2ecf20Sopenharmony_ci		return NULL;
21778c2ecf20Sopenharmony_ci
21788c2ecf20Sopenharmony_ci	if (atomic_read(&memcg->moving_account) <= 0)
21798c2ecf20Sopenharmony_ci		return memcg;
21808c2ecf20Sopenharmony_ci
21818c2ecf20Sopenharmony_ci	spin_lock_irqsave(&memcg->move_lock, flags);
21828c2ecf20Sopenharmony_ci	if (memcg != head->mem_cgroup) {
21838c2ecf20Sopenharmony_ci		spin_unlock_irqrestore(&memcg->move_lock, flags);
21848c2ecf20Sopenharmony_ci		goto again;
21858c2ecf20Sopenharmony_ci	}
21868c2ecf20Sopenharmony_ci
21878c2ecf20Sopenharmony_ci	/*
21888c2ecf20Sopenharmony_ci	 * When charge migration first begins, we can have locked and
21898c2ecf20Sopenharmony_ci	 * unlocked page stat updates happening concurrently.  Track
21908c2ecf20Sopenharmony_ci	 * the task who has the lock for unlock_page_memcg().
21918c2ecf20Sopenharmony_ci	 */
21928c2ecf20Sopenharmony_ci	memcg->move_lock_task = current;
21938c2ecf20Sopenharmony_ci	memcg->move_lock_flags = flags;
21948c2ecf20Sopenharmony_ci
21958c2ecf20Sopenharmony_ci	return memcg;
21968c2ecf20Sopenharmony_ci}
21978c2ecf20Sopenharmony_ciEXPORT_SYMBOL(lock_page_memcg);
21988c2ecf20Sopenharmony_ci
21998c2ecf20Sopenharmony_ci/**
22008c2ecf20Sopenharmony_ci * __unlock_page_memcg - unlock and unpin a memcg
22018c2ecf20Sopenharmony_ci * @memcg: the memcg
22028c2ecf20Sopenharmony_ci *
22038c2ecf20Sopenharmony_ci * Unlock and unpin a memcg returned by lock_page_memcg().
22048c2ecf20Sopenharmony_ci */
22058c2ecf20Sopenharmony_civoid __unlock_page_memcg(struct mem_cgroup *memcg)
22068c2ecf20Sopenharmony_ci{
22078c2ecf20Sopenharmony_ci	if (memcg && memcg->move_lock_task == current) {
22088c2ecf20Sopenharmony_ci		unsigned long flags = memcg->move_lock_flags;
22098c2ecf20Sopenharmony_ci
22108c2ecf20Sopenharmony_ci		memcg->move_lock_task = NULL;
22118c2ecf20Sopenharmony_ci		memcg->move_lock_flags = 0;
22128c2ecf20Sopenharmony_ci
22138c2ecf20Sopenharmony_ci		spin_unlock_irqrestore(&memcg->move_lock, flags);
22148c2ecf20Sopenharmony_ci	}
22158c2ecf20Sopenharmony_ci
22168c2ecf20Sopenharmony_ci	rcu_read_unlock();
22178c2ecf20Sopenharmony_ci}
22188c2ecf20Sopenharmony_ci
22198c2ecf20Sopenharmony_ci/**
22208c2ecf20Sopenharmony_ci * unlock_page_memcg - unlock a page->mem_cgroup binding
22218c2ecf20Sopenharmony_ci * @page: the page
22228c2ecf20Sopenharmony_ci */
22238c2ecf20Sopenharmony_civoid unlock_page_memcg(struct page *page)
22248c2ecf20Sopenharmony_ci{
22258c2ecf20Sopenharmony_ci	struct page *head = compound_head(page);
22268c2ecf20Sopenharmony_ci
22278c2ecf20Sopenharmony_ci	__unlock_page_memcg(head->mem_cgroup);
22288c2ecf20Sopenharmony_ci}
22298c2ecf20Sopenharmony_ciEXPORT_SYMBOL(unlock_page_memcg);
22308c2ecf20Sopenharmony_ci
22318c2ecf20Sopenharmony_cistruct memcg_stock_pcp {
22328c2ecf20Sopenharmony_ci	struct mem_cgroup *cached; /* this never be root cgroup */
22338c2ecf20Sopenharmony_ci	unsigned int nr_pages;
22348c2ecf20Sopenharmony_ci
22358c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
22368c2ecf20Sopenharmony_ci	struct obj_cgroup *cached_objcg;
22378c2ecf20Sopenharmony_ci	unsigned int nr_bytes;
22388c2ecf20Sopenharmony_ci#endif
22398c2ecf20Sopenharmony_ci
22408c2ecf20Sopenharmony_ci	struct work_struct work;
22418c2ecf20Sopenharmony_ci	unsigned long flags;
22428c2ecf20Sopenharmony_ci#define FLUSHING_CACHED_CHARGE	0
22438c2ecf20Sopenharmony_ci};
22448c2ecf20Sopenharmony_cistatic DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
22458c2ecf20Sopenharmony_cistatic DEFINE_MUTEX(percpu_charge_mutex);
22468c2ecf20Sopenharmony_ci
22478c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
22488c2ecf20Sopenharmony_cistatic void drain_obj_stock(struct memcg_stock_pcp *stock);
22498c2ecf20Sopenharmony_cistatic bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
22508c2ecf20Sopenharmony_ci				     struct mem_cgroup *root_memcg);
22518c2ecf20Sopenharmony_ci
22528c2ecf20Sopenharmony_ci#else
22538c2ecf20Sopenharmony_cistatic inline void drain_obj_stock(struct memcg_stock_pcp *stock)
22548c2ecf20Sopenharmony_ci{
22558c2ecf20Sopenharmony_ci}
22568c2ecf20Sopenharmony_cistatic bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
22578c2ecf20Sopenharmony_ci				     struct mem_cgroup *root_memcg)
22588c2ecf20Sopenharmony_ci{
22598c2ecf20Sopenharmony_ci	return false;
22608c2ecf20Sopenharmony_ci}
22618c2ecf20Sopenharmony_ci#endif
22628c2ecf20Sopenharmony_ci
22638c2ecf20Sopenharmony_ci/**
22648c2ecf20Sopenharmony_ci * consume_stock: Try to consume stocked charge on this cpu.
22658c2ecf20Sopenharmony_ci * @memcg: memcg to consume from.
22668c2ecf20Sopenharmony_ci * @nr_pages: how many pages to charge.
22678c2ecf20Sopenharmony_ci *
22688c2ecf20Sopenharmony_ci * The charges will only happen if @memcg matches the current cpu's memcg
22698c2ecf20Sopenharmony_ci * stock, and at least @nr_pages are available in that stock.  Failure to
22708c2ecf20Sopenharmony_ci * service an allocation will refill the stock.
22718c2ecf20Sopenharmony_ci *
22728c2ecf20Sopenharmony_ci * returns true if successful, false otherwise.
22738c2ecf20Sopenharmony_ci */
22748c2ecf20Sopenharmony_cistatic bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
22758c2ecf20Sopenharmony_ci{
22768c2ecf20Sopenharmony_ci	struct memcg_stock_pcp *stock;
22778c2ecf20Sopenharmony_ci	unsigned long flags;
22788c2ecf20Sopenharmony_ci	bool ret = false;
22798c2ecf20Sopenharmony_ci
22808c2ecf20Sopenharmony_ci	if (nr_pages > MEMCG_CHARGE_BATCH)
22818c2ecf20Sopenharmony_ci		return ret;
22828c2ecf20Sopenharmony_ci
22838c2ecf20Sopenharmony_ci	local_irq_save(flags);
22848c2ecf20Sopenharmony_ci
22858c2ecf20Sopenharmony_ci	stock = this_cpu_ptr(&memcg_stock);
22868c2ecf20Sopenharmony_ci	if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
22878c2ecf20Sopenharmony_ci		stock->nr_pages -= nr_pages;
22888c2ecf20Sopenharmony_ci		ret = true;
22898c2ecf20Sopenharmony_ci	}
22908c2ecf20Sopenharmony_ci
22918c2ecf20Sopenharmony_ci	local_irq_restore(flags);
22928c2ecf20Sopenharmony_ci
22938c2ecf20Sopenharmony_ci	return ret;
22948c2ecf20Sopenharmony_ci}
22958c2ecf20Sopenharmony_ci
22968c2ecf20Sopenharmony_ci/*
22978c2ecf20Sopenharmony_ci * Returns stocks cached in percpu and reset cached information.
22988c2ecf20Sopenharmony_ci */
22998c2ecf20Sopenharmony_cistatic void drain_stock(struct memcg_stock_pcp *stock)
23008c2ecf20Sopenharmony_ci{
23018c2ecf20Sopenharmony_ci	struct mem_cgroup *old = stock->cached;
23028c2ecf20Sopenharmony_ci
23038c2ecf20Sopenharmony_ci	if (!old)
23048c2ecf20Sopenharmony_ci		return;
23058c2ecf20Sopenharmony_ci
23068c2ecf20Sopenharmony_ci	if (stock->nr_pages) {
23078c2ecf20Sopenharmony_ci		page_counter_uncharge(&old->memory, stock->nr_pages);
23088c2ecf20Sopenharmony_ci		if (do_memsw_account())
23098c2ecf20Sopenharmony_ci			page_counter_uncharge(&old->memsw, stock->nr_pages);
23108c2ecf20Sopenharmony_ci		stock->nr_pages = 0;
23118c2ecf20Sopenharmony_ci	}
23128c2ecf20Sopenharmony_ci
23138c2ecf20Sopenharmony_ci	css_put(&old->css);
23148c2ecf20Sopenharmony_ci	stock->cached = NULL;
23158c2ecf20Sopenharmony_ci}
23168c2ecf20Sopenharmony_ci
23178c2ecf20Sopenharmony_cistatic void drain_local_stock(struct work_struct *dummy)
23188c2ecf20Sopenharmony_ci{
23198c2ecf20Sopenharmony_ci	struct memcg_stock_pcp *stock;
23208c2ecf20Sopenharmony_ci	unsigned long flags;
23218c2ecf20Sopenharmony_ci
23228c2ecf20Sopenharmony_ci	/*
23238c2ecf20Sopenharmony_ci	 * The only protection from memory hotplug vs. drain_stock races is
23248c2ecf20Sopenharmony_ci	 * that we always operate on local CPU stock here with IRQ disabled
23258c2ecf20Sopenharmony_ci	 */
23268c2ecf20Sopenharmony_ci	local_irq_save(flags);
23278c2ecf20Sopenharmony_ci
23288c2ecf20Sopenharmony_ci	stock = this_cpu_ptr(&memcg_stock);
23298c2ecf20Sopenharmony_ci	drain_obj_stock(stock);
23308c2ecf20Sopenharmony_ci	drain_stock(stock);
23318c2ecf20Sopenharmony_ci	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
23328c2ecf20Sopenharmony_ci
23338c2ecf20Sopenharmony_ci	local_irq_restore(flags);
23348c2ecf20Sopenharmony_ci}
23358c2ecf20Sopenharmony_ci
23368c2ecf20Sopenharmony_ci/*
23378c2ecf20Sopenharmony_ci * Cache charges(val) to local per_cpu area.
23388c2ecf20Sopenharmony_ci * This will be consumed by consume_stock() function, later.
23398c2ecf20Sopenharmony_ci */
23408c2ecf20Sopenharmony_cistatic void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
23418c2ecf20Sopenharmony_ci{
23428c2ecf20Sopenharmony_ci	struct memcg_stock_pcp *stock;
23438c2ecf20Sopenharmony_ci	unsigned long flags;
23448c2ecf20Sopenharmony_ci
23458c2ecf20Sopenharmony_ci	local_irq_save(flags);
23468c2ecf20Sopenharmony_ci
23478c2ecf20Sopenharmony_ci	stock = this_cpu_ptr(&memcg_stock);
23488c2ecf20Sopenharmony_ci	if (stock->cached != memcg) { /* reset if necessary */
23498c2ecf20Sopenharmony_ci		drain_stock(stock);
23508c2ecf20Sopenharmony_ci		css_get(&memcg->css);
23518c2ecf20Sopenharmony_ci		stock->cached = memcg;
23528c2ecf20Sopenharmony_ci	}
23538c2ecf20Sopenharmony_ci	stock->nr_pages += nr_pages;
23548c2ecf20Sopenharmony_ci
23558c2ecf20Sopenharmony_ci	if (stock->nr_pages > MEMCG_CHARGE_BATCH)
23568c2ecf20Sopenharmony_ci		drain_stock(stock);
23578c2ecf20Sopenharmony_ci
23588c2ecf20Sopenharmony_ci	local_irq_restore(flags);
23598c2ecf20Sopenharmony_ci}
23608c2ecf20Sopenharmony_ci
23618c2ecf20Sopenharmony_ci/*
23628c2ecf20Sopenharmony_ci * Drains all per-CPU charge caches for given root_memcg resp. subtree
23638c2ecf20Sopenharmony_ci * of the hierarchy under it.
23648c2ecf20Sopenharmony_ci */
23658c2ecf20Sopenharmony_cistatic void drain_all_stock(struct mem_cgroup *root_memcg)
23668c2ecf20Sopenharmony_ci{
23678c2ecf20Sopenharmony_ci	int cpu, curcpu;
23688c2ecf20Sopenharmony_ci
23698c2ecf20Sopenharmony_ci	/* If someone's already draining, avoid adding running more workers. */
23708c2ecf20Sopenharmony_ci	if (!mutex_trylock(&percpu_charge_mutex))
23718c2ecf20Sopenharmony_ci		return;
23728c2ecf20Sopenharmony_ci	/*
23738c2ecf20Sopenharmony_ci	 * Notify other cpus that system-wide "drain" is running
23748c2ecf20Sopenharmony_ci	 * We do not care about races with the cpu hotplug because cpu down
23758c2ecf20Sopenharmony_ci	 * as well as workers from this path always operate on the local
23768c2ecf20Sopenharmony_ci	 * per-cpu data. CPU up doesn't touch memcg_stock at all.
23778c2ecf20Sopenharmony_ci	 */
23788c2ecf20Sopenharmony_ci	curcpu = get_cpu();
23798c2ecf20Sopenharmony_ci	for_each_online_cpu(cpu) {
23808c2ecf20Sopenharmony_ci		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
23818c2ecf20Sopenharmony_ci		struct mem_cgroup *memcg;
23828c2ecf20Sopenharmony_ci		bool flush = false;
23838c2ecf20Sopenharmony_ci
23848c2ecf20Sopenharmony_ci		rcu_read_lock();
23858c2ecf20Sopenharmony_ci		memcg = stock->cached;
23868c2ecf20Sopenharmony_ci		if (memcg && stock->nr_pages &&
23878c2ecf20Sopenharmony_ci		    mem_cgroup_is_descendant(memcg, root_memcg))
23888c2ecf20Sopenharmony_ci			flush = true;
23898c2ecf20Sopenharmony_ci		if (obj_stock_flush_required(stock, root_memcg))
23908c2ecf20Sopenharmony_ci			flush = true;
23918c2ecf20Sopenharmony_ci		rcu_read_unlock();
23928c2ecf20Sopenharmony_ci
23938c2ecf20Sopenharmony_ci		if (flush &&
23948c2ecf20Sopenharmony_ci		    !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
23958c2ecf20Sopenharmony_ci			if (cpu == curcpu)
23968c2ecf20Sopenharmony_ci				drain_local_stock(&stock->work);
23978c2ecf20Sopenharmony_ci			else
23988c2ecf20Sopenharmony_ci				schedule_work_on(cpu, &stock->work);
23998c2ecf20Sopenharmony_ci		}
24008c2ecf20Sopenharmony_ci	}
24018c2ecf20Sopenharmony_ci	put_cpu();
24028c2ecf20Sopenharmony_ci	mutex_unlock(&percpu_charge_mutex);
24038c2ecf20Sopenharmony_ci}
24048c2ecf20Sopenharmony_ci
24058c2ecf20Sopenharmony_cistatic int memcg_hotplug_cpu_dead(unsigned int cpu)
24068c2ecf20Sopenharmony_ci{
24078c2ecf20Sopenharmony_ci	struct memcg_stock_pcp *stock;
24088c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg, *mi;
24098c2ecf20Sopenharmony_ci
24108c2ecf20Sopenharmony_ci	stock = &per_cpu(memcg_stock, cpu);
24118c2ecf20Sopenharmony_ci	drain_stock(stock);
24128c2ecf20Sopenharmony_ci
24138c2ecf20Sopenharmony_ci	for_each_mem_cgroup(memcg) {
24148c2ecf20Sopenharmony_ci		int i;
24158c2ecf20Sopenharmony_ci
24168c2ecf20Sopenharmony_ci		for (i = 0; i < MEMCG_NR_STAT; i++) {
24178c2ecf20Sopenharmony_ci			int nid;
24188c2ecf20Sopenharmony_ci			long x;
24198c2ecf20Sopenharmony_ci
24208c2ecf20Sopenharmony_ci			x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
24218c2ecf20Sopenharmony_ci			if (x)
24228c2ecf20Sopenharmony_ci				for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
24238c2ecf20Sopenharmony_ci					atomic_long_add(x, &memcg->vmstats[i]);
24248c2ecf20Sopenharmony_ci
24258c2ecf20Sopenharmony_ci			if (i >= NR_VM_NODE_STAT_ITEMS)
24268c2ecf20Sopenharmony_ci				continue;
24278c2ecf20Sopenharmony_ci
24288c2ecf20Sopenharmony_ci			for_each_node(nid) {
24298c2ecf20Sopenharmony_ci				struct mem_cgroup_per_node *pn;
24308c2ecf20Sopenharmony_ci
24318c2ecf20Sopenharmony_ci				pn = mem_cgroup_nodeinfo(memcg, nid);
24328c2ecf20Sopenharmony_ci				x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
24338c2ecf20Sopenharmony_ci				if (x)
24348c2ecf20Sopenharmony_ci					do {
24358c2ecf20Sopenharmony_ci						atomic_long_add(x, &pn->lruvec_stat[i]);
24368c2ecf20Sopenharmony_ci					} while ((pn = parent_nodeinfo(pn, nid)));
24378c2ecf20Sopenharmony_ci			}
24388c2ecf20Sopenharmony_ci		}
24398c2ecf20Sopenharmony_ci
24408c2ecf20Sopenharmony_ci		for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
24418c2ecf20Sopenharmony_ci			long x;
24428c2ecf20Sopenharmony_ci
24438c2ecf20Sopenharmony_ci			x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
24448c2ecf20Sopenharmony_ci			if (x)
24458c2ecf20Sopenharmony_ci				for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
24468c2ecf20Sopenharmony_ci					atomic_long_add(x, &memcg->vmevents[i]);
24478c2ecf20Sopenharmony_ci		}
24488c2ecf20Sopenharmony_ci	}
24498c2ecf20Sopenharmony_ci
24508c2ecf20Sopenharmony_ci	return 0;
24518c2ecf20Sopenharmony_ci}
24528c2ecf20Sopenharmony_ci
24538c2ecf20Sopenharmony_cistatic unsigned long reclaim_high(struct mem_cgroup *memcg,
24548c2ecf20Sopenharmony_ci				  unsigned int nr_pages,
24558c2ecf20Sopenharmony_ci				  gfp_t gfp_mask)
24568c2ecf20Sopenharmony_ci{
24578c2ecf20Sopenharmony_ci	unsigned long nr_reclaimed = 0;
24588c2ecf20Sopenharmony_ci
24598c2ecf20Sopenharmony_ci	do {
24608c2ecf20Sopenharmony_ci		unsigned long pflags;
24618c2ecf20Sopenharmony_ci
24628c2ecf20Sopenharmony_ci		if (page_counter_read(&memcg->memory) <=
24638c2ecf20Sopenharmony_ci		    READ_ONCE(memcg->memory.high))
24648c2ecf20Sopenharmony_ci			continue;
24658c2ecf20Sopenharmony_ci
24668c2ecf20Sopenharmony_ci		memcg_memory_event(memcg, MEMCG_HIGH);
24678c2ecf20Sopenharmony_ci
24688c2ecf20Sopenharmony_ci		psi_memstall_enter(&pflags);
24698c2ecf20Sopenharmony_ci		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
24708c2ecf20Sopenharmony_ci							     gfp_mask, true);
24718c2ecf20Sopenharmony_ci		psi_memstall_leave(&pflags);
24728c2ecf20Sopenharmony_ci	} while ((memcg = parent_mem_cgroup(memcg)) &&
24738c2ecf20Sopenharmony_ci		 !mem_cgroup_is_root(memcg));
24748c2ecf20Sopenharmony_ci
24758c2ecf20Sopenharmony_ci	return nr_reclaimed;
24768c2ecf20Sopenharmony_ci}
24778c2ecf20Sopenharmony_ci
24788c2ecf20Sopenharmony_cistatic void high_work_func(struct work_struct *work)
24798c2ecf20Sopenharmony_ci{
24808c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
24818c2ecf20Sopenharmony_ci
24828c2ecf20Sopenharmony_ci	memcg = container_of(work, struct mem_cgroup, high_work);
24838c2ecf20Sopenharmony_ci	reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
24848c2ecf20Sopenharmony_ci}
24858c2ecf20Sopenharmony_ci
24868c2ecf20Sopenharmony_ci/*
24878c2ecf20Sopenharmony_ci * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
24888c2ecf20Sopenharmony_ci * enough to still cause a significant slowdown in most cases, while still
24898c2ecf20Sopenharmony_ci * allowing diagnostics and tracing to proceed without becoming stuck.
24908c2ecf20Sopenharmony_ci */
24918c2ecf20Sopenharmony_ci#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
24928c2ecf20Sopenharmony_ci
24938c2ecf20Sopenharmony_ci/*
24948c2ecf20Sopenharmony_ci * When calculating the delay, we use these either side of the exponentiation to
24958c2ecf20Sopenharmony_ci * maintain precision and scale to a reasonable number of jiffies (see the table
24968c2ecf20Sopenharmony_ci * below.
24978c2ecf20Sopenharmony_ci *
24988c2ecf20Sopenharmony_ci * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
24998c2ecf20Sopenharmony_ci *   overage ratio to a delay.
25008c2ecf20Sopenharmony_ci * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
25018c2ecf20Sopenharmony_ci *   proposed penalty in order to reduce to a reasonable number of jiffies, and
25028c2ecf20Sopenharmony_ci *   to produce a reasonable delay curve.
25038c2ecf20Sopenharmony_ci *
25048c2ecf20Sopenharmony_ci * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
25058c2ecf20Sopenharmony_ci * reasonable delay curve compared to precision-adjusted overage, not
25068c2ecf20Sopenharmony_ci * penalising heavily at first, but still making sure that growth beyond the
25078c2ecf20Sopenharmony_ci * limit penalises misbehaviour cgroups by slowing them down exponentially. For
25088c2ecf20Sopenharmony_ci * example, with a high of 100 megabytes:
25098c2ecf20Sopenharmony_ci *
25108c2ecf20Sopenharmony_ci *  +-------+------------------------+
25118c2ecf20Sopenharmony_ci *  | usage | time to allocate in ms |
25128c2ecf20Sopenharmony_ci *  +-------+------------------------+
25138c2ecf20Sopenharmony_ci *  | 100M  |                      0 |
25148c2ecf20Sopenharmony_ci *  | 101M  |                      6 |
25158c2ecf20Sopenharmony_ci *  | 102M  |                     25 |
25168c2ecf20Sopenharmony_ci *  | 103M  |                     57 |
25178c2ecf20Sopenharmony_ci *  | 104M  |                    102 |
25188c2ecf20Sopenharmony_ci *  | 105M  |                    159 |
25198c2ecf20Sopenharmony_ci *  | 106M  |                    230 |
25208c2ecf20Sopenharmony_ci *  | 107M  |                    313 |
25218c2ecf20Sopenharmony_ci *  | 108M  |                    409 |
25228c2ecf20Sopenharmony_ci *  | 109M  |                    518 |
25238c2ecf20Sopenharmony_ci *  | 110M  |                    639 |
25248c2ecf20Sopenharmony_ci *  | 111M  |                    774 |
25258c2ecf20Sopenharmony_ci *  | 112M  |                    921 |
25268c2ecf20Sopenharmony_ci *  | 113M  |                   1081 |
25278c2ecf20Sopenharmony_ci *  | 114M  |                   1254 |
25288c2ecf20Sopenharmony_ci *  | 115M  |                   1439 |
25298c2ecf20Sopenharmony_ci *  | 116M  |                   1638 |
25308c2ecf20Sopenharmony_ci *  | 117M  |                   1849 |
25318c2ecf20Sopenharmony_ci *  | 118M  |                   2000 |
25328c2ecf20Sopenharmony_ci *  | 119M  |                   2000 |
25338c2ecf20Sopenharmony_ci *  | 120M  |                   2000 |
25348c2ecf20Sopenharmony_ci *  +-------+------------------------+
25358c2ecf20Sopenharmony_ci */
25368c2ecf20Sopenharmony_ci #define MEMCG_DELAY_PRECISION_SHIFT 20
25378c2ecf20Sopenharmony_ci #define MEMCG_DELAY_SCALING_SHIFT 14
25388c2ecf20Sopenharmony_ci
25398c2ecf20Sopenharmony_cistatic u64 calculate_overage(unsigned long usage, unsigned long high)
25408c2ecf20Sopenharmony_ci{
25418c2ecf20Sopenharmony_ci	u64 overage;
25428c2ecf20Sopenharmony_ci
25438c2ecf20Sopenharmony_ci	if (usage <= high)
25448c2ecf20Sopenharmony_ci		return 0;
25458c2ecf20Sopenharmony_ci
25468c2ecf20Sopenharmony_ci	/*
25478c2ecf20Sopenharmony_ci	 * Prevent division by 0 in overage calculation by acting as if
25488c2ecf20Sopenharmony_ci	 * it was a threshold of 1 page
25498c2ecf20Sopenharmony_ci	 */
25508c2ecf20Sopenharmony_ci	high = max(high, 1UL);
25518c2ecf20Sopenharmony_ci
25528c2ecf20Sopenharmony_ci	overage = usage - high;
25538c2ecf20Sopenharmony_ci	overage <<= MEMCG_DELAY_PRECISION_SHIFT;
25548c2ecf20Sopenharmony_ci	return div64_u64(overage, high);
25558c2ecf20Sopenharmony_ci}
25568c2ecf20Sopenharmony_ci
25578c2ecf20Sopenharmony_cistatic u64 mem_find_max_overage(struct mem_cgroup *memcg)
25588c2ecf20Sopenharmony_ci{
25598c2ecf20Sopenharmony_ci	u64 overage, max_overage = 0;
25608c2ecf20Sopenharmony_ci
25618c2ecf20Sopenharmony_ci	do {
25628c2ecf20Sopenharmony_ci		overage = calculate_overage(page_counter_read(&memcg->memory),
25638c2ecf20Sopenharmony_ci					    READ_ONCE(memcg->memory.high));
25648c2ecf20Sopenharmony_ci		max_overage = max(overage, max_overage);
25658c2ecf20Sopenharmony_ci	} while ((memcg = parent_mem_cgroup(memcg)) &&
25668c2ecf20Sopenharmony_ci		 !mem_cgroup_is_root(memcg));
25678c2ecf20Sopenharmony_ci
25688c2ecf20Sopenharmony_ci	return max_overage;
25698c2ecf20Sopenharmony_ci}
25708c2ecf20Sopenharmony_ci
25718c2ecf20Sopenharmony_cistatic u64 swap_find_max_overage(struct mem_cgroup *memcg)
25728c2ecf20Sopenharmony_ci{
25738c2ecf20Sopenharmony_ci	u64 overage, max_overage = 0;
25748c2ecf20Sopenharmony_ci
25758c2ecf20Sopenharmony_ci	do {
25768c2ecf20Sopenharmony_ci		overage = calculate_overage(page_counter_read(&memcg->swap),
25778c2ecf20Sopenharmony_ci					    READ_ONCE(memcg->swap.high));
25788c2ecf20Sopenharmony_ci		if (overage)
25798c2ecf20Sopenharmony_ci			memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
25808c2ecf20Sopenharmony_ci		max_overage = max(overage, max_overage);
25818c2ecf20Sopenharmony_ci	} while ((memcg = parent_mem_cgroup(memcg)) &&
25828c2ecf20Sopenharmony_ci		 !mem_cgroup_is_root(memcg));
25838c2ecf20Sopenharmony_ci
25848c2ecf20Sopenharmony_ci	return max_overage;
25858c2ecf20Sopenharmony_ci}
25868c2ecf20Sopenharmony_ci
25878c2ecf20Sopenharmony_ci/*
25888c2ecf20Sopenharmony_ci * Get the number of jiffies that we should penalise a mischievous cgroup which
25898c2ecf20Sopenharmony_ci * is exceeding its memory.high by checking both it and its ancestors.
25908c2ecf20Sopenharmony_ci */
25918c2ecf20Sopenharmony_cistatic unsigned long calculate_high_delay(struct mem_cgroup *memcg,
25928c2ecf20Sopenharmony_ci					  unsigned int nr_pages,
25938c2ecf20Sopenharmony_ci					  u64 max_overage)
25948c2ecf20Sopenharmony_ci{
25958c2ecf20Sopenharmony_ci	unsigned long penalty_jiffies;
25968c2ecf20Sopenharmony_ci
25978c2ecf20Sopenharmony_ci	if (!max_overage)
25988c2ecf20Sopenharmony_ci		return 0;
25998c2ecf20Sopenharmony_ci
26008c2ecf20Sopenharmony_ci	/*
26018c2ecf20Sopenharmony_ci	 * We use overage compared to memory.high to calculate the number of
26028c2ecf20Sopenharmony_ci	 * jiffies to sleep (penalty_jiffies). Ideally this value should be
26038c2ecf20Sopenharmony_ci	 * fairly lenient on small overages, and increasingly harsh when the
26048c2ecf20Sopenharmony_ci	 * memcg in question makes it clear that it has no intention of stopping
26058c2ecf20Sopenharmony_ci	 * its crazy behaviour, so we exponentially increase the delay based on
26068c2ecf20Sopenharmony_ci	 * overage amount.
26078c2ecf20Sopenharmony_ci	 */
26088c2ecf20Sopenharmony_ci	penalty_jiffies = max_overage * max_overage * HZ;
26098c2ecf20Sopenharmony_ci	penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
26108c2ecf20Sopenharmony_ci	penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
26118c2ecf20Sopenharmony_ci
26128c2ecf20Sopenharmony_ci	/*
26138c2ecf20Sopenharmony_ci	 * Factor in the task's own contribution to the overage, such that four
26148c2ecf20Sopenharmony_ci	 * N-sized allocations are throttled approximately the same as one
26158c2ecf20Sopenharmony_ci	 * 4N-sized allocation.
26168c2ecf20Sopenharmony_ci	 *
26178c2ecf20Sopenharmony_ci	 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
26188c2ecf20Sopenharmony_ci	 * larger the current charge patch is than that.
26198c2ecf20Sopenharmony_ci	 */
26208c2ecf20Sopenharmony_ci	return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
26218c2ecf20Sopenharmony_ci}
26228c2ecf20Sopenharmony_ci
26238c2ecf20Sopenharmony_ci/*
26248c2ecf20Sopenharmony_ci * Scheduled by try_charge() to be executed from the userland return path
26258c2ecf20Sopenharmony_ci * and reclaims memory over the high limit.
26268c2ecf20Sopenharmony_ci */
26278c2ecf20Sopenharmony_civoid mem_cgroup_handle_over_high(void)
26288c2ecf20Sopenharmony_ci{
26298c2ecf20Sopenharmony_ci	unsigned long penalty_jiffies;
26308c2ecf20Sopenharmony_ci	unsigned long pflags;
26318c2ecf20Sopenharmony_ci	unsigned long nr_reclaimed;
26328c2ecf20Sopenharmony_ci	unsigned int nr_pages = current->memcg_nr_pages_over_high;
26338c2ecf20Sopenharmony_ci	int nr_retries = MAX_RECLAIM_RETRIES;
26348c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
26358c2ecf20Sopenharmony_ci	bool in_retry = false;
26368c2ecf20Sopenharmony_ci
26378c2ecf20Sopenharmony_ci	if (likely(!nr_pages))
26388c2ecf20Sopenharmony_ci		return;
26398c2ecf20Sopenharmony_ci
26408c2ecf20Sopenharmony_ci	memcg = get_mem_cgroup_from_mm(current->mm);
26418c2ecf20Sopenharmony_ci	current->memcg_nr_pages_over_high = 0;
26428c2ecf20Sopenharmony_ci
26438c2ecf20Sopenharmony_ciretry_reclaim:
26448c2ecf20Sopenharmony_ci	/*
26458c2ecf20Sopenharmony_ci	 * The allocating task should reclaim at least the batch size, but for
26468c2ecf20Sopenharmony_ci	 * subsequent retries we only want to do what's necessary to prevent oom
26478c2ecf20Sopenharmony_ci	 * or breaching resource isolation.
26488c2ecf20Sopenharmony_ci	 *
26498c2ecf20Sopenharmony_ci	 * This is distinct from memory.max or page allocator behaviour because
26508c2ecf20Sopenharmony_ci	 * memory.high is currently batched, whereas memory.max and the page
26518c2ecf20Sopenharmony_ci	 * allocator run every time an allocation is made.
26528c2ecf20Sopenharmony_ci	 */
26538c2ecf20Sopenharmony_ci	nr_reclaimed = reclaim_high(memcg,
26548c2ecf20Sopenharmony_ci				    in_retry ? SWAP_CLUSTER_MAX : nr_pages,
26558c2ecf20Sopenharmony_ci				    GFP_KERNEL);
26568c2ecf20Sopenharmony_ci
26578c2ecf20Sopenharmony_ci	/*
26588c2ecf20Sopenharmony_ci	 * memory.high is breached and reclaim is unable to keep up. Throttle
26598c2ecf20Sopenharmony_ci	 * allocators proactively to slow down excessive growth.
26608c2ecf20Sopenharmony_ci	 */
26618c2ecf20Sopenharmony_ci	penalty_jiffies = calculate_high_delay(memcg, nr_pages,
26628c2ecf20Sopenharmony_ci					       mem_find_max_overage(memcg));
26638c2ecf20Sopenharmony_ci
26648c2ecf20Sopenharmony_ci	penalty_jiffies += calculate_high_delay(memcg, nr_pages,
26658c2ecf20Sopenharmony_ci						swap_find_max_overage(memcg));
26668c2ecf20Sopenharmony_ci
26678c2ecf20Sopenharmony_ci	/*
26688c2ecf20Sopenharmony_ci	 * Clamp the max delay per usermode return so as to still keep the
26698c2ecf20Sopenharmony_ci	 * application moving forwards and also permit diagnostics, albeit
26708c2ecf20Sopenharmony_ci	 * extremely slowly.
26718c2ecf20Sopenharmony_ci	 */
26728c2ecf20Sopenharmony_ci	penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
26738c2ecf20Sopenharmony_ci
26748c2ecf20Sopenharmony_ci	/*
26758c2ecf20Sopenharmony_ci	 * Don't sleep if the amount of jiffies this memcg owes us is so low
26768c2ecf20Sopenharmony_ci	 * that it's not even worth doing, in an attempt to be nice to those who
26778c2ecf20Sopenharmony_ci	 * go only a small amount over their memory.high value and maybe haven't
26788c2ecf20Sopenharmony_ci	 * been aggressively reclaimed enough yet.
26798c2ecf20Sopenharmony_ci	 */
26808c2ecf20Sopenharmony_ci	if (penalty_jiffies <= HZ / 100)
26818c2ecf20Sopenharmony_ci		goto out;
26828c2ecf20Sopenharmony_ci
26838c2ecf20Sopenharmony_ci	/*
26848c2ecf20Sopenharmony_ci	 * If reclaim is making forward progress but we're still over
26858c2ecf20Sopenharmony_ci	 * memory.high, we want to encourage that rather than doing allocator
26868c2ecf20Sopenharmony_ci	 * throttling.
26878c2ecf20Sopenharmony_ci	 */
26888c2ecf20Sopenharmony_ci	if (nr_reclaimed || nr_retries--) {
26898c2ecf20Sopenharmony_ci		in_retry = true;
26908c2ecf20Sopenharmony_ci		goto retry_reclaim;
26918c2ecf20Sopenharmony_ci	}
26928c2ecf20Sopenharmony_ci
26938c2ecf20Sopenharmony_ci	/*
26948c2ecf20Sopenharmony_ci	 * If we exit early, we're guaranteed to die (since
26958c2ecf20Sopenharmony_ci	 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
26968c2ecf20Sopenharmony_ci	 * need to account for any ill-begotten jiffies to pay them off later.
26978c2ecf20Sopenharmony_ci	 */
26988c2ecf20Sopenharmony_ci	psi_memstall_enter(&pflags);
26998c2ecf20Sopenharmony_ci	schedule_timeout_killable(penalty_jiffies);
27008c2ecf20Sopenharmony_ci	psi_memstall_leave(&pflags);
27018c2ecf20Sopenharmony_ci
27028c2ecf20Sopenharmony_ciout:
27038c2ecf20Sopenharmony_ci	css_put(&memcg->css);
27048c2ecf20Sopenharmony_ci}
27058c2ecf20Sopenharmony_ci
27068c2ecf20Sopenharmony_cistatic int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
27078c2ecf20Sopenharmony_ci		      unsigned int nr_pages)
27088c2ecf20Sopenharmony_ci{
27098c2ecf20Sopenharmony_ci	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
27108c2ecf20Sopenharmony_ci	int nr_retries = MAX_RECLAIM_RETRIES;
27118c2ecf20Sopenharmony_ci	struct mem_cgroup *mem_over_limit;
27128c2ecf20Sopenharmony_ci	struct page_counter *counter;
27138c2ecf20Sopenharmony_ci	enum oom_status oom_status;
27148c2ecf20Sopenharmony_ci	unsigned long nr_reclaimed;
27158c2ecf20Sopenharmony_ci	bool passed_oom = false;
27168c2ecf20Sopenharmony_ci	bool may_swap = true;
27178c2ecf20Sopenharmony_ci	bool drained = false;
27188c2ecf20Sopenharmony_ci	unsigned long pflags;
27198c2ecf20Sopenharmony_ci
27208c2ecf20Sopenharmony_ci	if (mem_cgroup_is_root(memcg))
27218c2ecf20Sopenharmony_ci		return 0;
27228c2ecf20Sopenharmony_ciretry:
27238c2ecf20Sopenharmony_ci	if (consume_stock(memcg, nr_pages))
27248c2ecf20Sopenharmony_ci		return 0;
27258c2ecf20Sopenharmony_ci
27268c2ecf20Sopenharmony_ci	if (!do_memsw_account() ||
27278c2ecf20Sopenharmony_ci	    page_counter_try_charge(&memcg->memsw, batch, &counter)) {
27288c2ecf20Sopenharmony_ci		if (page_counter_try_charge(&memcg->memory, batch, &counter))
27298c2ecf20Sopenharmony_ci			goto done_restock;
27308c2ecf20Sopenharmony_ci		if (do_memsw_account())
27318c2ecf20Sopenharmony_ci			page_counter_uncharge(&memcg->memsw, batch);
27328c2ecf20Sopenharmony_ci		mem_over_limit = mem_cgroup_from_counter(counter, memory);
27338c2ecf20Sopenharmony_ci	} else {
27348c2ecf20Sopenharmony_ci		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
27358c2ecf20Sopenharmony_ci		may_swap = false;
27368c2ecf20Sopenharmony_ci	}
27378c2ecf20Sopenharmony_ci
27388c2ecf20Sopenharmony_ci	if (batch > nr_pages) {
27398c2ecf20Sopenharmony_ci		batch = nr_pages;
27408c2ecf20Sopenharmony_ci		goto retry;
27418c2ecf20Sopenharmony_ci	}
27428c2ecf20Sopenharmony_ci
27438c2ecf20Sopenharmony_ci	/*
27448c2ecf20Sopenharmony_ci	 * Memcg doesn't have a dedicated reserve for atomic
27458c2ecf20Sopenharmony_ci	 * allocations. But like the global atomic pool, we need to
27468c2ecf20Sopenharmony_ci	 * put the burden of reclaim on regular allocation requests
27478c2ecf20Sopenharmony_ci	 * and let these go through as privileged allocations.
27488c2ecf20Sopenharmony_ci	 */
27498c2ecf20Sopenharmony_ci	if (gfp_mask & __GFP_ATOMIC)
27508c2ecf20Sopenharmony_ci		goto force;
27518c2ecf20Sopenharmony_ci
27528c2ecf20Sopenharmony_ci	/*
27538c2ecf20Sopenharmony_ci	 * Prevent unbounded recursion when reclaim operations need to
27548c2ecf20Sopenharmony_ci	 * allocate memory. This might exceed the limits temporarily,
27558c2ecf20Sopenharmony_ci	 * but we prefer facilitating memory reclaim and getting back
27568c2ecf20Sopenharmony_ci	 * under the limit over triggering OOM kills in these cases.
27578c2ecf20Sopenharmony_ci	 */
27588c2ecf20Sopenharmony_ci	if (unlikely(current->flags & PF_MEMALLOC))
27598c2ecf20Sopenharmony_ci		goto force;
27608c2ecf20Sopenharmony_ci
27618c2ecf20Sopenharmony_ci	if (unlikely(task_in_memcg_oom(current)))
27628c2ecf20Sopenharmony_ci		goto nomem;
27638c2ecf20Sopenharmony_ci
27648c2ecf20Sopenharmony_ci	if (!gfpflags_allow_blocking(gfp_mask))
27658c2ecf20Sopenharmony_ci		goto nomem;
27668c2ecf20Sopenharmony_ci
27678c2ecf20Sopenharmony_ci	memcg_memory_event(mem_over_limit, MEMCG_MAX);
27688c2ecf20Sopenharmony_ci
27698c2ecf20Sopenharmony_ci	psi_memstall_enter(&pflags);
27708c2ecf20Sopenharmony_ci	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
27718c2ecf20Sopenharmony_ci						    gfp_mask, may_swap);
27728c2ecf20Sopenharmony_ci	psi_memstall_leave(&pflags);
27738c2ecf20Sopenharmony_ci
27748c2ecf20Sopenharmony_ci	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
27758c2ecf20Sopenharmony_ci		goto retry;
27768c2ecf20Sopenharmony_ci
27778c2ecf20Sopenharmony_ci	if (!drained) {
27788c2ecf20Sopenharmony_ci		drain_all_stock(mem_over_limit);
27798c2ecf20Sopenharmony_ci		drained = true;
27808c2ecf20Sopenharmony_ci		goto retry;
27818c2ecf20Sopenharmony_ci	}
27828c2ecf20Sopenharmony_ci
27838c2ecf20Sopenharmony_ci	if (gfp_mask & __GFP_NORETRY)
27848c2ecf20Sopenharmony_ci		goto nomem;
27858c2ecf20Sopenharmony_ci	/*
27868c2ecf20Sopenharmony_ci	 * Even though the limit is exceeded at this point, reclaim
27878c2ecf20Sopenharmony_ci	 * may have been able to free some pages.  Retry the charge
27888c2ecf20Sopenharmony_ci	 * before killing the task.
27898c2ecf20Sopenharmony_ci	 *
27908c2ecf20Sopenharmony_ci	 * Only for regular pages, though: huge pages are rather
27918c2ecf20Sopenharmony_ci	 * unlikely to succeed so close to the limit, and we fall back
27928c2ecf20Sopenharmony_ci	 * to regular pages anyway in case of failure.
27938c2ecf20Sopenharmony_ci	 */
27948c2ecf20Sopenharmony_ci	if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
27958c2ecf20Sopenharmony_ci		goto retry;
27968c2ecf20Sopenharmony_ci	/*
27978c2ecf20Sopenharmony_ci	 * At task move, charge accounts can be doubly counted. So, it's
27988c2ecf20Sopenharmony_ci	 * better to wait until the end of task_move if something is going on.
27998c2ecf20Sopenharmony_ci	 */
28008c2ecf20Sopenharmony_ci	if (mem_cgroup_wait_acct_move(mem_over_limit))
28018c2ecf20Sopenharmony_ci		goto retry;
28028c2ecf20Sopenharmony_ci
28038c2ecf20Sopenharmony_ci	if (nr_retries--)
28048c2ecf20Sopenharmony_ci		goto retry;
28058c2ecf20Sopenharmony_ci
28068c2ecf20Sopenharmony_ci	if (gfp_mask & __GFP_RETRY_MAYFAIL)
28078c2ecf20Sopenharmony_ci		goto nomem;
28088c2ecf20Sopenharmony_ci
28098c2ecf20Sopenharmony_ci	if (gfp_mask & __GFP_NOFAIL)
28108c2ecf20Sopenharmony_ci		goto force;
28118c2ecf20Sopenharmony_ci
28128c2ecf20Sopenharmony_ci	/* Avoid endless loop for tasks bypassed by the oom killer */
28138c2ecf20Sopenharmony_ci	if (passed_oom && task_is_dying())
28148c2ecf20Sopenharmony_ci		goto nomem;
28158c2ecf20Sopenharmony_ci
28168c2ecf20Sopenharmony_ci	/*
28178c2ecf20Sopenharmony_ci	 * keep retrying as long as the memcg oom killer is able to make
28188c2ecf20Sopenharmony_ci	 * a forward progress or bypass the charge if the oom killer
28198c2ecf20Sopenharmony_ci	 * couldn't make any progress.
28208c2ecf20Sopenharmony_ci	 */
28218c2ecf20Sopenharmony_ci	oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
28228c2ecf20Sopenharmony_ci		       get_order(nr_pages * PAGE_SIZE));
28238c2ecf20Sopenharmony_ci	if (oom_status == OOM_SUCCESS) {
28248c2ecf20Sopenharmony_ci		passed_oom = true;
28258c2ecf20Sopenharmony_ci		nr_retries = MAX_RECLAIM_RETRIES;
28268c2ecf20Sopenharmony_ci		goto retry;
28278c2ecf20Sopenharmony_ci	}
28288c2ecf20Sopenharmony_cinomem:
28298c2ecf20Sopenharmony_ci	if (!(gfp_mask & __GFP_NOFAIL))
28308c2ecf20Sopenharmony_ci		return -ENOMEM;
28318c2ecf20Sopenharmony_ciforce:
28328c2ecf20Sopenharmony_ci	/*
28338c2ecf20Sopenharmony_ci	 * The allocation either can't fail or will lead to more memory
28348c2ecf20Sopenharmony_ci	 * being freed very soon.  Allow memory usage go over the limit
28358c2ecf20Sopenharmony_ci	 * temporarily by force charging it.
28368c2ecf20Sopenharmony_ci	 */
28378c2ecf20Sopenharmony_ci	page_counter_charge(&memcg->memory, nr_pages);
28388c2ecf20Sopenharmony_ci	if (do_memsw_account())
28398c2ecf20Sopenharmony_ci		page_counter_charge(&memcg->memsw, nr_pages);
28408c2ecf20Sopenharmony_ci
28418c2ecf20Sopenharmony_ci	return 0;
28428c2ecf20Sopenharmony_ci
28438c2ecf20Sopenharmony_cidone_restock:
28448c2ecf20Sopenharmony_ci	if (batch > nr_pages)
28458c2ecf20Sopenharmony_ci		refill_stock(memcg, batch - nr_pages);
28468c2ecf20Sopenharmony_ci
28478c2ecf20Sopenharmony_ci	/*
28488c2ecf20Sopenharmony_ci	 * If the hierarchy is above the normal consumption range, schedule
28498c2ecf20Sopenharmony_ci	 * reclaim on returning to userland.  We can perform reclaim here
28508c2ecf20Sopenharmony_ci	 * if __GFP_RECLAIM but let's always punt for simplicity and so that
28518c2ecf20Sopenharmony_ci	 * GFP_KERNEL can consistently be used during reclaim.  @memcg is
28528c2ecf20Sopenharmony_ci	 * not recorded as it most likely matches current's and won't
28538c2ecf20Sopenharmony_ci	 * change in the meantime.  As high limit is checked again before
28548c2ecf20Sopenharmony_ci	 * reclaim, the cost of mismatch is negligible.
28558c2ecf20Sopenharmony_ci	 */
28568c2ecf20Sopenharmony_ci	do {
28578c2ecf20Sopenharmony_ci		bool mem_high, swap_high;
28588c2ecf20Sopenharmony_ci
28598c2ecf20Sopenharmony_ci		mem_high = page_counter_read(&memcg->memory) >
28608c2ecf20Sopenharmony_ci			READ_ONCE(memcg->memory.high);
28618c2ecf20Sopenharmony_ci		swap_high = page_counter_read(&memcg->swap) >
28628c2ecf20Sopenharmony_ci			READ_ONCE(memcg->swap.high);
28638c2ecf20Sopenharmony_ci
28648c2ecf20Sopenharmony_ci		/* Don't bother a random interrupted task */
28658c2ecf20Sopenharmony_ci		if (in_interrupt()) {
28668c2ecf20Sopenharmony_ci			if (mem_high) {
28678c2ecf20Sopenharmony_ci				schedule_work(&memcg->high_work);
28688c2ecf20Sopenharmony_ci				break;
28698c2ecf20Sopenharmony_ci			}
28708c2ecf20Sopenharmony_ci			continue;
28718c2ecf20Sopenharmony_ci		}
28728c2ecf20Sopenharmony_ci
28738c2ecf20Sopenharmony_ci		if (mem_high || swap_high) {
28748c2ecf20Sopenharmony_ci			/*
28758c2ecf20Sopenharmony_ci			 * The allocating tasks in this cgroup will need to do
28768c2ecf20Sopenharmony_ci			 * reclaim or be throttled to prevent further growth
28778c2ecf20Sopenharmony_ci			 * of the memory or swap footprints.
28788c2ecf20Sopenharmony_ci			 *
28798c2ecf20Sopenharmony_ci			 * Target some best-effort fairness between the tasks,
28808c2ecf20Sopenharmony_ci			 * and distribute reclaim work and delay penalties
28818c2ecf20Sopenharmony_ci			 * based on how much each task is actually allocating.
28828c2ecf20Sopenharmony_ci			 */
28838c2ecf20Sopenharmony_ci			current->memcg_nr_pages_over_high += batch;
28848c2ecf20Sopenharmony_ci			set_notify_resume(current);
28858c2ecf20Sopenharmony_ci			break;
28868c2ecf20Sopenharmony_ci		}
28878c2ecf20Sopenharmony_ci	} while ((memcg = parent_mem_cgroup(memcg)));
28888c2ecf20Sopenharmony_ci
28898c2ecf20Sopenharmony_ci	return 0;
28908c2ecf20Sopenharmony_ci}
28918c2ecf20Sopenharmony_ci
28928c2ecf20Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
28938c2ecf20Sopenharmony_cistatic void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
28948c2ecf20Sopenharmony_ci{
28958c2ecf20Sopenharmony_ci	if (mem_cgroup_is_root(memcg))
28968c2ecf20Sopenharmony_ci		return;
28978c2ecf20Sopenharmony_ci
28988c2ecf20Sopenharmony_ci	page_counter_uncharge(&memcg->memory, nr_pages);
28998c2ecf20Sopenharmony_ci	if (do_memsw_account())
29008c2ecf20Sopenharmony_ci		page_counter_uncharge(&memcg->memsw, nr_pages);
29018c2ecf20Sopenharmony_ci}
29028c2ecf20Sopenharmony_ci#endif
29038c2ecf20Sopenharmony_ci
29048c2ecf20Sopenharmony_cistatic void commit_charge(struct page *page, struct mem_cgroup *memcg)
29058c2ecf20Sopenharmony_ci{
29068c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(page->mem_cgroup, page);
29078c2ecf20Sopenharmony_ci	/*
29088c2ecf20Sopenharmony_ci	 * Any of the following ensures page->mem_cgroup stability:
29098c2ecf20Sopenharmony_ci	 *
29108c2ecf20Sopenharmony_ci	 * - the page lock
29118c2ecf20Sopenharmony_ci	 * - LRU isolation
29128c2ecf20Sopenharmony_ci	 * - lock_page_memcg()
29138c2ecf20Sopenharmony_ci	 * - exclusive reference
29148c2ecf20Sopenharmony_ci	 */
29158c2ecf20Sopenharmony_ci	page->mem_cgroup = memcg;
29168c2ecf20Sopenharmony_ci}
29178c2ecf20Sopenharmony_ci
29188c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
29198c2ecf20Sopenharmony_ci/*
29208c2ecf20Sopenharmony_ci * The allocated objcg pointers array is not accounted directly.
29218c2ecf20Sopenharmony_ci * Moreover, it should not come from DMA buffer and is not readily
29228c2ecf20Sopenharmony_ci * reclaimable. So those GFP bits should be masked off.
29238c2ecf20Sopenharmony_ci */
29248c2ecf20Sopenharmony_ci#define OBJCGS_CLEAR_MASK	(__GFP_DMA | __GFP_RECLAIMABLE | \
29258c2ecf20Sopenharmony_ci				 __GFP_ACCOUNT | __GFP_NOFAIL)
29268c2ecf20Sopenharmony_ci
29278c2ecf20Sopenharmony_ciint memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
29288c2ecf20Sopenharmony_ci				 gfp_t gfp)
29298c2ecf20Sopenharmony_ci{
29308c2ecf20Sopenharmony_ci	unsigned int objects = objs_per_slab_page(s, page);
29318c2ecf20Sopenharmony_ci	void *vec;
29328c2ecf20Sopenharmony_ci
29338c2ecf20Sopenharmony_ci	gfp &= ~OBJCGS_CLEAR_MASK;
29348c2ecf20Sopenharmony_ci	vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
29358c2ecf20Sopenharmony_ci			   page_to_nid(page));
29368c2ecf20Sopenharmony_ci	if (!vec)
29378c2ecf20Sopenharmony_ci		return -ENOMEM;
29388c2ecf20Sopenharmony_ci
29398c2ecf20Sopenharmony_ci	if (cmpxchg(&page->obj_cgroups, NULL,
29408c2ecf20Sopenharmony_ci		    (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
29418c2ecf20Sopenharmony_ci		kfree(vec);
29428c2ecf20Sopenharmony_ci	else
29438c2ecf20Sopenharmony_ci		kmemleak_not_leak(vec);
29448c2ecf20Sopenharmony_ci
29458c2ecf20Sopenharmony_ci	return 0;
29468c2ecf20Sopenharmony_ci}
29478c2ecf20Sopenharmony_ci
29488c2ecf20Sopenharmony_ci/*
29498c2ecf20Sopenharmony_ci * Returns a pointer to the memory cgroup to which the kernel object is charged.
29508c2ecf20Sopenharmony_ci *
29518c2ecf20Sopenharmony_ci * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
29528c2ecf20Sopenharmony_ci * cgroup_mutex, etc.
29538c2ecf20Sopenharmony_ci */
29548c2ecf20Sopenharmony_cistruct mem_cgroup *mem_cgroup_from_obj(void *p)
29558c2ecf20Sopenharmony_ci{
29568c2ecf20Sopenharmony_ci	struct page *page;
29578c2ecf20Sopenharmony_ci
29588c2ecf20Sopenharmony_ci	if (mem_cgroup_disabled())
29598c2ecf20Sopenharmony_ci		return NULL;
29608c2ecf20Sopenharmony_ci
29618c2ecf20Sopenharmony_ci	page = virt_to_head_page(p);
29628c2ecf20Sopenharmony_ci
29638c2ecf20Sopenharmony_ci	/*
29648c2ecf20Sopenharmony_ci	 * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
29658c2ecf20Sopenharmony_ci	 * or a pointer to obj_cgroup vector. In the latter case the lowest
29668c2ecf20Sopenharmony_ci	 * bit of the pointer is set.
29678c2ecf20Sopenharmony_ci	 * The page->mem_cgroup pointer can be asynchronously changed
29688c2ecf20Sopenharmony_ci	 * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
29698c2ecf20Sopenharmony_ci	 * from a valid memcg pointer to objcg vector or back.
29708c2ecf20Sopenharmony_ci	 */
29718c2ecf20Sopenharmony_ci	if (!page->mem_cgroup)
29728c2ecf20Sopenharmony_ci		return NULL;
29738c2ecf20Sopenharmony_ci
29748c2ecf20Sopenharmony_ci	/*
29758c2ecf20Sopenharmony_ci	 * Slab objects are accounted individually, not per-page.
29768c2ecf20Sopenharmony_ci	 * Memcg membership data for each individual object is saved in
29778c2ecf20Sopenharmony_ci	 * the page->obj_cgroups.
29788c2ecf20Sopenharmony_ci	 */
29798c2ecf20Sopenharmony_ci	if (page_has_obj_cgroups(page)) {
29808c2ecf20Sopenharmony_ci		struct obj_cgroup *objcg;
29818c2ecf20Sopenharmony_ci		unsigned int off;
29828c2ecf20Sopenharmony_ci
29838c2ecf20Sopenharmony_ci		off = obj_to_index(page->slab_cache, page, p);
29848c2ecf20Sopenharmony_ci		objcg = page_obj_cgroups(page)[off];
29858c2ecf20Sopenharmony_ci		if (objcg)
29868c2ecf20Sopenharmony_ci			return obj_cgroup_memcg(objcg);
29878c2ecf20Sopenharmony_ci
29888c2ecf20Sopenharmony_ci		return NULL;
29898c2ecf20Sopenharmony_ci	}
29908c2ecf20Sopenharmony_ci
29918c2ecf20Sopenharmony_ci	/* All other pages use page->mem_cgroup */
29928c2ecf20Sopenharmony_ci	return page->mem_cgroup;
29938c2ecf20Sopenharmony_ci}
29948c2ecf20Sopenharmony_ci
29958c2ecf20Sopenharmony_ci__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
29968c2ecf20Sopenharmony_ci{
29978c2ecf20Sopenharmony_ci	struct obj_cgroup *objcg = NULL;
29988c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
29998c2ecf20Sopenharmony_ci
30008c2ecf20Sopenharmony_ci	if (memcg_kmem_bypass())
30018c2ecf20Sopenharmony_ci		return NULL;
30028c2ecf20Sopenharmony_ci
30038c2ecf20Sopenharmony_ci	rcu_read_lock();
30048c2ecf20Sopenharmony_ci	if (unlikely(active_memcg()))
30058c2ecf20Sopenharmony_ci		memcg = active_memcg();
30068c2ecf20Sopenharmony_ci	else
30078c2ecf20Sopenharmony_ci		memcg = mem_cgroup_from_task(current);
30088c2ecf20Sopenharmony_ci
30098c2ecf20Sopenharmony_ci	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
30108c2ecf20Sopenharmony_ci		objcg = rcu_dereference(memcg->objcg);
30118c2ecf20Sopenharmony_ci		if (objcg && obj_cgroup_tryget(objcg))
30128c2ecf20Sopenharmony_ci			break;
30138c2ecf20Sopenharmony_ci		objcg = NULL;
30148c2ecf20Sopenharmony_ci	}
30158c2ecf20Sopenharmony_ci	rcu_read_unlock();
30168c2ecf20Sopenharmony_ci
30178c2ecf20Sopenharmony_ci	return objcg;
30188c2ecf20Sopenharmony_ci}
30198c2ecf20Sopenharmony_ci
30208c2ecf20Sopenharmony_cistatic int memcg_alloc_cache_id(void)
30218c2ecf20Sopenharmony_ci{
30228c2ecf20Sopenharmony_ci	int id, size;
30238c2ecf20Sopenharmony_ci	int err;
30248c2ecf20Sopenharmony_ci
30258c2ecf20Sopenharmony_ci	id = ida_simple_get(&memcg_cache_ida,
30268c2ecf20Sopenharmony_ci			    0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
30278c2ecf20Sopenharmony_ci	if (id < 0)
30288c2ecf20Sopenharmony_ci		return id;
30298c2ecf20Sopenharmony_ci
30308c2ecf20Sopenharmony_ci	if (id < memcg_nr_cache_ids)
30318c2ecf20Sopenharmony_ci		return id;
30328c2ecf20Sopenharmony_ci
30338c2ecf20Sopenharmony_ci	/*
30348c2ecf20Sopenharmony_ci	 * There's no space for the new id in memcg_caches arrays,
30358c2ecf20Sopenharmony_ci	 * so we have to grow them.
30368c2ecf20Sopenharmony_ci	 */
30378c2ecf20Sopenharmony_ci	down_write(&memcg_cache_ids_sem);
30388c2ecf20Sopenharmony_ci
30398c2ecf20Sopenharmony_ci	size = 2 * (id + 1);
30408c2ecf20Sopenharmony_ci	if (size < MEMCG_CACHES_MIN_SIZE)
30418c2ecf20Sopenharmony_ci		size = MEMCG_CACHES_MIN_SIZE;
30428c2ecf20Sopenharmony_ci	else if (size > MEMCG_CACHES_MAX_SIZE)
30438c2ecf20Sopenharmony_ci		size = MEMCG_CACHES_MAX_SIZE;
30448c2ecf20Sopenharmony_ci
30458c2ecf20Sopenharmony_ci	err = memcg_update_all_list_lrus(size);
30468c2ecf20Sopenharmony_ci	if (!err)
30478c2ecf20Sopenharmony_ci		memcg_nr_cache_ids = size;
30488c2ecf20Sopenharmony_ci
30498c2ecf20Sopenharmony_ci	up_write(&memcg_cache_ids_sem);
30508c2ecf20Sopenharmony_ci
30518c2ecf20Sopenharmony_ci	if (err) {
30528c2ecf20Sopenharmony_ci		ida_simple_remove(&memcg_cache_ida, id);
30538c2ecf20Sopenharmony_ci		return err;
30548c2ecf20Sopenharmony_ci	}
30558c2ecf20Sopenharmony_ci	return id;
30568c2ecf20Sopenharmony_ci}
30578c2ecf20Sopenharmony_ci
30588c2ecf20Sopenharmony_cistatic void memcg_free_cache_id(int id)
30598c2ecf20Sopenharmony_ci{
30608c2ecf20Sopenharmony_ci	ida_simple_remove(&memcg_cache_ida, id);
30618c2ecf20Sopenharmony_ci}
30628c2ecf20Sopenharmony_ci
30638c2ecf20Sopenharmony_ci/**
30648c2ecf20Sopenharmony_ci * __memcg_kmem_charge: charge a number of kernel pages to a memcg
30658c2ecf20Sopenharmony_ci * @memcg: memory cgroup to charge
30668c2ecf20Sopenharmony_ci * @gfp: reclaim mode
30678c2ecf20Sopenharmony_ci * @nr_pages: number of pages to charge
30688c2ecf20Sopenharmony_ci *
30698c2ecf20Sopenharmony_ci * Returns 0 on success, an error code on failure.
30708c2ecf20Sopenharmony_ci */
30718c2ecf20Sopenharmony_ciint __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
30728c2ecf20Sopenharmony_ci			unsigned int nr_pages)
30738c2ecf20Sopenharmony_ci{
30748c2ecf20Sopenharmony_ci	struct page_counter *counter;
30758c2ecf20Sopenharmony_ci	int ret;
30768c2ecf20Sopenharmony_ci
30778c2ecf20Sopenharmony_ci	ret = try_charge(memcg, gfp, nr_pages);
30788c2ecf20Sopenharmony_ci	if (ret)
30798c2ecf20Sopenharmony_ci		return ret;
30808c2ecf20Sopenharmony_ci
30818c2ecf20Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
30828c2ecf20Sopenharmony_ci	    !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
30838c2ecf20Sopenharmony_ci
30848c2ecf20Sopenharmony_ci		/*
30858c2ecf20Sopenharmony_ci		 * Enforce __GFP_NOFAIL allocation because callers are not
30868c2ecf20Sopenharmony_ci		 * prepared to see failures and likely do not have any failure
30878c2ecf20Sopenharmony_ci		 * handling code.
30888c2ecf20Sopenharmony_ci		 */
30898c2ecf20Sopenharmony_ci		if (gfp & __GFP_NOFAIL) {
30908c2ecf20Sopenharmony_ci			page_counter_charge(&memcg->kmem, nr_pages);
30918c2ecf20Sopenharmony_ci			return 0;
30928c2ecf20Sopenharmony_ci		}
30938c2ecf20Sopenharmony_ci		cancel_charge(memcg, nr_pages);
30948c2ecf20Sopenharmony_ci		return -ENOMEM;
30958c2ecf20Sopenharmony_ci	}
30968c2ecf20Sopenharmony_ci	return 0;
30978c2ecf20Sopenharmony_ci}
30988c2ecf20Sopenharmony_ci
30998c2ecf20Sopenharmony_ci/**
31008c2ecf20Sopenharmony_ci * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
31018c2ecf20Sopenharmony_ci * @memcg: memcg to uncharge
31028c2ecf20Sopenharmony_ci * @nr_pages: number of pages to uncharge
31038c2ecf20Sopenharmony_ci */
31048c2ecf20Sopenharmony_civoid __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
31058c2ecf20Sopenharmony_ci{
31068c2ecf20Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
31078c2ecf20Sopenharmony_ci		page_counter_uncharge(&memcg->kmem, nr_pages);
31088c2ecf20Sopenharmony_ci
31098c2ecf20Sopenharmony_ci	refill_stock(memcg, nr_pages);
31108c2ecf20Sopenharmony_ci}
31118c2ecf20Sopenharmony_ci
31128c2ecf20Sopenharmony_ci/**
31138c2ecf20Sopenharmony_ci * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
31148c2ecf20Sopenharmony_ci * @page: page to charge
31158c2ecf20Sopenharmony_ci * @gfp: reclaim mode
31168c2ecf20Sopenharmony_ci * @order: allocation order
31178c2ecf20Sopenharmony_ci *
31188c2ecf20Sopenharmony_ci * Returns 0 on success, an error code on failure.
31198c2ecf20Sopenharmony_ci */
31208c2ecf20Sopenharmony_ciint __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
31218c2ecf20Sopenharmony_ci{
31228c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
31238c2ecf20Sopenharmony_ci	int ret = 0;
31248c2ecf20Sopenharmony_ci
31258c2ecf20Sopenharmony_ci	memcg = get_mem_cgroup_from_current();
31268c2ecf20Sopenharmony_ci	if (memcg && !mem_cgroup_is_root(memcg)) {
31278c2ecf20Sopenharmony_ci		ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
31288c2ecf20Sopenharmony_ci		if (!ret) {
31298c2ecf20Sopenharmony_ci			page->mem_cgroup = memcg;
31308c2ecf20Sopenharmony_ci			__SetPageKmemcg(page);
31318c2ecf20Sopenharmony_ci			return 0;
31328c2ecf20Sopenharmony_ci		}
31338c2ecf20Sopenharmony_ci		css_put(&memcg->css);
31348c2ecf20Sopenharmony_ci	}
31358c2ecf20Sopenharmony_ci	return ret;
31368c2ecf20Sopenharmony_ci}
31378c2ecf20Sopenharmony_ci
31388c2ecf20Sopenharmony_ci/**
31398c2ecf20Sopenharmony_ci * __memcg_kmem_uncharge_page: uncharge a kmem page
31408c2ecf20Sopenharmony_ci * @page: page to uncharge
31418c2ecf20Sopenharmony_ci * @order: allocation order
31428c2ecf20Sopenharmony_ci */
31438c2ecf20Sopenharmony_civoid __memcg_kmem_uncharge_page(struct page *page, int order)
31448c2ecf20Sopenharmony_ci{
31458c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = page->mem_cgroup;
31468c2ecf20Sopenharmony_ci	unsigned int nr_pages = 1 << order;
31478c2ecf20Sopenharmony_ci
31488c2ecf20Sopenharmony_ci	if (!memcg)
31498c2ecf20Sopenharmony_ci		return;
31508c2ecf20Sopenharmony_ci
31518c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
31528c2ecf20Sopenharmony_ci	__memcg_kmem_uncharge(memcg, nr_pages);
31538c2ecf20Sopenharmony_ci	page->mem_cgroup = NULL;
31548c2ecf20Sopenharmony_ci	css_put(&memcg->css);
31558c2ecf20Sopenharmony_ci
31568c2ecf20Sopenharmony_ci	/* slab pages do not have PageKmemcg flag set */
31578c2ecf20Sopenharmony_ci	if (PageKmemcg(page))
31588c2ecf20Sopenharmony_ci		__ClearPageKmemcg(page);
31598c2ecf20Sopenharmony_ci}
31608c2ecf20Sopenharmony_ci
31618c2ecf20Sopenharmony_cistatic bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
31628c2ecf20Sopenharmony_ci{
31638c2ecf20Sopenharmony_ci	struct memcg_stock_pcp *stock;
31648c2ecf20Sopenharmony_ci	unsigned long flags;
31658c2ecf20Sopenharmony_ci	bool ret = false;
31668c2ecf20Sopenharmony_ci
31678c2ecf20Sopenharmony_ci	local_irq_save(flags);
31688c2ecf20Sopenharmony_ci
31698c2ecf20Sopenharmony_ci	stock = this_cpu_ptr(&memcg_stock);
31708c2ecf20Sopenharmony_ci	if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
31718c2ecf20Sopenharmony_ci		stock->nr_bytes -= nr_bytes;
31728c2ecf20Sopenharmony_ci		ret = true;
31738c2ecf20Sopenharmony_ci	}
31748c2ecf20Sopenharmony_ci
31758c2ecf20Sopenharmony_ci	local_irq_restore(flags);
31768c2ecf20Sopenharmony_ci
31778c2ecf20Sopenharmony_ci	return ret;
31788c2ecf20Sopenharmony_ci}
31798c2ecf20Sopenharmony_ci
31808c2ecf20Sopenharmony_cistatic void drain_obj_stock(struct memcg_stock_pcp *stock)
31818c2ecf20Sopenharmony_ci{
31828c2ecf20Sopenharmony_ci	struct obj_cgroup *old = stock->cached_objcg;
31838c2ecf20Sopenharmony_ci
31848c2ecf20Sopenharmony_ci	if (!old)
31858c2ecf20Sopenharmony_ci		return;
31868c2ecf20Sopenharmony_ci
31878c2ecf20Sopenharmony_ci	if (stock->nr_bytes) {
31888c2ecf20Sopenharmony_ci		unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
31898c2ecf20Sopenharmony_ci		unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
31908c2ecf20Sopenharmony_ci
31918c2ecf20Sopenharmony_ci		if (nr_pages) {
31928c2ecf20Sopenharmony_ci			struct mem_cgroup *memcg;
31938c2ecf20Sopenharmony_ci
31948c2ecf20Sopenharmony_ci			rcu_read_lock();
31958c2ecf20Sopenharmony_ciretry:
31968c2ecf20Sopenharmony_ci			memcg = obj_cgroup_memcg(old);
31978c2ecf20Sopenharmony_ci			if (unlikely(!css_tryget(&memcg->css)))
31988c2ecf20Sopenharmony_ci				goto retry;
31998c2ecf20Sopenharmony_ci			rcu_read_unlock();
32008c2ecf20Sopenharmony_ci
32018c2ecf20Sopenharmony_ci			__memcg_kmem_uncharge(memcg, nr_pages);
32028c2ecf20Sopenharmony_ci			css_put(&memcg->css);
32038c2ecf20Sopenharmony_ci		}
32048c2ecf20Sopenharmony_ci
32058c2ecf20Sopenharmony_ci		/*
32068c2ecf20Sopenharmony_ci		 * The leftover is flushed to the centralized per-memcg value.
32078c2ecf20Sopenharmony_ci		 * On the next attempt to refill obj stock it will be moved
32088c2ecf20Sopenharmony_ci		 * to a per-cpu stock (probably, on an other CPU), see
32098c2ecf20Sopenharmony_ci		 * refill_obj_stock().
32108c2ecf20Sopenharmony_ci		 *
32118c2ecf20Sopenharmony_ci		 * How often it's flushed is a trade-off between the memory
32128c2ecf20Sopenharmony_ci		 * limit enforcement accuracy and potential CPU contention,
32138c2ecf20Sopenharmony_ci		 * so it might be changed in the future.
32148c2ecf20Sopenharmony_ci		 */
32158c2ecf20Sopenharmony_ci		atomic_add(nr_bytes, &old->nr_charged_bytes);
32168c2ecf20Sopenharmony_ci		stock->nr_bytes = 0;
32178c2ecf20Sopenharmony_ci	}
32188c2ecf20Sopenharmony_ci
32198c2ecf20Sopenharmony_ci	obj_cgroup_put(old);
32208c2ecf20Sopenharmony_ci	stock->cached_objcg = NULL;
32218c2ecf20Sopenharmony_ci}
32228c2ecf20Sopenharmony_ci
32238c2ecf20Sopenharmony_cistatic bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
32248c2ecf20Sopenharmony_ci				     struct mem_cgroup *root_memcg)
32258c2ecf20Sopenharmony_ci{
32268c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
32278c2ecf20Sopenharmony_ci
32288c2ecf20Sopenharmony_ci	if (stock->cached_objcg) {
32298c2ecf20Sopenharmony_ci		memcg = obj_cgroup_memcg(stock->cached_objcg);
32308c2ecf20Sopenharmony_ci		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
32318c2ecf20Sopenharmony_ci			return true;
32328c2ecf20Sopenharmony_ci	}
32338c2ecf20Sopenharmony_ci
32348c2ecf20Sopenharmony_ci	return false;
32358c2ecf20Sopenharmony_ci}
32368c2ecf20Sopenharmony_ci
32378c2ecf20Sopenharmony_cistatic void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
32388c2ecf20Sopenharmony_ci{
32398c2ecf20Sopenharmony_ci	struct memcg_stock_pcp *stock;
32408c2ecf20Sopenharmony_ci	unsigned long flags;
32418c2ecf20Sopenharmony_ci
32428c2ecf20Sopenharmony_ci	local_irq_save(flags);
32438c2ecf20Sopenharmony_ci
32448c2ecf20Sopenharmony_ci	stock = this_cpu_ptr(&memcg_stock);
32458c2ecf20Sopenharmony_ci	if (stock->cached_objcg != objcg) { /* reset if necessary */
32468c2ecf20Sopenharmony_ci		drain_obj_stock(stock);
32478c2ecf20Sopenharmony_ci		obj_cgroup_get(objcg);
32488c2ecf20Sopenharmony_ci		stock->cached_objcg = objcg;
32498c2ecf20Sopenharmony_ci		stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
32508c2ecf20Sopenharmony_ci	}
32518c2ecf20Sopenharmony_ci	stock->nr_bytes += nr_bytes;
32528c2ecf20Sopenharmony_ci
32538c2ecf20Sopenharmony_ci	if (stock->nr_bytes > PAGE_SIZE)
32548c2ecf20Sopenharmony_ci		drain_obj_stock(stock);
32558c2ecf20Sopenharmony_ci
32568c2ecf20Sopenharmony_ci	local_irq_restore(flags);
32578c2ecf20Sopenharmony_ci}
32588c2ecf20Sopenharmony_ci
32598c2ecf20Sopenharmony_ciint obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
32608c2ecf20Sopenharmony_ci{
32618c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
32628c2ecf20Sopenharmony_ci	unsigned int nr_pages, nr_bytes;
32638c2ecf20Sopenharmony_ci	int ret;
32648c2ecf20Sopenharmony_ci
32658c2ecf20Sopenharmony_ci	if (consume_obj_stock(objcg, size))
32668c2ecf20Sopenharmony_ci		return 0;
32678c2ecf20Sopenharmony_ci
32688c2ecf20Sopenharmony_ci	/*
32698c2ecf20Sopenharmony_ci	 * In theory, memcg->nr_charged_bytes can have enough
32708c2ecf20Sopenharmony_ci	 * pre-charged bytes to satisfy the allocation. However,
32718c2ecf20Sopenharmony_ci	 * flushing memcg->nr_charged_bytes requires two atomic
32728c2ecf20Sopenharmony_ci	 * operations, and memcg->nr_charged_bytes can't be big,
32738c2ecf20Sopenharmony_ci	 * so it's better to ignore it and try grab some new pages.
32748c2ecf20Sopenharmony_ci	 * memcg->nr_charged_bytes will be flushed in
32758c2ecf20Sopenharmony_ci	 * refill_obj_stock(), called from this function or
32768c2ecf20Sopenharmony_ci	 * independently later.
32778c2ecf20Sopenharmony_ci	 */
32788c2ecf20Sopenharmony_ci	rcu_read_lock();
32798c2ecf20Sopenharmony_ciretry:
32808c2ecf20Sopenharmony_ci	memcg = obj_cgroup_memcg(objcg);
32818c2ecf20Sopenharmony_ci	if (unlikely(!css_tryget(&memcg->css)))
32828c2ecf20Sopenharmony_ci		goto retry;
32838c2ecf20Sopenharmony_ci	rcu_read_unlock();
32848c2ecf20Sopenharmony_ci
32858c2ecf20Sopenharmony_ci	nr_pages = size >> PAGE_SHIFT;
32868c2ecf20Sopenharmony_ci	nr_bytes = size & (PAGE_SIZE - 1);
32878c2ecf20Sopenharmony_ci
32888c2ecf20Sopenharmony_ci	if (nr_bytes)
32898c2ecf20Sopenharmony_ci		nr_pages += 1;
32908c2ecf20Sopenharmony_ci
32918c2ecf20Sopenharmony_ci	ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
32928c2ecf20Sopenharmony_ci	if (!ret && nr_bytes)
32938c2ecf20Sopenharmony_ci		refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
32948c2ecf20Sopenharmony_ci
32958c2ecf20Sopenharmony_ci	css_put(&memcg->css);
32968c2ecf20Sopenharmony_ci	return ret;
32978c2ecf20Sopenharmony_ci}
32988c2ecf20Sopenharmony_ci
32998c2ecf20Sopenharmony_civoid obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
33008c2ecf20Sopenharmony_ci{
33018c2ecf20Sopenharmony_ci	refill_obj_stock(objcg, size);
33028c2ecf20Sopenharmony_ci}
33038c2ecf20Sopenharmony_ci
33048c2ecf20Sopenharmony_ci#endif /* CONFIG_MEMCG_KMEM */
33058c2ecf20Sopenharmony_ci
33068c2ecf20Sopenharmony_ci/*
33078c2ecf20Sopenharmony_ci * Because head->mem_cgroup is not set on tails, set it now.
33088c2ecf20Sopenharmony_ci */
33098c2ecf20Sopenharmony_civoid split_page_memcg(struct page *head, unsigned int nr)
33108c2ecf20Sopenharmony_ci{
33118c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = head->mem_cgroup;
33128c2ecf20Sopenharmony_ci	int kmemcg = PageKmemcg(head);
33138c2ecf20Sopenharmony_ci	int i;
33148c2ecf20Sopenharmony_ci
33158c2ecf20Sopenharmony_ci	if (mem_cgroup_disabled() || !memcg)
33168c2ecf20Sopenharmony_ci		return;
33178c2ecf20Sopenharmony_ci
33188c2ecf20Sopenharmony_ci	for (i = 1; i < nr; i++) {
33198c2ecf20Sopenharmony_ci		head[i].mem_cgroup = memcg;
33208c2ecf20Sopenharmony_ci		if (kmemcg)
33218c2ecf20Sopenharmony_ci			__SetPageKmemcg(head + i);
33228c2ecf20Sopenharmony_ci	}
33238c2ecf20Sopenharmony_ci	css_get_many(&memcg->css, nr - 1);
33248c2ecf20Sopenharmony_ci}
33258c2ecf20Sopenharmony_ci
33268c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_SWAP
33278c2ecf20Sopenharmony_ci/**
33288c2ecf20Sopenharmony_ci * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
33298c2ecf20Sopenharmony_ci * @entry: swap entry to be moved
33308c2ecf20Sopenharmony_ci * @from:  mem_cgroup which the entry is moved from
33318c2ecf20Sopenharmony_ci * @to:  mem_cgroup which the entry is moved to
33328c2ecf20Sopenharmony_ci *
33338c2ecf20Sopenharmony_ci * It succeeds only when the swap_cgroup's record for this entry is the same
33348c2ecf20Sopenharmony_ci * as the mem_cgroup's id of @from.
33358c2ecf20Sopenharmony_ci *
33368c2ecf20Sopenharmony_ci * Returns 0 on success, -EINVAL on failure.
33378c2ecf20Sopenharmony_ci *
33388c2ecf20Sopenharmony_ci * The caller must have charged to @to, IOW, called page_counter_charge() about
33398c2ecf20Sopenharmony_ci * both res and memsw, and called css_get().
33408c2ecf20Sopenharmony_ci */
33418c2ecf20Sopenharmony_cistatic int mem_cgroup_move_swap_account(swp_entry_t entry,
33428c2ecf20Sopenharmony_ci				struct mem_cgroup *from, struct mem_cgroup *to)
33438c2ecf20Sopenharmony_ci{
33448c2ecf20Sopenharmony_ci	unsigned short old_id, new_id;
33458c2ecf20Sopenharmony_ci
33468c2ecf20Sopenharmony_ci	old_id = mem_cgroup_id(from);
33478c2ecf20Sopenharmony_ci	new_id = mem_cgroup_id(to);
33488c2ecf20Sopenharmony_ci
33498c2ecf20Sopenharmony_ci	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
33508c2ecf20Sopenharmony_ci		mod_memcg_state(from, MEMCG_SWAP, -1);
33518c2ecf20Sopenharmony_ci		mod_memcg_state(to, MEMCG_SWAP, 1);
33528c2ecf20Sopenharmony_ci		return 0;
33538c2ecf20Sopenharmony_ci	}
33548c2ecf20Sopenharmony_ci	return -EINVAL;
33558c2ecf20Sopenharmony_ci}
33568c2ecf20Sopenharmony_ci#else
33578c2ecf20Sopenharmony_cistatic inline int mem_cgroup_move_swap_account(swp_entry_t entry,
33588c2ecf20Sopenharmony_ci				struct mem_cgroup *from, struct mem_cgroup *to)
33598c2ecf20Sopenharmony_ci{
33608c2ecf20Sopenharmony_ci	return -EINVAL;
33618c2ecf20Sopenharmony_ci}
33628c2ecf20Sopenharmony_ci#endif
33638c2ecf20Sopenharmony_ci
33648c2ecf20Sopenharmony_cistatic DEFINE_MUTEX(memcg_max_mutex);
33658c2ecf20Sopenharmony_ci
33668c2ecf20Sopenharmony_cistatic int mem_cgroup_resize_max(struct mem_cgroup *memcg,
33678c2ecf20Sopenharmony_ci				 unsigned long max, bool memsw)
33688c2ecf20Sopenharmony_ci{
33698c2ecf20Sopenharmony_ci	bool enlarge = false;
33708c2ecf20Sopenharmony_ci	bool drained = false;
33718c2ecf20Sopenharmony_ci	int ret;
33728c2ecf20Sopenharmony_ci	bool limits_invariant;
33738c2ecf20Sopenharmony_ci	struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
33748c2ecf20Sopenharmony_ci
33758c2ecf20Sopenharmony_ci	do {
33768c2ecf20Sopenharmony_ci		if (signal_pending(current)) {
33778c2ecf20Sopenharmony_ci			ret = -EINTR;
33788c2ecf20Sopenharmony_ci			break;
33798c2ecf20Sopenharmony_ci		}
33808c2ecf20Sopenharmony_ci
33818c2ecf20Sopenharmony_ci		mutex_lock(&memcg_max_mutex);
33828c2ecf20Sopenharmony_ci		/*
33838c2ecf20Sopenharmony_ci		 * Make sure that the new limit (memsw or memory limit) doesn't
33848c2ecf20Sopenharmony_ci		 * break our basic invariant rule memory.max <= memsw.max.
33858c2ecf20Sopenharmony_ci		 */
33868c2ecf20Sopenharmony_ci		limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
33878c2ecf20Sopenharmony_ci					   max <= memcg->memsw.max;
33888c2ecf20Sopenharmony_ci		if (!limits_invariant) {
33898c2ecf20Sopenharmony_ci			mutex_unlock(&memcg_max_mutex);
33908c2ecf20Sopenharmony_ci			ret = -EINVAL;
33918c2ecf20Sopenharmony_ci			break;
33928c2ecf20Sopenharmony_ci		}
33938c2ecf20Sopenharmony_ci		if (max > counter->max)
33948c2ecf20Sopenharmony_ci			enlarge = true;
33958c2ecf20Sopenharmony_ci		ret = page_counter_set_max(counter, max);
33968c2ecf20Sopenharmony_ci		mutex_unlock(&memcg_max_mutex);
33978c2ecf20Sopenharmony_ci
33988c2ecf20Sopenharmony_ci		if (!ret)
33998c2ecf20Sopenharmony_ci			break;
34008c2ecf20Sopenharmony_ci
34018c2ecf20Sopenharmony_ci		if (!drained) {
34028c2ecf20Sopenharmony_ci			drain_all_stock(memcg);
34038c2ecf20Sopenharmony_ci			drained = true;
34048c2ecf20Sopenharmony_ci			continue;
34058c2ecf20Sopenharmony_ci		}
34068c2ecf20Sopenharmony_ci
34078c2ecf20Sopenharmony_ci		if (!try_to_free_mem_cgroup_pages(memcg, 1,
34088c2ecf20Sopenharmony_ci					GFP_KERNEL, !memsw)) {
34098c2ecf20Sopenharmony_ci			ret = -EBUSY;
34108c2ecf20Sopenharmony_ci			break;
34118c2ecf20Sopenharmony_ci		}
34128c2ecf20Sopenharmony_ci	} while (true);
34138c2ecf20Sopenharmony_ci
34148c2ecf20Sopenharmony_ci	if (!ret && enlarge)
34158c2ecf20Sopenharmony_ci		memcg_oom_recover(memcg);
34168c2ecf20Sopenharmony_ci
34178c2ecf20Sopenharmony_ci	return ret;
34188c2ecf20Sopenharmony_ci}
34198c2ecf20Sopenharmony_ci
34208c2ecf20Sopenharmony_ciunsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
34218c2ecf20Sopenharmony_ci					    gfp_t gfp_mask,
34228c2ecf20Sopenharmony_ci					    unsigned long *total_scanned)
34238c2ecf20Sopenharmony_ci{
34248c2ecf20Sopenharmony_ci	unsigned long nr_reclaimed = 0;
34258c2ecf20Sopenharmony_ci	struct mem_cgroup_per_node *mz, *next_mz = NULL;
34268c2ecf20Sopenharmony_ci	unsigned long reclaimed;
34278c2ecf20Sopenharmony_ci	int loop = 0;
34288c2ecf20Sopenharmony_ci	struct mem_cgroup_tree_per_node *mctz;
34298c2ecf20Sopenharmony_ci	unsigned long excess;
34308c2ecf20Sopenharmony_ci	unsigned long nr_scanned;
34318c2ecf20Sopenharmony_ci
34328c2ecf20Sopenharmony_ci	if (order > 0)
34338c2ecf20Sopenharmony_ci		return 0;
34348c2ecf20Sopenharmony_ci
34358c2ecf20Sopenharmony_ci	mctz = soft_limit_tree_node(pgdat->node_id);
34368c2ecf20Sopenharmony_ci
34378c2ecf20Sopenharmony_ci	/*
34388c2ecf20Sopenharmony_ci	 * Do not even bother to check the largest node if the root
34398c2ecf20Sopenharmony_ci	 * is empty. Do it lockless to prevent lock bouncing. Races
34408c2ecf20Sopenharmony_ci	 * are acceptable as soft limit is best effort anyway.
34418c2ecf20Sopenharmony_ci	 */
34428c2ecf20Sopenharmony_ci	if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
34438c2ecf20Sopenharmony_ci		return 0;
34448c2ecf20Sopenharmony_ci
34458c2ecf20Sopenharmony_ci	/*
34468c2ecf20Sopenharmony_ci	 * This loop can run a while, specially if mem_cgroup's continuously
34478c2ecf20Sopenharmony_ci	 * keep exceeding their soft limit and putting the system under
34488c2ecf20Sopenharmony_ci	 * pressure
34498c2ecf20Sopenharmony_ci	 */
34508c2ecf20Sopenharmony_ci	do {
34518c2ecf20Sopenharmony_ci		if (next_mz)
34528c2ecf20Sopenharmony_ci			mz = next_mz;
34538c2ecf20Sopenharmony_ci		else
34548c2ecf20Sopenharmony_ci			mz = mem_cgroup_largest_soft_limit_node(mctz);
34558c2ecf20Sopenharmony_ci		if (!mz)
34568c2ecf20Sopenharmony_ci			break;
34578c2ecf20Sopenharmony_ci
34588c2ecf20Sopenharmony_ci		nr_scanned = 0;
34598c2ecf20Sopenharmony_ci		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
34608c2ecf20Sopenharmony_ci						    gfp_mask, &nr_scanned);
34618c2ecf20Sopenharmony_ci		nr_reclaimed += reclaimed;
34628c2ecf20Sopenharmony_ci		*total_scanned += nr_scanned;
34638c2ecf20Sopenharmony_ci		spin_lock_irq(&mctz->lock);
34648c2ecf20Sopenharmony_ci		__mem_cgroup_remove_exceeded(mz, mctz);
34658c2ecf20Sopenharmony_ci
34668c2ecf20Sopenharmony_ci		/*
34678c2ecf20Sopenharmony_ci		 * If we failed to reclaim anything from this memory cgroup
34688c2ecf20Sopenharmony_ci		 * it is time to move on to the next cgroup
34698c2ecf20Sopenharmony_ci		 */
34708c2ecf20Sopenharmony_ci		next_mz = NULL;
34718c2ecf20Sopenharmony_ci		if (!reclaimed)
34728c2ecf20Sopenharmony_ci			next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
34738c2ecf20Sopenharmony_ci
34748c2ecf20Sopenharmony_ci		excess = soft_limit_excess(mz->memcg);
34758c2ecf20Sopenharmony_ci		/*
34768c2ecf20Sopenharmony_ci		 * One school of thought says that we should not add
34778c2ecf20Sopenharmony_ci		 * back the node to the tree if reclaim returns 0.
34788c2ecf20Sopenharmony_ci		 * But our reclaim could return 0, simply because due
34798c2ecf20Sopenharmony_ci		 * to priority we are exposing a smaller subset of
34808c2ecf20Sopenharmony_ci		 * memory to reclaim from. Consider this as a longer
34818c2ecf20Sopenharmony_ci		 * term TODO.
34828c2ecf20Sopenharmony_ci		 */
34838c2ecf20Sopenharmony_ci		/* If excess == 0, no tree ops */
34848c2ecf20Sopenharmony_ci		__mem_cgroup_insert_exceeded(mz, mctz, excess);
34858c2ecf20Sopenharmony_ci		spin_unlock_irq(&mctz->lock);
34868c2ecf20Sopenharmony_ci		css_put(&mz->memcg->css);
34878c2ecf20Sopenharmony_ci		loop++;
34888c2ecf20Sopenharmony_ci		/*
34898c2ecf20Sopenharmony_ci		 * Could not reclaim anything and there are no more
34908c2ecf20Sopenharmony_ci		 * mem cgroups to try or we seem to be looping without
34918c2ecf20Sopenharmony_ci		 * reclaiming anything.
34928c2ecf20Sopenharmony_ci		 */
34938c2ecf20Sopenharmony_ci		if (!nr_reclaimed &&
34948c2ecf20Sopenharmony_ci			(next_mz == NULL ||
34958c2ecf20Sopenharmony_ci			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
34968c2ecf20Sopenharmony_ci			break;
34978c2ecf20Sopenharmony_ci	} while (!nr_reclaimed);
34988c2ecf20Sopenharmony_ci	if (next_mz)
34998c2ecf20Sopenharmony_ci		css_put(&next_mz->memcg->css);
35008c2ecf20Sopenharmony_ci	return nr_reclaimed;
35018c2ecf20Sopenharmony_ci}
35028c2ecf20Sopenharmony_ci
35038c2ecf20Sopenharmony_ci/*
35048c2ecf20Sopenharmony_ci * Test whether @memcg has children, dead or alive.  Note that this
35058c2ecf20Sopenharmony_ci * function doesn't care whether @memcg has use_hierarchy enabled and
35068c2ecf20Sopenharmony_ci * returns %true if there are child csses according to the cgroup
35078c2ecf20Sopenharmony_ci * hierarchy.  Testing use_hierarchy is the caller's responsibility.
35088c2ecf20Sopenharmony_ci */
35098c2ecf20Sopenharmony_cistatic inline bool memcg_has_children(struct mem_cgroup *memcg)
35108c2ecf20Sopenharmony_ci{
35118c2ecf20Sopenharmony_ci	bool ret;
35128c2ecf20Sopenharmony_ci
35138c2ecf20Sopenharmony_ci	rcu_read_lock();
35148c2ecf20Sopenharmony_ci	ret = css_next_child(NULL, &memcg->css);
35158c2ecf20Sopenharmony_ci	rcu_read_unlock();
35168c2ecf20Sopenharmony_ci	return ret;
35178c2ecf20Sopenharmony_ci}
35188c2ecf20Sopenharmony_ci
35198c2ecf20Sopenharmony_ci/*
35208c2ecf20Sopenharmony_ci * Reclaims as many pages from the given memcg as possible.
35218c2ecf20Sopenharmony_ci *
35228c2ecf20Sopenharmony_ci * Caller is responsible for holding css reference for memcg.
35238c2ecf20Sopenharmony_ci */
35248c2ecf20Sopenharmony_cistatic int mem_cgroup_force_empty(struct mem_cgroup *memcg)
35258c2ecf20Sopenharmony_ci{
35268c2ecf20Sopenharmony_ci	int nr_retries = MAX_RECLAIM_RETRIES;
35278c2ecf20Sopenharmony_ci
35288c2ecf20Sopenharmony_ci	/* we call try-to-free pages for make this cgroup empty */
35298c2ecf20Sopenharmony_ci	lru_add_drain_all();
35308c2ecf20Sopenharmony_ci
35318c2ecf20Sopenharmony_ci	drain_all_stock(memcg);
35328c2ecf20Sopenharmony_ci
35338c2ecf20Sopenharmony_ci	/* try to free all pages in this cgroup */
35348c2ecf20Sopenharmony_ci	while (nr_retries && page_counter_read(&memcg->memory)) {
35358c2ecf20Sopenharmony_ci		int progress;
35368c2ecf20Sopenharmony_ci
35378c2ecf20Sopenharmony_ci		if (signal_pending(current))
35388c2ecf20Sopenharmony_ci			return -EINTR;
35398c2ecf20Sopenharmony_ci
35408c2ecf20Sopenharmony_ci		progress = try_to_free_mem_cgroup_pages(memcg, 1,
35418c2ecf20Sopenharmony_ci							GFP_KERNEL, true);
35428c2ecf20Sopenharmony_ci		if (!progress) {
35438c2ecf20Sopenharmony_ci			nr_retries--;
35448c2ecf20Sopenharmony_ci			/* maybe some writeback is necessary */
35458c2ecf20Sopenharmony_ci			congestion_wait(BLK_RW_ASYNC, HZ/10);
35468c2ecf20Sopenharmony_ci		}
35478c2ecf20Sopenharmony_ci
35488c2ecf20Sopenharmony_ci	}
35498c2ecf20Sopenharmony_ci
35508c2ecf20Sopenharmony_ci	return 0;
35518c2ecf20Sopenharmony_ci}
35528c2ecf20Sopenharmony_ci
35538c2ecf20Sopenharmony_cistatic ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
35548c2ecf20Sopenharmony_ci					    char *buf, size_t nbytes,
35558c2ecf20Sopenharmony_ci					    loff_t off)
35568c2ecf20Sopenharmony_ci{
35578c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
35588c2ecf20Sopenharmony_ci
35598c2ecf20Sopenharmony_ci	if (mem_cgroup_is_root(memcg))
35608c2ecf20Sopenharmony_ci		return -EINVAL;
35618c2ecf20Sopenharmony_ci	return mem_cgroup_force_empty(memcg) ?: nbytes;
35628c2ecf20Sopenharmony_ci}
35638c2ecf20Sopenharmony_ci
35648c2ecf20Sopenharmony_cistatic u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
35658c2ecf20Sopenharmony_ci				     struct cftype *cft)
35668c2ecf20Sopenharmony_ci{
35678c2ecf20Sopenharmony_ci	return mem_cgroup_from_css(css)->use_hierarchy;
35688c2ecf20Sopenharmony_ci}
35698c2ecf20Sopenharmony_ci
35708c2ecf20Sopenharmony_cistatic int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
35718c2ecf20Sopenharmony_ci				      struct cftype *cft, u64 val)
35728c2ecf20Sopenharmony_ci{
35738c2ecf20Sopenharmony_ci	int retval = 0;
35748c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
35758c2ecf20Sopenharmony_ci	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
35768c2ecf20Sopenharmony_ci
35778c2ecf20Sopenharmony_ci	if (memcg->use_hierarchy == val)
35788c2ecf20Sopenharmony_ci		return 0;
35798c2ecf20Sopenharmony_ci
35808c2ecf20Sopenharmony_ci	/*
35818c2ecf20Sopenharmony_ci	 * If parent's use_hierarchy is set, we can't make any modifications
35828c2ecf20Sopenharmony_ci	 * in the child subtrees. If it is unset, then the change can
35838c2ecf20Sopenharmony_ci	 * occur, provided the current cgroup has no children.
35848c2ecf20Sopenharmony_ci	 *
35858c2ecf20Sopenharmony_ci	 * For the root cgroup, parent_mem is NULL, we allow value to be
35868c2ecf20Sopenharmony_ci	 * set if there are no children.
35878c2ecf20Sopenharmony_ci	 */
35888c2ecf20Sopenharmony_ci	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
35898c2ecf20Sopenharmony_ci				(val == 1 || val == 0)) {
35908c2ecf20Sopenharmony_ci		if (!memcg_has_children(memcg))
35918c2ecf20Sopenharmony_ci			memcg->use_hierarchy = val;
35928c2ecf20Sopenharmony_ci		else
35938c2ecf20Sopenharmony_ci			retval = -EBUSY;
35948c2ecf20Sopenharmony_ci	} else
35958c2ecf20Sopenharmony_ci		retval = -EINVAL;
35968c2ecf20Sopenharmony_ci
35978c2ecf20Sopenharmony_ci	return retval;
35988c2ecf20Sopenharmony_ci}
35998c2ecf20Sopenharmony_ci
36008c2ecf20Sopenharmony_cistatic unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
36018c2ecf20Sopenharmony_ci{
36028c2ecf20Sopenharmony_ci	unsigned long val;
36038c2ecf20Sopenharmony_ci
36048c2ecf20Sopenharmony_ci	if (mem_cgroup_is_root(memcg)) {
36058c2ecf20Sopenharmony_ci		val = memcg_page_state(memcg, NR_FILE_PAGES) +
36068c2ecf20Sopenharmony_ci			memcg_page_state(memcg, NR_ANON_MAPPED);
36078c2ecf20Sopenharmony_ci		if (swap)
36088c2ecf20Sopenharmony_ci			val += memcg_page_state(memcg, MEMCG_SWAP);
36098c2ecf20Sopenharmony_ci	} else {
36108c2ecf20Sopenharmony_ci		if (!swap)
36118c2ecf20Sopenharmony_ci			val = page_counter_read(&memcg->memory);
36128c2ecf20Sopenharmony_ci		else
36138c2ecf20Sopenharmony_ci			val = page_counter_read(&memcg->memsw);
36148c2ecf20Sopenharmony_ci	}
36158c2ecf20Sopenharmony_ci	return val;
36168c2ecf20Sopenharmony_ci}
36178c2ecf20Sopenharmony_ci
36188c2ecf20Sopenharmony_cienum {
36198c2ecf20Sopenharmony_ci	RES_USAGE,
36208c2ecf20Sopenharmony_ci	RES_LIMIT,
36218c2ecf20Sopenharmony_ci	RES_MAX_USAGE,
36228c2ecf20Sopenharmony_ci	RES_FAILCNT,
36238c2ecf20Sopenharmony_ci	RES_SOFT_LIMIT,
36248c2ecf20Sopenharmony_ci};
36258c2ecf20Sopenharmony_ci
36268c2ecf20Sopenharmony_cistatic u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
36278c2ecf20Sopenharmony_ci			       struct cftype *cft)
36288c2ecf20Sopenharmony_ci{
36298c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
36308c2ecf20Sopenharmony_ci	struct page_counter *counter;
36318c2ecf20Sopenharmony_ci
36328c2ecf20Sopenharmony_ci	switch (MEMFILE_TYPE(cft->private)) {
36338c2ecf20Sopenharmony_ci	case _MEM:
36348c2ecf20Sopenharmony_ci		counter = &memcg->memory;
36358c2ecf20Sopenharmony_ci		break;
36368c2ecf20Sopenharmony_ci	case _MEMSWAP:
36378c2ecf20Sopenharmony_ci		counter = &memcg->memsw;
36388c2ecf20Sopenharmony_ci		break;
36398c2ecf20Sopenharmony_ci	case _KMEM:
36408c2ecf20Sopenharmony_ci		counter = &memcg->kmem;
36418c2ecf20Sopenharmony_ci		break;
36428c2ecf20Sopenharmony_ci	case _TCP:
36438c2ecf20Sopenharmony_ci		counter = &memcg->tcpmem;
36448c2ecf20Sopenharmony_ci		break;
36458c2ecf20Sopenharmony_ci	default:
36468c2ecf20Sopenharmony_ci		BUG();
36478c2ecf20Sopenharmony_ci	}
36488c2ecf20Sopenharmony_ci
36498c2ecf20Sopenharmony_ci	switch (MEMFILE_ATTR(cft->private)) {
36508c2ecf20Sopenharmony_ci	case RES_USAGE:
36518c2ecf20Sopenharmony_ci		if (counter == &memcg->memory)
36528c2ecf20Sopenharmony_ci			return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
36538c2ecf20Sopenharmony_ci		if (counter == &memcg->memsw)
36548c2ecf20Sopenharmony_ci			return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
36558c2ecf20Sopenharmony_ci		return (u64)page_counter_read(counter) * PAGE_SIZE;
36568c2ecf20Sopenharmony_ci	case RES_LIMIT:
36578c2ecf20Sopenharmony_ci		return (u64)counter->max * PAGE_SIZE;
36588c2ecf20Sopenharmony_ci	case RES_MAX_USAGE:
36598c2ecf20Sopenharmony_ci		return (u64)counter->watermark * PAGE_SIZE;
36608c2ecf20Sopenharmony_ci	case RES_FAILCNT:
36618c2ecf20Sopenharmony_ci		return counter->failcnt;
36628c2ecf20Sopenharmony_ci	case RES_SOFT_LIMIT:
36638c2ecf20Sopenharmony_ci		return (u64)memcg->soft_limit * PAGE_SIZE;
36648c2ecf20Sopenharmony_ci	default:
36658c2ecf20Sopenharmony_ci		BUG();
36668c2ecf20Sopenharmony_ci	}
36678c2ecf20Sopenharmony_ci}
36688c2ecf20Sopenharmony_ci
36698c2ecf20Sopenharmony_cistatic void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
36708c2ecf20Sopenharmony_ci{
36718c2ecf20Sopenharmony_ci	unsigned long stat[MEMCG_NR_STAT] = {0};
36728c2ecf20Sopenharmony_ci	struct mem_cgroup *mi;
36738c2ecf20Sopenharmony_ci	int node, cpu, i;
36748c2ecf20Sopenharmony_ci
36758c2ecf20Sopenharmony_ci	for_each_online_cpu(cpu)
36768c2ecf20Sopenharmony_ci		for (i = 0; i < MEMCG_NR_STAT; i++)
36778c2ecf20Sopenharmony_ci			stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
36788c2ecf20Sopenharmony_ci
36798c2ecf20Sopenharmony_ci	for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
36808c2ecf20Sopenharmony_ci		for (i = 0; i < MEMCG_NR_STAT; i++)
36818c2ecf20Sopenharmony_ci			atomic_long_add(stat[i], &mi->vmstats[i]);
36828c2ecf20Sopenharmony_ci
36838c2ecf20Sopenharmony_ci	for_each_node(node) {
36848c2ecf20Sopenharmony_ci		struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
36858c2ecf20Sopenharmony_ci		struct mem_cgroup_per_node *pi;
36868c2ecf20Sopenharmony_ci
36878c2ecf20Sopenharmony_ci		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
36888c2ecf20Sopenharmony_ci			stat[i] = 0;
36898c2ecf20Sopenharmony_ci
36908c2ecf20Sopenharmony_ci		for_each_online_cpu(cpu)
36918c2ecf20Sopenharmony_ci			for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
36928c2ecf20Sopenharmony_ci				stat[i] += per_cpu(
36938c2ecf20Sopenharmony_ci					pn->lruvec_stat_cpu->count[i], cpu);
36948c2ecf20Sopenharmony_ci
36958c2ecf20Sopenharmony_ci		for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
36968c2ecf20Sopenharmony_ci			for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
36978c2ecf20Sopenharmony_ci				atomic_long_add(stat[i], &pi->lruvec_stat[i]);
36988c2ecf20Sopenharmony_ci	}
36998c2ecf20Sopenharmony_ci}
37008c2ecf20Sopenharmony_ci
37018c2ecf20Sopenharmony_cistatic void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
37028c2ecf20Sopenharmony_ci{
37038c2ecf20Sopenharmony_ci	unsigned long events[NR_VM_EVENT_ITEMS];
37048c2ecf20Sopenharmony_ci	struct mem_cgroup *mi;
37058c2ecf20Sopenharmony_ci	int cpu, i;
37068c2ecf20Sopenharmony_ci
37078c2ecf20Sopenharmony_ci	for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
37088c2ecf20Sopenharmony_ci		events[i] = 0;
37098c2ecf20Sopenharmony_ci
37108c2ecf20Sopenharmony_ci	for_each_online_cpu(cpu)
37118c2ecf20Sopenharmony_ci		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
37128c2ecf20Sopenharmony_ci			events[i] += per_cpu(memcg->vmstats_percpu->events[i],
37138c2ecf20Sopenharmony_ci					     cpu);
37148c2ecf20Sopenharmony_ci
37158c2ecf20Sopenharmony_ci	for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
37168c2ecf20Sopenharmony_ci		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
37178c2ecf20Sopenharmony_ci			atomic_long_add(events[i], &mi->vmevents[i]);
37188c2ecf20Sopenharmony_ci}
37198c2ecf20Sopenharmony_ci
37208c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
37218c2ecf20Sopenharmony_cistatic int memcg_online_kmem(struct mem_cgroup *memcg)
37228c2ecf20Sopenharmony_ci{
37238c2ecf20Sopenharmony_ci	struct obj_cgroup *objcg;
37248c2ecf20Sopenharmony_ci	int memcg_id;
37258c2ecf20Sopenharmony_ci
37268c2ecf20Sopenharmony_ci	if (cgroup_memory_nokmem)
37278c2ecf20Sopenharmony_ci		return 0;
37288c2ecf20Sopenharmony_ci
37298c2ecf20Sopenharmony_ci	BUG_ON(memcg->kmemcg_id >= 0);
37308c2ecf20Sopenharmony_ci	BUG_ON(memcg->kmem_state);
37318c2ecf20Sopenharmony_ci
37328c2ecf20Sopenharmony_ci	memcg_id = memcg_alloc_cache_id();
37338c2ecf20Sopenharmony_ci	if (memcg_id < 0)
37348c2ecf20Sopenharmony_ci		return memcg_id;
37358c2ecf20Sopenharmony_ci
37368c2ecf20Sopenharmony_ci	objcg = obj_cgroup_alloc();
37378c2ecf20Sopenharmony_ci	if (!objcg) {
37388c2ecf20Sopenharmony_ci		memcg_free_cache_id(memcg_id);
37398c2ecf20Sopenharmony_ci		return -ENOMEM;
37408c2ecf20Sopenharmony_ci	}
37418c2ecf20Sopenharmony_ci	objcg->memcg = memcg;
37428c2ecf20Sopenharmony_ci	rcu_assign_pointer(memcg->objcg, objcg);
37438c2ecf20Sopenharmony_ci
37448c2ecf20Sopenharmony_ci	static_branch_enable(&memcg_kmem_enabled_key);
37458c2ecf20Sopenharmony_ci
37468c2ecf20Sopenharmony_ci	/*
37478c2ecf20Sopenharmony_ci	 * A memory cgroup is considered kmem-online as soon as it gets
37488c2ecf20Sopenharmony_ci	 * kmemcg_id. Setting the id after enabling static branching will
37498c2ecf20Sopenharmony_ci	 * guarantee no one starts accounting before all call sites are
37508c2ecf20Sopenharmony_ci	 * patched.
37518c2ecf20Sopenharmony_ci	 */
37528c2ecf20Sopenharmony_ci	memcg->kmemcg_id = memcg_id;
37538c2ecf20Sopenharmony_ci	memcg->kmem_state = KMEM_ONLINE;
37548c2ecf20Sopenharmony_ci
37558c2ecf20Sopenharmony_ci	return 0;
37568c2ecf20Sopenharmony_ci}
37578c2ecf20Sopenharmony_ci
37588c2ecf20Sopenharmony_cistatic void memcg_offline_kmem(struct mem_cgroup *memcg)
37598c2ecf20Sopenharmony_ci{
37608c2ecf20Sopenharmony_ci	struct cgroup_subsys_state *css;
37618c2ecf20Sopenharmony_ci	struct mem_cgroup *parent, *child;
37628c2ecf20Sopenharmony_ci	int kmemcg_id;
37638c2ecf20Sopenharmony_ci
37648c2ecf20Sopenharmony_ci	if (memcg->kmem_state != KMEM_ONLINE)
37658c2ecf20Sopenharmony_ci		return;
37668c2ecf20Sopenharmony_ci
37678c2ecf20Sopenharmony_ci	memcg->kmem_state = KMEM_ALLOCATED;
37688c2ecf20Sopenharmony_ci
37698c2ecf20Sopenharmony_ci	parent = parent_mem_cgroup(memcg);
37708c2ecf20Sopenharmony_ci	if (!parent)
37718c2ecf20Sopenharmony_ci		parent = root_mem_cgroup;
37728c2ecf20Sopenharmony_ci
37738c2ecf20Sopenharmony_ci	memcg_reparent_objcgs(memcg, parent);
37748c2ecf20Sopenharmony_ci
37758c2ecf20Sopenharmony_ci	kmemcg_id = memcg->kmemcg_id;
37768c2ecf20Sopenharmony_ci	BUG_ON(kmemcg_id < 0);
37778c2ecf20Sopenharmony_ci
37788c2ecf20Sopenharmony_ci	/*
37798c2ecf20Sopenharmony_ci	 * Change kmemcg_id of this cgroup and all its descendants to the
37808c2ecf20Sopenharmony_ci	 * parent's id, and then move all entries from this cgroup's list_lrus
37818c2ecf20Sopenharmony_ci	 * to ones of the parent. After we have finished, all list_lrus
37828c2ecf20Sopenharmony_ci	 * corresponding to this cgroup are guaranteed to remain empty. The
37838c2ecf20Sopenharmony_ci	 * ordering is imposed by list_lru_node->lock taken by
37848c2ecf20Sopenharmony_ci	 * memcg_drain_all_list_lrus().
37858c2ecf20Sopenharmony_ci	 */
37868c2ecf20Sopenharmony_ci	rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
37878c2ecf20Sopenharmony_ci	css_for_each_descendant_pre(css, &memcg->css) {
37888c2ecf20Sopenharmony_ci		child = mem_cgroup_from_css(css);
37898c2ecf20Sopenharmony_ci		BUG_ON(child->kmemcg_id != kmemcg_id);
37908c2ecf20Sopenharmony_ci		child->kmemcg_id = parent->kmemcg_id;
37918c2ecf20Sopenharmony_ci		if (!memcg->use_hierarchy)
37928c2ecf20Sopenharmony_ci			break;
37938c2ecf20Sopenharmony_ci	}
37948c2ecf20Sopenharmony_ci	rcu_read_unlock();
37958c2ecf20Sopenharmony_ci
37968c2ecf20Sopenharmony_ci	memcg_drain_all_list_lrus(kmemcg_id, parent);
37978c2ecf20Sopenharmony_ci
37988c2ecf20Sopenharmony_ci	memcg_free_cache_id(kmemcg_id);
37998c2ecf20Sopenharmony_ci}
38008c2ecf20Sopenharmony_ci
38018c2ecf20Sopenharmony_cistatic void memcg_free_kmem(struct mem_cgroup *memcg)
38028c2ecf20Sopenharmony_ci{
38038c2ecf20Sopenharmony_ci	/* css_alloc() failed, offlining didn't happen */
38048c2ecf20Sopenharmony_ci	if (unlikely(memcg->kmem_state == KMEM_ONLINE))
38058c2ecf20Sopenharmony_ci		memcg_offline_kmem(memcg);
38068c2ecf20Sopenharmony_ci}
38078c2ecf20Sopenharmony_ci#else
38088c2ecf20Sopenharmony_cistatic int memcg_online_kmem(struct mem_cgroup *memcg)
38098c2ecf20Sopenharmony_ci{
38108c2ecf20Sopenharmony_ci	return 0;
38118c2ecf20Sopenharmony_ci}
38128c2ecf20Sopenharmony_cistatic void memcg_offline_kmem(struct mem_cgroup *memcg)
38138c2ecf20Sopenharmony_ci{
38148c2ecf20Sopenharmony_ci}
38158c2ecf20Sopenharmony_cistatic void memcg_free_kmem(struct mem_cgroup *memcg)
38168c2ecf20Sopenharmony_ci{
38178c2ecf20Sopenharmony_ci}
38188c2ecf20Sopenharmony_ci#endif /* CONFIG_MEMCG_KMEM */
38198c2ecf20Sopenharmony_ci
38208c2ecf20Sopenharmony_cistatic int memcg_update_kmem_max(struct mem_cgroup *memcg,
38218c2ecf20Sopenharmony_ci				 unsigned long max)
38228c2ecf20Sopenharmony_ci{
38238c2ecf20Sopenharmony_ci	int ret;
38248c2ecf20Sopenharmony_ci
38258c2ecf20Sopenharmony_ci	mutex_lock(&memcg_max_mutex);
38268c2ecf20Sopenharmony_ci	ret = page_counter_set_max(&memcg->kmem, max);
38278c2ecf20Sopenharmony_ci	mutex_unlock(&memcg_max_mutex);
38288c2ecf20Sopenharmony_ci	return ret;
38298c2ecf20Sopenharmony_ci}
38308c2ecf20Sopenharmony_ci
38318c2ecf20Sopenharmony_cistatic int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
38328c2ecf20Sopenharmony_ci{
38338c2ecf20Sopenharmony_ci	int ret;
38348c2ecf20Sopenharmony_ci
38358c2ecf20Sopenharmony_ci	mutex_lock(&memcg_max_mutex);
38368c2ecf20Sopenharmony_ci
38378c2ecf20Sopenharmony_ci	ret = page_counter_set_max(&memcg->tcpmem, max);
38388c2ecf20Sopenharmony_ci	if (ret)
38398c2ecf20Sopenharmony_ci		goto out;
38408c2ecf20Sopenharmony_ci
38418c2ecf20Sopenharmony_ci	if (!memcg->tcpmem_active) {
38428c2ecf20Sopenharmony_ci		/*
38438c2ecf20Sopenharmony_ci		 * The active flag needs to be written after the static_key
38448c2ecf20Sopenharmony_ci		 * update. This is what guarantees that the socket activation
38458c2ecf20Sopenharmony_ci		 * function is the last one to run. See mem_cgroup_sk_alloc()
38468c2ecf20Sopenharmony_ci		 * for details, and note that we don't mark any socket as
38478c2ecf20Sopenharmony_ci		 * belonging to this memcg until that flag is up.
38488c2ecf20Sopenharmony_ci		 *
38498c2ecf20Sopenharmony_ci		 * We need to do this, because static_keys will span multiple
38508c2ecf20Sopenharmony_ci		 * sites, but we can't control their order. If we mark a socket
38518c2ecf20Sopenharmony_ci		 * as accounted, but the accounting functions are not patched in
38528c2ecf20Sopenharmony_ci		 * yet, we'll lose accounting.
38538c2ecf20Sopenharmony_ci		 *
38548c2ecf20Sopenharmony_ci		 * We never race with the readers in mem_cgroup_sk_alloc(),
38558c2ecf20Sopenharmony_ci		 * because when this value change, the code to process it is not
38568c2ecf20Sopenharmony_ci		 * patched in yet.
38578c2ecf20Sopenharmony_ci		 */
38588c2ecf20Sopenharmony_ci		static_branch_inc(&memcg_sockets_enabled_key);
38598c2ecf20Sopenharmony_ci		memcg->tcpmem_active = true;
38608c2ecf20Sopenharmony_ci	}
38618c2ecf20Sopenharmony_ciout:
38628c2ecf20Sopenharmony_ci	mutex_unlock(&memcg_max_mutex);
38638c2ecf20Sopenharmony_ci	return ret;
38648c2ecf20Sopenharmony_ci}
38658c2ecf20Sopenharmony_ci
38668c2ecf20Sopenharmony_ci/*
38678c2ecf20Sopenharmony_ci * The user of this function is...
38688c2ecf20Sopenharmony_ci * RES_LIMIT.
38698c2ecf20Sopenharmony_ci */
38708c2ecf20Sopenharmony_cistatic ssize_t mem_cgroup_write(struct kernfs_open_file *of,
38718c2ecf20Sopenharmony_ci				char *buf, size_t nbytes, loff_t off)
38728c2ecf20Sopenharmony_ci{
38738c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
38748c2ecf20Sopenharmony_ci	unsigned long nr_pages;
38758c2ecf20Sopenharmony_ci	int ret;
38768c2ecf20Sopenharmony_ci
38778c2ecf20Sopenharmony_ci	buf = strstrip(buf);
38788c2ecf20Sopenharmony_ci	ret = page_counter_memparse(buf, "-1", &nr_pages);
38798c2ecf20Sopenharmony_ci	if (ret)
38808c2ecf20Sopenharmony_ci		return ret;
38818c2ecf20Sopenharmony_ci
38828c2ecf20Sopenharmony_ci	switch (MEMFILE_ATTR(of_cft(of)->private)) {
38838c2ecf20Sopenharmony_ci	case RES_LIMIT:
38848c2ecf20Sopenharmony_ci		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
38858c2ecf20Sopenharmony_ci			ret = -EINVAL;
38868c2ecf20Sopenharmony_ci			break;
38878c2ecf20Sopenharmony_ci		}
38888c2ecf20Sopenharmony_ci		switch (MEMFILE_TYPE(of_cft(of)->private)) {
38898c2ecf20Sopenharmony_ci		case _MEM:
38908c2ecf20Sopenharmony_ci			ret = mem_cgroup_resize_max(memcg, nr_pages, false);
38918c2ecf20Sopenharmony_ci			break;
38928c2ecf20Sopenharmony_ci		case _MEMSWAP:
38938c2ecf20Sopenharmony_ci			ret = mem_cgroup_resize_max(memcg, nr_pages, true);
38948c2ecf20Sopenharmony_ci			break;
38958c2ecf20Sopenharmony_ci		case _KMEM:
38968c2ecf20Sopenharmony_ci			pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
38978c2ecf20Sopenharmony_ci				     "Please report your usecase to linux-mm@kvack.org if you "
38988c2ecf20Sopenharmony_ci				     "depend on this functionality.\n");
38998c2ecf20Sopenharmony_ci			ret = memcg_update_kmem_max(memcg, nr_pages);
39008c2ecf20Sopenharmony_ci			break;
39018c2ecf20Sopenharmony_ci		case _TCP:
39028c2ecf20Sopenharmony_ci			ret = memcg_update_tcp_max(memcg, nr_pages);
39038c2ecf20Sopenharmony_ci			break;
39048c2ecf20Sopenharmony_ci		}
39058c2ecf20Sopenharmony_ci		break;
39068c2ecf20Sopenharmony_ci	case RES_SOFT_LIMIT:
39078c2ecf20Sopenharmony_ci		memcg->soft_limit = nr_pages;
39088c2ecf20Sopenharmony_ci		ret = 0;
39098c2ecf20Sopenharmony_ci		break;
39108c2ecf20Sopenharmony_ci	}
39118c2ecf20Sopenharmony_ci	return ret ?: nbytes;
39128c2ecf20Sopenharmony_ci}
39138c2ecf20Sopenharmony_ci
39148c2ecf20Sopenharmony_cistatic ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
39158c2ecf20Sopenharmony_ci				size_t nbytes, loff_t off)
39168c2ecf20Sopenharmony_ci{
39178c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
39188c2ecf20Sopenharmony_ci	struct page_counter *counter;
39198c2ecf20Sopenharmony_ci
39208c2ecf20Sopenharmony_ci	switch (MEMFILE_TYPE(of_cft(of)->private)) {
39218c2ecf20Sopenharmony_ci	case _MEM:
39228c2ecf20Sopenharmony_ci		counter = &memcg->memory;
39238c2ecf20Sopenharmony_ci		break;
39248c2ecf20Sopenharmony_ci	case _MEMSWAP:
39258c2ecf20Sopenharmony_ci		counter = &memcg->memsw;
39268c2ecf20Sopenharmony_ci		break;
39278c2ecf20Sopenharmony_ci	case _KMEM:
39288c2ecf20Sopenharmony_ci		counter = &memcg->kmem;
39298c2ecf20Sopenharmony_ci		break;
39308c2ecf20Sopenharmony_ci	case _TCP:
39318c2ecf20Sopenharmony_ci		counter = &memcg->tcpmem;
39328c2ecf20Sopenharmony_ci		break;
39338c2ecf20Sopenharmony_ci	default:
39348c2ecf20Sopenharmony_ci		BUG();
39358c2ecf20Sopenharmony_ci	}
39368c2ecf20Sopenharmony_ci
39378c2ecf20Sopenharmony_ci	switch (MEMFILE_ATTR(of_cft(of)->private)) {
39388c2ecf20Sopenharmony_ci	case RES_MAX_USAGE:
39398c2ecf20Sopenharmony_ci		page_counter_reset_watermark(counter);
39408c2ecf20Sopenharmony_ci		break;
39418c2ecf20Sopenharmony_ci	case RES_FAILCNT:
39428c2ecf20Sopenharmony_ci		counter->failcnt = 0;
39438c2ecf20Sopenharmony_ci		break;
39448c2ecf20Sopenharmony_ci	default:
39458c2ecf20Sopenharmony_ci		BUG();
39468c2ecf20Sopenharmony_ci	}
39478c2ecf20Sopenharmony_ci
39488c2ecf20Sopenharmony_ci	return nbytes;
39498c2ecf20Sopenharmony_ci}
39508c2ecf20Sopenharmony_ci
39518c2ecf20Sopenharmony_cistatic u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
39528c2ecf20Sopenharmony_ci					struct cftype *cft)
39538c2ecf20Sopenharmony_ci{
39548c2ecf20Sopenharmony_ci	return mem_cgroup_from_css(css)->move_charge_at_immigrate;
39558c2ecf20Sopenharmony_ci}
39568c2ecf20Sopenharmony_ci
39578c2ecf20Sopenharmony_ci#ifdef CONFIG_MMU
39588c2ecf20Sopenharmony_cistatic int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
39598c2ecf20Sopenharmony_ci					struct cftype *cft, u64 val)
39608c2ecf20Sopenharmony_ci{
39618c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
39628c2ecf20Sopenharmony_ci
39638c2ecf20Sopenharmony_ci	pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
39648c2ecf20Sopenharmony_ci		     "Please report your usecase to linux-mm@kvack.org if you "
39658c2ecf20Sopenharmony_ci		     "depend on this functionality.\n");
39668c2ecf20Sopenharmony_ci
39678c2ecf20Sopenharmony_ci	if (val & ~MOVE_MASK)
39688c2ecf20Sopenharmony_ci		return -EINVAL;
39698c2ecf20Sopenharmony_ci
39708c2ecf20Sopenharmony_ci	/*
39718c2ecf20Sopenharmony_ci	 * No kind of locking is needed in here, because ->can_attach() will
39728c2ecf20Sopenharmony_ci	 * check this value once in the beginning of the process, and then carry
39738c2ecf20Sopenharmony_ci	 * on with stale data. This means that changes to this value will only
39748c2ecf20Sopenharmony_ci	 * affect task migrations starting after the change.
39758c2ecf20Sopenharmony_ci	 */
39768c2ecf20Sopenharmony_ci	memcg->move_charge_at_immigrate = val;
39778c2ecf20Sopenharmony_ci	return 0;
39788c2ecf20Sopenharmony_ci}
39798c2ecf20Sopenharmony_ci#else
39808c2ecf20Sopenharmony_cistatic int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
39818c2ecf20Sopenharmony_ci					struct cftype *cft, u64 val)
39828c2ecf20Sopenharmony_ci{
39838c2ecf20Sopenharmony_ci	return -ENOSYS;
39848c2ecf20Sopenharmony_ci}
39858c2ecf20Sopenharmony_ci#endif
39868c2ecf20Sopenharmony_ci
39878c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA
39888c2ecf20Sopenharmony_ci
39898c2ecf20Sopenharmony_ci#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
39908c2ecf20Sopenharmony_ci#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
39918c2ecf20Sopenharmony_ci#define LRU_ALL	     ((1 << NR_LRU_LISTS) - 1)
39928c2ecf20Sopenharmony_ci
39938c2ecf20Sopenharmony_cistatic unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
39948c2ecf20Sopenharmony_ci				int nid, unsigned int lru_mask, bool tree)
39958c2ecf20Sopenharmony_ci{
39968c2ecf20Sopenharmony_ci	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
39978c2ecf20Sopenharmony_ci	unsigned long nr = 0;
39988c2ecf20Sopenharmony_ci	enum lru_list lru;
39998c2ecf20Sopenharmony_ci
40008c2ecf20Sopenharmony_ci	VM_BUG_ON((unsigned)nid >= nr_node_ids);
40018c2ecf20Sopenharmony_ci
40028c2ecf20Sopenharmony_ci	for_each_lru(lru) {
40038c2ecf20Sopenharmony_ci		if (!(BIT(lru) & lru_mask))
40048c2ecf20Sopenharmony_ci			continue;
40058c2ecf20Sopenharmony_ci		if (tree)
40068c2ecf20Sopenharmony_ci			nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
40078c2ecf20Sopenharmony_ci		else
40088c2ecf20Sopenharmony_ci			nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
40098c2ecf20Sopenharmony_ci	}
40108c2ecf20Sopenharmony_ci	return nr;
40118c2ecf20Sopenharmony_ci}
40128c2ecf20Sopenharmony_ci
40138c2ecf20Sopenharmony_cistatic unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
40148c2ecf20Sopenharmony_ci					     unsigned int lru_mask,
40158c2ecf20Sopenharmony_ci					     bool tree)
40168c2ecf20Sopenharmony_ci{
40178c2ecf20Sopenharmony_ci	unsigned long nr = 0;
40188c2ecf20Sopenharmony_ci	enum lru_list lru;
40198c2ecf20Sopenharmony_ci
40208c2ecf20Sopenharmony_ci	for_each_lru(lru) {
40218c2ecf20Sopenharmony_ci		if (!(BIT(lru) & lru_mask))
40228c2ecf20Sopenharmony_ci			continue;
40238c2ecf20Sopenharmony_ci		if (tree)
40248c2ecf20Sopenharmony_ci			nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
40258c2ecf20Sopenharmony_ci		else
40268c2ecf20Sopenharmony_ci			nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
40278c2ecf20Sopenharmony_ci	}
40288c2ecf20Sopenharmony_ci	return nr;
40298c2ecf20Sopenharmony_ci}
40308c2ecf20Sopenharmony_ci
40318c2ecf20Sopenharmony_cistatic int memcg_numa_stat_show(struct seq_file *m, void *v)
40328c2ecf20Sopenharmony_ci{
40338c2ecf20Sopenharmony_ci	struct numa_stat {
40348c2ecf20Sopenharmony_ci		const char *name;
40358c2ecf20Sopenharmony_ci		unsigned int lru_mask;
40368c2ecf20Sopenharmony_ci	};
40378c2ecf20Sopenharmony_ci
40388c2ecf20Sopenharmony_ci	static const struct numa_stat stats[] = {
40398c2ecf20Sopenharmony_ci		{ "total", LRU_ALL },
40408c2ecf20Sopenharmony_ci		{ "file", LRU_ALL_FILE },
40418c2ecf20Sopenharmony_ci		{ "anon", LRU_ALL_ANON },
40428c2ecf20Sopenharmony_ci		{ "unevictable", BIT(LRU_UNEVICTABLE) },
40438c2ecf20Sopenharmony_ci	};
40448c2ecf20Sopenharmony_ci	const struct numa_stat *stat;
40458c2ecf20Sopenharmony_ci	int nid;
40468c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
40478c2ecf20Sopenharmony_ci
40488c2ecf20Sopenharmony_ci	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
40498c2ecf20Sopenharmony_ci		seq_printf(m, "%s=%lu", stat->name,
40508c2ecf20Sopenharmony_ci			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
40518c2ecf20Sopenharmony_ci						   false));
40528c2ecf20Sopenharmony_ci		for_each_node_state(nid, N_MEMORY)
40538c2ecf20Sopenharmony_ci			seq_printf(m, " N%d=%lu", nid,
40548c2ecf20Sopenharmony_ci				   mem_cgroup_node_nr_lru_pages(memcg, nid,
40558c2ecf20Sopenharmony_ci							stat->lru_mask, false));
40568c2ecf20Sopenharmony_ci		seq_putc(m, '\n');
40578c2ecf20Sopenharmony_ci	}
40588c2ecf20Sopenharmony_ci
40598c2ecf20Sopenharmony_ci	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
40608c2ecf20Sopenharmony_ci
40618c2ecf20Sopenharmony_ci		seq_printf(m, "hierarchical_%s=%lu", stat->name,
40628c2ecf20Sopenharmony_ci			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
40638c2ecf20Sopenharmony_ci						   true));
40648c2ecf20Sopenharmony_ci		for_each_node_state(nid, N_MEMORY)
40658c2ecf20Sopenharmony_ci			seq_printf(m, " N%d=%lu", nid,
40668c2ecf20Sopenharmony_ci				   mem_cgroup_node_nr_lru_pages(memcg, nid,
40678c2ecf20Sopenharmony_ci							stat->lru_mask, true));
40688c2ecf20Sopenharmony_ci		seq_putc(m, '\n');
40698c2ecf20Sopenharmony_ci	}
40708c2ecf20Sopenharmony_ci
40718c2ecf20Sopenharmony_ci	return 0;
40728c2ecf20Sopenharmony_ci}
40738c2ecf20Sopenharmony_ci#endif /* CONFIG_NUMA */
40748c2ecf20Sopenharmony_ci
40758c2ecf20Sopenharmony_cistatic const unsigned int memcg1_stats[] = {
40768c2ecf20Sopenharmony_ci	NR_FILE_PAGES,
40778c2ecf20Sopenharmony_ci	NR_ANON_MAPPED,
40788c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
40798c2ecf20Sopenharmony_ci	NR_ANON_THPS,
40808c2ecf20Sopenharmony_ci#endif
40818c2ecf20Sopenharmony_ci	NR_SHMEM,
40828c2ecf20Sopenharmony_ci	NR_FILE_MAPPED,
40838c2ecf20Sopenharmony_ci	NR_FILE_DIRTY,
40848c2ecf20Sopenharmony_ci	NR_WRITEBACK,
40858c2ecf20Sopenharmony_ci	MEMCG_SWAP,
40868c2ecf20Sopenharmony_ci};
40878c2ecf20Sopenharmony_ci
40888c2ecf20Sopenharmony_cistatic const char *const memcg1_stat_names[] = {
40898c2ecf20Sopenharmony_ci	"cache",
40908c2ecf20Sopenharmony_ci	"rss",
40918c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
40928c2ecf20Sopenharmony_ci	"rss_huge",
40938c2ecf20Sopenharmony_ci#endif
40948c2ecf20Sopenharmony_ci	"shmem",
40958c2ecf20Sopenharmony_ci	"mapped_file",
40968c2ecf20Sopenharmony_ci	"dirty",
40978c2ecf20Sopenharmony_ci	"writeback",
40988c2ecf20Sopenharmony_ci	"swap",
40998c2ecf20Sopenharmony_ci};
41008c2ecf20Sopenharmony_ci
41018c2ecf20Sopenharmony_ci/* Universal VM events cgroup1 shows, original sort order */
41028c2ecf20Sopenharmony_cistatic const unsigned int memcg1_events[] = {
41038c2ecf20Sopenharmony_ci	PGPGIN,
41048c2ecf20Sopenharmony_ci	PGPGOUT,
41058c2ecf20Sopenharmony_ci	PGFAULT,
41068c2ecf20Sopenharmony_ci	PGMAJFAULT,
41078c2ecf20Sopenharmony_ci};
41088c2ecf20Sopenharmony_ci
41098c2ecf20Sopenharmony_cistatic int memcg_stat_show(struct seq_file *m, void *v)
41108c2ecf20Sopenharmony_ci{
41118c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
41128c2ecf20Sopenharmony_ci	unsigned long memory, memsw;
41138c2ecf20Sopenharmony_ci	struct mem_cgroup *mi;
41148c2ecf20Sopenharmony_ci	unsigned int i;
41158c2ecf20Sopenharmony_ci
41168c2ecf20Sopenharmony_ci	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
41178c2ecf20Sopenharmony_ci
41188c2ecf20Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
41198c2ecf20Sopenharmony_ci		unsigned long nr;
41208c2ecf20Sopenharmony_ci
41218c2ecf20Sopenharmony_ci		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
41228c2ecf20Sopenharmony_ci			continue;
41238c2ecf20Sopenharmony_ci		nr = memcg_page_state_local(memcg, memcg1_stats[i]);
41248c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
41258c2ecf20Sopenharmony_ci		if (memcg1_stats[i] == NR_ANON_THPS)
41268c2ecf20Sopenharmony_ci			nr *= HPAGE_PMD_NR;
41278c2ecf20Sopenharmony_ci#endif
41288c2ecf20Sopenharmony_ci		seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
41298c2ecf20Sopenharmony_ci	}
41308c2ecf20Sopenharmony_ci
41318c2ecf20Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
41328c2ecf20Sopenharmony_ci		seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
41338c2ecf20Sopenharmony_ci			   memcg_events_local(memcg, memcg1_events[i]));
41348c2ecf20Sopenharmony_ci
41358c2ecf20Sopenharmony_ci	for (i = 0; i < NR_LRU_LISTS; i++) {
41368c2ecf20Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE
41378c2ecf20Sopenharmony_ci		if (i == LRU_INACTIVE_PURGEABLE || i == LRU_ACTIVE_PURGEABLE)
41388c2ecf20Sopenharmony_ci			continue;
41398c2ecf20Sopenharmony_ci#endif
41408c2ecf20Sopenharmony_ci		seq_printf(m, "%s %lu\n", lru_list_name(i),
41418c2ecf20Sopenharmony_ci			   memcg_page_state_local(memcg, NR_LRU_BASE + i) *
41428c2ecf20Sopenharmony_ci			   PAGE_SIZE);
41438c2ecf20Sopenharmony_ci	}
41448c2ecf20Sopenharmony_ci
41458c2ecf20Sopenharmony_ci	/* Hierarchical information */
41468c2ecf20Sopenharmony_ci	memory = memsw = PAGE_COUNTER_MAX;
41478c2ecf20Sopenharmony_ci	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
41488c2ecf20Sopenharmony_ci		memory = min(memory, READ_ONCE(mi->memory.max));
41498c2ecf20Sopenharmony_ci		memsw = min(memsw, READ_ONCE(mi->memsw.max));
41508c2ecf20Sopenharmony_ci	}
41518c2ecf20Sopenharmony_ci	seq_printf(m, "hierarchical_memory_limit %llu\n",
41528c2ecf20Sopenharmony_ci		   (u64)memory * PAGE_SIZE);
41538c2ecf20Sopenharmony_ci	if (do_memsw_account())
41548c2ecf20Sopenharmony_ci		seq_printf(m, "hierarchical_memsw_limit %llu\n",
41558c2ecf20Sopenharmony_ci			   (u64)memsw * PAGE_SIZE);
41568c2ecf20Sopenharmony_ci
41578c2ecf20Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
41588c2ecf20Sopenharmony_ci		unsigned long nr;
41598c2ecf20Sopenharmony_ci
41608c2ecf20Sopenharmony_ci		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
41618c2ecf20Sopenharmony_ci			continue;
41628c2ecf20Sopenharmony_ci		nr = memcg_page_state(memcg, memcg1_stats[i]);
41638c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
41648c2ecf20Sopenharmony_ci		if (memcg1_stats[i] == NR_ANON_THPS)
41658c2ecf20Sopenharmony_ci			nr *= HPAGE_PMD_NR;
41668c2ecf20Sopenharmony_ci#endif
41678c2ecf20Sopenharmony_ci		seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
41688c2ecf20Sopenharmony_ci						(u64)nr * PAGE_SIZE);
41698c2ecf20Sopenharmony_ci	}
41708c2ecf20Sopenharmony_ci
41718c2ecf20Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
41728c2ecf20Sopenharmony_ci		seq_printf(m, "total_%s %llu\n",
41738c2ecf20Sopenharmony_ci			   vm_event_name(memcg1_events[i]),
41748c2ecf20Sopenharmony_ci			   (u64)memcg_events(memcg, memcg1_events[i]));
41758c2ecf20Sopenharmony_ci
41768c2ecf20Sopenharmony_ci	for (i = 0; i < NR_LRU_LISTS; i++) {
41778c2ecf20Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE
41788c2ecf20Sopenharmony_ci		if (i == LRU_INACTIVE_PURGEABLE || i == LRU_ACTIVE_PURGEABLE)
41798c2ecf20Sopenharmony_ci			continue;
41808c2ecf20Sopenharmony_ci#endif
41818c2ecf20Sopenharmony_ci		seq_printf(m, "total_%s %llu\n", lru_list_name(i),
41828c2ecf20Sopenharmony_ci			   (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
41838c2ecf20Sopenharmony_ci			   PAGE_SIZE);
41848c2ecf20Sopenharmony_ci	}
41858c2ecf20Sopenharmony_ci
41868c2ecf20Sopenharmony_ci#ifdef CONFIG_DEBUG_VM
41878c2ecf20Sopenharmony_ci	{
41888c2ecf20Sopenharmony_ci		pg_data_t *pgdat;
41898c2ecf20Sopenharmony_ci		struct mem_cgroup_per_node *mz;
41908c2ecf20Sopenharmony_ci		unsigned long anon_cost = 0;
41918c2ecf20Sopenharmony_ci		unsigned long file_cost = 0;
41928c2ecf20Sopenharmony_ci
41938c2ecf20Sopenharmony_ci		for_each_online_pgdat(pgdat) {
41948c2ecf20Sopenharmony_ci			mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
41958c2ecf20Sopenharmony_ci
41968c2ecf20Sopenharmony_ci			anon_cost += mz->lruvec.anon_cost;
41978c2ecf20Sopenharmony_ci			file_cost += mz->lruvec.file_cost;
41988c2ecf20Sopenharmony_ci		}
41998c2ecf20Sopenharmony_ci		seq_printf(m, "anon_cost %lu\n", anon_cost);
42008c2ecf20Sopenharmony_ci		seq_printf(m, "file_cost %lu\n", file_cost);
42018c2ecf20Sopenharmony_ci	}
42028c2ecf20Sopenharmony_ci#endif
42038c2ecf20Sopenharmony_ci
42048c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_DEBUG
42058c2ecf20Sopenharmony_ci	memcg_eswap_info_show(m);
42068c2ecf20Sopenharmony_ci#endif
42078c2ecf20Sopenharmony_ci	return 0;
42088c2ecf20Sopenharmony_ci}
42098c2ecf20Sopenharmony_ci
42108c2ecf20Sopenharmony_cistatic u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
42118c2ecf20Sopenharmony_ci				      struct cftype *cft)
42128c2ecf20Sopenharmony_ci{
42138c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
42148c2ecf20Sopenharmony_ci
42158c2ecf20Sopenharmony_ci	return mem_cgroup_swappiness(memcg);
42168c2ecf20Sopenharmony_ci}
42178c2ecf20Sopenharmony_ci
42188c2ecf20Sopenharmony_cistatic int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
42198c2ecf20Sopenharmony_ci				       struct cftype *cft, u64 val)
42208c2ecf20Sopenharmony_ci{
42218c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
42228c2ecf20Sopenharmony_ci
42238c2ecf20Sopenharmony_ci	if (val > 200)
42248c2ecf20Sopenharmony_ci		return -EINVAL;
42258c2ecf20Sopenharmony_ci
42268c2ecf20Sopenharmony_ci	if (css->parent)
42278c2ecf20Sopenharmony_ci		memcg->swappiness = val;
42288c2ecf20Sopenharmony_ci	else
42298c2ecf20Sopenharmony_ci		vm_swappiness = val;
42308c2ecf20Sopenharmony_ci
42318c2ecf20Sopenharmony_ci	return 0;
42328c2ecf20Sopenharmony_ci}
42338c2ecf20Sopenharmony_ci
42348c2ecf20Sopenharmony_cistatic void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
42358c2ecf20Sopenharmony_ci{
42368c2ecf20Sopenharmony_ci	struct mem_cgroup_threshold_ary *t;
42378c2ecf20Sopenharmony_ci	unsigned long usage;
42388c2ecf20Sopenharmony_ci	int i;
42398c2ecf20Sopenharmony_ci
42408c2ecf20Sopenharmony_ci	rcu_read_lock();
42418c2ecf20Sopenharmony_ci	if (!swap)
42428c2ecf20Sopenharmony_ci		t = rcu_dereference(memcg->thresholds.primary);
42438c2ecf20Sopenharmony_ci	else
42448c2ecf20Sopenharmony_ci		t = rcu_dereference(memcg->memsw_thresholds.primary);
42458c2ecf20Sopenharmony_ci
42468c2ecf20Sopenharmony_ci	if (!t)
42478c2ecf20Sopenharmony_ci		goto unlock;
42488c2ecf20Sopenharmony_ci
42498c2ecf20Sopenharmony_ci	usage = mem_cgroup_usage(memcg, swap);
42508c2ecf20Sopenharmony_ci
42518c2ecf20Sopenharmony_ci	/*
42528c2ecf20Sopenharmony_ci	 * current_threshold points to threshold just below or equal to usage.
42538c2ecf20Sopenharmony_ci	 * If it's not true, a threshold was crossed after last
42548c2ecf20Sopenharmony_ci	 * call of __mem_cgroup_threshold().
42558c2ecf20Sopenharmony_ci	 */
42568c2ecf20Sopenharmony_ci	i = t->current_threshold;
42578c2ecf20Sopenharmony_ci
42588c2ecf20Sopenharmony_ci	/*
42598c2ecf20Sopenharmony_ci	 * Iterate backward over array of thresholds starting from
42608c2ecf20Sopenharmony_ci	 * current_threshold and check if a threshold is crossed.
42618c2ecf20Sopenharmony_ci	 * If none of thresholds below usage is crossed, we read
42628c2ecf20Sopenharmony_ci	 * only one element of the array here.
42638c2ecf20Sopenharmony_ci	 */
42648c2ecf20Sopenharmony_ci	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
42658c2ecf20Sopenharmony_ci		eventfd_signal(t->entries[i].eventfd, 1);
42668c2ecf20Sopenharmony_ci
42678c2ecf20Sopenharmony_ci	/* i = current_threshold + 1 */
42688c2ecf20Sopenharmony_ci	i++;
42698c2ecf20Sopenharmony_ci
42708c2ecf20Sopenharmony_ci	/*
42718c2ecf20Sopenharmony_ci	 * Iterate forward over array of thresholds starting from
42728c2ecf20Sopenharmony_ci	 * current_threshold+1 and check if a threshold is crossed.
42738c2ecf20Sopenharmony_ci	 * If none of thresholds above usage is crossed, we read
42748c2ecf20Sopenharmony_ci	 * only one element of the array here.
42758c2ecf20Sopenharmony_ci	 */
42768c2ecf20Sopenharmony_ci	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
42778c2ecf20Sopenharmony_ci		eventfd_signal(t->entries[i].eventfd, 1);
42788c2ecf20Sopenharmony_ci
42798c2ecf20Sopenharmony_ci	/* Update current_threshold */
42808c2ecf20Sopenharmony_ci	t->current_threshold = i - 1;
42818c2ecf20Sopenharmony_ciunlock:
42828c2ecf20Sopenharmony_ci	rcu_read_unlock();
42838c2ecf20Sopenharmony_ci}
42848c2ecf20Sopenharmony_ci
42858c2ecf20Sopenharmony_cistatic void mem_cgroup_threshold(struct mem_cgroup *memcg)
42868c2ecf20Sopenharmony_ci{
42878c2ecf20Sopenharmony_ci	while (memcg) {
42888c2ecf20Sopenharmony_ci		__mem_cgroup_threshold(memcg, false);
42898c2ecf20Sopenharmony_ci		if (do_memsw_account())
42908c2ecf20Sopenharmony_ci			__mem_cgroup_threshold(memcg, true);
42918c2ecf20Sopenharmony_ci
42928c2ecf20Sopenharmony_ci		memcg = parent_mem_cgroup(memcg);
42938c2ecf20Sopenharmony_ci	}
42948c2ecf20Sopenharmony_ci}
42958c2ecf20Sopenharmony_ci
42968c2ecf20Sopenharmony_cistatic int compare_thresholds(const void *a, const void *b)
42978c2ecf20Sopenharmony_ci{
42988c2ecf20Sopenharmony_ci	const struct mem_cgroup_threshold *_a = a;
42998c2ecf20Sopenharmony_ci	const struct mem_cgroup_threshold *_b = b;
43008c2ecf20Sopenharmony_ci
43018c2ecf20Sopenharmony_ci	if (_a->threshold > _b->threshold)
43028c2ecf20Sopenharmony_ci		return 1;
43038c2ecf20Sopenharmony_ci
43048c2ecf20Sopenharmony_ci	if (_a->threshold < _b->threshold)
43058c2ecf20Sopenharmony_ci		return -1;
43068c2ecf20Sopenharmony_ci
43078c2ecf20Sopenharmony_ci	return 0;
43088c2ecf20Sopenharmony_ci}
43098c2ecf20Sopenharmony_ci
43108c2ecf20Sopenharmony_cistatic int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
43118c2ecf20Sopenharmony_ci{
43128c2ecf20Sopenharmony_ci	struct mem_cgroup_eventfd_list *ev;
43138c2ecf20Sopenharmony_ci
43148c2ecf20Sopenharmony_ci	spin_lock(&memcg_oom_lock);
43158c2ecf20Sopenharmony_ci
43168c2ecf20Sopenharmony_ci	list_for_each_entry(ev, &memcg->oom_notify, list)
43178c2ecf20Sopenharmony_ci		eventfd_signal(ev->eventfd, 1);
43188c2ecf20Sopenharmony_ci
43198c2ecf20Sopenharmony_ci	spin_unlock(&memcg_oom_lock);
43208c2ecf20Sopenharmony_ci	return 0;
43218c2ecf20Sopenharmony_ci}
43228c2ecf20Sopenharmony_ci
43238c2ecf20Sopenharmony_cistatic void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
43248c2ecf20Sopenharmony_ci{
43258c2ecf20Sopenharmony_ci	struct mem_cgroup *iter;
43268c2ecf20Sopenharmony_ci
43278c2ecf20Sopenharmony_ci	for_each_mem_cgroup_tree(iter, memcg)
43288c2ecf20Sopenharmony_ci		mem_cgroup_oom_notify_cb(iter);
43298c2ecf20Sopenharmony_ci}
43308c2ecf20Sopenharmony_ci
43318c2ecf20Sopenharmony_cistatic int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
43328c2ecf20Sopenharmony_ci	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
43338c2ecf20Sopenharmony_ci{
43348c2ecf20Sopenharmony_ci	struct mem_cgroup_thresholds *thresholds;
43358c2ecf20Sopenharmony_ci	struct mem_cgroup_threshold_ary *new;
43368c2ecf20Sopenharmony_ci	unsigned long threshold;
43378c2ecf20Sopenharmony_ci	unsigned long usage;
43388c2ecf20Sopenharmony_ci	int i, size, ret;
43398c2ecf20Sopenharmony_ci
43408c2ecf20Sopenharmony_ci	ret = page_counter_memparse(args, "-1", &threshold);
43418c2ecf20Sopenharmony_ci	if (ret)
43428c2ecf20Sopenharmony_ci		return ret;
43438c2ecf20Sopenharmony_ci
43448c2ecf20Sopenharmony_ci	mutex_lock(&memcg->thresholds_lock);
43458c2ecf20Sopenharmony_ci
43468c2ecf20Sopenharmony_ci	if (type == _MEM) {
43478c2ecf20Sopenharmony_ci		thresholds = &memcg->thresholds;
43488c2ecf20Sopenharmony_ci		usage = mem_cgroup_usage(memcg, false);
43498c2ecf20Sopenharmony_ci	} else if (type == _MEMSWAP) {
43508c2ecf20Sopenharmony_ci		thresholds = &memcg->memsw_thresholds;
43518c2ecf20Sopenharmony_ci		usage = mem_cgroup_usage(memcg, true);
43528c2ecf20Sopenharmony_ci	} else
43538c2ecf20Sopenharmony_ci		BUG();
43548c2ecf20Sopenharmony_ci
43558c2ecf20Sopenharmony_ci	/* Check if a threshold crossed before adding a new one */
43568c2ecf20Sopenharmony_ci	if (thresholds->primary)
43578c2ecf20Sopenharmony_ci		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
43588c2ecf20Sopenharmony_ci
43598c2ecf20Sopenharmony_ci	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
43608c2ecf20Sopenharmony_ci
43618c2ecf20Sopenharmony_ci	/* Allocate memory for new array of thresholds */
43628c2ecf20Sopenharmony_ci	new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
43638c2ecf20Sopenharmony_ci	if (!new) {
43648c2ecf20Sopenharmony_ci		ret = -ENOMEM;
43658c2ecf20Sopenharmony_ci		goto unlock;
43668c2ecf20Sopenharmony_ci	}
43678c2ecf20Sopenharmony_ci	new->size = size;
43688c2ecf20Sopenharmony_ci
43698c2ecf20Sopenharmony_ci	/* Copy thresholds (if any) to new array */
43708c2ecf20Sopenharmony_ci	if (thresholds->primary)
43718c2ecf20Sopenharmony_ci		memcpy(new->entries, thresholds->primary->entries,
43728c2ecf20Sopenharmony_ci		       flex_array_size(new, entries, size - 1));
43738c2ecf20Sopenharmony_ci
43748c2ecf20Sopenharmony_ci	/* Add new threshold */
43758c2ecf20Sopenharmony_ci	new->entries[size - 1].eventfd = eventfd;
43768c2ecf20Sopenharmony_ci	new->entries[size - 1].threshold = threshold;
43778c2ecf20Sopenharmony_ci
43788c2ecf20Sopenharmony_ci	/* Sort thresholds. Registering of new threshold isn't time-critical */
43798c2ecf20Sopenharmony_ci	sort(new->entries, size, sizeof(*new->entries),
43808c2ecf20Sopenharmony_ci			compare_thresholds, NULL);
43818c2ecf20Sopenharmony_ci
43828c2ecf20Sopenharmony_ci	/* Find current threshold */
43838c2ecf20Sopenharmony_ci	new->current_threshold = -1;
43848c2ecf20Sopenharmony_ci	for (i = 0; i < size; i++) {
43858c2ecf20Sopenharmony_ci		if (new->entries[i].threshold <= usage) {
43868c2ecf20Sopenharmony_ci			/*
43878c2ecf20Sopenharmony_ci			 * new->current_threshold will not be used until
43888c2ecf20Sopenharmony_ci			 * rcu_assign_pointer(), so it's safe to increment
43898c2ecf20Sopenharmony_ci			 * it here.
43908c2ecf20Sopenharmony_ci			 */
43918c2ecf20Sopenharmony_ci			++new->current_threshold;
43928c2ecf20Sopenharmony_ci		} else
43938c2ecf20Sopenharmony_ci			break;
43948c2ecf20Sopenharmony_ci	}
43958c2ecf20Sopenharmony_ci
43968c2ecf20Sopenharmony_ci	/* Free old spare buffer and save old primary buffer as spare */
43978c2ecf20Sopenharmony_ci	kfree(thresholds->spare);
43988c2ecf20Sopenharmony_ci	thresholds->spare = thresholds->primary;
43998c2ecf20Sopenharmony_ci
44008c2ecf20Sopenharmony_ci	rcu_assign_pointer(thresholds->primary, new);
44018c2ecf20Sopenharmony_ci
44028c2ecf20Sopenharmony_ci	/* To be sure that nobody uses thresholds */
44038c2ecf20Sopenharmony_ci	synchronize_rcu();
44048c2ecf20Sopenharmony_ci
44058c2ecf20Sopenharmony_ciunlock:
44068c2ecf20Sopenharmony_ci	mutex_unlock(&memcg->thresholds_lock);
44078c2ecf20Sopenharmony_ci
44088c2ecf20Sopenharmony_ci	return ret;
44098c2ecf20Sopenharmony_ci}
44108c2ecf20Sopenharmony_ci
44118c2ecf20Sopenharmony_cistatic int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
44128c2ecf20Sopenharmony_ci	struct eventfd_ctx *eventfd, const char *args)
44138c2ecf20Sopenharmony_ci{
44148c2ecf20Sopenharmony_ci	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
44158c2ecf20Sopenharmony_ci}
44168c2ecf20Sopenharmony_ci
44178c2ecf20Sopenharmony_cistatic int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
44188c2ecf20Sopenharmony_ci	struct eventfd_ctx *eventfd, const char *args)
44198c2ecf20Sopenharmony_ci{
44208c2ecf20Sopenharmony_ci	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
44218c2ecf20Sopenharmony_ci}
44228c2ecf20Sopenharmony_ci
44238c2ecf20Sopenharmony_cistatic void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
44248c2ecf20Sopenharmony_ci	struct eventfd_ctx *eventfd, enum res_type type)
44258c2ecf20Sopenharmony_ci{
44268c2ecf20Sopenharmony_ci	struct mem_cgroup_thresholds *thresholds;
44278c2ecf20Sopenharmony_ci	struct mem_cgroup_threshold_ary *new;
44288c2ecf20Sopenharmony_ci	unsigned long usage;
44298c2ecf20Sopenharmony_ci	int i, j, size, entries;
44308c2ecf20Sopenharmony_ci
44318c2ecf20Sopenharmony_ci	mutex_lock(&memcg->thresholds_lock);
44328c2ecf20Sopenharmony_ci
44338c2ecf20Sopenharmony_ci	if (type == _MEM) {
44348c2ecf20Sopenharmony_ci		thresholds = &memcg->thresholds;
44358c2ecf20Sopenharmony_ci		usage = mem_cgroup_usage(memcg, false);
44368c2ecf20Sopenharmony_ci	} else if (type == _MEMSWAP) {
44378c2ecf20Sopenharmony_ci		thresholds = &memcg->memsw_thresholds;
44388c2ecf20Sopenharmony_ci		usage = mem_cgroup_usage(memcg, true);
44398c2ecf20Sopenharmony_ci	} else
44408c2ecf20Sopenharmony_ci		BUG();
44418c2ecf20Sopenharmony_ci
44428c2ecf20Sopenharmony_ci	if (!thresholds->primary)
44438c2ecf20Sopenharmony_ci		goto unlock;
44448c2ecf20Sopenharmony_ci
44458c2ecf20Sopenharmony_ci	/* Check if a threshold crossed before removing */
44468c2ecf20Sopenharmony_ci	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
44478c2ecf20Sopenharmony_ci
44488c2ecf20Sopenharmony_ci	/* Calculate new number of threshold */
44498c2ecf20Sopenharmony_ci	size = entries = 0;
44508c2ecf20Sopenharmony_ci	for (i = 0; i < thresholds->primary->size; i++) {
44518c2ecf20Sopenharmony_ci		if (thresholds->primary->entries[i].eventfd != eventfd)
44528c2ecf20Sopenharmony_ci			size++;
44538c2ecf20Sopenharmony_ci		else
44548c2ecf20Sopenharmony_ci			entries++;
44558c2ecf20Sopenharmony_ci	}
44568c2ecf20Sopenharmony_ci
44578c2ecf20Sopenharmony_ci	new = thresholds->spare;
44588c2ecf20Sopenharmony_ci
44598c2ecf20Sopenharmony_ci	/* If no items related to eventfd have been cleared, nothing to do */
44608c2ecf20Sopenharmony_ci	if (!entries)
44618c2ecf20Sopenharmony_ci		goto unlock;
44628c2ecf20Sopenharmony_ci
44638c2ecf20Sopenharmony_ci	/* Set thresholds array to NULL if we don't have thresholds */
44648c2ecf20Sopenharmony_ci	if (!size) {
44658c2ecf20Sopenharmony_ci		kfree(new);
44668c2ecf20Sopenharmony_ci		new = NULL;
44678c2ecf20Sopenharmony_ci		goto swap_buffers;
44688c2ecf20Sopenharmony_ci	}
44698c2ecf20Sopenharmony_ci
44708c2ecf20Sopenharmony_ci	new->size = size;
44718c2ecf20Sopenharmony_ci
44728c2ecf20Sopenharmony_ci	/* Copy thresholds and find current threshold */
44738c2ecf20Sopenharmony_ci	new->current_threshold = -1;
44748c2ecf20Sopenharmony_ci	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
44758c2ecf20Sopenharmony_ci		if (thresholds->primary->entries[i].eventfd == eventfd)
44768c2ecf20Sopenharmony_ci			continue;
44778c2ecf20Sopenharmony_ci
44788c2ecf20Sopenharmony_ci		new->entries[j] = thresholds->primary->entries[i];
44798c2ecf20Sopenharmony_ci		if (new->entries[j].threshold <= usage) {
44808c2ecf20Sopenharmony_ci			/*
44818c2ecf20Sopenharmony_ci			 * new->current_threshold will not be used
44828c2ecf20Sopenharmony_ci			 * until rcu_assign_pointer(), so it's safe to increment
44838c2ecf20Sopenharmony_ci			 * it here.
44848c2ecf20Sopenharmony_ci			 */
44858c2ecf20Sopenharmony_ci			++new->current_threshold;
44868c2ecf20Sopenharmony_ci		}
44878c2ecf20Sopenharmony_ci		j++;
44888c2ecf20Sopenharmony_ci	}
44898c2ecf20Sopenharmony_ci
44908c2ecf20Sopenharmony_ciswap_buffers:
44918c2ecf20Sopenharmony_ci	/* Swap primary and spare array */
44928c2ecf20Sopenharmony_ci	thresholds->spare = thresholds->primary;
44938c2ecf20Sopenharmony_ci
44948c2ecf20Sopenharmony_ci	rcu_assign_pointer(thresholds->primary, new);
44958c2ecf20Sopenharmony_ci
44968c2ecf20Sopenharmony_ci	/* To be sure that nobody uses thresholds */
44978c2ecf20Sopenharmony_ci	synchronize_rcu();
44988c2ecf20Sopenharmony_ci
44998c2ecf20Sopenharmony_ci	/* If all events are unregistered, free the spare array */
45008c2ecf20Sopenharmony_ci	if (!new) {
45018c2ecf20Sopenharmony_ci		kfree(thresholds->spare);
45028c2ecf20Sopenharmony_ci		thresholds->spare = NULL;
45038c2ecf20Sopenharmony_ci	}
45048c2ecf20Sopenharmony_ciunlock:
45058c2ecf20Sopenharmony_ci	mutex_unlock(&memcg->thresholds_lock);
45068c2ecf20Sopenharmony_ci}
45078c2ecf20Sopenharmony_ci
45088c2ecf20Sopenharmony_cistatic void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
45098c2ecf20Sopenharmony_ci	struct eventfd_ctx *eventfd)
45108c2ecf20Sopenharmony_ci{
45118c2ecf20Sopenharmony_ci	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
45128c2ecf20Sopenharmony_ci}
45138c2ecf20Sopenharmony_ci
45148c2ecf20Sopenharmony_cistatic void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
45158c2ecf20Sopenharmony_ci	struct eventfd_ctx *eventfd)
45168c2ecf20Sopenharmony_ci{
45178c2ecf20Sopenharmony_ci	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
45188c2ecf20Sopenharmony_ci}
45198c2ecf20Sopenharmony_ci
45208c2ecf20Sopenharmony_cistatic int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
45218c2ecf20Sopenharmony_ci	struct eventfd_ctx *eventfd, const char *args)
45228c2ecf20Sopenharmony_ci{
45238c2ecf20Sopenharmony_ci	struct mem_cgroup_eventfd_list *event;
45248c2ecf20Sopenharmony_ci
45258c2ecf20Sopenharmony_ci	event = kmalloc(sizeof(*event),	GFP_KERNEL);
45268c2ecf20Sopenharmony_ci	if (!event)
45278c2ecf20Sopenharmony_ci		return -ENOMEM;
45288c2ecf20Sopenharmony_ci
45298c2ecf20Sopenharmony_ci	spin_lock(&memcg_oom_lock);
45308c2ecf20Sopenharmony_ci
45318c2ecf20Sopenharmony_ci	event->eventfd = eventfd;
45328c2ecf20Sopenharmony_ci	list_add(&event->list, &memcg->oom_notify);
45338c2ecf20Sopenharmony_ci
45348c2ecf20Sopenharmony_ci	/* already in OOM ? */
45358c2ecf20Sopenharmony_ci	if (memcg->under_oom)
45368c2ecf20Sopenharmony_ci		eventfd_signal(eventfd, 1);
45378c2ecf20Sopenharmony_ci	spin_unlock(&memcg_oom_lock);
45388c2ecf20Sopenharmony_ci
45398c2ecf20Sopenharmony_ci	return 0;
45408c2ecf20Sopenharmony_ci}
45418c2ecf20Sopenharmony_ci
45428c2ecf20Sopenharmony_cistatic void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
45438c2ecf20Sopenharmony_ci	struct eventfd_ctx *eventfd)
45448c2ecf20Sopenharmony_ci{
45458c2ecf20Sopenharmony_ci	struct mem_cgroup_eventfd_list *ev, *tmp;
45468c2ecf20Sopenharmony_ci
45478c2ecf20Sopenharmony_ci	spin_lock(&memcg_oom_lock);
45488c2ecf20Sopenharmony_ci
45498c2ecf20Sopenharmony_ci	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
45508c2ecf20Sopenharmony_ci		if (ev->eventfd == eventfd) {
45518c2ecf20Sopenharmony_ci			list_del(&ev->list);
45528c2ecf20Sopenharmony_ci			kfree(ev);
45538c2ecf20Sopenharmony_ci		}
45548c2ecf20Sopenharmony_ci	}
45558c2ecf20Sopenharmony_ci
45568c2ecf20Sopenharmony_ci	spin_unlock(&memcg_oom_lock);
45578c2ecf20Sopenharmony_ci}
45588c2ecf20Sopenharmony_ci
45598c2ecf20Sopenharmony_cistatic int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
45608c2ecf20Sopenharmony_ci{
45618c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
45628c2ecf20Sopenharmony_ci
45638c2ecf20Sopenharmony_ci	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
45648c2ecf20Sopenharmony_ci	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
45658c2ecf20Sopenharmony_ci	seq_printf(sf, "oom_kill %lu\n",
45668c2ecf20Sopenharmony_ci		   atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
45678c2ecf20Sopenharmony_ci	return 0;
45688c2ecf20Sopenharmony_ci}
45698c2ecf20Sopenharmony_ci
45708c2ecf20Sopenharmony_cistatic int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
45718c2ecf20Sopenharmony_ci	struct cftype *cft, u64 val)
45728c2ecf20Sopenharmony_ci{
45738c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
45748c2ecf20Sopenharmony_ci
45758c2ecf20Sopenharmony_ci	/* cannot set to root cgroup and only 0 and 1 are allowed */
45768c2ecf20Sopenharmony_ci	if (!css->parent || !((val == 0) || (val == 1)))
45778c2ecf20Sopenharmony_ci		return -EINVAL;
45788c2ecf20Sopenharmony_ci
45798c2ecf20Sopenharmony_ci	memcg->oom_kill_disable = val;
45808c2ecf20Sopenharmony_ci	if (!val)
45818c2ecf20Sopenharmony_ci		memcg_oom_recover(memcg);
45828c2ecf20Sopenharmony_ci
45838c2ecf20Sopenharmony_ci	return 0;
45848c2ecf20Sopenharmony_ci}
45858c2ecf20Sopenharmony_ci
45868c2ecf20Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK
45878c2ecf20Sopenharmony_ci
45888c2ecf20Sopenharmony_ci#include <trace/events/writeback.h>
45898c2ecf20Sopenharmony_ci
45908c2ecf20Sopenharmony_cistatic int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
45918c2ecf20Sopenharmony_ci{
45928c2ecf20Sopenharmony_ci	return wb_domain_init(&memcg->cgwb_domain, gfp);
45938c2ecf20Sopenharmony_ci}
45948c2ecf20Sopenharmony_ci
45958c2ecf20Sopenharmony_cistatic void memcg_wb_domain_exit(struct mem_cgroup *memcg)
45968c2ecf20Sopenharmony_ci{
45978c2ecf20Sopenharmony_ci	wb_domain_exit(&memcg->cgwb_domain);
45988c2ecf20Sopenharmony_ci}
45998c2ecf20Sopenharmony_ci
46008c2ecf20Sopenharmony_cistatic void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
46018c2ecf20Sopenharmony_ci{
46028c2ecf20Sopenharmony_ci	wb_domain_size_changed(&memcg->cgwb_domain);
46038c2ecf20Sopenharmony_ci}
46048c2ecf20Sopenharmony_ci
46058c2ecf20Sopenharmony_cistruct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
46068c2ecf20Sopenharmony_ci{
46078c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
46088c2ecf20Sopenharmony_ci
46098c2ecf20Sopenharmony_ci	if (!memcg->css.parent)
46108c2ecf20Sopenharmony_ci		return NULL;
46118c2ecf20Sopenharmony_ci
46128c2ecf20Sopenharmony_ci	return &memcg->cgwb_domain;
46138c2ecf20Sopenharmony_ci}
46148c2ecf20Sopenharmony_ci
46158c2ecf20Sopenharmony_ci/*
46168c2ecf20Sopenharmony_ci * idx can be of type enum memcg_stat_item or node_stat_item.
46178c2ecf20Sopenharmony_ci * Keep in sync with memcg_exact_page().
46188c2ecf20Sopenharmony_ci */
46198c2ecf20Sopenharmony_cistatic unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
46208c2ecf20Sopenharmony_ci{
46218c2ecf20Sopenharmony_ci	long x = atomic_long_read(&memcg->vmstats[idx]);
46228c2ecf20Sopenharmony_ci	int cpu;
46238c2ecf20Sopenharmony_ci
46248c2ecf20Sopenharmony_ci	for_each_online_cpu(cpu)
46258c2ecf20Sopenharmony_ci		x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
46268c2ecf20Sopenharmony_ci	if (x < 0)
46278c2ecf20Sopenharmony_ci		x = 0;
46288c2ecf20Sopenharmony_ci	return x;
46298c2ecf20Sopenharmony_ci}
46308c2ecf20Sopenharmony_ci
46318c2ecf20Sopenharmony_ci/**
46328c2ecf20Sopenharmony_ci * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
46338c2ecf20Sopenharmony_ci * @wb: bdi_writeback in question
46348c2ecf20Sopenharmony_ci * @pfilepages: out parameter for number of file pages
46358c2ecf20Sopenharmony_ci * @pheadroom: out parameter for number of allocatable pages according to memcg
46368c2ecf20Sopenharmony_ci * @pdirty: out parameter for number of dirty pages
46378c2ecf20Sopenharmony_ci * @pwriteback: out parameter for number of pages under writeback
46388c2ecf20Sopenharmony_ci *
46398c2ecf20Sopenharmony_ci * Determine the numbers of file, headroom, dirty, and writeback pages in
46408c2ecf20Sopenharmony_ci * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
46418c2ecf20Sopenharmony_ci * is a bit more involved.
46428c2ecf20Sopenharmony_ci *
46438c2ecf20Sopenharmony_ci * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
46448c2ecf20Sopenharmony_ci * headroom is calculated as the lowest headroom of itself and the
46458c2ecf20Sopenharmony_ci * ancestors.  Note that this doesn't consider the actual amount of
46468c2ecf20Sopenharmony_ci * available memory in the system.  The caller should further cap
46478c2ecf20Sopenharmony_ci * *@pheadroom accordingly.
46488c2ecf20Sopenharmony_ci */
46498c2ecf20Sopenharmony_civoid mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
46508c2ecf20Sopenharmony_ci			 unsigned long *pheadroom, unsigned long *pdirty,
46518c2ecf20Sopenharmony_ci			 unsigned long *pwriteback)
46528c2ecf20Sopenharmony_ci{
46538c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
46548c2ecf20Sopenharmony_ci	struct mem_cgroup *parent;
46558c2ecf20Sopenharmony_ci
46568c2ecf20Sopenharmony_ci	*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
46578c2ecf20Sopenharmony_ci
46588c2ecf20Sopenharmony_ci	*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
46598c2ecf20Sopenharmony_ci	*pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
46608c2ecf20Sopenharmony_ci			memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
46618c2ecf20Sopenharmony_ci	*pheadroom = PAGE_COUNTER_MAX;
46628c2ecf20Sopenharmony_ci
46638c2ecf20Sopenharmony_ci	while ((parent = parent_mem_cgroup(memcg))) {
46648c2ecf20Sopenharmony_ci		unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
46658c2ecf20Sopenharmony_ci					    READ_ONCE(memcg->memory.high));
46668c2ecf20Sopenharmony_ci		unsigned long used = page_counter_read(&memcg->memory);
46678c2ecf20Sopenharmony_ci
46688c2ecf20Sopenharmony_ci		*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
46698c2ecf20Sopenharmony_ci		memcg = parent;
46708c2ecf20Sopenharmony_ci	}
46718c2ecf20Sopenharmony_ci}
46728c2ecf20Sopenharmony_ci
46738c2ecf20Sopenharmony_ci/*
46748c2ecf20Sopenharmony_ci * Foreign dirty flushing
46758c2ecf20Sopenharmony_ci *
46768c2ecf20Sopenharmony_ci * There's an inherent mismatch between memcg and writeback.  The former
46778c2ecf20Sopenharmony_ci * trackes ownership per-page while the latter per-inode.  This was a
46788c2ecf20Sopenharmony_ci * deliberate design decision because honoring per-page ownership in the
46798c2ecf20Sopenharmony_ci * writeback path is complicated, may lead to higher CPU and IO overheads
46808c2ecf20Sopenharmony_ci * and deemed unnecessary given that write-sharing an inode across
46818c2ecf20Sopenharmony_ci * different cgroups isn't a common use-case.
46828c2ecf20Sopenharmony_ci *
46838c2ecf20Sopenharmony_ci * Combined with inode majority-writer ownership switching, this works well
46848c2ecf20Sopenharmony_ci * enough in most cases but there are some pathological cases.  For
46858c2ecf20Sopenharmony_ci * example, let's say there are two cgroups A and B which keep writing to
46868c2ecf20Sopenharmony_ci * different but confined parts of the same inode.  B owns the inode and
46878c2ecf20Sopenharmony_ci * A's memory is limited far below B's.  A's dirty ratio can rise enough to
46888c2ecf20Sopenharmony_ci * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
46898c2ecf20Sopenharmony_ci * triggering background writeback.  A will be slowed down without a way to
46908c2ecf20Sopenharmony_ci * make writeback of the dirty pages happen.
46918c2ecf20Sopenharmony_ci *
46928c2ecf20Sopenharmony_ci * Conditions like the above can lead to a cgroup getting repatedly and
46938c2ecf20Sopenharmony_ci * severely throttled after making some progress after each
46948c2ecf20Sopenharmony_ci * dirty_expire_interval while the underyling IO device is almost
46958c2ecf20Sopenharmony_ci * completely idle.
46968c2ecf20Sopenharmony_ci *
46978c2ecf20Sopenharmony_ci * Solving this problem completely requires matching the ownership tracking
46988c2ecf20Sopenharmony_ci * granularities between memcg and writeback in either direction.  However,
46998c2ecf20Sopenharmony_ci * the more egregious behaviors can be avoided by simply remembering the
47008c2ecf20Sopenharmony_ci * most recent foreign dirtying events and initiating remote flushes on
47018c2ecf20Sopenharmony_ci * them when local writeback isn't enough to keep the memory clean enough.
47028c2ecf20Sopenharmony_ci *
47038c2ecf20Sopenharmony_ci * The following two functions implement such mechanism.  When a foreign
47048c2ecf20Sopenharmony_ci * page - a page whose memcg and writeback ownerships don't match - is
47058c2ecf20Sopenharmony_ci * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
47068c2ecf20Sopenharmony_ci * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
47078c2ecf20Sopenharmony_ci * decides that the memcg needs to sleep due to high dirty ratio, it calls
47088c2ecf20Sopenharmony_ci * mem_cgroup_flush_foreign() which queues writeback on the recorded
47098c2ecf20Sopenharmony_ci * foreign bdi_writebacks which haven't expired.  Both the numbers of
47108c2ecf20Sopenharmony_ci * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
47118c2ecf20Sopenharmony_ci * limited to MEMCG_CGWB_FRN_CNT.
47128c2ecf20Sopenharmony_ci *
47138c2ecf20Sopenharmony_ci * The mechanism only remembers IDs and doesn't hold any object references.
47148c2ecf20Sopenharmony_ci * As being wrong occasionally doesn't matter, updates and accesses to the
47158c2ecf20Sopenharmony_ci * records are lockless and racy.
47168c2ecf20Sopenharmony_ci */
47178c2ecf20Sopenharmony_civoid mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
47188c2ecf20Sopenharmony_ci					     struct bdi_writeback *wb)
47198c2ecf20Sopenharmony_ci{
47208c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = page->mem_cgroup;
47218c2ecf20Sopenharmony_ci	struct memcg_cgwb_frn *frn;
47228c2ecf20Sopenharmony_ci	u64 now = get_jiffies_64();
47238c2ecf20Sopenharmony_ci	u64 oldest_at = now;
47248c2ecf20Sopenharmony_ci	int oldest = -1;
47258c2ecf20Sopenharmony_ci	int i;
47268c2ecf20Sopenharmony_ci
47278c2ecf20Sopenharmony_ci	trace_track_foreign_dirty(page, wb);
47288c2ecf20Sopenharmony_ci
47298c2ecf20Sopenharmony_ci	/*
47308c2ecf20Sopenharmony_ci	 * Pick the slot to use.  If there is already a slot for @wb, keep
47318c2ecf20Sopenharmony_ci	 * using it.  If not replace the oldest one which isn't being
47328c2ecf20Sopenharmony_ci	 * written out.
47338c2ecf20Sopenharmony_ci	 */
47348c2ecf20Sopenharmony_ci	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
47358c2ecf20Sopenharmony_ci		frn = &memcg->cgwb_frn[i];
47368c2ecf20Sopenharmony_ci		if (frn->bdi_id == wb->bdi->id &&
47378c2ecf20Sopenharmony_ci		    frn->memcg_id == wb->memcg_css->id)
47388c2ecf20Sopenharmony_ci			break;
47398c2ecf20Sopenharmony_ci		if (time_before64(frn->at, oldest_at) &&
47408c2ecf20Sopenharmony_ci		    atomic_read(&frn->done.cnt) == 1) {
47418c2ecf20Sopenharmony_ci			oldest = i;
47428c2ecf20Sopenharmony_ci			oldest_at = frn->at;
47438c2ecf20Sopenharmony_ci		}
47448c2ecf20Sopenharmony_ci	}
47458c2ecf20Sopenharmony_ci
47468c2ecf20Sopenharmony_ci	if (i < MEMCG_CGWB_FRN_CNT) {
47478c2ecf20Sopenharmony_ci		/*
47488c2ecf20Sopenharmony_ci		 * Re-using an existing one.  Update timestamp lazily to
47498c2ecf20Sopenharmony_ci		 * avoid making the cacheline hot.  We want them to be
47508c2ecf20Sopenharmony_ci		 * reasonably up-to-date and significantly shorter than
47518c2ecf20Sopenharmony_ci		 * dirty_expire_interval as that's what expires the record.
47528c2ecf20Sopenharmony_ci		 * Use the shorter of 1s and dirty_expire_interval / 8.
47538c2ecf20Sopenharmony_ci		 */
47548c2ecf20Sopenharmony_ci		unsigned long update_intv =
47558c2ecf20Sopenharmony_ci			min_t(unsigned long, HZ,
47568c2ecf20Sopenharmony_ci			      msecs_to_jiffies(dirty_expire_interval * 10) / 8);
47578c2ecf20Sopenharmony_ci
47588c2ecf20Sopenharmony_ci		if (time_before64(frn->at, now - update_intv))
47598c2ecf20Sopenharmony_ci			frn->at = now;
47608c2ecf20Sopenharmony_ci	} else if (oldest >= 0) {
47618c2ecf20Sopenharmony_ci		/* replace the oldest free one */
47628c2ecf20Sopenharmony_ci		frn = &memcg->cgwb_frn[oldest];
47638c2ecf20Sopenharmony_ci		frn->bdi_id = wb->bdi->id;
47648c2ecf20Sopenharmony_ci		frn->memcg_id = wb->memcg_css->id;
47658c2ecf20Sopenharmony_ci		frn->at = now;
47668c2ecf20Sopenharmony_ci	}
47678c2ecf20Sopenharmony_ci}
47688c2ecf20Sopenharmony_ci
47698c2ecf20Sopenharmony_ci/* issue foreign writeback flushes for recorded foreign dirtying events */
47708c2ecf20Sopenharmony_civoid mem_cgroup_flush_foreign(struct bdi_writeback *wb)
47718c2ecf20Sopenharmony_ci{
47728c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
47738c2ecf20Sopenharmony_ci	unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
47748c2ecf20Sopenharmony_ci	u64 now = jiffies_64;
47758c2ecf20Sopenharmony_ci	int i;
47768c2ecf20Sopenharmony_ci
47778c2ecf20Sopenharmony_ci	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
47788c2ecf20Sopenharmony_ci		struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
47798c2ecf20Sopenharmony_ci
47808c2ecf20Sopenharmony_ci		/*
47818c2ecf20Sopenharmony_ci		 * If the record is older than dirty_expire_interval,
47828c2ecf20Sopenharmony_ci		 * writeback on it has already started.  No need to kick it
47838c2ecf20Sopenharmony_ci		 * off again.  Also, don't start a new one if there's
47848c2ecf20Sopenharmony_ci		 * already one in flight.
47858c2ecf20Sopenharmony_ci		 */
47868c2ecf20Sopenharmony_ci		if (time_after64(frn->at, now - intv) &&
47878c2ecf20Sopenharmony_ci		    atomic_read(&frn->done.cnt) == 1) {
47888c2ecf20Sopenharmony_ci			frn->at = 0;
47898c2ecf20Sopenharmony_ci			trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
47908c2ecf20Sopenharmony_ci			cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
47918c2ecf20Sopenharmony_ci					       WB_REASON_FOREIGN_FLUSH,
47928c2ecf20Sopenharmony_ci					       &frn->done);
47938c2ecf20Sopenharmony_ci		}
47948c2ecf20Sopenharmony_ci	}
47958c2ecf20Sopenharmony_ci}
47968c2ecf20Sopenharmony_ci
47978c2ecf20Sopenharmony_ci#else	/* CONFIG_CGROUP_WRITEBACK */
47988c2ecf20Sopenharmony_ci
47998c2ecf20Sopenharmony_cistatic int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
48008c2ecf20Sopenharmony_ci{
48018c2ecf20Sopenharmony_ci	return 0;
48028c2ecf20Sopenharmony_ci}
48038c2ecf20Sopenharmony_ci
48048c2ecf20Sopenharmony_cistatic void memcg_wb_domain_exit(struct mem_cgroup *memcg)
48058c2ecf20Sopenharmony_ci{
48068c2ecf20Sopenharmony_ci}
48078c2ecf20Sopenharmony_ci
48088c2ecf20Sopenharmony_cistatic void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
48098c2ecf20Sopenharmony_ci{
48108c2ecf20Sopenharmony_ci}
48118c2ecf20Sopenharmony_ci
48128c2ecf20Sopenharmony_ci#endif	/* CONFIG_CGROUP_WRITEBACK */
48138c2ecf20Sopenharmony_ci
48148c2ecf20Sopenharmony_ci/*
48158c2ecf20Sopenharmony_ci * DO NOT USE IN NEW FILES.
48168c2ecf20Sopenharmony_ci *
48178c2ecf20Sopenharmony_ci * "cgroup.event_control" implementation.
48188c2ecf20Sopenharmony_ci *
48198c2ecf20Sopenharmony_ci * This is way over-engineered.  It tries to support fully configurable
48208c2ecf20Sopenharmony_ci * events for each user.  Such level of flexibility is completely
48218c2ecf20Sopenharmony_ci * unnecessary especially in the light of the planned unified hierarchy.
48228c2ecf20Sopenharmony_ci *
48238c2ecf20Sopenharmony_ci * Please deprecate this and replace with something simpler if at all
48248c2ecf20Sopenharmony_ci * possible.
48258c2ecf20Sopenharmony_ci */
48268c2ecf20Sopenharmony_ci
48278c2ecf20Sopenharmony_ci/*
48288c2ecf20Sopenharmony_ci * Unregister event and free resources.
48298c2ecf20Sopenharmony_ci *
48308c2ecf20Sopenharmony_ci * Gets called from workqueue.
48318c2ecf20Sopenharmony_ci */
48328c2ecf20Sopenharmony_cistatic void memcg_event_remove(struct work_struct *work)
48338c2ecf20Sopenharmony_ci{
48348c2ecf20Sopenharmony_ci	struct mem_cgroup_event *event =
48358c2ecf20Sopenharmony_ci		container_of(work, struct mem_cgroup_event, remove);
48368c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = event->memcg;
48378c2ecf20Sopenharmony_ci
48388c2ecf20Sopenharmony_ci	remove_wait_queue(event->wqh, &event->wait);
48398c2ecf20Sopenharmony_ci
48408c2ecf20Sopenharmony_ci	event->unregister_event(memcg, event->eventfd);
48418c2ecf20Sopenharmony_ci
48428c2ecf20Sopenharmony_ci	/* Notify userspace the event is going away. */
48438c2ecf20Sopenharmony_ci	eventfd_signal(event->eventfd, 1);
48448c2ecf20Sopenharmony_ci
48458c2ecf20Sopenharmony_ci	eventfd_ctx_put(event->eventfd);
48468c2ecf20Sopenharmony_ci	kfree(event);
48478c2ecf20Sopenharmony_ci	css_put(&memcg->css);
48488c2ecf20Sopenharmony_ci}
48498c2ecf20Sopenharmony_ci
48508c2ecf20Sopenharmony_ci/*
48518c2ecf20Sopenharmony_ci * Gets called on EPOLLHUP on eventfd when user closes it.
48528c2ecf20Sopenharmony_ci *
48538c2ecf20Sopenharmony_ci * Called with wqh->lock held and interrupts disabled.
48548c2ecf20Sopenharmony_ci */
48558c2ecf20Sopenharmony_cistatic int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
48568c2ecf20Sopenharmony_ci			    int sync, void *key)
48578c2ecf20Sopenharmony_ci{
48588c2ecf20Sopenharmony_ci	struct mem_cgroup_event *event =
48598c2ecf20Sopenharmony_ci		container_of(wait, struct mem_cgroup_event, wait);
48608c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = event->memcg;
48618c2ecf20Sopenharmony_ci	__poll_t flags = key_to_poll(key);
48628c2ecf20Sopenharmony_ci
48638c2ecf20Sopenharmony_ci	if (flags & EPOLLHUP) {
48648c2ecf20Sopenharmony_ci		/*
48658c2ecf20Sopenharmony_ci		 * If the event has been detached at cgroup removal, we
48668c2ecf20Sopenharmony_ci		 * can simply return knowing the other side will cleanup
48678c2ecf20Sopenharmony_ci		 * for us.
48688c2ecf20Sopenharmony_ci		 *
48698c2ecf20Sopenharmony_ci		 * We can't race against event freeing since the other
48708c2ecf20Sopenharmony_ci		 * side will require wqh->lock via remove_wait_queue(),
48718c2ecf20Sopenharmony_ci		 * which we hold.
48728c2ecf20Sopenharmony_ci		 */
48738c2ecf20Sopenharmony_ci		spin_lock(&memcg->event_list_lock);
48748c2ecf20Sopenharmony_ci		if (!list_empty(&event->list)) {
48758c2ecf20Sopenharmony_ci			list_del_init(&event->list);
48768c2ecf20Sopenharmony_ci			/*
48778c2ecf20Sopenharmony_ci			 * We are in atomic context, but cgroup_event_remove()
48788c2ecf20Sopenharmony_ci			 * may sleep, so we have to call it in workqueue.
48798c2ecf20Sopenharmony_ci			 */
48808c2ecf20Sopenharmony_ci			schedule_work(&event->remove);
48818c2ecf20Sopenharmony_ci		}
48828c2ecf20Sopenharmony_ci		spin_unlock(&memcg->event_list_lock);
48838c2ecf20Sopenharmony_ci	}
48848c2ecf20Sopenharmony_ci
48858c2ecf20Sopenharmony_ci	return 0;
48868c2ecf20Sopenharmony_ci}
48878c2ecf20Sopenharmony_ci
48888c2ecf20Sopenharmony_cistatic void memcg_event_ptable_queue_proc(struct file *file,
48898c2ecf20Sopenharmony_ci		wait_queue_head_t *wqh, poll_table *pt)
48908c2ecf20Sopenharmony_ci{
48918c2ecf20Sopenharmony_ci	struct mem_cgroup_event *event =
48928c2ecf20Sopenharmony_ci		container_of(pt, struct mem_cgroup_event, pt);
48938c2ecf20Sopenharmony_ci
48948c2ecf20Sopenharmony_ci	event->wqh = wqh;
48958c2ecf20Sopenharmony_ci	add_wait_queue(wqh, &event->wait);
48968c2ecf20Sopenharmony_ci}
48978c2ecf20Sopenharmony_ci
48988c2ecf20Sopenharmony_ci/*
48998c2ecf20Sopenharmony_ci * DO NOT USE IN NEW FILES.
49008c2ecf20Sopenharmony_ci *
49018c2ecf20Sopenharmony_ci * Parse input and register new cgroup event handler.
49028c2ecf20Sopenharmony_ci *
49038c2ecf20Sopenharmony_ci * Input must be in format '<event_fd> <control_fd> <args>'.
49048c2ecf20Sopenharmony_ci * Interpretation of args is defined by control file implementation.
49058c2ecf20Sopenharmony_ci */
49068c2ecf20Sopenharmony_cistatic ssize_t memcg_write_event_control(struct kernfs_open_file *of,
49078c2ecf20Sopenharmony_ci					 char *buf, size_t nbytes, loff_t off)
49088c2ecf20Sopenharmony_ci{
49098c2ecf20Sopenharmony_ci	struct cgroup_subsys_state *css = of_css(of);
49108c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
49118c2ecf20Sopenharmony_ci	struct mem_cgroup_event *event;
49128c2ecf20Sopenharmony_ci	struct cgroup_subsys_state *cfile_css;
49138c2ecf20Sopenharmony_ci	unsigned int efd, cfd;
49148c2ecf20Sopenharmony_ci	struct fd efile;
49158c2ecf20Sopenharmony_ci	struct fd cfile;
49168c2ecf20Sopenharmony_ci	struct dentry *cdentry;
49178c2ecf20Sopenharmony_ci	const char *name;
49188c2ecf20Sopenharmony_ci	char *endp;
49198c2ecf20Sopenharmony_ci	int ret;
49208c2ecf20Sopenharmony_ci
49218c2ecf20Sopenharmony_ci	buf = strstrip(buf);
49228c2ecf20Sopenharmony_ci
49238c2ecf20Sopenharmony_ci	efd = simple_strtoul(buf, &endp, 10);
49248c2ecf20Sopenharmony_ci	if (*endp != ' ')
49258c2ecf20Sopenharmony_ci		return -EINVAL;
49268c2ecf20Sopenharmony_ci	buf = endp + 1;
49278c2ecf20Sopenharmony_ci
49288c2ecf20Sopenharmony_ci	cfd = simple_strtoul(buf, &endp, 10);
49298c2ecf20Sopenharmony_ci	if (*endp == '\0')
49308c2ecf20Sopenharmony_ci		buf = endp;
49318c2ecf20Sopenharmony_ci	else if (*endp == ' ')
49328c2ecf20Sopenharmony_ci		buf = endp + 1;
49338c2ecf20Sopenharmony_ci	else
49348c2ecf20Sopenharmony_ci		return -EINVAL;
49358c2ecf20Sopenharmony_ci
49368c2ecf20Sopenharmony_ci	event = kzalloc(sizeof(*event), GFP_KERNEL);
49378c2ecf20Sopenharmony_ci	if (!event)
49388c2ecf20Sopenharmony_ci		return -ENOMEM;
49398c2ecf20Sopenharmony_ci
49408c2ecf20Sopenharmony_ci	event->memcg = memcg;
49418c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&event->list);
49428c2ecf20Sopenharmony_ci	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
49438c2ecf20Sopenharmony_ci	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
49448c2ecf20Sopenharmony_ci	INIT_WORK(&event->remove, memcg_event_remove);
49458c2ecf20Sopenharmony_ci
49468c2ecf20Sopenharmony_ci	efile = fdget(efd);
49478c2ecf20Sopenharmony_ci	if (!efile.file) {
49488c2ecf20Sopenharmony_ci		ret = -EBADF;
49498c2ecf20Sopenharmony_ci		goto out_kfree;
49508c2ecf20Sopenharmony_ci	}
49518c2ecf20Sopenharmony_ci
49528c2ecf20Sopenharmony_ci	event->eventfd = eventfd_ctx_fileget(efile.file);
49538c2ecf20Sopenharmony_ci	if (IS_ERR(event->eventfd)) {
49548c2ecf20Sopenharmony_ci		ret = PTR_ERR(event->eventfd);
49558c2ecf20Sopenharmony_ci		goto out_put_efile;
49568c2ecf20Sopenharmony_ci	}
49578c2ecf20Sopenharmony_ci
49588c2ecf20Sopenharmony_ci	cfile = fdget(cfd);
49598c2ecf20Sopenharmony_ci	if (!cfile.file) {
49608c2ecf20Sopenharmony_ci		ret = -EBADF;
49618c2ecf20Sopenharmony_ci		goto out_put_eventfd;
49628c2ecf20Sopenharmony_ci	}
49638c2ecf20Sopenharmony_ci
49648c2ecf20Sopenharmony_ci	/* the process need read permission on control file */
49658c2ecf20Sopenharmony_ci	/* AV: shouldn't we check that it's been opened for read instead? */
49668c2ecf20Sopenharmony_ci	ret = inode_permission(file_inode(cfile.file), MAY_READ);
49678c2ecf20Sopenharmony_ci	if (ret < 0)
49688c2ecf20Sopenharmony_ci		goto out_put_cfile;
49698c2ecf20Sopenharmony_ci
49708c2ecf20Sopenharmony_ci	/*
49718c2ecf20Sopenharmony_ci	 * The control file must be a regular cgroup1 file. As a regular cgroup
49728c2ecf20Sopenharmony_ci	 * file can't be renamed, it's safe to access its name afterwards.
49738c2ecf20Sopenharmony_ci	 */
49748c2ecf20Sopenharmony_ci	cdentry = cfile.file->f_path.dentry;
49758c2ecf20Sopenharmony_ci	if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
49768c2ecf20Sopenharmony_ci		ret = -EINVAL;
49778c2ecf20Sopenharmony_ci		goto out_put_cfile;
49788c2ecf20Sopenharmony_ci	}
49798c2ecf20Sopenharmony_ci
49808c2ecf20Sopenharmony_ci	/*
49818c2ecf20Sopenharmony_ci	 * Determine the event callbacks and set them in @event.  This used
49828c2ecf20Sopenharmony_ci	 * to be done via struct cftype but cgroup core no longer knows
49838c2ecf20Sopenharmony_ci	 * about these events.  The following is crude but the whole thing
49848c2ecf20Sopenharmony_ci	 * is for compatibility anyway.
49858c2ecf20Sopenharmony_ci	 *
49868c2ecf20Sopenharmony_ci	 * DO NOT ADD NEW FILES.
49878c2ecf20Sopenharmony_ci	 */
49888c2ecf20Sopenharmony_ci	name = cdentry->d_name.name;
49898c2ecf20Sopenharmony_ci
49908c2ecf20Sopenharmony_ci	if (!strcmp(name, "memory.usage_in_bytes")) {
49918c2ecf20Sopenharmony_ci		event->register_event = mem_cgroup_usage_register_event;
49928c2ecf20Sopenharmony_ci		event->unregister_event = mem_cgroup_usage_unregister_event;
49938c2ecf20Sopenharmony_ci	} else if (!strcmp(name, "memory.oom_control")) {
49948c2ecf20Sopenharmony_ci		event->register_event = mem_cgroup_oom_register_event;
49958c2ecf20Sopenharmony_ci		event->unregister_event = mem_cgroup_oom_unregister_event;
49968c2ecf20Sopenharmony_ci	} else if (!strcmp(name, "memory.pressure_level")) {
49978c2ecf20Sopenharmony_ci		event->register_event = vmpressure_register_event;
49988c2ecf20Sopenharmony_ci		event->unregister_event = vmpressure_unregister_event;
49998c2ecf20Sopenharmony_ci	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
50008c2ecf20Sopenharmony_ci		event->register_event = memsw_cgroup_usage_register_event;
50018c2ecf20Sopenharmony_ci		event->unregister_event = memsw_cgroup_usage_unregister_event;
50028c2ecf20Sopenharmony_ci	} else {
50038c2ecf20Sopenharmony_ci		ret = -EINVAL;
50048c2ecf20Sopenharmony_ci		goto out_put_cfile;
50058c2ecf20Sopenharmony_ci	}
50068c2ecf20Sopenharmony_ci
50078c2ecf20Sopenharmony_ci	/*
50088c2ecf20Sopenharmony_ci	 * Verify @cfile should belong to @css.  Also, remaining events are
50098c2ecf20Sopenharmony_ci	 * automatically removed on cgroup destruction but the removal is
50108c2ecf20Sopenharmony_ci	 * asynchronous, so take an extra ref on @css.
50118c2ecf20Sopenharmony_ci	 */
50128c2ecf20Sopenharmony_ci	cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
50138c2ecf20Sopenharmony_ci					       &memory_cgrp_subsys);
50148c2ecf20Sopenharmony_ci	ret = -EINVAL;
50158c2ecf20Sopenharmony_ci	if (IS_ERR(cfile_css))
50168c2ecf20Sopenharmony_ci		goto out_put_cfile;
50178c2ecf20Sopenharmony_ci	if (cfile_css != css) {
50188c2ecf20Sopenharmony_ci		css_put(cfile_css);
50198c2ecf20Sopenharmony_ci		goto out_put_cfile;
50208c2ecf20Sopenharmony_ci	}
50218c2ecf20Sopenharmony_ci
50228c2ecf20Sopenharmony_ci	ret = event->register_event(memcg, event->eventfd, buf);
50238c2ecf20Sopenharmony_ci	if (ret)
50248c2ecf20Sopenharmony_ci		goto out_put_css;
50258c2ecf20Sopenharmony_ci
50268c2ecf20Sopenharmony_ci	vfs_poll(efile.file, &event->pt);
50278c2ecf20Sopenharmony_ci
50288c2ecf20Sopenharmony_ci	spin_lock(&memcg->event_list_lock);
50298c2ecf20Sopenharmony_ci	list_add(&event->list, &memcg->event_list);
50308c2ecf20Sopenharmony_ci	spin_unlock(&memcg->event_list_lock);
50318c2ecf20Sopenharmony_ci
50328c2ecf20Sopenharmony_ci	fdput(cfile);
50338c2ecf20Sopenharmony_ci	fdput(efile);
50348c2ecf20Sopenharmony_ci
50358c2ecf20Sopenharmony_ci	return nbytes;
50368c2ecf20Sopenharmony_ci
50378c2ecf20Sopenharmony_ciout_put_css:
50388c2ecf20Sopenharmony_ci	css_put(css);
50398c2ecf20Sopenharmony_ciout_put_cfile:
50408c2ecf20Sopenharmony_ci	fdput(cfile);
50418c2ecf20Sopenharmony_ciout_put_eventfd:
50428c2ecf20Sopenharmony_ci	eventfd_ctx_put(event->eventfd);
50438c2ecf20Sopenharmony_ciout_put_efile:
50448c2ecf20Sopenharmony_ci	fdput(efile);
50458c2ecf20Sopenharmony_ciout_kfree:
50468c2ecf20Sopenharmony_ci	kfree(event);
50478c2ecf20Sopenharmony_ci
50488c2ecf20Sopenharmony_ci	return ret;
50498c2ecf20Sopenharmony_ci}
50508c2ecf20Sopenharmony_ci
50518c2ecf20Sopenharmony_cistatic struct cftype mem_cgroup_legacy_files[] = {
50528c2ecf20Sopenharmony_ci	{
50538c2ecf20Sopenharmony_ci		.name = "usage_in_bytes",
50548c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
50558c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
50568c2ecf20Sopenharmony_ci	},
50578c2ecf20Sopenharmony_ci	{
50588c2ecf20Sopenharmony_ci		.name = "max_usage_in_bytes",
50598c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
50608c2ecf20Sopenharmony_ci		.write = mem_cgroup_reset,
50618c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
50628c2ecf20Sopenharmony_ci	},
50638c2ecf20Sopenharmony_ci	{
50648c2ecf20Sopenharmony_ci		.name = "limit_in_bytes",
50658c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
50668c2ecf20Sopenharmony_ci		.write = mem_cgroup_write,
50678c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
50688c2ecf20Sopenharmony_ci	},
50698c2ecf20Sopenharmony_ci	{
50708c2ecf20Sopenharmony_ci		.name = "soft_limit_in_bytes",
50718c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
50728c2ecf20Sopenharmony_ci		.write = mem_cgroup_write,
50738c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
50748c2ecf20Sopenharmony_ci	},
50758c2ecf20Sopenharmony_ci	{
50768c2ecf20Sopenharmony_ci		.name = "failcnt",
50778c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
50788c2ecf20Sopenharmony_ci		.write = mem_cgroup_reset,
50798c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
50808c2ecf20Sopenharmony_ci	},
50818c2ecf20Sopenharmony_ci	{
50828c2ecf20Sopenharmony_ci		.name = "stat",
50838c2ecf20Sopenharmony_ci		.seq_show = memcg_stat_show,
50848c2ecf20Sopenharmony_ci	},
50858c2ecf20Sopenharmony_ci	{
50868c2ecf20Sopenharmony_ci		.name = "force_empty",
50878c2ecf20Sopenharmony_ci		.write = mem_cgroup_force_empty_write,
50888c2ecf20Sopenharmony_ci	},
50898c2ecf20Sopenharmony_ci	{
50908c2ecf20Sopenharmony_ci		.name = "use_hierarchy",
50918c2ecf20Sopenharmony_ci		.write_u64 = mem_cgroup_hierarchy_write,
50928c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_hierarchy_read,
50938c2ecf20Sopenharmony_ci	},
50948c2ecf20Sopenharmony_ci	{
50958c2ecf20Sopenharmony_ci		.name = "cgroup.event_control",		/* XXX: for compat */
50968c2ecf20Sopenharmony_ci		.write = memcg_write_event_control,
50978c2ecf20Sopenharmony_ci		.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
50988c2ecf20Sopenharmony_ci	},
50998c2ecf20Sopenharmony_ci	{
51008c2ecf20Sopenharmony_ci		.name = "swappiness",
51018c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_swappiness_read,
51028c2ecf20Sopenharmony_ci		.write_u64 = mem_cgroup_swappiness_write,
51038c2ecf20Sopenharmony_ci	},
51048c2ecf20Sopenharmony_ci	{
51058c2ecf20Sopenharmony_ci		.name = "move_charge_at_immigrate",
51068c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_move_charge_read,
51078c2ecf20Sopenharmony_ci		.write_u64 = mem_cgroup_move_charge_write,
51088c2ecf20Sopenharmony_ci	},
51098c2ecf20Sopenharmony_ci	{
51108c2ecf20Sopenharmony_ci		.name = "oom_control",
51118c2ecf20Sopenharmony_ci		.seq_show = mem_cgroup_oom_control_read,
51128c2ecf20Sopenharmony_ci		.write_u64 = mem_cgroup_oom_control_write,
51138c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
51148c2ecf20Sopenharmony_ci	},
51158c2ecf20Sopenharmony_ci	{
51168c2ecf20Sopenharmony_ci		.name = "pressure_level",
51178c2ecf20Sopenharmony_ci	},
51188c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA
51198c2ecf20Sopenharmony_ci	{
51208c2ecf20Sopenharmony_ci		.name = "numa_stat",
51218c2ecf20Sopenharmony_ci		.seq_show = memcg_numa_stat_show,
51228c2ecf20Sopenharmony_ci	},
51238c2ecf20Sopenharmony_ci#endif
51248c2ecf20Sopenharmony_ci	{
51258c2ecf20Sopenharmony_ci		.name = "kmem.limit_in_bytes",
51268c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
51278c2ecf20Sopenharmony_ci		.write = mem_cgroup_write,
51288c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
51298c2ecf20Sopenharmony_ci	},
51308c2ecf20Sopenharmony_ci	{
51318c2ecf20Sopenharmony_ci		.name = "kmem.usage_in_bytes",
51328c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
51338c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
51348c2ecf20Sopenharmony_ci	},
51358c2ecf20Sopenharmony_ci	{
51368c2ecf20Sopenharmony_ci		.name = "kmem.failcnt",
51378c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
51388c2ecf20Sopenharmony_ci		.write = mem_cgroup_reset,
51398c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
51408c2ecf20Sopenharmony_ci	},
51418c2ecf20Sopenharmony_ci	{
51428c2ecf20Sopenharmony_ci		.name = "kmem.max_usage_in_bytes",
51438c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
51448c2ecf20Sopenharmony_ci		.write = mem_cgroup_reset,
51458c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
51468c2ecf20Sopenharmony_ci	},
51478c2ecf20Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) && \
51488c2ecf20Sopenharmony_ci	(defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
51498c2ecf20Sopenharmony_ci	{
51508c2ecf20Sopenharmony_ci		.name = "kmem.slabinfo",
51518c2ecf20Sopenharmony_ci		.seq_show = memcg_slab_show,
51528c2ecf20Sopenharmony_ci	},
51538c2ecf20Sopenharmony_ci#endif
51548c2ecf20Sopenharmony_ci	{
51558c2ecf20Sopenharmony_ci		.name = "kmem.tcp.limit_in_bytes",
51568c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
51578c2ecf20Sopenharmony_ci		.write = mem_cgroup_write,
51588c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
51598c2ecf20Sopenharmony_ci	},
51608c2ecf20Sopenharmony_ci	{
51618c2ecf20Sopenharmony_ci		.name = "kmem.tcp.usage_in_bytes",
51628c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
51638c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
51648c2ecf20Sopenharmony_ci	},
51658c2ecf20Sopenharmony_ci	{
51668c2ecf20Sopenharmony_ci		.name = "kmem.tcp.failcnt",
51678c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
51688c2ecf20Sopenharmony_ci		.write = mem_cgroup_reset,
51698c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
51708c2ecf20Sopenharmony_ci	},
51718c2ecf20Sopenharmony_ci	{
51728c2ecf20Sopenharmony_ci		.name = "kmem.tcp.max_usage_in_bytes",
51738c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
51748c2ecf20Sopenharmony_ci		.write = mem_cgroup_reset,
51758c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
51768c2ecf20Sopenharmony_ci	},
51778c2ecf20Sopenharmony_ci	{ },	/* terminate */
51788c2ecf20Sopenharmony_ci};
51798c2ecf20Sopenharmony_ci
51808c2ecf20Sopenharmony_ci/*
51818c2ecf20Sopenharmony_ci * Private memory cgroup IDR
51828c2ecf20Sopenharmony_ci *
51838c2ecf20Sopenharmony_ci * Swap-out records and page cache shadow entries need to store memcg
51848c2ecf20Sopenharmony_ci * references in constrained space, so we maintain an ID space that is
51858c2ecf20Sopenharmony_ci * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
51868c2ecf20Sopenharmony_ci * memory-controlled cgroups to 64k.
51878c2ecf20Sopenharmony_ci *
51888c2ecf20Sopenharmony_ci * However, there usually are many references to the offline CSS after
51898c2ecf20Sopenharmony_ci * the cgroup has been destroyed, such as page cache or reclaimable
51908c2ecf20Sopenharmony_ci * slab objects, that don't need to hang on to the ID. We want to keep
51918c2ecf20Sopenharmony_ci * those dead CSS from occupying IDs, or we might quickly exhaust the
51928c2ecf20Sopenharmony_ci * relatively small ID space and prevent the creation of new cgroups
51938c2ecf20Sopenharmony_ci * even when there are much fewer than 64k cgroups - possibly none.
51948c2ecf20Sopenharmony_ci *
51958c2ecf20Sopenharmony_ci * Maintain a private 16-bit ID space for memcg, and allow the ID to
51968c2ecf20Sopenharmony_ci * be freed and recycled when it's no longer needed, which is usually
51978c2ecf20Sopenharmony_ci * when the CSS is offlined.
51988c2ecf20Sopenharmony_ci *
51998c2ecf20Sopenharmony_ci * The only exception to that are records of swapped out tmpfs/shmem
52008c2ecf20Sopenharmony_ci * pages that need to be attributed to live ancestors on swapin. But
52018c2ecf20Sopenharmony_ci * those references are manageable from userspace.
52028c2ecf20Sopenharmony_ci */
52038c2ecf20Sopenharmony_ci
52048c2ecf20Sopenharmony_cistatic DEFINE_IDR(mem_cgroup_idr);
52058c2ecf20Sopenharmony_cistatic DEFINE_SPINLOCK(memcg_idr_lock);
52068c2ecf20Sopenharmony_ci
52078c2ecf20Sopenharmony_cistatic int mem_cgroup_alloc_id(void)
52088c2ecf20Sopenharmony_ci{
52098c2ecf20Sopenharmony_ci	int ret;
52108c2ecf20Sopenharmony_ci
52118c2ecf20Sopenharmony_ci	idr_preload(GFP_KERNEL);
52128c2ecf20Sopenharmony_ci	spin_lock(&memcg_idr_lock);
52138c2ecf20Sopenharmony_ci	ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX,
52148c2ecf20Sopenharmony_ci			GFP_NOWAIT);
52158c2ecf20Sopenharmony_ci	spin_unlock(&memcg_idr_lock);
52168c2ecf20Sopenharmony_ci	idr_preload_end();
52178c2ecf20Sopenharmony_ci	return ret;
52188c2ecf20Sopenharmony_ci}
52198c2ecf20Sopenharmony_ci
52208c2ecf20Sopenharmony_cistatic void mem_cgroup_id_remove(struct mem_cgroup *memcg)
52218c2ecf20Sopenharmony_ci{
52228c2ecf20Sopenharmony_ci	if (memcg->id.id > 0) {
52238c2ecf20Sopenharmony_ci		spin_lock(&memcg_idr_lock);
52248c2ecf20Sopenharmony_ci		idr_remove(&mem_cgroup_idr, memcg->id.id);
52258c2ecf20Sopenharmony_ci		spin_unlock(&memcg_idr_lock);
52268c2ecf20Sopenharmony_ci
52278c2ecf20Sopenharmony_ci		memcg->id.id = 0;
52288c2ecf20Sopenharmony_ci	}
52298c2ecf20Sopenharmony_ci}
52308c2ecf20Sopenharmony_ci
52318c2ecf20Sopenharmony_cistatic void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
52328c2ecf20Sopenharmony_ci						  unsigned int n)
52338c2ecf20Sopenharmony_ci{
52348c2ecf20Sopenharmony_ci	refcount_add(n, &memcg->id.ref);
52358c2ecf20Sopenharmony_ci}
52368c2ecf20Sopenharmony_ci
52378c2ecf20Sopenharmony_cistatic void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
52388c2ecf20Sopenharmony_ci{
52398c2ecf20Sopenharmony_ci	if (refcount_sub_and_test(n, &memcg->id.ref)) {
52408c2ecf20Sopenharmony_ci		mem_cgroup_id_remove(memcg);
52418c2ecf20Sopenharmony_ci
52428c2ecf20Sopenharmony_ci		/* Memcg ID pins CSS */
52438c2ecf20Sopenharmony_ci		css_put(&memcg->css);
52448c2ecf20Sopenharmony_ci	}
52458c2ecf20Sopenharmony_ci}
52468c2ecf20Sopenharmony_ci
52478c2ecf20Sopenharmony_cistatic inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
52488c2ecf20Sopenharmony_ci{
52498c2ecf20Sopenharmony_ci	mem_cgroup_id_put_many(memcg, 1);
52508c2ecf20Sopenharmony_ci}
52518c2ecf20Sopenharmony_ci
52528c2ecf20Sopenharmony_ci/**
52538c2ecf20Sopenharmony_ci * mem_cgroup_from_id - look up a memcg from a memcg id
52548c2ecf20Sopenharmony_ci * @id: the memcg id to look up
52558c2ecf20Sopenharmony_ci *
52568c2ecf20Sopenharmony_ci * Caller must hold rcu_read_lock().
52578c2ecf20Sopenharmony_ci */
52588c2ecf20Sopenharmony_cistruct mem_cgroup *mem_cgroup_from_id(unsigned short id)
52598c2ecf20Sopenharmony_ci{
52608c2ecf20Sopenharmony_ci	WARN_ON_ONCE(!rcu_read_lock_held());
52618c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
52628c2ecf20Sopenharmony_ci	if (id == -1)
52638c2ecf20Sopenharmony_ci		return NULL;
52648c2ecf20Sopenharmony_ci#endif
52658c2ecf20Sopenharmony_ci	return idr_find(&mem_cgroup_idr, id);
52668c2ecf20Sopenharmony_ci}
52678c2ecf20Sopenharmony_ci
52688c2ecf20Sopenharmony_cistatic int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
52698c2ecf20Sopenharmony_ci{
52708c2ecf20Sopenharmony_ci	struct mem_cgroup_per_node *pn;
52718c2ecf20Sopenharmony_ci	int tmp = node;
52728c2ecf20Sopenharmony_ci	/*
52738c2ecf20Sopenharmony_ci	 * This routine is called against possible nodes.
52748c2ecf20Sopenharmony_ci	 * But it's BUG to call kmalloc() against offline node.
52758c2ecf20Sopenharmony_ci	 *
52768c2ecf20Sopenharmony_ci	 * TODO: this routine can waste much memory for nodes which will
52778c2ecf20Sopenharmony_ci	 *       never be onlined. It's better to use memory hotplug callback
52788c2ecf20Sopenharmony_ci	 *       function.
52798c2ecf20Sopenharmony_ci	 */
52808c2ecf20Sopenharmony_ci	if (!node_state(node, N_NORMAL_MEMORY))
52818c2ecf20Sopenharmony_ci		tmp = -1;
52828c2ecf20Sopenharmony_ci	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
52838c2ecf20Sopenharmony_ci	if (!pn)
52848c2ecf20Sopenharmony_ci		return 1;
52858c2ecf20Sopenharmony_ci
52868c2ecf20Sopenharmony_ci	pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
52878c2ecf20Sopenharmony_ci						 GFP_KERNEL_ACCOUNT);
52888c2ecf20Sopenharmony_ci	if (!pn->lruvec_stat_local) {
52898c2ecf20Sopenharmony_ci		kfree(pn);
52908c2ecf20Sopenharmony_ci		return 1;
52918c2ecf20Sopenharmony_ci	}
52928c2ecf20Sopenharmony_ci
52938c2ecf20Sopenharmony_ci	pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
52948c2ecf20Sopenharmony_ci					       GFP_KERNEL_ACCOUNT);
52958c2ecf20Sopenharmony_ci	if (!pn->lruvec_stat_cpu) {
52968c2ecf20Sopenharmony_ci		free_percpu(pn->lruvec_stat_local);
52978c2ecf20Sopenharmony_ci		kfree(pn);
52988c2ecf20Sopenharmony_ci		return 1;
52998c2ecf20Sopenharmony_ci	}
53008c2ecf20Sopenharmony_ci
53018c2ecf20Sopenharmony_ci	lruvec_init(&pn->lruvec);
53028c2ecf20Sopenharmony_ci	pn->usage_in_excess = 0;
53038c2ecf20Sopenharmony_ci	pn->lruvec.pgdat = NODE_DATA(node);
53048c2ecf20Sopenharmony_ci	pn->on_tree = false;
53058c2ecf20Sopenharmony_ci	pn->memcg = memcg;
53068c2ecf20Sopenharmony_ci
53078c2ecf20Sopenharmony_ci	memcg->nodeinfo[node] = pn;
53088c2ecf20Sopenharmony_ci	return 0;
53098c2ecf20Sopenharmony_ci}
53108c2ecf20Sopenharmony_ci
53118c2ecf20Sopenharmony_cistatic void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
53128c2ecf20Sopenharmony_ci{
53138c2ecf20Sopenharmony_ci	struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
53148c2ecf20Sopenharmony_ci
53158c2ecf20Sopenharmony_ci	if (!pn)
53168c2ecf20Sopenharmony_ci		return;
53178c2ecf20Sopenharmony_ci
53188c2ecf20Sopenharmony_ci	free_percpu(pn->lruvec_stat_cpu);
53198c2ecf20Sopenharmony_ci	free_percpu(pn->lruvec_stat_local);
53208c2ecf20Sopenharmony_ci	kfree(pn);
53218c2ecf20Sopenharmony_ci}
53228c2ecf20Sopenharmony_ci
53238c2ecf20Sopenharmony_cistatic void __mem_cgroup_free(struct mem_cgroup *memcg)
53248c2ecf20Sopenharmony_ci{
53258c2ecf20Sopenharmony_ci	int node;
53268c2ecf20Sopenharmony_ci
53278c2ecf20Sopenharmony_ci	for_each_node(node)
53288c2ecf20Sopenharmony_ci		free_mem_cgroup_per_node_info(memcg, node);
53298c2ecf20Sopenharmony_ci	free_percpu(memcg->vmstats_percpu);
53308c2ecf20Sopenharmony_ci	free_percpu(memcg->vmstats_local);
53318c2ecf20Sopenharmony_ci	kfree(memcg);
53328c2ecf20Sopenharmony_ci}
53338c2ecf20Sopenharmony_ci
53348c2ecf20Sopenharmony_cistatic void mem_cgroup_free(struct mem_cgroup *memcg)
53358c2ecf20Sopenharmony_ci{
53368c2ecf20Sopenharmony_ci	memcg_wb_domain_exit(memcg);
53378c2ecf20Sopenharmony_ci	/*
53388c2ecf20Sopenharmony_ci	 * Flush percpu vmstats and vmevents to guarantee the value correctness
53398c2ecf20Sopenharmony_ci	 * on parent's and all ancestor levels.
53408c2ecf20Sopenharmony_ci	 */
53418c2ecf20Sopenharmony_ci	memcg_flush_percpu_vmstats(memcg);
53428c2ecf20Sopenharmony_ci	memcg_flush_percpu_vmevents(memcg);
53438c2ecf20Sopenharmony_ci	__mem_cgroup_free(memcg);
53448c2ecf20Sopenharmony_ci}
53458c2ecf20Sopenharmony_ci
53468c2ecf20Sopenharmony_cistatic struct mem_cgroup *mem_cgroup_alloc(void)
53478c2ecf20Sopenharmony_ci{
53488c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
53498c2ecf20Sopenharmony_ci	unsigned int size;
53508c2ecf20Sopenharmony_ci	int node;
53518c2ecf20Sopenharmony_ci	int __maybe_unused i;
53528c2ecf20Sopenharmony_ci	long error = -ENOMEM;
53538c2ecf20Sopenharmony_ci
53548c2ecf20Sopenharmony_ci	size = sizeof(struct mem_cgroup);
53558c2ecf20Sopenharmony_ci	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
53568c2ecf20Sopenharmony_ci
53578c2ecf20Sopenharmony_ci	memcg = kzalloc(size, GFP_KERNEL);
53588c2ecf20Sopenharmony_ci	if (!memcg)
53598c2ecf20Sopenharmony_ci		return ERR_PTR(error);
53608c2ecf20Sopenharmony_ci
53618c2ecf20Sopenharmony_ci	memcg->id.id = mem_cgroup_alloc_id();
53628c2ecf20Sopenharmony_ci	if (memcg->id.id < 0) {
53638c2ecf20Sopenharmony_ci		error = memcg->id.id;
53648c2ecf20Sopenharmony_ci		goto fail;
53658c2ecf20Sopenharmony_ci	}
53668c2ecf20Sopenharmony_ci
53678c2ecf20Sopenharmony_ci	memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
53688c2ecf20Sopenharmony_ci						GFP_KERNEL_ACCOUNT);
53698c2ecf20Sopenharmony_ci	if (!memcg->vmstats_local)
53708c2ecf20Sopenharmony_ci		goto fail;
53718c2ecf20Sopenharmony_ci
53728c2ecf20Sopenharmony_ci	memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
53738c2ecf20Sopenharmony_ci						 GFP_KERNEL_ACCOUNT);
53748c2ecf20Sopenharmony_ci	if (!memcg->vmstats_percpu)
53758c2ecf20Sopenharmony_ci		goto fail;
53768c2ecf20Sopenharmony_ci
53778c2ecf20Sopenharmony_ci	for_each_node(node)
53788c2ecf20Sopenharmony_ci		if (alloc_mem_cgroup_per_node_info(memcg, node))
53798c2ecf20Sopenharmony_ci			goto fail;
53808c2ecf20Sopenharmony_ci
53818c2ecf20Sopenharmony_ci	if (memcg_wb_domain_init(memcg, GFP_KERNEL))
53828c2ecf20Sopenharmony_ci		goto fail;
53838c2ecf20Sopenharmony_ci
53848c2ecf20Sopenharmony_ci	INIT_WORK(&memcg->high_work, high_work_func);
53858c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&memcg->oom_notify);
53868c2ecf20Sopenharmony_ci	mutex_init(&memcg->thresholds_lock);
53878c2ecf20Sopenharmony_ci	spin_lock_init(&memcg->move_lock);
53888c2ecf20Sopenharmony_ci	vmpressure_init(&memcg->vmpressure);
53898c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&memcg->event_list);
53908c2ecf20Sopenharmony_ci	spin_lock_init(&memcg->event_list_lock);
53918c2ecf20Sopenharmony_ci	memcg->socket_pressure = jiffies;
53928c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
53938c2ecf20Sopenharmony_ci	memcg->kmemcg_id = -1;
53948c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&memcg->objcg_list);
53958c2ecf20Sopenharmony_ci#endif
53968c2ecf20Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK
53978c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&memcg->cgwb_list);
53988c2ecf20Sopenharmony_ci	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
53998c2ecf20Sopenharmony_ci		memcg->cgwb_frn[i].done =
54008c2ecf20Sopenharmony_ci			__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
54018c2ecf20Sopenharmony_ci#endif
54028c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
54038c2ecf20Sopenharmony_ci	spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
54048c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
54058c2ecf20Sopenharmony_ci	memcg->deferred_split_queue.split_queue_len = 0;
54068c2ecf20Sopenharmony_ci#endif
54078c2ecf20Sopenharmony_ci
54088c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG
54098c2ecf20Sopenharmony_ci	if (unlikely(!score_head_inited)) {
54108c2ecf20Sopenharmony_ci		INIT_LIST_HEAD(&score_head);
54118c2ecf20Sopenharmony_ci		score_head_inited = true;
54128c2ecf20Sopenharmony_ci	}
54138c2ecf20Sopenharmony_ci#endif
54148c2ecf20Sopenharmony_ci
54158c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG
54168c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&memcg->score_node);
54178c2ecf20Sopenharmony_ci#endif
54188c2ecf20Sopenharmony_ci	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
54198c2ecf20Sopenharmony_ci	return memcg;
54208c2ecf20Sopenharmony_cifail:
54218c2ecf20Sopenharmony_ci	mem_cgroup_id_remove(memcg);
54228c2ecf20Sopenharmony_ci	__mem_cgroup_free(memcg);
54238c2ecf20Sopenharmony_ci	return ERR_PTR(error);
54248c2ecf20Sopenharmony_ci}
54258c2ecf20Sopenharmony_ci
54268c2ecf20Sopenharmony_cistatic struct cgroup_subsys_state * __ref
54278c2ecf20Sopenharmony_cimem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
54288c2ecf20Sopenharmony_ci{
54298c2ecf20Sopenharmony_ci	struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
54308c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg, *old_memcg;
54318c2ecf20Sopenharmony_ci	long error = -ENOMEM;
54328c2ecf20Sopenharmony_ci
54338c2ecf20Sopenharmony_ci	old_memcg = set_active_memcg(parent);
54348c2ecf20Sopenharmony_ci	memcg = mem_cgroup_alloc();
54358c2ecf20Sopenharmony_ci	set_active_memcg(old_memcg);
54368c2ecf20Sopenharmony_ci	if (IS_ERR(memcg))
54378c2ecf20Sopenharmony_ci		return ERR_CAST(memcg);
54388c2ecf20Sopenharmony_ci
54398c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG
54408c2ecf20Sopenharmony_ci	atomic64_set(&memcg->memcg_reclaimed.app_score, 300);
54418c2ecf20Sopenharmony_ci#endif
54428c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_ZSWAPD
54438c2ecf20Sopenharmony_ci	atomic_set(&memcg->memcg_reclaimed.ub_zram2ufs_ratio, 10);
54448c2ecf20Sopenharmony_ci	atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, 60);
54458c2ecf20Sopenharmony_ci	atomic_set(&memcg->memcg_reclaimed.refault_threshold, 50);
54468c2ecf20Sopenharmony_ci#endif
54478c2ecf20Sopenharmony_ci	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
54488c2ecf20Sopenharmony_ci	memcg->soft_limit = PAGE_COUNTER_MAX;
54498c2ecf20Sopenharmony_ci	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
54508c2ecf20Sopenharmony_ci	if (parent) {
54518c2ecf20Sopenharmony_ci		memcg->swappiness = mem_cgroup_swappiness(parent);
54528c2ecf20Sopenharmony_ci		memcg->oom_kill_disable = parent->oom_kill_disable;
54538c2ecf20Sopenharmony_ci	}
54548c2ecf20Sopenharmony_ci	if (!parent) {
54558c2ecf20Sopenharmony_ci		page_counter_init(&memcg->memory, NULL);
54568c2ecf20Sopenharmony_ci		page_counter_init(&memcg->swap, NULL);
54578c2ecf20Sopenharmony_ci		page_counter_init(&memcg->kmem, NULL);
54588c2ecf20Sopenharmony_ci		page_counter_init(&memcg->tcpmem, NULL);
54598c2ecf20Sopenharmony_ci	} else if (parent->use_hierarchy) {
54608c2ecf20Sopenharmony_ci		memcg->use_hierarchy = true;
54618c2ecf20Sopenharmony_ci		page_counter_init(&memcg->memory, &parent->memory);
54628c2ecf20Sopenharmony_ci		page_counter_init(&memcg->swap, &parent->swap);
54638c2ecf20Sopenharmony_ci		page_counter_init(&memcg->kmem, &parent->kmem);
54648c2ecf20Sopenharmony_ci		page_counter_init(&memcg->tcpmem, &parent->tcpmem);
54658c2ecf20Sopenharmony_ci	} else {
54668c2ecf20Sopenharmony_ci		page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
54678c2ecf20Sopenharmony_ci		page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
54688c2ecf20Sopenharmony_ci		page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
54698c2ecf20Sopenharmony_ci		page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
54708c2ecf20Sopenharmony_ci		/*
54718c2ecf20Sopenharmony_ci		 * Deeper hierachy with use_hierarchy == false doesn't make
54728c2ecf20Sopenharmony_ci		 * much sense so let cgroup subsystem know about this
54738c2ecf20Sopenharmony_ci		 * unfortunate state in our controller.
54748c2ecf20Sopenharmony_ci		 */
54758c2ecf20Sopenharmony_ci		if (parent != root_mem_cgroup)
54768c2ecf20Sopenharmony_ci			memory_cgrp_subsys.broken_hierarchy = true;
54778c2ecf20Sopenharmony_ci	}
54788c2ecf20Sopenharmony_ci
54798c2ecf20Sopenharmony_ci	/* The following stuff does not apply to the root */
54808c2ecf20Sopenharmony_ci	if (!parent) {
54818c2ecf20Sopenharmony_ci		root_mem_cgroup = memcg;
54828c2ecf20Sopenharmony_ci		return &memcg->css;
54838c2ecf20Sopenharmony_ci	}
54848c2ecf20Sopenharmony_ci
54858c2ecf20Sopenharmony_ci	error = memcg_online_kmem(memcg);
54868c2ecf20Sopenharmony_ci	if (error)
54878c2ecf20Sopenharmony_ci		goto fail;
54888c2ecf20Sopenharmony_ci
54898c2ecf20Sopenharmony_ci	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
54908c2ecf20Sopenharmony_ci		static_branch_inc(&memcg_sockets_enabled_key);
54918c2ecf20Sopenharmony_ci
54928c2ecf20Sopenharmony_ci	return &memcg->css;
54938c2ecf20Sopenharmony_cifail:
54948c2ecf20Sopenharmony_ci	mem_cgroup_id_remove(memcg);
54958c2ecf20Sopenharmony_ci	mem_cgroup_free(memcg);
54968c2ecf20Sopenharmony_ci	return ERR_PTR(error);
54978c2ecf20Sopenharmony_ci}
54988c2ecf20Sopenharmony_ci
54998c2ecf20Sopenharmony_cistatic int mem_cgroup_css_online(struct cgroup_subsys_state *css)
55008c2ecf20Sopenharmony_ci{
55018c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
55028c2ecf20Sopenharmony_ci
55038c2ecf20Sopenharmony_ci	/*
55048c2ecf20Sopenharmony_ci	 * A memcg must be visible for memcg_expand_shrinker_maps()
55058c2ecf20Sopenharmony_ci	 * by the time the maps are allocated. So, we allocate maps
55068c2ecf20Sopenharmony_ci	 * here, when for_each_mem_cgroup() can't skip it.
55078c2ecf20Sopenharmony_ci	 */
55088c2ecf20Sopenharmony_ci	if (memcg_alloc_shrinker_maps(memcg)) {
55098c2ecf20Sopenharmony_ci		mem_cgroup_id_remove(memcg);
55108c2ecf20Sopenharmony_ci		return -ENOMEM;
55118c2ecf20Sopenharmony_ci	}
55128c2ecf20Sopenharmony_ci
55138c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG
55148c2ecf20Sopenharmony_ci	memcg_app_score_update(memcg);
55158c2ecf20Sopenharmony_ci	css_get(css);
55168c2ecf20Sopenharmony_ci#endif
55178c2ecf20Sopenharmony_ci
55188c2ecf20Sopenharmony_ci	/* Online state pins memcg ID, memcg ID pins CSS */
55198c2ecf20Sopenharmony_ci	refcount_set(&memcg->id.ref, 1);
55208c2ecf20Sopenharmony_ci	css_get(css);
55218c2ecf20Sopenharmony_ci	return 0;
55228c2ecf20Sopenharmony_ci}
55238c2ecf20Sopenharmony_ci
55248c2ecf20Sopenharmony_cistatic void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
55258c2ecf20Sopenharmony_ci{
55268c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
55278c2ecf20Sopenharmony_ci	struct mem_cgroup_event *event, *tmp;
55288c2ecf20Sopenharmony_ci
55298c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG
55308c2ecf20Sopenharmony_ci	unsigned long flags;
55318c2ecf20Sopenharmony_ci
55328c2ecf20Sopenharmony_ci	write_lock_irqsave(&score_list_lock, flags);
55338c2ecf20Sopenharmony_ci	list_del_init(&memcg->score_node);
55348c2ecf20Sopenharmony_ci	write_unlock_irqrestore(&score_list_lock, flags);
55358c2ecf20Sopenharmony_ci	css_put(css);
55368c2ecf20Sopenharmony_ci#endif
55378c2ecf20Sopenharmony_ci
55388c2ecf20Sopenharmony_ci	/*
55398c2ecf20Sopenharmony_ci	 * Unregister events and notify userspace.
55408c2ecf20Sopenharmony_ci	 * Notify userspace about cgroup removing only after rmdir of cgroup
55418c2ecf20Sopenharmony_ci	 * directory to avoid race between userspace and kernelspace.
55428c2ecf20Sopenharmony_ci	 */
55438c2ecf20Sopenharmony_ci	spin_lock(&memcg->event_list_lock);
55448c2ecf20Sopenharmony_ci	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
55458c2ecf20Sopenharmony_ci		list_del_init(&event->list);
55468c2ecf20Sopenharmony_ci		schedule_work(&event->remove);
55478c2ecf20Sopenharmony_ci	}
55488c2ecf20Sopenharmony_ci	spin_unlock(&memcg->event_list_lock);
55498c2ecf20Sopenharmony_ci
55508c2ecf20Sopenharmony_ci	page_counter_set_min(&memcg->memory, 0);
55518c2ecf20Sopenharmony_ci	page_counter_set_low(&memcg->memory, 0);
55528c2ecf20Sopenharmony_ci
55538c2ecf20Sopenharmony_ci	memcg_offline_kmem(memcg);
55548c2ecf20Sopenharmony_ci	wb_memcg_offline(memcg);
55558c2ecf20Sopenharmony_ci
55568c2ecf20Sopenharmony_ci	drain_all_stock(memcg);
55578c2ecf20Sopenharmony_ci
55588c2ecf20Sopenharmony_ci	mem_cgroup_id_put(memcg);
55598c2ecf20Sopenharmony_ci}
55608c2ecf20Sopenharmony_ci
55618c2ecf20Sopenharmony_cistatic void mem_cgroup_css_released(struct cgroup_subsys_state *css)
55628c2ecf20Sopenharmony_ci{
55638c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
55648c2ecf20Sopenharmony_ci
55658c2ecf20Sopenharmony_ci	invalidate_reclaim_iterators(memcg);
55668c2ecf20Sopenharmony_ci}
55678c2ecf20Sopenharmony_ci
55688c2ecf20Sopenharmony_cistatic void mem_cgroup_css_free(struct cgroup_subsys_state *css)
55698c2ecf20Sopenharmony_ci{
55708c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
55718c2ecf20Sopenharmony_ci	int __maybe_unused i;
55728c2ecf20Sopenharmony_ci
55738c2ecf20Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK
55748c2ecf20Sopenharmony_ci	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
55758c2ecf20Sopenharmony_ci		wb_wait_for_completion(&memcg->cgwb_frn[i].done);
55768c2ecf20Sopenharmony_ci#endif
55778c2ecf20Sopenharmony_ci	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
55788c2ecf20Sopenharmony_ci		static_branch_dec(&memcg_sockets_enabled_key);
55798c2ecf20Sopenharmony_ci
55808c2ecf20Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
55818c2ecf20Sopenharmony_ci		static_branch_dec(&memcg_sockets_enabled_key);
55828c2ecf20Sopenharmony_ci
55838c2ecf20Sopenharmony_ci	vmpressure_cleanup(&memcg->vmpressure);
55848c2ecf20Sopenharmony_ci	cancel_work_sync(&memcg->high_work);
55858c2ecf20Sopenharmony_ci	mem_cgroup_remove_from_trees(memcg);
55868c2ecf20Sopenharmony_ci	memcg_free_shrinker_maps(memcg);
55878c2ecf20Sopenharmony_ci	memcg_free_kmem(memcg);
55888c2ecf20Sopenharmony_ci	mem_cgroup_free(memcg);
55898c2ecf20Sopenharmony_ci}
55908c2ecf20Sopenharmony_ci
55918c2ecf20Sopenharmony_ci/**
55928c2ecf20Sopenharmony_ci * mem_cgroup_css_reset - reset the states of a mem_cgroup
55938c2ecf20Sopenharmony_ci * @css: the target css
55948c2ecf20Sopenharmony_ci *
55958c2ecf20Sopenharmony_ci * Reset the states of the mem_cgroup associated with @css.  This is
55968c2ecf20Sopenharmony_ci * invoked when the userland requests disabling on the default hierarchy
55978c2ecf20Sopenharmony_ci * but the memcg is pinned through dependency.  The memcg should stop
55988c2ecf20Sopenharmony_ci * applying policies and should revert to the vanilla state as it may be
55998c2ecf20Sopenharmony_ci * made visible again.
56008c2ecf20Sopenharmony_ci *
56018c2ecf20Sopenharmony_ci * The current implementation only resets the essential configurations.
56028c2ecf20Sopenharmony_ci * This needs to be expanded to cover all the visible parts.
56038c2ecf20Sopenharmony_ci */
56048c2ecf20Sopenharmony_cistatic void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
56058c2ecf20Sopenharmony_ci{
56068c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
56078c2ecf20Sopenharmony_ci
56088c2ecf20Sopenharmony_ci	page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
56098c2ecf20Sopenharmony_ci	page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
56108c2ecf20Sopenharmony_ci	page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
56118c2ecf20Sopenharmony_ci	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
56128c2ecf20Sopenharmony_ci	page_counter_set_min(&memcg->memory, 0);
56138c2ecf20Sopenharmony_ci	page_counter_set_low(&memcg->memory, 0);
56148c2ecf20Sopenharmony_ci	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
56158c2ecf20Sopenharmony_ci	memcg->soft_limit = PAGE_COUNTER_MAX;
56168c2ecf20Sopenharmony_ci	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
56178c2ecf20Sopenharmony_ci	memcg_wb_domain_size_changed(memcg);
56188c2ecf20Sopenharmony_ci}
56198c2ecf20Sopenharmony_ci
56208c2ecf20Sopenharmony_ci#ifdef CONFIG_MMU
56218c2ecf20Sopenharmony_ci/* Handlers for move charge at task migration. */
56228c2ecf20Sopenharmony_cistatic int mem_cgroup_do_precharge(unsigned long count)
56238c2ecf20Sopenharmony_ci{
56248c2ecf20Sopenharmony_ci	int ret;
56258c2ecf20Sopenharmony_ci
56268c2ecf20Sopenharmony_ci	/* Try a single bulk charge without reclaim first, kswapd may wake */
56278c2ecf20Sopenharmony_ci	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
56288c2ecf20Sopenharmony_ci	if (!ret) {
56298c2ecf20Sopenharmony_ci		mc.precharge += count;
56308c2ecf20Sopenharmony_ci		return ret;
56318c2ecf20Sopenharmony_ci	}
56328c2ecf20Sopenharmony_ci
56338c2ecf20Sopenharmony_ci	/* Try charges one by one with reclaim, but do not retry */
56348c2ecf20Sopenharmony_ci	while (count--) {
56358c2ecf20Sopenharmony_ci		ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
56368c2ecf20Sopenharmony_ci		if (ret)
56378c2ecf20Sopenharmony_ci			return ret;
56388c2ecf20Sopenharmony_ci		mc.precharge++;
56398c2ecf20Sopenharmony_ci		cond_resched();
56408c2ecf20Sopenharmony_ci	}
56418c2ecf20Sopenharmony_ci	return 0;
56428c2ecf20Sopenharmony_ci}
56438c2ecf20Sopenharmony_ci
56448c2ecf20Sopenharmony_ciunion mc_target {
56458c2ecf20Sopenharmony_ci	struct page	*page;
56468c2ecf20Sopenharmony_ci	swp_entry_t	ent;
56478c2ecf20Sopenharmony_ci};
56488c2ecf20Sopenharmony_ci
56498c2ecf20Sopenharmony_cienum mc_target_type {
56508c2ecf20Sopenharmony_ci	MC_TARGET_NONE = 0,
56518c2ecf20Sopenharmony_ci	MC_TARGET_PAGE,
56528c2ecf20Sopenharmony_ci	MC_TARGET_SWAP,
56538c2ecf20Sopenharmony_ci	MC_TARGET_DEVICE,
56548c2ecf20Sopenharmony_ci};
56558c2ecf20Sopenharmony_ci
56568c2ecf20Sopenharmony_cistatic struct page *mc_handle_present_pte(struct vm_area_struct *vma,
56578c2ecf20Sopenharmony_ci						unsigned long addr, pte_t ptent)
56588c2ecf20Sopenharmony_ci{
56598c2ecf20Sopenharmony_ci	struct page *page = vm_normal_page(vma, addr, ptent);
56608c2ecf20Sopenharmony_ci
56618c2ecf20Sopenharmony_ci	if (!page || !page_mapped(page))
56628c2ecf20Sopenharmony_ci		return NULL;
56638c2ecf20Sopenharmony_ci	if (PageAnon(page)) {
56648c2ecf20Sopenharmony_ci		if (!(mc.flags & MOVE_ANON))
56658c2ecf20Sopenharmony_ci			return NULL;
56668c2ecf20Sopenharmony_ci	} else {
56678c2ecf20Sopenharmony_ci		if (!(mc.flags & MOVE_FILE))
56688c2ecf20Sopenharmony_ci			return NULL;
56698c2ecf20Sopenharmony_ci	}
56708c2ecf20Sopenharmony_ci	if (!get_page_unless_zero(page))
56718c2ecf20Sopenharmony_ci		return NULL;
56728c2ecf20Sopenharmony_ci
56738c2ecf20Sopenharmony_ci	return page;
56748c2ecf20Sopenharmony_ci}
56758c2ecf20Sopenharmony_ci
56768c2ecf20Sopenharmony_ci#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
56778c2ecf20Sopenharmony_cistatic struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
56788c2ecf20Sopenharmony_ci			pte_t ptent, swp_entry_t *entry)
56798c2ecf20Sopenharmony_ci{
56808c2ecf20Sopenharmony_ci	struct page *page = NULL;
56818c2ecf20Sopenharmony_ci	swp_entry_t ent = pte_to_swp_entry(ptent);
56828c2ecf20Sopenharmony_ci
56838c2ecf20Sopenharmony_ci	if (!(mc.flags & MOVE_ANON))
56848c2ecf20Sopenharmony_ci		return NULL;
56858c2ecf20Sopenharmony_ci
56868c2ecf20Sopenharmony_ci	/*
56878c2ecf20Sopenharmony_ci	 * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
56888c2ecf20Sopenharmony_ci	 * a device and because they are not accessible by CPU they are store
56898c2ecf20Sopenharmony_ci	 * as special swap entry in the CPU page table.
56908c2ecf20Sopenharmony_ci	 */
56918c2ecf20Sopenharmony_ci	if (is_device_private_entry(ent)) {
56928c2ecf20Sopenharmony_ci		page = device_private_entry_to_page(ent);
56938c2ecf20Sopenharmony_ci		/*
56948c2ecf20Sopenharmony_ci		 * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
56958c2ecf20Sopenharmony_ci		 * a refcount of 1 when free (unlike normal page)
56968c2ecf20Sopenharmony_ci		 */
56978c2ecf20Sopenharmony_ci		if (!page_ref_add_unless(page, 1, 1))
56988c2ecf20Sopenharmony_ci			return NULL;
56998c2ecf20Sopenharmony_ci		return page;
57008c2ecf20Sopenharmony_ci	}
57018c2ecf20Sopenharmony_ci
57028c2ecf20Sopenharmony_ci	if (non_swap_entry(ent))
57038c2ecf20Sopenharmony_ci		return NULL;
57048c2ecf20Sopenharmony_ci
57058c2ecf20Sopenharmony_ci	/*
57068c2ecf20Sopenharmony_ci	 * Because lookup_swap_cache() updates some statistics counter,
57078c2ecf20Sopenharmony_ci	 * we call find_get_page() with swapper_space directly.
57088c2ecf20Sopenharmony_ci	 */
57098c2ecf20Sopenharmony_ci	page = find_get_page(swap_address_space(ent), swp_offset(ent));
57108c2ecf20Sopenharmony_ci	entry->val = ent.val;
57118c2ecf20Sopenharmony_ci
57128c2ecf20Sopenharmony_ci	return page;
57138c2ecf20Sopenharmony_ci}
57148c2ecf20Sopenharmony_ci#else
57158c2ecf20Sopenharmony_cistatic struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
57168c2ecf20Sopenharmony_ci			pte_t ptent, swp_entry_t *entry)
57178c2ecf20Sopenharmony_ci{
57188c2ecf20Sopenharmony_ci	return NULL;
57198c2ecf20Sopenharmony_ci}
57208c2ecf20Sopenharmony_ci#endif
57218c2ecf20Sopenharmony_ci
57228c2ecf20Sopenharmony_cistatic struct page *mc_handle_file_pte(struct vm_area_struct *vma,
57238c2ecf20Sopenharmony_ci			unsigned long addr, pte_t ptent, swp_entry_t *entry)
57248c2ecf20Sopenharmony_ci{
57258c2ecf20Sopenharmony_ci	if (!vma->vm_file) /* anonymous vma */
57268c2ecf20Sopenharmony_ci		return NULL;
57278c2ecf20Sopenharmony_ci	if (!(mc.flags & MOVE_FILE))
57288c2ecf20Sopenharmony_ci		return NULL;
57298c2ecf20Sopenharmony_ci
57308c2ecf20Sopenharmony_ci	/* page is moved even if it's not RSS of this task(page-faulted). */
57318c2ecf20Sopenharmony_ci	/* shmem/tmpfs may report page out on swap: account for that too. */
57328c2ecf20Sopenharmony_ci	return find_get_incore_page(vma->vm_file->f_mapping,
57338c2ecf20Sopenharmony_ci			linear_page_index(vma, addr));
57348c2ecf20Sopenharmony_ci}
57358c2ecf20Sopenharmony_ci
57368c2ecf20Sopenharmony_ci/**
57378c2ecf20Sopenharmony_ci * mem_cgroup_move_account - move account of the page
57388c2ecf20Sopenharmony_ci * @page: the page
57398c2ecf20Sopenharmony_ci * @compound: charge the page as compound or small page
57408c2ecf20Sopenharmony_ci * @from: mem_cgroup which the page is moved from.
57418c2ecf20Sopenharmony_ci * @to:	mem_cgroup which the page is moved to. @from != @to.
57428c2ecf20Sopenharmony_ci *
57438c2ecf20Sopenharmony_ci * The caller must make sure the page is not on LRU (isolate_page() is useful.)
57448c2ecf20Sopenharmony_ci *
57458c2ecf20Sopenharmony_ci * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
57468c2ecf20Sopenharmony_ci * from old cgroup.
57478c2ecf20Sopenharmony_ci */
57488c2ecf20Sopenharmony_cistatic int mem_cgroup_move_account(struct page *page,
57498c2ecf20Sopenharmony_ci				   bool compound,
57508c2ecf20Sopenharmony_ci				   struct mem_cgroup *from,
57518c2ecf20Sopenharmony_ci				   struct mem_cgroup *to)
57528c2ecf20Sopenharmony_ci{
57538c2ecf20Sopenharmony_ci	struct lruvec *from_vec, *to_vec;
57548c2ecf20Sopenharmony_ci	struct pglist_data *pgdat;
57558c2ecf20Sopenharmony_ci	unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
57568c2ecf20Sopenharmony_ci	int ret;
57578c2ecf20Sopenharmony_ci
57588c2ecf20Sopenharmony_ci	VM_BUG_ON(from == to);
57598c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(PageLRU(page), page);
57608c2ecf20Sopenharmony_ci	VM_BUG_ON(compound && !PageTransHuge(page));
57618c2ecf20Sopenharmony_ci
57628c2ecf20Sopenharmony_ci	/*
57638c2ecf20Sopenharmony_ci	 * Prevent mem_cgroup_migrate() from looking at
57648c2ecf20Sopenharmony_ci	 * page->mem_cgroup of its source page while we change it.
57658c2ecf20Sopenharmony_ci	 */
57668c2ecf20Sopenharmony_ci	ret = -EBUSY;
57678c2ecf20Sopenharmony_ci	if (!trylock_page(page))
57688c2ecf20Sopenharmony_ci		goto out;
57698c2ecf20Sopenharmony_ci
57708c2ecf20Sopenharmony_ci	ret = -EINVAL;
57718c2ecf20Sopenharmony_ci	if (page->mem_cgroup != from)
57728c2ecf20Sopenharmony_ci		goto out_unlock;
57738c2ecf20Sopenharmony_ci
57748c2ecf20Sopenharmony_ci	pgdat = page_pgdat(page);
57758c2ecf20Sopenharmony_ci	from_vec = mem_cgroup_lruvec(from, pgdat);
57768c2ecf20Sopenharmony_ci	to_vec = mem_cgroup_lruvec(to, pgdat);
57778c2ecf20Sopenharmony_ci
57788c2ecf20Sopenharmony_ci	lock_page_memcg(page);
57798c2ecf20Sopenharmony_ci
57808c2ecf20Sopenharmony_ci	if (PageAnon(page)) {
57818c2ecf20Sopenharmony_ci		if (page_mapped(page)) {
57828c2ecf20Sopenharmony_ci			__mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
57838c2ecf20Sopenharmony_ci			__mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
57848c2ecf20Sopenharmony_ci			if (PageTransHuge(page)) {
57858c2ecf20Sopenharmony_ci				__dec_lruvec_state(from_vec, NR_ANON_THPS);
57868c2ecf20Sopenharmony_ci				__inc_lruvec_state(to_vec, NR_ANON_THPS);
57878c2ecf20Sopenharmony_ci			}
57888c2ecf20Sopenharmony_ci
57898c2ecf20Sopenharmony_ci		}
57908c2ecf20Sopenharmony_ci	} else {
57918c2ecf20Sopenharmony_ci		__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
57928c2ecf20Sopenharmony_ci		__mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
57938c2ecf20Sopenharmony_ci
57948c2ecf20Sopenharmony_ci		if (PageSwapBacked(page)) {
57958c2ecf20Sopenharmony_ci			__mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
57968c2ecf20Sopenharmony_ci			__mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
57978c2ecf20Sopenharmony_ci		}
57988c2ecf20Sopenharmony_ci
57998c2ecf20Sopenharmony_ci		if (page_mapped(page)) {
58008c2ecf20Sopenharmony_ci			__mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
58018c2ecf20Sopenharmony_ci			__mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
58028c2ecf20Sopenharmony_ci		}
58038c2ecf20Sopenharmony_ci
58048c2ecf20Sopenharmony_ci		if (PageDirty(page)) {
58058c2ecf20Sopenharmony_ci			struct address_space *mapping = page_mapping(page);
58068c2ecf20Sopenharmony_ci
58078c2ecf20Sopenharmony_ci			if (mapping_can_writeback(mapping)) {
58088c2ecf20Sopenharmony_ci				__mod_lruvec_state(from_vec, NR_FILE_DIRTY,
58098c2ecf20Sopenharmony_ci						   -nr_pages);
58108c2ecf20Sopenharmony_ci				__mod_lruvec_state(to_vec, NR_FILE_DIRTY,
58118c2ecf20Sopenharmony_ci						   nr_pages);
58128c2ecf20Sopenharmony_ci			}
58138c2ecf20Sopenharmony_ci		}
58148c2ecf20Sopenharmony_ci	}
58158c2ecf20Sopenharmony_ci
58168c2ecf20Sopenharmony_ci	if (PageWriteback(page)) {
58178c2ecf20Sopenharmony_ci		__mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
58188c2ecf20Sopenharmony_ci		__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
58198c2ecf20Sopenharmony_ci	}
58208c2ecf20Sopenharmony_ci
58218c2ecf20Sopenharmony_ci	/*
58228c2ecf20Sopenharmony_ci	 * All state has been migrated, let's switch to the new memcg.
58238c2ecf20Sopenharmony_ci	 *
58248c2ecf20Sopenharmony_ci	 * It is safe to change page->mem_cgroup here because the page
58258c2ecf20Sopenharmony_ci	 * is referenced, charged, isolated, and locked: we can't race
58268c2ecf20Sopenharmony_ci	 * with (un)charging, migration, LRU putback, or anything else
58278c2ecf20Sopenharmony_ci	 * that would rely on a stable page->mem_cgroup.
58288c2ecf20Sopenharmony_ci	 *
58298c2ecf20Sopenharmony_ci	 * Note that lock_page_memcg is a memcg lock, not a page lock,
58308c2ecf20Sopenharmony_ci	 * to save space. As soon as we switch page->mem_cgroup to a
58318c2ecf20Sopenharmony_ci	 * new memcg that isn't locked, the above state can change
58328c2ecf20Sopenharmony_ci	 * concurrently again. Make sure we're truly done with it.
58338c2ecf20Sopenharmony_ci	 */
58348c2ecf20Sopenharmony_ci	smp_mb();
58358c2ecf20Sopenharmony_ci
58368c2ecf20Sopenharmony_ci	css_get(&to->css);
58378c2ecf20Sopenharmony_ci	css_put(&from->css);
58388c2ecf20Sopenharmony_ci
58398c2ecf20Sopenharmony_ci	page->mem_cgroup = to;
58408c2ecf20Sopenharmony_ci
58418c2ecf20Sopenharmony_ci	__unlock_page_memcg(from);
58428c2ecf20Sopenharmony_ci
58438c2ecf20Sopenharmony_ci	ret = 0;
58448c2ecf20Sopenharmony_ci
58458c2ecf20Sopenharmony_ci	local_irq_disable();
58468c2ecf20Sopenharmony_ci	mem_cgroup_charge_statistics(to, page, nr_pages);
58478c2ecf20Sopenharmony_ci	memcg_check_events(to, page);
58488c2ecf20Sopenharmony_ci	mem_cgroup_charge_statistics(from, page, -nr_pages);
58498c2ecf20Sopenharmony_ci	memcg_check_events(from, page);
58508c2ecf20Sopenharmony_ci	local_irq_enable();
58518c2ecf20Sopenharmony_ciout_unlock:
58528c2ecf20Sopenharmony_ci	unlock_page(page);
58538c2ecf20Sopenharmony_ciout:
58548c2ecf20Sopenharmony_ci	return ret;
58558c2ecf20Sopenharmony_ci}
58568c2ecf20Sopenharmony_ci
58578c2ecf20Sopenharmony_ci/**
58588c2ecf20Sopenharmony_ci * get_mctgt_type - get target type of moving charge
58598c2ecf20Sopenharmony_ci * @vma: the vma the pte to be checked belongs
58608c2ecf20Sopenharmony_ci * @addr: the address corresponding to the pte to be checked
58618c2ecf20Sopenharmony_ci * @ptent: the pte to be checked
58628c2ecf20Sopenharmony_ci * @target: the pointer the target page or swap ent will be stored(can be NULL)
58638c2ecf20Sopenharmony_ci *
58648c2ecf20Sopenharmony_ci * Returns
58658c2ecf20Sopenharmony_ci *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
58668c2ecf20Sopenharmony_ci *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
58678c2ecf20Sopenharmony_ci *     move charge. if @target is not NULL, the page is stored in target->page
58688c2ecf20Sopenharmony_ci *     with extra refcnt got(Callers should handle it).
58698c2ecf20Sopenharmony_ci *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
58708c2ecf20Sopenharmony_ci *     target for charge migration. if @target is not NULL, the entry is stored
58718c2ecf20Sopenharmony_ci *     in target->ent.
58728c2ecf20Sopenharmony_ci *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PRIVATE
58738c2ecf20Sopenharmony_ci *     (so ZONE_DEVICE page and thus not on the lru).
58748c2ecf20Sopenharmony_ci *     For now we such page is charge like a regular page would be as for all
58758c2ecf20Sopenharmony_ci *     intent and purposes it is just special memory taking the place of a
58768c2ecf20Sopenharmony_ci *     regular page.
58778c2ecf20Sopenharmony_ci *
58788c2ecf20Sopenharmony_ci *     See Documentations/vm/hmm.txt and include/linux/hmm.h
58798c2ecf20Sopenharmony_ci *
58808c2ecf20Sopenharmony_ci * Called with pte lock held.
58818c2ecf20Sopenharmony_ci */
58828c2ecf20Sopenharmony_ci
58838c2ecf20Sopenharmony_cistatic enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
58848c2ecf20Sopenharmony_ci		unsigned long addr, pte_t ptent, union mc_target *target)
58858c2ecf20Sopenharmony_ci{
58868c2ecf20Sopenharmony_ci	struct page *page = NULL;
58878c2ecf20Sopenharmony_ci	enum mc_target_type ret = MC_TARGET_NONE;
58888c2ecf20Sopenharmony_ci	swp_entry_t ent = { .val = 0 };
58898c2ecf20Sopenharmony_ci
58908c2ecf20Sopenharmony_ci	if (pte_present(ptent))
58918c2ecf20Sopenharmony_ci		page = mc_handle_present_pte(vma, addr, ptent);
58928c2ecf20Sopenharmony_ci	else if (is_swap_pte(ptent))
58938c2ecf20Sopenharmony_ci		page = mc_handle_swap_pte(vma, ptent, &ent);
58948c2ecf20Sopenharmony_ci	else if (pte_none(ptent))
58958c2ecf20Sopenharmony_ci		page = mc_handle_file_pte(vma, addr, ptent, &ent);
58968c2ecf20Sopenharmony_ci
58978c2ecf20Sopenharmony_ci	if (!page && !ent.val)
58988c2ecf20Sopenharmony_ci		return ret;
58998c2ecf20Sopenharmony_ci	if (page) {
59008c2ecf20Sopenharmony_ci		/*
59018c2ecf20Sopenharmony_ci		 * Do only loose check w/o serialization.
59028c2ecf20Sopenharmony_ci		 * mem_cgroup_move_account() checks the page is valid or
59038c2ecf20Sopenharmony_ci		 * not under LRU exclusion.
59048c2ecf20Sopenharmony_ci		 */
59058c2ecf20Sopenharmony_ci		if (page->mem_cgroup == mc.from) {
59068c2ecf20Sopenharmony_ci			ret = MC_TARGET_PAGE;
59078c2ecf20Sopenharmony_ci			if (is_device_private_page(page))
59088c2ecf20Sopenharmony_ci				ret = MC_TARGET_DEVICE;
59098c2ecf20Sopenharmony_ci			if (target)
59108c2ecf20Sopenharmony_ci				target->page = page;
59118c2ecf20Sopenharmony_ci		}
59128c2ecf20Sopenharmony_ci		if (!ret || !target)
59138c2ecf20Sopenharmony_ci			put_page(page);
59148c2ecf20Sopenharmony_ci	}
59158c2ecf20Sopenharmony_ci	/*
59168c2ecf20Sopenharmony_ci	 * There is a swap entry and a page doesn't exist or isn't charged.
59178c2ecf20Sopenharmony_ci	 * But we cannot move a tail-page in a THP.
59188c2ecf20Sopenharmony_ci	 */
59198c2ecf20Sopenharmony_ci	if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
59208c2ecf20Sopenharmony_ci	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
59218c2ecf20Sopenharmony_ci		ret = MC_TARGET_SWAP;
59228c2ecf20Sopenharmony_ci		if (target)
59238c2ecf20Sopenharmony_ci			target->ent = ent;
59248c2ecf20Sopenharmony_ci	}
59258c2ecf20Sopenharmony_ci	return ret;
59268c2ecf20Sopenharmony_ci}
59278c2ecf20Sopenharmony_ci
59288c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
59298c2ecf20Sopenharmony_ci/*
59308c2ecf20Sopenharmony_ci * We don't consider PMD mapped swapping or file mapped pages because THP does
59318c2ecf20Sopenharmony_ci * not support them for now.
59328c2ecf20Sopenharmony_ci * Caller should make sure that pmd_trans_huge(pmd) is true.
59338c2ecf20Sopenharmony_ci */
59348c2ecf20Sopenharmony_cistatic enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
59358c2ecf20Sopenharmony_ci		unsigned long addr, pmd_t pmd, union mc_target *target)
59368c2ecf20Sopenharmony_ci{
59378c2ecf20Sopenharmony_ci	struct page *page = NULL;
59388c2ecf20Sopenharmony_ci	enum mc_target_type ret = MC_TARGET_NONE;
59398c2ecf20Sopenharmony_ci
59408c2ecf20Sopenharmony_ci	if (unlikely(is_swap_pmd(pmd))) {
59418c2ecf20Sopenharmony_ci		VM_BUG_ON(thp_migration_supported() &&
59428c2ecf20Sopenharmony_ci				  !is_pmd_migration_entry(pmd));
59438c2ecf20Sopenharmony_ci		return ret;
59448c2ecf20Sopenharmony_ci	}
59458c2ecf20Sopenharmony_ci	page = pmd_page(pmd);
59468c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
59478c2ecf20Sopenharmony_ci	if (!(mc.flags & MOVE_ANON))
59488c2ecf20Sopenharmony_ci		return ret;
59498c2ecf20Sopenharmony_ci	if (page->mem_cgroup == mc.from) {
59508c2ecf20Sopenharmony_ci		ret = MC_TARGET_PAGE;
59518c2ecf20Sopenharmony_ci		if (target) {
59528c2ecf20Sopenharmony_ci			get_page(page);
59538c2ecf20Sopenharmony_ci			target->page = page;
59548c2ecf20Sopenharmony_ci		}
59558c2ecf20Sopenharmony_ci	}
59568c2ecf20Sopenharmony_ci	return ret;
59578c2ecf20Sopenharmony_ci}
59588c2ecf20Sopenharmony_ci#else
59598c2ecf20Sopenharmony_cistatic inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
59608c2ecf20Sopenharmony_ci		unsigned long addr, pmd_t pmd, union mc_target *target)
59618c2ecf20Sopenharmony_ci{
59628c2ecf20Sopenharmony_ci	return MC_TARGET_NONE;
59638c2ecf20Sopenharmony_ci}
59648c2ecf20Sopenharmony_ci#endif
59658c2ecf20Sopenharmony_ci
59668c2ecf20Sopenharmony_cistatic int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
59678c2ecf20Sopenharmony_ci					unsigned long addr, unsigned long end,
59688c2ecf20Sopenharmony_ci					struct mm_walk *walk)
59698c2ecf20Sopenharmony_ci{
59708c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = walk->vma;
59718c2ecf20Sopenharmony_ci	pte_t *pte;
59728c2ecf20Sopenharmony_ci	spinlock_t *ptl;
59738c2ecf20Sopenharmony_ci
59748c2ecf20Sopenharmony_ci	ptl = pmd_trans_huge_lock(pmd, vma);
59758c2ecf20Sopenharmony_ci	if (ptl) {
59768c2ecf20Sopenharmony_ci		/*
59778c2ecf20Sopenharmony_ci		 * Note their can not be MC_TARGET_DEVICE for now as we do not
59788c2ecf20Sopenharmony_ci		 * support transparent huge page with MEMORY_DEVICE_PRIVATE but
59798c2ecf20Sopenharmony_ci		 * this might change.
59808c2ecf20Sopenharmony_ci		 */
59818c2ecf20Sopenharmony_ci		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
59828c2ecf20Sopenharmony_ci			mc.precharge += HPAGE_PMD_NR;
59838c2ecf20Sopenharmony_ci		spin_unlock(ptl);
59848c2ecf20Sopenharmony_ci		return 0;
59858c2ecf20Sopenharmony_ci	}
59868c2ecf20Sopenharmony_ci
59878c2ecf20Sopenharmony_ci	if (pmd_trans_unstable(pmd))
59888c2ecf20Sopenharmony_ci		return 0;
59898c2ecf20Sopenharmony_ci	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
59908c2ecf20Sopenharmony_ci	for (; addr != end; pte++, addr += PAGE_SIZE)
59918c2ecf20Sopenharmony_ci		if (get_mctgt_type(vma, addr, *pte, NULL))
59928c2ecf20Sopenharmony_ci			mc.precharge++;	/* increment precharge temporarily */
59938c2ecf20Sopenharmony_ci	pte_unmap_unlock(pte - 1, ptl);
59948c2ecf20Sopenharmony_ci	cond_resched();
59958c2ecf20Sopenharmony_ci
59968c2ecf20Sopenharmony_ci	return 0;
59978c2ecf20Sopenharmony_ci}
59988c2ecf20Sopenharmony_ci
59998c2ecf20Sopenharmony_cistatic const struct mm_walk_ops precharge_walk_ops = {
60008c2ecf20Sopenharmony_ci	.pmd_entry	= mem_cgroup_count_precharge_pte_range,
60018c2ecf20Sopenharmony_ci};
60028c2ecf20Sopenharmony_ci
60038c2ecf20Sopenharmony_cistatic unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
60048c2ecf20Sopenharmony_ci{
60058c2ecf20Sopenharmony_ci	unsigned long precharge;
60068c2ecf20Sopenharmony_ci
60078c2ecf20Sopenharmony_ci	mmap_read_lock(mm);
60088c2ecf20Sopenharmony_ci	walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
60098c2ecf20Sopenharmony_ci	mmap_read_unlock(mm);
60108c2ecf20Sopenharmony_ci
60118c2ecf20Sopenharmony_ci	precharge = mc.precharge;
60128c2ecf20Sopenharmony_ci	mc.precharge = 0;
60138c2ecf20Sopenharmony_ci
60148c2ecf20Sopenharmony_ci	return precharge;
60158c2ecf20Sopenharmony_ci}
60168c2ecf20Sopenharmony_ci
60178c2ecf20Sopenharmony_cistatic int mem_cgroup_precharge_mc(struct mm_struct *mm)
60188c2ecf20Sopenharmony_ci{
60198c2ecf20Sopenharmony_ci	unsigned long precharge = mem_cgroup_count_precharge(mm);
60208c2ecf20Sopenharmony_ci
60218c2ecf20Sopenharmony_ci	VM_BUG_ON(mc.moving_task);
60228c2ecf20Sopenharmony_ci	mc.moving_task = current;
60238c2ecf20Sopenharmony_ci	return mem_cgroup_do_precharge(precharge);
60248c2ecf20Sopenharmony_ci}
60258c2ecf20Sopenharmony_ci
60268c2ecf20Sopenharmony_ci/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
60278c2ecf20Sopenharmony_cistatic void __mem_cgroup_clear_mc(void)
60288c2ecf20Sopenharmony_ci{
60298c2ecf20Sopenharmony_ci	struct mem_cgroup *from = mc.from;
60308c2ecf20Sopenharmony_ci	struct mem_cgroup *to = mc.to;
60318c2ecf20Sopenharmony_ci
60328c2ecf20Sopenharmony_ci	/* we must uncharge all the leftover precharges from mc.to */
60338c2ecf20Sopenharmony_ci	if (mc.precharge) {
60348c2ecf20Sopenharmony_ci		cancel_charge(mc.to, mc.precharge);
60358c2ecf20Sopenharmony_ci		mc.precharge = 0;
60368c2ecf20Sopenharmony_ci	}
60378c2ecf20Sopenharmony_ci	/*
60388c2ecf20Sopenharmony_ci	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
60398c2ecf20Sopenharmony_ci	 * we must uncharge here.
60408c2ecf20Sopenharmony_ci	 */
60418c2ecf20Sopenharmony_ci	if (mc.moved_charge) {
60428c2ecf20Sopenharmony_ci		cancel_charge(mc.from, mc.moved_charge);
60438c2ecf20Sopenharmony_ci		mc.moved_charge = 0;
60448c2ecf20Sopenharmony_ci	}
60458c2ecf20Sopenharmony_ci	/* we must fixup refcnts and charges */
60468c2ecf20Sopenharmony_ci	if (mc.moved_swap) {
60478c2ecf20Sopenharmony_ci		/* uncharge swap account from the old cgroup */
60488c2ecf20Sopenharmony_ci		if (!mem_cgroup_is_root(mc.from))
60498c2ecf20Sopenharmony_ci			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
60508c2ecf20Sopenharmony_ci
60518c2ecf20Sopenharmony_ci		mem_cgroup_id_put_many(mc.from, mc.moved_swap);
60528c2ecf20Sopenharmony_ci
60538c2ecf20Sopenharmony_ci		/*
60548c2ecf20Sopenharmony_ci		 * we charged both to->memory and to->memsw, so we
60558c2ecf20Sopenharmony_ci		 * should uncharge to->memory.
60568c2ecf20Sopenharmony_ci		 */
60578c2ecf20Sopenharmony_ci		if (!mem_cgroup_is_root(mc.to))
60588c2ecf20Sopenharmony_ci			page_counter_uncharge(&mc.to->memory, mc.moved_swap);
60598c2ecf20Sopenharmony_ci
60608c2ecf20Sopenharmony_ci		mc.moved_swap = 0;
60618c2ecf20Sopenharmony_ci	}
60628c2ecf20Sopenharmony_ci	memcg_oom_recover(from);
60638c2ecf20Sopenharmony_ci	memcg_oom_recover(to);
60648c2ecf20Sopenharmony_ci	wake_up_all(&mc.waitq);
60658c2ecf20Sopenharmony_ci}
60668c2ecf20Sopenharmony_ci
60678c2ecf20Sopenharmony_cistatic void mem_cgroup_clear_mc(void)
60688c2ecf20Sopenharmony_ci{
60698c2ecf20Sopenharmony_ci	struct mm_struct *mm = mc.mm;
60708c2ecf20Sopenharmony_ci
60718c2ecf20Sopenharmony_ci	/*
60728c2ecf20Sopenharmony_ci	 * we must clear moving_task before waking up waiters at the end of
60738c2ecf20Sopenharmony_ci	 * task migration.
60748c2ecf20Sopenharmony_ci	 */
60758c2ecf20Sopenharmony_ci	mc.moving_task = NULL;
60768c2ecf20Sopenharmony_ci	__mem_cgroup_clear_mc();
60778c2ecf20Sopenharmony_ci	spin_lock(&mc.lock);
60788c2ecf20Sopenharmony_ci	mc.from = NULL;
60798c2ecf20Sopenharmony_ci	mc.to = NULL;
60808c2ecf20Sopenharmony_ci	mc.mm = NULL;
60818c2ecf20Sopenharmony_ci	spin_unlock(&mc.lock);
60828c2ecf20Sopenharmony_ci
60838c2ecf20Sopenharmony_ci	mmput(mm);
60848c2ecf20Sopenharmony_ci}
60858c2ecf20Sopenharmony_ci
60868c2ecf20Sopenharmony_cistatic int mem_cgroup_can_attach(struct cgroup_taskset *tset)
60878c2ecf20Sopenharmony_ci{
60888c2ecf20Sopenharmony_ci	struct cgroup_subsys_state *css;
60898c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
60908c2ecf20Sopenharmony_ci	struct mem_cgroup *from;
60918c2ecf20Sopenharmony_ci	struct task_struct *leader, *p;
60928c2ecf20Sopenharmony_ci	struct mm_struct *mm;
60938c2ecf20Sopenharmony_ci	unsigned long move_flags;
60948c2ecf20Sopenharmony_ci	int ret = 0;
60958c2ecf20Sopenharmony_ci
60968c2ecf20Sopenharmony_ci	/* charge immigration isn't supported on the default hierarchy */
60978c2ecf20Sopenharmony_ci	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
60988c2ecf20Sopenharmony_ci		return 0;
60998c2ecf20Sopenharmony_ci
61008c2ecf20Sopenharmony_ci	/*
61018c2ecf20Sopenharmony_ci	 * Multi-process migrations only happen on the default hierarchy
61028c2ecf20Sopenharmony_ci	 * where charge immigration is not used.  Perform charge
61038c2ecf20Sopenharmony_ci	 * immigration if @tset contains a leader and whine if there are
61048c2ecf20Sopenharmony_ci	 * multiple.
61058c2ecf20Sopenharmony_ci	 */
61068c2ecf20Sopenharmony_ci	p = NULL;
61078c2ecf20Sopenharmony_ci	cgroup_taskset_for_each_leader(leader, css, tset) {
61088c2ecf20Sopenharmony_ci		WARN_ON_ONCE(p);
61098c2ecf20Sopenharmony_ci		p = leader;
61108c2ecf20Sopenharmony_ci		memcg = mem_cgroup_from_css(css);
61118c2ecf20Sopenharmony_ci	}
61128c2ecf20Sopenharmony_ci	if (!p)
61138c2ecf20Sopenharmony_ci		return 0;
61148c2ecf20Sopenharmony_ci
61158c2ecf20Sopenharmony_ci	/*
61168c2ecf20Sopenharmony_ci	 * We are now commited to this value whatever it is. Changes in this
61178c2ecf20Sopenharmony_ci	 * tunable will only affect upcoming migrations, not the current one.
61188c2ecf20Sopenharmony_ci	 * So we need to save it, and keep it going.
61198c2ecf20Sopenharmony_ci	 */
61208c2ecf20Sopenharmony_ci	move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
61218c2ecf20Sopenharmony_ci	if (!move_flags)
61228c2ecf20Sopenharmony_ci		return 0;
61238c2ecf20Sopenharmony_ci
61248c2ecf20Sopenharmony_ci	from = mem_cgroup_from_task(p);
61258c2ecf20Sopenharmony_ci
61268c2ecf20Sopenharmony_ci	VM_BUG_ON(from == memcg);
61278c2ecf20Sopenharmony_ci
61288c2ecf20Sopenharmony_ci	mm = get_task_mm(p);
61298c2ecf20Sopenharmony_ci	if (!mm)
61308c2ecf20Sopenharmony_ci		return 0;
61318c2ecf20Sopenharmony_ci	/* We move charges only when we move a owner of the mm */
61328c2ecf20Sopenharmony_ci	if (mm->owner == p) {
61338c2ecf20Sopenharmony_ci		VM_BUG_ON(mc.from);
61348c2ecf20Sopenharmony_ci		VM_BUG_ON(mc.to);
61358c2ecf20Sopenharmony_ci		VM_BUG_ON(mc.precharge);
61368c2ecf20Sopenharmony_ci		VM_BUG_ON(mc.moved_charge);
61378c2ecf20Sopenharmony_ci		VM_BUG_ON(mc.moved_swap);
61388c2ecf20Sopenharmony_ci
61398c2ecf20Sopenharmony_ci		spin_lock(&mc.lock);
61408c2ecf20Sopenharmony_ci		mc.mm = mm;
61418c2ecf20Sopenharmony_ci		mc.from = from;
61428c2ecf20Sopenharmony_ci		mc.to = memcg;
61438c2ecf20Sopenharmony_ci		mc.flags = move_flags;
61448c2ecf20Sopenharmony_ci		spin_unlock(&mc.lock);
61458c2ecf20Sopenharmony_ci		/* We set mc.moving_task later */
61468c2ecf20Sopenharmony_ci
61478c2ecf20Sopenharmony_ci		ret = mem_cgroup_precharge_mc(mm);
61488c2ecf20Sopenharmony_ci		if (ret)
61498c2ecf20Sopenharmony_ci			mem_cgroup_clear_mc();
61508c2ecf20Sopenharmony_ci	} else {
61518c2ecf20Sopenharmony_ci		mmput(mm);
61528c2ecf20Sopenharmony_ci	}
61538c2ecf20Sopenharmony_ci	return ret;
61548c2ecf20Sopenharmony_ci}
61558c2ecf20Sopenharmony_ci
61568c2ecf20Sopenharmony_cistatic void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
61578c2ecf20Sopenharmony_ci{
61588c2ecf20Sopenharmony_ci	if (mc.to)
61598c2ecf20Sopenharmony_ci		mem_cgroup_clear_mc();
61608c2ecf20Sopenharmony_ci}
61618c2ecf20Sopenharmony_ci
61628c2ecf20Sopenharmony_cistatic int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
61638c2ecf20Sopenharmony_ci				unsigned long addr, unsigned long end,
61648c2ecf20Sopenharmony_ci				struct mm_walk *walk)
61658c2ecf20Sopenharmony_ci{
61668c2ecf20Sopenharmony_ci	int ret = 0;
61678c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = walk->vma;
61688c2ecf20Sopenharmony_ci	pte_t *pte;
61698c2ecf20Sopenharmony_ci	spinlock_t *ptl;
61708c2ecf20Sopenharmony_ci	enum mc_target_type target_type;
61718c2ecf20Sopenharmony_ci	union mc_target target;
61728c2ecf20Sopenharmony_ci	struct page *page;
61738c2ecf20Sopenharmony_ci
61748c2ecf20Sopenharmony_ci	ptl = pmd_trans_huge_lock(pmd, vma);
61758c2ecf20Sopenharmony_ci	if (ptl) {
61768c2ecf20Sopenharmony_ci		if (mc.precharge < HPAGE_PMD_NR) {
61778c2ecf20Sopenharmony_ci			spin_unlock(ptl);
61788c2ecf20Sopenharmony_ci			return 0;
61798c2ecf20Sopenharmony_ci		}
61808c2ecf20Sopenharmony_ci		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
61818c2ecf20Sopenharmony_ci		if (target_type == MC_TARGET_PAGE) {
61828c2ecf20Sopenharmony_ci			page = target.page;
61838c2ecf20Sopenharmony_ci			if (!isolate_lru_page(page)) {
61848c2ecf20Sopenharmony_ci				if (!mem_cgroup_move_account(page, true,
61858c2ecf20Sopenharmony_ci							     mc.from, mc.to)) {
61868c2ecf20Sopenharmony_ci					mc.precharge -= HPAGE_PMD_NR;
61878c2ecf20Sopenharmony_ci					mc.moved_charge += HPAGE_PMD_NR;
61888c2ecf20Sopenharmony_ci				}
61898c2ecf20Sopenharmony_ci				putback_lru_page(page);
61908c2ecf20Sopenharmony_ci			}
61918c2ecf20Sopenharmony_ci			put_page(page);
61928c2ecf20Sopenharmony_ci		} else if (target_type == MC_TARGET_DEVICE) {
61938c2ecf20Sopenharmony_ci			page = target.page;
61948c2ecf20Sopenharmony_ci			if (!mem_cgroup_move_account(page, true,
61958c2ecf20Sopenharmony_ci						     mc.from, mc.to)) {
61968c2ecf20Sopenharmony_ci				mc.precharge -= HPAGE_PMD_NR;
61978c2ecf20Sopenharmony_ci				mc.moved_charge += HPAGE_PMD_NR;
61988c2ecf20Sopenharmony_ci			}
61998c2ecf20Sopenharmony_ci			put_page(page);
62008c2ecf20Sopenharmony_ci		}
62018c2ecf20Sopenharmony_ci		spin_unlock(ptl);
62028c2ecf20Sopenharmony_ci		return 0;
62038c2ecf20Sopenharmony_ci	}
62048c2ecf20Sopenharmony_ci
62058c2ecf20Sopenharmony_ci	if (pmd_trans_unstable(pmd))
62068c2ecf20Sopenharmony_ci		return 0;
62078c2ecf20Sopenharmony_ciretry:
62088c2ecf20Sopenharmony_ci	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
62098c2ecf20Sopenharmony_ci	for (; addr != end; addr += PAGE_SIZE) {
62108c2ecf20Sopenharmony_ci		pte_t ptent = *(pte++);
62118c2ecf20Sopenharmony_ci		bool device = false;
62128c2ecf20Sopenharmony_ci		swp_entry_t ent;
62138c2ecf20Sopenharmony_ci
62148c2ecf20Sopenharmony_ci		if (!mc.precharge)
62158c2ecf20Sopenharmony_ci			break;
62168c2ecf20Sopenharmony_ci
62178c2ecf20Sopenharmony_ci		switch (get_mctgt_type(vma, addr, ptent, &target)) {
62188c2ecf20Sopenharmony_ci		case MC_TARGET_DEVICE:
62198c2ecf20Sopenharmony_ci			device = true;
62208c2ecf20Sopenharmony_ci			fallthrough;
62218c2ecf20Sopenharmony_ci		case MC_TARGET_PAGE:
62228c2ecf20Sopenharmony_ci			page = target.page;
62238c2ecf20Sopenharmony_ci			/*
62248c2ecf20Sopenharmony_ci			 * We can have a part of the split pmd here. Moving it
62258c2ecf20Sopenharmony_ci			 * can be done but it would be too convoluted so simply
62268c2ecf20Sopenharmony_ci			 * ignore such a partial THP and keep it in original
62278c2ecf20Sopenharmony_ci			 * memcg. There should be somebody mapping the head.
62288c2ecf20Sopenharmony_ci			 */
62298c2ecf20Sopenharmony_ci			if (PageTransCompound(page))
62308c2ecf20Sopenharmony_ci				goto put;
62318c2ecf20Sopenharmony_ci			if (!device && isolate_lru_page(page))
62328c2ecf20Sopenharmony_ci				goto put;
62338c2ecf20Sopenharmony_ci			if (!mem_cgroup_move_account(page, false,
62348c2ecf20Sopenharmony_ci						mc.from, mc.to)) {
62358c2ecf20Sopenharmony_ci				mc.precharge--;
62368c2ecf20Sopenharmony_ci				/* we uncharge from mc.from later. */
62378c2ecf20Sopenharmony_ci				mc.moved_charge++;
62388c2ecf20Sopenharmony_ci			}
62398c2ecf20Sopenharmony_ci			if (!device)
62408c2ecf20Sopenharmony_ci				putback_lru_page(page);
62418c2ecf20Sopenharmony_ciput:			/* get_mctgt_type() gets the page */
62428c2ecf20Sopenharmony_ci			put_page(page);
62438c2ecf20Sopenharmony_ci			break;
62448c2ecf20Sopenharmony_ci		case MC_TARGET_SWAP:
62458c2ecf20Sopenharmony_ci			ent = target.ent;
62468c2ecf20Sopenharmony_ci			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
62478c2ecf20Sopenharmony_ci				mc.precharge--;
62488c2ecf20Sopenharmony_ci				mem_cgroup_id_get_many(mc.to, 1);
62498c2ecf20Sopenharmony_ci				/* we fixup other refcnts and charges later. */
62508c2ecf20Sopenharmony_ci				mc.moved_swap++;
62518c2ecf20Sopenharmony_ci			}
62528c2ecf20Sopenharmony_ci			break;
62538c2ecf20Sopenharmony_ci		default:
62548c2ecf20Sopenharmony_ci			break;
62558c2ecf20Sopenharmony_ci		}
62568c2ecf20Sopenharmony_ci	}
62578c2ecf20Sopenharmony_ci	pte_unmap_unlock(pte - 1, ptl);
62588c2ecf20Sopenharmony_ci	cond_resched();
62598c2ecf20Sopenharmony_ci
62608c2ecf20Sopenharmony_ci	if (addr != end) {
62618c2ecf20Sopenharmony_ci		/*
62628c2ecf20Sopenharmony_ci		 * We have consumed all precharges we got in can_attach().
62638c2ecf20Sopenharmony_ci		 * We try charge one by one, but don't do any additional
62648c2ecf20Sopenharmony_ci		 * charges to mc.to if we have failed in charge once in attach()
62658c2ecf20Sopenharmony_ci		 * phase.
62668c2ecf20Sopenharmony_ci		 */
62678c2ecf20Sopenharmony_ci		ret = mem_cgroup_do_precharge(1);
62688c2ecf20Sopenharmony_ci		if (!ret)
62698c2ecf20Sopenharmony_ci			goto retry;
62708c2ecf20Sopenharmony_ci	}
62718c2ecf20Sopenharmony_ci
62728c2ecf20Sopenharmony_ci	return ret;
62738c2ecf20Sopenharmony_ci}
62748c2ecf20Sopenharmony_ci
62758c2ecf20Sopenharmony_cistatic const struct mm_walk_ops charge_walk_ops = {
62768c2ecf20Sopenharmony_ci	.pmd_entry	= mem_cgroup_move_charge_pte_range,
62778c2ecf20Sopenharmony_ci};
62788c2ecf20Sopenharmony_ci
62798c2ecf20Sopenharmony_cistatic void mem_cgroup_move_charge(void)
62808c2ecf20Sopenharmony_ci{
62818c2ecf20Sopenharmony_ci	lru_add_drain_all();
62828c2ecf20Sopenharmony_ci	/*
62838c2ecf20Sopenharmony_ci	 * Signal lock_page_memcg() to take the memcg's move_lock
62848c2ecf20Sopenharmony_ci	 * while we're moving its pages to another memcg. Then wait
62858c2ecf20Sopenharmony_ci	 * for already started RCU-only updates to finish.
62868c2ecf20Sopenharmony_ci	 */
62878c2ecf20Sopenharmony_ci	atomic_inc(&mc.from->moving_account);
62888c2ecf20Sopenharmony_ci	synchronize_rcu();
62898c2ecf20Sopenharmony_ciretry:
62908c2ecf20Sopenharmony_ci	if (unlikely(!mmap_read_trylock(mc.mm))) {
62918c2ecf20Sopenharmony_ci		/*
62928c2ecf20Sopenharmony_ci		 * Someone who are holding the mmap_lock might be waiting in
62938c2ecf20Sopenharmony_ci		 * waitq. So we cancel all extra charges, wake up all waiters,
62948c2ecf20Sopenharmony_ci		 * and retry. Because we cancel precharges, we might not be able
62958c2ecf20Sopenharmony_ci		 * to move enough charges, but moving charge is a best-effort
62968c2ecf20Sopenharmony_ci		 * feature anyway, so it wouldn't be a big problem.
62978c2ecf20Sopenharmony_ci		 */
62988c2ecf20Sopenharmony_ci		__mem_cgroup_clear_mc();
62998c2ecf20Sopenharmony_ci		cond_resched();
63008c2ecf20Sopenharmony_ci		goto retry;
63018c2ecf20Sopenharmony_ci	}
63028c2ecf20Sopenharmony_ci	/*
63038c2ecf20Sopenharmony_ci	 * When we have consumed all precharges and failed in doing
63048c2ecf20Sopenharmony_ci	 * additional charge, the page walk just aborts.
63058c2ecf20Sopenharmony_ci	 */
63068c2ecf20Sopenharmony_ci	walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
63078c2ecf20Sopenharmony_ci			NULL);
63088c2ecf20Sopenharmony_ci
63098c2ecf20Sopenharmony_ci	mmap_read_unlock(mc.mm);
63108c2ecf20Sopenharmony_ci	atomic_dec(&mc.from->moving_account);
63118c2ecf20Sopenharmony_ci}
63128c2ecf20Sopenharmony_ci
63138c2ecf20Sopenharmony_cistatic void mem_cgroup_move_task(void)
63148c2ecf20Sopenharmony_ci{
63158c2ecf20Sopenharmony_ci	if (mc.to) {
63168c2ecf20Sopenharmony_ci		mem_cgroup_move_charge();
63178c2ecf20Sopenharmony_ci		mem_cgroup_clear_mc();
63188c2ecf20Sopenharmony_ci	}
63198c2ecf20Sopenharmony_ci}
63208c2ecf20Sopenharmony_ci#else	/* !CONFIG_MMU */
63218c2ecf20Sopenharmony_cistatic int mem_cgroup_can_attach(struct cgroup_taskset *tset)
63228c2ecf20Sopenharmony_ci{
63238c2ecf20Sopenharmony_ci	return 0;
63248c2ecf20Sopenharmony_ci}
63258c2ecf20Sopenharmony_cistatic void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
63268c2ecf20Sopenharmony_ci{
63278c2ecf20Sopenharmony_ci}
63288c2ecf20Sopenharmony_cistatic void mem_cgroup_move_task(void)
63298c2ecf20Sopenharmony_ci{
63308c2ecf20Sopenharmony_ci}
63318c2ecf20Sopenharmony_ci#endif
63328c2ecf20Sopenharmony_ci
63338c2ecf20Sopenharmony_ci/*
63348c2ecf20Sopenharmony_ci * Cgroup retains root cgroups across [un]mount cycles making it necessary
63358c2ecf20Sopenharmony_ci * to verify whether we're attached to the default hierarchy on each mount
63368c2ecf20Sopenharmony_ci * attempt.
63378c2ecf20Sopenharmony_ci */
63388c2ecf20Sopenharmony_cistatic void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
63398c2ecf20Sopenharmony_ci{
63408c2ecf20Sopenharmony_ci	/*
63418c2ecf20Sopenharmony_ci	 * use_hierarchy is forced on the default hierarchy.  cgroup core
63428c2ecf20Sopenharmony_ci	 * guarantees that @root doesn't have any children, so turning it
63438c2ecf20Sopenharmony_ci	 * on for the root memcg is enough.
63448c2ecf20Sopenharmony_ci	 */
63458c2ecf20Sopenharmony_ci	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
63468c2ecf20Sopenharmony_ci		root_mem_cgroup->use_hierarchy = true;
63478c2ecf20Sopenharmony_ci	else
63488c2ecf20Sopenharmony_ci		root_mem_cgroup->use_hierarchy = false;
63498c2ecf20Sopenharmony_ci}
63508c2ecf20Sopenharmony_ci
63518c2ecf20Sopenharmony_cistatic int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
63528c2ecf20Sopenharmony_ci{
63538c2ecf20Sopenharmony_ci	if (value == PAGE_COUNTER_MAX)
63548c2ecf20Sopenharmony_ci		seq_puts(m, "max\n");
63558c2ecf20Sopenharmony_ci	else
63568c2ecf20Sopenharmony_ci		seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
63578c2ecf20Sopenharmony_ci
63588c2ecf20Sopenharmony_ci	return 0;
63598c2ecf20Sopenharmony_ci}
63608c2ecf20Sopenharmony_ci
63618c2ecf20Sopenharmony_cistatic u64 memory_current_read(struct cgroup_subsys_state *css,
63628c2ecf20Sopenharmony_ci			       struct cftype *cft)
63638c2ecf20Sopenharmony_ci{
63648c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
63658c2ecf20Sopenharmony_ci
63668c2ecf20Sopenharmony_ci	return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
63678c2ecf20Sopenharmony_ci}
63688c2ecf20Sopenharmony_ci
63698c2ecf20Sopenharmony_cistatic int memory_min_show(struct seq_file *m, void *v)
63708c2ecf20Sopenharmony_ci{
63718c2ecf20Sopenharmony_ci	return seq_puts_memcg_tunable(m,
63728c2ecf20Sopenharmony_ci		READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
63738c2ecf20Sopenharmony_ci}
63748c2ecf20Sopenharmony_ci
63758c2ecf20Sopenharmony_cistatic ssize_t memory_min_write(struct kernfs_open_file *of,
63768c2ecf20Sopenharmony_ci				char *buf, size_t nbytes, loff_t off)
63778c2ecf20Sopenharmony_ci{
63788c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
63798c2ecf20Sopenharmony_ci	unsigned long min;
63808c2ecf20Sopenharmony_ci	int err;
63818c2ecf20Sopenharmony_ci
63828c2ecf20Sopenharmony_ci	buf = strstrip(buf);
63838c2ecf20Sopenharmony_ci	err = page_counter_memparse(buf, "max", &min);
63848c2ecf20Sopenharmony_ci	if (err)
63858c2ecf20Sopenharmony_ci		return err;
63868c2ecf20Sopenharmony_ci
63878c2ecf20Sopenharmony_ci	page_counter_set_min(&memcg->memory, min);
63888c2ecf20Sopenharmony_ci
63898c2ecf20Sopenharmony_ci	return nbytes;
63908c2ecf20Sopenharmony_ci}
63918c2ecf20Sopenharmony_ci
63928c2ecf20Sopenharmony_cistatic int memory_low_show(struct seq_file *m, void *v)
63938c2ecf20Sopenharmony_ci{
63948c2ecf20Sopenharmony_ci	return seq_puts_memcg_tunable(m,
63958c2ecf20Sopenharmony_ci		READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
63968c2ecf20Sopenharmony_ci}
63978c2ecf20Sopenharmony_ci
63988c2ecf20Sopenharmony_cistatic ssize_t memory_low_write(struct kernfs_open_file *of,
63998c2ecf20Sopenharmony_ci				char *buf, size_t nbytes, loff_t off)
64008c2ecf20Sopenharmony_ci{
64018c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
64028c2ecf20Sopenharmony_ci	unsigned long low;
64038c2ecf20Sopenharmony_ci	int err;
64048c2ecf20Sopenharmony_ci
64058c2ecf20Sopenharmony_ci	buf = strstrip(buf);
64068c2ecf20Sopenharmony_ci	err = page_counter_memparse(buf, "max", &low);
64078c2ecf20Sopenharmony_ci	if (err)
64088c2ecf20Sopenharmony_ci		return err;
64098c2ecf20Sopenharmony_ci
64108c2ecf20Sopenharmony_ci	page_counter_set_low(&memcg->memory, low);
64118c2ecf20Sopenharmony_ci
64128c2ecf20Sopenharmony_ci	return nbytes;
64138c2ecf20Sopenharmony_ci}
64148c2ecf20Sopenharmony_ci
64158c2ecf20Sopenharmony_cistatic int memory_high_show(struct seq_file *m, void *v)
64168c2ecf20Sopenharmony_ci{
64178c2ecf20Sopenharmony_ci	return seq_puts_memcg_tunable(m,
64188c2ecf20Sopenharmony_ci		READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
64198c2ecf20Sopenharmony_ci}
64208c2ecf20Sopenharmony_ci
64218c2ecf20Sopenharmony_cistatic ssize_t memory_high_write(struct kernfs_open_file *of,
64228c2ecf20Sopenharmony_ci				 char *buf, size_t nbytes, loff_t off)
64238c2ecf20Sopenharmony_ci{
64248c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
64258c2ecf20Sopenharmony_ci	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
64268c2ecf20Sopenharmony_ci	bool drained = false;
64278c2ecf20Sopenharmony_ci	unsigned long high;
64288c2ecf20Sopenharmony_ci	int err;
64298c2ecf20Sopenharmony_ci
64308c2ecf20Sopenharmony_ci	buf = strstrip(buf);
64318c2ecf20Sopenharmony_ci	err = page_counter_memparse(buf, "max", &high);
64328c2ecf20Sopenharmony_ci	if (err)
64338c2ecf20Sopenharmony_ci		return err;
64348c2ecf20Sopenharmony_ci
64358c2ecf20Sopenharmony_ci	page_counter_set_high(&memcg->memory, high);
64368c2ecf20Sopenharmony_ci
64378c2ecf20Sopenharmony_ci	for (;;) {
64388c2ecf20Sopenharmony_ci		unsigned long nr_pages = page_counter_read(&memcg->memory);
64398c2ecf20Sopenharmony_ci		unsigned long reclaimed;
64408c2ecf20Sopenharmony_ci
64418c2ecf20Sopenharmony_ci		if (nr_pages <= high)
64428c2ecf20Sopenharmony_ci			break;
64438c2ecf20Sopenharmony_ci
64448c2ecf20Sopenharmony_ci		if (signal_pending(current))
64458c2ecf20Sopenharmony_ci			break;
64468c2ecf20Sopenharmony_ci
64478c2ecf20Sopenharmony_ci		if (!drained) {
64488c2ecf20Sopenharmony_ci			drain_all_stock(memcg);
64498c2ecf20Sopenharmony_ci			drained = true;
64508c2ecf20Sopenharmony_ci			continue;
64518c2ecf20Sopenharmony_ci		}
64528c2ecf20Sopenharmony_ci
64538c2ecf20Sopenharmony_ci		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
64548c2ecf20Sopenharmony_ci							 GFP_KERNEL, true);
64558c2ecf20Sopenharmony_ci
64568c2ecf20Sopenharmony_ci		if (!reclaimed && !nr_retries--)
64578c2ecf20Sopenharmony_ci			break;
64588c2ecf20Sopenharmony_ci	}
64598c2ecf20Sopenharmony_ci
64608c2ecf20Sopenharmony_ci	memcg_wb_domain_size_changed(memcg);
64618c2ecf20Sopenharmony_ci	return nbytes;
64628c2ecf20Sopenharmony_ci}
64638c2ecf20Sopenharmony_ci
64648c2ecf20Sopenharmony_cistatic int memory_max_show(struct seq_file *m, void *v)
64658c2ecf20Sopenharmony_ci{
64668c2ecf20Sopenharmony_ci	return seq_puts_memcg_tunable(m,
64678c2ecf20Sopenharmony_ci		READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
64688c2ecf20Sopenharmony_ci}
64698c2ecf20Sopenharmony_ci
64708c2ecf20Sopenharmony_cistatic ssize_t memory_max_write(struct kernfs_open_file *of,
64718c2ecf20Sopenharmony_ci				char *buf, size_t nbytes, loff_t off)
64728c2ecf20Sopenharmony_ci{
64738c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
64748c2ecf20Sopenharmony_ci	unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
64758c2ecf20Sopenharmony_ci	bool drained = false;
64768c2ecf20Sopenharmony_ci	unsigned long max;
64778c2ecf20Sopenharmony_ci	int err;
64788c2ecf20Sopenharmony_ci
64798c2ecf20Sopenharmony_ci	buf = strstrip(buf);
64808c2ecf20Sopenharmony_ci	err = page_counter_memparse(buf, "max", &max);
64818c2ecf20Sopenharmony_ci	if (err)
64828c2ecf20Sopenharmony_ci		return err;
64838c2ecf20Sopenharmony_ci
64848c2ecf20Sopenharmony_ci	xchg(&memcg->memory.max, max);
64858c2ecf20Sopenharmony_ci
64868c2ecf20Sopenharmony_ci	for (;;) {
64878c2ecf20Sopenharmony_ci		unsigned long nr_pages = page_counter_read(&memcg->memory);
64888c2ecf20Sopenharmony_ci
64898c2ecf20Sopenharmony_ci		if (nr_pages <= max)
64908c2ecf20Sopenharmony_ci			break;
64918c2ecf20Sopenharmony_ci
64928c2ecf20Sopenharmony_ci		if (signal_pending(current))
64938c2ecf20Sopenharmony_ci			break;
64948c2ecf20Sopenharmony_ci
64958c2ecf20Sopenharmony_ci		if (!drained) {
64968c2ecf20Sopenharmony_ci			drain_all_stock(memcg);
64978c2ecf20Sopenharmony_ci			drained = true;
64988c2ecf20Sopenharmony_ci			continue;
64998c2ecf20Sopenharmony_ci		}
65008c2ecf20Sopenharmony_ci
65018c2ecf20Sopenharmony_ci		if (nr_reclaims) {
65028c2ecf20Sopenharmony_ci			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
65038c2ecf20Sopenharmony_ci							  GFP_KERNEL, true))
65048c2ecf20Sopenharmony_ci				nr_reclaims--;
65058c2ecf20Sopenharmony_ci			continue;
65068c2ecf20Sopenharmony_ci		}
65078c2ecf20Sopenharmony_ci
65088c2ecf20Sopenharmony_ci		memcg_memory_event(memcg, MEMCG_OOM);
65098c2ecf20Sopenharmony_ci		if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
65108c2ecf20Sopenharmony_ci			break;
65118c2ecf20Sopenharmony_ci	}
65128c2ecf20Sopenharmony_ci
65138c2ecf20Sopenharmony_ci	memcg_wb_domain_size_changed(memcg);
65148c2ecf20Sopenharmony_ci	return nbytes;
65158c2ecf20Sopenharmony_ci}
65168c2ecf20Sopenharmony_ci
65178c2ecf20Sopenharmony_cistatic void __memory_events_show(struct seq_file *m, atomic_long_t *events)
65188c2ecf20Sopenharmony_ci{
65198c2ecf20Sopenharmony_ci	seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
65208c2ecf20Sopenharmony_ci	seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
65218c2ecf20Sopenharmony_ci	seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
65228c2ecf20Sopenharmony_ci	seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
65238c2ecf20Sopenharmony_ci	seq_printf(m, "oom_kill %lu\n",
65248c2ecf20Sopenharmony_ci		   atomic_long_read(&events[MEMCG_OOM_KILL]));
65258c2ecf20Sopenharmony_ci}
65268c2ecf20Sopenharmony_ci
65278c2ecf20Sopenharmony_cistatic int memory_events_show(struct seq_file *m, void *v)
65288c2ecf20Sopenharmony_ci{
65298c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
65308c2ecf20Sopenharmony_ci
65318c2ecf20Sopenharmony_ci	__memory_events_show(m, memcg->memory_events);
65328c2ecf20Sopenharmony_ci	return 0;
65338c2ecf20Sopenharmony_ci}
65348c2ecf20Sopenharmony_ci
65358c2ecf20Sopenharmony_cistatic int memory_events_local_show(struct seq_file *m, void *v)
65368c2ecf20Sopenharmony_ci{
65378c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
65388c2ecf20Sopenharmony_ci
65398c2ecf20Sopenharmony_ci	__memory_events_show(m, memcg->memory_events_local);
65408c2ecf20Sopenharmony_ci	return 0;
65418c2ecf20Sopenharmony_ci}
65428c2ecf20Sopenharmony_ci
65438c2ecf20Sopenharmony_cistatic int memory_stat_show(struct seq_file *m, void *v)
65448c2ecf20Sopenharmony_ci{
65458c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
65468c2ecf20Sopenharmony_ci	char *buf;
65478c2ecf20Sopenharmony_ci
65488c2ecf20Sopenharmony_ci	buf = memory_stat_format(memcg);
65498c2ecf20Sopenharmony_ci	if (!buf)
65508c2ecf20Sopenharmony_ci		return -ENOMEM;
65518c2ecf20Sopenharmony_ci	seq_puts(m, buf);
65528c2ecf20Sopenharmony_ci	kfree(buf);
65538c2ecf20Sopenharmony_ci	return 0;
65548c2ecf20Sopenharmony_ci}
65558c2ecf20Sopenharmony_ci
65568c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA
65578c2ecf20Sopenharmony_cistatic int memory_numa_stat_show(struct seq_file *m, void *v)
65588c2ecf20Sopenharmony_ci{
65598c2ecf20Sopenharmony_ci	int i;
65608c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
65618c2ecf20Sopenharmony_ci
65628c2ecf20Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
65638c2ecf20Sopenharmony_ci		int nid;
65648c2ecf20Sopenharmony_ci
65658c2ecf20Sopenharmony_ci		if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
65668c2ecf20Sopenharmony_ci			continue;
65678c2ecf20Sopenharmony_ci
65688c2ecf20Sopenharmony_ci		seq_printf(m, "%s", memory_stats[i].name);
65698c2ecf20Sopenharmony_ci		for_each_node_state(nid, N_MEMORY) {
65708c2ecf20Sopenharmony_ci			u64 size;
65718c2ecf20Sopenharmony_ci			struct lruvec *lruvec;
65728c2ecf20Sopenharmony_ci
65738c2ecf20Sopenharmony_ci			lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
65748c2ecf20Sopenharmony_ci			size = lruvec_page_state(lruvec, memory_stats[i].idx);
65758c2ecf20Sopenharmony_ci			size *= memory_stats[i].ratio;
65768c2ecf20Sopenharmony_ci			seq_printf(m, " N%d=%llu", nid, size);
65778c2ecf20Sopenharmony_ci		}
65788c2ecf20Sopenharmony_ci		seq_putc(m, '\n');
65798c2ecf20Sopenharmony_ci	}
65808c2ecf20Sopenharmony_ci
65818c2ecf20Sopenharmony_ci	return 0;
65828c2ecf20Sopenharmony_ci}
65838c2ecf20Sopenharmony_ci#endif
65848c2ecf20Sopenharmony_ci
65858c2ecf20Sopenharmony_cistatic int memory_oom_group_show(struct seq_file *m, void *v)
65868c2ecf20Sopenharmony_ci{
65878c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
65888c2ecf20Sopenharmony_ci
65898c2ecf20Sopenharmony_ci	seq_printf(m, "%d\n", memcg->oom_group);
65908c2ecf20Sopenharmony_ci
65918c2ecf20Sopenharmony_ci	return 0;
65928c2ecf20Sopenharmony_ci}
65938c2ecf20Sopenharmony_ci
65948c2ecf20Sopenharmony_cistatic ssize_t memory_oom_group_write(struct kernfs_open_file *of,
65958c2ecf20Sopenharmony_ci				      char *buf, size_t nbytes, loff_t off)
65968c2ecf20Sopenharmony_ci{
65978c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
65988c2ecf20Sopenharmony_ci	int ret, oom_group;
65998c2ecf20Sopenharmony_ci
66008c2ecf20Sopenharmony_ci	buf = strstrip(buf);
66018c2ecf20Sopenharmony_ci	if (!buf)
66028c2ecf20Sopenharmony_ci		return -EINVAL;
66038c2ecf20Sopenharmony_ci
66048c2ecf20Sopenharmony_ci	ret = kstrtoint(buf, 0, &oom_group);
66058c2ecf20Sopenharmony_ci	if (ret)
66068c2ecf20Sopenharmony_ci		return ret;
66078c2ecf20Sopenharmony_ci
66088c2ecf20Sopenharmony_ci	if (oom_group != 0 && oom_group != 1)
66098c2ecf20Sopenharmony_ci		return -EINVAL;
66108c2ecf20Sopenharmony_ci
66118c2ecf20Sopenharmony_ci	memcg->oom_group = oom_group;
66128c2ecf20Sopenharmony_ci
66138c2ecf20Sopenharmony_ci	return nbytes;
66148c2ecf20Sopenharmony_ci}
66158c2ecf20Sopenharmony_ci
66168c2ecf20Sopenharmony_cistatic struct cftype memory_files[] = {
66178c2ecf20Sopenharmony_ci	{
66188c2ecf20Sopenharmony_ci		.name = "current",
66198c2ecf20Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
66208c2ecf20Sopenharmony_ci		.read_u64 = memory_current_read,
66218c2ecf20Sopenharmony_ci	},
66228c2ecf20Sopenharmony_ci	{
66238c2ecf20Sopenharmony_ci		.name = "min",
66248c2ecf20Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
66258c2ecf20Sopenharmony_ci		.seq_show = memory_min_show,
66268c2ecf20Sopenharmony_ci		.write = memory_min_write,
66278c2ecf20Sopenharmony_ci	},
66288c2ecf20Sopenharmony_ci	{
66298c2ecf20Sopenharmony_ci		.name = "low",
66308c2ecf20Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
66318c2ecf20Sopenharmony_ci		.seq_show = memory_low_show,
66328c2ecf20Sopenharmony_ci		.write = memory_low_write,
66338c2ecf20Sopenharmony_ci	},
66348c2ecf20Sopenharmony_ci	{
66358c2ecf20Sopenharmony_ci		.name = "high",
66368c2ecf20Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
66378c2ecf20Sopenharmony_ci		.seq_show = memory_high_show,
66388c2ecf20Sopenharmony_ci		.write = memory_high_write,
66398c2ecf20Sopenharmony_ci	},
66408c2ecf20Sopenharmony_ci	{
66418c2ecf20Sopenharmony_ci		.name = "max",
66428c2ecf20Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
66438c2ecf20Sopenharmony_ci		.seq_show = memory_max_show,
66448c2ecf20Sopenharmony_ci		.write = memory_max_write,
66458c2ecf20Sopenharmony_ci	},
66468c2ecf20Sopenharmony_ci	{
66478c2ecf20Sopenharmony_ci		.name = "events",
66488c2ecf20Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
66498c2ecf20Sopenharmony_ci		.file_offset = offsetof(struct mem_cgroup, events_file),
66508c2ecf20Sopenharmony_ci		.seq_show = memory_events_show,
66518c2ecf20Sopenharmony_ci	},
66528c2ecf20Sopenharmony_ci	{
66538c2ecf20Sopenharmony_ci		.name = "events.local",
66548c2ecf20Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
66558c2ecf20Sopenharmony_ci		.file_offset = offsetof(struct mem_cgroup, events_local_file),
66568c2ecf20Sopenharmony_ci		.seq_show = memory_events_local_show,
66578c2ecf20Sopenharmony_ci	},
66588c2ecf20Sopenharmony_ci	{
66598c2ecf20Sopenharmony_ci		.name = "stat",
66608c2ecf20Sopenharmony_ci		.seq_show = memory_stat_show,
66618c2ecf20Sopenharmony_ci	},
66628c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA
66638c2ecf20Sopenharmony_ci	{
66648c2ecf20Sopenharmony_ci		.name = "numa_stat",
66658c2ecf20Sopenharmony_ci		.seq_show = memory_numa_stat_show,
66668c2ecf20Sopenharmony_ci	},
66678c2ecf20Sopenharmony_ci#endif
66688c2ecf20Sopenharmony_ci	{
66698c2ecf20Sopenharmony_ci		.name = "oom.group",
66708c2ecf20Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
66718c2ecf20Sopenharmony_ci		.seq_show = memory_oom_group_show,
66728c2ecf20Sopenharmony_ci		.write = memory_oom_group_write,
66738c2ecf20Sopenharmony_ci	},
66748c2ecf20Sopenharmony_ci	{ }	/* terminate */
66758c2ecf20Sopenharmony_ci};
66768c2ecf20Sopenharmony_ci
66778c2ecf20Sopenharmony_cistruct cgroup_subsys memory_cgrp_subsys = {
66788c2ecf20Sopenharmony_ci	.css_alloc = mem_cgroup_css_alloc,
66798c2ecf20Sopenharmony_ci	.css_online = mem_cgroup_css_online,
66808c2ecf20Sopenharmony_ci	.css_offline = mem_cgroup_css_offline,
66818c2ecf20Sopenharmony_ci	.css_released = mem_cgroup_css_released,
66828c2ecf20Sopenharmony_ci	.css_free = mem_cgroup_css_free,
66838c2ecf20Sopenharmony_ci	.css_reset = mem_cgroup_css_reset,
66848c2ecf20Sopenharmony_ci	.can_attach = mem_cgroup_can_attach,
66858c2ecf20Sopenharmony_ci	.cancel_attach = mem_cgroup_cancel_attach,
66868c2ecf20Sopenharmony_ci	.post_attach = mem_cgroup_move_task,
66878c2ecf20Sopenharmony_ci	.bind = mem_cgroup_bind,
66888c2ecf20Sopenharmony_ci	.dfl_cftypes = memory_files,
66898c2ecf20Sopenharmony_ci	.legacy_cftypes = mem_cgroup_legacy_files,
66908c2ecf20Sopenharmony_ci	.early_init = 0,
66918c2ecf20Sopenharmony_ci};
66928c2ecf20Sopenharmony_ci
66938c2ecf20Sopenharmony_ci/*
66948c2ecf20Sopenharmony_ci * This function calculates an individual cgroup's effective
66958c2ecf20Sopenharmony_ci * protection which is derived from its own memory.min/low, its
66968c2ecf20Sopenharmony_ci * parent's and siblings' settings, as well as the actual memory
66978c2ecf20Sopenharmony_ci * distribution in the tree.
66988c2ecf20Sopenharmony_ci *
66998c2ecf20Sopenharmony_ci * The following rules apply to the effective protection values:
67008c2ecf20Sopenharmony_ci *
67018c2ecf20Sopenharmony_ci * 1. At the first level of reclaim, effective protection is equal to
67028c2ecf20Sopenharmony_ci *    the declared protection in memory.min and memory.low.
67038c2ecf20Sopenharmony_ci *
67048c2ecf20Sopenharmony_ci * 2. To enable safe delegation of the protection configuration, at
67058c2ecf20Sopenharmony_ci *    subsequent levels the effective protection is capped to the
67068c2ecf20Sopenharmony_ci *    parent's effective protection.
67078c2ecf20Sopenharmony_ci *
67088c2ecf20Sopenharmony_ci * 3. To make complex and dynamic subtrees easier to configure, the
67098c2ecf20Sopenharmony_ci *    user is allowed to overcommit the declared protection at a given
67108c2ecf20Sopenharmony_ci *    level. If that is the case, the parent's effective protection is
67118c2ecf20Sopenharmony_ci *    distributed to the children in proportion to how much protection
67128c2ecf20Sopenharmony_ci *    they have declared and how much of it they are utilizing.
67138c2ecf20Sopenharmony_ci *
67148c2ecf20Sopenharmony_ci *    This makes distribution proportional, but also work-conserving:
67158c2ecf20Sopenharmony_ci *    if one cgroup claims much more protection than it uses memory,
67168c2ecf20Sopenharmony_ci *    the unused remainder is available to its siblings.
67178c2ecf20Sopenharmony_ci *
67188c2ecf20Sopenharmony_ci * 4. Conversely, when the declared protection is undercommitted at a
67198c2ecf20Sopenharmony_ci *    given level, the distribution of the larger parental protection
67208c2ecf20Sopenharmony_ci *    budget is NOT proportional. A cgroup's protection from a sibling
67218c2ecf20Sopenharmony_ci *    is capped to its own memory.min/low setting.
67228c2ecf20Sopenharmony_ci *
67238c2ecf20Sopenharmony_ci * 5. However, to allow protecting recursive subtrees from each other
67248c2ecf20Sopenharmony_ci *    without having to declare each individual cgroup's fixed share
67258c2ecf20Sopenharmony_ci *    of the ancestor's claim to protection, any unutilized -
67268c2ecf20Sopenharmony_ci *    "floating" - protection from up the tree is distributed in
67278c2ecf20Sopenharmony_ci *    proportion to each cgroup's *usage*. This makes the protection
67288c2ecf20Sopenharmony_ci *    neutral wrt sibling cgroups and lets them compete freely over
67298c2ecf20Sopenharmony_ci *    the shared parental protection budget, but it protects the
67308c2ecf20Sopenharmony_ci *    subtree as a whole from neighboring subtrees.
67318c2ecf20Sopenharmony_ci *
67328c2ecf20Sopenharmony_ci * Note that 4. and 5. are not in conflict: 4. is about protecting
67338c2ecf20Sopenharmony_ci * against immediate siblings whereas 5. is about protecting against
67348c2ecf20Sopenharmony_ci * neighboring subtrees.
67358c2ecf20Sopenharmony_ci */
67368c2ecf20Sopenharmony_cistatic unsigned long effective_protection(unsigned long usage,
67378c2ecf20Sopenharmony_ci					  unsigned long parent_usage,
67388c2ecf20Sopenharmony_ci					  unsigned long setting,
67398c2ecf20Sopenharmony_ci					  unsigned long parent_effective,
67408c2ecf20Sopenharmony_ci					  unsigned long siblings_protected)
67418c2ecf20Sopenharmony_ci{
67428c2ecf20Sopenharmony_ci	unsigned long protected;
67438c2ecf20Sopenharmony_ci	unsigned long ep;
67448c2ecf20Sopenharmony_ci
67458c2ecf20Sopenharmony_ci	protected = min(usage, setting);
67468c2ecf20Sopenharmony_ci	/*
67478c2ecf20Sopenharmony_ci	 * If all cgroups at this level combined claim and use more
67488c2ecf20Sopenharmony_ci	 * protection then what the parent affords them, distribute
67498c2ecf20Sopenharmony_ci	 * shares in proportion to utilization.
67508c2ecf20Sopenharmony_ci	 *
67518c2ecf20Sopenharmony_ci	 * We are using actual utilization rather than the statically
67528c2ecf20Sopenharmony_ci	 * claimed protection in order to be work-conserving: claimed
67538c2ecf20Sopenharmony_ci	 * but unused protection is available to siblings that would
67548c2ecf20Sopenharmony_ci	 * otherwise get a smaller chunk than what they claimed.
67558c2ecf20Sopenharmony_ci	 */
67568c2ecf20Sopenharmony_ci	if (siblings_protected > parent_effective)
67578c2ecf20Sopenharmony_ci		return protected * parent_effective / siblings_protected;
67588c2ecf20Sopenharmony_ci
67598c2ecf20Sopenharmony_ci	/*
67608c2ecf20Sopenharmony_ci	 * Ok, utilized protection of all children is within what the
67618c2ecf20Sopenharmony_ci	 * parent affords them, so we know whatever this child claims
67628c2ecf20Sopenharmony_ci	 * and utilizes is effectively protected.
67638c2ecf20Sopenharmony_ci	 *
67648c2ecf20Sopenharmony_ci	 * If there is unprotected usage beyond this value, reclaim
67658c2ecf20Sopenharmony_ci	 * will apply pressure in proportion to that amount.
67668c2ecf20Sopenharmony_ci	 *
67678c2ecf20Sopenharmony_ci	 * If there is unutilized protection, the cgroup will be fully
67688c2ecf20Sopenharmony_ci	 * shielded from reclaim, but we do return a smaller value for
67698c2ecf20Sopenharmony_ci	 * protection than what the group could enjoy in theory. This
67708c2ecf20Sopenharmony_ci	 * is okay. With the overcommit distribution above, effective
67718c2ecf20Sopenharmony_ci	 * protection is always dependent on how memory is actually
67728c2ecf20Sopenharmony_ci	 * consumed among the siblings anyway.
67738c2ecf20Sopenharmony_ci	 */
67748c2ecf20Sopenharmony_ci	ep = protected;
67758c2ecf20Sopenharmony_ci
67768c2ecf20Sopenharmony_ci	/*
67778c2ecf20Sopenharmony_ci	 * If the children aren't claiming (all of) the protection
67788c2ecf20Sopenharmony_ci	 * afforded to them by the parent, distribute the remainder in
67798c2ecf20Sopenharmony_ci	 * proportion to the (unprotected) memory of each cgroup. That
67808c2ecf20Sopenharmony_ci	 * way, cgroups that aren't explicitly prioritized wrt each
67818c2ecf20Sopenharmony_ci	 * other compete freely over the allowance, but they are
67828c2ecf20Sopenharmony_ci	 * collectively protected from neighboring trees.
67838c2ecf20Sopenharmony_ci	 *
67848c2ecf20Sopenharmony_ci	 * We're using unprotected memory for the weight so that if
67858c2ecf20Sopenharmony_ci	 * some cgroups DO claim explicit protection, we don't protect
67868c2ecf20Sopenharmony_ci	 * the same bytes twice.
67878c2ecf20Sopenharmony_ci	 *
67888c2ecf20Sopenharmony_ci	 * Check both usage and parent_usage against the respective
67898c2ecf20Sopenharmony_ci	 * protected values. One should imply the other, but they
67908c2ecf20Sopenharmony_ci	 * aren't read atomically - make sure the division is sane.
67918c2ecf20Sopenharmony_ci	 */
67928c2ecf20Sopenharmony_ci	if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
67938c2ecf20Sopenharmony_ci		return ep;
67948c2ecf20Sopenharmony_ci	if (parent_effective > siblings_protected &&
67958c2ecf20Sopenharmony_ci	    parent_usage > siblings_protected &&
67968c2ecf20Sopenharmony_ci	    usage > protected) {
67978c2ecf20Sopenharmony_ci		unsigned long unclaimed;
67988c2ecf20Sopenharmony_ci
67998c2ecf20Sopenharmony_ci		unclaimed = parent_effective - siblings_protected;
68008c2ecf20Sopenharmony_ci		unclaimed *= usage - protected;
68018c2ecf20Sopenharmony_ci		unclaimed /= parent_usage - siblings_protected;
68028c2ecf20Sopenharmony_ci
68038c2ecf20Sopenharmony_ci		ep += unclaimed;
68048c2ecf20Sopenharmony_ci	}
68058c2ecf20Sopenharmony_ci
68068c2ecf20Sopenharmony_ci	return ep;
68078c2ecf20Sopenharmony_ci}
68088c2ecf20Sopenharmony_ci
68098c2ecf20Sopenharmony_ci/**
68108c2ecf20Sopenharmony_ci * mem_cgroup_protected - check if memory consumption is in the normal range
68118c2ecf20Sopenharmony_ci * @root: the top ancestor of the sub-tree being checked
68128c2ecf20Sopenharmony_ci * @memcg: the memory cgroup to check
68138c2ecf20Sopenharmony_ci *
68148c2ecf20Sopenharmony_ci * WARNING: This function is not stateless! It can only be used as part
68158c2ecf20Sopenharmony_ci *          of a top-down tree iteration, not for isolated queries.
68168c2ecf20Sopenharmony_ci */
68178c2ecf20Sopenharmony_civoid mem_cgroup_calculate_protection(struct mem_cgroup *root,
68188c2ecf20Sopenharmony_ci				     struct mem_cgroup *memcg)
68198c2ecf20Sopenharmony_ci{
68208c2ecf20Sopenharmony_ci	unsigned long usage, parent_usage;
68218c2ecf20Sopenharmony_ci	struct mem_cgroup *parent;
68228c2ecf20Sopenharmony_ci
68238c2ecf20Sopenharmony_ci	if (mem_cgroup_disabled())
68248c2ecf20Sopenharmony_ci		return;
68258c2ecf20Sopenharmony_ci
68268c2ecf20Sopenharmony_ci	if (!root)
68278c2ecf20Sopenharmony_ci		root = root_mem_cgroup;
68288c2ecf20Sopenharmony_ci
68298c2ecf20Sopenharmony_ci	/*
68308c2ecf20Sopenharmony_ci	 * Effective values of the reclaim targets are ignored so they
68318c2ecf20Sopenharmony_ci	 * can be stale. Have a look at mem_cgroup_protection for more
68328c2ecf20Sopenharmony_ci	 * details.
68338c2ecf20Sopenharmony_ci	 * TODO: calculation should be more robust so that we do not need
68348c2ecf20Sopenharmony_ci	 * that special casing.
68358c2ecf20Sopenharmony_ci	 */
68368c2ecf20Sopenharmony_ci	if (memcg == root)
68378c2ecf20Sopenharmony_ci		return;
68388c2ecf20Sopenharmony_ci
68398c2ecf20Sopenharmony_ci	usage = page_counter_read(&memcg->memory);
68408c2ecf20Sopenharmony_ci	if (!usage)
68418c2ecf20Sopenharmony_ci		return;
68428c2ecf20Sopenharmony_ci
68438c2ecf20Sopenharmony_ci	parent = parent_mem_cgroup(memcg);
68448c2ecf20Sopenharmony_ci	/* No parent means a non-hierarchical mode on v1 memcg */
68458c2ecf20Sopenharmony_ci	if (!parent)
68468c2ecf20Sopenharmony_ci		return;
68478c2ecf20Sopenharmony_ci
68488c2ecf20Sopenharmony_ci	if (parent == root) {
68498c2ecf20Sopenharmony_ci		memcg->memory.emin = READ_ONCE(memcg->memory.min);
68508c2ecf20Sopenharmony_ci		memcg->memory.elow = READ_ONCE(memcg->memory.low);
68518c2ecf20Sopenharmony_ci		return;
68528c2ecf20Sopenharmony_ci	}
68538c2ecf20Sopenharmony_ci
68548c2ecf20Sopenharmony_ci	parent_usage = page_counter_read(&parent->memory);
68558c2ecf20Sopenharmony_ci
68568c2ecf20Sopenharmony_ci	WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
68578c2ecf20Sopenharmony_ci			READ_ONCE(memcg->memory.min),
68588c2ecf20Sopenharmony_ci			READ_ONCE(parent->memory.emin),
68598c2ecf20Sopenharmony_ci			atomic_long_read(&parent->memory.children_min_usage)));
68608c2ecf20Sopenharmony_ci
68618c2ecf20Sopenharmony_ci	WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
68628c2ecf20Sopenharmony_ci			READ_ONCE(memcg->memory.low),
68638c2ecf20Sopenharmony_ci			READ_ONCE(parent->memory.elow),
68648c2ecf20Sopenharmony_ci			atomic_long_read(&parent->memory.children_low_usage)));
68658c2ecf20Sopenharmony_ci}
68668c2ecf20Sopenharmony_ci
68678c2ecf20Sopenharmony_ci/**
68688c2ecf20Sopenharmony_ci * mem_cgroup_charge - charge a newly allocated page to a cgroup
68698c2ecf20Sopenharmony_ci * @page: page to charge
68708c2ecf20Sopenharmony_ci * @mm: mm context of the victim
68718c2ecf20Sopenharmony_ci * @gfp_mask: reclaim mode
68728c2ecf20Sopenharmony_ci *
68738c2ecf20Sopenharmony_ci * Try to charge @page to the memcg that @mm belongs to, reclaiming
68748c2ecf20Sopenharmony_ci * pages according to @gfp_mask if necessary.
68758c2ecf20Sopenharmony_ci *
68768c2ecf20Sopenharmony_ci * Returns 0 on success. Otherwise, an error code is returned.
68778c2ecf20Sopenharmony_ci */
68788c2ecf20Sopenharmony_ciint mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
68798c2ecf20Sopenharmony_ci{
68808c2ecf20Sopenharmony_ci	unsigned int nr_pages = thp_nr_pages(page);
68818c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = NULL;
68828c2ecf20Sopenharmony_ci	int ret = 0;
68838c2ecf20Sopenharmony_ci
68848c2ecf20Sopenharmony_ci	if (mem_cgroup_disabled())
68858c2ecf20Sopenharmony_ci		goto out;
68868c2ecf20Sopenharmony_ci
68878c2ecf20Sopenharmony_ci	if (PageSwapCache(page)) {
68888c2ecf20Sopenharmony_ci		swp_entry_t ent = { .val = page_private(page), };
68898c2ecf20Sopenharmony_ci		unsigned short id;
68908c2ecf20Sopenharmony_ci
68918c2ecf20Sopenharmony_ci		/*
68928c2ecf20Sopenharmony_ci		 * Every swap fault against a single page tries to charge the
68938c2ecf20Sopenharmony_ci		 * page, bail as early as possible.  shmem_unuse() encounters
68948c2ecf20Sopenharmony_ci		 * already charged pages, too.  page->mem_cgroup is protected
68958c2ecf20Sopenharmony_ci		 * by the page lock, which serializes swap cache removal, which
68968c2ecf20Sopenharmony_ci		 * in turn serializes uncharging.
68978c2ecf20Sopenharmony_ci		 */
68988c2ecf20Sopenharmony_ci		VM_BUG_ON_PAGE(!PageLocked(page), page);
68998c2ecf20Sopenharmony_ci		if (compound_head(page)->mem_cgroup)
69008c2ecf20Sopenharmony_ci			goto out;
69018c2ecf20Sopenharmony_ci
69028c2ecf20Sopenharmony_ci		id = lookup_swap_cgroup_id(ent);
69038c2ecf20Sopenharmony_ci		rcu_read_lock();
69048c2ecf20Sopenharmony_ci		memcg = mem_cgroup_from_id(id);
69058c2ecf20Sopenharmony_ci		if (memcg && !css_tryget_online(&memcg->css))
69068c2ecf20Sopenharmony_ci			memcg = NULL;
69078c2ecf20Sopenharmony_ci		rcu_read_unlock();
69088c2ecf20Sopenharmony_ci	}
69098c2ecf20Sopenharmony_ci
69108c2ecf20Sopenharmony_ci	if (!memcg)
69118c2ecf20Sopenharmony_ci		memcg = get_mem_cgroup_from_mm(mm);
69128c2ecf20Sopenharmony_ci
69138c2ecf20Sopenharmony_ci	ret = try_charge(memcg, gfp_mask, nr_pages);
69148c2ecf20Sopenharmony_ci	if (ret)
69158c2ecf20Sopenharmony_ci		goto out_put;
69168c2ecf20Sopenharmony_ci
69178c2ecf20Sopenharmony_ci	css_get(&memcg->css);
69188c2ecf20Sopenharmony_ci	commit_charge(page, memcg);
69198c2ecf20Sopenharmony_ci
69208c2ecf20Sopenharmony_ci	local_irq_disable();
69218c2ecf20Sopenharmony_ci	mem_cgroup_charge_statistics(memcg, page, nr_pages);
69228c2ecf20Sopenharmony_ci	memcg_check_events(memcg, page);
69238c2ecf20Sopenharmony_ci	local_irq_enable();
69248c2ecf20Sopenharmony_ci
69258c2ecf20Sopenharmony_ci	/*
69268c2ecf20Sopenharmony_ci	 * Cgroup1's unified memory+swap counter has been charged with the
69278c2ecf20Sopenharmony_ci	 * new swapcache page, finish the transfer by uncharging the swap
69288c2ecf20Sopenharmony_ci	 * slot. The swap slot would also get uncharged when it dies, but
69298c2ecf20Sopenharmony_ci	 * it can stick around indefinitely and we'd count the page twice
69308c2ecf20Sopenharmony_ci	 * the entire time.
69318c2ecf20Sopenharmony_ci	 *
69328c2ecf20Sopenharmony_ci	 * Cgroup2 has separate resource counters for memory and swap,
69338c2ecf20Sopenharmony_ci	 * so this is a non-issue here. Memory and swap charge lifetimes
69348c2ecf20Sopenharmony_ci	 * correspond 1:1 to page and swap slot lifetimes: we charge the
69358c2ecf20Sopenharmony_ci	 * page to memory here, and uncharge swap when the slot is freed.
69368c2ecf20Sopenharmony_ci	 */
69378c2ecf20Sopenharmony_ci	if (do_memsw_account() && PageSwapCache(page)) {
69388c2ecf20Sopenharmony_ci		swp_entry_t entry = { .val = page_private(page) };
69398c2ecf20Sopenharmony_ci		/*
69408c2ecf20Sopenharmony_ci		 * The swap entry might not get freed for a long time,
69418c2ecf20Sopenharmony_ci		 * let's not wait for it.  The page already received a
69428c2ecf20Sopenharmony_ci		 * memory+swap charge, drop the swap entry duplicate.
69438c2ecf20Sopenharmony_ci		 */
69448c2ecf20Sopenharmony_ci		mem_cgroup_uncharge_swap(entry, nr_pages);
69458c2ecf20Sopenharmony_ci	}
69468c2ecf20Sopenharmony_ci
69478c2ecf20Sopenharmony_ciout_put:
69488c2ecf20Sopenharmony_ci	css_put(&memcg->css);
69498c2ecf20Sopenharmony_ciout:
69508c2ecf20Sopenharmony_ci	return ret;
69518c2ecf20Sopenharmony_ci}
69528c2ecf20Sopenharmony_ci
69538c2ecf20Sopenharmony_cistruct uncharge_gather {
69548c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
69558c2ecf20Sopenharmony_ci	unsigned long nr_pages;
69568c2ecf20Sopenharmony_ci	unsigned long pgpgout;
69578c2ecf20Sopenharmony_ci	unsigned long nr_kmem;
69588c2ecf20Sopenharmony_ci	struct page *dummy_page;
69598c2ecf20Sopenharmony_ci};
69608c2ecf20Sopenharmony_ci
69618c2ecf20Sopenharmony_cistatic inline void uncharge_gather_clear(struct uncharge_gather *ug)
69628c2ecf20Sopenharmony_ci{
69638c2ecf20Sopenharmony_ci	memset(ug, 0, sizeof(*ug));
69648c2ecf20Sopenharmony_ci}
69658c2ecf20Sopenharmony_ci
69668c2ecf20Sopenharmony_cistatic void uncharge_batch(const struct uncharge_gather *ug)
69678c2ecf20Sopenharmony_ci{
69688c2ecf20Sopenharmony_ci	unsigned long flags;
69698c2ecf20Sopenharmony_ci
69708c2ecf20Sopenharmony_ci	if (!mem_cgroup_is_root(ug->memcg)) {
69718c2ecf20Sopenharmony_ci		page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
69728c2ecf20Sopenharmony_ci		if (do_memsw_account())
69738c2ecf20Sopenharmony_ci			page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
69748c2ecf20Sopenharmony_ci		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
69758c2ecf20Sopenharmony_ci			page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
69768c2ecf20Sopenharmony_ci		memcg_oom_recover(ug->memcg);
69778c2ecf20Sopenharmony_ci	}
69788c2ecf20Sopenharmony_ci
69798c2ecf20Sopenharmony_ci	local_irq_save(flags);
69808c2ecf20Sopenharmony_ci	__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
69818c2ecf20Sopenharmony_ci	__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
69828c2ecf20Sopenharmony_ci	memcg_check_events(ug->memcg, ug->dummy_page);
69838c2ecf20Sopenharmony_ci	local_irq_restore(flags);
69848c2ecf20Sopenharmony_ci
69858c2ecf20Sopenharmony_ci	/* drop reference from uncharge_page */
69868c2ecf20Sopenharmony_ci	css_put(&ug->memcg->css);
69878c2ecf20Sopenharmony_ci}
69888c2ecf20Sopenharmony_ci
69898c2ecf20Sopenharmony_cistatic void uncharge_page(struct page *page, struct uncharge_gather *ug)
69908c2ecf20Sopenharmony_ci{
69918c2ecf20Sopenharmony_ci	unsigned long nr_pages;
69928c2ecf20Sopenharmony_ci
69938c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(PageLRU(page), page);
69948c2ecf20Sopenharmony_ci
69958c2ecf20Sopenharmony_ci	if (!page->mem_cgroup)
69968c2ecf20Sopenharmony_ci		return;
69978c2ecf20Sopenharmony_ci
69988c2ecf20Sopenharmony_ci	/*
69998c2ecf20Sopenharmony_ci	 * Nobody should be changing or seriously looking at
70008c2ecf20Sopenharmony_ci	 * page->mem_cgroup at this point, we have fully
70018c2ecf20Sopenharmony_ci	 * exclusive access to the page.
70028c2ecf20Sopenharmony_ci	 */
70038c2ecf20Sopenharmony_ci
70048c2ecf20Sopenharmony_ci	if (ug->memcg != page->mem_cgroup) {
70058c2ecf20Sopenharmony_ci		if (ug->memcg) {
70068c2ecf20Sopenharmony_ci			uncharge_batch(ug);
70078c2ecf20Sopenharmony_ci			uncharge_gather_clear(ug);
70088c2ecf20Sopenharmony_ci		}
70098c2ecf20Sopenharmony_ci		ug->memcg = page->mem_cgroup;
70108c2ecf20Sopenharmony_ci
70118c2ecf20Sopenharmony_ci		/* pairs with css_put in uncharge_batch */
70128c2ecf20Sopenharmony_ci		css_get(&ug->memcg->css);
70138c2ecf20Sopenharmony_ci	}
70148c2ecf20Sopenharmony_ci
70158c2ecf20Sopenharmony_ci	nr_pages = compound_nr(page);
70168c2ecf20Sopenharmony_ci	ug->nr_pages += nr_pages;
70178c2ecf20Sopenharmony_ci
70188c2ecf20Sopenharmony_ci	if (!PageKmemcg(page)) {
70198c2ecf20Sopenharmony_ci		ug->pgpgout++;
70208c2ecf20Sopenharmony_ci	} else {
70218c2ecf20Sopenharmony_ci		ug->nr_kmem += nr_pages;
70228c2ecf20Sopenharmony_ci		__ClearPageKmemcg(page);
70238c2ecf20Sopenharmony_ci	}
70248c2ecf20Sopenharmony_ci
70258c2ecf20Sopenharmony_ci	ug->dummy_page = page;
70268c2ecf20Sopenharmony_ci	page->mem_cgroup = NULL;
70278c2ecf20Sopenharmony_ci	css_put(&ug->memcg->css);
70288c2ecf20Sopenharmony_ci}
70298c2ecf20Sopenharmony_ci
70308c2ecf20Sopenharmony_cistatic void uncharge_list(struct list_head *page_list)
70318c2ecf20Sopenharmony_ci{
70328c2ecf20Sopenharmony_ci	struct uncharge_gather ug;
70338c2ecf20Sopenharmony_ci	struct list_head *next;
70348c2ecf20Sopenharmony_ci
70358c2ecf20Sopenharmony_ci	uncharge_gather_clear(&ug);
70368c2ecf20Sopenharmony_ci
70378c2ecf20Sopenharmony_ci	/*
70388c2ecf20Sopenharmony_ci	 * Note that the list can be a single page->lru; hence the
70398c2ecf20Sopenharmony_ci	 * do-while loop instead of a simple list_for_each_entry().
70408c2ecf20Sopenharmony_ci	 */
70418c2ecf20Sopenharmony_ci	next = page_list->next;
70428c2ecf20Sopenharmony_ci	do {
70438c2ecf20Sopenharmony_ci		struct page *page;
70448c2ecf20Sopenharmony_ci
70458c2ecf20Sopenharmony_ci		page = list_entry(next, struct page, lru);
70468c2ecf20Sopenharmony_ci		next = page->lru.next;
70478c2ecf20Sopenharmony_ci
70488c2ecf20Sopenharmony_ci		uncharge_page(page, &ug);
70498c2ecf20Sopenharmony_ci	} while (next != page_list);
70508c2ecf20Sopenharmony_ci
70518c2ecf20Sopenharmony_ci	if (ug.memcg)
70528c2ecf20Sopenharmony_ci		uncharge_batch(&ug);
70538c2ecf20Sopenharmony_ci}
70548c2ecf20Sopenharmony_ci
70558c2ecf20Sopenharmony_ci/**
70568c2ecf20Sopenharmony_ci * mem_cgroup_uncharge - uncharge a page
70578c2ecf20Sopenharmony_ci * @page: page to uncharge
70588c2ecf20Sopenharmony_ci *
70598c2ecf20Sopenharmony_ci * Uncharge a page previously charged with mem_cgroup_charge().
70608c2ecf20Sopenharmony_ci */
70618c2ecf20Sopenharmony_civoid mem_cgroup_uncharge(struct page *page)
70628c2ecf20Sopenharmony_ci{
70638c2ecf20Sopenharmony_ci	struct uncharge_gather ug;
70648c2ecf20Sopenharmony_ci
70658c2ecf20Sopenharmony_ci	if (mem_cgroup_disabled())
70668c2ecf20Sopenharmony_ci		return;
70678c2ecf20Sopenharmony_ci
70688c2ecf20Sopenharmony_ci	/* Don't touch page->lru of any random page, pre-check: */
70698c2ecf20Sopenharmony_ci	if (!page->mem_cgroup)
70708c2ecf20Sopenharmony_ci		return;
70718c2ecf20Sopenharmony_ci
70728c2ecf20Sopenharmony_ci	uncharge_gather_clear(&ug);
70738c2ecf20Sopenharmony_ci	uncharge_page(page, &ug);
70748c2ecf20Sopenharmony_ci	uncharge_batch(&ug);
70758c2ecf20Sopenharmony_ci}
70768c2ecf20Sopenharmony_ci
70778c2ecf20Sopenharmony_ci/**
70788c2ecf20Sopenharmony_ci * mem_cgroup_uncharge_list - uncharge a list of page
70798c2ecf20Sopenharmony_ci * @page_list: list of pages to uncharge
70808c2ecf20Sopenharmony_ci *
70818c2ecf20Sopenharmony_ci * Uncharge a list of pages previously charged with
70828c2ecf20Sopenharmony_ci * mem_cgroup_charge().
70838c2ecf20Sopenharmony_ci */
70848c2ecf20Sopenharmony_civoid mem_cgroup_uncharge_list(struct list_head *page_list)
70858c2ecf20Sopenharmony_ci{
70868c2ecf20Sopenharmony_ci	if (mem_cgroup_disabled())
70878c2ecf20Sopenharmony_ci		return;
70888c2ecf20Sopenharmony_ci
70898c2ecf20Sopenharmony_ci	if (!list_empty(page_list))
70908c2ecf20Sopenharmony_ci		uncharge_list(page_list);
70918c2ecf20Sopenharmony_ci}
70928c2ecf20Sopenharmony_ci
70938c2ecf20Sopenharmony_ci/**
70948c2ecf20Sopenharmony_ci * mem_cgroup_migrate - charge a page's replacement
70958c2ecf20Sopenharmony_ci * @oldpage: currently circulating page
70968c2ecf20Sopenharmony_ci * @newpage: replacement page
70978c2ecf20Sopenharmony_ci *
70988c2ecf20Sopenharmony_ci * Charge @newpage as a replacement page for @oldpage. @oldpage will
70998c2ecf20Sopenharmony_ci * be uncharged upon free.
71008c2ecf20Sopenharmony_ci *
71018c2ecf20Sopenharmony_ci * Both pages must be locked, @newpage->mapping must be set up.
71028c2ecf20Sopenharmony_ci */
71038c2ecf20Sopenharmony_civoid mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
71048c2ecf20Sopenharmony_ci{
71058c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
71068c2ecf20Sopenharmony_ci	unsigned int nr_pages;
71078c2ecf20Sopenharmony_ci	unsigned long flags;
71088c2ecf20Sopenharmony_ci
71098c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
71108c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
71118c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
71128c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
71138c2ecf20Sopenharmony_ci		       newpage);
71148c2ecf20Sopenharmony_ci
71158c2ecf20Sopenharmony_ci	if (mem_cgroup_disabled())
71168c2ecf20Sopenharmony_ci		return;
71178c2ecf20Sopenharmony_ci
71188c2ecf20Sopenharmony_ci	/* Page cache replacement: new page already charged? */
71198c2ecf20Sopenharmony_ci	if (newpage->mem_cgroup)
71208c2ecf20Sopenharmony_ci		return;
71218c2ecf20Sopenharmony_ci
71228c2ecf20Sopenharmony_ci	/* Swapcache readahead pages can get replaced before being charged */
71238c2ecf20Sopenharmony_ci	memcg = oldpage->mem_cgroup;
71248c2ecf20Sopenharmony_ci	if (!memcg)
71258c2ecf20Sopenharmony_ci		return;
71268c2ecf20Sopenharmony_ci
71278c2ecf20Sopenharmony_ci	/* Force-charge the new page. The old one will be freed soon */
71288c2ecf20Sopenharmony_ci	nr_pages = thp_nr_pages(newpage);
71298c2ecf20Sopenharmony_ci
71308c2ecf20Sopenharmony_ci	page_counter_charge(&memcg->memory, nr_pages);
71318c2ecf20Sopenharmony_ci	if (do_memsw_account())
71328c2ecf20Sopenharmony_ci		page_counter_charge(&memcg->memsw, nr_pages);
71338c2ecf20Sopenharmony_ci
71348c2ecf20Sopenharmony_ci	css_get(&memcg->css);
71358c2ecf20Sopenharmony_ci	commit_charge(newpage, memcg);
71368c2ecf20Sopenharmony_ci
71378c2ecf20Sopenharmony_ci	local_irq_save(flags);
71388c2ecf20Sopenharmony_ci	mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
71398c2ecf20Sopenharmony_ci	memcg_check_events(memcg, newpage);
71408c2ecf20Sopenharmony_ci	local_irq_restore(flags);
71418c2ecf20Sopenharmony_ci}
71428c2ecf20Sopenharmony_ci
71438c2ecf20Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
71448c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memcg_sockets_enabled_key);
71458c2ecf20Sopenharmony_ci
71468c2ecf20Sopenharmony_civoid mem_cgroup_sk_alloc(struct sock *sk)
71478c2ecf20Sopenharmony_ci{
71488c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
71498c2ecf20Sopenharmony_ci
71508c2ecf20Sopenharmony_ci	if (!mem_cgroup_sockets_enabled)
71518c2ecf20Sopenharmony_ci		return;
71528c2ecf20Sopenharmony_ci
71538c2ecf20Sopenharmony_ci	/* Do not associate the sock with unrelated interrupted task's memcg. */
71548c2ecf20Sopenharmony_ci	if (in_interrupt())
71558c2ecf20Sopenharmony_ci		return;
71568c2ecf20Sopenharmony_ci
71578c2ecf20Sopenharmony_ci	rcu_read_lock();
71588c2ecf20Sopenharmony_ci	memcg = mem_cgroup_from_task(current);
71598c2ecf20Sopenharmony_ci	if (memcg == root_mem_cgroup)
71608c2ecf20Sopenharmony_ci		goto out;
71618c2ecf20Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
71628c2ecf20Sopenharmony_ci		goto out;
71638c2ecf20Sopenharmony_ci	if (css_tryget(&memcg->css))
71648c2ecf20Sopenharmony_ci		sk->sk_memcg = memcg;
71658c2ecf20Sopenharmony_ciout:
71668c2ecf20Sopenharmony_ci	rcu_read_unlock();
71678c2ecf20Sopenharmony_ci}
71688c2ecf20Sopenharmony_ci
71698c2ecf20Sopenharmony_civoid mem_cgroup_sk_free(struct sock *sk)
71708c2ecf20Sopenharmony_ci{
71718c2ecf20Sopenharmony_ci	if (sk->sk_memcg)
71728c2ecf20Sopenharmony_ci		css_put(&sk->sk_memcg->css);
71738c2ecf20Sopenharmony_ci}
71748c2ecf20Sopenharmony_ci
71758c2ecf20Sopenharmony_ci/**
71768c2ecf20Sopenharmony_ci * mem_cgroup_charge_skmem - charge socket memory
71778c2ecf20Sopenharmony_ci * @memcg: memcg to charge
71788c2ecf20Sopenharmony_ci * @nr_pages: number of pages to charge
71798c2ecf20Sopenharmony_ci *
71808c2ecf20Sopenharmony_ci * Charges @nr_pages to @memcg. Returns %true if the charge fit within
71818c2ecf20Sopenharmony_ci * @memcg's configured limit, %false if the charge had to be forced.
71828c2ecf20Sopenharmony_ci */
71838c2ecf20Sopenharmony_cibool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
71848c2ecf20Sopenharmony_ci{
71858c2ecf20Sopenharmony_ci	gfp_t gfp_mask = GFP_KERNEL;
71868c2ecf20Sopenharmony_ci
71878c2ecf20Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
71888c2ecf20Sopenharmony_ci		struct page_counter *fail;
71898c2ecf20Sopenharmony_ci
71908c2ecf20Sopenharmony_ci		if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
71918c2ecf20Sopenharmony_ci			memcg->tcpmem_pressure = 0;
71928c2ecf20Sopenharmony_ci			return true;
71938c2ecf20Sopenharmony_ci		}
71948c2ecf20Sopenharmony_ci		page_counter_charge(&memcg->tcpmem, nr_pages);
71958c2ecf20Sopenharmony_ci		memcg->tcpmem_pressure = 1;
71968c2ecf20Sopenharmony_ci		return false;
71978c2ecf20Sopenharmony_ci	}
71988c2ecf20Sopenharmony_ci
71998c2ecf20Sopenharmony_ci	/* Don't block in the packet receive path */
72008c2ecf20Sopenharmony_ci	if (in_softirq())
72018c2ecf20Sopenharmony_ci		gfp_mask = GFP_NOWAIT;
72028c2ecf20Sopenharmony_ci
72038c2ecf20Sopenharmony_ci	mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
72048c2ecf20Sopenharmony_ci
72058c2ecf20Sopenharmony_ci	if (try_charge(memcg, gfp_mask, nr_pages) == 0)
72068c2ecf20Sopenharmony_ci		return true;
72078c2ecf20Sopenharmony_ci
72088c2ecf20Sopenharmony_ci	try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
72098c2ecf20Sopenharmony_ci	return false;
72108c2ecf20Sopenharmony_ci}
72118c2ecf20Sopenharmony_ci
72128c2ecf20Sopenharmony_ci/**
72138c2ecf20Sopenharmony_ci * mem_cgroup_uncharge_skmem - uncharge socket memory
72148c2ecf20Sopenharmony_ci * @memcg: memcg to uncharge
72158c2ecf20Sopenharmony_ci * @nr_pages: number of pages to uncharge
72168c2ecf20Sopenharmony_ci */
72178c2ecf20Sopenharmony_civoid mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
72188c2ecf20Sopenharmony_ci{
72198c2ecf20Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
72208c2ecf20Sopenharmony_ci		page_counter_uncharge(&memcg->tcpmem, nr_pages);
72218c2ecf20Sopenharmony_ci		return;
72228c2ecf20Sopenharmony_ci	}
72238c2ecf20Sopenharmony_ci
72248c2ecf20Sopenharmony_ci	mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
72258c2ecf20Sopenharmony_ci
72268c2ecf20Sopenharmony_ci	refill_stock(memcg, nr_pages);
72278c2ecf20Sopenharmony_ci}
72288c2ecf20Sopenharmony_ci
72298c2ecf20Sopenharmony_cistatic int __init cgroup_memory(char *s)
72308c2ecf20Sopenharmony_ci{
72318c2ecf20Sopenharmony_ci	char *token;
72328c2ecf20Sopenharmony_ci
72338c2ecf20Sopenharmony_ci	while ((token = strsep(&s, ",")) != NULL) {
72348c2ecf20Sopenharmony_ci		if (!*token)
72358c2ecf20Sopenharmony_ci			continue;
72368c2ecf20Sopenharmony_ci		if (!strcmp(token, "nosocket"))
72378c2ecf20Sopenharmony_ci			cgroup_memory_nosocket = true;
72388c2ecf20Sopenharmony_ci		if (!strcmp(token, "nokmem"))
72398c2ecf20Sopenharmony_ci			cgroup_memory_nokmem = true;
72408c2ecf20Sopenharmony_ci		else if (!strcmp(token, "kmem"))
72418c2ecf20Sopenharmony_ci			cgroup_memory_nokmem = false;
72428c2ecf20Sopenharmony_ci	}
72438c2ecf20Sopenharmony_ci	return 1;
72448c2ecf20Sopenharmony_ci}
72458c2ecf20Sopenharmony_ci__setup("cgroup.memory=", cgroup_memory);
72468c2ecf20Sopenharmony_ci
72478c2ecf20Sopenharmony_ci/*
72488c2ecf20Sopenharmony_ci * subsys_initcall() for memory controller.
72498c2ecf20Sopenharmony_ci *
72508c2ecf20Sopenharmony_ci * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
72518c2ecf20Sopenharmony_ci * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
72528c2ecf20Sopenharmony_ci * basically everything that doesn't depend on a specific mem_cgroup structure
72538c2ecf20Sopenharmony_ci * should be initialized from here.
72548c2ecf20Sopenharmony_ci */
72558c2ecf20Sopenharmony_cistatic int __init mem_cgroup_init(void)
72568c2ecf20Sopenharmony_ci{
72578c2ecf20Sopenharmony_ci	int cpu, node;
72588c2ecf20Sopenharmony_ci
72598c2ecf20Sopenharmony_ci	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
72608c2ecf20Sopenharmony_ci				  memcg_hotplug_cpu_dead);
72618c2ecf20Sopenharmony_ci
72628c2ecf20Sopenharmony_ci	for_each_possible_cpu(cpu)
72638c2ecf20Sopenharmony_ci		INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
72648c2ecf20Sopenharmony_ci			  drain_local_stock);
72658c2ecf20Sopenharmony_ci
72668c2ecf20Sopenharmony_ci	for_each_node(node) {
72678c2ecf20Sopenharmony_ci		struct mem_cgroup_tree_per_node *rtpn;
72688c2ecf20Sopenharmony_ci
72698c2ecf20Sopenharmony_ci		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
72708c2ecf20Sopenharmony_ci				    node_online(node) ? node : NUMA_NO_NODE);
72718c2ecf20Sopenharmony_ci
72728c2ecf20Sopenharmony_ci		rtpn->rb_root = RB_ROOT;
72738c2ecf20Sopenharmony_ci		rtpn->rb_rightmost = NULL;
72748c2ecf20Sopenharmony_ci		spin_lock_init(&rtpn->lock);
72758c2ecf20Sopenharmony_ci		soft_limit_tree.rb_tree_per_node[node] = rtpn;
72768c2ecf20Sopenharmony_ci	}
72778c2ecf20Sopenharmony_ci
72788c2ecf20Sopenharmony_ci	return 0;
72798c2ecf20Sopenharmony_ci}
72808c2ecf20Sopenharmony_cisubsys_initcall(mem_cgroup_init);
72818c2ecf20Sopenharmony_ci
72828c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_SWAP
72838c2ecf20Sopenharmony_cistatic struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
72848c2ecf20Sopenharmony_ci{
72858c2ecf20Sopenharmony_ci	while (!refcount_inc_not_zero(&memcg->id.ref)) {
72868c2ecf20Sopenharmony_ci		/*
72878c2ecf20Sopenharmony_ci		 * The root cgroup cannot be destroyed, so it's refcount must
72888c2ecf20Sopenharmony_ci		 * always be >= 1.
72898c2ecf20Sopenharmony_ci		 */
72908c2ecf20Sopenharmony_ci		if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
72918c2ecf20Sopenharmony_ci			VM_BUG_ON(1);
72928c2ecf20Sopenharmony_ci			break;
72938c2ecf20Sopenharmony_ci		}
72948c2ecf20Sopenharmony_ci		memcg = parent_mem_cgroup(memcg);
72958c2ecf20Sopenharmony_ci		if (!memcg)
72968c2ecf20Sopenharmony_ci			memcg = root_mem_cgroup;
72978c2ecf20Sopenharmony_ci	}
72988c2ecf20Sopenharmony_ci	return memcg;
72998c2ecf20Sopenharmony_ci}
73008c2ecf20Sopenharmony_ci
73018c2ecf20Sopenharmony_ci/**
73028c2ecf20Sopenharmony_ci * mem_cgroup_swapout - transfer a memsw charge to swap
73038c2ecf20Sopenharmony_ci * @page: page whose memsw charge to transfer
73048c2ecf20Sopenharmony_ci * @entry: swap entry to move the charge to
73058c2ecf20Sopenharmony_ci *
73068c2ecf20Sopenharmony_ci * Transfer the memsw charge of @page to @entry.
73078c2ecf20Sopenharmony_ci */
73088c2ecf20Sopenharmony_civoid mem_cgroup_swapout(struct page *page, swp_entry_t entry)
73098c2ecf20Sopenharmony_ci{
73108c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg, *swap_memcg;
73118c2ecf20Sopenharmony_ci	unsigned int nr_entries;
73128c2ecf20Sopenharmony_ci	unsigned short oldid;
73138c2ecf20Sopenharmony_ci
73148c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(PageLRU(page), page);
73158c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(page_count(page), page);
73168c2ecf20Sopenharmony_ci
73178c2ecf20Sopenharmony_ci	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
73188c2ecf20Sopenharmony_ci		return;
73198c2ecf20Sopenharmony_ci
73208c2ecf20Sopenharmony_ci	memcg = page->mem_cgroup;
73218c2ecf20Sopenharmony_ci
73228c2ecf20Sopenharmony_ci	/* Readahead page, never charged */
73238c2ecf20Sopenharmony_ci	if (!memcg)
73248c2ecf20Sopenharmony_ci		return;
73258c2ecf20Sopenharmony_ci
73268c2ecf20Sopenharmony_ci	/*
73278c2ecf20Sopenharmony_ci	 * In case the memcg owning these pages has been offlined and doesn't
73288c2ecf20Sopenharmony_ci	 * have an ID allocated to it anymore, charge the closest online
73298c2ecf20Sopenharmony_ci	 * ancestor for the swap instead and transfer the memory+swap charge.
73308c2ecf20Sopenharmony_ci	 */
73318c2ecf20Sopenharmony_ci	swap_memcg = mem_cgroup_id_get_online(memcg);
73328c2ecf20Sopenharmony_ci	nr_entries = thp_nr_pages(page);
73338c2ecf20Sopenharmony_ci	/* Get references for the tail pages, too */
73348c2ecf20Sopenharmony_ci	if (nr_entries > 1)
73358c2ecf20Sopenharmony_ci		mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
73368c2ecf20Sopenharmony_ci	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
73378c2ecf20Sopenharmony_ci				   nr_entries);
73388c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(oldid, page);
73398c2ecf20Sopenharmony_ci	mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
73408c2ecf20Sopenharmony_ci
73418c2ecf20Sopenharmony_ci	page->mem_cgroup = NULL;
73428c2ecf20Sopenharmony_ci
73438c2ecf20Sopenharmony_ci	if (!mem_cgroup_is_root(memcg))
73448c2ecf20Sopenharmony_ci		page_counter_uncharge(&memcg->memory, nr_entries);
73458c2ecf20Sopenharmony_ci
73468c2ecf20Sopenharmony_ci	if (!cgroup_memory_noswap && memcg != swap_memcg) {
73478c2ecf20Sopenharmony_ci		if (!mem_cgroup_is_root(swap_memcg))
73488c2ecf20Sopenharmony_ci			page_counter_charge(&swap_memcg->memsw, nr_entries);
73498c2ecf20Sopenharmony_ci		page_counter_uncharge(&memcg->memsw, nr_entries);
73508c2ecf20Sopenharmony_ci	}
73518c2ecf20Sopenharmony_ci
73528c2ecf20Sopenharmony_ci	/*
73538c2ecf20Sopenharmony_ci	 * Interrupts should be disabled here because the caller holds the
73548c2ecf20Sopenharmony_ci	 * i_pages lock which is taken with interrupts-off. It is
73558c2ecf20Sopenharmony_ci	 * important here to have the interrupts disabled because it is the
73568c2ecf20Sopenharmony_ci	 * only synchronisation we have for updating the per-CPU variables.
73578c2ecf20Sopenharmony_ci	 */
73588c2ecf20Sopenharmony_ci	VM_BUG_ON(!irqs_disabled());
73598c2ecf20Sopenharmony_ci	mem_cgroup_charge_statistics(memcg, page, -nr_entries);
73608c2ecf20Sopenharmony_ci	memcg_check_events(memcg, page);
73618c2ecf20Sopenharmony_ci
73628c2ecf20Sopenharmony_ci	css_put(&memcg->css);
73638c2ecf20Sopenharmony_ci}
73648c2ecf20Sopenharmony_ci
73658c2ecf20Sopenharmony_ci/**
73668c2ecf20Sopenharmony_ci * mem_cgroup_try_charge_swap - try charging swap space for a page
73678c2ecf20Sopenharmony_ci * @page: page being added to swap
73688c2ecf20Sopenharmony_ci * @entry: swap entry to charge
73698c2ecf20Sopenharmony_ci *
73708c2ecf20Sopenharmony_ci * Try to charge @page's memcg for the swap space at @entry.
73718c2ecf20Sopenharmony_ci *
73728c2ecf20Sopenharmony_ci * Returns 0 on success, -ENOMEM on failure.
73738c2ecf20Sopenharmony_ci */
73748c2ecf20Sopenharmony_ciint mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
73758c2ecf20Sopenharmony_ci{
73768c2ecf20Sopenharmony_ci	unsigned int nr_pages = thp_nr_pages(page);
73778c2ecf20Sopenharmony_ci	struct page_counter *counter;
73788c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
73798c2ecf20Sopenharmony_ci	unsigned short oldid;
73808c2ecf20Sopenharmony_ci
73818c2ecf20Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
73828c2ecf20Sopenharmony_ci		return 0;
73838c2ecf20Sopenharmony_ci
73848c2ecf20Sopenharmony_ci	memcg = page->mem_cgroup;
73858c2ecf20Sopenharmony_ci
73868c2ecf20Sopenharmony_ci	/* Readahead page, never charged */
73878c2ecf20Sopenharmony_ci	if (!memcg)
73888c2ecf20Sopenharmony_ci		return 0;
73898c2ecf20Sopenharmony_ci
73908c2ecf20Sopenharmony_ci	if (!entry.val) {
73918c2ecf20Sopenharmony_ci		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
73928c2ecf20Sopenharmony_ci		return 0;
73938c2ecf20Sopenharmony_ci	}
73948c2ecf20Sopenharmony_ci
73958c2ecf20Sopenharmony_ci	memcg = mem_cgroup_id_get_online(memcg);
73968c2ecf20Sopenharmony_ci
73978c2ecf20Sopenharmony_ci	if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
73988c2ecf20Sopenharmony_ci	    !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
73998c2ecf20Sopenharmony_ci		memcg_memory_event(memcg, MEMCG_SWAP_MAX);
74008c2ecf20Sopenharmony_ci		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
74018c2ecf20Sopenharmony_ci		mem_cgroup_id_put(memcg);
74028c2ecf20Sopenharmony_ci		return -ENOMEM;
74038c2ecf20Sopenharmony_ci	}
74048c2ecf20Sopenharmony_ci
74058c2ecf20Sopenharmony_ci	/* Get references for the tail pages, too */
74068c2ecf20Sopenharmony_ci	if (nr_pages > 1)
74078c2ecf20Sopenharmony_ci		mem_cgroup_id_get_many(memcg, nr_pages - 1);
74088c2ecf20Sopenharmony_ci	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
74098c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(oldid, page);
74108c2ecf20Sopenharmony_ci	mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
74118c2ecf20Sopenharmony_ci
74128c2ecf20Sopenharmony_ci	return 0;
74138c2ecf20Sopenharmony_ci}
74148c2ecf20Sopenharmony_ci
74158c2ecf20Sopenharmony_ci/**
74168c2ecf20Sopenharmony_ci * mem_cgroup_uncharge_swap - uncharge swap space
74178c2ecf20Sopenharmony_ci * @entry: swap entry to uncharge
74188c2ecf20Sopenharmony_ci * @nr_pages: the amount of swap space to uncharge
74198c2ecf20Sopenharmony_ci */
74208c2ecf20Sopenharmony_civoid mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
74218c2ecf20Sopenharmony_ci{
74228c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
74238c2ecf20Sopenharmony_ci	unsigned short id;
74248c2ecf20Sopenharmony_ci
74258c2ecf20Sopenharmony_ci	id = swap_cgroup_record(entry, 0, nr_pages);
74268c2ecf20Sopenharmony_ci	rcu_read_lock();
74278c2ecf20Sopenharmony_ci	memcg = mem_cgroup_from_id(id);
74288c2ecf20Sopenharmony_ci	if (memcg) {
74298c2ecf20Sopenharmony_ci		if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
74308c2ecf20Sopenharmony_ci			if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
74318c2ecf20Sopenharmony_ci				page_counter_uncharge(&memcg->swap, nr_pages);
74328c2ecf20Sopenharmony_ci			else
74338c2ecf20Sopenharmony_ci				page_counter_uncharge(&memcg->memsw, nr_pages);
74348c2ecf20Sopenharmony_ci		}
74358c2ecf20Sopenharmony_ci		mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
74368c2ecf20Sopenharmony_ci		mem_cgroup_id_put_many(memcg, nr_pages);
74378c2ecf20Sopenharmony_ci	}
74388c2ecf20Sopenharmony_ci	rcu_read_unlock();
74398c2ecf20Sopenharmony_ci}
74408c2ecf20Sopenharmony_ci
74418c2ecf20Sopenharmony_cilong mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
74428c2ecf20Sopenharmony_ci{
74438c2ecf20Sopenharmony_ci	long nr_swap_pages = get_nr_swap_pages();
74448c2ecf20Sopenharmony_ci
74458c2ecf20Sopenharmony_ci	if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
74468c2ecf20Sopenharmony_ci		return nr_swap_pages;
74478c2ecf20Sopenharmony_ci	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
74488c2ecf20Sopenharmony_ci		nr_swap_pages = min_t(long, nr_swap_pages,
74498c2ecf20Sopenharmony_ci				      READ_ONCE(memcg->swap.max) -
74508c2ecf20Sopenharmony_ci				      page_counter_read(&memcg->swap));
74518c2ecf20Sopenharmony_ci	return nr_swap_pages;
74528c2ecf20Sopenharmony_ci}
74538c2ecf20Sopenharmony_ci
74548c2ecf20Sopenharmony_cibool mem_cgroup_swap_full(struct page *page)
74558c2ecf20Sopenharmony_ci{
74568c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg;
74578c2ecf20Sopenharmony_ci
74588c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(!PageLocked(page), page);
74598c2ecf20Sopenharmony_ci
74608c2ecf20Sopenharmony_ci	if (vm_swap_full())
74618c2ecf20Sopenharmony_ci		return true;
74628c2ecf20Sopenharmony_ci	if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
74638c2ecf20Sopenharmony_ci		return false;
74648c2ecf20Sopenharmony_ci
74658c2ecf20Sopenharmony_ci	memcg = page->mem_cgroup;
74668c2ecf20Sopenharmony_ci	if (!memcg)
74678c2ecf20Sopenharmony_ci		return false;
74688c2ecf20Sopenharmony_ci
74698c2ecf20Sopenharmony_ci	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
74708c2ecf20Sopenharmony_ci		unsigned long usage = page_counter_read(&memcg->swap);
74718c2ecf20Sopenharmony_ci
74728c2ecf20Sopenharmony_ci		if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
74738c2ecf20Sopenharmony_ci		    usage * 2 >= READ_ONCE(memcg->swap.max))
74748c2ecf20Sopenharmony_ci			return true;
74758c2ecf20Sopenharmony_ci	}
74768c2ecf20Sopenharmony_ci
74778c2ecf20Sopenharmony_ci	return false;
74788c2ecf20Sopenharmony_ci}
74798c2ecf20Sopenharmony_ci
74808c2ecf20Sopenharmony_cistatic int __init setup_swap_account(char *s)
74818c2ecf20Sopenharmony_ci{
74828c2ecf20Sopenharmony_ci	if (!strcmp(s, "1"))
74838c2ecf20Sopenharmony_ci		cgroup_memory_noswap = 0;
74848c2ecf20Sopenharmony_ci	else if (!strcmp(s, "0"))
74858c2ecf20Sopenharmony_ci		cgroup_memory_noswap = 1;
74868c2ecf20Sopenharmony_ci	return 1;
74878c2ecf20Sopenharmony_ci}
74888c2ecf20Sopenharmony_ci__setup("swapaccount=", setup_swap_account);
74898c2ecf20Sopenharmony_ci
74908c2ecf20Sopenharmony_cistatic u64 swap_current_read(struct cgroup_subsys_state *css,
74918c2ecf20Sopenharmony_ci			     struct cftype *cft)
74928c2ecf20Sopenharmony_ci{
74938c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
74948c2ecf20Sopenharmony_ci
74958c2ecf20Sopenharmony_ci	return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
74968c2ecf20Sopenharmony_ci}
74978c2ecf20Sopenharmony_ci
74988c2ecf20Sopenharmony_cistatic int swap_high_show(struct seq_file *m, void *v)
74998c2ecf20Sopenharmony_ci{
75008c2ecf20Sopenharmony_ci	return seq_puts_memcg_tunable(m,
75018c2ecf20Sopenharmony_ci		READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
75028c2ecf20Sopenharmony_ci}
75038c2ecf20Sopenharmony_ci
75048c2ecf20Sopenharmony_cistatic ssize_t swap_high_write(struct kernfs_open_file *of,
75058c2ecf20Sopenharmony_ci			       char *buf, size_t nbytes, loff_t off)
75068c2ecf20Sopenharmony_ci{
75078c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
75088c2ecf20Sopenharmony_ci	unsigned long high;
75098c2ecf20Sopenharmony_ci	int err;
75108c2ecf20Sopenharmony_ci
75118c2ecf20Sopenharmony_ci	buf = strstrip(buf);
75128c2ecf20Sopenharmony_ci	err = page_counter_memparse(buf, "max", &high);
75138c2ecf20Sopenharmony_ci	if (err)
75148c2ecf20Sopenharmony_ci		return err;
75158c2ecf20Sopenharmony_ci
75168c2ecf20Sopenharmony_ci	page_counter_set_high(&memcg->swap, high);
75178c2ecf20Sopenharmony_ci
75188c2ecf20Sopenharmony_ci	return nbytes;
75198c2ecf20Sopenharmony_ci}
75208c2ecf20Sopenharmony_ci
75218c2ecf20Sopenharmony_cistatic int swap_max_show(struct seq_file *m, void *v)
75228c2ecf20Sopenharmony_ci{
75238c2ecf20Sopenharmony_ci	return seq_puts_memcg_tunable(m,
75248c2ecf20Sopenharmony_ci		READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
75258c2ecf20Sopenharmony_ci}
75268c2ecf20Sopenharmony_ci
75278c2ecf20Sopenharmony_cistatic ssize_t swap_max_write(struct kernfs_open_file *of,
75288c2ecf20Sopenharmony_ci			      char *buf, size_t nbytes, loff_t off)
75298c2ecf20Sopenharmony_ci{
75308c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
75318c2ecf20Sopenharmony_ci	unsigned long max;
75328c2ecf20Sopenharmony_ci	int err;
75338c2ecf20Sopenharmony_ci
75348c2ecf20Sopenharmony_ci	buf = strstrip(buf);
75358c2ecf20Sopenharmony_ci	err = page_counter_memparse(buf, "max", &max);
75368c2ecf20Sopenharmony_ci	if (err)
75378c2ecf20Sopenharmony_ci		return err;
75388c2ecf20Sopenharmony_ci
75398c2ecf20Sopenharmony_ci	xchg(&memcg->swap.max, max);
75408c2ecf20Sopenharmony_ci
75418c2ecf20Sopenharmony_ci	return nbytes;
75428c2ecf20Sopenharmony_ci}
75438c2ecf20Sopenharmony_ci
75448c2ecf20Sopenharmony_cistatic int swap_events_show(struct seq_file *m, void *v)
75458c2ecf20Sopenharmony_ci{
75468c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
75478c2ecf20Sopenharmony_ci
75488c2ecf20Sopenharmony_ci	seq_printf(m, "high %lu\n",
75498c2ecf20Sopenharmony_ci		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
75508c2ecf20Sopenharmony_ci	seq_printf(m, "max %lu\n",
75518c2ecf20Sopenharmony_ci		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
75528c2ecf20Sopenharmony_ci	seq_printf(m, "fail %lu\n",
75538c2ecf20Sopenharmony_ci		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
75548c2ecf20Sopenharmony_ci
75558c2ecf20Sopenharmony_ci	return 0;
75568c2ecf20Sopenharmony_ci}
75578c2ecf20Sopenharmony_ci
75588c2ecf20Sopenharmony_cistatic struct cftype swap_files[] = {
75598c2ecf20Sopenharmony_ci	{
75608c2ecf20Sopenharmony_ci		.name = "swap.current",
75618c2ecf20Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
75628c2ecf20Sopenharmony_ci		.read_u64 = swap_current_read,
75638c2ecf20Sopenharmony_ci	},
75648c2ecf20Sopenharmony_ci	{
75658c2ecf20Sopenharmony_ci		.name = "swap.high",
75668c2ecf20Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
75678c2ecf20Sopenharmony_ci		.seq_show = swap_high_show,
75688c2ecf20Sopenharmony_ci		.write = swap_high_write,
75698c2ecf20Sopenharmony_ci	},
75708c2ecf20Sopenharmony_ci	{
75718c2ecf20Sopenharmony_ci		.name = "swap.max",
75728c2ecf20Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
75738c2ecf20Sopenharmony_ci		.seq_show = swap_max_show,
75748c2ecf20Sopenharmony_ci		.write = swap_max_write,
75758c2ecf20Sopenharmony_ci	},
75768c2ecf20Sopenharmony_ci	{
75778c2ecf20Sopenharmony_ci		.name = "swap.events",
75788c2ecf20Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
75798c2ecf20Sopenharmony_ci		.file_offset = offsetof(struct mem_cgroup, swap_events_file),
75808c2ecf20Sopenharmony_ci		.seq_show = swap_events_show,
75818c2ecf20Sopenharmony_ci	},
75828c2ecf20Sopenharmony_ci	{ }	/* terminate */
75838c2ecf20Sopenharmony_ci};
75848c2ecf20Sopenharmony_ci
75858c2ecf20Sopenharmony_cistatic struct cftype memsw_files[] = {
75868c2ecf20Sopenharmony_ci	{
75878c2ecf20Sopenharmony_ci		.name = "memsw.usage_in_bytes",
75888c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
75898c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
75908c2ecf20Sopenharmony_ci	},
75918c2ecf20Sopenharmony_ci	{
75928c2ecf20Sopenharmony_ci		.name = "memsw.max_usage_in_bytes",
75938c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
75948c2ecf20Sopenharmony_ci		.write = mem_cgroup_reset,
75958c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
75968c2ecf20Sopenharmony_ci	},
75978c2ecf20Sopenharmony_ci	{
75988c2ecf20Sopenharmony_ci		.name = "memsw.limit_in_bytes",
75998c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
76008c2ecf20Sopenharmony_ci		.write = mem_cgroup_write,
76018c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
76028c2ecf20Sopenharmony_ci	},
76038c2ecf20Sopenharmony_ci	{
76048c2ecf20Sopenharmony_ci		.name = "memsw.failcnt",
76058c2ecf20Sopenharmony_ci		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
76068c2ecf20Sopenharmony_ci		.write = mem_cgroup_reset,
76078c2ecf20Sopenharmony_ci		.read_u64 = mem_cgroup_read_u64,
76088c2ecf20Sopenharmony_ci	},
76098c2ecf20Sopenharmony_ci	{ },	/* terminate */
76108c2ecf20Sopenharmony_ci};
76118c2ecf20Sopenharmony_ci
76128c2ecf20Sopenharmony_ci/*
76138c2ecf20Sopenharmony_ci * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
76148c2ecf20Sopenharmony_ci * instead of a core_initcall(), this could mean cgroup_memory_noswap still
76158c2ecf20Sopenharmony_ci * remains set to false even when memcg is disabled via "cgroup_disable=memory"
76168c2ecf20Sopenharmony_ci * boot parameter. This may result in premature OOPS inside
76178c2ecf20Sopenharmony_ci * mem_cgroup_get_nr_swap_pages() function in corner cases.
76188c2ecf20Sopenharmony_ci */
76198c2ecf20Sopenharmony_cistatic int __init mem_cgroup_swap_init(void)
76208c2ecf20Sopenharmony_ci{
76218c2ecf20Sopenharmony_ci	/* No memory control -> no swap control */
76228c2ecf20Sopenharmony_ci	if (mem_cgroup_disabled())
76238c2ecf20Sopenharmony_ci		cgroup_memory_noswap = true;
76248c2ecf20Sopenharmony_ci
76258c2ecf20Sopenharmony_ci	if (cgroup_memory_noswap)
76268c2ecf20Sopenharmony_ci		return 0;
76278c2ecf20Sopenharmony_ci
76288c2ecf20Sopenharmony_ci	WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
76298c2ecf20Sopenharmony_ci	WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
76308c2ecf20Sopenharmony_ci
76318c2ecf20Sopenharmony_ci	return 0;
76328c2ecf20Sopenharmony_ci}
76338c2ecf20Sopenharmony_cicore_initcall(mem_cgroup_swap_init);
76348c2ecf20Sopenharmony_ci
76358c2ecf20Sopenharmony_ci#endif /* CONFIG_MEMCG_SWAP */
7636