18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 28c2ecf20Sopenharmony_ci/* memcontrol.c - Memory Controller 38c2ecf20Sopenharmony_ci * 48c2ecf20Sopenharmony_ci * Copyright IBM Corporation, 2007 58c2ecf20Sopenharmony_ci * Author Balbir Singh <balbir@linux.vnet.ibm.com> 68c2ecf20Sopenharmony_ci * 78c2ecf20Sopenharmony_ci * Copyright 2007 OpenVZ SWsoft Inc 88c2ecf20Sopenharmony_ci * Author: Pavel Emelianov <xemul@openvz.org> 98c2ecf20Sopenharmony_ci * 108c2ecf20Sopenharmony_ci * Memory thresholds 118c2ecf20Sopenharmony_ci * Copyright (C) 2009 Nokia Corporation 128c2ecf20Sopenharmony_ci * Author: Kirill A. Shutemov 138c2ecf20Sopenharmony_ci * 148c2ecf20Sopenharmony_ci * Kernel Memory Controller 158c2ecf20Sopenharmony_ci * Copyright (C) 2012 Parallels Inc. and Google Inc. 168c2ecf20Sopenharmony_ci * Authors: Glauber Costa and Suleiman Souhlal 178c2ecf20Sopenharmony_ci * 188c2ecf20Sopenharmony_ci * Native page reclaim 198c2ecf20Sopenharmony_ci * Charge lifetime sanitation 208c2ecf20Sopenharmony_ci * Lockless page tracking & accounting 218c2ecf20Sopenharmony_ci * Unified hierarchy configuration model 228c2ecf20Sopenharmony_ci * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 238c2ecf20Sopenharmony_ci */ 248c2ecf20Sopenharmony_ci 258c2ecf20Sopenharmony_ci#include <linux/page_counter.h> 268c2ecf20Sopenharmony_ci#include <linux/memcontrol.h> 278c2ecf20Sopenharmony_ci#include <linux/cgroup.h> 288c2ecf20Sopenharmony_ci#include <linux/pagewalk.h> 298c2ecf20Sopenharmony_ci#include <linux/sched/mm.h> 308c2ecf20Sopenharmony_ci#include <linux/shmem_fs.h> 318c2ecf20Sopenharmony_ci#include <linux/hugetlb.h> 328c2ecf20Sopenharmony_ci#include <linux/pagemap.h> 338c2ecf20Sopenharmony_ci#include <linux/vm_event_item.h> 348c2ecf20Sopenharmony_ci#include <linux/smp.h> 358c2ecf20Sopenharmony_ci#include <linux/page-flags.h> 368c2ecf20Sopenharmony_ci#include <linux/backing-dev.h> 378c2ecf20Sopenharmony_ci#include <linux/bit_spinlock.h> 388c2ecf20Sopenharmony_ci#include <linux/rcupdate.h> 398c2ecf20Sopenharmony_ci#include <linux/limits.h> 408c2ecf20Sopenharmony_ci#include <linux/export.h> 418c2ecf20Sopenharmony_ci#include <linux/mutex.h> 428c2ecf20Sopenharmony_ci#include <linux/rbtree.h> 438c2ecf20Sopenharmony_ci#include <linux/slab.h> 448c2ecf20Sopenharmony_ci#include <linux/swap.h> 458c2ecf20Sopenharmony_ci#include <linux/swapops.h> 468c2ecf20Sopenharmony_ci#include <linux/spinlock.h> 478c2ecf20Sopenharmony_ci#include <linux/eventfd.h> 488c2ecf20Sopenharmony_ci#include <linux/poll.h> 498c2ecf20Sopenharmony_ci#include <linux/sort.h> 508c2ecf20Sopenharmony_ci#include <linux/fs.h> 518c2ecf20Sopenharmony_ci#include <linux/seq_file.h> 528c2ecf20Sopenharmony_ci#include <linux/vmpressure.h> 538c2ecf20Sopenharmony_ci#include <linux/mm_inline.h> 548c2ecf20Sopenharmony_ci#include <linux/swap_cgroup.h> 558c2ecf20Sopenharmony_ci#include <linux/cpu.h> 568c2ecf20Sopenharmony_ci#include <linux/oom.h> 578c2ecf20Sopenharmony_ci#include <linux/lockdep.h> 588c2ecf20Sopenharmony_ci#include <linux/file.h> 598c2ecf20Sopenharmony_ci#include <linux/tracehook.h> 608c2ecf20Sopenharmony_ci#include <linux/psi.h> 618c2ecf20Sopenharmony_ci#include <linux/seq_buf.h> 628c2ecf20Sopenharmony_ci#include "internal.h" 638c2ecf20Sopenharmony_ci#include <net/sock.h> 648c2ecf20Sopenharmony_ci#include <net/ip.h> 658c2ecf20Sopenharmony_ci#include "slab.h" 668c2ecf20Sopenharmony_ci 678c2ecf20Sopenharmony_ci#include <linux/uaccess.h> 688c2ecf20Sopenharmony_ci#include <linux/zswapd.h> 698c2ecf20Sopenharmony_ci 708c2ecf20Sopenharmony_ci#include <trace/events/vmscan.h> 718c2ecf20Sopenharmony_ci 728c2ecf20Sopenharmony_cistruct cgroup_subsys memory_cgrp_subsys __read_mostly; 738c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memory_cgrp_subsys); 748c2ecf20Sopenharmony_ci 758c2ecf20Sopenharmony_cistruct mem_cgroup *root_mem_cgroup __read_mostly; 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci/* Active memory cgroup to use from an interrupt context */ 788c2ecf20Sopenharmony_ciDEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); 798c2ecf20Sopenharmony_ci 808c2ecf20Sopenharmony_ci/* Socket memory accounting disabled? */ 818c2ecf20Sopenharmony_cistatic bool cgroup_memory_nosocket; 828c2ecf20Sopenharmony_ci 838c2ecf20Sopenharmony_ci/* Kernel memory accounting disabled */ 848c2ecf20Sopenharmony_cistatic bool cgroup_memory_nokmem = true; 858c2ecf20Sopenharmony_ci 868c2ecf20Sopenharmony_ci/* Whether the swap controller is active */ 878c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_SWAP 888c2ecf20Sopenharmony_cibool cgroup_memory_noswap __read_mostly; 898c2ecf20Sopenharmony_ci#else 908c2ecf20Sopenharmony_ci#define cgroup_memory_noswap 1 918c2ecf20Sopenharmony_ci#endif 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK 948c2ecf20Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); 958c2ecf20Sopenharmony_ci#endif 968c2ecf20Sopenharmony_ci 978c2ecf20Sopenharmony_ci/* Whether legacy memory+swap accounting is active */ 988c2ecf20Sopenharmony_cistatic bool do_memsw_account(void) 998c2ecf20Sopenharmony_ci{ 1008c2ecf20Sopenharmony_ci return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; 1018c2ecf20Sopenharmony_ci} 1028c2ecf20Sopenharmony_ci 1038c2ecf20Sopenharmony_ci#define THRESHOLDS_EVENTS_TARGET 128 1048c2ecf20Sopenharmony_ci#define SOFTLIMIT_EVENTS_TARGET 1024 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci/* 1078c2ecf20Sopenharmony_ci * Cgroups above their limits are maintained in a RB-Tree, independent of 1088c2ecf20Sopenharmony_ci * their hierarchy representation 1098c2ecf20Sopenharmony_ci */ 1108c2ecf20Sopenharmony_ci 1118c2ecf20Sopenharmony_cistruct mem_cgroup_tree_per_node { 1128c2ecf20Sopenharmony_ci struct rb_root rb_root; 1138c2ecf20Sopenharmony_ci struct rb_node *rb_rightmost; 1148c2ecf20Sopenharmony_ci spinlock_t lock; 1158c2ecf20Sopenharmony_ci}; 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_cistruct mem_cgroup_tree { 1188c2ecf20Sopenharmony_ci struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 1198c2ecf20Sopenharmony_ci}; 1208c2ecf20Sopenharmony_ci 1218c2ecf20Sopenharmony_cistatic struct mem_cgroup_tree soft_limit_tree __read_mostly; 1228c2ecf20Sopenharmony_ci 1238c2ecf20Sopenharmony_ci/* for OOM */ 1248c2ecf20Sopenharmony_cistruct mem_cgroup_eventfd_list { 1258c2ecf20Sopenharmony_ci struct list_head list; 1268c2ecf20Sopenharmony_ci struct eventfd_ctx *eventfd; 1278c2ecf20Sopenharmony_ci}; 1288c2ecf20Sopenharmony_ci 1298c2ecf20Sopenharmony_ci/* 1308c2ecf20Sopenharmony_ci * cgroup_event represents events which userspace want to receive. 1318c2ecf20Sopenharmony_ci */ 1328c2ecf20Sopenharmony_cistruct mem_cgroup_event { 1338c2ecf20Sopenharmony_ci /* 1348c2ecf20Sopenharmony_ci * memcg which the event belongs to. 1358c2ecf20Sopenharmony_ci */ 1368c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 1378c2ecf20Sopenharmony_ci /* 1388c2ecf20Sopenharmony_ci * eventfd to signal userspace about the event. 1398c2ecf20Sopenharmony_ci */ 1408c2ecf20Sopenharmony_ci struct eventfd_ctx *eventfd; 1418c2ecf20Sopenharmony_ci /* 1428c2ecf20Sopenharmony_ci * Each of these stored in a list by the cgroup. 1438c2ecf20Sopenharmony_ci */ 1448c2ecf20Sopenharmony_ci struct list_head list; 1458c2ecf20Sopenharmony_ci /* 1468c2ecf20Sopenharmony_ci * register_event() callback will be used to add new userspace 1478c2ecf20Sopenharmony_ci * waiter for changes related to this event. Use eventfd_signal() 1488c2ecf20Sopenharmony_ci * on eventfd to send notification to userspace. 1498c2ecf20Sopenharmony_ci */ 1508c2ecf20Sopenharmony_ci int (*register_event)(struct mem_cgroup *memcg, 1518c2ecf20Sopenharmony_ci struct eventfd_ctx *eventfd, const char *args); 1528c2ecf20Sopenharmony_ci /* 1538c2ecf20Sopenharmony_ci * unregister_event() callback will be called when userspace closes 1548c2ecf20Sopenharmony_ci * the eventfd or on cgroup removing. This callback must be set, 1558c2ecf20Sopenharmony_ci * if you want provide notification functionality. 1568c2ecf20Sopenharmony_ci */ 1578c2ecf20Sopenharmony_ci void (*unregister_event)(struct mem_cgroup *memcg, 1588c2ecf20Sopenharmony_ci struct eventfd_ctx *eventfd); 1598c2ecf20Sopenharmony_ci /* 1608c2ecf20Sopenharmony_ci * All fields below needed to unregister event when 1618c2ecf20Sopenharmony_ci * userspace closes eventfd. 1628c2ecf20Sopenharmony_ci */ 1638c2ecf20Sopenharmony_ci poll_table pt; 1648c2ecf20Sopenharmony_ci wait_queue_head_t *wqh; 1658c2ecf20Sopenharmony_ci wait_queue_entry_t wait; 1668c2ecf20Sopenharmony_ci struct work_struct remove; 1678c2ecf20Sopenharmony_ci}; 1688c2ecf20Sopenharmony_ci 1698c2ecf20Sopenharmony_cistatic void mem_cgroup_threshold(struct mem_cgroup *memcg); 1708c2ecf20Sopenharmony_cistatic void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 1718c2ecf20Sopenharmony_ci 1728c2ecf20Sopenharmony_ci/* Stuffs for move charges at task migration. */ 1738c2ecf20Sopenharmony_ci/* 1748c2ecf20Sopenharmony_ci * Types of charges to be moved. 1758c2ecf20Sopenharmony_ci */ 1768c2ecf20Sopenharmony_ci#define MOVE_ANON 0x1U 1778c2ecf20Sopenharmony_ci#define MOVE_FILE 0x2U 1788c2ecf20Sopenharmony_ci#define MOVE_MASK (MOVE_ANON | MOVE_FILE) 1798c2ecf20Sopenharmony_ci 1808c2ecf20Sopenharmony_ci/* "mc" and its members are protected by cgroup_mutex */ 1818c2ecf20Sopenharmony_cistatic struct move_charge_struct { 1828c2ecf20Sopenharmony_ci spinlock_t lock; /* for from, to */ 1838c2ecf20Sopenharmony_ci struct mm_struct *mm; 1848c2ecf20Sopenharmony_ci struct mem_cgroup *from; 1858c2ecf20Sopenharmony_ci struct mem_cgroup *to; 1868c2ecf20Sopenharmony_ci unsigned long flags; 1878c2ecf20Sopenharmony_ci unsigned long precharge; 1888c2ecf20Sopenharmony_ci unsigned long moved_charge; 1898c2ecf20Sopenharmony_ci unsigned long moved_swap; 1908c2ecf20Sopenharmony_ci struct task_struct *moving_task; /* a task moving charges */ 1918c2ecf20Sopenharmony_ci wait_queue_head_t waitq; /* a waitq for other context */ 1928c2ecf20Sopenharmony_ci} mc = { 1938c2ecf20Sopenharmony_ci .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 1948c2ecf20Sopenharmony_ci .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 1958c2ecf20Sopenharmony_ci}; 1968c2ecf20Sopenharmony_ci 1978c2ecf20Sopenharmony_ci/* 1988c2ecf20Sopenharmony_ci * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 1998c2ecf20Sopenharmony_ci * limit reclaim to prevent infinite loops, if they ever occur. 2008c2ecf20Sopenharmony_ci */ 2018c2ecf20Sopenharmony_ci#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 2028c2ecf20Sopenharmony_ci#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ci/* for encoding cft->private value on file */ 2058c2ecf20Sopenharmony_cienum res_type { 2068c2ecf20Sopenharmony_ci _MEM, 2078c2ecf20Sopenharmony_ci _MEMSWAP, 2088c2ecf20Sopenharmony_ci _OOM_TYPE, 2098c2ecf20Sopenharmony_ci _KMEM, 2108c2ecf20Sopenharmony_ci _TCP, 2118c2ecf20Sopenharmony_ci}; 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 2148c2ecf20Sopenharmony_ci#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 2158c2ecf20Sopenharmony_ci#define MEMFILE_ATTR(val) ((val) & 0xffff) 2168c2ecf20Sopenharmony_ci/* Used for OOM nofiier */ 2178c2ecf20Sopenharmony_ci#define OOM_CONTROL (0) 2188c2ecf20Sopenharmony_ci 2198c2ecf20Sopenharmony_ci/* 2208c2ecf20Sopenharmony_ci * Iteration constructs for visiting all cgroups (under a tree). If 2218c2ecf20Sopenharmony_ci * loops are exited prematurely (break), mem_cgroup_iter_break() must 2228c2ecf20Sopenharmony_ci * be used for reference counting. 2238c2ecf20Sopenharmony_ci */ 2248c2ecf20Sopenharmony_ci#define for_each_mem_cgroup_tree(iter, root) \ 2258c2ecf20Sopenharmony_ci for (iter = mem_cgroup_iter(root, NULL, NULL); \ 2268c2ecf20Sopenharmony_ci iter != NULL; \ 2278c2ecf20Sopenharmony_ci iter = mem_cgroup_iter(root, iter, NULL)) 2288c2ecf20Sopenharmony_ci 2298c2ecf20Sopenharmony_ci#define for_each_mem_cgroup(iter) \ 2308c2ecf20Sopenharmony_ci for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 2318c2ecf20Sopenharmony_ci iter != NULL; \ 2328c2ecf20Sopenharmony_ci iter = mem_cgroup_iter(NULL, iter, NULL)) 2338c2ecf20Sopenharmony_ci 2348c2ecf20Sopenharmony_cistatic inline bool task_is_dying(void) 2358c2ecf20Sopenharmony_ci{ 2368c2ecf20Sopenharmony_ci return tsk_is_oom_victim(current) || fatal_signal_pending(current) || 2378c2ecf20Sopenharmony_ci (current->flags & PF_EXITING); 2388c2ecf20Sopenharmony_ci} 2398c2ecf20Sopenharmony_ci 2408c2ecf20Sopenharmony_ci/* Some nice accessors for the vmpressure. */ 2418c2ecf20Sopenharmony_cistruct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 2428c2ecf20Sopenharmony_ci{ 2438c2ecf20Sopenharmony_ci if (!memcg) 2448c2ecf20Sopenharmony_ci memcg = root_mem_cgroup; 2458c2ecf20Sopenharmony_ci return &memcg->vmpressure; 2468c2ecf20Sopenharmony_ci} 2478c2ecf20Sopenharmony_ci 2488c2ecf20Sopenharmony_cistruct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 2498c2ecf20Sopenharmony_ci{ 2508c2ecf20Sopenharmony_ci return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 2518c2ecf20Sopenharmony_ci} 2528c2ecf20Sopenharmony_ci 2538c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM 2548c2ecf20Sopenharmony_cistatic DEFINE_SPINLOCK(objcg_lock); 2558c2ecf20Sopenharmony_ci 2568c2ecf20Sopenharmony_cistatic void obj_cgroup_release(struct percpu_ref *ref) 2578c2ecf20Sopenharmony_ci{ 2588c2ecf20Sopenharmony_ci struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); 2598c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 2608c2ecf20Sopenharmony_ci unsigned int nr_bytes; 2618c2ecf20Sopenharmony_ci unsigned int nr_pages; 2628c2ecf20Sopenharmony_ci unsigned long flags; 2638c2ecf20Sopenharmony_ci 2648c2ecf20Sopenharmony_ci /* 2658c2ecf20Sopenharmony_ci * At this point all allocated objects are freed, and 2668c2ecf20Sopenharmony_ci * objcg->nr_charged_bytes can't have an arbitrary byte value. 2678c2ecf20Sopenharmony_ci * However, it can be PAGE_SIZE or (x * PAGE_SIZE). 2688c2ecf20Sopenharmony_ci * 2698c2ecf20Sopenharmony_ci * The following sequence can lead to it: 2708c2ecf20Sopenharmony_ci * 1) CPU0: objcg == stock->cached_objcg 2718c2ecf20Sopenharmony_ci * 2) CPU1: we do a small allocation (e.g. 92 bytes), 2728c2ecf20Sopenharmony_ci * PAGE_SIZE bytes are charged 2738c2ecf20Sopenharmony_ci * 3) CPU1: a process from another memcg is allocating something, 2748c2ecf20Sopenharmony_ci * the stock if flushed, 2758c2ecf20Sopenharmony_ci * objcg->nr_charged_bytes = PAGE_SIZE - 92 2768c2ecf20Sopenharmony_ci * 5) CPU0: we do release this object, 2778c2ecf20Sopenharmony_ci * 92 bytes are added to stock->nr_bytes 2788c2ecf20Sopenharmony_ci * 6) CPU0: stock is flushed, 2798c2ecf20Sopenharmony_ci * 92 bytes are added to objcg->nr_charged_bytes 2808c2ecf20Sopenharmony_ci * 2818c2ecf20Sopenharmony_ci * In the result, nr_charged_bytes == PAGE_SIZE. 2828c2ecf20Sopenharmony_ci * This page will be uncharged in obj_cgroup_release(). 2838c2ecf20Sopenharmony_ci */ 2848c2ecf20Sopenharmony_ci nr_bytes = atomic_read(&objcg->nr_charged_bytes); 2858c2ecf20Sopenharmony_ci WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); 2868c2ecf20Sopenharmony_ci nr_pages = nr_bytes >> PAGE_SHIFT; 2878c2ecf20Sopenharmony_ci 2888c2ecf20Sopenharmony_ci spin_lock_irqsave(&objcg_lock, flags); 2898c2ecf20Sopenharmony_ci memcg = obj_cgroup_memcg(objcg); 2908c2ecf20Sopenharmony_ci if (nr_pages) 2918c2ecf20Sopenharmony_ci __memcg_kmem_uncharge(memcg, nr_pages); 2928c2ecf20Sopenharmony_ci list_del(&objcg->list); 2938c2ecf20Sopenharmony_ci mem_cgroup_put(memcg); 2948c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&objcg_lock, flags); 2958c2ecf20Sopenharmony_ci 2968c2ecf20Sopenharmony_ci percpu_ref_exit(ref); 2978c2ecf20Sopenharmony_ci kfree_rcu(objcg, rcu); 2988c2ecf20Sopenharmony_ci} 2998c2ecf20Sopenharmony_ci 3008c2ecf20Sopenharmony_cistatic struct obj_cgroup *obj_cgroup_alloc(void) 3018c2ecf20Sopenharmony_ci{ 3028c2ecf20Sopenharmony_ci struct obj_cgroup *objcg; 3038c2ecf20Sopenharmony_ci int ret; 3048c2ecf20Sopenharmony_ci 3058c2ecf20Sopenharmony_ci objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL); 3068c2ecf20Sopenharmony_ci if (!objcg) 3078c2ecf20Sopenharmony_ci return NULL; 3088c2ecf20Sopenharmony_ci 3098c2ecf20Sopenharmony_ci ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0, 3108c2ecf20Sopenharmony_ci GFP_KERNEL); 3118c2ecf20Sopenharmony_ci if (ret) { 3128c2ecf20Sopenharmony_ci kfree(objcg); 3138c2ecf20Sopenharmony_ci return NULL; 3148c2ecf20Sopenharmony_ci } 3158c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&objcg->list); 3168c2ecf20Sopenharmony_ci return objcg; 3178c2ecf20Sopenharmony_ci} 3188c2ecf20Sopenharmony_ci 3198c2ecf20Sopenharmony_cistatic void memcg_reparent_objcgs(struct mem_cgroup *memcg, 3208c2ecf20Sopenharmony_ci struct mem_cgroup *parent) 3218c2ecf20Sopenharmony_ci{ 3228c2ecf20Sopenharmony_ci struct obj_cgroup *objcg, *iter; 3238c2ecf20Sopenharmony_ci 3248c2ecf20Sopenharmony_ci objcg = rcu_replace_pointer(memcg->objcg, NULL, true); 3258c2ecf20Sopenharmony_ci 3268c2ecf20Sopenharmony_ci spin_lock_irq(&objcg_lock); 3278c2ecf20Sopenharmony_ci 3288c2ecf20Sopenharmony_ci /* Move active objcg to the parent's list */ 3298c2ecf20Sopenharmony_ci xchg(&objcg->memcg, parent); 3308c2ecf20Sopenharmony_ci css_get(&parent->css); 3318c2ecf20Sopenharmony_ci list_add(&objcg->list, &parent->objcg_list); 3328c2ecf20Sopenharmony_ci 3338c2ecf20Sopenharmony_ci /* Move already reparented objcgs to the parent's list */ 3348c2ecf20Sopenharmony_ci list_for_each_entry(iter, &memcg->objcg_list, list) { 3358c2ecf20Sopenharmony_ci css_get(&parent->css); 3368c2ecf20Sopenharmony_ci xchg(&iter->memcg, parent); 3378c2ecf20Sopenharmony_ci css_put(&memcg->css); 3388c2ecf20Sopenharmony_ci } 3398c2ecf20Sopenharmony_ci list_splice(&memcg->objcg_list, &parent->objcg_list); 3408c2ecf20Sopenharmony_ci 3418c2ecf20Sopenharmony_ci spin_unlock_irq(&objcg_lock); 3428c2ecf20Sopenharmony_ci 3438c2ecf20Sopenharmony_ci percpu_ref_kill(&objcg->refcnt); 3448c2ecf20Sopenharmony_ci} 3458c2ecf20Sopenharmony_ci 3468c2ecf20Sopenharmony_ci/* 3478c2ecf20Sopenharmony_ci * This will be used as a shrinker list's index. 3488c2ecf20Sopenharmony_ci * The main reason for not using cgroup id for this: 3498c2ecf20Sopenharmony_ci * this works better in sparse environments, where we have a lot of memcgs, 3508c2ecf20Sopenharmony_ci * but only a few kmem-limited. Or also, if we have, for instance, 200 3518c2ecf20Sopenharmony_ci * memcgs, and none but the 200th is kmem-limited, we'd have to have a 3528c2ecf20Sopenharmony_ci * 200 entry array for that. 3538c2ecf20Sopenharmony_ci * 3548c2ecf20Sopenharmony_ci * The current size of the caches array is stored in memcg_nr_cache_ids. It 3558c2ecf20Sopenharmony_ci * will double each time we have to increase it. 3568c2ecf20Sopenharmony_ci */ 3578c2ecf20Sopenharmony_cistatic DEFINE_IDA(memcg_cache_ida); 3588c2ecf20Sopenharmony_ciint memcg_nr_cache_ids; 3598c2ecf20Sopenharmony_ci 3608c2ecf20Sopenharmony_ci/* Protects memcg_nr_cache_ids */ 3618c2ecf20Sopenharmony_cistatic DECLARE_RWSEM(memcg_cache_ids_sem); 3628c2ecf20Sopenharmony_ci 3638c2ecf20Sopenharmony_civoid memcg_get_cache_ids(void) 3648c2ecf20Sopenharmony_ci{ 3658c2ecf20Sopenharmony_ci down_read(&memcg_cache_ids_sem); 3668c2ecf20Sopenharmony_ci} 3678c2ecf20Sopenharmony_ci 3688c2ecf20Sopenharmony_civoid memcg_put_cache_ids(void) 3698c2ecf20Sopenharmony_ci{ 3708c2ecf20Sopenharmony_ci up_read(&memcg_cache_ids_sem); 3718c2ecf20Sopenharmony_ci} 3728c2ecf20Sopenharmony_ci 3738c2ecf20Sopenharmony_ci/* 3748c2ecf20Sopenharmony_ci * MIN_SIZE is different than 1, because we would like to avoid going through 3758c2ecf20Sopenharmony_ci * the alloc/free process all the time. In a small machine, 4 kmem-limited 3768c2ecf20Sopenharmony_ci * cgroups is a reasonable guess. In the future, it could be a parameter or 3778c2ecf20Sopenharmony_ci * tunable, but that is strictly not necessary. 3788c2ecf20Sopenharmony_ci * 3798c2ecf20Sopenharmony_ci * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 3808c2ecf20Sopenharmony_ci * this constant directly from cgroup, but it is understandable that this is 3818c2ecf20Sopenharmony_ci * better kept as an internal representation in cgroup.c. In any case, the 3828c2ecf20Sopenharmony_ci * cgrp_id space is not getting any smaller, and we don't have to necessarily 3838c2ecf20Sopenharmony_ci * increase ours as well if it increases. 3848c2ecf20Sopenharmony_ci */ 3858c2ecf20Sopenharmony_ci#define MEMCG_CACHES_MIN_SIZE 4 3868c2ecf20Sopenharmony_ci#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 3878c2ecf20Sopenharmony_ci 3888c2ecf20Sopenharmony_ci/* 3898c2ecf20Sopenharmony_ci * A lot of the calls to the cache allocation functions are expected to be 3908c2ecf20Sopenharmony_ci * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are 3918c2ecf20Sopenharmony_ci * conditional to this static branch, we'll have to allow modules that does 3928c2ecf20Sopenharmony_ci * kmem_cache_alloc and the such to see this symbol as well 3938c2ecf20Sopenharmony_ci */ 3948c2ecf20Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); 3958c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memcg_kmem_enabled_key); 3968c2ecf20Sopenharmony_ci#endif 3978c2ecf20Sopenharmony_ci 3988c2ecf20Sopenharmony_cistatic int memcg_shrinker_map_size; 3998c2ecf20Sopenharmony_cistatic DEFINE_MUTEX(memcg_shrinker_map_mutex); 4008c2ecf20Sopenharmony_ci 4018c2ecf20Sopenharmony_cistatic void memcg_free_shrinker_map_rcu(struct rcu_head *head) 4028c2ecf20Sopenharmony_ci{ 4038c2ecf20Sopenharmony_ci kvfree(container_of(head, struct memcg_shrinker_map, rcu)); 4048c2ecf20Sopenharmony_ci} 4058c2ecf20Sopenharmony_ci 4068c2ecf20Sopenharmony_cistatic int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, 4078c2ecf20Sopenharmony_ci int size, int old_size) 4088c2ecf20Sopenharmony_ci{ 4098c2ecf20Sopenharmony_ci struct memcg_shrinker_map *new, *old; 4108c2ecf20Sopenharmony_ci int nid; 4118c2ecf20Sopenharmony_ci 4128c2ecf20Sopenharmony_ci lockdep_assert_held(&memcg_shrinker_map_mutex); 4138c2ecf20Sopenharmony_ci 4148c2ecf20Sopenharmony_ci for_each_node(nid) { 4158c2ecf20Sopenharmony_ci old = rcu_dereference_protected( 4168c2ecf20Sopenharmony_ci mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true); 4178c2ecf20Sopenharmony_ci /* Not yet online memcg */ 4188c2ecf20Sopenharmony_ci if (!old) 4198c2ecf20Sopenharmony_ci return 0; 4208c2ecf20Sopenharmony_ci 4218c2ecf20Sopenharmony_ci new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); 4228c2ecf20Sopenharmony_ci if (!new) 4238c2ecf20Sopenharmony_ci return -ENOMEM; 4248c2ecf20Sopenharmony_ci 4258c2ecf20Sopenharmony_ci /* Set all old bits, clear all new bits */ 4268c2ecf20Sopenharmony_ci memset(new->map, (int)0xff, old_size); 4278c2ecf20Sopenharmony_ci memset((void *)new->map + old_size, 0, size - old_size); 4288c2ecf20Sopenharmony_ci 4298c2ecf20Sopenharmony_ci rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new); 4308c2ecf20Sopenharmony_ci call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); 4318c2ecf20Sopenharmony_ci } 4328c2ecf20Sopenharmony_ci 4338c2ecf20Sopenharmony_ci return 0; 4348c2ecf20Sopenharmony_ci} 4358c2ecf20Sopenharmony_ci 4368c2ecf20Sopenharmony_cistatic void memcg_free_shrinker_maps(struct mem_cgroup *memcg) 4378c2ecf20Sopenharmony_ci{ 4388c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *pn; 4398c2ecf20Sopenharmony_ci struct memcg_shrinker_map *map; 4408c2ecf20Sopenharmony_ci int nid; 4418c2ecf20Sopenharmony_ci 4428c2ecf20Sopenharmony_ci if (mem_cgroup_is_root(memcg)) 4438c2ecf20Sopenharmony_ci return; 4448c2ecf20Sopenharmony_ci 4458c2ecf20Sopenharmony_ci for_each_node(nid) { 4468c2ecf20Sopenharmony_ci pn = mem_cgroup_nodeinfo(memcg, nid); 4478c2ecf20Sopenharmony_ci map = rcu_dereference_protected(pn->shrinker_map, true); 4488c2ecf20Sopenharmony_ci if (map) 4498c2ecf20Sopenharmony_ci kvfree(map); 4508c2ecf20Sopenharmony_ci rcu_assign_pointer(pn->shrinker_map, NULL); 4518c2ecf20Sopenharmony_ci } 4528c2ecf20Sopenharmony_ci} 4538c2ecf20Sopenharmony_ci 4548c2ecf20Sopenharmony_cistatic int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) 4558c2ecf20Sopenharmony_ci{ 4568c2ecf20Sopenharmony_ci struct memcg_shrinker_map *map; 4578c2ecf20Sopenharmony_ci int nid, size, ret = 0; 4588c2ecf20Sopenharmony_ci 4598c2ecf20Sopenharmony_ci if (mem_cgroup_is_root(memcg)) 4608c2ecf20Sopenharmony_ci return 0; 4618c2ecf20Sopenharmony_ci 4628c2ecf20Sopenharmony_ci mutex_lock(&memcg_shrinker_map_mutex); 4638c2ecf20Sopenharmony_ci size = memcg_shrinker_map_size; 4648c2ecf20Sopenharmony_ci for_each_node(nid) { 4658c2ecf20Sopenharmony_ci map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); 4668c2ecf20Sopenharmony_ci if (!map) { 4678c2ecf20Sopenharmony_ci memcg_free_shrinker_maps(memcg); 4688c2ecf20Sopenharmony_ci ret = -ENOMEM; 4698c2ecf20Sopenharmony_ci break; 4708c2ecf20Sopenharmony_ci } 4718c2ecf20Sopenharmony_ci rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); 4728c2ecf20Sopenharmony_ci } 4738c2ecf20Sopenharmony_ci mutex_unlock(&memcg_shrinker_map_mutex); 4748c2ecf20Sopenharmony_ci 4758c2ecf20Sopenharmony_ci return ret; 4768c2ecf20Sopenharmony_ci} 4778c2ecf20Sopenharmony_ci 4788c2ecf20Sopenharmony_ciint memcg_expand_shrinker_maps(int new_id) 4798c2ecf20Sopenharmony_ci{ 4808c2ecf20Sopenharmony_ci int size, old_size, ret = 0; 4818c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 4828c2ecf20Sopenharmony_ci 4838c2ecf20Sopenharmony_ci size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); 4848c2ecf20Sopenharmony_ci old_size = memcg_shrinker_map_size; 4858c2ecf20Sopenharmony_ci if (size <= old_size) 4868c2ecf20Sopenharmony_ci return 0; 4878c2ecf20Sopenharmony_ci 4888c2ecf20Sopenharmony_ci mutex_lock(&memcg_shrinker_map_mutex); 4898c2ecf20Sopenharmony_ci if (!root_mem_cgroup) 4908c2ecf20Sopenharmony_ci goto unlock; 4918c2ecf20Sopenharmony_ci 4928c2ecf20Sopenharmony_ci for_each_mem_cgroup(memcg) { 4938c2ecf20Sopenharmony_ci if (mem_cgroup_is_root(memcg)) 4948c2ecf20Sopenharmony_ci continue; 4958c2ecf20Sopenharmony_ci ret = memcg_expand_one_shrinker_map(memcg, size, old_size); 4968c2ecf20Sopenharmony_ci if (ret) { 4978c2ecf20Sopenharmony_ci mem_cgroup_iter_break(NULL, memcg); 4988c2ecf20Sopenharmony_ci goto unlock; 4998c2ecf20Sopenharmony_ci } 5008c2ecf20Sopenharmony_ci } 5018c2ecf20Sopenharmony_ciunlock: 5028c2ecf20Sopenharmony_ci if (!ret) 5038c2ecf20Sopenharmony_ci memcg_shrinker_map_size = size; 5048c2ecf20Sopenharmony_ci mutex_unlock(&memcg_shrinker_map_mutex); 5058c2ecf20Sopenharmony_ci return ret; 5068c2ecf20Sopenharmony_ci} 5078c2ecf20Sopenharmony_ci 5088c2ecf20Sopenharmony_civoid memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) 5098c2ecf20Sopenharmony_ci{ 5108c2ecf20Sopenharmony_ci if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { 5118c2ecf20Sopenharmony_ci struct memcg_shrinker_map *map; 5128c2ecf20Sopenharmony_ci 5138c2ecf20Sopenharmony_ci rcu_read_lock(); 5148c2ecf20Sopenharmony_ci map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); 5158c2ecf20Sopenharmony_ci /* Pairs with smp mb in shrink_slab() */ 5168c2ecf20Sopenharmony_ci smp_mb__before_atomic(); 5178c2ecf20Sopenharmony_ci set_bit(shrinker_id, map->map); 5188c2ecf20Sopenharmony_ci rcu_read_unlock(); 5198c2ecf20Sopenharmony_ci } 5208c2ecf20Sopenharmony_ci} 5218c2ecf20Sopenharmony_ci 5228c2ecf20Sopenharmony_ci/** 5238c2ecf20Sopenharmony_ci * mem_cgroup_css_from_page - css of the memcg associated with a page 5248c2ecf20Sopenharmony_ci * @page: page of interest 5258c2ecf20Sopenharmony_ci * 5268c2ecf20Sopenharmony_ci * If memcg is bound to the default hierarchy, css of the memcg associated 5278c2ecf20Sopenharmony_ci * with @page is returned. The returned css remains associated with @page 5288c2ecf20Sopenharmony_ci * until it is released. 5298c2ecf20Sopenharmony_ci * 5308c2ecf20Sopenharmony_ci * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 5318c2ecf20Sopenharmony_ci * is returned. 5328c2ecf20Sopenharmony_ci */ 5338c2ecf20Sopenharmony_cistruct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) 5348c2ecf20Sopenharmony_ci{ 5358c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 5368c2ecf20Sopenharmony_ci 5378c2ecf20Sopenharmony_ci memcg = page->mem_cgroup; 5388c2ecf20Sopenharmony_ci 5398c2ecf20Sopenharmony_ci if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 5408c2ecf20Sopenharmony_ci memcg = root_mem_cgroup; 5418c2ecf20Sopenharmony_ci 5428c2ecf20Sopenharmony_ci return &memcg->css; 5438c2ecf20Sopenharmony_ci} 5448c2ecf20Sopenharmony_ci 5458c2ecf20Sopenharmony_ci/** 5468c2ecf20Sopenharmony_ci * page_cgroup_ino - return inode number of the memcg a page is charged to 5478c2ecf20Sopenharmony_ci * @page: the page 5488c2ecf20Sopenharmony_ci * 5498c2ecf20Sopenharmony_ci * Look up the closest online ancestor of the memory cgroup @page is charged to 5508c2ecf20Sopenharmony_ci * and return its inode number or 0 if @page is not charged to any cgroup. It 5518c2ecf20Sopenharmony_ci * is safe to call this function without holding a reference to @page. 5528c2ecf20Sopenharmony_ci * 5538c2ecf20Sopenharmony_ci * Note, this function is inherently racy, because there is nothing to prevent 5548c2ecf20Sopenharmony_ci * the cgroup inode from getting torn down and potentially reallocated a moment 5558c2ecf20Sopenharmony_ci * after page_cgroup_ino() returns, so it only should be used by callers that 5568c2ecf20Sopenharmony_ci * do not care (such as procfs interfaces). 5578c2ecf20Sopenharmony_ci */ 5588c2ecf20Sopenharmony_ciino_t page_cgroup_ino(struct page *page) 5598c2ecf20Sopenharmony_ci{ 5608c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 5618c2ecf20Sopenharmony_ci unsigned long ino = 0; 5628c2ecf20Sopenharmony_ci 5638c2ecf20Sopenharmony_ci rcu_read_lock(); 5648c2ecf20Sopenharmony_ci memcg = page->mem_cgroup; 5658c2ecf20Sopenharmony_ci 5668c2ecf20Sopenharmony_ci /* 5678c2ecf20Sopenharmony_ci * The lowest bit set means that memcg isn't a valid 5688c2ecf20Sopenharmony_ci * memcg pointer, but a obj_cgroups pointer. 5698c2ecf20Sopenharmony_ci * In this case the page is shared and doesn't belong 5708c2ecf20Sopenharmony_ci * to any specific memory cgroup. 5718c2ecf20Sopenharmony_ci */ 5728c2ecf20Sopenharmony_ci if ((unsigned long) memcg & 0x1UL) 5738c2ecf20Sopenharmony_ci memcg = NULL; 5748c2ecf20Sopenharmony_ci 5758c2ecf20Sopenharmony_ci while (memcg && !(memcg->css.flags & CSS_ONLINE)) 5768c2ecf20Sopenharmony_ci memcg = parent_mem_cgroup(memcg); 5778c2ecf20Sopenharmony_ci if (memcg) 5788c2ecf20Sopenharmony_ci ino = cgroup_ino(memcg->css.cgroup); 5798c2ecf20Sopenharmony_ci rcu_read_unlock(); 5808c2ecf20Sopenharmony_ci return ino; 5818c2ecf20Sopenharmony_ci} 5828c2ecf20Sopenharmony_ci 5838c2ecf20Sopenharmony_cistatic struct mem_cgroup_per_node * 5848c2ecf20Sopenharmony_cimem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) 5858c2ecf20Sopenharmony_ci{ 5868c2ecf20Sopenharmony_ci int nid = page_to_nid(page); 5878c2ecf20Sopenharmony_ci 5888c2ecf20Sopenharmony_ci return memcg->nodeinfo[nid]; 5898c2ecf20Sopenharmony_ci} 5908c2ecf20Sopenharmony_ci 5918c2ecf20Sopenharmony_cistatic struct mem_cgroup_tree_per_node * 5928c2ecf20Sopenharmony_cisoft_limit_tree_node(int nid) 5938c2ecf20Sopenharmony_ci{ 5948c2ecf20Sopenharmony_ci return soft_limit_tree.rb_tree_per_node[nid]; 5958c2ecf20Sopenharmony_ci} 5968c2ecf20Sopenharmony_ci 5978c2ecf20Sopenharmony_cistatic struct mem_cgroup_tree_per_node * 5988c2ecf20Sopenharmony_cisoft_limit_tree_from_page(struct page *page) 5998c2ecf20Sopenharmony_ci{ 6008c2ecf20Sopenharmony_ci int nid = page_to_nid(page); 6018c2ecf20Sopenharmony_ci 6028c2ecf20Sopenharmony_ci return soft_limit_tree.rb_tree_per_node[nid]; 6038c2ecf20Sopenharmony_ci} 6048c2ecf20Sopenharmony_ci 6058c2ecf20Sopenharmony_cistatic void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 6068c2ecf20Sopenharmony_ci struct mem_cgroup_tree_per_node *mctz, 6078c2ecf20Sopenharmony_ci unsigned long new_usage_in_excess) 6088c2ecf20Sopenharmony_ci{ 6098c2ecf20Sopenharmony_ci struct rb_node **p = &mctz->rb_root.rb_node; 6108c2ecf20Sopenharmony_ci struct rb_node *parent = NULL; 6118c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *mz_node; 6128c2ecf20Sopenharmony_ci bool rightmost = true; 6138c2ecf20Sopenharmony_ci 6148c2ecf20Sopenharmony_ci if (mz->on_tree) 6158c2ecf20Sopenharmony_ci return; 6168c2ecf20Sopenharmony_ci 6178c2ecf20Sopenharmony_ci mz->usage_in_excess = new_usage_in_excess; 6188c2ecf20Sopenharmony_ci if (!mz->usage_in_excess) 6198c2ecf20Sopenharmony_ci return; 6208c2ecf20Sopenharmony_ci while (*p) { 6218c2ecf20Sopenharmony_ci parent = *p; 6228c2ecf20Sopenharmony_ci mz_node = rb_entry(parent, struct mem_cgroup_per_node, 6238c2ecf20Sopenharmony_ci tree_node); 6248c2ecf20Sopenharmony_ci if (mz->usage_in_excess < mz_node->usage_in_excess) { 6258c2ecf20Sopenharmony_ci p = &(*p)->rb_left; 6268c2ecf20Sopenharmony_ci rightmost = false; 6278c2ecf20Sopenharmony_ci } 6288c2ecf20Sopenharmony_ci 6298c2ecf20Sopenharmony_ci /* 6308c2ecf20Sopenharmony_ci * We can't avoid mem cgroups that are over their soft 6318c2ecf20Sopenharmony_ci * limit by the same amount 6328c2ecf20Sopenharmony_ci */ 6338c2ecf20Sopenharmony_ci else if (mz->usage_in_excess >= mz_node->usage_in_excess) 6348c2ecf20Sopenharmony_ci p = &(*p)->rb_right; 6358c2ecf20Sopenharmony_ci } 6368c2ecf20Sopenharmony_ci 6378c2ecf20Sopenharmony_ci if (rightmost) 6388c2ecf20Sopenharmony_ci mctz->rb_rightmost = &mz->tree_node; 6398c2ecf20Sopenharmony_ci 6408c2ecf20Sopenharmony_ci rb_link_node(&mz->tree_node, parent, p); 6418c2ecf20Sopenharmony_ci rb_insert_color(&mz->tree_node, &mctz->rb_root); 6428c2ecf20Sopenharmony_ci mz->on_tree = true; 6438c2ecf20Sopenharmony_ci} 6448c2ecf20Sopenharmony_ci 6458c2ecf20Sopenharmony_cistatic void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 6468c2ecf20Sopenharmony_ci struct mem_cgroup_tree_per_node *mctz) 6478c2ecf20Sopenharmony_ci{ 6488c2ecf20Sopenharmony_ci if (!mz->on_tree) 6498c2ecf20Sopenharmony_ci return; 6508c2ecf20Sopenharmony_ci 6518c2ecf20Sopenharmony_ci if (&mz->tree_node == mctz->rb_rightmost) 6528c2ecf20Sopenharmony_ci mctz->rb_rightmost = rb_prev(&mz->tree_node); 6538c2ecf20Sopenharmony_ci 6548c2ecf20Sopenharmony_ci rb_erase(&mz->tree_node, &mctz->rb_root); 6558c2ecf20Sopenharmony_ci mz->on_tree = false; 6568c2ecf20Sopenharmony_ci} 6578c2ecf20Sopenharmony_ci 6588c2ecf20Sopenharmony_cistatic void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 6598c2ecf20Sopenharmony_ci struct mem_cgroup_tree_per_node *mctz) 6608c2ecf20Sopenharmony_ci{ 6618c2ecf20Sopenharmony_ci unsigned long flags; 6628c2ecf20Sopenharmony_ci 6638c2ecf20Sopenharmony_ci spin_lock_irqsave(&mctz->lock, flags); 6648c2ecf20Sopenharmony_ci __mem_cgroup_remove_exceeded(mz, mctz); 6658c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&mctz->lock, flags); 6668c2ecf20Sopenharmony_ci} 6678c2ecf20Sopenharmony_ci 6688c2ecf20Sopenharmony_cistatic unsigned long soft_limit_excess(struct mem_cgroup *memcg) 6698c2ecf20Sopenharmony_ci{ 6708c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 6718c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *mz = mem_cgroup_nodeinfo(memcg, 0); 6728c2ecf20Sopenharmony_ci struct lruvec *lruvec = &mz->lruvec; 6738c2ecf20Sopenharmony_ci unsigned long nr_pages = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, 6748c2ecf20Sopenharmony_ci MAX_NR_ZONES) + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, 6758c2ecf20Sopenharmony_ci MAX_NR_ZONES); 6768c2ecf20Sopenharmony_ci#else 6778c2ecf20Sopenharmony_ci unsigned long nr_pages = page_counter_read(&memcg->memory); 6788c2ecf20Sopenharmony_ci#endif 6798c2ecf20Sopenharmony_ci unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 6808c2ecf20Sopenharmony_ci unsigned long excess = 0; 6818c2ecf20Sopenharmony_ci 6828c2ecf20Sopenharmony_ci if (nr_pages > soft_limit) 6838c2ecf20Sopenharmony_ci excess = nr_pages - soft_limit; 6848c2ecf20Sopenharmony_ci 6858c2ecf20Sopenharmony_ci return excess; 6868c2ecf20Sopenharmony_ci} 6878c2ecf20Sopenharmony_ci 6888c2ecf20Sopenharmony_cistatic void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 6898c2ecf20Sopenharmony_ci{ 6908c2ecf20Sopenharmony_ci unsigned long excess; 6918c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *mz; 6928c2ecf20Sopenharmony_ci struct mem_cgroup_tree_per_node *mctz; 6938c2ecf20Sopenharmony_ci 6948c2ecf20Sopenharmony_ci mctz = soft_limit_tree_from_page(page); 6958c2ecf20Sopenharmony_ci if (!mctz) 6968c2ecf20Sopenharmony_ci return; 6978c2ecf20Sopenharmony_ci /* 6988c2ecf20Sopenharmony_ci * Necessary to update all ancestors when hierarchy is used. 6998c2ecf20Sopenharmony_ci * because their event counter is not touched. 7008c2ecf20Sopenharmony_ci */ 7018c2ecf20Sopenharmony_ci for (; memcg; memcg = parent_mem_cgroup(memcg)) { 7028c2ecf20Sopenharmony_ci mz = mem_cgroup_page_nodeinfo(memcg, page); 7038c2ecf20Sopenharmony_ci excess = soft_limit_excess(memcg); 7048c2ecf20Sopenharmony_ci /* 7058c2ecf20Sopenharmony_ci * We have to update the tree if mz is on RB-tree or 7068c2ecf20Sopenharmony_ci * mem is over its softlimit. 7078c2ecf20Sopenharmony_ci */ 7088c2ecf20Sopenharmony_ci if (excess || mz->on_tree) { 7098c2ecf20Sopenharmony_ci unsigned long flags; 7108c2ecf20Sopenharmony_ci 7118c2ecf20Sopenharmony_ci spin_lock_irqsave(&mctz->lock, flags); 7128c2ecf20Sopenharmony_ci /* if on-tree, remove it */ 7138c2ecf20Sopenharmony_ci if (mz->on_tree) 7148c2ecf20Sopenharmony_ci __mem_cgroup_remove_exceeded(mz, mctz); 7158c2ecf20Sopenharmony_ci /* 7168c2ecf20Sopenharmony_ci * Insert again. mz->usage_in_excess will be updated. 7178c2ecf20Sopenharmony_ci * If excess is 0, no tree ops. 7188c2ecf20Sopenharmony_ci */ 7198c2ecf20Sopenharmony_ci __mem_cgroup_insert_exceeded(mz, mctz, excess); 7208c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&mctz->lock, flags); 7218c2ecf20Sopenharmony_ci } 7228c2ecf20Sopenharmony_ci } 7238c2ecf20Sopenharmony_ci} 7248c2ecf20Sopenharmony_ci 7258c2ecf20Sopenharmony_cistatic void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 7268c2ecf20Sopenharmony_ci{ 7278c2ecf20Sopenharmony_ci struct mem_cgroup_tree_per_node *mctz; 7288c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *mz; 7298c2ecf20Sopenharmony_ci int nid; 7308c2ecf20Sopenharmony_ci 7318c2ecf20Sopenharmony_ci for_each_node(nid) { 7328c2ecf20Sopenharmony_ci mz = mem_cgroup_nodeinfo(memcg, nid); 7338c2ecf20Sopenharmony_ci mctz = soft_limit_tree_node(nid); 7348c2ecf20Sopenharmony_ci if (mctz) 7358c2ecf20Sopenharmony_ci mem_cgroup_remove_exceeded(mz, mctz); 7368c2ecf20Sopenharmony_ci } 7378c2ecf20Sopenharmony_ci} 7388c2ecf20Sopenharmony_ci 7398c2ecf20Sopenharmony_cistatic struct mem_cgroup_per_node * 7408c2ecf20Sopenharmony_ci__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 7418c2ecf20Sopenharmony_ci{ 7428c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *mz; 7438c2ecf20Sopenharmony_ci 7448c2ecf20Sopenharmony_ciretry: 7458c2ecf20Sopenharmony_ci mz = NULL; 7468c2ecf20Sopenharmony_ci if (!mctz->rb_rightmost) 7478c2ecf20Sopenharmony_ci goto done; /* Nothing to reclaim from */ 7488c2ecf20Sopenharmony_ci 7498c2ecf20Sopenharmony_ci mz = rb_entry(mctz->rb_rightmost, 7508c2ecf20Sopenharmony_ci struct mem_cgroup_per_node, tree_node); 7518c2ecf20Sopenharmony_ci /* 7528c2ecf20Sopenharmony_ci * Remove the node now but someone else can add it back, 7538c2ecf20Sopenharmony_ci * we will to add it back at the end of reclaim to its correct 7548c2ecf20Sopenharmony_ci * position in the tree. 7558c2ecf20Sopenharmony_ci */ 7568c2ecf20Sopenharmony_ci __mem_cgroup_remove_exceeded(mz, mctz); 7578c2ecf20Sopenharmony_ci if (!soft_limit_excess(mz->memcg) || 7588c2ecf20Sopenharmony_ci !css_tryget(&mz->memcg->css)) 7598c2ecf20Sopenharmony_ci goto retry; 7608c2ecf20Sopenharmony_cidone: 7618c2ecf20Sopenharmony_ci return mz; 7628c2ecf20Sopenharmony_ci} 7638c2ecf20Sopenharmony_ci 7648c2ecf20Sopenharmony_cistatic struct mem_cgroup_per_node * 7658c2ecf20Sopenharmony_cimem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 7668c2ecf20Sopenharmony_ci{ 7678c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *mz; 7688c2ecf20Sopenharmony_ci 7698c2ecf20Sopenharmony_ci spin_lock_irq(&mctz->lock); 7708c2ecf20Sopenharmony_ci mz = __mem_cgroup_largest_soft_limit_node(mctz); 7718c2ecf20Sopenharmony_ci spin_unlock_irq(&mctz->lock); 7728c2ecf20Sopenharmony_ci return mz; 7738c2ecf20Sopenharmony_ci} 7748c2ecf20Sopenharmony_ci 7758c2ecf20Sopenharmony_ci/** 7768c2ecf20Sopenharmony_ci * __mod_memcg_state - update cgroup memory statistics 7778c2ecf20Sopenharmony_ci * @memcg: the memory cgroup 7788c2ecf20Sopenharmony_ci * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item 7798c2ecf20Sopenharmony_ci * @val: delta to add to the counter, can be negative 7808c2ecf20Sopenharmony_ci */ 7818c2ecf20Sopenharmony_civoid __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) 7828c2ecf20Sopenharmony_ci{ 7838c2ecf20Sopenharmony_ci long x, threshold = MEMCG_CHARGE_BATCH; 7848c2ecf20Sopenharmony_ci 7858c2ecf20Sopenharmony_ci if (mem_cgroup_disabled()) 7868c2ecf20Sopenharmony_ci return; 7878c2ecf20Sopenharmony_ci 7888c2ecf20Sopenharmony_ci if (memcg_stat_item_in_bytes(idx)) 7898c2ecf20Sopenharmony_ci threshold <<= PAGE_SHIFT; 7908c2ecf20Sopenharmony_ci 7918c2ecf20Sopenharmony_ci x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); 7928c2ecf20Sopenharmony_ci if (unlikely(abs(x) > threshold)) { 7938c2ecf20Sopenharmony_ci struct mem_cgroup *mi; 7948c2ecf20Sopenharmony_ci 7958c2ecf20Sopenharmony_ci /* 7968c2ecf20Sopenharmony_ci * Batch local counters to keep them in sync with 7978c2ecf20Sopenharmony_ci * the hierarchical ones. 7988c2ecf20Sopenharmony_ci */ 7998c2ecf20Sopenharmony_ci __this_cpu_add(memcg->vmstats_local->stat[idx], x); 8008c2ecf20Sopenharmony_ci for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 8018c2ecf20Sopenharmony_ci atomic_long_add(x, &mi->vmstats[idx]); 8028c2ecf20Sopenharmony_ci x = 0; 8038c2ecf20Sopenharmony_ci } 8048c2ecf20Sopenharmony_ci __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); 8058c2ecf20Sopenharmony_ci} 8068c2ecf20Sopenharmony_ci 8078c2ecf20Sopenharmony_cistatic struct mem_cgroup_per_node * 8088c2ecf20Sopenharmony_ciparent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) 8098c2ecf20Sopenharmony_ci{ 8108c2ecf20Sopenharmony_ci struct mem_cgroup *parent; 8118c2ecf20Sopenharmony_ci 8128c2ecf20Sopenharmony_ci parent = parent_mem_cgroup(pn->memcg); 8138c2ecf20Sopenharmony_ci if (!parent) 8148c2ecf20Sopenharmony_ci return NULL; 8158c2ecf20Sopenharmony_ci return mem_cgroup_nodeinfo(parent, nid); 8168c2ecf20Sopenharmony_ci} 8178c2ecf20Sopenharmony_ci 8188c2ecf20Sopenharmony_civoid __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 8198c2ecf20Sopenharmony_ci int val) 8208c2ecf20Sopenharmony_ci{ 8218c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *pn; 8228c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 8238c2ecf20Sopenharmony_ci long x, threshold = MEMCG_CHARGE_BATCH; 8248c2ecf20Sopenharmony_ci 8258c2ecf20Sopenharmony_ci pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 8268c2ecf20Sopenharmony_ci memcg = pn->memcg; 8278c2ecf20Sopenharmony_ci 8288c2ecf20Sopenharmony_ci /* Update memcg */ 8298c2ecf20Sopenharmony_ci __mod_memcg_state(memcg, idx, val); 8308c2ecf20Sopenharmony_ci 8318c2ecf20Sopenharmony_ci /* Update lruvec */ 8328c2ecf20Sopenharmony_ci __this_cpu_add(pn->lruvec_stat_local->count[idx], val); 8338c2ecf20Sopenharmony_ci 8348c2ecf20Sopenharmony_ci if (vmstat_item_in_bytes(idx)) 8358c2ecf20Sopenharmony_ci threshold <<= PAGE_SHIFT; 8368c2ecf20Sopenharmony_ci 8378c2ecf20Sopenharmony_ci x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); 8388c2ecf20Sopenharmony_ci if (unlikely(abs(x) > threshold)) { 8398c2ecf20Sopenharmony_ci pg_data_t *pgdat = lruvec_pgdat(lruvec); 8408c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *pi; 8418c2ecf20Sopenharmony_ci 8428c2ecf20Sopenharmony_ci for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) 8438c2ecf20Sopenharmony_ci atomic_long_add(x, &pi->lruvec_stat[idx]); 8448c2ecf20Sopenharmony_ci x = 0; 8458c2ecf20Sopenharmony_ci } 8468c2ecf20Sopenharmony_ci __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); 8478c2ecf20Sopenharmony_ci} 8488c2ecf20Sopenharmony_ci 8498c2ecf20Sopenharmony_ci/** 8508c2ecf20Sopenharmony_ci * __mod_lruvec_state - update lruvec memory statistics 8518c2ecf20Sopenharmony_ci * @lruvec: the lruvec 8528c2ecf20Sopenharmony_ci * @idx: the stat item 8538c2ecf20Sopenharmony_ci * @val: delta to add to the counter, can be negative 8548c2ecf20Sopenharmony_ci * 8558c2ecf20Sopenharmony_ci * The lruvec is the intersection of the NUMA node and a cgroup. This 8568c2ecf20Sopenharmony_ci * function updates the all three counters that are affected by a 8578c2ecf20Sopenharmony_ci * change of state at this level: per-node, per-cgroup, per-lruvec. 8588c2ecf20Sopenharmony_ci */ 8598c2ecf20Sopenharmony_civoid __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 8608c2ecf20Sopenharmony_ci int val) 8618c2ecf20Sopenharmony_ci{ 8628c2ecf20Sopenharmony_ci /* Update node */ 8638c2ecf20Sopenharmony_ci __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); 8648c2ecf20Sopenharmony_ci 8658c2ecf20Sopenharmony_ci /* Update memcg and lruvec */ 8668c2ecf20Sopenharmony_ci if (!mem_cgroup_disabled()) { 8678c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 8688c2ecf20Sopenharmony_ci if (is_node_lruvec(lruvec)) 8698c2ecf20Sopenharmony_ci return; 8708c2ecf20Sopenharmony_ci#endif 8718c2ecf20Sopenharmony_ci __mod_memcg_lruvec_state(lruvec, idx, val); 8728c2ecf20Sopenharmony_ci } 8738c2ecf20Sopenharmony_ci} 8748c2ecf20Sopenharmony_ci 8758c2ecf20Sopenharmony_civoid __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) 8768c2ecf20Sopenharmony_ci{ 8778c2ecf20Sopenharmony_ci pg_data_t *pgdat = page_pgdat(virt_to_page(p)); 8788c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 8798c2ecf20Sopenharmony_ci struct lruvec *lruvec; 8808c2ecf20Sopenharmony_ci 8818c2ecf20Sopenharmony_ci rcu_read_lock(); 8828c2ecf20Sopenharmony_ci memcg = mem_cgroup_from_obj(p); 8838c2ecf20Sopenharmony_ci 8848c2ecf20Sopenharmony_ci /* 8858c2ecf20Sopenharmony_ci * Untracked pages have no memcg, no lruvec. Update only the 8868c2ecf20Sopenharmony_ci * node. If we reparent the slab objects to the root memcg, 8878c2ecf20Sopenharmony_ci * when we free the slab object, we need to update the per-memcg 8888c2ecf20Sopenharmony_ci * vmstats to keep it correct for the root memcg. 8898c2ecf20Sopenharmony_ci */ 8908c2ecf20Sopenharmony_ci if (!memcg) { 8918c2ecf20Sopenharmony_ci __mod_node_page_state(pgdat, idx, val); 8928c2ecf20Sopenharmony_ci } else { 8938c2ecf20Sopenharmony_ci lruvec = mem_cgroup_lruvec(memcg, pgdat); 8948c2ecf20Sopenharmony_ci __mod_lruvec_state(lruvec, idx, val); 8958c2ecf20Sopenharmony_ci } 8968c2ecf20Sopenharmony_ci rcu_read_unlock(); 8978c2ecf20Sopenharmony_ci} 8988c2ecf20Sopenharmony_ci 8998c2ecf20Sopenharmony_civoid mod_memcg_obj_state(void *p, int idx, int val) 9008c2ecf20Sopenharmony_ci{ 9018c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 9028c2ecf20Sopenharmony_ci 9038c2ecf20Sopenharmony_ci rcu_read_lock(); 9048c2ecf20Sopenharmony_ci memcg = mem_cgroup_from_obj(p); 9058c2ecf20Sopenharmony_ci if (memcg) 9068c2ecf20Sopenharmony_ci mod_memcg_state(memcg, idx, val); 9078c2ecf20Sopenharmony_ci rcu_read_unlock(); 9088c2ecf20Sopenharmony_ci} 9098c2ecf20Sopenharmony_ci 9108c2ecf20Sopenharmony_ci/** 9118c2ecf20Sopenharmony_ci * __count_memcg_events - account VM events in a cgroup 9128c2ecf20Sopenharmony_ci * @memcg: the memory cgroup 9138c2ecf20Sopenharmony_ci * @idx: the event item 9148c2ecf20Sopenharmony_ci * @count: the number of events that occured 9158c2ecf20Sopenharmony_ci */ 9168c2ecf20Sopenharmony_civoid __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, 9178c2ecf20Sopenharmony_ci unsigned long count) 9188c2ecf20Sopenharmony_ci{ 9198c2ecf20Sopenharmony_ci unsigned long x; 9208c2ecf20Sopenharmony_ci 9218c2ecf20Sopenharmony_ci if (mem_cgroup_disabled()) 9228c2ecf20Sopenharmony_ci return; 9238c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 9248c2ecf20Sopenharmony_ci if (!memcg) 9258c2ecf20Sopenharmony_ci return; 9268c2ecf20Sopenharmony_ci#endif 9278c2ecf20Sopenharmony_ci 9288c2ecf20Sopenharmony_ci x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); 9298c2ecf20Sopenharmony_ci if (unlikely(x > MEMCG_CHARGE_BATCH)) { 9308c2ecf20Sopenharmony_ci struct mem_cgroup *mi; 9318c2ecf20Sopenharmony_ci 9328c2ecf20Sopenharmony_ci /* 9338c2ecf20Sopenharmony_ci * Batch local counters to keep them in sync with 9348c2ecf20Sopenharmony_ci * the hierarchical ones. 9358c2ecf20Sopenharmony_ci */ 9368c2ecf20Sopenharmony_ci __this_cpu_add(memcg->vmstats_local->events[idx], x); 9378c2ecf20Sopenharmony_ci for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 9388c2ecf20Sopenharmony_ci atomic_long_add(x, &mi->vmevents[idx]); 9398c2ecf20Sopenharmony_ci x = 0; 9408c2ecf20Sopenharmony_ci } 9418c2ecf20Sopenharmony_ci __this_cpu_write(memcg->vmstats_percpu->events[idx], x); 9428c2ecf20Sopenharmony_ci} 9438c2ecf20Sopenharmony_ci 9448c2ecf20Sopenharmony_cistatic unsigned long memcg_events(struct mem_cgroup *memcg, int event) 9458c2ecf20Sopenharmony_ci{ 9468c2ecf20Sopenharmony_ci return atomic_long_read(&memcg->vmevents[event]); 9478c2ecf20Sopenharmony_ci} 9488c2ecf20Sopenharmony_ci 9498c2ecf20Sopenharmony_cistatic unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) 9508c2ecf20Sopenharmony_ci{ 9518c2ecf20Sopenharmony_ci long x = 0; 9528c2ecf20Sopenharmony_ci int cpu; 9538c2ecf20Sopenharmony_ci 9548c2ecf20Sopenharmony_ci for_each_possible_cpu(cpu) 9558c2ecf20Sopenharmony_ci x += per_cpu(memcg->vmstats_local->events[event], cpu); 9568c2ecf20Sopenharmony_ci return x; 9578c2ecf20Sopenharmony_ci} 9588c2ecf20Sopenharmony_ci 9598c2ecf20Sopenharmony_cistatic void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 9608c2ecf20Sopenharmony_ci struct page *page, 9618c2ecf20Sopenharmony_ci int nr_pages) 9628c2ecf20Sopenharmony_ci{ 9638c2ecf20Sopenharmony_ci /* pagein of a big page is an event. So, ignore page size */ 9648c2ecf20Sopenharmony_ci if (nr_pages > 0) 9658c2ecf20Sopenharmony_ci __count_memcg_events(memcg, PGPGIN, 1); 9668c2ecf20Sopenharmony_ci else { 9678c2ecf20Sopenharmony_ci __count_memcg_events(memcg, PGPGOUT, 1); 9688c2ecf20Sopenharmony_ci nr_pages = -nr_pages; /* for event */ 9698c2ecf20Sopenharmony_ci } 9708c2ecf20Sopenharmony_ci 9718c2ecf20Sopenharmony_ci __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); 9728c2ecf20Sopenharmony_ci} 9738c2ecf20Sopenharmony_ci 9748c2ecf20Sopenharmony_cistatic bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 9758c2ecf20Sopenharmony_ci enum mem_cgroup_events_target target) 9768c2ecf20Sopenharmony_ci{ 9778c2ecf20Sopenharmony_ci unsigned long val, next; 9788c2ecf20Sopenharmony_ci 9798c2ecf20Sopenharmony_ci val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); 9808c2ecf20Sopenharmony_ci next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); 9818c2ecf20Sopenharmony_ci /* from time_after() in jiffies.h */ 9828c2ecf20Sopenharmony_ci if ((long)(next - val) < 0) { 9838c2ecf20Sopenharmony_ci switch (target) { 9848c2ecf20Sopenharmony_ci case MEM_CGROUP_TARGET_THRESH: 9858c2ecf20Sopenharmony_ci next = val + THRESHOLDS_EVENTS_TARGET; 9868c2ecf20Sopenharmony_ci break; 9878c2ecf20Sopenharmony_ci case MEM_CGROUP_TARGET_SOFTLIMIT: 9888c2ecf20Sopenharmony_ci next = val + SOFTLIMIT_EVENTS_TARGET; 9898c2ecf20Sopenharmony_ci break; 9908c2ecf20Sopenharmony_ci default: 9918c2ecf20Sopenharmony_ci break; 9928c2ecf20Sopenharmony_ci } 9938c2ecf20Sopenharmony_ci __this_cpu_write(memcg->vmstats_percpu->targets[target], next); 9948c2ecf20Sopenharmony_ci return true; 9958c2ecf20Sopenharmony_ci } 9968c2ecf20Sopenharmony_ci return false; 9978c2ecf20Sopenharmony_ci} 9988c2ecf20Sopenharmony_ci 9998c2ecf20Sopenharmony_ci/* 10008c2ecf20Sopenharmony_ci * Check events in order. 10018c2ecf20Sopenharmony_ci * 10028c2ecf20Sopenharmony_ci */ 10038c2ecf20Sopenharmony_cistatic void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 10048c2ecf20Sopenharmony_ci{ 10058c2ecf20Sopenharmony_ci /* threshold event is triggered in finer grain than soft limit */ 10068c2ecf20Sopenharmony_ci if (unlikely(mem_cgroup_event_ratelimit(memcg, 10078c2ecf20Sopenharmony_ci MEM_CGROUP_TARGET_THRESH))) { 10088c2ecf20Sopenharmony_ci bool do_softlimit; 10098c2ecf20Sopenharmony_ci 10108c2ecf20Sopenharmony_ci do_softlimit = mem_cgroup_event_ratelimit(memcg, 10118c2ecf20Sopenharmony_ci MEM_CGROUP_TARGET_SOFTLIMIT); 10128c2ecf20Sopenharmony_ci mem_cgroup_threshold(memcg); 10138c2ecf20Sopenharmony_ci if (unlikely(do_softlimit)) 10148c2ecf20Sopenharmony_ci mem_cgroup_update_tree(memcg, page); 10158c2ecf20Sopenharmony_ci } 10168c2ecf20Sopenharmony_ci} 10178c2ecf20Sopenharmony_ci 10188c2ecf20Sopenharmony_cistruct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 10198c2ecf20Sopenharmony_ci{ 10208c2ecf20Sopenharmony_ci /* 10218c2ecf20Sopenharmony_ci * mm_update_next_owner() may clear mm->owner to NULL 10228c2ecf20Sopenharmony_ci * if it races with swapoff, page migration, etc. 10238c2ecf20Sopenharmony_ci * So this can be called with p == NULL. 10248c2ecf20Sopenharmony_ci */ 10258c2ecf20Sopenharmony_ci if (unlikely(!p)) 10268c2ecf20Sopenharmony_ci return NULL; 10278c2ecf20Sopenharmony_ci 10288c2ecf20Sopenharmony_ci return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 10298c2ecf20Sopenharmony_ci} 10308c2ecf20Sopenharmony_ciEXPORT_SYMBOL(mem_cgroup_from_task); 10318c2ecf20Sopenharmony_ci 10328c2ecf20Sopenharmony_ci/** 10338c2ecf20Sopenharmony_ci * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. 10348c2ecf20Sopenharmony_ci * @mm: mm from which memcg should be extracted. It can be NULL. 10358c2ecf20Sopenharmony_ci * 10368c2ecf20Sopenharmony_ci * Obtain a reference on mm->memcg and returns it if successful. Otherwise 10378c2ecf20Sopenharmony_ci * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is 10388c2ecf20Sopenharmony_ci * returned. 10398c2ecf20Sopenharmony_ci */ 10408c2ecf20Sopenharmony_cistruct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 10418c2ecf20Sopenharmony_ci{ 10428c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 10438c2ecf20Sopenharmony_ci 10448c2ecf20Sopenharmony_ci if (mem_cgroup_disabled()) 10458c2ecf20Sopenharmony_ci return NULL; 10468c2ecf20Sopenharmony_ci 10478c2ecf20Sopenharmony_ci rcu_read_lock(); 10488c2ecf20Sopenharmony_ci do { 10498c2ecf20Sopenharmony_ci /* 10508c2ecf20Sopenharmony_ci * Page cache insertions can happen withou an 10518c2ecf20Sopenharmony_ci * actual mm context, e.g. during disk probing 10528c2ecf20Sopenharmony_ci * on boot, loopback IO, acct() writes etc. 10538c2ecf20Sopenharmony_ci */ 10548c2ecf20Sopenharmony_ci if (unlikely(!mm)) 10558c2ecf20Sopenharmony_ci memcg = root_mem_cgroup; 10568c2ecf20Sopenharmony_ci else { 10578c2ecf20Sopenharmony_ci memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 10588c2ecf20Sopenharmony_ci if (unlikely(!memcg)) 10598c2ecf20Sopenharmony_ci memcg = root_mem_cgroup; 10608c2ecf20Sopenharmony_ci } 10618c2ecf20Sopenharmony_ci } while (!css_tryget(&memcg->css)); 10628c2ecf20Sopenharmony_ci rcu_read_unlock(); 10638c2ecf20Sopenharmony_ci return memcg; 10648c2ecf20Sopenharmony_ci} 10658c2ecf20Sopenharmony_ciEXPORT_SYMBOL(get_mem_cgroup_from_mm); 10668c2ecf20Sopenharmony_ci 10678c2ecf20Sopenharmony_ci/** 10688c2ecf20Sopenharmony_ci * get_mem_cgroup_from_page: Obtain a reference on given page's memcg. 10698c2ecf20Sopenharmony_ci * @page: page from which memcg should be extracted. 10708c2ecf20Sopenharmony_ci * 10718c2ecf20Sopenharmony_ci * Obtain a reference on page->memcg and returns it if successful. Otherwise 10728c2ecf20Sopenharmony_ci * root_mem_cgroup is returned. 10738c2ecf20Sopenharmony_ci */ 10748c2ecf20Sopenharmony_cistruct mem_cgroup *get_mem_cgroup_from_page(struct page *page) 10758c2ecf20Sopenharmony_ci{ 10768c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = page->mem_cgroup; 10778c2ecf20Sopenharmony_ci 10788c2ecf20Sopenharmony_ci if (mem_cgroup_disabled()) 10798c2ecf20Sopenharmony_ci return NULL; 10808c2ecf20Sopenharmony_ci 10818c2ecf20Sopenharmony_ci rcu_read_lock(); 10828c2ecf20Sopenharmony_ci /* Page should not get uncharged and freed memcg under us. */ 10838c2ecf20Sopenharmony_ci if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css))) 10848c2ecf20Sopenharmony_ci memcg = root_mem_cgroup; 10858c2ecf20Sopenharmony_ci rcu_read_unlock(); 10868c2ecf20Sopenharmony_ci return memcg; 10878c2ecf20Sopenharmony_ci} 10888c2ecf20Sopenharmony_ciEXPORT_SYMBOL(get_mem_cgroup_from_page); 10898c2ecf20Sopenharmony_ci 10908c2ecf20Sopenharmony_cistatic __always_inline struct mem_cgroup *active_memcg(void) 10918c2ecf20Sopenharmony_ci{ 10928c2ecf20Sopenharmony_ci if (in_interrupt()) 10938c2ecf20Sopenharmony_ci return this_cpu_read(int_active_memcg); 10948c2ecf20Sopenharmony_ci else 10958c2ecf20Sopenharmony_ci return current->active_memcg; 10968c2ecf20Sopenharmony_ci} 10978c2ecf20Sopenharmony_ci 10988c2ecf20Sopenharmony_cistatic __always_inline struct mem_cgroup *get_active_memcg(void) 10998c2ecf20Sopenharmony_ci{ 11008c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 11018c2ecf20Sopenharmony_ci 11028c2ecf20Sopenharmony_ci rcu_read_lock(); 11038c2ecf20Sopenharmony_ci memcg = active_memcg(); 11048c2ecf20Sopenharmony_ci /* remote memcg must hold a ref. */ 11058c2ecf20Sopenharmony_ci if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css))) 11068c2ecf20Sopenharmony_ci memcg = root_mem_cgroup; 11078c2ecf20Sopenharmony_ci rcu_read_unlock(); 11088c2ecf20Sopenharmony_ci 11098c2ecf20Sopenharmony_ci return memcg; 11108c2ecf20Sopenharmony_ci} 11118c2ecf20Sopenharmony_ci 11128c2ecf20Sopenharmony_cistatic __always_inline bool memcg_kmem_bypass(void) 11138c2ecf20Sopenharmony_ci{ 11148c2ecf20Sopenharmony_ci /* Allow remote memcg charging from any context. */ 11158c2ecf20Sopenharmony_ci if (unlikely(active_memcg())) 11168c2ecf20Sopenharmony_ci return false; 11178c2ecf20Sopenharmony_ci 11188c2ecf20Sopenharmony_ci /* Memcg to charge can't be determined. */ 11198c2ecf20Sopenharmony_ci if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) 11208c2ecf20Sopenharmony_ci return true; 11218c2ecf20Sopenharmony_ci 11228c2ecf20Sopenharmony_ci return false; 11238c2ecf20Sopenharmony_ci} 11248c2ecf20Sopenharmony_ci 11258c2ecf20Sopenharmony_ci/** 11268c2ecf20Sopenharmony_ci * If active memcg is set, do not fallback to current->mm->memcg. 11278c2ecf20Sopenharmony_ci */ 11288c2ecf20Sopenharmony_cistatic __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) 11298c2ecf20Sopenharmony_ci{ 11308c2ecf20Sopenharmony_ci if (memcg_kmem_bypass()) 11318c2ecf20Sopenharmony_ci return NULL; 11328c2ecf20Sopenharmony_ci 11338c2ecf20Sopenharmony_ci if (unlikely(active_memcg())) 11348c2ecf20Sopenharmony_ci return get_active_memcg(); 11358c2ecf20Sopenharmony_ci 11368c2ecf20Sopenharmony_ci return get_mem_cgroup_from_mm(current->mm); 11378c2ecf20Sopenharmony_ci} 11388c2ecf20Sopenharmony_ci 11398c2ecf20Sopenharmony_ci/** 11408c2ecf20Sopenharmony_ci * mem_cgroup_iter - iterate over memory cgroup hierarchy 11418c2ecf20Sopenharmony_ci * @root: hierarchy root 11428c2ecf20Sopenharmony_ci * @prev: previously returned memcg, NULL on first invocation 11438c2ecf20Sopenharmony_ci * @reclaim: cookie for shared reclaim walks, NULL for full walks 11448c2ecf20Sopenharmony_ci * 11458c2ecf20Sopenharmony_ci * Returns references to children of the hierarchy below @root, or 11468c2ecf20Sopenharmony_ci * @root itself, or %NULL after a full round-trip. 11478c2ecf20Sopenharmony_ci * 11488c2ecf20Sopenharmony_ci * Caller must pass the return value in @prev on subsequent 11498c2ecf20Sopenharmony_ci * invocations for reference counting, or use mem_cgroup_iter_break() 11508c2ecf20Sopenharmony_ci * to cancel a hierarchy walk before the round-trip is complete. 11518c2ecf20Sopenharmony_ci * 11528c2ecf20Sopenharmony_ci * Reclaimers can specify a node in @reclaim to divide up the memcgs 11538c2ecf20Sopenharmony_ci * in the hierarchy among all concurrent reclaimers operating on the 11548c2ecf20Sopenharmony_ci * same node. 11558c2ecf20Sopenharmony_ci */ 11568c2ecf20Sopenharmony_cistruct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 11578c2ecf20Sopenharmony_ci struct mem_cgroup *prev, 11588c2ecf20Sopenharmony_ci struct mem_cgroup_reclaim_cookie *reclaim) 11598c2ecf20Sopenharmony_ci{ 11608c2ecf20Sopenharmony_ci struct mem_cgroup_reclaim_iter *iter; 11618c2ecf20Sopenharmony_ci struct cgroup_subsys_state *css = NULL; 11628c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = NULL; 11638c2ecf20Sopenharmony_ci struct mem_cgroup *pos = NULL; 11648c2ecf20Sopenharmony_ci 11658c2ecf20Sopenharmony_ci if (mem_cgroup_disabled()) 11668c2ecf20Sopenharmony_ci return NULL; 11678c2ecf20Sopenharmony_ci 11688c2ecf20Sopenharmony_ci if (!root) 11698c2ecf20Sopenharmony_ci root = root_mem_cgroup; 11708c2ecf20Sopenharmony_ci 11718c2ecf20Sopenharmony_ci if (prev && !reclaim) 11728c2ecf20Sopenharmony_ci pos = prev; 11738c2ecf20Sopenharmony_ci 11748c2ecf20Sopenharmony_ci if (!root->use_hierarchy && root != root_mem_cgroup) { 11758c2ecf20Sopenharmony_ci if (prev) 11768c2ecf20Sopenharmony_ci goto out; 11778c2ecf20Sopenharmony_ci return root; 11788c2ecf20Sopenharmony_ci } 11798c2ecf20Sopenharmony_ci 11808c2ecf20Sopenharmony_ci rcu_read_lock(); 11818c2ecf20Sopenharmony_ci 11828c2ecf20Sopenharmony_ci if (reclaim) { 11838c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *mz; 11848c2ecf20Sopenharmony_ci 11858c2ecf20Sopenharmony_ci mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); 11868c2ecf20Sopenharmony_ci iter = &mz->iter; 11878c2ecf20Sopenharmony_ci 11888c2ecf20Sopenharmony_ci if (prev && reclaim->generation != iter->generation) 11898c2ecf20Sopenharmony_ci goto out_unlock; 11908c2ecf20Sopenharmony_ci 11918c2ecf20Sopenharmony_ci while (1) { 11928c2ecf20Sopenharmony_ci pos = READ_ONCE(iter->position); 11938c2ecf20Sopenharmony_ci if (!pos || css_tryget(&pos->css)) 11948c2ecf20Sopenharmony_ci break; 11958c2ecf20Sopenharmony_ci /* 11968c2ecf20Sopenharmony_ci * css reference reached zero, so iter->position will 11978c2ecf20Sopenharmony_ci * be cleared by ->css_released. However, we should not 11988c2ecf20Sopenharmony_ci * rely on this happening soon, because ->css_released 11998c2ecf20Sopenharmony_ci * is called from a work queue, and by busy-waiting we 12008c2ecf20Sopenharmony_ci * might block it. So we clear iter->position right 12018c2ecf20Sopenharmony_ci * away. 12028c2ecf20Sopenharmony_ci */ 12038c2ecf20Sopenharmony_ci (void)cmpxchg(&iter->position, pos, NULL); 12048c2ecf20Sopenharmony_ci } 12058c2ecf20Sopenharmony_ci } 12068c2ecf20Sopenharmony_ci 12078c2ecf20Sopenharmony_ci if (pos) 12088c2ecf20Sopenharmony_ci css = &pos->css; 12098c2ecf20Sopenharmony_ci 12108c2ecf20Sopenharmony_ci for (;;) { 12118c2ecf20Sopenharmony_ci css = css_next_descendant_pre(css, &root->css); 12128c2ecf20Sopenharmony_ci if (!css) { 12138c2ecf20Sopenharmony_ci /* 12148c2ecf20Sopenharmony_ci * Reclaimers share the hierarchy walk, and a 12158c2ecf20Sopenharmony_ci * new one might jump in right at the end of 12168c2ecf20Sopenharmony_ci * the hierarchy - make sure they see at least 12178c2ecf20Sopenharmony_ci * one group and restart from the beginning. 12188c2ecf20Sopenharmony_ci */ 12198c2ecf20Sopenharmony_ci if (!prev) 12208c2ecf20Sopenharmony_ci continue; 12218c2ecf20Sopenharmony_ci break; 12228c2ecf20Sopenharmony_ci } 12238c2ecf20Sopenharmony_ci 12248c2ecf20Sopenharmony_ci /* 12258c2ecf20Sopenharmony_ci * Verify the css and acquire a reference. The root 12268c2ecf20Sopenharmony_ci * is provided by the caller, so we know it's alive 12278c2ecf20Sopenharmony_ci * and kicking, and don't take an extra reference. 12288c2ecf20Sopenharmony_ci */ 12298c2ecf20Sopenharmony_ci memcg = mem_cgroup_from_css(css); 12308c2ecf20Sopenharmony_ci 12318c2ecf20Sopenharmony_ci if (css == &root->css) 12328c2ecf20Sopenharmony_ci break; 12338c2ecf20Sopenharmony_ci 12348c2ecf20Sopenharmony_ci if (css_tryget(css)) 12358c2ecf20Sopenharmony_ci break; 12368c2ecf20Sopenharmony_ci 12378c2ecf20Sopenharmony_ci memcg = NULL; 12388c2ecf20Sopenharmony_ci } 12398c2ecf20Sopenharmony_ci 12408c2ecf20Sopenharmony_ci if (reclaim) { 12418c2ecf20Sopenharmony_ci /* 12428c2ecf20Sopenharmony_ci * The position could have already been updated by a competing 12438c2ecf20Sopenharmony_ci * thread, so check that the value hasn't changed since we read 12448c2ecf20Sopenharmony_ci * it to avoid reclaiming from the same cgroup twice. 12458c2ecf20Sopenharmony_ci */ 12468c2ecf20Sopenharmony_ci (void)cmpxchg(&iter->position, pos, memcg); 12478c2ecf20Sopenharmony_ci 12488c2ecf20Sopenharmony_ci if (pos) 12498c2ecf20Sopenharmony_ci css_put(&pos->css); 12508c2ecf20Sopenharmony_ci 12518c2ecf20Sopenharmony_ci if (!memcg) 12528c2ecf20Sopenharmony_ci iter->generation++; 12538c2ecf20Sopenharmony_ci else if (!prev) 12548c2ecf20Sopenharmony_ci reclaim->generation = iter->generation; 12558c2ecf20Sopenharmony_ci } 12568c2ecf20Sopenharmony_ci 12578c2ecf20Sopenharmony_ciout_unlock: 12588c2ecf20Sopenharmony_ci rcu_read_unlock(); 12598c2ecf20Sopenharmony_ciout: 12608c2ecf20Sopenharmony_ci if (prev && prev != root) 12618c2ecf20Sopenharmony_ci css_put(&prev->css); 12628c2ecf20Sopenharmony_ci 12638c2ecf20Sopenharmony_ci return memcg; 12648c2ecf20Sopenharmony_ci} 12658c2ecf20Sopenharmony_ci 12668c2ecf20Sopenharmony_ci/** 12678c2ecf20Sopenharmony_ci * mem_cgroup_iter_break - abort a hierarchy walk prematurely 12688c2ecf20Sopenharmony_ci * @root: hierarchy root 12698c2ecf20Sopenharmony_ci * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 12708c2ecf20Sopenharmony_ci */ 12718c2ecf20Sopenharmony_civoid mem_cgroup_iter_break(struct mem_cgroup *root, 12728c2ecf20Sopenharmony_ci struct mem_cgroup *prev) 12738c2ecf20Sopenharmony_ci{ 12748c2ecf20Sopenharmony_ci if (!root) 12758c2ecf20Sopenharmony_ci root = root_mem_cgroup; 12768c2ecf20Sopenharmony_ci if (prev && prev != root) 12778c2ecf20Sopenharmony_ci css_put(&prev->css); 12788c2ecf20Sopenharmony_ci} 12798c2ecf20Sopenharmony_ci 12808c2ecf20Sopenharmony_cistatic void __invalidate_reclaim_iterators(struct mem_cgroup *from, 12818c2ecf20Sopenharmony_ci struct mem_cgroup *dead_memcg) 12828c2ecf20Sopenharmony_ci{ 12838c2ecf20Sopenharmony_ci struct mem_cgroup_reclaim_iter *iter; 12848c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *mz; 12858c2ecf20Sopenharmony_ci int nid; 12868c2ecf20Sopenharmony_ci 12878c2ecf20Sopenharmony_ci for_each_node(nid) { 12888c2ecf20Sopenharmony_ci mz = mem_cgroup_nodeinfo(from, nid); 12898c2ecf20Sopenharmony_ci iter = &mz->iter; 12908c2ecf20Sopenharmony_ci cmpxchg(&iter->position, dead_memcg, NULL); 12918c2ecf20Sopenharmony_ci } 12928c2ecf20Sopenharmony_ci} 12938c2ecf20Sopenharmony_ci 12948c2ecf20Sopenharmony_cistatic void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) 12958c2ecf20Sopenharmony_ci{ 12968c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = dead_memcg; 12978c2ecf20Sopenharmony_ci struct mem_cgroup *last; 12988c2ecf20Sopenharmony_ci 12998c2ecf20Sopenharmony_ci do { 13008c2ecf20Sopenharmony_ci __invalidate_reclaim_iterators(memcg, dead_memcg); 13018c2ecf20Sopenharmony_ci last = memcg; 13028c2ecf20Sopenharmony_ci } while ((memcg = parent_mem_cgroup(memcg))); 13038c2ecf20Sopenharmony_ci 13048c2ecf20Sopenharmony_ci /* 13058c2ecf20Sopenharmony_ci * When cgruop1 non-hierarchy mode is used, 13068c2ecf20Sopenharmony_ci * parent_mem_cgroup() does not walk all the way up to the 13078c2ecf20Sopenharmony_ci * cgroup root (root_mem_cgroup). So we have to handle 13088c2ecf20Sopenharmony_ci * dead_memcg from cgroup root separately. 13098c2ecf20Sopenharmony_ci */ 13108c2ecf20Sopenharmony_ci if (last != root_mem_cgroup) 13118c2ecf20Sopenharmony_ci __invalidate_reclaim_iterators(root_mem_cgroup, 13128c2ecf20Sopenharmony_ci dead_memcg); 13138c2ecf20Sopenharmony_ci} 13148c2ecf20Sopenharmony_ci 13158c2ecf20Sopenharmony_ci/** 13168c2ecf20Sopenharmony_ci * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy 13178c2ecf20Sopenharmony_ci * @memcg: hierarchy root 13188c2ecf20Sopenharmony_ci * @fn: function to call for each task 13198c2ecf20Sopenharmony_ci * @arg: argument passed to @fn 13208c2ecf20Sopenharmony_ci * 13218c2ecf20Sopenharmony_ci * This function iterates over tasks attached to @memcg or to any of its 13228c2ecf20Sopenharmony_ci * descendants and calls @fn for each task. If @fn returns a non-zero 13238c2ecf20Sopenharmony_ci * value, the function breaks the iteration loop and returns the value. 13248c2ecf20Sopenharmony_ci * Otherwise, it will iterate over all tasks and return 0. 13258c2ecf20Sopenharmony_ci * 13268c2ecf20Sopenharmony_ci * This function must not be called for the root memory cgroup. 13278c2ecf20Sopenharmony_ci */ 13288c2ecf20Sopenharmony_ciint mem_cgroup_scan_tasks(struct mem_cgroup *memcg, 13298c2ecf20Sopenharmony_ci int (*fn)(struct task_struct *, void *), void *arg) 13308c2ecf20Sopenharmony_ci{ 13318c2ecf20Sopenharmony_ci struct mem_cgroup *iter; 13328c2ecf20Sopenharmony_ci int ret = 0; 13338c2ecf20Sopenharmony_ci 13348c2ecf20Sopenharmony_ci BUG_ON(memcg == root_mem_cgroup); 13358c2ecf20Sopenharmony_ci 13368c2ecf20Sopenharmony_ci for_each_mem_cgroup_tree(iter, memcg) { 13378c2ecf20Sopenharmony_ci struct css_task_iter it; 13388c2ecf20Sopenharmony_ci struct task_struct *task; 13398c2ecf20Sopenharmony_ci 13408c2ecf20Sopenharmony_ci css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); 13418c2ecf20Sopenharmony_ci while (!ret && (task = css_task_iter_next(&it))) 13428c2ecf20Sopenharmony_ci ret = fn(task, arg); 13438c2ecf20Sopenharmony_ci css_task_iter_end(&it); 13448c2ecf20Sopenharmony_ci if (ret) { 13458c2ecf20Sopenharmony_ci mem_cgroup_iter_break(memcg, iter); 13468c2ecf20Sopenharmony_ci break; 13478c2ecf20Sopenharmony_ci } 13488c2ecf20Sopenharmony_ci } 13498c2ecf20Sopenharmony_ci return ret; 13508c2ecf20Sopenharmony_ci} 13518c2ecf20Sopenharmony_ci 13528c2ecf20Sopenharmony_ci/** 13538c2ecf20Sopenharmony_ci * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page 13548c2ecf20Sopenharmony_ci * @page: the page 13558c2ecf20Sopenharmony_ci * @pgdat: pgdat of the page 13568c2ecf20Sopenharmony_ci * 13578c2ecf20Sopenharmony_ci * This function relies on page->mem_cgroup being stable - see the 13588c2ecf20Sopenharmony_ci * access rules in commit_charge(). 13598c2ecf20Sopenharmony_ci */ 13608c2ecf20Sopenharmony_cistruct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) 13618c2ecf20Sopenharmony_ci{ 13628c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *mz; 13638c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 13648c2ecf20Sopenharmony_ci struct lruvec *lruvec; 13658c2ecf20Sopenharmony_ci 13668c2ecf20Sopenharmony_ci if (mem_cgroup_disabled()) { 13678c2ecf20Sopenharmony_ci lruvec = &pgdat->__lruvec; 13688c2ecf20Sopenharmony_ci goto out; 13698c2ecf20Sopenharmony_ci } 13708c2ecf20Sopenharmony_ci 13718c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 13728c2ecf20Sopenharmony_ci if (page_is_file_lru(page) && 13738c2ecf20Sopenharmony_ci !is_prot_page(page)) { 13748c2ecf20Sopenharmony_ci lruvec = node_lruvec(pgdat); 13758c2ecf20Sopenharmony_ci goto out; 13768c2ecf20Sopenharmony_ci } 13778c2ecf20Sopenharmony_ci#endif 13788c2ecf20Sopenharmony_ci memcg = page->mem_cgroup; 13798c2ecf20Sopenharmony_ci /* 13808c2ecf20Sopenharmony_ci * Swapcache readahead pages are added to the LRU - and 13818c2ecf20Sopenharmony_ci * possibly migrated - before they are charged. 13828c2ecf20Sopenharmony_ci */ 13838c2ecf20Sopenharmony_ci if (!memcg) 13848c2ecf20Sopenharmony_ci memcg = root_mem_cgroup; 13858c2ecf20Sopenharmony_ci 13868c2ecf20Sopenharmony_ci mz = mem_cgroup_page_nodeinfo(memcg, page); 13878c2ecf20Sopenharmony_ci lruvec = &mz->lruvec; 13888c2ecf20Sopenharmony_ciout: 13898c2ecf20Sopenharmony_ci /* 13908c2ecf20Sopenharmony_ci * Since a node can be onlined after the mem_cgroup was created, 13918c2ecf20Sopenharmony_ci * we have to be prepared to initialize lruvec->zone here; 13928c2ecf20Sopenharmony_ci * and if offlined then reonlined, we need to reinitialize it. 13938c2ecf20Sopenharmony_ci */ 13948c2ecf20Sopenharmony_ci if (unlikely(lruvec->pgdat != pgdat)) 13958c2ecf20Sopenharmony_ci lruvec->pgdat = pgdat; 13968c2ecf20Sopenharmony_ci return lruvec; 13978c2ecf20Sopenharmony_ci} 13988c2ecf20Sopenharmony_ci 13998c2ecf20Sopenharmony_ci/** 14008c2ecf20Sopenharmony_ci * mem_cgroup_update_lru_size - account for adding or removing an lru page 14018c2ecf20Sopenharmony_ci * @lruvec: mem_cgroup per zone lru vector 14028c2ecf20Sopenharmony_ci * @lru: index of lru list the page is sitting on 14038c2ecf20Sopenharmony_ci * @zid: zone id of the accounted pages 14048c2ecf20Sopenharmony_ci * @nr_pages: positive when adding or negative when removing 14058c2ecf20Sopenharmony_ci * 14068c2ecf20Sopenharmony_ci * This function must be called under lru_lock, just before a page is added 14078c2ecf20Sopenharmony_ci * to or just after a page is removed from an lru list (that ordering being 14088c2ecf20Sopenharmony_ci * so as to allow it to check that lru_size 0 is consistent with list_empty). 14098c2ecf20Sopenharmony_ci */ 14108c2ecf20Sopenharmony_civoid mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 14118c2ecf20Sopenharmony_ci int zid, int nr_pages) 14128c2ecf20Sopenharmony_ci{ 14138c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *mz; 14148c2ecf20Sopenharmony_ci unsigned long *lru_size; 14158c2ecf20Sopenharmony_ci long size; 14168c2ecf20Sopenharmony_ci 14178c2ecf20Sopenharmony_ci if (mem_cgroup_disabled()) 14188c2ecf20Sopenharmony_ci return; 14198c2ecf20Sopenharmony_ci 14208c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 14218c2ecf20Sopenharmony_ci if (is_node_lruvec(lruvec)) 14228c2ecf20Sopenharmony_ci return; 14238c2ecf20Sopenharmony_ci#endif 14248c2ecf20Sopenharmony_ci mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 14258c2ecf20Sopenharmony_ci lru_size = &mz->lru_zone_size[zid][lru]; 14268c2ecf20Sopenharmony_ci 14278c2ecf20Sopenharmony_ci if (nr_pages < 0) 14288c2ecf20Sopenharmony_ci *lru_size += nr_pages; 14298c2ecf20Sopenharmony_ci 14308c2ecf20Sopenharmony_ci size = *lru_size; 14318c2ecf20Sopenharmony_ci if (WARN_ONCE(size < 0, 14328c2ecf20Sopenharmony_ci "%s(%p, %d, %d): lru_size %ld\n", 14338c2ecf20Sopenharmony_ci __func__, lruvec, lru, nr_pages, size)) { 14348c2ecf20Sopenharmony_ci VM_BUG_ON(1); 14358c2ecf20Sopenharmony_ci *lru_size = 0; 14368c2ecf20Sopenharmony_ci } 14378c2ecf20Sopenharmony_ci 14388c2ecf20Sopenharmony_ci if (nr_pages > 0) 14398c2ecf20Sopenharmony_ci *lru_size += nr_pages; 14408c2ecf20Sopenharmony_ci} 14418c2ecf20Sopenharmony_ci 14428c2ecf20Sopenharmony_ci/** 14438c2ecf20Sopenharmony_ci * mem_cgroup_margin - calculate chargeable space of a memory cgroup 14448c2ecf20Sopenharmony_ci * @memcg: the memory cgroup 14458c2ecf20Sopenharmony_ci * 14468c2ecf20Sopenharmony_ci * Returns the maximum amount of memory @mem can be charged with, in 14478c2ecf20Sopenharmony_ci * pages. 14488c2ecf20Sopenharmony_ci */ 14498c2ecf20Sopenharmony_cistatic unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 14508c2ecf20Sopenharmony_ci{ 14518c2ecf20Sopenharmony_ci unsigned long margin = 0; 14528c2ecf20Sopenharmony_ci unsigned long count; 14538c2ecf20Sopenharmony_ci unsigned long limit; 14548c2ecf20Sopenharmony_ci 14558c2ecf20Sopenharmony_ci count = page_counter_read(&memcg->memory); 14568c2ecf20Sopenharmony_ci limit = READ_ONCE(memcg->memory.max); 14578c2ecf20Sopenharmony_ci if (count < limit) 14588c2ecf20Sopenharmony_ci margin = limit - count; 14598c2ecf20Sopenharmony_ci 14608c2ecf20Sopenharmony_ci if (do_memsw_account()) { 14618c2ecf20Sopenharmony_ci count = page_counter_read(&memcg->memsw); 14628c2ecf20Sopenharmony_ci limit = READ_ONCE(memcg->memsw.max); 14638c2ecf20Sopenharmony_ci if (count < limit) 14648c2ecf20Sopenharmony_ci margin = min(margin, limit - count); 14658c2ecf20Sopenharmony_ci else 14668c2ecf20Sopenharmony_ci margin = 0; 14678c2ecf20Sopenharmony_ci } 14688c2ecf20Sopenharmony_ci 14698c2ecf20Sopenharmony_ci return margin; 14708c2ecf20Sopenharmony_ci} 14718c2ecf20Sopenharmony_ci 14728c2ecf20Sopenharmony_ci/* 14738c2ecf20Sopenharmony_ci * A routine for checking "mem" is under move_account() or not. 14748c2ecf20Sopenharmony_ci * 14758c2ecf20Sopenharmony_ci * Checking a cgroup is mc.from or mc.to or under hierarchy of 14768c2ecf20Sopenharmony_ci * moving cgroups. This is for waiting at high-memory pressure 14778c2ecf20Sopenharmony_ci * caused by "move". 14788c2ecf20Sopenharmony_ci */ 14798c2ecf20Sopenharmony_cistatic bool mem_cgroup_under_move(struct mem_cgroup *memcg) 14808c2ecf20Sopenharmony_ci{ 14818c2ecf20Sopenharmony_ci struct mem_cgroup *from; 14828c2ecf20Sopenharmony_ci struct mem_cgroup *to; 14838c2ecf20Sopenharmony_ci bool ret = false; 14848c2ecf20Sopenharmony_ci /* 14858c2ecf20Sopenharmony_ci * Unlike task_move routines, we access mc.to, mc.from not under 14868c2ecf20Sopenharmony_ci * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 14878c2ecf20Sopenharmony_ci */ 14888c2ecf20Sopenharmony_ci spin_lock(&mc.lock); 14898c2ecf20Sopenharmony_ci from = mc.from; 14908c2ecf20Sopenharmony_ci to = mc.to; 14918c2ecf20Sopenharmony_ci if (!from) 14928c2ecf20Sopenharmony_ci goto unlock; 14938c2ecf20Sopenharmony_ci 14948c2ecf20Sopenharmony_ci ret = mem_cgroup_is_descendant(from, memcg) || 14958c2ecf20Sopenharmony_ci mem_cgroup_is_descendant(to, memcg); 14968c2ecf20Sopenharmony_ciunlock: 14978c2ecf20Sopenharmony_ci spin_unlock(&mc.lock); 14988c2ecf20Sopenharmony_ci return ret; 14998c2ecf20Sopenharmony_ci} 15008c2ecf20Sopenharmony_ci 15018c2ecf20Sopenharmony_cistatic bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 15028c2ecf20Sopenharmony_ci{ 15038c2ecf20Sopenharmony_ci if (mc.moving_task && current != mc.moving_task) { 15048c2ecf20Sopenharmony_ci if (mem_cgroup_under_move(memcg)) { 15058c2ecf20Sopenharmony_ci DEFINE_WAIT(wait); 15068c2ecf20Sopenharmony_ci prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 15078c2ecf20Sopenharmony_ci /* moving charge context might have finished. */ 15088c2ecf20Sopenharmony_ci if (mc.moving_task) 15098c2ecf20Sopenharmony_ci schedule(); 15108c2ecf20Sopenharmony_ci finish_wait(&mc.waitq, &wait); 15118c2ecf20Sopenharmony_ci return true; 15128c2ecf20Sopenharmony_ci } 15138c2ecf20Sopenharmony_ci } 15148c2ecf20Sopenharmony_ci return false; 15158c2ecf20Sopenharmony_ci} 15168c2ecf20Sopenharmony_ci 15178c2ecf20Sopenharmony_cistruct memory_stat { 15188c2ecf20Sopenharmony_ci const char *name; 15198c2ecf20Sopenharmony_ci unsigned int ratio; 15208c2ecf20Sopenharmony_ci unsigned int idx; 15218c2ecf20Sopenharmony_ci}; 15228c2ecf20Sopenharmony_ci 15238c2ecf20Sopenharmony_cistatic struct memory_stat memory_stats[] = { 15248c2ecf20Sopenharmony_ci { "anon", PAGE_SIZE, NR_ANON_MAPPED }, 15258c2ecf20Sopenharmony_ci { "file", PAGE_SIZE, NR_FILE_PAGES }, 15268c2ecf20Sopenharmony_ci { "kernel_stack", 1024, NR_KERNEL_STACK_KB }, 15278c2ecf20Sopenharmony_ci { "percpu", 1, MEMCG_PERCPU_B }, 15288c2ecf20Sopenharmony_ci { "sock", PAGE_SIZE, MEMCG_SOCK }, 15298c2ecf20Sopenharmony_ci { "shmem", PAGE_SIZE, NR_SHMEM }, 15308c2ecf20Sopenharmony_ci { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED }, 15318c2ecf20Sopenharmony_ci { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY }, 15328c2ecf20Sopenharmony_ci { "file_writeback", PAGE_SIZE, NR_WRITEBACK }, 15338c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 15348c2ecf20Sopenharmony_ci /* 15358c2ecf20Sopenharmony_ci * The ratio will be initialized in memory_stats_init(). Because 15368c2ecf20Sopenharmony_ci * on some architectures, the macro of HPAGE_PMD_SIZE is not 15378c2ecf20Sopenharmony_ci * constant(e.g. powerpc). 15388c2ecf20Sopenharmony_ci */ 15398c2ecf20Sopenharmony_ci { "anon_thp", 0, NR_ANON_THPS }, 15408c2ecf20Sopenharmony_ci#endif 15418c2ecf20Sopenharmony_ci { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON }, 15428c2ecf20Sopenharmony_ci { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON }, 15438c2ecf20Sopenharmony_ci { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE }, 15448c2ecf20Sopenharmony_ci { "active_file", PAGE_SIZE, NR_ACTIVE_FILE }, 15458c2ecf20Sopenharmony_ci { "unevictable", PAGE_SIZE, NR_UNEVICTABLE }, 15468c2ecf20Sopenharmony_ci 15478c2ecf20Sopenharmony_ci /* 15488c2ecf20Sopenharmony_ci * Note: The slab_reclaimable and slab_unreclaimable must be 15498c2ecf20Sopenharmony_ci * together and slab_reclaimable must be in front. 15508c2ecf20Sopenharmony_ci */ 15518c2ecf20Sopenharmony_ci { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B }, 15528c2ecf20Sopenharmony_ci { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B }, 15538c2ecf20Sopenharmony_ci 15548c2ecf20Sopenharmony_ci /* The memory events */ 15558c2ecf20Sopenharmony_ci { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON }, 15568c2ecf20Sopenharmony_ci { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE }, 15578c2ecf20Sopenharmony_ci { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON }, 15588c2ecf20Sopenharmony_ci { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE }, 15598c2ecf20Sopenharmony_ci { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON }, 15608c2ecf20Sopenharmony_ci { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE }, 15618c2ecf20Sopenharmony_ci { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM }, 15628c2ecf20Sopenharmony_ci}; 15638c2ecf20Sopenharmony_ci 15648c2ecf20Sopenharmony_cistatic int __init memory_stats_init(void) 15658c2ecf20Sopenharmony_ci{ 15668c2ecf20Sopenharmony_ci int i; 15678c2ecf20Sopenharmony_ci 15688c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 15698c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 15708c2ecf20Sopenharmony_ci if (memory_stats[i].idx == NR_ANON_THPS) 15718c2ecf20Sopenharmony_ci memory_stats[i].ratio = HPAGE_PMD_SIZE; 15728c2ecf20Sopenharmony_ci#endif 15738c2ecf20Sopenharmony_ci VM_BUG_ON(!memory_stats[i].ratio); 15748c2ecf20Sopenharmony_ci VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT); 15758c2ecf20Sopenharmony_ci } 15768c2ecf20Sopenharmony_ci 15778c2ecf20Sopenharmony_ci return 0; 15788c2ecf20Sopenharmony_ci} 15798c2ecf20Sopenharmony_cipure_initcall(memory_stats_init); 15808c2ecf20Sopenharmony_ci 15818c2ecf20Sopenharmony_cistatic char *memory_stat_format(struct mem_cgroup *memcg) 15828c2ecf20Sopenharmony_ci{ 15838c2ecf20Sopenharmony_ci struct seq_buf s; 15848c2ecf20Sopenharmony_ci int i; 15858c2ecf20Sopenharmony_ci 15868c2ecf20Sopenharmony_ci seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); 15878c2ecf20Sopenharmony_ci if (!s.buffer) 15888c2ecf20Sopenharmony_ci return NULL; 15898c2ecf20Sopenharmony_ci 15908c2ecf20Sopenharmony_ci /* 15918c2ecf20Sopenharmony_ci * Provide statistics on the state of the memory subsystem as 15928c2ecf20Sopenharmony_ci * well as cumulative event counters that show past behavior. 15938c2ecf20Sopenharmony_ci * 15948c2ecf20Sopenharmony_ci * This list is ordered following a combination of these gradients: 15958c2ecf20Sopenharmony_ci * 1) generic big picture -> specifics and details 15968c2ecf20Sopenharmony_ci * 2) reflecting userspace activity -> reflecting kernel heuristics 15978c2ecf20Sopenharmony_ci * 15988c2ecf20Sopenharmony_ci * Current memory state: 15998c2ecf20Sopenharmony_ci */ 16008c2ecf20Sopenharmony_ci 16018c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 16028c2ecf20Sopenharmony_ci u64 size; 16038c2ecf20Sopenharmony_ci 16048c2ecf20Sopenharmony_ci size = memcg_page_state(memcg, memory_stats[i].idx); 16058c2ecf20Sopenharmony_ci size *= memory_stats[i].ratio; 16068c2ecf20Sopenharmony_ci seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size); 16078c2ecf20Sopenharmony_ci 16088c2ecf20Sopenharmony_ci if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { 16098c2ecf20Sopenharmony_ci size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + 16108c2ecf20Sopenharmony_ci memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B); 16118c2ecf20Sopenharmony_ci seq_buf_printf(&s, "slab %llu\n", size); 16128c2ecf20Sopenharmony_ci } 16138c2ecf20Sopenharmony_ci } 16148c2ecf20Sopenharmony_ci 16158c2ecf20Sopenharmony_ci /* Accumulated memory events */ 16168c2ecf20Sopenharmony_ci 16178c2ecf20Sopenharmony_ci seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT), 16188c2ecf20Sopenharmony_ci memcg_events(memcg, PGFAULT)); 16198c2ecf20Sopenharmony_ci seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT), 16208c2ecf20Sopenharmony_ci memcg_events(memcg, PGMAJFAULT)); 16218c2ecf20Sopenharmony_ci seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL), 16228c2ecf20Sopenharmony_ci memcg_events(memcg, PGREFILL)); 16238c2ecf20Sopenharmony_ci seq_buf_printf(&s, "pgscan %lu\n", 16248c2ecf20Sopenharmony_ci memcg_events(memcg, PGSCAN_KSWAPD) + 16258c2ecf20Sopenharmony_ci memcg_events(memcg, PGSCAN_DIRECT)); 16268c2ecf20Sopenharmony_ci seq_buf_printf(&s, "pgsteal %lu\n", 16278c2ecf20Sopenharmony_ci memcg_events(memcg, PGSTEAL_KSWAPD) + 16288c2ecf20Sopenharmony_ci memcg_events(memcg, PGSTEAL_DIRECT)); 16298c2ecf20Sopenharmony_ci seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE), 16308c2ecf20Sopenharmony_ci memcg_events(memcg, PGACTIVATE)); 16318c2ecf20Sopenharmony_ci seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE), 16328c2ecf20Sopenharmony_ci memcg_events(memcg, PGDEACTIVATE)); 16338c2ecf20Sopenharmony_ci seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE), 16348c2ecf20Sopenharmony_ci memcg_events(memcg, PGLAZYFREE)); 16358c2ecf20Sopenharmony_ci seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED), 16368c2ecf20Sopenharmony_ci memcg_events(memcg, PGLAZYFREED)); 16378c2ecf20Sopenharmony_ci 16388c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 16398c2ecf20Sopenharmony_ci seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC), 16408c2ecf20Sopenharmony_ci memcg_events(memcg, THP_FAULT_ALLOC)); 16418c2ecf20Sopenharmony_ci seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC), 16428c2ecf20Sopenharmony_ci memcg_events(memcg, THP_COLLAPSE_ALLOC)); 16438c2ecf20Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 16448c2ecf20Sopenharmony_ci 16458c2ecf20Sopenharmony_ci /* The above should easily fit into one page */ 16468c2ecf20Sopenharmony_ci WARN_ON_ONCE(seq_buf_has_overflowed(&s)); 16478c2ecf20Sopenharmony_ci 16488c2ecf20Sopenharmony_ci return s.buffer; 16498c2ecf20Sopenharmony_ci} 16508c2ecf20Sopenharmony_ci 16518c2ecf20Sopenharmony_ci#define K(x) ((x) << (PAGE_SHIFT-10)) 16528c2ecf20Sopenharmony_ci/** 16538c2ecf20Sopenharmony_ci * mem_cgroup_print_oom_context: Print OOM information relevant to 16548c2ecf20Sopenharmony_ci * memory controller. 16558c2ecf20Sopenharmony_ci * @memcg: The memory cgroup that went over limit 16568c2ecf20Sopenharmony_ci * @p: Task that is going to be killed 16578c2ecf20Sopenharmony_ci * 16588c2ecf20Sopenharmony_ci * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 16598c2ecf20Sopenharmony_ci * enabled 16608c2ecf20Sopenharmony_ci */ 16618c2ecf20Sopenharmony_civoid mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) 16628c2ecf20Sopenharmony_ci{ 16638c2ecf20Sopenharmony_ci rcu_read_lock(); 16648c2ecf20Sopenharmony_ci 16658c2ecf20Sopenharmony_ci if (memcg) { 16668c2ecf20Sopenharmony_ci pr_cont(",oom_memcg="); 16678c2ecf20Sopenharmony_ci pr_cont_cgroup_path(memcg->css.cgroup); 16688c2ecf20Sopenharmony_ci } else 16698c2ecf20Sopenharmony_ci pr_cont(",global_oom"); 16708c2ecf20Sopenharmony_ci if (p) { 16718c2ecf20Sopenharmony_ci pr_cont(",task_memcg="); 16728c2ecf20Sopenharmony_ci pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 16738c2ecf20Sopenharmony_ci } 16748c2ecf20Sopenharmony_ci rcu_read_unlock(); 16758c2ecf20Sopenharmony_ci} 16768c2ecf20Sopenharmony_ci 16778c2ecf20Sopenharmony_ci/** 16788c2ecf20Sopenharmony_ci * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to 16798c2ecf20Sopenharmony_ci * memory controller. 16808c2ecf20Sopenharmony_ci * @memcg: The memory cgroup that went over limit 16818c2ecf20Sopenharmony_ci */ 16828c2ecf20Sopenharmony_civoid mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) 16838c2ecf20Sopenharmony_ci{ 16848c2ecf20Sopenharmony_ci char *buf; 16858c2ecf20Sopenharmony_ci 16868c2ecf20Sopenharmony_ci pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 16878c2ecf20Sopenharmony_ci K((u64)page_counter_read(&memcg->memory)), 16888c2ecf20Sopenharmony_ci K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt); 16898c2ecf20Sopenharmony_ci if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 16908c2ecf20Sopenharmony_ci pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", 16918c2ecf20Sopenharmony_ci K((u64)page_counter_read(&memcg->swap)), 16928c2ecf20Sopenharmony_ci K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt); 16938c2ecf20Sopenharmony_ci else { 16948c2ecf20Sopenharmony_ci pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 16958c2ecf20Sopenharmony_ci K((u64)page_counter_read(&memcg->memsw)), 16968c2ecf20Sopenharmony_ci K((u64)memcg->memsw.max), memcg->memsw.failcnt); 16978c2ecf20Sopenharmony_ci pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 16988c2ecf20Sopenharmony_ci K((u64)page_counter_read(&memcg->kmem)), 16998c2ecf20Sopenharmony_ci K((u64)memcg->kmem.max), memcg->kmem.failcnt); 17008c2ecf20Sopenharmony_ci } 17018c2ecf20Sopenharmony_ci 17028c2ecf20Sopenharmony_ci pr_info("Memory cgroup stats for "); 17038c2ecf20Sopenharmony_ci pr_cont_cgroup_path(memcg->css.cgroup); 17048c2ecf20Sopenharmony_ci pr_cont(":"); 17058c2ecf20Sopenharmony_ci buf = memory_stat_format(memcg); 17068c2ecf20Sopenharmony_ci if (!buf) 17078c2ecf20Sopenharmony_ci return; 17088c2ecf20Sopenharmony_ci pr_info("%s", buf); 17098c2ecf20Sopenharmony_ci kfree(buf); 17108c2ecf20Sopenharmony_ci} 17118c2ecf20Sopenharmony_ci 17128c2ecf20Sopenharmony_ci/* 17138c2ecf20Sopenharmony_ci * Return the memory (and swap, if configured) limit for a memcg. 17148c2ecf20Sopenharmony_ci */ 17158c2ecf20Sopenharmony_ciunsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) 17168c2ecf20Sopenharmony_ci{ 17178c2ecf20Sopenharmony_ci unsigned long max = READ_ONCE(memcg->memory.max); 17188c2ecf20Sopenharmony_ci 17198c2ecf20Sopenharmony_ci if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 17208c2ecf20Sopenharmony_ci if (mem_cgroup_swappiness(memcg)) 17218c2ecf20Sopenharmony_ci max += min(READ_ONCE(memcg->swap.max), 17228c2ecf20Sopenharmony_ci (unsigned long)total_swap_pages); 17238c2ecf20Sopenharmony_ci } else { /* v1 */ 17248c2ecf20Sopenharmony_ci if (mem_cgroup_swappiness(memcg)) { 17258c2ecf20Sopenharmony_ci /* Calculate swap excess capacity from memsw limit */ 17268c2ecf20Sopenharmony_ci unsigned long swap = READ_ONCE(memcg->memsw.max) - max; 17278c2ecf20Sopenharmony_ci 17288c2ecf20Sopenharmony_ci max += min(swap, (unsigned long)total_swap_pages); 17298c2ecf20Sopenharmony_ci } 17308c2ecf20Sopenharmony_ci } 17318c2ecf20Sopenharmony_ci return max; 17328c2ecf20Sopenharmony_ci} 17338c2ecf20Sopenharmony_ci 17348c2ecf20Sopenharmony_ciunsigned long mem_cgroup_size(struct mem_cgroup *memcg) 17358c2ecf20Sopenharmony_ci{ 17368c2ecf20Sopenharmony_ci return page_counter_read(&memcg->memory); 17378c2ecf20Sopenharmony_ci} 17388c2ecf20Sopenharmony_ci 17398c2ecf20Sopenharmony_cistatic bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 17408c2ecf20Sopenharmony_ci int order) 17418c2ecf20Sopenharmony_ci{ 17428c2ecf20Sopenharmony_ci struct oom_control oc = { 17438c2ecf20Sopenharmony_ci .zonelist = NULL, 17448c2ecf20Sopenharmony_ci .nodemask = NULL, 17458c2ecf20Sopenharmony_ci .memcg = memcg, 17468c2ecf20Sopenharmony_ci .gfp_mask = gfp_mask, 17478c2ecf20Sopenharmony_ci .order = order, 17488c2ecf20Sopenharmony_ci }; 17498c2ecf20Sopenharmony_ci bool ret = true; 17508c2ecf20Sopenharmony_ci 17518c2ecf20Sopenharmony_ci if (mutex_lock_killable(&oom_lock)) 17528c2ecf20Sopenharmony_ci return true; 17538c2ecf20Sopenharmony_ci 17548c2ecf20Sopenharmony_ci if (mem_cgroup_margin(memcg) >= (1 << order)) 17558c2ecf20Sopenharmony_ci goto unlock; 17568c2ecf20Sopenharmony_ci 17578c2ecf20Sopenharmony_ci /* 17588c2ecf20Sopenharmony_ci * A few threads which were not waiting at mutex_lock_killable() can 17598c2ecf20Sopenharmony_ci * fail to bail out. Therefore, check again after holding oom_lock. 17608c2ecf20Sopenharmony_ci */ 17618c2ecf20Sopenharmony_ci ret = task_is_dying() || out_of_memory(&oc); 17628c2ecf20Sopenharmony_ci 17638c2ecf20Sopenharmony_ciunlock: 17648c2ecf20Sopenharmony_ci mutex_unlock(&oom_lock); 17658c2ecf20Sopenharmony_ci return ret; 17668c2ecf20Sopenharmony_ci} 17678c2ecf20Sopenharmony_ci 17688c2ecf20Sopenharmony_cistatic int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 17698c2ecf20Sopenharmony_ci pg_data_t *pgdat, 17708c2ecf20Sopenharmony_ci gfp_t gfp_mask, 17718c2ecf20Sopenharmony_ci unsigned long *total_scanned) 17728c2ecf20Sopenharmony_ci{ 17738c2ecf20Sopenharmony_ci struct mem_cgroup *victim = NULL; 17748c2ecf20Sopenharmony_ci int total = 0; 17758c2ecf20Sopenharmony_ci int loop = 0; 17768c2ecf20Sopenharmony_ci unsigned long excess; 17778c2ecf20Sopenharmony_ci unsigned long nr_scanned; 17788c2ecf20Sopenharmony_ci struct mem_cgroup_reclaim_cookie reclaim = { 17798c2ecf20Sopenharmony_ci .pgdat = pgdat, 17808c2ecf20Sopenharmony_ci }; 17818c2ecf20Sopenharmony_ci 17828c2ecf20Sopenharmony_ci excess = soft_limit_excess(root_memcg); 17838c2ecf20Sopenharmony_ci 17848c2ecf20Sopenharmony_ci while (1) { 17858c2ecf20Sopenharmony_ci victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 17868c2ecf20Sopenharmony_ci if (!victim) { 17878c2ecf20Sopenharmony_ci loop++; 17888c2ecf20Sopenharmony_ci if (loop >= 2) { 17898c2ecf20Sopenharmony_ci /* 17908c2ecf20Sopenharmony_ci * If we have not been able to reclaim 17918c2ecf20Sopenharmony_ci * anything, it might because there are 17928c2ecf20Sopenharmony_ci * no reclaimable pages under this hierarchy 17938c2ecf20Sopenharmony_ci */ 17948c2ecf20Sopenharmony_ci if (!total) 17958c2ecf20Sopenharmony_ci break; 17968c2ecf20Sopenharmony_ci /* 17978c2ecf20Sopenharmony_ci * We want to do more targeted reclaim. 17988c2ecf20Sopenharmony_ci * excess >> 2 is not to excessive so as to 17998c2ecf20Sopenharmony_ci * reclaim too much, nor too less that we keep 18008c2ecf20Sopenharmony_ci * coming back to reclaim from this cgroup 18018c2ecf20Sopenharmony_ci */ 18028c2ecf20Sopenharmony_ci if (total >= (excess >> 2) || 18038c2ecf20Sopenharmony_ci (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 18048c2ecf20Sopenharmony_ci break; 18058c2ecf20Sopenharmony_ci } 18068c2ecf20Sopenharmony_ci continue; 18078c2ecf20Sopenharmony_ci } 18088c2ecf20Sopenharmony_ci total += mem_cgroup_shrink_node(victim, gfp_mask, false, 18098c2ecf20Sopenharmony_ci pgdat, &nr_scanned); 18108c2ecf20Sopenharmony_ci *total_scanned += nr_scanned; 18118c2ecf20Sopenharmony_ci if (!soft_limit_excess(root_memcg)) 18128c2ecf20Sopenharmony_ci break; 18138c2ecf20Sopenharmony_ci } 18148c2ecf20Sopenharmony_ci mem_cgroup_iter_break(root_memcg, victim); 18158c2ecf20Sopenharmony_ci return total; 18168c2ecf20Sopenharmony_ci} 18178c2ecf20Sopenharmony_ci 18188c2ecf20Sopenharmony_ci#ifdef CONFIG_LOCKDEP 18198c2ecf20Sopenharmony_cistatic struct lockdep_map memcg_oom_lock_dep_map = { 18208c2ecf20Sopenharmony_ci .name = "memcg_oom_lock", 18218c2ecf20Sopenharmony_ci}; 18228c2ecf20Sopenharmony_ci#endif 18238c2ecf20Sopenharmony_ci 18248c2ecf20Sopenharmony_cistatic DEFINE_SPINLOCK(memcg_oom_lock); 18258c2ecf20Sopenharmony_ci 18268c2ecf20Sopenharmony_ci/* 18278c2ecf20Sopenharmony_ci * Check OOM-Killer is already running under our hierarchy. 18288c2ecf20Sopenharmony_ci * If someone is running, return false. 18298c2ecf20Sopenharmony_ci */ 18308c2ecf20Sopenharmony_cistatic bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 18318c2ecf20Sopenharmony_ci{ 18328c2ecf20Sopenharmony_ci struct mem_cgroup *iter, *failed = NULL; 18338c2ecf20Sopenharmony_ci 18348c2ecf20Sopenharmony_ci spin_lock(&memcg_oom_lock); 18358c2ecf20Sopenharmony_ci 18368c2ecf20Sopenharmony_ci for_each_mem_cgroup_tree(iter, memcg) { 18378c2ecf20Sopenharmony_ci if (iter->oom_lock) { 18388c2ecf20Sopenharmony_ci /* 18398c2ecf20Sopenharmony_ci * this subtree of our hierarchy is already locked 18408c2ecf20Sopenharmony_ci * so we cannot give a lock. 18418c2ecf20Sopenharmony_ci */ 18428c2ecf20Sopenharmony_ci failed = iter; 18438c2ecf20Sopenharmony_ci mem_cgroup_iter_break(memcg, iter); 18448c2ecf20Sopenharmony_ci break; 18458c2ecf20Sopenharmony_ci } else 18468c2ecf20Sopenharmony_ci iter->oom_lock = true; 18478c2ecf20Sopenharmony_ci } 18488c2ecf20Sopenharmony_ci 18498c2ecf20Sopenharmony_ci if (failed) { 18508c2ecf20Sopenharmony_ci /* 18518c2ecf20Sopenharmony_ci * OK, we failed to lock the whole subtree so we have 18528c2ecf20Sopenharmony_ci * to clean up what we set up to the failing subtree 18538c2ecf20Sopenharmony_ci */ 18548c2ecf20Sopenharmony_ci for_each_mem_cgroup_tree(iter, memcg) { 18558c2ecf20Sopenharmony_ci if (iter == failed) { 18568c2ecf20Sopenharmony_ci mem_cgroup_iter_break(memcg, iter); 18578c2ecf20Sopenharmony_ci break; 18588c2ecf20Sopenharmony_ci } 18598c2ecf20Sopenharmony_ci iter->oom_lock = false; 18608c2ecf20Sopenharmony_ci } 18618c2ecf20Sopenharmony_ci } else 18628c2ecf20Sopenharmony_ci mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 18638c2ecf20Sopenharmony_ci 18648c2ecf20Sopenharmony_ci spin_unlock(&memcg_oom_lock); 18658c2ecf20Sopenharmony_ci 18668c2ecf20Sopenharmony_ci return !failed; 18678c2ecf20Sopenharmony_ci} 18688c2ecf20Sopenharmony_ci 18698c2ecf20Sopenharmony_cistatic void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 18708c2ecf20Sopenharmony_ci{ 18718c2ecf20Sopenharmony_ci struct mem_cgroup *iter; 18728c2ecf20Sopenharmony_ci 18738c2ecf20Sopenharmony_ci spin_lock(&memcg_oom_lock); 18748c2ecf20Sopenharmony_ci mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); 18758c2ecf20Sopenharmony_ci for_each_mem_cgroup_tree(iter, memcg) 18768c2ecf20Sopenharmony_ci iter->oom_lock = false; 18778c2ecf20Sopenharmony_ci spin_unlock(&memcg_oom_lock); 18788c2ecf20Sopenharmony_ci} 18798c2ecf20Sopenharmony_ci 18808c2ecf20Sopenharmony_cistatic void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 18818c2ecf20Sopenharmony_ci{ 18828c2ecf20Sopenharmony_ci struct mem_cgroup *iter; 18838c2ecf20Sopenharmony_ci 18848c2ecf20Sopenharmony_ci spin_lock(&memcg_oom_lock); 18858c2ecf20Sopenharmony_ci for_each_mem_cgroup_tree(iter, memcg) 18868c2ecf20Sopenharmony_ci iter->under_oom++; 18878c2ecf20Sopenharmony_ci spin_unlock(&memcg_oom_lock); 18888c2ecf20Sopenharmony_ci} 18898c2ecf20Sopenharmony_ci 18908c2ecf20Sopenharmony_cistatic void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 18918c2ecf20Sopenharmony_ci{ 18928c2ecf20Sopenharmony_ci struct mem_cgroup *iter; 18938c2ecf20Sopenharmony_ci 18948c2ecf20Sopenharmony_ci /* 18958c2ecf20Sopenharmony_ci * Be careful about under_oom underflows becase a child memcg 18968c2ecf20Sopenharmony_ci * could have been added after mem_cgroup_mark_under_oom. 18978c2ecf20Sopenharmony_ci */ 18988c2ecf20Sopenharmony_ci spin_lock(&memcg_oom_lock); 18998c2ecf20Sopenharmony_ci for_each_mem_cgroup_tree(iter, memcg) 19008c2ecf20Sopenharmony_ci if (iter->under_oom > 0) 19018c2ecf20Sopenharmony_ci iter->under_oom--; 19028c2ecf20Sopenharmony_ci spin_unlock(&memcg_oom_lock); 19038c2ecf20Sopenharmony_ci} 19048c2ecf20Sopenharmony_ci 19058c2ecf20Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 19068c2ecf20Sopenharmony_ci 19078c2ecf20Sopenharmony_cistruct oom_wait_info { 19088c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 19098c2ecf20Sopenharmony_ci wait_queue_entry_t wait; 19108c2ecf20Sopenharmony_ci}; 19118c2ecf20Sopenharmony_ci 19128c2ecf20Sopenharmony_cistatic int memcg_oom_wake_function(wait_queue_entry_t *wait, 19138c2ecf20Sopenharmony_ci unsigned mode, int sync, void *arg) 19148c2ecf20Sopenharmony_ci{ 19158c2ecf20Sopenharmony_ci struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 19168c2ecf20Sopenharmony_ci struct mem_cgroup *oom_wait_memcg; 19178c2ecf20Sopenharmony_ci struct oom_wait_info *oom_wait_info; 19188c2ecf20Sopenharmony_ci 19198c2ecf20Sopenharmony_ci oom_wait_info = container_of(wait, struct oom_wait_info, wait); 19208c2ecf20Sopenharmony_ci oom_wait_memcg = oom_wait_info->memcg; 19218c2ecf20Sopenharmony_ci 19228c2ecf20Sopenharmony_ci if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 19238c2ecf20Sopenharmony_ci !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 19248c2ecf20Sopenharmony_ci return 0; 19258c2ecf20Sopenharmony_ci return autoremove_wake_function(wait, mode, sync, arg); 19268c2ecf20Sopenharmony_ci} 19278c2ecf20Sopenharmony_ci 19288c2ecf20Sopenharmony_cistatic void memcg_oom_recover(struct mem_cgroup *memcg) 19298c2ecf20Sopenharmony_ci{ 19308c2ecf20Sopenharmony_ci /* 19318c2ecf20Sopenharmony_ci * For the following lockless ->under_oom test, the only required 19328c2ecf20Sopenharmony_ci * guarantee is that it must see the state asserted by an OOM when 19338c2ecf20Sopenharmony_ci * this function is called as a result of userland actions 19348c2ecf20Sopenharmony_ci * triggered by the notification of the OOM. This is trivially 19358c2ecf20Sopenharmony_ci * achieved by invoking mem_cgroup_mark_under_oom() before 19368c2ecf20Sopenharmony_ci * triggering notification. 19378c2ecf20Sopenharmony_ci */ 19388c2ecf20Sopenharmony_ci if (memcg && memcg->under_oom) 19398c2ecf20Sopenharmony_ci __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 19408c2ecf20Sopenharmony_ci} 19418c2ecf20Sopenharmony_ci 19428c2ecf20Sopenharmony_cienum oom_status { 19438c2ecf20Sopenharmony_ci OOM_SUCCESS, 19448c2ecf20Sopenharmony_ci OOM_FAILED, 19458c2ecf20Sopenharmony_ci OOM_ASYNC, 19468c2ecf20Sopenharmony_ci OOM_SKIPPED 19478c2ecf20Sopenharmony_ci}; 19488c2ecf20Sopenharmony_ci 19498c2ecf20Sopenharmony_cistatic enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 19508c2ecf20Sopenharmony_ci{ 19518c2ecf20Sopenharmony_ci enum oom_status ret; 19528c2ecf20Sopenharmony_ci bool locked; 19538c2ecf20Sopenharmony_ci 19548c2ecf20Sopenharmony_ci if (order > PAGE_ALLOC_COSTLY_ORDER) 19558c2ecf20Sopenharmony_ci return OOM_SKIPPED; 19568c2ecf20Sopenharmony_ci 19578c2ecf20Sopenharmony_ci memcg_memory_event(memcg, MEMCG_OOM); 19588c2ecf20Sopenharmony_ci 19598c2ecf20Sopenharmony_ci /* 19608c2ecf20Sopenharmony_ci * We are in the middle of the charge context here, so we 19618c2ecf20Sopenharmony_ci * don't want to block when potentially sitting on a callstack 19628c2ecf20Sopenharmony_ci * that holds all kinds of filesystem and mm locks. 19638c2ecf20Sopenharmony_ci * 19648c2ecf20Sopenharmony_ci * cgroup1 allows disabling the OOM killer and waiting for outside 19658c2ecf20Sopenharmony_ci * handling until the charge can succeed; remember the context and put 19668c2ecf20Sopenharmony_ci * the task to sleep at the end of the page fault when all locks are 19678c2ecf20Sopenharmony_ci * released. 19688c2ecf20Sopenharmony_ci * 19698c2ecf20Sopenharmony_ci * On the other hand, in-kernel OOM killer allows for an async victim 19708c2ecf20Sopenharmony_ci * memory reclaim (oom_reaper) and that means that we are not solely 19718c2ecf20Sopenharmony_ci * relying on the oom victim to make a forward progress and we can 19728c2ecf20Sopenharmony_ci * invoke the oom killer here. 19738c2ecf20Sopenharmony_ci * 19748c2ecf20Sopenharmony_ci * Please note that mem_cgroup_out_of_memory might fail to find a 19758c2ecf20Sopenharmony_ci * victim and then we have to bail out from the charge path. 19768c2ecf20Sopenharmony_ci */ 19778c2ecf20Sopenharmony_ci if (memcg->oom_kill_disable) { 19788c2ecf20Sopenharmony_ci if (!current->in_user_fault) 19798c2ecf20Sopenharmony_ci return OOM_SKIPPED; 19808c2ecf20Sopenharmony_ci css_get(&memcg->css); 19818c2ecf20Sopenharmony_ci current->memcg_in_oom = memcg; 19828c2ecf20Sopenharmony_ci current->memcg_oom_gfp_mask = mask; 19838c2ecf20Sopenharmony_ci current->memcg_oom_order = order; 19848c2ecf20Sopenharmony_ci 19858c2ecf20Sopenharmony_ci return OOM_ASYNC; 19868c2ecf20Sopenharmony_ci } 19878c2ecf20Sopenharmony_ci 19888c2ecf20Sopenharmony_ci mem_cgroup_mark_under_oom(memcg); 19898c2ecf20Sopenharmony_ci 19908c2ecf20Sopenharmony_ci locked = mem_cgroup_oom_trylock(memcg); 19918c2ecf20Sopenharmony_ci 19928c2ecf20Sopenharmony_ci if (locked) 19938c2ecf20Sopenharmony_ci mem_cgroup_oom_notify(memcg); 19948c2ecf20Sopenharmony_ci 19958c2ecf20Sopenharmony_ci mem_cgroup_unmark_under_oom(memcg); 19968c2ecf20Sopenharmony_ci if (mem_cgroup_out_of_memory(memcg, mask, order)) 19978c2ecf20Sopenharmony_ci ret = OOM_SUCCESS; 19988c2ecf20Sopenharmony_ci else 19998c2ecf20Sopenharmony_ci ret = OOM_FAILED; 20008c2ecf20Sopenharmony_ci 20018c2ecf20Sopenharmony_ci if (locked) 20028c2ecf20Sopenharmony_ci mem_cgroup_oom_unlock(memcg); 20038c2ecf20Sopenharmony_ci 20048c2ecf20Sopenharmony_ci return ret; 20058c2ecf20Sopenharmony_ci} 20068c2ecf20Sopenharmony_ci 20078c2ecf20Sopenharmony_ci/** 20088c2ecf20Sopenharmony_ci * mem_cgroup_oom_synchronize - complete memcg OOM handling 20098c2ecf20Sopenharmony_ci * @handle: actually kill/wait or just clean up the OOM state 20108c2ecf20Sopenharmony_ci * 20118c2ecf20Sopenharmony_ci * This has to be called at the end of a page fault if the memcg OOM 20128c2ecf20Sopenharmony_ci * handler was enabled. 20138c2ecf20Sopenharmony_ci * 20148c2ecf20Sopenharmony_ci * Memcg supports userspace OOM handling where failed allocations must 20158c2ecf20Sopenharmony_ci * sleep on a waitqueue until the userspace task resolves the 20168c2ecf20Sopenharmony_ci * situation. Sleeping directly in the charge context with all kinds 20178c2ecf20Sopenharmony_ci * of locks held is not a good idea, instead we remember an OOM state 20188c2ecf20Sopenharmony_ci * in the task and mem_cgroup_oom_synchronize() has to be called at 20198c2ecf20Sopenharmony_ci * the end of the page fault to complete the OOM handling. 20208c2ecf20Sopenharmony_ci * 20218c2ecf20Sopenharmony_ci * Returns %true if an ongoing memcg OOM situation was detected and 20228c2ecf20Sopenharmony_ci * completed, %false otherwise. 20238c2ecf20Sopenharmony_ci */ 20248c2ecf20Sopenharmony_cibool mem_cgroup_oom_synchronize(bool handle) 20258c2ecf20Sopenharmony_ci{ 20268c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = current->memcg_in_oom; 20278c2ecf20Sopenharmony_ci struct oom_wait_info owait; 20288c2ecf20Sopenharmony_ci bool locked; 20298c2ecf20Sopenharmony_ci 20308c2ecf20Sopenharmony_ci /* OOM is global, do not handle */ 20318c2ecf20Sopenharmony_ci if (!memcg) 20328c2ecf20Sopenharmony_ci return false; 20338c2ecf20Sopenharmony_ci 20348c2ecf20Sopenharmony_ci if (!handle) 20358c2ecf20Sopenharmony_ci goto cleanup; 20368c2ecf20Sopenharmony_ci 20378c2ecf20Sopenharmony_ci owait.memcg = memcg; 20388c2ecf20Sopenharmony_ci owait.wait.flags = 0; 20398c2ecf20Sopenharmony_ci owait.wait.func = memcg_oom_wake_function; 20408c2ecf20Sopenharmony_ci owait.wait.private = current; 20418c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&owait.wait.entry); 20428c2ecf20Sopenharmony_ci 20438c2ecf20Sopenharmony_ci prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 20448c2ecf20Sopenharmony_ci mem_cgroup_mark_under_oom(memcg); 20458c2ecf20Sopenharmony_ci 20468c2ecf20Sopenharmony_ci locked = mem_cgroup_oom_trylock(memcg); 20478c2ecf20Sopenharmony_ci 20488c2ecf20Sopenharmony_ci if (locked) 20498c2ecf20Sopenharmony_ci mem_cgroup_oom_notify(memcg); 20508c2ecf20Sopenharmony_ci 20518c2ecf20Sopenharmony_ci if (locked && !memcg->oom_kill_disable) { 20528c2ecf20Sopenharmony_ci mem_cgroup_unmark_under_oom(memcg); 20538c2ecf20Sopenharmony_ci finish_wait(&memcg_oom_waitq, &owait.wait); 20548c2ecf20Sopenharmony_ci mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, 20558c2ecf20Sopenharmony_ci current->memcg_oom_order); 20568c2ecf20Sopenharmony_ci } else { 20578c2ecf20Sopenharmony_ci schedule(); 20588c2ecf20Sopenharmony_ci mem_cgroup_unmark_under_oom(memcg); 20598c2ecf20Sopenharmony_ci finish_wait(&memcg_oom_waitq, &owait.wait); 20608c2ecf20Sopenharmony_ci } 20618c2ecf20Sopenharmony_ci 20628c2ecf20Sopenharmony_ci if (locked) { 20638c2ecf20Sopenharmony_ci mem_cgroup_oom_unlock(memcg); 20648c2ecf20Sopenharmony_ci /* 20658c2ecf20Sopenharmony_ci * There is no guarantee that an OOM-lock contender 20668c2ecf20Sopenharmony_ci * sees the wakeups triggered by the OOM kill 20678c2ecf20Sopenharmony_ci * uncharges. Wake any sleepers explicitely. 20688c2ecf20Sopenharmony_ci */ 20698c2ecf20Sopenharmony_ci memcg_oom_recover(memcg); 20708c2ecf20Sopenharmony_ci } 20718c2ecf20Sopenharmony_cicleanup: 20728c2ecf20Sopenharmony_ci current->memcg_in_oom = NULL; 20738c2ecf20Sopenharmony_ci css_put(&memcg->css); 20748c2ecf20Sopenharmony_ci return true; 20758c2ecf20Sopenharmony_ci} 20768c2ecf20Sopenharmony_ci 20778c2ecf20Sopenharmony_ci/** 20788c2ecf20Sopenharmony_ci * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM 20798c2ecf20Sopenharmony_ci * @victim: task to be killed by the OOM killer 20808c2ecf20Sopenharmony_ci * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM 20818c2ecf20Sopenharmony_ci * 20828c2ecf20Sopenharmony_ci * Returns a pointer to a memory cgroup, which has to be cleaned up 20838c2ecf20Sopenharmony_ci * by killing all belonging OOM-killable tasks. 20848c2ecf20Sopenharmony_ci * 20858c2ecf20Sopenharmony_ci * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. 20868c2ecf20Sopenharmony_ci */ 20878c2ecf20Sopenharmony_cistruct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, 20888c2ecf20Sopenharmony_ci struct mem_cgroup *oom_domain) 20898c2ecf20Sopenharmony_ci{ 20908c2ecf20Sopenharmony_ci struct mem_cgroup *oom_group = NULL; 20918c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 20928c2ecf20Sopenharmony_ci 20938c2ecf20Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 20948c2ecf20Sopenharmony_ci return NULL; 20958c2ecf20Sopenharmony_ci 20968c2ecf20Sopenharmony_ci if (!oom_domain) 20978c2ecf20Sopenharmony_ci oom_domain = root_mem_cgroup; 20988c2ecf20Sopenharmony_ci 20998c2ecf20Sopenharmony_ci rcu_read_lock(); 21008c2ecf20Sopenharmony_ci 21018c2ecf20Sopenharmony_ci memcg = mem_cgroup_from_task(victim); 21028c2ecf20Sopenharmony_ci if (memcg == root_mem_cgroup) 21038c2ecf20Sopenharmony_ci goto out; 21048c2ecf20Sopenharmony_ci 21058c2ecf20Sopenharmony_ci /* 21068c2ecf20Sopenharmony_ci * If the victim task has been asynchronously moved to a different 21078c2ecf20Sopenharmony_ci * memory cgroup, we might end up killing tasks outside oom_domain. 21088c2ecf20Sopenharmony_ci * In this case it's better to ignore memory.group.oom. 21098c2ecf20Sopenharmony_ci */ 21108c2ecf20Sopenharmony_ci if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain))) 21118c2ecf20Sopenharmony_ci goto out; 21128c2ecf20Sopenharmony_ci 21138c2ecf20Sopenharmony_ci /* 21148c2ecf20Sopenharmony_ci * Traverse the memory cgroup hierarchy from the victim task's 21158c2ecf20Sopenharmony_ci * cgroup up to the OOMing cgroup (or root) to find the 21168c2ecf20Sopenharmony_ci * highest-level memory cgroup with oom.group set. 21178c2ecf20Sopenharmony_ci */ 21188c2ecf20Sopenharmony_ci for (; memcg; memcg = parent_mem_cgroup(memcg)) { 21198c2ecf20Sopenharmony_ci if (memcg->oom_group) 21208c2ecf20Sopenharmony_ci oom_group = memcg; 21218c2ecf20Sopenharmony_ci 21228c2ecf20Sopenharmony_ci if (memcg == oom_domain) 21238c2ecf20Sopenharmony_ci break; 21248c2ecf20Sopenharmony_ci } 21258c2ecf20Sopenharmony_ci 21268c2ecf20Sopenharmony_ci if (oom_group) 21278c2ecf20Sopenharmony_ci css_get(&oom_group->css); 21288c2ecf20Sopenharmony_ciout: 21298c2ecf20Sopenharmony_ci rcu_read_unlock(); 21308c2ecf20Sopenharmony_ci 21318c2ecf20Sopenharmony_ci return oom_group; 21328c2ecf20Sopenharmony_ci} 21338c2ecf20Sopenharmony_ci 21348c2ecf20Sopenharmony_civoid mem_cgroup_print_oom_group(struct mem_cgroup *memcg) 21358c2ecf20Sopenharmony_ci{ 21368c2ecf20Sopenharmony_ci pr_info("Tasks in "); 21378c2ecf20Sopenharmony_ci pr_cont_cgroup_path(memcg->css.cgroup); 21388c2ecf20Sopenharmony_ci pr_cont(" are going to be killed due to memory.oom.group set\n"); 21398c2ecf20Sopenharmony_ci} 21408c2ecf20Sopenharmony_ci 21418c2ecf20Sopenharmony_ci/** 21428c2ecf20Sopenharmony_ci * lock_page_memcg - lock a page->mem_cgroup binding 21438c2ecf20Sopenharmony_ci * @page: the page 21448c2ecf20Sopenharmony_ci * 21458c2ecf20Sopenharmony_ci * This function protects unlocked LRU pages from being moved to 21468c2ecf20Sopenharmony_ci * another cgroup. 21478c2ecf20Sopenharmony_ci * 21488c2ecf20Sopenharmony_ci * It ensures lifetime of the returned memcg. Caller is responsible 21498c2ecf20Sopenharmony_ci * for the lifetime of the page; __unlock_page_memcg() is available 21508c2ecf20Sopenharmony_ci * when @page might get freed inside the locked section. 21518c2ecf20Sopenharmony_ci */ 21528c2ecf20Sopenharmony_cistruct mem_cgroup *lock_page_memcg(struct page *page) 21538c2ecf20Sopenharmony_ci{ 21548c2ecf20Sopenharmony_ci struct page *head = compound_head(page); /* rmap on tail pages */ 21558c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 21568c2ecf20Sopenharmony_ci unsigned long flags; 21578c2ecf20Sopenharmony_ci 21588c2ecf20Sopenharmony_ci /* 21598c2ecf20Sopenharmony_ci * The RCU lock is held throughout the transaction. The fast 21608c2ecf20Sopenharmony_ci * path can get away without acquiring the memcg->move_lock 21618c2ecf20Sopenharmony_ci * because page moving starts with an RCU grace period. 21628c2ecf20Sopenharmony_ci * 21638c2ecf20Sopenharmony_ci * The RCU lock also protects the memcg from being freed when 21648c2ecf20Sopenharmony_ci * the page state that is going to change is the only thing 21658c2ecf20Sopenharmony_ci * preventing the page itself from being freed. E.g. writeback 21668c2ecf20Sopenharmony_ci * doesn't hold a page reference and relies on PG_writeback to 21678c2ecf20Sopenharmony_ci * keep off truncation, migration and so forth. 21688c2ecf20Sopenharmony_ci */ 21698c2ecf20Sopenharmony_ci rcu_read_lock(); 21708c2ecf20Sopenharmony_ci 21718c2ecf20Sopenharmony_ci if (mem_cgroup_disabled()) 21728c2ecf20Sopenharmony_ci return NULL; 21738c2ecf20Sopenharmony_ciagain: 21748c2ecf20Sopenharmony_ci memcg = head->mem_cgroup; 21758c2ecf20Sopenharmony_ci if (unlikely(!memcg)) 21768c2ecf20Sopenharmony_ci return NULL; 21778c2ecf20Sopenharmony_ci 21788c2ecf20Sopenharmony_ci if (atomic_read(&memcg->moving_account) <= 0) 21798c2ecf20Sopenharmony_ci return memcg; 21808c2ecf20Sopenharmony_ci 21818c2ecf20Sopenharmony_ci spin_lock_irqsave(&memcg->move_lock, flags); 21828c2ecf20Sopenharmony_ci if (memcg != head->mem_cgroup) { 21838c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&memcg->move_lock, flags); 21848c2ecf20Sopenharmony_ci goto again; 21858c2ecf20Sopenharmony_ci } 21868c2ecf20Sopenharmony_ci 21878c2ecf20Sopenharmony_ci /* 21888c2ecf20Sopenharmony_ci * When charge migration first begins, we can have locked and 21898c2ecf20Sopenharmony_ci * unlocked page stat updates happening concurrently. Track 21908c2ecf20Sopenharmony_ci * the task who has the lock for unlock_page_memcg(). 21918c2ecf20Sopenharmony_ci */ 21928c2ecf20Sopenharmony_ci memcg->move_lock_task = current; 21938c2ecf20Sopenharmony_ci memcg->move_lock_flags = flags; 21948c2ecf20Sopenharmony_ci 21958c2ecf20Sopenharmony_ci return memcg; 21968c2ecf20Sopenharmony_ci} 21978c2ecf20Sopenharmony_ciEXPORT_SYMBOL(lock_page_memcg); 21988c2ecf20Sopenharmony_ci 21998c2ecf20Sopenharmony_ci/** 22008c2ecf20Sopenharmony_ci * __unlock_page_memcg - unlock and unpin a memcg 22018c2ecf20Sopenharmony_ci * @memcg: the memcg 22028c2ecf20Sopenharmony_ci * 22038c2ecf20Sopenharmony_ci * Unlock and unpin a memcg returned by lock_page_memcg(). 22048c2ecf20Sopenharmony_ci */ 22058c2ecf20Sopenharmony_civoid __unlock_page_memcg(struct mem_cgroup *memcg) 22068c2ecf20Sopenharmony_ci{ 22078c2ecf20Sopenharmony_ci if (memcg && memcg->move_lock_task == current) { 22088c2ecf20Sopenharmony_ci unsigned long flags = memcg->move_lock_flags; 22098c2ecf20Sopenharmony_ci 22108c2ecf20Sopenharmony_ci memcg->move_lock_task = NULL; 22118c2ecf20Sopenharmony_ci memcg->move_lock_flags = 0; 22128c2ecf20Sopenharmony_ci 22138c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&memcg->move_lock, flags); 22148c2ecf20Sopenharmony_ci } 22158c2ecf20Sopenharmony_ci 22168c2ecf20Sopenharmony_ci rcu_read_unlock(); 22178c2ecf20Sopenharmony_ci} 22188c2ecf20Sopenharmony_ci 22198c2ecf20Sopenharmony_ci/** 22208c2ecf20Sopenharmony_ci * unlock_page_memcg - unlock a page->mem_cgroup binding 22218c2ecf20Sopenharmony_ci * @page: the page 22228c2ecf20Sopenharmony_ci */ 22238c2ecf20Sopenharmony_civoid unlock_page_memcg(struct page *page) 22248c2ecf20Sopenharmony_ci{ 22258c2ecf20Sopenharmony_ci struct page *head = compound_head(page); 22268c2ecf20Sopenharmony_ci 22278c2ecf20Sopenharmony_ci __unlock_page_memcg(head->mem_cgroup); 22288c2ecf20Sopenharmony_ci} 22298c2ecf20Sopenharmony_ciEXPORT_SYMBOL(unlock_page_memcg); 22308c2ecf20Sopenharmony_ci 22318c2ecf20Sopenharmony_cistruct memcg_stock_pcp { 22328c2ecf20Sopenharmony_ci struct mem_cgroup *cached; /* this never be root cgroup */ 22338c2ecf20Sopenharmony_ci unsigned int nr_pages; 22348c2ecf20Sopenharmony_ci 22358c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM 22368c2ecf20Sopenharmony_ci struct obj_cgroup *cached_objcg; 22378c2ecf20Sopenharmony_ci unsigned int nr_bytes; 22388c2ecf20Sopenharmony_ci#endif 22398c2ecf20Sopenharmony_ci 22408c2ecf20Sopenharmony_ci struct work_struct work; 22418c2ecf20Sopenharmony_ci unsigned long flags; 22428c2ecf20Sopenharmony_ci#define FLUSHING_CACHED_CHARGE 0 22438c2ecf20Sopenharmony_ci}; 22448c2ecf20Sopenharmony_cistatic DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 22458c2ecf20Sopenharmony_cistatic DEFINE_MUTEX(percpu_charge_mutex); 22468c2ecf20Sopenharmony_ci 22478c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM 22488c2ecf20Sopenharmony_cistatic void drain_obj_stock(struct memcg_stock_pcp *stock); 22498c2ecf20Sopenharmony_cistatic bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 22508c2ecf20Sopenharmony_ci struct mem_cgroup *root_memcg); 22518c2ecf20Sopenharmony_ci 22528c2ecf20Sopenharmony_ci#else 22538c2ecf20Sopenharmony_cistatic inline void drain_obj_stock(struct memcg_stock_pcp *stock) 22548c2ecf20Sopenharmony_ci{ 22558c2ecf20Sopenharmony_ci} 22568c2ecf20Sopenharmony_cistatic bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 22578c2ecf20Sopenharmony_ci struct mem_cgroup *root_memcg) 22588c2ecf20Sopenharmony_ci{ 22598c2ecf20Sopenharmony_ci return false; 22608c2ecf20Sopenharmony_ci} 22618c2ecf20Sopenharmony_ci#endif 22628c2ecf20Sopenharmony_ci 22638c2ecf20Sopenharmony_ci/** 22648c2ecf20Sopenharmony_ci * consume_stock: Try to consume stocked charge on this cpu. 22658c2ecf20Sopenharmony_ci * @memcg: memcg to consume from. 22668c2ecf20Sopenharmony_ci * @nr_pages: how many pages to charge. 22678c2ecf20Sopenharmony_ci * 22688c2ecf20Sopenharmony_ci * The charges will only happen if @memcg matches the current cpu's memcg 22698c2ecf20Sopenharmony_ci * stock, and at least @nr_pages are available in that stock. Failure to 22708c2ecf20Sopenharmony_ci * service an allocation will refill the stock. 22718c2ecf20Sopenharmony_ci * 22728c2ecf20Sopenharmony_ci * returns true if successful, false otherwise. 22738c2ecf20Sopenharmony_ci */ 22748c2ecf20Sopenharmony_cistatic bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 22758c2ecf20Sopenharmony_ci{ 22768c2ecf20Sopenharmony_ci struct memcg_stock_pcp *stock; 22778c2ecf20Sopenharmony_ci unsigned long flags; 22788c2ecf20Sopenharmony_ci bool ret = false; 22798c2ecf20Sopenharmony_ci 22808c2ecf20Sopenharmony_ci if (nr_pages > MEMCG_CHARGE_BATCH) 22818c2ecf20Sopenharmony_ci return ret; 22828c2ecf20Sopenharmony_ci 22838c2ecf20Sopenharmony_ci local_irq_save(flags); 22848c2ecf20Sopenharmony_ci 22858c2ecf20Sopenharmony_ci stock = this_cpu_ptr(&memcg_stock); 22868c2ecf20Sopenharmony_ci if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 22878c2ecf20Sopenharmony_ci stock->nr_pages -= nr_pages; 22888c2ecf20Sopenharmony_ci ret = true; 22898c2ecf20Sopenharmony_ci } 22908c2ecf20Sopenharmony_ci 22918c2ecf20Sopenharmony_ci local_irq_restore(flags); 22928c2ecf20Sopenharmony_ci 22938c2ecf20Sopenharmony_ci return ret; 22948c2ecf20Sopenharmony_ci} 22958c2ecf20Sopenharmony_ci 22968c2ecf20Sopenharmony_ci/* 22978c2ecf20Sopenharmony_ci * Returns stocks cached in percpu and reset cached information. 22988c2ecf20Sopenharmony_ci */ 22998c2ecf20Sopenharmony_cistatic void drain_stock(struct memcg_stock_pcp *stock) 23008c2ecf20Sopenharmony_ci{ 23018c2ecf20Sopenharmony_ci struct mem_cgroup *old = stock->cached; 23028c2ecf20Sopenharmony_ci 23038c2ecf20Sopenharmony_ci if (!old) 23048c2ecf20Sopenharmony_ci return; 23058c2ecf20Sopenharmony_ci 23068c2ecf20Sopenharmony_ci if (stock->nr_pages) { 23078c2ecf20Sopenharmony_ci page_counter_uncharge(&old->memory, stock->nr_pages); 23088c2ecf20Sopenharmony_ci if (do_memsw_account()) 23098c2ecf20Sopenharmony_ci page_counter_uncharge(&old->memsw, stock->nr_pages); 23108c2ecf20Sopenharmony_ci stock->nr_pages = 0; 23118c2ecf20Sopenharmony_ci } 23128c2ecf20Sopenharmony_ci 23138c2ecf20Sopenharmony_ci css_put(&old->css); 23148c2ecf20Sopenharmony_ci stock->cached = NULL; 23158c2ecf20Sopenharmony_ci} 23168c2ecf20Sopenharmony_ci 23178c2ecf20Sopenharmony_cistatic void drain_local_stock(struct work_struct *dummy) 23188c2ecf20Sopenharmony_ci{ 23198c2ecf20Sopenharmony_ci struct memcg_stock_pcp *stock; 23208c2ecf20Sopenharmony_ci unsigned long flags; 23218c2ecf20Sopenharmony_ci 23228c2ecf20Sopenharmony_ci /* 23238c2ecf20Sopenharmony_ci * The only protection from memory hotplug vs. drain_stock races is 23248c2ecf20Sopenharmony_ci * that we always operate on local CPU stock here with IRQ disabled 23258c2ecf20Sopenharmony_ci */ 23268c2ecf20Sopenharmony_ci local_irq_save(flags); 23278c2ecf20Sopenharmony_ci 23288c2ecf20Sopenharmony_ci stock = this_cpu_ptr(&memcg_stock); 23298c2ecf20Sopenharmony_ci drain_obj_stock(stock); 23308c2ecf20Sopenharmony_ci drain_stock(stock); 23318c2ecf20Sopenharmony_ci clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 23328c2ecf20Sopenharmony_ci 23338c2ecf20Sopenharmony_ci local_irq_restore(flags); 23348c2ecf20Sopenharmony_ci} 23358c2ecf20Sopenharmony_ci 23368c2ecf20Sopenharmony_ci/* 23378c2ecf20Sopenharmony_ci * Cache charges(val) to local per_cpu area. 23388c2ecf20Sopenharmony_ci * This will be consumed by consume_stock() function, later. 23398c2ecf20Sopenharmony_ci */ 23408c2ecf20Sopenharmony_cistatic void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 23418c2ecf20Sopenharmony_ci{ 23428c2ecf20Sopenharmony_ci struct memcg_stock_pcp *stock; 23438c2ecf20Sopenharmony_ci unsigned long flags; 23448c2ecf20Sopenharmony_ci 23458c2ecf20Sopenharmony_ci local_irq_save(flags); 23468c2ecf20Sopenharmony_ci 23478c2ecf20Sopenharmony_ci stock = this_cpu_ptr(&memcg_stock); 23488c2ecf20Sopenharmony_ci if (stock->cached != memcg) { /* reset if necessary */ 23498c2ecf20Sopenharmony_ci drain_stock(stock); 23508c2ecf20Sopenharmony_ci css_get(&memcg->css); 23518c2ecf20Sopenharmony_ci stock->cached = memcg; 23528c2ecf20Sopenharmony_ci } 23538c2ecf20Sopenharmony_ci stock->nr_pages += nr_pages; 23548c2ecf20Sopenharmony_ci 23558c2ecf20Sopenharmony_ci if (stock->nr_pages > MEMCG_CHARGE_BATCH) 23568c2ecf20Sopenharmony_ci drain_stock(stock); 23578c2ecf20Sopenharmony_ci 23588c2ecf20Sopenharmony_ci local_irq_restore(flags); 23598c2ecf20Sopenharmony_ci} 23608c2ecf20Sopenharmony_ci 23618c2ecf20Sopenharmony_ci/* 23628c2ecf20Sopenharmony_ci * Drains all per-CPU charge caches for given root_memcg resp. subtree 23638c2ecf20Sopenharmony_ci * of the hierarchy under it. 23648c2ecf20Sopenharmony_ci */ 23658c2ecf20Sopenharmony_cistatic void drain_all_stock(struct mem_cgroup *root_memcg) 23668c2ecf20Sopenharmony_ci{ 23678c2ecf20Sopenharmony_ci int cpu, curcpu; 23688c2ecf20Sopenharmony_ci 23698c2ecf20Sopenharmony_ci /* If someone's already draining, avoid adding running more workers. */ 23708c2ecf20Sopenharmony_ci if (!mutex_trylock(&percpu_charge_mutex)) 23718c2ecf20Sopenharmony_ci return; 23728c2ecf20Sopenharmony_ci /* 23738c2ecf20Sopenharmony_ci * Notify other cpus that system-wide "drain" is running 23748c2ecf20Sopenharmony_ci * We do not care about races with the cpu hotplug because cpu down 23758c2ecf20Sopenharmony_ci * as well as workers from this path always operate on the local 23768c2ecf20Sopenharmony_ci * per-cpu data. CPU up doesn't touch memcg_stock at all. 23778c2ecf20Sopenharmony_ci */ 23788c2ecf20Sopenharmony_ci curcpu = get_cpu(); 23798c2ecf20Sopenharmony_ci for_each_online_cpu(cpu) { 23808c2ecf20Sopenharmony_ci struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 23818c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 23828c2ecf20Sopenharmony_ci bool flush = false; 23838c2ecf20Sopenharmony_ci 23848c2ecf20Sopenharmony_ci rcu_read_lock(); 23858c2ecf20Sopenharmony_ci memcg = stock->cached; 23868c2ecf20Sopenharmony_ci if (memcg && stock->nr_pages && 23878c2ecf20Sopenharmony_ci mem_cgroup_is_descendant(memcg, root_memcg)) 23888c2ecf20Sopenharmony_ci flush = true; 23898c2ecf20Sopenharmony_ci if (obj_stock_flush_required(stock, root_memcg)) 23908c2ecf20Sopenharmony_ci flush = true; 23918c2ecf20Sopenharmony_ci rcu_read_unlock(); 23928c2ecf20Sopenharmony_ci 23938c2ecf20Sopenharmony_ci if (flush && 23948c2ecf20Sopenharmony_ci !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 23958c2ecf20Sopenharmony_ci if (cpu == curcpu) 23968c2ecf20Sopenharmony_ci drain_local_stock(&stock->work); 23978c2ecf20Sopenharmony_ci else 23988c2ecf20Sopenharmony_ci schedule_work_on(cpu, &stock->work); 23998c2ecf20Sopenharmony_ci } 24008c2ecf20Sopenharmony_ci } 24018c2ecf20Sopenharmony_ci put_cpu(); 24028c2ecf20Sopenharmony_ci mutex_unlock(&percpu_charge_mutex); 24038c2ecf20Sopenharmony_ci} 24048c2ecf20Sopenharmony_ci 24058c2ecf20Sopenharmony_cistatic int memcg_hotplug_cpu_dead(unsigned int cpu) 24068c2ecf20Sopenharmony_ci{ 24078c2ecf20Sopenharmony_ci struct memcg_stock_pcp *stock; 24088c2ecf20Sopenharmony_ci struct mem_cgroup *memcg, *mi; 24098c2ecf20Sopenharmony_ci 24108c2ecf20Sopenharmony_ci stock = &per_cpu(memcg_stock, cpu); 24118c2ecf20Sopenharmony_ci drain_stock(stock); 24128c2ecf20Sopenharmony_ci 24138c2ecf20Sopenharmony_ci for_each_mem_cgroup(memcg) { 24148c2ecf20Sopenharmony_ci int i; 24158c2ecf20Sopenharmony_ci 24168c2ecf20Sopenharmony_ci for (i = 0; i < MEMCG_NR_STAT; i++) { 24178c2ecf20Sopenharmony_ci int nid; 24188c2ecf20Sopenharmony_ci long x; 24198c2ecf20Sopenharmony_ci 24208c2ecf20Sopenharmony_ci x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0); 24218c2ecf20Sopenharmony_ci if (x) 24228c2ecf20Sopenharmony_ci for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 24238c2ecf20Sopenharmony_ci atomic_long_add(x, &memcg->vmstats[i]); 24248c2ecf20Sopenharmony_ci 24258c2ecf20Sopenharmony_ci if (i >= NR_VM_NODE_STAT_ITEMS) 24268c2ecf20Sopenharmony_ci continue; 24278c2ecf20Sopenharmony_ci 24288c2ecf20Sopenharmony_ci for_each_node(nid) { 24298c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *pn; 24308c2ecf20Sopenharmony_ci 24318c2ecf20Sopenharmony_ci pn = mem_cgroup_nodeinfo(memcg, nid); 24328c2ecf20Sopenharmony_ci x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); 24338c2ecf20Sopenharmony_ci if (x) 24348c2ecf20Sopenharmony_ci do { 24358c2ecf20Sopenharmony_ci atomic_long_add(x, &pn->lruvec_stat[i]); 24368c2ecf20Sopenharmony_ci } while ((pn = parent_nodeinfo(pn, nid))); 24378c2ecf20Sopenharmony_ci } 24388c2ecf20Sopenharmony_ci } 24398c2ecf20Sopenharmony_ci 24408c2ecf20Sopenharmony_ci for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { 24418c2ecf20Sopenharmony_ci long x; 24428c2ecf20Sopenharmony_ci 24438c2ecf20Sopenharmony_ci x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0); 24448c2ecf20Sopenharmony_ci if (x) 24458c2ecf20Sopenharmony_ci for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 24468c2ecf20Sopenharmony_ci atomic_long_add(x, &memcg->vmevents[i]); 24478c2ecf20Sopenharmony_ci } 24488c2ecf20Sopenharmony_ci } 24498c2ecf20Sopenharmony_ci 24508c2ecf20Sopenharmony_ci return 0; 24518c2ecf20Sopenharmony_ci} 24528c2ecf20Sopenharmony_ci 24538c2ecf20Sopenharmony_cistatic unsigned long reclaim_high(struct mem_cgroup *memcg, 24548c2ecf20Sopenharmony_ci unsigned int nr_pages, 24558c2ecf20Sopenharmony_ci gfp_t gfp_mask) 24568c2ecf20Sopenharmony_ci{ 24578c2ecf20Sopenharmony_ci unsigned long nr_reclaimed = 0; 24588c2ecf20Sopenharmony_ci 24598c2ecf20Sopenharmony_ci do { 24608c2ecf20Sopenharmony_ci unsigned long pflags; 24618c2ecf20Sopenharmony_ci 24628c2ecf20Sopenharmony_ci if (page_counter_read(&memcg->memory) <= 24638c2ecf20Sopenharmony_ci READ_ONCE(memcg->memory.high)) 24648c2ecf20Sopenharmony_ci continue; 24658c2ecf20Sopenharmony_ci 24668c2ecf20Sopenharmony_ci memcg_memory_event(memcg, MEMCG_HIGH); 24678c2ecf20Sopenharmony_ci 24688c2ecf20Sopenharmony_ci psi_memstall_enter(&pflags); 24698c2ecf20Sopenharmony_ci nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, 24708c2ecf20Sopenharmony_ci gfp_mask, true); 24718c2ecf20Sopenharmony_ci psi_memstall_leave(&pflags); 24728c2ecf20Sopenharmony_ci } while ((memcg = parent_mem_cgroup(memcg)) && 24738c2ecf20Sopenharmony_ci !mem_cgroup_is_root(memcg)); 24748c2ecf20Sopenharmony_ci 24758c2ecf20Sopenharmony_ci return nr_reclaimed; 24768c2ecf20Sopenharmony_ci} 24778c2ecf20Sopenharmony_ci 24788c2ecf20Sopenharmony_cistatic void high_work_func(struct work_struct *work) 24798c2ecf20Sopenharmony_ci{ 24808c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 24818c2ecf20Sopenharmony_ci 24828c2ecf20Sopenharmony_ci memcg = container_of(work, struct mem_cgroup, high_work); 24838c2ecf20Sopenharmony_ci reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); 24848c2ecf20Sopenharmony_ci} 24858c2ecf20Sopenharmony_ci 24868c2ecf20Sopenharmony_ci/* 24878c2ecf20Sopenharmony_ci * Clamp the maximum sleep time per allocation batch to 2 seconds. This is 24888c2ecf20Sopenharmony_ci * enough to still cause a significant slowdown in most cases, while still 24898c2ecf20Sopenharmony_ci * allowing diagnostics and tracing to proceed without becoming stuck. 24908c2ecf20Sopenharmony_ci */ 24918c2ecf20Sopenharmony_ci#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) 24928c2ecf20Sopenharmony_ci 24938c2ecf20Sopenharmony_ci/* 24948c2ecf20Sopenharmony_ci * When calculating the delay, we use these either side of the exponentiation to 24958c2ecf20Sopenharmony_ci * maintain precision and scale to a reasonable number of jiffies (see the table 24968c2ecf20Sopenharmony_ci * below. 24978c2ecf20Sopenharmony_ci * 24988c2ecf20Sopenharmony_ci * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the 24998c2ecf20Sopenharmony_ci * overage ratio to a delay. 25008c2ecf20Sopenharmony_ci * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the 25018c2ecf20Sopenharmony_ci * proposed penalty in order to reduce to a reasonable number of jiffies, and 25028c2ecf20Sopenharmony_ci * to produce a reasonable delay curve. 25038c2ecf20Sopenharmony_ci * 25048c2ecf20Sopenharmony_ci * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a 25058c2ecf20Sopenharmony_ci * reasonable delay curve compared to precision-adjusted overage, not 25068c2ecf20Sopenharmony_ci * penalising heavily at first, but still making sure that growth beyond the 25078c2ecf20Sopenharmony_ci * limit penalises misbehaviour cgroups by slowing them down exponentially. For 25088c2ecf20Sopenharmony_ci * example, with a high of 100 megabytes: 25098c2ecf20Sopenharmony_ci * 25108c2ecf20Sopenharmony_ci * +-------+------------------------+ 25118c2ecf20Sopenharmony_ci * | usage | time to allocate in ms | 25128c2ecf20Sopenharmony_ci * +-------+------------------------+ 25138c2ecf20Sopenharmony_ci * | 100M | 0 | 25148c2ecf20Sopenharmony_ci * | 101M | 6 | 25158c2ecf20Sopenharmony_ci * | 102M | 25 | 25168c2ecf20Sopenharmony_ci * | 103M | 57 | 25178c2ecf20Sopenharmony_ci * | 104M | 102 | 25188c2ecf20Sopenharmony_ci * | 105M | 159 | 25198c2ecf20Sopenharmony_ci * | 106M | 230 | 25208c2ecf20Sopenharmony_ci * | 107M | 313 | 25218c2ecf20Sopenharmony_ci * | 108M | 409 | 25228c2ecf20Sopenharmony_ci * | 109M | 518 | 25238c2ecf20Sopenharmony_ci * | 110M | 639 | 25248c2ecf20Sopenharmony_ci * | 111M | 774 | 25258c2ecf20Sopenharmony_ci * | 112M | 921 | 25268c2ecf20Sopenharmony_ci * | 113M | 1081 | 25278c2ecf20Sopenharmony_ci * | 114M | 1254 | 25288c2ecf20Sopenharmony_ci * | 115M | 1439 | 25298c2ecf20Sopenharmony_ci * | 116M | 1638 | 25308c2ecf20Sopenharmony_ci * | 117M | 1849 | 25318c2ecf20Sopenharmony_ci * | 118M | 2000 | 25328c2ecf20Sopenharmony_ci * | 119M | 2000 | 25338c2ecf20Sopenharmony_ci * | 120M | 2000 | 25348c2ecf20Sopenharmony_ci * +-------+------------------------+ 25358c2ecf20Sopenharmony_ci */ 25368c2ecf20Sopenharmony_ci #define MEMCG_DELAY_PRECISION_SHIFT 20 25378c2ecf20Sopenharmony_ci #define MEMCG_DELAY_SCALING_SHIFT 14 25388c2ecf20Sopenharmony_ci 25398c2ecf20Sopenharmony_cistatic u64 calculate_overage(unsigned long usage, unsigned long high) 25408c2ecf20Sopenharmony_ci{ 25418c2ecf20Sopenharmony_ci u64 overage; 25428c2ecf20Sopenharmony_ci 25438c2ecf20Sopenharmony_ci if (usage <= high) 25448c2ecf20Sopenharmony_ci return 0; 25458c2ecf20Sopenharmony_ci 25468c2ecf20Sopenharmony_ci /* 25478c2ecf20Sopenharmony_ci * Prevent division by 0 in overage calculation by acting as if 25488c2ecf20Sopenharmony_ci * it was a threshold of 1 page 25498c2ecf20Sopenharmony_ci */ 25508c2ecf20Sopenharmony_ci high = max(high, 1UL); 25518c2ecf20Sopenharmony_ci 25528c2ecf20Sopenharmony_ci overage = usage - high; 25538c2ecf20Sopenharmony_ci overage <<= MEMCG_DELAY_PRECISION_SHIFT; 25548c2ecf20Sopenharmony_ci return div64_u64(overage, high); 25558c2ecf20Sopenharmony_ci} 25568c2ecf20Sopenharmony_ci 25578c2ecf20Sopenharmony_cistatic u64 mem_find_max_overage(struct mem_cgroup *memcg) 25588c2ecf20Sopenharmony_ci{ 25598c2ecf20Sopenharmony_ci u64 overage, max_overage = 0; 25608c2ecf20Sopenharmony_ci 25618c2ecf20Sopenharmony_ci do { 25628c2ecf20Sopenharmony_ci overage = calculate_overage(page_counter_read(&memcg->memory), 25638c2ecf20Sopenharmony_ci READ_ONCE(memcg->memory.high)); 25648c2ecf20Sopenharmony_ci max_overage = max(overage, max_overage); 25658c2ecf20Sopenharmony_ci } while ((memcg = parent_mem_cgroup(memcg)) && 25668c2ecf20Sopenharmony_ci !mem_cgroup_is_root(memcg)); 25678c2ecf20Sopenharmony_ci 25688c2ecf20Sopenharmony_ci return max_overage; 25698c2ecf20Sopenharmony_ci} 25708c2ecf20Sopenharmony_ci 25718c2ecf20Sopenharmony_cistatic u64 swap_find_max_overage(struct mem_cgroup *memcg) 25728c2ecf20Sopenharmony_ci{ 25738c2ecf20Sopenharmony_ci u64 overage, max_overage = 0; 25748c2ecf20Sopenharmony_ci 25758c2ecf20Sopenharmony_ci do { 25768c2ecf20Sopenharmony_ci overage = calculate_overage(page_counter_read(&memcg->swap), 25778c2ecf20Sopenharmony_ci READ_ONCE(memcg->swap.high)); 25788c2ecf20Sopenharmony_ci if (overage) 25798c2ecf20Sopenharmony_ci memcg_memory_event(memcg, MEMCG_SWAP_HIGH); 25808c2ecf20Sopenharmony_ci max_overage = max(overage, max_overage); 25818c2ecf20Sopenharmony_ci } while ((memcg = parent_mem_cgroup(memcg)) && 25828c2ecf20Sopenharmony_ci !mem_cgroup_is_root(memcg)); 25838c2ecf20Sopenharmony_ci 25848c2ecf20Sopenharmony_ci return max_overage; 25858c2ecf20Sopenharmony_ci} 25868c2ecf20Sopenharmony_ci 25878c2ecf20Sopenharmony_ci/* 25888c2ecf20Sopenharmony_ci * Get the number of jiffies that we should penalise a mischievous cgroup which 25898c2ecf20Sopenharmony_ci * is exceeding its memory.high by checking both it and its ancestors. 25908c2ecf20Sopenharmony_ci */ 25918c2ecf20Sopenharmony_cistatic unsigned long calculate_high_delay(struct mem_cgroup *memcg, 25928c2ecf20Sopenharmony_ci unsigned int nr_pages, 25938c2ecf20Sopenharmony_ci u64 max_overage) 25948c2ecf20Sopenharmony_ci{ 25958c2ecf20Sopenharmony_ci unsigned long penalty_jiffies; 25968c2ecf20Sopenharmony_ci 25978c2ecf20Sopenharmony_ci if (!max_overage) 25988c2ecf20Sopenharmony_ci return 0; 25998c2ecf20Sopenharmony_ci 26008c2ecf20Sopenharmony_ci /* 26018c2ecf20Sopenharmony_ci * We use overage compared to memory.high to calculate the number of 26028c2ecf20Sopenharmony_ci * jiffies to sleep (penalty_jiffies). Ideally this value should be 26038c2ecf20Sopenharmony_ci * fairly lenient on small overages, and increasingly harsh when the 26048c2ecf20Sopenharmony_ci * memcg in question makes it clear that it has no intention of stopping 26058c2ecf20Sopenharmony_ci * its crazy behaviour, so we exponentially increase the delay based on 26068c2ecf20Sopenharmony_ci * overage amount. 26078c2ecf20Sopenharmony_ci */ 26088c2ecf20Sopenharmony_ci penalty_jiffies = max_overage * max_overage * HZ; 26098c2ecf20Sopenharmony_ci penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT; 26108c2ecf20Sopenharmony_ci penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT; 26118c2ecf20Sopenharmony_ci 26128c2ecf20Sopenharmony_ci /* 26138c2ecf20Sopenharmony_ci * Factor in the task's own contribution to the overage, such that four 26148c2ecf20Sopenharmony_ci * N-sized allocations are throttled approximately the same as one 26158c2ecf20Sopenharmony_ci * 4N-sized allocation. 26168c2ecf20Sopenharmony_ci * 26178c2ecf20Sopenharmony_ci * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or 26188c2ecf20Sopenharmony_ci * larger the current charge patch is than that. 26198c2ecf20Sopenharmony_ci */ 26208c2ecf20Sopenharmony_ci return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; 26218c2ecf20Sopenharmony_ci} 26228c2ecf20Sopenharmony_ci 26238c2ecf20Sopenharmony_ci/* 26248c2ecf20Sopenharmony_ci * Scheduled by try_charge() to be executed from the userland return path 26258c2ecf20Sopenharmony_ci * and reclaims memory over the high limit. 26268c2ecf20Sopenharmony_ci */ 26278c2ecf20Sopenharmony_civoid mem_cgroup_handle_over_high(void) 26288c2ecf20Sopenharmony_ci{ 26298c2ecf20Sopenharmony_ci unsigned long penalty_jiffies; 26308c2ecf20Sopenharmony_ci unsigned long pflags; 26318c2ecf20Sopenharmony_ci unsigned long nr_reclaimed; 26328c2ecf20Sopenharmony_ci unsigned int nr_pages = current->memcg_nr_pages_over_high; 26338c2ecf20Sopenharmony_ci int nr_retries = MAX_RECLAIM_RETRIES; 26348c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 26358c2ecf20Sopenharmony_ci bool in_retry = false; 26368c2ecf20Sopenharmony_ci 26378c2ecf20Sopenharmony_ci if (likely(!nr_pages)) 26388c2ecf20Sopenharmony_ci return; 26398c2ecf20Sopenharmony_ci 26408c2ecf20Sopenharmony_ci memcg = get_mem_cgroup_from_mm(current->mm); 26418c2ecf20Sopenharmony_ci current->memcg_nr_pages_over_high = 0; 26428c2ecf20Sopenharmony_ci 26438c2ecf20Sopenharmony_ciretry_reclaim: 26448c2ecf20Sopenharmony_ci /* 26458c2ecf20Sopenharmony_ci * The allocating task should reclaim at least the batch size, but for 26468c2ecf20Sopenharmony_ci * subsequent retries we only want to do what's necessary to prevent oom 26478c2ecf20Sopenharmony_ci * or breaching resource isolation. 26488c2ecf20Sopenharmony_ci * 26498c2ecf20Sopenharmony_ci * This is distinct from memory.max or page allocator behaviour because 26508c2ecf20Sopenharmony_ci * memory.high is currently batched, whereas memory.max and the page 26518c2ecf20Sopenharmony_ci * allocator run every time an allocation is made. 26528c2ecf20Sopenharmony_ci */ 26538c2ecf20Sopenharmony_ci nr_reclaimed = reclaim_high(memcg, 26548c2ecf20Sopenharmony_ci in_retry ? SWAP_CLUSTER_MAX : nr_pages, 26558c2ecf20Sopenharmony_ci GFP_KERNEL); 26568c2ecf20Sopenharmony_ci 26578c2ecf20Sopenharmony_ci /* 26588c2ecf20Sopenharmony_ci * memory.high is breached and reclaim is unable to keep up. Throttle 26598c2ecf20Sopenharmony_ci * allocators proactively to slow down excessive growth. 26608c2ecf20Sopenharmony_ci */ 26618c2ecf20Sopenharmony_ci penalty_jiffies = calculate_high_delay(memcg, nr_pages, 26628c2ecf20Sopenharmony_ci mem_find_max_overage(memcg)); 26638c2ecf20Sopenharmony_ci 26648c2ecf20Sopenharmony_ci penalty_jiffies += calculate_high_delay(memcg, nr_pages, 26658c2ecf20Sopenharmony_ci swap_find_max_overage(memcg)); 26668c2ecf20Sopenharmony_ci 26678c2ecf20Sopenharmony_ci /* 26688c2ecf20Sopenharmony_ci * Clamp the max delay per usermode return so as to still keep the 26698c2ecf20Sopenharmony_ci * application moving forwards and also permit diagnostics, albeit 26708c2ecf20Sopenharmony_ci * extremely slowly. 26718c2ecf20Sopenharmony_ci */ 26728c2ecf20Sopenharmony_ci penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); 26738c2ecf20Sopenharmony_ci 26748c2ecf20Sopenharmony_ci /* 26758c2ecf20Sopenharmony_ci * Don't sleep if the amount of jiffies this memcg owes us is so low 26768c2ecf20Sopenharmony_ci * that it's not even worth doing, in an attempt to be nice to those who 26778c2ecf20Sopenharmony_ci * go only a small amount over their memory.high value and maybe haven't 26788c2ecf20Sopenharmony_ci * been aggressively reclaimed enough yet. 26798c2ecf20Sopenharmony_ci */ 26808c2ecf20Sopenharmony_ci if (penalty_jiffies <= HZ / 100) 26818c2ecf20Sopenharmony_ci goto out; 26828c2ecf20Sopenharmony_ci 26838c2ecf20Sopenharmony_ci /* 26848c2ecf20Sopenharmony_ci * If reclaim is making forward progress but we're still over 26858c2ecf20Sopenharmony_ci * memory.high, we want to encourage that rather than doing allocator 26868c2ecf20Sopenharmony_ci * throttling. 26878c2ecf20Sopenharmony_ci */ 26888c2ecf20Sopenharmony_ci if (nr_reclaimed || nr_retries--) { 26898c2ecf20Sopenharmony_ci in_retry = true; 26908c2ecf20Sopenharmony_ci goto retry_reclaim; 26918c2ecf20Sopenharmony_ci } 26928c2ecf20Sopenharmony_ci 26938c2ecf20Sopenharmony_ci /* 26948c2ecf20Sopenharmony_ci * If we exit early, we're guaranteed to die (since 26958c2ecf20Sopenharmony_ci * schedule_timeout_killable sets TASK_KILLABLE). This means we don't 26968c2ecf20Sopenharmony_ci * need to account for any ill-begotten jiffies to pay them off later. 26978c2ecf20Sopenharmony_ci */ 26988c2ecf20Sopenharmony_ci psi_memstall_enter(&pflags); 26998c2ecf20Sopenharmony_ci schedule_timeout_killable(penalty_jiffies); 27008c2ecf20Sopenharmony_ci psi_memstall_leave(&pflags); 27018c2ecf20Sopenharmony_ci 27028c2ecf20Sopenharmony_ciout: 27038c2ecf20Sopenharmony_ci css_put(&memcg->css); 27048c2ecf20Sopenharmony_ci} 27058c2ecf20Sopenharmony_ci 27068c2ecf20Sopenharmony_cistatic int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 27078c2ecf20Sopenharmony_ci unsigned int nr_pages) 27088c2ecf20Sopenharmony_ci{ 27098c2ecf20Sopenharmony_ci unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); 27108c2ecf20Sopenharmony_ci int nr_retries = MAX_RECLAIM_RETRIES; 27118c2ecf20Sopenharmony_ci struct mem_cgroup *mem_over_limit; 27128c2ecf20Sopenharmony_ci struct page_counter *counter; 27138c2ecf20Sopenharmony_ci enum oom_status oom_status; 27148c2ecf20Sopenharmony_ci unsigned long nr_reclaimed; 27158c2ecf20Sopenharmony_ci bool passed_oom = false; 27168c2ecf20Sopenharmony_ci bool may_swap = true; 27178c2ecf20Sopenharmony_ci bool drained = false; 27188c2ecf20Sopenharmony_ci unsigned long pflags; 27198c2ecf20Sopenharmony_ci 27208c2ecf20Sopenharmony_ci if (mem_cgroup_is_root(memcg)) 27218c2ecf20Sopenharmony_ci return 0; 27228c2ecf20Sopenharmony_ciretry: 27238c2ecf20Sopenharmony_ci if (consume_stock(memcg, nr_pages)) 27248c2ecf20Sopenharmony_ci return 0; 27258c2ecf20Sopenharmony_ci 27268c2ecf20Sopenharmony_ci if (!do_memsw_account() || 27278c2ecf20Sopenharmony_ci page_counter_try_charge(&memcg->memsw, batch, &counter)) { 27288c2ecf20Sopenharmony_ci if (page_counter_try_charge(&memcg->memory, batch, &counter)) 27298c2ecf20Sopenharmony_ci goto done_restock; 27308c2ecf20Sopenharmony_ci if (do_memsw_account()) 27318c2ecf20Sopenharmony_ci page_counter_uncharge(&memcg->memsw, batch); 27328c2ecf20Sopenharmony_ci mem_over_limit = mem_cgroup_from_counter(counter, memory); 27338c2ecf20Sopenharmony_ci } else { 27348c2ecf20Sopenharmony_ci mem_over_limit = mem_cgroup_from_counter(counter, memsw); 27358c2ecf20Sopenharmony_ci may_swap = false; 27368c2ecf20Sopenharmony_ci } 27378c2ecf20Sopenharmony_ci 27388c2ecf20Sopenharmony_ci if (batch > nr_pages) { 27398c2ecf20Sopenharmony_ci batch = nr_pages; 27408c2ecf20Sopenharmony_ci goto retry; 27418c2ecf20Sopenharmony_ci } 27428c2ecf20Sopenharmony_ci 27438c2ecf20Sopenharmony_ci /* 27448c2ecf20Sopenharmony_ci * Memcg doesn't have a dedicated reserve for atomic 27458c2ecf20Sopenharmony_ci * allocations. But like the global atomic pool, we need to 27468c2ecf20Sopenharmony_ci * put the burden of reclaim on regular allocation requests 27478c2ecf20Sopenharmony_ci * and let these go through as privileged allocations. 27488c2ecf20Sopenharmony_ci */ 27498c2ecf20Sopenharmony_ci if (gfp_mask & __GFP_ATOMIC) 27508c2ecf20Sopenharmony_ci goto force; 27518c2ecf20Sopenharmony_ci 27528c2ecf20Sopenharmony_ci /* 27538c2ecf20Sopenharmony_ci * Prevent unbounded recursion when reclaim operations need to 27548c2ecf20Sopenharmony_ci * allocate memory. This might exceed the limits temporarily, 27558c2ecf20Sopenharmony_ci * but we prefer facilitating memory reclaim and getting back 27568c2ecf20Sopenharmony_ci * under the limit over triggering OOM kills in these cases. 27578c2ecf20Sopenharmony_ci */ 27588c2ecf20Sopenharmony_ci if (unlikely(current->flags & PF_MEMALLOC)) 27598c2ecf20Sopenharmony_ci goto force; 27608c2ecf20Sopenharmony_ci 27618c2ecf20Sopenharmony_ci if (unlikely(task_in_memcg_oom(current))) 27628c2ecf20Sopenharmony_ci goto nomem; 27638c2ecf20Sopenharmony_ci 27648c2ecf20Sopenharmony_ci if (!gfpflags_allow_blocking(gfp_mask)) 27658c2ecf20Sopenharmony_ci goto nomem; 27668c2ecf20Sopenharmony_ci 27678c2ecf20Sopenharmony_ci memcg_memory_event(mem_over_limit, MEMCG_MAX); 27688c2ecf20Sopenharmony_ci 27698c2ecf20Sopenharmony_ci psi_memstall_enter(&pflags); 27708c2ecf20Sopenharmony_ci nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 27718c2ecf20Sopenharmony_ci gfp_mask, may_swap); 27728c2ecf20Sopenharmony_ci psi_memstall_leave(&pflags); 27738c2ecf20Sopenharmony_ci 27748c2ecf20Sopenharmony_ci if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 27758c2ecf20Sopenharmony_ci goto retry; 27768c2ecf20Sopenharmony_ci 27778c2ecf20Sopenharmony_ci if (!drained) { 27788c2ecf20Sopenharmony_ci drain_all_stock(mem_over_limit); 27798c2ecf20Sopenharmony_ci drained = true; 27808c2ecf20Sopenharmony_ci goto retry; 27818c2ecf20Sopenharmony_ci } 27828c2ecf20Sopenharmony_ci 27838c2ecf20Sopenharmony_ci if (gfp_mask & __GFP_NORETRY) 27848c2ecf20Sopenharmony_ci goto nomem; 27858c2ecf20Sopenharmony_ci /* 27868c2ecf20Sopenharmony_ci * Even though the limit is exceeded at this point, reclaim 27878c2ecf20Sopenharmony_ci * may have been able to free some pages. Retry the charge 27888c2ecf20Sopenharmony_ci * before killing the task. 27898c2ecf20Sopenharmony_ci * 27908c2ecf20Sopenharmony_ci * Only for regular pages, though: huge pages are rather 27918c2ecf20Sopenharmony_ci * unlikely to succeed so close to the limit, and we fall back 27928c2ecf20Sopenharmony_ci * to regular pages anyway in case of failure. 27938c2ecf20Sopenharmony_ci */ 27948c2ecf20Sopenharmony_ci if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 27958c2ecf20Sopenharmony_ci goto retry; 27968c2ecf20Sopenharmony_ci /* 27978c2ecf20Sopenharmony_ci * At task move, charge accounts can be doubly counted. So, it's 27988c2ecf20Sopenharmony_ci * better to wait until the end of task_move if something is going on. 27998c2ecf20Sopenharmony_ci */ 28008c2ecf20Sopenharmony_ci if (mem_cgroup_wait_acct_move(mem_over_limit)) 28018c2ecf20Sopenharmony_ci goto retry; 28028c2ecf20Sopenharmony_ci 28038c2ecf20Sopenharmony_ci if (nr_retries--) 28048c2ecf20Sopenharmony_ci goto retry; 28058c2ecf20Sopenharmony_ci 28068c2ecf20Sopenharmony_ci if (gfp_mask & __GFP_RETRY_MAYFAIL) 28078c2ecf20Sopenharmony_ci goto nomem; 28088c2ecf20Sopenharmony_ci 28098c2ecf20Sopenharmony_ci if (gfp_mask & __GFP_NOFAIL) 28108c2ecf20Sopenharmony_ci goto force; 28118c2ecf20Sopenharmony_ci 28128c2ecf20Sopenharmony_ci /* Avoid endless loop for tasks bypassed by the oom killer */ 28138c2ecf20Sopenharmony_ci if (passed_oom && task_is_dying()) 28148c2ecf20Sopenharmony_ci goto nomem; 28158c2ecf20Sopenharmony_ci 28168c2ecf20Sopenharmony_ci /* 28178c2ecf20Sopenharmony_ci * keep retrying as long as the memcg oom killer is able to make 28188c2ecf20Sopenharmony_ci * a forward progress or bypass the charge if the oom killer 28198c2ecf20Sopenharmony_ci * couldn't make any progress. 28208c2ecf20Sopenharmony_ci */ 28218c2ecf20Sopenharmony_ci oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, 28228c2ecf20Sopenharmony_ci get_order(nr_pages * PAGE_SIZE)); 28238c2ecf20Sopenharmony_ci if (oom_status == OOM_SUCCESS) { 28248c2ecf20Sopenharmony_ci passed_oom = true; 28258c2ecf20Sopenharmony_ci nr_retries = MAX_RECLAIM_RETRIES; 28268c2ecf20Sopenharmony_ci goto retry; 28278c2ecf20Sopenharmony_ci } 28288c2ecf20Sopenharmony_cinomem: 28298c2ecf20Sopenharmony_ci if (!(gfp_mask & __GFP_NOFAIL)) 28308c2ecf20Sopenharmony_ci return -ENOMEM; 28318c2ecf20Sopenharmony_ciforce: 28328c2ecf20Sopenharmony_ci /* 28338c2ecf20Sopenharmony_ci * The allocation either can't fail or will lead to more memory 28348c2ecf20Sopenharmony_ci * being freed very soon. Allow memory usage go over the limit 28358c2ecf20Sopenharmony_ci * temporarily by force charging it. 28368c2ecf20Sopenharmony_ci */ 28378c2ecf20Sopenharmony_ci page_counter_charge(&memcg->memory, nr_pages); 28388c2ecf20Sopenharmony_ci if (do_memsw_account()) 28398c2ecf20Sopenharmony_ci page_counter_charge(&memcg->memsw, nr_pages); 28408c2ecf20Sopenharmony_ci 28418c2ecf20Sopenharmony_ci return 0; 28428c2ecf20Sopenharmony_ci 28438c2ecf20Sopenharmony_cidone_restock: 28448c2ecf20Sopenharmony_ci if (batch > nr_pages) 28458c2ecf20Sopenharmony_ci refill_stock(memcg, batch - nr_pages); 28468c2ecf20Sopenharmony_ci 28478c2ecf20Sopenharmony_ci /* 28488c2ecf20Sopenharmony_ci * If the hierarchy is above the normal consumption range, schedule 28498c2ecf20Sopenharmony_ci * reclaim on returning to userland. We can perform reclaim here 28508c2ecf20Sopenharmony_ci * if __GFP_RECLAIM but let's always punt for simplicity and so that 28518c2ecf20Sopenharmony_ci * GFP_KERNEL can consistently be used during reclaim. @memcg is 28528c2ecf20Sopenharmony_ci * not recorded as it most likely matches current's and won't 28538c2ecf20Sopenharmony_ci * change in the meantime. As high limit is checked again before 28548c2ecf20Sopenharmony_ci * reclaim, the cost of mismatch is negligible. 28558c2ecf20Sopenharmony_ci */ 28568c2ecf20Sopenharmony_ci do { 28578c2ecf20Sopenharmony_ci bool mem_high, swap_high; 28588c2ecf20Sopenharmony_ci 28598c2ecf20Sopenharmony_ci mem_high = page_counter_read(&memcg->memory) > 28608c2ecf20Sopenharmony_ci READ_ONCE(memcg->memory.high); 28618c2ecf20Sopenharmony_ci swap_high = page_counter_read(&memcg->swap) > 28628c2ecf20Sopenharmony_ci READ_ONCE(memcg->swap.high); 28638c2ecf20Sopenharmony_ci 28648c2ecf20Sopenharmony_ci /* Don't bother a random interrupted task */ 28658c2ecf20Sopenharmony_ci if (in_interrupt()) { 28668c2ecf20Sopenharmony_ci if (mem_high) { 28678c2ecf20Sopenharmony_ci schedule_work(&memcg->high_work); 28688c2ecf20Sopenharmony_ci break; 28698c2ecf20Sopenharmony_ci } 28708c2ecf20Sopenharmony_ci continue; 28718c2ecf20Sopenharmony_ci } 28728c2ecf20Sopenharmony_ci 28738c2ecf20Sopenharmony_ci if (mem_high || swap_high) { 28748c2ecf20Sopenharmony_ci /* 28758c2ecf20Sopenharmony_ci * The allocating tasks in this cgroup will need to do 28768c2ecf20Sopenharmony_ci * reclaim or be throttled to prevent further growth 28778c2ecf20Sopenharmony_ci * of the memory or swap footprints. 28788c2ecf20Sopenharmony_ci * 28798c2ecf20Sopenharmony_ci * Target some best-effort fairness between the tasks, 28808c2ecf20Sopenharmony_ci * and distribute reclaim work and delay penalties 28818c2ecf20Sopenharmony_ci * based on how much each task is actually allocating. 28828c2ecf20Sopenharmony_ci */ 28838c2ecf20Sopenharmony_ci current->memcg_nr_pages_over_high += batch; 28848c2ecf20Sopenharmony_ci set_notify_resume(current); 28858c2ecf20Sopenharmony_ci break; 28868c2ecf20Sopenharmony_ci } 28878c2ecf20Sopenharmony_ci } while ((memcg = parent_mem_cgroup(memcg))); 28888c2ecf20Sopenharmony_ci 28898c2ecf20Sopenharmony_ci return 0; 28908c2ecf20Sopenharmony_ci} 28918c2ecf20Sopenharmony_ci 28928c2ecf20Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU) 28938c2ecf20Sopenharmony_cistatic void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 28948c2ecf20Sopenharmony_ci{ 28958c2ecf20Sopenharmony_ci if (mem_cgroup_is_root(memcg)) 28968c2ecf20Sopenharmony_ci return; 28978c2ecf20Sopenharmony_ci 28988c2ecf20Sopenharmony_ci page_counter_uncharge(&memcg->memory, nr_pages); 28998c2ecf20Sopenharmony_ci if (do_memsw_account()) 29008c2ecf20Sopenharmony_ci page_counter_uncharge(&memcg->memsw, nr_pages); 29018c2ecf20Sopenharmony_ci} 29028c2ecf20Sopenharmony_ci#endif 29038c2ecf20Sopenharmony_ci 29048c2ecf20Sopenharmony_cistatic void commit_charge(struct page *page, struct mem_cgroup *memcg) 29058c2ecf20Sopenharmony_ci{ 29068c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(page->mem_cgroup, page); 29078c2ecf20Sopenharmony_ci /* 29088c2ecf20Sopenharmony_ci * Any of the following ensures page->mem_cgroup stability: 29098c2ecf20Sopenharmony_ci * 29108c2ecf20Sopenharmony_ci * - the page lock 29118c2ecf20Sopenharmony_ci * - LRU isolation 29128c2ecf20Sopenharmony_ci * - lock_page_memcg() 29138c2ecf20Sopenharmony_ci * - exclusive reference 29148c2ecf20Sopenharmony_ci */ 29158c2ecf20Sopenharmony_ci page->mem_cgroup = memcg; 29168c2ecf20Sopenharmony_ci} 29178c2ecf20Sopenharmony_ci 29188c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM 29198c2ecf20Sopenharmony_ci/* 29208c2ecf20Sopenharmony_ci * The allocated objcg pointers array is not accounted directly. 29218c2ecf20Sopenharmony_ci * Moreover, it should not come from DMA buffer and is not readily 29228c2ecf20Sopenharmony_ci * reclaimable. So those GFP bits should be masked off. 29238c2ecf20Sopenharmony_ci */ 29248c2ecf20Sopenharmony_ci#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ 29258c2ecf20Sopenharmony_ci __GFP_ACCOUNT | __GFP_NOFAIL) 29268c2ecf20Sopenharmony_ci 29278c2ecf20Sopenharmony_ciint memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, 29288c2ecf20Sopenharmony_ci gfp_t gfp) 29298c2ecf20Sopenharmony_ci{ 29308c2ecf20Sopenharmony_ci unsigned int objects = objs_per_slab_page(s, page); 29318c2ecf20Sopenharmony_ci void *vec; 29328c2ecf20Sopenharmony_ci 29338c2ecf20Sopenharmony_ci gfp &= ~OBJCGS_CLEAR_MASK; 29348c2ecf20Sopenharmony_ci vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, 29358c2ecf20Sopenharmony_ci page_to_nid(page)); 29368c2ecf20Sopenharmony_ci if (!vec) 29378c2ecf20Sopenharmony_ci return -ENOMEM; 29388c2ecf20Sopenharmony_ci 29398c2ecf20Sopenharmony_ci if (cmpxchg(&page->obj_cgroups, NULL, 29408c2ecf20Sopenharmony_ci (struct obj_cgroup **) ((unsigned long)vec | 0x1UL))) 29418c2ecf20Sopenharmony_ci kfree(vec); 29428c2ecf20Sopenharmony_ci else 29438c2ecf20Sopenharmony_ci kmemleak_not_leak(vec); 29448c2ecf20Sopenharmony_ci 29458c2ecf20Sopenharmony_ci return 0; 29468c2ecf20Sopenharmony_ci} 29478c2ecf20Sopenharmony_ci 29488c2ecf20Sopenharmony_ci/* 29498c2ecf20Sopenharmony_ci * Returns a pointer to the memory cgroup to which the kernel object is charged. 29508c2ecf20Sopenharmony_ci * 29518c2ecf20Sopenharmony_ci * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 29528c2ecf20Sopenharmony_ci * cgroup_mutex, etc. 29538c2ecf20Sopenharmony_ci */ 29548c2ecf20Sopenharmony_cistruct mem_cgroup *mem_cgroup_from_obj(void *p) 29558c2ecf20Sopenharmony_ci{ 29568c2ecf20Sopenharmony_ci struct page *page; 29578c2ecf20Sopenharmony_ci 29588c2ecf20Sopenharmony_ci if (mem_cgroup_disabled()) 29598c2ecf20Sopenharmony_ci return NULL; 29608c2ecf20Sopenharmony_ci 29618c2ecf20Sopenharmony_ci page = virt_to_head_page(p); 29628c2ecf20Sopenharmony_ci 29638c2ecf20Sopenharmony_ci /* 29648c2ecf20Sopenharmony_ci * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer 29658c2ecf20Sopenharmony_ci * or a pointer to obj_cgroup vector. In the latter case the lowest 29668c2ecf20Sopenharmony_ci * bit of the pointer is set. 29678c2ecf20Sopenharmony_ci * The page->mem_cgroup pointer can be asynchronously changed 29688c2ecf20Sopenharmony_ci * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed 29698c2ecf20Sopenharmony_ci * from a valid memcg pointer to objcg vector or back. 29708c2ecf20Sopenharmony_ci */ 29718c2ecf20Sopenharmony_ci if (!page->mem_cgroup) 29728c2ecf20Sopenharmony_ci return NULL; 29738c2ecf20Sopenharmony_ci 29748c2ecf20Sopenharmony_ci /* 29758c2ecf20Sopenharmony_ci * Slab objects are accounted individually, not per-page. 29768c2ecf20Sopenharmony_ci * Memcg membership data for each individual object is saved in 29778c2ecf20Sopenharmony_ci * the page->obj_cgroups. 29788c2ecf20Sopenharmony_ci */ 29798c2ecf20Sopenharmony_ci if (page_has_obj_cgroups(page)) { 29808c2ecf20Sopenharmony_ci struct obj_cgroup *objcg; 29818c2ecf20Sopenharmony_ci unsigned int off; 29828c2ecf20Sopenharmony_ci 29838c2ecf20Sopenharmony_ci off = obj_to_index(page->slab_cache, page, p); 29848c2ecf20Sopenharmony_ci objcg = page_obj_cgroups(page)[off]; 29858c2ecf20Sopenharmony_ci if (objcg) 29868c2ecf20Sopenharmony_ci return obj_cgroup_memcg(objcg); 29878c2ecf20Sopenharmony_ci 29888c2ecf20Sopenharmony_ci return NULL; 29898c2ecf20Sopenharmony_ci } 29908c2ecf20Sopenharmony_ci 29918c2ecf20Sopenharmony_ci /* All other pages use page->mem_cgroup */ 29928c2ecf20Sopenharmony_ci return page->mem_cgroup; 29938c2ecf20Sopenharmony_ci} 29948c2ecf20Sopenharmony_ci 29958c2ecf20Sopenharmony_ci__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) 29968c2ecf20Sopenharmony_ci{ 29978c2ecf20Sopenharmony_ci struct obj_cgroup *objcg = NULL; 29988c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 29998c2ecf20Sopenharmony_ci 30008c2ecf20Sopenharmony_ci if (memcg_kmem_bypass()) 30018c2ecf20Sopenharmony_ci return NULL; 30028c2ecf20Sopenharmony_ci 30038c2ecf20Sopenharmony_ci rcu_read_lock(); 30048c2ecf20Sopenharmony_ci if (unlikely(active_memcg())) 30058c2ecf20Sopenharmony_ci memcg = active_memcg(); 30068c2ecf20Sopenharmony_ci else 30078c2ecf20Sopenharmony_ci memcg = mem_cgroup_from_task(current); 30088c2ecf20Sopenharmony_ci 30098c2ecf20Sopenharmony_ci for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { 30108c2ecf20Sopenharmony_ci objcg = rcu_dereference(memcg->objcg); 30118c2ecf20Sopenharmony_ci if (objcg && obj_cgroup_tryget(objcg)) 30128c2ecf20Sopenharmony_ci break; 30138c2ecf20Sopenharmony_ci objcg = NULL; 30148c2ecf20Sopenharmony_ci } 30158c2ecf20Sopenharmony_ci rcu_read_unlock(); 30168c2ecf20Sopenharmony_ci 30178c2ecf20Sopenharmony_ci return objcg; 30188c2ecf20Sopenharmony_ci} 30198c2ecf20Sopenharmony_ci 30208c2ecf20Sopenharmony_cistatic int memcg_alloc_cache_id(void) 30218c2ecf20Sopenharmony_ci{ 30228c2ecf20Sopenharmony_ci int id, size; 30238c2ecf20Sopenharmony_ci int err; 30248c2ecf20Sopenharmony_ci 30258c2ecf20Sopenharmony_ci id = ida_simple_get(&memcg_cache_ida, 30268c2ecf20Sopenharmony_ci 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 30278c2ecf20Sopenharmony_ci if (id < 0) 30288c2ecf20Sopenharmony_ci return id; 30298c2ecf20Sopenharmony_ci 30308c2ecf20Sopenharmony_ci if (id < memcg_nr_cache_ids) 30318c2ecf20Sopenharmony_ci return id; 30328c2ecf20Sopenharmony_ci 30338c2ecf20Sopenharmony_ci /* 30348c2ecf20Sopenharmony_ci * There's no space for the new id in memcg_caches arrays, 30358c2ecf20Sopenharmony_ci * so we have to grow them. 30368c2ecf20Sopenharmony_ci */ 30378c2ecf20Sopenharmony_ci down_write(&memcg_cache_ids_sem); 30388c2ecf20Sopenharmony_ci 30398c2ecf20Sopenharmony_ci size = 2 * (id + 1); 30408c2ecf20Sopenharmony_ci if (size < MEMCG_CACHES_MIN_SIZE) 30418c2ecf20Sopenharmony_ci size = MEMCG_CACHES_MIN_SIZE; 30428c2ecf20Sopenharmony_ci else if (size > MEMCG_CACHES_MAX_SIZE) 30438c2ecf20Sopenharmony_ci size = MEMCG_CACHES_MAX_SIZE; 30448c2ecf20Sopenharmony_ci 30458c2ecf20Sopenharmony_ci err = memcg_update_all_list_lrus(size); 30468c2ecf20Sopenharmony_ci if (!err) 30478c2ecf20Sopenharmony_ci memcg_nr_cache_ids = size; 30488c2ecf20Sopenharmony_ci 30498c2ecf20Sopenharmony_ci up_write(&memcg_cache_ids_sem); 30508c2ecf20Sopenharmony_ci 30518c2ecf20Sopenharmony_ci if (err) { 30528c2ecf20Sopenharmony_ci ida_simple_remove(&memcg_cache_ida, id); 30538c2ecf20Sopenharmony_ci return err; 30548c2ecf20Sopenharmony_ci } 30558c2ecf20Sopenharmony_ci return id; 30568c2ecf20Sopenharmony_ci} 30578c2ecf20Sopenharmony_ci 30588c2ecf20Sopenharmony_cistatic void memcg_free_cache_id(int id) 30598c2ecf20Sopenharmony_ci{ 30608c2ecf20Sopenharmony_ci ida_simple_remove(&memcg_cache_ida, id); 30618c2ecf20Sopenharmony_ci} 30628c2ecf20Sopenharmony_ci 30638c2ecf20Sopenharmony_ci/** 30648c2ecf20Sopenharmony_ci * __memcg_kmem_charge: charge a number of kernel pages to a memcg 30658c2ecf20Sopenharmony_ci * @memcg: memory cgroup to charge 30668c2ecf20Sopenharmony_ci * @gfp: reclaim mode 30678c2ecf20Sopenharmony_ci * @nr_pages: number of pages to charge 30688c2ecf20Sopenharmony_ci * 30698c2ecf20Sopenharmony_ci * Returns 0 on success, an error code on failure. 30708c2ecf20Sopenharmony_ci */ 30718c2ecf20Sopenharmony_ciint __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, 30728c2ecf20Sopenharmony_ci unsigned int nr_pages) 30738c2ecf20Sopenharmony_ci{ 30748c2ecf20Sopenharmony_ci struct page_counter *counter; 30758c2ecf20Sopenharmony_ci int ret; 30768c2ecf20Sopenharmony_ci 30778c2ecf20Sopenharmony_ci ret = try_charge(memcg, gfp, nr_pages); 30788c2ecf20Sopenharmony_ci if (ret) 30798c2ecf20Sopenharmony_ci return ret; 30808c2ecf20Sopenharmony_ci 30818c2ecf20Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && 30828c2ecf20Sopenharmony_ci !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { 30838c2ecf20Sopenharmony_ci 30848c2ecf20Sopenharmony_ci /* 30858c2ecf20Sopenharmony_ci * Enforce __GFP_NOFAIL allocation because callers are not 30868c2ecf20Sopenharmony_ci * prepared to see failures and likely do not have any failure 30878c2ecf20Sopenharmony_ci * handling code. 30888c2ecf20Sopenharmony_ci */ 30898c2ecf20Sopenharmony_ci if (gfp & __GFP_NOFAIL) { 30908c2ecf20Sopenharmony_ci page_counter_charge(&memcg->kmem, nr_pages); 30918c2ecf20Sopenharmony_ci return 0; 30928c2ecf20Sopenharmony_ci } 30938c2ecf20Sopenharmony_ci cancel_charge(memcg, nr_pages); 30948c2ecf20Sopenharmony_ci return -ENOMEM; 30958c2ecf20Sopenharmony_ci } 30968c2ecf20Sopenharmony_ci return 0; 30978c2ecf20Sopenharmony_ci} 30988c2ecf20Sopenharmony_ci 30998c2ecf20Sopenharmony_ci/** 31008c2ecf20Sopenharmony_ci * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg 31018c2ecf20Sopenharmony_ci * @memcg: memcg to uncharge 31028c2ecf20Sopenharmony_ci * @nr_pages: number of pages to uncharge 31038c2ecf20Sopenharmony_ci */ 31048c2ecf20Sopenharmony_civoid __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) 31058c2ecf20Sopenharmony_ci{ 31068c2ecf20Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 31078c2ecf20Sopenharmony_ci page_counter_uncharge(&memcg->kmem, nr_pages); 31088c2ecf20Sopenharmony_ci 31098c2ecf20Sopenharmony_ci refill_stock(memcg, nr_pages); 31108c2ecf20Sopenharmony_ci} 31118c2ecf20Sopenharmony_ci 31128c2ecf20Sopenharmony_ci/** 31138c2ecf20Sopenharmony_ci * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup 31148c2ecf20Sopenharmony_ci * @page: page to charge 31158c2ecf20Sopenharmony_ci * @gfp: reclaim mode 31168c2ecf20Sopenharmony_ci * @order: allocation order 31178c2ecf20Sopenharmony_ci * 31188c2ecf20Sopenharmony_ci * Returns 0 on success, an error code on failure. 31198c2ecf20Sopenharmony_ci */ 31208c2ecf20Sopenharmony_ciint __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) 31218c2ecf20Sopenharmony_ci{ 31228c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 31238c2ecf20Sopenharmony_ci int ret = 0; 31248c2ecf20Sopenharmony_ci 31258c2ecf20Sopenharmony_ci memcg = get_mem_cgroup_from_current(); 31268c2ecf20Sopenharmony_ci if (memcg && !mem_cgroup_is_root(memcg)) { 31278c2ecf20Sopenharmony_ci ret = __memcg_kmem_charge(memcg, gfp, 1 << order); 31288c2ecf20Sopenharmony_ci if (!ret) { 31298c2ecf20Sopenharmony_ci page->mem_cgroup = memcg; 31308c2ecf20Sopenharmony_ci __SetPageKmemcg(page); 31318c2ecf20Sopenharmony_ci return 0; 31328c2ecf20Sopenharmony_ci } 31338c2ecf20Sopenharmony_ci css_put(&memcg->css); 31348c2ecf20Sopenharmony_ci } 31358c2ecf20Sopenharmony_ci return ret; 31368c2ecf20Sopenharmony_ci} 31378c2ecf20Sopenharmony_ci 31388c2ecf20Sopenharmony_ci/** 31398c2ecf20Sopenharmony_ci * __memcg_kmem_uncharge_page: uncharge a kmem page 31408c2ecf20Sopenharmony_ci * @page: page to uncharge 31418c2ecf20Sopenharmony_ci * @order: allocation order 31428c2ecf20Sopenharmony_ci */ 31438c2ecf20Sopenharmony_civoid __memcg_kmem_uncharge_page(struct page *page, int order) 31448c2ecf20Sopenharmony_ci{ 31458c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = page->mem_cgroup; 31468c2ecf20Sopenharmony_ci unsigned int nr_pages = 1 << order; 31478c2ecf20Sopenharmony_ci 31488c2ecf20Sopenharmony_ci if (!memcg) 31498c2ecf20Sopenharmony_ci return; 31508c2ecf20Sopenharmony_ci 31518c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 31528c2ecf20Sopenharmony_ci __memcg_kmem_uncharge(memcg, nr_pages); 31538c2ecf20Sopenharmony_ci page->mem_cgroup = NULL; 31548c2ecf20Sopenharmony_ci css_put(&memcg->css); 31558c2ecf20Sopenharmony_ci 31568c2ecf20Sopenharmony_ci /* slab pages do not have PageKmemcg flag set */ 31578c2ecf20Sopenharmony_ci if (PageKmemcg(page)) 31588c2ecf20Sopenharmony_ci __ClearPageKmemcg(page); 31598c2ecf20Sopenharmony_ci} 31608c2ecf20Sopenharmony_ci 31618c2ecf20Sopenharmony_cistatic bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 31628c2ecf20Sopenharmony_ci{ 31638c2ecf20Sopenharmony_ci struct memcg_stock_pcp *stock; 31648c2ecf20Sopenharmony_ci unsigned long flags; 31658c2ecf20Sopenharmony_ci bool ret = false; 31668c2ecf20Sopenharmony_ci 31678c2ecf20Sopenharmony_ci local_irq_save(flags); 31688c2ecf20Sopenharmony_ci 31698c2ecf20Sopenharmony_ci stock = this_cpu_ptr(&memcg_stock); 31708c2ecf20Sopenharmony_ci if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { 31718c2ecf20Sopenharmony_ci stock->nr_bytes -= nr_bytes; 31728c2ecf20Sopenharmony_ci ret = true; 31738c2ecf20Sopenharmony_ci } 31748c2ecf20Sopenharmony_ci 31758c2ecf20Sopenharmony_ci local_irq_restore(flags); 31768c2ecf20Sopenharmony_ci 31778c2ecf20Sopenharmony_ci return ret; 31788c2ecf20Sopenharmony_ci} 31798c2ecf20Sopenharmony_ci 31808c2ecf20Sopenharmony_cistatic void drain_obj_stock(struct memcg_stock_pcp *stock) 31818c2ecf20Sopenharmony_ci{ 31828c2ecf20Sopenharmony_ci struct obj_cgroup *old = stock->cached_objcg; 31838c2ecf20Sopenharmony_ci 31848c2ecf20Sopenharmony_ci if (!old) 31858c2ecf20Sopenharmony_ci return; 31868c2ecf20Sopenharmony_ci 31878c2ecf20Sopenharmony_ci if (stock->nr_bytes) { 31888c2ecf20Sopenharmony_ci unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; 31898c2ecf20Sopenharmony_ci unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); 31908c2ecf20Sopenharmony_ci 31918c2ecf20Sopenharmony_ci if (nr_pages) { 31928c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 31938c2ecf20Sopenharmony_ci 31948c2ecf20Sopenharmony_ci rcu_read_lock(); 31958c2ecf20Sopenharmony_ciretry: 31968c2ecf20Sopenharmony_ci memcg = obj_cgroup_memcg(old); 31978c2ecf20Sopenharmony_ci if (unlikely(!css_tryget(&memcg->css))) 31988c2ecf20Sopenharmony_ci goto retry; 31998c2ecf20Sopenharmony_ci rcu_read_unlock(); 32008c2ecf20Sopenharmony_ci 32018c2ecf20Sopenharmony_ci __memcg_kmem_uncharge(memcg, nr_pages); 32028c2ecf20Sopenharmony_ci css_put(&memcg->css); 32038c2ecf20Sopenharmony_ci } 32048c2ecf20Sopenharmony_ci 32058c2ecf20Sopenharmony_ci /* 32068c2ecf20Sopenharmony_ci * The leftover is flushed to the centralized per-memcg value. 32078c2ecf20Sopenharmony_ci * On the next attempt to refill obj stock it will be moved 32088c2ecf20Sopenharmony_ci * to a per-cpu stock (probably, on an other CPU), see 32098c2ecf20Sopenharmony_ci * refill_obj_stock(). 32108c2ecf20Sopenharmony_ci * 32118c2ecf20Sopenharmony_ci * How often it's flushed is a trade-off between the memory 32128c2ecf20Sopenharmony_ci * limit enforcement accuracy and potential CPU contention, 32138c2ecf20Sopenharmony_ci * so it might be changed in the future. 32148c2ecf20Sopenharmony_ci */ 32158c2ecf20Sopenharmony_ci atomic_add(nr_bytes, &old->nr_charged_bytes); 32168c2ecf20Sopenharmony_ci stock->nr_bytes = 0; 32178c2ecf20Sopenharmony_ci } 32188c2ecf20Sopenharmony_ci 32198c2ecf20Sopenharmony_ci obj_cgroup_put(old); 32208c2ecf20Sopenharmony_ci stock->cached_objcg = NULL; 32218c2ecf20Sopenharmony_ci} 32228c2ecf20Sopenharmony_ci 32238c2ecf20Sopenharmony_cistatic bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 32248c2ecf20Sopenharmony_ci struct mem_cgroup *root_memcg) 32258c2ecf20Sopenharmony_ci{ 32268c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 32278c2ecf20Sopenharmony_ci 32288c2ecf20Sopenharmony_ci if (stock->cached_objcg) { 32298c2ecf20Sopenharmony_ci memcg = obj_cgroup_memcg(stock->cached_objcg); 32308c2ecf20Sopenharmony_ci if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) 32318c2ecf20Sopenharmony_ci return true; 32328c2ecf20Sopenharmony_ci } 32338c2ecf20Sopenharmony_ci 32348c2ecf20Sopenharmony_ci return false; 32358c2ecf20Sopenharmony_ci} 32368c2ecf20Sopenharmony_ci 32378c2ecf20Sopenharmony_cistatic void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 32388c2ecf20Sopenharmony_ci{ 32398c2ecf20Sopenharmony_ci struct memcg_stock_pcp *stock; 32408c2ecf20Sopenharmony_ci unsigned long flags; 32418c2ecf20Sopenharmony_ci 32428c2ecf20Sopenharmony_ci local_irq_save(flags); 32438c2ecf20Sopenharmony_ci 32448c2ecf20Sopenharmony_ci stock = this_cpu_ptr(&memcg_stock); 32458c2ecf20Sopenharmony_ci if (stock->cached_objcg != objcg) { /* reset if necessary */ 32468c2ecf20Sopenharmony_ci drain_obj_stock(stock); 32478c2ecf20Sopenharmony_ci obj_cgroup_get(objcg); 32488c2ecf20Sopenharmony_ci stock->cached_objcg = objcg; 32498c2ecf20Sopenharmony_ci stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0); 32508c2ecf20Sopenharmony_ci } 32518c2ecf20Sopenharmony_ci stock->nr_bytes += nr_bytes; 32528c2ecf20Sopenharmony_ci 32538c2ecf20Sopenharmony_ci if (stock->nr_bytes > PAGE_SIZE) 32548c2ecf20Sopenharmony_ci drain_obj_stock(stock); 32558c2ecf20Sopenharmony_ci 32568c2ecf20Sopenharmony_ci local_irq_restore(flags); 32578c2ecf20Sopenharmony_ci} 32588c2ecf20Sopenharmony_ci 32598c2ecf20Sopenharmony_ciint obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) 32608c2ecf20Sopenharmony_ci{ 32618c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 32628c2ecf20Sopenharmony_ci unsigned int nr_pages, nr_bytes; 32638c2ecf20Sopenharmony_ci int ret; 32648c2ecf20Sopenharmony_ci 32658c2ecf20Sopenharmony_ci if (consume_obj_stock(objcg, size)) 32668c2ecf20Sopenharmony_ci return 0; 32678c2ecf20Sopenharmony_ci 32688c2ecf20Sopenharmony_ci /* 32698c2ecf20Sopenharmony_ci * In theory, memcg->nr_charged_bytes can have enough 32708c2ecf20Sopenharmony_ci * pre-charged bytes to satisfy the allocation. However, 32718c2ecf20Sopenharmony_ci * flushing memcg->nr_charged_bytes requires two atomic 32728c2ecf20Sopenharmony_ci * operations, and memcg->nr_charged_bytes can't be big, 32738c2ecf20Sopenharmony_ci * so it's better to ignore it and try grab some new pages. 32748c2ecf20Sopenharmony_ci * memcg->nr_charged_bytes will be flushed in 32758c2ecf20Sopenharmony_ci * refill_obj_stock(), called from this function or 32768c2ecf20Sopenharmony_ci * independently later. 32778c2ecf20Sopenharmony_ci */ 32788c2ecf20Sopenharmony_ci rcu_read_lock(); 32798c2ecf20Sopenharmony_ciretry: 32808c2ecf20Sopenharmony_ci memcg = obj_cgroup_memcg(objcg); 32818c2ecf20Sopenharmony_ci if (unlikely(!css_tryget(&memcg->css))) 32828c2ecf20Sopenharmony_ci goto retry; 32838c2ecf20Sopenharmony_ci rcu_read_unlock(); 32848c2ecf20Sopenharmony_ci 32858c2ecf20Sopenharmony_ci nr_pages = size >> PAGE_SHIFT; 32868c2ecf20Sopenharmony_ci nr_bytes = size & (PAGE_SIZE - 1); 32878c2ecf20Sopenharmony_ci 32888c2ecf20Sopenharmony_ci if (nr_bytes) 32898c2ecf20Sopenharmony_ci nr_pages += 1; 32908c2ecf20Sopenharmony_ci 32918c2ecf20Sopenharmony_ci ret = __memcg_kmem_charge(memcg, gfp, nr_pages); 32928c2ecf20Sopenharmony_ci if (!ret && nr_bytes) 32938c2ecf20Sopenharmony_ci refill_obj_stock(objcg, PAGE_SIZE - nr_bytes); 32948c2ecf20Sopenharmony_ci 32958c2ecf20Sopenharmony_ci css_put(&memcg->css); 32968c2ecf20Sopenharmony_ci return ret; 32978c2ecf20Sopenharmony_ci} 32988c2ecf20Sopenharmony_ci 32998c2ecf20Sopenharmony_civoid obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) 33008c2ecf20Sopenharmony_ci{ 33018c2ecf20Sopenharmony_ci refill_obj_stock(objcg, size); 33028c2ecf20Sopenharmony_ci} 33038c2ecf20Sopenharmony_ci 33048c2ecf20Sopenharmony_ci#endif /* CONFIG_MEMCG_KMEM */ 33058c2ecf20Sopenharmony_ci 33068c2ecf20Sopenharmony_ci/* 33078c2ecf20Sopenharmony_ci * Because head->mem_cgroup is not set on tails, set it now. 33088c2ecf20Sopenharmony_ci */ 33098c2ecf20Sopenharmony_civoid split_page_memcg(struct page *head, unsigned int nr) 33108c2ecf20Sopenharmony_ci{ 33118c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = head->mem_cgroup; 33128c2ecf20Sopenharmony_ci int kmemcg = PageKmemcg(head); 33138c2ecf20Sopenharmony_ci int i; 33148c2ecf20Sopenharmony_ci 33158c2ecf20Sopenharmony_ci if (mem_cgroup_disabled() || !memcg) 33168c2ecf20Sopenharmony_ci return; 33178c2ecf20Sopenharmony_ci 33188c2ecf20Sopenharmony_ci for (i = 1; i < nr; i++) { 33198c2ecf20Sopenharmony_ci head[i].mem_cgroup = memcg; 33208c2ecf20Sopenharmony_ci if (kmemcg) 33218c2ecf20Sopenharmony_ci __SetPageKmemcg(head + i); 33228c2ecf20Sopenharmony_ci } 33238c2ecf20Sopenharmony_ci css_get_many(&memcg->css, nr - 1); 33248c2ecf20Sopenharmony_ci} 33258c2ecf20Sopenharmony_ci 33268c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_SWAP 33278c2ecf20Sopenharmony_ci/** 33288c2ecf20Sopenharmony_ci * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 33298c2ecf20Sopenharmony_ci * @entry: swap entry to be moved 33308c2ecf20Sopenharmony_ci * @from: mem_cgroup which the entry is moved from 33318c2ecf20Sopenharmony_ci * @to: mem_cgroup which the entry is moved to 33328c2ecf20Sopenharmony_ci * 33338c2ecf20Sopenharmony_ci * It succeeds only when the swap_cgroup's record for this entry is the same 33348c2ecf20Sopenharmony_ci * as the mem_cgroup's id of @from. 33358c2ecf20Sopenharmony_ci * 33368c2ecf20Sopenharmony_ci * Returns 0 on success, -EINVAL on failure. 33378c2ecf20Sopenharmony_ci * 33388c2ecf20Sopenharmony_ci * The caller must have charged to @to, IOW, called page_counter_charge() about 33398c2ecf20Sopenharmony_ci * both res and memsw, and called css_get(). 33408c2ecf20Sopenharmony_ci */ 33418c2ecf20Sopenharmony_cistatic int mem_cgroup_move_swap_account(swp_entry_t entry, 33428c2ecf20Sopenharmony_ci struct mem_cgroup *from, struct mem_cgroup *to) 33438c2ecf20Sopenharmony_ci{ 33448c2ecf20Sopenharmony_ci unsigned short old_id, new_id; 33458c2ecf20Sopenharmony_ci 33468c2ecf20Sopenharmony_ci old_id = mem_cgroup_id(from); 33478c2ecf20Sopenharmony_ci new_id = mem_cgroup_id(to); 33488c2ecf20Sopenharmony_ci 33498c2ecf20Sopenharmony_ci if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 33508c2ecf20Sopenharmony_ci mod_memcg_state(from, MEMCG_SWAP, -1); 33518c2ecf20Sopenharmony_ci mod_memcg_state(to, MEMCG_SWAP, 1); 33528c2ecf20Sopenharmony_ci return 0; 33538c2ecf20Sopenharmony_ci } 33548c2ecf20Sopenharmony_ci return -EINVAL; 33558c2ecf20Sopenharmony_ci} 33568c2ecf20Sopenharmony_ci#else 33578c2ecf20Sopenharmony_cistatic inline int mem_cgroup_move_swap_account(swp_entry_t entry, 33588c2ecf20Sopenharmony_ci struct mem_cgroup *from, struct mem_cgroup *to) 33598c2ecf20Sopenharmony_ci{ 33608c2ecf20Sopenharmony_ci return -EINVAL; 33618c2ecf20Sopenharmony_ci} 33628c2ecf20Sopenharmony_ci#endif 33638c2ecf20Sopenharmony_ci 33648c2ecf20Sopenharmony_cistatic DEFINE_MUTEX(memcg_max_mutex); 33658c2ecf20Sopenharmony_ci 33668c2ecf20Sopenharmony_cistatic int mem_cgroup_resize_max(struct mem_cgroup *memcg, 33678c2ecf20Sopenharmony_ci unsigned long max, bool memsw) 33688c2ecf20Sopenharmony_ci{ 33698c2ecf20Sopenharmony_ci bool enlarge = false; 33708c2ecf20Sopenharmony_ci bool drained = false; 33718c2ecf20Sopenharmony_ci int ret; 33728c2ecf20Sopenharmony_ci bool limits_invariant; 33738c2ecf20Sopenharmony_ci struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 33748c2ecf20Sopenharmony_ci 33758c2ecf20Sopenharmony_ci do { 33768c2ecf20Sopenharmony_ci if (signal_pending(current)) { 33778c2ecf20Sopenharmony_ci ret = -EINTR; 33788c2ecf20Sopenharmony_ci break; 33798c2ecf20Sopenharmony_ci } 33808c2ecf20Sopenharmony_ci 33818c2ecf20Sopenharmony_ci mutex_lock(&memcg_max_mutex); 33828c2ecf20Sopenharmony_ci /* 33838c2ecf20Sopenharmony_ci * Make sure that the new limit (memsw or memory limit) doesn't 33848c2ecf20Sopenharmony_ci * break our basic invariant rule memory.max <= memsw.max. 33858c2ecf20Sopenharmony_ci */ 33868c2ecf20Sopenharmony_ci limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : 33878c2ecf20Sopenharmony_ci max <= memcg->memsw.max; 33888c2ecf20Sopenharmony_ci if (!limits_invariant) { 33898c2ecf20Sopenharmony_ci mutex_unlock(&memcg_max_mutex); 33908c2ecf20Sopenharmony_ci ret = -EINVAL; 33918c2ecf20Sopenharmony_ci break; 33928c2ecf20Sopenharmony_ci } 33938c2ecf20Sopenharmony_ci if (max > counter->max) 33948c2ecf20Sopenharmony_ci enlarge = true; 33958c2ecf20Sopenharmony_ci ret = page_counter_set_max(counter, max); 33968c2ecf20Sopenharmony_ci mutex_unlock(&memcg_max_mutex); 33978c2ecf20Sopenharmony_ci 33988c2ecf20Sopenharmony_ci if (!ret) 33998c2ecf20Sopenharmony_ci break; 34008c2ecf20Sopenharmony_ci 34018c2ecf20Sopenharmony_ci if (!drained) { 34028c2ecf20Sopenharmony_ci drain_all_stock(memcg); 34038c2ecf20Sopenharmony_ci drained = true; 34048c2ecf20Sopenharmony_ci continue; 34058c2ecf20Sopenharmony_ci } 34068c2ecf20Sopenharmony_ci 34078c2ecf20Sopenharmony_ci if (!try_to_free_mem_cgroup_pages(memcg, 1, 34088c2ecf20Sopenharmony_ci GFP_KERNEL, !memsw)) { 34098c2ecf20Sopenharmony_ci ret = -EBUSY; 34108c2ecf20Sopenharmony_ci break; 34118c2ecf20Sopenharmony_ci } 34128c2ecf20Sopenharmony_ci } while (true); 34138c2ecf20Sopenharmony_ci 34148c2ecf20Sopenharmony_ci if (!ret && enlarge) 34158c2ecf20Sopenharmony_ci memcg_oom_recover(memcg); 34168c2ecf20Sopenharmony_ci 34178c2ecf20Sopenharmony_ci return ret; 34188c2ecf20Sopenharmony_ci} 34198c2ecf20Sopenharmony_ci 34208c2ecf20Sopenharmony_ciunsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, 34218c2ecf20Sopenharmony_ci gfp_t gfp_mask, 34228c2ecf20Sopenharmony_ci unsigned long *total_scanned) 34238c2ecf20Sopenharmony_ci{ 34248c2ecf20Sopenharmony_ci unsigned long nr_reclaimed = 0; 34258c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *mz, *next_mz = NULL; 34268c2ecf20Sopenharmony_ci unsigned long reclaimed; 34278c2ecf20Sopenharmony_ci int loop = 0; 34288c2ecf20Sopenharmony_ci struct mem_cgroup_tree_per_node *mctz; 34298c2ecf20Sopenharmony_ci unsigned long excess; 34308c2ecf20Sopenharmony_ci unsigned long nr_scanned; 34318c2ecf20Sopenharmony_ci 34328c2ecf20Sopenharmony_ci if (order > 0) 34338c2ecf20Sopenharmony_ci return 0; 34348c2ecf20Sopenharmony_ci 34358c2ecf20Sopenharmony_ci mctz = soft_limit_tree_node(pgdat->node_id); 34368c2ecf20Sopenharmony_ci 34378c2ecf20Sopenharmony_ci /* 34388c2ecf20Sopenharmony_ci * Do not even bother to check the largest node if the root 34398c2ecf20Sopenharmony_ci * is empty. Do it lockless to prevent lock bouncing. Races 34408c2ecf20Sopenharmony_ci * are acceptable as soft limit is best effort anyway. 34418c2ecf20Sopenharmony_ci */ 34428c2ecf20Sopenharmony_ci if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 34438c2ecf20Sopenharmony_ci return 0; 34448c2ecf20Sopenharmony_ci 34458c2ecf20Sopenharmony_ci /* 34468c2ecf20Sopenharmony_ci * This loop can run a while, specially if mem_cgroup's continuously 34478c2ecf20Sopenharmony_ci * keep exceeding their soft limit and putting the system under 34488c2ecf20Sopenharmony_ci * pressure 34498c2ecf20Sopenharmony_ci */ 34508c2ecf20Sopenharmony_ci do { 34518c2ecf20Sopenharmony_ci if (next_mz) 34528c2ecf20Sopenharmony_ci mz = next_mz; 34538c2ecf20Sopenharmony_ci else 34548c2ecf20Sopenharmony_ci mz = mem_cgroup_largest_soft_limit_node(mctz); 34558c2ecf20Sopenharmony_ci if (!mz) 34568c2ecf20Sopenharmony_ci break; 34578c2ecf20Sopenharmony_ci 34588c2ecf20Sopenharmony_ci nr_scanned = 0; 34598c2ecf20Sopenharmony_ci reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 34608c2ecf20Sopenharmony_ci gfp_mask, &nr_scanned); 34618c2ecf20Sopenharmony_ci nr_reclaimed += reclaimed; 34628c2ecf20Sopenharmony_ci *total_scanned += nr_scanned; 34638c2ecf20Sopenharmony_ci spin_lock_irq(&mctz->lock); 34648c2ecf20Sopenharmony_ci __mem_cgroup_remove_exceeded(mz, mctz); 34658c2ecf20Sopenharmony_ci 34668c2ecf20Sopenharmony_ci /* 34678c2ecf20Sopenharmony_ci * If we failed to reclaim anything from this memory cgroup 34688c2ecf20Sopenharmony_ci * it is time to move on to the next cgroup 34698c2ecf20Sopenharmony_ci */ 34708c2ecf20Sopenharmony_ci next_mz = NULL; 34718c2ecf20Sopenharmony_ci if (!reclaimed) 34728c2ecf20Sopenharmony_ci next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 34738c2ecf20Sopenharmony_ci 34748c2ecf20Sopenharmony_ci excess = soft_limit_excess(mz->memcg); 34758c2ecf20Sopenharmony_ci /* 34768c2ecf20Sopenharmony_ci * One school of thought says that we should not add 34778c2ecf20Sopenharmony_ci * back the node to the tree if reclaim returns 0. 34788c2ecf20Sopenharmony_ci * But our reclaim could return 0, simply because due 34798c2ecf20Sopenharmony_ci * to priority we are exposing a smaller subset of 34808c2ecf20Sopenharmony_ci * memory to reclaim from. Consider this as a longer 34818c2ecf20Sopenharmony_ci * term TODO. 34828c2ecf20Sopenharmony_ci */ 34838c2ecf20Sopenharmony_ci /* If excess == 0, no tree ops */ 34848c2ecf20Sopenharmony_ci __mem_cgroup_insert_exceeded(mz, mctz, excess); 34858c2ecf20Sopenharmony_ci spin_unlock_irq(&mctz->lock); 34868c2ecf20Sopenharmony_ci css_put(&mz->memcg->css); 34878c2ecf20Sopenharmony_ci loop++; 34888c2ecf20Sopenharmony_ci /* 34898c2ecf20Sopenharmony_ci * Could not reclaim anything and there are no more 34908c2ecf20Sopenharmony_ci * mem cgroups to try or we seem to be looping without 34918c2ecf20Sopenharmony_ci * reclaiming anything. 34928c2ecf20Sopenharmony_ci */ 34938c2ecf20Sopenharmony_ci if (!nr_reclaimed && 34948c2ecf20Sopenharmony_ci (next_mz == NULL || 34958c2ecf20Sopenharmony_ci loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 34968c2ecf20Sopenharmony_ci break; 34978c2ecf20Sopenharmony_ci } while (!nr_reclaimed); 34988c2ecf20Sopenharmony_ci if (next_mz) 34998c2ecf20Sopenharmony_ci css_put(&next_mz->memcg->css); 35008c2ecf20Sopenharmony_ci return nr_reclaimed; 35018c2ecf20Sopenharmony_ci} 35028c2ecf20Sopenharmony_ci 35038c2ecf20Sopenharmony_ci/* 35048c2ecf20Sopenharmony_ci * Test whether @memcg has children, dead or alive. Note that this 35058c2ecf20Sopenharmony_ci * function doesn't care whether @memcg has use_hierarchy enabled and 35068c2ecf20Sopenharmony_ci * returns %true if there are child csses according to the cgroup 35078c2ecf20Sopenharmony_ci * hierarchy. Testing use_hierarchy is the caller's responsibility. 35088c2ecf20Sopenharmony_ci */ 35098c2ecf20Sopenharmony_cistatic inline bool memcg_has_children(struct mem_cgroup *memcg) 35108c2ecf20Sopenharmony_ci{ 35118c2ecf20Sopenharmony_ci bool ret; 35128c2ecf20Sopenharmony_ci 35138c2ecf20Sopenharmony_ci rcu_read_lock(); 35148c2ecf20Sopenharmony_ci ret = css_next_child(NULL, &memcg->css); 35158c2ecf20Sopenharmony_ci rcu_read_unlock(); 35168c2ecf20Sopenharmony_ci return ret; 35178c2ecf20Sopenharmony_ci} 35188c2ecf20Sopenharmony_ci 35198c2ecf20Sopenharmony_ci/* 35208c2ecf20Sopenharmony_ci * Reclaims as many pages from the given memcg as possible. 35218c2ecf20Sopenharmony_ci * 35228c2ecf20Sopenharmony_ci * Caller is responsible for holding css reference for memcg. 35238c2ecf20Sopenharmony_ci */ 35248c2ecf20Sopenharmony_cistatic int mem_cgroup_force_empty(struct mem_cgroup *memcg) 35258c2ecf20Sopenharmony_ci{ 35268c2ecf20Sopenharmony_ci int nr_retries = MAX_RECLAIM_RETRIES; 35278c2ecf20Sopenharmony_ci 35288c2ecf20Sopenharmony_ci /* we call try-to-free pages for make this cgroup empty */ 35298c2ecf20Sopenharmony_ci lru_add_drain_all(); 35308c2ecf20Sopenharmony_ci 35318c2ecf20Sopenharmony_ci drain_all_stock(memcg); 35328c2ecf20Sopenharmony_ci 35338c2ecf20Sopenharmony_ci /* try to free all pages in this cgroup */ 35348c2ecf20Sopenharmony_ci while (nr_retries && page_counter_read(&memcg->memory)) { 35358c2ecf20Sopenharmony_ci int progress; 35368c2ecf20Sopenharmony_ci 35378c2ecf20Sopenharmony_ci if (signal_pending(current)) 35388c2ecf20Sopenharmony_ci return -EINTR; 35398c2ecf20Sopenharmony_ci 35408c2ecf20Sopenharmony_ci progress = try_to_free_mem_cgroup_pages(memcg, 1, 35418c2ecf20Sopenharmony_ci GFP_KERNEL, true); 35428c2ecf20Sopenharmony_ci if (!progress) { 35438c2ecf20Sopenharmony_ci nr_retries--; 35448c2ecf20Sopenharmony_ci /* maybe some writeback is necessary */ 35458c2ecf20Sopenharmony_ci congestion_wait(BLK_RW_ASYNC, HZ/10); 35468c2ecf20Sopenharmony_ci } 35478c2ecf20Sopenharmony_ci 35488c2ecf20Sopenharmony_ci } 35498c2ecf20Sopenharmony_ci 35508c2ecf20Sopenharmony_ci return 0; 35518c2ecf20Sopenharmony_ci} 35528c2ecf20Sopenharmony_ci 35538c2ecf20Sopenharmony_cistatic ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 35548c2ecf20Sopenharmony_ci char *buf, size_t nbytes, 35558c2ecf20Sopenharmony_ci loff_t off) 35568c2ecf20Sopenharmony_ci{ 35578c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 35588c2ecf20Sopenharmony_ci 35598c2ecf20Sopenharmony_ci if (mem_cgroup_is_root(memcg)) 35608c2ecf20Sopenharmony_ci return -EINVAL; 35618c2ecf20Sopenharmony_ci return mem_cgroup_force_empty(memcg) ?: nbytes; 35628c2ecf20Sopenharmony_ci} 35638c2ecf20Sopenharmony_ci 35648c2ecf20Sopenharmony_cistatic u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 35658c2ecf20Sopenharmony_ci struct cftype *cft) 35668c2ecf20Sopenharmony_ci{ 35678c2ecf20Sopenharmony_ci return mem_cgroup_from_css(css)->use_hierarchy; 35688c2ecf20Sopenharmony_ci} 35698c2ecf20Sopenharmony_ci 35708c2ecf20Sopenharmony_cistatic int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 35718c2ecf20Sopenharmony_ci struct cftype *cft, u64 val) 35728c2ecf20Sopenharmony_ci{ 35738c2ecf20Sopenharmony_ci int retval = 0; 35748c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 35758c2ecf20Sopenharmony_ci struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); 35768c2ecf20Sopenharmony_ci 35778c2ecf20Sopenharmony_ci if (memcg->use_hierarchy == val) 35788c2ecf20Sopenharmony_ci return 0; 35798c2ecf20Sopenharmony_ci 35808c2ecf20Sopenharmony_ci /* 35818c2ecf20Sopenharmony_ci * If parent's use_hierarchy is set, we can't make any modifications 35828c2ecf20Sopenharmony_ci * in the child subtrees. If it is unset, then the change can 35838c2ecf20Sopenharmony_ci * occur, provided the current cgroup has no children. 35848c2ecf20Sopenharmony_ci * 35858c2ecf20Sopenharmony_ci * For the root cgroup, parent_mem is NULL, we allow value to be 35868c2ecf20Sopenharmony_ci * set if there are no children. 35878c2ecf20Sopenharmony_ci */ 35888c2ecf20Sopenharmony_ci if ((!parent_memcg || !parent_memcg->use_hierarchy) && 35898c2ecf20Sopenharmony_ci (val == 1 || val == 0)) { 35908c2ecf20Sopenharmony_ci if (!memcg_has_children(memcg)) 35918c2ecf20Sopenharmony_ci memcg->use_hierarchy = val; 35928c2ecf20Sopenharmony_ci else 35938c2ecf20Sopenharmony_ci retval = -EBUSY; 35948c2ecf20Sopenharmony_ci } else 35958c2ecf20Sopenharmony_ci retval = -EINVAL; 35968c2ecf20Sopenharmony_ci 35978c2ecf20Sopenharmony_ci return retval; 35988c2ecf20Sopenharmony_ci} 35998c2ecf20Sopenharmony_ci 36008c2ecf20Sopenharmony_cistatic unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 36018c2ecf20Sopenharmony_ci{ 36028c2ecf20Sopenharmony_ci unsigned long val; 36038c2ecf20Sopenharmony_ci 36048c2ecf20Sopenharmony_ci if (mem_cgroup_is_root(memcg)) { 36058c2ecf20Sopenharmony_ci val = memcg_page_state(memcg, NR_FILE_PAGES) + 36068c2ecf20Sopenharmony_ci memcg_page_state(memcg, NR_ANON_MAPPED); 36078c2ecf20Sopenharmony_ci if (swap) 36088c2ecf20Sopenharmony_ci val += memcg_page_state(memcg, MEMCG_SWAP); 36098c2ecf20Sopenharmony_ci } else { 36108c2ecf20Sopenharmony_ci if (!swap) 36118c2ecf20Sopenharmony_ci val = page_counter_read(&memcg->memory); 36128c2ecf20Sopenharmony_ci else 36138c2ecf20Sopenharmony_ci val = page_counter_read(&memcg->memsw); 36148c2ecf20Sopenharmony_ci } 36158c2ecf20Sopenharmony_ci return val; 36168c2ecf20Sopenharmony_ci} 36178c2ecf20Sopenharmony_ci 36188c2ecf20Sopenharmony_cienum { 36198c2ecf20Sopenharmony_ci RES_USAGE, 36208c2ecf20Sopenharmony_ci RES_LIMIT, 36218c2ecf20Sopenharmony_ci RES_MAX_USAGE, 36228c2ecf20Sopenharmony_ci RES_FAILCNT, 36238c2ecf20Sopenharmony_ci RES_SOFT_LIMIT, 36248c2ecf20Sopenharmony_ci}; 36258c2ecf20Sopenharmony_ci 36268c2ecf20Sopenharmony_cistatic u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 36278c2ecf20Sopenharmony_ci struct cftype *cft) 36288c2ecf20Sopenharmony_ci{ 36298c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 36308c2ecf20Sopenharmony_ci struct page_counter *counter; 36318c2ecf20Sopenharmony_ci 36328c2ecf20Sopenharmony_ci switch (MEMFILE_TYPE(cft->private)) { 36338c2ecf20Sopenharmony_ci case _MEM: 36348c2ecf20Sopenharmony_ci counter = &memcg->memory; 36358c2ecf20Sopenharmony_ci break; 36368c2ecf20Sopenharmony_ci case _MEMSWAP: 36378c2ecf20Sopenharmony_ci counter = &memcg->memsw; 36388c2ecf20Sopenharmony_ci break; 36398c2ecf20Sopenharmony_ci case _KMEM: 36408c2ecf20Sopenharmony_ci counter = &memcg->kmem; 36418c2ecf20Sopenharmony_ci break; 36428c2ecf20Sopenharmony_ci case _TCP: 36438c2ecf20Sopenharmony_ci counter = &memcg->tcpmem; 36448c2ecf20Sopenharmony_ci break; 36458c2ecf20Sopenharmony_ci default: 36468c2ecf20Sopenharmony_ci BUG(); 36478c2ecf20Sopenharmony_ci } 36488c2ecf20Sopenharmony_ci 36498c2ecf20Sopenharmony_ci switch (MEMFILE_ATTR(cft->private)) { 36508c2ecf20Sopenharmony_ci case RES_USAGE: 36518c2ecf20Sopenharmony_ci if (counter == &memcg->memory) 36528c2ecf20Sopenharmony_ci return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 36538c2ecf20Sopenharmony_ci if (counter == &memcg->memsw) 36548c2ecf20Sopenharmony_ci return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 36558c2ecf20Sopenharmony_ci return (u64)page_counter_read(counter) * PAGE_SIZE; 36568c2ecf20Sopenharmony_ci case RES_LIMIT: 36578c2ecf20Sopenharmony_ci return (u64)counter->max * PAGE_SIZE; 36588c2ecf20Sopenharmony_ci case RES_MAX_USAGE: 36598c2ecf20Sopenharmony_ci return (u64)counter->watermark * PAGE_SIZE; 36608c2ecf20Sopenharmony_ci case RES_FAILCNT: 36618c2ecf20Sopenharmony_ci return counter->failcnt; 36628c2ecf20Sopenharmony_ci case RES_SOFT_LIMIT: 36638c2ecf20Sopenharmony_ci return (u64)memcg->soft_limit * PAGE_SIZE; 36648c2ecf20Sopenharmony_ci default: 36658c2ecf20Sopenharmony_ci BUG(); 36668c2ecf20Sopenharmony_ci } 36678c2ecf20Sopenharmony_ci} 36688c2ecf20Sopenharmony_ci 36698c2ecf20Sopenharmony_cistatic void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) 36708c2ecf20Sopenharmony_ci{ 36718c2ecf20Sopenharmony_ci unsigned long stat[MEMCG_NR_STAT] = {0}; 36728c2ecf20Sopenharmony_ci struct mem_cgroup *mi; 36738c2ecf20Sopenharmony_ci int node, cpu, i; 36748c2ecf20Sopenharmony_ci 36758c2ecf20Sopenharmony_ci for_each_online_cpu(cpu) 36768c2ecf20Sopenharmony_ci for (i = 0; i < MEMCG_NR_STAT; i++) 36778c2ecf20Sopenharmony_ci stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu); 36788c2ecf20Sopenharmony_ci 36798c2ecf20Sopenharmony_ci for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 36808c2ecf20Sopenharmony_ci for (i = 0; i < MEMCG_NR_STAT; i++) 36818c2ecf20Sopenharmony_ci atomic_long_add(stat[i], &mi->vmstats[i]); 36828c2ecf20Sopenharmony_ci 36838c2ecf20Sopenharmony_ci for_each_node(node) { 36848c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 36858c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *pi; 36868c2ecf20Sopenharmony_ci 36878c2ecf20Sopenharmony_ci for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 36888c2ecf20Sopenharmony_ci stat[i] = 0; 36898c2ecf20Sopenharmony_ci 36908c2ecf20Sopenharmony_ci for_each_online_cpu(cpu) 36918c2ecf20Sopenharmony_ci for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 36928c2ecf20Sopenharmony_ci stat[i] += per_cpu( 36938c2ecf20Sopenharmony_ci pn->lruvec_stat_cpu->count[i], cpu); 36948c2ecf20Sopenharmony_ci 36958c2ecf20Sopenharmony_ci for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) 36968c2ecf20Sopenharmony_ci for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 36978c2ecf20Sopenharmony_ci atomic_long_add(stat[i], &pi->lruvec_stat[i]); 36988c2ecf20Sopenharmony_ci } 36998c2ecf20Sopenharmony_ci} 37008c2ecf20Sopenharmony_ci 37018c2ecf20Sopenharmony_cistatic void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) 37028c2ecf20Sopenharmony_ci{ 37038c2ecf20Sopenharmony_ci unsigned long events[NR_VM_EVENT_ITEMS]; 37048c2ecf20Sopenharmony_ci struct mem_cgroup *mi; 37058c2ecf20Sopenharmony_ci int cpu, i; 37068c2ecf20Sopenharmony_ci 37078c2ecf20Sopenharmony_ci for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 37088c2ecf20Sopenharmony_ci events[i] = 0; 37098c2ecf20Sopenharmony_ci 37108c2ecf20Sopenharmony_ci for_each_online_cpu(cpu) 37118c2ecf20Sopenharmony_ci for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 37128c2ecf20Sopenharmony_ci events[i] += per_cpu(memcg->vmstats_percpu->events[i], 37138c2ecf20Sopenharmony_ci cpu); 37148c2ecf20Sopenharmony_ci 37158c2ecf20Sopenharmony_ci for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 37168c2ecf20Sopenharmony_ci for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 37178c2ecf20Sopenharmony_ci atomic_long_add(events[i], &mi->vmevents[i]); 37188c2ecf20Sopenharmony_ci} 37198c2ecf20Sopenharmony_ci 37208c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM 37218c2ecf20Sopenharmony_cistatic int memcg_online_kmem(struct mem_cgroup *memcg) 37228c2ecf20Sopenharmony_ci{ 37238c2ecf20Sopenharmony_ci struct obj_cgroup *objcg; 37248c2ecf20Sopenharmony_ci int memcg_id; 37258c2ecf20Sopenharmony_ci 37268c2ecf20Sopenharmony_ci if (cgroup_memory_nokmem) 37278c2ecf20Sopenharmony_ci return 0; 37288c2ecf20Sopenharmony_ci 37298c2ecf20Sopenharmony_ci BUG_ON(memcg->kmemcg_id >= 0); 37308c2ecf20Sopenharmony_ci BUG_ON(memcg->kmem_state); 37318c2ecf20Sopenharmony_ci 37328c2ecf20Sopenharmony_ci memcg_id = memcg_alloc_cache_id(); 37338c2ecf20Sopenharmony_ci if (memcg_id < 0) 37348c2ecf20Sopenharmony_ci return memcg_id; 37358c2ecf20Sopenharmony_ci 37368c2ecf20Sopenharmony_ci objcg = obj_cgroup_alloc(); 37378c2ecf20Sopenharmony_ci if (!objcg) { 37388c2ecf20Sopenharmony_ci memcg_free_cache_id(memcg_id); 37398c2ecf20Sopenharmony_ci return -ENOMEM; 37408c2ecf20Sopenharmony_ci } 37418c2ecf20Sopenharmony_ci objcg->memcg = memcg; 37428c2ecf20Sopenharmony_ci rcu_assign_pointer(memcg->objcg, objcg); 37438c2ecf20Sopenharmony_ci 37448c2ecf20Sopenharmony_ci static_branch_enable(&memcg_kmem_enabled_key); 37458c2ecf20Sopenharmony_ci 37468c2ecf20Sopenharmony_ci /* 37478c2ecf20Sopenharmony_ci * A memory cgroup is considered kmem-online as soon as it gets 37488c2ecf20Sopenharmony_ci * kmemcg_id. Setting the id after enabling static branching will 37498c2ecf20Sopenharmony_ci * guarantee no one starts accounting before all call sites are 37508c2ecf20Sopenharmony_ci * patched. 37518c2ecf20Sopenharmony_ci */ 37528c2ecf20Sopenharmony_ci memcg->kmemcg_id = memcg_id; 37538c2ecf20Sopenharmony_ci memcg->kmem_state = KMEM_ONLINE; 37548c2ecf20Sopenharmony_ci 37558c2ecf20Sopenharmony_ci return 0; 37568c2ecf20Sopenharmony_ci} 37578c2ecf20Sopenharmony_ci 37588c2ecf20Sopenharmony_cistatic void memcg_offline_kmem(struct mem_cgroup *memcg) 37598c2ecf20Sopenharmony_ci{ 37608c2ecf20Sopenharmony_ci struct cgroup_subsys_state *css; 37618c2ecf20Sopenharmony_ci struct mem_cgroup *parent, *child; 37628c2ecf20Sopenharmony_ci int kmemcg_id; 37638c2ecf20Sopenharmony_ci 37648c2ecf20Sopenharmony_ci if (memcg->kmem_state != KMEM_ONLINE) 37658c2ecf20Sopenharmony_ci return; 37668c2ecf20Sopenharmony_ci 37678c2ecf20Sopenharmony_ci memcg->kmem_state = KMEM_ALLOCATED; 37688c2ecf20Sopenharmony_ci 37698c2ecf20Sopenharmony_ci parent = parent_mem_cgroup(memcg); 37708c2ecf20Sopenharmony_ci if (!parent) 37718c2ecf20Sopenharmony_ci parent = root_mem_cgroup; 37728c2ecf20Sopenharmony_ci 37738c2ecf20Sopenharmony_ci memcg_reparent_objcgs(memcg, parent); 37748c2ecf20Sopenharmony_ci 37758c2ecf20Sopenharmony_ci kmemcg_id = memcg->kmemcg_id; 37768c2ecf20Sopenharmony_ci BUG_ON(kmemcg_id < 0); 37778c2ecf20Sopenharmony_ci 37788c2ecf20Sopenharmony_ci /* 37798c2ecf20Sopenharmony_ci * Change kmemcg_id of this cgroup and all its descendants to the 37808c2ecf20Sopenharmony_ci * parent's id, and then move all entries from this cgroup's list_lrus 37818c2ecf20Sopenharmony_ci * to ones of the parent. After we have finished, all list_lrus 37828c2ecf20Sopenharmony_ci * corresponding to this cgroup are guaranteed to remain empty. The 37838c2ecf20Sopenharmony_ci * ordering is imposed by list_lru_node->lock taken by 37848c2ecf20Sopenharmony_ci * memcg_drain_all_list_lrus(). 37858c2ecf20Sopenharmony_ci */ 37868c2ecf20Sopenharmony_ci rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ 37878c2ecf20Sopenharmony_ci css_for_each_descendant_pre(css, &memcg->css) { 37888c2ecf20Sopenharmony_ci child = mem_cgroup_from_css(css); 37898c2ecf20Sopenharmony_ci BUG_ON(child->kmemcg_id != kmemcg_id); 37908c2ecf20Sopenharmony_ci child->kmemcg_id = parent->kmemcg_id; 37918c2ecf20Sopenharmony_ci if (!memcg->use_hierarchy) 37928c2ecf20Sopenharmony_ci break; 37938c2ecf20Sopenharmony_ci } 37948c2ecf20Sopenharmony_ci rcu_read_unlock(); 37958c2ecf20Sopenharmony_ci 37968c2ecf20Sopenharmony_ci memcg_drain_all_list_lrus(kmemcg_id, parent); 37978c2ecf20Sopenharmony_ci 37988c2ecf20Sopenharmony_ci memcg_free_cache_id(kmemcg_id); 37998c2ecf20Sopenharmony_ci} 38008c2ecf20Sopenharmony_ci 38018c2ecf20Sopenharmony_cistatic void memcg_free_kmem(struct mem_cgroup *memcg) 38028c2ecf20Sopenharmony_ci{ 38038c2ecf20Sopenharmony_ci /* css_alloc() failed, offlining didn't happen */ 38048c2ecf20Sopenharmony_ci if (unlikely(memcg->kmem_state == KMEM_ONLINE)) 38058c2ecf20Sopenharmony_ci memcg_offline_kmem(memcg); 38068c2ecf20Sopenharmony_ci} 38078c2ecf20Sopenharmony_ci#else 38088c2ecf20Sopenharmony_cistatic int memcg_online_kmem(struct mem_cgroup *memcg) 38098c2ecf20Sopenharmony_ci{ 38108c2ecf20Sopenharmony_ci return 0; 38118c2ecf20Sopenharmony_ci} 38128c2ecf20Sopenharmony_cistatic void memcg_offline_kmem(struct mem_cgroup *memcg) 38138c2ecf20Sopenharmony_ci{ 38148c2ecf20Sopenharmony_ci} 38158c2ecf20Sopenharmony_cistatic void memcg_free_kmem(struct mem_cgroup *memcg) 38168c2ecf20Sopenharmony_ci{ 38178c2ecf20Sopenharmony_ci} 38188c2ecf20Sopenharmony_ci#endif /* CONFIG_MEMCG_KMEM */ 38198c2ecf20Sopenharmony_ci 38208c2ecf20Sopenharmony_cistatic int memcg_update_kmem_max(struct mem_cgroup *memcg, 38218c2ecf20Sopenharmony_ci unsigned long max) 38228c2ecf20Sopenharmony_ci{ 38238c2ecf20Sopenharmony_ci int ret; 38248c2ecf20Sopenharmony_ci 38258c2ecf20Sopenharmony_ci mutex_lock(&memcg_max_mutex); 38268c2ecf20Sopenharmony_ci ret = page_counter_set_max(&memcg->kmem, max); 38278c2ecf20Sopenharmony_ci mutex_unlock(&memcg_max_mutex); 38288c2ecf20Sopenharmony_ci return ret; 38298c2ecf20Sopenharmony_ci} 38308c2ecf20Sopenharmony_ci 38318c2ecf20Sopenharmony_cistatic int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 38328c2ecf20Sopenharmony_ci{ 38338c2ecf20Sopenharmony_ci int ret; 38348c2ecf20Sopenharmony_ci 38358c2ecf20Sopenharmony_ci mutex_lock(&memcg_max_mutex); 38368c2ecf20Sopenharmony_ci 38378c2ecf20Sopenharmony_ci ret = page_counter_set_max(&memcg->tcpmem, max); 38388c2ecf20Sopenharmony_ci if (ret) 38398c2ecf20Sopenharmony_ci goto out; 38408c2ecf20Sopenharmony_ci 38418c2ecf20Sopenharmony_ci if (!memcg->tcpmem_active) { 38428c2ecf20Sopenharmony_ci /* 38438c2ecf20Sopenharmony_ci * The active flag needs to be written after the static_key 38448c2ecf20Sopenharmony_ci * update. This is what guarantees that the socket activation 38458c2ecf20Sopenharmony_ci * function is the last one to run. See mem_cgroup_sk_alloc() 38468c2ecf20Sopenharmony_ci * for details, and note that we don't mark any socket as 38478c2ecf20Sopenharmony_ci * belonging to this memcg until that flag is up. 38488c2ecf20Sopenharmony_ci * 38498c2ecf20Sopenharmony_ci * We need to do this, because static_keys will span multiple 38508c2ecf20Sopenharmony_ci * sites, but we can't control their order. If we mark a socket 38518c2ecf20Sopenharmony_ci * as accounted, but the accounting functions are not patched in 38528c2ecf20Sopenharmony_ci * yet, we'll lose accounting. 38538c2ecf20Sopenharmony_ci * 38548c2ecf20Sopenharmony_ci * We never race with the readers in mem_cgroup_sk_alloc(), 38558c2ecf20Sopenharmony_ci * because when this value change, the code to process it is not 38568c2ecf20Sopenharmony_ci * patched in yet. 38578c2ecf20Sopenharmony_ci */ 38588c2ecf20Sopenharmony_ci static_branch_inc(&memcg_sockets_enabled_key); 38598c2ecf20Sopenharmony_ci memcg->tcpmem_active = true; 38608c2ecf20Sopenharmony_ci } 38618c2ecf20Sopenharmony_ciout: 38628c2ecf20Sopenharmony_ci mutex_unlock(&memcg_max_mutex); 38638c2ecf20Sopenharmony_ci return ret; 38648c2ecf20Sopenharmony_ci} 38658c2ecf20Sopenharmony_ci 38668c2ecf20Sopenharmony_ci/* 38678c2ecf20Sopenharmony_ci * The user of this function is... 38688c2ecf20Sopenharmony_ci * RES_LIMIT. 38698c2ecf20Sopenharmony_ci */ 38708c2ecf20Sopenharmony_cistatic ssize_t mem_cgroup_write(struct kernfs_open_file *of, 38718c2ecf20Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 38728c2ecf20Sopenharmony_ci{ 38738c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 38748c2ecf20Sopenharmony_ci unsigned long nr_pages; 38758c2ecf20Sopenharmony_ci int ret; 38768c2ecf20Sopenharmony_ci 38778c2ecf20Sopenharmony_ci buf = strstrip(buf); 38788c2ecf20Sopenharmony_ci ret = page_counter_memparse(buf, "-1", &nr_pages); 38798c2ecf20Sopenharmony_ci if (ret) 38808c2ecf20Sopenharmony_ci return ret; 38818c2ecf20Sopenharmony_ci 38828c2ecf20Sopenharmony_ci switch (MEMFILE_ATTR(of_cft(of)->private)) { 38838c2ecf20Sopenharmony_ci case RES_LIMIT: 38848c2ecf20Sopenharmony_ci if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 38858c2ecf20Sopenharmony_ci ret = -EINVAL; 38868c2ecf20Sopenharmony_ci break; 38878c2ecf20Sopenharmony_ci } 38888c2ecf20Sopenharmony_ci switch (MEMFILE_TYPE(of_cft(of)->private)) { 38898c2ecf20Sopenharmony_ci case _MEM: 38908c2ecf20Sopenharmony_ci ret = mem_cgroup_resize_max(memcg, nr_pages, false); 38918c2ecf20Sopenharmony_ci break; 38928c2ecf20Sopenharmony_ci case _MEMSWAP: 38938c2ecf20Sopenharmony_ci ret = mem_cgroup_resize_max(memcg, nr_pages, true); 38948c2ecf20Sopenharmony_ci break; 38958c2ecf20Sopenharmony_ci case _KMEM: 38968c2ecf20Sopenharmony_ci pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " 38978c2ecf20Sopenharmony_ci "Please report your usecase to linux-mm@kvack.org if you " 38988c2ecf20Sopenharmony_ci "depend on this functionality.\n"); 38998c2ecf20Sopenharmony_ci ret = memcg_update_kmem_max(memcg, nr_pages); 39008c2ecf20Sopenharmony_ci break; 39018c2ecf20Sopenharmony_ci case _TCP: 39028c2ecf20Sopenharmony_ci ret = memcg_update_tcp_max(memcg, nr_pages); 39038c2ecf20Sopenharmony_ci break; 39048c2ecf20Sopenharmony_ci } 39058c2ecf20Sopenharmony_ci break; 39068c2ecf20Sopenharmony_ci case RES_SOFT_LIMIT: 39078c2ecf20Sopenharmony_ci memcg->soft_limit = nr_pages; 39088c2ecf20Sopenharmony_ci ret = 0; 39098c2ecf20Sopenharmony_ci break; 39108c2ecf20Sopenharmony_ci } 39118c2ecf20Sopenharmony_ci return ret ?: nbytes; 39128c2ecf20Sopenharmony_ci} 39138c2ecf20Sopenharmony_ci 39148c2ecf20Sopenharmony_cistatic ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 39158c2ecf20Sopenharmony_ci size_t nbytes, loff_t off) 39168c2ecf20Sopenharmony_ci{ 39178c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 39188c2ecf20Sopenharmony_ci struct page_counter *counter; 39198c2ecf20Sopenharmony_ci 39208c2ecf20Sopenharmony_ci switch (MEMFILE_TYPE(of_cft(of)->private)) { 39218c2ecf20Sopenharmony_ci case _MEM: 39228c2ecf20Sopenharmony_ci counter = &memcg->memory; 39238c2ecf20Sopenharmony_ci break; 39248c2ecf20Sopenharmony_ci case _MEMSWAP: 39258c2ecf20Sopenharmony_ci counter = &memcg->memsw; 39268c2ecf20Sopenharmony_ci break; 39278c2ecf20Sopenharmony_ci case _KMEM: 39288c2ecf20Sopenharmony_ci counter = &memcg->kmem; 39298c2ecf20Sopenharmony_ci break; 39308c2ecf20Sopenharmony_ci case _TCP: 39318c2ecf20Sopenharmony_ci counter = &memcg->tcpmem; 39328c2ecf20Sopenharmony_ci break; 39338c2ecf20Sopenharmony_ci default: 39348c2ecf20Sopenharmony_ci BUG(); 39358c2ecf20Sopenharmony_ci } 39368c2ecf20Sopenharmony_ci 39378c2ecf20Sopenharmony_ci switch (MEMFILE_ATTR(of_cft(of)->private)) { 39388c2ecf20Sopenharmony_ci case RES_MAX_USAGE: 39398c2ecf20Sopenharmony_ci page_counter_reset_watermark(counter); 39408c2ecf20Sopenharmony_ci break; 39418c2ecf20Sopenharmony_ci case RES_FAILCNT: 39428c2ecf20Sopenharmony_ci counter->failcnt = 0; 39438c2ecf20Sopenharmony_ci break; 39448c2ecf20Sopenharmony_ci default: 39458c2ecf20Sopenharmony_ci BUG(); 39468c2ecf20Sopenharmony_ci } 39478c2ecf20Sopenharmony_ci 39488c2ecf20Sopenharmony_ci return nbytes; 39498c2ecf20Sopenharmony_ci} 39508c2ecf20Sopenharmony_ci 39518c2ecf20Sopenharmony_cistatic u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 39528c2ecf20Sopenharmony_ci struct cftype *cft) 39538c2ecf20Sopenharmony_ci{ 39548c2ecf20Sopenharmony_ci return mem_cgroup_from_css(css)->move_charge_at_immigrate; 39558c2ecf20Sopenharmony_ci} 39568c2ecf20Sopenharmony_ci 39578c2ecf20Sopenharmony_ci#ifdef CONFIG_MMU 39588c2ecf20Sopenharmony_cistatic int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 39598c2ecf20Sopenharmony_ci struct cftype *cft, u64 val) 39608c2ecf20Sopenharmony_ci{ 39618c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 39628c2ecf20Sopenharmony_ci 39638c2ecf20Sopenharmony_ci pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " 39648c2ecf20Sopenharmony_ci "Please report your usecase to linux-mm@kvack.org if you " 39658c2ecf20Sopenharmony_ci "depend on this functionality.\n"); 39668c2ecf20Sopenharmony_ci 39678c2ecf20Sopenharmony_ci if (val & ~MOVE_MASK) 39688c2ecf20Sopenharmony_ci return -EINVAL; 39698c2ecf20Sopenharmony_ci 39708c2ecf20Sopenharmony_ci /* 39718c2ecf20Sopenharmony_ci * No kind of locking is needed in here, because ->can_attach() will 39728c2ecf20Sopenharmony_ci * check this value once in the beginning of the process, and then carry 39738c2ecf20Sopenharmony_ci * on with stale data. This means that changes to this value will only 39748c2ecf20Sopenharmony_ci * affect task migrations starting after the change. 39758c2ecf20Sopenharmony_ci */ 39768c2ecf20Sopenharmony_ci memcg->move_charge_at_immigrate = val; 39778c2ecf20Sopenharmony_ci return 0; 39788c2ecf20Sopenharmony_ci} 39798c2ecf20Sopenharmony_ci#else 39808c2ecf20Sopenharmony_cistatic int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 39818c2ecf20Sopenharmony_ci struct cftype *cft, u64 val) 39828c2ecf20Sopenharmony_ci{ 39838c2ecf20Sopenharmony_ci return -ENOSYS; 39848c2ecf20Sopenharmony_ci} 39858c2ecf20Sopenharmony_ci#endif 39868c2ecf20Sopenharmony_ci 39878c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 39888c2ecf20Sopenharmony_ci 39898c2ecf20Sopenharmony_ci#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 39908c2ecf20Sopenharmony_ci#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 39918c2ecf20Sopenharmony_ci#define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 39928c2ecf20Sopenharmony_ci 39938c2ecf20Sopenharmony_cistatic unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 39948c2ecf20Sopenharmony_ci int nid, unsigned int lru_mask, bool tree) 39958c2ecf20Sopenharmony_ci{ 39968c2ecf20Sopenharmony_ci struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 39978c2ecf20Sopenharmony_ci unsigned long nr = 0; 39988c2ecf20Sopenharmony_ci enum lru_list lru; 39998c2ecf20Sopenharmony_ci 40008c2ecf20Sopenharmony_ci VM_BUG_ON((unsigned)nid >= nr_node_ids); 40018c2ecf20Sopenharmony_ci 40028c2ecf20Sopenharmony_ci for_each_lru(lru) { 40038c2ecf20Sopenharmony_ci if (!(BIT(lru) & lru_mask)) 40048c2ecf20Sopenharmony_ci continue; 40058c2ecf20Sopenharmony_ci if (tree) 40068c2ecf20Sopenharmony_ci nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 40078c2ecf20Sopenharmony_ci else 40088c2ecf20Sopenharmony_ci nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 40098c2ecf20Sopenharmony_ci } 40108c2ecf20Sopenharmony_ci return nr; 40118c2ecf20Sopenharmony_ci} 40128c2ecf20Sopenharmony_ci 40138c2ecf20Sopenharmony_cistatic unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 40148c2ecf20Sopenharmony_ci unsigned int lru_mask, 40158c2ecf20Sopenharmony_ci bool tree) 40168c2ecf20Sopenharmony_ci{ 40178c2ecf20Sopenharmony_ci unsigned long nr = 0; 40188c2ecf20Sopenharmony_ci enum lru_list lru; 40198c2ecf20Sopenharmony_ci 40208c2ecf20Sopenharmony_ci for_each_lru(lru) { 40218c2ecf20Sopenharmony_ci if (!(BIT(lru) & lru_mask)) 40228c2ecf20Sopenharmony_ci continue; 40238c2ecf20Sopenharmony_ci if (tree) 40248c2ecf20Sopenharmony_ci nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 40258c2ecf20Sopenharmony_ci else 40268c2ecf20Sopenharmony_ci nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 40278c2ecf20Sopenharmony_ci } 40288c2ecf20Sopenharmony_ci return nr; 40298c2ecf20Sopenharmony_ci} 40308c2ecf20Sopenharmony_ci 40318c2ecf20Sopenharmony_cistatic int memcg_numa_stat_show(struct seq_file *m, void *v) 40328c2ecf20Sopenharmony_ci{ 40338c2ecf20Sopenharmony_ci struct numa_stat { 40348c2ecf20Sopenharmony_ci const char *name; 40358c2ecf20Sopenharmony_ci unsigned int lru_mask; 40368c2ecf20Sopenharmony_ci }; 40378c2ecf20Sopenharmony_ci 40388c2ecf20Sopenharmony_ci static const struct numa_stat stats[] = { 40398c2ecf20Sopenharmony_ci { "total", LRU_ALL }, 40408c2ecf20Sopenharmony_ci { "file", LRU_ALL_FILE }, 40418c2ecf20Sopenharmony_ci { "anon", LRU_ALL_ANON }, 40428c2ecf20Sopenharmony_ci { "unevictable", BIT(LRU_UNEVICTABLE) }, 40438c2ecf20Sopenharmony_ci }; 40448c2ecf20Sopenharmony_ci const struct numa_stat *stat; 40458c2ecf20Sopenharmony_ci int nid; 40468c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 40478c2ecf20Sopenharmony_ci 40488c2ecf20Sopenharmony_ci for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 40498c2ecf20Sopenharmony_ci seq_printf(m, "%s=%lu", stat->name, 40508c2ecf20Sopenharmony_ci mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 40518c2ecf20Sopenharmony_ci false)); 40528c2ecf20Sopenharmony_ci for_each_node_state(nid, N_MEMORY) 40538c2ecf20Sopenharmony_ci seq_printf(m, " N%d=%lu", nid, 40548c2ecf20Sopenharmony_ci mem_cgroup_node_nr_lru_pages(memcg, nid, 40558c2ecf20Sopenharmony_ci stat->lru_mask, false)); 40568c2ecf20Sopenharmony_ci seq_putc(m, '\n'); 40578c2ecf20Sopenharmony_ci } 40588c2ecf20Sopenharmony_ci 40598c2ecf20Sopenharmony_ci for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 40608c2ecf20Sopenharmony_ci 40618c2ecf20Sopenharmony_ci seq_printf(m, "hierarchical_%s=%lu", stat->name, 40628c2ecf20Sopenharmony_ci mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 40638c2ecf20Sopenharmony_ci true)); 40648c2ecf20Sopenharmony_ci for_each_node_state(nid, N_MEMORY) 40658c2ecf20Sopenharmony_ci seq_printf(m, " N%d=%lu", nid, 40668c2ecf20Sopenharmony_ci mem_cgroup_node_nr_lru_pages(memcg, nid, 40678c2ecf20Sopenharmony_ci stat->lru_mask, true)); 40688c2ecf20Sopenharmony_ci seq_putc(m, '\n'); 40698c2ecf20Sopenharmony_ci } 40708c2ecf20Sopenharmony_ci 40718c2ecf20Sopenharmony_ci return 0; 40728c2ecf20Sopenharmony_ci} 40738c2ecf20Sopenharmony_ci#endif /* CONFIG_NUMA */ 40748c2ecf20Sopenharmony_ci 40758c2ecf20Sopenharmony_cistatic const unsigned int memcg1_stats[] = { 40768c2ecf20Sopenharmony_ci NR_FILE_PAGES, 40778c2ecf20Sopenharmony_ci NR_ANON_MAPPED, 40788c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 40798c2ecf20Sopenharmony_ci NR_ANON_THPS, 40808c2ecf20Sopenharmony_ci#endif 40818c2ecf20Sopenharmony_ci NR_SHMEM, 40828c2ecf20Sopenharmony_ci NR_FILE_MAPPED, 40838c2ecf20Sopenharmony_ci NR_FILE_DIRTY, 40848c2ecf20Sopenharmony_ci NR_WRITEBACK, 40858c2ecf20Sopenharmony_ci MEMCG_SWAP, 40868c2ecf20Sopenharmony_ci}; 40878c2ecf20Sopenharmony_ci 40888c2ecf20Sopenharmony_cistatic const char *const memcg1_stat_names[] = { 40898c2ecf20Sopenharmony_ci "cache", 40908c2ecf20Sopenharmony_ci "rss", 40918c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 40928c2ecf20Sopenharmony_ci "rss_huge", 40938c2ecf20Sopenharmony_ci#endif 40948c2ecf20Sopenharmony_ci "shmem", 40958c2ecf20Sopenharmony_ci "mapped_file", 40968c2ecf20Sopenharmony_ci "dirty", 40978c2ecf20Sopenharmony_ci "writeback", 40988c2ecf20Sopenharmony_ci "swap", 40998c2ecf20Sopenharmony_ci}; 41008c2ecf20Sopenharmony_ci 41018c2ecf20Sopenharmony_ci/* Universal VM events cgroup1 shows, original sort order */ 41028c2ecf20Sopenharmony_cistatic const unsigned int memcg1_events[] = { 41038c2ecf20Sopenharmony_ci PGPGIN, 41048c2ecf20Sopenharmony_ci PGPGOUT, 41058c2ecf20Sopenharmony_ci PGFAULT, 41068c2ecf20Sopenharmony_ci PGMAJFAULT, 41078c2ecf20Sopenharmony_ci}; 41088c2ecf20Sopenharmony_ci 41098c2ecf20Sopenharmony_cistatic int memcg_stat_show(struct seq_file *m, void *v) 41108c2ecf20Sopenharmony_ci{ 41118c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 41128c2ecf20Sopenharmony_ci unsigned long memory, memsw; 41138c2ecf20Sopenharmony_ci struct mem_cgroup *mi; 41148c2ecf20Sopenharmony_ci unsigned int i; 41158c2ecf20Sopenharmony_ci 41168c2ecf20Sopenharmony_ci BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 41178c2ecf20Sopenharmony_ci 41188c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 41198c2ecf20Sopenharmony_ci unsigned long nr; 41208c2ecf20Sopenharmony_ci 41218c2ecf20Sopenharmony_ci if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 41228c2ecf20Sopenharmony_ci continue; 41238c2ecf20Sopenharmony_ci nr = memcg_page_state_local(memcg, memcg1_stats[i]); 41248c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 41258c2ecf20Sopenharmony_ci if (memcg1_stats[i] == NR_ANON_THPS) 41268c2ecf20Sopenharmony_ci nr *= HPAGE_PMD_NR; 41278c2ecf20Sopenharmony_ci#endif 41288c2ecf20Sopenharmony_ci seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); 41298c2ecf20Sopenharmony_ci } 41308c2ecf20Sopenharmony_ci 41318c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 41328c2ecf20Sopenharmony_ci seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]), 41338c2ecf20Sopenharmony_ci memcg_events_local(memcg, memcg1_events[i])); 41348c2ecf20Sopenharmony_ci 41358c2ecf20Sopenharmony_ci for (i = 0; i < NR_LRU_LISTS; i++) { 41368c2ecf20Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE 41378c2ecf20Sopenharmony_ci if (i == LRU_INACTIVE_PURGEABLE || i == LRU_ACTIVE_PURGEABLE) 41388c2ecf20Sopenharmony_ci continue; 41398c2ecf20Sopenharmony_ci#endif 41408c2ecf20Sopenharmony_ci seq_printf(m, "%s %lu\n", lru_list_name(i), 41418c2ecf20Sopenharmony_ci memcg_page_state_local(memcg, NR_LRU_BASE + i) * 41428c2ecf20Sopenharmony_ci PAGE_SIZE); 41438c2ecf20Sopenharmony_ci } 41448c2ecf20Sopenharmony_ci 41458c2ecf20Sopenharmony_ci /* Hierarchical information */ 41468c2ecf20Sopenharmony_ci memory = memsw = PAGE_COUNTER_MAX; 41478c2ecf20Sopenharmony_ci for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 41488c2ecf20Sopenharmony_ci memory = min(memory, READ_ONCE(mi->memory.max)); 41498c2ecf20Sopenharmony_ci memsw = min(memsw, READ_ONCE(mi->memsw.max)); 41508c2ecf20Sopenharmony_ci } 41518c2ecf20Sopenharmony_ci seq_printf(m, "hierarchical_memory_limit %llu\n", 41528c2ecf20Sopenharmony_ci (u64)memory * PAGE_SIZE); 41538c2ecf20Sopenharmony_ci if (do_memsw_account()) 41548c2ecf20Sopenharmony_ci seq_printf(m, "hierarchical_memsw_limit %llu\n", 41558c2ecf20Sopenharmony_ci (u64)memsw * PAGE_SIZE); 41568c2ecf20Sopenharmony_ci 41578c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 41588c2ecf20Sopenharmony_ci unsigned long nr; 41598c2ecf20Sopenharmony_ci 41608c2ecf20Sopenharmony_ci if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 41618c2ecf20Sopenharmony_ci continue; 41628c2ecf20Sopenharmony_ci nr = memcg_page_state(memcg, memcg1_stats[i]); 41638c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 41648c2ecf20Sopenharmony_ci if (memcg1_stats[i] == NR_ANON_THPS) 41658c2ecf20Sopenharmony_ci nr *= HPAGE_PMD_NR; 41668c2ecf20Sopenharmony_ci#endif 41678c2ecf20Sopenharmony_ci seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], 41688c2ecf20Sopenharmony_ci (u64)nr * PAGE_SIZE); 41698c2ecf20Sopenharmony_ci } 41708c2ecf20Sopenharmony_ci 41718c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 41728c2ecf20Sopenharmony_ci seq_printf(m, "total_%s %llu\n", 41738c2ecf20Sopenharmony_ci vm_event_name(memcg1_events[i]), 41748c2ecf20Sopenharmony_ci (u64)memcg_events(memcg, memcg1_events[i])); 41758c2ecf20Sopenharmony_ci 41768c2ecf20Sopenharmony_ci for (i = 0; i < NR_LRU_LISTS; i++) { 41778c2ecf20Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE 41788c2ecf20Sopenharmony_ci if (i == LRU_INACTIVE_PURGEABLE || i == LRU_ACTIVE_PURGEABLE) 41798c2ecf20Sopenharmony_ci continue; 41808c2ecf20Sopenharmony_ci#endif 41818c2ecf20Sopenharmony_ci seq_printf(m, "total_%s %llu\n", lru_list_name(i), 41828c2ecf20Sopenharmony_ci (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 41838c2ecf20Sopenharmony_ci PAGE_SIZE); 41848c2ecf20Sopenharmony_ci } 41858c2ecf20Sopenharmony_ci 41868c2ecf20Sopenharmony_ci#ifdef CONFIG_DEBUG_VM 41878c2ecf20Sopenharmony_ci { 41888c2ecf20Sopenharmony_ci pg_data_t *pgdat; 41898c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *mz; 41908c2ecf20Sopenharmony_ci unsigned long anon_cost = 0; 41918c2ecf20Sopenharmony_ci unsigned long file_cost = 0; 41928c2ecf20Sopenharmony_ci 41938c2ecf20Sopenharmony_ci for_each_online_pgdat(pgdat) { 41948c2ecf20Sopenharmony_ci mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); 41958c2ecf20Sopenharmony_ci 41968c2ecf20Sopenharmony_ci anon_cost += mz->lruvec.anon_cost; 41978c2ecf20Sopenharmony_ci file_cost += mz->lruvec.file_cost; 41988c2ecf20Sopenharmony_ci } 41998c2ecf20Sopenharmony_ci seq_printf(m, "anon_cost %lu\n", anon_cost); 42008c2ecf20Sopenharmony_ci seq_printf(m, "file_cost %lu\n", file_cost); 42018c2ecf20Sopenharmony_ci } 42028c2ecf20Sopenharmony_ci#endif 42038c2ecf20Sopenharmony_ci 42048c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_DEBUG 42058c2ecf20Sopenharmony_ci memcg_eswap_info_show(m); 42068c2ecf20Sopenharmony_ci#endif 42078c2ecf20Sopenharmony_ci return 0; 42088c2ecf20Sopenharmony_ci} 42098c2ecf20Sopenharmony_ci 42108c2ecf20Sopenharmony_cistatic u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 42118c2ecf20Sopenharmony_ci struct cftype *cft) 42128c2ecf20Sopenharmony_ci{ 42138c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 42148c2ecf20Sopenharmony_ci 42158c2ecf20Sopenharmony_ci return mem_cgroup_swappiness(memcg); 42168c2ecf20Sopenharmony_ci} 42178c2ecf20Sopenharmony_ci 42188c2ecf20Sopenharmony_cistatic int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 42198c2ecf20Sopenharmony_ci struct cftype *cft, u64 val) 42208c2ecf20Sopenharmony_ci{ 42218c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 42228c2ecf20Sopenharmony_ci 42238c2ecf20Sopenharmony_ci if (val > 200) 42248c2ecf20Sopenharmony_ci return -EINVAL; 42258c2ecf20Sopenharmony_ci 42268c2ecf20Sopenharmony_ci if (css->parent) 42278c2ecf20Sopenharmony_ci memcg->swappiness = val; 42288c2ecf20Sopenharmony_ci else 42298c2ecf20Sopenharmony_ci vm_swappiness = val; 42308c2ecf20Sopenharmony_ci 42318c2ecf20Sopenharmony_ci return 0; 42328c2ecf20Sopenharmony_ci} 42338c2ecf20Sopenharmony_ci 42348c2ecf20Sopenharmony_cistatic void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 42358c2ecf20Sopenharmony_ci{ 42368c2ecf20Sopenharmony_ci struct mem_cgroup_threshold_ary *t; 42378c2ecf20Sopenharmony_ci unsigned long usage; 42388c2ecf20Sopenharmony_ci int i; 42398c2ecf20Sopenharmony_ci 42408c2ecf20Sopenharmony_ci rcu_read_lock(); 42418c2ecf20Sopenharmony_ci if (!swap) 42428c2ecf20Sopenharmony_ci t = rcu_dereference(memcg->thresholds.primary); 42438c2ecf20Sopenharmony_ci else 42448c2ecf20Sopenharmony_ci t = rcu_dereference(memcg->memsw_thresholds.primary); 42458c2ecf20Sopenharmony_ci 42468c2ecf20Sopenharmony_ci if (!t) 42478c2ecf20Sopenharmony_ci goto unlock; 42488c2ecf20Sopenharmony_ci 42498c2ecf20Sopenharmony_ci usage = mem_cgroup_usage(memcg, swap); 42508c2ecf20Sopenharmony_ci 42518c2ecf20Sopenharmony_ci /* 42528c2ecf20Sopenharmony_ci * current_threshold points to threshold just below or equal to usage. 42538c2ecf20Sopenharmony_ci * If it's not true, a threshold was crossed after last 42548c2ecf20Sopenharmony_ci * call of __mem_cgroup_threshold(). 42558c2ecf20Sopenharmony_ci */ 42568c2ecf20Sopenharmony_ci i = t->current_threshold; 42578c2ecf20Sopenharmony_ci 42588c2ecf20Sopenharmony_ci /* 42598c2ecf20Sopenharmony_ci * Iterate backward over array of thresholds starting from 42608c2ecf20Sopenharmony_ci * current_threshold and check if a threshold is crossed. 42618c2ecf20Sopenharmony_ci * If none of thresholds below usage is crossed, we read 42628c2ecf20Sopenharmony_ci * only one element of the array here. 42638c2ecf20Sopenharmony_ci */ 42648c2ecf20Sopenharmony_ci for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 42658c2ecf20Sopenharmony_ci eventfd_signal(t->entries[i].eventfd, 1); 42668c2ecf20Sopenharmony_ci 42678c2ecf20Sopenharmony_ci /* i = current_threshold + 1 */ 42688c2ecf20Sopenharmony_ci i++; 42698c2ecf20Sopenharmony_ci 42708c2ecf20Sopenharmony_ci /* 42718c2ecf20Sopenharmony_ci * Iterate forward over array of thresholds starting from 42728c2ecf20Sopenharmony_ci * current_threshold+1 and check if a threshold is crossed. 42738c2ecf20Sopenharmony_ci * If none of thresholds above usage is crossed, we read 42748c2ecf20Sopenharmony_ci * only one element of the array here. 42758c2ecf20Sopenharmony_ci */ 42768c2ecf20Sopenharmony_ci for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 42778c2ecf20Sopenharmony_ci eventfd_signal(t->entries[i].eventfd, 1); 42788c2ecf20Sopenharmony_ci 42798c2ecf20Sopenharmony_ci /* Update current_threshold */ 42808c2ecf20Sopenharmony_ci t->current_threshold = i - 1; 42818c2ecf20Sopenharmony_ciunlock: 42828c2ecf20Sopenharmony_ci rcu_read_unlock(); 42838c2ecf20Sopenharmony_ci} 42848c2ecf20Sopenharmony_ci 42858c2ecf20Sopenharmony_cistatic void mem_cgroup_threshold(struct mem_cgroup *memcg) 42868c2ecf20Sopenharmony_ci{ 42878c2ecf20Sopenharmony_ci while (memcg) { 42888c2ecf20Sopenharmony_ci __mem_cgroup_threshold(memcg, false); 42898c2ecf20Sopenharmony_ci if (do_memsw_account()) 42908c2ecf20Sopenharmony_ci __mem_cgroup_threshold(memcg, true); 42918c2ecf20Sopenharmony_ci 42928c2ecf20Sopenharmony_ci memcg = parent_mem_cgroup(memcg); 42938c2ecf20Sopenharmony_ci } 42948c2ecf20Sopenharmony_ci} 42958c2ecf20Sopenharmony_ci 42968c2ecf20Sopenharmony_cistatic int compare_thresholds(const void *a, const void *b) 42978c2ecf20Sopenharmony_ci{ 42988c2ecf20Sopenharmony_ci const struct mem_cgroup_threshold *_a = a; 42998c2ecf20Sopenharmony_ci const struct mem_cgroup_threshold *_b = b; 43008c2ecf20Sopenharmony_ci 43018c2ecf20Sopenharmony_ci if (_a->threshold > _b->threshold) 43028c2ecf20Sopenharmony_ci return 1; 43038c2ecf20Sopenharmony_ci 43048c2ecf20Sopenharmony_ci if (_a->threshold < _b->threshold) 43058c2ecf20Sopenharmony_ci return -1; 43068c2ecf20Sopenharmony_ci 43078c2ecf20Sopenharmony_ci return 0; 43088c2ecf20Sopenharmony_ci} 43098c2ecf20Sopenharmony_ci 43108c2ecf20Sopenharmony_cistatic int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 43118c2ecf20Sopenharmony_ci{ 43128c2ecf20Sopenharmony_ci struct mem_cgroup_eventfd_list *ev; 43138c2ecf20Sopenharmony_ci 43148c2ecf20Sopenharmony_ci spin_lock(&memcg_oom_lock); 43158c2ecf20Sopenharmony_ci 43168c2ecf20Sopenharmony_ci list_for_each_entry(ev, &memcg->oom_notify, list) 43178c2ecf20Sopenharmony_ci eventfd_signal(ev->eventfd, 1); 43188c2ecf20Sopenharmony_ci 43198c2ecf20Sopenharmony_ci spin_unlock(&memcg_oom_lock); 43208c2ecf20Sopenharmony_ci return 0; 43218c2ecf20Sopenharmony_ci} 43228c2ecf20Sopenharmony_ci 43238c2ecf20Sopenharmony_cistatic void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 43248c2ecf20Sopenharmony_ci{ 43258c2ecf20Sopenharmony_ci struct mem_cgroup *iter; 43268c2ecf20Sopenharmony_ci 43278c2ecf20Sopenharmony_ci for_each_mem_cgroup_tree(iter, memcg) 43288c2ecf20Sopenharmony_ci mem_cgroup_oom_notify_cb(iter); 43298c2ecf20Sopenharmony_ci} 43308c2ecf20Sopenharmony_ci 43318c2ecf20Sopenharmony_cistatic int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 43328c2ecf20Sopenharmony_ci struct eventfd_ctx *eventfd, const char *args, enum res_type type) 43338c2ecf20Sopenharmony_ci{ 43348c2ecf20Sopenharmony_ci struct mem_cgroup_thresholds *thresholds; 43358c2ecf20Sopenharmony_ci struct mem_cgroup_threshold_ary *new; 43368c2ecf20Sopenharmony_ci unsigned long threshold; 43378c2ecf20Sopenharmony_ci unsigned long usage; 43388c2ecf20Sopenharmony_ci int i, size, ret; 43398c2ecf20Sopenharmony_ci 43408c2ecf20Sopenharmony_ci ret = page_counter_memparse(args, "-1", &threshold); 43418c2ecf20Sopenharmony_ci if (ret) 43428c2ecf20Sopenharmony_ci return ret; 43438c2ecf20Sopenharmony_ci 43448c2ecf20Sopenharmony_ci mutex_lock(&memcg->thresholds_lock); 43458c2ecf20Sopenharmony_ci 43468c2ecf20Sopenharmony_ci if (type == _MEM) { 43478c2ecf20Sopenharmony_ci thresholds = &memcg->thresholds; 43488c2ecf20Sopenharmony_ci usage = mem_cgroup_usage(memcg, false); 43498c2ecf20Sopenharmony_ci } else if (type == _MEMSWAP) { 43508c2ecf20Sopenharmony_ci thresholds = &memcg->memsw_thresholds; 43518c2ecf20Sopenharmony_ci usage = mem_cgroup_usage(memcg, true); 43528c2ecf20Sopenharmony_ci } else 43538c2ecf20Sopenharmony_ci BUG(); 43548c2ecf20Sopenharmony_ci 43558c2ecf20Sopenharmony_ci /* Check if a threshold crossed before adding a new one */ 43568c2ecf20Sopenharmony_ci if (thresholds->primary) 43578c2ecf20Sopenharmony_ci __mem_cgroup_threshold(memcg, type == _MEMSWAP); 43588c2ecf20Sopenharmony_ci 43598c2ecf20Sopenharmony_ci size = thresholds->primary ? thresholds->primary->size + 1 : 1; 43608c2ecf20Sopenharmony_ci 43618c2ecf20Sopenharmony_ci /* Allocate memory for new array of thresholds */ 43628c2ecf20Sopenharmony_ci new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); 43638c2ecf20Sopenharmony_ci if (!new) { 43648c2ecf20Sopenharmony_ci ret = -ENOMEM; 43658c2ecf20Sopenharmony_ci goto unlock; 43668c2ecf20Sopenharmony_ci } 43678c2ecf20Sopenharmony_ci new->size = size; 43688c2ecf20Sopenharmony_ci 43698c2ecf20Sopenharmony_ci /* Copy thresholds (if any) to new array */ 43708c2ecf20Sopenharmony_ci if (thresholds->primary) 43718c2ecf20Sopenharmony_ci memcpy(new->entries, thresholds->primary->entries, 43728c2ecf20Sopenharmony_ci flex_array_size(new, entries, size - 1)); 43738c2ecf20Sopenharmony_ci 43748c2ecf20Sopenharmony_ci /* Add new threshold */ 43758c2ecf20Sopenharmony_ci new->entries[size - 1].eventfd = eventfd; 43768c2ecf20Sopenharmony_ci new->entries[size - 1].threshold = threshold; 43778c2ecf20Sopenharmony_ci 43788c2ecf20Sopenharmony_ci /* Sort thresholds. Registering of new threshold isn't time-critical */ 43798c2ecf20Sopenharmony_ci sort(new->entries, size, sizeof(*new->entries), 43808c2ecf20Sopenharmony_ci compare_thresholds, NULL); 43818c2ecf20Sopenharmony_ci 43828c2ecf20Sopenharmony_ci /* Find current threshold */ 43838c2ecf20Sopenharmony_ci new->current_threshold = -1; 43848c2ecf20Sopenharmony_ci for (i = 0; i < size; i++) { 43858c2ecf20Sopenharmony_ci if (new->entries[i].threshold <= usage) { 43868c2ecf20Sopenharmony_ci /* 43878c2ecf20Sopenharmony_ci * new->current_threshold will not be used until 43888c2ecf20Sopenharmony_ci * rcu_assign_pointer(), so it's safe to increment 43898c2ecf20Sopenharmony_ci * it here. 43908c2ecf20Sopenharmony_ci */ 43918c2ecf20Sopenharmony_ci ++new->current_threshold; 43928c2ecf20Sopenharmony_ci } else 43938c2ecf20Sopenharmony_ci break; 43948c2ecf20Sopenharmony_ci } 43958c2ecf20Sopenharmony_ci 43968c2ecf20Sopenharmony_ci /* Free old spare buffer and save old primary buffer as spare */ 43978c2ecf20Sopenharmony_ci kfree(thresholds->spare); 43988c2ecf20Sopenharmony_ci thresholds->spare = thresholds->primary; 43998c2ecf20Sopenharmony_ci 44008c2ecf20Sopenharmony_ci rcu_assign_pointer(thresholds->primary, new); 44018c2ecf20Sopenharmony_ci 44028c2ecf20Sopenharmony_ci /* To be sure that nobody uses thresholds */ 44038c2ecf20Sopenharmony_ci synchronize_rcu(); 44048c2ecf20Sopenharmony_ci 44058c2ecf20Sopenharmony_ciunlock: 44068c2ecf20Sopenharmony_ci mutex_unlock(&memcg->thresholds_lock); 44078c2ecf20Sopenharmony_ci 44088c2ecf20Sopenharmony_ci return ret; 44098c2ecf20Sopenharmony_ci} 44108c2ecf20Sopenharmony_ci 44118c2ecf20Sopenharmony_cistatic int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 44128c2ecf20Sopenharmony_ci struct eventfd_ctx *eventfd, const char *args) 44138c2ecf20Sopenharmony_ci{ 44148c2ecf20Sopenharmony_ci return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 44158c2ecf20Sopenharmony_ci} 44168c2ecf20Sopenharmony_ci 44178c2ecf20Sopenharmony_cistatic int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 44188c2ecf20Sopenharmony_ci struct eventfd_ctx *eventfd, const char *args) 44198c2ecf20Sopenharmony_ci{ 44208c2ecf20Sopenharmony_ci return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 44218c2ecf20Sopenharmony_ci} 44228c2ecf20Sopenharmony_ci 44238c2ecf20Sopenharmony_cistatic void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 44248c2ecf20Sopenharmony_ci struct eventfd_ctx *eventfd, enum res_type type) 44258c2ecf20Sopenharmony_ci{ 44268c2ecf20Sopenharmony_ci struct mem_cgroup_thresholds *thresholds; 44278c2ecf20Sopenharmony_ci struct mem_cgroup_threshold_ary *new; 44288c2ecf20Sopenharmony_ci unsigned long usage; 44298c2ecf20Sopenharmony_ci int i, j, size, entries; 44308c2ecf20Sopenharmony_ci 44318c2ecf20Sopenharmony_ci mutex_lock(&memcg->thresholds_lock); 44328c2ecf20Sopenharmony_ci 44338c2ecf20Sopenharmony_ci if (type == _MEM) { 44348c2ecf20Sopenharmony_ci thresholds = &memcg->thresholds; 44358c2ecf20Sopenharmony_ci usage = mem_cgroup_usage(memcg, false); 44368c2ecf20Sopenharmony_ci } else if (type == _MEMSWAP) { 44378c2ecf20Sopenharmony_ci thresholds = &memcg->memsw_thresholds; 44388c2ecf20Sopenharmony_ci usage = mem_cgroup_usage(memcg, true); 44398c2ecf20Sopenharmony_ci } else 44408c2ecf20Sopenharmony_ci BUG(); 44418c2ecf20Sopenharmony_ci 44428c2ecf20Sopenharmony_ci if (!thresholds->primary) 44438c2ecf20Sopenharmony_ci goto unlock; 44448c2ecf20Sopenharmony_ci 44458c2ecf20Sopenharmony_ci /* Check if a threshold crossed before removing */ 44468c2ecf20Sopenharmony_ci __mem_cgroup_threshold(memcg, type == _MEMSWAP); 44478c2ecf20Sopenharmony_ci 44488c2ecf20Sopenharmony_ci /* Calculate new number of threshold */ 44498c2ecf20Sopenharmony_ci size = entries = 0; 44508c2ecf20Sopenharmony_ci for (i = 0; i < thresholds->primary->size; i++) { 44518c2ecf20Sopenharmony_ci if (thresholds->primary->entries[i].eventfd != eventfd) 44528c2ecf20Sopenharmony_ci size++; 44538c2ecf20Sopenharmony_ci else 44548c2ecf20Sopenharmony_ci entries++; 44558c2ecf20Sopenharmony_ci } 44568c2ecf20Sopenharmony_ci 44578c2ecf20Sopenharmony_ci new = thresholds->spare; 44588c2ecf20Sopenharmony_ci 44598c2ecf20Sopenharmony_ci /* If no items related to eventfd have been cleared, nothing to do */ 44608c2ecf20Sopenharmony_ci if (!entries) 44618c2ecf20Sopenharmony_ci goto unlock; 44628c2ecf20Sopenharmony_ci 44638c2ecf20Sopenharmony_ci /* Set thresholds array to NULL if we don't have thresholds */ 44648c2ecf20Sopenharmony_ci if (!size) { 44658c2ecf20Sopenharmony_ci kfree(new); 44668c2ecf20Sopenharmony_ci new = NULL; 44678c2ecf20Sopenharmony_ci goto swap_buffers; 44688c2ecf20Sopenharmony_ci } 44698c2ecf20Sopenharmony_ci 44708c2ecf20Sopenharmony_ci new->size = size; 44718c2ecf20Sopenharmony_ci 44728c2ecf20Sopenharmony_ci /* Copy thresholds and find current threshold */ 44738c2ecf20Sopenharmony_ci new->current_threshold = -1; 44748c2ecf20Sopenharmony_ci for (i = 0, j = 0; i < thresholds->primary->size; i++) { 44758c2ecf20Sopenharmony_ci if (thresholds->primary->entries[i].eventfd == eventfd) 44768c2ecf20Sopenharmony_ci continue; 44778c2ecf20Sopenharmony_ci 44788c2ecf20Sopenharmony_ci new->entries[j] = thresholds->primary->entries[i]; 44798c2ecf20Sopenharmony_ci if (new->entries[j].threshold <= usage) { 44808c2ecf20Sopenharmony_ci /* 44818c2ecf20Sopenharmony_ci * new->current_threshold will not be used 44828c2ecf20Sopenharmony_ci * until rcu_assign_pointer(), so it's safe to increment 44838c2ecf20Sopenharmony_ci * it here. 44848c2ecf20Sopenharmony_ci */ 44858c2ecf20Sopenharmony_ci ++new->current_threshold; 44868c2ecf20Sopenharmony_ci } 44878c2ecf20Sopenharmony_ci j++; 44888c2ecf20Sopenharmony_ci } 44898c2ecf20Sopenharmony_ci 44908c2ecf20Sopenharmony_ciswap_buffers: 44918c2ecf20Sopenharmony_ci /* Swap primary and spare array */ 44928c2ecf20Sopenharmony_ci thresholds->spare = thresholds->primary; 44938c2ecf20Sopenharmony_ci 44948c2ecf20Sopenharmony_ci rcu_assign_pointer(thresholds->primary, new); 44958c2ecf20Sopenharmony_ci 44968c2ecf20Sopenharmony_ci /* To be sure that nobody uses thresholds */ 44978c2ecf20Sopenharmony_ci synchronize_rcu(); 44988c2ecf20Sopenharmony_ci 44998c2ecf20Sopenharmony_ci /* If all events are unregistered, free the spare array */ 45008c2ecf20Sopenharmony_ci if (!new) { 45018c2ecf20Sopenharmony_ci kfree(thresholds->spare); 45028c2ecf20Sopenharmony_ci thresholds->spare = NULL; 45038c2ecf20Sopenharmony_ci } 45048c2ecf20Sopenharmony_ciunlock: 45058c2ecf20Sopenharmony_ci mutex_unlock(&memcg->thresholds_lock); 45068c2ecf20Sopenharmony_ci} 45078c2ecf20Sopenharmony_ci 45088c2ecf20Sopenharmony_cistatic void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 45098c2ecf20Sopenharmony_ci struct eventfd_ctx *eventfd) 45108c2ecf20Sopenharmony_ci{ 45118c2ecf20Sopenharmony_ci return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 45128c2ecf20Sopenharmony_ci} 45138c2ecf20Sopenharmony_ci 45148c2ecf20Sopenharmony_cistatic void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 45158c2ecf20Sopenharmony_ci struct eventfd_ctx *eventfd) 45168c2ecf20Sopenharmony_ci{ 45178c2ecf20Sopenharmony_ci return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 45188c2ecf20Sopenharmony_ci} 45198c2ecf20Sopenharmony_ci 45208c2ecf20Sopenharmony_cistatic int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 45218c2ecf20Sopenharmony_ci struct eventfd_ctx *eventfd, const char *args) 45228c2ecf20Sopenharmony_ci{ 45238c2ecf20Sopenharmony_ci struct mem_cgroup_eventfd_list *event; 45248c2ecf20Sopenharmony_ci 45258c2ecf20Sopenharmony_ci event = kmalloc(sizeof(*event), GFP_KERNEL); 45268c2ecf20Sopenharmony_ci if (!event) 45278c2ecf20Sopenharmony_ci return -ENOMEM; 45288c2ecf20Sopenharmony_ci 45298c2ecf20Sopenharmony_ci spin_lock(&memcg_oom_lock); 45308c2ecf20Sopenharmony_ci 45318c2ecf20Sopenharmony_ci event->eventfd = eventfd; 45328c2ecf20Sopenharmony_ci list_add(&event->list, &memcg->oom_notify); 45338c2ecf20Sopenharmony_ci 45348c2ecf20Sopenharmony_ci /* already in OOM ? */ 45358c2ecf20Sopenharmony_ci if (memcg->under_oom) 45368c2ecf20Sopenharmony_ci eventfd_signal(eventfd, 1); 45378c2ecf20Sopenharmony_ci spin_unlock(&memcg_oom_lock); 45388c2ecf20Sopenharmony_ci 45398c2ecf20Sopenharmony_ci return 0; 45408c2ecf20Sopenharmony_ci} 45418c2ecf20Sopenharmony_ci 45428c2ecf20Sopenharmony_cistatic void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 45438c2ecf20Sopenharmony_ci struct eventfd_ctx *eventfd) 45448c2ecf20Sopenharmony_ci{ 45458c2ecf20Sopenharmony_ci struct mem_cgroup_eventfd_list *ev, *tmp; 45468c2ecf20Sopenharmony_ci 45478c2ecf20Sopenharmony_ci spin_lock(&memcg_oom_lock); 45488c2ecf20Sopenharmony_ci 45498c2ecf20Sopenharmony_ci list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 45508c2ecf20Sopenharmony_ci if (ev->eventfd == eventfd) { 45518c2ecf20Sopenharmony_ci list_del(&ev->list); 45528c2ecf20Sopenharmony_ci kfree(ev); 45538c2ecf20Sopenharmony_ci } 45548c2ecf20Sopenharmony_ci } 45558c2ecf20Sopenharmony_ci 45568c2ecf20Sopenharmony_ci spin_unlock(&memcg_oom_lock); 45578c2ecf20Sopenharmony_ci} 45588c2ecf20Sopenharmony_ci 45598c2ecf20Sopenharmony_cistatic int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 45608c2ecf20Sopenharmony_ci{ 45618c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 45628c2ecf20Sopenharmony_ci 45638c2ecf20Sopenharmony_ci seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 45648c2ecf20Sopenharmony_ci seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 45658c2ecf20Sopenharmony_ci seq_printf(sf, "oom_kill %lu\n", 45668c2ecf20Sopenharmony_ci atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 45678c2ecf20Sopenharmony_ci return 0; 45688c2ecf20Sopenharmony_ci} 45698c2ecf20Sopenharmony_ci 45708c2ecf20Sopenharmony_cistatic int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 45718c2ecf20Sopenharmony_ci struct cftype *cft, u64 val) 45728c2ecf20Sopenharmony_ci{ 45738c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 45748c2ecf20Sopenharmony_ci 45758c2ecf20Sopenharmony_ci /* cannot set to root cgroup and only 0 and 1 are allowed */ 45768c2ecf20Sopenharmony_ci if (!css->parent || !((val == 0) || (val == 1))) 45778c2ecf20Sopenharmony_ci return -EINVAL; 45788c2ecf20Sopenharmony_ci 45798c2ecf20Sopenharmony_ci memcg->oom_kill_disable = val; 45808c2ecf20Sopenharmony_ci if (!val) 45818c2ecf20Sopenharmony_ci memcg_oom_recover(memcg); 45828c2ecf20Sopenharmony_ci 45838c2ecf20Sopenharmony_ci return 0; 45848c2ecf20Sopenharmony_ci} 45858c2ecf20Sopenharmony_ci 45868c2ecf20Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK 45878c2ecf20Sopenharmony_ci 45888c2ecf20Sopenharmony_ci#include <trace/events/writeback.h> 45898c2ecf20Sopenharmony_ci 45908c2ecf20Sopenharmony_cistatic int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 45918c2ecf20Sopenharmony_ci{ 45928c2ecf20Sopenharmony_ci return wb_domain_init(&memcg->cgwb_domain, gfp); 45938c2ecf20Sopenharmony_ci} 45948c2ecf20Sopenharmony_ci 45958c2ecf20Sopenharmony_cistatic void memcg_wb_domain_exit(struct mem_cgroup *memcg) 45968c2ecf20Sopenharmony_ci{ 45978c2ecf20Sopenharmony_ci wb_domain_exit(&memcg->cgwb_domain); 45988c2ecf20Sopenharmony_ci} 45998c2ecf20Sopenharmony_ci 46008c2ecf20Sopenharmony_cistatic void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 46018c2ecf20Sopenharmony_ci{ 46028c2ecf20Sopenharmony_ci wb_domain_size_changed(&memcg->cgwb_domain); 46038c2ecf20Sopenharmony_ci} 46048c2ecf20Sopenharmony_ci 46058c2ecf20Sopenharmony_cistruct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 46068c2ecf20Sopenharmony_ci{ 46078c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 46088c2ecf20Sopenharmony_ci 46098c2ecf20Sopenharmony_ci if (!memcg->css.parent) 46108c2ecf20Sopenharmony_ci return NULL; 46118c2ecf20Sopenharmony_ci 46128c2ecf20Sopenharmony_ci return &memcg->cgwb_domain; 46138c2ecf20Sopenharmony_ci} 46148c2ecf20Sopenharmony_ci 46158c2ecf20Sopenharmony_ci/* 46168c2ecf20Sopenharmony_ci * idx can be of type enum memcg_stat_item or node_stat_item. 46178c2ecf20Sopenharmony_ci * Keep in sync with memcg_exact_page(). 46188c2ecf20Sopenharmony_ci */ 46198c2ecf20Sopenharmony_cistatic unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) 46208c2ecf20Sopenharmony_ci{ 46218c2ecf20Sopenharmony_ci long x = atomic_long_read(&memcg->vmstats[idx]); 46228c2ecf20Sopenharmony_ci int cpu; 46238c2ecf20Sopenharmony_ci 46248c2ecf20Sopenharmony_ci for_each_online_cpu(cpu) 46258c2ecf20Sopenharmony_ci x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx]; 46268c2ecf20Sopenharmony_ci if (x < 0) 46278c2ecf20Sopenharmony_ci x = 0; 46288c2ecf20Sopenharmony_ci return x; 46298c2ecf20Sopenharmony_ci} 46308c2ecf20Sopenharmony_ci 46318c2ecf20Sopenharmony_ci/** 46328c2ecf20Sopenharmony_ci * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 46338c2ecf20Sopenharmony_ci * @wb: bdi_writeback in question 46348c2ecf20Sopenharmony_ci * @pfilepages: out parameter for number of file pages 46358c2ecf20Sopenharmony_ci * @pheadroom: out parameter for number of allocatable pages according to memcg 46368c2ecf20Sopenharmony_ci * @pdirty: out parameter for number of dirty pages 46378c2ecf20Sopenharmony_ci * @pwriteback: out parameter for number of pages under writeback 46388c2ecf20Sopenharmony_ci * 46398c2ecf20Sopenharmony_ci * Determine the numbers of file, headroom, dirty, and writeback pages in 46408c2ecf20Sopenharmony_ci * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom 46418c2ecf20Sopenharmony_ci * is a bit more involved. 46428c2ecf20Sopenharmony_ci * 46438c2ecf20Sopenharmony_ci * A memcg's headroom is "min(max, high) - used". In the hierarchy, the 46448c2ecf20Sopenharmony_ci * headroom is calculated as the lowest headroom of itself and the 46458c2ecf20Sopenharmony_ci * ancestors. Note that this doesn't consider the actual amount of 46468c2ecf20Sopenharmony_ci * available memory in the system. The caller should further cap 46478c2ecf20Sopenharmony_ci * *@pheadroom accordingly. 46488c2ecf20Sopenharmony_ci */ 46498c2ecf20Sopenharmony_civoid mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, 46508c2ecf20Sopenharmony_ci unsigned long *pheadroom, unsigned long *pdirty, 46518c2ecf20Sopenharmony_ci unsigned long *pwriteback) 46528c2ecf20Sopenharmony_ci{ 46538c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 46548c2ecf20Sopenharmony_ci struct mem_cgroup *parent; 46558c2ecf20Sopenharmony_ci 46568c2ecf20Sopenharmony_ci *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); 46578c2ecf20Sopenharmony_ci 46588c2ecf20Sopenharmony_ci *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); 46598c2ecf20Sopenharmony_ci *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + 46608c2ecf20Sopenharmony_ci memcg_exact_page_state(memcg, NR_ACTIVE_FILE); 46618c2ecf20Sopenharmony_ci *pheadroom = PAGE_COUNTER_MAX; 46628c2ecf20Sopenharmony_ci 46638c2ecf20Sopenharmony_ci while ((parent = parent_mem_cgroup(memcg))) { 46648c2ecf20Sopenharmony_ci unsigned long ceiling = min(READ_ONCE(memcg->memory.max), 46658c2ecf20Sopenharmony_ci READ_ONCE(memcg->memory.high)); 46668c2ecf20Sopenharmony_ci unsigned long used = page_counter_read(&memcg->memory); 46678c2ecf20Sopenharmony_ci 46688c2ecf20Sopenharmony_ci *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); 46698c2ecf20Sopenharmony_ci memcg = parent; 46708c2ecf20Sopenharmony_ci } 46718c2ecf20Sopenharmony_ci} 46728c2ecf20Sopenharmony_ci 46738c2ecf20Sopenharmony_ci/* 46748c2ecf20Sopenharmony_ci * Foreign dirty flushing 46758c2ecf20Sopenharmony_ci * 46768c2ecf20Sopenharmony_ci * There's an inherent mismatch between memcg and writeback. The former 46778c2ecf20Sopenharmony_ci * trackes ownership per-page while the latter per-inode. This was a 46788c2ecf20Sopenharmony_ci * deliberate design decision because honoring per-page ownership in the 46798c2ecf20Sopenharmony_ci * writeback path is complicated, may lead to higher CPU and IO overheads 46808c2ecf20Sopenharmony_ci * and deemed unnecessary given that write-sharing an inode across 46818c2ecf20Sopenharmony_ci * different cgroups isn't a common use-case. 46828c2ecf20Sopenharmony_ci * 46838c2ecf20Sopenharmony_ci * Combined with inode majority-writer ownership switching, this works well 46848c2ecf20Sopenharmony_ci * enough in most cases but there are some pathological cases. For 46858c2ecf20Sopenharmony_ci * example, let's say there are two cgroups A and B which keep writing to 46868c2ecf20Sopenharmony_ci * different but confined parts of the same inode. B owns the inode and 46878c2ecf20Sopenharmony_ci * A's memory is limited far below B's. A's dirty ratio can rise enough to 46888c2ecf20Sopenharmony_ci * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid 46898c2ecf20Sopenharmony_ci * triggering background writeback. A will be slowed down without a way to 46908c2ecf20Sopenharmony_ci * make writeback of the dirty pages happen. 46918c2ecf20Sopenharmony_ci * 46928c2ecf20Sopenharmony_ci * Conditions like the above can lead to a cgroup getting repatedly and 46938c2ecf20Sopenharmony_ci * severely throttled after making some progress after each 46948c2ecf20Sopenharmony_ci * dirty_expire_interval while the underyling IO device is almost 46958c2ecf20Sopenharmony_ci * completely idle. 46968c2ecf20Sopenharmony_ci * 46978c2ecf20Sopenharmony_ci * Solving this problem completely requires matching the ownership tracking 46988c2ecf20Sopenharmony_ci * granularities between memcg and writeback in either direction. However, 46998c2ecf20Sopenharmony_ci * the more egregious behaviors can be avoided by simply remembering the 47008c2ecf20Sopenharmony_ci * most recent foreign dirtying events and initiating remote flushes on 47018c2ecf20Sopenharmony_ci * them when local writeback isn't enough to keep the memory clean enough. 47028c2ecf20Sopenharmony_ci * 47038c2ecf20Sopenharmony_ci * The following two functions implement such mechanism. When a foreign 47048c2ecf20Sopenharmony_ci * page - a page whose memcg and writeback ownerships don't match - is 47058c2ecf20Sopenharmony_ci * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning 47068c2ecf20Sopenharmony_ci * bdi_writeback on the page owning memcg. When balance_dirty_pages() 47078c2ecf20Sopenharmony_ci * decides that the memcg needs to sleep due to high dirty ratio, it calls 47088c2ecf20Sopenharmony_ci * mem_cgroup_flush_foreign() which queues writeback on the recorded 47098c2ecf20Sopenharmony_ci * foreign bdi_writebacks which haven't expired. Both the numbers of 47108c2ecf20Sopenharmony_ci * recorded bdi_writebacks and concurrent in-flight foreign writebacks are 47118c2ecf20Sopenharmony_ci * limited to MEMCG_CGWB_FRN_CNT. 47128c2ecf20Sopenharmony_ci * 47138c2ecf20Sopenharmony_ci * The mechanism only remembers IDs and doesn't hold any object references. 47148c2ecf20Sopenharmony_ci * As being wrong occasionally doesn't matter, updates and accesses to the 47158c2ecf20Sopenharmony_ci * records are lockless and racy. 47168c2ecf20Sopenharmony_ci */ 47178c2ecf20Sopenharmony_civoid mem_cgroup_track_foreign_dirty_slowpath(struct page *page, 47188c2ecf20Sopenharmony_ci struct bdi_writeback *wb) 47198c2ecf20Sopenharmony_ci{ 47208c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = page->mem_cgroup; 47218c2ecf20Sopenharmony_ci struct memcg_cgwb_frn *frn; 47228c2ecf20Sopenharmony_ci u64 now = get_jiffies_64(); 47238c2ecf20Sopenharmony_ci u64 oldest_at = now; 47248c2ecf20Sopenharmony_ci int oldest = -1; 47258c2ecf20Sopenharmony_ci int i; 47268c2ecf20Sopenharmony_ci 47278c2ecf20Sopenharmony_ci trace_track_foreign_dirty(page, wb); 47288c2ecf20Sopenharmony_ci 47298c2ecf20Sopenharmony_ci /* 47308c2ecf20Sopenharmony_ci * Pick the slot to use. If there is already a slot for @wb, keep 47318c2ecf20Sopenharmony_ci * using it. If not replace the oldest one which isn't being 47328c2ecf20Sopenharmony_ci * written out. 47338c2ecf20Sopenharmony_ci */ 47348c2ecf20Sopenharmony_ci for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 47358c2ecf20Sopenharmony_ci frn = &memcg->cgwb_frn[i]; 47368c2ecf20Sopenharmony_ci if (frn->bdi_id == wb->bdi->id && 47378c2ecf20Sopenharmony_ci frn->memcg_id == wb->memcg_css->id) 47388c2ecf20Sopenharmony_ci break; 47398c2ecf20Sopenharmony_ci if (time_before64(frn->at, oldest_at) && 47408c2ecf20Sopenharmony_ci atomic_read(&frn->done.cnt) == 1) { 47418c2ecf20Sopenharmony_ci oldest = i; 47428c2ecf20Sopenharmony_ci oldest_at = frn->at; 47438c2ecf20Sopenharmony_ci } 47448c2ecf20Sopenharmony_ci } 47458c2ecf20Sopenharmony_ci 47468c2ecf20Sopenharmony_ci if (i < MEMCG_CGWB_FRN_CNT) { 47478c2ecf20Sopenharmony_ci /* 47488c2ecf20Sopenharmony_ci * Re-using an existing one. Update timestamp lazily to 47498c2ecf20Sopenharmony_ci * avoid making the cacheline hot. We want them to be 47508c2ecf20Sopenharmony_ci * reasonably up-to-date and significantly shorter than 47518c2ecf20Sopenharmony_ci * dirty_expire_interval as that's what expires the record. 47528c2ecf20Sopenharmony_ci * Use the shorter of 1s and dirty_expire_interval / 8. 47538c2ecf20Sopenharmony_ci */ 47548c2ecf20Sopenharmony_ci unsigned long update_intv = 47558c2ecf20Sopenharmony_ci min_t(unsigned long, HZ, 47568c2ecf20Sopenharmony_ci msecs_to_jiffies(dirty_expire_interval * 10) / 8); 47578c2ecf20Sopenharmony_ci 47588c2ecf20Sopenharmony_ci if (time_before64(frn->at, now - update_intv)) 47598c2ecf20Sopenharmony_ci frn->at = now; 47608c2ecf20Sopenharmony_ci } else if (oldest >= 0) { 47618c2ecf20Sopenharmony_ci /* replace the oldest free one */ 47628c2ecf20Sopenharmony_ci frn = &memcg->cgwb_frn[oldest]; 47638c2ecf20Sopenharmony_ci frn->bdi_id = wb->bdi->id; 47648c2ecf20Sopenharmony_ci frn->memcg_id = wb->memcg_css->id; 47658c2ecf20Sopenharmony_ci frn->at = now; 47668c2ecf20Sopenharmony_ci } 47678c2ecf20Sopenharmony_ci} 47688c2ecf20Sopenharmony_ci 47698c2ecf20Sopenharmony_ci/* issue foreign writeback flushes for recorded foreign dirtying events */ 47708c2ecf20Sopenharmony_civoid mem_cgroup_flush_foreign(struct bdi_writeback *wb) 47718c2ecf20Sopenharmony_ci{ 47728c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 47738c2ecf20Sopenharmony_ci unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); 47748c2ecf20Sopenharmony_ci u64 now = jiffies_64; 47758c2ecf20Sopenharmony_ci int i; 47768c2ecf20Sopenharmony_ci 47778c2ecf20Sopenharmony_ci for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 47788c2ecf20Sopenharmony_ci struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; 47798c2ecf20Sopenharmony_ci 47808c2ecf20Sopenharmony_ci /* 47818c2ecf20Sopenharmony_ci * If the record is older than dirty_expire_interval, 47828c2ecf20Sopenharmony_ci * writeback on it has already started. No need to kick it 47838c2ecf20Sopenharmony_ci * off again. Also, don't start a new one if there's 47848c2ecf20Sopenharmony_ci * already one in flight. 47858c2ecf20Sopenharmony_ci */ 47868c2ecf20Sopenharmony_ci if (time_after64(frn->at, now - intv) && 47878c2ecf20Sopenharmony_ci atomic_read(&frn->done.cnt) == 1) { 47888c2ecf20Sopenharmony_ci frn->at = 0; 47898c2ecf20Sopenharmony_ci trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); 47908c2ecf20Sopenharmony_ci cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, 47918c2ecf20Sopenharmony_ci WB_REASON_FOREIGN_FLUSH, 47928c2ecf20Sopenharmony_ci &frn->done); 47938c2ecf20Sopenharmony_ci } 47948c2ecf20Sopenharmony_ci } 47958c2ecf20Sopenharmony_ci} 47968c2ecf20Sopenharmony_ci 47978c2ecf20Sopenharmony_ci#else /* CONFIG_CGROUP_WRITEBACK */ 47988c2ecf20Sopenharmony_ci 47998c2ecf20Sopenharmony_cistatic int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 48008c2ecf20Sopenharmony_ci{ 48018c2ecf20Sopenharmony_ci return 0; 48028c2ecf20Sopenharmony_ci} 48038c2ecf20Sopenharmony_ci 48048c2ecf20Sopenharmony_cistatic void memcg_wb_domain_exit(struct mem_cgroup *memcg) 48058c2ecf20Sopenharmony_ci{ 48068c2ecf20Sopenharmony_ci} 48078c2ecf20Sopenharmony_ci 48088c2ecf20Sopenharmony_cistatic void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 48098c2ecf20Sopenharmony_ci{ 48108c2ecf20Sopenharmony_ci} 48118c2ecf20Sopenharmony_ci 48128c2ecf20Sopenharmony_ci#endif /* CONFIG_CGROUP_WRITEBACK */ 48138c2ecf20Sopenharmony_ci 48148c2ecf20Sopenharmony_ci/* 48158c2ecf20Sopenharmony_ci * DO NOT USE IN NEW FILES. 48168c2ecf20Sopenharmony_ci * 48178c2ecf20Sopenharmony_ci * "cgroup.event_control" implementation. 48188c2ecf20Sopenharmony_ci * 48198c2ecf20Sopenharmony_ci * This is way over-engineered. It tries to support fully configurable 48208c2ecf20Sopenharmony_ci * events for each user. Such level of flexibility is completely 48218c2ecf20Sopenharmony_ci * unnecessary especially in the light of the planned unified hierarchy. 48228c2ecf20Sopenharmony_ci * 48238c2ecf20Sopenharmony_ci * Please deprecate this and replace with something simpler if at all 48248c2ecf20Sopenharmony_ci * possible. 48258c2ecf20Sopenharmony_ci */ 48268c2ecf20Sopenharmony_ci 48278c2ecf20Sopenharmony_ci/* 48288c2ecf20Sopenharmony_ci * Unregister event and free resources. 48298c2ecf20Sopenharmony_ci * 48308c2ecf20Sopenharmony_ci * Gets called from workqueue. 48318c2ecf20Sopenharmony_ci */ 48328c2ecf20Sopenharmony_cistatic void memcg_event_remove(struct work_struct *work) 48338c2ecf20Sopenharmony_ci{ 48348c2ecf20Sopenharmony_ci struct mem_cgroup_event *event = 48358c2ecf20Sopenharmony_ci container_of(work, struct mem_cgroup_event, remove); 48368c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = event->memcg; 48378c2ecf20Sopenharmony_ci 48388c2ecf20Sopenharmony_ci remove_wait_queue(event->wqh, &event->wait); 48398c2ecf20Sopenharmony_ci 48408c2ecf20Sopenharmony_ci event->unregister_event(memcg, event->eventfd); 48418c2ecf20Sopenharmony_ci 48428c2ecf20Sopenharmony_ci /* Notify userspace the event is going away. */ 48438c2ecf20Sopenharmony_ci eventfd_signal(event->eventfd, 1); 48448c2ecf20Sopenharmony_ci 48458c2ecf20Sopenharmony_ci eventfd_ctx_put(event->eventfd); 48468c2ecf20Sopenharmony_ci kfree(event); 48478c2ecf20Sopenharmony_ci css_put(&memcg->css); 48488c2ecf20Sopenharmony_ci} 48498c2ecf20Sopenharmony_ci 48508c2ecf20Sopenharmony_ci/* 48518c2ecf20Sopenharmony_ci * Gets called on EPOLLHUP on eventfd when user closes it. 48528c2ecf20Sopenharmony_ci * 48538c2ecf20Sopenharmony_ci * Called with wqh->lock held and interrupts disabled. 48548c2ecf20Sopenharmony_ci */ 48558c2ecf20Sopenharmony_cistatic int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, 48568c2ecf20Sopenharmony_ci int sync, void *key) 48578c2ecf20Sopenharmony_ci{ 48588c2ecf20Sopenharmony_ci struct mem_cgroup_event *event = 48598c2ecf20Sopenharmony_ci container_of(wait, struct mem_cgroup_event, wait); 48608c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = event->memcg; 48618c2ecf20Sopenharmony_ci __poll_t flags = key_to_poll(key); 48628c2ecf20Sopenharmony_ci 48638c2ecf20Sopenharmony_ci if (flags & EPOLLHUP) { 48648c2ecf20Sopenharmony_ci /* 48658c2ecf20Sopenharmony_ci * If the event has been detached at cgroup removal, we 48668c2ecf20Sopenharmony_ci * can simply return knowing the other side will cleanup 48678c2ecf20Sopenharmony_ci * for us. 48688c2ecf20Sopenharmony_ci * 48698c2ecf20Sopenharmony_ci * We can't race against event freeing since the other 48708c2ecf20Sopenharmony_ci * side will require wqh->lock via remove_wait_queue(), 48718c2ecf20Sopenharmony_ci * which we hold. 48728c2ecf20Sopenharmony_ci */ 48738c2ecf20Sopenharmony_ci spin_lock(&memcg->event_list_lock); 48748c2ecf20Sopenharmony_ci if (!list_empty(&event->list)) { 48758c2ecf20Sopenharmony_ci list_del_init(&event->list); 48768c2ecf20Sopenharmony_ci /* 48778c2ecf20Sopenharmony_ci * We are in atomic context, but cgroup_event_remove() 48788c2ecf20Sopenharmony_ci * may sleep, so we have to call it in workqueue. 48798c2ecf20Sopenharmony_ci */ 48808c2ecf20Sopenharmony_ci schedule_work(&event->remove); 48818c2ecf20Sopenharmony_ci } 48828c2ecf20Sopenharmony_ci spin_unlock(&memcg->event_list_lock); 48838c2ecf20Sopenharmony_ci } 48848c2ecf20Sopenharmony_ci 48858c2ecf20Sopenharmony_ci return 0; 48868c2ecf20Sopenharmony_ci} 48878c2ecf20Sopenharmony_ci 48888c2ecf20Sopenharmony_cistatic void memcg_event_ptable_queue_proc(struct file *file, 48898c2ecf20Sopenharmony_ci wait_queue_head_t *wqh, poll_table *pt) 48908c2ecf20Sopenharmony_ci{ 48918c2ecf20Sopenharmony_ci struct mem_cgroup_event *event = 48928c2ecf20Sopenharmony_ci container_of(pt, struct mem_cgroup_event, pt); 48938c2ecf20Sopenharmony_ci 48948c2ecf20Sopenharmony_ci event->wqh = wqh; 48958c2ecf20Sopenharmony_ci add_wait_queue(wqh, &event->wait); 48968c2ecf20Sopenharmony_ci} 48978c2ecf20Sopenharmony_ci 48988c2ecf20Sopenharmony_ci/* 48998c2ecf20Sopenharmony_ci * DO NOT USE IN NEW FILES. 49008c2ecf20Sopenharmony_ci * 49018c2ecf20Sopenharmony_ci * Parse input and register new cgroup event handler. 49028c2ecf20Sopenharmony_ci * 49038c2ecf20Sopenharmony_ci * Input must be in format '<event_fd> <control_fd> <args>'. 49048c2ecf20Sopenharmony_ci * Interpretation of args is defined by control file implementation. 49058c2ecf20Sopenharmony_ci */ 49068c2ecf20Sopenharmony_cistatic ssize_t memcg_write_event_control(struct kernfs_open_file *of, 49078c2ecf20Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 49088c2ecf20Sopenharmony_ci{ 49098c2ecf20Sopenharmony_ci struct cgroup_subsys_state *css = of_css(of); 49108c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 49118c2ecf20Sopenharmony_ci struct mem_cgroup_event *event; 49128c2ecf20Sopenharmony_ci struct cgroup_subsys_state *cfile_css; 49138c2ecf20Sopenharmony_ci unsigned int efd, cfd; 49148c2ecf20Sopenharmony_ci struct fd efile; 49158c2ecf20Sopenharmony_ci struct fd cfile; 49168c2ecf20Sopenharmony_ci struct dentry *cdentry; 49178c2ecf20Sopenharmony_ci const char *name; 49188c2ecf20Sopenharmony_ci char *endp; 49198c2ecf20Sopenharmony_ci int ret; 49208c2ecf20Sopenharmony_ci 49218c2ecf20Sopenharmony_ci buf = strstrip(buf); 49228c2ecf20Sopenharmony_ci 49238c2ecf20Sopenharmony_ci efd = simple_strtoul(buf, &endp, 10); 49248c2ecf20Sopenharmony_ci if (*endp != ' ') 49258c2ecf20Sopenharmony_ci return -EINVAL; 49268c2ecf20Sopenharmony_ci buf = endp + 1; 49278c2ecf20Sopenharmony_ci 49288c2ecf20Sopenharmony_ci cfd = simple_strtoul(buf, &endp, 10); 49298c2ecf20Sopenharmony_ci if (*endp == '\0') 49308c2ecf20Sopenharmony_ci buf = endp; 49318c2ecf20Sopenharmony_ci else if (*endp == ' ') 49328c2ecf20Sopenharmony_ci buf = endp + 1; 49338c2ecf20Sopenharmony_ci else 49348c2ecf20Sopenharmony_ci return -EINVAL; 49358c2ecf20Sopenharmony_ci 49368c2ecf20Sopenharmony_ci event = kzalloc(sizeof(*event), GFP_KERNEL); 49378c2ecf20Sopenharmony_ci if (!event) 49388c2ecf20Sopenharmony_ci return -ENOMEM; 49398c2ecf20Sopenharmony_ci 49408c2ecf20Sopenharmony_ci event->memcg = memcg; 49418c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&event->list); 49428c2ecf20Sopenharmony_ci init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 49438c2ecf20Sopenharmony_ci init_waitqueue_func_entry(&event->wait, memcg_event_wake); 49448c2ecf20Sopenharmony_ci INIT_WORK(&event->remove, memcg_event_remove); 49458c2ecf20Sopenharmony_ci 49468c2ecf20Sopenharmony_ci efile = fdget(efd); 49478c2ecf20Sopenharmony_ci if (!efile.file) { 49488c2ecf20Sopenharmony_ci ret = -EBADF; 49498c2ecf20Sopenharmony_ci goto out_kfree; 49508c2ecf20Sopenharmony_ci } 49518c2ecf20Sopenharmony_ci 49528c2ecf20Sopenharmony_ci event->eventfd = eventfd_ctx_fileget(efile.file); 49538c2ecf20Sopenharmony_ci if (IS_ERR(event->eventfd)) { 49548c2ecf20Sopenharmony_ci ret = PTR_ERR(event->eventfd); 49558c2ecf20Sopenharmony_ci goto out_put_efile; 49568c2ecf20Sopenharmony_ci } 49578c2ecf20Sopenharmony_ci 49588c2ecf20Sopenharmony_ci cfile = fdget(cfd); 49598c2ecf20Sopenharmony_ci if (!cfile.file) { 49608c2ecf20Sopenharmony_ci ret = -EBADF; 49618c2ecf20Sopenharmony_ci goto out_put_eventfd; 49628c2ecf20Sopenharmony_ci } 49638c2ecf20Sopenharmony_ci 49648c2ecf20Sopenharmony_ci /* the process need read permission on control file */ 49658c2ecf20Sopenharmony_ci /* AV: shouldn't we check that it's been opened for read instead? */ 49668c2ecf20Sopenharmony_ci ret = inode_permission(file_inode(cfile.file), MAY_READ); 49678c2ecf20Sopenharmony_ci if (ret < 0) 49688c2ecf20Sopenharmony_ci goto out_put_cfile; 49698c2ecf20Sopenharmony_ci 49708c2ecf20Sopenharmony_ci /* 49718c2ecf20Sopenharmony_ci * The control file must be a regular cgroup1 file. As a regular cgroup 49728c2ecf20Sopenharmony_ci * file can't be renamed, it's safe to access its name afterwards. 49738c2ecf20Sopenharmony_ci */ 49748c2ecf20Sopenharmony_ci cdentry = cfile.file->f_path.dentry; 49758c2ecf20Sopenharmony_ci if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { 49768c2ecf20Sopenharmony_ci ret = -EINVAL; 49778c2ecf20Sopenharmony_ci goto out_put_cfile; 49788c2ecf20Sopenharmony_ci } 49798c2ecf20Sopenharmony_ci 49808c2ecf20Sopenharmony_ci /* 49818c2ecf20Sopenharmony_ci * Determine the event callbacks and set them in @event. This used 49828c2ecf20Sopenharmony_ci * to be done via struct cftype but cgroup core no longer knows 49838c2ecf20Sopenharmony_ci * about these events. The following is crude but the whole thing 49848c2ecf20Sopenharmony_ci * is for compatibility anyway. 49858c2ecf20Sopenharmony_ci * 49868c2ecf20Sopenharmony_ci * DO NOT ADD NEW FILES. 49878c2ecf20Sopenharmony_ci */ 49888c2ecf20Sopenharmony_ci name = cdentry->d_name.name; 49898c2ecf20Sopenharmony_ci 49908c2ecf20Sopenharmony_ci if (!strcmp(name, "memory.usage_in_bytes")) { 49918c2ecf20Sopenharmony_ci event->register_event = mem_cgroup_usage_register_event; 49928c2ecf20Sopenharmony_ci event->unregister_event = mem_cgroup_usage_unregister_event; 49938c2ecf20Sopenharmony_ci } else if (!strcmp(name, "memory.oom_control")) { 49948c2ecf20Sopenharmony_ci event->register_event = mem_cgroup_oom_register_event; 49958c2ecf20Sopenharmony_ci event->unregister_event = mem_cgroup_oom_unregister_event; 49968c2ecf20Sopenharmony_ci } else if (!strcmp(name, "memory.pressure_level")) { 49978c2ecf20Sopenharmony_ci event->register_event = vmpressure_register_event; 49988c2ecf20Sopenharmony_ci event->unregister_event = vmpressure_unregister_event; 49998c2ecf20Sopenharmony_ci } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 50008c2ecf20Sopenharmony_ci event->register_event = memsw_cgroup_usage_register_event; 50018c2ecf20Sopenharmony_ci event->unregister_event = memsw_cgroup_usage_unregister_event; 50028c2ecf20Sopenharmony_ci } else { 50038c2ecf20Sopenharmony_ci ret = -EINVAL; 50048c2ecf20Sopenharmony_ci goto out_put_cfile; 50058c2ecf20Sopenharmony_ci } 50068c2ecf20Sopenharmony_ci 50078c2ecf20Sopenharmony_ci /* 50088c2ecf20Sopenharmony_ci * Verify @cfile should belong to @css. Also, remaining events are 50098c2ecf20Sopenharmony_ci * automatically removed on cgroup destruction but the removal is 50108c2ecf20Sopenharmony_ci * asynchronous, so take an extra ref on @css. 50118c2ecf20Sopenharmony_ci */ 50128c2ecf20Sopenharmony_ci cfile_css = css_tryget_online_from_dir(cdentry->d_parent, 50138c2ecf20Sopenharmony_ci &memory_cgrp_subsys); 50148c2ecf20Sopenharmony_ci ret = -EINVAL; 50158c2ecf20Sopenharmony_ci if (IS_ERR(cfile_css)) 50168c2ecf20Sopenharmony_ci goto out_put_cfile; 50178c2ecf20Sopenharmony_ci if (cfile_css != css) { 50188c2ecf20Sopenharmony_ci css_put(cfile_css); 50198c2ecf20Sopenharmony_ci goto out_put_cfile; 50208c2ecf20Sopenharmony_ci } 50218c2ecf20Sopenharmony_ci 50228c2ecf20Sopenharmony_ci ret = event->register_event(memcg, event->eventfd, buf); 50238c2ecf20Sopenharmony_ci if (ret) 50248c2ecf20Sopenharmony_ci goto out_put_css; 50258c2ecf20Sopenharmony_ci 50268c2ecf20Sopenharmony_ci vfs_poll(efile.file, &event->pt); 50278c2ecf20Sopenharmony_ci 50288c2ecf20Sopenharmony_ci spin_lock(&memcg->event_list_lock); 50298c2ecf20Sopenharmony_ci list_add(&event->list, &memcg->event_list); 50308c2ecf20Sopenharmony_ci spin_unlock(&memcg->event_list_lock); 50318c2ecf20Sopenharmony_ci 50328c2ecf20Sopenharmony_ci fdput(cfile); 50338c2ecf20Sopenharmony_ci fdput(efile); 50348c2ecf20Sopenharmony_ci 50358c2ecf20Sopenharmony_ci return nbytes; 50368c2ecf20Sopenharmony_ci 50378c2ecf20Sopenharmony_ciout_put_css: 50388c2ecf20Sopenharmony_ci css_put(css); 50398c2ecf20Sopenharmony_ciout_put_cfile: 50408c2ecf20Sopenharmony_ci fdput(cfile); 50418c2ecf20Sopenharmony_ciout_put_eventfd: 50428c2ecf20Sopenharmony_ci eventfd_ctx_put(event->eventfd); 50438c2ecf20Sopenharmony_ciout_put_efile: 50448c2ecf20Sopenharmony_ci fdput(efile); 50458c2ecf20Sopenharmony_ciout_kfree: 50468c2ecf20Sopenharmony_ci kfree(event); 50478c2ecf20Sopenharmony_ci 50488c2ecf20Sopenharmony_ci return ret; 50498c2ecf20Sopenharmony_ci} 50508c2ecf20Sopenharmony_ci 50518c2ecf20Sopenharmony_cistatic struct cftype mem_cgroup_legacy_files[] = { 50528c2ecf20Sopenharmony_ci { 50538c2ecf20Sopenharmony_ci .name = "usage_in_bytes", 50548c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 50558c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 50568c2ecf20Sopenharmony_ci }, 50578c2ecf20Sopenharmony_ci { 50588c2ecf20Sopenharmony_ci .name = "max_usage_in_bytes", 50598c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 50608c2ecf20Sopenharmony_ci .write = mem_cgroup_reset, 50618c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 50628c2ecf20Sopenharmony_ci }, 50638c2ecf20Sopenharmony_ci { 50648c2ecf20Sopenharmony_ci .name = "limit_in_bytes", 50658c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 50668c2ecf20Sopenharmony_ci .write = mem_cgroup_write, 50678c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 50688c2ecf20Sopenharmony_ci }, 50698c2ecf20Sopenharmony_ci { 50708c2ecf20Sopenharmony_ci .name = "soft_limit_in_bytes", 50718c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 50728c2ecf20Sopenharmony_ci .write = mem_cgroup_write, 50738c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 50748c2ecf20Sopenharmony_ci }, 50758c2ecf20Sopenharmony_ci { 50768c2ecf20Sopenharmony_ci .name = "failcnt", 50778c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 50788c2ecf20Sopenharmony_ci .write = mem_cgroup_reset, 50798c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 50808c2ecf20Sopenharmony_ci }, 50818c2ecf20Sopenharmony_ci { 50828c2ecf20Sopenharmony_ci .name = "stat", 50838c2ecf20Sopenharmony_ci .seq_show = memcg_stat_show, 50848c2ecf20Sopenharmony_ci }, 50858c2ecf20Sopenharmony_ci { 50868c2ecf20Sopenharmony_ci .name = "force_empty", 50878c2ecf20Sopenharmony_ci .write = mem_cgroup_force_empty_write, 50888c2ecf20Sopenharmony_ci }, 50898c2ecf20Sopenharmony_ci { 50908c2ecf20Sopenharmony_ci .name = "use_hierarchy", 50918c2ecf20Sopenharmony_ci .write_u64 = mem_cgroup_hierarchy_write, 50928c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_hierarchy_read, 50938c2ecf20Sopenharmony_ci }, 50948c2ecf20Sopenharmony_ci { 50958c2ecf20Sopenharmony_ci .name = "cgroup.event_control", /* XXX: for compat */ 50968c2ecf20Sopenharmony_ci .write = memcg_write_event_control, 50978c2ecf20Sopenharmony_ci .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, 50988c2ecf20Sopenharmony_ci }, 50998c2ecf20Sopenharmony_ci { 51008c2ecf20Sopenharmony_ci .name = "swappiness", 51018c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_swappiness_read, 51028c2ecf20Sopenharmony_ci .write_u64 = mem_cgroup_swappiness_write, 51038c2ecf20Sopenharmony_ci }, 51048c2ecf20Sopenharmony_ci { 51058c2ecf20Sopenharmony_ci .name = "move_charge_at_immigrate", 51068c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_move_charge_read, 51078c2ecf20Sopenharmony_ci .write_u64 = mem_cgroup_move_charge_write, 51088c2ecf20Sopenharmony_ci }, 51098c2ecf20Sopenharmony_ci { 51108c2ecf20Sopenharmony_ci .name = "oom_control", 51118c2ecf20Sopenharmony_ci .seq_show = mem_cgroup_oom_control_read, 51128c2ecf20Sopenharmony_ci .write_u64 = mem_cgroup_oom_control_write, 51138c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 51148c2ecf20Sopenharmony_ci }, 51158c2ecf20Sopenharmony_ci { 51168c2ecf20Sopenharmony_ci .name = "pressure_level", 51178c2ecf20Sopenharmony_ci }, 51188c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 51198c2ecf20Sopenharmony_ci { 51208c2ecf20Sopenharmony_ci .name = "numa_stat", 51218c2ecf20Sopenharmony_ci .seq_show = memcg_numa_stat_show, 51228c2ecf20Sopenharmony_ci }, 51238c2ecf20Sopenharmony_ci#endif 51248c2ecf20Sopenharmony_ci { 51258c2ecf20Sopenharmony_ci .name = "kmem.limit_in_bytes", 51268c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 51278c2ecf20Sopenharmony_ci .write = mem_cgroup_write, 51288c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 51298c2ecf20Sopenharmony_ci }, 51308c2ecf20Sopenharmony_ci { 51318c2ecf20Sopenharmony_ci .name = "kmem.usage_in_bytes", 51328c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 51338c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 51348c2ecf20Sopenharmony_ci }, 51358c2ecf20Sopenharmony_ci { 51368c2ecf20Sopenharmony_ci .name = "kmem.failcnt", 51378c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 51388c2ecf20Sopenharmony_ci .write = mem_cgroup_reset, 51398c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 51408c2ecf20Sopenharmony_ci }, 51418c2ecf20Sopenharmony_ci { 51428c2ecf20Sopenharmony_ci .name = "kmem.max_usage_in_bytes", 51438c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 51448c2ecf20Sopenharmony_ci .write = mem_cgroup_reset, 51458c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 51468c2ecf20Sopenharmony_ci }, 51478c2ecf20Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) && \ 51488c2ecf20Sopenharmony_ci (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 51498c2ecf20Sopenharmony_ci { 51508c2ecf20Sopenharmony_ci .name = "kmem.slabinfo", 51518c2ecf20Sopenharmony_ci .seq_show = memcg_slab_show, 51528c2ecf20Sopenharmony_ci }, 51538c2ecf20Sopenharmony_ci#endif 51548c2ecf20Sopenharmony_ci { 51558c2ecf20Sopenharmony_ci .name = "kmem.tcp.limit_in_bytes", 51568c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 51578c2ecf20Sopenharmony_ci .write = mem_cgroup_write, 51588c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 51598c2ecf20Sopenharmony_ci }, 51608c2ecf20Sopenharmony_ci { 51618c2ecf20Sopenharmony_ci .name = "kmem.tcp.usage_in_bytes", 51628c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 51638c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 51648c2ecf20Sopenharmony_ci }, 51658c2ecf20Sopenharmony_ci { 51668c2ecf20Sopenharmony_ci .name = "kmem.tcp.failcnt", 51678c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 51688c2ecf20Sopenharmony_ci .write = mem_cgroup_reset, 51698c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 51708c2ecf20Sopenharmony_ci }, 51718c2ecf20Sopenharmony_ci { 51728c2ecf20Sopenharmony_ci .name = "kmem.tcp.max_usage_in_bytes", 51738c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 51748c2ecf20Sopenharmony_ci .write = mem_cgroup_reset, 51758c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 51768c2ecf20Sopenharmony_ci }, 51778c2ecf20Sopenharmony_ci { }, /* terminate */ 51788c2ecf20Sopenharmony_ci}; 51798c2ecf20Sopenharmony_ci 51808c2ecf20Sopenharmony_ci/* 51818c2ecf20Sopenharmony_ci * Private memory cgroup IDR 51828c2ecf20Sopenharmony_ci * 51838c2ecf20Sopenharmony_ci * Swap-out records and page cache shadow entries need to store memcg 51848c2ecf20Sopenharmony_ci * references in constrained space, so we maintain an ID space that is 51858c2ecf20Sopenharmony_ci * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of 51868c2ecf20Sopenharmony_ci * memory-controlled cgroups to 64k. 51878c2ecf20Sopenharmony_ci * 51888c2ecf20Sopenharmony_ci * However, there usually are many references to the offline CSS after 51898c2ecf20Sopenharmony_ci * the cgroup has been destroyed, such as page cache or reclaimable 51908c2ecf20Sopenharmony_ci * slab objects, that don't need to hang on to the ID. We want to keep 51918c2ecf20Sopenharmony_ci * those dead CSS from occupying IDs, or we might quickly exhaust the 51928c2ecf20Sopenharmony_ci * relatively small ID space and prevent the creation of new cgroups 51938c2ecf20Sopenharmony_ci * even when there are much fewer than 64k cgroups - possibly none. 51948c2ecf20Sopenharmony_ci * 51958c2ecf20Sopenharmony_ci * Maintain a private 16-bit ID space for memcg, and allow the ID to 51968c2ecf20Sopenharmony_ci * be freed and recycled when it's no longer needed, which is usually 51978c2ecf20Sopenharmony_ci * when the CSS is offlined. 51988c2ecf20Sopenharmony_ci * 51998c2ecf20Sopenharmony_ci * The only exception to that are records of swapped out tmpfs/shmem 52008c2ecf20Sopenharmony_ci * pages that need to be attributed to live ancestors on swapin. But 52018c2ecf20Sopenharmony_ci * those references are manageable from userspace. 52028c2ecf20Sopenharmony_ci */ 52038c2ecf20Sopenharmony_ci 52048c2ecf20Sopenharmony_cistatic DEFINE_IDR(mem_cgroup_idr); 52058c2ecf20Sopenharmony_cistatic DEFINE_SPINLOCK(memcg_idr_lock); 52068c2ecf20Sopenharmony_ci 52078c2ecf20Sopenharmony_cistatic int mem_cgroup_alloc_id(void) 52088c2ecf20Sopenharmony_ci{ 52098c2ecf20Sopenharmony_ci int ret; 52108c2ecf20Sopenharmony_ci 52118c2ecf20Sopenharmony_ci idr_preload(GFP_KERNEL); 52128c2ecf20Sopenharmony_ci spin_lock(&memcg_idr_lock); 52138c2ecf20Sopenharmony_ci ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX, 52148c2ecf20Sopenharmony_ci GFP_NOWAIT); 52158c2ecf20Sopenharmony_ci spin_unlock(&memcg_idr_lock); 52168c2ecf20Sopenharmony_ci idr_preload_end(); 52178c2ecf20Sopenharmony_ci return ret; 52188c2ecf20Sopenharmony_ci} 52198c2ecf20Sopenharmony_ci 52208c2ecf20Sopenharmony_cistatic void mem_cgroup_id_remove(struct mem_cgroup *memcg) 52218c2ecf20Sopenharmony_ci{ 52228c2ecf20Sopenharmony_ci if (memcg->id.id > 0) { 52238c2ecf20Sopenharmony_ci spin_lock(&memcg_idr_lock); 52248c2ecf20Sopenharmony_ci idr_remove(&mem_cgroup_idr, memcg->id.id); 52258c2ecf20Sopenharmony_ci spin_unlock(&memcg_idr_lock); 52268c2ecf20Sopenharmony_ci 52278c2ecf20Sopenharmony_ci memcg->id.id = 0; 52288c2ecf20Sopenharmony_ci } 52298c2ecf20Sopenharmony_ci} 52308c2ecf20Sopenharmony_ci 52318c2ecf20Sopenharmony_cistatic void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, 52328c2ecf20Sopenharmony_ci unsigned int n) 52338c2ecf20Sopenharmony_ci{ 52348c2ecf20Sopenharmony_ci refcount_add(n, &memcg->id.ref); 52358c2ecf20Sopenharmony_ci} 52368c2ecf20Sopenharmony_ci 52378c2ecf20Sopenharmony_cistatic void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) 52388c2ecf20Sopenharmony_ci{ 52398c2ecf20Sopenharmony_ci if (refcount_sub_and_test(n, &memcg->id.ref)) { 52408c2ecf20Sopenharmony_ci mem_cgroup_id_remove(memcg); 52418c2ecf20Sopenharmony_ci 52428c2ecf20Sopenharmony_ci /* Memcg ID pins CSS */ 52438c2ecf20Sopenharmony_ci css_put(&memcg->css); 52448c2ecf20Sopenharmony_ci } 52458c2ecf20Sopenharmony_ci} 52468c2ecf20Sopenharmony_ci 52478c2ecf20Sopenharmony_cistatic inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 52488c2ecf20Sopenharmony_ci{ 52498c2ecf20Sopenharmony_ci mem_cgroup_id_put_many(memcg, 1); 52508c2ecf20Sopenharmony_ci} 52518c2ecf20Sopenharmony_ci 52528c2ecf20Sopenharmony_ci/** 52538c2ecf20Sopenharmony_ci * mem_cgroup_from_id - look up a memcg from a memcg id 52548c2ecf20Sopenharmony_ci * @id: the memcg id to look up 52558c2ecf20Sopenharmony_ci * 52568c2ecf20Sopenharmony_ci * Caller must hold rcu_read_lock(). 52578c2ecf20Sopenharmony_ci */ 52588c2ecf20Sopenharmony_cistruct mem_cgroup *mem_cgroup_from_id(unsigned short id) 52598c2ecf20Sopenharmony_ci{ 52608c2ecf20Sopenharmony_ci WARN_ON_ONCE(!rcu_read_lock_held()); 52618c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 52628c2ecf20Sopenharmony_ci if (id == -1) 52638c2ecf20Sopenharmony_ci return NULL; 52648c2ecf20Sopenharmony_ci#endif 52658c2ecf20Sopenharmony_ci return idr_find(&mem_cgroup_idr, id); 52668c2ecf20Sopenharmony_ci} 52678c2ecf20Sopenharmony_ci 52688c2ecf20Sopenharmony_cistatic int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 52698c2ecf20Sopenharmony_ci{ 52708c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *pn; 52718c2ecf20Sopenharmony_ci int tmp = node; 52728c2ecf20Sopenharmony_ci /* 52738c2ecf20Sopenharmony_ci * This routine is called against possible nodes. 52748c2ecf20Sopenharmony_ci * But it's BUG to call kmalloc() against offline node. 52758c2ecf20Sopenharmony_ci * 52768c2ecf20Sopenharmony_ci * TODO: this routine can waste much memory for nodes which will 52778c2ecf20Sopenharmony_ci * never be onlined. It's better to use memory hotplug callback 52788c2ecf20Sopenharmony_ci * function. 52798c2ecf20Sopenharmony_ci */ 52808c2ecf20Sopenharmony_ci if (!node_state(node, N_NORMAL_MEMORY)) 52818c2ecf20Sopenharmony_ci tmp = -1; 52828c2ecf20Sopenharmony_ci pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 52838c2ecf20Sopenharmony_ci if (!pn) 52848c2ecf20Sopenharmony_ci return 1; 52858c2ecf20Sopenharmony_ci 52868c2ecf20Sopenharmony_ci pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat, 52878c2ecf20Sopenharmony_ci GFP_KERNEL_ACCOUNT); 52888c2ecf20Sopenharmony_ci if (!pn->lruvec_stat_local) { 52898c2ecf20Sopenharmony_ci kfree(pn); 52908c2ecf20Sopenharmony_ci return 1; 52918c2ecf20Sopenharmony_ci } 52928c2ecf20Sopenharmony_ci 52938c2ecf20Sopenharmony_ci pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat, 52948c2ecf20Sopenharmony_ci GFP_KERNEL_ACCOUNT); 52958c2ecf20Sopenharmony_ci if (!pn->lruvec_stat_cpu) { 52968c2ecf20Sopenharmony_ci free_percpu(pn->lruvec_stat_local); 52978c2ecf20Sopenharmony_ci kfree(pn); 52988c2ecf20Sopenharmony_ci return 1; 52998c2ecf20Sopenharmony_ci } 53008c2ecf20Sopenharmony_ci 53018c2ecf20Sopenharmony_ci lruvec_init(&pn->lruvec); 53028c2ecf20Sopenharmony_ci pn->usage_in_excess = 0; 53038c2ecf20Sopenharmony_ci pn->lruvec.pgdat = NODE_DATA(node); 53048c2ecf20Sopenharmony_ci pn->on_tree = false; 53058c2ecf20Sopenharmony_ci pn->memcg = memcg; 53068c2ecf20Sopenharmony_ci 53078c2ecf20Sopenharmony_ci memcg->nodeinfo[node] = pn; 53088c2ecf20Sopenharmony_ci return 0; 53098c2ecf20Sopenharmony_ci} 53108c2ecf20Sopenharmony_ci 53118c2ecf20Sopenharmony_cistatic void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 53128c2ecf20Sopenharmony_ci{ 53138c2ecf20Sopenharmony_ci struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 53148c2ecf20Sopenharmony_ci 53158c2ecf20Sopenharmony_ci if (!pn) 53168c2ecf20Sopenharmony_ci return; 53178c2ecf20Sopenharmony_ci 53188c2ecf20Sopenharmony_ci free_percpu(pn->lruvec_stat_cpu); 53198c2ecf20Sopenharmony_ci free_percpu(pn->lruvec_stat_local); 53208c2ecf20Sopenharmony_ci kfree(pn); 53218c2ecf20Sopenharmony_ci} 53228c2ecf20Sopenharmony_ci 53238c2ecf20Sopenharmony_cistatic void __mem_cgroup_free(struct mem_cgroup *memcg) 53248c2ecf20Sopenharmony_ci{ 53258c2ecf20Sopenharmony_ci int node; 53268c2ecf20Sopenharmony_ci 53278c2ecf20Sopenharmony_ci for_each_node(node) 53288c2ecf20Sopenharmony_ci free_mem_cgroup_per_node_info(memcg, node); 53298c2ecf20Sopenharmony_ci free_percpu(memcg->vmstats_percpu); 53308c2ecf20Sopenharmony_ci free_percpu(memcg->vmstats_local); 53318c2ecf20Sopenharmony_ci kfree(memcg); 53328c2ecf20Sopenharmony_ci} 53338c2ecf20Sopenharmony_ci 53348c2ecf20Sopenharmony_cistatic void mem_cgroup_free(struct mem_cgroup *memcg) 53358c2ecf20Sopenharmony_ci{ 53368c2ecf20Sopenharmony_ci memcg_wb_domain_exit(memcg); 53378c2ecf20Sopenharmony_ci /* 53388c2ecf20Sopenharmony_ci * Flush percpu vmstats and vmevents to guarantee the value correctness 53398c2ecf20Sopenharmony_ci * on parent's and all ancestor levels. 53408c2ecf20Sopenharmony_ci */ 53418c2ecf20Sopenharmony_ci memcg_flush_percpu_vmstats(memcg); 53428c2ecf20Sopenharmony_ci memcg_flush_percpu_vmevents(memcg); 53438c2ecf20Sopenharmony_ci __mem_cgroup_free(memcg); 53448c2ecf20Sopenharmony_ci} 53458c2ecf20Sopenharmony_ci 53468c2ecf20Sopenharmony_cistatic struct mem_cgroup *mem_cgroup_alloc(void) 53478c2ecf20Sopenharmony_ci{ 53488c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 53498c2ecf20Sopenharmony_ci unsigned int size; 53508c2ecf20Sopenharmony_ci int node; 53518c2ecf20Sopenharmony_ci int __maybe_unused i; 53528c2ecf20Sopenharmony_ci long error = -ENOMEM; 53538c2ecf20Sopenharmony_ci 53548c2ecf20Sopenharmony_ci size = sizeof(struct mem_cgroup); 53558c2ecf20Sopenharmony_ci size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 53568c2ecf20Sopenharmony_ci 53578c2ecf20Sopenharmony_ci memcg = kzalloc(size, GFP_KERNEL); 53588c2ecf20Sopenharmony_ci if (!memcg) 53598c2ecf20Sopenharmony_ci return ERR_PTR(error); 53608c2ecf20Sopenharmony_ci 53618c2ecf20Sopenharmony_ci memcg->id.id = mem_cgroup_alloc_id(); 53628c2ecf20Sopenharmony_ci if (memcg->id.id < 0) { 53638c2ecf20Sopenharmony_ci error = memcg->id.id; 53648c2ecf20Sopenharmony_ci goto fail; 53658c2ecf20Sopenharmony_ci } 53668c2ecf20Sopenharmony_ci 53678c2ecf20Sopenharmony_ci memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu, 53688c2ecf20Sopenharmony_ci GFP_KERNEL_ACCOUNT); 53698c2ecf20Sopenharmony_ci if (!memcg->vmstats_local) 53708c2ecf20Sopenharmony_ci goto fail; 53718c2ecf20Sopenharmony_ci 53728c2ecf20Sopenharmony_ci memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, 53738c2ecf20Sopenharmony_ci GFP_KERNEL_ACCOUNT); 53748c2ecf20Sopenharmony_ci if (!memcg->vmstats_percpu) 53758c2ecf20Sopenharmony_ci goto fail; 53768c2ecf20Sopenharmony_ci 53778c2ecf20Sopenharmony_ci for_each_node(node) 53788c2ecf20Sopenharmony_ci if (alloc_mem_cgroup_per_node_info(memcg, node)) 53798c2ecf20Sopenharmony_ci goto fail; 53808c2ecf20Sopenharmony_ci 53818c2ecf20Sopenharmony_ci if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 53828c2ecf20Sopenharmony_ci goto fail; 53838c2ecf20Sopenharmony_ci 53848c2ecf20Sopenharmony_ci INIT_WORK(&memcg->high_work, high_work_func); 53858c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&memcg->oom_notify); 53868c2ecf20Sopenharmony_ci mutex_init(&memcg->thresholds_lock); 53878c2ecf20Sopenharmony_ci spin_lock_init(&memcg->move_lock); 53888c2ecf20Sopenharmony_ci vmpressure_init(&memcg->vmpressure); 53898c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&memcg->event_list); 53908c2ecf20Sopenharmony_ci spin_lock_init(&memcg->event_list_lock); 53918c2ecf20Sopenharmony_ci memcg->socket_pressure = jiffies; 53928c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM 53938c2ecf20Sopenharmony_ci memcg->kmemcg_id = -1; 53948c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&memcg->objcg_list); 53958c2ecf20Sopenharmony_ci#endif 53968c2ecf20Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK 53978c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&memcg->cgwb_list); 53988c2ecf20Sopenharmony_ci for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 53998c2ecf20Sopenharmony_ci memcg->cgwb_frn[i].done = 54008c2ecf20Sopenharmony_ci __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); 54018c2ecf20Sopenharmony_ci#endif 54028c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 54038c2ecf20Sopenharmony_ci spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); 54048c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); 54058c2ecf20Sopenharmony_ci memcg->deferred_split_queue.split_queue_len = 0; 54068c2ecf20Sopenharmony_ci#endif 54078c2ecf20Sopenharmony_ci 54088c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG 54098c2ecf20Sopenharmony_ci if (unlikely(!score_head_inited)) { 54108c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&score_head); 54118c2ecf20Sopenharmony_ci score_head_inited = true; 54128c2ecf20Sopenharmony_ci } 54138c2ecf20Sopenharmony_ci#endif 54148c2ecf20Sopenharmony_ci 54158c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG 54168c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&memcg->score_node); 54178c2ecf20Sopenharmony_ci#endif 54188c2ecf20Sopenharmony_ci idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); 54198c2ecf20Sopenharmony_ci return memcg; 54208c2ecf20Sopenharmony_cifail: 54218c2ecf20Sopenharmony_ci mem_cgroup_id_remove(memcg); 54228c2ecf20Sopenharmony_ci __mem_cgroup_free(memcg); 54238c2ecf20Sopenharmony_ci return ERR_PTR(error); 54248c2ecf20Sopenharmony_ci} 54258c2ecf20Sopenharmony_ci 54268c2ecf20Sopenharmony_cistatic struct cgroup_subsys_state * __ref 54278c2ecf20Sopenharmony_cimem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 54288c2ecf20Sopenharmony_ci{ 54298c2ecf20Sopenharmony_ci struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); 54308c2ecf20Sopenharmony_ci struct mem_cgroup *memcg, *old_memcg; 54318c2ecf20Sopenharmony_ci long error = -ENOMEM; 54328c2ecf20Sopenharmony_ci 54338c2ecf20Sopenharmony_ci old_memcg = set_active_memcg(parent); 54348c2ecf20Sopenharmony_ci memcg = mem_cgroup_alloc(); 54358c2ecf20Sopenharmony_ci set_active_memcg(old_memcg); 54368c2ecf20Sopenharmony_ci if (IS_ERR(memcg)) 54378c2ecf20Sopenharmony_ci return ERR_CAST(memcg); 54388c2ecf20Sopenharmony_ci 54398c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG 54408c2ecf20Sopenharmony_ci atomic64_set(&memcg->memcg_reclaimed.app_score, 300); 54418c2ecf20Sopenharmony_ci#endif 54428c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_ZSWAPD 54438c2ecf20Sopenharmony_ci atomic_set(&memcg->memcg_reclaimed.ub_zram2ufs_ratio, 10); 54448c2ecf20Sopenharmony_ci atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, 60); 54458c2ecf20Sopenharmony_ci atomic_set(&memcg->memcg_reclaimed.refault_threshold, 50); 54468c2ecf20Sopenharmony_ci#endif 54478c2ecf20Sopenharmony_ci page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 54488c2ecf20Sopenharmony_ci memcg->soft_limit = PAGE_COUNTER_MAX; 54498c2ecf20Sopenharmony_ci page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 54508c2ecf20Sopenharmony_ci if (parent) { 54518c2ecf20Sopenharmony_ci memcg->swappiness = mem_cgroup_swappiness(parent); 54528c2ecf20Sopenharmony_ci memcg->oom_kill_disable = parent->oom_kill_disable; 54538c2ecf20Sopenharmony_ci } 54548c2ecf20Sopenharmony_ci if (!parent) { 54558c2ecf20Sopenharmony_ci page_counter_init(&memcg->memory, NULL); 54568c2ecf20Sopenharmony_ci page_counter_init(&memcg->swap, NULL); 54578c2ecf20Sopenharmony_ci page_counter_init(&memcg->kmem, NULL); 54588c2ecf20Sopenharmony_ci page_counter_init(&memcg->tcpmem, NULL); 54598c2ecf20Sopenharmony_ci } else if (parent->use_hierarchy) { 54608c2ecf20Sopenharmony_ci memcg->use_hierarchy = true; 54618c2ecf20Sopenharmony_ci page_counter_init(&memcg->memory, &parent->memory); 54628c2ecf20Sopenharmony_ci page_counter_init(&memcg->swap, &parent->swap); 54638c2ecf20Sopenharmony_ci page_counter_init(&memcg->kmem, &parent->kmem); 54648c2ecf20Sopenharmony_ci page_counter_init(&memcg->tcpmem, &parent->tcpmem); 54658c2ecf20Sopenharmony_ci } else { 54668c2ecf20Sopenharmony_ci page_counter_init(&memcg->memory, &root_mem_cgroup->memory); 54678c2ecf20Sopenharmony_ci page_counter_init(&memcg->swap, &root_mem_cgroup->swap); 54688c2ecf20Sopenharmony_ci page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); 54698c2ecf20Sopenharmony_ci page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem); 54708c2ecf20Sopenharmony_ci /* 54718c2ecf20Sopenharmony_ci * Deeper hierachy with use_hierarchy == false doesn't make 54728c2ecf20Sopenharmony_ci * much sense so let cgroup subsystem know about this 54738c2ecf20Sopenharmony_ci * unfortunate state in our controller. 54748c2ecf20Sopenharmony_ci */ 54758c2ecf20Sopenharmony_ci if (parent != root_mem_cgroup) 54768c2ecf20Sopenharmony_ci memory_cgrp_subsys.broken_hierarchy = true; 54778c2ecf20Sopenharmony_ci } 54788c2ecf20Sopenharmony_ci 54798c2ecf20Sopenharmony_ci /* The following stuff does not apply to the root */ 54808c2ecf20Sopenharmony_ci if (!parent) { 54818c2ecf20Sopenharmony_ci root_mem_cgroup = memcg; 54828c2ecf20Sopenharmony_ci return &memcg->css; 54838c2ecf20Sopenharmony_ci } 54848c2ecf20Sopenharmony_ci 54858c2ecf20Sopenharmony_ci error = memcg_online_kmem(memcg); 54868c2ecf20Sopenharmony_ci if (error) 54878c2ecf20Sopenharmony_ci goto fail; 54888c2ecf20Sopenharmony_ci 54898c2ecf20Sopenharmony_ci if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 54908c2ecf20Sopenharmony_ci static_branch_inc(&memcg_sockets_enabled_key); 54918c2ecf20Sopenharmony_ci 54928c2ecf20Sopenharmony_ci return &memcg->css; 54938c2ecf20Sopenharmony_cifail: 54948c2ecf20Sopenharmony_ci mem_cgroup_id_remove(memcg); 54958c2ecf20Sopenharmony_ci mem_cgroup_free(memcg); 54968c2ecf20Sopenharmony_ci return ERR_PTR(error); 54978c2ecf20Sopenharmony_ci} 54988c2ecf20Sopenharmony_ci 54998c2ecf20Sopenharmony_cistatic int mem_cgroup_css_online(struct cgroup_subsys_state *css) 55008c2ecf20Sopenharmony_ci{ 55018c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 55028c2ecf20Sopenharmony_ci 55038c2ecf20Sopenharmony_ci /* 55048c2ecf20Sopenharmony_ci * A memcg must be visible for memcg_expand_shrinker_maps() 55058c2ecf20Sopenharmony_ci * by the time the maps are allocated. So, we allocate maps 55068c2ecf20Sopenharmony_ci * here, when for_each_mem_cgroup() can't skip it. 55078c2ecf20Sopenharmony_ci */ 55088c2ecf20Sopenharmony_ci if (memcg_alloc_shrinker_maps(memcg)) { 55098c2ecf20Sopenharmony_ci mem_cgroup_id_remove(memcg); 55108c2ecf20Sopenharmony_ci return -ENOMEM; 55118c2ecf20Sopenharmony_ci } 55128c2ecf20Sopenharmony_ci 55138c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG 55148c2ecf20Sopenharmony_ci memcg_app_score_update(memcg); 55158c2ecf20Sopenharmony_ci css_get(css); 55168c2ecf20Sopenharmony_ci#endif 55178c2ecf20Sopenharmony_ci 55188c2ecf20Sopenharmony_ci /* Online state pins memcg ID, memcg ID pins CSS */ 55198c2ecf20Sopenharmony_ci refcount_set(&memcg->id.ref, 1); 55208c2ecf20Sopenharmony_ci css_get(css); 55218c2ecf20Sopenharmony_ci return 0; 55228c2ecf20Sopenharmony_ci} 55238c2ecf20Sopenharmony_ci 55248c2ecf20Sopenharmony_cistatic void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 55258c2ecf20Sopenharmony_ci{ 55268c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 55278c2ecf20Sopenharmony_ci struct mem_cgroup_event *event, *tmp; 55288c2ecf20Sopenharmony_ci 55298c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG 55308c2ecf20Sopenharmony_ci unsigned long flags; 55318c2ecf20Sopenharmony_ci 55328c2ecf20Sopenharmony_ci write_lock_irqsave(&score_list_lock, flags); 55338c2ecf20Sopenharmony_ci list_del_init(&memcg->score_node); 55348c2ecf20Sopenharmony_ci write_unlock_irqrestore(&score_list_lock, flags); 55358c2ecf20Sopenharmony_ci css_put(css); 55368c2ecf20Sopenharmony_ci#endif 55378c2ecf20Sopenharmony_ci 55388c2ecf20Sopenharmony_ci /* 55398c2ecf20Sopenharmony_ci * Unregister events and notify userspace. 55408c2ecf20Sopenharmony_ci * Notify userspace about cgroup removing only after rmdir of cgroup 55418c2ecf20Sopenharmony_ci * directory to avoid race between userspace and kernelspace. 55428c2ecf20Sopenharmony_ci */ 55438c2ecf20Sopenharmony_ci spin_lock(&memcg->event_list_lock); 55448c2ecf20Sopenharmony_ci list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 55458c2ecf20Sopenharmony_ci list_del_init(&event->list); 55468c2ecf20Sopenharmony_ci schedule_work(&event->remove); 55478c2ecf20Sopenharmony_ci } 55488c2ecf20Sopenharmony_ci spin_unlock(&memcg->event_list_lock); 55498c2ecf20Sopenharmony_ci 55508c2ecf20Sopenharmony_ci page_counter_set_min(&memcg->memory, 0); 55518c2ecf20Sopenharmony_ci page_counter_set_low(&memcg->memory, 0); 55528c2ecf20Sopenharmony_ci 55538c2ecf20Sopenharmony_ci memcg_offline_kmem(memcg); 55548c2ecf20Sopenharmony_ci wb_memcg_offline(memcg); 55558c2ecf20Sopenharmony_ci 55568c2ecf20Sopenharmony_ci drain_all_stock(memcg); 55578c2ecf20Sopenharmony_ci 55588c2ecf20Sopenharmony_ci mem_cgroup_id_put(memcg); 55598c2ecf20Sopenharmony_ci} 55608c2ecf20Sopenharmony_ci 55618c2ecf20Sopenharmony_cistatic void mem_cgroup_css_released(struct cgroup_subsys_state *css) 55628c2ecf20Sopenharmony_ci{ 55638c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 55648c2ecf20Sopenharmony_ci 55658c2ecf20Sopenharmony_ci invalidate_reclaim_iterators(memcg); 55668c2ecf20Sopenharmony_ci} 55678c2ecf20Sopenharmony_ci 55688c2ecf20Sopenharmony_cistatic void mem_cgroup_css_free(struct cgroup_subsys_state *css) 55698c2ecf20Sopenharmony_ci{ 55708c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 55718c2ecf20Sopenharmony_ci int __maybe_unused i; 55728c2ecf20Sopenharmony_ci 55738c2ecf20Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK 55748c2ecf20Sopenharmony_ci for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 55758c2ecf20Sopenharmony_ci wb_wait_for_completion(&memcg->cgwb_frn[i].done); 55768c2ecf20Sopenharmony_ci#endif 55778c2ecf20Sopenharmony_ci if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 55788c2ecf20Sopenharmony_ci static_branch_dec(&memcg_sockets_enabled_key); 55798c2ecf20Sopenharmony_ci 55808c2ecf20Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) 55818c2ecf20Sopenharmony_ci static_branch_dec(&memcg_sockets_enabled_key); 55828c2ecf20Sopenharmony_ci 55838c2ecf20Sopenharmony_ci vmpressure_cleanup(&memcg->vmpressure); 55848c2ecf20Sopenharmony_ci cancel_work_sync(&memcg->high_work); 55858c2ecf20Sopenharmony_ci mem_cgroup_remove_from_trees(memcg); 55868c2ecf20Sopenharmony_ci memcg_free_shrinker_maps(memcg); 55878c2ecf20Sopenharmony_ci memcg_free_kmem(memcg); 55888c2ecf20Sopenharmony_ci mem_cgroup_free(memcg); 55898c2ecf20Sopenharmony_ci} 55908c2ecf20Sopenharmony_ci 55918c2ecf20Sopenharmony_ci/** 55928c2ecf20Sopenharmony_ci * mem_cgroup_css_reset - reset the states of a mem_cgroup 55938c2ecf20Sopenharmony_ci * @css: the target css 55948c2ecf20Sopenharmony_ci * 55958c2ecf20Sopenharmony_ci * Reset the states of the mem_cgroup associated with @css. This is 55968c2ecf20Sopenharmony_ci * invoked when the userland requests disabling on the default hierarchy 55978c2ecf20Sopenharmony_ci * but the memcg is pinned through dependency. The memcg should stop 55988c2ecf20Sopenharmony_ci * applying policies and should revert to the vanilla state as it may be 55998c2ecf20Sopenharmony_ci * made visible again. 56008c2ecf20Sopenharmony_ci * 56018c2ecf20Sopenharmony_ci * The current implementation only resets the essential configurations. 56028c2ecf20Sopenharmony_ci * This needs to be expanded to cover all the visible parts. 56038c2ecf20Sopenharmony_ci */ 56048c2ecf20Sopenharmony_cistatic void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 56058c2ecf20Sopenharmony_ci{ 56068c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 56078c2ecf20Sopenharmony_ci 56088c2ecf20Sopenharmony_ci page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); 56098c2ecf20Sopenharmony_ci page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); 56108c2ecf20Sopenharmony_ci page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); 56118c2ecf20Sopenharmony_ci page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); 56128c2ecf20Sopenharmony_ci page_counter_set_min(&memcg->memory, 0); 56138c2ecf20Sopenharmony_ci page_counter_set_low(&memcg->memory, 0); 56148c2ecf20Sopenharmony_ci page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 56158c2ecf20Sopenharmony_ci memcg->soft_limit = PAGE_COUNTER_MAX; 56168c2ecf20Sopenharmony_ci page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 56178c2ecf20Sopenharmony_ci memcg_wb_domain_size_changed(memcg); 56188c2ecf20Sopenharmony_ci} 56198c2ecf20Sopenharmony_ci 56208c2ecf20Sopenharmony_ci#ifdef CONFIG_MMU 56218c2ecf20Sopenharmony_ci/* Handlers for move charge at task migration. */ 56228c2ecf20Sopenharmony_cistatic int mem_cgroup_do_precharge(unsigned long count) 56238c2ecf20Sopenharmony_ci{ 56248c2ecf20Sopenharmony_ci int ret; 56258c2ecf20Sopenharmony_ci 56268c2ecf20Sopenharmony_ci /* Try a single bulk charge without reclaim first, kswapd may wake */ 56278c2ecf20Sopenharmony_ci ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); 56288c2ecf20Sopenharmony_ci if (!ret) { 56298c2ecf20Sopenharmony_ci mc.precharge += count; 56308c2ecf20Sopenharmony_ci return ret; 56318c2ecf20Sopenharmony_ci } 56328c2ecf20Sopenharmony_ci 56338c2ecf20Sopenharmony_ci /* Try charges one by one with reclaim, but do not retry */ 56348c2ecf20Sopenharmony_ci while (count--) { 56358c2ecf20Sopenharmony_ci ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); 56368c2ecf20Sopenharmony_ci if (ret) 56378c2ecf20Sopenharmony_ci return ret; 56388c2ecf20Sopenharmony_ci mc.precharge++; 56398c2ecf20Sopenharmony_ci cond_resched(); 56408c2ecf20Sopenharmony_ci } 56418c2ecf20Sopenharmony_ci return 0; 56428c2ecf20Sopenharmony_ci} 56438c2ecf20Sopenharmony_ci 56448c2ecf20Sopenharmony_ciunion mc_target { 56458c2ecf20Sopenharmony_ci struct page *page; 56468c2ecf20Sopenharmony_ci swp_entry_t ent; 56478c2ecf20Sopenharmony_ci}; 56488c2ecf20Sopenharmony_ci 56498c2ecf20Sopenharmony_cienum mc_target_type { 56508c2ecf20Sopenharmony_ci MC_TARGET_NONE = 0, 56518c2ecf20Sopenharmony_ci MC_TARGET_PAGE, 56528c2ecf20Sopenharmony_ci MC_TARGET_SWAP, 56538c2ecf20Sopenharmony_ci MC_TARGET_DEVICE, 56548c2ecf20Sopenharmony_ci}; 56558c2ecf20Sopenharmony_ci 56568c2ecf20Sopenharmony_cistatic struct page *mc_handle_present_pte(struct vm_area_struct *vma, 56578c2ecf20Sopenharmony_ci unsigned long addr, pte_t ptent) 56588c2ecf20Sopenharmony_ci{ 56598c2ecf20Sopenharmony_ci struct page *page = vm_normal_page(vma, addr, ptent); 56608c2ecf20Sopenharmony_ci 56618c2ecf20Sopenharmony_ci if (!page || !page_mapped(page)) 56628c2ecf20Sopenharmony_ci return NULL; 56638c2ecf20Sopenharmony_ci if (PageAnon(page)) { 56648c2ecf20Sopenharmony_ci if (!(mc.flags & MOVE_ANON)) 56658c2ecf20Sopenharmony_ci return NULL; 56668c2ecf20Sopenharmony_ci } else { 56678c2ecf20Sopenharmony_ci if (!(mc.flags & MOVE_FILE)) 56688c2ecf20Sopenharmony_ci return NULL; 56698c2ecf20Sopenharmony_ci } 56708c2ecf20Sopenharmony_ci if (!get_page_unless_zero(page)) 56718c2ecf20Sopenharmony_ci return NULL; 56728c2ecf20Sopenharmony_ci 56738c2ecf20Sopenharmony_ci return page; 56748c2ecf20Sopenharmony_ci} 56758c2ecf20Sopenharmony_ci 56768c2ecf20Sopenharmony_ci#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) 56778c2ecf20Sopenharmony_cistatic struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 56788c2ecf20Sopenharmony_ci pte_t ptent, swp_entry_t *entry) 56798c2ecf20Sopenharmony_ci{ 56808c2ecf20Sopenharmony_ci struct page *page = NULL; 56818c2ecf20Sopenharmony_ci swp_entry_t ent = pte_to_swp_entry(ptent); 56828c2ecf20Sopenharmony_ci 56838c2ecf20Sopenharmony_ci if (!(mc.flags & MOVE_ANON)) 56848c2ecf20Sopenharmony_ci return NULL; 56858c2ecf20Sopenharmony_ci 56868c2ecf20Sopenharmony_ci /* 56878c2ecf20Sopenharmony_ci * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to 56888c2ecf20Sopenharmony_ci * a device and because they are not accessible by CPU they are store 56898c2ecf20Sopenharmony_ci * as special swap entry in the CPU page table. 56908c2ecf20Sopenharmony_ci */ 56918c2ecf20Sopenharmony_ci if (is_device_private_entry(ent)) { 56928c2ecf20Sopenharmony_ci page = device_private_entry_to_page(ent); 56938c2ecf20Sopenharmony_ci /* 56948c2ecf20Sopenharmony_ci * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have 56958c2ecf20Sopenharmony_ci * a refcount of 1 when free (unlike normal page) 56968c2ecf20Sopenharmony_ci */ 56978c2ecf20Sopenharmony_ci if (!page_ref_add_unless(page, 1, 1)) 56988c2ecf20Sopenharmony_ci return NULL; 56998c2ecf20Sopenharmony_ci return page; 57008c2ecf20Sopenharmony_ci } 57018c2ecf20Sopenharmony_ci 57028c2ecf20Sopenharmony_ci if (non_swap_entry(ent)) 57038c2ecf20Sopenharmony_ci return NULL; 57048c2ecf20Sopenharmony_ci 57058c2ecf20Sopenharmony_ci /* 57068c2ecf20Sopenharmony_ci * Because lookup_swap_cache() updates some statistics counter, 57078c2ecf20Sopenharmony_ci * we call find_get_page() with swapper_space directly. 57088c2ecf20Sopenharmony_ci */ 57098c2ecf20Sopenharmony_ci page = find_get_page(swap_address_space(ent), swp_offset(ent)); 57108c2ecf20Sopenharmony_ci entry->val = ent.val; 57118c2ecf20Sopenharmony_ci 57128c2ecf20Sopenharmony_ci return page; 57138c2ecf20Sopenharmony_ci} 57148c2ecf20Sopenharmony_ci#else 57158c2ecf20Sopenharmony_cistatic struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 57168c2ecf20Sopenharmony_ci pte_t ptent, swp_entry_t *entry) 57178c2ecf20Sopenharmony_ci{ 57188c2ecf20Sopenharmony_ci return NULL; 57198c2ecf20Sopenharmony_ci} 57208c2ecf20Sopenharmony_ci#endif 57218c2ecf20Sopenharmony_ci 57228c2ecf20Sopenharmony_cistatic struct page *mc_handle_file_pte(struct vm_area_struct *vma, 57238c2ecf20Sopenharmony_ci unsigned long addr, pte_t ptent, swp_entry_t *entry) 57248c2ecf20Sopenharmony_ci{ 57258c2ecf20Sopenharmony_ci if (!vma->vm_file) /* anonymous vma */ 57268c2ecf20Sopenharmony_ci return NULL; 57278c2ecf20Sopenharmony_ci if (!(mc.flags & MOVE_FILE)) 57288c2ecf20Sopenharmony_ci return NULL; 57298c2ecf20Sopenharmony_ci 57308c2ecf20Sopenharmony_ci /* page is moved even if it's not RSS of this task(page-faulted). */ 57318c2ecf20Sopenharmony_ci /* shmem/tmpfs may report page out on swap: account for that too. */ 57328c2ecf20Sopenharmony_ci return find_get_incore_page(vma->vm_file->f_mapping, 57338c2ecf20Sopenharmony_ci linear_page_index(vma, addr)); 57348c2ecf20Sopenharmony_ci} 57358c2ecf20Sopenharmony_ci 57368c2ecf20Sopenharmony_ci/** 57378c2ecf20Sopenharmony_ci * mem_cgroup_move_account - move account of the page 57388c2ecf20Sopenharmony_ci * @page: the page 57398c2ecf20Sopenharmony_ci * @compound: charge the page as compound or small page 57408c2ecf20Sopenharmony_ci * @from: mem_cgroup which the page is moved from. 57418c2ecf20Sopenharmony_ci * @to: mem_cgroup which the page is moved to. @from != @to. 57428c2ecf20Sopenharmony_ci * 57438c2ecf20Sopenharmony_ci * The caller must make sure the page is not on LRU (isolate_page() is useful.) 57448c2ecf20Sopenharmony_ci * 57458c2ecf20Sopenharmony_ci * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 57468c2ecf20Sopenharmony_ci * from old cgroup. 57478c2ecf20Sopenharmony_ci */ 57488c2ecf20Sopenharmony_cistatic int mem_cgroup_move_account(struct page *page, 57498c2ecf20Sopenharmony_ci bool compound, 57508c2ecf20Sopenharmony_ci struct mem_cgroup *from, 57518c2ecf20Sopenharmony_ci struct mem_cgroup *to) 57528c2ecf20Sopenharmony_ci{ 57538c2ecf20Sopenharmony_ci struct lruvec *from_vec, *to_vec; 57548c2ecf20Sopenharmony_ci struct pglist_data *pgdat; 57558c2ecf20Sopenharmony_ci unsigned int nr_pages = compound ? thp_nr_pages(page) : 1; 57568c2ecf20Sopenharmony_ci int ret; 57578c2ecf20Sopenharmony_ci 57588c2ecf20Sopenharmony_ci VM_BUG_ON(from == to); 57598c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(PageLRU(page), page); 57608c2ecf20Sopenharmony_ci VM_BUG_ON(compound && !PageTransHuge(page)); 57618c2ecf20Sopenharmony_ci 57628c2ecf20Sopenharmony_ci /* 57638c2ecf20Sopenharmony_ci * Prevent mem_cgroup_migrate() from looking at 57648c2ecf20Sopenharmony_ci * page->mem_cgroup of its source page while we change it. 57658c2ecf20Sopenharmony_ci */ 57668c2ecf20Sopenharmony_ci ret = -EBUSY; 57678c2ecf20Sopenharmony_ci if (!trylock_page(page)) 57688c2ecf20Sopenharmony_ci goto out; 57698c2ecf20Sopenharmony_ci 57708c2ecf20Sopenharmony_ci ret = -EINVAL; 57718c2ecf20Sopenharmony_ci if (page->mem_cgroup != from) 57728c2ecf20Sopenharmony_ci goto out_unlock; 57738c2ecf20Sopenharmony_ci 57748c2ecf20Sopenharmony_ci pgdat = page_pgdat(page); 57758c2ecf20Sopenharmony_ci from_vec = mem_cgroup_lruvec(from, pgdat); 57768c2ecf20Sopenharmony_ci to_vec = mem_cgroup_lruvec(to, pgdat); 57778c2ecf20Sopenharmony_ci 57788c2ecf20Sopenharmony_ci lock_page_memcg(page); 57798c2ecf20Sopenharmony_ci 57808c2ecf20Sopenharmony_ci if (PageAnon(page)) { 57818c2ecf20Sopenharmony_ci if (page_mapped(page)) { 57828c2ecf20Sopenharmony_ci __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); 57838c2ecf20Sopenharmony_ci __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); 57848c2ecf20Sopenharmony_ci if (PageTransHuge(page)) { 57858c2ecf20Sopenharmony_ci __dec_lruvec_state(from_vec, NR_ANON_THPS); 57868c2ecf20Sopenharmony_ci __inc_lruvec_state(to_vec, NR_ANON_THPS); 57878c2ecf20Sopenharmony_ci } 57888c2ecf20Sopenharmony_ci 57898c2ecf20Sopenharmony_ci } 57908c2ecf20Sopenharmony_ci } else { 57918c2ecf20Sopenharmony_ci __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); 57928c2ecf20Sopenharmony_ci __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); 57938c2ecf20Sopenharmony_ci 57948c2ecf20Sopenharmony_ci if (PageSwapBacked(page)) { 57958c2ecf20Sopenharmony_ci __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); 57968c2ecf20Sopenharmony_ci __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); 57978c2ecf20Sopenharmony_ci } 57988c2ecf20Sopenharmony_ci 57998c2ecf20Sopenharmony_ci if (page_mapped(page)) { 58008c2ecf20Sopenharmony_ci __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); 58018c2ecf20Sopenharmony_ci __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); 58028c2ecf20Sopenharmony_ci } 58038c2ecf20Sopenharmony_ci 58048c2ecf20Sopenharmony_ci if (PageDirty(page)) { 58058c2ecf20Sopenharmony_ci struct address_space *mapping = page_mapping(page); 58068c2ecf20Sopenharmony_ci 58078c2ecf20Sopenharmony_ci if (mapping_can_writeback(mapping)) { 58088c2ecf20Sopenharmony_ci __mod_lruvec_state(from_vec, NR_FILE_DIRTY, 58098c2ecf20Sopenharmony_ci -nr_pages); 58108c2ecf20Sopenharmony_ci __mod_lruvec_state(to_vec, NR_FILE_DIRTY, 58118c2ecf20Sopenharmony_ci nr_pages); 58128c2ecf20Sopenharmony_ci } 58138c2ecf20Sopenharmony_ci } 58148c2ecf20Sopenharmony_ci } 58158c2ecf20Sopenharmony_ci 58168c2ecf20Sopenharmony_ci if (PageWriteback(page)) { 58178c2ecf20Sopenharmony_ci __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); 58188c2ecf20Sopenharmony_ci __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); 58198c2ecf20Sopenharmony_ci } 58208c2ecf20Sopenharmony_ci 58218c2ecf20Sopenharmony_ci /* 58228c2ecf20Sopenharmony_ci * All state has been migrated, let's switch to the new memcg. 58238c2ecf20Sopenharmony_ci * 58248c2ecf20Sopenharmony_ci * It is safe to change page->mem_cgroup here because the page 58258c2ecf20Sopenharmony_ci * is referenced, charged, isolated, and locked: we can't race 58268c2ecf20Sopenharmony_ci * with (un)charging, migration, LRU putback, or anything else 58278c2ecf20Sopenharmony_ci * that would rely on a stable page->mem_cgroup. 58288c2ecf20Sopenharmony_ci * 58298c2ecf20Sopenharmony_ci * Note that lock_page_memcg is a memcg lock, not a page lock, 58308c2ecf20Sopenharmony_ci * to save space. As soon as we switch page->mem_cgroup to a 58318c2ecf20Sopenharmony_ci * new memcg that isn't locked, the above state can change 58328c2ecf20Sopenharmony_ci * concurrently again. Make sure we're truly done with it. 58338c2ecf20Sopenharmony_ci */ 58348c2ecf20Sopenharmony_ci smp_mb(); 58358c2ecf20Sopenharmony_ci 58368c2ecf20Sopenharmony_ci css_get(&to->css); 58378c2ecf20Sopenharmony_ci css_put(&from->css); 58388c2ecf20Sopenharmony_ci 58398c2ecf20Sopenharmony_ci page->mem_cgroup = to; 58408c2ecf20Sopenharmony_ci 58418c2ecf20Sopenharmony_ci __unlock_page_memcg(from); 58428c2ecf20Sopenharmony_ci 58438c2ecf20Sopenharmony_ci ret = 0; 58448c2ecf20Sopenharmony_ci 58458c2ecf20Sopenharmony_ci local_irq_disable(); 58468c2ecf20Sopenharmony_ci mem_cgroup_charge_statistics(to, page, nr_pages); 58478c2ecf20Sopenharmony_ci memcg_check_events(to, page); 58488c2ecf20Sopenharmony_ci mem_cgroup_charge_statistics(from, page, -nr_pages); 58498c2ecf20Sopenharmony_ci memcg_check_events(from, page); 58508c2ecf20Sopenharmony_ci local_irq_enable(); 58518c2ecf20Sopenharmony_ciout_unlock: 58528c2ecf20Sopenharmony_ci unlock_page(page); 58538c2ecf20Sopenharmony_ciout: 58548c2ecf20Sopenharmony_ci return ret; 58558c2ecf20Sopenharmony_ci} 58568c2ecf20Sopenharmony_ci 58578c2ecf20Sopenharmony_ci/** 58588c2ecf20Sopenharmony_ci * get_mctgt_type - get target type of moving charge 58598c2ecf20Sopenharmony_ci * @vma: the vma the pte to be checked belongs 58608c2ecf20Sopenharmony_ci * @addr: the address corresponding to the pte to be checked 58618c2ecf20Sopenharmony_ci * @ptent: the pte to be checked 58628c2ecf20Sopenharmony_ci * @target: the pointer the target page or swap ent will be stored(can be NULL) 58638c2ecf20Sopenharmony_ci * 58648c2ecf20Sopenharmony_ci * Returns 58658c2ecf20Sopenharmony_ci * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 58668c2ecf20Sopenharmony_ci * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 58678c2ecf20Sopenharmony_ci * move charge. if @target is not NULL, the page is stored in target->page 58688c2ecf20Sopenharmony_ci * with extra refcnt got(Callers should handle it). 58698c2ecf20Sopenharmony_ci * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 58708c2ecf20Sopenharmony_ci * target for charge migration. if @target is not NULL, the entry is stored 58718c2ecf20Sopenharmony_ci * in target->ent. 58728c2ecf20Sopenharmony_ci * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE 58738c2ecf20Sopenharmony_ci * (so ZONE_DEVICE page and thus not on the lru). 58748c2ecf20Sopenharmony_ci * For now we such page is charge like a regular page would be as for all 58758c2ecf20Sopenharmony_ci * intent and purposes it is just special memory taking the place of a 58768c2ecf20Sopenharmony_ci * regular page. 58778c2ecf20Sopenharmony_ci * 58788c2ecf20Sopenharmony_ci * See Documentations/vm/hmm.txt and include/linux/hmm.h 58798c2ecf20Sopenharmony_ci * 58808c2ecf20Sopenharmony_ci * Called with pte lock held. 58818c2ecf20Sopenharmony_ci */ 58828c2ecf20Sopenharmony_ci 58838c2ecf20Sopenharmony_cistatic enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 58848c2ecf20Sopenharmony_ci unsigned long addr, pte_t ptent, union mc_target *target) 58858c2ecf20Sopenharmony_ci{ 58868c2ecf20Sopenharmony_ci struct page *page = NULL; 58878c2ecf20Sopenharmony_ci enum mc_target_type ret = MC_TARGET_NONE; 58888c2ecf20Sopenharmony_ci swp_entry_t ent = { .val = 0 }; 58898c2ecf20Sopenharmony_ci 58908c2ecf20Sopenharmony_ci if (pte_present(ptent)) 58918c2ecf20Sopenharmony_ci page = mc_handle_present_pte(vma, addr, ptent); 58928c2ecf20Sopenharmony_ci else if (is_swap_pte(ptent)) 58938c2ecf20Sopenharmony_ci page = mc_handle_swap_pte(vma, ptent, &ent); 58948c2ecf20Sopenharmony_ci else if (pte_none(ptent)) 58958c2ecf20Sopenharmony_ci page = mc_handle_file_pte(vma, addr, ptent, &ent); 58968c2ecf20Sopenharmony_ci 58978c2ecf20Sopenharmony_ci if (!page && !ent.val) 58988c2ecf20Sopenharmony_ci return ret; 58998c2ecf20Sopenharmony_ci if (page) { 59008c2ecf20Sopenharmony_ci /* 59018c2ecf20Sopenharmony_ci * Do only loose check w/o serialization. 59028c2ecf20Sopenharmony_ci * mem_cgroup_move_account() checks the page is valid or 59038c2ecf20Sopenharmony_ci * not under LRU exclusion. 59048c2ecf20Sopenharmony_ci */ 59058c2ecf20Sopenharmony_ci if (page->mem_cgroup == mc.from) { 59068c2ecf20Sopenharmony_ci ret = MC_TARGET_PAGE; 59078c2ecf20Sopenharmony_ci if (is_device_private_page(page)) 59088c2ecf20Sopenharmony_ci ret = MC_TARGET_DEVICE; 59098c2ecf20Sopenharmony_ci if (target) 59108c2ecf20Sopenharmony_ci target->page = page; 59118c2ecf20Sopenharmony_ci } 59128c2ecf20Sopenharmony_ci if (!ret || !target) 59138c2ecf20Sopenharmony_ci put_page(page); 59148c2ecf20Sopenharmony_ci } 59158c2ecf20Sopenharmony_ci /* 59168c2ecf20Sopenharmony_ci * There is a swap entry and a page doesn't exist or isn't charged. 59178c2ecf20Sopenharmony_ci * But we cannot move a tail-page in a THP. 59188c2ecf20Sopenharmony_ci */ 59198c2ecf20Sopenharmony_ci if (ent.val && !ret && (!page || !PageTransCompound(page)) && 59208c2ecf20Sopenharmony_ci mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 59218c2ecf20Sopenharmony_ci ret = MC_TARGET_SWAP; 59228c2ecf20Sopenharmony_ci if (target) 59238c2ecf20Sopenharmony_ci target->ent = ent; 59248c2ecf20Sopenharmony_ci } 59258c2ecf20Sopenharmony_ci return ret; 59268c2ecf20Sopenharmony_ci} 59278c2ecf20Sopenharmony_ci 59288c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 59298c2ecf20Sopenharmony_ci/* 59308c2ecf20Sopenharmony_ci * We don't consider PMD mapped swapping or file mapped pages because THP does 59318c2ecf20Sopenharmony_ci * not support them for now. 59328c2ecf20Sopenharmony_ci * Caller should make sure that pmd_trans_huge(pmd) is true. 59338c2ecf20Sopenharmony_ci */ 59348c2ecf20Sopenharmony_cistatic enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 59358c2ecf20Sopenharmony_ci unsigned long addr, pmd_t pmd, union mc_target *target) 59368c2ecf20Sopenharmony_ci{ 59378c2ecf20Sopenharmony_ci struct page *page = NULL; 59388c2ecf20Sopenharmony_ci enum mc_target_type ret = MC_TARGET_NONE; 59398c2ecf20Sopenharmony_ci 59408c2ecf20Sopenharmony_ci if (unlikely(is_swap_pmd(pmd))) { 59418c2ecf20Sopenharmony_ci VM_BUG_ON(thp_migration_supported() && 59428c2ecf20Sopenharmony_ci !is_pmd_migration_entry(pmd)); 59438c2ecf20Sopenharmony_ci return ret; 59448c2ecf20Sopenharmony_ci } 59458c2ecf20Sopenharmony_ci page = pmd_page(pmd); 59468c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!page || !PageHead(page), page); 59478c2ecf20Sopenharmony_ci if (!(mc.flags & MOVE_ANON)) 59488c2ecf20Sopenharmony_ci return ret; 59498c2ecf20Sopenharmony_ci if (page->mem_cgroup == mc.from) { 59508c2ecf20Sopenharmony_ci ret = MC_TARGET_PAGE; 59518c2ecf20Sopenharmony_ci if (target) { 59528c2ecf20Sopenharmony_ci get_page(page); 59538c2ecf20Sopenharmony_ci target->page = page; 59548c2ecf20Sopenharmony_ci } 59558c2ecf20Sopenharmony_ci } 59568c2ecf20Sopenharmony_ci return ret; 59578c2ecf20Sopenharmony_ci} 59588c2ecf20Sopenharmony_ci#else 59598c2ecf20Sopenharmony_cistatic inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 59608c2ecf20Sopenharmony_ci unsigned long addr, pmd_t pmd, union mc_target *target) 59618c2ecf20Sopenharmony_ci{ 59628c2ecf20Sopenharmony_ci return MC_TARGET_NONE; 59638c2ecf20Sopenharmony_ci} 59648c2ecf20Sopenharmony_ci#endif 59658c2ecf20Sopenharmony_ci 59668c2ecf20Sopenharmony_cistatic int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 59678c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 59688c2ecf20Sopenharmony_ci struct mm_walk *walk) 59698c2ecf20Sopenharmony_ci{ 59708c2ecf20Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 59718c2ecf20Sopenharmony_ci pte_t *pte; 59728c2ecf20Sopenharmony_ci spinlock_t *ptl; 59738c2ecf20Sopenharmony_ci 59748c2ecf20Sopenharmony_ci ptl = pmd_trans_huge_lock(pmd, vma); 59758c2ecf20Sopenharmony_ci if (ptl) { 59768c2ecf20Sopenharmony_ci /* 59778c2ecf20Sopenharmony_ci * Note their can not be MC_TARGET_DEVICE for now as we do not 59788c2ecf20Sopenharmony_ci * support transparent huge page with MEMORY_DEVICE_PRIVATE but 59798c2ecf20Sopenharmony_ci * this might change. 59808c2ecf20Sopenharmony_ci */ 59818c2ecf20Sopenharmony_ci if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 59828c2ecf20Sopenharmony_ci mc.precharge += HPAGE_PMD_NR; 59838c2ecf20Sopenharmony_ci spin_unlock(ptl); 59848c2ecf20Sopenharmony_ci return 0; 59858c2ecf20Sopenharmony_ci } 59868c2ecf20Sopenharmony_ci 59878c2ecf20Sopenharmony_ci if (pmd_trans_unstable(pmd)) 59888c2ecf20Sopenharmony_ci return 0; 59898c2ecf20Sopenharmony_ci pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 59908c2ecf20Sopenharmony_ci for (; addr != end; pte++, addr += PAGE_SIZE) 59918c2ecf20Sopenharmony_ci if (get_mctgt_type(vma, addr, *pte, NULL)) 59928c2ecf20Sopenharmony_ci mc.precharge++; /* increment precharge temporarily */ 59938c2ecf20Sopenharmony_ci pte_unmap_unlock(pte - 1, ptl); 59948c2ecf20Sopenharmony_ci cond_resched(); 59958c2ecf20Sopenharmony_ci 59968c2ecf20Sopenharmony_ci return 0; 59978c2ecf20Sopenharmony_ci} 59988c2ecf20Sopenharmony_ci 59998c2ecf20Sopenharmony_cistatic const struct mm_walk_ops precharge_walk_ops = { 60008c2ecf20Sopenharmony_ci .pmd_entry = mem_cgroup_count_precharge_pte_range, 60018c2ecf20Sopenharmony_ci}; 60028c2ecf20Sopenharmony_ci 60038c2ecf20Sopenharmony_cistatic unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 60048c2ecf20Sopenharmony_ci{ 60058c2ecf20Sopenharmony_ci unsigned long precharge; 60068c2ecf20Sopenharmony_ci 60078c2ecf20Sopenharmony_ci mmap_read_lock(mm); 60088c2ecf20Sopenharmony_ci walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); 60098c2ecf20Sopenharmony_ci mmap_read_unlock(mm); 60108c2ecf20Sopenharmony_ci 60118c2ecf20Sopenharmony_ci precharge = mc.precharge; 60128c2ecf20Sopenharmony_ci mc.precharge = 0; 60138c2ecf20Sopenharmony_ci 60148c2ecf20Sopenharmony_ci return precharge; 60158c2ecf20Sopenharmony_ci} 60168c2ecf20Sopenharmony_ci 60178c2ecf20Sopenharmony_cistatic int mem_cgroup_precharge_mc(struct mm_struct *mm) 60188c2ecf20Sopenharmony_ci{ 60198c2ecf20Sopenharmony_ci unsigned long precharge = mem_cgroup_count_precharge(mm); 60208c2ecf20Sopenharmony_ci 60218c2ecf20Sopenharmony_ci VM_BUG_ON(mc.moving_task); 60228c2ecf20Sopenharmony_ci mc.moving_task = current; 60238c2ecf20Sopenharmony_ci return mem_cgroup_do_precharge(precharge); 60248c2ecf20Sopenharmony_ci} 60258c2ecf20Sopenharmony_ci 60268c2ecf20Sopenharmony_ci/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 60278c2ecf20Sopenharmony_cistatic void __mem_cgroup_clear_mc(void) 60288c2ecf20Sopenharmony_ci{ 60298c2ecf20Sopenharmony_ci struct mem_cgroup *from = mc.from; 60308c2ecf20Sopenharmony_ci struct mem_cgroup *to = mc.to; 60318c2ecf20Sopenharmony_ci 60328c2ecf20Sopenharmony_ci /* we must uncharge all the leftover precharges from mc.to */ 60338c2ecf20Sopenharmony_ci if (mc.precharge) { 60348c2ecf20Sopenharmony_ci cancel_charge(mc.to, mc.precharge); 60358c2ecf20Sopenharmony_ci mc.precharge = 0; 60368c2ecf20Sopenharmony_ci } 60378c2ecf20Sopenharmony_ci /* 60388c2ecf20Sopenharmony_ci * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 60398c2ecf20Sopenharmony_ci * we must uncharge here. 60408c2ecf20Sopenharmony_ci */ 60418c2ecf20Sopenharmony_ci if (mc.moved_charge) { 60428c2ecf20Sopenharmony_ci cancel_charge(mc.from, mc.moved_charge); 60438c2ecf20Sopenharmony_ci mc.moved_charge = 0; 60448c2ecf20Sopenharmony_ci } 60458c2ecf20Sopenharmony_ci /* we must fixup refcnts and charges */ 60468c2ecf20Sopenharmony_ci if (mc.moved_swap) { 60478c2ecf20Sopenharmony_ci /* uncharge swap account from the old cgroup */ 60488c2ecf20Sopenharmony_ci if (!mem_cgroup_is_root(mc.from)) 60498c2ecf20Sopenharmony_ci page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 60508c2ecf20Sopenharmony_ci 60518c2ecf20Sopenharmony_ci mem_cgroup_id_put_many(mc.from, mc.moved_swap); 60528c2ecf20Sopenharmony_ci 60538c2ecf20Sopenharmony_ci /* 60548c2ecf20Sopenharmony_ci * we charged both to->memory and to->memsw, so we 60558c2ecf20Sopenharmony_ci * should uncharge to->memory. 60568c2ecf20Sopenharmony_ci */ 60578c2ecf20Sopenharmony_ci if (!mem_cgroup_is_root(mc.to)) 60588c2ecf20Sopenharmony_ci page_counter_uncharge(&mc.to->memory, mc.moved_swap); 60598c2ecf20Sopenharmony_ci 60608c2ecf20Sopenharmony_ci mc.moved_swap = 0; 60618c2ecf20Sopenharmony_ci } 60628c2ecf20Sopenharmony_ci memcg_oom_recover(from); 60638c2ecf20Sopenharmony_ci memcg_oom_recover(to); 60648c2ecf20Sopenharmony_ci wake_up_all(&mc.waitq); 60658c2ecf20Sopenharmony_ci} 60668c2ecf20Sopenharmony_ci 60678c2ecf20Sopenharmony_cistatic void mem_cgroup_clear_mc(void) 60688c2ecf20Sopenharmony_ci{ 60698c2ecf20Sopenharmony_ci struct mm_struct *mm = mc.mm; 60708c2ecf20Sopenharmony_ci 60718c2ecf20Sopenharmony_ci /* 60728c2ecf20Sopenharmony_ci * we must clear moving_task before waking up waiters at the end of 60738c2ecf20Sopenharmony_ci * task migration. 60748c2ecf20Sopenharmony_ci */ 60758c2ecf20Sopenharmony_ci mc.moving_task = NULL; 60768c2ecf20Sopenharmony_ci __mem_cgroup_clear_mc(); 60778c2ecf20Sopenharmony_ci spin_lock(&mc.lock); 60788c2ecf20Sopenharmony_ci mc.from = NULL; 60798c2ecf20Sopenharmony_ci mc.to = NULL; 60808c2ecf20Sopenharmony_ci mc.mm = NULL; 60818c2ecf20Sopenharmony_ci spin_unlock(&mc.lock); 60828c2ecf20Sopenharmony_ci 60838c2ecf20Sopenharmony_ci mmput(mm); 60848c2ecf20Sopenharmony_ci} 60858c2ecf20Sopenharmony_ci 60868c2ecf20Sopenharmony_cistatic int mem_cgroup_can_attach(struct cgroup_taskset *tset) 60878c2ecf20Sopenharmony_ci{ 60888c2ecf20Sopenharmony_ci struct cgroup_subsys_state *css; 60898c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ 60908c2ecf20Sopenharmony_ci struct mem_cgroup *from; 60918c2ecf20Sopenharmony_ci struct task_struct *leader, *p; 60928c2ecf20Sopenharmony_ci struct mm_struct *mm; 60938c2ecf20Sopenharmony_ci unsigned long move_flags; 60948c2ecf20Sopenharmony_ci int ret = 0; 60958c2ecf20Sopenharmony_ci 60968c2ecf20Sopenharmony_ci /* charge immigration isn't supported on the default hierarchy */ 60978c2ecf20Sopenharmony_ci if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 60988c2ecf20Sopenharmony_ci return 0; 60998c2ecf20Sopenharmony_ci 61008c2ecf20Sopenharmony_ci /* 61018c2ecf20Sopenharmony_ci * Multi-process migrations only happen on the default hierarchy 61028c2ecf20Sopenharmony_ci * where charge immigration is not used. Perform charge 61038c2ecf20Sopenharmony_ci * immigration if @tset contains a leader and whine if there are 61048c2ecf20Sopenharmony_ci * multiple. 61058c2ecf20Sopenharmony_ci */ 61068c2ecf20Sopenharmony_ci p = NULL; 61078c2ecf20Sopenharmony_ci cgroup_taskset_for_each_leader(leader, css, tset) { 61088c2ecf20Sopenharmony_ci WARN_ON_ONCE(p); 61098c2ecf20Sopenharmony_ci p = leader; 61108c2ecf20Sopenharmony_ci memcg = mem_cgroup_from_css(css); 61118c2ecf20Sopenharmony_ci } 61128c2ecf20Sopenharmony_ci if (!p) 61138c2ecf20Sopenharmony_ci return 0; 61148c2ecf20Sopenharmony_ci 61158c2ecf20Sopenharmony_ci /* 61168c2ecf20Sopenharmony_ci * We are now commited to this value whatever it is. Changes in this 61178c2ecf20Sopenharmony_ci * tunable will only affect upcoming migrations, not the current one. 61188c2ecf20Sopenharmony_ci * So we need to save it, and keep it going. 61198c2ecf20Sopenharmony_ci */ 61208c2ecf20Sopenharmony_ci move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 61218c2ecf20Sopenharmony_ci if (!move_flags) 61228c2ecf20Sopenharmony_ci return 0; 61238c2ecf20Sopenharmony_ci 61248c2ecf20Sopenharmony_ci from = mem_cgroup_from_task(p); 61258c2ecf20Sopenharmony_ci 61268c2ecf20Sopenharmony_ci VM_BUG_ON(from == memcg); 61278c2ecf20Sopenharmony_ci 61288c2ecf20Sopenharmony_ci mm = get_task_mm(p); 61298c2ecf20Sopenharmony_ci if (!mm) 61308c2ecf20Sopenharmony_ci return 0; 61318c2ecf20Sopenharmony_ci /* We move charges only when we move a owner of the mm */ 61328c2ecf20Sopenharmony_ci if (mm->owner == p) { 61338c2ecf20Sopenharmony_ci VM_BUG_ON(mc.from); 61348c2ecf20Sopenharmony_ci VM_BUG_ON(mc.to); 61358c2ecf20Sopenharmony_ci VM_BUG_ON(mc.precharge); 61368c2ecf20Sopenharmony_ci VM_BUG_ON(mc.moved_charge); 61378c2ecf20Sopenharmony_ci VM_BUG_ON(mc.moved_swap); 61388c2ecf20Sopenharmony_ci 61398c2ecf20Sopenharmony_ci spin_lock(&mc.lock); 61408c2ecf20Sopenharmony_ci mc.mm = mm; 61418c2ecf20Sopenharmony_ci mc.from = from; 61428c2ecf20Sopenharmony_ci mc.to = memcg; 61438c2ecf20Sopenharmony_ci mc.flags = move_flags; 61448c2ecf20Sopenharmony_ci spin_unlock(&mc.lock); 61458c2ecf20Sopenharmony_ci /* We set mc.moving_task later */ 61468c2ecf20Sopenharmony_ci 61478c2ecf20Sopenharmony_ci ret = mem_cgroup_precharge_mc(mm); 61488c2ecf20Sopenharmony_ci if (ret) 61498c2ecf20Sopenharmony_ci mem_cgroup_clear_mc(); 61508c2ecf20Sopenharmony_ci } else { 61518c2ecf20Sopenharmony_ci mmput(mm); 61528c2ecf20Sopenharmony_ci } 61538c2ecf20Sopenharmony_ci return ret; 61548c2ecf20Sopenharmony_ci} 61558c2ecf20Sopenharmony_ci 61568c2ecf20Sopenharmony_cistatic void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 61578c2ecf20Sopenharmony_ci{ 61588c2ecf20Sopenharmony_ci if (mc.to) 61598c2ecf20Sopenharmony_ci mem_cgroup_clear_mc(); 61608c2ecf20Sopenharmony_ci} 61618c2ecf20Sopenharmony_ci 61628c2ecf20Sopenharmony_cistatic int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 61638c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 61648c2ecf20Sopenharmony_ci struct mm_walk *walk) 61658c2ecf20Sopenharmony_ci{ 61668c2ecf20Sopenharmony_ci int ret = 0; 61678c2ecf20Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 61688c2ecf20Sopenharmony_ci pte_t *pte; 61698c2ecf20Sopenharmony_ci spinlock_t *ptl; 61708c2ecf20Sopenharmony_ci enum mc_target_type target_type; 61718c2ecf20Sopenharmony_ci union mc_target target; 61728c2ecf20Sopenharmony_ci struct page *page; 61738c2ecf20Sopenharmony_ci 61748c2ecf20Sopenharmony_ci ptl = pmd_trans_huge_lock(pmd, vma); 61758c2ecf20Sopenharmony_ci if (ptl) { 61768c2ecf20Sopenharmony_ci if (mc.precharge < HPAGE_PMD_NR) { 61778c2ecf20Sopenharmony_ci spin_unlock(ptl); 61788c2ecf20Sopenharmony_ci return 0; 61798c2ecf20Sopenharmony_ci } 61808c2ecf20Sopenharmony_ci target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 61818c2ecf20Sopenharmony_ci if (target_type == MC_TARGET_PAGE) { 61828c2ecf20Sopenharmony_ci page = target.page; 61838c2ecf20Sopenharmony_ci if (!isolate_lru_page(page)) { 61848c2ecf20Sopenharmony_ci if (!mem_cgroup_move_account(page, true, 61858c2ecf20Sopenharmony_ci mc.from, mc.to)) { 61868c2ecf20Sopenharmony_ci mc.precharge -= HPAGE_PMD_NR; 61878c2ecf20Sopenharmony_ci mc.moved_charge += HPAGE_PMD_NR; 61888c2ecf20Sopenharmony_ci } 61898c2ecf20Sopenharmony_ci putback_lru_page(page); 61908c2ecf20Sopenharmony_ci } 61918c2ecf20Sopenharmony_ci put_page(page); 61928c2ecf20Sopenharmony_ci } else if (target_type == MC_TARGET_DEVICE) { 61938c2ecf20Sopenharmony_ci page = target.page; 61948c2ecf20Sopenharmony_ci if (!mem_cgroup_move_account(page, true, 61958c2ecf20Sopenharmony_ci mc.from, mc.to)) { 61968c2ecf20Sopenharmony_ci mc.precharge -= HPAGE_PMD_NR; 61978c2ecf20Sopenharmony_ci mc.moved_charge += HPAGE_PMD_NR; 61988c2ecf20Sopenharmony_ci } 61998c2ecf20Sopenharmony_ci put_page(page); 62008c2ecf20Sopenharmony_ci } 62018c2ecf20Sopenharmony_ci spin_unlock(ptl); 62028c2ecf20Sopenharmony_ci return 0; 62038c2ecf20Sopenharmony_ci } 62048c2ecf20Sopenharmony_ci 62058c2ecf20Sopenharmony_ci if (pmd_trans_unstable(pmd)) 62068c2ecf20Sopenharmony_ci return 0; 62078c2ecf20Sopenharmony_ciretry: 62088c2ecf20Sopenharmony_ci pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 62098c2ecf20Sopenharmony_ci for (; addr != end; addr += PAGE_SIZE) { 62108c2ecf20Sopenharmony_ci pte_t ptent = *(pte++); 62118c2ecf20Sopenharmony_ci bool device = false; 62128c2ecf20Sopenharmony_ci swp_entry_t ent; 62138c2ecf20Sopenharmony_ci 62148c2ecf20Sopenharmony_ci if (!mc.precharge) 62158c2ecf20Sopenharmony_ci break; 62168c2ecf20Sopenharmony_ci 62178c2ecf20Sopenharmony_ci switch (get_mctgt_type(vma, addr, ptent, &target)) { 62188c2ecf20Sopenharmony_ci case MC_TARGET_DEVICE: 62198c2ecf20Sopenharmony_ci device = true; 62208c2ecf20Sopenharmony_ci fallthrough; 62218c2ecf20Sopenharmony_ci case MC_TARGET_PAGE: 62228c2ecf20Sopenharmony_ci page = target.page; 62238c2ecf20Sopenharmony_ci /* 62248c2ecf20Sopenharmony_ci * We can have a part of the split pmd here. Moving it 62258c2ecf20Sopenharmony_ci * can be done but it would be too convoluted so simply 62268c2ecf20Sopenharmony_ci * ignore such a partial THP and keep it in original 62278c2ecf20Sopenharmony_ci * memcg. There should be somebody mapping the head. 62288c2ecf20Sopenharmony_ci */ 62298c2ecf20Sopenharmony_ci if (PageTransCompound(page)) 62308c2ecf20Sopenharmony_ci goto put; 62318c2ecf20Sopenharmony_ci if (!device && isolate_lru_page(page)) 62328c2ecf20Sopenharmony_ci goto put; 62338c2ecf20Sopenharmony_ci if (!mem_cgroup_move_account(page, false, 62348c2ecf20Sopenharmony_ci mc.from, mc.to)) { 62358c2ecf20Sopenharmony_ci mc.precharge--; 62368c2ecf20Sopenharmony_ci /* we uncharge from mc.from later. */ 62378c2ecf20Sopenharmony_ci mc.moved_charge++; 62388c2ecf20Sopenharmony_ci } 62398c2ecf20Sopenharmony_ci if (!device) 62408c2ecf20Sopenharmony_ci putback_lru_page(page); 62418c2ecf20Sopenharmony_ciput: /* get_mctgt_type() gets the page */ 62428c2ecf20Sopenharmony_ci put_page(page); 62438c2ecf20Sopenharmony_ci break; 62448c2ecf20Sopenharmony_ci case MC_TARGET_SWAP: 62458c2ecf20Sopenharmony_ci ent = target.ent; 62468c2ecf20Sopenharmony_ci if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 62478c2ecf20Sopenharmony_ci mc.precharge--; 62488c2ecf20Sopenharmony_ci mem_cgroup_id_get_many(mc.to, 1); 62498c2ecf20Sopenharmony_ci /* we fixup other refcnts and charges later. */ 62508c2ecf20Sopenharmony_ci mc.moved_swap++; 62518c2ecf20Sopenharmony_ci } 62528c2ecf20Sopenharmony_ci break; 62538c2ecf20Sopenharmony_ci default: 62548c2ecf20Sopenharmony_ci break; 62558c2ecf20Sopenharmony_ci } 62568c2ecf20Sopenharmony_ci } 62578c2ecf20Sopenharmony_ci pte_unmap_unlock(pte - 1, ptl); 62588c2ecf20Sopenharmony_ci cond_resched(); 62598c2ecf20Sopenharmony_ci 62608c2ecf20Sopenharmony_ci if (addr != end) { 62618c2ecf20Sopenharmony_ci /* 62628c2ecf20Sopenharmony_ci * We have consumed all precharges we got in can_attach(). 62638c2ecf20Sopenharmony_ci * We try charge one by one, but don't do any additional 62648c2ecf20Sopenharmony_ci * charges to mc.to if we have failed in charge once in attach() 62658c2ecf20Sopenharmony_ci * phase. 62668c2ecf20Sopenharmony_ci */ 62678c2ecf20Sopenharmony_ci ret = mem_cgroup_do_precharge(1); 62688c2ecf20Sopenharmony_ci if (!ret) 62698c2ecf20Sopenharmony_ci goto retry; 62708c2ecf20Sopenharmony_ci } 62718c2ecf20Sopenharmony_ci 62728c2ecf20Sopenharmony_ci return ret; 62738c2ecf20Sopenharmony_ci} 62748c2ecf20Sopenharmony_ci 62758c2ecf20Sopenharmony_cistatic const struct mm_walk_ops charge_walk_ops = { 62768c2ecf20Sopenharmony_ci .pmd_entry = mem_cgroup_move_charge_pte_range, 62778c2ecf20Sopenharmony_ci}; 62788c2ecf20Sopenharmony_ci 62798c2ecf20Sopenharmony_cistatic void mem_cgroup_move_charge(void) 62808c2ecf20Sopenharmony_ci{ 62818c2ecf20Sopenharmony_ci lru_add_drain_all(); 62828c2ecf20Sopenharmony_ci /* 62838c2ecf20Sopenharmony_ci * Signal lock_page_memcg() to take the memcg's move_lock 62848c2ecf20Sopenharmony_ci * while we're moving its pages to another memcg. Then wait 62858c2ecf20Sopenharmony_ci * for already started RCU-only updates to finish. 62868c2ecf20Sopenharmony_ci */ 62878c2ecf20Sopenharmony_ci atomic_inc(&mc.from->moving_account); 62888c2ecf20Sopenharmony_ci synchronize_rcu(); 62898c2ecf20Sopenharmony_ciretry: 62908c2ecf20Sopenharmony_ci if (unlikely(!mmap_read_trylock(mc.mm))) { 62918c2ecf20Sopenharmony_ci /* 62928c2ecf20Sopenharmony_ci * Someone who are holding the mmap_lock might be waiting in 62938c2ecf20Sopenharmony_ci * waitq. So we cancel all extra charges, wake up all waiters, 62948c2ecf20Sopenharmony_ci * and retry. Because we cancel precharges, we might not be able 62958c2ecf20Sopenharmony_ci * to move enough charges, but moving charge is a best-effort 62968c2ecf20Sopenharmony_ci * feature anyway, so it wouldn't be a big problem. 62978c2ecf20Sopenharmony_ci */ 62988c2ecf20Sopenharmony_ci __mem_cgroup_clear_mc(); 62998c2ecf20Sopenharmony_ci cond_resched(); 63008c2ecf20Sopenharmony_ci goto retry; 63018c2ecf20Sopenharmony_ci } 63028c2ecf20Sopenharmony_ci /* 63038c2ecf20Sopenharmony_ci * When we have consumed all precharges and failed in doing 63048c2ecf20Sopenharmony_ci * additional charge, the page walk just aborts. 63058c2ecf20Sopenharmony_ci */ 63068c2ecf20Sopenharmony_ci walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, 63078c2ecf20Sopenharmony_ci NULL); 63088c2ecf20Sopenharmony_ci 63098c2ecf20Sopenharmony_ci mmap_read_unlock(mc.mm); 63108c2ecf20Sopenharmony_ci atomic_dec(&mc.from->moving_account); 63118c2ecf20Sopenharmony_ci} 63128c2ecf20Sopenharmony_ci 63138c2ecf20Sopenharmony_cistatic void mem_cgroup_move_task(void) 63148c2ecf20Sopenharmony_ci{ 63158c2ecf20Sopenharmony_ci if (mc.to) { 63168c2ecf20Sopenharmony_ci mem_cgroup_move_charge(); 63178c2ecf20Sopenharmony_ci mem_cgroup_clear_mc(); 63188c2ecf20Sopenharmony_ci } 63198c2ecf20Sopenharmony_ci} 63208c2ecf20Sopenharmony_ci#else /* !CONFIG_MMU */ 63218c2ecf20Sopenharmony_cistatic int mem_cgroup_can_attach(struct cgroup_taskset *tset) 63228c2ecf20Sopenharmony_ci{ 63238c2ecf20Sopenharmony_ci return 0; 63248c2ecf20Sopenharmony_ci} 63258c2ecf20Sopenharmony_cistatic void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 63268c2ecf20Sopenharmony_ci{ 63278c2ecf20Sopenharmony_ci} 63288c2ecf20Sopenharmony_cistatic void mem_cgroup_move_task(void) 63298c2ecf20Sopenharmony_ci{ 63308c2ecf20Sopenharmony_ci} 63318c2ecf20Sopenharmony_ci#endif 63328c2ecf20Sopenharmony_ci 63338c2ecf20Sopenharmony_ci/* 63348c2ecf20Sopenharmony_ci * Cgroup retains root cgroups across [un]mount cycles making it necessary 63358c2ecf20Sopenharmony_ci * to verify whether we're attached to the default hierarchy on each mount 63368c2ecf20Sopenharmony_ci * attempt. 63378c2ecf20Sopenharmony_ci */ 63388c2ecf20Sopenharmony_cistatic void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 63398c2ecf20Sopenharmony_ci{ 63408c2ecf20Sopenharmony_ci /* 63418c2ecf20Sopenharmony_ci * use_hierarchy is forced on the default hierarchy. cgroup core 63428c2ecf20Sopenharmony_ci * guarantees that @root doesn't have any children, so turning it 63438c2ecf20Sopenharmony_ci * on for the root memcg is enough. 63448c2ecf20Sopenharmony_ci */ 63458c2ecf20Sopenharmony_ci if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 63468c2ecf20Sopenharmony_ci root_mem_cgroup->use_hierarchy = true; 63478c2ecf20Sopenharmony_ci else 63488c2ecf20Sopenharmony_ci root_mem_cgroup->use_hierarchy = false; 63498c2ecf20Sopenharmony_ci} 63508c2ecf20Sopenharmony_ci 63518c2ecf20Sopenharmony_cistatic int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) 63528c2ecf20Sopenharmony_ci{ 63538c2ecf20Sopenharmony_ci if (value == PAGE_COUNTER_MAX) 63548c2ecf20Sopenharmony_ci seq_puts(m, "max\n"); 63558c2ecf20Sopenharmony_ci else 63568c2ecf20Sopenharmony_ci seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); 63578c2ecf20Sopenharmony_ci 63588c2ecf20Sopenharmony_ci return 0; 63598c2ecf20Sopenharmony_ci} 63608c2ecf20Sopenharmony_ci 63618c2ecf20Sopenharmony_cistatic u64 memory_current_read(struct cgroup_subsys_state *css, 63628c2ecf20Sopenharmony_ci struct cftype *cft) 63638c2ecf20Sopenharmony_ci{ 63648c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 63658c2ecf20Sopenharmony_ci 63668c2ecf20Sopenharmony_ci return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; 63678c2ecf20Sopenharmony_ci} 63688c2ecf20Sopenharmony_ci 63698c2ecf20Sopenharmony_cistatic int memory_min_show(struct seq_file *m, void *v) 63708c2ecf20Sopenharmony_ci{ 63718c2ecf20Sopenharmony_ci return seq_puts_memcg_tunable(m, 63728c2ecf20Sopenharmony_ci READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); 63738c2ecf20Sopenharmony_ci} 63748c2ecf20Sopenharmony_ci 63758c2ecf20Sopenharmony_cistatic ssize_t memory_min_write(struct kernfs_open_file *of, 63768c2ecf20Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 63778c2ecf20Sopenharmony_ci{ 63788c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 63798c2ecf20Sopenharmony_ci unsigned long min; 63808c2ecf20Sopenharmony_ci int err; 63818c2ecf20Sopenharmony_ci 63828c2ecf20Sopenharmony_ci buf = strstrip(buf); 63838c2ecf20Sopenharmony_ci err = page_counter_memparse(buf, "max", &min); 63848c2ecf20Sopenharmony_ci if (err) 63858c2ecf20Sopenharmony_ci return err; 63868c2ecf20Sopenharmony_ci 63878c2ecf20Sopenharmony_ci page_counter_set_min(&memcg->memory, min); 63888c2ecf20Sopenharmony_ci 63898c2ecf20Sopenharmony_ci return nbytes; 63908c2ecf20Sopenharmony_ci} 63918c2ecf20Sopenharmony_ci 63928c2ecf20Sopenharmony_cistatic int memory_low_show(struct seq_file *m, void *v) 63938c2ecf20Sopenharmony_ci{ 63948c2ecf20Sopenharmony_ci return seq_puts_memcg_tunable(m, 63958c2ecf20Sopenharmony_ci READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); 63968c2ecf20Sopenharmony_ci} 63978c2ecf20Sopenharmony_ci 63988c2ecf20Sopenharmony_cistatic ssize_t memory_low_write(struct kernfs_open_file *of, 63998c2ecf20Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 64008c2ecf20Sopenharmony_ci{ 64018c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 64028c2ecf20Sopenharmony_ci unsigned long low; 64038c2ecf20Sopenharmony_ci int err; 64048c2ecf20Sopenharmony_ci 64058c2ecf20Sopenharmony_ci buf = strstrip(buf); 64068c2ecf20Sopenharmony_ci err = page_counter_memparse(buf, "max", &low); 64078c2ecf20Sopenharmony_ci if (err) 64088c2ecf20Sopenharmony_ci return err; 64098c2ecf20Sopenharmony_ci 64108c2ecf20Sopenharmony_ci page_counter_set_low(&memcg->memory, low); 64118c2ecf20Sopenharmony_ci 64128c2ecf20Sopenharmony_ci return nbytes; 64138c2ecf20Sopenharmony_ci} 64148c2ecf20Sopenharmony_ci 64158c2ecf20Sopenharmony_cistatic int memory_high_show(struct seq_file *m, void *v) 64168c2ecf20Sopenharmony_ci{ 64178c2ecf20Sopenharmony_ci return seq_puts_memcg_tunable(m, 64188c2ecf20Sopenharmony_ci READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); 64198c2ecf20Sopenharmony_ci} 64208c2ecf20Sopenharmony_ci 64218c2ecf20Sopenharmony_cistatic ssize_t memory_high_write(struct kernfs_open_file *of, 64228c2ecf20Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 64238c2ecf20Sopenharmony_ci{ 64248c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 64258c2ecf20Sopenharmony_ci unsigned int nr_retries = MAX_RECLAIM_RETRIES; 64268c2ecf20Sopenharmony_ci bool drained = false; 64278c2ecf20Sopenharmony_ci unsigned long high; 64288c2ecf20Sopenharmony_ci int err; 64298c2ecf20Sopenharmony_ci 64308c2ecf20Sopenharmony_ci buf = strstrip(buf); 64318c2ecf20Sopenharmony_ci err = page_counter_memparse(buf, "max", &high); 64328c2ecf20Sopenharmony_ci if (err) 64338c2ecf20Sopenharmony_ci return err; 64348c2ecf20Sopenharmony_ci 64358c2ecf20Sopenharmony_ci page_counter_set_high(&memcg->memory, high); 64368c2ecf20Sopenharmony_ci 64378c2ecf20Sopenharmony_ci for (;;) { 64388c2ecf20Sopenharmony_ci unsigned long nr_pages = page_counter_read(&memcg->memory); 64398c2ecf20Sopenharmony_ci unsigned long reclaimed; 64408c2ecf20Sopenharmony_ci 64418c2ecf20Sopenharmony_ci if (nr_pages <= high) 64428c2ecf20Sopenharmony_ci break; 64438c2ecf20Sopenharmony_ci 64448c2ecf20Sopenharmony_ci if (signal_pending(current)) 64458c2ecf20Sopenharmony_ci break; 64468c2ecf20Sopenharmony_ci 64478c2ecf20Sopenharmony_ci if (!drained) { 64488c2ecf20Sopenharmony_ci drain_all_stock(memcg); 64498c2ecf20Sopenharmony_ci drained = true; 64508c2ecf20Sopenharmony_ci continue; 64518c2ecf20Sopenharmony_ci } 64528c2ecf20Sopenharmony_ci 64538c2ecf20Sopenharmony_ci reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, 64548c2ecf20Sopenharmony_ci GFP_KERNEL, true); 64558c2ecf20Sopenharmony_ci 64568c2ecf20Sopenharmony_ci if (!reclaimed && !nr_retries--) 64578c2ecf20Sopenharmony_ci break; 64588c2ecf20Sopenharmony_ci } 64598c2ecf20Sopenharmony_ci 64608c2ecf20Sopenharmony_ci memcg_wb_domain_size_changed(memcg); 64618c2ecf20Sopenharmony_ci return nbytes; 64628c2ecf20Sopenharmony_ci} 64638c2ecf20Sopenharmony_ci 64648c2ecf20Sopenharmony_cistatic int memory_max_show(struct seq_file *m, void *v) 64658c2ecf20Sopenharmony_ci{ 64668c2ecf20Sopenharmony_ci return seq_puts_memcg_tunable(m, 64678c2ecf20Sopenharmony_ci READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); 64688c2ecf20Sopenharmony_ci} 64698c2ecf20Sopenharmony_ci 64708c2ecf20Sopenharmony_cistatic ssize_t memory_max_write(struct kernfs_open_file *of, 64718c2ecf20Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 64728c2ecf20Sopenharmony_ci{ 64738c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 64748c2ecf20Sopenharmony_ci unsigned int nr_reclaims = MAX_RECLAIM_RETRIES; 64758c2ecf20Sopenharmony_ci bool drained = false; 64768c2ecf20Sopenharmony_ci unsigned long max; 64778c2ecf20Sopenharmony_ci int err; 64788c2ecf20Sopenharmony_ci 64798c2ecf20Sopenharmony_ci buf = strstrip(buf); 64808c2ecf20Sopenharmony_ci err = page_counter_memparse(buf, "max", &max); 64818c2ecf20Sopenharmony_ci if (err) 64828c2ecf20Sopenharmony_ci return err; 64838c2ecf20Sopenharmony_ci 64848c2ecf20Sopenharmony_ci xchg(&memcg->memory.max, max); 64858c2ecf20Sopenharmony_ci 64868c2ecf20Sopenharmony_ci for (;;) { 64878c2ecf20Sopenharmony_ci unsigned long nr_pages = page_counter_read(&memcg->memory); 64888c2ecf20Sopenharmony_ci 64898c2ecf20Sopenharmony_ci if (nr_pages <= max) 64908c2ecf20Sopenharmony_ci break; 64918c2ecf20Sopenharmony_ci 64928c2ecf20Sopenharmony_ci if (signal_pending(current)) 64938c2ecf20Sopenharmony_ci break; 64948c2ecf20Sopenharmony_ci 64958c2ecf20Sopenharmony_ci if (!drained) { 64968c2ecf20Sopenharmony_ci drain_all_stock(memcg); 64978c2ecf20Sopenharmony_ci drained = true; 64988c2ecf20Sopenharmony_ci continue; 64998c2ecf20Sopenharmony_ci } 65008c2ecf20Sopenharmony_ci 65018c2ecf20Sopenharmony_ci if (nr_reclaims) { 65028c2ecf20Sopenharmony_ci if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, 65038c2ecf20Sopenharmony_ci GFP_KERNEL, true)) 65048c2ecf20Sopenharmony_ci nr_reclaims--; 65058c2ecf20Sopenharmony_ci continue; 65068c2ecf20Sopenharmony_ci } 65078c2ecf20Sopenharmony_ci 65088c2ecf20Sopenharmony_ci memcg_memory_event(memcg, MEMCG_OOM); 65098c2ecf20Sopenharmony_ci if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) 65108c2ecf20Sopenharmony_ci break; 65118c2ecf20Sopenharmony_ci } 65128c2ecf20Sopenharmony_ci 65138c2ecf20Sopenharmony_ci memcg_wb_domain_size_changed(memcg); 65148c2ecf20Sopenharmony_ci return nbytes; 65158c2ecf20Sopenharmony_ci} 65168c2ecf20Sopenharmony_ci 65178c2ecf20Sopenharmony_cistatic void __memory_events_show(struct seq_file *m, atomic_long_t *events) 65188c2ecf20Sopenharmony_ci{ 65198c2ecf20Sopenharmony_ci seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); 65208c2ecf20Sopenharmony_ci seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); 65218c2ecf20Sopenharmony_ci seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); 65228c2ecf20Sopenharmony_ci seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); 65238c2ecf20Sopenharmony_ci seq_printf(m, "oom_kill %lu\n", 65248c2ecf20Sopenharmony_ci atomic_long_read(&events[MEMCG_OOM_KILL])); 65258c2ecf20Sopenharmony_ci} 65268c2ecf20Sopenharmony_ci 65278c2ecf20Sopenharmony_cistatic int memory_events_show(struct seq_file *m, void *v) 65288c2ecf20Sopenharmony_ci{ 65298c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 65308c2ecf20Sopenharmony_ci 65318c2ecf20Sopenharmony_ci __memory_events_show(m, memcg->memory_events); 65328c2ecf20Sopenharmony_ci return 0; 65338c2ecf20Sopenharmony_ci} 65348c2ecf20Sopenharmony_ci 65358c2ecf20Sopenharmony_cistatic int memory_events_local_show(struct seq_file *m, void *v) 65368c2ecf20Sopenharmony_ci{ 65378c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 65388c2ecf20Sopenharmony_ci 65398c2ecf20Sopenharmony_ci __memory_events_show(m, memcg->memory_events_local); 65408c2ecf20Sopenharmony_ci return 0; 65418c2ecf20Sopenharmony_ci} 65428c2ecf20Sopenharmony_ci 65438c2ecf20Sopenharmony_cistatic int memory_stat_show(struct seq_file *m, void *v) 65448c2ecf20Sopenharmony_ci{ 65458c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 65468c2ecf20Sopenharmony_ci char *buf; 65478c2ecf20Sopenharmony_ci 65488c2ecf20Sopenharmony_ci buf = memory_stat_format(memcg); 65498c2ecf20Sopenharmony_ci if (!buf) 65508c2ecf20Sopenharmony_ci return -ENOMEM; 65518c2ecf20Sopenharmony_ci seq_puts(m, buf); 65528c2ecf20Sopenharmony_ci kfree(buf); 65538c2ecf20Sopenharmony_ci return 0; 65548c2ecf20Sopenharmony_ci} 65558c2ecf20Sopenharmony_ci 65568c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 65578c2ecf20Sopenharmony_cistatic int memory_numa_stat_show(struct seq_file *m, void *v) 65588c2ecf20Sopenharmony_ci{ 65598c2ecf20Sopenharmony_ci int i; 65608c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 65618c2ecf20Sopenharmony_ci 65628c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 65638c2ecf20Sopenharmony_ci int nid; 65648c2ecf20Sopenharmony_ci 65658c2ecf20Sopenharmony_ci if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS) 65668c2ecf20Sopenharmony_ci continue; 65678c2ecf20Sopenharmony_ci 65688c2ecf20Sopenharmony_ci seq_printf(m, "%s", memory_stats[i].name); 65698c2ecf20Sopenharmony_ci for_each_node_state(nid, N_MEMORY) { 65708c2ecf20Sopenharmony_ci u64 size; 65718c2ecf20Sopenharmony_ci struct lruvec *lruvec; 65728c2ecf20Sopenharmony_ci 65738c2ecf20Sopenharmony_ci lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 65748c2ecf20Sopenharmony_ci size = lruvec_page_state(lruvec, memory_stats[i].idx); 65758c2ecf20Sopenharmony_ci size *= memory_stats[i].ratio; 65768c2ecf20Sopenharmony_ci seq_printf(m, " N%d=%llu", nid, size); 65778c2ecf20Sopenharmony_ci } 65788c2ecf20Sopenharmony_ci seq_putc(m, '\n'); 65798c2ecf20Sopenharmony_ci } 65808c2ecf20Sopenharmony_ci 65818c2ecf20Sopenharmony_ci return 0; 65828c2ecf20Sopenharmony_ci} 65838c2ecf20Sopenharmony_ci#endif 65848c2ecf20Sopenharmony_ci 65858c2ecf20Sopenharmony_cistatic int memory_oom_group_show(struct seq_file *m, void *v) 65868c2ecf20Sopenharmony_ci{ 65878c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 65888c2ecf20Sopenharmony_ci 65898c2ecf20Sopenharmony_ci seq_printf(m, "%d\n", memcg->oom_group); 65908c2ecf20Sopenharmony_ci 65918c2ecf20Sopenharmony_ci return 0; 65928c2ecf20Sopenharmony_ci} 65938c2ecf20Sopenharmony_ci 65948c2ecf20Sopenharmony_cistatic ssize_t memory_oom_group_write(struct kernfs_open_file *of, 65958c2ecf20Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 65968c2ecf20Sopenharmony_ci{ 65978c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 65988c2ecf20Sopenharmony_ci int ret, oom_group; 65998c2ecf20Sopenharmony_ci 66008c2ecf20Sopenharmony_ci buf = strstrip(buf); 66018c2ecf20Sopenharmony_ci if (!buf) 66028c2ecf20Sopenharmony_ci return -EINVAL; 66038c2ecf20Sopenharmony_ci 66048c2ecf20Sopenharmony_ci ret = kstrtoint(buf, 0, &oom_group); 66058c2ecf20Sopenharmony_ci if (ret) 66068c2ecf20Sopenharmony_ci return ret; 66078c2ecf20Sopenharmony_ci 66088c2ecf20Sopenharmony_ci if (oom_group != 0 && oom_group != 1) 66098c2ecf20Sopenharmony_ci return -EINVAL; 66108c2ecf20Sopenharmony_ci 66118c2ecf20Sopenharmony_ci memcg->oom_group = oom_group; 66128c2ecf20Sopenharmony_ci 66138c2ecf20Sopenharmony_ci return nbytes; 66148c2ecf20Sopenharmony_ci} 66158c2ecf20Sopenharmony_ci 66168c2ecf20Sopenharmony_cistatic struct cftype memory_files[] = { 66178c2ecf20Sopenharmony_ci { 66188c2ecf20Sopenharmony_ci .name = "current", 66198c2ecf20Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 66208c2ecf20Sopenharmony_ci .read_u64 = memory_current_read, 66218c2ecf20Sopenharmony_ci }, 66228c2ecf20Sopenharmony_ci { 66238c2ecf20Sopenharmony_ci .name = "min", 66248c2ecf20Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 66258c2ecf20Sopenharmony_ci .seq_show = memory_min_show, 66268c2ecf20Sopenharmony_ci .write = memory_min_write, 66278c2ecf20Sopenharmony_ci }, 66288c2ecf20Sopenharmony_ci { 66298c2ecf20Sopenharmony_ci .name = "low", 66308c2ecf20Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 66318c2ecf20Sopenharmony_ci .seq_show = memory_low_show, 66328c2ecf20Sopenharmony_ci .write = memory_low_write, 66338c2ecf20Sopenharmony_ci }, 66348c2ecf20Sopenharmony_ci { 66358c2ecf20Sopenharmony_ci .name = "high", 66368c2ecf20Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 66378c2ecf20Sopenharmony_ci .seq_show = memory_high_show, 66388c2ecf20Sopenharmony_ci .write = memory_high_write, 66398c2ecf20Sopenharmony_ci }, 66408c2ecf20Sopenharmony_ci { 66418c2ecf20Sopenharmony_ci .name = "max", 66428c2ecf20Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 66438c2ecf20Sopenharmony_ci .seq_show = memory_max_show, 66448c2ecf20Sopenharmony_ci .write = memory_max_write, 66458c2ecf20Sopenharmony_ci }, 66468c2ecf20Sopenharmony_ci { 66478c2ecf20Sopenharmony_ci .name = "events", 66488c2ecf20Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 66498c2ecf20Sopenharmony_ci .file_offset = offsetof(struct mem_cgroup, events_file), 66508c2ecf20Sopenharmony_ci .seq_show = memory_events_show, 66518c2ecf20Sopenharmony_ci }, 66528c2ecf20Sopenharmony_ci { 66538c2ecf20Sopenharmony_ci .name = "events.local", 66548c2ecf20Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 66558c2ecf20Sopenharmony_ci .file_offset = offsetof(struct mem_cgroup, events_local_file), 66568c2ecf20Sopenharmony_ci .seq_show = memory_events_local_show, 66578c2ecf20Sopenharmony_ci }, 66588c2ecf20Sopenharmony_ci { 66598c2ecf20Sopenharmony_ci .name = "stat", 66608c2ecf20Sopenharmony_ci .seq_show = memory_stat_show, 66618c2ecf20Sopenharmony_ci }, 66628c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 66638c2ecf20Sopenharmony_ci { 66648c2ecf20Sopenharmony_ci .name = "numa_stat", 66658c2ecf20Sopenharmony_ci .seq_show = memory_numa_stat_show, 66668c2ecf20Sopenharmony_ci }, 66678c2ecf20Sopenharmony_ci#endif 66688c2ecf20Sopenharmony_ci { 66698c2ecf20Sopenharmony_ci .name = "oom.group", 66708c2ecf20Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, 66718c2ecf20Sopenharmony_ci .seq_show = memory_oom_group_show, 66728c2ecf20Sopenharmony_ci .write = memory_oom_group_write, 66738c2ecf20Sopenharmony_ci }, 66748c2ecf20Sopenharmony_ci { } /* terminate */ 66758c2ecf20Sopenharmony_ci}; 66768c2ecf20Sopenharmony_ci 66778c2ecf20Sopenharmony_cistruct cgroup_subsys memory_cgrp_subsys = { 66788c2ecf20Sopenharmony_ci .css_alloc = mem_cgroup_css_alloc, 66798c2ecf20Sopenharmony_ci .css_online = mem_cgroup_css_online, 66808c2ecf20Sopenharmony_ci .css_offline = mem_cgroup_css_offline, 66818c2ecf20Sopenharmony_ci .css_released = mem_cgroup_css_released, 66828c2ecf20Sopenharmony_ci .css_free = mem_cgroup_css_free, 66838c2ecf20Sopenharmony_ci .css_reset = mem_cgroup_css_reset, 66848c2ecf20Sopenharmony_ci .can_attach = mem_cgroup_can_attach, 66858c2ecf20Sopenharmony_ci .cancel_attach = mem_cgroup_cancel_attach, 66868c2ecf20Sopenharmony_ci .post_attach = mem_cgroup_move_task, 66878c2ecf20Sopenharmony_ci .bind = mem_cgroup_bind, 66888c2ecf20Sopenharmony_ci .dfl_cftypes = memory_files, 66898c2ecf20Sopenharmony_ci .legacy_cftypes = mem_cgroup_legacy_files, 66908c2ecf20Sopenharmony_ci .early_init = 0, 66918c2ecf20Sopenharmony_ci}; 66928c2ecf20Sopenharmony_ci 66938c2ecf20Sopenharmony_ci/* 66948c2ecf20Sopenharmony_ci * This function calculates an individual cgroup's effective 66958c2ecf20Sopenharmony_ci * protection which is derived from its own memory.min/low, its 66968c2ecf20Sopenharmony_ci * parent's and siblings' settings, as well as the actual memory 66978c2ecf20Sopenharmony_ci * distribution in the tree. 66988c2ecf20Sopenharmony_ci * 66998c2ecf20Sopenharmony_ci * The following rules apply to the effective protection values: 67008c2ecf20Sopenharmony_ci * 67018c2ecf20Sopenharmony_ci * 1. At the first level of reclaim, effective protection is equal to 67028c2ecf20Sopenharmony_ci * the declared protection in memory.min and memory.low. 67038c2ecf20Sopenharmony_ci * 67048c2ecf20Sopenharmony_ci * 2. To enable safe delegation of the protection configuration, at 67058c2ecf20Sopenharmony_ci * subsequent levels the effective protection is capped to the 67068c2ecf20Sopenharmony_ci * parent's effective protection. 67078c2ecf20Sopenharmony_ci * 67088c2ecf20Sopenharmony_ci * 3. To make complex and dynamic subtrees easier to configure, the 67098c2ecf20Sopenharmony_ci * user is allowed to overcommit the declared protection at a given 67108c2ecf20Sopenharmony_ci * level. If that is the case, the parent's effective protection is 67118c2ecf20Sopenharmony_ci * distributed to the children in proportion to how much protection 67128c2ecf20Sopenharmony_ci * they have declared and how much of it they are utilizing. 67138c2ecf20Sopenharmony_ci * 67148c2ecf20Sopenharmony_ci * This makes distribution proportional, but also work-conserving: 67158c2ecf20Sopenharmony_ci * if one cgroup claims much more protection than it uses memory, 67168c2ecf20Sopenharmony_ci * the unused remainder is available to its siblings. 67178c2ecf20Sopenharmony_ci * 67188c2ecf20Sopenharmony_ci * 4. Conversely, when the declared protection is undercommitted at a 67198c2ecf20Sopenharmony_ci * given level, the distribution of the larger parental protection 67208c2ecf20Sopenharmony_ci * budget is NOT proportional. A cgroup's protection from a sibling 67218c2ecf20Sopenharmony_ci * is capped to its own memory.min/low setting. 67228c2ecf20Sopenharmony_ci * 67238c2ecf20Sopenharmony_ci * 5. However, to allow protecting recursive subtrees from each other 67248c2ecf20Sopenharmony_ci * without having to declare each individual cgroup's fixed share 67258c2ecf20Sopenharmony_ci * of the ancestor's claim to protection, any unutilized - 67268c2ecf20Sopenharmony_ci * "floating" - protection from up the tree is distributed in 67278c2ecf20Sopenharmony_ci * proportion to each cgroup's *usage*. This makes the protection 67288c2ecf20Sopenharmony_ci * neutral wrt sibling cgroups and lets them compete freely over 67298c2ecf20Sopenharmony_ci * the shared parental protection budget, but it protects the 67308c2ecf20Sopenharmony_ci * subtree as a whole from neighboring subtrees. 67318c2ecf20Sopenharmony_ci * 67328c2ecf20Sopenharmony_ci * Note that 4. and 5. are not in conflict: 4. is about protecting 67338c2ecf20Sopenharmony_ci * against immediate siblings whereas 5. is about protecting against 67348c2ecf20Sopenharmony_ci * neighboring subtrees. 67358c2ecf20Sopenharmony_ci */ 67368c2ecf20Sopenharmony_cistatic unsigned long effective_protection(unsigned long usage, 67378c2ecf20Sopenharmony_ci unsigned long parent_usage, 67388c2ecf20Sopenharmony_ci unsigned long setting, 67398c2ecf20Sopenharmony_ci unsigned long parent_effective, 67408c2ecf20Sopenharmony_ci unsigned long siblings_protected) 67418c2ecf20Sopenharmony_ci{ 67428c2ecf20Sopenharmony_ci unsigned long protected; 67438c2ecf20Sopenharmony_ci unsigned long ep; 67448c2ecf20Sopenharmony_ci 67458c2ecf20Sopenharmony_ci protected = min(usage, setting); 67468c2ecf20Sopenharmony_ci /* 67478c2ecf20Sopenharmony_ci * If all cgroups at this level combined claim and use more 67488c2ecf20Sopenharmony_ci * protection then what the parent affords them, distribute 67498c2ecf20Sopenharmony_ci * shares in proportion to utilization. 67508c2ecf20Sopenharmony_ci * 67518c2ecf20Sopenharmony_ci * We are using actual utilization rather than the statically 67528c2ecf20Sopenharmony_ci * claimed protection in order to be work-conserving: claimed 67538c2ecf20Sopenharmony_ci * but unused protection is available to siblings that would 67548c2ecf20Sopenharmony_ci * otherwise get a smaller chunk than what they claimed. 67558c2ecf20Sopenharmony_ci */ 67568c2ecf20Sopenharmony_ci if (siblings_protected > parent_effective) 67578c2ecf20Sopenharmony_ci return protected * parent_effective / siblings_protected; 67588c2ecf20Sopenharmony_ci 67598c2ecf20Sopenharmony_ci /* 67608c2ecf20Sopenharmony_ci * Ok, utilized protection of all children is within what the 67618c2ecf20Sopenharmony_ci * parent affords them, so we know whatever this child claims 67628c2ecf20Sopenharmony_ci * and utilizes is effectively protected. 67638c2ecf20Sopenharmony_ci * 67648c2ecf20Sopenharmony_ci * If there is unprotected usage beyond this value, reclaim 67658c2ecf20Sopenharmony_ci * will apply pressure in proportion to that amount. 67668c2ecf20Sopenharmony_ci * 67678c2ecf20Sopenharmony_ci * If there is unutilized protection, the cgroup will be fully 67688c2ecf20Sopenharmony_ci * shielded from reclaim, but we do return a smaller value for 67698c2ecf20Sopenharmony_ci * protection than what the group could enjoy in theory. This 67708c2ecf20Sopenharmony_ci * is okay. With the overcommit distribution above, effective 67718c2ecf20Sopenharmony_ci * protection is always dependent on how memory is actually 67728c2ecf20Sopenharmony_ci * consumed among the siblings anyway. 67738c2ecf20Sopenharmony_ci */ 67748c2ecf20Sopenharmony_ci ep = protected; 67758c2ecf20Sopenharmony_ci 67768c2ecf20Sopenharmony_ci /* 67778c2ecf20Sopenharmony_ci * If the children aren't claiming (all of) the protection 67788c2ecf20Sopenharmony_ci * afforded to them by the parent, distribute the remainder in 67798c2ecf20Sopenharmony_ci * proportion to the (unprotected) memory of each cgroup. That 67808c2ecf20Sopenharmony_ci * way, cgroups that aren't explicitly prioritized wrt each 67818c2ecf20Sopenharmony_ci * other compete freely over the allowance, but they are 67828c2ecf20Sopenharmony_ci * collectively protected from neighboring trees. 67838c2ecf20Sopenharmony_ci * 67848c2ecf20Sopenharmony_ci * We're using unprotected memory for the weight so that if 67858c2ecf20Sopenharmony_ci * some cgroups DO claim explicit protection, we don't protect 67868c2ecf20Sopenharmony_ci * the same bytes twice. 67878c2ecf20Sopenharmony_ci * 67888c2ecf20Sopenharmony_ci * Check both usage and parent_usage against the respective 67898c2ecf20Sopenharmony_ci * protected values. One should imply the other, but they 67908c2ecf20Sopenharmony_ci * aren't read atomically - make sure the division is sane. 67918c2ecf20Sopenharmony_ci */ 67928c2ecf20Sopenharmony_ci if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) 67938c2ecf20Sopenharmony_ci return ep; 67948c2ecf20Sopenharmony_ci if (parent_effective > siblings_protected && 67958c2ecf20Sopenharmony_ci parent_usage > siblings_protected && 67968c2ecf20Sopenharmony_ci usage > protected) { 67978c2ecf20Sopenharmony_ci unsigned long unclaimed; 67988c2ecf20Sopenharmony_ci 67998c2ecf20Sopenharmony_ci unclaimed = parent_effective - siblings_protected; 68008c2ecf20Sopenharmony_ci unclaimed *= usage - protected; 68018c2ecf20Sopenharmony_ci unclaimed /= parent_usage - siblings_protected; 68028c2ecf20Sopenharmony_ci 68038c2ecf20Sopenharmony_ci ep += unclaimed; 68048c2ecf20Sopenharmony_ci } 68058c2ecf20Sopenharmony_ci 68068c2ecf20Sopenharmony_ci return ep; 68078c2ecf20Sopenharmony_ci} 68088c2ecf20Sopenharmony_ci 68098c2ecf20Sopenharmony_ci/** 68108c2ecf20Sopenharmony_ci * mem_cgroup_protected - check if memory consumption is in the normal range 68118c2ecf20Sopenharmony_ci * @root: the top ancestor of the sub-tree being checked 68128c2ecf20Sopenharmony_ci * @memcg: the memory cgroup to check 68138c2ecf20Sopenharmony_ci * 68148c2ecf20Sopenharmony_ci * WARNING: This function is not stateless! It can only be used as part 68158c2ecf20Sopenharmony_ci * of a top-down tree iteration, not for isolated queries. 68168c2ecf20Sopenharmony_ci */ 68178c2ecf20Sopenharmony_civoid mem_cgroup_calculate_protection(struct mem_cgroup *root, 68188c2ecf20Sopenharmony_ci struct mem_cgroup *memcg) 68198c2ecf20Sopenharmony_ci{ 68208c2ecf20Sopenharmony_ci unsigned long usage, parent_usage; 68218c2ecf20Sopenharmony_ci struct mem_cgroup *parent; 68228c2ecf20Sopenharmony_ci 68238c2ecf20Sopenharmony_ci if (mem_cgroup_disabled()) 68248c2ecf20Sopenharmony_ci return; 68258c2ecf20Sopenharmony_ci 68268c2ecf20Sopenharmony_ci if (!root) 68278c2ecf20Sopenharmony_ci root = root_mem_cgroup; 68288c2ecf20Sopenharmony_ci 68298c2ecf20Sopenharmony_ci /* 68308c2ecf20Sopenharmony_ci * Effective values of the reclaim targets are ignored so they 68318c2ecf20Sopenharmony_ci * can be stale. Have a look at mem_cgroup_protection for more 68328c2ecf20Sopenharmony_ci * details. 68338c2ecf20Sopenharmony_ci * TODO: calculation should be more robust so that we do not need 68348c2ecf20Sopenharmony_ci * that special casing. 68358c2ecf20Sopenharmony_ci */ 68368c2ecf20Sopenharmony_ci if (memcg == root) 68378c2ecf20Sopenharmony_ci return; 68388c2ecf20Sopenharmony_ci 68398c2ecf20Sopenharmony_ci usage = page_counter_read(&memcg->memory); 68408c2ecf20Sopenharmony_ci if (!usage) 68418c2ecf20Sopenharmony_ci return; 68428c2ecf20Sopenharmony_ci 68438c2ecf20Sopenharmony_ci parent = parent_mem_cgroup(memcg); 68448c2ecf20Sopenharmony_ci /* No parent means a non-hierarchical mode on v1 memcg */ 68458c2ecf20Sopenharmony_ci if (!parent) 68468c2ecf20Sopenharmony_ci return; 68478c2ecf20Sopenharmony_ci 68488c2ecf20Sopenharmony_ci if (parent == root) { 68498c2ecf20Sopenharmony_ci memcg->memory.emin = READ_ONCE(memcg->memory.min); 68508c2ecf20Sopenharmony_ci memcg->memory.elow = READ_ONCE(memcg->memory.low); 68518c2ecf20Sopenharmony_ci return; 68528c2ecf20Sopenharmony_ci } 68538c2ecf20Sopenharmony_ci 68548c2ecf20Sopenharmony_ci parent_usage = page_counter_read(&parent->memory); 68558c2ecf20Sopenharmony_ci 68568c2ecf20Sopenharmony_ci WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage, 68578c2ecf20Sopenharmony_ci READ_ONCE(memcg->memory.min), 68588c2ecf20Sopenharmony_ci READ_ONCE(parent->memory.emin), 68598c2ecf20Sopenharmony_ci atomic_long_read(&parent->memory.children_min_usage))); 68608c2ecf20Sopenharmony_ci 68618c2ecf20Sopenharmony_ci WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, 68628c2ecf20Sopenharmony_ci READ_ONCE(memcg->memory.low), 68638c2ecf20Sopenharmony_ci READ_ONCE(parent->memory.elow), 68648c2ecf20Sopenharmony_ci atomic_long_read(&parent->memory.children_low_usage))); 68658c2ecf20Sopenharmony_ci} 68668c2ecf20Sopenharmony_ci 68678c2ecf20Sopenharmony_ci/** 68688c2ecf20Sopenharmony_ci * mem_cgroup_charge - charge a newly allocated page to a cgroup 68698c2ecf20Sopenharmony_ci * @page: page to charge 68708c2ecf20Sopenharmony_ci * @mm: mm context of the victim 68718c2ecf20Sopenharmony_ci * @gfp_mask: reclaim mode 68728c2ecf20Sopenharmony_ci * 68738c2ecf20Sopenharmony_ci * Try to charge @page to the memcg that @mm belongs to, reclaiming 68748c2ecf20Sopenharmony_ci * pages according to @gfp_mask if necessary. 68758c2ecf20Sopenharmony_ci * 68768c2ecf20Sopenharmony_ci * Returns 0 on success. Otherwise, an error code is returned. 68778c2ecf20Sopenharmony_ci */ 68788c2ecf20Sopenharmony_ciint mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 68798c2ecf20Sopenharmony_ci{ 68808c2ecf20Sopenharmony_ci unsigned int nr_pages = thp_nr_pages(page); 68818c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = NULL; 68828c2ecf20Sopenharmony_ci int ret = 0; 68838c2ecf20Sopenharmony_ci 68848c2ecf20Sopenharmony_ci if (mem_cgroup_disabled()) 68858c2ecf20Sopenharmony_ci goto out; 68868c2ecf20Sopenharmony_ci 68878c2ecf20Sopenharmony_ci if (PageSwapCache(page)) { 68888c2ecf20Sopenharmony_ci swp_entry_t ent = { .val = page_private(page), }; 68898c2ecf20Sopenharmony_ci unsigned short id; 68908c2ecf20Sopenharmony_ci 68918c2ecf20Sopenharmony_ci /* 68928c2ecf20Sopenharmony_ci * Every swap fault against a single page tries to charge the 68938c2ecf20Sopenharmony_ci * page, bail as early as possible. shmem_unuse() encounters 68948c2ecf20Sopenharmony_ci * already charged pages, too. page->mem_cgroup is protected 68958c2ecf20Sopenharmony_ci * by the page lock, which serializes swap cache removal, which 68968c2ecf20Sopenharmony_ci * in turn serializes uncharging. 68978c2ecf20Sopenharmony_ci */ 68988c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!PageLocked(page), page); 68998c2ecf20Sopenharmony_ci if (compound_head(page)->mem_cgroup) 69008c2ecf20Sopenharmony_ci goto out; 69018c2ecf20Sopenharmony_ci 69028c2ecf20Sopenharmony_ci id = lookup_swap_cgroup_id(ent); 69038c2ecf20Sopenharmony_ci rcu_read_lock(); 69048c2ecf20Sopenharmony_ci memcg = mem_cgroup_from_id(id); 69058c2ecf20Sopenharmony_ci if (memcg && !css_tryget_online(&memcg->css)) 69068c2ecf20Sopenharmony_ci memcg = NULL; 69078c2ecf20Sopenharmony_ci rcu_read_unlock(); 69088c2ecf20Sopenharmony_ci } 69098c2ecf20Sopenharmony_ci 69108c2ecf20Sopenharmony_ci if (!memcg) 69118c2ecf20Sopenharmony_ci memcg = get_mem_cgroup_from_mm(mm); 69128c2ecf20Sopenharmony_ci 69138c2ecf20Sopenharmony_ci ret = try_charge(memcg, gfp_mask, nr_pages); 69148c2ecf20Sopenharmony_ci if (ret) 69158c2ecf20Sopenharmony_ci goto out_put; 69168c2ecf20Sopenharmony_ci 69178c2ecf20Sopenharmony_ci css_get(&memcg->css); 69188c2ecf20Sopenharmony_ci commit_charge(page, memcg); 69198c2ecf20Sopenharmony_ci 69208c2ecf20Sopenharmony_ci local_irq_disable(); 69218c2ecf20Sopenharmony_ci mem_cgroup_charge_statistics(memcg, page, nr_pages); 69228c2ecf20Sopenharmony_ci memcg_check_events(memcg, page); 69238c2ecf20Sopenharmony_ci local_irq_enable(); 69248c2ecf20Sopenharmony_ci 69258c2ecf20Sopenharmony_ci /* 69268c2ecf20Sopenharmony_ci * Cgroup1's unified memory+swap counter has been charged with the 69278c2ecf20Sopenharmony_ci * new swapcache page, finish the transfer by uncharging the swap 69288c2ecf20Sopenharmony_ci * slot. The swap slot would also get uncharged when it dies, but 69298c2ecf20Sopenharmony_ci * it can stick around indefinitely and we'd count the page twice 69308c2ecf20Sopenharmony_ci * the entire time. 69318c2ecf20Sopenharmony_ci * 69328c2ecf20Sopenharmony_ci * Cgroup2 has separate resource counters for memory and swap, 69338c2ecf20Sopenharmony_ci * so this is a non-issue here. Memory and swap charge lifetimes 69348c2ecf20Sopenharmony_ci * correspond 1:1 to page and swap slot lifetimes: we charge the 69358c2ecf20Sopenharmony_ci * page to memory here, and uncharge swap when the slot is freed. 69368c2ecf20Sopenharmony_ci */ 69378c2ecf20Sopenharmony_ci if (do_memsw_account() && PageSwapCache(page)) { 69388c2ecf20Sopenharmony_ci swp_entry_t entry = { .val = page_private(page) }; 69398c2ecf20Sopenharmony_ci /* 69408c2ecf20Sopenharmony_ci * The swap entry might not get freed for a long time, 69418c2ecf20Sopenharmony_ci * let's not wait for it. The page already received a 69428c2ecf20Sopenharmony_ci * memory+swap charge, drop the swap entry duplicate. 69438c2ecf20Sopenharmony_ci */ 69448c2ecf20Sopenharmony_ci mem_cgroup_uncharge_swap(entry, nr_pages); 69458c2ecf20Sopenharmony_ci } 69468c2ecf20Sopenharmony_ci 69478c2ecf20Sopenharmony_ciout_put: 69488c2ecf20Sopenharmony_ci css_put(&memcg->css); 69498c2ecf20Sopenharmony_ciout: 69508c2ecf20Sopenharmony_ci return ret; 69518c2ecf20Sopenharmony_ci} 69528c2ecf20Sopenharmony_ci 69538c2ecf20Sopenharmony_cistruct uncharge_gather { 69548c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 69558c2ecf20Sopenharmony_ci unsigned long nr_pages; 69568c2ecf20Sopenharmony_ci unsigned long pgpgout; 69578c2ecf20Sopenharmony_ci unsigned long nr_kmem; 69588c2ecf20Sopenharmony_ci struct page *dummy_page; 69598c2ecf20Sopenharmony_ci}; 69608c2ecf20Sopenharmony_ci 69618c2ecf20Sopenharmony_cistatic inline void uncharge_gather_clear(struct uncharge_gather *ug) 69628c2ecf20Sopenharmony_ci{ 69638c2ecf20Sopenharmony_ci memset(ug, 0, sizeof(*ug)); 69648c2ecf20Sopenharmony_ci} 69658c2ecf20Sopenharmony_ci 69668c2ecf20Sopenharmony_cistatic void uncharge_batch(const struct uncharge_gather *ug) 69678c2ecf20Sopenharmony_ci{ 69688c2ecf20Sopenharmony_ci unsigned long flags; 69698c2ecf20Sopenharmony_ci 69708c2ecf20Sopenharmony_ci if (!mem_cgroup_is_root(ug->memcg)) { 69718c2ecf20Sopenharmony_ci page_counter_uncharge(&ug->memcg->memory, ug->nr_pages); 69728c2ecf20Sopenharmony_ci if (do_memsw_account()) 69738c2ecf20Sopenharmony_ci page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages); 69748c2ecf20Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) 69758c2ecf20Sopenharmony_ci page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); 69768c2ecf20Sopenharmony_ci memcg_oom_recover(ug->memcg); 69778c2ecf20Sopenharmony_ci } 69788c2ecf20Sopenharmony_ci 69798c2ecf20Sopenharmony_ci local_irq_save(flags); 69808c2ecf20Sopenharmony_ci __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); 69818c2ecf20Sopenharmony_ci __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); 69828c2ecf20Sopenharmony_ci memcg_check_events(ug->memcg, ug->dummy_page); 69838c2ecf20Sopenharmony_ci local_irq_restore(flags); 69848c2ecf20Sopenharmony_ci 69858c2ecf20Sopenharmony_ci /* drop reference from uncharge_page */ 69868c2ecf20Sopenharmony_ci css_put(&ug->memcg->css); 69878c2ecf20Sopenharmony_ci} 69888c2ecf20Sopenharmony_ci 69898c2ecf20Sopenharmony_cistatic void uncharge_page(struct page *page, struct uncharge_gather *ug) 69908c2ecf20Sopenharmony_ci{ 69918c2ecf20Sopenharmony_ci unsigned long nr_pages; 69928c2ecf20Sopenharmony_ci 69938c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(PageLRU(page), page); 69948c2ecf20Sopenharmony_ci 69958c2ecf20Sopenharmony_ci if (!page->mem_cgroup) 69968c2ecf20Sopenharmony_ci return; 69978c2ecf20Sopenharmony_ci 69988c2ecf20Sopenharmony_ci /* 69998c2ecf20Sopenharmony_ci * Nobody should be changing or seriously looking at 70008c2ecf20Sopenharmony_ci * page->mem_cgroup at this point, we have fully 70018c2ecf20Sopenharmony_ci * exclusive access to the page. 70028c2ecf20Sopenharmony_ci */ 70038c2ecf20Sopenharmony_ci 70048c2ecf20Sopenharmony_ci if (ug->memcg != page->mem_cgroup) { 70058c2ecf20Sopenharmony_ci if (ug->memcg) { 70068c2ecf20Sopenharmony_ci uncharge_batch(ug); 70078c2ecf20Sopenharmony_ci uncharge_gather_clear(ug); 70088c2ecf20Sopenharmony_ci } 70098c2ecf20Sopenharmony_ci ug->memcg = page->mem_cgroup; 70108c2ecf20Sopenharmony_ci 70118c2ecf20Sopenharmony_ci /* pairs with css_put in uncharge_batch */ 70128c2ecf20Sopenharmony_ci css_get(&ug->memcg->css); 70138c2ecf20Sopenharmony_ci } 70148c2ecf20Sopenharmony_ci 70158c2ecf20Sopenharmony_ci nr_pages = compound_nr(page); 70168c2ecf20Sopenharmony_ci ug->nr_pages += nr_pages; 70178c2ecf20Sopenharmony_ci 70188c2ecf20Sopenharmony_ci if (!PageKmemcg(page)) { 70198c2ecf20Sopenharmony_ci ug->pgpgout++; 70208c2ecf20Sopenharmony_ci } else { 70218c2ecf20Sopenharmony_ci ug->nr_kmem += nr_pages; 70228c2ecf20Sopenharmony_ci __ClearPageKmemcg(page); 70238c2ecf20Sopenharmony_ci } 70248c2ecf20Sopenharmony_ci 70258c2ecf20Sopenharmony_ci ug->dummy_page = page; 70268c2ecf20Sopenharmony_ci page->mem_cgroup = NULL; 70278c2ecf20Sopenharmony_ci css_put(&ug->memcg->css); 70288c2ecf20Sopenharmony_ci} 70298c2ecf20Sopenharmony_ci 70308c2ecf20Sopenharmony_cistatic void uncharge_list(struct list_head *page_list) 70318c2ecf20Sopenharmony_ci{ 70328c2ecf20Sopenharmony_ci struct uncharge_gather ug; 70338c2ecf20Sopenharmony_ci struct list_head *next; 70348c2ecf20Sopenharmony_ci 70358c2ecf20Sopenharmony_ci uncharge_gather_clear(&ug); 70368c2ecf20Sopenharmony_ci 70378c2ecf20Sopenharmony_ci /* 70388c2ecf20Sopenharmony_ci * Note that the list can be a single page->lru; hence the 70398c2ecf20Sopenharmony_ci * do-while loop instead of a simple list_for_each_entry(). 70408c2ecf20Sopenharmony_ci */ 70418c2ecf20Sopenharmony_ci next = page_list->next; 70428c2ecf20Sopenharmony_ci do { 70438c2ecf20Sopenharmony_ci struct page *page; 70448c2ecf20Sopenharmony_ci 70458c2ecf20Sopenharmony_ci page = list_entry(next, struct page, lru); 70468c2ecf20Sopenharmony_ci next = page->lru.next; 70478c2ecf20Sopenharmony_ci 70488c2ecf20Sopenharmony_ci uncharge_page(page, &ug); 70498c2ecf20Sopenharmony_ci } while (next != page_list); 70508c2ecf20Sopenharmony_ci 70518c2ecf20Sopenharmony_ci if (ug.memcg) 70528c2ecf20Sopenharmony_ci uncharge_batch(&ug); 70538c2ecf20Sopenharmony_ci} 70548c2ecf20Sopenharmony_ci 70558c2ecf20Sopenharmony_ci/** 70568c2ecf20Sopenharmony_ci * mem_cgroup_uncharge - uncharge a page 70578c2ecf20Sopenharmony_ci * @page: page to uncharge 70588c2ecf20Sopenharmony_ci * 70598c2ecf20Sopenharmony_ci * Uncharge a page previously charged with mem_cgroup_charge(). 70608c2ecf20Sopenharmony_ci */ 70618c2ecf20Sopenharmony_civoid mem_cgroup_uncharge(struct page *page) 70628c2ecf20Sopenharmony_ci{ 70638c2ecf20Sopenharmony_ci struct uncharge_gather ug; 70648c2ecf20Sopenharmony_ci 70658c2ecf20Sopenharmony_ci if (mem_cgroup_disabled()) 70668c2ecf20Sopenharmony_ci return; 70678c2ecf20Sopenharmony_ci 70688c2ecf20Sopenharmony_ci /* Don't touch page->lru of any random page, pre-check: */ 70698c2ecf20Sopenharmony_ci if (!page->mem_cgroup) 70708c2ecf20Sopenharmony_ci return; 70718c2ecf20Sopenharmony_ci 70728c2ecf20Sopenharmony_ci uncharge_gather_clear(&ug); 70738c2ecf20Sopenharmony_ci uncharge_page(page, &ug); 70748c2ecf20Sopenharmony_ci uncharge_batch(&ug); 70758c2ecf20Sopenharmony_ci} 70768c2ecf20Sopenharmony_ci 70778c2ecf20Sopenharmony_ci/** 70788c2ecf20Sopenharmony_ci * mem_cgroup_uncharge_list - uncharge a list of page 70798c2ecf20Sopenharmony_ci * @page_list: list of pages to uncharge 70808c2ecf20Sopenharmony_ci * 70818c2ecf20Sopenharmony_ci * Uncharge a list of pages previously charged with 70828c2ecf20Sopenharmony_ci * mem_cgroup_charge(). 70838c2ecf20Sopenharmony_ci */ 70848c2ecf20Sopenharmony_civoid mem_cgroup_uncharge_list(struct list_head *page_list) 70858c2ecf20Sopenharmony_ci{ 70868c2ecf20Sopenharmony_ci if (mem_cgroup_disabled()) 70878c2ecf20Sopenharmony_ci return; 70888c2ecf20Sopenharmony_ci 70898c2ecf20Sopenharmony_ci if (!list_empty(page_list)) 70908c2ecf20Sopenharmony_ci uncharge_list(page_list); 70918c2ecf20Sopenharmony_ci} 70928c2ecf20Sopenharmony_ci 70938c2ecf20Sopenharmony_ci/** 70948c2ecf20Sopenharmony_ci * mem_cgroup_migrate - charge a page's replacement 70958c2ecf20Sopenharmony_ci * @oldpage: currently circulating page 70968c2ecf20Sopenharmony_ci * @newpage: replacement page 70978c2ecf20Sopenharmony_ci * 70988c2ecf20Sopenharmony_ci * Charge @newpage as a replacement page for @oldpage. @oldpage will 70998c2ecf20Sopenharmony_ci * be uncharged upon free. 71008c2ecf20Sopenharmony_ci * 71018c2ecf20Sopenharmony_ci * Both pages must be locked, @newpage->mapping must be set up. 71028c2ecf20Sopenharmony_ci */ 71038c2ecf20Sopenharmony_civoid mem_cgroup_migrate(struct page *oldpage, struct page *newpage) 71048c2ecf20Sopenharmony_ci{ 71058c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 71068c2ecf20Sopenharmony_ci unsigned int nr_pages; 71078c2ecf20Sopenharmony_ci unsigned long flags; 71088c2ecf20Sopenharmony_ci 71098c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 71108c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 71118c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 71128c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 71138c2ecf20Sopenharmony_ci newpage); 71148c2ecf20Sopenharmony_ci 71158c2ecf20Sopenharmony_ci if (mem_cgroup_disabled()) 71168c2ecf20Sopenharmony_ci return; 71178c2ecf20Sopenharmony_ci 71188c2ecf20Sopenharmony_ci /* Page cache replacement: new page already charged? */ 71198c2ecf20Sopenharmony_ci if (newpage->mem_cgroup) 71208c2ecf20Sopenharmony_ci return; 71218c2ecf20Sopenharmony_ci 71228c2ecf20Sopenharmony_ci /* Swapcache readahead pages can get replaced before being charged */ 71238c2ecf20Sopenharmony_ci memcg = oldpage->mem_cgroup; 71248c2ecf20Sopenharmony_ci if (!memcg) 71258c2ecf20Sopenharmony_ci return; 71268c2ecf20Sopenharmony_ci 71278c2ecf20Sopenharmony_ci /* Force-charge the new page. The old one will be freed soon */ 71288c2ecf20Sopenharmony_ci nr_pages = thp_nr_pages(newpage); 71298c2ecf20Sopenharmony_ci 71308c2ecf20Sopenharmony_ci page_counter_charge(&memcg->memory, nr_pages); 71318c2ecf20Sopenharmony_ci if (do_memsw_account()) 71328c2ecf20Sopenharmony_ci page_counter_charge(&memcg->memsw, nr_pages); 71338c2ecf20Sopenharmony_ci 71348c2ecf20Sopenharmony_ci css_get(&memcg->css); 71358c2ecf20Sopenharmony_ci commit_charge(newpage, memcg); 71368c2ecf20Sopenharmony_ci 71378c2ecf20Sopenharmony_ci local_irq_save(flags); 71388c2ecf20Sopenharmony_ci mem_cgroup_charge_statistics(memcg, newpage, nr_pages); 71398c2ecf20Sopenharmony_ci memcg_check_events(memcg, newpage); 71408c2ecf20Sopenharmony_ci local_irq_restore(flags); 71418c2ecf20Sopenharmony_ci} 71428c2ecf20Sopenharmony_ci 71438c2ecf20Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); 71448c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memcg_sockets_enabled_key); 71458c2ecf20Sopenharmony_ci 71468c2ecf20Sopenharmony_civoid mem_cgroup_sk_alloc(struct sock *sk) 71478c2ecf20Sopenharmony_ci{ 71488c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 71498c2ecf20Sopenharmony_ci 71508c2ecf20Sopenharmony_ci if (!mem_cgroup_sockets_enabled) 71518c2ecf20Sopenharmony_ci return; 71528c2ecf20Sopenharmony_ci 71538c2ecf20Sopenharmony_ci /* Do not associate the sock with unrelated interrupted task's memcg. */ 71548c2ecf20Sopenharmony_ci if (in_interrupt()) 71558c2ecf20Sopenharmony_ci return; 71568c2ecf20Sopenharmony_ci 71578c2ecf20Sopenharmony_ci rcu_read_lock(); 71588c2ecf20Sopenharmony_ci memcg = mem_cgroup_from_task(current); 71598c2ecf20Sopenharmony_ci if (memcg == root_mem_cgroup) 71608c2ecf20Sopenharmony_ci goto out; 71618c2ecf20Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) 71628c2ecf20Sopenharmony_ci goto out; 71638c2ecf20Sopenharmony_ci if (css_tryget(&memcg->css)) 71648c2ecf20Sopenharmony_ci sk->sk_memcg = memcg; 71658c2ecf20Sopenharmony_ciout: 71668c2ecf20Sopenharmony_ci rcu_read_unlock(); 71678c2ecf20Sopenharmony_ci} 71688c2ecf20Sopenharmony_ci 71698c2ecf20Sopenharmony_civoid mem_cgroup_sk_free(struct sock *sk) 71708c2ecf20Sopenharmony_ci{ 71718c2ecf20Sopenharmony_ci if (sk->sk_memcg) 71728c2ecf20Sopenharmony_ci css_put(&sk->sk_memcg->css); 71738c2ecf20Sopenharmony_ci} 71748c2ecf20Sopenharmony_ci 71758c2ecf20Sopenharmony_ci/** 71768c2ecf20Sopenharmony_ci * mem_cgroup_charge_skmem - charge socket memory 71778c2ecf20Sopenharmony_ci * @memcg: memcg to charge 71788c2ecf20Sopenharmony_ci * @nr_pages: number of pages to charge 71798c2ecf20Sopenharmony_ci * 71808c2ecf20Sopenharmony_ci * Charges @nr_pages to @memcg. Returns %true if the charge fit within 71818c2ecf20Sopenharmony_ci * @memcg's configured limit, %false if the charge had to be forced. 71828c2ecf20Sopenharmony_ci */ 71838c2ecf20Sopenharmony_cibool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 71848c2ecf20Sopenharmony_ci{ 71858c2ecf20Sopenharmony_ci gfp_t gfp_mask = GFP_KERNEL; 71868c2ecf20Sopenharmony_ci 71878c2ecf20Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 71888c2ecf20Sopenharmony_ci struct page_counter *fail; 71898c2ecf20Sopenharmony_ci 71908c2ecf20Sopenharmony_ci if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 71918c2ecf20Sopenharmony_ci memcg->tcpmem_pressure = 0; 71928c2ecf20Sopenharmony_ci return true; 71938c2ecf20Sopenharmony_ci } 71948c2ecf20Sopenharmony_ci page_counter_charge(&memcg->tcpmem, nr_pages); 71958c2ecf20Sopenharmony_ci memcg->tcpmem_pressure = 1; 71968c2ecf20Sopenharmony_ci return false; 71978c2ecf20Sopenharmony_ci } 71988c2ecf20Sopenharmony_ci 71998c2ecf20Sopenharmony_ci /* Don't block in the packet receive path */ 72008c2ecf20Sopenharmony_ci if (in_softirq()) 72018c2ecf20Sopenharmony_ci gfp_mask = GFP_NOWAIT; 72028c2ecf20Sopenharmony_ci 72038c2ecf20Sopenharmony_ci mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); 72048c2ecf20Sopenharmony_ci 72058c2ecf20Sopenharmony_ci if (try_charge(memcg, gfp_mask, nr_pages) == 0) 72068c2ecf20Sopenharmony_ci return true; 72078c2ecf20Sopenharmony_ci 72088c2ecf20Sopenharmony_ci try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages); 72098c2ecf20Sopenharmony_ci return false; 72108c2ecf20Sopenharmony_ci} 72118c2ecf20Sopenharmony_ci 72128c2ecf20Sopenharmony_ci/** 72138c2ecf20Sopenharmony_ci * mem_cgroup_uncharge_skmem - uncharge socket memory 72148c2ecf20Sopenharmony_ci * @memcg: memcg to uncharge 72158c2ecf20Sopenharmony_ci * @nr_pages: number of pages to uncharge 72168c2ecf20Sopenharmony_ci */ 72178c2ecf20Sopenharmony_civoid mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 72188c2ecf20Sopenharmony_ci{ 72198c2ecf20Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 72208c2ecf20Sopenharmony_ci page_counter_uncharge(&memcg->tcpmem, nr_pages); 72218c2ecf20Sopenharmony_ci return; 72228c2ecf20Sopenharmony_ci } 72238c2ecf20Sopenharmony_ci 72248c2ecf20Sopenharmony_ci mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); 72258c2ecf20Sopenharmony_ci 72268c2ecf20Sopenharmony_ci refill_stock(memcg, nr_pages); 72278c2ecf20Sopenharmony_ci} 72288c2ecf20Sopenharmony_ci 72298c2ecf20Sopenharmony_cistatic int __init cgroup_memory(char *s) 72308c2ecf20Sopenharmony_ci{ 72318c2ecf20Sopenharmony_ci char *token; 72328c2ecf20Sopenharmony_ci 72338c2ecf20Sopenharmony_ci while ((token = strsep(&s, ",")) != NULL) { 72348c2ecf20Sopenharmony_ci if (!*token) 72358c2ecf20Sopenharmony_ci continue; 72368c2ecf20Sopenharmony_ci if (!strcmp(token, "nosocket")) 72378c2ecf20Sopenharmony_ci cgroup_memory_nosocket = true; 72388c2ecf20Sopenharmony_ci if (!strcmp(token, "nokmem")) 72398c2ecf20Sopenharmony_ci cgroup_memory_nokmem = true; 72408c2ecf20Sopenharmony_ci else if (!strcmp(token, "kmem")) 72418c2ecf20Sopenharmony_ci cgroup_memory_nokmem = false; 72428c2ecf20Sopenharmony_ci } 72438c2ecf20Sopenharmony_ci return 1; 72448c2ecf20Sopenharmony_ci} 72458c2ecf20Sopenharmony_ci__setup("cgroup.memory=", cgroup_memory); 72468c2ecf20Sopenharmony_ci 72478c2ecf20Sopenharmony_ci/* 72488c2ecf20Sopenharmony_ci * subsys_initcall() for memory controller. 72498c2ecf20Sopenharmony_ci * 72508c2ecf20Sopenharmony_ci * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this 72518c2ecf20Sopenharmony_ci * context because of lock dependencies (cgroup_lock -> cpu hotplug) but 72528c2ecf20Sopenharmony_ci * basically everything that doesn't depend on a specific mem_cgroup structure 72538c2ecf20Sopenharmony_ci * should be initialized from here. 72548c2ecf20Sopenharmony_ci */ 72558c2ecf20Sopenharmony_cistatic int __init mem_cgroup_init(void) 72568c2ecf20Sopenharmony_ci{ 72578c2ecf20Sopenharmony_ci int cpu, node; 72588c2ecf20Sopenharmony_ci 72598c2ecf20Sopenharmony_ci cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, 72608c2ecf20Sopenharmony_ci memcg_hotplug_cpu_dead); 72618c2ecf20Sopenharmony_ci 72628c2ecf20Sopenharmony_ci for_each_possible_cpu(cpu) 72638c2ecf20Sopenharmony_ci INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 72648c2ecf20Sopenharmony_ci drain_local_stock); 72658c2ecf20Sopenharmony_ci 72668c2ecf20Sopenharmony_ci for_each_node(node) { 72678c2ecf20Sopenharmony_ci struct mem_cgroup_tree_per_node *rtpn; 72688c2ecf20Sopenharmony_ci 72698c2ecf20Sopenharmony_ci rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, 72708c2ecf20Sopenharmony_ci node_online(node) ? node : NUMA_NO_NODE); 72718c2ecf20Sopenharmony_ci 72728c2ecf20Sopenharmony_ci rtpn->rb_root = RB_ROOT; 72738c2ecf20Sopenharmony_ci rtpn->rb_rightmost = NULL; 72748c2ecf20Sopenharmony_ci spin_lock_init(&rtpn->lock); 72758c2ecf20Sopenharmony_ci soft_limit_tree.rb_tree_per_node[node] = rtpn; 72768c2ecf20Sopenharmony_ci } 72778c2ecf20Sopenharmony_ci 72788c2ecf20Sopenharmony_ci return 0; 72798c2ecf20Sopenharmony_ci} 72808c2ecf20Sopenharmony_cisubsys_initcall(mem_cgroup_init); 72818c2ecf20Sopenharmony_ci 72828c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_SWAP 72838c2ecf20Sopenharmony_cistatic struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) 72848c2ecf20Sopenharmony_ci{ 72858c2ecf20Sopenharmony_ci while (!refcount_inc_not_zero(&memcg->id.ref)) { 72868c2ecf20Sopenharmony_ci /* 72878c2ecf20Sopenharmony_ci * The root cgroup cannot be destroyed, so it's refcount must 72888c2ecf20Sopenharmony_ci * always be >= 1. 72898c2ecf20Sopenharmony_ci */ 72908c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { 72918c2ecf20Sopenharmony_ci VM_BUG_ON(1); 72928c2ecf20Sopenharmony_ci break; 72938c2ecf20Sopenharmony_ci } 72948c2ecf20Sopenharmony_ci memcg = parent_mem_cgroup(memcg); 72958c2ecf20Sopenharmony_ci if (!memcg) 72968c2ecf20Sopenharmony_ci memcg = root_mem_cgroup; 72978c2ecf20Sopenharmony_ci } 72988c2ecf20Sopenharmony_ci return memcg; 72998c2ecf20Sopenharmony_ci} 73008c2ecf20Sopenharmony_ci 73018c2ecf20Sopenharmony_ci/** 73028c2ecf20Sopenharmony_ci * mem_cgroup_swapout - transfer a memsw charge to swap 73038c2ecf20Sopenharmony_ci * @page: page whose memsw charge to transfer 73048c2ecf20Sopenharmony_ci * @entry: swap entry to move the charge to 73058c2ecf20Sopenharmony_ci * 73068c2ecf20Sopenharmony_ci * Transfer the memsw charge of @page to @entry. 73078c2ecf20Sopenharmony_ci */ 73088c2ecf20Sopenharmony_civoid mem_cgroup_swapout(struct page *page, swp_entry_t entry) 73098c2ecf20Sopenharmony_ci{ 73108c2ecf20Sopenharmony_ci struct mem_cgroup *memcg, *swap_memcg; 73118c2ecf20Sopenharmony_ci unsigned int nr_entries; 73128c2ecf20Sopenharmony_ci unsigned short oldid; 73138c2ecf20Sopenharmony_ci 73148c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(PageLRU(page), page); 73158c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(page_count(page), page); 73168c2ecf20Sopenharmony_ci 73178c2ecf20Sopenharmony_ci if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 73188c2ecf20Sopenharmony_ci return; 73198c2ecf20Sopenharmony_ci 73208c2ecf20Sopenharmony_ci memcg = page->mem_cgroup; 73218c2ecf20Sopenharmony_ci 73228c2ecf20Sopenharmony_ci /* Readahead page, never charged */ 73238c2ecf20Sopenharmony_ci if (!memcg) 73248c2ecf20Sopenharmony_ci return; 73258c2ecf20Sopenharmony_ci 73268c2ecf20Sopenharmony_ci /* 73278c2ecf20Sopenharmony_ci * In case the memcg owning these pages has been offlined and doesn't 73288c2ecf20Sopenharmony_ci * have an ID allocated to it anymore, charge the closest online 73298c2ecf20Sopenharmony_ci * ancestor for the swap instead and transfer the memory+swap charge. 73308c2ecf20Sopenharmony_ci */ 73318c2ecf20Sopenharmony_ci swap_memcg = mem_cgroup_id_get_online(memcg); 73328c2ecf20Sopenharmony_ci nr_entries = thp_nr_pages(page); 73338c2ecf20Sopenharmony_ci /* Get references for the tail pages, too */ 73348c2ecf20Sopenharmony_ci if (nr_entries > 1) 73358c2ecf20Sopenharmony_ci mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); 73368c2ecf20Sopenharmony_ci oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 73378c2ecf20Sopenharmony_ci nr_entries); 73388c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(oldid, page); 73398c2ecf20Sopenharmony_ci mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 73408c2ecf20Sopenharmony_ci 73418c2ecf20Sopenharmony_ci page->mem_cgroup = NULL; 73428c2ecf20Sopenharmony_ci 73438c2ecf20Sopenharmony_ci if (!mem_cgroup_is_root(memcg)) 73448c2ecf20Sopenharmony_ci page_counter_uncharge(&memcg->memory, nr_entries); 73458c2ecf20Sopenharmony_ci 73468c2ecf20Sopenharmony_ci if (!cgroup_memory_noswap && memcg != swap_memcg) { 73478c2ecf20Sopenharmony_ci if (!mem_cgroup_is_root(swap_memcg)) 73488c2ecf20Sopenharmony_ci page_counter_charge(&swap_memcg->memsw, nr_entries); 73498c2ecf20Sopenharmony_ci page_counter_uncharge(&memcg->memsw, nr_entries); 73508c2ecf20Sopenharmony_ci } 73518c2ecf20Sopenharmony_ci 73528c2ecf20Sopenharmony_ci /* 73538c2ecf20Sopenharmony_ci * Interrupts should be disabled here because the caller holds the 73548c2ecf20Sopenharmony_ci * i_pages lock which is taken with interrupts-off. It is 73558c2ecf20Sopenharmony_ci * important here to have the interrupts disabled because it is the 73568c2ecf20Sopenharmony_ci * only synchronisation we have for updating the per-CPU variables. 73578c2ecf20Sopenharmony_ci */ 73588c2ecf20Sopenharmony_ci VM_BUG_ON(!irqs_disabled()); 73598c2ecf20Sopenharmony_ci mem_cgroup_charge_statistics(memcg, page, -nr_entries); 73608c2ecf20Sopenharmony_ci memcg_check_events(memcg, page); 73618c2ecf20Sopenharmony_ci 73628c2ecf20Sopenharmony_ci css_put(&memcg->css); 73638c2ecf20Sopenharmony_ci} 73648c2ecf20Sopenharmony_ci 73658c2ecf20Sopenharmony_ci/** 73668c2ecf20Sopenharmony_ci * mem_cgroup_try_charge_swap - try charging swap space for a page 73678c2ecf20Sopenharmony_ci * @page: page being added to swap 73688c2ecf20Sopenharmony_ci * @entry: swap entry to charge 73698c2ecf20Sopenharmony_ci * 73708c2ecf20Sopenharmony_ci * Try to charge @page's memcg for the swap space at @entry. 73718c2ecf20Sopenharmony_ci * 73728c2ecf20Sopenharmony_ci * Returns 0 on success, -ENOMEM on failure. 73738c2ecf20Sopenharmony_ci */ 73748c2ecf20Sopenharmony_ciint mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) 73758c2ecf20Sopenharmony_ci{ 73768c2ecf20Sopenharmony_ci unsigned int nr_pages = thp_nr_pages(page); 73778c2ecf20Sopenharmony_ci struct page_counter *counter; 73788c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 73798c2ecf20Sopenharmony_ci unsigned short oldid; 73808c2ecf20Sopenharmony_ci 73818c2ecf20Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 73828c2ecf20Sopenharmony_ci return 0; 73838c2ecf20Sopenharmony_ci 73848c2ecf20Sopenharmony_ci memcg = page->mem_cgroup; 73858c2ecf20Sopenharmony_ci 73868c2ecf20Sopenharmony_ci /* Readahead page, never charged */ 73878c2ecf20Sopenharmony_ci if (!memcg) 73888c2ecf20Sopenharmony_ci return 0; 73898c2ecf20Sopenharmony_ci 73908c2ecf20Sopenharmony_ci if (!entry.val) { 73918c2ecf20Sopenharmony_ci memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 73928c2ecf20Sopenharmony_ci return 0; 73938c2ecf20Sopenharmony_ci } 73948c2ecf20Sopenharmony_ci 73958c2ecf20Sopenharmony_ci memcg = mem_cgroup_id_get_online(memcg); 73968c2ecf20Sopenharmony_ci 73978c2ecf20Sopenharmony_ci if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) && 73988c2ecf20Sopenharmony_ci !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { 73998c2ecf20Sopenharmony_ci memcg_memory_event(memcg, MEMCG_SWAP_MAX); 74008c2ecf20Sopenharmony_ci memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 74018c2ecf20Sopenharmony_ci mem_cgroup_id_put(memcg); 74028c2ecf20Sopenharmony_ci return -ENOMEM; 74038c2ecf20Sopenharmony_ci } 74048c2ecf20Sopenharmony_ci 74058c2ecf20Sopenharmony_ci /* Get references for the tail pages, too */ 74068c2ecf20Sopenharmony_ci if (nr_pages > 1) 74078c2ecf20Sopenharmony_ci mem_cgroup_id_get_many(memcg, nr_pages - 1); 74088c2ecf20Sopenharmony_ci oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); 74098c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(oldid, page); 74108c2ecf20Sopenharmony_ci mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); 74118c2ecf20Sopenharmony_ci 74128c2ecf20Sopenharmony_ci return 0; 74138c2ecf20Sopenharmony_ci} 74148c2ecf20Sopenharmony_ci 74158c2ecf20Sopenharmony_ci/** 74168c2ecf20Sopenharmony_ci * mem_cgroup_uncharge_swap - uncharge swap space 74178c2ecf20Sopenharmony_ci * @entry: swap entry to uncharge 74188c2ecf20Sopenharmony_ci * @nr_pages: the amount of swap space to uncharge 74198c2ecf20Sopenharmony_ci */ 74208c2ecf20Sopenharmony_civoid mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) 74218c2ecf20Sopenharmony_ci{ 74228c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 74238c2ecf20Sopenharmony_ci unsigned short id; 74248c2ecf20Sopenharmony_ci 74258c2ecf20Sopenharmony_ci id = swap_cgroup_record(entry, 0, nr_pages); 74268c2ecf20Sopenharmony_ci rcu_read_lock(); 74278c2ecf20Sopenharmony_ci memcg = mem_cgroup_from_id(id); 74288c2ecf20Sopenharmony_ci if (memcg) { 74298c2ecf20Sopenharmony_ci if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) { 74308c2ecf20Sopenharmony_ci if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 74318c2ecf20Sopenharmony_ci page_counter_uncharge(&memcg->swap, nr_pages); 74328c2ecf20Sopenharmony_ci else 74338c2ecf20Sopenharmony_ci page_counter_uncharge(&memcg->memsw, nr_pages); 74348c2ecf20Sopenharmony_ci } 74358c2ecf20Sopenharmony_ci mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); 74368c2ecf20Sopenharmony_ci mem_cgroup_id_put_many(memcg, nr_pages); 74378c2ecf20Sopenharmony_ci } 74388c2ecf20Sopenharmony_ci rcu_read_unlock(); 74398c2ecf20Sopenharmony_ci} 74408c2ecf20Sopenharmony_ci 74418c2ecf20Sopenharmony_cilong mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) 74428c2ecf20Sopenharmony_ci{ 74438c2ecf20Sopenharmony_ci long nr_swap_pages = get_nr_swap_pages(); 74448c2ecf20Sopenharmony_ci 74458c2ecf20Sopenharmony_ci if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 74468c2ecf20Sopenharmony_ci return nr_swap_pages; 74478c2ecf20Sopenharmony_ci for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) 74488c2ecf20Sopenharmony_ci nr_swap_pages = min_t(long, nr_swap_pages, 74498c2ecf20Sopenharmony_ci READ_ONCE(memcg->swap.max) - 74508c2ecf20Sopenharmony_ci page_counter_read(&memcg->swap)); 74518c2ecf20Sopenharmony_ci return nr_swap_pages; 74528c2ecf20Sopenharmony_ci} 74538c2ecf20Sopenharmony_ci 74548c2ecf20Sopenharmony_cibool mem_cgroup_swap_full(struct page *page) 74558c2ecf20Sopenharmony_ci{ 74568c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 74578c2ecf20Sopenharmony_ci 74588c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!PageLocked(page), page); 74598c2ecf20Sopenharmony_ci 74608c2ecf20Sopenharmony_ci if (vm_swap_full()) 74618c2ecf20Sopenharmony_ci return true; 74628c2ecf20Sopenharmony_ci if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 74638c2ecf20Sopenharmony_ci return false; 74648c2ecf20Sopenharmony_ci 74658c2ecf20Sopenharmony_ci memcg = page->mem_cgroup; 74668c2ecf20Sopenharmony_ci if (!memcg) 74678c2ecf20Sopenharmony_ci return false; 74688c2ecf20Sopenharmony_ci 74698c2ecf20Sopenharmony_ci for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { 74708c2ecf20Sopenharmony_ci unsigned long usage = page_counter_read(&memcg->swap); 74718c2ecf20Sopenharmony_ci 74728c2ecf20Sopenharmony_ci if (usage * 2 >= READ_ONCE(memcg->swap.high) || 74738c2ecf20Sopenharmony_ci usage * 2 >= READ_ONCE(memcg->swap.max)) 74748c2ecf20Sopenharmony_ci return true; 74758c2ecf20Sopenharmony_ci } 74768c2ecf20Sopenharmony_ci 74778c2ecf20Sopenharmony_ci return false; 74788c2ecf20Sopenharmony_ci} 74798c2ecf20Sopenharmony_ci 74808c2ecf20Sopenharmony_cistatic int __init setup_swap_account(char *s) 74818c2ecf20Sopenharmony_ci{ 74828c2ecf20Sopenharmony_ci if (!strcmp(s, "1")) 74838c2ecf20Sopenharmony_ci cgroup_memory_noswap = 0; 74848c2ecf20Sopenharmony_ci else if (!strcmp(s, "0")) 74858c2ecf20Sopenharmony_ci cgroup_memory_noswap = 1; 74868c2ecf20Sopenharmony_ci return 1; 74878c2ecf20Sopenharmony_ci} 74888c2ecf20Sopenharmony_ci__setup("swapaccount=", setup_swap_account); 74898c2ecf20Sopenharmony_ci 74908c2ecf20Sopenharmony_cistatic u64 swap_current_read(struct cgroup_subsys_state *css, 74918c2ecf20Sopenharmony_ci struct cftype *cft) 74928c2ecf20Sopenharmony_ci{ 74938c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 74948c2ecf20Sopenharmony_ci 74958c2ecf20Sopenharmony_ci return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; 74968c2ecf20Sopenharmony_ci} 74978c2ecf20Sopenharmony_ci 74988c2ecf20Sopenharmony_cistatic int swap_high_show(struct seq_file *m, void *v) 74998c2ecf20Sopenharmony_ci{ 75008c2ecf20Sopenharmony_ci return seq_puts_memcg_tunable(m, 75018c2ecf20Sopenharmony_ci READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); 75028c2ecf20Sopenharmony_ci} 75038c2ecf20Sopenharmony_ci 75048c2ecf20Sopenharmony_cistatic ssize_t swap_high_write(struct kernfs_open_file *of, 75058c2ecf20Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 75068c2ecf20Sopenharmony_ci{ 75078c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 75088c2ecf20Sopenharmony_ci unsigned long high; 75098c2ecf20Sopenharmony_ci int err; 75108c2ecf20Sopenharmony_ci 75118c2ecf20Sopenharmony_ci buf = strstrip(buf); 75128c2ecf20Sopenharmony_ci err = page_counter_memparse(buf, "max", &high); 75138c2ecf20Sopenharmony_ci if (err) 75148c2ecf20Sopenharmony_ci return err; 75158c2ecf20Sopenharmony_ci 75168c2ecf20Sopenharmony_ci page_counter_set_high(&memcg->swap, high); 75178c2ecf20Sopenharmony_ci 75188c2ecf20Sopenharmony_ci return nbytes; 75198c2ecf20Sopenharmony_ci} 75208c2ecf20Sopenharmony_ci 75218c2ecf20Sopenharmony_cistatic int swap_max_show(struct seq_file *m, void *v) 75228c2ecf20Sopenharmony_ci{ 75238c2ecf20Sopenharmony_ci return seq_puts_memcg_tunable(m, 75248c2ecf20Sopenharmony_ci READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); 75258c2ecf20Sopenharmony_ci} 75268c2ecf20Sopenharmony_ci 75278c2ecf20Sopenharmony_cistatic ssize_t swap_max_write(struct kernfs_open_file *of, 75288c2ecf20Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 75298c2ecf20Sopenharmony_ci{ 75308c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 75318c2ecf20Sopenharmony_ci unsigned long max; 75328c2ecf20Sopenharmony_ci int err; 75338c2ecf20Sopenharmony_ci 75348c2ecf20Sopenharmony_ci buf = strstrip(buf); 75358c2ecf20Sopenharmony_ci err = page_counter_memparse(buf, "max", &max); 75368c2ecf20Sopenharmony_ci if (err) 75378c2ecf20Sopenharmony_ci return err; 75388c2ecf20Sopenharmony_ci 75398c2ecf20Sopenharmony_ci xchg(&memcg->swap.max, max); 75408c2ecf20Sopenharmony_ci 75418c2ecf20Sopenharmony_ci return nbytes; 75428c2ecf20Sopenharmony_ci} 75438c2ecf20Sopenharmony_ci 75448c2ecf20Sopenharmony_cistatic int swap_events_show(struct seq_file *m, void *v) 75458c2ecf20Sopenharmony_ci{ 75468c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 75478c2ecf20Sopenharmony_ci 75488c2ecf20Sopenharmony_ci seq_printf(m, "high %lu\n", 75498c2ecf20Sopenharmony_ci atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); 75508c2ecf20Sopenharmony_ci seq_printf(m, "max %lu\n", 75518c2ecf20Sopenharmony_ci atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); 75528c2ecf20Sopenharmony_ci seq_printf(m, "fail %lu\n", 75538c2ecf20Sopenharmony_ci atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); 75548c2ecf20Sopenharmony_ci 75558c2ecf20Sopenharmony_ci return 0; 75568c2ecf20Sopenharmony_ci} 75578c2ecf20Sopenharmony_ci 75588c2ecf20Sopenharmony_cistatic struct cftype swap_files[] = { 75598c2ecf20Sopenharmony_ci { 75608c2ecf20Sopenharmony_ci .name = "swap.current", 75618c2ecf20Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 75628c2ecf20Sopenharmony_ci .read_u64 = swap_current_read, 75638c2ecf20Sopenharmony_ci }, 75648c2ecf20Sopenharmony_ci { 75658c2ecf20Sopenharmony_ci .name = "swap.high", 75668c2ecf20Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 75678c2ecf20Sopenharmony_ci .seq_show = swap_high_show, 75688c2ecf20Sopenharmony_ci .write = swap_high_write, 75698c2ecf20Sopenharmony_ci }, 75708c2ecf20Sopenharmony_ci { 75718c2ecf20Sopenharmony_ci .name = "swap.max", 75728c2ecf20Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 75738c2ecf20Sopenharmony_ci .seq_show = swap_max_show, 75748c2ecf20Sopenharmony_ci .write = swap_max_write, 75758c2ecf20Sopenharmony_ci }, 75768c2ecf20Sopenharmony_ci { 75778c2ecf20Sopenharmony_ci .name = "swap.events", 75788c2ecf20Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 75798c2ecf20Sopenharmony_ci .file_offset = offsetof(struct mem_cgroup, swap_events_file), 75808c2ecf20Sopenharmony_ci .seq_show = swap_events_show, 75818c2ecf20Sopenharmony_ci }, 75828c2ecf20Sopenharmony_ci { } /* terminate */ 75838c2ecf20Sopenharmony_ci}; 75848c2ecf20Sopenharmony_ci 75858c2ecf20Sopenharmony_cistatic struct cftype memsw_files[] = { 75868c2ecf20Sopenharmony_ci { 75878c2ecf20Sopenharmony_ci .name = "memsw.usage_in_bytes", 75888c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 75898c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 75908c2ecf20Sopenharmony_ci }, 75918c2ecf20Sopenharmony_ci { 75928c2ecf20Sopenharmony_ci .name = "memsw.max_usage_in_bytes", 75938c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 75948c2ecf20Sopenharmony_ci .write = mem_cgroup_reset, 75958c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 75968c2ecf20Sopenharmony_ci }, 75978c2ecf20Sopenharmony_ci { 75988c2ecf20Sopenharmony_ci .name = "memsw.limit_in_bytes", 75998c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 76008c2ecf20Sopenharmony_ci .write = mem_cgroup_write, 76018c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 76028c2ecf20Sopenharmony_ci }, 76038c2ecf20Sopenharmony_ci { 76048c2ecf20Sopenharmony_ci .name = "memsw.failcnt", 76058c2ecf20Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 76068c2ecf20Sopenharmony_ci .write = mem_cgroup_reset, 76078c2ecf20Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 76088c2ecf20Sopenharmony_ci }, 76098c2ecf20Sopenharmony_ci { }, /* terminate */ 76108c2ecf20Sopenharmony_ci}; 76118c2ecf20Sopenharmony_ci 76128c2ecf20Sopenharmony_ci/* 76138c2ecf20Sopenharmony_ci * If mem_cgroup_swap_init() is implemented as a subsys_initcall() 76148c2ecf20Sopenharmony_ci * instead of a core_initcall(), this could mean cgroup_memory_noswap still 76158c2ecf20Sopenharmony_ci * remains set to false even when memcg is disabled via "cgroup_disable=memory" 76168c2ecf20Sopenharmony_ci * boot parameter. This may result in premature OOPS inside 76178c2ecf20Sopenharmony_ci * mem_cgroup_get_nr_swap_pages() function in corner cases. 76188c2ecf20Sopenharmony_ci */ 76198c2ecf20Sopenharmony_cistatic int __init mem_cgroup_swap_init(void) 76208c2ecf20Sopenharmony_ci{ 76218c2ecf20Sopenharmony_ci /* No memory control -> no swap control */ 76228c2ecf20Sopenharmony_ci if (mem_cgroup_disabled()) 76238c2ecf20Sopenharmony_ci cgroup_memory_noswap = true; 76248c2ecf20Sopenharmony_ci 76258c2ecf20Sopenharmony_ci if (cgroup_memory_noswap) 76268c2ecf20Sopenharmony_ci return 0; 76278c2ecf20Sopenharmony_ci 76288c2ecf20Sopenharmony_ci WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); 76298c2ecf20Sopenharmony_ci WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); 76308c2ecf20Sopenharmony_ci 76318c2ecf20Sopenharmony_ci return 0; 76328c2ecf20Sopenharmony_ci} 76338c2ecf20Sopenharmony_cicore_initcall(mem_cgroup_swap_init); 76348c2ecf20Sopenharmony_ci 76358c2ecf20Sopenharmony_ci#endif /* CONFIG_MEMCG_SWAP */ 7636